├── src ├── strava_data │ ├── db │ │ ├── __init__.py │ │ ├── models.py │ │ └── dao.py │ ├── strava_api │ │ ├── visualisation │ │ │ ├── __init__.py │ │ │ ├── graphs_effort.py │ │ │ ├── graphs_pace.py │ │ │ ├── graphs_distribution.py │ │ │ ├── graphs_distance.py │ │ │ └── utils.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ └── transform.py │ │ ├── __init__.py │ │ └── client.py │ ├── ml │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── run_type_classifier.py │ │ ├── pace_forecast.py │ │ └── training_advisor.py │ ├── __init__.py │ ├── config.py │ └── auth.py ├── utils │ └── logger.py ├── get_tokens.py ├── generate_readme.py └── main.py ├── strava.sqlite ├── .bandit ├── Pace_by_Day.png ├── Activity_Heatmap.png ├── Cadence_Over_Time.png ├── Pace_Distribution.png ├── Rest_Days_Heatmap.png ├── Run_Days_Heatmap.png ├── Run_Type_Clusters.png ├── Cumulative_Distance.png ├── Time_Taken_Distance.png ├── VO2_Proxy_Over_Time.png ├── Forecast_Weekly_Pace.png ├── Longest_Run_per_Month.png ├── Run_Rest_Ratio_Heatmap.png ├── Running_Pace_over_Time.png ├── Median_1k_Pace_over_Time.png ├── Monthly_Distance_by_Year.png ├── Pace_Consistency_by_Run.png ├── Run_Start_Time_by_Month.png ├── Training_Load_Over_Time.png ├── A.I._Recommended_Training.png ├── Elevation_Gain_Distribution.png ├── Fastest_1k_Pace_over_Time.png ├── Number_of_Runs_per_Distance.png ├── Rolling_30_Day_Comparison.png ├── Run_Distance_Distribution.png ├── Total_Distance_Ran_by_Month.png ├── Run_Type_Distribution_By_Year.png ├── Elevation_Gain_per_KM_by_Month.png ├── Running_Pace_vs_Elevation_Change.png ├── Running_Pace_vs_Total_Distance.png ├── Time_Taken_Distance_Recent_Years.png ├── Training_Intensity_by_HeartRate_Zone.png ├── .github ├── dependabot.yml └── workflows │ ├── cleanup-runs.yml │ ├── test-code.yml │ ├── generate-stats.yml │ └── codeql-analysis.yml ├── LICENSE ├── pyproject.toml ├── .gitignore └── README.md /src/strava_data/db/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | strava_data db package. 3 | """ 4 | -------------------------------------------------------------------------------- /strava.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/strava.sqlite -------------------------------------------------------------------------------- /.bandit: -------------------------------------------------------------------------------- 1 | [bandit] 2 | # B105 - Not a hardcoded password, it's a secrets passed in 3 | skips = B105 -------------------------------------------------------------------------------- /Pace_by_Day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_by_Day.png -------------------------------------------------------------------------------- /Activity_Heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Activity_Heatmap.png -------------------------------------------------------------------------------- /Cadence_Over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Cadence_Over_Time.png -------------------------------------------------------------------------------- /Pace_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_Distribution.png -------------------------------------------------------------------------------- /Rest_Days_Heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Rest_Days_Heatmap.png -------------------------------------------------------------------------------- /Run_Days_Heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Days_Heatmap.png -------------------------------------------------------------------------------- /Run_Type_Clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Type_Clusters.png -------------------------------------------------------------------------------- /Cumulative_Distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Cumulative_Distance.png -------------------------------------------------------------------------------- /Time_Taken_Distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Time_Taken_Distance.png -------------------------------------------------------------------------------- /VO2_Proxy_Over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/VO2_Proxy_Over_Time.png -------------------------------------------------------------------------------- /Forecast_Weekly_Pace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Forecast_Weekly_Pace.png -------------------------------------------------------------------------------- /Longest_Run_per_Month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Longest_Run_per_Month.png -------------------------------------------------------------------------------- /Run_Rest_Ratio_Heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Rest_Ratio_Heatmap.png -------------------------------------------------------------------------------- /Running_Pace_over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_over_Time.png -------------------------------------------------------------------------------- /Median_1k_Pace_over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Median_1k_Pace_over_Time.png -------------------------------------------------------------------------------- /Monthly_Distance_by_Year.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Monthly_Distance_by_Year.png -------------------------------------------------------------------------------- /Pace_Consistency_by_Run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Pace_Consistency_by_Run.png -------------------------------------------------------------------------------- /Run_Start_Time_by_Month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Start_Time_by_Month.png -------------------------------------------------------------------------------- /Training_Load_Over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Training_Load_Over_Time.png -------------------------------------------------------------------------------- /A.I._Recommended_Training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/A.I._Recommended_Training.png -------------------------------------------------------------------------------- /Elevation_Gain_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Elevation_Gain_Distribution.png -------------------------------------------------------------------------------- /Fastest_1k_Pace_over_Time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Fastest_1k_Pace_over_Time.png -------------------------------------------------------------------------------- /Number_of_Runs_per_Distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Number_of_Runs_per_Distance.png -------------------------------------------------------------------------------- /Rolling_30_Day_Comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Rolling_30_Day_Comparison.png -------------------------------------------------------------------------------- /Run_Distance_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Distance_Distribution.png -------------------------------------------------------------------------------- /Total_Distance_Ran_by_Month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Total_Distance_Ran_by_Month.png -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualisation package for generating required charts. 3 | """ 4 | -------------------------------------------------------------------------------- /Run_Type_Distribution_By_Year.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Run_Type_Distribution_By_Year.png -------------------------------------------------------------------------------- /src/strava_data/strava_api/processing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package for data processing, cleanup, and transformation logic. 3 | """ 4 | -------------------------------------------------------------------------------- /Elevation_Gain_per_KM_by_Month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Elevation_Gain_per_KM_by_Month.png -------------------------------------------------------------------------------- /Running_Pace_vs_Elevation_Change.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_vs_Elevation_Change.png -------------------------------------------------------------------------------- /Running_Pace_vs_Total_Distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Running_Pace_vs_Total_Distance.png -------------------------------------------------------------------------------- /Time_Taken_Distance_Recent_Years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Time_Taken_Distance_Recent_Years.png -------------------------------------------------------------------------------- /Training_Intensity_by_HeartRate_Zone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/c-wilkinson/StravaDataAnalysis/HEAD/Training_Intensity_by_HeartRate_Zone.png -------------------------------------------------------------------------------- /src/strava_data/strava_api/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | strava_api package containing logic to call the Strava endpoints and handle rate-limiting. 3 | """ 4 | -------------------------------------------------------------------------------- /src/strava_data/ml/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | strava_data.ml package. 3 | 4 | Contains all machine learning functionality for: 5 | - Forecasting future running pace 6 | - Model training and evaluation 7 | - ML visualisations 8 | """ 9 | -------------------------------------------------------------------------------- /src/strava_data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | strava_data package. 3 | 4 | Houses all core functionality for: 5 | - Authentication & config 6 | - Database models and data access 7 | - Strava API calls 8 | - Data processing 9 | - Visualization 10 | """ 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | - package-ecosystem: "pip" 8 | directory: "/" 9 | schedule: 10 | interval: "daily" 11 | -------------------------------------------------------------------------------- /.github/workflows/cleanup-runs.yml: -------------------------------------------------------------------------------- 1 | name: Cleanup All Old Workflow Runs 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | workflow_dispatch: 7 | 8 | jobs: 9 | cleanup: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Delete workflow runs 14 | uses: Mattraks/delete-workflow-runs@v2.1.0 15 | with: 16 | token: ${{ secrets.TOKEN_GITHUB }} 17 | repository: ${{ github.repository }} 18 | retain_days: 1 19 | -------------------------------------------------------------------------------- /src/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Central logger configuration. 3 | """ 4 | 5 | import logging 6 | import sys 7 | 8 | LOGGER = logging.getLogger("StravaDataAnalysis") 9 | LOGGER.setLevel(logging.INFO) 10 | 11 | handler = logging.StreamHandler(sys.stdout) 12 | handler.setLevel(logging.INFO) 13 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 14 | handler.setFormatter(formatter) 15 | LOGGER.addHandler(handler) 16 | 17 | 18 | def get_logger() -> logging.Logger: 19 | """ 20 | Returns the shared logger instance. 21 | """ 22 | return LOGGER 23 | -------------------------------------------------------------------------------- /.github/workflows/test-code.yml: -------------------------------------------------------------------------------- 1 | name: CodeTest 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | schedule: 9 | - cron: '25 11 * * *' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | test-and-lint: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v6 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v6 22 | with: 23 | python-version: "3.11" 24 | 25 | - name: Install Poetry 26 | uses: abatilo/actions-poetry@v4 27 | 28 | - name: Install dependencies 29 | run: poetry install 30 | 31 | - name: Run pylint 32 | run: poetry run pylint src 33 | 34 | - name: Check formatting with black 35 | run: poetry run black --check src 36 | 37 | # - name: Run tests with pytest 38 | # run: poetry run pytest 39 | -------------------------------------------------------------------------------- /src/strava_data/ml/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared utilities for machine learning feature engineering. 3 | """ 4 | 5 | import pandas as pd 6 | 7 | 8 | def prepare_pace_summary(splits_df: pd.DataFrame, group_cols: list[str]) -> pd.DataFrame: 9 | """ 10 | Aggregates pace-based metrics by the given group columns (e.g. weekly or per run). 11 | """ 12 | data = splits_df.copy() 13 | data = data[(data["distance_m"] >= 950) & (data["distance_m"] <= 1050)] 14 | data["pace_sec_km"] = data["elapsed_time_s"] / (data["distance_m"] / 1000) 15 | data["distance_km"] = data["distance_m"] / 1000 16 | 17 | grouped = ( 18 | data.groupby(group_cols) 19 | .agg( 20 | distance_km=("distance_km", "sum"), 21 | pace_median=("pace_sec_km", "median"), 22 | pace_std=("pace_sec_km", "std"), 23 | split_count=("pace_sec_km", "count"), 24 | ) 25 | .reset_index() 26 | .dropna() 27 | ) 28 | 29 | return grouped 30 | -------------------------------------------------------------------------------- /src/strava_data/db/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data classes or schemas representing Strava data in Python. 3 | """ 4 | 5 | from dataclasses import dataclass 6 | from typing import Optional 7 | 8 | 9 | @dataclass 10 | class Activity: 11 | # pylint: disable=too-many-instance-attributes 12 | """ 13 | Represents a single Strava activity row. 14 | """ 15 | 16 | activity_id: int 17 | name: str 18 | activity_type: str 19 | distance_m: float 20 | moving_time_s: int 21 | average_speed_m_s: float 22 | max_speed_m_s: float 23 | total_elevation_gain_m: float 24 | start_date_local: str 25 | average_cadence: float 26 | 27 | 28 | @dataclass 29 | class Split: 30 | # pylint: disable=too-many-instance-attributes 31 | """ 32 | Represents a single 1 km split from a Strava activity. 33 | """ 34 | 35 | split_id: int 36 | activity_id: int 37 | distance_m: float 38 | elapsed_time_s: int 39 | elevation_difference_m: float 40 | moving_time_s: int 41 | pace_zone: int 42 | split_index: int 43 | average_grade_adjusted_speed_m_s: float 44 | average_heartrate: Optional[float] 45 | start_date_local: str 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "strava_project" 3 | version = "0.1.1" 4 | description = "Codebase for Strava data analysis." 5 | authors = ["Craig Wilkinson"] 6 | readme = "README.md" 7 | packages = [{ include = "src" }] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | pandas = "^2.3" 12 | matplotlib = "^3.10" 13 | seaborn = "^0.13" 14 | requests = "^2.32" 15 | pyAesCrypt = "^6.0" 16 | cryptography = ">=44.0.1" 17 | numpy = "^2.3" 18 | scikit-learn = "^1.8" 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | pylint = "^4.0" 22 | pytest = "^9.0" 23 | black = "^25.12" 24 | 25 | [build-system] 26 | requires = ["poetry-core>=1.0.0"] 27 | build-backend = "poetry.core.masonry.api" 28 | 29 | [tool.black] 30 | line-length = 100 31 | target-version = ['py39'] 32 | include = 'src/.*\.pyi?$' 33 | exclude = ''' 34 | /( 35 | \.venv 36 | | \.git 37 | | \.mypy_cache 38 | | \.pytest_cache 39 | | \.tox 40 | | \.eggs 41 | | \.idea 42 | | __pycache__ 43 | | build 44 | | dist 45 | | tests 46 | )/ 47 | ''' 48 | 49 | [tool.pytest.ini_options] 50 | minversion = "6.0" 51 | addopts = "-ra -q" 52 | testpaths = ["src/tests"] 53 | 54 | [tool.pylint.'MAIN'] 55 | max-line-length = 100 56 | disable = [ 57 | "missing-docstring", 58 | "too-few-public-methods", 59 | "too-many-arguments" 60 | ] -------------------------------------------------------------------------------- /src/strava_data/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global configuration and environment variable handling for Strava secrets 3 | and optional encryption. 4 | """ 5 | 6 | import os 7 | 8 | CONFIG_FILE = "config.txt" 9 | 10 | # pylint: disable=C0103 11 | # Attempt to read password / buffer size from local config file 12 | if os.path.isfile(CONFIG_FILE): 13 | with open(CONFIG_FILE, "r", encoding="utf-8") as file_handle: 14 | lines = file_handle.read().splitlines() 15 | BUFFER_SIZE = int(lines[0].strip()) # First line 16 | ENCRYPTION_PASSWORD = lines[1].strip() # Second line 17 | CLIENT_ID = lines[2].strip() # Third line 18 | CLIENT_SECRET = lines[3].strip() # Forth line 19 | else: 20 | # Fallback to environment variables 21 | BUFFER_SIZE = int(os.environ.get("BUFFERSIZE", 65536)) # default 64KB 22 | ENCRYPTION_PASSWORD = os.environ.get("ENCRYPTIONPASSWORD", "default_password") 23 | CLIENT_ID = os.environ.get("CLIENTID", "") 24 | CLIENT_SECRET = os.environ.get("CLIENTSECRET", "") 25 | # pylint: enable=C0103 26 | 27 | 28 | def get_buffer_size() -> int: 29 | """ 30 | Returns the buffer size used for file encryption/decryption. 31 | Reads from config.txt if present, otherwise from environment variables. 32 | """ 33 | return BUFFER_SIZE 34 | 35 | 36 | def get_encryption_password() -> str: 37 | """ 38 | Returns the encryption password used for securing the database file. 39 | Reads from config.txt if present, otherwise from environment variables. 40 | """ 41 | return ENCRYPTION_PASSWORD 42 | 43 | 44 | def get_client_id() -> str: 45 | """ 46 | Retrieves Strava client ID from environment variables. 47 | """ 48 | return CLIENT_ID 49 | 50 | 51 | def get_client_secret() -> str: 52 | """ 53 | Retrieves Strava client secret from environment variables. 54 | """ 55 | return CLIENT_SECRET 56 | -------------------------------------------------------------------------------- /.github/workflows/generate-stats.yml: -------------------------------------------------------------------------------- 1 | name: Generate Stats and Update README 2 | 3 | on: 4 | schedule: 5 | - cron: '2,17,32,47 1,4,7,10,13,16,19,22 * * *' 6 | workflow_dispatch: 7 | jobs: 8 | Stats: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | contents: write 12 | 13 | steps: 14 | - name: Check out code 15 | uses: actions/checkout@v6 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v6 19 | with: 20 | python-version: "3.11" 21 | 22 | - name: Install Poetry 23 | uses: abatilo/actions-poetry@v4 24 | 25 | - name: Install dependencies 26 | run: poetry install 27 | 28 | - name: Generate Stats 29 | run: | 30 | poetry run python src/main.py 31 | continue-on-error: true 32 | env: 33 | BUFFERSIZE: ${{ secrets.BUFFERSIZE }} 34 | ENCRYPTIONPASSWORD: ${{ secrets.ENCRYPTIONPASSWORD }} 35 | CLIENTID: ${{ secrets.CLIENTID }} 36 | CLIENTSECRET: ${{ secrets.CLIENTSECRET }} 37 | 38 | - name: Generate README 39 | run: | 40 | poetry run python src/generate_readme.py 41 | continue-on-error: true 42 | env: 43 | BUFFERSIZE: ${{ secrets.BUFFERSIZE }} 44 | ENCRYPTIONPASSWORD: ${{ secrets.ENCRYPTIONPASSWORD }} 45 | CLIENTID: ${{ secrets.CLIENTID }} 46 | CLIENTSECRET: ${{ secrets.CLIENTSECRET }} 47 | 48 | - name: Commit and push changes 49 | run: | 50 | git config --global user.name 'GithubBot' 51 | git config --global user.email 'GithubBot@9bc0ff44ae664378ab0252851a8954ad.com' 52 | git remote set-url origin https://x-access-token:${{ secrets.TOKEN_GITHUB }}@github.com/${{ github.repository }} 53 | git diff-index --quiet HEAD || git commit --allow-empty -am "Automated changes" 54 | git push 55 | env: 56 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 57 | -------------------------------------------------------------------------------- /src/strava_data/auth.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handles Strava OAuth token retrieval and refresh logic. 3 | """ 4 | 5 | import json 6 | import time 7 | import requests 8 | from strava_data.config import get_client_id, get_client_secret 9 | from strava_data.db.dao import read_tokens, store_tokens 10 | from utils.logger import get_logger 11 | 12 | LOGGER = get_logger() 13 | 14 | 15 | def get_or_refresh_tokens() -> None: 16 | """ 17 | Reads existing tokens from the database. 18 | If expired, refreshes them via Strava OAuth. 19 | If none exist, the user must initially obtain them with a manual OAuth flow. 20 | """ 21 | tokens = read_tokens() 22 | if not tokens: 23 | LOGGER.info("No tokens found in the database. Please obtain them initially.") 24 | return 25 | 26 | expires_at = tokens.get("expires_at", 0) 27 | if expires_at < time.time(): 28 | LOGGER.info("Tokens expired. Refreshing now.") 29 | refresh_token = tokens.get("refresh_token", "") 30 | new_tokens = refresh_strava_tokens(refresh_token) 31 | if not new_tokens: 32 | raise RuntimeError("Token refresh failed") 33 | store_tokens(new_tokens) 34 | else: 35 | LOGGER.info("Tokens are still valid.") 36 | 37 | 38 | def refresh_strava_tokens(refresh_token: str) -> dict: 39 | """ 40 | Calls Strava's /oauth/token endpoint to refresh an expired token. 41 | 42 | :param refresh_token: The user's current refresh token from the DB. 43 | :return: Dictionary of new tokens, or an empty dict if refresh fails. 44 | """ 45 | url = "https://www.strava.com/oauth/token" 46 | payload = { 47 | "client_id": get_client_id(), 48 | "client_secret": get_client_secret(), 49 | "grant_type": "refresh_token", 50 | "refresh_token": refresh_token, 51 | } 52 | 53 | payload_str = json.dumps(payload) 54 | LOGGER.info(payload_str) 55 | try: 56 | response = requests.post(url, data=payload, timeout=10) 57 | except requests.exceptions.Timeout: 58 | LOGGER.info("Token refresh request timed out.") 59 | return {} 60 | 61 | if response.ok: 62 | return response.json() 63 | 64 | LOGGER.info( 65 | "Failed to refresh tokens. Status: %s Response: %s", response.status_code, response.text 66 | ) 67 | return {} 68 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '25 11 * * *' 22 | workflow_dispatch: 23 | 24 | jobs: 25 | analyze: 26 | name: Analyze 27 | runs-on: ubuntu-latest 28 | permissions: 29 | actions: read 30 | contents: read 31 | security-events: write 32 | 33 | strategy: 34 | fail-fast: false 35 | matrix: 36 | language: [ 'python' ] 37 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 38 | # Learn more: 39 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v6 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@v4 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 54 | 55 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 56 | # If this step fails, then you should remove it and run the build manually (see below) 57 | - name: Autobuild 58 | uses: github/codeql-action/autobuild@v4 59 | 60 | # ℹ️ Command-line programs to run using the OS shell. 61 | # 📚 https://git.io/JvXDl 62 | 63 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 64 | # and modify them (or add more) to build your code if your project 65 | # uses a compiled language 66 | 67 | #- run: | 68 | # make bootstrap 69 | # make release 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v4 73 | -------------------------------------------------------------------------------- /src/get_tokens.py: -------------------------------------------------------------------------------- 1 | """ 2 | Allows a one-time, manual retrieval of Strava tokens using an authorization code. 3 | Usage: 4 | 1. Run: python get_tokens.py 5 | 2. Paste the authorization code when prompted. 6 | 3. The script will request tokens from Strava and store them in the encrypted database. 7 | """ 8 | 9 | import requests 10 | 11 | from strava_data.db.dao import decrypt_database, encrypt_database, store_tokens 12 | 13 | 14 | def main() -> None: 15 | """ 16 | Steps: 17 | 1. Prompt user for the 'authorization code' from the redirect URL. 18 | 2. Fetch tokens from Strava's /oauth/token endpoint. 19 | 3. Store tokens in the DB (encrypting at the end). 20 | """ 21 | print("=== Strava Token Retrieval ===") 22 | print( 23 | "After creating a Strava application and authorizing it, you obtain a code " 24 | "in the redirect URL." 25 | ) 26 | print( 27 | "Example redirect URL: http://localhost/exchange_token?state=&code=LONGCODEHERE" 28 | "&scope=read,activity:read_all,profile:read_all" 29 | ) 30 | print("Enter your LONGCODEHERE value below.\n") 31 | 32 | auth_code = input("Paste your Strava authorization code: ").strip() 33 | if not auth_code: 34 | print("No authorization code provided. Exiting.") 35 | return 36 | 37 | client_id = input("Paste your Strava client id: ").strip() 38 | if not client_id: 39 | print("No client id provided. Exiting.") 40 | return 41 | 42 | client_secret = input("Paste your Strava client secret: ").strip() 43 | if not client_secret: 44 | print("No client secret provided. Exiting.") 45 | return 46 | 47 | print("\nRequesting tokens from Strava...") 48 | try: 49 | response = requests.post( 50 | url="https://www.strava.com/oauth/token", 51 | data={ 52 | "client_id": client_id, 53 | "client_secret": client_secret, 54 | "code": auth_code, 55 | "grant_type": "authorization_code", 56 | }, 57 | timeout=10, 58 | ) 59 | except requests.exceptions.Timeout: 60 | print("Timeout occurred while requesting Strava tokens.") 61 | return 62 | 63 | strava_tokens = response.json() 64 | if "errors" in strava_tokens or "message" in strava_tokens: 65 | print("Failed to retrieve tokens. Strava responded with:") 66 | print(strava_tokens) 67 | return 68 | 69 | print("Successfully retrieved tokens!") 70 | print(strava_tokens) 71 | 72 | print("\nStoring tokens in the database...") 73 | decrypt_database() 74 | store_tokens(strava_tokens) 75 | encrypt_database() 76 | print("Tokens stored successfully. Database re-encrypted.\n") 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config.txt 2 | strava_temp.sqlite 3 | *.lock 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ -------------------------------------------------------------------------------- /src/strava_data/strava_api/processing/transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data transformation utilities for activities and splits. 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | 9 | def transform_activities(activities_df: pd.DataFrame) -> pd.DataFrame: 10 | """ 11 | Cleans and enriches raw Strava activities. 12 | 13 | :param activities_df: Raw DataFrame from Strava's /activities endpoint. 14 | :return: DataFrame with standardized columns and transformations. 15 | """ 16 | if activities_df.empty: 17 | return pd.DataFrame() 18 | 19 | activities_clean = activities_df.copy() 20 | 21 | activities_clean["distance_m"] = activities_clean["distance"] 22 | activities_clean["moving_time_s"] = activities_clean["moving_time"] 23 | activities_clean["average_speed_m_s"] = activities_clean["average_speed"] 24 | activities_clean["max_speed_m_s"] = activities_clean["max_speed"] 25 | activities_clean["total_elevation_gain_m"] = activities_clean["total_elevation_gain"] 26 | 27 | if "average_cadence" not in activities_clean.columns: 28 | activities_clean["average_cadence"] = 0.0 29 | 30 | activities_clean["start_date_local"] = activities_clean["start_date_local"] 31 | activities_clean["activity_type"] = np.where( 32 | activities_clean["type"].str.lower() == "run", "Run", activities_clean["type"] 33 | ) 34 | 35 | final_cols = [ 36 | "id", 37 | "name", 38 | "type", 39 | "distance_m", 40 | "moving_time_s", 41 | "average_speed_m_s", 42 | "max_speed_m_s", 43 | "total_elevation_gain_m", 44 | "start_date_local", 45 | "average_cadence", 46 | ] 47 | 48 | return activities_clean[final_cols].copy() 49 | 50 | 51 | def transform_splits(splits_df: pd.DataFrame) -> pd.DataFrame: 52 | """ 53 | Cleans and enriches splits data from Strava activities. 54 | 55 | :param splits_df: DataFrame from activity detail calls. 56 | :return: DataFrame with standardized columns for splits. 57 | """ 58 | if splits_df.empty: 59 | return pd.DataFrame() 60 | 61 | splits_clean = splits_df.copy() 62 | 63 | splits_clean["distance_m"] = splits_clean["distance"] 64 | splits_clean["elapsed_time_s"] = splits_clean["elapsed_time"] 65 | splits_clean["elevation_difference_m"] = splits_clean["elevation_difference"] 66 | splits_clean["moving_time_s"] = splits_clean["moving_time"] 67 | splits_clean["average_grade_adjusted_speed_m_s"] = splits_clean["average_grade_adjusted_speed"] 68 | splits_clean["average_heartrate"] = splits_clean.get("average_heartrate", np.nan) 69 | splits_clean["split_index"] = splits_clean["split"] 70 | splits_clean["start_date_local"] = splits_clean["start_date_local"] 71 | 72 | final_cols = [ 73 | "activity_id", 74 | "distance_m", 75 | "elapsed_time_s", 76 | "elevation_difference_m", 77 | "moving_time_s", 78 | "pace_zone", 79 | "split_index", 80 | "average_grade_adjusted_speed_m_s", 81 | "average_heartrate", 82 | "start_date_local", 83 | ] 84 | 85 | return splits_clean[final_cols].copy() 86 | -------------------------------------------------------------------------------- /src/strava_data/ml/run_type_classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Machine learning to classify run types (e.g. Easy, Tempo, Intervals, Long) 3 | """ 4 | 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | from sklearn.cluster import KMeans 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.metrics import silhouette_score 11 | 12 | from strava_data.ml.utils import prepare_pace_summary 13 | from strava_data.strava_api.visualisation.utils import ( 14 | prepare_dated_activities, 15 | save_and_close_plot, 16 | format_pace, 17 | add_title_with_attribution, 18 | TitleBoxConfig, 19 | ) 20 | from utils.logger import get_logger 21 | 22 | LOGGER = get_logger() 23 | 24 | RUN_TYPE_LABELS = { 25 | 0: "Easy", 26 | 1: "Tempo", 27 | 2: "Intervals", 28 | 3: "Long", 29 | } 30 | 31 | 32 | def build_run_features(splits_df: pd.DataFrame) -> pd.DataFrame: 33 | """ 34 | Aggregates split data into per-run features for clustering. 35 | """ 36 | data = prepare_dated_activities(splits_df) 37 | data["start_date"] = pd.to_datetime(data["start_date_local"]).dt.tz_localize(None) 38 | 39 | # Group by activity ID and start date to represent each run 40 | summary = prepare_pace_summary(data, group_cols=["activity_id", "start_date_local"]) 41 | 42 | summary["start_date"] = pd.to_datetime(summary["start_date_local"]) 43 | summary["day_of_week"] = summary["start_date"].dt.dayofweek 44 | summary["month"] = summary["start_date"].dt.month 45 | summary["year"] = summary["start_date"].dt.year 46 | 47 | return summary 48 | 49 | 50 | def cluster_run_types(data: pd.DataFrame, n_clusters: int = 4) -> pd.DataFrame: 51 | """ 52 | Applies KMeans clustering to classify run types. 53 | """ 54 | features = data[["distance_km", "pace_median", "pace_std", "split_count"]] 55 | scaler = StandardScaler() 56 | features_scaled = scaler.fit_transform(features) 57 | 58 | model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) 59 | cluster_labels = model.fit_predict(features_scaled) 60 | 61 | score = silhouette_score(features_scaled, cluster_labels) 62 | LOGGER.info("Silhouette Score: %.3f", score) 63 | 64 | data["run_type_cluster"] = cluster_labels 65 | data["run_type"] = data["run_type_cluster"].map(RUN_TYPE_LABELS) 66 | return data 67 | 68 | 69 | def plot_clusters(data: pd.DataFrame, output_path: str) -> None: 70 | """ 71 | Scatterplot of distance vs. pace coloured by run type. 72 | """ 73 | plt.figure(figsize=(10, 6)) 74 | sns.scatterplot( 75 | data=data, 76 | x="distance_km", 77 | y="pace_median", 78 | hue="run_type", 79 | palette="tab10", 80 | alpha=0.8, 81 | ) 82 | plt.xlabel("Distance (km)") 83 | plt.ylabel("Pace (mm:ss per km)") 84 | plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(format_pace)) 85 | plt.grid(True) 86 | plt.legend(title="Run Type") 87 | add_title_with_attribution( 88 | plt.gcf(), 89 | "Run Type Clustering", 90 | TitleBoxConfig(), 91 | ) 92 | save_and_close_plot(output_path) 93 | 94 | 95 | def plot_run_type_distribution_by_year(data: pd.DataFrame, output_path: str) -> None: 96 | """ 97 | Bar chart showing count of run types per year. 98 | """ 99 | counts = data.groupby(["year", "run_type"]).size().reset_index(name="count") 100 | pivot = counts.pivot(index="year", columns="run_type", values="count").fillna(0) 101 | 102 | pivot.plot(kind="bar", stacked=True, figsize=(10, 6), colormap="tab10") 103 | plt.xlabel("Year") 104 | plt.ylabel("Number of Runs") 105 | plt.xticks(rotation=45) 106 | plt.legend(title="Run Type") 107 | plt.grid(True, axis="y") 108 | add_title_with_attribution( 109 | plt.gcf(), 110 | "Run Type Distribution by Year", 111 | TitleBoxConfig(), 112 | ) 113 | save_and_close_plot(output_path) 114 | 115 | 116 | def run_clustering_pipeline(splits_df: pd.DataFrame) -> pd.DataFrame: 117 | """ 118 | Runs the full clustering pipeline: feature prep, clustering, and visualisation. 119 | """ 120 | LOGGER.info("Loading and building features...") 121 | feature_data = build_run_features(splits_df) 122 | 123 | LOGGER.info("Running KMeans clustering...") 124 | clustered = cluster_run_types(feature_data, n_clusters=4) 125 | 126 | LOGGER.info("Plotting clusters...") 127 | plot_clusters(clustered, "Run_Type_Clusters.png") 128 | 129 | LOGGER.info("Plotting run type distribution...") 130 | plot_run_type_distribution_by_year(clustered, "Run_Type_Distribution_By_Year.png") 131 | 132 | return clustered 133 | -------------------------------------------------------------------------------- /src/strava_data/ml/pace_forecast.py: -------------------------------------------------------------------------------- 1 | """ 2 | Machine learning to forecast pace 3 | """ 4 | 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from sklearn.linear_model import Ridge 8 | from sklearn.model_selection import TimeSeriesSplit, cross_val_score 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | from strava_data.ml.utils import prepare_pace_summary 13 | from strava_data.strava_api.visualisation.utils import ( 14 | prepare_dated_activities, 15 | format_pace, 16 | save_and_close_plot, 17 | add_title_with_attribution, 18 | TitleBoxConfig, 19 | ) 20 | from utils.logger import get_logger 21 | 22 | LOGGER = get_logger() 23 | 24 | 25 | def build_weekly_pace_features(splits_df: pd.DataFrame) -> pd.DataFrame: 26 | """ 27 | Aggregates ~1 km splits into weekly median pace and rolling stats. 28 | """ 29 | data = prepare_dated_activities(splits_df) 30 | data["start_date"] = pd.to_datetime(data["start_date_local"]).dt.tz_localize(None) 31 | data["week"] = data["start_date"].dt.to_period("W").apply(lambda r: r.start_time) 32 | summary = prepare_pace_summary(data, group_cols=["week"]) 33 | 34 | summary["pace_7d_avg"] = summary["pace_median"].rolling(window=2).mean() 35 | summary["pace_7d_std"] = summary["pace_median"].rolling(window=2).std() 36 | return summary.dropna() 37 | 38 | 39 | def train_forecast_model(data: pd.DataFrame): 40 | """ 41 | Trains a Ridge regression model using time-based cross-validation. 42 | """ 43 | features = data[["pace_7d_avg", "pace_7d_std", "split_count"]] 44 | target = data["pace_median"] 45 | 46 | model = Pipeline([("scale", StandardScaler()), ("ridge", Ridge(alpha=1.0))]) 47 | 48 | tscv = TimeSeriesSplit(n_splits=5) 49 | scores = cross_val_score( 50 | model, features, target, cv=tscv, scoring="neg_root_mean_squared_error" 51 | ) 52 | LOGGER.info("CV RMSE: %.2f seconds", -scores.mean()) 53 | 54 | model.fit(features, target) 55 | return model 56 | 57 | 58 | def predict_next_week(model, latest_row: pd.Series): 59 | """ 60 | Uses trained model to predict next week's average pace. 61 | """ 62 | next_features = latest_row[["pace_7d_avg", "pace_7d_std", "split_count"]].to_frame().T 63 | predicted_pace = model.predict(next_features)[0] 64 | minutes = int(predicted_pace // 60) 65 | seconds = int(predicted_pace % 60) 66 | LOGGER.info("Forecasted pace for next week: %d:%02d per km", minutes, seconds) 67 | return predicted_pace 68 | 69 | 70 | def plot_forecast(weekly_data: pd.DataFrame, forecast_value: float, output_path: str) -> None: 71 | """ 72 | Plots weekly median pace and overlays the next week's forecast as an X with RMSE band. 73 | """ 74 | plt.figure(figsize=(10, 6)) 75 | plt.plot(weekly_data["week"], weekly_data["pace_median"], label="Actual Pace", marker="o") 76 | 77 | true_values = weekly_data["pace_median"] 78 | feature_values = weekly_data[["pace_7d_avg", "pace_7d_std", "split_count"]] 79 | model = Pipeline([("scale", StandardScaler()), ("ridge", Ridge(alpha=1.0))]) 80 | model.fit(feature_values, true_values) 81 | residuals = true_values - model.predict(feature_values) 82 | rmse = residuals.std() 83 | 84 | forecast_week = weekly_data["week"].max() + pd.Timedelta(weeks=1) 85 | plt.scatter( 86 | forecast_week, forecast_value, marker="x", color="red", s=100, label="Forecast Next Week" 87 | ) 88 | 89 | plt.fill_between( 90 | weekly_data["week"].astype("datetime64[ns]"), 91 | weekly_data["pace_median"] - rmse, 92 | weekly_data["pace_median"] + rmse, 93 | color="blue", 94 | alpha=0.1, 95 | label="±1 RMSE Band", 96 | ) 97 | 98 | plt.xlabel("Week") 99 | plt.ylabel("Pace (mm:ss)") 100 | plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(format_pace)) 101 | plt.legend() 102 | plt.grid(True) 103 | add_title_with_attribution( 104 | plt.gcf(), 105 | "Weekly Median Pace with Forecast", 106 | TitleBoxConfig(), 107 | ) 108 | save_and_close_plot(output_path) 109 | 110 | 111 | def run_forecast_pipeline(splits_df: pd.DataFrame) -> None: 112 | """ 113 | Orchestrates weekly pace forecast: feature prep, training, prediction, plotting. 114 | """ 115 | LOGGER.info("Building features from splits...") 116 | weekly_data = build_weekly_pace_features(splits_df) 117 | 118 | LOGGER.info("Training forecast model...") 119 | model = train_forecast_model(weekly_data) 120 | 121 | LOGGER.info("Predicting future pace...") 122 | latest_features = weekly_data.iloc[-1] 123 | forecast_value = predict_next_week(model, latest_features) 124 | 125 | LOGGER.info("Generating forecast chart...") 126 | plot_forecast(weekly_data, forecast_value, "Forecast_Weekly_Pace.png") 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StravaDataAnalysis 2 | This repository extracts data from the Strava API, which is downstream of Garmin devices, stores it locally (encrypted), and generates visualizations. 3 | 4 | If other people start using this, I'll try and streamline this process as much as I can. 5 | 6 | [![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](http://unlicense.org/) 7 | [![CodeFactor](https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis/badge)](https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis) 8 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/9f08e367a5594645aa30c1e31c54dbb8)](https://app.codacy.com/gh/c-wilkinson/StravaDataAnalysis?utm_source=github.com&utm_medium=referral&utm_content=c-wilkinson/StravaDataAnalysis&utm_campaign=Badge_Grade) 9 | [![CodeTest](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml) 10 | [![GenerateStats](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/generate-stats.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/generate-stats.yml) 11 | [![CodeQL](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml) 12 | 13 | ## Generated Content 14 | 📅 Stats last updated on: **2025-12-19 02:50:13** 15 | 16 | 🏃‍♂️ Most recent run: 0 years, 0 months, 3 days, 20 hours and 50 minutes 17 | 18 | ![A.I. Recommended Training](A.I._Recommended_Training.png?raw=true "A.I. Recommended Training") 19 | 20 | ![Activity Heatmap](Activity_Heatmap.png?raw=true "Activity Heatmap") 21 | 22 | ![Cadence Over Time](Cadence_Over_Time.png?raw=true "Cadence Over Time") 23 | 24 | ![Cumulative Distance](Cumulative_Distance.png?raw=true "Cumulative Distance") 25 | 26 | ![Elevation Gain Distribution](Elevation_Gain_Distribution.png?raw=true "Elevation Gain Distribution") 27 | 28 | ![Elevation Gain Per Km By Month](Elevation_Gain_per_KM_by_Month.png?raw=true "Elevation Gain Per Km By Month") 29 | 30 | ![Fastest 1K Pace Over Time](Fastest_1k_Pace_over_Time.png?raw=true "Fastest 1K Pace Over Time") 31 | 32 | ![Forecast Weekly Pace](Forecast_Weekly_Pace.png?raw=true "Forecast Weekly Pace") 33 | 34 | ![Longest Run Per Month](Longest_Run_per_Month.png?raw=true "Longest Run Per Month") 35 | 36 | ![Median 1K Pace Over Time](Median_1k_Pace_over_Time.png?raw=true "Median 1K Pace Over Time") 37 | 38 | ![Monthly Distance By Year](Monthly_Distance_by_Year.png?raw=true "Monthly Distance By Year") 39 | 40 | ![Number Of Runs Per Distance](Number_of_Runs_per_Distance.png?raw=true "Number Of Runs Per Distance") 41 | 42 | ![Pace Consistency By Run](Pace_Consistency_by_Run.png?raw=true "Pace Consistency By Run") 43 | 44 | ![Pace Distribution](Pace_Distribution.png?raw=true "Pace Distribution") 45 | 46 | ![Pace By Day](Pace_by_Day.png?raw=true "Pace By Day") 47 | 48 | ![Rest Days Heatmap](Rest_Days_Heatmap.png?raw=true "Rest Days Heatmap") 49 | 50 | ![Rolling 30 Day Comparison](Rolling_30_Day_Comparison.png?raw=true "Rolling 30 Day Comparison") 51 | 52 | ![Run Days Heatmap](Run_Days_Heatmap.png?raw=true "Run Days Heatmap") 53 | 54 | ![Run Distance Distribution](Run_Distance_Distribution.png?raw=true "Run Distance Distribution") 55 | 56 | ![Run Rest Ratio Heatmap](Run_Rest_Ratio_Heatmap.png?raw=true "Run Rest Ratio Heatmap") 57 | 58 | ![Run Start Time By Month](Run_Start_Time_by_Month.png?raw=true "Run Start Time By Month") 59 | 60 | ![Run Type Clusters](Run_Type_Clusters.png?raw=true "Run Type Clusters") 61 | 62 | ![Run Type Distribution By Year](Run_Type_Distribution_By_Year.png?raw=true "Run Type Distribution By Year") 63 | 64 | ![Running Pace Over Time](Running_Pace_over_Time.png?raw=true "Running Pace Over Time") 65 | 66 | ![Running Pace Vs Elevation Change](Running_Pace_vs_Elevation_Change.png?raw=true "Running Pace Vs Elevation Change") 67 | 68 | ![Running Pace Vs Total Distance](Running_Pace_vs_Total_Distance.png?raw=true "Running Pace Vs Total Distance") 69 | 70 | ![Time Taken Distance](Time_Taken_Distance.png?raw=true "Time Taken Distance") 71 | 72 | ![Time Taken Distance Recent Years](Time_Taken_Distance_Recent_Years.png?raw=true "Time Taken Distance Recent Years") 73 | 74 | ![Total Distance Ran By Month](Total_Distance_Ran_by_Month.png?raw=true "Total Distance Ran By Month") 75 | 76 | ![Training Intensity By Heartrate Zone](Training_Intensity_by_HeartRate_Zone.png?raw=true "Training Intensity By Heartrate Zone") 77 | 78 | ![Training Load Over Time](Training_Load_Over_Time.png?raw=true "Training Load Over Time") 79 | 80 | ![Vo2 Proxy Over Time](VO2_Proxy_Over_Time.png?raw=true "Vo2 Proxy Over Time") 81 | 82 | ## Instructions 83 | As I'm sure is obvious, I'm teaching myself python as I go so the code quality is not likely to be great. Do with it as you wish. 84 | 85 | 1. To use, create an Application on Strava. This can be done here: https://www.strava.com/settings/api 86 | 87 | Give it a name, a website and an 'Authorization Callback Domain'. The 'Authorization Callback Domain' should be 'local host'. 88 | 89 | 2. Copy and paste the following link into your browser, replacing {CLIENTIDHERE} with your numeric Client ID found on your Strava application settings page. 90 | 91 | > http://www.strava.com/oauth/authorize?client_id={CLIENTIDHERE}&response_type=code&redirect_uri=http://localhost/exchange_token&approval_prompt=force&scope=profile:read_all,activity:read_all 92 | 93 | Click authorise when you visit the above link 94 | 95 | 3. You will go to a 404 not found page with a link that looks like this: - 96 | 97 | > http://localhost/exchange_token?state=&code={LONGCODEHERE}&scope=read,activity:read_all,profile:read_all 98 | 99 | Copy the code after '&code=' to save for step 4. You will also need your client ID and client secret found on your Strava application settings page. 100 | 101 | 4. Run 'get_tokens.py'. This will create the initial tokens required for the script. 102 | 103 | Once this has been completed, you can run 'main.py' which uses the tokens to get the data points. If the access_token has expired, it will refresh its tokens automatically during run time. -------------------------------------------------------------------------------- /src/generate_readme.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates an updated README.md at the top-level of the repository. 3 | """ 4 | 5 | import os 6 | from datetime import datetime 7 | from dateutil.relativedelta import relativedelta 8 | 9 | from strava_data.db.dao import decrypt_database, encrypt_database, get_last_run_time 10 | from utils.logger import get_logger 11 | 12 | LOGGER = get_logger() 13 | 14 | # Build a path to the README.md in the top-level directory 15 | README_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md") 16 | 17 | 18 | def generate_readme() -> None: 19 | """ 20 | 1. Decrypts the DB if needed. 21 | 2. Fetches the last run from the activities table. 22 | 3. Calculates how long ago it was. 23 | 4. Rebuilds README.md in the top-level directory with embedded graphs. 24 | 5. Encrypts DB again if desired. 25 | """ 26 | LOGGER.info("Start generate_readme.") 27 | decrypt_database() 28 | 29 | last_updated = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 30 | last_run_time = get_last_run_time() 31 | time_string = "No runs found!" 32 | if last_run_time is not None: 33 | delta = relativedelta(datetime.now(), last_run_time) 34 | time_string = ( 35 | f"{delta.years} years, " 36 | f"{delta.months} months, " 37 | f"{delta.days} days, " 38 | f"{delta.hours} hours and " 39 | f"{delta.minutes} minutes" 40 | ) 41 | 42 | encrypt_database() 43 | if os.path.exists(README_PATH): 44 | os.remove(README_PATH) 45 | 46 | readme_dir = os.path.dirname(README_PATH) 47 | 48 | with open(README_PATH, "w", encoding="utf-8") as handle: 49 | handle.write("# StravaDataAnalysis\n") 50 | handle.write( 51 | "This repository extracts data from the Strava API, which is downstream of Garmin " 52 | "devices, stores it locally (encrypted), and generates visualizations.\n\n" 53 | "If other people start using this, I'll try and streamline this process as much as I " 54 | "can.\n\n" 55 | "[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)]" 56 | "(http://unlicense.org/)\n" 57 | "[![CodeFactor](" 58 | "https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis/badge)]" 59 | "(https://www.codefactor.io/repository/github/c-wilkinson/stravadataanalysis)\n" 60 | "[![Codacy Badge](" 61 | "https://api.codacy.com/project/badge/Grade/9f08e367a5594645aa30c1e31c54dbb8)]" 62 | "(https://app.codacy.com/gh/c-wilkinson/StravaDataAnalysis?" 63 | "utm_source=github.com&utm_medium=referral" 64 | "&utm_content=c-wilkinson/StravaDataAnalysis&utm_campaign=Badge_Grade)\n" 65 | "[![CodeTest](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/" 66 | "test-code.yml/badge.svg)]" 67 | "(https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/test-code.yml)\n" 68 | "[![GenerateStats](https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/" 69 | "generate-stats.yml/badge.svg)]" 70 | "(https://github.com/c-wilkinson/StravaDataAnalysis/actions" 71 | "/workflows/generate-stats.yml)\n" 72 | "[![CodeQL](https://github.com/" 73 | "c-wilkinson/StravaDataAnalysis/actions/workflows/codeql-analysis.yml/" 74 | "badge.svg)]" 75 | "(https://github.com/c-wilkinson/StravaDataAnalysis/actions/workflows/" 76 | "codeql-analysis.yml)\n\n" 77 | ) 78 | handle.write("## Generated Content\n") 79 | handle.write(f"📅 Stats last updated on: **{last_updated}**\n\n") 80 | handle.write(f"🏃‍♂️ Most recent run: {time_string}\n\n") 81 | 82 | # Dynamically insert all PNG images 83 | image_files = sorted(f for f in os.listdir(readme_dir) if f.endswith(".png")) 84 | for image in image_files: 85 | title = image.replace("_", " ").replace(".png", "").title() 86 | LOGGER.info("Adding %s to readme.md", title) 87 | handle.write(f'![{title}]({image}?raw=true "{title}")\n\n') 88 | 89 | handle.write("## Instructions\n") 90 | handle.write( 91 | "As I'm sure is obvious, I'm teaching myself python as I go so the code " 92 | "quality is not " 93 | "likely to be great. Do with it as you wish.\n\n" 94 | "1. To use, create an Application on Strava. This can be done here: " 95 | "https://www.strava.com/settings/api\n\n" 96 | "Give it a name, a website and an 'Authorization Callback Domain'. The " 97 | "'Authorization Callback " 98 | "Domain' should be 'local host'.\n\n" 99 | "2. Copy and paste the following link into your browser, replacing {CLIENTIDHERE} " 100 | "with your numeric " 101 | "Client ID found on your Strava application settings page.\n\n" 102 | "> http://www.strava.com/oauth/authorize?client_id={CLIENTIDHERE}&" 103 | "response_type=code&redirect_uri=" 104 | "http://localhost/exchange_token&approval_prompt=force&scope=" 105 | "profile:read_all,activity:read_all\n\n" 106 | "Click authorise when you visit the above link\n\n" 107 | "3. You will go to a 404 not found page with a link that looks like this: -\n\n" 108 | "> http://localhost/exchange_token?state=&code={LONGCODEHERE}" 109 | "&scope=read,activity:read_all," 110 | "profile:read_all\n\n" 111 | "Copy the code after '&code=' to save for step 4. You will also need your " 112 | "client ID and client secret " 113 | "found on your Strava application settings page.\n\n" 114 | "4. Run 'get_tokens.py'. This will create the initial tokens required for " 115 | "the script.\n\n" 116 | "Once this has been completed, you can run 'main.py' which uses the tokens " 117 | "to get the data points. " 118 | "If the access_token has expired, it will refresh its tokens automatically " 119 | "during run time." 120 | ) 121 | 122 | 123 | if __name__ == "__main__": 124 | generate_readme() 125 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/graphs_effort.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the effort chart functions, each saving a PNG file. 3 | """ 4 | 5 | import matplotlib.dates as mdates 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | 10 | from strava_data.strava_api.visualisation import utils 11 | 12 | 13 | def plot_elevation_gain_per_km_by_month(activities_df: pd.DataFrame, output_path: str) -> None: 14 | """ 15 | Plots average elevation gain per km for each month, per year. 16 | - X-axis: Month (Jan–Dec) 17 | - Y-axis: Elevation gain per km 18 | - Line series: one per year 19 | """ 20 | data = utils.prepare_dated_activities(activities_df) 21 | 22 | monthly_stats = ( 23 | data.groupby(["year", "month"]) 24 | .agg({"distance_km": "sum", "total_elevation_gain_m": "sum"}) 25 | .reset_index() 26 | ) 27 | 28 | monthly_stats = monthly_stats[monthly_stats["distance_km"] > 0] 29 | monthly_stats["elev_gain_per_km"] = ( 30 | monthly_stats["total_elevation_gain_m"] / monthly_stats["distance_km"] 31 | ) 32 | 33 | def plot_fn(axis): 34 | for year in sorted(monthly_stats["year"].unique()): 35 | year_data = monthly_stats[monthly_stats["year"] == year].sort_values("month") 36 | axis.plot( 37 | year_data["month"], year_data["elev_gain_per_km"], marker="o", label=str(year) 38 | ) 39 | utils.label_month_axis(axis) 40 | axis.legend(title="Year") 41 | 42 | # pylint: disable=R0801 43 | utils.plot_with_common_setup( 44 | title="Elevation Gain per km by Month", 45 | xlabel="Month", 46 | ylabel="Elevation Gain (m/km)", 47 | output_path=output_path, 48 | plot_func=plot_fn, 49 | ) 50 | # pylint: enable=R0801 51 | 52 | 53 | def plot_cadence_over_time(activities_df: pd.DataFrame, output_path: str) -> None: 54 | """ 55 | Scatter plot of average cadence over time with trend line. 56 | - Filters to activities with cadence > 0 57 | """ 58 | data = utils.prepare_dated_activities(activities_df) 59 | data = data[data["average_cadence"] > 0] 60 | if data.empty: 61 | return 62 | 63 | data["start_date"] = pd.to_datetime(data["start_date_local"]) 64 | data = data.sort_values("start_date") 65 | data["start_date_num"] = mdates.date2num(data["start_date"]) 66 | 67 | def plot_fn(axis): 68 | sns.scatterplot(data=data, x="start_date", y="average_cadence", alpha=0.5, ax=axis) 69 | sns.regplot( 70 | data=data, 71 | x="start_date_num", 72 | y="average_cadence", 73 | scatter=False, 74 | color="black", 75 | line_kws={"linestyle": "--"}, 76 | ax=axis, 77 | ) 78 | for label in axis.get_xticklabels(): 79 | label.set_rotation(45) 80 | 81 | # pylint: disable=R0801 82 | utils.plot_with_common_setup( 83 | title="Average Cadence Over Time", 84 | xlabel="Date", 85 | ylabel="Cadence (steps per minute)", 86 | output_path=output_path, 87 | plot_func=plot_fn, 88 | ) 89 | # pylint: enable=R0801 90 | 91 | 92 | def plot_effort_score_over_time(activities_df: pd.DataFrame, output_path: str) -> None: 93 | """ 94 | Line plot showing calculated effort score over time. 95 | effort = (distance_km * 10) + (elevation_gain_m * 1.5) 96 | """ 97 | data = utils.prepare_dated_activities(activities_df) 98 | data["effort_score"] = (data["distance_km"] * 10) + (data["total_elevation_gain_m"] * 1.5) 99 | data["rolling_effort"] = data["effort_score"].rolling(window=7).mean() 100 | 101 | def plot_fn(axis): 102 | axis.plot( 103 | data["start_date"], data["rolling_effort"], label="7-day Avg Effort", color="blue" 104 | ) 105 | axis.legend() 106 | axis.grid(True) 107 | 108 | # pylint: disable=R0801 109 | utils.plot_with_common_setup( 110 | title="Training Load (Effort Score) Over Time", 111 | xlabel="Date", 112 | ylabel="Effort Score", 113 | output_path=output_path, 114 | plot_func=plot_fn, 115 | ) 116 | # pylint: enable=R0801 117 | 118 | 119 | def plot_vo2_proxy_over_time(splits_df: pd.DataFrame, output_path: str) -> None: 120 | """ 121 | Estimates a VO₂ max–style fitness proxy using 1 km split pace over time. 122 | 123 | VO₂ proxy = 15.0 × (speed in m/s), where speed = distance / time for fastest split per month. 124 | 125 | Produces a line chart per year showing how top-end aerobic fitness changes across months. 126 | """ 127 | data = utils.prepare_dated_activities(splits_df) 128 | if data.empty: 129 | return 130 | 131 | data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"] 132 | data["speed_mps"] = data["distance_m"] / data["elapsed_time_s"] 133 | data["vo2_proxy"] = 15.0 * data["speed_mps"] 134 | data["year"] = pd.to_datetime(data["start_date_local"]).dt.year 135 | data["month"] = pd.to_datetime(data["start_date_local"]).dt.month 136 | 137 | monthly = data.groupby(["year", "month"])["vo2_proxy"].max().reset_index() 138 | 139 | rows = [] 140 | for year in sorted(monthly["year"].unique()): 141 | for month in range(1, 13): 142 | match = monthly[(monthly["year"] == year) & (monthly["month"] == month)] 143 | value = match["vo2_proxy"].values[0] if not match.empty else np.nan 144 | rows.append({"year": year, "month": month, "vo2_proxy": value}) 145 | 146 | plot_df = pd.DataFrame(rows) 147 | plot_df["vo2_proxy"] = plot_df.groupby("year")["vo2_proxy"].ffill() 148 | 149 | def plot_fn(axis): 150 | for year in sorted(plot_df["year"].unique()): 151 | sub = plot_df[plot_df["year"] == year] 152 | axis.plot(sub["month"], sub["vo2_proxy"], marker="o", label=str(year)) 153 | utils.label_month_axis(axis) 154 | axis.legend(title="Year") 155 | axis.grid(True) 156 | 157 | # pylint: disable=R0801 158 | utils.plot_with_common_setup( 159 | title="Estimated VO₂ Max Proxy Over Time", 160 | xlabel="Month", 161 | ylabel="VO₂ Proxy", 162 | output_path=output_path, 163 | plot_func=plot_fn, 164 | ) 165 | # pylint: enable=R0801 166 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Client code to call Strava's API endpoints, with rate-limiting control. 3 | """ 4 | 5 | import time 6 | from typing import Optional 7 | from datetime import datetime 8 | 9 | import requests 10 | import pandas as pd 11 | 12 | from strava_data.db.dao import read_tokens, insert_activities, get_latest_activity_date 13 | from strava_data.strava_api.processing.transform import transform_activities 14 | from utils.logger import get_logger 15 | 16 | LOGGER = get_logger() 17 | 18 | MAX_REQUESTS_15_MIN = 100 19 | MAX_REQUESTS_DAY = 1000 20 | RATE_LIMIT_15_MIN_SECONDS = 15 * 60 21 | RATE_LIMIT_24_HOURS_SECONDS = 24 * 60 * 60 22 | 23 | 24 | class RateLimiter: 25 | def __init__(self): 26 | self.last_request_time = None 27 | self.request_count = 0 28 | 29 | def update(self): 30 | self.last_request_time = time.time() 31 | self.request_count += 1 32 | 33 | def reset(self): 34 | self.last_request_time = None 35 | self.request_count = 0 36 | 37 | def should_wait(self) -> bool: 38 | if not self.last_request_time: 39 | return False 40 | elapsed = time.time() - self.last_request_time 41 | return elapsed < RATE_LIMIT_15_MIN_SECONDS and self.request_count >= MAX_REQUESTS_15_MIN 42 | 43 | def wait_if_needed(self): 44 | if self.should_wait(): 45 | wait_time = RATE_LIMIT_15_MIN_SECONDS - (time.time() - self.last_request_time) 46 | LOGGER.warning("15-min rate limit reached. Waiting %f seconds.", wait_time) 47 | time.sleep(wait_time) 48 | self.reset() 49 | 50 | 51 | rate_limiter = RateLimiter() 52 | 53 | 54 | def fetch_activities(per_page: int = 30) -> pd.DataFrame: 55 | tokens = read_tokens() 56 | if not tokens: 57 | LOGGER.warning("No stored tokens. Returning empty DataFrame.") 58 | return pd.DataFrame() 59 | 60 | headers = {"Authorization": f"Bearer {tokens.get('access_token', '')}"} 61 | 62 | latest_str = get_latest_activity_date() 63 | if latest_str: 64 | latest_dt = datetime.strptime(latest_str, "%Y-%m-%dT%H:%M:%SZ") 65 | after_unix = int(latest_dt.timestamp()) 66 | LOGGER.info("Fetching activities after %s (UNIX %d)", latest_str, after_unix) 67 | else: 68 | after_unix = 0 69 | LOGGER.info("No existing activities in DB, fetching all from start.") 70 | 71 | all_activities = pd.DataFrame() 72 | page = 1 73 | 74 | while True: 75 | LOGGER.info("Fetching page %d of activities", page) 76 | params = {"per_page": per_page, "page": page, "after": after_unix} 77 | response_data = _make_api_request( 78 | "https://www.strava.com/api/v3/athlete/activities", headers, params 79 | ) 80 | 81 | if response_data is None: 82 | LOGGER.error("No data returned (None). Stopping fetch.") 83 | break 84 | 85 | if ( 86 | isinstance(response_data, dict) 87 | and response_data.get("message") == "Rate Limit Exceeded" 88 | ): 89 | LOGGER.warning( 90 | "Strava rate limit exceeded. Waiting %d seconds.", RATE_LIMIT_15_MIN_SECONDS 91 | ) 92 | time.sleep(RATE_LIMIT_15_MIN_SECONDS) 93 | rate_limiter.reset() 94 | continue 95 | 96 | if isinstance(response_data, list): 97 | page_df = pd.DataFrame(response_data) 98 | if page_df.empty: 99 | LOGGER.info("No more activities on page %d. Ending fetch.", page) 100 | break 101 | 102 | all_activities = pd.concat([all_activities, page_df], ignore_index=True) 103 | transformed_df = transform_activities(page_df) 104 | insert_activities(transformed_df) 105 | 106 | page += 1 107 | else: 108 | LOGGER.error("Unexpected response data type: %s", type(response_data)) 109 | break 110 | 111 | LOGGER.info("Fetched a total of %d activities", len(all_activities)) 112 | return all_activities 113 | 114 | 115 | def fetch_splits_if_needed(activities_df: pd.DataFrame) -> pd.DataFrame: 116 | tokens = read_tokens() 117 | if not tokens: 118 | return pd.DataFrame() 119 | 120 | headers = {"Authorization": f"Bearer {tokens.get('access_token', '')}"} 121 | all_splits = pd.DataFrame() 122 | 123 | for _, row in activities_df.iterrows(): 124 | if str(row.get("type", "")).lower() != "run": 125 | continue 126 | 127 | activity_id = row.get("id") 128 | if not activity_id: 129 | continue 130 | 131 | splits_url = f"https://www.strava.com/api/v3/activities/{activity_id}" 132 | splits_data = _make_api_request(splits_url, headers, None) 133 | 134 | if isinstance(splits_data, dict) and splits_data.get("message") == "Rate Limit Exceeded": 135 | LOGGER.warning( 136 | "Hit 429 fetching splits for activity %d. Waiting %d seconds, then retrying once.", 137 | activity_id, 138 | RATE_LIMIT_15_MIN_SECONDS, 139 | ) 140 | time.sleep(RATE_LIMIT_15_MIN_SECONDS) 141 | rate_limiter.reset() 142 | continue 143 | 144 | if not splits_data or "splits_metric" not in splits_data: 145 | continue 146 | 147 | df_splits = pd.DataFrame(splits_data["splits_metric"]) 148 | df_splits["activity_id"] = activity_id 149 | df_splits["start_date_local"] = splits_data.get("start_date_local", "") 150 | all_splits = pd.concat([all_splits, df_splits], ignore_index=True) 151 | 152 | return all_splits 153 | 154 | 155 | def _make_api_request(url: str, headers: dict, params: Optional[dict]) -> Optional[list]: 156 | rate_limiter.wait_if_needed() 157 | 158 | try: 159 | response = requests.get(url, headers=headers, params=params, timeout=10) 160 | except requests.exceptions.Timeout: 161 | LOGGER.error("Timeout occurred while making API request to %s", url) 162 | return None 163 | 164 | if response.status_code == 429: 165 | LOGGER.error("HTTP 429: Rate Limit Exceeded by Strava") 166 | rate_limiter.reset() 167 | return {"message": "Rate Limit Exceeded"} 168 | 169 | if not response.ok: 170 | LOGGER.error( 171 | "Request failed. Status: %d, Response: %s", 172 | response.status_code, 173 | response.text, 174 | ) 175 | return None 176 | 177 | rate_limiter.update() 178 | 179 | try: 180 | return response.json() 181 | except ValueError: 182 | LOGGER.error("Invalid JSON response from %s", url) 183 | return None 184 | 185 | 186 | def get_latest_activity_unix_timestamp() -> int: 187 | latest_str = get_latest_activity_date() 188 | if not latest_str: 189 | return 0 190 | 191 | latest_dt = datetime.strptime(latest_str, "%Y-%m-%dT%H:%M:%SZ") 192 | return int(latest_dt.timestamp()) 193 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main entry point for running Strava data retrieval, processing, and visualization. 3 | """ 4 | 5 | import argparse 6 | import pandas as pd 7 | 8 | from utils.logger import get_logger 9 | from strava_data.auth import get_or_refresh_tokens 10 | from strava_data.db.dao import ( 11 | decrypt_database, 12 | encrypt_database, 13 | init_database, 14 | insert_activities, 15 | insert_splits, 16 | load_all_activities, 17 | load_all_splits, 18 | ) 19 | from strava_data.strava_api.client import fetch_activities, fetch_splits_if_needed 20 | from strava_data.strava_api.processing.transform import transform_activities, transform_splits 21 | from strava_data.strava_api.visualisation import ( 22 | graphs_distribution, 23 | graphs_distance, 24 | graphs_pace, 25 | graphs_effort, 26 | ) 27 | from strava_data.strava_api.visualisation.utils import configure_matplotlib_styles 28 | from strava_data.ml.pace_forecast import run_forecast_pipeline 29 | from strava_data.ml.run_type_classifier import run_clustering_pipeline 30 | from strava_data.ml.training_advisor import generate_training_plan_chart 31 | 32 | configure_matplotlib_styles() 33 | LOGGER = get_logger() 34 | 35 | 36 | def main(skip_fetch: bool = False) -> None: 37 | """ 38 | Orchestrates the full flow: auth, DB prep, fetch, transform, visualize. 39 | """ 40 | LOGGER.info("Start main.") 41 | decrypt_database() 42 | init_database() 43 | 44 | if not skip_fetch: 45 | process_new_activities() 46 | else: 47 | LOGGER.info("Skipping fetch. Using existing database contents.") 48 | 49 | LOGGER.info("Running chart generation...") 50 | generate_charts_from_db() 51 | encrypt_database() 52 | LOGGER.info("Done.") 53 | 54 | 55 | def process_new_activities() -> None: 56 | """ 57 | Authenticates and processes newly fetched Strava activities and splits. 58 | """ 59 | get_or_refresh_tokens() 60 | new_activities = fetch_activities(per_page=50) 61 | 62 | if new_activities.empty: 63 | LOGGER.info("No new activities detected") 64 | return 65 | 66 | LOGGER.info("New activities detected, processing...") 67 | new_splits = fetch_splits_if_needed(new_activities) 68 | transformed_activities = transform_activities(new_activities) 69 | transformed_splits = transform_splits(new_splits) 70 | insert_activities(transformed_activities) 71 | insert_splits(transformed_splits) 72 | LOGGER.info("New activities processed") 73 | 74 | 75 | def generate_charts_from_db() -> None: 76 | """ 77 | Loads all data from the database and triggers chart generation. 78 | """ 79 | all_activities = load_all_activities() 80 | all_splits = load_all_splits() 81 | generate_required_charts(all_activities, all_splits) 82 | LOGGER.info("Running pace forecast pipeline...") 83 | run_forecast_pipeline(all_splits) 84 | LOGGER.info("Running run type clustering pipeline...") 85 | run_clustering_pipeline(all_splits) 86 | LOGGER.info("Generating training plan...") 87 | generate_training_plan_chart(all_activities, all_splits, "A.I._Recommended_Training.png") 88 | 89 | 90 | def generate_required_charts(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None: 91 | """ 92 | Produces visualisations from activity and split data. 93 | """ 94 | generate_pace_and_distance_charts(activities_df, splits_df) 95 | generate_distribution_and_heatmaps(activities_df, splits_df) 96 | generate_time_series_and_trends(activities_df, splits_df) 97 | 98 | 99 | def generate_pace_and_distance_charts(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None: 100 | LOGGER.info("Generate Running_Pace_vs_Elevation_Change") 101 | graphs_pace.plot_pace_vs_elevation_change(splits_df, "Running_Pace_vs_Elevation_Change.png") 102 | LOGGER.info("Generate Time_Taken_Distance") 103 | graphs_distance.plot_time_taken_over_distances(activities_df, "Time_Taken_Distance.png") 104 | LOGGER.info("Generate Time_Taken_Distance_Recent_Years") 105 | graphs_distance.plot_time_taken_over_distances_recent_years( 106 | activities_df, "Time_Taken_Distance_Recent_Years.png" 107 | ) 108 | LOGGER.info("Generate Running_Pace_over_Time") 109 | graphs_pace.plot_running_pace_over_time(splits_df, "Running_Pace_over_Time.png") 110 | LOGGER.info("Generate Running_Pace_vs_Total_Distance") 111 | graphs_distance.plot_pace_vs_total_distance(splits_df, "Running_Pace_vs_Total_Distance.png") 112 | LOGGER.info("Generate Number_of_Runs_per_Distance") 113 | graphs_distance.plot_number_of_runs_per_distance( 114 | activities_df, "Number_of_Runs_per_Distance.png" 115 | ) 116 | LOGGER.info("Generate Fastest_1k_Pace_over_Time") 117 | graphs_pace.plot_fastest_1km_pace_over_time(splits_df, "Fastest_1k_Pace_over_Time.png") 118 | LOGGER.info("Generate Median_1k_Pace_over_Time") 119 | graphs_pace.plot_median_1km_pace_over_time(splits_df, "Median_1k_Pace_over_Time.png") 120 | LOGGER.info("Generate Total_Distance_Ran_by_Month") 121 | graphs_distance.plot_total_distance_by_month(activities_df, "Total_Distance_Ran_by_Month.png") 122 | LOGGER.info("Generate Pace_by_Day") 123 | graphs_pace.plot_pace_by_day_of_week(splits_df, "Pace_by_Day.png") 124 | 125 | 126 | def generate_distribution_and_heatmaps( 127 | activities_df: pd.DataFrame, splits_df: pd.DataFrame 128 | ) -> None: 129 | LOGGER.info("Generate Activity_Heatmap") 130 | graphs_distribution.plot_heatmap_activities(activities_df, "Activity_Heatmap.png") 131 | LOGGER.info("Generate Run_Distance_Distribution") 132 | graphs_distribution.plot_run_distance_distribution( 133 | activities_df, "Run_Distance_Distribution.png" 134 | ) 135 | LOGGER.info("Generate Pace_Distribution") 136 | graphs_distribution.plot_pace_distribution(splits_df, "Pace_Distribution.png") 137 | LOGGER.info("Generate Elevation_Gain_Distribution") 138 | graphs_distribution.plot_elevation_gain_distribution( 139 | activities_df, "Elevation_Gain_Distribution.png" 140 | ) 141 | LOGGER.info("Generate Run_Days_Heatmap") 142 | graphs_distribution.plot_run_days_heatmap(activities_df, "Run_Days_Heatmap.png") 143 | LOGGER.info("Generate Rest_Days_Heatmap") 144 | graphs_distribution.plot_rest_days_heatmap(activities_df, "Rest_Days_Heatmap.png") 145 | LOGGER.info("Generate Run_Rest_Ratio_Heatmap") 146 | graphs_distribution.plot_run_rest_ratio_heatmap(activities_df, "Run_Rest_Ratio_Heatmap.png") 147 | 148 | 149 | def generate_time_series_and_trends(activities_df: pd.DataFrame, splits_df: pd.DataFrame) -> None: 150 | LOGGER.info("Generate Cumulative_Distance") 151 | graphs_distance.plot_cumulative_distance_over_time(activities_df, "Cumulative_Distance.png") 152 | LOGGER.info("Generate Longest_Run_per_Month") 153 | graphs_distance.plot_longest_run_per_month(activities_df, "Longest_Run_per_Month.png") 154 | LOGGER.info("Generate Elevation_Gain_per_KM_by_Month") 155 | graphs_effort.plot_elevation_gain_per_km_by_month( 156 | activities_df, "Elevation_Gain_per_KM_by_Month.png" 157 | ) 158 | LOGGER.info("Generate Run_Start_Time_by_Month") 159 | graphs_distribution.plot_run_start_time_distribution( 160 | activities_df, "Run_Start_Time_by_Month.png" 161 | ) 162 | LOGGER.info("Generate Monthly_Distance_by_Year") 163 | graphs_distance.plot_monthly_distance_by_year_grouped( 164 | activities_df, "Monthly_Distance_by_Year.png" 165 | ) 166 | LOGGER.info("Generate Rolling_30_Day_Comparison") 167 | graphs_distance.plot_rolling_distance(activities_df, "Rolling_30_Day_Comparison.png", window=30) 168 | LOGGER.info("Generate Cadence_Over_Time") 169 | graphs_effort.plot_cadence_over_time(activities_df, "Cadence_Over_Time.png") 170 | LOGGER.info("Generate Training_Intensity_by_HeartRate_Zone") 171 | graphs_distribution.plot_heart_rate_zone_distribution( 172 | splits_df, "Training_Intensity_by_HeartRate_Zone.png" 173 | ) 174 | LOGGER.info("Generate Pace_Consistency_by_Run") 175 | graphs_pace.plot_pace_variability_per_run(splits_df, "Pace_Consistency_by_Run.png") 176 | LOGGER.info("Generate Training_Load_Over_Time") 177 | graphs_effort.plot_effort_score_over_time(activities_df, "Training_Load_Over_Time.png") 178 | LOGGER.info("Generate VO2_Proxy_Over_Time") 179 | graphs_effort.plot_vo2_proxy_over_time(splits_df, "VO2_Proxy_Over_Time.png") 180 | 181 | 182 | if __name__ == "__main__": 183 | parser = argparse.ArgumentParser(description="Process and visualize Strava data.") 184 | parser.add_argument("--skip-fetch", action="store_true", help="Skip fetching new activities.") 185 | args = parser.parse_args() 186 | main(skip_fetch=args.skip_fetch) 187 | -------------------------------------------------------------------------------- /src/strava_data/db/dao.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data Access Object (DAO) layer for reading/writing tokens and activity data. 3 | Uses pyAesCrypt to decrypt/encrypt the SQLite database file, but only once at program start/end. 4 | """ 5 | 6 | import os 7 | import sqlite3 8 | from os import stat 9 | from typing import Any, Dict, Optional 10 | from datetime import datetime 11 | 12 | import pandas as pd 13 | import pyAesCrypt 14 | 15 | from strava_data.config import get_buffer_size, get_encryption_password 16 | from utils.logger import get_logger 17 | 18 | LOGGER = get_logger() 19 | 20 | ENCRYPTED_DB_FILE = "strava.sqlite" 21 | TEMP_DB_FILE = "strava_temp.sqlite" 22 | 23 | 24 | def decrypt_database() -> None: 25 | """ 26 | Decrypts strava.sqlite into strava_temp.sqlite (if strava.sqlite exists). 27 | If strava_temp.sqlite already exists, skip to avoid double-decryption. 28 | Call this once at program start. 29 | """ 30 | if os.path.exists(TEMP_DB_FILE): 31 | LOGGER.warning("Database appears already decrypted. Skipping decryption.") 32 | return 33 | 34 | if not os.path.exists(ENCRYPTED_DB_FILE): 35 | LOGGER.warning( 36 | "Encrypted database file %s not found. Creating a new database.", 37 | ENCRYPTED_DB_FILE, 38 | ) 39 | init_database() 40 | return 41 | 42 | enc_file_size = stat(ENCRYPTED_DB_FILE).st_size 43 | password = get_encryption_password() 44 | buffer_size = get_buffer_size() 45 | 46 | LOGGER.info("Decrypting database...") 47 | with open(ENCRYPTED_DB_FILE, "rb") as f_in, open(TEMP_DB_FILE, "wb") as f_out: 48 | pyAesCrypt.decryptStream(f_in, f_out, password, buffer_size, enc_file_size) 49 | 50 | LOGGER.info("Decryption complete. Working with the unencrypted file now.") 51 | 52 | 53 | def encrypt_database() -> None: 54 | """ 55 | Encrypts strava_temp.sqlite back into strava.sqlite and removes strava_temp.sqlite. 56 | Call this once at program end. 57 | """ 58 | if not os.path.exists(TEMP_DB_FILE): 59 | LOGGER.warning("No decrypted DB file %s found to encrypt. Skipping.", TEMP_DB_FILE) 60 | return 61 | 62 | password = get_encryption_password() 63 | buffer_size = get_buffer_size() 64 | 65 | if os.path.exists(ENCRYPTED_DB_FILE): 66 | os.remove(ENCRYPTED_DB_FILE) 67 | 68 | LOGGER.info("Encrypting database back to strava.sqlite...") 69 | with open(TEMP_DB_FILE, "rb") as f_in, open(ENCRYPTED_DB_FILE, "wb") as f_out: 70 | pyAesCrypt.encryptStream(f_in, f_out, password, buffer_size) 71 | 72 | os.remove(TEMP_DB_FILE) 73 | LOGGER.info("Encryption complete.") 74 | 75 | 76 | def init_database() -> None: 77 | """ 78 | Creates required tables if they do not already exist in strava_temp.sqlite. 79 | Assumes decrypt_database() has already been called. 80 | """ 81 | conn = sqlite3.connect(TEMP_DB_FILE) 82 | cur = conn.cursor() 83 | 84 | cur.execute( 85 | """ 86 | CREATE TABLE IF NOT EXISTS config ( 87 | token_type TEXT, 88 | access_token TEXT, 89 | expires_at INTEGER, 90 | expires_in INTEGER, 91 | refresh_token TEXT 92 | ); 93 | """ 94 | ) 95 | 96 | cur.execute( 97 | """ 98 | CREATE TABLE IF NOT EXISTS activities ( 99 | activity_id INTEGER PRIMARY KEY, 100 | name TEXT, 101 | activity_type TEXT, 102 | distance_m REAL, 103 | moving_time_s INTEGER, 104 | average_speed_m_s REAL, 105 | max_speed_m_s REAL, 106 | total_elevation_gain_m REAL, 107 | start_date_local TEXT, 108 | average_cadence REAL, 109 | is_outdoor INTEGER 110 | ); 111 | """ 112 | ) 113 | 114 | cur.execute( 115 | """ 116 | CREATE TABLE IF NOT EXISTS splits ( 117 | split_row_id INTEGER PRIMARY KEY AUTOINCREMENT, 118 | activity_id INTEGER, 119 | distance_m REAL, 120 | elapsed_time_s INTEGER, 121 | elevation_difference_m REAL, 122 | moving_time_s INTEGER, 123 | pace_zone INTEGER, 124 | split_index INTEGER, 125 | average_grade_adjusted_speed_m_s REAL, 126 | average_heartrate REAL, 127 | start_date_local TEXT, 128 | FOREIGN KEY(activity_id) REFERENCES activities(activity_id) 129 | ); 130 | """ 131 | ) 132 | 133 | conn.commit() 134 | conn.close() 135 | 136 | 137 | def store_tokens(tokens: Dict[str, Any]) -> None: 138 | conn = sqlite3.connect(TEMP_DB_FILE) 139 | cur = conn.cursor() 140 | cur.execute("DELETE FROM config;") 141 | cur.execute( 142 | """ 143 | INSERT INTO config (token_type, access_token, expires_at, expires_in, refresh_token) 144 | VALUES (?, ?, ?, ?, ?); 145 | """, 146 | ( 147 | tokens.get("token_type"), 148 | tokens.get("access_token"), 149 | tokens.get("expires_at"), 150 | tokens.get("expires_in"), 151 | tokens.get("refresh_token"), 152 | ), 153 | ) 154 | conn.commit() 155 | conn.close() 156 | 157 | 158 | def read_tokens() -> Optional[Dict[str, Any]]: 159 | conn = sqlite3.connect(TEMP_DB_FILE) 160 | cur = conn.cursor() 161 | cur.execute( 162 | """ 163 | SELECT token_type, access_token, expires_at, expires_in, refresh_token 164 | FROM config 165 | LIMIT 1; 166 | """ 167 | ) 168 | row = cur.fetchone() 169 | conn.close() 170 | 171 | if row: 172 | return { 173 | "token_type": row[0], 174 | "access_token": row[1], 175 | "expires_at": row[2], 176 | "expires_in": row[3], 177 | "refresh_token": row[4], 178 | } 179 | return None 180 | 181 | 182 | def insert_activities(activities_df: pd.DataFrame) -> None: 183 | if activities_df.empty: 184 | return 185 | 186 | conn = sqlite3.connect(TEMP_DB_FILE) 187 | cur = conn.cursor() 188 | 189 | for _, row in activities_df.iterrows(): 190 | cur.execute( 191 | """ 192 | INSERT OR IGNORE INTO activities ( 193 | activity_id, 194 | name, 195 | activity_type, 196 | distance_m, 197 | moving_time_s, 198 | average_speed_m_s, 199 | max_speed_m_s, 200 | total_elevation_gain_m, 201 | start_date_local, 202 | average_cadence, 203 | is_outdoor 204 | ) 205 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?); 206 | """, 207 | ( 208 | row.get("id"), 209 | row.get("name"), 210 | row.get("type"), 211 | row.get("distance_m", 0.0), 212 | row.get("moving_time_s", 0), 213 | row.get("average_speed_m_s", 0.0), 214 | row.get("max_speed_m_s", 0.0), 215 | row.get("total_elevation_gain_m", 0.0), 216 | row.get("start_date_local", ""), 217 | row.get("average_cadence", 0.0), 218 | row.get("is_outdoor", 1 if row.get("is_outdoor") else 0), 219 | ), 220 | ) 221 | conn.commit() 222 | conn.close() 223 | 224 | 225 | def insert_splits(splits_df: pd.DataFrame) -> None: 226 | if splits_df.empty: 227 | return 228 | 229 | conn = sqlite3.connect(TEMP_DB_FILE) 230 | cur = conn.cursor() 231 | 232 | for _, row in splits_df.iterrows(): 233 | cur.execute( 234 | """ 235 | INSERT INTO splits ( 236 | activity_id, 237 | distance_m, 238 | elapsed_time_s, 239 | elevation_difference_m, 240 | moving_time_s, 241 | pace_zone, 242 | split_index, 243 | average_grade_adjusted_speed_m_s, 244 | average_heartrate, 245 | start_date_local 246 | ) 247 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?); 248 | """, 249 | ( 250 | row.get("activity_id"), 251 | row.get("distance_m", 0.0), 252 | row.get("elapsed_time_s", 0), 253 | row.get("elevation_difference_m", 0.0), 254 | row.get("moving_time_s", 0), 255 | row.get("pace_zone", 0), 256 | row.get("split_index", 0), 257 | row.get("average_grade_adjusted_speed_m_s", 0.0), 258 | row.get("average_heartrate", None), 259 | row.get("start_date_local", ""), 260 | ), 261 | ) 262 | 263 | conn.commit() 264 | conn.close() 265 | 266 | 267 | def get_latest_activity_date() -> Optional[str]: 268 | conn = sqlite3.connect(TEMP_DB_FILE) 269 | cur = conn.cursor() 270 | cur.execute("SELECT MAX(start_date_local) FROM activities;") 271 | row = cur.fetchone() 272 | conn.close() 273 | 274 | if row and row[0]: 275 | return row[0] 276 | return None 277 | 278 | 279 | def load_all_activities() -> pd.DataFrame: 280 | conn = sqlite3.connect(TEMP_DB_FILE) 281 | activities_df = pd.read_sql_query("SELECT * FROM activities;", conn) 282 | conn.close() 283 | return activities_df 284 | 285 | 286 | def load_all_splits() -> pd.DataFrame: 287 | conn = sqlite3.connect(TEMP_DB_FILE) 288 | splits_df = pd.read_sql_query("SELECT * FROM splits;", conn) 289 | conn.close() 290 | return splits_df 291 | 292 | 293 | def get_last_run_time(): 294 | """ 295 | Retrieves the latest run timestamp from 'activities' in strava_temp.sqlite, 296 | returning a datetime object or None if no runs exist. 297 | """ 298 | if not os.path.exists(TEMP_DB_FILE): 299 | return None 300 | 301 | conn = sqlite3.connect(TEMP_DB_FILE) 302 | cur = conn.cursor() 303 | 304 | query = "SELECT MAX(start_date_local) FROM activities;" 305 | cur.execute(query) 306 | row = cur.fetchone() 307 | conn.close() 308 | 309 | if not row or not row[0]: 310 | return None 311 | 312 | try: 313 | return datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%SZ") 314 | except ValueError: 315 | return None 316 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/graphs_pace.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the pace chart functions, each saving a PNG file. 3 | """ 4 | 5 | import calendar 6 | import matplotlib.dates as mdates 7 | from matplotlib import ticker 8 | import numpy as np 9 | import pandas as pd 10 | import seaborn as sns 11 | 12 | from strava_data.strava_api.visualisation import utils 13 | 14 | 15 | def plot_pace_vs_elevation_change(splits_df: pd.DataFrame, output_path: str) -> None: 16 | """ 17 | Plot Running Pace vs. Elevation Change for 1km splits. 18 | - y-axis: pace (mm:ss) 19 | - x-axis: elevation change (m) 20 | - Points coloured by year 21 | - Trend line included 22 | """ 23 | splits = utils.prepare_dated_activities(splits_df) 24 | splits = splits[ 25 | (splits["elevation_difference_m"] >= -100) & (splits["elevation_difference_m"] <= 100) 26 | ] 27 | splits["pace_s_km"] = splits["elapsed_time_s"] / (splits["distance_m"] / 1000) 28 | splits["year"] = pd.to_datetime(splits["start_date_local"]).dt.year 29 | 30 | def plot_fn(axis): 31 | sns.scatterplot( 32 | data=splits, 33 | x="elevation_difference_m", 34 | y="pace_s_km", 35 | hue="year", 36 | alpha=0.6, 37 | palette="viridis", 38 | ax=axis, 39 | ) 40 | sns.regplot( 41 | data=splits, 42 | x="elevation_difference_m", 43 | y="pace_s_km", 44 | scatter=False, 45 | color="black", 46 | line_kws={"linestyle": "--"}, 47 | ci=95, 48 | ax=axis, 49 | ) 50 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 51 | 52 | # pylint: disable=R0801 53 | utils.plot_with_common_setup( 54 | title="Running Pace vs. Elevation Change", 55 | xlabel="Elevation Change (m)", 56 | ylabel="Split Pace (mm:ss)", 57 | output_path=output_path, 58 | plot_func=plot_fn, 59 | ) 60 | # pylint: enable=R0801 61 | 62 | 63 | def plot_running_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None: 64 | """ 65 | Running pace over time: 66 | - y-axis: 1 km pace (mm:ss) 67 | - x-axis: date 68 | - Points for each ~1 km split 69 | - Trend line to show changes 70 | """ 71 | data = utils.prepare_dated_activities(splits_df) 72 | if data.empty: 73 | return 74 | 75 | data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"] 76 | data["datetime_obj"] = pd.to_datetime(data["start_date_local"], errors="coerce") 77 | data["date_numeric"] = mdates.date2num(data["datetime_obj"]) 78 | data.sort_values("date_numeric", inplace=True) 79 | 80 | def plot_fn(axis): 81 | sns.scatterplot(data=data, x="date_numeric", y="pace_sec_km", alpha=0.5, ax=axis) 82 | sns.regplot( 83 | data=data, 84 | x="date_numeric", 85 | y="pace_sec_km", 86 | scatter=False, 87 | ci=95, 88 | color="black", 89 | line_kws={"linestyle": "--"}, 90 | ax=axis, 91 | ) 92 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 93 | axis.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m")) 94 | axis.set_xticks(data["date_numeric"][:: len(data) // 10 if len(data) >= 10 else 1]) 95 | axis.set_xticklabels( 96 | [ 97 | d.strftime("%Y-%m") 98 | for d in data["datetime_obj"].iloc[:: len(data) // 10 if len(data) >= 10 else 1] 99 | ], 100 | rotation=45, 101 | ) 102 | 103 | # pylint: disable=R0801 104 | utils.plot_with_common_setup( 105 | title="Running Pace Over Time", 106 | xlabel="Date", 107 | ylabel="Pace (mm:ss)", 108 | output_path=output_path, 109 | plot_func=plot_fn, 110 | ) 111 | # pylint: enable=R0801 112 | 113 | 114 | def plot_fastest_1km_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None: 115 | """ 116 | Plots the fastest 1km pace per month across all years. 117 | """ 118 | split_data = utils.prepare_dated_activities(splits_df) 119 | if split_data.empty: 120 | return 121 | 122 | split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"] 123 | split_data["year"] = pd.to_datetime(split_data["start_date_local"]).dt.year 124 | split_data["month"] = pd.to_datetime(split_data["start_date_local"]).dt.month 125 | 126 | monthly_fastest = split_data.groupby(["year", "month"])["pace_sec_km"].min().reset_index() 127 | 128 | all_years = sorted(monthly_fastest["year"].unique()) 129 | rows = [] 130 | for year in all_years: 131 | for month in range(1, 13): 132 | pace = monthly_fastest.loc[ 133 | (monthly_fastest["year"] == year) & (monthly_fastest["month"] == month), 134 | "pace_sec_km", 135 | ] 136 | pace_val = pace.values[0] if not pace.empty else np.nan 137 | rows.append({"year": year, "month": month, "pace_sec_km": pace_val}) 138 | 139 | plot_df = pd.DataFrame(rows) 140 | plot_df["pace_sec_km"] = plot_df.groupby("year")["pace_sec_km"].ffill() 141 | 142 | def plot_fn(axis): 143 | for year in sorted(plot_df["year"].unique()): 144 | year_data = plot_df[plot_df["year"] == year].sort_values("month") 145 | axis.plot(year_data["month"], year_data["pace_sec_km"], marker="o", label=str(year)) 146 | utils.label_month_axis(axis) 147 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 148 | axis.legend(title="Year") 149 | 150 | # pylint: disable=R0801 151 | utils.plot_with_common_setup( 152 | title="Fastest 1 km Pace Over Time", 153 | xlabel="Month", 154 | ylabel="Fastest Pace (mm:ss)", 155 | output_path=output_path, 156 | plot_func=plot_fn, 157 | ) 158 | # pylint: enable=R0801 159 | 160 | 161 | def plot_median_1km_pace_over_time(splits_df: pd.DataFrame, output_path: str) -> None: 162 | """ 163 | Plots the median 1km pace per month across all years. 164 | """ 165 | split_data = utils.prepare_dated_activities(splits_df) 166 | if split_data.empty: 167 | return 168 | 169 | split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"] 170 | split_data["year"] = pd.to_datetime(split_data["start_date_local"]).dt.year 171 | split_data["month"] = pd.to_datetime(split_data["start_date_local"]).dt.month 172 | 173 | monthly_medians = split_data.groupby(["year", "month"])["pace_sec_km"].median().reset_index() 174 | 175 | all_years = monthly_medians["year"].unique() 176 | rows = [] 177 | for year in all_years: 178 | for month in range(1, 13): 179 | val = monthly_medians.loc[ 180 | (monthly_medians["year"] == year) & (monthly_medians["month"] == month), 181 | "pace_sec_km", 182 | ] 183 | pace_val = val.values[0] if not val.empty else np.nan 184 | rows.append({"year": year, "month": month, "pace_sec_km": pace_val}) 185 | 186 | plot_df = pd.DataFrame(rows) 187 | plot_df["pace_sec_km"] = plot_df.groupby("year")["pace_sec_km"].ffill() 188 | 189 | def plot_fn(axis): 190 | for year in sorted(plot_df["year"].unique()): 191 | year_data = plot_df[plot_df["year"] == year].sort_values("month") 192 | axis.plot( 193 | year_data["month"], 194 | year_data["pace_sec_km"], 195 | marker="o", 196 | linestyle="-", 197 | label=str(year), 198 | ) 199 | utils.label_month_axis(axis) 200 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 201 | axis.legend(title="Year") 202 | 203 | # pylint: disable=R0801 204 | utils.plot_with_common_setup( 205 | title="Median 1 km Pace Over Time", 206 | xlabel="Month", 207 | ylabel="Median Pace (mm:ss)", 208 | output_path=output_path, 209 | plot_func=plot_fn, 210 | ) 211 | # pylint: enable=R0801 212 | 213 | 214 | def plot_pace_by_day_of_week(splits_df: pd.DataFrame, output_path: str) -> None: 215 | """ 216 | Pace by Day of Week: 217 | - y-axis: 1 km pace (mm:ss), x-axis: day of week 218 | - Box plot filtered for ~1 km splits 219 | """ 220 | split_data = utils.prepare_dated_activities(splits_df) 221 | if split_data.empty: 222 | return 223 | 224 | split_data["pace_sec_km"] = split_data["elapsed_time_s"] / split_data["distance_km"] 225 | split_data["day_of_week"] = pd.to_datetime(split_data["start_date_local"]).dt.day_name() 226 | ordered_days = list(calendar.day_name) 227 | 228 | def plot_fn(axis): 229 | sns.boxplot(data=split_data, x="day_of_week", y="pace_sec_km", order=ordered_days, ax=axis) 230 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 231 | 232 | # pylint: disable=R0801 233 | utils.plot_with_common_setup( 234 | title="Pace by Day of Week", 235 | xlabel="Day of Week", 236 | ylabel="Pace (mm:ss)", 237 | output_path=output_path, 238 | plot_func=plot_fn, 239 | ) 240 | # pylint: enable=R0801 241 | 242 | 243 | def plot_pace_variability_per_run(splits_df: pd.DataFrame, output_path: str) -> None: 244 | """ 245 | Plots the standard deviation of pace (in sec/km) for each run over time. 246 | Only includes activities with at least 3 ~1 km splits. 247 | """ 248 | splits = utils.prepare_dated_activities(splits_df) 249 | if splits.empty: 250 | return 251 | 252 | splits["pace_sec_km"] = splits["elapsed_time_s"] / splits["distance_km"] 253 | 254 | grouped = ( 255 | splits.groupby(["activity_id", "start_date_local"]) 256 | .agg(pace_std=("pace_sec_km", "std"), split_count=("pace_sec_km", "count")) 257 | .reset_index() 258 | ) 259 | 260 | grouped = grouped[grouped["split_count"] >= 3] 261 | grouped["date"] = pd.to_datetime(grouped["start_date_local"]) 262 | 263 | if grouped.empty: 264 | return 265 | 266 | def plot_fn(axis): 267 | sns.lineplot(data=grouped.sort_values("date"), x="date", y="pace_std", marker="o", ax=axis) 268 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 269 | for label in axis.get_xticklabels(): 270 | label.set_rotation(45) 271 | 272 | # pylint: disable=R0801 273 | utils.plot_with_common_setup( 274 | title="Pace Variability per Run (Standard Deviation)", 275 | xlabel="Date", 276 | ylabel="Pace Std Dev (mm:ss)", 277 | output_path=output_path, 278 | plot_func=plot_fn, 279 | ) 280 | # pylint: enable=R0801 281 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/graphs_distribution.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the distribution chart functions, each saving a PNG file. 3 | """ 4 | 5 | import calendar 6 | from matplotlib import ticker 7 | from matplotlib.colors import ListedColormap, BoundaryNorm 8 | import numpy as np 9 | import pandas as pd 10 | import seaborn as sns 11 | 12 | from strava_data.strava_api.visualisation import utils 13 | 14 | 15 | def plot_run_distance_distribution(activities_df: pd.DataFrame, output_path: str) -> None: 16 | """ 17 | KDE plot showing distribution of run distances, split by year. 18 | Highlights distance preferences and training evolution over time. 19 | """ 20 | data = utils.prepare_dated_activities(activities_df) 21 | 22 | def plot_fn(axis): 23 | for year in sorted(data["year"].unique()): 24 | year_data = data[data["year"] == year] 25 | if year_data["distance_km"].nunique() > 1: 26 | sns.kdeplot( 27 | year_data["distance_km"], 28 | fill=True, 29 | label=str(year), 30 | alpha=0.3, 31 | ax=axis, 32 | ) 33 | axis.set_xlim(left=0) 34 | axis.legend(title="Year") 35 | axis.grid(True, linestyle="--", linewidth=0.5) 36 | 37 | # pylint: disable=R0801 38 | utils.plot_with_common_setup( 39 | title="Run Distance Distribution by Year", 40 | xlabel="Distance (km)", 41 | ylabel="Density", 42 | output_path=output_path, 43 | plot_func=plot_fn, 44 | ) 45 | # pylint: enable=R0801 46 | 47 | 48 | def plot_pace_distribution(splits_df: pd.DataFrame, output_path: str) -> None: 49 | """ 50 | KDE plot showing distribution of paces (in mm:ss per km), one per year. 51 | Only includes ~1 km splits. 52 | """ 53 | data = utils.prepare_dated_activities(splits_df) 54 | if data.empty: 55 | return 56 | 57 | data["pace_sec_km"] = data["elapsed_time_s"] / data["distance_km"] 58 | 59 | def plot_fn(axis): 60 | for year in sorted(data["year"].unique()): 61 | year_data = data[data["year"] == year] 62 | if year_data["pace_sec_km"].nunique() > 1: 63 | sns.kdeplot( 64 | year_data["pace_sec_km"], 65 | fill=True, 66 | label=str(year), 67 | alpha=0.3, 68 | ax=axis, 69 | ) 70 | axis.xaxis.set_major_formatter(ticker.FuncFormatter(utils.format_pace)) 71 | axis.legend(title="Year") 72 | axis.grid(True) 73 | 74 | # pylint: disable=R0801 75 | utils.plot_with_common_setup( 76 | title="Pace Distribution by Year (1 km splits)", 77 | xlabel="Pace (mm:ss)", 78 | ylabel="Density", 79 | output_path=output_path, 80 | plot_func=plot_fn, 81 | ) 82 | # pylint: enable=R0801 83 | 84 | 85 | def plot_elevation_gain_distribution(activities_df: pd.DataFrame, output_path: str) -> None: 86 | """ 87 | KDE plots showing distribution of elevation gain per run, one per year. 88 | Highlights how hilly your training was year-to-year. 89 | """ 90 | data = utils.prepare_dated_activities(activities_df) 91 | data = data[data["total_elevation_gain_m"] != 0] 92 | 93 | def plot_fn(axis): 94 | for year in sorted(data["year"].unique()): 95 | year_data = data[data["year"] == year] 96 | if year_data["total_elevation_gain_m"].nunique() > 1: 97 | sns.kdeplot( 98 | year_data["total_elevation_gain_m"], 99 | fill=True, 100 | label=str(year), 101 | alpha=0.3, 102 | ax=axis, 103 | ) 104 | axis.legend(title="Year") 105 | axis.grid(True, linestyle="--", linewidth=0.5) 106 | 107 | # pylint: disable=R0801 108 | utils.plot_with_common_setup( 109 | title="Elevation Gain per Run (by Year)", 110 | xlabel="Elevation Gain (m)", 111 | ylabel="Density", 112 | output_path=output_path, 113 | plot_func=plot_fn, 114 | ) 115 | # pylint: enable=R0801 116 | 117 | 118 | def plot_heart_rate_zone_distribution(splits_df: pd.DataFrame, output_path: str) -> None: 119 | """ 120 | Stacked bar chart showing time spent in heart rate zones per month. 121 | Only includes ~1 km splits with valid heart rate data. 122 | """ 123 | data = utils.prepare_dated_activities(splits_df) 124 | data = data[pd.notnull(data["average_heartrate"])] 125 | 126 | if data.empty: 127 | return 128 | 129 | data["month_label"] = ( 130 | pd.to_datetime(data["start_date_local"]).dt.tz_localize(None).dt.to_period("M").astype(str) 131 | ) 132 | data["hr_zone"] = data.apply( 133 | lambda row: utils.classify_zone_dynamic(row["average_heartrate"], row["start_date_local"]), 134 | axis=1, 135 | ) 136 | data["time_min"] = data["elapsed_time_s"] / 60.0 137 | grouped = data.groupby(["month_label", "hr_zone"])["time_min"].sum().unstack().fillna(0) 138 | grouped = grouped.sort_index() 139 | 140 | def plot_fn(axis): 141 | grouped.plot(kind="bar", stacked=True, figsize=(14, 6), colormap="viridis", ax=axis) 142 | axis.set_xticks(range(len(grouped.index))) 143 | axis.set_xticklabels([str(label) for label in grouped.index], rotation=45) 144 | axis.legend(title="Heart Rate Zone") 145 | 146 | # pylint: disable=R0801 147 | utils.plot_with_common_setup( 148 | title="Training Intensity by Heart Rate Zone", 149 | xlabel="Month", 150 | ylabel="Time Spent (minutes)", 151 | output_path=output_path, 152 | plot_func=plot_fn, 153 | ) 154 | # pylint: enable=R0801 155 | 156 | 157 | def plot_run_start_time_distribution(activities_df: pd.DataFrame, output_path: str) -> None: 158 | """ 159 | Box plot showing distribution of run start times by month. 160 | - X-axis: Month (Jan–Dec) 161 | - Y-axis: Hour of day (0–23) 162 | """ 163 | if activities_df.empty: 164 | return 165 | 166 | data = utils.prepare_activities_with_distance(activities_df) 167 | data["start_time"] = pd.to_datetime(data["start_date_local"], errors="coerce") 168 | data["hour"] = data["start_time"].dt.hour 169 | 170 | if data[["month", "hour"]].dropna().empty: 171 | return 172 | 173 | def plot_fn(axis): 174 | sns.boxplot(data=data, x="month", y="hour", ax=axis) 175 | axis.set_xticks(ticks=range(0, 12)) 176 | axis.set_xticklabels(labels=calendar.month_abbr[1:13]) 177 | 178 | # pylint: disable=R0801 179 | utils.plot_with_common_setup( 180 | title="Distribution of Run Start Time by Month", 181 | xlabel="Month", 182 | ylabel="Start Hour of Day", 183 | output_path=output_path, 184 | plot_func=plot_fn, 185 | ) 186 | # pylint: enable=R0801 187 | 188 | 189 | def plot_run_days_heatmap(activities_df: pd.DataFrame, output_path: str) -> None: 190 | """ 191 | Heatmap showing number of days with runs per month. 192 | Highlights how consistently you trained. 193 | """ 194 | if activities_df.empty: 195 | return 196 | 197 | data = activities_df.copy() 198 | data["date"] = pd.to_datetime(data["start_date_local"]).dt.date 199 | data["year"] = pd.to_datetime(data["start_date_local"]).dt.year 200 | data["month"] = pd.to_datetime(data["start_date_local"]).dt.month 201 | 202 | run_days = data.drop_duplicates(subset="date") 203 | summary = run_days.groupby(["year", "month"]).size().reset_index(name="run_day_count") 204 | pivot = summary.pivot(index="year", columns="month", values="run_day_count") 205 | 206 | def plot_fn(axis): 207 | sns.heatmap( 208 | pivot, 209 | annot=pivot, 210 | fmt=".0f", 211 | cmap="Greens", 212 | cbar_kws={"label": "Run Days"}, 213 | mask=pivot.isna(), 214 | ax=axis, 215 | ) 216 | utils.label_month_axis_barplot(axis) 217 | axis.set_xlabel("Month") 218 | axis.set_ylabel("Year") 219 | 220 | # pylint: disable=R0801 221 | utils.plot_with_common_setup( 222 | title="Run Days per Month", 223 | xlabel="Month", 224 | ylabel="Year", 225 | output_path=output_path, 226 | plot_func=plot_fn, 227 | ) 228 | # pylint: enable=R0801 229 | 230 | 231 | def plot_rest_days_heatmap(activities_df: pd.DataFrame, output_path: str) -> None: 232 | """ 233 | Heatmap showing number of rest days per month. 234 | Only annotates months where rest days occurred. 235 | """ 236 | if activities_df.empty: 237 | return 238 | 239 | data = activities_df.copy() 240 | data["date"] = pd.to_datetime(data["start_date_local"]).dt.date 241 | 242 | start = data["date"].min() 243 | end = data["date"].max() 244 | full_dates = pd.DataFrame({"date": [d.date() for d in pd.date_range(start, end)]}) 245 | 246 | rest_days = full_dates[~full_dates["date"].isin(data["date"])].copy() 247 | rest_days["year"] = pd.to_datetime(rest_days["date"]).dt.year 248 | rest_days["month"] = pd.to_datetime(rest_days["date"]).dt.month 249 | 250 | rest_summary = rest_days.groupby(["year", "month"]).size().reset_index(name="rest_day_count") 251 | pivot = rest_summary.pivot(index="year", columns="month", values="rest_day_count") 252 | 253 | def plot_fn(axis): 254 | sns.heatmap( 255 | pivot, 256 | annot=pivot, 257 | fmt=".0f", 258 | cmap="Reds", 259 | cbar_kws={"label": "Rest Days"}, 260 | mask=pivot.isna(), 261 | ax=axis, 262 | ) 263 | utils.label_month_axis_barplot(axis) 264 | axis.set_xlabel("Month") 265 | axis.set_ylabel("Year") 266 | 267 | # pylint: disable=R0801 268 | utils.plot_with_common_setup( 269 | title="Rest Days per Month", 270 | xlabel="Month", 271 | ylabel="Year", 272 | output_path=output_path, 273 | plot_func=plot_fn, 274 | ) 275 | # pylint: enable=R0801 276 | 277 | 278 | def plot_run_rest_ratio_heatmap(activities_df: pd.DataFrame, output_path: str) -> None: 279 | """ 280 | Heatmap showing the run:rest ratio per month with colour-coded zones: 281 | - Green = Balanced (0.25–0.9) 282 | - Red = High (overtraining) 283 | - Yellow = Low (undertraining) 284 | """ 285 | if activities_df.empty: 286 | return 287 | 288 | data = activities_df.copy() 289 | data["date"] = pd.to_datetime(data["start_date_local"]).dt.date 290 | 291 | start = data["date"].min() 292 | end = data["date"].max() 293 | all_dates = pd.DataFrame({"date": [d.date() for d in pd.date_range(start, end)]}) 294 | all_dates["year"] = pd.to_datetime(all_dates["date"]).dt.year 295 | all_dates["month"] = pd.to_datetime(all_dates["date"]).dt.month 296 | 297 | run_dates = data.drop_duplicates(subset="date")[["date"]].copy() 298 | run_dates["ran"] = 1 299 | 300 | merged = all_dates.merge(run_dates, on="date", how="left") 301 | merged["ran"] = merged["ran"].fillna(0) 302 | 303 | summary = ( 304 | merged.groupby(["year", "month"])["ran"] 305 | .agg(run_days="sum", total_days="count") 306 | .reset_index() 307 | ) 308 | summary["run_rest_ratio"] = summary["run_days"] / summary["total_days"] 309 | pivot = summary.pivot(index="year", columns="month", values="run_rest_ratio") 310 | 311 | cmap = ListedColormap(["#FFD700", "#32CD32", "#FF6347"]) 312 | bounds = [0, 0.25, 0.9, 1.0] 313 | norm = BoundaryNorm(bounds, cmap.N) 314 | 315 | def plot_fn(axis): 316 | sns.heatmap( 317 | pivot, 318 | annot=pivot, 319 | fmt=".2f", 320 | cmap=cmap, 321 | norm=norm, 322 | cbar_kws={"label": "Run:Rest Ratio"}, 323 | mask=pivot.isna(), 324 | linewidths=0.5, 325 | linecolor="white", 326 | ax=axis, 327 | ) 328 | utils.label_month_axis_barplot(axis) 329 | axis.set_xlabel("Month") 330 | axis.set_ylabel("Year") 331 | 332 | # pylint: disable=R0801 333 | utils.plot_with_common_setup( 334 | title="Run:Rest Ratio per Month", 335 | xlabel="Month", 336 | ylabel="Year", 337 | output_path=output_path, 338 | plot_func=plot_fn, 339 | ) 340 | # pylint: enable=R0801 341 | 342 | 343 | def plot_heatmap_activities(activities_df: pd.DataFrame, output_path: str) -> None: 344 | """ 345 | Heatmap of Activities by Day and Hour: 346 | - x-axis: hour of day (0–23) 347 | - y-axis: day of week 348 | - cell = count of runs 349 | """ 350 | if activities_df.empty: 351 | return 352 | 353 | activity_data = activities_df.copy() 354 | dt_col = pd.to_datetime(activity_data["start_date_local"]) 355 | activity_data["weekday"] = dt_col.dt.weekday 356 | activity_data["hour"] = dt_col.dt.hour 357 | 358 | pivot = activity_data.groupby(["weekday", "hour"]).size().unstack(fill_value=0) 359 | 360 | def plot_fn(axis): 361 | sns.heatmap(pivot, cmap="YlGnBu", cbar_kws={"label": "Count of Runs"}, ax=axis) 362 | axis.set_xlabel("Hour of Day") 363 | axis.set_ylabel("Day of Week") 364 | ylabels = [calendar.day_name[i] for i in pivot.index] 365 | axis.set_yticks(ticks=np.arange(0.5, 7.5, 1)) 366 | axis.set_yticklabels(labels=ylabels, rotation=0) 367 | 368 | # pylint: disable=R0801 369 | utils.plot_with_common_setup( 370 | title="Heatmap of Activities by Day and Hour", 371 | xlabel="Hour of Day", 372 | ylabel="Day of Week", 373 | output_path=output_path, 374 | plot_func=plot_fn, 375 | ) 376 | # pylint: enable=R0801 377 | -------------------------------------------------------------------------------- /src/strava_data/ml/training_advisor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates a weekly training recommendation chart based on recent Strava activity. 3 | 4 | - Analyses the past 6 months to determine usual running days. 5 | - Reviews the last 8 weeks of runs to identify missing session types. 6 | - Uses historical pace data to tailor pace suggestions. 7 | - Assigns recommended runs to the days you typically train. 8 | - Outputs a visual table chart: Suggested_Training_Week.png 9 | """ 10 | 11 | from collections import Counter 12 | import pandas as pd 13 | import matplotlib.pyplot as plt 14 | from matplotlib.table import Table 15 | 16 | from strava_data.strava_api.visualisation.utils import ( 17 | prepare_dated_activities, 18 | format_pace, 19 | save_and_close_plot, 20 | ) 21 | from strava_data.ml.run_type_classifier import cluster_run_types, build_run_features 22 | from strava_data.ml.utils import prepare_pace_summary 23 | from utils.logger import get_logger 24 | 25 | LOGGER = get_logger() 26 | 27 | 28 | def generate_training_plan_chart( 29 | activities_df: pd.DataFrame, splits_df: pd.DataFrame, output_path: str 30 | ) -> None: 31 | """ 32 | Generates and saves a visual weekly training plan as a PNG file. 33 | 34 | This function analyses recent training behaviour and fitness gaps to suggest 35 | up to 5 structured runs per week. The resulting schedule balances intensity, 36 | avoids overtraining, and aligns with the user's typical running days. 37 | 38 | Parameters: 39 | activities_df (pd.DataFrame): DataFrame of all Strava activity-level data. 40 | splits_df (pd.DataFrame): DataFrame of all split-level running data. 41 | output_path (str): File path where the training plan image will be saved. 42 | """ 43 | LOGGER.info("Generating training recommendation chart...") 44 | 45 | ( 46 | preferred_days, 47 | run_counts, 48 | fast_pace, 49 | median_pace, 50 | slow_pace, 51 | max_recommendations, 52 | ) = _get_recent_data(activities_df, splits_df) 53 | 54 | recommendations = _generate_recommendations( 55 | run_counts, fast_pace, median_pace, slow_pace, max_recommendations 56 | ) 57 | 58 | assigned = _assign_runs_to_days(recommendations, preferred_days) 59 | _render_training_table(assigned, output_path) 60 | 61 | 62 | def _get_recent_data(activities_df, splits_df): 63 | """ 64 | Extracts recent training metadata needed to generate a personalised training plan. 65 | 66 | This includes: 67 | - The user's preferred training days (based on frequency over 6 months) 68 | - Run type distribution over the past 8 weeks 69 | - Recent representative pace metrics (fast, median, slow) 70 | - Target number of runs to recommend this week 71 | 72 | Parameters: 73 | activities_df (pd.DataFrame): DataFrame of all Strava activities. 74 | splits_df (pd.DataFrame): DataFrame of all split-level records. 75 | 76 | Returns: 77 | tuple: 78 | preferred_days (list): Ordered list of most common training days. 79 | run_counts (Counter): Frequency of run types over recent period. 80 | fast_pace (float): 25th percentile pace in seconds/km. 81 | median_pace (float): Median pace in seconds/km. 82 | slow_pace (float): 75th percentile pace in seconds/km. 83 | max_recommendations (int): Number of runs to recommend this week. 84 | """ 85 | preferred_days, recent_data = _get_recent_days(activities_df) 86 | recent_splits = _get_recent_splits(splits_df) 87 | run_counts = _get_recent_run_counts(splits_df, recent_splits) 88 | pace_data = prepare_pace_summary(recent_splits, group_cols=["activity_id"]) 89 | median_pace = pace_data["pace_median"].median() 90 | fast_pace = pace_data["pace_median"].quantile(0.25) 91 | slow_pace = pace_data["pace_median"].quantile(0.75) 92 | 93 | weekly_counts = recent_data.groupby(recent_data["start_date"].dt.isocalendar().week).size() 94 | average_weekly_runs = int(round(weekly_counts.mean() + 0.5)) 95 | max_recommendations = max(3, min(5, average_weekly_runs)) 96 | 97 | return preferred_days, run_counts, fast_pace, median_pace, slow_pace, max_recommendations 98 | 99 | 100 | def _get_recent_run_counts(splits_df: pd.DataFrame, recent_splits: pd.DataFrame) -> Counter: 101 | """ 102 | Computes the frequency of run types over the recent training period. 103 | 104 | Uses clustering to classify runs into types (e.g. Long, Tempo, Recovery, Intervals), 105 | then filters to include only runs from the same window as recent_splits. 106 | 107 | Parameters: 108 | splits_df (pd.DataFrame): Full split-level dataset for building run features. 109 | recent_splits (pd.DataFrame): Date-filtered splits used to define the recent period. 110 | 111 | Returns: 112 | Counter: A mapping of run type labels to their counts over the recent period. 113 | """ 114 | run_features = build_run_features(splits_df) 115 | clustered = cluster_run_types(run_features) 116 | recent_runs = clustered[clustered["start_date"] >= recent_splits["start_date"].min()] 117 | return Counter(recent_runs["run_type"]) 118 | 119 | 120 | def _get_recent_days(activities_df): 121 | """ 122 | Identifies the user's most common training days over the past 6 months. 123 | 124 | Filters activities to the last 6 months and counts frequency of runs per day of the week. 125 | 126 | Parameters: 127 | activities_df (pd.DataFrame): DataFrame containing all activity records. 128 | 129 | Returns: 130 | tuple: 131 | preferred_days (list): Day names ordered by frequency of runs. 132 | recent_data (pd.DataFrame): Filtered DataFrame with activities from the past 6 months. 133 | """ 134 | six_months_ago = pd.Timestamp.now(tz="UTC") - pd.DateOffset(months=6) 135 | recent_data = prepare_dated_activities(activities_df) 136 | recent_data = recent_data[recent_data["start_date"] >= six_months_ago] 137 | recent_data["day"] = recent_data["start_date"].dt.day_name() 138 | day_freq = Counter(recent_data["day"]) 139 | preferred_days = [d for d, _ in day_freq.most_common()] 140 | return preferred_days, recent_data 141 | 142 | 143 | def _get_recent_splits(splits_df): 144 | """ 145 | Filters split-level data to include only entries from the past 8 weeks. 146 | 147 | Converts 'start_date_local' to UTC-naive datetime and filters by date threshold. 148 | 149 | Parameters: 150 | splits_df (pd.DataFrame): DataFrame containing all split records. 151 | 152 | Returns: 153 | pd.DataFrame: Filtered DataFrame containing only recent splits. 154 | """ 155 | eight_weeks_ago = pd.Timestamp.now(tz="UTC") - pd.DateOffset(weeks=8) 156 | recent_splits = splits_df.copy() 157 | recent_splits["start_date"] = pd.to_datetime(recent_splits["start_date_local"]) 158 | return recent_splits[recent_splits["start_date"] >= eight_weeks_ago] 159 | 160 | 161 | def _suggest_bounds(pace: float, tolerance: float = 0.05) -> str: 162 | """ 163 | Returns a formatted pace range string based on a central pace value and tolerance. 164 | 165 | Parameters: 166 | pace (float): Central pace in seconds per kilometre. 167 | tolerance (float): Proportional margin around the pace (default is 5%). 168 | 169 | Returns: 170 | str: Formatted pace range string (e.g. "4:45 – 5:00"). 171 | """ 172 | low = pace * (1 - tolerance) 173 | high = pace * (1 + tolerance) 174 | return f"{format_pace(low, None)} – {format_pace(high, None)}" 175 | 176 | 177 | def _generate_recommendations(run_counts, fast_pace, median_pace, slow_pace, max_recommendations): 178 | """ 179 | Generates a list of recommended training sessions to improve fitness balance. 180 | 181 | Scores run types based on recent frequency and prioritises under-represented types. 182 | Uses recent pace data to personalise pace ranges for each session type. 183 | Limits output to a maximum number of recommendations. 184 | 185 | Parameters: 186 | run_counts (Counter): Frequency of each run type over the recent period. 187 | fast_pace (float): 25th percentile pace from recent runs (used for intervals). 188 | median_pace (float): Median pace from recent runs (used for tempo). 189 | slow_pace (float): 75th percentile pace from recent runs (used for long and recovery). 190 | max_recommendations (int): Maximum number of runs to recommend in the week. 191 | 192 | Returns: 193 | list: Recommended run dicts including type, distance, intensity, pace, and rationale. 194 | """ 195 | 196 | run_scores = { 197 | "Long": 1.0 - (run_counts.get("Long", 0) / 4), 198 | "Tempo": 1.0 - (run_counts.get("Tempo", 0) / 4), 199 | "Intervals": 1.0 - (run_counts.get("Intervals", 0) / 3), 200 | "Recovery": 1.0 - (run_counts.get("Recovery", 0) / 2), 201 | } 202 | sorted_types = sorted(run_scores.items(), key=lambda x: x[1], reverse=True) 203 | recommendations = [] 204 | 205 | for run_type, _ in sorted_types: 206 | if len(recommendations) >= max_recommendations: 207 | break 208 | if run_type == "Intervals": 209 | recommendations.append( 210 | { 211 | "type": "Intervals", 212 | "intensity": "Hard", 213 | "distance": "6x400m", 214 | "pace": _suggest_bounds(fast_pace), 215 | "reason": "Include interval session to improve VO2 max.", 216 | } 217 | ) 218 | elif run_type == "Long": 219 | recommendations.append( 220 | { 221 | "type": "Long", 222 | "intensity": "Easy", 223 | "distance": "14–18 km", 224 | "pace": _suggest_bounds(slow_pace), 225 | "reason": "Endurance run builds aerobic fitness.", 226 | } 227 | ) 228 | elif run_type == "Tempo": 229 | recommendations.append( 230 | { 231 | "type": "Tempo", 232 | "intensity": "Moderate–Hard", 233 | "distance": "6–10 km", 234 | "pace": _suggest_bounds(median_pace), 235 | "reason": "Tempo runs increase lactate threshold.", 236 | } 237 | ) 238 | elif run_type == "Recovery": 239 | recommendations.append( 240 | { 241 | "type": "Recovery", 242 | "intensity": "Easy", 243 | "distance": "5 km", 244 | "pace": _suggest_bounds(slow_pace), 245 | "reason": "Recovery run supports adaptation.", 246 | } 247 | ) 248 | 249 | return recommendations 250 | 251 | 252 | def _assign_runs_to_days(recommendations, preferred_days): 253 | """ 254 | Assigns recommended training sessions to days of the week based on historical patterns. 255 | 256 | Prioritises days the user typically trains (based on recent activity), 257 | avoids back-to-back hard sessions, and fills up to the number of recommended runs. 258 | 259 | Parameters: 260 | recommendations (list): A list of run recommendation dicts, each containing type, pace, etc. 261 | preferred_days (list): Days of the week sorted by user's historical training frequency. 262 | 263 | Returns: 264 | dict: A mapping of day name to assigned run recommendation. 265 | """ 266 | full_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] 267 | assigned = {} 268 | used_days = set() 269 | previous_day = None 270 | 271 | for rec in recommendations: 272 | for day_candidate in preferred_days: 273 | if day_candidate in used_days: 274 | continue 275 | if previous_day is not None: 276 | current_idx = full_week.index(day_candidate) 277 | previous_idx = full_week.index(previous_day) 278 | if abs(current_idx - previous_idx) in (0, 1): 279 | if rec["type"] not in ("Recovery", "Easy") and assigned[previous_day][ 280 | "type" 281 | ] not in ("Recovery", "Easy"): 282 | continue 283 | assigned[day_candidate] = rec 284 | used_days.add(day_candidate) 285 | previous_day = day_candidate 286 | break 287 | 288 | return assigned 289 | 290 | 291 | def _render_training_table(assigned, output_path): 292 | """ 293 | Renders a visual training plan as a PNG table. 294 | 295 | Creates a 7-row table showing daily training recommendations using matplotlib. 296 | Each row includes the day, run type, distance, pace range, intensity, and reason. 297 | Rest days are automatically filled for unassigned days. 298 | 299 | Parameters: 300 | assigned (dict): Mapping of day name (e.g. "Monday") to a run recommendation dict. 301 | output_path (str): File path where the output PNG image should be saved. 302 | """ 303 | full_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] 304 | _, plot_axis = plt.subplots(figsize=(12, len(full_week) * 0.8)) 305 | plot_axis.axis("off") 306 | table = Table(plot_axis, bbox=[0, 0, 1, 1]) 307 | col_labels = ["Day", "Run Type", "Distance", "Pace", "Intensity", "Reason"] 308 | cell_text = [] 309 | 310 | for day in full_week: 311 | if day in assigned: 312 | rec = assigned[day] 313 | row = [day, rec["type"], rec["distance"], rec["pace"], rec["intensity"], rec["reason"]] 314 | else: 315 | row = [day, "Rest", "–", "–", "–", "Scheduled rest day"] 316 | cell_text.append(row) 317 | 318 | table.auto_set_font_size(False) 319 | table.set_fontsize(9) 320 | for row_idx, row in enumerate([col_labels] + cell_text): 321 | for col_idx, val in enumerate(row): 322 | cell_width = [0.08, 0.12, 0.10, 0.10, 0.12, 0.48][col_idx] 323 | table.add_cell( 324 | row_idx, 325 | col_idx, 326 | cell_width, 327 | 1 / (len(cell_text) + 1), 328 | text=val, 329 | loc="center", 330 | facecolor="#f0f0f0" if row_idx == 0 else "white", 331 | ) 332 | 333 | plot_axis.add_table(table) 334 | plt.title("Suggested Training Plan (Next Week)", fontsize=14) 335 | save_and_close_plot(output_path) 336 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/graphs_distance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the distance chart functions, each saving a PNG file. 3 | """ 4 | 5 | import calendar 6 | from datetime import datetime 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | from matplotlib import ticker 11 | 12 | from strava_data.strava_api.visualisation import utils 13 | 14 | 15 | def plot_time_taken_over_distances(activities_df: pd.DataFrame, output_path: str) -> None: 16 | """ 17 | Time Taken Over Distances: 18 | - y-axis: total time (hh:mm:ss) with 15-minute intervals 19 | - x-axis: total distance (km) with 5 km intervals 20 | - Points colored by year 21 | - Trend line per year (same color, not labeled) 22 | - Overall trend in dashed black, labeled 23 | - Last run marked with a red X 24 | - Filters out runs shorter than 0.5 km 25 | - Decay logic: +180s/km added at max distance + 2km 26 | """ 27 | if activities_df.empty: 28 | return 29 | 30 | data = utils.prepare_time_distance_data(activities_df) 31 | if data.empty: 32 | return 33 | 34 | decay_distance, decay_time = utils.calculate_decay_point(data) 35 | palette = sns.color_palette(n_colors=data["year"].nunique()) 36 | year_color_map = dict(zip(sorted(data["year"].unique()), palette)) 37 | 38 | def plot_fn(axis): 39 | for year in sorted(data["year"].unique()): 40 | year_data = data[data["year"] == year] 41 | sns.scatterplot( 42 | data=year_data, 43 | x="distance_km", 44 | y="time_seconds", 45 | color=year_color_map[year], 46 | alpha=0.5, 47 | label=year, 48 | ax=axis, 49 | ) 50 | 51 | last_run = data[data["is_last_run"]] 52 | if not last_run.empty: 53 | axis.plot( 54 | last_run["distance_km"], 55 | last_run["time_seconds"], 56 | "x", 57 | color="red", 58 | markersize=10, 59 | label="Last Run", 60 | ) 61 | 62 | for year in sorted(data["year"].unique()): 63 | sub = data[data["year"] == year][["distance_km", "time_seconds"]].copy() 64 | sub = pd.concat( 65 | [pd.DataFrame.from_records([{"distance_km": 0, "time_seconds": 0}]), sub] 66 | ) 67 | sns.regplot( 68 | data=sub, 69 | x="distance_km", 70 | y="time_seconds", 71 | scatter=False, 72 | ci=None, 73 | truncate=False, 74 | line_kws={"color": year_color_map[year], "alpha": 0.6}, 75 | ax=axis, 76 | ) 77 | 78 | overall = pd.concat( 79 | [ 80 | pd.DataFrame.from_records([{"distance_km": 0, "time_seconds": 0}]), 81 | data[["distance_km", "time_seconds"]], 82 | pd.DataFrame.from_records( 83 | [{"distance_km": decay_distance, "time_seconds": decay_time}] 84 | ), 85 | ] 86 | ) 87 | sns.regplot( 88 | data=overall, 89 | x="distance_km", 90 | y="time_seconds", 91 | scatter=False, 92 | ci=None, 93 | color="black", 94 | line_kws={"linestyle": "--"}, 95 | ax=axis, 96 | label="Overall Trend", 97 | truncate=False, 98 | ) 99 | 100 | axis.yaxis.set_major_formatter(ticker.FuncFormatter(utils.seconds_to_hms)) 101 | axis.yaxis.set_major_locator(ticker.MultipleLocator(15 * 60)) 102 | axis.xaxis.set_major_locator(ticker.MultipleLocator(5)) 103 | axis.set_xlim(0, (int(decay_distance / 5) + 1) * 5) 104 | axis.set_ylim(0, (int((decay_time * 1.05) / (15 * 60)) + 1) * (15 * 60)) 105 | axis.legend(title="Year") 106 | 107 | # pylint: disable=R0801 108 | utils.plot_with_common_setup( 109 | title="Time Taken Over Distances", 110 | xlabel="Distance (km)", 111 | ylabel="Time Taken (hh:mm:ss)", 112 | output_path=output_path, 113 | plot_func=plot_fn, 114 | ) 115 | # pylint: enable=R0801 116 | 117 | 118 | def plot_time_taken_over_distances_recent_years( 119 | activities_df: pd.DataFrame, 120 | output_path: str, 121 | ) -> None: 122 | """ 123 | Time Taken Over Distances (Recent Years Only): 124 | - Same as plot_time_taken_over_distances but filtered to current and previous year. 125 | """ 126 | if activities_df.empty: 127 | return 128 | 129 | activities_df["start_date_local"] = pd.to_datetime(activities_df["start_date_local"]) 130 | 131 | current_year = datetime.now().year 132 | years_to_include = {current_year, current_year - 1} 133 | filtered_df = activities_df[activities_df["start_date_local"].dt.year.isin(years_to_include)] 134 | 135 | # Reuse original plotting function on filtered data 136 | plot_time_taken_over_distances(filtered_df, output_path) 137 | 138 | 139 | def plot_pace_vs_total_distance(splits_df: pd.DataFrame, output_path: str) -> None: 140 | """ 141 | Running pace vs total distance of that run: 142 | - x-axis: total distance (km) 143 | - y-axis: average pace (mm:ss per km) 144 | - Points colored by year 145 | - Trend lines by year (matched color, not shown in legend) 146 | """ 147 | if splits_df.empty: 148 | return 149 | 150 | data = utils.prepare_pace_distance_data(splits_df) 151 | if data.empty: 152 | return 153 | 154 | max_distance = data["distance_km"].max() 155 | palette = sns.color_palette(n_colors=data["year"].nunique()) 156 | year_color_map = dict(zip(sorted(data["year"].unique()), palette)) 157 | 158 | def plot_fn(axis): 159 | for year in sorted(data["year"].unique()): 160 | year_data = data[data["year"] == year] 161 | sns.scatterplot( 162 | data=year_data, 163 | x="distance_km", 164 | y="pace_sec", 165 | color=year_color_map[year], 166 | alpha=0.5, 167 | label=year, 168 | ax=axis, 169 | ) 170 | 171 | for year in sorted(data["year"].unique()): 172 | year_data = data[data["year"] == year].copy() 173 | if year_data.empty: 174 | continue 175 | distance_max = year_data["distance_km"].max() 176 | pace_max = year_data["pace_sec"].max() 177 | decay_distance = distance_max + 2 178 | decay_pace = pace_max + 180 179 | 180 | extended_data = pd.concat( 181 | [ 182 | year_data, 183 | pd.DataFrame.from_records( 184 | [{"distance_km": decay_distance, "pace_sec": decay_pace}] 185 | ), 186 | ] 187 | ) 188 | sns.regplot( 189 | data=extended_data, 190 | x="distance_km", 191 | y="pace_sec", 192 | scatter=False, 193 | ci=None, 194 | truncate=False, 195 | line_kws={"color": year_color_map[year], "alpha": 0.6}, 196 | ax=axis, 197 | ) 198 | 199 | axis.set_xlim(0, max_distance + 3) 200 | axis.yaxis.set_major_formatter(plt.FuncFormatter(utils.format_pace)) 201 | axis.legend(title="Year") 202 | 203 | # pylint: disable=R0801 204 | utils.plot_with_common_setup( 205 | title="Running Pace vs. Total Distance", 206 | xlabel="Total Distance (km)", 207 | ylabel="Average Pace (mm:ss per km)", 208 | output_path=output_path, 209 | plot_func=plot_fn, 210 | ) 211 | # pylint: enable=R0801 212 | 213 | 214 | def plot_number_of_runs_per_distance(activities_df: pd.DataFrame, output_path: str) -> None: 215 | """ 216 | Number of runs per distance: 217 | - Bar graph showing grouped distances (<5 km, 5–10 km, etc.) 218 | - Bars per year + an overall bar 219 | """ 220 | data = utils.prepare_dated_activities(activities_df) 221 | data["distance_bin"] = pd.cut( 222 | data["distance_km"], 223 | bins=[0, 5, 10, 15, 20, 25, 30, 9999], 224 | labels=["<5", "5–10", "10–15", "15–20", "20–25", "25–30", "30+"], 225 | include_lowest=True, 226 | ) 227 | 228 | grouped = data.groupby(["distance_bin", "year"]).size().reset_index(name="count") 229 | 230 | def plot_fn(axis): 231 | sns.barplot(data=grouped, x="distance_bin", y="count", hue="year", errorbar=None, ax=axis) 232 | 233 | # pylint: disable=R0801 234 | utils.plot_with_common_setup( 235 | title="Number of Runs per Distance", 236 | xlabel="Distance Range (km)", 237 | ylabel="Count of Runs", 238 | output_path=output_path, 239 | plot_func=plot_fn, 240 | ) 241 | # pylint: enable=R0801 242 | 243 | 244 | def plot_total_distance_by_month(activities_df: pd.DataFrame, output_path: str) -> None: 245 | """ 246 | Total distance run by month: 247 | - x-axis: months (Jan–Dec) 248 | - y-axis: total distance run (km) 249 | - Separate line graph for each year 250 | """ 251 | data = utils.prepare_dated_activities(activities_df) 252 | monthly_totals = data.groupby(["year", "month"])["distance_km"].sum().reset_index() 253 | 254 | def plot_fn(axis): 255 | for year in sorted(monthly_totals["year"].unique()): 256 | year_data = monthly_totals[monthly_totals["year"] == year].sort_values("month") 257 | axis.plot( 258 | year_data["month"], 259 | year_data["distance_km"], 260 | marker="o", 261 | linestyle="-", 262 | label=str(year), 263 | ) 264 | utils.label_month_axis(axis) 265 | axis.legend(title="Year") 266 | 267 | # pylint: disable=R0801 268 | utils.plot_with_common_setup( 269 | title="Total Distance Run by Month", 270 | xlabel="Month", 271 | ylabel="Total Distance (km)", 272 | output_path=output_path, 273 | plot_func=plot_fn, 274 | ) 275 | # pylint: enable=R0801 276 | 277 | 278 | def plot_cumulative_distance_over_time(activities_df: pd.DataFrame, output_path: str) -> None: 279 | """ 280 | Cumulative distance per month: 281 | - x-axis: ['Jan', 'Feb', ..., 'Dec'] 282 | - y-axis: cumulative distance (km) 283 | - Separate line per year 284 | """ 285 | data = utils.prepare_dated_activities(activities_df) 286 | monthly_df = data.groupby(["year", "month"])["distance_km"].sum().reset_index() 287 | 288 | def plot_fn(axis): 289 | for year in sorted(monthly_df["year"].unique()): 290 | sub = monthly_df[monthly_df["year"] == year].copy() 291 | sub = sub.set_index("month").reindex(range(1, 13), fill_value=0).reset_index() 292 | sub["cum_dist"] = sub["distance_km"].cumsum() 293 | axis.plot(sub["month"], sub["cum_dist"], marker="o", label=str(year)) 294 | utils.label_month_axis(axis) 295 | axis.legend(title="Year") 296 | 297 | # pylint: disable=R0801 298 | utils.plot_with_common_setup( 299 | title="Cumulative Distance per Year", 300 | xlabel="Month", 301 | ylabel="Cumulative Distance (km)", 302 | output_path=output_path, 303 | plot_func=plot_fn, 304 | ) 305 | # pylint: enable=R0801 306 | 307 | 308 | def plot_monthly_distance_by_year_grouped(activities_df: pd.DataFrame, output_path: str) -> None: 309 | """ 310 | Clustered bar chart comparing total monthly distance by year. 311 | - X-axis: Month (Jan–Dec) 312 | - Y-axis: Total distance (km) 313 | - Grouped by year 314 | """ 315 | data = utils.prepare_dated_activities(activities_df) 316 | grouped = data.groupby(["month", "year"])["distance_km"].sum().reset_index() 317 | 318 | pivot = grouped.pivot(index="month", columns="year", values="distance_km").fillna(0) 319 | pivot = pivot.sort_index() 320 | month_labels = [calendar.month_abbr[m] for m in pivot.index] 321 | 322 | def plot_fn(axis): 323 | pivot.plot(kind="bar", width=0.8, ax=axis) 324 | axis.set_xticks(range(len(month_labels))) 325 | axis.set_xticklabels(month_labels, rotation=45) 326 | axis.legend(title="Year") 327 | 328 | # pylint: disable=R0801 329 | utils.plot_with_common_setup( 330 | title="Year-over-Year Monthly Distance Comparison", 331 | xlabel="Month", 332 | ylabel="Total Distance (km)", 333 | output_path=output_path, 334 | plot_func=plot_fn, 335 | ) 336 | # pylint: enable=R0801 337 | 338 | 339 | def plot_rolling_distance(activities_df: pd.DataFrame, output_path: str, window: int = 30) -> None: 340 | """ 341 | Line graph showing rolling X-day distance total. 342 | Default window = 30 days. 343 | """ 344 | data = utils.prepare_dated_activities(activities_df) 345 | daily = data.groupby("start_date")["distance_km"].sum().reset_index() 346 | daily["rolling_distance_km"] = daily["distance_km"].rolling(window=window).sum() 347 | 348 | def plot_fn(axis): 349 | axis.plot(daily["start_date"], daily["rolling_distance_km"], color="blue", linewidth=2) 350 | 351 | # pylint: disable=R0801 352 | utils.plot_with_common_setup( 353 | title=f"Rolling {window}-Day Distance", 354 | xlabel="Date", 355 | ylabel="Distance (km)", 356 | output_path=output_path, 357 | plot_func=plot_fn, 358 | ) 359 | # pylint: enable=R0801 360 | 361 | 362 | def plot_longest_run_per_month(activities_df: pd.DataFrame, output_path: str) -> None: 363 | """ 364 | Scatter plot of longest run per month across all years. 365 | - X-axis: month (Jan–Dec) 366 | - Y-axis: longest run (km) 367 | - Points: one per year-month, only if a run occurred 368 | - Colour-coded by year 369 | """ 370 | data = utils.prepare_dated_activities(activities_df) 371 | longest = data.groupby(["year", "month"])["distance_km"].max().reset_index() 372 | 373 | def plot_fn(axis): 374 | for year in sorted(longest["year"].unique()): 375 | year_data = longest[longest["year"] == year] 376 | axis.scatter( 377 | year_data["month"], 378 | year_data["distance_km"], 379 | label=str(year), 380 | alpha=0.7, 381 | s=60, 382 | ) 383 | utils.label_month_axis(axis) 384 | axis.legend(title="Year") 385 | 386 | # pylint: disable=R0801 387 | utils.plot_with_common_setup( 388 | title="Longest Run per Month", 389 | xlabel="Month", 390 | ylabel="Distance (km)", 391 | output_path=output_path, 392 | plot_func=plot_fn, 393 | ) 394 | # pylint: enable=R0801 395 | -------------------------------------------------------------------------------- /src/strava_data/strava_api/visualisation/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for chart styling or other shared visualisation helpers. 3 | """ 4 | 5 | import calendar 6 | from dataclasses import dataclass 7 | import datetime 8 | from typing import Callable, Optional, Tuple, List 9 | 10 | import matplotlib.pyplot as plt 11 | from matplotlib.patches import FancyBboxPatch 12 | from matplotlib.text import Text 13 | import numpy as np 14 | import pandas as pd 15 | 16 | DOB = datetime.datetime(1985, 1, 26) 17 | 18 | 19 | @dataclass(frozen=True) 20 | class TitleBoxConfig: 21 | """Configuration for the title + attribution banner box.""" 22 | 23 | attribution: Optional[str] = "Data sourced from Garmin (synced via Strava)" 24 | fontsizes: Tuple[int, int] = (14, 9) # (title_fontsize, subtitle_fontsize) 25 | offsets: Tuple[float, float] = (0.03, 1.3) # (top_offset, line_height_scale) 26 | gap_and_pad: Tuple[float, float] = (0.02, 0.006) # (min_gap, box_pad) 27 | box_lr: Tuple[float, float] = (0.05, 0.95) # (box_left, box_right) 28 | 29 | 30 | def _occupied_content_top(fig: plt.Figure, renderer) -> float: 31 | """ 32 | Highest occupied y in figure coords considering axes *tight* bboxes 33 | (includes tick labels/rotations) and any visible legends. 34 | Falls back to _axes_top(fig) if nothing measurable is found. 35 | """ 36 | tops: List[float] = [] 37 | try: 38 | inv = fig.transFigure.inverted() 39 | except (AttributeError, ValueError): 40 | inv = None 41 | 42 | for axis in getattr(fig, "axes", []): 43 | try: 44 | if not axis.get_visible(): 45 | continue 46 | except (AttributeError, ValueError): 47 | pass 48 | 49 | # Axes tight bbox (ticks/labels included) 50 | try: 51 | tight_bbox = axis.get_tightbbox(renderer) 52 | if tight_bbox is not None: 53 | tops.append(tight_bbox.transformed(inv).y1 if inv is not None else tight_bbox.y1) 54 | except (AttributeError, ValueError): 55 | pass 56 | 57 | # Legend bbox (if present) 58 | try: 59 | legend = axis.get_legend() 60 | if legend is not None and legend.get_visible(): 61 | legend_bbox = legend.get_window_extent(renderer=renderer) 62 | if legend_bbox is not None: 63 | tops.append( 64 | legend_bbox.transformed(inv).y1 if inv is not None else legend_bbox.y1 65 | ) 66 | except (AttributeError, ValueError): 67 | pass 68 | 69 | return max(tops) if tops else _axes_top(fig) 70 | 71 | 72 | def _reserve_space_above_axes( 73 | fig: plt.Figure, top_limit: float, *, min_bottom: float = 0.05 74 | ) -> None: 75 | """ 76 | Ensure no axes extend above top_limit (figure coords). 77 | Prefer shifting axes down; if that would push the bottom below min_bottom, 78 | trim from the top instead. 79 | """ 80 | for axis in fig.axes: 81 | if not axis.get_visible(): 82 | continue 83 | 84 | pos = axis.get_position() 85 | if pos.y1 <= top_limit + 1e-6: 86 | continue 87 | 88 | excess = pos.y1 - top_limit 89 | new_y0 = pos.y0 - excess 90 | new_y1 = top_limit 91 | 92 | # If we can't shift without going below min_bottom, shrink from the top 93 | if new_y0 < min_bottom: 94 | new_y0 = pos.y0 95 | new_y1 = top_limit 96 | 97 | axis.set_position([pos.x0, new_y0, pos.width, new_y1 - new_y0]) 98 | 99 | 100 | def _finalise_and_get_renderer(fig: plt.Figure): 101 | """Draw once so constrained layout is final and return the renderer.""" 102 | fig.canvas.draw() 103 | return fig.canvas.get_renderer() 104 | 105 | 106 | def _axes_top(fig: plt.Figure) -> float: 107 | """Top (y1) of the highest visible axes in figure coords.""" 108 | return max(ax.get_position().y1 for ax in fig.axes if ax.get_visible()) 109 | 110 | 111 | def _line_height(fig: plt.Figure, fontsize_pt: int, scale: float) -> float: 112 | """ 113 | Convert a font size in points to a figure-coordinate line height. 114 | 115 | Args: 116 | fig: Matplotlib figure. 117 | fontsize_pt: Font size in points. 118 | scale: Multiplier to adjust line spacing. 119 | 120 | Returns: 121 | Line height in figure coordinates. 122 | """ 123 | fig_h_in = fig.get_size_inches()[1] 124 | return (fontsize_pt / 72.0) / fig_h_in * scale 125 | 126 | 127 | def _place_texts( 128 | fig: plt.Figure, 129 | title: str, 130 | attribution: Optional[str], 131 | *, 132 | title_y: float, 133 | subtitle_y: float, 134 | title_fontsize: int, 135 | subtitle_fontsize: int, 136 | ) -> Tuple[Text, Optional[Text]]: 137 | """ 138 | Create title and subtitle Text artists at the given y positions. 139 | 140 | Returns: 141 | Tuple of (title_text, attribution_text_or_None). 142 | """ 143 | title_txt = fig.text( 144 | 0.5, 145 | title_y, 146 | title, 147 | ha="center", 148 | va="bottom", 149 | fontsize=title_fontsize, 150 | color="black", 151 | zorder=3, 152 | ) 153 | attr_txt = None 154 | if attribution: 155 | attr_txt = fig.text( 156 | 0.5, 157 | subtitle_y, 158 | attribution, 159 | ha="center", 160 | va="bottom", 161 | fontsize=subtitle_fontsize, 162 | color="gray", 163 | zorder=3, 164 | ) 165 | return title_txt, attr_txt 166 | 167 | 168 | def _measure_text_bounds(fig: plt.Figure, renderer, artists: List[Text]) -> Tuple[float, float]: 169 | """ 170 | Return the vertical span of the given text elements in **figure coordinates**. 171 | 172 | This computes the minimum and maximum y values across all provided text objects 173 | after transforming their bounding boxes into the figure's coordinate system 174 | (0–1 in both x and y). 175 | 176 | Returns: 177 | (ymin, ymax) of all provided text artists. 178 | """ 179 | y_bounds: List[Tuple[float, float]] = [] 180 | for artist in artists: 181 | if artist is None: 182 | continue 183 | bounding_box = artist.get_window_extent(renderer).transformed(fig.transFigure.inverted()) 184 | y_bounds.append((bounding_box.ymin, bounding_box.ymax)) 185 | ymin = min(b[0] for b in y_bounds) 186 | ymax = max(b[1] for b in y_bounds) 187 | return ymin, ymax 188 | 189 | 190 | def _shift_texts(title_txt: Text, attr_txt: Optional[Text], shift: float) -> None: 191 | """ 192 | Shift title and attribution texts upward by a given amount. 193 | 194 | Args: 195 | title_txt: The title Text artist. 196 | attr_txt: The attribution Text artist (or None). 197 | shift: Amount to add to Y position in figure coords. 198 | """ 199 | x_title, y_title = title_txt.get_position() 200 | title_txt.set_position((x_title, y_title + shift)) 201 | if attr_txt is not None: 202 | x_attr, y_attr = attr_txt.get_position() 203 | attr_txt.set_position((x_attr, y_attr + shift)) 204 | 205 | 206 | def _lift_if_needed( 207 | fig: plt.Figure, 208 | *, 209 | min_gap: float, 210 | box_pad: float, 211 | title_txt: Text, 212 | attr_txt: Optional[Text], 213 | ) -> Tuple[float, float]: 214 | """ 215 | Ensure the box bottom clears the axes by min_gap. 216 | If needed, shift both texts upward (clamped to the figure top). 217 | 218 | Returns: 219 | (ymin, ymax) of the final text union after any shift. 220 | """ 221 | renderer = _finalise_and_get_renderer(fig) 222 | 223 | ymin, ymax = _measure_text_bounds(fig, renderer, [title_txt, attr_txt]) 224 | box_bottom = ymin - box_pad 225 | occupied_top = _occupied_content_top(fig, renderer) 226 | 227 | # Small extra buffer if any legend is visible (avoids near misses) 228 | extra = ( 229 | 0.01 230 | if any((legend := ax.get_legend()) is not None and legend.get_visible() for ax in fig.axes) 231 | else 0.0 232 | ) 233 | 234 | required_bottom = occupied_top + (min_gap + extra) 235 | if box_bottom >= required_bottom: 236 | return ymin, ymax 237 | 238 | # Need to lift the banner 239 | shift = min( 240 | required_bottom - box_bottom, 241 | max(0.0, 0.995 - title_txt.get_position()[1]), 242 | ) 243 | _shift_texts(title_txt, attr_txt, shift) 244 | 245 | fig.canvas.draw() # re-measure after moving 246 | return _measure_text_bounds(fig, fig.canvas.get_renderer(), [title_txt, attr_txt]) 247 | 248 | 249 | def _draw_background_box( 250 | fig: plt.Figure, 251 | *, 252 | ymin: float, 253 | ymax: float, 254 | box_left: float, 255 | box_right: float, 256 | box_pad: float, 257 | ) -> None: 258 | """ 259 | Draw a white rounded rectangle behind the title and subtitle. 260 | 261 | Args: 262 | fig: Matplotlib figure. 263 | ymin: Lower y-bound of text union (figure coords). 264 | ymax: Upper y-bound of text union (figure coords). 265 | box_left: Left x-position of box (figure coords). 266 | box_right: Right x-position of box (figure coords). 267 | box_pad: Padding applied above/below the text union (figure coords). 268 | """ 269 | fig.patches.append( 270 | FancyBboxPatch( 271 | (box_left, ymin - box_pad), 272 | (box_right - box_left), 273 | (ymax - ymin) + 2 * box_pad, 274 | transform=fig.transFigure, 275 | boxstyle="round,pad=0.004,rounding_size=0.01", 276 | facecolor="white", 277 | edgecolor="lightgray", 278 | linewidth=0.8, 279 | alpha=0.95, 280 | zorder=2, 281 | ) 282 | ) 283 | 284 | 285 | def add_title_with_attribution( 286 | fig: plt.Figure, 287 | title: str, 288 | config: TitleBoxConfig = TitleBoxConfig(), 289 | ) -> None: 290 | """ 291 | Add a title and optional attribution above the plot area, automatically 292 | lifting them if they would overlap the axes, and drawing a rounded 293 | background box behind both lines. 294 | """ 295 | if not fig.axes: 296 | return 297 | 298 | axes_top = _axes_top(fig) 299 | subtitle_y = axes_top + config.offsets[0] 300 | title_y = subtitle_y + _line_height(fig, config.fontsizes[1], config.offsets[1]) 301 | 302 | title_txt, attr_txt = _place_texts( 303 | fig, 304 | title, 305 | config.attribution, 306 | title_y=title_y, 307 | subtitle_y=subtitle_y, 308 | title_fontsize=config.fontsizes[0], 309 | subtitle_fontsize=config.fontsizes[1], 310 | ) 311 | 312 | ymin, ymax = _lift_if_needed( 313 | fig, 314 | min_gap=config.gap_and_pad[0], 315 | box_pad=config.gap_and_pad[1], 316 | title_txt=title_txt, 317 | attr_txt=attr_txt, 318 | ) 319 | 320 | # Push axes down so they clear the banner by at least min_gap 321 | header_bottom = ymin - config.gap_and_pad[1] 322 | top_limit = header_bottom - config.gap_and_pad[0] 323 | _reserve_space_above_axes(fig, top_limit) 324 | fig.canvas.draw() 325 | 326 | _draw_background_box( 327 | fig, 328 | ymin=ymin, 329 | ymax=ymax, 330 | box_left=config.box_lr[0], 331 | box_right=config.box_lr[1], 332 | box_pad=config.gap_and_pad[1], 333 | ) 334 | 335 | 336 | def configure_matplotlib_styles() -> None: 337 | """ 338 | Apply consistent style settings across all charts. 339 | """ 340 | plt.rcParams["figure.figsize"] = (10, 6) 341 | plt.rcParams["axes.labelsize"] = 12 342 | plt.rcParams["axes.titlesize"] = 14 343 | plt.rcParams["legend.fontsize"] = 12 344 | plt.rcParams["axes.grid"] = True 345 | 346 | 347 | def format_pace(value: float, _) -> str: 348 | """ 349 | Convert a time value in seconds into 'minutes:seconds' format. 350 | """ 351 | if not np.isfinite(value): 352 | return "" 353 | minutes = int(value // 60) 354 | seconds = int(value % 60) 355 | return f"{minutes}:{seconds:02d}" 356 | 357 | 358 | def classify_zone_dynamic(heart_rate: float, date_str: str) -> str: 359 | """ 360 | Classify heart rate into a dynamic training zone based on age at the run date. 361 | 362 | Zones are computed from a max HR of (220 - age) on the given date. 363 | """ 364 | try: 365 | run_date = pd.to_datetime(date_str) 366 | except (ValueError, TypeError): 367 | return "Unknown" 368 | 369 | age = run_date.year - DOB.year - ((run_date.month, run_date.day) < (DOB.month, DOB.day)) 370 | max_hr = 220 - age 371 | heart_pct = heart_rate / max_hr 372 | 373 | if heart_pct < 0.60: 374 | return "Z1 (<60%)" 375 | if heart_pct < 0.70: 376 | return "Z2 (60–70%)" 377 | if heart_pct < 0.80: 378 | return "Z3 (70–80%)" 379 | if heart_pct < 0.90: 380 | return "Z4 (80–90%)" 381 | return "Z5 (90–100%)" 382 | 383 | 384 | def prepare_pace_distance_data(splits_df: pd.DataFrame) -> pd.DataFrame: 385 | """ 386 | Aggregate and derive per-run pace metrics from individual split data. 387 | 388 | Adds: 389 | - pace_sec_km: seconds per kilometre for the run 390 | - distance_km, pace_sec, year 391 | """ 392 | splits = splits_df.copy() 393 | splits["pace_sec_km"] = splits["elapsed_time_s"] / (splits["distance_m"] / 1000) 394 | grouped_df = ( 395 | splits.groupby(["activity_id", "start_date_local"]) 396 | .agg({"distance_m": "sum", "elapsed_time_s": "sum"}) 397 | .reset_index() 398 | ) 399 | grouped_df["pace_sec_km"] = grouped_df["elapsed_time_s"] / (grouped_df["distance_m"] / 1000) 400 | grouped_df["distance_km"] = grouped_df["distance_m"] / 1000 401 | grouped_df["pace_sec"] = grouped_df["pace_sec_km"] 402 | grouped_df["year"] = pd.to_datetime(grouped_df["start_date_local"]).dt.year 403 | return grouped_df 404 | 405 | 406 | def prepare_time_distance_data(activities_df: pd.DataFrame) -> pd.DataFrame: 407 | """ 408 | Clean and enrich raw activities data for plotting time vs. distance trends. 409 | 410 | Adds: 411 | - distance_km, time_seconds, year, is_last_run 412 | and filters out very short activities (< 0.5 km). 413 | """ 414 | data = activities_df.copy() 415 | data["distance_km"] = data["distance_m"] / 1000.0 416 | data = data[data["distance_km"] >= 0.5] 417 | data["time_seconds"] = data["moving_time_s"] 418 | data["year"] = pd.to_datetime(data["start_date_local"]).dt.year 419 | last_run_date = pd.to_datetime(data["start_date_local"]).max() 420 | data["is_last_run"] = pd.to_datetime(data["start_date_local"]) == last_run_date 421 | return data 422 | 423 | 424 | def calculate_decay_point(data: pd.DataFrame) -> Tuple[float, float]: 425 | """ 426 | Compute an extrapolated decay point for visualising projected pacing trends. 427 | 428 | Returns: 429 | (decay_distance_km, decay_time_seconds) 430 | """ 431 | max_distance = data["distance_km"].max() 432 | max_time = data["time_seconds"].max() 433 | decay_distance = max_distance + 2 434 | average_pace = max_time / max_distance 435 | decay_time = decay_distance * (average_pace + 180) 436 | return decay_distance, decay_time 437 | 438 | 439 | def seconds_to_hms(value, _): 440 | """ 441 | Convert a numeric value (in seconds) to a HH:MM:SS formatted string. 442 | """ 443 | return str(datetime.timedelta(seconds=int(value))) 444 | 445 | 446 | def save_and_close_plot(output_path: str) -> None: 447 | """ 448 | Common helper to save matplotlib plots without switching layout engines. 449 | """ 450 | fig = plt.gcf() 451 | fig.savefig(output_path, dpi=150, bbox_inches="tight") 452 | plt.close(fig) 453 | 454 | 455 | def extract_year_month(dataframe: pd.DataFrame) -> pd.DataFrame: 456 | """ 457 | Add 'year' and 'month' columns based on 'start_date_local'. 458 | """ 459 | data = dataframe.copy() 460 | data["year"] = pd.to_datetime(data["start_date_local"]).dt.year 461 | data["month"] = pd.to_datetime(data["start_date_local"]).dt.month 462 | return data 463 | 464 | 465 | def prepare_activities_with_distance(activities_df: pd.DataFrame) -> pd.DataFrame: 466 | """ 467 | Copy and derive 'distance_km', 'year', 'month' from raw activities. 468 | """ 469 | if activities_df.empty: 470 | return pd.DataFrame() 471 | 472 | data = activities_df.copy() 473 | data["distance_km"] = data["distance_m"] / 1000.0 474 | data = extract_year_month(data) 475 | return data 476 | 477 | 478 | def prepare_1km_splits(splits_df: pd.DataFrame) -> pd.DataFrame: 479 | """ 480 | Filter splits to ~1 km and add 'distance_km' and 'year'. 481 | """ 482 | if splits_df.empty: 483 | return pd.DataFrame() 484 | 485 | data = splits_df.copy() 486 | data["distance_km"] = data["distance_m"] / 1000.0 487 | data = data[(data["distance_km"] >= 0.95) & (data["distance_km"] <= 1.05)] 488 | if data.empty: 489 | return pd.DataFrame() 490 | 491 | data["year"] = pd.to_datetime(data["start_date_local"]).dt.year 492 | return data 493 | 494 | 495 | def plot_with_common_setup( 496 | title: str, 497 | xlabel: str, 498 | ylabel: str, 499 | output_path: str, 500 | plot_func: Callable, 501 | *, 502 | attribution: Optional[str] = "Data sourced from Garmin (synced via Strava)", 503 | figsize: tuple[int, int] = (10, 5), 504 | ): 505 | """ 506 | Reusable wrapper to set up common plot structure and call the provided plot_func. 507 | 508 | Args: 509 | title: Figure title. 510 | xlabel: X-axis label. 511 | ylabel: Y-axis label. 512 | output_path: File path to save the figure. 513 | plot_func: Callable that accepts a Matplotlib axis and draws the plot. 514 | attribution: Optional attribution text for data source. 515 | figsize: Figure size in inches (width, height). 516 | """ 517 | fig, axis = plt.subplots(figsize=figsize, constrained_layout=True) 518 | plot_func(axis) 519 | axis.set_xlabel(xlabel) 520 | axis.set_ylabel(ylabel) 521 | axis.grid(True) 522 | add_title_with_attribution( 523 | fig, 524 | title, 525 | TitleBoxConfig(attribution=attribution), 526 | ) 527 | save_and_close_plot(output_path) 528 | 529 | 530 | def prepare_dated_activities(activities_df: pd.DataFrame) -> pd.DataFrame: 531 | """ 532 | Prepare an activities DataFrame for time series plotting. 533 | 534 | Adds a sorted 'start_date' timestamp column. 535 | """ 536 | if activities_df.empty: 537 | return pd.DataFrame() 538 | data = prepare_activities_with_distance(activities_df) 539 | data["start_date"] = pd.to_datetime(data["start_date_local"]) 540 | return data.sort_values("start_date") 541 | 542 | 543 | def label_month_axis(axis): 544 | """ 545 | Apply consistent x-axis formatting for month-based plots. 546 | """ 547 | axis.set_xticks(range(1, 13)) 548 | axis.set_xticklabels(calendar.month_abbr[1:13], rotation=45) 549 | 550 | 551 | def label_month_axis_barplot(axis): 552 | """ 553 | Apply consistent x-axis formatting for month-based (bar) plots. 554 | """ 555 | axis.set_xticks(np.arange(12) + 0.5) 556 | axis.set_xticklabels(calendar.month_abbr[1:13], rotation=45) 557 | --------------------------------------------------------------------------------