├── .devcontainer └── devcontainer.json ├── .env.example ├── .github ├── ISSUE_TEMPLATE │ ├── clarify-concept.md │ ├── questions-about-the-course-material.md │ └── technical-troubleshooting-or-bugs.md ├── actions │ └── setup │ │ └── action.yaml └── workflows │ └── ml_pipelines.yaml ├── .gitignore ├── .python-version ├── INSTALL_AND_USAGE.md ├── LICENSE ├── Makefile ├── README.md ├── assets ├── 4_stage_recommender_architecture.png ├── github_actions_manual_trigger.png ├── github_actions_pipeline_done.png ├── github_actions_pipeline_progress.png ├── github_actions_secrets.png ├── hopsworks.png ├── hopsworks_deployments.png ├── streamlit_choose_advanced_settings.png ├── streamlit_choose_app_type.png ├── streamlit_choose_main_settings.png ├── system_architecture.png ├── two_tower_embedding_model.png └── ui_example.png ├── notebooks ├── 1_fp_computing_features.ipynb ├── 2_tp_training_retrieval_model.ipynb ├── 3_tp_training_ranking_model.ipynb ├── 4_ip_computing_item_embeddings.ipynb ├── 5_ip_creating_deployments.ipynb ├── 6_scheduling_materialization_jobs.ipynb └── 7_ip_creating_deployments_llm_ranking.ipynb ├── packages.txt ├── pyproject.toml ├── recsys ├── __init__.py ├── config.py ├── features │ ├── __init__.py │ ├── articles.py │ ├── customers.py │ ├── embeddings.py │ ├── interaction.py │ ├── ranking.py │ └── transactions.py ├── hopsworks_integration │ ├── __init__.py │ ├── constants.py │ ├── feature_store.py │ ├── llm_ranker │ │ └── requirements.txt │ ├── llm_ranking_serving.py │ ├── ranking_serving.py │ └── two_tower_serving.py ├── inference │ ├── __init__.py │ ├── llm_ranking_predictor.py │ ├── query_transformer.py │ ├── ranking_predictor.py │ └── ranking_transformer.py ├── raw_data_sources │ ├── __init__.py │ └── h_and_m.py ├── training │ ├── __init__.py │ ├── ranking.py │ └── two_tower.py └── ui │ ├── __init__.py │ ├── feature_group_updater.py │ ├── interaction_tracker.py │ ├── recommenders.py │ └── utils.py ├── streamlit_app.py ├── tools └── clean_hopsworks_resources.py └── uv.lock /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "streamlit_app.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y 2 | 3 | 4 | 5 | Decoding ML Logo 6 | 7 | 8 | 9 |
10 |

📬 Stay Updated

11 |

Join Decoding ML for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.

12 |
13 | 14 | 15 | 16 | 17 |

18 | 19 | Subscribe Now 20 | 21 |

22 | 23 | ------ 24 | 25 | # 🚀 Installation and Usage Guide 26 | 27 | This guide will help you set up and run a machine learning pipeline that includes feature engineering, model training, and deployment using Hopsworks and OpenAI. 28 | 29 | # 📑 Table of Contents 30 | 31 | - [📋 Prerequisites](#-prerequisites) 32 | - [🎯 Getting Started](#-getting-started) 33 | - [⚡️ Running the H&M Personalized Recommender](#️-running-the-hm-personalized-recommender) 34 | - [🤖 Running the ML Pipelines in GitHub Actions](#-running-the-ml-pipelines-in-github-actions) 35 | - [🌐 Live Demo](#-live-demo) 36 | - [☁️ Deploying the Streamlit App](#️-deploying-the-streamlit-app) 37 | 38 | # 📋 Prerequisites 39 | 40 | ## Local Tools 41 | You'll need the following tools installed locally: 42 | 43 | | Tool | Version | Purpose | Installation Link | 44 | |------|---------|---------|------------------| 45 | | Python | 3.11 | Programming language runtime | [Download](https://www.python.org/downloads/) | 46 | | uv | ≥ 0.4.30 | Python package installer and virtual environment manager | [Download](https://github.com/astral-sh/uv) | 47 | | GNU Make | ≥ 3.81 | Build automation tool | [Download](https://www.gnu.org/software/make/) | 48 | | Git | ≥2.44.0 | Version control | [Download](https://git-scm.com/downloads) 49 | 50 | ## Cloud Services 51 | The project requires access to these cloud services: 52 | 53 | | Service | Purpose | Cost | Required Credentials | Setup Guide | 54 | |---------|---------|------|---------------------|-------------| 55 | | [Hopsworks](https://rebrand.ly/serverless-github) | AI Lakehouse for feature store, model registry, and serving | Free tier available | `HOPSWORKS_API_KEY` | [Create API Key](https://docs.hopsworks.ai/latest/user_guides/projects/api_key/create_api_key/) | 56 | | [GitHub Actions](https://github.com/features/actions) | Compute & Automation | Free for public repos | - | - | 57 | | [OpenAI API](https://openai.com/index/openai-api/) | LLM API for recommender system | Pay-per-use | `OPENAI_API_KEY` | [Quick Start Guide](https://platform.openai.com/docs/quickstart) | 58 | 59 | # 🎯 Getting Started 60 | 61 | ## 1. Clone the Repository 62 | 63 | Start by cloning the repository and navigating to the project directory: 64 | ``` 65 | git clone https://github.com/decodingml/personalized-recommender-course.git 66 | cd personalized-recommender-course 67 | ``` 68 | 69 | Next, we have to prepare your Python environment and its adjacent dependencies. 70 | 71 | ## 2. Installation 72 | 73 | Set up the project environment by running the following: 74 | ```bash 75 | make install 76 | ``` 77 | Test that you have Python 3.11.8 installed in your new `uv` environment: 78 | ```bash 79 | uv run python --version 80 | # Output: Python 3.11.8 81 | ``` 82 | 83 | This command will: 84 | - Create a virtual environment using `uv` 85 | - Activate the virtual environment 86 | - Install all dependencies from `pyproject.toml` 87 | 88 | > [!NOTE] 89 | > Normally, `uv` will pick the right Python version mentioned in `.python-version` and install it automatically if it is not on your system. If you are having any issues, explicitly install the right Python version by running `make install-python` 90 | 91 | ## 3. Environment Configuration 92 | 93 | Before running any components: 94 | 1. Create your environment file: 95 | ```bash 96 | cp .env.example .env 97 | ``` 98 | 2. Open `.env` and configure the required credentials following the inline comments and the recommendations from the [Cloud Services](#-prerequisites) section. 99 | 100 | # ⚡️ Running the H&M Personalized Recommender 101 | 102 | ## Notebooks 103 | 104 | For instructions on exploring the Notebooks, check out the [📚 Course](https://github.com/decodingml/personalized-recommender-course?tab=readme-ov-file#-course-outline) section from the main [README](https://github.com/decodingml/personalized-recommender-course?tab=readme-ov-file#-course-outline). 105 | 106 | ## Running the ML Pipelines 107 | 108 | You can run the entire pipeline at once or execute individual components. 109 | 110 | ### Running Everything in One Go (Quick) 111 | 112 | Execute all the ML pipelines in a sequence: 113 | ```bash 114 | make all 115 | ``` 116 | It will take ~1.5 hours to run, depending on your machine. 117 | 118 | This runs the following steps: 119 | 1. Feature engineering 120 | 2. Retrieval model training 121 | 3. Ranking model training 122 | 4. Candidate embeddings creation 123 | 5. Inference pipeline deployment 124 | 6. Materialization job scheduling 125 | 126 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments** 127 | 128 | Start the Streamlit UI: 129 | ```bash 130 | make start-ui 131 | ``` 132 | Accessible at `http://localhost:8501/` 133 | 134 | ### Running Individual Components (Recommended) 135 | 136 | Each component can be run separately: 137 | 138 | 1. **Feature Engineering** 139 | ```bash 140 | make feature-engineering 141 | ``` 142 | It will take ~1 hour to run, depending on your machine. 143 | 144 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups** 145 | 146 | 2. **Retrieval Model Training** 147 | ```bash 148 | make train-retrieval 149 | ``` 150 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry** 151 | 152 | 3. **Ranking Model Training** 153 | ```bash 154 | make train-ranking 155 | ``` 156 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry** 157 | 158 | 4. **Embeddings Creation** 159 | ```bash 160 | make create-embeddings 161 | ``` 162 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups** 163 | 164 | 5. **Deployment Creation** 165 | ```bash 166 | make create-deployments 167 | ``` 168 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments** 169 | 170 |

171 | 172 | hopsworks_deployments 173 | 174 |

175 | 176 | Start the Streamlit UI: 177 | ```bash 178 | make start-ui 179 | ``` 180 | Accessible at `http://localhost:8501/` 181 | 182 | > [!IMPORTANT] 183 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother. 184 | 185 | 6. **Materialization Job Scheduling** 186 | ```bash 187 | make schedule-materialization-jobs 188 | ``` 189 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Compute → Ingestions** 190 | 191 | 7. **Deployment Creation with LLM Ranking (Optional)** 192 | 193 | Optional step to replace the standard deployments (created in Step 5) with the ones powered by LLMs: 194 | ```bash 195 | make create-deployments-llm-ranking 196 | ``` 197 | **NOTE**: If the script fails, go to [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**, forcefully stop all the deployments and run again. 198 | 199 | > [!WARNING] 200 | > The LLM Ranking deployment overrides the deployment from **5. Deployment Creation** 201 | 202 | Start the Streamlit UI that interfaces the LLM deployment: 203 | ```bash 204 | make start-ui-llm-ranking 205 | ``` 206 | Accessible at `http://localhost:8501/` 207 | 208 | > [!WARNING] 209 | > The Streamlit UI command is compatible only with its corresponding deployment. For example, running the deployment from **5. Deployment Creation** and `make start-ui-llm-ranking` won't work. 210 | 211 | ## Clean Up Resources 212 | 213 | Remove all created resources from [Hopsworks Serverless](https://rebrand.ly/serverless-github): 214 | ```bash 215 | make clean-hopsworks-resources 216 | ``` 217 | 218 | ### 🚨 Important Notes 219 | - Ensure UV is properly installed and configured before running any commands 220 | - All notebooks are executed using IPython through the UV virtual environment 221 | - Components should be run in the specified order when executing individually 222 | 223 | # 🤖 Running the ML Pipelines in GitHub Actions 224 | 225 | This project supports running ML pipelines automatically through GitHub Actions, providing an alternative to local or Colab execution. 226 | 227 | > [!NOTE] 228 | > This is handy when getting network errors, such as timeouts, on your local machine. GitHub Actions has an enterprise-level network that will run your ML pipelines smoothly. 229 | 230 | ## Pipeline Triggers 231 | 232 | The ML pipelines can be triggered in three ways: 233 | - Manual trigger through GitHub UI 234 | - Scheduled execution (configurable) 235 | - On push to main branch (configurable) 236 | 237 | ## Setup Process 238 | 239 | ### 1. Fork Repository 240 | Create your own copy of the repository to access GitHub Actions: 241 | ```bash 242 | # Use GitHub's UI to fork the repository 243 | https://github.com/original-repo/name → Your-Username/name 244 | ``` 245 | [📚 GitHub Fork Guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) 246 | 247 | ### 2. Configure Secrets 248 | Set up required environment variables as GitHub Actions secrets: 249 | 250 | **Option A: Using GitHub UI** 251 | 1. Navigate to: Repository → Settings → Secrets and variables → Actions 252 | 2. Click "New repository secret" 253 | 3. Add required secrets: 254 | - `HOPSWORKS_API_KEY` 255 | - `OPENAI_API_KEY` 256 | 257 | [📚 Set up GitHub Actions Secrets Guide](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui) 258 | 259 |

260 | 261 | GA Secrets 262 | 263 |

264 | 265 | **Option B: Using GitHub CLI** 266 | 267 | If you have `GitHub CLI` installed, instead of settings the GitHub Actions secrets manually, you can set them by running the following: 268 | 269 | ```bash 270 | gh secret set HOPSWORKS_API_KEY 271 | gh secret set OPENAI_API_KEY 272 | ``` 273 | 274 | ### 3. Execute Pipeline 275 | 276 | #### Manual Execution 277 | 1. Go to Actions → ML Pipelines 278 | 2. Click "Run workflow" 279 | 3. Select branch (default: main) 280 | 4. Click "Run workflow" 281 | 282 |

283 | 284 | GA Manual Trigger 285 | 286 |

287 | 288 | After triggering the pipeline, you will see it running, signaled by a yellow circle. Click on it to see the progress. 289 | 290 |

291 | 292 | GA Progress 293 | 294 |

295 | 296 | After it is finished, it should look like this: 297 | 298 |

299 | 300 | GA Done 301 | 302 |

303 | 304 | #### Automated Execution 305 | 306 | Another option is to run the ML pipelines automatically on a schedule or when new commits are pushed to the main branch. 307 | 308 | Edit `.github/workflows/ml_pipelines.yaml` to enable automatic triggers: 309 | 310 | ```yaml 311 | name: ML Pipelines 312 | 313 | on: 314 | # schedule: # Uncomment to run the pipelines every 2 hours. All the pipelines take ~1.5 hours to run. 315 | # - cron: '0 */2 * * *' 316 | # push: # Uncomment to run pipelines on every new commit to main 317 | # branches: 318 | # - main 319 | workflow_dispatch: # Allows manual triggering from GitHub UI 320 | ``` 321 | 322 | ## Monitoring & Results 323 | 324 | 1. **Pipeline Progress** 325 | - View real-time execution in Actions tab 326 | - Each step shows detailed logs and status 327 | 328 | 2. **Output Verification** 329 | - Access results in [Hopsworks Serverless](https://rebrand.ly/serverless-github) 330 | - Check Feature Groups, Feature Views, Model Registry, and Deployments 331 | 332 | ## ⚠️ Important Notes 333 | - Full pipeline execution takes approximately 1.5 hours 334 | - Ensure sufficient GitHub Actions minutes available 335 | - Monitor usage when enabling automated triggers 336 | 337 | # 🌐 Live Demo 338 | 339 | Try out our deployed H&M real-time personalized recommender to see what you'll learn to build by the end of this course: 340 | [💻 Live H&M Recommender Streamlit Demo](https://decodingml-hands-on-personalized-recommender.streamlit.app/) 341 | 342 | > [!IMPORTANT] 343 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother. 344 | 345 |

346 | 347 | UI Example 348 | 349 |

350 | 351 | # ☁️ Deploying the Streamlit App 352 | 353 | Deploying a Streamlit App to their [cloud](https://streamlit.io/cloud) is free and straightforward after the GitHub repository is set in right place: 354 | 355 | - `uv.lock` - installing Python dependencies 356 | - `packages.txt` - installing system dependencies 357 | - `streamlit_app.py` - entrypoint to the Streamlit application 358 | 359 | ## Deployment Steps 360 | 361 | ### 1. Repository Setup 362 | Fork the repository if you haven't already: 363 | ```bash 364 | # Use GitHub's UI to fork the repository 365 | https://github.com/original-repo/name → Your-Username/name 366 | ``` 367 | [📚 GitHub Fork Guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) 368 | 369 | ### 2. Streamlit Cloud Setup 370 | 1. Create a free account on [Streamlit Cloud](https://docs.streamlit.io/deploy/streamlit-community-cloud/get-started) 371 | 2. Navigate to [New App Deployment](https://docs.streamlit.io/deploy/streamlit-community-cloud/deploy-your-app) 372 | 3. Configure deployment settings: 373 | 374 | | Setting | Configuration | Description | 375 | |---------|--------------|-------------| 376 | | App Type | ![App Type](assets/streamlit_choose_app_type.png) | Select "Deploy a public app from GitHub" | 377 | | Main Settings | ![Main Settings](assets/streamlit_choose_main_settings.png) | Configure your repository | 378 | | Advanced Settings | ![Advanced Settings](assets/streamlit_choose_advanced_settings.png) | Set Python 3.11 and `HOPSWORKS_API_KEY` | 379 | 380 | ## ⚠️ Important Notes 381 | - Ensure all required files are present in your repository 382 | - Python version must be set to 3.11 383 | - `HOPSWORKS_API_KEY` must be configured in environment variables 384 | - Repository must be public for free tier deployment 385 | 386 | [📚 More on Streamlit Cloud deployments](https://docs.streamlit.io/deploy) 387 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Crafted Intelligence SRL 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install-python: 2 | uv python install 3 | 4 | install: 5 | uv venv 6 | . .venv/bin/activate 7 | uv pip install --all-extras --requirement pyproject.toml 8 | 9 | start-ui: 10 | RANKING_MODEL_TYPE=ranking uv run python -m streamlit run streamlit_app.py 11 | 12 | start-ui-llm-ranking: 13 | RANKING_MODEL_TYPE=llmranking uv run python -m streamlit run streamlit_app.py 14 | 15 | clean-hopsworks-resources: 16 | uv run python tools/clean_hopsworks_resources.py 17 | 18 | all: feature-engineering train-retrieval train-ranking create-embeddings create-deployments schedule-materialization-jobs 19 | 20 | feature-engineering: 21 | uv run ipython notebooks/1_fp_computing_features.ipynb 22 | 23 | train-retrieval: 24 | uv run ipython notebooks/2_tp_training_retrieval_model.ipynb 25 | 26 | train-ranking: 27 | uv run ipython notebooks/3_tp_training_ranking_model.ipynb 28 | 29 | create-embeddings: 30 | uv run ipython notebooks/4_ip_computing_item_embeddings.ipynb 31 | 32 | create-deployments: 33 | uv run ipython notebooks/5_ip_creating_deployments.ipynb 34 | 35 | schedule-materialization-jobs: 36 | uv run ipython notebooks/6_scheduling_materialization_jobs.ipynb 37 | 38 | create-deployments-llm-ranking: 39 | uv run ipython notebooks/7_ip_creating_deployments_llm_ranking.ipynb 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Hands-on H&M Real-Time Personalized Recommender

3 |

Open-source course by Decoding ML in collaboration with Hopsworks.

4 |
5 | 6 |
7 | 8 |

9 | 10 | Architecture 11 | 12 |

13 | 14 | ## 🎯 What You'll Learn 15 | 16 | This hands-on course teaches you how to build and deploy a real-time personalized recommender system for H&M fashion articles. You'll learn: 17 | 18 | - To architect a modern ML system for real-time personalized recommenders. 19 | - To do feature engineering using modern tools such as Polars. 20 | - To design and train ML models for recommender systems powered by neural networks. 21 | - To use MLOps best practices by leveraging [Hopsworks AI Lakehouse](https://rebrand.ly/homepage-github). 22 | - To deploy the recommender on a Kubernetes cluster managed by [Hopsworks Serverless](https://rebrand.ly/serverless-github) using KServe. 23 | - To apply LLM techniques for personalized recommendations. 24 | 25 |

26 | 4_stage_recommender_architecture 27 | two_tower_embedding_model 28 |

29 | 30 | ## 📖 About This Course 31 | 32 | This course is part of Decoding ML's open-source series, where we provide free hands-on resources for building GenAI and recommender systems. 33 | 34 | The **Hands-on H&M Real-Time Personalized Recommender**, in collaboration with [Hopsworks](https://rebrand.ly/homepage-github), is a 5-module course backed up by code, Notebooks and lessons that will teach you how to build an H&M real-time personalized recommender from scratch. 35 | 36 | By the end of this course, you will know how to architect, build and deploy a modern recommender. 37 | 38 | **What you'll do:** 39 | 40 | 1. Architect a scalable and modular ML system using the Feature/Training/Inference (FTI) architecture. 41 | 3. Feature engineering on top of our H&M data for collaborative and content-based filtering techniques for recommenders. 42 | 2. Use the two-tower network to Create user and item embeddings in the same vector space. 43 | 3. Implement an H&M real-time personalized recommender using the 4-stage recommender design and a vector database. 44 | 4. Use MLOps best practices, such as a feature store and a model registry. 45 | 5. Deploy the online inference pipeline to Kubernetes using KServe. 46 | 6. Deploy the offline ML pipelines to GitHub Actions. 47 | 7. Implement a web interface using Streamlit. 48 | 8. Improve the H&M real-time personalized recommender using LLMs. 49 | 50 | 🥷 With these skills, you'll become a ninja in building real-time personalized recommenders. 51 | 52 | ## 🌐 Live Demo 53 | 54 | Try out our deployed H&M real-time personalized recommender to see what you'll learn to build by the end of this course: 55 | [💻 Live H&M Recommender Streamlit Demo](https://decodingml-hands-on-personalized-recommender.streamlit.app/) 56 | 57 | > [!IMPORTANT] 58 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother. 59 | 60 |

61 | 62 | UI Example 63 | 64 |

65 | 66 | ---- 67 | 68 | 69 | 70 | 75 | 81 | 82 |
71 | 72 | Decoding ML Logo 73 | 74 | 76 |
77 |

📬 Stay Updated

78 |

Join Decoding ML for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.

79 |
80 |
83 | 84 |

85 | 86 | Subscribe Now 87 | 88 |

89 | 90 | ## 👥 Who Should Join? 91 | 92 | **This course is ideal for:** 93 | - ML/AI engineers interested in building production-ready recommender systems 94 | - Data Engineers, Data Scientists, and Software Engineers wanting to understand the engineering behind recommenders 95 | 96 | **Note:** This course focuses on engineering practices and end-to-end system implementation rather than theoretical model optimization or research. 97 | 98 | ## 🎓 Prerequisites 99 | 100 | | Category | Requirements | 101 | |----------|-------------| 102 | | **Skills** | Basic understanding of Python and Machine Learning | 103 | | **Hardware** | Any modern laptop/workstation will do the job (no GPU or powerful computing power required). We also support Google Colab or GitHub Actions for compute.| 104 | | **Level** | Intermediate | 105 | 106 | 107 | ## 💰 Cost Structure 108 | 109 | All tools used throughout the course will stick to their free tier, except OpenAI's API, as follows: 110 | 111 | - Modules 1-4: Completely free 112 | - Module 5 (Optional): ~$1-2 for OpenAI API usage when building LLM-enhanced recommenders 113 | 114 | ## 🥂 Open-source Course: Participation is Open and Free 115 | 116 | As an open-source course, you don't have to enroll. Everything is self-paced, free of charge and with its resources freely accessible as follows: 117 | - **code**: this GitHub repository 118 | - **articles**: [Decoding ML](https://decodingml.substack.com/p/the-ultimate-recommender-system-framework) 119 | 120 | ## 📚 Course Outline 121 | 122 | This **open-source course consists of 5 comprehensive modules** covering theory, system design, and hands-on implementation. 123 | 124 | Our recommendation for each module: 125 | 1. Read the article 126 | 2. Run the Notebook to replicate our results (locally or on Colab) 127 | 3. Following the Notebook, go deeper into the code by reading the `recsys` Python module 128 | 129 | > [!NOTE] 130 | > Check the [INSTALL_AND_USAGE](https://github.com/decodingml/hands-on-personalized-recommender/blob/main/INSTALL_AND_USAGE.md) doc for a step-by-step installation and usage guide. 131 | 132 | | Module | Article | Description | Notebooks | 133 | |--------|-------|-------------|----------------| 134 | | 1 | [Building a TikTok-like recommender](https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022) | Learn how to architect a recommender system using the 4-stage architecture and two-tower network. | **No code** | 135 | | 2 | [Feature pipelines for TikTok-like recommenders](https://decodingml.substack.com/p/feature-pipeline-for-tiktok-like) | Learn how to build a scalable feature pipeline using a feature store. | •[1_fp_computing_features.ipynb](notebooks/1_fp_computing_features.ipynb) | 136 | | 3 | [Training pipelines for TikTok-like recommenders](https://decodingml.substack.com/p/training-pipelines-for-tiktok-like) | Learn to train and evaluate the two-tower network and ranking model using MLOps best practices. | •[2_tp_training_retrieval_model.ipynb](notebooks/2_tp_training_retrieval_model.ipynb)
•[3_tp_training_ranking_model.ipynb](notebooks/3_tp_training_ranking_model.ipynb) | 137 | | 4 | [Deploy scalable TikTok-like recommenders](https://decodingml.substack.com/p/deploy-scalable-tiktok-like-recommenders) | Learn how to architect and deploy the inference pipelines for real-time recommendations using the 4-stage design. | •[4_ip_computing_item_embeddings.ipynb](notebooks/4_ip_computing_item_embeddings.ipynb)
•[5_ip_creating_deployments.ipynb](notebooks/5_ip_creating_deployments.ipynb)
•[6_scheduling_materialization_jobs.ipynb](notebooks/6_scheduling_materialization_jobs.ipynb) | 138 | | 5 | [Using LLMs to build TikTok-like recommenders](https://decodingml.substack.com/p/using-llms-to-build-tiktok-like-recommenders) | Learn how to enhance the H&M personalized recommender with LLMs. | •[7_ip_creating_deployments_llm_ranking.ipynb](notebooks/7_ip_creating_deployments_llm_ranking.ipynb) | 139 | 140 | ### Google Colab 141 | 142 | To run the Notebooks in Google Colab, copy-paste them into your Google Drive, open them with Google Colab, and run them as running them locally. At the beginning of each Notebook, we have a set of setup steps that will **prepare the code and Python environment automatically**. 143 | 144 | ---- 145 | 146 | 147 | 148 | 153 | 159 | 160 |
149 | 150 | Decoding ML Logo 151 | 152 | 154 |
155 |

📬 Stay Updated

156 |

Join Decoding ML for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.

157 |
158 |
161 | 162 |

163 | 164 | Subscribe Now 165 | 166 |

167 | 168 | ## 🏗️ Project Structure 169 | 170 | At Decoding ML we teach how to build production ML systems, thus the course follows the structure of a real-world Python project: 171 | 172 | ```bash 173 | . 174 | ├── notebooks/ # Jupyter notebooks for each pipeline 175 | ├── recsys/ # Core recommender system package 176 | │ ├── config.py # Configuration and settings 177 | │ ... 178 | │ └── training/ # Training pipelines code 179 | ├── tools/ # Utility scripts 180 | ├── streamlit_app.py # Streamlit app entry point 181 | ├── .env.example # Example environment variables template 182 | ├── Makefile # Commands to install and run the project 183 | ├── pyproject.toml # Project dependencies 184 | ``` 185 | 186 | ## 👔 Dataset 187 | 188 | We will use the [H&M Personalized Fashion Recommendations](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations) dataset, available on Kaggle, open-source for academic research and education. 189 | 190 | It is an e-commerce dataset that contains fashion articles from the H&M clothes brand. 191 | 192 | It contains: 193 | - 105k articles 194 | - 137k customers 195 | - 31 million transactions 196 | 197 | More on the dataset in the feature engineering pipeline [Notebook](notebooks/1_fp_computing_features.ipynb) and [article](https://decodingml.substack.com/p/feature-pipeline-for-tiktok-like). 198 | 199 | ## 🚀 Getting Started 200 | 201 | For detailed installation and usage instructions, see our [INSTALL_AND_USAGE](https://github.com/decodingml/hands-on-personalized-recommender/blob/main/INSTALL_AND_USAGE.md) guide. 202 | 203 | **Recommendation:** While you can follow the installation guide directly, we strongly recommend reading the accompanying articles to gain a complete understanding of the recommender system. 204 | 205 | ## 💡 Questions and Troubleshooting 206 | 207 | Have questions or running into issues? We're here to help! 208 | 209 | Open a [GitHub issue](https://github.com/decodingml/hands-on-personalized-recommender/issues) for: 210 | - Questions about the course material 211 | - Technical troubleshooting 212 | - Clarification on concepts 213 | 214 | When having issues with [Hopsworks Serverless](https://rebrand.ly/serverless-github), the best place to ask questions is on [Hopsworks's Slack](https://join.slack.com/t/public-hopsworks/shared_invite/zt-1uf21vitz-rhHKNdIf8GEiOf1EJ6Wzsw), where their engineers can help you directly. 215 | 216 | ## 🥂 Contributing 217 | 218 | As an open-source course, we may not be able to fix all the bugs that arise. 219 | 220 | If you find any bugs and know how to fix them, support future readers by contributing to this course with your bug fix. 221 | 222 | We will deeply appreciate your support for the AI community and future readers 🤗 223 | 224 | ## Sponsors 225 | 226 | 227 | 228 | 231 | 232 | 233 | 238 | 239 |
229 | Hopsworks 230 |
234 | 235 | Hopsworks 236 | 237 |
240 | 241 | ## Contributors 242 | 243 | 244 | 245 | 252 | 259 | 266 | 273 | 274 |
246 | 247 | Paul Iusztin
248 | Paul Iusztin 249 |

250 | AI/ML Engineer 251 |
253 | 254 | Anca Ioana Muscalagiu
255 | Anca Ioana Muscalagiu 256 |

257 | AI/ML Engineer 258 |
260 | 261 | Paolo Perrone
262 | Paolo Perrone 263 |

264 | AI/ML Engineer 265 |
267 | 268 | Hopsworks
269 | Hopsworks's Engineering Team 270 |

271 | AI Lakehouse 272 |
275 | 276 | 277 | ## License 278 | 279 | This course is an open-source project released under the MIT license. Thus, as long you distribute our LICENSE and acknowledge your project is based on our work, you can safely clone or fork this project and use it as a source of inspiration for your educational projects (e.g., university, college degree, personal projects, etc.). 280 | 281 | ---- 282 | 283 | 284 | 285 | 290 | 296 | 297 |
286 | 287 | Decoding ML Logo 288 | 289 | 291 |
292 |

📬 Stay Updated

293 |

Join Decoding ML for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.

294 |
295 |
298 | 299 |

300 | 301 | Subscribe Now 302 | 303 |

304 | -------------------------------------------------------------------------------- /assets/4_stage_recommender_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/4_stage_recommender_architecture.png -------------------------------------------------------------------------------- /assets/github_actions_manual_trigger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_manual_trigger.png -------------------------------------------------------------------------------- /assets/github_actions_pipeline_done.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_pipeline_done.png -------------------------------------------------------------------------------- /assets/github_actions_pipeline_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_pipeline_progress.png -------------------------------------------------------------------------------- /assets/github_actions_secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_secrets.png -------------------------------------------------------------------------------- /assets/hopsworks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/hopsworks.png -------------------------------------------------------------------------------- /assets/hopsworks_deployments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/hopsworks_deployments.png -------------------------------------------------------------------------------- /assets/streamlit_choose_advanced_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_advanced_settings.png -------------------------------------------------------------------------------- /assets/streamlit_choose_app_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_app_type.png -------------------------------------------------------------------------------- /assets/streamlit_choose_main_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_main_settings.png -------------------------------------------------------------------------------- /assets/system_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/system_architecture.png -------------------------------------------------------------------------------- /assets/two_tower_embedding_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/two_tower_embedding_model.png -------------------------------------------------------------------------------- /assets/ui_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/ui_example.png -------------------------------------------------------------------------------- /notebooks/5_ip_creating_deployments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import time\n", 10 | "\n", 11 | "notebook_start_time = time.time()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Set up environment" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "⛳️ Local environment\n", 31 | "Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import sys\n", 37 | "from pathlib import Path\n", 38 | "\n", 39 | "\n", 40 | "def is_google_colab() -> bool:\n", 41 | " if \"google.colab\" in str(get_ipython()):\n", 42 | " return True\n", 43 | " return False\n", 44 | "\n", 45 | "\n", 46 | "def clone_repository() -> None:\n", 47 | " !git clone https://github.com/decodingml/hands-on-recommender-system.git\n", 48 | " %cd hands-on-recommender-system/\n", 49 | "\n", 50 | "\n", 51 | "def install_dependencies() -> None:\n", 52 | " !pip install --upgrade uv\n", 53 | " !uv pip install --all-extras --system --requirement pyproject.toml\n", 54 | "\n", 55 | "\n", 56 | "if is_google_colab():\n", 57 | " clone_repository()\n", 58 | " install_dependencies()\n", 59 | "\n", 60 | " root_dir = str(Path().absolute())\n", 61 | " print(\"⛳️ Google Colab environment\")\n", 62 | "else:\n", 63 | " root_dir = str(Path().absolute().parent)\n", 64 | " print(\"⛳️ Local environment\")\n", 65 | "\n", 66 | "# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.\n", 67 | "if root_dir not in sys.path:\n", 68 | " print(f\"Adding the following directory to the PYTHONPATH: {root_dir}\")\n", 69 | " sys.path.append(root_dir)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Online inference pipeline: Deploying and testing the real-time ML services\n", 77 | "\n", 78 | "In this notebook, we will dig into the inference pipeline and deploy it to Hopsworks as a real-time service." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## 📝 Imports" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "import warnings\n", 95 | "\n", 96 | "warnings.filterwarnings(\"ignore\")\n", 97 | "\n", 98 | "from loguru import logger\n", 99 | "\n", 100 | "from recsys import hopsworks_integration" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## 🔮 Connect to Hopsworks Feature Store " 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stderr", 117 | "output_type": "stream", 118 | "text": [ 119 | "\u001b[32m2024-12-24 13:12:11.849\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mrecsys.hopsworks_integration.feature_store\u001b[0m:\u001b[36mget_feature_store\u001b[0m:\u001b[36m13\u001b[0m - \u001b[1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.\u001b[0m\n" 120 | ] 121 | }, 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "2024-12-24 13:12:11,850 INFO: Initializing external client\n", 127 | "2024-12-24 13:12:11,850 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 128 | "2024-12-24 13:12:13,423 INFO: Python Engine initialized.\n", 129 | "\n", 130 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "project, fs = hopsworks_integration.get_feature_store()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "tags": [] 142 | }, 143 | "source": [ 144 | "# Deploying the ranking inference pipeline\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "You start by deploying your ranking model. Since it is a CatBoost model you need to implement a `Predict` class that tells Hopsworks how to load the model and how to use it:" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stderr", 161 | "output_type": "stream", 162 | "text": [ 163 | "Uploading: 100.000%|██████████| 4491/4491 elapsed<00:01 remaining<00:00\n", 164 | "Uploading: 100.000%|██████████| 1113/1113 elapsed<00:01 remaining<00:00\n" 165 | ] 166 | }, 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "Deployment created, explore it at https://c.app.hopsworks.ai:443/p/1192098/deployments/353319\n", 172 | "Before making predictions, start the deployment by using `.start()`\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "ranking_deployment = hopsworks_integration.ranking_serving.HopsworksRankingModel.deploy(\n", 178 | " project=project\n", 179 | ")" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Now, we have to explicitly start the deployment:" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 6, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stderr", 196 | "output_type": "stream", 197 | "text": [ 198 | "Deployment is ready: 100%|██████████| 6/6 [00:47<00:00, 7.88s/it] " 199 | ] 200 | }, 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "Start making predictions by using `.predict()`\n" 206 | ] 207 | }, 208 | { 209 | "name": "stderr", 210 | "output_type": "stream", 211 | "text": [ 212 | "\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "ranking_deployment.start()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 7, 223 | "metadata": { 224 | "tags": [] 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "# Check logs in case of failure\n", 229 | "# ranking_deployment.get_logs(component=\"transformer\", tail=200)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Test the ranking inference pipeline\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 8, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "def get_top_recommendations(ranked_candidates, k=3):\n", 246 | " return [candidate[-1] for candidate in ranked_candidates[\"ranking\"][:k]]" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "Let's define a dummy test example to test our ranking deployment (only the `customer_id` has to match):" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 9, 259 | "metadata": { 260 | "tags": [] 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "['592846001', '536139006', '408554004']" 267 | ] 268 | }, 269 | "execution_count": 9, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "test_ranking_input = [\n", 276 | " {\n", 277 | " \"customer_id\": \"d327d0ad9e30085a436933dfbb7f77cf42e38447993a078ed35d93e3fd350ecf\",\n", 278 | " \"month_sin\": 1.2246467991473532e-16,\n", 279 | " \"query_emb\": [\n", 280 | " 0.214135289,\n", 281 | " 0.571055949,\n", 282 | " 0.330709577,\n", 283 | " -0.225899458,\n", 284 | " -0.308674961,\n", 285 | " -0.0115124583,\n", 286 | " 0.0730511621,\n", 287 | " -0.495835781,\n", 288 | " 0.625569344,\n", 289 | " -0.0438038409,\n", 290 | " 0.263472944,\n", 291 | " -0.58485353,\n", 292 | " -0.307070434,\n", 293 | " 0.0414443575,\n", 294 | " -0.321789205,\n", 295 | " 0.966559,\n", 296 | " ],\n", 297 | " \"month_cos\": -1.0,\n", 298 | " }\n", 299 | " ]\n", 300 | "\n", 301 | "# Test ranking deployment\n", 302 | "ranked_candidates = ranking_deployment.predict(inputs=test_ranking_input)\n", 303 | "\n", 304 | "# Retrieve article ids of the top recommended items\n", 305 | "recommendations = get_top_recommendations(ranked_candidates[\"predictions\"], k=3)\n", 306 | "recommendations" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "Check logs in case of failure:" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 10, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# ranking_deployment.get_logs(component=\"transformer\", tail=200)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "tags": [] 329 | }, 330 | "source": [ 331 | "# Deploying the query inference pipeline" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 11, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | "2024-12-24 13:13:14,889 INFO: Closing external client and cleaning up certificates.\n", 344 | "Connection closed.\n", 345 | "2024-12-24 13:13:14,894 INFO: Initializing external client\n", 346 | "2024-12-24 13:13:14,895 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 347 | "2024-12-24 13:13:16,223 INFO: Python Engine initialized.\n", 348 | "\n", 349 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n", 350 | "2024-12-24 13:13:17,497 INFO: Closing external client and cleaning up certificates.\n", 351 | "2024-12-24 13:13:17,501 INFO: Initializing external client\n", 352 | "2024-12-24 13:13:17,502 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 353 | "2024-12-24 13:13:18,402 INFO: Closing external client and cleaning up certificates.\n", 354 | "Connection closed.\n", 355 | "2024-12-24 13:13:18,408 INFO: Initializing external client\n", 356 | "2024-12-24 13:13:18,408 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 357 | "2024-12-24 13:13:19,727 INFO: Python Engine initialized.\n", 358 | "\n", 359 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n", 360 | "Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets\n" 361 | ] 362 | }, 363 | { 364 | "name": "stderr", 365 | "output_type": "stream", 366 | "text": [ 367 | "Uploading: 100.000%|██████████| 2948/2948 elapsed<00:05 remaining<00:00\n" 368 | ] 369 | }, 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "Deployment created, explore it at https://c.app.hopsworks.ai:443/p/1192098/deployments/353320\n", 375 | "Before making predictions, start the deployment by using `.start()`\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "query_model_deployment = (\n", 381 | " hopsworks_integration.two_tower_serving.HopsworksQueryModel.deploy(ranking_model_type=\"ranking\")\n", 382 | ")" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "At this point, you have registered your deployment. To start it up you need to run:" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 12, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stderr", 399 | "output_type": "stream", 400 | "text": [ 401 | "Deployment is ready: 100%|██████████| 6/6 [00:26<00:00, 4.45s/it] " 402 | ] 403 | }, 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "Start making predictions by using `.predict()`\n" 409 | ] 410 | }, 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "query_model_deployment.start()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 13, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# Check logs in case of failure\n", 430 | "# query_model_deployment.get_logs(component=\"transformer\", tail=20)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "## Testing the inference pipeline \n", 438 | "\n", 439 | "Define a test input example:" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 14, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "data = [\n", 449 | " {\n", 450 | " \"customer_id\": \"d327d0ad9e30085a436933dfbb7f77cf42e38447993a078ed35d93e3fd350ecf\",\n", 451 | " \"transaction_date\": \"2022-11-15T12:16:25.330916\",\n", 452 | " }\n", 453 | "]" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "Test out the deployment:" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 15, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "['670079001', '299768002', '324946001']" 472 | ] 473 | }, 474 | "execution_count": 15, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "ranked_candidates = query_model_deployment.predict(inputs=data)\n", 481 | "\n", 482 | "# Retrieve article ids of the top recommended items\n", 483 | "recommendations = get_top_recommendations(ranked_candidates[\"predictions\"], k=3)\n", 484 | "recommendations" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "Check logs in case of failure:" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 16, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "# query_model_deployment.get_logs(component=\"transformer\", tail=200)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "# Stopping the Hopsworks deployments " 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "Stop the deployment when you're not using it." 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 17, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "name": "stderr", 524 | "output_type": "stream", 525 | "text": [ 526 | "Deployment is stopped: 100%|██████████| 4/4 [00:10<00:00, 2.67s/it] \n", 527 | "Deployment is stopped: 100%|██████████| 4/4 [00:10<00:00, 2.68s/it] \n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "ranking_deployment.stop()\n", 533 | "query_model_deployment.stop()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "## Inspecting the deployments in Hopsworks UI \n", 541 | "\n", 542 | "View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "---" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 18, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "name": "stderr", 559 | "output_type": "stream", 560 | "text": [ 561 | "\u001b[32m2024-12-24 13:14:20.862\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1m⌛️ Notebook Execution time: 133.44 seconds ~ 2.22 minutes\u001b[0m\n" 562 | ] 563 | } 564 | ], 565 | "source": [ 566 | "notebook_end_time = time.time()\n", 567 | "notebook_execution_time = notebook_end_time - notebook_start_time\n", 568 | "\n", 569 | "logger.info(\n", 570 | " f\"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes\"\n", 571 | ")" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "# → Next Steps \n", 579 | "\n", 580 | "The last step is to schedule the materialization jobs." 581 | ] 582 | } 583 | ], 584 | "metadata": { 585 | "kernelspec": { 586 | "display_name": "Python 3 (ipykernel)", 587 | "language": "python", 588 | "name": "python3" 589 | }, 590 | "language_info": { 591 | "codemirror_mode": { 592 | "name": "ipython", 593 | "version": 3 594 | }, 595 | "file_extension": ".py", 596 | "mimetype": "text/x-python", 597 | "name": "python", 598 | "nbconvert_exporter": "python", 599 | "pygments_lexer": "ipython3", 600 | "version": "3.11.8" 601 | } 602 | }, 603 | "nbformat": 4, 604 | "nbformat_minor": 4 605 | } 606 | -------------------------------------------------------------------------------- /notebooks/6_scheduling_materialization_jobs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6d91c23d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Set up environment" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "20f093e1", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "⛳️ Local environment\n", 22 | "Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import sys\n", 28 | "from pathlib import Path\n", 29 | "\n", 30 | "\n", 31 | "def is_google_colab() -> bool:\n", 32 | " if \"google.colab\" in str(get_ipython()):\n", 33 | " return True\n", 34 | " return False\n", 35 | "\n", 36 | "\n", 37 | "def clone_repository() -> None:\n", 38 | " !git clone https://github.com/decodingml/hands-on-recommender-system.git\n", 39 | " %cd hands-on-recommender-system/\n", 40 | "\n", 41 | "\n", 42 | "def install_dependencies() -> None:\n", 43 | " !pip install --upgrade uv\n", 44 | " !uv pip install --all-extras --system --requirement pyproject.toml\n", 45 | "\n", 46 | "\n", 47 | "if is_google_colab():\n", 48 | " clone_repository()\n", 49 | " install_dependencies()\n", 50 | "\n", 51 | " root_dir = str(Path().absolute())\n", 52 | " print(\"⛳️ Google Colab environment\")\n", 53 | "else:\n", 54 | " root_dir = str(Path().absolute().parent)\n", 55 | " print(\"⛳️ Local environment\")\n", 56 | "\n", 57 | "# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.\n", 58 | "if root_dir not in sys.path:\n", 59 | " print(f\"Adding the following directory to the PYTHONPATH: {root_dir}\")\n", 60 | " sys.path.append(root_dir)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "6a8f7546", 66 | "metadata": {}, 67 | "source": [ 68 | "# Scheduling Hopsworks materialization jobs \n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "b204608b", 74 | "metadata": {}, 75 | "source": [ 76 | "## 📝 Imports" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "id": "06390a5b", 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "/Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 90 | " from .autonotebook import tqdm as notebook_tqdm\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "from datetime import datetime, timezone\n", 96 | "\n", 97 | "from recsys import hopsworks_integration" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "7a931086", 103 | "metadata": {}, 104 | "source": [ 105 | "## 🔮 Connect to Hopsworks Feature Store " 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 3, 111 | "id": "ef7a5e64", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "\u001b[32m2024-12-24 13:15:04.623\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mrecsys.hopsworks_integration.feature_store\u001b[0m:\u001b[36mget_feature_store\u001b[0m:\u001b[36m13\u001b[0m - \u001b[1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.\u001b[0m\n" 119 | ] 120 | }, 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "2024-12-24 13:15:04,625 INFO: Initializing external client\n", 126 | "2024-12-24 13:15:04,625 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 127 | "2024-12-24 13:15:06,101 INFO: Python Engine initialized.\n", 128 | "\n", 129 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "project, fs = hopsworks_integration.get_feature_store()\n", 135 | "\n", 136 | "jobs_api = project.get_jobs_api()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "cea06db9", 142 | "metadata": {}, 143 | "source": [ 144 | "# Retrieving materialization jobs\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "id": "92f62c85", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "Job('interactions_1_offline_fg_materialization', 'SPARK')" 157 | ] 158 | }, 159 | "execution_count": 4, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "interactions_job = jobs_api.get_job(\"interactions_1_offline_fg_materialization\")\n", 166 | "interactions_job" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "id": "128827f1", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "Job('transactions_1_offline_fg_materialization', 'SPARK')" 179 | ] 180 | }, 181 | "execution_count": 5, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "transactions_job = jobs_api.get_job(\"transactions_1_offline_fg_materialization\")\n", 188 | "transactions_job" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "f78e6278", 194 | "metadata": {}, 195 | "source": [ 196 | "# Running materialization jobs\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 6, 202 | "id": "f79e13a5", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "Launching job: interactions_1_offline_fg_materialization\n", 210 | "Job started successfully, you can follow the progress at \n", 211 | "https://c.app.hopsworks.ai:443/p/1192098/jobs/named/interactions_1_offline_fg_materialization/executions\n", 212 | "2024-12-24 13:15:16,740 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n", 213 | "2024-12-24 13:15:19,916 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n", 214 | "2024-12-24 13:16:49,132 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n", 215 | "2024-12-24 13:16:49,297 INFO: Waiting for log aggregation to finish.\n", 216 | "2024-12-24 13:17:08,007 INFO: Execution finished successfully.\n" 217 | ] 218 | }, 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "Execution('SUCCEEDED', 'FINISHED', '2024-12-24T11:15:08.000Z', '-op offline_fg_materialization -path hdfs:///Projects/decoding/Resources/jobs/interactions_1_offline_fg_materialization/config_1735032952539')" 223 | ] 224 | }, 225 | "execution_count": 6, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "interactions_job_execution = interactions_job.run()\n", 232 | "interactions_job_execution" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 7, 238 | "id": "8dc5cc7e", 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Launching job: transactions_1_offline_fg_materialization\n", 246 | "Job started successfully, you can follow the progress at \n", 247 | "https://c.app.hopsworks.ai:443/p/1192098/jobs/named/transactions_1_offline_fg_materialization/executions\n", 248 | "2024-12-24 13:17:16,894 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n", 249 | "2024-12-24 13:17:20,074 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n", 250 | "2024-12-24 13:18:49,133 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n", 251 | "2024-12-24 13:18:49,292 INFO: Waiting for log aggregation to finish.\n", 252 | "2024-12-24 13:19:01,338 INFO: Execution finished successfully.\n" 253 | ] 254 | }, 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "Execution('SUCCEEDED', 'FINISHED', '2024-12-24T11:17:08.000Z', '-op offline_fg_materialization -path hdfs:///Projects/decoding/Resources/jobs/transactions_1_offline_fg_materialization/config_1735032811896')" 259 | ] 260 | }, 261 | "execution_count": 7, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "transactions_job_execution = transactions_job.run()\n", 268 | "transactions_job_execution" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "2f45a3f3", 274 | "metadata": {}, 275 | "source": [ 276 | "## ⏰ Scheduling materialization jobs \n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 8, 282 | "id": "b95eb11a", 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "datetime.datetime(2024, 12, 26, 0, 0, tzinfo=datetime.timezone.utc)" 289 | ] 290 | }, 291 | "execution_count": 8, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "interactions_job.schedule(\n", 298 | " cron_expression=\"0 0 0 * * ?\", # Runs at midnight (00:00:00) every day\n", 299 | " start_time=datetime.now(tz=timezone.utc),\n", 300 | ")\n", 301 | "interactions_job.job_schedule.next_execution_date_time" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 9, 307 | "id": "97d546e5", 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "datetime.datetime(2024, 12, 26, 0, 0, tzinfo=datetime.timezone.utc)" 314 | ] 315 | }, 316 | "execution_count": 9, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "transactions_job.schedule(\n", 323 | " cron_expression=\"0 0 0 * * ?\", # Runs at midnight (00:00:00) every day\n", 324 | " start_time=datetime.now(tz=timezone.utc),\n", 325 | ")\n", 326 | "transactions_job.job_schedule.next_execution_date_time" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "db866c1f", 332 | "metadata": {}, 333 | "source": [ 334 | "## Inspecting the materialization jobs in Hopsworks UI \n", 335 | "\n", 336 | "View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Compute → Ingestions**" 337 | ] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.11.8" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 5 361 | } 362 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | build-essential 2 | clang -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "hands-on-recommender-system" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = "~=3.11" 7 | dependencies = [ 8 | "altair>=4.2.2", 9 | "catboost==1.2", 10 | "hopsworks[python]>=4.1.2", 11 | "huggingface-hub==0.24.7", 12 | "ipykernel>=6.29.5", 13 | "langchain-openai==0.1.14", 14 | "langchain==0.2.6", 15 | "loguru>=0.7.2", 16 | "nbformat>=5.10.4", 17 | "polars==1.9.0", 18 | "pydantic-settings>=2.6.1", 19 | "sentence-transformers==2.2.2", 20 | "streamlit==1.28.2", 21 | "tensorflow-recommenders==0.7.2", 22 | "tensorflow==2.14", 23 | ] 24 | 25 | [dependency-groups] 26 | dev = [ 27 | "ruff>=0.7.2", 28 | ] 29 | -------------------------------------------------------------------------------- /recsys/__init__.py: -------------------------------------------------------------------------------- 1 | from . import features, inference, hopsworks_integration, raw_data_sources, training 2 | 3 | __all__ = [ 4 | "features", 5 | "inference", 6 | "hopsworks_integration", 7 | "raw_data_sources", 8 | "training", 9 | ] 10 | -------------------------------------------------------------------------------- /recsys/config.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from pathlib import Path 3 | from typing import Literal 4 | 5 | from pydantic import SecretStr 6 | from pydantic_settings import BaseSettings, SettingsConfigDict 7 | 8 | 9 | class CustomerDatasetSize(Enum): 10 | LARGE = "LARGE" 11 | MEDIUM = "MEDIUM" 12 | SMALL = "SMALL" 13 | 14 | 15 | class Settings(BaseSettings): 16 | model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8") 17 | 18 | RECSYS_DIR: Path = Path(__file__).parent 19 | 20 | # Hopsworks 21 | HOPSWORKS_API_KEY: SecretStr | None = None 22 | 23 | # OpenAI 24 | OPENAI_MODEL_ID: str = "gpt-4o-mini" 25 | OPENAI_API_KEY: SecretStr | None = None 26 | 27 | # Feature engineering 28 | CUSTOMER_DATA_SIZE: CustomerDatasetSize = CustomerDatasetSize.SMALL 29 | FEATURES_EMBEDDING_MODEL_ID: str = "all-MiniLM-L6-v2" 30 | 31 | # Training 32 | TWO_TOWER_MODEL_EMBEDDING_SIZE: int = 16 33 | TWO_TOWER_MODEL_BATCH_SIZE: int = 2048 34 | TWO_TOWER_NUM_EPOCHS: int = 10 35 | TWO_TOWER_WEIGHT_DECAY: float = 0.001 36 | TWO_TOWER_LEARNING_RATE: float = 0.01 37 | TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE: float = 0.1 38 | TWO_TOWER_DATASET_TEST_SPLIT_SIZE: float = 0.1 39 | 40 | RANKING_DATASET_VALIDATON_SPLIT_SIZE: float = 0.1 41 | RANKING_LEARNING_RATE: float = 0.2 42 | RANKING_ITERATIONS: int = 100 43 | RANKING_SCALE_POS_WEIGHT: int = 10 44 | RANKING_EARLY_STOPPING_ROUNDS: int = 5 45 | 46 | # Inference 47 | RANKING_MODEL_TYPE: Literal["ranking", "llmranking"] = "ranking" 48 | CUSTOM_HOPSWORKS_INFERENCE_ENV: str = "custom_env_name" 49 | 50 | 51 | settings = Settings() 52 | -------------------------------------------------------------------------------- /recsys/features/__init__.py: -------------------------------------------------------------------------------- 1 | from . import articles, customers, embeddings, interaction, ranking, transactions 2 | 3 | __all__ = [ 4 | "articles", 5 | "customers", 6 | "embeddings", 7 | "interaction", 8 | "ranking", 9 | "transactions", 10 | ] 11 | -------------------------------------------------------------------------------- /recsys/features/articles.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import io 3 | import sys 4 | 5 | import polars as pl 6 | from tqdm.auto import tqdm 7 | from sentence_transformers import SentenceTransformer 8 | 9 | 10 | def get_article_id(df: pl.DataFrame) -> pl.Series: 11 | """ 12 | Extracts and returns the article_id column as a string. 13 | Parameters: 14 | - df (pl.DataFrame): Input DataFrame containing the 'article_id' column. 15 | Returns: 16 | - pl.Series: Series containing the 'article_id' column as strings. 17 | """ 18 | return df["article_id"].cast(pl.Utf8) 19 | 20 | 21 | def create_prod_name_length(df: pl.DataFrame) -> pl.Series: 22 | """ 23 | Creates a new column 'prod_name_length' representing the length of 'prod_name'. 24 | Parameters: 25 | - df (pl.DataFrame): Input DataFrame containing the 'prod_name' column. 26 | Returns: 27 | - pl.Series: Series containing the length of 'prod_name' for each row. 28 | """ 29 | return df["prod_name"].str.len_chars() 30 | 31 | 32 | def create_article_description(row): 33 | description = f"{row['prod_name']} - {row['product_type_name']} in {row['product_group_name']}" 34 | description += f"\nAppearance: {row['graphical_appearance_name']}" 35 | description += f"\nColor: {row['perceived_colour_value_name']} {row['perceived_colour_master_name']} ({row['colour_group_name']})" 36 | description += f"\nCategory: {row['index_group_name']} - {row['section_name']} - {row['garment_group_name']}" 37 | 38 | if row["detail_desc"]: 39 | description += f"\nDetails: {row['detail_desc']}" 40 | 41 | return description 42 | 43 | 44 | def compute_features_articles(df: pl.DataFrame) -> pl.DataFrame: 45 | """ 46 | Prepares the input DataFrame by creating new features and dropping specific columns. 47 | Parameters: 48 | - df (pl.DataFrame): Input DataFrame. 49 | Returns: 50 | - pl.DataFrame: Processed DataFrame with new features and specific columns dropped. 51 | """ 52 | # Create new columns 53 | df = df.with_columns( 54 | [ 55 | get_article_id(df).alias("article_id"), 56 | create_prod_name_length(df).alias("prod_name_length"), 57 | pl.struct(df.columns) 58 | .map_elements(create_article_description) 59 | .alias("article_description"), 60 | ] 61 | ) 62 | 63 | # Add full image URLs. 64 | df = df.with_columns(image_url=pl.col("article_id").map_elements(get_image_url)) 65 | 66 | # Drop columns with null values 67 | df = df.select([col for col in df.columns if not df[col].is_null().any()]) 68 | 69 | # Remove 'detail_desc' column 70 | columns_to_drop = ["detail_desc", "detail_desc_length"] 71 | existing_columns = df.columns 72 | columns_to_keep = [col for col in existing_columns if col not in columns_to_drop] 73 | 74 | return df.select(columns_to_keep) 75 | 76 | 77 | def generate_embeddings_for_dataframe( 78 | df: pl.DataFrame, text_column: str, model: SentenceTransformer, batch_size: int = 32 79 | ) -> pl.DataFrame: 80 | """ 81 | Generate embeddings for a text column in a Polars DataFrame. 82 | 83 | Args: 84 | df (pl.DataFrame): Input Polars DataFrame 85 | text_column (str): Name of the column containing text to embed 86 | model (SentenceTransformer): SentenceTransformer embedding model to use 87 | batch_size (int): Number of samples run at once through the embedding model 88 | 89 | Returns: 90 | pl.DataFrame: DataFrame with a new 'embedding' column 91 | """ 92 | 93 | @contextlib.contextmanager 94 | def suppress_stdout(): 95 | new_stdout = io.StringIO() 96 | old_stdout = sys.stdout 97 | sys.stdout = new_stdout 98 | try: 99 | yield new_stdout 100 | finally: 101 | sys.stdout = old_stdout 102 | 103 | total_rows = len(df) 104 | pbar = tqdm(total=total_rows, desc="Generating embeddings") 105 | 106 | # Create a new column with embeddings 107 | texts = df[text_column].to_list() 108 | 109 | all_embeddings = [] 110 | for i in range(0, len(texts), batch_size): 111 | batch_texts = texts[i : i + batch_size] 112 | with suppress_stdout(): 113 | batch_embeddings = model.encode( 114 | batch_texts, device=model.device, show_progress_bar=False 115 | ) 116 | all_embeddings.extend(batch_embeddings.tolist()) 117 | pbar.update(len(batch_texts)) 118 | 119 | df_with_embeddings = df.with_columns(embeddings=pl.Series(all_embeddings)) 120 | 121 | pbar.close() 122 | 123 | return df_with_embeddings 124 | 125 | 126 | def get_image_url(article_id): 127 | url_start = "https://repo.hops.works/dev/jdowling/h-and-m/images/0" 128 | 129 | # Convert article_id to string 130 | article_id_str = str(article_id) 131 | 132 | folder = article_id_str[:2] 133 | 134 | image_name = article_id_str 135 | 136 | return f"{url_start}{folder}/0{image_name}.jpg" 137 | -------------------------------------------------------------------------------- /recsys/features/customers.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import polars as pl 4 | from loguru import logger 5 | 6 | from recsys.config import CustomerDatasetSize 7 | 8 | 9 | class DatasetSampler: 10 | _SIZES = { 11 | CustomerDatasetSize.LARGE: 50_000, 12 | CustomerDatasetSize.MEDIUM: 5_000, 13 | CustomerDatasetSize.SMALL: 1_000, 14 | } 15 | 16 | def __init__(self, size: CustomerDatasetSize) -> None: 17 | self._size = size 18 | 19 | @classmethod 20 | def get_supported_sizes(cls) -> dict: 21 | return cls._SIZES 22 | 23 | def sample( 24 | self, customers_df: pl.DataFrame, transations_df: pl.DataFrame 25 | ) -> dict[str, pl.DataFrame]: 26 | random.seed(27) 27 | 28 | n_customers = self._SIZES[self._size] 29 | logger.info(f"Sampling {n_customers} customers.") 30 | customers_df = customers_df.sample(n=n_customers) 31 | 32 | logger.info( 33 | f"Number of transactions for all the customers: {transations_df.height}" 34 | ) 35 | transations_df = transations_df.join( 36 | customers_df.select("customer_id"), on="customer_id" 37 | ) 38 | logger.info( 39 | f"Number of transactions for the {n_customers} sampled customers: {transations_df.height}" 40 | ) 41 | 42 | return {"customers": customers_df, "transactions": transations_df} 43 | 44 | 45 | def fill_missing_club_member_status(df: pl.DataFrame) -> pl.DataFrame: 46 | """ 47 | Fill missing values in the 'club_member_status' column with 'ABSENT'. 48 | 49 | Parameters: 50 | - df (pl.DataFrame): Input DataFrame containing the 'club_member_status' column. 51 | 52 | Returns: 53 | - pl.DataFrame: DataFrame with filled 'club_member_status' column. 54 | """ 55 | return df.with_columns(pl.col("club_member_status").fill_null("ABSENT")) 56 | 57 | 58 | def drop_na_age(df: pl.DataFrame) -> pl.DataFrame: 59 | """ 60 | Drop rows with null values in the 'age' column. 61 | 62 | Parameters: 63 | - df (pl.DataFrame): Input DataFrame containing the 'age' column. 64 | 65 | Returns: 66 | - pl.DataFrame: DataFrame with rows containing null 'age' values removed. 67 | """ 68 | return df.drop_nulls(subset=["age"]) 69 | 70 | 71 | def create_age_group() -> pl.Expr: 72 | """ 73 | Create an expression to categorize age into groups. 74 | 75 | Returns: 76 | - pl.Expr: Polars expression that categorizes 'age' into predefined age groups. 77 | """ 78 | return ( 79 | pl.when(pl.col("age").is_between(0, 18)) 80 | .then(pl.lit("0-18")) 81 | .when(pl.col("age").is_between(19, 25)) 82 | .then(pl.lit("19-25")) 83 | .when(pl.col("age").is_between(26, 35)) 84 | .then(pl.lit("26-35")) 85 | .when(pl.col("age").is_between(36, 45)) 86 | .then(pl.lit("36-45")) 87 | .when(pl.col("age").is_between(46, 55)) 88 | .then(pl.lit("46-55")) 89 | .when(pl.col("age").is_between(56, 65)) 90 | .then(pl.lit("56-65")) 91 | .otherwise(pl.lit("66+")) 92 | ).alias("age_group") 93 | 94 | 95 | def compute_features_customers( 96 | df: pl.DataFrame, drop_null_age: bool = False 97 | ) -> pl.DataFrame: 98 | """ 99 | Prepare customer data by performing several data cleaning and transformation steps. 100 | 101 | This function does the following: 102 | 1. Checks for required columns in the input DataFrame. 103 | 2. Fills missing club member status with 'ABSENT'. 104 | 3. Drops rows with missing age values. 105 | 4. Creates an age group category. 106 | 5. Casts the 'age' column to Float64. 107 | 6. Selects and orders specific columns in the output. 108 | 109 | Parameters: 110 | - df (pl.DataFrame): Input DataFrame containing customer data. 111 | 112 | Returns: 113 | - pl.DataFrame: Processed DataFrame with cleaned and transformed customer data. 114 | 115 | Raises: 116 | - ValueError: If any of the required columns are missing from the input DataFrame. 117 | """ 118 | required_columns = ["customer_id", "club_member_status", "age", "postal_code"] 119 | missing_columns = [col for col in required_columns if col not in df.columns] 120 | if missing_columns: 121 | raise ValueError( 122 | f"Columns {', '.join(missing_columns)} not found in the DataFrame" 123 | ) 124 | 125 | df = ( 126 | df.pipe(fill_missing_club_member_status) 127 | .pipe(drop_na_age) 128 | .with_columns([create_age_group(), pl.col("age").cast(pl.Float64)]) 129 | .select( 130 | ["customer_id", "club_member_status", "age", "postal_code", "age_group"] 131 | ) 132 | ) 133 | 134 | if drop_null_age is True: 135 | df = df.drop_nulls(subset=["age"]) 136 | 137 | return df 138 | -------------------------------------------------------------------------------- /recsys/features/embeddings.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import tensorflow as tf 3 | 4 | 5 | def preprocess(train_df: pd.DataFrame, candidate_features: list) -> pd.DataFrame: 6 | # Select the candidate features from the training DataFrame 7 | item_df = train_df[candidate_features] 8 | 9 | # Drop duplicate rows based on the 'article_id' column to get unique candidate items 10 | item_df.drop_duplicates(subset="article_id", inplace=True) 11 | 12 | return item_df 13 | 14 | 15 | def embed(df: pd.DataFrame, candidate_model) -> pd.DataFrame: 16 | ds = tf.data.Dataset.from_tensor_slices({col: df[col] for col in df}) 17 | 18 | candidate_embeddings = ds.batch(2048).map( 19 | lambda x: (x["article_id"], candidate_model(x)) 20 | ) 21 | 22 | all_article_ids = tf.concat([batch[0] for batch in candidate_embeddings], axis=0) 23 | all_embeddings = tf.concat([batch[1] for batch in candidate_embeddings], axis=0) 24 | 25 | all_article_ids = all_article_ids.numpy().astype(int).tolist() 26 | all_embeddings = all_embeddings.numpy().tolist() 27 | 28 | embeddings_df = pd.DataFrame( 29 | { 30 | "article_id": all_article_ids, 31 | "embeddings": all_embeddings, 32 | } 33 | ) 34 | 35 | return embeddings_df 36 | -------------------------------------------------------------------------------- /recsys/features/interaction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import polars as pl 3 | from tqdm import tqdm 4 | 5 | 6 | def generate_interaction_data(trans_df): 7 | # Pre-compute unique values once 8 | unique_customers = trans_df["customer_id"].unique() 9 | all_articles = trans_df["article_id"].unique() 10 | all_articles_set = set(all_articles) 11 | 12 | interactions = [] 13 | 14 | def generate_timestamps(base_timestamp, count, min_hours, max_hours): 15 | hours = np.random.randint(min_hours, max_hours, size=count) 16 | return base_timestamp - (hours * 3600000) 17 | 18 | # Ratios to ensure more realistic interactions 19 | CLICK_BEFORE_PURCHASE_PROB = 0.9 20 | MIN_IGNORES = 40 21 | MAX_IGNORES = 60 22 | MIN_EXTRA_CLICKS = 5 23 | MAX_EXTRA_CLICKS = 8 24 | EXTRA_CLICKS_PROB = 0.95 25 | 26 | chunk_size = 1000 27 | for chunk_start in tqdm( 28 | range(0, len(unique_customers), chunk_size), desc="Processing customer chunks" 29 | ): 30 | chunk_end = min(chunk_start + chunk_size, len(unique_customers)) 31 | chunk_customers = unique_customers[chunk_start:chunk_end] 32 | 33 | chunk_transactions = trans_df.filter( 34 | pl.col("customer_id").is_in(chunk_customers) 35 | ) 36 | 37 | for customer_id in chunk_customers: 38 | customer_purchases = chunk_transactions.filter( 39 | pl.col("customer_id") == customer_id 40 | ) 41 | 42 | if len(customer_purchases) == 0: 43 | continue 44 | 45 | customer_articles = {"purchased": set(), "clicked": set(), "ignored": set()} 46 | last_purchase_timestamp = customer_purchases["t_dat"].max() 47 | 48 | # Generate more ignores first 49 | num_ignores = np.random.randint(MIN_IGNORES, MAX_IGNORES) 50 | available_articles = list(all_articles_set) 51 | 52 | if available_articles and num_ignores > 0: 53 | ignore_timestamps = generate_timestamps( 54 | last_purchase_timestamp, num_ignores, 1, 96 55 | ) 56 | selected_ignores = np.random.choice( 57 | available_articles, 58 | size=min(num_ignores, len(available_articles)), 59 | replace=False, 60 | ) 61 | 62 | # Generate multiple sets of ignores to increase the count 63 | for ts, art_id in zip(ignore_timestamps, selected_ignores): 64 | # Add 1-2 ignore events for the same article 65 | num_ignore_events = np.random.randint(1, 3) 66 | for _ in range(num_ignore_events): 67 | ignore_ts = ( 68 | ts - np.random.randint(1, 12) * 3600000 69 | ) # Add some random hours difference 70 | interactions.append( 71 | { 72 | "t_dat": ignore_ts, 73 | "customer_id": customer_id, 74 | "article_id": art_id, 75 | "interaction_score": 0, 76 | "prev_article_id": None, 77 | } 78 | ) 79 | customer_articles["ignored"].add(art_id) 80 | 81 | # Process purchases and their clicks 82 | purchase_rows = customer_purchases.iter_rows(named=True) 83 | for row in purchase_rows: 84 | purchase_timestamp = row["t_dat"] 85 | article_id = row["article_id"] 86 | 87 | # Add clicks before purchase 88 | if np.random.random() < CLICK_BEFORE_PURCHASE_PROB: 89 | num_pre_clicks = np.random.randint(1, 3) 90 | for _ in range(num_pre_clicks): 91 | click_timestamp = generate_timestamps( 92 | purchase_timestamp, 1, 1, 48 93 | )[0] 94 | interactions.append( 95 | { 96 | "t_dat": click_timestamp, 97 | "customer_id": customer_id, 98 | "article_id": article_id, 99 | "interaction_score": 1, 100 | "prev_article_id": None, 101 | } 102 | ) 103 | customer_articles["clicked"].add(article_id) 104 | 105 | # Add purchase 106 | interactions.append( 107 | { 108 | "t_dat": purchase_timestamp, 109 | "customer_id": customer_id, 110 | "article_id": article_id, 111 | "interaction_score": 2, 112 | "prev_article_id": None, 113 | } 114 | ) 115 | customer_articles["purchased"].add(article_id) 116 | 117 | # Generate extra clicks 118 | if np.random.random() < EXTRA_CLICKS_PROB: 119 | num_extra_clicks = np.random.randint( 120 | MIN_EXTRA_CLICKS, MAX_EXTRA_CLICKS + 1 121 | ) 122 | available_for_clicks = list( 123 | all_articles_set 124 | - customer_articles["purchased"] 125 | - customer_articles["clicked"] 126 | - customer_articles["ignored"] 127 | ) 128 | 129 | if available_for_clicks and num_extra_clicks > 0: 130 | click_timestamps = generate_timestamps( 131 | last_purchase_timestamp, num_extra_clicks, 1, 72 132 | ) 133 | selected_clicks = np.random.choice( 134 | available_for_clicks, 135 | size=min(num_extra_clicks, len(available_for_clicks)), 136 | replace=False, 137 | ) 138 | 139 | for ts, art_id in zip(click_timestamps, selected_clicks): 140 | interactions.append( 141 | { 142 | "t_dat": ts, 143 | "customer_id": customer_id, 144 | "article_id": art_id, 145 | "interaction_score": 1, 146 | "prev_article_id": None, 147 | } 148 | ) 149 | 150 | if not interactions: 151 | return pl.DataFrame( 152 | schema={ 153 | "t_dat": pl.Int64, 154 | "customer_id": pl.Utf8, 155 | "article_id": pl.Utf8, 156 | "interaction_score": pl.Int64, 157 | "prev_article_id": pl.Utf8, 158 | } 159 | ) 160 | 161 | interaction_df = pl.DataFrame(interactions) 162 | sorted_df = interaction_df.sort(["customer_id", "t_dat"]) 163 | 164 | final_df = sorted_df.with_columns( 165 | [ 166 | pl.col("article_id") 167 | .alias("prev_article_id") 168 | .shift(1) 169 | .over("customer_id") 170 | .fill_null("START") 171 | ] 172 | ) 173 | 174 | return final_df 175 | -------------------------------------------------------------------------------- /recsys/features/ranking.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | def compute_ranking_dataset(trans_fg, articles_fg, customers_fg) -> pl.DataFrame: 4 | # Read data from the feature groups 5 | trans_df = trans_fg.select( 6 | ["article_id", "customer_id"] 7 | ).read(dataframe_type="polars") 8 | articles_df = articles_fg.select_except( 9 | ["article_description", "embeddings", "image_url"] 10 | ).read(dataframe_type="polars") 11 | customers_df = customers_fg.select(["customer_id", "age"]).read(dataframe_type="polars") 12 | 13 | # Convert article_id to string in both dataframes before joining 14 | trans_df = trans_df.with_columns(pl.col("article_id").cast(pl.Utf8)) 15 | articles_df = articles_df.with_columns(pl.col("article_id").cast(pl.Utf8)) 16 | 17 | # Merge operations 18 | df = trans_df.join(articles_df, on="article_id", how="left") 19 | df = df.join(customers_df, on="customer_id", how="left") 20 | 21 | # Select query features 22 | query_features = ["customer_id", "age", "article_id"] 23 | df = df.select(query_features) 24 | 25 | # Create positive pairs 26 | positive_pairs = df.clone() 27 | 28 | # Calculate number of negative pairs 29 | n_neg = len(positive_pairs) * 10 30 | 31 | # Create negative pairs DataFrame 32 | article_ids = (df.select("article_id") 33 | .unique() 34 | .sample(n=n_neg, with_replacement=True, seed=2) 35 | .get_column("article_id")) 36 | 37 | customer_ids = (df.select("customer_id") 38 | .sample(n=n_neg, with_replacement=True, seed=3) 39 | .get_column("customer_id")) 40 | 41 | other_features = (df.select(["age"]) 42 | .sample(n=n_neg, with_replacement=True, seed=4)) 43 | 44 | # Construct negative pairs 45 | negative_pairs = pl.DataFrame({ 46 | "article_id": article_ids, 47 | "customer_id": customer_ids, 48 | "age": other_features.get_column("age"), 49 | }) 50 | 51 | # Add labels 52 | positive_pairs = positive_pairs.with_columns(pl.lit(1).alias("label")) 53 | negative_pairs = negative_pairs.with_columns(pl.lit(0).alias("label")) 54 | 55 | # Concatenate positive and negative pairs 56 | ranking_df = pl.concat([ 57 | positive_pairs, 58 | negative_pairs.select(positive_pairs.columns) 59 | ]) 60 | 61 | # Process item features 62 | item_df = articles_fg.read(dataframe_type="polars") 63 | 64 | # Convert article_id to string in item_df before final join 65 | item_df = item_df.with_columns(pl.col("article_id").cast(pl.Utf8)) 66 | 67 | # Keep unique article_ids and select columns 68 | item_df = ( 69 | item_df.unique(subset=["article_id"]) 70 | .select([ 71 | "article_id", 72 | "product_type_name", 73 | "product_group_name", 74 | "graphical_appearance_name", 75 | "colour_group_name", 76 | "perceived_colour_value_name", 77 | "perceived_colour_master_name", 78 | "department_name", 79 | "index_name", 80 | "index_group_name", 81 | "section_name", 82 | "garment_group_name", 83 | ]) 84 | ) 85 | 86 | # Final merge with item features 87 | ranking_df = ranking_df.join(item_df, on="article_id", how="left") 88 | 89 | return ranking_df -------------------------------------------------------------------------------- /recsys/features/transactions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import polars as pl 4 | from hopsworks import udf 5 | 6 | 7 | def convert_article_id_to_str(df: pl.DataFrame) -> pl.Series: 8 | """ 9 | Convert the 'article_id' column to string type. 10 | 11 | Parameters: 12 | - df (pl.DataFrame): Input DataFrame containing the 'article_id' column. 13 | 14 | Returns: 15 | - pl.Series: The 'article_id' column converted to string type. 16 | """ 17 | return df["article_id"].cast(pl.Utf8) 18 | 19 | 20 | def convert_t_dat_to_datetime(df: pl.DataFrame) -> pl.Series: 21 | """ 22 | Convert the 't_dat' column to datetime type. 23 | 24 | Parameters: 25 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 26 | 27 | Returns: 28 | - pl.Series: The 't_dat' column converted to datetime type. 29 | """ 30 | return pl.from_pandas(pd.to_datetime(df["t_dat"].to_pandas())) 31 | 32 | 33 | def get_year_feature(df: pl.DataFrame) -> pl.Series: 34 | """ 35 | Extract the year from the 't_dat' column. 36 | 37 | Parameters: 38 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 39 | 40 | Returns: 41 | - pl.Series: A series containing the year extracted from 't_dat'. 42 | """ 43 | return df["t_dat"].dt.year() 44 | 45 | 46 | def get_month_feature(df: pl.DataFrame) -> pl.Series: 47 | """ 48 | Extract the month from the 't_dat' column. 49 | 50 | Parameters: 51 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 52 | 53 | Returns: 54 | - pl.Series: A series containing the month extracted from 't_dat'. 55 | """ 56 | return df["t_dat"].dt.month() 57 | 58 | 59 | def get_day_feature(df: pl.DataFrame) -> pl.Series: 60 | """ 61 | Extract the day from the 't_dat' column. 62 | 63 | Parameters: 64 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 65 | 66 | Returns: 67 | - pl.Series: A series containing the day extracted from 't_dat'. 68 | """ 69 | return df["t_dat"].dt.day() 70 | 71 | 72 | def get_day_of_week_feature(df: pl.DataFrame) -> pl.Series: 73 | """ 74 | Extract the day of the week from the 't_dat' column. 75 | 76 | Parameters: 77 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 78 | 79 | Returns: 80 | - pl.Series: A series containing the day of the week extracted from 't_dat'. 81 | """ 82 | return df["t_dat"].dt.weekday() 83 | 84 | 85 | def calculate_month_sin_cos(month: pl.Series) -> pl.DataFrame: 86 | """ 87 | Calculate sine and cosine values for the month to capture cyclical patterns. 88 | 89 | Parameters: 90 | - month (pl.Series): A series containing month values. 91 | 92 | Returns: 93 | - pl.DataFrame: A DataFrame with 'month_sin' and 'month_cos' columns. 94 | """ 95 | C = 2 * np.pi / 12 96 | return pl.DataFrame( 97 | { 98 | "month_sin": month.apply(lambda x: np.sin(x * C)), 99 | "month_cos": month.apply(lambda x: np.cos(x * C)), 100 | } 101 | ) 102 | 103 | 104 | def convert_t_dat_to_epoch_milliseconds(df: pl.DataFrame) -> pl.Series: 105 | """ 106 | Convert the 't_dat' column to epoch milliseconds. 107 | 108 | Parameters: 109 | - df (pl.DataFrame): Input DataFrame containing the 't_dat' column. 110 | 111 | Returns: 112 | - pl.Series: A series with 't_dat' converted to epoch milliseconds. 113 | """ 114 | return df["t_dat"].cast(pl.Int64) // 1_000_000 115 | 116 | @udf(return_type = float, mode="pandas") 117 | def month_sin(month :pd.Series): 118 | """ 119 | On-demand transformation function that sine of month for cyclical feature encoding. 120 | 121 | Parameters: 122 | - month (pd.Series): A pandas series that contains the months 123 | 124 | Returns: 125 | - pd.Series: The sine of months 126 | """ 127 | return np.sin(month * (2 * np.pi / 12)) 128 | 129 | @udf(return_type = float, mode="pandas") 130 | def month_cos(month :pd.Series): 131 | """ 132 | On-demand transformation function that sine of month for cyclical feature encoding. 133 | 134 | Parameters: 135 | - month (pd.Series): A pandas series that contains the months 136 | 137 | Returns: 138 | - pd.Series: The cosine of months 139 | """ 140 | return np.cos(month * (2 * np.pi / 12)) 141 | 142 | 143 | def compute_features_transactions(df: pl.DataFrame) -> pl.DataFrame: 144 | """ 145 | Prepare transaction data by performing several data transformations. 146 | 147 | This function does the following: 148 | 1. Converts 'article_id' to string type. 149 | 2. Converts 't_dat' to datetime type. 150 | 3. Extracts year, month, day, and day of week from 't_dat'. 151 | 4. Calculates sine and cosine of the month for cyclical feature encoding. 152 | 5. Converts 't_dat' to epoch milliseconds. 153 | 154 | Parameters: 155 | - df (pl.DataFrame): Input DataFrame containing transaction data. 156 | 157 | Returns: 158 | - pl.DataFrame: Processed DataFrame with transformed transaction data. 159 | """ 160 | 161 | return ( 162 | df.with_columns( 163 | [ 164 | pl.col("article_id").cast(pl.Utf8).alias("article_id"), 165 | ] 166 | ) 167 | .with_columns( 168 | [ 169 | pl.col("t_dat").dt.year().alias("year"), 170 | pl.col("t_dat").dt.month().alias("month"), 171 | pl.col("t_dat").dt.day().alias("day"), 172 | pl.col("t_dat").dt.weekday().alias("day_of_week"), 173 | ] 174 | ) 175 | .with_columns([(pl.col("t_dat").cast(pl.Int64) // 1_000_000).alias("t_dat")]) 176 | ) 177 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/__init__.py: -------------------------------------------------------------------------------- 1 | from . import feature_store, ranking_serving, two_tower_serving, llm_ranking_serving 2 | from .feature_store import get_feature_store 3 | 4 | __all__ = ["feature_store", "get_feature_store", "ranking_serving", "two_tower_serving", "llm_ranking_serving"] 5 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/constants.py: -------------------------------------------------------------------------------- 1 | from hsfs.feature import Feature 2 | 3 | ### Post ingestion format.### 4 | 5 | customer_feature_descriptions = [ 6 | {"name": "customer_id", "description": "Unique identifier for each customer."}, 7 | { 8 | "name": "club_member_status", 9 | "description": "Membership status of the customer in the club.", 10 | }, 11 | {"name": "age", "description": "Age of the customer."}, 12 | { 13 | "name": "postal_code", 14 | "description": "Postal code associated with the customer's address.", 15 | }, 16 | {"name": "age_group", "description": "Categorized age group of the customer."}, 17 | ] 18 | 19 | transactions_feature_descriptions = [ 20 | {"name": "t_dat", "description": "Timestamp of the data record."}, 21 | {"name": "customer_id", "description": "Unique identifier for each customer."}, 22 | {"name": "article_id", "description": "Identifier for the purchased article."}, 23 | {"name": "price", "description": "Price of the purchased article."}, 24 | {"name": "sales_channel_id", "description": "Identifier for the sales channel."}, 25 | {"name": "year", "description": "Year of the transaction."}, 26 | {"name": "month", "description": "Month of the transaction."}, 27 | {"name": "day", "description": "Day of the transaction."}, 28 | {"name": "day_of_week", "description": "Day of the week of the transaction."}, 29 | { 30 | "name": "month_sin", 31 | "description": "Sine of the month used for seasonal patterns.", 32 | }, 33 | { 34 | "name": "month_cos", 35 | "description": "Cosine of the month used for seasonal patterns.", 36 | }, 37 | ] 38 | 39 | interactions_feature_descriptions = [ 40 | {"name": "t_dat", "description": "Timestamp of the interaction."}, 41 | {"name": "customer_id", "description": "Unique identifier for each customer."}, 42 | { 43 | "name": "article_id", 44 | "description": "Identifier for the article that was interacted with.", 45 | }, 46 | { 47 | "name": "interaction_score", 48 | "description": "Type of interaction: 0 = ignore, 1 = click, 2 = purchase.", 49 | }, 50 | { 51 | "name": "prev_article_id", 52 | "description": "Previous article that the customer interacted with, useful for sequential recommendation patterns.", 53 | }, 54 | ] 55 | 56 | ranking_feature_descriptions = [ 57 | {"name": "customer_id", "description": "Unique identifier for each customer."}, 58 | {"name": "article_id", "description": "Identifier for the purchased article."}, 59 | {"name": "age", "description": "Age of the customer."}, 60 | {"name": "product_type_name", "description": "Name of the product type."}, 61 | {"name": "product_group_name", "description": "Name of the product group."}, 62 | { 63 | "name": "graphical_appearance_name", 64 | "description": "Name of the graphical appearance.", 65 | }, 66 | {"name": "colour_group_name", "description": "Name of the colour group."}, 67 | { 68 | "name": "perceived_colour_value_name", 69 | "description": "Name of the perceived colour value.", 70 | }, 71 | { 72 | "name": "perceived_colour_master_name", 73 | "description": "Name of the perceived colour master.", 74 | }, 75 | {"name": "department_name", "description": "Name of the department."}, 76 | {"name": "index_name", "description": "Name of the index."}, 77 | {"name": "index_group_name", "description": "Name of the index group."}, 78 | {"name": "section_name", "description": "Name of the section."}, 79 | {"name": "garment_group_name", "description": "Name of the garment group."}, 80 | { 81 | "name": "label", 82 | "description": "Label indicating whether the article was purchased (1) or not (0).", 83 | }, 84 | ] 85 | 86 | ### Pre ingestion format. ### 87 | 88 | article_feature_description = [ 89 | Feature( 90 | name="article_id", type="string", description="Identifier for the article." 91 | ), 92 | Feature( 93 | name="product_code", 94 | type="bigint", 95 | description="Code associated with the product.", 96 | ), 97 | Feature(name="prod_name", type="string", description="Name of the product."), 98 | Feature( 99 | name="product_type_no", 100 | type="bigint", 101 | description="Number associated with the product type.", 102 | ), 103 | Feature( 104 | name="product_type_name", type="string", description="Name of the product type." 105 | ), 106 | Feature( 107 | name="product_group_name", 108 | type="string", 109 | description="Name of the product group.", 110 | ), 111 | Feature( 112 | name="graphical_appearance_no", 113 | type="bigint", 114 | description="Number associated with graphical appearance.", 115 | ), 116 | Feature( 117 | name="graphical_appearance_name", 118 | type="string", 119 | description="Name of the graphical appearance.", 120 | ), 121 | Feature( 122 | name="colour_group_code", 123 | type="bigint", 124 | description="Code associated with the colour group.", 125 | ), 126 | Feature( 127 | name="colour_group_name", type="string", description="Name of the colour group." 128 | ), 129 | Feature( 130 | name="perceived_colour_value_id", 131 | type="bigint", 132 | description="ID associated with perceived colour value.", 133 | ), 134 | Feature( 135 | name="perceived_colour_value_name", 136 | type="string", 137 | description="Name of the perceived colour value.", 138 | ), 139 | Feature( 140 | name="perceived_colour_master_id", 141 | type="bigint", 142 | description="ID associated with perceived colour master.", 143 | ), 144 | Feature( 145 | name="perceived_colour_master_name", 146 | type="string", 147 | description="Name of the perceived colour master.", 148 | ), 149 | Feature( 150 | name="department_no", 151 | type="bigint", 152 | description="Number associated with the department.", 153 | ), 154 | Feature( 155 | name="department_name", type="string", description="Name of the department." 156 | ), 157 | Feature( 158 | name="index_code", type="string", description="Code associated with the index." 159 | ), 160 | Feature(name="index_name", type="string", description="Name of the index."), 161 | Feature( 162 | name="index_group_no", 163 | type="bigint", 164 | description="Number associated with the index group.", 165 | ), 166 | Feature( 167 | name="index_group_name", type="string", description="Name of the index group." 168 | ), 169 | Feature( 170 | name="section_no", 171 | type="bigint", 172 | description="Number associated with the section.", 173 | ), 174 | Feature(name="section_name", type="string", description="Name of the section."), 175 | Feature( 176 | name="garment_group_no", 177 | type="bigint", 178 | description="Number associated with the garment group.", 179 | ), 180 | Feature( 181 | name="garment_group_name", 182 | type="string", 183 | description="Name of the garment group.", 184 | ), 185 | Feature( 186 | name="prod_name_length", 187 | type="bigint", 188 | description="Length of the product name.", 189 | ), 190 | Feature( 191 | name="article_description", 192 | type="string", 193 | online_type="VARCHAR(5800)", 194 | description="Description of the article.", 195 | ), 196 | Feature( 197 | name="embeddings", 198 | type="array", 199 | description="Vector embeddings of the article description.", 200 | ), 201 | Feature(name="image_url", type="string", description="URL of the product image."), 202 | ] 203 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/feature_store.py: -------------------------------------------------------------------------------- 1 | import hopsworks 2 | import pandas as pd 3 | from hsfs import embedding 4 | from loguru import logger 5 | 6 | from recsys.config import settings 7 | from recsys.hopsworks_integration import constants 8 | from recsys.features.transactions import month_cos, month_sin 9 | 10 | 11 | def get_feature_store(): 12 | if settings.HOPSWORKS_API_KEY: 13 | logger.info("Loging to Hopsworks using HOPSWORKS_API_KEY env var.") 14 | project = hopsworks.login( 15 | api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value() 16 | ) 17 | else: 18 | logger.info("Login to Hopsworks using cached API key.") 19 | project = hopsworks.login() 20 | return project, project.get_feature_store() 21 | 22 | 23 | ######################## 24 | #### Feature Groups #### 25 | ######################## 26 | 27 | 28 | def create_customers_feature_group(fs, df: pd.DataFrame, online_enabled: bool = True): 29 | customers_fg = fs.get_or_create_feature_group( 30 | name="customers", 31 | description="Customers data including age and postal code", 32 | version=1, 33 | primary_key=["customer_id"], 34 | online_enabled=online_enabled, 35 | ) 36 | customers_fg.insert(df, wait=True) 37 | 38 | for desc in constants.customer_feature_descriptions: 39 | customers_fg.update_feature_description(desc["name"], desc["description"]) 40 | 41 | return customers_fg 42 | 43 | 44 | def create_articles_feature_group( 45 | fs, 46 | df: pd.DataFrame, 47 | articles_description_embedding_dim: int, 48 | online_enabled: bool = True, 49 | ): 50 | # Create the Embedding Index for the articles description embedding. 51 | emb = embedding.EmbeddingIndex() 52 | emb.add_embedding("embeddings", articles_description_embedding_dim) 53 | 54 | articles_fg = fs.get_or_create_feature_group( 55 | name="articles", 56 | version=1, 57 | description="Fashion items data including type of item, visual description and category", 58 | primary_key=["article_id"], 59 | online_enabled=online_enabled, 60 | features=constants.article_feature_description, 61 | embedding_index=emb, 62 | ) 63 | articles_fg.insert(df, wait=True) 64 | 65 | return articles_fg 66 | 67 | 68 | def create_transactions_feature_group( 69 | fs, df: pd.DataFrame, online_enabled: bool = True 70 | ): 71 | trans_fg = fs.get_or_create_feature_group( 72 | name="transactions", 73 | version=1, 74 | description="Transactions data including customer, item, price, sales channel and transaction date", 75 | primary_key=["customer_id", "article_id"], 76 | online_enabled=online_enabled, 77 | transformation_functions=[month_sin, month_cos], 78 | event_time="t_dat", 79 | ) 80 | trans_fg.insert(df, wait=True) 81 | 82 | for desc in constants.transactions_feature_descriptions: 83 | trans_fg.update_feature_description(desc["name"], desc["description"]) 84 | 85 | return trans_fg 86 | 87 | 88 | def create_interactions_feature_group( 89 | fs, df: pd.DataFrame, online_enabled: bool = True 90 | ): 91 | interactions_fg = fs.get_or_create_feature_group( 92 | name="interactions", 93 | version=1, 94 | description="Customer interactions with articles including purchases, clicks, and ignores. Used for building recommendation systems and analyzing user behavior.", 95 | primary_key=["customer_id", "article_id"], 96 | online_enabled=online_enabled, 97 | event_time="t_dat", 98 | ) 99 | 100 | interactions_fg.insert( 101 | df, 102 | wait=True, 103 | ) 104 | 105 | for desc in constants.interactions_feature_descriptions: 106 | interactions_fg.update_feature_description(desc["name"], desc["description"]) 107 | 108 | return interactions_fg 109 | 110 | 111 | def create_ranking_feature_group( 112 | fs, df: pd.DataFrame, parents: list, online_enabled: bool = True 113 | ): 114 | rank_fg = fs.get_or_create_feature_group( 115 | name="ranking", 116 | version=1, 117 | description="Derived feature group for ranking", 118 | primary_key=["customer_id", "article_id"], 119 | parents=parents, 120 | online_enabled=online_enabled, 121 | ) 122 | rank_fg.insert(df, wait=True) 123 | 124 | for desc in constants.ranking_feature_descriptions: 125 | rank_fg.update_feature_description(desc["name"], desc["description"]) 126 | 127 | return rank_fg 128 | 129 | 130 | def create_candidate_embeddings_feature_group( 131 | fs, df: pd.DataFrame, online_enabled: bool = True 132 | ): 133 | embedding_index = embedding.EmbeddingIndex() 134 | 135 | embedding_index.add_embedding( 136 | "embeddings", # Embeddings feature name 137 | settings.TWO_TOWER_MODEL_EMBEDDING_SIZE, 138 | ) 139 | 140 | candidate_embeddings_fg = fs.get_or_create_feature_group( 141 | name="candidate_embeddings", 142 | embedding_index=embedding_index, # Specify the Embedding Index 143 | primary_key=["article_id"], 144 | version=1, 145 | description="Embeddings for each article.", 146 | online_enabled=online_enabled, 147 | ) 148 | candidate_embeddings_fg.insert(df, wait=True) 149 | 150 | return candidate_embeddings_fg 151 | 152 | 153 | ######################### 154 | ##### Feature Views ##### 155 | ######################### 156 | 157 | 158 | def create_retrieval_feature_view(fs): 159 | trans_fg = fs.get_feature_group(name="transactions", version=1) 160 | customers_fg = fs.get_feature_group(name="customers", version=1) 161 | articles_fg = fs.get_feature_group(name="articles", version=1) 162 | 163 | # You'll need to join these three data sources to make the data compatible 164 | # with out retrieval model. Recall that each row in the `transactions` feature group 165 | # relates information about which customer bought which item. 166 | # You'll join this feature group with the `customers` and `articles` feature groups 167 | # to inject customer and item features into each row. 168 | selected_features = ( 169 | trans_fg.select( 170 | ["customer_id", "article_id", "t_dat", "price", "month_sin", "month_cos"] 171 | ) 172 | .join( 173 | customers_fg.select(["age", "club_member_status", "age_group"]), 174 | on="customer_id", 175 | ) 176 | .join( 177 | articles_fg.select(["garment_group_name", "index_group_name"]), 178 | on="article_id", 179 | ) 180 | ) 181 | 182 | feature_view = fs.get_or_create_feature_view( 183 | name="retrieval", 184 | query=selected_features, 185 | version=1, 186 | ) 187 | 188 | return feature_view 189 | 190 | 191 | def create_ranking_feature_views(fs): 192 | customers_fg = fs.get_feature_group( 193 | name="customers", 194 | version=1, 195 | ) 196 | 197 | articles_fg = fs.get_feature_group( 198 | name="articles", 199 | version=1, 200 | ) 201 | 202 | rank_fg = fs.get_feature_group( 203 | name="ranking", 204 | version=1, 205 | ) 206 | 207 | trans_fg = fs.get_feature_group( 208 | name="transactions", 209 | version=1) 210 | 211 | selected_features_customers = customers_fg.select_all() 212 | fs.get_or_create_feature_view( 213 | name="customers", 214 | query=selected_features_customers, 215 | version=1, 216 | ) 217 | 218 | selected_features_articles = articles_fg.select_except(["embeddings"]) 219 | fs.get_or_create_feature_view( 220 | name="articles", 221 | query=selected_features_articles, 222 | version=1, 223 | ) 224 | 225 | # Select features 226 | selected_features_ranking = rank_fg.select_except(["customer_id", "article_id"]).join(trans_fg.select(["month_sin", "month_cos"])) 227 | feature_view_ranking = fs.get_or_create_feature_view( 228 | name="ranking", 229 | query=selected_features_ranking, 230 | labels=["label"], 231 | version=1, 232 | ) 233 | 234 | return feature_view_ranking 235 | 236 | 237 | def create_candidate_embeddings_feature_view(fs, fg): 238 | feature_view = fs.get_or_create_feature_view( 239 | name="candidate_embeddings", 240 | version=1, 241 | description="Embeddings of each article", 242 | query=fg.select(["article_id"]), 243 | ) 244 | 245 | return feature_view 246 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/llm_ranker/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.2.6 2 | langchain-openai==0.1.14 3 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/llm_ranking_serving.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import hopsworks 4 | from hsml.transformer import Transformer 5 | 6 | from recsys.config import settings 7 | 8 | 9 | class HopsworksLLMRankingModel: 10 | deployment_name = "llmranking" 11 | 12 | @classmethod 13 | def register(cls, mr): 14 | local_model_path = str( 15 | settings.RECSYS_DIR / "inference" / "llm_ranking_predictor.py" 16 | ) 17 | ranking_model = mr.python.create_model( 18 | name="llm_ranking_model", 19 | description="LLM Ranking model that scores item candidates", 20 | ) 21 | ranking_model.save(local_model_path) 22 | 23 | @classmethod 24 | def deploy(cls): 25 | # Prepare secrets used in the deployment 26 | cls._prepare_secrets() 27 | 28 | project = hopsworks.login() 29 | cls._prepare_environment(project) 30 | mr = project.get_model_registry() 31 | dataset_api = project.get_dataset_api() 32 | 33 | ranking_model = mr.get_model(name="llm_ranking_model") 34 | # Copy transformer file into Hopsworks File System 35 | 36 | uploaded_file_path = dataset_api.upload( 37 | str( 38 | settings.RECSYS_DIR / "inference" / "ranking_transformer.py" 39 | ), # File name to be uploaded 40 | "Resources", # Destination directory in Hopsworks File System 41 | overwrite=True, # Overwrite the file if it already exists 42 | ) 43 | # Construct the path to the uploaded transformer script 44 | transformer_script_path = os.path.join( 45 | "/Projects", # Root directory for projects in Hopsworks 46 | project.name, # Name of the current project 47 | uploaded_file_path, # Path to the uploaded file within the project 48 | ) 49 | 50 | # Upload llm predictor file to Hopsworks 51 | uploaded_file_path = dataset_api.upload( 52 | str(settings.RECSYS_DIR / "inference" / "llm_ranking_predictor.py"), 53 | "Resources", 54 | overwrite=True, 55 | ) 56 | 57 | # Construct the path to the uploaded script 58 | predictor_script_path = os.path.join( 59 | "/Projects", 60 | project.name, 61 | uploaded_file_path, 62 | ) 63 | 64 | ranking_transformer = Transformer( 65 | script_file=transformer_script_path, 66 | resources={"num_instances": 0}, 67 | ) 68 | 69 | # Deploy ranking model 70 | ranking_deployment = ranking_model.deploy( 71 | name=cls.deployment_name, 72 | description="Deployment that search for item candidates and scores them based on customer metadata using " 73 | "GPT 4", 74 | script_file=predictor_script_path, 75 | resources={"num_instances": 0}, 76 | transformer=ranking_transformer, 77 | environment=settings.CUSTOM_HOPSWORKS_INFERENCE_ENV, 78 | ) 79 | 80 | return ranking_deployment 81 | 82 | @classmethod 83 | def _prepare_environment(cls, project): 84 | # Upload requirements file to Hopsworks 85 | dataset_api = project.get_dataset_api() 86 | 87 | requirements_path = dataset_api.upload( 88 | str( 89 | settings.RECSYS_DIR 90 | / "hopsworks_integration" 91 | / "llm_ranker" 92 | / "requirements.txt" 93 | ), 94 | "Resources", 95 | overwrite=True, 96 | ) 97 | 98 | # Check if custom env exists, if not create it 99 | env_api = project.get_environment_api() 100 | envs = env_api.get_environments() 101 | existing_envs = [env.name for env in envs] 102 | if settings.CUSTOM_HOPSWORKS_INFERENCE_ENV in existing_envs: 103 | env = env_api.get_environment(settings.CUSTOM_HOPSWORKS_INFERENCE_ENV) 104 | else: 105 | env = env_api.create_environment( 106 | name=settings.CUSTOM_HOPSWORKS_INFERENCE_ENV, 107 | base_environment_name="pandas-inference-pipeline", 108 | ) 109 | 110 | # Install the extra requirements in the Python environment on Hopsworks 111 | env.install_requirements(requirements_path) 112 | 113 | @classmethod 114 | def _prepare_secrets(cls): 115 | if not settings.OPENAI_API_KEY: 116 | raise ValueError( 117 | "Missing required secret: 'OPENAI_API_KEY'. Please ensure it is set in the .env file or config.py " 118 | "settings." 119 | ) 120 | 121 | project = hopsworks.login( 122 | hostname_verification=False, 123 | api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value(), 124 | ) 125 | secrets_api = hopsworks.get_secrets_api() 126 | secrets = secrets_api.get_secrets() 127 | existing_secret_keys = [secret.name for secret in secrets] 128 | if "OPENAI_API_KEY" in existing_secret_keys: 129 | secrets_api._delete(name="OPENAI_API_KEY") 130 | 131 | secrets_api.create_secret( 132 | "OPENAI_API_KEY", 133 | settings.OPENAI_API_KEY.get_secret_value(), 134 | project=project.name, 135 | ) 136 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/ranking_serving.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | from hsml.transformer import Transformer 5 | 6 | from recsys.config import settings 7 | 8 | 9 | class HopsworksRankingModel: 10 | deployment_name = "ranking" 11 | 12 | def __init__(self, model): 13 | self._model = model 14 | 15 | def save_to_local(self, output_path: str = "ranking_model.pkl"): 16 | joblib.dump(self._model, output_path) 17 | 18 | return output_path 19 | 20 | def register(self, mr, feature_view, X_train, metrics): 21 | local_model_path = self.save_to_local() 22 | 23 | input_example = X_train.sample().to_dict("records") 24 | 25 | ranking_model = mr.python.create_model( 26 | name="ranking_model", 27 | description="Ranking model that scores item candidates", 28 | metrics=metrics, 29 | input_example=input_example, 30 | feature_view=feature_view, 31 | ) 32 | ranking_model.save(local_model_path) 33 | 34 | @classmethod 35 | def deploy(cls, project): 36 | mr = project.get_model_registry() 37 | dataset_api = project.get_dataset_api() 38 | 39 | ranking_model = mr.get_best_model( 40 | name="ranking_model", 41 | metric="fscore", 42 | direction="max", 43 | ) 44 | 45 | # Copy transformer file into Hopsworks File System 46 | uploaded_file_path = dataset_api.upload( 47 | str( 48 | settings.RECSYS_DIR / "inference" / "ranking_transformer.py" 49 | ), # File name to be uploaded 50 | "Resources", # Destination directory in Hopsworks File System 51 | overwrite=True, # Overwrite the file if it already exists 52 | ) 53 | # Construct the path to the uploaded transformer script 54 | transformer_script_path = os.path.join( 55 | "/Projects", # Root directory for projects in Hopsworks 56 | project.name, # Name of the current project 57 | uploaded_file_path, # Path to the uploaded file within the project 58 | ) 59 | 60 | # Upload predictor file to Hopsworks 61 | uploaded_file_path = dataset_api.upload( 62 | str(settings.RECSYS_DIR / "inference" / "ranking_predictor.py"), 63 | "Resources", 64 | overwrite=True, 65 | ) 66 | 67 | # Construct the path to the uploaded script 68 | predictor_script_path = os.path.join( 69 | "/Projects", 70 | project.name, 71 | uploaded_file_path, 72 | ) 73 | 74 | ranking_transformer = Transformer( 75 | script_file=transformer_script_path, 76 | resources={"num_instances": 0}, 77 | ) 78 | 79 | # Deploy ranking model 80 | ranking_deployment = ranking_model.deploy( 81 | name=cls.deployment_name, 82 | description="Deployment that search for item candidates and scores them based on customer metadata", 83 | script_file=predictor_script_path, 84 | resources={"num_instances": 0}, 85 | transformer=ranking_transformer, 86 | ) 87 | 88 | return ranking_deployment 89 | -------------------------------------------------------------------------------- /recsys/hopsworks_integration/two_tower_serving.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Literal 3 | 4 | import hopsworks 5 | import tensorflow as tf 6 | from hsml.transformer import Transformer 7 | from loguru import logger 8 | 9 | from recsys.config import settings 10 | from recsys.training.two_tower import ItemTower, QueryTower 11 | 12 | 13 | class HopsworksQueryModel: 14 | deployment_name = "query" 15 | 16 | def __init__(self, model: QueryTower) -> None: 17 | self.model = model 18 | 19 | def save_to_local(self, output_path: str = "query_model") -> str: 20 | # Define the input specifications for the instances 21 | instances_spec = { 22 | "customer_id": tf.TensorSpec( 23 | shape=(None,), dtype=tf.string, name="customer_id" 24 | ), # Specification for customer IDs 25 | "month_sin": tf.TensorSpec( 26 | shape=(None,), dtype=tf.float64, name="month_sin" 27 | ), # Specification for sine of month 28 | "month_cos": tf.TensorSpec( 29 | shape=(None,), dtype=tf.float64, name="month_cos" 30 | ), # Specification for cosine of month 31 | "age": tf.TensorSpec( 32 | shape=(None,), dtype=tf.float64, name="age" 33 | ), # Specification for age 34 | } 35 | 36 | query_module_module = QueryModelModule(model=self.model) 37 | # Get the concrete function for the query_model's compute_emb function using the specified input signatures 38 | inference_signatures = ( 39 | query_module_module.compute_embedding.get_concrete_function(instances_spec) 40 | ) 41 | 42 | # Save the query_model along with the concrete function signatures 43 | tf.saved_model.save( 44 | self.model, # The model to save 45 | output_path, # Path to save the model 46 | signatures=inference_signatures, # Concrete function signatures to include 47 | ) 48 | 49 | return output_path 50 | 51 | def register(self, mr, feature_view, query_df) -> None: 52 | local_model_path = self.save_to_local() 53 | 54 | # Sample a query example from the query DataFrame 55 | query_example = query_df.sample().to_dict("records") 56 | 57 | # Create a tensorflow model for the query_model in the Model Registry 58 | mr_query_model = mr.tensorflow.create_model( 59 | name="query_model", # Name of the model 60 | description="Model that generates query embeddings from user and transaction features", # Description of the model 61 | input_example=query_example, # Example input for the model 62 | feature_view=feature_view, 63 | ) 64 | 65 | # Save the query_model to the Model Registry 66 | mr_query_model.save(local_model_path) # Path to save the model 67 | 68 | @classmethod 69 | def deploy(cls, ranking_model_type: Literal["ranking", "llmranking"] = "ranking"): 70 | # Prepare secrets used in the deployment 71 | project = hopsworks.login() 72 | cls._prepare_secrets(ranking_model_type) 73 | 74 | mr = project.get_model_registry() 75 | dataset_api = project.get_dataset_api() 76 | 77 | # Retrieve the 'query_model' from the Model Registry 78 | query_model = mr.get_model( 79 | name="query_model", 80 | version=1, 81 | ) 82 | 83 | # Copy transformer file into Hopsworks File System 84 | uploaded_file_path = dataset_api.upload( 85 | str(settings.RECSYS_DIR / "inference" / "query_transformer.py"), 86 | "Models", 87 | overwrite=True, 88 | ) 89 | 90 | # Construct the path to the uploaded script 91 | transformer_script_path = os.path.join( 92 | "/Projects", 93 | project.name, 94 | uploaded_file_path, 95 | ) 96 | 97 | query_model_transformer = Transformer( 98 | script_file=transformer_script_path, 99 | resources={"num_instances": 0}, 100 | ) 101 | 102 | # Deploy the query model 103 | query_model_deployment = query_model.deploy( 104 | name=cls.deployment_name, 105 | description="Deployment that generates query embeddings from customer and item features using the query model", 106 | resources={"num_instances": 0}, 107 | transformer=query_model_transformer, 108 | ) 109 | 110 | return query_model_deployment 111 | 112 | @classmethod 113 | def _prepare_secrets(cls, ranking_model_type: Literal["ranking", "llmranking"]): 114 | project = hopsworks.login( 115 | hostname_verification=False, 116 | api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value(), 117 | ) 118 | secrets_api = hopsworks.get_secrets_api() 119 | secrets = secrets_api.get_secrets() 120 | existing_secret_keys = [secret.name for secret in secrets] 121 | if "RANKING_MODEL_TYPE" in existing_secret_keys: 122 | secrets_api._delete(name="RANKING_MODEL_TYPE") 123 | 124 | secrets_api.create_secret( 125 | "RANKING_MODEL_TYPE", 126 | ranking_model_type, 127 | project=project.name, 128 | ) 129 | 130 | 131 | class QueryModelModule(tf.Module): 132 | def __init__(self, model: QueryTower) -> None: 133 | self.model = model 134 | 135 | @tf.function() 136 | def compute_embedding(self, instances): 137 | query_embedding = self.model(instances) 138 | 139 | return { 140 | "customer_id": instances["customer_id"], 141 | "month_sin": instances["month_sin"], 142 | "month_cos": instances["month_cos"], 143 | "query_emb": query_embedding, 144 | } 145 | 146 | 147 | class HopsworksCandidateModel: 148 | def __init__(self, model: ItemTower): 149 | self.model = model 150 | 151 | def save_to_local(self, output_path: str = "candidate_model") -> str: 152 | tf.saved_model.save( 153 | self.model, # The model to save 154 | output_path, # Path to save the model 155 | ) 156 | 157 | return output_path 158 | 159 | def register(self, mr, feature_view, item_df): 160 | local_model_path = self.save_to_local() 161 | 162 | # Sample a candidate example from the item DataFrame 163 | candidate_example = item_df.sample().to_dict("records") 164 | 165 | # Create a tensorflow model for the candidate_model in the Model Registry 166 | mr_candidate_model = mr.tensorflow.create_model( 167 | name="candidate_model", # Name of the model 168 | description="Model that generates candidate embeddings from item features", # Description of the model 169 | input_example=candidate_example, # Example input for the model 170 | feature_view=feature_view, 171 | ) 172 | 173 | # Save the candidate_model to the Model Registry 174 | mr_candidate_model.save(local_model_path) # Path to save the model 175 | 176 | @classmethod 177 | def download(cls, mr) -> tuple[ItemTower, dict]: 178 | models = mr.get_models(name="candidate_model") 179 | if len(models) == 0: 180 | raise RuntimeError( 181 | "No 'candidate_model' found in Hopsworks model registry." 182 | ) 183 | latest_model = max(models, key=lambda m: m.version) 184 | 185 | logger.info(f"Downloading 'candidate_model' version {latest_model.version}") 186 | model_path = latest_model.download() 187 | 188 | candidate_model = tf.saved_model.load(model_path) 189 | 190 | candidate_features = [ 191 | *candidate_model.signatures["serving_default"] 192 | .structured_input_signature[-1] 193 | .keys() 194 | ] 195 | return candidate_model, candidate_features 196 | -------------------------------------------------------------------------------- /recsys/inference/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | query_transformer, 3 | ranking_transformer, 4 | ranking_predictor, 5 | llm_ranking_predictor 6 | ) 7 | 8 | __all__ = [ 9 | "query_transformer", 10 | "ranking_transformer", 11 | "ranking_predictor", 12 | "llm_ranking_predictor" 13 | ] 14 | -------------------------------------------------------------------------------- /recsys/inference/llm_ranking_predictor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import hopsworks 4 | from langchain import PromptTemplate, LLMChain 5 | from langchain_core.output_parsers import BaseOutputParser 6 | from langchain_openai import ChatOpenAI 7 | 8 | 9 | class ScoreOutputParser(BaseOutputParser[float]): 10 | def parse(self, output) -> float: 11 | text = output['text'] 12 | # Extract the numeric part after "Probability:" 13 | if "Probability:" not in text: 14 | raise ValueError("Text does not contain 'Probability:' label.") 15 | probability_str = text.split("Probability:")[1].strip() 16 | probability = float(probability_str) 17 | 18 | # Ensure the probability is in the valid range [0, 1] 19 | if not (0.0 <= probability <= 1.0): 20 | raise ValueError("Probability value must be between 0 and 1.") 21 | 22 | return probability 23 | 24 | PROMPT_TEMPLATE = """ 25 | You are a helpful assistant specialized in predicting customer behavior. Your task is to analyze the features of a product and predict the probability of it being purchased by a customer. 26 | 27 | ### Instructions: 28 | 1. Use the provided features of the product to make your prediction. 29 | 2. Consider the following numeric and categorical features: 30 | - Numeric features: These are quantitative attributes, such as numerical identifiers or measurements. 31 | - Categorical features: These describe qualitative aspects, like product category, color, and material. 32 | 3. Your response should only include the probability of purchase for the positive class (e.g., likelihood of being purchased), as a value between 0 and 1. 33 | 34 | ### Product and User Features: 35 | Numeric features: 36 | - Age: {age} 37 | - Month Sin: {month_sin} 38 | - Month Cos: {month_cos} 39 | 40 | Categorical features: 41 | - Product Type: {product_type_name} 42 | - Product Group: {product_group_name} 43 | - Graphical Appearance: {graphical_appearance_name} 44 | - Colour Group: {colour_group_name} 45 | - Perceived Colour Value: {perceived_colour_value_name} 46 | - Perceived Colour Master Value: {perceived_colour_master_name} 47 | - Department Name: {department_name} 48 | - Index Name: {index_name} 49 | - Department: {index_group_name} 50 | - Sub-Department: {section_name} 51 | - Group: {garment_group_name} 52 | 53 | ### Your Task: 54 | Based on the features provided, predict the probability that the customer will purchase this product to 4-decimals precision. Provide the output in the following format: 55 | Probability: 56 | """ 57 | 58 | 59 | class Predict(object): 60 | def __init__(self): 61 | self.input_variables = ["age", "month_sin", "month_cos", "product_type_name", "product_group_name", 62 | "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name", 63 | "perceived_colour_master_name", "department_name", "index_name", "index_group_name", 64 | "section_name", "garment_group_name"] 65 | self._retrieve_secrets() 66 | self.llm = self._build_lang_chain() 67 | self.parser = ScoreOutputParser() 68 | 69 | def _retrieve_secrets(self): 70 | project = hopsworks.login() 71 | secrets_api = hopsworks.get_secrets_api() 72 | self.openai_api_key = secrets_api.get_secret("OPENAI_API_KEY").value 73 | 74 | def predict(self, inputs): 75 | logging.info(f"✅ Inputs: {inputs}") 76 | 77 | # Extract ranking features and article IDs from the inputs limit to 20 candidates because otherwise the 78 | # inference time is over 60 seconds and the predict endpoint closes the socket 79 | features = inputs[0].pop("ranking_features")[:20] 80 | article_ids = inputs[0].pop("article_ids")[:20] 81 | 82 | # Preprocess features for OpenAI model input 83 | preprocessed_features_candidates = self._preprocess_features(features) 84 | logging.info(f"predict -> Preprocessed features: {preprocessed_features_candidates}") 85 | logging.info(f"Article IDs: {article_ids}") 86 | 87 | logging.info(f"🦅 Predicting with OpenAI model for {len(features)} instances") 88 | 89 | scores = [] 90 | for candidate in preprocessed_features_candidates: 91 | try: 92 | text = self.llm.invoke(candidate) 93 | score = self.parser.parse(text) 94 | except Exception as exception: 95 | logging.error(exception) 96 | # Add minimum default score in case of error 97 | score = 0 98 | scores.append(score) 99 | 100 | logging.info(f"LLM Scores: {scores}") 101 | 102 | return { 103 | "scores": scores, 104 | "article_ids": article_ids, 105 | } 106 | 107 | def _preprocess_features(self, features): 108 | """ 109 | Convert ranking features into a natural language description 110 | suitable for OpenAI model input. 111 | """ 112 | preprocessed = [] 113 | for feature_set in features: 114 | # Example: Create a descriptive string for each feature set 115 | query_parameters = {} 116 | for key, value in zip(self.input_variables, feature_set): 117 | query_parameters[key] = value 118 | preprocessed.append(query_parameters) 119 | return preprocessed 120 | 121 | def _build_lang_chain(self): 122 | model = ChatOpenAI( 123 | model_name='gpt-4o-mini-2024-07-18', 124 | temperature=0.7, 125 | openai_api_key=self.openai_api_key, 126 | ) 127 | prompt = PromptTemplate( 128 | input_variables=self.input_variables, 129 | template=PROMPT_TEMPLATE, 130 | ) 131 | langchain = LLMChain( 132 | llm=model, 133 | prompt=prompt, 134 | verbose=True 135 | ) 136 | return langchain 137 | -------------------------------------------------------------------------------- /recsys/inference/query_transformer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | import hopsworks 5 | import nest_asyncio 6 | 7 | nest_asyncio.apply() 8 | import pandas as pd 9 | 10 | class Transformer(object): 11 | def __init__(self) -> None: 12 | # Connect to the Hopsworks 13 | project = hopsworks.login() 14 | ms = project.get_model_serving() 15 | 16 | self._retrieve_secrets() 17 | 18 | # Retrieve the 'customers' feature view 19 | fs = project.get_feature_store() 20 | self.customer_fv = fs.get_feature_view( 21 | name="customers", 22 | version=1, 23 | ) 24 | 25 | # Retrieve the "ranking" feature view and initialize the batch scoring server. 26 | self.ranking_fv = fs.get_feature_view(name="ranking", version=1) 27 | self.ranking_fv.init_batch_scoring(1) 28 | 29 | # Retrieve the ranking deployment 30 | self.ranking_server = ms.get_deployment(self.ranking_model_type) 31 | 32 | def _retrieve_secrets(self): 33 | project = hopsworks.login() 34 | secrets_api = hopsworks.get_secrets_api() 35 | try: 36 | self.ranking_model_type = secrets_api.get_secret("RANKING_MODEL_TYPE").value 37 | except Exception as e: 38 | logging.error(e) 39 | logging.error("Could not retrieve secret RANKING_MODEL_TYPE, defaulting to ranker") 40 | self.ranking_model_type = "ranking" 41 | 42 | def preprocess(self, inputs): 43 | # Check if the input data contains a key named "instances" 44 | # and extract the actual data if present 45 | inputs = inputs["instances"] if "instances" in inputs else inputs 46 | inputs = inputs[0] 47 | 48 | # Extract customer_id and transaction_date from the inputs 49 | customer_id = inputs["customer_id"] 50 | transaction_date = inputs["transaction_date"] 51 | 52 | # Extract month from the transaction_date 53 | month_of_purchase = datetime.fromisoformat(inputs.pop("transaction_date")) 54 | 55 | # Get customer features 56 | customer_features = self.customer_fv.get_feature_vector( 57 | {"customer_id": customer_id}, 58 | return_type="pandas", 59 | ) 60 | 61 | # Enrich inputs with customer age 62 | inputs["age"] = customer_features.age.values[0] 63 | 64 | # Calculate the sine and cosine of the month_of_purchase 65 | month_of_purchase = datetime.strptime( 66 | transaction_date, "%Y-%m-%dT%H:%M:%S.%f" 67 | ).month 68 | 69 | # Calculate the sine and cosine components for the month_of_purchase using on-demand transformation present in "ranking" feature view. 70 | feature_vector = self.ranking_fv._batch_scoring_server.compute_on_demand_features( 71 | feature_vectors=pd.DataFrame([inputs]), request_parameters={"month": month_of_purchase} 72 | ).to_dict(orient="records")[0] 73 | 74 | inputs["month_sin"] = feature_vector["month_sin"] 75 | inputs["month_cos"] = feature_vector["month_cos"] 76 | 77 | return {"instances": [inputs]} 78 | 79 | def postprocess(self, outputs): 80 | # Return ordered ranking predictions 81 | return self.ranking_server.predict(inputs=outputs) 82 | -------------------------------------------------------------------------------- /recsys/inference/ranking_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import joblib 3 | import numpy as np 4 | 5 | import logging 6 | 7 | class Predict(object): 8 | 9 | def __init__(self): 10 | self.model = joblib.load(os.environ["MODEL_FILES_PATH"] + "/ranking_model.pkl") 11 | 12 | def predict(self, inputs): 13 | 14 | logging.info(f"✅ Inputs: {inputs}") 15 | 16 | # Extract ranking features and article IDs from the inputs 17 | features = inputs[0].pop("ranking_features") 18 | article_ids = inputs[0].pop("article_ids") 19 | 20 | # Log the extracted features 21 | logging.info("predict -> " + str(features)) 22 | 23 | # Log the extracted article ids 24 | logging.info(f'Article IDs: {article_ids}') 25 | 26 | logging.info(f"🦅 Predicting...") 27 | 28 | # Predict probabilities for the positive class 29 | scores = self.model.predict_proba(features).tolist() 30 | 31 | # Get scores of positive class 32 | scores = np.asarray(scores)[:,1].tolist() 33 | 34 | # Return the predicted scores along with the corresponding article IDs 35 | return { 36 | "scores": scores, 37 | "article_ids": article_ids, 38 | } 39 | -------------------------------------------------------------------------------- /recsys/inference/ranking_transformer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import hopsworks 4 | import pandas as pd 5 | 6 | import nest_asyncio 7 | nest_asyncio.apply() 8 | 9 | class Transformer(object): 10 | def __init__(self): 11 | # Connect to Hopsworks 12 | project = hopsworks.login() 13 | self.fs = project.get_feature_store() 14 | 15 | # Retrieve 'transactions' feature group. 16 | self.transactions_fg = self.fs.get_feature_group("transactions", 1) 17 | 18 | # Retrieve the 'articles' feature view 19 | self.articles_fv = self.fs.get_feature_view( 20 | name="articles", 21 | version=1, 22 | ) 23 | 24 | # Get list of feature names for articles 25 | self.articles_features = [feat.name for feat in self.articles_fv.schema] 26 | 27 | # Retrieve the 'customers' feature view 28 | self.customer_fv = self.fs.get_feature_view( 29 | name="customers", 30 | version=1, 31 | ) 32 | 33 | self.customer_fv.init_serving(1) 34 | 35 | # Retrieve the 'candidate_embeddings' feature view 36 | self.candidate_index = self.fs.get_feature_view( 37 | name="candidate_embeddings", 38 | version=1, 39 | ) 40 | 41 | # Retrieve ranking model 42 | mr = project.get_model_registry() 43 | model = mr.get_model( 44 | name="ranking_model", 45 | version=1, 46 | ) 47 | 48 | self.ranking_fv = model.get_feature_view(init=False) 49 | self.ranking_fv.init_batch_scoring(1) 50 | 51 | # Get the names of features expected by the ranking model 52 | self.ranking_model_feature_names = [ 53 | feature.name 54 | for feature 55 | in self.ranking_fv.schema 56 | if feature.name != 'label' 57 | ] 58 | 59 | def preprocess(self, inputs): 60 | # Extract the input instance 61 | inputs = inputs["instances"][0] 62 | 63 | # Extract customer_id from inputs 64 | customer_id = inputs["customer_id"] 65 | 66 | # Search for candidate items 67 | neighbors = self.candidate_index.find_neighbors( 68 | inputs["query_emb"], 69 | k=100, 70 | ) 71 | neighbors = [neighbor[0] for neighbor in neighbors] 72 | 73 | # Get IDs of items already bought by the customer 74 | already_bought_items_ids = ( 75 | self.transactions_fg.select("article_id").filter(self.transactions_fg.customer_id==customer_id).read(dataframe_type="pandas").values.reshape(-1).tolist() 76 | ) 77 | 78 | # Filter candidate items to exclude those already bought by the customer 79 | item_id_list = [ 80 | str(item_id) 81 | for item_id in neighbors 82 | if str(item_id) not in already_bought_items_ids 83 | ] 84 | item_id_df = pd.DataFrame({"article_id": item_id_list}) 85 | 86 | # Retrieve Article data for candidate items 87 | articles_data = [ 88 | self.articles_fv.get_feature_vector({"article_id": item_id}) 89 | for item_id in item_id_list 90 | ] 91 | 92 | logging.info("✅ Articles Data Retrieved!") 93 | 94 | articles_df = pd.DataFrame( 95 | data=articles_data, 96 | columns=self.articles_features, 97 | ) 98 | 99 | # Join candidate items with their features 100 | ranking_model_inputs = item_id_df.merge( 101 | articles_df, 102 | on="article_id", 103 | how="inner", 104 | ) 105 | 106 | logging.info("✅ Inputs are almost ready!") 107 | 108 | # Add customer features 109 | customer_features = self.customer_fv.get_feature_vector( 110 | {"customer_id": customer_id}, 111 | return_type="pandas", 112 | ) 113 | 114 | ranking_model_inputs["age"] = customer_features.age.values[0] 115 | ranking_model_inputs["month_sin"] = inputs["month_sin"] 116 | ranking_model_inputs["month_cos"] = inputs["month_cos"] 117 | 118 | # Select only the features required by the ranking model 119 | ranking_model_inputs = ranking_model_inputs[self.ranking_model_feature_names] 120 | 121 | logging.info("✅ Inputs are ready!") 122 | 123 | return { 124 | "inputs": [ 125 | { 126 | "ranking_features": ranking_model_inputs.values.tolist(), 127 | "article_ids": item_id_list, 128 | } 129 | ] 130 | } 131 | 132 | def postprocess(self, outputs): 133 | logging.info("✅ Predictions are ready!") 134 | 135 | # Merge prediction scores and corresponding article IDs into a list of tuples 136 | ranking = list(zip(outputs["scores"], outputs["article_ids"])) 137 | 138 | # Sort the ranking list by score in descending order 139 | ranking.sort(reverse=True) 140 | 141 | # Return the sorted ranking list 142 | return { 143 | "ranking": ranking, 144 | } 145 | -------------------------------------------------------------------------------- /recsys/raw_data_sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/recsys/raw_data_sources/__init__.py -------------------------------------------------------------------------------- /recsys/raw_data_sources/h_and_m.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | 3 | 4 | def extract_articles_df() -> pl.DataFrame: 5 | return pl.read_csv("https://repo.hops.works/dev/jdowling/h-and-m/articles.csv", try_parse_dates=True) 6 | 7 | 8 | def extract_customers_df() -> pl.DataFrame: 9 | return pl.read_csv("https://repo.hops.works/dev/jdowling/h-and-m/customers.csv", try_parse_dates=True) 10 | 11 | 12 | def extract_transactions_df() -> pl.DataFrame: 13 | return pl.read_csv( 14 | "https://repo.hops.works/dev/jdowling/h-and-m/transactions_train.csv", try_parse_dates=True 15 | ) 16 | -------------------------------------------------------------------------------- /recsys/training/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ranking, two_tower 2 | 3 | __all__ = ["ranking", "two_tower"] 4 | -------------------------------------------------------------------------------- /recsys/training/ranking.py: -------------------------------------------------------------------------------- 1 | from catboost import CatBoostClassifier, Pool 2 | from loguru import logger 3 | from sklearn.metrics import classification_report, precision_recall_fscore_support 4 | 5 | from recsys.config import settings 6 | 7 | 8 | class RankingModelFactory: 9 | @classmethod 10 | def build(cls) -> CatBoostClassifier: 11 | return CatBoostClassifier( 12 | learning_rate=settings.RANKING_LEARNING_RATE, 13 | iterations=settings.RANKING_ITERATIONS, 14 | depth=10, 15 | scale_pos_weight=settings.RANKING_SCALE_POS_WEIGHT, 16 | early_stopping_rounds=settings.RANKING_EARLY_STOPPING_ROUNDS, 17 | use_best_model=True, 18 | ) 19 | 20 | 21 | class RankingModelTrainer: 22 | def __init__(self, model, train_dataset, eval_dataset) -> None: 23 | self._model = model 24 | 25 | self._X_train, self._y_train = train_dataset 26 | self._X_val, self._y_val = eval_dataset 27 | 28 | self._train_dataset, self._eval_dataset = self._initialize_dataset( 29 | train_dataset, eval_dataset 30 | ) 31 | 32 | def get_model(self): 33 | return self._model 34 | 35 | def _initialize_dataset(self, train_dataset, eval_dataset): 36 | X_train, y_train = train_dataset 37 | X_val, y_val = eval_dataset 38 | 39 | cat_features = list(X_train.select_dtypes(include=["string", "object"]).columns) 40 | 41 | pool_train = Pool(X_train, y_train, cat_features=cat_features) 42 | pool_val = Pool(X_val, y_val, cat_features=cat_features) 43 | 44 | return pool_train, pool_val 45 | 46 | def fit(self): 47 | self._model.fit( 48 | self._train_dataset, 49 | eval_set=self._eval_dataset, 50 | ) 51 | 52 | return self._model 53 | 54 | def evaluate(self, log: bool = False): 55 | preds = self._model.predict(self._eval_dataset) 56 | 57 | precision, recall, fscore, _ = precision_recall_fscore_support( 58 | self._y_val, preds, average="binary" 59 | ) 60 | 61 | if log: 62 | logger.info(classification_report(self._y_val, preds)) 63 | 64 | return { 65 | "precision": precision, 66 | "recall": recall, 67 | "fscore": fscore, 68 | } 69 | 70 | def get_feature_importance(self) -> dict: 71 | feat_to_score = { 72 | feature: score 73 | for feature, score in zip( 74 | self._X_train.columns, 75 | self._model.feature_importances_, 76 | ) 77 | } 78 | 79 | feat_to_score = dict( 80 | sorted( 81 | feat_to_score.items(), 82 | key=lambda item: item[1], 83 | reverse=True, 84 | ) 85 | ) 86 | 87 | return feat_to_score 88 | -------------------------------------------------------------------------------- /recsys/training/two_tower.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_recommenders as tfrs 3 | from loguru import logger 4 | from tensorflow.keras.layers import Normalization, StringLookup 5 | 6 | from recsys.config import settings 7 | 8 | 9 | class QueryTowerFactory: 10 | def __init__(self, dataset: "TwoTowerDataset") -> None: 11 | self._dataset = dataset 12 | 13 | def build( 14 | self, embed_dim: int = settings.TWO_TOWER_MODEL_EMBEDDING_SIZE 15 | ) -> "QueryTower": 16 | return QueryTower( 17 | user_ids=self._dataset.properties["user_ids"], 18 | emb_dim=embed_dim, 19 | ) 20 | 21 | 22 | class QueryTower(tf.keras.Model): 23 | def __init__(self, user_ids: list, emb_dim: int) -> None: 24 | super().__init__() 25 | 26 | self.user_embedding = tf.keras.Sequential( 27 | [ 28 | StringLookup(vocabulary=user_ids, mask_token=None), 29 | tf.keras.layers.Embedding( 30 | # Add an additional embedding to account for unknown tokens. 31 | len(user_ids) + 1, 32 | emb_dim, 33 | ), 34 | ] 35 | ) 36 | 37 | self.normalized_age = Normalization(axis=None) 38 | 39 | self.fnn = tf.keras.Sequential( 40 | [ 41 | tf.keras.layers.Dense(emb_dim, activation="relu"), 42 | tf.keras.layers.Dense(emb_dim), 43 | ] 44 | ) 45 | 46 | def call(self, inputs): 47 | concatenated_inputs = tf.concat( 48 | [ 49 | self.user_embedding(inputs["customer_id"]), 50 | tf.reshape(self.normalized_age(inputs["age"]), (-1, 1)), 51 | tf.reshape(inputs["month_sin"], (-1, 1)), 52 | tf.reshape(inputs["month_cos"], (-1, 1)), 53 | ], 54 | axis=1, 55 | ) 56 | 57 | outputs = self.fnn(concatenated_inputs) 58 | 59 | return outputs 60 | 61 | 62 | class ItemTowerFactory: 63 | def __init__(self, dataset: "TwoTowerDataset") -> None: 64 | self._dataset = dataset 65 | 66 | def build( 67 | self, embed_dim: int = settings.TWO_TOWER_MODEL_EMBEDDING_SIZE 68 | ) -> "ItemTower": 69 | return ItemTower( 70 | item_ids=self._dataset.properties["item_ids"], 71 | garment_groups=self._dataset.properties["garment_groups"], 72 | index_groups=self._dataset.properties["index_groups"], 73 | emb_dim=embed_dim, 74 | ) 75 | 76 | 77 | class ItemTower(tf.keras.Model): 78 | def __init__( 79 | self, 80 | item_ids: list, 81 | garment_groups: list, 82 | index_groups: list, 83 | emb_dim: int, 84 | ): 85 | super().__init__() 86 | 87 | self.garment_groups = garment_groups 88 | self.index_groups = index_groups 89 | 90 | self.item_embedding = tf.keras.Sequential( 91 | [ 92 | StringLookup(vocabulary=item_ids, mask_token=None), 93 | tf.keras.layers.Embedding( 94 | # Add an additional embedding to account for unknown tokens. 95 | len(item_ids) + 1, 96 | emb_dim, 97 | ), 98 | ] 99 | ) 100 | # Converts strings into integer indices (scikit-learn LabelEncoder analog) 101 | self.garment_group_tokenizer = StringLookup( 102 | vocabulary=garment_groups, 103 | mask_token=None, 104 | ) 105 | self.index_group_tokenizer = StringLookup( 106 | vocabulary=index_groups, 107 | mask_token=None, 108 | ) 109 | 110 | self.fnn = tf.keras.Sequential( 111 | [ 112 | tf.keras.layers.Dense(emb_dim, activation="relu"), 113 | tf.keras.layers.Dense(emb_dim), 114 | ] 115 | ) 116 | 117 | def call(self, inputs): 118 | garment_group_embedding = tf.one_hot( 119 | self.garment_group_tokenizer(inputs["garment_group_name"]), 120 | len(self.garment_groups), 121 | ) 122 | 123 | index_group_embedding = tf.one_hot( 124 | self.index_group_tokenizer(inputs["index_group_name"]), 125 | len(self.index_groups), 126 | ) 127 | 128 | concatenated_inputs = tf.concat( 129 | [ 130 | self.item_embedding(inputs["article_id"]), 131 | garment_group_embedding, 132 | index_group_embedding, 133 | ], 134 | axis=1, 135 | ) 136 | 137 | outputs = self.fnn(concatenated_inputs) 138 | 139 | return outputs 140 | 141 | 142 | class TwoTowerFactory: 143 | def __init__(self, dataset: "TwoTowerDataset") -> None: 144 | self._dataset = dataset 145 | 146 | def build( 147 | self, 148 | query_model: QueryTower, 149 | item_model: ItemTower, 150 | batch_size: int = settings.TWO_TOWER_MODEL_BATCH_SIZE, 151 | ) -> "TwoTowerModel": 152 | item_ds = self._dataset.get_items_subset() 153 | 154 | return TwoTowerModel( 155 | query_model, 156 | item_model, 157 | item_ds=item_ds, 158 | batch_size=batch_size, 159 | ) 160 | 161 | 162 | class TwoTowerModel(tf.keras.Model): 163 | def __init__( 164 | self, 165 | query_model: QueryTower, 166 | item_model: ItemTower, 167 | item_ds: tf.data.Dataset, 168 | batch_size: int, 169 | ) -> None: 170 | super().__init__() 171 | self.query_model = query_model 172 | self.item_model = item_model 173 | self.task = tfrs.tasks.Retrieval( 174 | metrics=tfrs.metrics.FactorizedTopK( 175 | candidates=item_ds.batch(batch_size).map(self.item_model) 176 | ) 177 | ) 178 | 179 | def train_step(self, batch) -> tf.Tensor: 180 | # Set up a gradient tape to record gradients. 181 | with tf.GradientTape() as tape: 182 | # Loss computation. 183 | user_embeddings = self.query_model(batch) 184 | item_embeddings = self.item_model(batch) 185 | loss = self.task( 186 | user_embeddings, 187 | item_embeddings, 188 | compute_metrics=False, 189 | ) 190 | 191 | # Handle regularization losses as well. 192 | regularization_loss = sum(self.losses) 193 | 194 | total_loss = loss + regularization_loss 195 | 196 | gradients = tape.gradient(total_loss, self.trainable_variables) 197 | self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) 198 | 199 | metrics = { 200 | "loss": loss, 201 | "regularization_loss": regularization_loss, 202 | "total_loss": total_loss, 203 | } 204 | 205 | return metrics 206 | 207 | def test_step(self, batch) -> tf.Tensor: 208 | # Loss computation. 209 | user_embeddings = self.query_model(batch) 210 | item_embeddings = self.item_model(batch) 211 | 212 | loss = self.task( 213 | user_embeddings, 214 | item_embeddings, 215 | compute_metrics=False, 216 | ) 217 | 218 | # Handle regularization losses as well. 219 | regularization_loss = sum(self.losses) 220 | 221 | total_loss = loss + regularization_loss 222 | 223 | metrics = {metric.name: metric.result() for metric in self.metrics} 224 | metrics["loss"] = loss 225 | metrics["regularization_loss"] = regularization_loss 226 | metrics["total_loss"] = total_loss 227 | 228 | return metrics 229 | 230 | 231 | class TwoTowerDataset: 232 | def __init__(self, feature_view, batch_size: int) -> None: 233 | self._feature_view = feature_view 234 | self._batch_size = batch_size 235 | self._properties: dict | None 236 | 237 | @property 238 | def query_features(self) -> list[str]: 239 | return ["customer_id", "age", "month_sin", "month_cos"] 240 | 241 | @property 242 | def candidate_features(self) -> list[str]: 243 | return [ 244 | "article_id", 245 | "garment_group_name", 246 | "index_group_name", 247 | ] 248 | 249 | @property 250 | def properties(self) -> dict: 251 | assert self._properties is not None, "Call get_train_val_split() first." 252 | 253 | return self._properties 254 | 255 | def get_items_subset(self): 256 | item_df = self.properties["train_df"][self.candidate_features] 257 | item_df.drop_duplicates(subset="article_id", inplace=True) 258 | item_ds = self.df_to_ds(item_df) 259 | 260 | return item_ds 261 | 262 | def get_train_val_split(self): 263 | logger.info("Retrieving and creating train, val test split...") 264 | 265 | train_df, val_df, test_df, _, _, _ = ( 266 | self._feature_view.train_validation_test_split( 267 | validation_size=settings.TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE, 268 | test_size=settings.TWO_TOWER_DATASET_TEST_SPLIT_SIZE, 269 | description="Retrieval dataset splits", 270 | ) 271 | ) 272 | 273 | train_ds = ( 274 | self.df_to_ds(train_df) 275 | .batch(self._batch_size) 276 | .cache() 277 | .shuffle(self._batch_size * 10) 278 | ) 279 | val_ds = self.df_to_ds(val_df).batch(self._batch_size).cache() 280 | 281 | self._properties = { 282 | "train_df": train_df, 283 | "val_df": val_df, 284 | "query_df": train_df[self.query_features], 285 | "item_df": train_df[self.candidate_features], 286 | "user_ids": train_df["customer_id"].unique().tolist(), 287 | "item_ids": train_df["article_id"].unique().tolist(), 288 | "garment_groups": train_df["garment_group_name"].unique().tolist(), 289 | "index_groups": train_df["index_group_name"].unique().tolist(), 290 | } 291 | 292 | return train_ds, val_ds 293 | 294 | def df_to_ds(self, df): 295 | return tf.data.Dataset.from_tensor_slices({col: df[col] for col in df}) 296 | 297 | 298 | class TwoTowerTrainer: 299 | def __init__(self, dataset: TwoTowerDataset, model: TwoTowerModel) -> None: 300 | self._dataset = dataset 301 | self._model = model 302 | 303 | def train(self, train_ds, val_ds): 304 | self._initialize_query_model(train_ds) 305 | 306 | # Define an optimizer using AdamW with a learning rate of 0.01 307 | optimizer = tf.keras.optimizers.AdamW( 308 | weight_decay=settings.TWO_TOWER_WEIGHT_DECAY, 309 | learning_rate=settings.TWO_TOWER_LEARNING_RATE, 310 | ) 311 | 312 | # Compile the model using the specified optimizer 313 | self._model.compile(optimizer=optimizer) 314 | 315 | # Start training 316 | history = self._model.fit( 317 | train_ds, 318 | validation_data=val_ds, 319 | epochs=settings.TWO_TOWER_NUM_EPOCHS, 320 | ) 321 | 322 | return history 323 | 324 | def _initialize_query_model(self, train_ds): 325 | # Initialize age normalization layer. 326 | self._model.query_model.normalized_age.adapt(train_ds.map(lambda x: x["age"])) 327 | 328 | # Initialize model with inputs. 329 | query_df = self._dataset.properties["query_df"] 330 | query_ds = self._dataset.df_to_ds(query_df).batch(1) 331 | self._model.query_model(next(iter(query_ds))) 332 | -------------------------------------------------------------------------------- /recsys/ui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/recsys/ui/__init__.py -------------------------------------------------------------------------------- /recsys/ui/feature_group_updater.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import random 4 | from datetime import datetime 5 | 6 | import hopsworks 7 | import pandas as pd 8 | import streamlit as st 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class FeatureGroupUpdater: 15 | def __init__(self): 16 | """Initialize the FeatureGroup updater""" 17 | self._initialize_feature_groups() 18 | 19 | def _initialize_feature_groups(self) -> None: 20 | """Initialize connection to Hopsworks Feature Groups""" 21 | try: 22 | if "feature_group" not in st.session_state: 23 | logger.info("📡 Initializing Hopsworks Feature Groups connection...") 24 | project = hopsworks.login() 25 | fs = project.get_feature_store() 26 | 27 | # Initialize interactions feature group 28 | st.session_state.feature_group = fs.get_feature_group( 29 | name="interactions", 30 | version=1, 31 | ) 32 | 33 | # Initialize transactions feature group 34 | st.session_state.transactions_fg = fs.get_feature_group( 35 | name="transactions", 36 | version=1, 37 | ) 38 | logger.info("✅ Feature Groups connection established") 39 | 40 | except Exception as e: 41 | logger.error(f"Failed to initialize Feature Groups connection: {str(e)}") 42 | st.error( 43 | "❌ Failed to connect to Feature Groups. Check terminal for details." 44 | ) 45 | raise 46 | 47 | def _prepare_transaction_for_insertion(self, purchase_data: dict) -> pd.DataFrame: 48 | """Prepare transaction data for insertion into transactions feature group""" 49 | try: 50 | timestamp = datetime.now() 51 | 52 | transaction = { 53 | "t_dat": int(timestamp.timestamp()), 54 | "customer_id": str(purchase_data["customer_id"]), 55 | "article_id": str(purchase_data["article_id"]), 56 | "price": round(random.uniform(10, 140), 2), 57 | "sales_channel_id": 2, 58 | "year": timestamp.year, 59 | "month": timestamp.month, 60 | "day": timestamp.day, 61 | "day_of_week": timestamp.weekday(), 62 | "month_sin": math.sin(2 * math.pi * timestamp.month / 12), 63 | "month_cos": math.cos(2 * math.pi * timestamp.month / 12), 64 | } 65 | 66 | df = pd.DataFrame([transaction]) 67 | 68 | # Ensure correct data types 69 | df["t_dat"] = df["t_dat"].astype("int64") 70 | df["customer_id"] = df["customer_id"].astype(str) 71 | df["article_id"] = df["article_id"].astype(str) 72 | df["price"] = df["price"].astype("float64") 73 | df["sales_channel_id"] = df["sales_channel_id"].astype("int64") 74 | df["year"] = df["year"].astype("int32") 75 | df["month"] = df["month"].astype("int32") 76 | df["day"] = df["day"].astype("int32") 77 | df["day_of_week"] = df["day_of_week"].astype("int32") 78 | df["month_sin"] = df["month_sin"].astype("float64") 79 | df["month_cos"] = df["month_cos"].astype("float64") 80 | 81 | logger.info(f"Prepared transaction for insertion: {transaction}") 82 | return df 83 | 84 | except Exception as e: 85 | logger.error(f"Error preparing transaction data: {str(e)}") 86 | return None 87 | 88 | def insert_transaction(self, purchase_data: dict) -> bool: 89 | """Insert a single transaction into transactions feature group""" 90 | try: 91 | transaction_df = self._prepare_transaction_for_insertion(purchase_data) 92 | 93 | if transaction_df is not None: 94 | logger.info("Inserting transaction...") 95 | with st.spinner("💫 Recording transaction..."): 96 | st.session_state.transactions_fg.multi_part_insert(transaction_df) 97 | logger.info("✅ Transaction inserted successfully") 98 | return True 99 | 100 | except Exception as e: 101 | logger.error(f"Failed to insert transaction: {str(e)}") 102 | st.error("❌ Failed to insert transaction. Check terminal for details.") 103 | 104 | return False 105 | 106 | def _prepare_interactions_for_insertion(self, df: pd.DataFrame) -> pd.DataFrame: 107 | """Prepare interactions dataframe for insertion""" 108 | if df is None or df.empty: 109 | return None 110 | 111 | try: 112 | # Convert timestamp to Unix timestamp if needed 113 | if not pd.api.types.is_integer_dtype(df["t_dat"]): 114 | df["t_dat"] = pd.to_datetime(df["t_dat"]).astype("int64") // 10**9 115 | 116 | prepared_df = pd.DataFrame( 117 | { 118 | "t_dat": df["t_dat"].astype("int64"), 119 | "customer_id": df["customer_id"].astype(str), 120 | "article_id": df["article_id"].astype(str), 121 | "interaction_score": df["interaction_score"].astype("int64"), 122 | "prev_article_id": df["prev_article_id"].astype(str), 123 | } 124 | ) 125 | 126 | logger.info("Prepared interaction for insertion") 127 | return prepared_df 128 | 129 | except Exception as e: 130 | logger.error(f"Error preparing interaction data: {str(e)}") 131 | return None 132 | 133 | def process_interactions(self, tracker, force: bool = False) -> bool: 134 | """Process and insert interactions immediately""" 135 | try: 136 | interactions_df = tracker.get_interactions_data() 137 | 138 | if interactions_df.empty: 139 | return False 140 | 141 | prepared_df = self._prepare_interactions_for_insertion(interactions_df) 142 | if prepared_df is not None: 143 | logger.info("Inserting interactions...") 144 | st.session_state.feature_group.multi_part_insert(prepared_df) 145 | logger.info("✅ Interactions inserted successfully") 146 | return True 147 | 148 | except Exception as e: 149 | logger.error(f"Error processing interactions: {str(e)}") 150 | return False 151 | 152 | return False 153 | 154 | 155 | def get_fg_updater(): 156 | """Get or create FeatureGroupUpdater instance""" 157 | if "fg_updater" not in st.session_state: 158 | st.session_state.fg_updater = FeatureGroupUpdater() 159 | return st.session_state.fg_updater 160 | -------------------------------------------------------------------------------- /recsys/ui/interaction_tracker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass 3 | from datetime import datetime 4 | from enum import Enum, auto 5 | from typing import Dict, List, Optional, Set, Tuple 6 | 7 | import pandas as pd 8 | import streamlit as st 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class InteractionType(Enum): 15 | """Enum for interaction types and their corresponding scores""" 16 | 17 | PURCHASE = auto() 18 | CLICK = auto() 19 | IGNORE = auto() 20 | 21 | @property 22 | def score(self) -> int: 23 | return { 24 | InteractionType.PURCHASE: 2, 25 | InteractionType.CLICK: 1, 26 | InteractionType.IGNORE: 0, 27 | }[self] 28 | 29 | @classmethod 30 | def from_str(cls, value: str) -> "InteractionType": 31 | return {"purchase": cls.PURCHASE, "click": cls.CLICK, "ignore": cls.IGNORE}[ 32 | value.lower() 33 | ] 34 | 35 | 36 | @dataclass 37 | class Interaction: 38 | t_dat: int # Unix timestamp 39 | customer_id: str 40 | article_id: str 41 | interaction_type: str 42 | interaction_score: int 43 | prev_article_id: Optional[str] 44 | 45 | 46 | class InteractionTracker: 47 | def __init__(self): 48 | """Initialize interaction tracking containers""" 49 | # Key: (customer_id, article_id, type) -> Interaction 50 | self.interactions: Dict[Tuple[str, str, str], Interaction] = {} 51 | # Key: customer_id -> list of article_ids 52 | self.current_items: Dict[str, List[str]] = {} 53 | # Key: customer_id -> set of article_ids 54 | self.purchased_items: Dict[str, Set[str]] = {} 55 | # Key: customer_id -> article_id 56 | self.last_interaction: Dict[str, str] = {} 57 | logger.info("Initialized InteractionTracker") 58 | 59 | def track_shown_items(self, customer_id: str, items_with_scores: list): 60 | """Record items being shown with their scores""" 61 | if customer_id not in self.purchased_items: 62 | self.purchased_items[customer_id] = set() 63 | 64 | item_ids = [str(item_id) for item_id, _ in items_with_scores] 65 | self.current_items[customer_id] = item_ids 66 | 67 | # Record ignore interactions 68 | timestamp = int(datetime.now().timestamp()) 69 | 70 | for idx, item_id in enumerate(item_ids): 71 | if item_id not in self.purchased_items.get(customer_id, set()): 72 | prev_id = item_ids[idx - 1] if idx > 0 else item_id 73 | self._add_interaction( 74 | customer_id=customer_id, 75 | article_id=item_id, 76 | interaction_type="ignore", 77 | prev_article_id=prev_id, 78 | timestamp=timestamp, 79 | ) 80 | 81 | logger.info(f"Tracked {len(item_ids)} shown items for customer {customer_id}") 82 | 83 | def track(self, customer_id: str, article_id: str, interaction_type: str): 84 | """Record a user interaction""" 85 | article_id = str(article_id) 86 | 87 | if customer_id not in self.purchased_items: 88 | self.purchased_items[customer_id] = set() 89 | 90 | prev_article_id = self.last_interaction.get(customer_id, article_id) 91 | 92 | self._add_interaction( 93 | customer_id=customer_id, 94 | article_id=article_id, 95 | interaction_type=interaction_type, 96 | prev_article_id=prev_article_id, 97 | ) 98 | 99 | # Update tracking state and UI feedback 100 | int_type = InteractionType.from_str(interaction_type) 101 | if int_type == InteractionType.PURCHASE: 102 | self.purchased_items[customer_id].add(article_id) 103 | st.toast(f"🛍️ Purchased item {article_id}", icon="🛍️") 104 | logger.info( 105 | f"Tracked purchase of item {article_id} by customer {customer_id}" 106 | ) 107 | elif int_type == InteractionType.CLICK: 108 | st.toast(f"Viewed details of item {article_id}", icon="👁️") 109 | logger.info(f"Tracked click on item {article_id} by customer {customer_id}") 110 | 111 | if int_type in (InteractionType.CLICK, InteractionType.PURCHASE): 112 | self.last_interaction[customer_id] = article_id 113 | 114 | def _add_interaction( 115 | self, customer_id, article_id, interaction_type, prev_article_id, timestamp=None 116 | ): 117 | """Add interaction with duplicate handling using dictionary""" 118 | if timestamp is None: 119 | timestamp = int(datetime.now().timestamp()) 120 | 121 | key = (customer_id, article_id, interaction_type) 122 | int_type = InteractionType.from_str(interaction_type) 123 | 124 | self.interactions[key] = Interaction( 125 | t_dat=timestamp, 126 | customer_id=str(customer_id), 127 | article_id=str(article_id), 128 | interaction_type=interaction_type, 129 | interaction_score=int_type.score, 130 | prev_article_id=str(prev_article_id), 131 | ) 132 | 133 | logger.debug( 134 | f"Added {interaction_type} interaction: " 135 | f"customer={customer_id}, article={article_id}, score={int_type.score}" 136 | ) 137 | 138 | def get_interactions_data(self) -> pd.DataFrame: 139 | """Get all recorded interactions as a pandas DataFrame""" 140 | if not self.interactions: 141 | logger.info("No interactions recorded yet") 142 | return pd.DataFrame( 143 | columns=[ 144 | "t_dat", 145 | "customer_id", 146 | "article_id", 147 | "interaction_type", 148 | "interaction_score", 149 | "prev_article_id", 150 | ] 151 | ) 152 | 153 | df = pd.DataFrame([vars(i) for i in self.interactions.values()]) 154 | logger.info(f"Retrieved {len(df)} interactions") 155 | return df 156 | 157 | def should_show_item(self, customer_id: str, article_id: str) -> bool: 158 | """Check if an item should be shown (not purchased)""" 159 | return str(article_id) not in self.purchased_items.get(customer_id, set()) 160 | 161 | def get_current_items(self, customer_id: str) -> List[str]: 162 | """Get current items for a customer""" 163 | return self.current_items.get(customer_id, []) 164 | 165 | def clear_interactions(self): 166 | """Clear all recorded interactions while preserving purchased items""" 167 | self.interactions.clear() 168 | logger.info("Cleared all recorded interactions") 169 | 170 | 171 | def get_tracker(): 172 | """Get or create InteractionTracker instance""" 173 | if "interaction_tracker" not in st.session_state: 174 | st.session_state.interaction_tracker = InteractionTracker() 175 | logger.info("Created new InteractionTracker instance") 176 | return st.session_state.interaction_tracker 177 | -------------------------------------------------------------------------------- /recsys/ui/recommenders.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | 4 | import streamlit as st 5 | from langchain.chains import LLMChain 6 | from langchain.prompts import PromptTemplate 7 | from langchain_openai import ChatOpenAI 8 | from sentence_transformers import SentenceTransformer 9 | 10 | from recsys.config import settings 11 | 12 | from .feature_group_updater import get_fg_updater 13 | from .interaction_tracker import get_tracker 14 | from .utils import ( 15 | fetch_and_process_image, 16 | get_item_image_url, 17 | print_header, 18 | process_description, 19 | ) 20 | 21 | 22 | def initialize_llm_state(): 23 | """Initialize all necessary session state variables for LLM recommendations""" 24 | if "llm_recommendations" not in st.session_state: 25 | st.session_state.llm_recommendations = [] 26 | if "outfit_summary" not in st.session_state: 27 | st.session_state.outfit_summary = "" 28 | if "llm_extra_items" not in st.session_state: 29 | st.session_state.llm_extra_items = {} 30 | 31 | 32 | def display_item(item_id, score, articles_fv, customer_id, tracker, source): 33 | """Display a single item with its interactions""" 34 | image_url = get_item_image_url(item_id, articles_fv) 35 | img = fetch_and_process_image(image_url) 36 | 37 | if img: 38 | st.image(img, use_column_width=True) 39 | st.write(f"**🎯 Score:** {score:.4f}") 40 | 41 | # View Details button 42 | details_key = f"{source}_details_{item_id}" 43 | if st.button("📝 View Details", key=details_key): 44 | tracker.track(customer_id, item_id, "click") 45 | with st.expander("Item Details", expanded=True): 46 | description = process_description( 47 | articles_fv.get_feature_vector({"article_id": item_id})[-2] 48 | ) 49 | st.write(description) 50 | 51 | # Buy button 52 | buy_key = f"{source}_buy_{item_id}" 53 | if st.button("🛒 Buy", key=buy_key): 54 | # Track interaction 55 | tracker.track(customer_id, item_id, "purchase") 56 | 57 | # Insert transaction 58 | fg_updater = get_fg_updater() 59 | purchase_data = {"customer_id": customer_id, "article_id": item_id} 60 | 61 | if fg_updater.insert_transaction(purchase_data): 62 | st.success(f"✅ Item {item_id} purchased!") 63 | st.experimental_rerun() 64 | else: 65 | st.error("Failed to record transaction, but purchase was tracked") 66 | 67 | 68 | def customer_recommendations( 69 | articles_fv, 70 | ranking_deployment, 71 | query_model_deployment, 72 | customer_id, 73 | max_retries: int = 5, 74 | retry_delay: int = 30, 75 | ): 76 | """Handle customer-based recommendations""" 77 | tracker = get_tracker() 78 | 79 | # Initialize or update recommendations 80 | if "customer_recs" not in st.session_state: 81 | st.session_state.customer_recs = [] 82 | st.session_state.prediction_time = None 83 | 84 | # Only get new predictions if: 85 | # 1. Button is clicked OR 86 | # 2. No recommendations exist OR 87 | # 3. Customer ID changed 88 | if ( 89 | st.sidebar.button("Get Recommendations", key="get_recommendations_button") 90 | or not st.session_state.customer_recs 91 | or "last_customer_id" not in st.session_state 92 | or st.session_state.last_customer_id != customer_id 93 | ): 94 | with st.spinner("🔮 Getting recommendations..."): 95 | # Format timestamp with microseconds 96 | current_time = datetime.now() 97 | formatted_timestamp = current_time.strftime("%Y-%m-%dT%H:%M:%S.%f") 98 | 99 | st.session_state.prediction_time = formatted_timestamp 100 | st.session_state.last_customer_id = customer_id 101 | 102 | # Get predictions from model using a retry mechanism in case of failure. 103 | deployment_input = [ 104 | {"customer_id": customer_id, "transaction_date": formatted_timestamp} 105 | ] 106 | warning_placeholder = None 107 | for attempt in range(max_retries): 108 | try: 109 | prediction = query_model_deployment.predict( 110 | inputs=deployment_input 111 | )["predictions"]["ranking"] 112 | if warning_placeholder: 113 | warning_placeholder.empty() 114 | break 115 | except Exception as e: 116 | if attempt < max_retries - 1: 117 | warning_placeholder = st.warning( 118 | f"⚠️ Failed to call the H&M recommender deployment. It's probably scaling from 0 to +1 instances, which may take 1-2 minutes. Retrying in {retry_delay} seconds..." 119 | ) 120 | time.sleep(retry_delay) 121 | else: 122 | st.error( 123 | f"❌ Failed to get predictions after {max_retries} retries" 124 | ) 125 | raise e 126 | 127 | # Filter out purchased items 128 | available_items = [ 129 | (item_id, score) 130 | for score, item_id in prediction 131 | if tracker.should_show_item(customer_id, item_id) 132 | ] 133 | 134 | # Store recommendations and extras 135 | st.session_state.customer_recs = available_items[:12] 136 | st.session_state.extra_recs = available_items[12:] 137 | 138 | # Track shown items 139 | tracker.track_shown_items( 140 | customer_id, 141 | [(item_id, score) for item_id, score in st.session_state.customer_recs], 142 | ) 143 | 144 | st.sidebar.success("✅ Got new recommendations") 145 | 146 | # Display recommendations 147 | print_header("📝 Top 12 Recommendations:") 148 | 149 | if not st.session_state.customer_recs: 150 | st.warning( 151 | "No recommendations available. Click 'Get Recommendations' to start." 152 | ) 153 | return 154 | 155 | # Display items in 3x4 grid 156 | for row in range(3): 157 | cols = st.columns(4) 158 | for col in range(4): 159 | idx = row * 4 + col 160 | if idx < len(st.session_state.customer_recs): 161 | item_id, score = st.session_state.customer_recs[idx] 162 | if tracker.should_show_item(customer_id, item_id): 163 | with cols[col]: 164 | display_item( 165 | item_id, 166 | score, 167 | articles_fv, 168 | customer_id, 169 | tracker, 170 | "customer", 171 | ) 172 | else: 173 | # Replace purchased item with one from extras 174 | if st.session_state.extra_recs: 175 | new_item = st.session_state.extra_recs.pop(0) 176 | st.session_state.customer_recs.append(new_item) 177 | st.session_state.customer_recs.pop(idx) 178 | st.experimental_rerun() 179 | 180 | 181 | def get_fashion_chain(api_key): 182 | model = ChatOpenAI( 183 | model_name=settings.OPENAI_MODEL_ID, 184 | temperature=0.7, 185 | openai_api_key=api_key, 186 | ) 187 | template = """ 188 | You are a fashion recommender for H&M. 189 | 190 | Customer request: {user_input} 191 | 192 | Gender: {gender} 193 | 194 | Generate 3-5 necessary fashion items with detailed descriptions, tailored for an H&M-style dataset and appropriate for the specified gender. 195 | Each item description should be specific, suitable for creating embeddings, and relevant to the gender. 196 | 197 | STRICTLY FOLLOW the next response format: 198 | @ | @ | @ | | 199 | 200 | Example for male gender: 201 | 👖 Pants @ Slim-fit dark wash jeans with subtle distressing | 👕 Top @ Classic white cotton polo shirt with embroidered logo | 👟 Footwear @ Navy canvas sneakers with white soles | 🧥 Outerwear @ Lightweight olive green bomber jacket | 🕶️👔 Versatile casual look! Mix and match for various occasions. Add accessories for personal flair! 💼⌚ 202 | 203 | Example for female gender: 204 | 👗 Dress @ Floral print wrap dress with flutter sleeves | 👠 Footwear @ Strappy nude block heel sandals | 👜 Accessory @ Woven straw tote bag with leather handles | 🧥 Outerwear @ Cropped denim jacket with raw hem | 🌸👒 Perfect for a summer day out! Layer with the jacket for cooler evenings. Add a wide-brim hat for extra style! 💃🏻🕶️ 205 | 206 | Ensure each item category has a relevant emoji, each item description is detailed, unique, and appropriate for the specified gender. 207 | Make sure to take into account the gender when selecting items and descriptions. 208 | The final section should provide a brief summary and styling tips with relevant emojis. Tailor your recommendations to the specified gender. 209 | """ 210 | prompt = PromptTemplate( 211 | input_variables=["user_input", "gender"], 212 | template=template, 213 | ) 214 | fashion_chain = LLMChain(llm=model, prompt=prompt, verbose=True) 215 | return fashion_chain 216 | 217 | 218 | def get_fashion_recommendations(user_input, fashion_chain, gender): 219 | """Get recommendations from the LLM""" 220 | response = fashion_chain.run(user_input=user_input, gender=gender) 221 | items = response.strip().split(" | ") 222 | 223 | outfit_summary = items[-1] if len(items) > 1 else "No summary available." 224 | item_descriptions = items[:-1] if len(items) > 1 else items 225 | 226 | parsed_items = [] 227 | for item in item_descriptions: 228 | try: 229 | emoji_category, description = item.split(" @ ", 1) 230 | emoji, category = emoji_category.split(" ", 1) 231 | parsed_items.append((emoji, category, description)) 232 | except ValueError: 233 | parsed_items.append(("🔷", "Item", item)) 234 | 235 | return parsed_items, outfit_summary 236 | 237 | 238 | def display_llm_item(item_data, col, articles_fv, customer_id, tracker): 239 | """Display a single LLM recommendation item and handle interactions""" 240 | description, item = item_data 241 | item_id = str(item[0]) 242 | 243 | image_url = get_item_image_url(item_id, articles_fv) 244 | img = fetch_and_process_image(image_url) 245 | 246 | if not img: 247 | return False 248 | 249 | col.image(img, use_column_width=True) 250 | 251 | # View Details button 252 | if col.button("📝 View Details", key=f"llm_details_{item_id}"): 253 | tracker.track(customer_id, item_id, "click") 254 | with col.expander("Item Details", expanded=True): 255 | col.write(process_description(item[-2])) 256 | 257 | # Buy button 258 | if col.button("🛒 Buy", key=f"llm_buy_{item_id}"): 259 | # Track interaction 260 | tracker.track(customer_id, item_id, "purchase") 261 | 262 | # Insert transaction 263 | fg_updater = get_fg_updater() 264 | purchase_data = {"customer_id": customer_id, "article_id": item_id} 265 | 266 | if fg_updater.insert_transaction(purchase_data): 267 | st.success(f"✅ Item {item_id} purchased!") 268 | return True 269 | else: 270 | st.error("Failed to record transaction, but purchase was tracked") 271 | 272 | return False 273 | 274 | 275 | def display_category_items(emoji, category, items, articles_fv, customer_id, tracker): 276 | """Display items for a category and handle purchases""" 277 | st.markdown(f"## {emoji} {category}") 278 | 279 | if items: 280 | st.write(f"**Recommendation: {items[0][0]}**") 281 | 282 | # Calculate number of rows needed 283 | items_per_row = 5 284 | num_rows = (len(items) + items_per_row - 1) // items_per_row 285 | 286 | need_rerun = False 287 | remaining_items = [] 288 | 289 | # Display items row by row 290 | for row in range(num_rows): 291 | start_idx = row * items_per_row 292 | end_idx = min(start_idx + items_per_row, len(items)) 293 | row_items = items[start_idx:end_idx] 294 | 295 | cols = st.columns(items_per_row) 296 | 297 | for idx, item_data in enumerate(row_items): 298 | if tracker.should_show_item(customer_id, item_data[1][0]): 299 | with cols[idx]: 300 | if display_llm_item( 301 | item_data, cols[idx], articles_fv, customer_id, tracker 302 | ): 303 | need_rerun = True 304 | else: 305 | remaining_items.append(item_data) 306 | 307 | st.markdown("---") 308 | return need_rerun, remaining_items 309 | return False, [] 310 | 311 | 312 | def llm_recommendations(articles_fv, api_key, customer_id): 313 | """Handle LLM-based recommendations with proper state management""" 314 | st.write("🤖 LLM Fashion Recommender") 315 | 316 | # Initialize session state 317 | initialize_llm_state() 318 | 319 | tracker = get_tracker() 320 | embedding_model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID) 321 | 322 | # Gender selection 323 | gender = st.selectbox("Select gender:", ("Male", "Female")) 324 | 325 | # Input options 326 | input_options = [ 327 | "I'm going to the beach for a week-long vacation. What items do I need?", 328 | "I have a formal winter wedding to attend next month. What should I wear?", 329 | "I'm starting a new job at a tech startup with a casual dress code. What items should I add to my wardrobe?", 330 | "Custom input", 331 | ] 332 | 333 | selected_input = st.selectbox( 334 | "Choose your fashion need or enter a custom one:", input_options 335 | ) 336 | 337 | user_request = "" 338 | if selected_input == "Custom input": 339 | user_request = st.text_input("Enter your custom fashion need:") 340 | else: 341 | user_request = selected_input 342 | 343 | # Generate recommendations button 344 | if st.button("Get LLM Recommendations") and user_request: 345 | with st.spinner("Generating recommendations..."): 346 | try: 347 | fashion_chain = get_fashion_chain(api_key) 348 | item_recommendations, summary = get_fashion_recommendations( 349 | user_request, fashion_chain, gender 350 | ) 351 | 352 | # Clear previous recommendations 353 | st.session_state.llm_recommendations = [] 354 | st.session_state.llm_extra_items = {} 355 | st.session_state.outfit_summary = summary 356 | 357 | for emoji, category, description in item_recommendations: 358 | similar_items = get_similar_items( 359 | description, embedding_model, articles_fv 360 | ) 361 | shown_items = [] 362 | extra_items = [] 363 | 364 | # Split items into shown and extra 365 | for item in similar_items: 366 | if len(shown_items) < 5 and tracker.should_show_item( 367 | customer_id, item[0] 368 | ): 369 | shown_items.append((description, item)) 370 | elif tracker.should_show_item(customer_id, item[0]): 371 | extra_items.append((description, item)) 372 | 373 | if shown_items: 374 | st.session_state.llm_recommendations.append( 375 | (emoji, category, shown_items) 376 | ) 377 | st.session_state.llm_extra_items[category] = extra_items 378 | 379 | # Track shown items 380 | tracker.track_shown_items( 381 | customer_id, [(item[1][0], 0.0) for item in shown_items] 382 | ) 383 | 384 | except Exception as e: 385 | st.error(f"An error occurred: {str(e)}") 386 | return 387 | 388 | # Display outfit summary if available 389 | if st.session_state.outfit_summary: 390 | st.markdown("## 🎨 Outfit Summary") 391 | st.markdown( 392 | f"

{st.session_state.outfit_summary}

", 393 | unsafe_allow_html=True, 394 | ) 395 | st.markdown("---") 396 | 397 | # Display recommendations by category 398 | updated_recommendations = [] 399 | need_rerun = False 400 | 401 | for emoji, category, items in st.session_state.llm_recommendations: 402 | if not items: 403 | continue 404 | 405 | st.markdown(f"## {emoji} {category}") 406 | st.write(f"**Recommendation: {items[0][0]}**") 407 | 408 | # Calculate number of columns needed 409 | n_items = len(items) 410 | n_cols = min(5, n_items) 411 | cols = st.columns(n_cols) 412 | 413 | # Track which items to keep 414 | remaining_items = [] 415 | category_updated = False 416 | 417 | # Display items 418 | for idx, item_data in enumerate(items): 419 | item_id = item_data[1][0] 420 | 421 | # Only show if not purchased 422 | if tracker.should_show_item(customer_id, item_id): 423 | with cols[idx % n_cols]: 424 | # Display and handle purchase 425 | was_purchased = display_llm_item( 426 | item_data, cols[idx % n_cols], articles_fv, customer_id, tracker 427 | ) 428 | 429 | if was_purchased: 430 | # Item was purchased, try to get replacement 431 | category_updated = True 432 | extra_items = st.session_state.llm_extra_items.get(category, []) 433 | 434 | if extra_items: 435 | # Add replacement item from extras 436 | new_item = extra_items.pop(0) 437 | remaining_items.append(new_item) 438 | st.session_state.llm_extra_items[category] = extra_items 439 | else: 440 | # Keep the item in display 441 | remaining_items.append(item_data) 442 | 443 | # If we still have items to display in this category 444 | if remaining_items: 445 | updated_recommendations.append((emoji, category, remaining_items)) 446 | 447 | if category_updated: 448 | need_rerun = True 449 | 450 | st.markdown("---") 451 | 452 | # Update recommendations and rerun if needed 453 | if need_rerun: 454 | st.session_state.llm_recommendations = updated_recommendations 455 | st.experimental_rerun() 456 | 457 | 458 | def get_similar_items(description, embedding_model, articles_fv): 459 | """Get similar items based on description embedding""" 460 | description_embedding = embedding_model.encode(description) 461 | 462 | return articles_fv.find_neighbors(description_embedding, k=25) 463 | -------------------------------------------------------------------------------- /recsys/ui/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import BytesIO 3 | 4 | import requests 5 | import streamlit as st 6 | from PIL import Image, UnidentifiedImageError 7 | 8 | from recsys import hopsworks_integration 9 | from recsys.config import settings 10 | 11 | def print_header(text, font_size=22): 12 | res = f'{text}' 13 | st.markdown(res, unsafe_allow_html=True) 14 | 15 | 16 | @st.cache_data() 17 | def fetch_and_process_image(image_url, width=200, height=300): 18 | try: 19 | response = requests.get(image_url) 20 | img = Image.open(BytesIO(response.content)) 21 | img = img.resize((width, height), Image.LANCZOS) 22 | return img 23 | except (UnidentifiedImageError, requests.RequestException, IOError): 24 | return None 25 | 26 | 27 | def process_description(description): 28 | details_match = re.search(r"Details: (.+?)(?:\n|$)", description) 29 | return details_match.group(1) if details_match else "No details available." 30 | 31 | 32 | def get_item_image_url(item_id, articles_fv): 33 | article_feature_view = articles_fv.get_feature_vector({"article_id": item_id}) 34 | if not article_feature_view: 35 | return None 36 | 37 | return article_feature_view[-1] 38 | 39 | 40 | @st.cache_resource() 41 | def get_deployments(): 42 | project, fs = hopsworks_integration.get_feature_store() 43 | 44 | ms = project.get_model_serving() 45 | 46 | articles_fv = fs.get_feature_view( 47 | name="articles", 48 | version=1, 49 | ) 50 | 51 | query_model_deployment = ms.get_deployment( 52 | hopsworks_integration.two_tower_serving.HopsworksQueryModel.deployment_name 53 | ) 54 | 55 | ranking_deployment = ms.get_deployment( 56 | settings.RANKING_MODEL_TYPE 57 | ) 58 | 59 | ranking_deployment.start(await_running=180) 60 | query_model_deployment.start(await_running=180) 61 | 62 | return articles_fv, ranking_deployment, query_model_deployment 63 | -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import streamlit as st 5 | 6 | from recsys.config import settings 7 | from recsys.ui.feature_group_updater import get_fg_updater 8 | from recsys.ui.interaction_tracker import get_tracker 9 | from recsys.ui.recommenders import customer_recommendations, llm_recommendations 10 | from recsys.ui.utils import get_deployments 11 | 12 | # Configure logging 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | # Constants 17 | CUSTOMER_IDS = [ 18 | "9e619265e3ae0d2ef96a71577c4aff3474bfa7dd0d60486b42bc8f921c3387c0", 19 | "a1f7201399574e78b0a1575c50e3b68d116f84e24c0f70c957083da99db6ab5f", 20 | "19fa659096de20f0c022b9727779e849813ccc82952b3d56e212ab18fa2c0bf3", 21 | "d9448c8585f1678937deb5118d95b09bf6f41fe00a65b1fb82c7d176c6bfc532", 22 | "b41d990c8a127dac386dd6c9f2a6ec4ac41185cd21ef2df0a952a8cbdf61ed5d", 23 | ] 24 | 25 | 26 | def initialize_page(): 27 | """Initialize Streamlit page configuration""" 28 | st.set_page_config(layout="wide", initial_sidebar_state="expanded") 29 | st.title("👒 Fashion Items Recommender") 30 | st.sidebar.title("⚙️ Configuration") 31 | 32 | 33 | def initialize_services(): 34 | """Initialize tracker, updater, and deployments""" 35 | tracker = get_tracker() 36 | fg_updater = get_fg_updater() 37 | 38 | logger.info("Initializing deployments...") 39 | with st.sidebar: 40 | with st.spinner("🚀 Starting Deployments..."): 41 | articles_fv, ranking_deployment, query_model_deployment = get_deployments() 42 | st.success("✅ Deployments Ready") 43 | 44 | # Stop deployments button 45 | if st.button( 46 | "⏹️ Stop Deployments", key="stop_deployments_button", type="secondary" 47 | ): 48 | ranking_deployment.stop() 49 | query_model_deployment.stop() 50 | st.success("Deployments stopped successfully!") 51 | 52 | return tracker, fg_updater, articles_fv, ranking_deployment, query_model_deployment 53 | 54 | 55 | def show_interaction_dashboard(tracker, fg_updater, page_selection): 56 | """Display interaction data and controls""" 57 | with st.sidebar.expander("📊 Interaction Dashboard", expanded=True): 58 | if page_selection == "LLM Recommendations": 59 | api_key = ( 60 | settings.OPENAI_API_KEY.get_secret_value() 61 | if settings.OPENAI_API_KEY 62 | and settings.OPENAI_API_KEY.get_secret_value() 63 | else None 64 | ) 65 | if not api_key: 66 | api_key = st.text_input( 67 | "🔑 OpenAI API Key:", type="password", key="openai_api_key" 68 | ) 69 | if api_key: 70 | os.environ["OPENAI_API_KEY"] = api_key 71 | else: 72 | st.warning("⚠️ Please enter OpenAI API Key for LLM Recommendations") 73 | st.divider() 74 | 75 | interaction_data = tracker.get_interactions_data() 76 | 77 | col1, col2, col3 = st.columns(3) 78 | total = len(interaction_data) 79 | clicks = len(interaction_data[interaction_data["interaction_score"] == 1]) 80 | purchases = len(interaction_data[interaction_data["interaction_score"] == 2]) 81 | 82 | col1.metric("Total", total) 83 | col2.metric("Clicks", clicks) 84 | col3.metric("Purchases", purchases) 85 | 86 | st.dataframe(interaction_data, hide_index=True) 87 | fg_updater.process_interactions(tracker, force=True) 88 | 89 | 90 | def handle_llm_page(articles_fv, customer_id): 91 | """Handle LLM recommendations page""" 92 | if "OPENAI_API_KEY" in os.environ: 93 | llm_recommendations(articles_fv, os.environ["OPENAI_API_KEY"], customer_id) 94 | else: 95 | st.warning("Please provide your OpenAI API Key in the Interaction Dashboard") 96 | 97 | 98 | def process_pending_interactions(tracker, fg_updater): 99 | """Process interactions immediately""" 100 | fg_updater.process_interactions(tracker, force=True) 101 | 102 | 103 | def main(): 104 | # Initialize page 105 | initialize_page() 106 | 107 | # Initialize services 108 | tracker, fg_updater, articles_fv, ranking_deployment, query_model_deployment = ( 109 | initialize_services() 110 | ) 111 | 112 | # Select customer 113 | customer_id = st.sidebar.selectbox( 114 | "👤 Select Customer:", CUSTOMER_IDS, key="selected_customer" 115 | ) 116 | 117 | # Page selection 118 | page_options = ["Customer Recommendations", "LLM Recommendations"] 119 | page_selection = st.sidebar.radio("📑 Choose Page:", page_options) 120 | 121 | # Process any pending interactions with notification 122 | process_pending_interactions(tracker, fg_updater) 123 | 124 | # Interaction dashboard with OpenAI API key field 125 | show_interaction_dashboard(tracker, fg_updater, page_selection) 126 | 127 | # Handle page content 128 | if page_selection == "Customer Recommendations": 129 | customer_recommendations( 130 | articles_fv, ranking_deployment, query_model_deployment, customer_id 131 | ) 132 | else: # LLM Recommendations 133 | handle_llm_page(articles_fv, customer_id) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /tools/clean_hopsworks_resources.py: -------------------------------------------------------------------------------- 1 | import hopsworks 2 | 3 | # Login to Hopsworks 4 | project = hopsworks.login() 5 | 6 | 7 | # Get deployment registry 8 | mr = project.get_model_serving() 9 | 10 | # List all deployments 11 | deployments = mr.get_deployments() 12 | 13 | # Delete each deployment 14 | for deployment in deployments: 15 | print(f"Deleting deployment: {deployment.name}.") 16 | deployment.stop() 17 | deployment.delete() 18 | 19 | # Get the model registry 20 | mr = project.get_model_registry() 21 | 22 | # List all models 23 | for model_name in [ 24 | "llm_ranking_model", 25 | "ranking_model", 26 | "candidate_model", 27 | "query_model", 28 | ]: 29 | models = mr.get_models(name=model_name) 30 | 31 | # Delete each model 32 | for model in models: 33 | print(f"Deleting model: {model.name} (version: {model.version})") 34 | try: 35 | model.delete() 36 | except Exception: 37 | print(f"Failed to delete model {model_name}.") 38 | 39 | # Get feature store 40 | fs = project.get_feature_store() 41 | 42 | for feature_view in [ 43 | "retrieval", 44 | "articles", 45 | "customers", 46 | "candidate_embeddings", 47 | "ranking", 48 | ]: 49 | # Get all feature views 50 | try: 51 | feature_views = fs.get_feature_views(name=feature_view) 52 | except: 53 | print(f"Couldn't find feature view: {feature_view}. Skipping...") 54 | feature_views = [] 55 | 56 | # Delete each feature view 57 | for fv in feature_views: 58 | print(f"Deleting feature view: {fv.name} (version: {fv.version})") 59 | try: 60 | fv.delete() 61 | except Exception: 62 | print(f"Failed to delete feature view {fv.name}.") 63 | 64 | for feature_group in [ 65 | "customers", 66 | "articles", 67 | "transactions", 68 | "interactions", 69 | "candidate_embeddings", 70 | "ranking", 71 | ]: 72 | # Get all feature groups 73 | try: 74 | feature_groups = fs.get_feature_groups(name=feature_group) 75 | except: 76 | print(f"Couldn't find feature group: {feature_view}. Skipping...") 77 | feature_groups = [] 78 | 79 | # Delete each feature group 80 | for fg in feature_groups: 81 | print(f"Deleting feature group: {fg.name} (version: {fg.version})") 82 | try: 83 | fg.delete() 84 | except: 85 | print(f"Failed to delete feature group {fv.name}.") 86 | --------------------------------------------------------------------------------