├── .devcontainer
    └── devcontainer.json
├── .env.example
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── clarify-concept.md
    │   ├── questions-about-the-course-material.md
    │   └── technical-troubleshooting-or-bugs.md
    ├── actions
    │   └── setup
    │   │   └── action.yaml
    └── workflows
    │   └── ml_pipelines.yaml
├── .gitignore
├── .python-version
├── INSTALL_AND_USAGE.md
├── LICENSE
├── Makefile
├── README.md
├── assets
    ├── 4_stage_recommender_architecture.png
    ├── github_actions_manual_trigger.png
    ├── github_actions_pipeline_done.png
    ├── github_actions_pipeline_progress.png
    ├── github_actions_secrets.png
    ├── hopsworks.png
    ├── hopsworks_deployments.png
    ├── streamlit_choose_advanced_settings.png
    ├── streamlit_choose_app_type.png
    ├── streamlit_choose_main_settings.png
    ├── system_architecture.png
    ├── two_tower_embedding_model.png
    └── ui_example.png
├── notebooks
    ├── 1_fp_computing_features.ipynb
    ├── 2_tp_training_retrieval_model.ipynb
    ├── 3_tp_training_ranking_model.ipynb
    ├── 4_ip_computing_item_embeddings.ipynb
    ├── 5_ip_creating_deployments.ipynb
    ├── 6_scheduling_materialization_jobs.ipynb
    └── 7_ip_creating_deployments_llm_ranking.ipynb
├── packages.txt
├── pyproject.toml
├── recsys
    ├── __init__.py
    ├── config.py
    ├── features
    │   ├── __init__.py
    │   ├── articles.py
    │   ├── customers.py
    │   ├── embeddings.py
    │   ├── interaction.py
    │   ├── ranking.py
    │   └── transactions.py
    ├── hopsworks_integration
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── feature_store.py
    │   ├── llm_ranker
    │   │   └── requirements.txt
    │   ├── llm_ranking_serving.py
    │   ├── ranking_serving.py
    │   └── two_tower_serving.py
    ├── inference
    │   ├── __init__.py
    │   ├── llm_ranking_predictor.py
    │   ├── query_transformer.py
    │   ├── ranking_predictor.py
    │   └── ranking_transformer.py
    ├── raw_data_sources
    │   ├── __init__.py
    │   └── h_and_m.py
    ├── training
    │   ├── __init__.py
    │   ├── ranking.py
    │   └── two_tower.py
    └── ui
    │   ├── __init__.py
    │   ├── feature_group_updater.py
    │   ├── interaction_tracker.py
    │   ├── recommenders.py
    │   └── utils.py
├── streamlit_app.py
├── tools
    └── clean_hopsworks_resources.py
└── uv.lock


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "streamlit_app.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # Your Hopsworks API key
2 | HOPSWORKS_API_KEY=
3 | 
4 | # Your OpenAI API key (Optional - required only for the last LLM lesson)
5 | OPENAI_API_KEY= 
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/clarify-concept.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Clarify concept
 3 | about: Clarify concept
 4 | title: Clarify concept
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Ask anything about the course.
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/questions-about-the-course-material.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Questions about the course material
 3 | about: Questions about the course material
 4 | title: Questions about the course material
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | Ask anything about the course material.
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/technical-troubleshooting-or-bugs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Technical troubleshooting or bugs
 3 | about: Technical troubleshooting or bugs
 4 | title: Technical troubleshooting or bugs
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug or technical issue**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/actions/setup/action.yaml:
--------------------------------------------------------------------------------
 1 | name: 'ML Environment Setup'
 2 | description: 'Sets up Python environment with uv and project dependencies'
 3 | 
 4 | inputs:
 5 |   uv-version:
 6 |     description: 'Version of uv to install'
 7 |     required: false
 8 |     default: '0.4.30'
 9 | 
10 | runs:
11 |   using: "composite"
12 |   steps:
13 |     - name: Install uv
14 |       uses: astral-sh/setup-uv@v3
15 |       with:
16 |         version: ${{ inputs.uv-version }}
17 | 
18 |     - name: Create and activate virtual environment
19 |       shell: bash
20 |       run: |
21 |         uv venv
22 |         source .venv/bin/activate
23 | 
24 |     - name: Install the project
25 |       shell: bash
26 |       run: |
27 |         uv pip install --all-extras --requirement pyproject.toml


--------------------------------------------------------------------------------
/.github/workflows/ml_pipelines.yaml:
--------------------------------------------------------------------------------
 1 | name: ML Pipelines
 2 | 
 3 | on:
 4 |   # schedule: # Uncomment to run the pipelines every 2 hours. All the pipelines take ~1.5 hours to run.
 5 |   #   - cron: '0 */2 * * *'
 6 |   # push: # Uncomment to run pipelines on every new commit to main
 7 |   #   branches:
 8 |   #     - main
 9 |   workflow_dispatch:  # Allows manual triggering from GitHub UI
10 |       
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   feature_engineering:
17 |     name: Compute Features
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - name: Checkout repository
21 |         uses: actions/checkout@v3
22 |         
23 |       - uses: ./.github/actions/setup
24 | 
25 |       - name: Run pipeline
26 |         run: uv run ipython notebooks/1_fp_computing_features.ipynb
27 |         env:
28 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
29 | 
30 |   train_retrieval:
31 |     needs: feature_engineering
32 |     name: Train Retrieval Model
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - name: Checkout repository
36 |         uses: actions/checkout@v3
37 |         
38 |       - uses: ./.github/actions/setup
39 | 
40 |       - name: Run pipeline
41 |         run: uv run ipython notebooks/2_tp_training_retrieval_model.ipynb
42 |         env:
43 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
44 | 
45 |   train_ranking:
46 |     needs: feature_engineering
47 |     name: Train Ranking Model
48 |     runs-on: ubuntu-latest
49 |     steps:
50 |       - name: Checkout repository
51 |         uses: actions/checkout@v3
52 |         
53 |       - uses: ./.github/actions/setup
54 | 
55 |       - name: Run pipeline
56 |         run: uv run ipython notebooks/3_tp_training_ranking_model.ipynb
57 |         env:
58 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
59 | 
60 |   computing_and_indexing_embeddings:
61 |     needs: train_retrieval
62 |     name: Compute Embeddings
63 |     runs-on: ubuntu-latest
64 |     steps:
65 |       - name: Checkout repository
66 |         uses: actions/checkout@v3
67 |         
68 |       - uses: ./.github/actions/setup
69 | 
70 |       - name: Run pipeline
71 |         run: uv run ipython notebooks/4_ip_computing_item_embeddings.ipynb
72 |         env:
73 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
74 | 
75 |   create_deployments:
76 |     needs: computing_and_indexing_embeddings
77 |     name: Create Deployments
78 |     runs-on: ubuntu-latest
79 |     steps:
80 |       - name: Checkout repository
81 |         uses: actions/checkout@v3
82 |         
83 |       - uses: ./.github/actions/setup
84 | 
85 |       - name: Run pipeline
86 |         run: uv run ipython notebooks/5_ip_creating_deployments.ipynb
87 |         env:
88 |           HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
89 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .env.old
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # IDEs
159 | .idea/
160 | .vscode
161 | 
162 | # Project local storage
163 | candidate_model
164 | query_model
165 | ranking_model.pkl
166 | catboost_info
167 | 
168 | # MacOS
169 | .DS_Store


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.8
2 | 


--------------------------------------------------------------------------------
/INSTALL_AND_USAGE.md:
--------------------------------------------------------------------------------
  1 | <table style="border-collapse: collapse; border: none;">
  2 |   <tr style="border: none;">
  3 |     <td width="20%" style="border: none;">
  4 |       <a href="https://decodingml.substack.com/" aria-label="Decoding ML">
  5 |         <img src="https://github.com/user-attachments/assets/f2f2f9c0-54b7-4ae3-bf8d-23a359c86982" alt="Decoding ML Logo" width="150"/>
  6 |       </a>
  7 |     </td>
  8 |     <td width="80%" style="border: none;">
  9 |       <div>
 10 |         <h2>📬 Stay Updated</h2>
 11 |         <p><b><a href="https://decodingml.substack.com/">Join Decoding ML</a></b> for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.</p>
 12 |       </div>
 13 |     </td>
 14 |   </tr>
 15 | </table>
 16 | 
 17 | <p align="center">
 18 |   <a href="https://decodingml.substack.com/">
 19 |     <img src="https://img.shields.io/static/v1?label&logo=substack&message=Subscribe%20Now&style=for-the-badge&color=black&scale=2" alt="Subscribe Now" height="40">
 20 |   </a>
 21 | </p>
 22 | 
 23 | ------
 24 | 
 25 | # 🚀 Installation and Usage Guide
 26 | 
 27 | This guide will help you set up and run a machine learning pipeline that includes feature engineering, model training, and deployment using Hopsworks and OpenAI.
 28 | 
 29 | # 📑 Table of Contents
 30 | 
 31 | - [📋 Prerequisites](#-prerequisites)
 32 | - [🎯 Getting Started](#-getting-started)
 33 | - [⚡️ Running the H&M Personalized Recommender](#️-running-the-hm-personalized-recommender)
 34 | - [🤖 Running the ML Pipelines in GitHub Actions](#-running-the-ml-pipelines-in-github-actions)
 35 | - [🌐 Live Demo](#-live-demo)
 36 | - [☁️ Deploying the Streamlit App](#️-deploying-the-streamlit-app)
 37 | 
 38 | # 📋 Prerequisites
 39 | 
 40 | ## Local Tools
 41 | You'll need the following tools installed locally:
 42 | 
 43 | | Tool | Version | Purpose | Installation Link |
 44 | |------|---------|---------|------------------|
 45 | | Python | 3.11 | Programming language runtime | [Download](https://www.python.org/downloads/) |
 46 | | uv | ≥ 0.4.30 | Python package installer and virtual environment manager | [Download](https://github.com/astral-sh/uv) |
 47 | | GNU Make | ≥ 3.81 | Build automation tool | [Download](https://www.gnu.org/software/make/) |
 48 | | Git | ≥2.44.0 | Version control | [Download](https://git-scm.com/downloads)
 49 | 
 50 | ## Cloud Services
 51 | The project requires access to these cloud services:
 52 | 
 53 | | Service | Purpose | Cost | Required Credentials | Setup Guide |
 54 | |---------|---------|------|---------------------|-------------|
 55 | | [Hopsworks](https://rebrand.ly/serverless-github) | AI Lakehouse for feature store, model registry, and serving | Free tier available | `HOPSWORKS_API_KEY` | [Create API Key](https://docs.hopsworks.ai/latest/user_guides/projects/api_key/create_api_key/) |
 56 | | [GitHub Actions](https://github.com/features/actions) | Compute & Automation | Free for public repos | - | - |
 57 | | [OpenAI API](https://openai.com/index/openai-api/) | LLM API for recommender system | Pay-per-use | `OPENAI_API_KEY` | [Quick Start Guide](https://platform.openai.com/docs/quickstart) |
 58 | 
 59 | # 🎯 Getting Started
 60 | 
 61 | ## 1. Clone the Repository
 62 | 
 63 | Start by cloning the repository and navigating to the project directory:
 64 | ```
 65 | git clone https://github.com/decodingml/personalized-recommender-course.git
 66 | cd personalized-recommender-course 
 67 | ```
 68 | 
 69 | Next, we have to prepare your Python environment and its adjacent dependencies.
 70 | 
 71 | ## 2. Installation
 72 | 
 73 | Set up the project environment by running the following:
 74 | ```bash
 75 | make install
 76 | ```
 77 | Test that you have Python 3.11.8 installed in your new `uv` environment:
 78 | ```bash
 79 | uv run python --version
 80 | # Output: Python 3.11.8
 81 | ```
 82 | 
 83 | This command will:
 84 | - Create a virtual environment using `uv`
 85 | - Activate the virtual environment
 86 | - Install all dependencies from `pyproject.toml`
 87 | 
 88 | > [!NOTE]
 89 | > Normally, `uv` will pick the right Python version mentioned in `.python-version` and install it automatically if it is not on your system. If you are having any issues, explicitly install the right Python version by running `make install-python`
 90 | 
 91 | ## 3. Environment Configuration
 92 | 
 93 | Before running any components:
 94 | 1. Create your environment file:
 95 |    ```bash
 96 |    cp .env.example .env
 97 |    ```
 98 | 2. Open `.env` and configure the required credentials following the inline comments and the recommendations from the [Cloud Services](#-prerequisites) section.
 99 | 
100 | # ⚡️ Running the H&M Personalized Recommender
101 | 
102 | ## Notebooks
103 | 
104 | For instructions on exploring the Notebooks, check out the [📚 Course](https://github.com/decodingml/personalized-recommender-course?tab=readme-ov-file#-course-outline) section from the main [README](https://github.com/decodingml/personalized-recommender-course?tab=readme-ov-file#-course-outline).
105 | 
106 | ## Running the ML Pipelines
107 | 
108 | You can run the entire pipeline at once or execute individual components.
109 | 
110 | ### Running Everything in One Go (Quick)
111 | 
112 | Execute all the ML pipelines in a sequence:
113 | ```bash
114 | make all
115 | ```
116 | It will take ~1.5 hours to run, depending on your machine. 
117 | 
118 | This runs the following steps:
119 | 1. Feature engineering
120 | 2. Retrieval model training
121 | 3. Ranking model training
122 | 4. Candidate embeddings creation
123 | 5. Inference pipeline deployment
124 | 6. Materialization job scheduling
125 | 
126 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**
127 | 
128 | Start the Streamlit UI:
129 | ```bash
130 | make start-ui
131 | ```
132 | Accessible at `http://localhost:8501/`
133 | 
134 | ### Running Individual Components (Recommended)
135 | 
136 | Each component can be run separately:
137 | 
138 | 1. **Feature Engineering**
139 | ```bash
140 | make feature-engineering
141 | ```
142 | It will take ~1 hour to run, depending on your machine. 
143 | 
144 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups**
145 | 
146 | 2. **Retrieval Model Training**
147 | ```bash
148 | make train-retrieval
149 | ```
150 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry**
151 | 
152 | 3. **Ranking Model Training**
153 | ```bash
154 | make train-ranking
155 | ```
156 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry**
157 | 
158 | 4. **Embeddings Creation**
159 | ```bash
160 | make create-embeddings
161 | ```
162 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Feature Store → Feature Groups**
163 | 
164 | 5. **Deployment Creation**
165 | ```bash
166 | make create-deployments
167 | ```
168 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**
169 | 
170 | <p align="center">
171 |   <a href="https://rebrand.ly/serverless-github">
172 |     <img src="assets/hopsworks_deployments.png" alt="hopsworks_deployments" width="800">
173 |   </a>
174 | </p>
175 | 
176 | Start the Streamlit UI:
177 | ```bash
178 | make start-ui
179 | ```
180 | Accessible at `http://localhost:8501/`
181 | 
182 | > [!IMPORTANT]
183 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother.
184 | 
185 | 6. **Materialization Job Scheduling**
186 | ```bash
187 | make schedule-materialization-jobs
188 | ```
189 | View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Compute → Ingestions**
190 | 
191 | 7. **Deployment Creation with LLM Ranking (Optional)**
192 | 
193 | Optional step to replace the standard deployments (created in Step 5) with the ones powered by LLMs:
194 | ```bash
195 | make create-deployments-llm-ranking
196 | ```
197 | **NOTE**: If the script fails, go to [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**, forcefully stop all the deployments and run again.
198 | 
199 | > [!WARNING]
200 | > The LLM Ranking deployment overrides the deployment from **5. Deployment Creation**
201 | 
202 | Start the Streamlit UI that interfaces the LLM deployment:
203 | ```bash
204 | make start-ui-llm-ranking
205 | ```
206 | Accessible at `http://localhost:8501/`
207 | 
208 | > [!WARNING]
209 | > The Streamlit UI command is compatible only with its corresponding deployment. For example, running the deployment from **5. Deployment Creation** and `make start-ui-llm-ranking` won't work.
210 | 
211 | ## Clean Up Resources
212 | 
213 | Remove all created resources from [Hopsworks Serverless](https://rebrand.ly/serverless-github):
214 | ```bash
215 | make clean-hopsworks-resources
216 | ```
217 | 
218 | ### 🚨 Important Notes
219 | - Ensure UV is properly installed and configured before running any commands
220 | - All notebooks are executed using IPython through the UV virtual environment
221 | - Components should be run in the specified order when executing individually
222 | 
223 | # 🤖 Running the ML Pipelines in GitHub Actions
224 | 
225 | This project supports running ML pipelines automatically through GitHub Actions, providing an alternative to local or Colab execution.
226 | 
227 | > [!NOTE]
228 | > This is handy when getting network errors, such as timeouts, on your local machine. GitHub Actions has an enterprise-level network that will run your ML pipelines smoothly.
229 | 
230 | ## Pipeline Triggers
231 | 
232 | The ML pipelines can be triggered in three ways:
233 | - Manual trigger through GitHub UI
234 | - Scheduled execution (configurable)
235 | - On push to main branch (configurable)
236 | 
237 | ## Setup Process
238 | 
239 | ### 1. Fork Repository
240 | Create your own copy of the repository to access GitHub Actions:
241 | ```bash
242 | # Use GitHub's UI to fork the repository
243 | https://github.com/original-repo/name → Your-Username/name
244 | ```
245 | [📚 GitHub Fork Guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo)
246 | 
247 | ### 2. Configure Secrets
248 | Set up required environment variables as GitHub Actions secrets:
249 | 
250 | **Option A: Using GitHub UI**
251 | 1. Navigate to: Repository → Settings → Secrets and variables → Actions
252 | 2. Click "New repository secret"
253 | 3. Add required secrets:
254 |    - `HOPSWORKS_API_KEY`
255 |    - `OPENAI_API_KEY`
256 | 
257 | [📚 Set up GitHub Actions Secrets Guide](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui)
258 | 
259 | <p align="center">
260 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
261 |     <img src="assets/github_actions_secrets.png" alt="GA Secrets" width="800">
262 |   </a>
263 | </p>
264 | 
265 | **Option B: Using GitHub CLI**
266 | 
267 | If you have `GitHub CLI` installed, instead of settings the GitHub Actions secrets manually, you can set them by running the following:
268 | 
269 | ```bash
270 | gh secret set HOPSWORKS_API_KEY
271 | gh secret set OPENAI_API_KEY
272 | ```
273 | 
274 | ### 3. Execute Pipeline
275 | 
276 | #### Manual Execution
277 | 1. Go to Actions → ML Pipelines
278 | 2. Click "Run workflow"
279 | 3. Select branch (default: main)
280 | 4. Click "Run workflow"
281 | 
282 | <p align="center">
283 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
284 |     <img src="assets/github_actions_manual_trigger.png" alt="GA Manual Trigger" width="1000">
285 |   </a>
286 | </p>
287 | 
288 | After triggering the pipeline, you will see it running, signaled by a yellow circle. Click on it to see the progress.
289 | 
290 | <p align="center">
291 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
292 |     <img src="assets/github_actions_pipeline_progress.png" alt="GA Progress" width="1000">
293 |   </a>
294 | </p>
295 | 
296 | After it is finished, it should look like this:
297 | 
298 | <p align="center">
299 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
300 |     <img src="assets/github_actions_pipeline_done.png" alt="GA Done" width="1000">
301 |   </a>
302 | </p>
303 | 
304 | #### Automated Execution
305 | 
306 | Another option is to run the ML pipelines automatically on a schedule or when new commits are pushed to the main branch.
307 | 
308 | Edit `.github/workflows/ml_pipelines.yaml` to enable automatic triggers:
309 | 
310 | ```yaml
311 | name: ML Pipelines
312 | 
313 | on:
314 |   # schedule: # Uncomment to run the pipelines every 2 hours. All the pipelines take ~1.5 hours to run.
315 |   #   - cron: '0 */2 * * *'
316 |   # push: # Uncomment to run pipelines on every new commit to main
317 |   #   branches:
318 |   #     - main
319 |   workflow_dispatch:  # Allows manual triggering from GitHub UI
320 | ```
321 | 
322 | ## Monitoring & Results
323 | 
324 | 1. **Pipeline Progress**
325 |    - View real-time execution in Actions tab
326 |    - Each step shows detailed logs and status
327 | 
328 | 2. **Output Verification**
329 |    - Access results in [Hopsworks Serverless](https://rebrand.ly/serverless-github)
330 |    - Check Feature Groups, Feature Views, Model Registry, and Deployments
331 | 
332 | ## ⚠️ Important Notes
333 | - Full pipeline execution takes approximately 1.5 hours
334 | - Ensure sufficient GitHub Actions minutes available
335 | - Monitor usage when enabling automated triggers
336 | 
337 | # 🌐 Live Demo
338 | 
339 | Try out our deployed H&M real-time personalized recommender to see what you'll learn to build by the end of this course:
340 | [💻 Live H&M Recommender Streamlit Demo](https://decodingml-hands-on-personalized-recommender.streamlit.app/)
341 | 
342 | > [!IMPORTANT]
343 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother.
344 | 
345 | <p align="center">
346 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
347 |     <img src="assets/ui_example.png" alt="UI Example" width="800">
348 |   </a>
349 | </p>
350 | 
351 | # ☁️ Deploying the Streamlit App
352 | 
353 | Deploying a Streamlit App to their [cloud](https://streamlit.io/cloud) is free and straightforward after the GitHub repository is set in right place:
354 | 
355 | - `uv.lock` - installing Python dependencies
356 | - `packages.txt` - installing system dependencies
357 | - `streamlit_app.py` - entrypoint to the Streamlit application
358 | 
359 | ## Deployment Steps
360 | 
361 | ### 1. Repository Setup
362 | Fork the repository if you haven't already:
363 | ```bash
364 | # Use GitHub's UI to fork the repository
365 | https://github.com/original-repo/name → Your-Username/name
366 | ```
367 | [📚 GitHub Fork Guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo)
368 | 
369 | ### 2. Streamlit Cloud Setup
370 | 1. Create a free account on [Streamlit Cloud](https://docs.streamlit.io/deploy/streamlit-community-cloud/get-started)
371 | 2. Navigate to [New App Deployment](https://docs.streamlit.io/deploy/streamlit-community-cloud/deploy-your-app)
372 | 3. Configure deployment settings:
373 | 
374 | | Setting | Configuration | Description |
375 | |---------|--------------|-------------|
376 | | App Type | ![App Type](assets/streamlit_choose_app_type.png) | Select "Deploy a public app from GitHub" |
377 | | Main Settings | ![Main Settings](assets/streamlit_choose_main_settings.png) | Configure your repository |
378 | | Advanced Settings | ![Advanced Settings](assets/streamlit_choose_advanced_settings.png) | Set Python 3.11 and `HOPSWORKS_API_KEY` |
379 | 
380 | ## ⚠️ Important Notes
381 | - Ensure all required files are present in your repository
382 | - Python version must be set to 3.11
383 | - `HOPSWORKS_API_KEY` must be configured in environment variables
384 | - Repository must be public for free tier deployment
385 | 
386 | [📚 More on Streamlit Cloud deployments](https://docs.streamlit.io/deploy)
387 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Crafted Intelligence SRL
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install-python:
 2 | 	uv python install
 3 | 
 4 | install:
 5 | 	uv venv
 6 | 	. .venv/bin/activate
 7 | 	uv pip install --all-extras --requirement pyproject.toml
 8 | 
 9 | start-ui:
10 | 	RANKING_MODEL_TYPE=ranking uv run python -m streamlit run streamlit_app.py
11 | 
12 | start-ui-llm-ranking:
13 | 	RANKING_MODEL_TYPE=llmranking uv run python -m streamlit run streamlit_app.py
14 | 
15 | clean-hopsworks-resources:
16 | 	uv run python tools/clean_hopsworks_resources.py
17 | 
18 | all: feature-engineering train-retrieval train-ranking create-embeddings create-deployments schedule-materialization-jobs
19 | 
20 | feature-engineering:
21 | 	uv run ipython notebooks/1_fp_computing_features.ipynb
22 | 
23 | train-retrieval:
24 | 	uv run ipython notebooks/2_tp_training_retrieval_model.ipynb
25 | 
26 | train-ranking:
27 | 	uv run ipython notebooks/3_tp_training_ranking_model.ipynb
28 | 
29 | create-embeddings:
30 | 	uv run ipython notebooks/4_ip_computing_item_embeddings.ipynb
31 | 
32 | create-deployments:
33 | 	uv run ipython notebooks/5_ip_creating_deployments.ipynb
34 | 
35 | schedule-materialization-jobs:
36 | 	uv run ipython notebooks/6_scheduling_materialization_jobs.ipynb
37 | 
38 | create-deployments-llm-ranking:
39 | 	uv run ipython notebooks/7_ip_creating_deployments_llm_ranking.ipynb
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <h1>Hands-on H&M Real-Time Personalized Recommender</h1>
  3 |   <p class="tagline">Open-source course by <a href="https://decodingml.substack.com">Decoding ML</a> in collaboration with <a href="https://rebrand.ly/homepage-github">Hopsworks</a>.</p>
  4 | </div>
  5 | 
  6 | </br>
  7 | 
  8 | <p align="center">
  9 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
 10 |     <img src="assets/system_architecture.png" alt="Architecture" width="600">
 11 |   </a>
 12 | </p>
 13 | 
 14 | ## 🎯 What You'll Learn
 15 | 
 16 | This hands-on course teaches you how to build and deploy a real-time personalized recommender system for H&M fashion articles. You'll learn:
 17 | 
 18 | - To architect a modern ML system for real-time personalized recommenders.
 19 | - To do feature engineering using modern tools such as Polars.
 20 | - To design and train ML models for recommender systems powered by neural networks.
 21 | - To use MLOps best practices by leveraging [Hopsworks AI Lakehouse](https://rebrand.ly/homepage-github).
 22 | - To deploy the recommender on a Kubernetes cluster managed by [Hopsworks Serverless](https://rebrand.ly/serverless-github) using KServe.
 23 | - To apply LLM techniques for personalized recommendations.
 24 | 
 25 | <p align="center">
 26 |   <img src="assets/4_stage_recommender_architecture.png" alt="4_stage_recommender_architecture" width="400" style="display: inline-block; margin-right: 20px;">
 27 |   <img src="assets/two_tower_embedding_model.png" alt="two_tower_embedding_model" width="400" style="display: inline-block;">
 28 | </p>
 29 | 
 30 | ## 📖 About This Course
 31 | 
 32 | This course is part of Decoding ML's open-source series, where we provide free hands-on resources for building GenAI and recommender systems.
 33 | 
 34 | The **Hands-on H&M Real-Time Personalized Recommender**, in collaboration with [Hopsworks](https://rebrand.ly/homepage-github), is a 5-module course backed up by code, Notebooks and lessons that will teach you how to build an H&M real-time personalized recommender from scratch.
 35 | 
 36 | By the end of this course, you will know how to architect, build and deploy a modern recommender.
 37 | 
 38 | **What you'll do:**
 39 | 
 40 | 1. Architect a scalable and modular ML system using the Feature/Training/Inference (FTI) architecture.
 41 | 3. Feature engineering on top of our H&M data for collaborative and content-based filtering techniques for recommenders.
 42 | 2. Use the two-tower network to Create user and item embeddings in the same vector space.
 43 | 3. Implement an H&M real-time personalized recommender using the 4-stage recommender design and a vector database.
 44 | 4. Use MLOps best practices, such as a feature store and a model registry.
 45 | 5. Deploy the online inference pipeline to Kubernetes using KServe.
 46 | 6. Deploy the offline ML pipelines to GitHub Actions.
 47 | 7. Implement a web interface using Streamlit.
 48 | 8. Improve the H&M real-time personalized recommender using LLMs.
 49 | 
 50 | 🥷 With these skills, you'll become a ninja in building real-time personalized recommenders. 
 51 | 
 52 | ## 🌐 Live Demo
 53 | 
 54 | Try out our deployed H&M real-time personalized recommender to see what you'll learn to build by the end of this course:
 55 | [💻 Live H&M Recommender Streamlit Demo](https://decodingml-hands-on-personalized-recommender.streamlit.app/)
 56 | 
 57 | > [!IMPORTANT]
 58 | > The demo is in 0-cost mode, which means that when there is no traffic, the deployment scales to 0 instances. The first time you interact with it, give it 1-2 minutes to warm up to 1+ instances. Afterward, everything will become smoother.
 59 | 
 60 | <p align="center">
 61 |   <a href="https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022">
 62 |     <img src="assets/ui_example.png" alt="UI Example" width="800">
 63 |   </a>
 64 | </p>
 65 | 
 66 | ----
 67 | 
 68 | <table style="border-collapse: collapse; border: none;">
 69 |   <tr style="border: none;">
 70 |     <td width="20%" style="border: none;">
 71 |       <a href="https://decodingml.substack.com/" aria-label="Decoding ML">
 72 |         <img src="https://github.com/user-attachments/assets/f2f2f9c0-54b7-4ae3-bf8d-23a359c86982" alt="Decoding ML Logo" width="150"/>
 73 |       </a>
 74 |     </td>
 75 |     <td width="80%" style="border: none;">
 76 |       <div>
 77 |         <h2>📬 Stay Updated</h2>
 78 |         <p><b><a href="https://decodingml.substack.com/">Join Decoding ML</a></b> for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.</p>
 79 |       </div>
 80 |     </td>
 81 |   </tr>
 82 | </table>
 83 | 
 84 | <p align="center">
 85 |   <a href="https://decodingml.substack.com/">
 86 |     <img src="https://img.shields.io/static/v1?label&logo=substack&message=Subscribe%20Now&style=for-the-badge&color=black&scale=2" alt="Subscribe Now" height="40">
 87 |   </a>
 88 | </p>
 89 | 
 90 | ## 👥 Who Should Join?
 91 | 
 92 | **This course is ideal for:**
 93 | - ML/AI engineers interested in building production-ready recommender systems
 94 | - Data Engineers, Data Scientists, and Software Engineers wanting to understand the engineering behind recommenders
 95 | 
 96 | **Note:** This course focuses on engineering practices and end-to-end system implementation rather than theoretical model optimization or research.
 97 | 
 98 | ## 🎓 Prerequisites
 99 | 
100 | | Category | Requirements |
101 | |----------|-------------|
102 | | **Skills** | Basic understanding of Python and Machine Learning |
103 | | **Hardware** | Any modern laptop/workstation will do the job (no GPU or powerful computing power required). We also support Google Colab or GitHub Actions for compute.|
104 | | **Level** | Intermediate |
105 | 
106 | 
107 | ## 💰 Cost Structure
108 | 
109 | All tools used throughout the course will stick to their free tier, except OpenAI's API, as follows:
110 | 
111 | - Modules 1-4: Completely free
112 | - Module 5 (Optional): ~$1-2 for OpenAI API usage when building LLM-enhanced recommenders
113 | 
114 | ## 🥂 Open-source Course: Participation is Open and Free
115 | 
116 | As an open-source course, you don't have to enroll. Everything is self-paced, free of charge and with its resources freely accessible as follows:
117 | - **code**: this GitHub repository
118 | - **articles**: [Decoding ML](https://decodingml.substack.com/p/the-ultimate-recommender-system-framework)
119 | 
120 | ## 📚 Course Outline
121 | 
122 | This **open-source course consists of 5 comprehensive modules** covering theory, system design, and hands-on implementation.
123 | 
124 | Our recommendation for each module:
125 | 1. Read the article
126 | 2. Run the Notebook to replicate our results (locally or on Colab)
127 | 3. Following the Notebook, go deeper into the code by reading the `recsys` Python module
128 | 
129 | > [!NOTE]
130 | > Check the [INSTALL_AND_USAGE](https://github.com/decodingml/hands-on-personalized-recommender/blob/main/INSTALL_AND_USAGE.md) doc for a step-by-step installation and usage guide.
131 | 
132 | | Module | Article | Description | Notebooks |
133 | |--------|-------|-------------|----------------|
134 | | 1 | [Building a TikTok-like recommender](https://decodingml.substack.com/p/33d3273e-b8e3-4d98-b160-c3d239343022) | Learn how to architect a recommender system using the 4-stage architecture and two-tower network. | **No code** |
135 | | 2 | [Feature pipelines for TikTok-like recommenders](https://decodingml.substack.com/p/feature-pipeline-for-tiktok-like) | Learn how to build a scalable feature pipeline using a feature store. | •[1_fp_computing_features.ipynb](notebooks/1_fp_computing_features.ipynb) |
136 | | 3 | [Training pipelines for TikTok-like recommenders](https://decodingml.substack.com/p/training-pipelines-for-tiktok-like) | Learn to train and evaluate the two-tower network and ranking model using MLOps best practices. | •[2_tp_training_retrieval_model.ipynb](notebooks/2_tp_training_retrieval_model.ipynb)<br>•[3_tp_training_ranking_model.ipynb](notebooks/3_tp_training_ranking_model.ipynb) |
137 | | 4 | [Deploy scalable TikTok-like recommenders](https://decodingml.substack.com/p/deploy-scalable-tiktok-like-recommenders) | Learn how to architect and deploy the inference pipelines for real-time recommendations using the 4-stage design. | •[4_ip_computing_item_embeddings.ipynb](notebooks/4_ip_computing_item_embeddings.ipynb)<br>•[5_ip_creating_deployments.ipynb](notebooks/5_ip_creating_deployments.ipynb)<br>•[6_scheduling_materialization_jobs.ipynb](notebooks/6_scheduling_materialization_jobs.ipynb) |
138 | | 5 | [Using LLMs to build TikTok-like recommenders](https://decodingml.substack.com/p/using-llms-to-build-tiktok-like-recommenders) | Learn how to enhance the H&M personalized recommender with LLMs. | •[7_ip_creating_deployments_llm_ranking.ipynb](notebooks/7_ip_creating_deployments_llm_ranking.ipynb) |
139 | 
140 | ### Google Colab
141 | 
142 | To run the Notebooks in Google Colab, copy-paste them into your Google Drive, open them with Google Colab, and run them as running them locally. At the beginning of each Notebook, we have a set of setup steps that will **prepare the code and Python environment automatically**.
143 | 
144 | ----
145 | 
146 | <table style="border-collapse: collapse; border: none;">
147 |   <tr style="border: none;">
148 |     <td width="20%" style="border: none;">
149 |       <a href="https://decodingml.substack.com/" aria-label="Decoding ML">
150 |         <img src="https://github.com/user-attachments/assets/f2f2f9c0-54b7-4ae3-bf8d-23a359c86982" alt="Decoding ML Logo" width="150"/>
151 |       </a>
152 |     </td>
153 |     <td width="80%" style="border: none;">
154 |       <div>
155 |         <h2>📬 Stay Updated</h2>
156 |         <p><b><a href="https://decodingml.substack.com/">Join Decoding ML</a></b> for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.</p>
157 |       </div>
158 |     </td>
159 |   </tr>
160 | </table>
161 | 
162 | <p align="center">
163 |   <a href="https://decodingml.substack.com/">
164 |     <img src="https://img.shields.io/static/v1?label&logo=substack&message=Subscribe%20Now&style=for-the-badge&color=black&scale=2" alt="Subscribe Now" height="40">
165 |   </a>
166 | </p>
167 | 
168 | ## 🏗️ Project Structure
169 | 
170 | At Decoding ML we teach how to build production ML systems, thus the course follows the structure of a real-world Python project:
171 | 
172 | ```bash
173 | .
174 | ├── notebooks/          # Jupyter notebooks for each pipeline
175 | ├── recsys/             # Core recommender system package
176 | │   ├── config.py       # Configuration and settings
177 | │   ...
178 | │   └── training/       # Training pipelines code
179 | ├── tools/              # Utility scripts
180 | ├── streamlit_app.py    # Streamlit app entry point
181 | ├── .env.example        # Example environment variables template
182 | ├── Makefile            # Commands to install and run the project
183 | ├── pyproject.toml      # Project dependencies
184 | ```
185 | 
186 | ## 👔 Dataset
187 | 
188 | We will use the [H&M Personalized Fashion Recommendations](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations) dataset, available on Kaggle, open-source for academic research and education.
189 | 
190 | It is an e-commerce dataset that contains fashion articles from the H&M clothes brand.
191 | 
192 | It contains:
193 | - 105k articles
194 | - 137k customers
195 | - 31 million transactions 
196 | 
197 | More on the dataset in the feature engineering pipeline [Notebook](notebooks/1_fp_computing_features.ipynb) and [article](https://decodingml.substack.com/p/feature-pipeline-for-tiktok-like).
198 | 
199 | ## 🚀 Getting Started
200 | 
201 | For detailed installation and usage instructions, see our [INSTALL_AND_USAGE](https://github.com/decodingml/hands-on-personalized-recommender/blob/main/INSTALL_AND_USAGE.md) guide.
202 | 
203 | **Recommendation:** While you can follow the installation guide directly, we strongly recommend reading the accompanying articles to gain a complete understanding of the recommender system.
204 | 
205 | ## 💡 Questions and Troubleshooting
206 | 
207 | Have questions or running into issues? We're here to help!
208 | 
209 | Open a [GitHub issue](https://github.com/decodingml/hands-on-personalized-recommender/issues) for:
210 | - Questions about the course material
211 | - Technical troubleshooting
212 | - Clarification on concepts
213 | 
214 | When having issues with [Hopsworks Serverless](https://rebrand.ly/serverless-github), the best place to ask questions is on [Hopsworks's Slack](https://join.slack.com/t/public-hopsworks/shared_invite/zt-1uf21vitz-rhHKNdIf8GEiOf1EJ6Wzsw), where their engineers can help you directly.
215 | 
216 | ## 🥂 Contributing
217 | 
218 | As an open-source course, we may not be able to fix all the bugs that arise.
219 | 
220 | If you find any bugs and know how to fix them, support future readers by contributing to this course with your bug fix.
221 | 
222 | We will deeply appreciate your support for the AI community and future readers 🤗
223 | 
224 | ## Sponsors
225 | 
226 | <table>
227 |   <tr>
228 |     <td align="center">
229 |       <a href="https://rebrand.ly/homepage-github" target="_blank">Hopsworks</a>
230 |     </td>
231 |   </tr>
232 |   <tr>
233 |     <td align="center">
234 |       <a href="https://rebrand.ly/homepage-github" target="_blank">
235 |         <img src="assets/hopsworks.png" width="200" alt="Hopsworks">
236 |       </a>
237 |     </td>
238 |   </tr>
239 | </table>
240 | 
241 | ## Contributors
242 | 
243 | <table>
244 |   <tr>
245 |     <td align="center">
246 |       <a href="https://github.com/iusztinpaul">
247 |         <img src="https://github.com/iusztinpaul.png" width="100px;" alt="Paul Iusztin"/><br />
248 |         <sub><b>Paul Iusztin</b></sub>
249 |       </a><br />
250 |       <sub>AI/ML Engineer</sub>
251 |     </td>
252 |     <td align="center">
253 |       <a href="https://github.com/915-Muscalagiu-AncaIoana">
254 |         <img src="https://github.com/915-Muscalagiu-AncaIoana.png" width="100px;" alt="Anca Ioana Muscalagiu"/><br />
255 |         <sub><b>Anca Ioana Muscalagiu</b></sub>
256 |       </a><br />
257 |       <sub>AI/ML Engineer</sub>
258 |     </td>
259 |      <td align="center">
260 |       <a href="https://github.com/paoloap-py">
261 |         <img src="https://github.com/paoloap-py.png" width="100px;" alt="Paolo Perrone"/><br />
262 |         <sub><b>Paolo Perrone</b></sub>
263 |       </a><br />
264 |       <sub>AI/ML Engineer</sub>
265 |     </td>
266 |     <td align="center">
267 |       <a href="https://github.com/logicalclocks">
268 |         <img src="https://github.com/logicalclocks.png" width="100px;" alt="Hopsworks"/><br />
269 |         <sub><b>Hopsworks's Engineering Team</b></sub>
270 |       </a><br />
271 |       <sub>AI Lakehouse</sub>
272 |     </td>
273 |   </tr>
274 | </table>
275 | 
276 | 
277 | ## License
278 | 
279 | This course is an open-source project released under the MIT license. Thus, as long you distribute our LICENSE and acknowledge your project is based on our work, you can safely clone or fork this project and use it as a source of inspiration for your educational projects (e.g., university, college degree, personal projects, etc.).
280 | 
281 | ----
282 | 
283 | <table style="border-collapse: collapse; border: none;">
284 |   <tr style="border: none;">
285 |     <td width="20%" style="border: none;">
286 |       <a href="https://decodingml.substack.com/" aria-label="Decoding ML">
287 |         <img src="https://github.com/user-attachments/assets/f2f2f9c0-54b7-4ae3-bf8d-23a359c86982" alt="Decoding ML Logo" width="150"/>
288 |       </a>
289 |     </td>
290 |     <td width="80%" style="border: none;">
291 |       <div>
292 |         <h2>📬 Stay Updated</h2>
293 |         <p><b><a href="https://decodingml.substack.com/">Join Decoding ML</a></b> for proven content on designing, coding, and deploying production-grade AI systems with software engineering and MLOps best practices to help you ship AI applications. Every week, straight to your inbox.</p>
294 |       </div>
295 |     </td>
296 |   </tr>
297 | </table>
298 | 
299 | <p align="center">
300 |   <a href="https://decodingml.substack.com/">
301 |     <img src="https://img.shields.io/static/v1?label&logo=substack&message=Subscribe%20Now&style=for-the-badge&color=black&scale=2" alt="Subscribe Now" height="40">
302 |   </a>
303 | </p>
304 | 


--------------------------------------------------------------------------------
/assets/4_stage_recommender_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/4_stage_recommender_architecture.png


--------------------------------------------------------------------------------
/assets/github_actions_manual_trigger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_manual_trigger.png


--------------------------------------------------------------------------------
/assets/github_actions_pipeline_done.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_pipeline_done.png


--------------------------------------------------------------------------------
/assets/github_actions_pipeline_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_pipeline_progress.png


--------------------------------------------------------------------------------
/assets/github_actions_secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/github_actions_secrets.png


--------------------------------------------------------------------------------
/assets/hopsworks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/hopsworks.png


--------------------------------------------------------------------------------
/assets/hopsworks_deployments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/hopsworks_deployments.png


--------------------------------------------------------------------------------
/assets/streamlit_choose_advanced_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_advanced_settings.png


--------------------------------------------------------------------------------
/assets/streamlit_choose_app_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_app_type.png


--------------------------------------------------------------------------------
/assets/streamlit_choose_main_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/streamlit_choose_main_settings.png


--------------------------------------------------------------------------------
/assets/system_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/system_architecture.png


--------------------------------------------------------------------------------
/assets/two_tower_embedding_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/two_tower_embedding_model.png


--------------------------------------------------------------------------------
/assets/ui_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/assets/ui_example.png


--------------------------------------------------------------------------------
/notebooks/5_ip_creating_deployments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import time\n",
 10 |     "\n",
 11 |     "notebook_start_time = time.time()"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# Set up environment"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "⛳️ Local environment\n",
 31 |       "Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "import sys\n",
 37 |     "from pathlib import Path\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "def is_google_colab() -> bool:\n",
 41 |     "    if \"google.colab\" in str(get_ipython()):\n",
 42 |     "        return True\n",
 43 |     "    return False\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "def clone_repository() -> None:\n",
 47 |     "    !git clone https://github.com/decodingml/hands-on-recommender-system.git\n",
 48 |     "    %cd hands-on-recommender-system/\n",
 49 |     "\n",
 50 |     "\n",
 51 |     "def install_dependencies() -> None:\n",
 52 |     "    !pip install --upgrade uv\n",
 53 |     "    !uv pip install --all-extras --system --requirement pyproject.toml\n",
 54 |     "\n",
 55 |     "\n",
 56 |     "if is_google_colab():\n",
 57 |     "    clone_repository()\n",
 58 |     "    install_dependencies()\n",
 59 |     "\n",
 60 |     "    root_dir = str(Path().absolute())\n",
 61 |     "    print(\"⛳️ Google Colab environment\")\n",
 62 |     "else:\n",
 63 |     "    root_dir = str(Path().absolute().parent)\n",
 64 |     "    print(\"⛳️ Local environment\")\n",
 65 |     "\n",
 66 |     "# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.\n",
 67 |     "if root_dir not in sys.path:\n",
 68 |     "    print(f\"Adding the following directory to the PYTHONPATH: {root_dir}\")\n",
 69 |     "    sys.path.append(root_dir)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "# Online inference pipeline: Deploying and testing the real-time ML services\n",
 77 |     "\n",
 78 |     "In this notebook, we will dig into the inference pipeline and deploy it to Hopsworks as a real-time service."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## 📝 Imports"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 3,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "import warnings\n",
 95 |     "\n",
 96 |     "warnings.filterwarnings(\"ignore\")\n",
 97 |     "\n",
 98 |     "from loguru import logger\n",
 99 |     "\n",
100 |     "from recsys import hopsworks_integration"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## <span style=\"color:#ff5f27\">🔮 Connect to Hopsworks Feature Store </span>"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 4,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stderr",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "\u001b[32m2024-12-24 13:12:11.849\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mrecsys.hopsworks_integration.feature_store\u001b[0m:\u001b[36mget_feature_store\u001b[0m:\u001b[36m13\u001b[0m - \u001b[1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.\u001b[0m\n"
120 |      ]
121 |     },
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "2024-12-24 13:12:11,850 INFO: Initializing external client\n",
127 |       "2024-12-24 13:12:11,850 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
128 |       "2024-12-24 13:12:13,423 INFO: Python Engine initialized.\n",
129 |       "\n",
130 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "project, fs = hopsworks_integration.get_feature_store()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {
141 |     "tags": []
142 |    },
143 |    "source": [
144 |     "# Deploying the ranking inference pipeline\n"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "You start by deploying your ranking model. Since it is a CatBoost model you need to implement a `Predict` class that tells Hopsworks how to load the model and how to use it:"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 5,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stderr",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "Uploading: 100.000%|██████████| 4491/4491 elapsed<00:01 remaining<00:00\n",
164 |       "Uploading: 100.000%|██████████| 1113/1113 elapsed<00:01 remaining<00:00\n"
165 |      ]
166 |     },
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "Deployment created, explore it at https://c.app.hopsworks.ai:443/p/1192098/deployments/353319\n",
172 |       "Before making predictions, start the deployment by using `.start()`\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "ranking_deployment = hopsworks_integration.ranking_serving.HopsworksRankingModel.deploy(\n",
178 |     "    project=project\n",
179 |     ")"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "Now, we have to explicitly start the deployment:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 6,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stderr",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "Deployment is ready: 100%|██████████| 6/6 [00:47<00:00,  7.88s/it]    "
199 |      ]
200 |     },
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "Start making predictions by using `.predict()`\n"
206 |      ]
207 |     },
208 |     {
209 |      "name": "stderr",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "ranking_deployment.start()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 7,
223 |    "metadata": {
224 |     "tags": []
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "# Check logs in case of failure\n",
229 |     "# ranking_deployment.get_logs(component=\"transformer\", tail=200)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "## <span style=\"color:#ff5f27\"> Test the ranking inference pipeline</span>\n"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 8,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "def get_top_recommendations(ranked_candidates, k=3):\n",
246 |     "    return [candidate[-1] for candidate in ranked_candidates[\"ranking\"][:k]]"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "Let's define a dummy test example to test our ranking deployment (only the `customer_id` has to match):"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 9,
259 |    "metadata": {
260 |     "tags": []
261 |    },
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "['592846001', '536139006', '408554004']"
267 |       ]
268 |      },
269 |      "execution_count": 9,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "test_ranking_input = [\n",
276 |     "        {\n",
277 |     "            \"customer_id\": \"d327d0ad9e30085a436933dfbb7f77cf42e38447993a078ed35d93e3fd350ecf\",\n",
278 |     "            \"month_sin\": 1.2246467991473532e-16,\n",
279 |     "            \"query_emb\": [\n",
280 |     "                0.214135289,\n",
281 |     "                0.571055949,\n",
282 |     "                0.330709577,\n",
283 |     "                -0.225899458,\n",
284 |     "                -0.308674961,\n",
285 |     "                -0.0115124583,\n",
286 |     "                0.0730511621,\n",
287 |     "                -0.495835781,\n",
288 |     "                0.625569344,\n",
289 |     "                -0.0438038409,\n",
290 |     "                0.263472944,\n",
291 |     "                -0.58485353,\n",
292 |     "                -0.307070434,\n",
293 |     "                0.0414443575,\n",
294 |     "                -0.321789205,\n",
295 |     "                0.966559,\n",
296 |     "            ],\n",
297 |     "            \"month_cos\": -1.0,\n",
298 |     "        }\n",
299 |     "    ]\n",
300 |     "\n",
301 |     "# Test ranking deployment\n",
302 |     "ranked_candidates = ranking_deployment.predict(inputs=test_ranking_input)\n",
303 |     "\n",
304 |     "# Retrieve article ids of the top recommended items\n",
305 |     "recommendations = get_top_recommendations(ranked_candidates[\"predictions\"], k=3)\n",
306 |     "recommendations"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "Check logs in case of failure:"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 10,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# ranking_deployment.get_logs(component=\"transformer\", tail=200)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {
328 |     "tags": []
329 |    },
330 |    "source": [
331 |     "# Deploying the query inference pipeline"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 11,
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "name": "stdout",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "2024-12-24 13:13:14,889 INFO: Closing external client and cleaning up certificates.\n",
344 |       "Connection closed.\n",
345 |       "2024-12-24 13:13:14,894 INFO: Initializing external client\n",
346 |       "2024-12-24 13:13:14,895 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
347 |       "2024-12-24 13:13:16,223 INFO: Python Engine initialized.\n",
348 |       "\n",
349 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n",
350 |       "2024-12-24 13:13:17,497 INFO: Closing external client and cleaning up certificates.\n",
351 |       "2024-12-24 13:13:17,501 INFO: Initializing external client\n",
352 |       "2024-12-24 13:13:17,502 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
353 |       "2024-12-24 13:13:18,402 INFO: Closing external client and cleaning up certificates.\n",
354 |       "Connection closed.\n",
355 |       "2024-12-24 13:13:18,408 INFO: Initializing external client\n",
356 |       "2024-12-24 13:13:18,408 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
357 |       "2024-12-24 13:13:19,727 INFO: Python Engine initialized.\n",
358 |       "\n",
359 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n",
360 |       "Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets\n"
361 |      ]
362 |     },
363 |     {
364 |      "name": "stderr",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "Uploading: 100.000%|██████████| 2948/2948 elapsed<00:05 remaining<00:00\n"
368 |      ]
369 |     },
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "Deployment created, explore it at https://c.app.hopsworks.ai:443/p/1192098/deployments/353320\n",
375 |       "Before making predictions, start the deployment by using `.start()`\n"
376 |      ]
377 |     }
378 |    ],
379 |    "source": [
380 |     "query_model_deployment = (\n",
381 |     "    hopsworks_integration.two_tower_serving.HopsworksQueryModel.deploy(ranking_model_type=\"ranking\")\n",
382 |     ")"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "At this point, you have registered your deployment. To start it up you need to run:"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 12,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "name": "stderr",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "Deployment is ready: 100%|██████████| 6/6 [00:26<00:00,  4.45s/it]    "
402 |      ]
403 |     },
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "Start making predictions by using `.predict()`\n"
409 |      ]
410 |     },
411 |     {
412 |      "name": "stderr",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "\n"
416 |      ]
417 |     }
418 |    ],
419 |    "source": [
420 |     "query_model_deployment.start()"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 13,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "# Check logs in case of failure\n",
430 |     "# query_model_deployment.get_logs(component=\"transformer\", tail=20)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "## <span style=\"color:#ff5f27\"> Testing the inference pipeline </span>\n",
438 |     "\n",
439 |     "Define a test input example:"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 14,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "data = [\n",
449 |     "    {\n",
450 |     "        \"customer_id\": \"d327d0ad9e30085a436933dfbb7f77cf42e38447993a078ed35d93e3fd350ecf\",\n",
451 |     "        \"transaction_date\": \"2022-11-15T12:16:25.330916\",\n",
452 |     "    }\n",
453 |     "]"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "Test out the deployment:"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 15,
466 |    "metadata": {},
467 |    "outputs": [
468 |     {
469 |      "data": {
470 |       "text/plain": [
471 |        "['670079001', '299768002', '324946001']"
472 |       ]
473 |      },
474 |      "execution_count": 15,
475 |      "metadata": {},
476 |      "output_type": "execute_result"
477 |     }
478 |    ],
479 |    "source": [
480 |     "ranked_candidates = query_model_deployment.predict(inputs=data)\n",
481 |     "\n",
482 |     "# Retrieve article ids of the top recommended items\n",
483 |     "recommendations = get_top_recommendations(ranked_candidates[\"predictions\"], k=3)\n",
484 |     "recommendations"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "markdown",
489 |    "metadata": {},
490 |    "source": [
491 |     "Check logs in case of failure:"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 16,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "# query_model_deployment.get_logs(component=\"transformer\", tail=200)"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "# <span style=\"color:#ff5f27\"> Stopping the Hopsworks deployments </span>"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "Stop the deployment when you're not using it."
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 17,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "name": "stderr",
524 |      "output_type": "stream",
525 |      "text": [
526 |       "Deployment is stopped: 100%|██████████| 4/4 [00:10<00:00,  2.67s/it]        \n",
527 |       "Deployment is stopped: 100%|██████████| 4/4 [00:10<00:00,  2.68s/it]        \n"
528 |      ]
529 |     }
530 |    ],
531 |    "source": [
532 |     "ranking_deployment.stop()\n",
533 |     "query_model_deployment.stop()"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "markdown",
538 |    "metadata": {},
539 |    "source": [
540 |     "## <span style=\"color:#ff5f27\"> Inspecting the deployments in Hopsworks UI </span>\n",
541 |     "\n",
542 |     "View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Deployments**"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "markdown",
547 |    "metadata": {},
548 |    "source": [
549 |     "---"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 18,
555 |    "metadata": {},
556 |    "outputs": [
557 |     {
558 |      "name": "stderr",
559 |      "output_type": "stream",
560 |      "text": [
561 |       "\u001b[32m2024-12-24 13:14:20.862\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1m⌛️ Notebook Execution time: 133.44 seconds ~ 2.22 minutes\u001b[0m\n"
562 |      ]
563 |     }
564 |    ],
565 |    "source": [
566 |     "notebook_end_time = time.time()\n",
567 |     "notebook_execution_time = notebook_end_time - notebook_start_time\n",
568 |     "\n",
569 |     "logger.info(\n",
570 |     "    f\"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes\"\n",
571 |     ")"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "markdown",
576 |    "metadata": {},
577 |    "source": [
578 |     "# <span style=\"color:#ff5f27\">→ Next Steps </span>\n",
579 |     "\n",
580 |     "The last step is to schedule the materialization jobs."
581 |    ]
582 |   }
583 |  ],
584 |  "metadata": {
585 |   "kernelspec": {
586 |    "display_name": "Python 3 (ipykernel)",
587 |    "language": "python",
588 |    "name": "python3"
589 |   },
590 |   "language_info": {
591 |    "codemirror_mode": {
592 |     "name": "ipython",
593 |     "version": 3
594 |    },
595 |    "file_extension": ".py",
596 |    "mimetype": "text/x-python",
597 |    "name": "python",
598 |    "nbconvert_exporter": "python",
599 |    "pygments_lexer": "ipython3",
600 |    "version": "3.11.8"
601 |   }
602 |  },
603 |  "nbformat": 4,
604 |  "nbformat_minor": 4
605 | }
606 | 


--------------------------------------------------------------------------------
/notebooks/6_scheduling_materialization_jobs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "6d91c23d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Set up environment"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "20f093e1",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "⛳️ Local environment\n",
 22 |       "Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "import sys\n",
 28 |     "from pathlib import Path\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "def is_google_colab() -> bool:\n",
 32 |     "    if \"google.colab\" in str(get_ipython()):\n",
 33 |     "        return True\n",
 34 |     "    return False\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "def clone_repository() -> None:\n",
 38 |     "    !git clone https://github.com/decodingml/hands-on-recommender-system.git\n",
 39 |     "    %cd hands-on-recommender-system/\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "def install_dependencies() -> None:\n",
 43 |     "    !pip install --upgrade uv\n",
 44 |     "    !uv pip install --all-extras --system --requirement pyproject.toml\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "if is_google_colab():\n",
 48 |     "    clone_repository()\n",
 49 |     "    install_dependencies()\n",
 50 |     "\n",
 51 |     "    root_dir = str(Path().absolute())\n",
 52 |     "    print(\"⛳️ Google Colab environment\")\n",
 53 |     "else:\n",
 54 |     "    root_dir = str(Path().absolute().parent)\n",
 55 |     "    print(\"⛳️ Local environment\")\n",
 56 |     "\n",
 57 |     "# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.\n",
 58 |     "if root_dir not in sys.path:\n",
 59 |     "    print(f\"Adding the following directory to the PYTHONPATH: {root_dir}\")\n",
 60 |     "    sys.path.append(root_dir)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "id": "6a8f7546",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "# <span style=\"color:#ff5f27\"> Scheduling Hopsworks materialization jobs </span>\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "b204608b",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## 📝 Imports"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "id": "06390a5b",
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "/Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 90 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "from datetime import datetime, timezone\n",
 96 |     "\n",
 97 |     "from recsys import hopsworks_integration"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "7a931086",
103 |    "metadata": {},
104 |    "source": [
105 |     "## <span style=\"color:#ff5f27\">🔮 Connect to Hopsworks Feature Store </span>"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 3,
111 |    "id": "ef7a5e64",
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "\u001b[32m2024-12-24 13:15:04.623\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mrecsys.hopsworks_integration.feature_store\u001b[0m:\u001b[36mget_feature_store\u001b[0m:\u001b[36m13\u001b[0m - \u001b[1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.\u001b[0m\n"
119 |      ]
120 |     },
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "2024-12-24 13:15:04,625 INFO: Initializing external client\n",
126 |       "2024-12-24 13:15:04,625 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
127 |       "2024-12-24 13:15:06,101 INFO: Python Engine initialized.\n",
128 |       "\n",
129 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "project, fs = hopsworks_integration.get_feature_store()\n",
135 |     "\n",
136 |     "jobs_api = project.get_jobs_api()"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "id": "cea06db9",
142 |    "metadata": {},
143 |    "source": [
144 |     "# Retrieving materialization jobs\n"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 4,
150 |    "id": "92f62c85",
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "Job('interactions_1_offline_fg_materialization', 'SPARK')"
157 |       ]
158 |      },
159 |      "execution_count": 4,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "interactions_job = jobs_api.get_job(\"interactions_1_offline_fg_materialization\")\n",
166 |     "interactions_job"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 5,
172 |    "id": "128827f1",
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "Job('transactions_1_offline_fg_materialization', 'SPARK')"
179 |       ]
180 |      },
181 |      "execution_count": 5,
182 |      "metadata": {},
183 |      "output_type": "execute_result"
184 |     }
185 |    ],
186 |    "source": [
187 |     "transactions_job = jobs_api.get_job(\"transactions_1_offline_fg_materialization\")\n",
188 |     "transactions_job"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "id": "f78e6278",
194 |    "metadata": {},
195 |    "source": [
196 |     "# Running materialization jobs\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 6,
202 |    "id": "f79e13a5",
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "Launching job: interactions_1_offline_fg_materialization\n",
210 |       "Job started successfully, you can follow the progress at \n",
211 |       "https://c.app.hopsworks.ai:443/p/1192098/jobs/named/interactions_1_offline_fg_materialization/executions\n",
212 |       "2024-12-24 13:15:16,740 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n",
213 |       "2024-12-24 13:15:19,916 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n",
214 |       "2024-12-24 13:16:49,132 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n",
215 |       "2024-12-24 13:16:49,297 INFO: Waiting for log aggregation to finish.\n",
216 |       "2024-12-24 13:17:08,007 INFO: Execution finished successfully.\n"
217 |      ]
218 |     },
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "Execution('SUCCEEDED', 'FINISHED', '2024-12-24T11:15:08.000Z', '-op offline_fg_materialization -path hdfs:///Projects/decoding/Resources/jobs/interactions_1_offline_fg_materialization/config_1735032952539')"
223 |       ]
224 |      },
225 |      "execution_count": 6,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "interactions_job_execution = interactions_job.run()\n",
232 |     "interactions_job_execution"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 7,
238 |    "id": "8dc5cc7e",
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "Launching job: transactions_1_offline_fg_materialization\n",
246 |       "Job started successfully, you can follow the progress at \n",
247 |       "https://c.app.hopsworks.ai:443/p/1192098/jobs/named/transactions_1_offline_fg_materialization/executions\n",
248 |       "2024-12-24 13:17:16,894 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n",
249 |       "2024-12-24 13:17:20,074 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n",
250 |       "2024-12-24 13:18:49,133 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n",
251 |       "2024-12-24 13:18:49,292 INFO: Waiting for log aggregation to finish.\n",
252 |       "2024-12-24 13:19:01,338 INFO: Execution finished successfully.\n"
253 |      ]
254 |     },
255 |     {
256 |      "data": {
257 |       "text/plain": [
258 |        "Execution('SUCCEEDED', 'FINISHED', '2024-12-24T11:17:08.000Z', '-op offline_fg_materialization -path hdfs:///Projects/decoding/Resources/jobs/transactions_1_offline_fg_materialization/config_1735032811896')"
259 |       ]
260 |      },
261 |      "execution_count": 7,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "transactions_job_execution = transactions_job.run()\n",
268 |     "transactions_job_execution"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "id": "2f45a3f3",
274 |    "metadata": {},
275 |    "source": [
276 |     "## <span style=\"color:#ff5f27\">⏰ Scheduling materialization jobs </span>\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 8,
282 |    "id": "b95eb11a",
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "data": {
287 |       "text/plain": [
288 |        "datetime.datetime(2024, 12, 26, 0, 0, tzinfo=datetime.timezone.utc)"
289 |       ]
290 |      },
291 |      "execution_count": 8,
292 |      "metadata": {},
293 |      "output_type": "execute_result"
294 |     }
295 |    ],
296 |    "source": [
297 |     "interactions_job.schedule(\n",
298 |     "    cron_expression=\"0 0 0 * * ?\",  # Runs at midnight (00:00:00) every day\n",
299 |     "    start_time=datetime.now(tz=timezone.utc),\n",
300 |     ")\n",
301 |     "interactions_job.job_schedule.next_execution_date_time"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 9,
307 |    "id": "97d546e5",
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "datetime.datetime(2024, 12, 26, 0, 0, tzinfo=datetime.timezone.utc)"
314 |       ]
315 |      },
316 |      "execution_count": 9,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "transactions_job.schedule(\n",
323 |     "    cron_expression=\"0 0 0 * * ?\",  # Runs at midnight (00:00:00) every day\n",
324 |     "    start_time=datetime.now(tz=timezone.utc),\n",
325 |     ")\n",
326 |     "transactions_job.job_schedule.next_execution_date_time"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "id": "db866c1f",
332 |    "metadata": {},
333 |    "source": [
334 |     "## <span style=\"color:#ff5f27\"> Inspecting the materialization jobs in Hopsworks UI </span>\n",
335 |     "\n",
336 |     "View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Compute → Ingestions**"
337 |    ]
338 |   }
339 |  ],
340 |  "metadata": {
341 |   "kernelspec": {
342 |    "display_name": "Python 3",
343 |    "language": "python",
344 |    "name": "python3"
345 |   },
346 |   "language_info": {
347 |    "codemirror_mode": {
348 |     "name": "ipython",
349 |     "version": 3
350 |    },
351 |    "file_extension": ".py",
352 |    "mimetype": "text/x-python",
353 |    "name": "python",
354 |    "nbconvert_exporter": "python",
355 |    "pygments_lexer": "ipython3",
356 |    "version": "3.11.8"
357 |   }
358 |  },
359 |  "nbformat": 4,
360 |  "nbformat_minor": 5
361 | }
362 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | build-essential
2 | clang


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "hands-on-recommender-system"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = "~=3.11"
 7 | dependencies = [
 8 |     "altair>=4.2.2",
 9 |     "catboost==1.2",
10 |     "hopsworks[python]>=4.1.2",
11 |     "huggingface-hub==0.24.7",
12 |     "ipykernel>=6.29.5",
13 |     "langchain-openai==0.1.14",
14 |     "langchain==0.2.6",
15 |     "loguru>=0.7.2",
16 |     "nbformat>=5.10.4",
17 |     "polars==1.9.0",
18 |     "pydantic-settings>=2.6.1",
19 |     "sentence-transformers==2.2.2",
20 |     "streamlit==1.28.2",
21 |     "tensorflow-recommenders==0.7.2",
22 |     "tensorflow==2.14",
23 | ]
24 | 
25 | [dependency-groups]
26 | dev = [
27 |     "ruff>=0.7.2",
28 | ]
29 | 


--------------------------------------------------------------------------------
/recsys/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import features, inference, hopsworks_integration, raw_data_sources, training
 2 | 
 3 | __all__ = [
 4 |     "features",
 5 |     "inference",
 6 |     "hopsworks_integration",
 7 |     "raw_data_sources",
 8 |     "training",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/recsys/config.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from pathlib import Path
 3 | from typing import Literal
 4 | 
 5 | from pydantic import SecretStr
 6 | from pydantic_settings import BaseSettings, SettingsConfigDict
 7 | 
 8 | 
 9 | class CustomerDatasetSize(Enum):
10 |     LARGE = "LARGE"
11 |     MEDIUM = "MEDIUM"
12 |     SMALL = "SMALL"
13 | 
14 | 
15 | class Settings(BaseSettings):
16 |     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
17 | 
18 |     RECSYS_DIR: Path = Path(__file__).parent
19 | 
20 |     # Hopsworks
21 |     HOPSWORKS_API_KEY: SecretStr | None = None
22 | 
23 |     # OpenAI
24 |     OPENAI_MODEL_ID: str = "gpt-4o-mini"
25 |     OPENAI_API_KEY: SecretStr | None = None
26 | 
27 |     # Feature engineering
28 |     CUSTOMER_DATA_SIZE: CustomerDatasetSize = CustomerDatasetSize.SMALL
29 |     FEATURES_EMBEDDING_MODEL_ID: str = "all-MiniLM-L6-v2"
30 | 
31 |     # Training
32 |     TWO_TOWER_MODEL_EMBEDDING_SIZE: int = 16
33 |     TWO_TOWER_MODEL_BATCH_SIZE: int = 2048
34 |     TWO_TOWER_NUM_EPOCHS: int = 10
35 |     TWO_TOWER_WEIGHT_DECAY: float = 0.001
36 |     TWO_TOWER_LEARNING_RATE: float = 0.01
37 |     TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE: float = 0.1
38 |     TWO_TOWER_DATASET_TEST_SPLIT_SIZE: float = 0.1
39 | 
40 |     RANKING_DATASET_VALIDATON_SPLIT_SIZE: float = 0.1
41 |     RANKING_LEARNING_RATE: float = 0.2
42 |     RANKING_ITERATIONS: int = 100
43 |     RANKING_SCALE_POS_WEIGHT: int = 10
44 |     RANKING_EARLY_STOPPING_ROUNDS: int = 5
45 | 
46 |     # Inference
47 |     RANKING_MODEL_TYPE: Literal["ranking", "llmranking"] = "ranking"
48 |     CUSTOM_HOPSWORKS_INFERENCE_ENV: str = "custom_env_name"
49 | 
50 | 
51 | settings = Settings()
52 | 


--------------------------------------------------------------------------------
/recsys/features/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import articles, customers, embeddings, interaction, ranking, transactions
 2 | 
 3 | __all__ = [
 4 |     "articles",
 5 |     "customers",
 6 |     "embeddings",
 7 |     "interaction",
 8 |     "ranking",
 9 |     "transactions",
10 | ]
11 | 


--------------------------------------------------------------------------------
/recsys/features/articles.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import io
  3 | import sys
  4 | 
  5 | import polars as pl
  6 | from tqdm.auto import tqdm
  7 | from sentence_transformers import SentenceTransformer
  8 | 
  9 | 
 10 | def get_article_id(df: pl.DataFrame) -> pl.Series:
 11 |     """
 12 |     Extracts and returns the article_id column as a string.
 13 |     Parameters:
 14 |     - df (pl.DataFrame): Input DataFrame containing the 'article_id' column.
 15 |     Returns:
 16 |     - pl.Series: Series containing the 'article_id' column as strings.
 17 |     """
 18 |     return df["article_id"].cast(pl.Utf8)
 19 | 
 20 | 
 21 | def create_prod_name_length(df: pl.DataFrame) -> pl.Series:
 22 |     """
 23 |     Creates a new column 'prod_name_length' representing the length of 'prod_name'.
 24 |     Parameters:
 25 |     - df (pl.DataFrame): Input DataFrame containing the 'prod_name' column.
 26 |     Returns:
 27 |     - pl.Series: Series containing the length of 'prod_name' for each row.
 28 |     """
 29 |     return df["prod_name"].str.len_chars()
 30 | 
 31 | 
 32 | def create_article_description(row):
 33 |     description = f"{row['prod_name']} - {row['product_type_name']} in {row['product_group_name']}"
 34 |     description += f"\nAppearance: {row['graphical_appearance_name']}"
 35 |     description += f"\nColor: {row['perceived_colour_value_name']} {row['perceived_colour_master_name']} ({row['colour_group_name']})"
 36 |     description += f"\nCategory: {row['index_group_name']} - {row['section_name']} - {row['garment_group_name']}"
 37 | 
 38 |     if row["detail_desc"]:
 39 |         description += f"\nDetails: {row['detail_desc']}"
 40 | 
 41 |     return description
 42 | 
 43 | 
 44 | def compute_features_articles(df: pl.DataFrame) -> pl.DataFrame:
 45 |     """
 46 |     Prepares the input DataFrame by creating new features and dropping specific columns.
 47 |     Parameters:
 48 |     - df (pl.DataFrame): Input DataFrame.
 49 |     Returns:
 50 |     - pl.DataFrame: Processed DataFrame with new features and specific columns dropped.
 51 |     """
 52 |     # Create new columns
 53 |     df = df.with_columns(
 54 |         [
 55 |             get_article_id(df).alias("article_id"),
 56 |             create_prod_name_length(df).alias("prod_name_length"),
 57 |             pl.struct(df.columns)
 58 |             .map_elements(create_article_description)
 59 |             .alias("article_description"),
 60 |         ]
 61 |     )
 62 | 
 63 |     # Add full image URLs.
 64 |     df = df.with_columns(image_url=pl.col("article_id").map_elements(get_image_url))
 65 | 
 66 |     # Drop columns with null values
 67 |     df = df.select([col for col in df.columns if not df[col].is_null().any()])
 68 | 
 69 |     # Remove 'detail_desc' column
 70 |     columns_to_drop = ["detail_desc", "detail_desc_length"]
 71 |     existing_columns = df.columns
 72 |     columns_to_keep = [col for col in existing_columns if col not in columns_to_drop]
 73 | 
 74 |     return df.select(columns_to_keep)
 75 | 
 76 | 
 77 | def generate_embeddings_for_dataframe(
 78 |     df: pl.DataFrame, text_column: str, model: SentenceTransformer, batch_size: int = 32
 79 | ) -> pl.DataFrame:
 80 |     """
 81 |     Generate embeddings for a text column in a Polars DataFrame.
 82 | 
 83 |     Args:
 84 |     df (pl.DataFrame): Input Polars DataFrame
 85 |     text_column (str): Name of the column containing text to embed
 86 |     model (SentenceTransformer): SentenceTransformer embedding model to use
 87 |     batch_size (int): Number of samples run at once through the embedding model
 88 | 
 89 |     Returns:
 90 |     pl.DataFrame: DataFrame with a new 'embedding' column
 91 |     """
 92 | 
 93 |     @contextlib.contextmanager
 94 |     def suppress_stdout():
 95 |         new_stdout = io.StringIO()
 96 |         old_stdout = sys.stdout
 97 |         sys.stdout = new_stdout
 98 |         try:
 99 |             yield new_stdout
100 |         finally:
101 |             sys.stdout = old_stdout
102 | 
103 |     total_rows = len(df)
104 |     pbar = tqdm(total=total_rows, desc="Generating embeddings")
105 | 
106 |     # Create a new column with embeddings
107 |     texts = df[text_column].to_list()
108 | 
109 |     all_embeddings = []
110 |     for i in range(0, len(texts), batch_size):
111 |         batch_texts = texts[i : i + batch_size]
112 |         with suppress_stdout():
113 |             batch_embeddings = model.encode(
114 |                 batch_texts, device=model.device, show_progress_bar=False
115 |             )
116 |         all_embeddings.extend(batch_embeddings.tolist())
117 |         pbar.update(len(batch_texts))
118 | 
119 |     df_with_embeddings = df.with_columns(embeddings=pl.Series(all_embeddings))
120 | 
121 |     pbar.close()
122 | 
123 |     return df_with_embeddings
124 | 
125 | 
126 | def get_image_url(article_id):
127 |     url_start = "https://repo.hops.works/dev/jdowling/h-and-m/images/0"
128 | 
129 |     # Convert article_id to string
130 |     article_id_str = str(article_id)
131 | 
132 |     folder = article_id_str[:2]
133 | 
134 |     image_name = article_id_str
135 | 
136 |     return f"{url_start}{folder}/0{image_name}.jpg"
137 | 


--------------------------------------------------------------------------------
/recsys/features/customers.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import polars as pl
  4 | from loguru import logger
  5 | 
  6 | from recsys.config import CustomerDatasetSize
  7 | 
  8 | 
  9 | class DatasetSampler:
 10 |     _SIZES = {
 11 |         CustomerDatasetSize.LARGE: 50_000,
 12 |         CustomerDatasetSize.MEDIUM: 5_000,
 13 |         CustomerDatasetSize.SMALL: 1_000,
 14 |     }
 15 | 
 16 |     def __init__(self, size: CustomerDatasetSize) -> None:
 17 |         self._size = size
 18 | 
 19 |     @classmethod
 20 |     def get_supported_sizes(cls) -> dict:
 21 |         return cls._SIZES
 22 | 
 23 |     def sample(
 24 |         self, customers_df: pl.DataFrame, transations_df: pl.DataFrame
 25 |     ) -> dict[str, pl.DataFrame]:
 26 |         random.seed(27)
 27 | 
 28 |         n_customers = self._SIZES[self._size]
 29 |         logger.info(f"Sampling {n_customers} customers.")
 30 |         customers_df = customers_df.sample(n=n_customers)
 31 | 
 32 |         logger.info(
 33 |             f"Number of transactions for all the customers: {transations_df.height}"
 34 |         )
 35 |         transations_df = transations_df.join(
 36 |             customers_df.select("customer_id"), on="customer_id"
 37 |         )
 38 |         logger.info(
 39 |             f"Number of transactions for the {n_customers} sampled customers: {transations_df.height}"
 40 |         )
 41 | 
 42 |         return {"customers": customers_df, "transactions": transations_df}
 43 | 
 44 | 
 45 | def fill_missing_club_member_status(df: pl.DataFrame) -> pl.DataFrame:
 46 |     """
 47 |     Fill missing values in the 'club_member_status' column with 'ABSENT'.
 48 | 
 49 |     Parameters:
 50 |     - df (pl.DataFrame): Input DataFrame containing the 'club_member_status' column.
 51 | 
 52 |     Returns:
 53 |     - pl.DataFrame: DataFrame with filled 'club_member_status' column.
 54 |     """
 55 |     return df.with_columns(pl.col("club_member_status").fill_null("ABSENT"))
 56 | 
 57 | 
 58 | def drop_na_age(df: pl.DataFrame) -> pl.DataFrame:
 59 |     """
 60 |     Drop rows with null values in the 'age' column.
 61 | 
 62 |     Parameters:
 63 |     - df (pl.DataFrame): Input DataFrame containing the 'age' column.
 64 | 
 65 |     Returns:
 66 |     - pl.DataFrame: DataFrame with rows containing null 'age' values removed.
 67 |     """
 68 |     return df.drop_nulls(subset=["age"])
 69 | 
 70 | 
 71 | def create_age_group() -> pl.Expr:
 72 |     """
 73 |     Create an expression to categorize age into groups.
 74 | 
 75 |     Returns:
 76 |     - pl.Expr: Polars expression that categorizes 'age' into predefined age groups.
 77 |     """
 78 |     return (
 79 |         pl.when(pl.col("age").is_between(0, 18))
 80 |         .then(pl.lit("0-18"))
 81 |         .when(pl.col("age").is_between(19, 25))
 82 |         .then(pl.lit("19-25"))
 83 |         .when(pl.col("age").is_between(26, 35))
 84 |         .then(pl.lit("26-35"))
 85 |         .when(pl.col("age").is_between(36, 45))
 86 |         .then(pl.lit("36-45"))
 87 |         .when(pl.col("age").is_between(46, 55))
 88 |         .then(pl.lit("46-55"))
 89 |         .when(pl.col("age").is_between(56, 65))
 90 |         .then(pl.lit("56-65"))
 91 |         .otherwise(pl.lit("66+"))
 92 |     ).alias("age_group")
 93 | 
 94 | 
 95 | def compute_features_customers(
 96 |     df: pl.DataFrame, drop_null_age: bool = False
 97 | ) -> pl.DataFrame:
 98 |     """
 99 |     Prepare customer data by performing several data cleaning and transformation steps.
100 | 
101 |     This function does the following:
102 |     1. Checks for required columns in the input DataFrame.
103 |     2. Fills missing club member status with 'ABSENT'.
104 |     3. Drops rows with missing age values.
105 |     4. Creates an age group category.
106 |     5. Casts the 'age' column to Float64.
107 |     6. Selects and orders specific columns in the output.
108 | 
109 |     Parameters:
110 |     - df (pl.DataFrame): Input DataFrame containing customer data.
111 | 
112 |     Returns:
113 |     - pl.DataFrame: Processed DataFrame with cleaned and transformed customer data.
114 | 
115 |     Raises:
116 |     - ValueError: If any of the required columns are missing from the input DataFrame.
117 |     """
118 |     required_columns = ["customer_id", "club_member_status", "age", "postal_code"]
119 |     missing_columns = [col for col in required_columns if col not in df.columns]
120 |     if missing_columns:
121 |         raise ValueError(
122 |             f"Columns {', '.join(missing_columns)} not found in the DataFrame"
123 |         )
124 | 
125 |     df = (
126 |         df.pipe(fill_missing_club_member_status)
127 |         .pipe(drop_na_age)
128 |         .with_columns([create_age_group(), pl.col("age").cast(pl.Float64)])
129 |         .select(
130 |             ["customer_id", "club_member_status", "age", "postal_code", "age_group"]
131 |         )
132 |     )
133 | 
134 |     if drop_null_age is True:
135 |         df = df.drop_nulls(subset=["age"])
136 | 
137 |     return df
138 | 


--------------------------------------------------------------------------------
/recsys/features/embeddings.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def preprocess(train_df: pd.DataFrame, candidate_features: list) -> pd.DataFrame:
 6 |     # Select the candidate features from the training DataFrame
 7 |     item_df = train_df[candidate_features]
 8 | 
 9 |     # Drop duplicate rows based on the 'article_id' column to get unique candidate items
10 |     item_df.drop_duplicates(subset="article_id", inplace=True)
11 | 
12 |     return item_df
13 | 
14 | 
15 | def embed(df: pd.DataFrame, candidate_model) -> pd.DataFrame:
16 |     ds = tf.data.Dataset.from_tensor_slices({col: df[col] for col in df})
17 | 
18 |     candidate_embeddings = ds.batch(2048).map(
19 |         lambda x: (x["article_id"], candidate_model(x))
20 |     )
21 | 
22 |     all_article_ids = tf.concat([batch[0] for batch in candidate_embeddings], axis=0)
23 |     all_embeddings = tf.concat([batch[1] for batch in candidate_embeddings], axis=0)
24 | 
25 |     all_article_ids = all_article_ids.numpy().astype(int).tolist()
26 |     all_embeddings = all_embeddings.numpy().tolist()
27 | 
28 |     embeddings_df = pd.DataFrame(
29 |         {
30 |             "article_id": all_article_ids,
31 |             "embeddings": all_embeddings,
32 |         }
33 |     )
34 | 
35 |     return embeddings_df
36 | 


--------------------------------------------------------------------------------
/recsys/features/interaction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import polars as pl
  3 | from tqdm import tqdm
  4 | 
  5 | 
  6 | def generate_interaction_data(trans_df):
  7 |     # Pre-compute unique values once
  8 |     unique_customers = trans_df["customer_id"].unique()
  9 |     all_articles = trans_df["article_id"].unique()
 10 |     all_articles_set = set(all_articles)
 11 | 
 12 |     interactions = []
 13 | 
 14 |     def generate_timestamps(base_timestamp, count, min_hours, max_hours):
 15 |         hours = np.random.randint(min_hours, max_hours, size=count)
 16 |         return base_timestamp - (hours * 3600000)
 17 | 
 18 |     # Ratios to ensure more realistic interactions
 19 |     CLICK_BEFORE_PURCHASE_PROB = 0.9
 20 |     MIN_IGNORES = 40
 21 |     MAX_IGNORES = 60
 22 |     MIN_EXTRA_CLICKS = 5
 23 |     MAX_EXTRA_CLICKS = 8
 24 |     EXTRA_CLICKS_PROB = 0.95
 25 | 
 26 |     chunk_size = 1000
 27 |     for chunk_start in tqdm(
 28 |         range(0, len(unique_customers), chunk_size), desc="Processing customer chunks"
 29 |     ):
 30 |         chunk_end = min(chunk_start + chunk_size, len(unique_customers))
 31 |         chunk_customers = unique_customers[chunk_start:chunk_end]
 32 | 
 33 |         chunk_transactions = trans_df.filter(
 34 |             pl.col("customer_id").is_in(chunk_customers)
 35 |         )
 36 | 
 37 |         for customer_id in chunk_customers:
 38 |             customer_purchases = chunk_transactions.filter(
 39 |                 pl.col("customer_id") == customer_id
 40 |             )
 41 | 
 42 |             if len(customer_purchases) == 0:
 43 |                 continue
 44 | 
 45 |             customer_articles = {"purchased": set(), "clicked": set(), "ignored": set()}
 46 |             last_purchase_timestamp = customer_purchases["t_dat"].max()
 47 | 
 48 |             # Generate more ignores first
 49 |             num_ignores = np.random.randint(MIN_IGNORES, MAX_IGNORES)
 50 |             available_articles = list(all_articles_set)
 51 | 
 52 |             if available_articles and num_ignores > 0:
 53 |                 ignore_timestamps = generate_timestamps(
 54 |                     last_purchase_timestamp, num_ignores, 1, 96
 55 |                 )
 56 |                 selected_ignores = np.random.choice(
 57 |                     available_articles,
 58 |                     size=min(num_ignores, len(available_articles)),
 59 |                     replace=False,
 60 |                 )
 61 | 
 62 |                 # Generate multiple sets of ignores to increase the count
 63 |                 for ts, art_id in zip(ignore_timestamps, selected_ignores):
 64 |                     # Add 1-2 ignore events for the same article
 65 |                     num_ignore_events = np.random.randint(1, 3)
 66 |                     for _ in range(num_ignore_events):
 67 |                         ignore_ts = (
 68 |                             ts - np.random.randint(1, 12) * 3600000
 69 |                         )  # Add some random hours difference
 70 |                         interactions.append(
 71 |                             {
 72 |                                 "t_dat": ignore_ts,
 73 |                                 "customer_id": customer_id,
 74 |                                 "article_id": art_id,
 75 |                                 "interaction_score": 0,
 76 |                                 "prev_article_id": None,
 77 |                             }
 78 |                         )
 79 |                     customer_articles["ignored"].add(art_id)
 80 | 
 81 |             # Process purchases and their clicks
 82 |             purchase_rows = customer_purchases.iter_rows(named=True)
 83 |             for row in purchase_rows:
 84 |                 purchase_timestamp = row["t_dat"]
 85 |                 article_id = row["article_id"]
 86 | 
 87 |                 # Add clicks before purchase
 88 |                 if np.random.random() < CLICK_BEFORE_PURCHASE_PROB:
 89 |                     num_pre_clicks = np.random.randint(1, 3)
 90 |                     for _ in range(num_pre_clicks):
 91 |                         click_timestamp = generate_timestamps(
 92 |                             purchase_timestamp, 1, 1, 48
 93 |                         )[0]
 94 |                         interactions.append(
 95 |                             {
 96 |                                 "t_dat": click_timestamp,
 97 |                                 "customer_id": customer_id,
 98 |                                 "article_id": article_id,
 99 |                                 "interaction_score": 1,
100 |                                 "prev_article_id": None,
101 |                             }
102 |                         )
103 |                         customer_articles["clicked"].add(article_id)
104 | 
105 |                 # Add purchase
106 |                 interactions.append(
107 |                     {
108 |                         "t_dat": purchase_timestamp,
109 |                         "customer_id": customer_id,
110 |                         "article_id": article_id,
111 |                         "interaction_score": 2,
112 |                         "prev_article_id": None,
113 |                     }
114 |                 )
115 |                 customer_articles["purchased"].add(article_id)
116 | 
117 |             # Generate extra clicks
118 |             if np.random.random() < EXTRA_CLICKS_PROB:
119 |                 num_extra_clicks = np.random.randint(
120 |                     MIN_EXTRA_CLICKS, MAX_EXTRA_CLICKS + 1
121 |                 )
122 |                 available_for_clicks = list(
123 |                     all_articles_set
124 |                     - customer_articles["purchased"]
125 |                     - customer_articles["clicked"]
126 |                     - customer_articles["ignored"]
127 |                 )
128 | 
129 |                 if available_for_clicks and num_extra_clicks > 0:
130 |                     click_timestamps = generate_timestamps(
131 |                         last_purchase_timestamp, num_extra_clicks, 1, 72
132 |                     )
133 |                     selected_clicks = np.random.choice(
134 |                         available_for_clicks,
135 |                         size=min(num_extra_clicks, len(available_for_clicks)),
136 |                         replace=False,
137 |                     )
138 | 
139 |                     for ts, art_id in zip(click_timestamps, selected_clicks):
140 |                         interactions.append(
141 |                             {
142 |                                 "t_dat": ts,
143 |                                 "customer_id": customer_id,
144 |                                 "article_id": art_id,
145 |                                 "interaction_score": 1,
146 |                                 "prev_article_id": None,
147 |                             }
148 |                         )
149 | 
150 |     if not interactions:
151 |         return pl.DataFrame(
152 |             schema={
153 |                 "t_dat": pl.Int64,
154 |                 "customer_id": pl.Utf8,
155 |                 "article_id": pl.Utf8,
156 |                 "interaction_score": pl.Int64,
157 |                 "prev_article_id": pl.Utf8,
158 |             }
159 |         )
160 | 
161 |     interaction_df = pl.DataFrame(interactions)
162 |     sorted_df = interaction_df.sort(["customer_id", "t_dat"])
163 | 
164 |     final_df = sorted_df.with_columns(
165 |         [
166 |             pl.col("article_id")
167 |             .alias("prev_article_id")
168 |             .shift(1)
169 |             .over("customer_id")
170 |             .fill_null("START")
171 |         ]
172 |     )
173 |     
174 |     return final_df
175 | 


--------------------------------------------------------------------------------
/recsys/features/ranking.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | def compute_ranking_dataset(trans_fg, articles_fg, customers_fg) -> pl.DataFrame:
 4 |     # Read data from the feature groups
 5 |     trans_df = trans_fg.select(
 6 |         ["article_id", "customer_id"]
 7 |     ).read(dataframe_type="polars")
 8 |     articles_df = articles_fg.select_except(
 9 |         ["article_description", "embeddings", "image_url"]
10 |     ).read(dataframe_type="polars")
11 |     customers_df = customers_fg.select(["customer_id", "age"]).read(dataframe_type="polars")
12 | 
13 |     # Convert article_id to string in both dataframes before joining
14 |     trans_df = trans_df.with_columns(pl.col("article_id").cast(pl.Utf8))
15 |     articles_df = articles_df.with_columns(pl.col("article_id").cast(pl.Utf8))
16 | 
17 |     # Merge operations
18 |     df = trans_df.join(articles_df, on="article_id", how="left")
19 |     df = df.join(customers_df, on="customer_id", how="left")
20 | 
21 |     # Select query features
22 |     query_features = ["customer_id", "age", "article_id"]
23 |     df = df.select(query_features)
24 | 
25 |     # Create positive pairs
26 |     positive_pairs = df.clone()
27 | 
28 |     # Calculate number of negative pairs
29 |     n_neg = len(positive_pairs) * 10
30 | 
31 |     # Create negative pairs DataFrame
32 |     article_ids = (df.select("article_id")
33 |                     .unique()
34 |                     .sample(n=n_neg, with_replacement=True, seed=2)
35 |                     .get_column("article_id"))
36 |     
37 |     customer_ids = (df.select("customer_id")
38 |                      .sample(n=n_neg, with_replacement=True, seed=3)
39 |                      .get_column("customer_id"))
40 | 
41 |     other_features = (df.select(["age"])
42 |                        .sample(n=n_neg, with_replacement=True, seed=4))
43 | 
44 |     # Construct negative pairs
45 |     negative_pairs = pl.DataFrame({
46 |         "article_id": article_ids,
47 |         "customer_id": customer_ids,
48 |         "age": other_features.get_column("age"),
49 |     })
50 | 
51 |     # Add labels
52 |     positive_pairs = positive_pairs.with_columns(pl.lit(1).alias("label"))
53 |     negative_pairs = negative_pairs.with_columns(pl.lit(0).alias("label"))
54 | 
55 |     # Concatenate positive and negative pairs
56 |     ranking_df = pl.concat([
57 |         positive_pairs,
58 |         negative_pairs.select(positive_pairs.columns)
59 |     ])
60 | 
61 |     # Process item features
62 |     item_df = articles_fg.read(dataframe_type="polars")
63 |     
64 |     # Convert article_id to string in item_df before final join
65 |     item_df = item_df.with_columns(pl.col("article_id").cast(pl.Utf8))
66 |     
67 |     # Keep unique article_ids and select columns
68 |     item_df = (
69 |         item_df.unique(subset=["article_id"])
70 |         .select([
71 |             "article_id",
72 |             "product_type_name",
73 |             "product_group_name",
74 |             "graphical_appearance_name",
75 |             "colour_group_name",
76 |             "perceived_colour_value_name",
77 |             "perceived_colour_master_name",
78 |             "department_name",
79 |             "index_name",
80 |             "index_group_name",
81 |             "section_name",
82 |             "garment_group_name",
83 |         ])
84 |     )
85 | 
86 |     # Final merge with item features
87 |     ranking_df = ranking_df.join(item_df, on="article_id", how="left")
88 | 
89 |     return ranking_df


--------------------------------------------------------------------------------
/recsys/features/transactions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import polars as pl
  4 | from hopsworks import udf
  5 | 
  6 | 
  7 | def convert_article_id_to_str(df: pl.DataFrame) -> pl.Series:
  8 |     """
  9 |     Convert the 'article_id' column to string type.
 10 | 
 11 |     Parameters:
 12 |     - df (pl.DataFrame): Input DataFrame containing the 'article_id' column.
 13 | 
 14 |     Returns:
 15 |     - pl.Series: The 'article_id' column converted to string type.
 16 |     """
 17 |     return df["article_id"].cast(pl.Utf8)
 18 | 
 19 | 
 20 | def convert_t_dat_to_datetime(df: pl.DataFrame) -> pl.Series:
 21 |     """
 22 |     Convert the 't_dat' column to datetime type.
 23 | 
 24 |     Parameters:
 25 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
 26 | 
 27 |     Returns:
 28 |     - pl.Series: The 't_dat' column converted to datetime type.
 29 |     """
 30 |     return pl.from_pandas(pd.to_datetime(df["t_dat"].to_pandas()))
 31 | 
 32 | 
 33 | def get_year_feature(df: pl.DataFrame) -> pl.Series:
 34 |     """
 35 |     Extract the year from the 't_dat' column.
 36 | 
 37 |     Parameters:
 38 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
 39 | 
 40 |     Returns:
 41 |     - pl.Series: A series containing the year extracted from 't_dat'.
 42 |     """
 43 |     return df["t_dat"].dt.year()
 44 | 
 45 | 
 46 | def get_month_feature(df: pl.DataFrame) -> pl.Series:
 47 |     """
 48 |     Extract the month from the 't_dat' column.
 49 | 
 50 |     Parameters:
 51 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
 52 | 
 53 |     Returns:
 54 |     - pl.Series: A series containing the month extracted from 't_dat'.
 55 |     """
 56 |     return df["t_dat"].dt.month()
 57 | 
 58 | 
 59 | def get_day_feature(df: pl.DataFrame) -> pl.Series:
 60 |     """
 61 |     Extract the day from the 't_dat' column.
 62 | 
 63 |     Parameters:
 64 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
 65 | 
 66 |     Returns:
 67 |     - pl.Series: A series containing the day extracted from 't_dat'.
 68 |     """
 69 |     return df["t_dat"].dt.day()
 70 | 
 71 | 
 72 | def get_day_of_week_feature(df: pl.DataFrame) -> pl.Series:
 73 |     """
 74 |     Extract the day of the week from the 't_dat' column.
 75 | 
 76 |     Parameters:
 77 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
 78 | 
 79 |     Returns:
 80 |     - pl.Series: A series containing the day of the week extracted from 't_dat'.
 81 |     """
 82 |     return df["t_dat"].dt.weekday()
 83 | 
 84 | 
 85 | def calculate_month_sin_cos(month: pl.Series) -> pl.DataFrame:
 86 |     """
 87 |     Calculate sine and cosine values for the month to capture cyclical patterns.
 88 | 
 89 |     Parameters:
 90 |     - month (pl.Series): A series containing month values.
 91 | 
 92 |     Returns:
 93 |     - pl.DataFrame: A DataFrame with 'month_sin' and 'month_cos' columns.
 94 |     """
 95 |     C = 2 * np.pi / 12
 96 |     return pl.DataFrame(
 97 |         {
 98 |             "month_sin": month.apply(lambda x: np.sin(x * C)),
 99 |             "month_cos": month.apply(lambda x: np.cos(x * C)),
100 |         }
101 |     )
102 | 
103 | 
104 | def convert_t_dat_to_epoch_milliseconds(df: pl.DataFrame) -> pl.Series:
105 |     """
106 |     Convert the 't_dat' column to epoch milliseconds.
107 | 
108 |     Parameters:
109 |     - df (pl.DataFrame): Input DataFrame containing the 't_dat' column.
110 | 
111 |     Returns:
112 |     - pl.Series: A series with 't_dat' converted to epoch milliseconds.
113 |     """
114 |     return df["t_dat"].cast(pl.Int64) // 1_000_000
115 | 
116 | @udf(return_type = float, mode="pandas")
117 | def month_sin(month :pd.Series):
118 |     """
119 |     On-demand transformation function that sine of month for cyclical feature encoding.
120 | 
121 |     Parameters:
122 |     - month (pd.Series): A pandas series that contains the months
123 | 
124 |     Returns:
125 |     - pd.Series: The sine of months
126 |     """
127 |     return np.sin(month * (2 * np.pi / 12))
128 | 
129 | @udf(return_type = float, mode="pandas")
130 | def month_cos(month :pd.Series):
131 |     """
132 |     On-demand transformation function that sine of month for cyclical feature encoding.
133 | 
134 |     Parameters:
135 |     - month (pd.Series): A pandas series that contains the months
136 | 
137 |     Returns:
138 |     - pd.Series: The cosine of months
139 |     """
140 |     return np.cos(month * (2 * np.pi / 12))
141 | 
142 | 
143 | def compute_features_transactions(df: pl.DataFrame) -> pl.DataFrame:
144 |     """
145 |     Prepare transaction data by performing several data transformations.
146 | 
147 |     This function does the following:
148 |     1. Converts 'article_id' to string type.
149 |     2. Converts 't_dat' to datetime type.
150 |     3. Extracts year, month, day, and day of week from 't_dat'.
151 |     4. Calculates sine and cosine of the month for cyclical feature encoding.
152 |     5. Converts 't_dat' to epoch milliseconds.
153 | 
154 |     Parameters:
155 |     - df (pl.DataFrame): Input DataFrame containing transaction data.
156 | 
157 |     Returns:
158 |     - pl.DataFrame: Processed DataFrame with transformed transaction data.
159 |     """
160 | 
161 |     return (
162 |         df.with_columns(
163 |             [
164 |                 pl.col("article_id").cast(pl.Utf8).alias("article_id"),
165 |             ]
166 |         )
167 |         .with_columns(
168 |             [
169 |                 pl.col("t_dat").dt.year().alias("year"),
170 |                 pl.col("t_dat").dt.month().alias("month"),
171 |                 pl.col("t_dat").dt.day().alias("day"),
172 |                 pl.col("t_dat").dt.weekday().alias("day_of_week"),
173 |             ]
174 |         )
175 |         .with_columns([(pl.col("t_dat").cast(pl.Int64) // 1_000_000).alias("t_dat")])
176 |     )
177 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/__init__.py:
--------------------------------------------------------------------------------
1 | from . import feature_store, ranking_serving, two_tower_serving, llm_ranking_serving
2 | from .feature_store import get_feature_store
3 | 
4 | __all__ = ["feature_store", "get_feature_store", "ranking_serving", "two_tower_serving", "llm_ranking_serving"]
5 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/constants.py:
--------------------------------------------------------------------------------
  1 | from hsfs.feature import Feature
  2 | 
  3 | ### Post ingestion format.###
  4 | 
  5 | customer_feature_descriptions = [
  6 |     {"name": "customer_id", "description": "Unique identifier for each customer."},
  7 |     {
  8 |         "name": "club_member_status",
  9 |         "description": "Membership status of the customer in the club.",
 10 |     },
 11 |     {"name": "age", "description": "Age of the customer."},
 12 |     {
 13 |         "name": "postal_code",
 14 |         "description": "Postal code associated with the customer's address.",
 15 |     },
 16 |     {"name": "age_group", "description": "Categorized age group of the customer."},
 17 | ]
 18 | 
 19 | transactions_feature_descriptions = [
 20 |     {"name": "t_dat", "description": "Timestamp of the data record."},
 21 |     {"name": "customer_id", "description": "Unique identifier for each customer."},
 22 |     {"name": "article_id", "description": "Identifier for the purchased article."},
 23 |     {"name": "price", "description": "Price of the purchased article."},
 24 |     {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
 25 |     {"name": "year", "description": "Year of the transaction."},
 26 |     {"name": "month", "description": "Month of the transaction."},
 27 |     {"name": "day", "description": "Day of the transaction."},
 28 |     {"name": "day_of_week", "description": "Day of the week of the transaction."},
 29 |     {
 30 |         "name": "month_sin",
 31 |         "description": "Sine of the month used for seasonal patterns.",
 32 |     },
 33 |     {
 34 |         "name": "month_cos",
 35 |         "description": "Cosine of the month used for seasonal patterns.",
 36 |     },
 37 | ]
 38 | 
 39 | interactions_feature_descriptions = [
 40 |     {"name": "t_dat", "description": "Timestamp of the interaction."},
 41 |     {"name": "customer_id", "description": "Unique identifier for each customer."},
 42 |     {
 43 |         "name": "article_id",
 44 |         "description": "Identifier for the article that was interacted with.",
 45 |     },
 46 |     {
 47 |         "name": "interaction_score",
 48 |         "description": "Type of interaction: 0 = ignore, 1 = click, 2 = purchase.",
 49 |     },
 50 |     {
 51 |         "name": "prev_article_id",
 52 |         "description": "Previous article that the customer interacted with, useful for sequential recommendation patterns.",
 53 |     },
 54 | ]
 55 | 
 56 | ranking_feature_descriptions = [
 57 |     {"name": "customer_id", "description": "Unique identifier for each customer."},
 58 |     {"name": "article_id", "description": "Identifier for the purchased article."},
 59 |     {"name": "age", "description": "Age of the customer."},
 60 |     {"name": "product_type_name", "description": "Name of the product type."},
 61 |     {"name": "product_group_name", "description": "Name of the product group."},
 62 |     {
 63 |         "name": "graphical_appearance_name",
 64 |         "description": "Name of the graphical appearance.",
 65 |     },
 66 |     {"name": "colour_group_name", "description": "Name of the colour group."},
 67 |     {
 68 |         "name": "perceived_colour_value_name",
 69 |         "description": "Name of the perceived colour value.",
 70 |     },
 71 |     {
 72 |         "name": "perceived_colour_master_name",
 73 |         "description": "Name of the perceived colour master.",
 74 |     },
 75 |     {"name": "department_name", "description": "Name of the department."},
 76 |     {"name": "index_name", "description": "Name of the index."},
 77 |     {"name": "index_group_name", "description": "Name of the index group."},
 78 |     {"name": "section_name", "description": "Name of the section."},
 79 |     {"name": "garment_group_name", "description": "Name of the garment group."},
 80 |     {
 81 |         "name": "label",
 82 |         "description": "Label indicating whether the article was purchased (1) or not (0).",
 83 |     },
 84 | ]
 85 | 
 86 | ### Pre ingestion format. ###
 87 | 
 88 | article_feature_description = [
 89 |     Feature(
 90 |         name="article_id", type="string", description="Identifier for the article."
 91 |     ),
 92 |     Feature(
 93 |         name="product_code",
 94 |         type="bigint",
 95 |         description="Code associated with the product.",
 96 |     ),
 97 |     Feature(name="prod_name", type="string", description="Name of the product."),
 98 |     Feature(
 99 |         name="product_type_no",
100 |         type="bigint",
101 |         description="Number associated with the product type.",
102 |     ),
103 |     Feature(
104 |         name="product_type_name", type="string", description="Name of the product type."
105 |     ),
106 |     Feature(
107 |         name="product_group_name",
108 |         type="string",
109 |         description="Name of the product group.",
110 |     ),
111 |     Feature(
112 |         name="graphical_appearance_no",
113 |         type="bigint",
114 |         description="Number associated with graphical appearance.",
115 |     ),
116 |     Feature(
117 |         name="graphical_appearance_name",
118 |         type="string",
119 |         description="Name of the graphical appearance.",
120 |     ),
121 |     Feature(
122 |         name="colour_group_code",
123 |         type="bigint",
124 |         description="Code associated with the colour group.",
125 |     ),
126 |     Feature(
127 |         name="colour_group_name", type="string", description="Name of the colour group."
128 |     ),
129 |     Feature(
130 |         name="perceived_colour_value_id",
131 |         type="bigint",
132 |         description="ID associated with perceived colour value.",
133 |     ),
134 |     Feature(
135 |         name="perceived_colour_value_name",
136 |         type="string",
137 |         description="Name of the perceived colour value.",
138 |     ),
139 |     Feature(
140 |         name="perceived_colour_master_id",
141 |         type="bigint",
142 |         description="ID associated with perceived colour master.",
143 |     ),
144 |     Feature(
145 |         name="perceived_colour_master_name",
146 |         type="string",
147 |         description="Name of the perceived colour master.",
148 |     ),
149 |     Feature(
150 |         name="department_no",
151 |         type="bigint",
152 |         description="Number associated with the department.",
153 |     ),
154 |     Feature(
155 |         name="department_name", type="string", description="Name of the department."
156 |     ),
157 |     Feature(
158 |         name="index_code", type="string", description="Code associated with the index."
159 |     ),
160 |     Feature(name="index_name", type="string", description="Name of the index."),
161 |     Feature(
162 |         name="index_group_no",
163 |         type="bigint",
164 |         description="Number associated with the index group.",
165 |     ),
166 |     Feature(
167 |         name="index_group_name", type="string", description="Name of the index group."
168 |     ),
169 |     Feature(
170 |         name="section_no",
171 |         type="bigint",
172 |         description="Number associated with the section.",
173 |     ),
174 |     Feature(name="section_name", type="string", description="Name of the section."),
175 |     Feature(
176 |         name="garment_group_no",
177 |         type="bigint",
178 |         description="Number associated with the garment group.",
179 |     ),
180 |     Feature(
181 |         name="garment_group_name",
182 |         type="string",
183 |         description="Name of the garment group.",
184 |     ),
185 |     Feature(
186 |         name="prod_name_length",
187 |         type="bigint",
188 |         description="Length of the product name.",
189 |     ),
190 |     Feature(
191 |         name="article_description",
192 |         type="string",
193 |         online_type="VARCHAR(5800)",
194 |         description="Description of the article.",
195 |     ),
196 |     Feature(
197 |         name="embeddings",
198 |         type="array<double>",
199 |         description="Vector embeddings of the article description.",
200 |     ),
201 |     Feature(name="image_url", type="string", description="URL of the product image."),
202 | ]
203 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/feature_store.py:
--------------------------------------------------------------------------------
  1 | import hopsworks
  2 | import pandas as pd
  3 | from hsfs import embedding
  4 | from loguru import logger
  5 | 
  6 | from recsys.config import settings
  7 | from recsys.hopsworks_integration import constants
  8 | from recsys.features.transactions import month_cos, month_sin
  9 | 
 10 | 
 11 | def get_feature_store():
 12 |     if settings.HOPSWORKS_API_KEY:
 13 |         logger.info("Loging to Hopsworks using HOPSWORKS_API_KEY env var.")
 14 |         project = hopsworks.login(
 15 |             api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
 16 |         )
 17 |     else:
 18 |         logger.info("Login to Hopsworks using cached API key.")
 19 |         project = hopsworks.login()
 20 |     return project, project.get_feature_store()
 21 | 
 22 | 
 23 | ########################
 24 | #### Feature Groups ####
 25 | ########################
 26 | 
 27 | 
 28 | def create_customers_feature_group(fs, df: pd.DataFrame, online_enabled: bool = True):
 29 |     customers_fg = fs.get_or_create_feature_group(
 30 |         name="customers",
 31 |         description="Customers data including age and postal code",
 32 |         version=1,
 33 |         primary_key=["customer_id"],
 34 |         online_enabled=online_enabled,
 35 |     )
 36 |     customers_fg.insert(df, wait=True)
 37 | 
 38 |     for desc in constants.customer_feature_descriptions:
 39 |         customers_fg.update_feature_description(desc["name"], desc["description"])
 40 | 
 41 |     return customers_fg
 42 | 
 43 | 
 44 | def create_articles_feature_group(
 45 |         fs,
 46 |         df: pd.DataFrame,
 47 |         articles_description_embedding_dim: int,
 48 |         online_enabled: bool = True,
 49 | ):
 50 |     # Create the Embedding Index for the articles description embedding.
 51 |     emb = embedding.EmbeddingIndex()
 52 |     emb.add_embedding("embeddings", articles_description_embedding_dim)
 53 | 
 54 |     articles_fg = fs.get_or_create_feature_group(
 55 |         name="articles",
 56 |         version=1,
 57 |         description="Fashion items data including type of item, visual description and category",
 58 |         primary_key=["article_id"],
 59 |         online_enabled=online_enabled,
 60 |         features=constants.article_feature_description,
 61 |         embedding_index=emb,
 62 |     )
 63 |     articles_fg.insert(df, wait=True)
 64 | 
 65 |     return articles_fg
 66 | 
 67 | 
 68 | def create_transactions_feature_group(
 69 |         fs, df: pd.DataFrame, online_enabled: bool = True
 70 | ):
 71 |     trans_fg = fs.get_or_create_feature_group(
 72 |         name="transactions",
 73 |         version=1,
 74 |         description="Transactions data including customer, item, price, sales channel and transaction date",
 75 |         primary_key=["customer_id", "article_id"],
 76 |         online_enabled=online_enabled,
 77 |         transformation_functions=[month_sin, month_cos],
 78 |         event_time="t_dat",
 79 |     )
 80 |     trans_fg.insert(df, wait=True)
 81 | 
 82 |     for desc in constants.transactions_feature_descriptions:
 83 |         trans_fg.update_feature_description(desc["name"], desc["description"])
 84 | 
 85 |     return trans_fg
 86 | 
 87 | 
 88 | def create_interactions_feature_group(
 89 |         fs, df: pd.DataFrame, online_enabled: bool = True
 90 | ):
 91 |     interactions_fg = fs.get_or_create_feature_group(
 92 |         name="interactions",
 93 |         version=1,
 94 |         description="Customer interactions with articles including purchases, clicks, and ignores. Used for building recommendation systems and analyzing user behavior.",
 95 |         primary_key=["customer_id", "article_id"],
 96 |         online_enabled=online_enabled,
 97 |         event_time="t_dat",
 98 |     )
 99 | 
100 |     interactions_fg.insert(
101 |         df,
102 |         wait=True,
103 |     )
104 | 
105 |     for desc in constants.interactions_feature_descriptions:
106 |         interactions_fg.update_feature_description(desc["name"], desc["description"])
107 | 
108 |     return interactions_fg
109 | 
110 | 
111 | def create_ranking_feature_group(
112 |     fs, df: pd.DataFrame, parents: list, online_enabled: bool = True
113 | ):
114 |     rank_fg = fs.get_or_create_feature_group(
115 |         name="ranking",
116 |         version=1,
117 |         description="Derived feature group for ranking",
118 |         primary_key=["customer_id", "article_id"],
119 |         parents=parents,
120 |         online_enabled=online_enabled,
121 |     )
122 |     rank_fg.insert(df, wait=True)
123 | 
124 |     for desc in constants.ranking_feature_descriptions:
125 |         rank_fg.update_feature_description(desc["name"], desc["description"])
126 | 
127 |     return rank_fg
128 | 
129 | 
130 | def create_candidate_embeddings_feature_group(
131 |         fs, df: pd.DataFrame, online_enabled: bool = True
132 | ):
133 |     embedding_index = embedding.EmbeddingIndex()
134 | 
135 |     embedding_index.add_embedding(
136 |         "embeddings",  # Embeddings feature name
137 |         settings.TWO_TOWER_MODEL_EMBEDDING_SIZE,
138 |     )
139 | 
140 |     candidate_embeddings_fg = fs.get_or_create_feature_group(
141 |         name="candidate_embeddings",
142 |         embedding_index=embedding_index,  # Specify the Embedding Index
143 |         primary_key=["article_id"],
144 |         version=1,
145 |         description="Embeddings for each article.",
146 |         online_enabled=online_enabled,
147 |     )
148 |     candidate_embeddings_fg.insert(df, wait=True)
149 | 
150 |     return candidate_embeddings_fg
151 | 
152 | 
153 | #########################
154 | ##### Feature Views #####
155 | #########################
156 | 
157 | 
158 | def create_retrieval_feature_view(fs):
159 |     trans_fg = fs.get_feature_group(name="transactions", version=1)
160 |     customers_fg = fs.get_feature_group(name="customers", version=1)
161 |     articles_fg = fs.get_feature_group(name="articles", version=1)
162 | 
163 |     # You'll need to join these three data sources to make the data compatible
164 |     # with out retrieval model. Recall that each row in the `transactions` feature group
165 |     # relates information about which customer bought which item.
166 |     # You'll join this feature group with the `customers` and `articles` feature groups
167 |     # to inject customer and item features into each row.
168 |     selected_features = (
169 |         trans_fg.select(
170 |             ["customer_id", "article_id", "t_dat", "price", "month_sin", "month_cos"]
171 |         )
172 |         .join(
173 |             customers_fg.select(["age", "club_member_status", "age_group"]),
174 |             on="customer_id",
175 |         )
176 |         .join(
177 |             articles_fg.select(["garment_group_name", "index_group_name"]),
178 |             on="article_id",
179 |         )
180 |     )
181 | 
182 |     feature_view = fs.get_or_create_feature_view(
183 |         name="retrieval",
184 |         query=selected_features,
185 |         version=1,
186 |     )
187 | 
188 |     return feature_view
189 | 
190 | 
191 | def create_ranking_feature_views(fs):
192 |     customers_fg = fs.get_feature_group(
193 |         name="customers",
194 |         version=1,
195 |     )
196 | 
197 |     articles_fg = fs.get_feature_group(
198 |         name="articles",
199 |         version=1,
200 |     )
201 | 
202 |     rank_fg = fs.get_feature_group(
203 |         name="ranking",
204 |         version=1,
205 |     )
206 | 
207 |     trans_fg = fs.get_feature_group(
208 |         name="transactions",
209 |         version=1)
210 | 
211 |     selected_features_customers = customers_fg.select_all()
212 |     fs.get_or_create_feature_view(
213 |         name="customers",
214 |         query=selected_features_customers,
215 |         version=1,
216 |     )
217 | 
218 |     selected_features_articles = articles_fg.select_except(["embeddings"])
219 |     fs.get_or_create_feature_view(
220 |         name="articles",
221 |         query=selected_features_articles,
222 |         version=1,
223 |     )
224 | 
225 |     # Select features
226 |     selected_features_ranking = rank_fg.select_except(["customer_id", "article_id"]).join(trans_fg.select(["month_sin", "month_cos"]))
227 |     feature_view_ranking = fs.get_or_create_feature_view(
228 |         name="ranking",
229 |         query=selected_features_ranking,
230 |         labels=["label"],
231 |         version=1,
232 |     )
233 | 
234 |     return feature_view_ranking
235 | 
236 | 
237 | def create_candidate_embeddings_feature_view(fs, fg):
238 |     feature_view = fs.get_or_create_feature_view(
239 |         name="candidate_embeddings",
240 |         version=1,
241 |         description="Embeddings of each article",
242 |         query=fg.select(["article_id"]),
243 |     )
244 | 
245 |     return feature_view
246 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/llm_ranker/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.2.6
2 | langchain-openai==0.1.14
3 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/llm_ranking_serving.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import hopsworks
  4 | from hsml.transformer import Transformer
  5 | 
  6 | from recsys.config import settings
  7 | 
  8 | 
  9 | class HopsworksLLMRankingModel:
 10 |     deployment_name = "llmranking"
 11 | 
 12 |     @classmethod
 13 |     def register(cls, mr):
 14 |         local_model_path = str(
 15 |             settings.RECSYS_DIR / "inference" / "llm_ranking_predictor.py"
 16 |         )
 17 |         ranking_model = mr.python.create_model(
 18 |             name="llm_ranking_model",
 19 |             description="LLM Ranking model that scores item candidates",
 20 |         )
 21 |         ranking_model.save(local_model_path)
 22 | 
 23 |     @classmethod
 24 |     def deploy(cls):
 25 |         # Prepare secrets used in the deployment
 26 |         cls._prepare_secrets()
 27 | 
 28 |         project = hopsworks.login()
 29 |         cls._prepare_environment(project)
 30 |         mr = project.get_model_registry()
 31 |         dataset_api = project.get_dataset_api()
 32 | 
 33 |         ranking_model = mr.get_model(name="llm_ranking_model")
 34 |         # Copy transformer file into Hopsworks File System
 35 | 
 36 |         uploaded_file_path = dataset_api.upload(
 37 |             str(
 38 |                 settings.RECSYS_DIR / "inference" / "ranking_transformer.py"
 39 |             ),  # File name to be uploaded
 40 |             "Resources",  # Destination directory in Hopsworks File System
 41 |             overwrite=True,  # Overwrite the file if it already exists
 42 |         )
 43 |         # Construct the path to the uploaded transformer script
 44 |         transformer_script_path = os.path.join(
 45 |             "/Projects",  # Root directory for projects in Hopsworks
 46 |             project.name,  # Name of the current project
 47 |             uploaded_file_path,  # Path to the uploaded file within the project
 48 |         )
 49 | 
 50 |         # Upload llm predictor file to Hopsworks
 51 |         uploaded_file_path = dataset_api.upload(
 52 |             str(settings.RECSYS_DIR / "inference" / "llm_ranking_predictor.py"),
 53 |             "Resources",
 54 |             overwrite=True,
 55 |         )
 56 | 
 57 |         # Construct the path to the uploaded script
 58 |         predictor_script_path = os.path.join(
 59 |             "/Projects",
 60 |             project.name,
 61 |             uploaded_file_path,
 62 |         )
 63 | 
 64 |         ranking_transformer = Transformer(
 65 |             script_file=transformer_script_path,
 66 |             resources={"num_instances": 0},
 67 |         )
 68 | 
 69 |         # Deploy ranking model
 70 |         ranking_deployment = ranking_model.deploy(
 71 |             name=cls.deployment_name,
 72 |             description="Deployment that search for item candidates and scores them based on customer metadata using "
 73 |             "GPT 4",
 74 |             script_file=predictor_script_path,
 75 |             resources={"num_instances": 0},
 76 |             transformer=ranking_transformer,
 77 |             environment=settings.CUSTOM_HOPSWORKS_INFERENCE_ENV,
 78 |         )
 79 | 
 80 |         return ranking_deployment
 81 | 
 82 |     @classmethod
 83 |     def _prepare_environment(cls, project):
 84 |         # Upload requirements file to Hopsworks
 85 |         dataset_api = project.get_dataset_api()
 86 | 
 87 |         requirements_path = dataset_api.upload(
 88 |             str(
 89 |                 settings.RECSYS_DIR
 90 |                 / "hopsworks_integration"
 91 |                 / "llm_ranker"
 92 |                 / "requirements.txt"
 93 |             ),
 94 |             "Resources",
 95 |             overwrite=True,
 96 |         )
 97 | 
 98 |         # Check if custom env exists, if not create it
 99 |         env_api = project.get_environment_api()
100 |         envs = env_api.get_environments()
101 |         existing_envs = [env.name for env in envs]
102 |         if settings.CUSTOM_HOPSWORKS_INFERENCE_ENV in existing_envs:
103 |             env = env_api.get_environment(settings.CUSTOM_HOPSWORKS_INFERENCE_ENV)
104 |         else:
105 |             env = env_api.create_environment(
106 |                 name=settings.CUSTOM_HOPSWORKS_INFERENCE_ENV,
107 |                 base_environment_name="pandas-inference-pipeline",
108 |             )
109 | 
110 |         # Install the extra requirements in the Python environment on Hopsworks
111 |         env.install_requirements(requirements_path)
112 | 
113 |     @classmethod
114 |     def _prepare_secrets(cls):
115 |         if not settings.OPENAI_API_KEY:
116 |             raise ValueError(
117 |                 "Missing required secret: 'OPENAI_API_KEY'. Please ensure it is set in the .env file or config.py "
118 |                 "settings."
119 |             )
120 |         
121 |         project = hopsworks.login(
122 |             hostname_verification=False,
123 |             api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value(),     
124 |         )
125 |         secrets_api = hopsworks.get_secrets_api()
126 |         secrets = secrets_api.get_secrets()
127 |         existing_secret_keys = [secret.name for secret in secrets]
128 |         if "OPENAI_API_KEY" in existing_secret_keys:
129 |             secrets_api._delete(name="OPENAI_API_KEY")
130 | 
131 |         secrets_api.create_secret(
132 |             "OPENAI_API_KEY",
133 |             settings.OPENAI_API_KEY.get_secret_value(),
134 |             project=project.name,
135 |         )
136 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/ranking_serving.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import joblib
 4 | from hsml.transformer import Transformer
 5 | 
 6 | from recsys.config import settings
 7 | 
 8 | 
 9 | class HopsworksRankingModel:
10 |     deployment_name = "ranking"
11 | 
12 |     def __init__(self, model):
13 |         self._model = model
14 | 
15 |     def save_to_local(self, output_path: str = "ranking_model.pkl"):
16 |         joblib.dump(self._model, output_path)
17 | 
18 |         return output_path
19 | 
20 |     def register(self, mr, feature_view, X_train, metrics):
21 |         local_model_path = self.save_to_local()
22 | 
23 |         input_example = X_train.sample().to_dict("records")
24 | 
25 |         ranking_model = mr.python.create_model(
26 |             name="ranking_model",
27 |             description="Ranking model that scores item candidates",
28 |             metrics=metrics,
29 |             input_example=input_example,
30 |             feature_view=feature_view,
31 |         )
32 |         ranking_model.save(local_model_path)
33 | 
34 |     @classmethod
35 |     def deploy(cls, project):
36 |         mr = project.get_model_registry()
37 |         dataset_api = project.get_dataset_api()
38 | 
39 |         ranking_model = mr.get_best_model(
40 |             name="ranking_model",
41 |             metric="fscore",
42 |             direction="max",
43 |         )
44 | 
45 |         # Copy transformer file into Hopsworks File System
46 |         uploaded_file_path = dataset_api.upload(
47 |             str(
48 |                 settings.RECSYS_DIR / "inference" / "ranking_transformer.py"
49 |             ),  # File name to be uploaded
50 |             "Resources",  # Destination directory in Hopsworks File System
51 |             overwrite=True,  # Overwrite the file if it already exists
52 |         )
53 |         # Construct the path to the uploaded transformer script
54 |         transformer_script_path = os.path.join(
55 |             "/Projects",  # Root directory for projects in Hopsworks
56 |             project.name,  # Name of the current project
57 |             uploaded_file_path,  # Path to the uploaded file within the project
58 |         )
59 | 
60 |         # Upload predictor file to Hopsworks
61 |         uploaded_file_path = dataset_api.upload(
62 |             str(settings.RECSYS_DIR / "inference" / "ranking_predictor.py"),
63 |             "Resources",
64 |             overwrite=True,
65 |         )
66 | 
67 |         # Construct the path to the uploaded script
68 |         predictor_script_path = os.path.join(
69 |             "/Projects",
70 |             project.name,
71 |             uploaded_file_path,
72 |         )
73 | 
74 |         ranking_transformer = Transformer(
75 |             script_file=transformer_script_path,
76 |             resources={"num_instances": 0},
77 |         )
78 | 
79 |         # Deploy ranking model
80 |         ranking_deployment = ranking_model.deploy(
81 |             name=cls.deployment_name,
82 |             description="Deployment that search for item candidates and scores them based on customer metadata",
83 |             script_file=predictor_script_path,
84 |             resources={"num_instances": 0},
85 |             transformer=ranking_transformer,
86 |         )
87 | 
88 |         return ranking_deployment
89 | 


--------------------------------------------------------------------------------
/recsys/hopsworks_integration/two_tower_serving.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Literal
  3 | 
  4 | import hopsworks
  5 | import tensorflow as tf
  6 | from hsml.transformer import Transformer
  7 | from loguru import logger
  8 | 
  9 | from recsys.config import settings
 10 | from recsys.training.two_tower import ItemTower, QueryTower
 11 | 
 12 | 
 13 | class HopsworksQueryModel:
 14 |     deployment_name = "query"
 15 | 
 16 |     def __init__(self, model: QueryTower) -> None:
 17 |         self.model = model
 18 | 
 19 |     def save_to_local(self, output_path: str = "query_model") -> str:
 20 |         # Define the input specifications for the instances
 21 |         instances_spec = {
 22 |             "customer_id": tf.TensorSpec(
 23 |                 shape=(None,), dtype=tf.string, name="customer_id"
 24 |             ),  # Specification for customer IDs
 25 |             "month_sin": tf.TensorSpec(
 26 |                 shape=(None,), dtype=tf.float64, name="month_sin"
 27 |             ),  # Specification for sine of month
 28 |             "month_cos": tf.TensorSpec(
 29 |                 shape=(None,), dtype=tf.float64, name="month_cos"
 30 |             ),  # Specification for cosine of month
 31 |             "age": tf.TensorSpec(
 32 |                 shape=(None,), dtype=tf.float64, name="age"
 33 |             ),  # Specification for age
 34 |         }
 35 | 
 36 |         query_module_module = QueryModelModule(model=self.model)
 37 |         # Get the concrete function for the query_model's compute_emb function using the specified input signatures
 38 |         inference_signatures = (
 39 |             query_module_module.compute_embedding.get_concrete_function(instances_spec)
 40 |         )
 41 | 
 42 |         # Save the query_model along with the concrete function signatures
 43 |         tf.saved_model.save(
 44 |             self.model,  # The model to save
 45 |             output_path,  # Path to save the model
 46 |             signatures=inference_signatures,  # Concrete function signatures to include
 47 |         )
 48 | 
 49 |         return output_path
 50 | 
 51 |     def register(self, mr, feature_view, query_df) -> None:
 52 |         local_model_path = self.save_to_local()
 53 | 
 54 |         # Sample a query example from the query DataFrame
 55 |         query_example = query_df.sample().to_dict("records")
 56 | 
 57 |         # Create a tensorflow model for the query_model in the Model Registry
 58 |         mr_query_model = mr.tensorflow.create_model(
 59 |             name="query_model",  # Name of the model
 60 |             description="Model that generates query embeddings from user and transaction features",  # Description of the model
 61 |             input_example=query_example,  # Example input for the model
 62 |             feature_view=feature_view,
 63 |         )
 64 | 
 65 |         # Save the query_model to the Model Registry
 66 |         mr_query_model.save(local_model_path)  # Path to save the model
 67 | 
 68 |     @classmethod
 69 |     def deploy(cls, ranking_model_type: Literal["ranking", "llmranking"] = "ranking"):
 70 |         # Prepare secrets used in the deployment
 71 |         project = hopsworks.login()
 72 |         cls._prepare_secrets(ranking_model_type)
 73 | 
 74 |         mr = project.get_model_registry()
 75 |         dataset_api = project.get_dataset_api()
 76 | 
 77 |         # Retrieve the 'query_model' from the Model Registry
 78 |         query_model = mr.get_model(
 79 |             name="query_model",
 80 |             version=1,
 81 |         )
 82 | 
 83 |         # Copy transformer file into Hopsworks File System
 84 |         uploaded_file_path = dataset_api.upload(
 85 |             str(settings.RECSYS_DIR / "inference" / "query_transformer.py"),
 86 |             "Models",
 87 |             overwrite=True,
 88 |         )
 89 | 
 90 |         # Construct the path to the uploaded script
 91 |         transformer_script_path = os.path.join(
 92 |             "/Projects",
 93 |             project.name,
 94 |             uploaded_file_path,
 95 |         )
 96 | 
 97 |         query_model_transformer = Transformer(
 98 |             script_file=transformer_script_path,
 99 |             resources={"num_instances": 0},
100 |         )
101 | 
102 |         # Deploy the query model
103 |         query_model_deployment = query_model.deploy(
104 |             name=cls.deployment_name,
105 |             description="Deployment that generates query embeddings from customer and item features using the query model",
106 |             resources={"num_instances": 0},
107 |             transformer=query_model_transformer,
108 |         )
109 | 
110 |         return query_model_deployment
111 | 
112 |     @classmethod
113 |     def _prepare_secrets(cls, ranking_model_type: Literal["ranking", "llmranking"]):
114 |         project = hopsworks.login(
115 |             hostname_verification=False,
116 |             api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value(),     
117 |         )
118 |         secrets_api = hopsworks.get_secrets_api()
119 |         secrets = secrets_api.get_secrets()
120 |         existing_secret_keys = [secret.name for secret in secrets]
121 |         if "RANKING_MODEL_TYPE" in existing_secret_keys:
122 |             secrets_api._delete(name="RANKING_MODEL_TYPE")
123 | 
124 |         secrets_api.create_secret(
125 |             "RANKING_MODEL_TYPE",
126 |             ranking_model_type,
127 |             project=project.name,
128 |         )
129 | 
130 | 
131 | class QueryModelModule(tf.Module):
132 |     def __init__(self, model: QueryTower) -> None:
133 |         self.model = model
134 | 
135 |     @tf.function()
136 |     def compute_embedding(self, instances):
137 |         query_embedding = self.model(instances)
138 | 
139 |         return {
140 |             "customer_id": instances["customer_id"],
141 |             "month_sin": instances["month_sin"],
142 |             "month_cos": instances["month_cos"],
143 |             "query_emb": query_embedding,
144 |         }
145 | 
146 | 
147 | class HopsworksCandidateModel:
148 |     def __init__(self, model: ItemTower):
149 |         self.model = model
150 | 
151 |     def save_to_local(self, output_path: str = "candidate_model") -> str:
152 |         tf.saved_model.save(
153 |             self.model,  # The model to save
154 |             output_path,  # Path to save the model
155 |         )
156 | 
157 |         return output_path
158 | 
159 |     def register(self, mr, feature_view, item_df):
160 |         local_model_path = self.save_to_local()
161 | 
162 |         # Sample a candidate example from the item DataFrame
163 |         candidate_example = item_df.sample().to_dict("records")
164 | 
165 |         # Create a tensorflow model for the candidate_model in the Model Registry
166 |         mr_candidate_model = mr.tensorflow.create_model(
167 |             name="candidate_model",  # Name of the model
168 |             description="Model that generates candidate embeddings from item features",  # Description of the model
169 |             input_example=candidate_example,  # Example input for the model
170 |             feature_view=feature_view,
171 |         )
172 | 
173 |         # Save the candidate_model to the Model Registry
174 |         mr_candidate_model.save(local_model_path)  # Path to save the model
175 | 
176 |     @classmethod
177 |     def download(cls, mr) -> tuple[ItemTower, dict]:
178 |         models = mr.get_models(name="candidate_model")
179 |         if len(models) == 0:
180 |             raise RuntimeError(
181 |                 "No 'candidate_model' found in Hopsworks model registry."
182 |             )
183 |         latest_model = max(models, key=lambda m: m.version)
184 | 
185 |         logger.info(f"Downloading 'candidate_model' version {latest_model.version}")
186 |         model_path = latest_model.download()
187 | 
188 |         candidate_model = tf.saved_model.load(model_path)
189 | 
190 |         candidate_features = [
191 |             *candidate_model.signatures["serving_default"]
192 |             .structured_input_signature[-1]
193 |             .keys()
194 |         ]
195 |         return candidate_model, candidate_features
196 | 


--------------------------------------------------------------------------------
/recsys/inference/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import (
 2 |     query_transformer,
 3 |     ranking_transformer,
 4 |     ranking_predictor,
 5 |     llm_ranking_predictor
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "query_transformer",
10 |     "ranking_transformer",
11 |     "ranking_predictor",
12 |     "llm_ranking_predictor"
13 | ]
14 | 


--------------------------------------------------------------------------------
/recsys/inference/llm_ranking_predictor.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import hopsworks
  4 | from langchain import PromptTemplate, LLMChain
  5 | from langchain_core.output_parsers import BaseOutputParser
  6 | from langchain_openai import ChatOpenAI
  7 | 
  8 | 
  9 | class ScoreOutputParser(BaseOutputParser[float]):
 10 |     def parse(self, output) -> float:
 11 |         text = output['text']
 12 |         # Extract the numeric part after "Probability:"
 13 |         if "Probability:" not in text:
 14 |             raise ValueError("Text does not contain 'Probability:' label.")
 15 |         probability_str = text.split("Probability:")[1].strip()
 16 |         probability = float(probability_str)
 17 | 
 18 |         # Ensure the probability is in the valid range [0, 1]
 19 |         if not (0.0 <= probability <= 1.0):
 20 |             raise ValueError("Probability value must be between 0 and 1.")
 21 | 
 22 |         return probability
 23 | 
 24 | PROMPT_TEMPLATE = """
 25 |             You are a helpful assistant specialized in predicting customer behavior. Your task is to analyze the features of a product and predict the probability of it being purchased by a customer.
 26 | 
 27 |             ### Instructions:
 28 |             1. Use the provided features of the product to make your prediction.
 29 |             2. Consider the following numeric and categorical features:
 30 |                - Numeric features: These are quantitative attributes, such as numerical identifiers or measurements.
 31 |                - Categorical features: These describe qualitative aspects, like product category, color, and material.
 32 |             3. Your response should only include the probability of purchase for the positive class (e.g., likelihood of being purchased), as a value between 0 and 1.
 33 | 
 34 |             ### Product and User Features:
 35 |             Numeric features:
 36 |             - Age: {age}
 37 |             - Month Sin: {month_sin}
 38 |             - Month Cos: {month_cos}
 39 | 
 40 |             Categorical features:
 41 |             - Product Type: {product_type_name}
 42 |             - Product Group: {product_group_name}
 43 |             - Graphical Appearance: {graphical_appearance_name}
 44 |             - Colour Group: {colour_group_name}
 45 |             - Perceived Colour Value: {perceived_colour_value_name}
 46 |             - Perceived Colour Master Value: {perceived_colour_master_name}
 47 |             - Department Name: {department_name}
 48 |             - Index Name: {index_name}
 49 |             - Department: {index_group_name}
 50 |             - Sub-Department: {section_name}
 51 |             - Group: {garment_group_name}
 52 | 
 53 |             ### Your Task:
 54 |             Based on the features provided, predict the probability that the customer will purchase this product to 4-decimals precision. Provide the output in the following format:
 55 |             Probability: 
 56 |         """
 57 | 
 58 | 
 59 | class Predict(object):
 60 |     def __init__(self):
 61 |         self.input_variables = ["age", "month_sin", "month_cos", "product_type_name", "product_group_name",
 62 |                                 "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name",
 63 |                                 "perceived_colour_master_name", "department_name", "index_name", "index_group_name",
 64 |                                 "section_name", "garment_group_name"]
 65 |         self._retrieve_secrets()
 66 |         self.llm = self._build_lang_chain()
 67 |         self.parser = ScoreOutputParser()
 68 | 
 69 |     def _retrieve_secrets(self):
 70 |         project = hopsworks.login()
 71 |         secrets_api = hopsworks.get_secrets_api()
 72 |         self.openai_api_key = secrets_api.get_secret("OPENAI_API_KEY").value
 73 | 
 74 |     def predict(self, inputs):
 75 |         logging.info(f"✅ Inputs: {inputs}")
 76 | 
 77 |         # Extract ranking features and article IDs from the inputs limit to 20 candidates because otherwise the
 78 |         # inference time is over 60 seconds and the predict endpoint closes the socket
 79 |         features = inputs[0].pop("ranking_features")[:20]
 80 |         article_ids = inputs[0].pop("article_ids")[:20]
 81 | 
 82 |         # Preprocess features for OpenAI model input
 83 |         preprocessed_features_candidates = self._preprocess_features(features)
 84 |         logging.info(f"predict -> Preprocessed features: {preprocessed_features_candidates}")
 85 |         logging.info(f"Article IDs: {article_ids}")
 86 | 
 87 |         logging.info(f"🦅 Predicting with OpenAI model for {len(features)} instances")
 88 | 
 89 |         scores = []
 90 |         for candidate in preprocessed_features_candidates:
 91 |             try:
 92 |                 text = self.llm.invoke(candidate)
 93 |                 score = self.parser.parse(text)
 94 |             except Exception as exception:
 95 |                 logging.error(exception)
 96 |                 # Add minimum default score in case of error
 97 |                 score = 0
 98 |             scores.append(score)
 99 | 
100 |         logging.info(f"LLM Scores: {scores}")
101 | 
102 |         return {
103 |             "scores": scores,
104 |             "article_ids": article_ids,
105 |         }
106 | 
107 |     def _preprocess_features(self, features):
108 |         """
109 |         Convert ranking features into a natural language description
110 |         suitable for OpenAI model input.
111 |         """
112 |         preprocessed = []
113 |         for feature_set in features:
114 |             # Example: Create a descriptive string for each feature set
115 |             query_parameters = {}
116 |             for key, value in zip(self.input_variables, feature_set):
117 |                 query_parameters[key] = value
118 |             preprocessed.append(query_parameters)
119 |         return preprocessed
120 | 
121 |     def _build_lang_chain(self):
122 |         model = ChatOpenAI(
123 |             model_name='gpt-4o-mini-2024-07-18',
124 |             temperature=0.7,
125 |             openai_api_key=self.openai_api_key,
126 |         )
127 |         prompt = PromptTemplate(
128 |             input_variables=self.input_variables,
129 |             template=PROMPT_TEMPLATE,
130 |         )
131 |         langchain = LLMChain(
132 |             llm=model,
133 |             prompt=prompt,
134 |             verbose=True
135 |         )
136 |         return langchain
137 | 


--------------------------------------------------------------------------------
/recsys/inference/query_transformer.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | 
 4 | import hopsworks
 5 | import nest_asyncio
 6 | 
 7 | nest_asyncio.apply()
 8 | import pandas as pd
 9 | 
10 | class Transformer(object):
11 |     def __init__(self) -> None:
12 |         # Connect to the Hopsworks
13 |         project = hopsworks.login()
14 |         ms = project.get_model_serving()
15 | 
16 |         self._retrieve_secrets()
17 | 
18 |         # Retrieve the 'customers' feature view
19 |         fs = project.get_feature_store()
20 |         self.customer_fv = fs.get_feature_view(
21 |             name="customers",
22 |             version=1,
23 |         )
24 | 
25 |         # Retrieve  the "ranking" feature view and initialize the batch scoring server.
26 |         self.ranking_fv = fs.get_feature_view(name="ranking", version=1)
27 |         self.ranking_fv.init_batch_scoring(1)
28 | 
29 |         # Retrieve the ranking deployment
30 |         self.ranking_server = ms.get_deployment(self.ranking_model_type)
31 | 
32 |     def _retrieve_secrets(self):
33 |         project = hopsworks.login()
34 |         secrets_api = hopsworks.get_secrets_api()
35 |         try:
36 |             self.ranking_model_type = secrets_api.get_secret("RANKING_MODEL_TYPE").value
37 |         except Exception as e:
38 |             logging.error(e)
39 |             logging.error("Could not retrieve secret RANKING_MODEL_TYPE, defaulting to ranker")
40 |             self.ranking_model_type = "ranking"
41 | 
42 |     def preprocess(self, inputs):
43 |         # Check if the input data contains a key named "instances"
44 |         # and extract the actual data if present
45 |         inputs = inputs["instances"] if "instances" in inputs else inputs
46 |         inputs = inputs[0]
47 | 
48 |         # Extract customer_id and transaction_date from the inputs
49 |         customer_id = inputs["customer_id"]
50 |         transaction_date = inputs["transaction_date"]
51 | 
52 |         # Extract month from the transaction_date
53 |         month_of_purchase = datetime.fromisoformat(inputs.pop("transaction_date"))
54 | 
55 |         # Get customer features
56 |         customer_features = self.customer_fv.get_feature_vector(
57 |             {"customer_id": customer_id},
58 |             return_type="pandas",
59 |         )
60 | 
61 |         # Enrich inputs with customer age
62 |         inputs["age"] = customer_features.age.values[0]
63 | 
64 |         # Calculate the sine and cosine of the month_of_purchase
65 |         month_of_purchase = datetime.strptime(
66 |             transaction_date, "%Y-%m-%dT%H:%M:%S.%f"
67 |         ).month
68 | 
69 |         # Calculate the sine and cosine components for the month_of_purchase using on-demand transformation present in "ranking" feature view.
70 |         feature_vector = self.ranking_fv._batch_scoring_server.compute_on_demand_features(
71 |             feature_vectors=pd.DataFrame([inputs]), request_parameters={"month": month_of_purchase}
72 |         ).to_dict(orient="records")[0]
73 | 
74 |         inputs["month_sin"] = feature_vector["month_sin"]
75 |         inputs["month_cos"] = feature_vector["month_cos"]
76 | 
77 |         return {"instances": [inputs]}
78 | 
79 |     def postprocess(self, outputs):
80 |         # Return ordered ranking predictions
81 |         return self.ranking_server.predict(inputs=outputs)
82 | 


--------------------------------------------------------------------------------
/recsys/inference/ranking_predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import joblib
 3 | import numpy as np
 4 | 
 5 | import logging
 6 | 
 7 | class Predict(object):
 8 |     
 9 |     def __init__(self):
10 |         self.model = joblib.load(os.environ["MODEL_FILES_PATH"] + "/ranking_model.pkl")
11 | 
12 |     def predict(self, inputs):
13 |         
14 |         logging.info(f"✅ Inputs: {inputs}")
15 |         
16 |         # Extract ranking features and article IDs from the inputs
17 |         features = inputs[0].pop("ranking_features")
18 |         article_ids = inputs[0].pop("article_ids")
19 |         
20 |         # Log the extracted features
21 |         logging.info("predict -> " + str(features))
22 |         
23 |         # Log the extracted article ids
24 |         logging.info(f'Article IDs: {article_ids}')
25 |         
26 |         logging.info(f"🦅 Predicting...")
27 | 
28 |         # Predict probabilities for the positive class
29 |         scores = self.model.predict_proba(features).tolist()
30 |         
31 |         # Get scores of positive class
32 |         scores = np.asarray(scores)[:,1].tolist() 
33 | 
34 |         # Return the predicted scores along with the corresponding article IDs
35 |         return {
36 |             "scores": scores, 
37 |             "article_ids": article_ids,
38 |         }
39 | 


--------------------------------------------------------------------------------
/recsys/inference/ranking_transformer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import hopsworks
  4 | import pandas as pd
  5 | 
  6 | import nest_asyncio
  7 | nest_asyncio.apply()
  8 | 
  9 | class Transformer(object):
 10 |     def __init__(self):
 11 |         # Connect to Hopsworks
 12 |         project = hopsworks.login()
 13 |         self.fs = project.get_feature_store()
 14 | 
 15 |         # Retrieve 'transactions' feature group.
 16 |         self.transactions_fg = self.fs.get_feature_group("transactions", 1)
 17 | 
 18 |         # Retrieve the 'articles' feature view
 19 |         self.articles_fv = self.fs.get_feature_view(
 20 |             name="articles",
 21 |             version=1,
 22 |         )
 23 | 
 24 |         # Get list of feature names for articles
 25 |         self.articles_features = [feat.name for feat in self.articles_fv.schema]
 26 | 
 27 |         # Retrieve the 'customers' feature view
 28 |         self.customer_fv = self.fs.get_feature_view(
 29 |             name="customers",
 30 |             version=1,
 31 |         )
 32 | 
 33 |         self.customer_fv.init_serving(1)
 34 | 
 35 |         # Retrieve the 'candidate_embeddings' feature view
 36 |         self.candidate_index = self.fs.get_feature_view(
 37 |             name="candidate_embeddings",
 38 |             version=1,
 39 |         )
 40 | 
 41 |         # Retrieve ranking model
 42 |         mr = project.get_model_registry()
 43 |         model = mr.get_model(
 44 |             name="ranking_model",
 45 |             version=1,
 46 |         )
 47 | 
 48 |         self.ranking_fv = model.get_feature_view(init=False)
 49 |         self.ranking_fv.init_batch_scoring(1)
 50 | 
 51 |         # Get the names of features expected by the ranking model
 52 |         self.ranking_model_feature_names = [
 53 |             feature.name 
 54 |             for feature 
 55 |             in self.ranking_fv.schema 
 56 |             if feature.name != 'label'
 57 |         ]
 58 | 
 59 |     def preprocess(self, inputs):
 60 |         # Extract the input instance
 61 |         inputs = inputs["instances"][0]
 62 | 
 63 |         # Extract customer_id from inputs
 64 |         customer_id = inputs["customer_id"]
 65 | 
 66 |         # Search for candidate items
 67 |         neighbors = self.candidate_index.find_neighbors(
 68 |             inputs["query_emb"],
 69 |             k=100,
 70 |         )
 71 |         neighbors = [neighbor[0] for neighbor in neighbors]
 72 | 
 73 |         # Get IDs of items already bought by the customer
 74 |         already_bought_items_ids = (
 75 |             self.transactions_fg.select("article_id").filter(self.transactions_fg.customer_id==customer_id).read(dataframe_type="pandas").values.reshape(-1).tolist()
 76 |         )
 77 | 
 78 |         # Filter candidate items to exclude those already bought by the customer
 79 |         item_id_list = [
 80 |             str(item_id)
 81 |             for item_id in neighbors
 82 |             if str(item_id) not in already_bought_items_ids
 83 |         ]
 84 |         item_id_df = pd.DataFrame({"article_id": item_id_list})
 85 | 
 86 |         # Retrieve Article data for candidate items
 87 |         articles_data = [
 88 |             self.articles_fv.get_feature_vector({"article_id": item_id})
 89 |             for item_id in item_id_list
 90 |         ]
 91 | 
 92 |         logging.info("✅ Articles Data Retrieved!")
 93 | 
 94 |         articles_df = pd.DataFrame(
 95 |             data=articles_data,
 96 |             columns=self.articles_features,
 97 |         )
 98 | 
 99 |         # Join candidate items with their features
100 |         ranking_model_inputs = item_id_df.merge(
101 |             articles_df,
102 |             on="article_id",
103 |             how="inner",
104 |         )
105 | 
106 |         logging.info("✅ Inputs are almost ready!")
107 | 
108 |         # Add customer features
109 |         customer_features = self.customer_fv.get_feature_vector(
110 |                 {"customer_id": customer_id},
111 |                 return_type="pandas",
112 |             )
113 | 
114 |         ranking_model_inputs["age"] = customer_features.age.values[0]
115 |         ranking_model_inputs["month_sin"] = inputs["month_sin"]
116 |         ranking_model_inputs["month_cos"] = inputs["month_cos"]
117 | 
118 |         # Select only the features required by the ranking model
119 |         ranking_model_inputs = ranking_model_inputs[self.ranking_model_feature_names]
120 | 
121 |         logging.info("✅ Inputs are ready!")
122 | 
123 |         return {
124 |             "inputs": [
125 |                 {
126 |                     "ranking_features": ranking_model_inputs.values.tolist(),
127 |                     "article_ids": item_id_list,
128 |                 }
129 |             ]
130 |         }
131 | 
132 |     def postprocess(self, outputs):
133 |         logging.info("✅ Predictions are ready!")
134 | 
135 |         # Merge prediction scores and corresponding article IDs into a list of tuples
136 |         ranking = list(zip(outputs["scores"], outputs["article_ids"]))
137 | 
138 |         # Sort the ranking list by score in descending order
139 |         ranking.sort(reverse=True)
140 | 
141 |         # Return the sorted ranking list
142 |         return {
143 |             "ranking": ranking,
144 |         }
145 | 


--------------------------------------------------------------------------------
/recsys/raw_data_sources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/recsys/raw_data_sources/__init__.py


--------------------------------------------------------------------------------
/recsys/raw_data_sources/h_and_m.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | 
 3 | 
 4 | def extract_articles_df() -> pl.DataFrame:
 5 |     return pl.read_csv("https://repo.hops.works/dev/jdowling/h-and-m/articles.csv", try_parse_dates=True)
 6 | 
 7 | 
 8 | def extract_customers_df() -> pl.DataFrame:
 9 |     return pl.read_csv("https://repo.hops.works/dev/jdowling/h-and-m/customers.csv", try_parse_dates=True)
10 | 
11 | 
12 | def extract_transactions_df() -> pl.DataFrame:
13 |     return pl.read_csv(
14 |         "https://repo.hops.works/dev/jdowling/h-and-m/transactions_train.csv", try_parse_dates=True
15 |     )
16 | 


--------------------------------------------------------------------------------
/recsys/training/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ranking, two_tower
2 | 
3 | __all__ = ["ranking", "two_tower"]
4 | 


--------------------------------------------------------------------------------
/recsys/training/ranking.py:
--------------------------------------------------------------------------------
 1 | from catboost import CatBoostClassifier, Pool
 2 | from loguru import logger
 3 | from sklearn.metrics import classification_report, precision_recall_fscore_support
 4 | 
 5 | from recsys.config import settings
 6 | 
 7 | 
 8 | class RankingModelFactory:
 9 |     @classmethod
10 |     def build(cls) -> CatBoostClassifier:
11 |         return CatBoostClassifier(
12 |             learning_rate=settings.RANKING_LEARNING_RATE,
13 |             iterations=settings.RANKING_ITERATIONS,
14 |             depth=10,
15 |             scale_pos_weight=settings.RANKING_SCALE_POS_WEIGHT,
16 |             early_stopping_rounds=settings.RANKING_EARLY_STOPPING_ROUNDS,
17 |             use_best_model=True,
18 |         )
19 | 
20 | 
21 | class RankingModelTrainer:
22 |     def __init__(self, model, train_dataset, eval_dataset) -> None:
23 |         self._model = model
24 | 
25 |         self._X_train, self._y_train = train_dataset
26 |         self._X_val, self._y_val = eval_dataset
27 | 
28 |         self._train_dataset, self._eval_dataset = self._initialize_dataset(
29 |             train_dataset, eval_dataset
30 |         )
31 | 
32 |     def get_model(self):
33 |         return self._model
34 | 
35 |     def _initialize_dataset(self, train_dataset, eval_dataset):
36 |         X_train, y_train = train_dataset
37 |         X_val, y_val = eval_dataset
38 | 
39 |         cat_features = list(X_train.select_dtypes(include=["string", "object"]).columns)
40 | 
41 |         pool_train = Pool(X_train, y_train, cat_features=cat_features)
42 |         pool_val = Pool(X_val, y_val, cat_features=cat_features)
43 | 
44 |         return pool_train, pool_val
45 | 
46 |     def fit(self):
47 |         self._model.fit(
48 |             self._train_dataset,
49 |             eval_set=self._eval_dataset,
50 |         )
51 | 
52 |         return self._model
53 | 
54 |     def evaluate(self, log: bool = False):
55 |         preds = self._model.predict(self._eval_dataset)
56 | 
57 |         precision, recall, fscore, _ = precision_recall_fscore_support(
58 |             self._y_val, preds, average="binary"
59 |         )
60 | 
61 |         if log:
62 |             logger.info(classification_report(self._y_val, preds))
63 | 
64 |         return {
65 |             "precision": precision,
66 |             "recall": recall,
67 |             "fscore": fscore,
68 |         }
69 | 
70 |     def get_feature_importance(self) -> dict:
71 |         feat_to_score = {
72 |             feature: score
73 |             for feature, score in zip(
74 |                 self._X_train.columns,
75 |                 self._model.feature_importances_,
76 |             )
77 |         }
78 | 
79 |         feat_to_score = dict(
80 |             sorted(
81 |                 feat_to_score.items(),
82 |                 key=lambda item: item[1],
83 |                 reverse=True,
84 |             )
85 |         )
86 | 
87 |         return feat_to_score
88 | 


--------------------------------------------------------------------------------
/recsys/training/two_tower.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow_recommenders as tfrs
  3 | from loguru import logger
  4 | from tensorflow.keras.layers import Normalization, StringLookup
  5 | 
  6 | from recsys.config import settings
  7 | 
  8 | 
  9 | class QueryTowerFactory:
 10 |     def __init__(self, dataset: "TwoTowerDataset") -> None:
 11 |         self._dataset = dataset
 12 | 
 13 |     def build(
 14 |         self, embed_dim: int = settings.TWO_TOWER_MODEL_EMBEDDING_SIZE
 15 |     ) -> "QueryTower":
 16 |         return QueryTower(
 17 |             user_ids=self._dataset.properties["user_ids"],
 18 |             emb_dim=embed_dim,
 19 |         )
 20 | 
 21 | 
 22 | class QueryTower(tf.keras.Model):
 23 |     def __init__(self, user_ids: list, emb_dim: int) -> None:
 24 |         super().__init__()
 25 | 
 26 |         self.user_embedding = tf.keras.Sequential(
 27 |             [
 28 |                 StringLookup(vocabulary=user_ids, mask_token=None),
 29 |                 tf.keras.layers.Embedding(
 30 |                     # Add an additional embedding to account for unknown tokens.
 31 |                     len(user_ids) + 1,
 32 |                     emb_dim,
 33 |                 ),
 34 |             ]
 35 |         )
 36 | 
 37 |         self.normalized_age = Normalization(axis=None)
 38 | 
 39 |         self.fnn = tf.keras.Sequential(
 40 |             [
 41 |                 tf.keras.layers.Dense(emb_dim, activation="relu"),
 42 |                 tf.keras.layers.Dense(emb_dim),
 43 |             ]
 44 |         )
 45 | 
 46 |     def call(self, inputs):
 47 |         concatenated_inputs = tf.concat(
 48 |             [
 49 |                 self.user_embedding(inputs["customer_id"]),
 50 |                 tf.reshape(self.normalized_age(inputs["age"]), (-1, 1)),
 51 |                 tf.reshape(inputs["month_sin"], (-1, 1)),
 52 |                 tf.reshape(inputs["month_cos"], (-1, 1)),
 53 |             ],
 54 |             axis=1,
 55 |         )
 56 | 
 57 |         outputs = self.fnn(concatenated_inputs)
 58 | 
 59 |         return outputs
 60 | 
 61 | 
 62 | class ItemTowerFactory:
 63 |     def __init__(self, dataset: "TwoTowerDataset") -> None:
 64 |         self._dataset = dataset
 65 | 
 66 |     def build(
 67 |         self, embed_dim: int = settings.TWO_TOWER_MODEL_EMBEDDING_SIZE
 68 |     ) -> "ItemTower":
 69 |         return ItemTower(
 70 |             item_ids=self._dataset.properties["item_ids"],
 71 |             garment_groups=self._dataset.properties["garment_groups"],
 72 |             index_groups=self._dataset.properties["index_groups"],
 73 |             emb_dim=embed_dim,
 74 |         )
 75 | 
 76 | 
 77 | class ItemTower(tf.keras.Model):
 78 |     def __init__(
 79 |         self,
 80 |         item_ids: list,
 81 |         garment_groups: list,
 82 |         index_groups: list,
 83 |         emb_dim: int,
 84 |     ):
 85 |         super().__init__()
 86 | 
 87 |         self.garment_groups = garment_groups
 88 |         self.index_groups = index_groups
 89 | 
 90 |         self.item_embedding = tf.keras.Sequential(
 91 |             [
 92 |                 StringLookup(vocabulary=item_ids, mask_token=None),
 93 |                 tf.keras.layers.Embedding(
 94 |                     # Add an additional embedding to account for unknown tokens.
 95 |                     len(item_ids) + 1,
 96 |                     emb_dim,
 97 |                 ),
 98 |             ]
 99 |         )
100 |         # Converts strings into integer indices (scikit-learn LabelEncoder analog)
101 |         self.garment_group_tokenizer = StringLookup(
102 |             vocabulary=garment_groups,
103 |             mask_token=None,
104 |         )
105 |         self.index_group_tokenizer = StringLookup(
106 |             vocabulary=index_groups,
107 |             mask_token=None,
108 |         )
109 | 
110 |         self.fnn = tf.keras.Sequential(
111 |             [
112 |                 tf.keras.layers.Dense(emb_dim, activation="relu"),
113 |                 tf.keras.layers.Dense(emb_dim),
114 |             ]
115 |         )
116 | 
117 |     def call(self, inputs):
118 |         garment_group_embedding = tf.one_hot(
119 |             self.garment_group_tokenizer(inputs["garment_group_name"]),
120 |             len(self.garment_groups),
121 |         )
122 | 
123 |         index_group_embedding = tf.one_hot(
124 |             self.index_group_tokenizer(inputs["index_group_name"]),
125 |             len(self.index_groups),
126 |         )
127 | 
128 |         concatenated_inputs = tf.concat(
129 |             [
130 |                 self.item_embedding(inputs["article_id"]),
131 |                 garment_group_embedding,
132 |                 index_group_embedding,
133 |             ],
134 |             axis=1,
135 |         )
136 | 
137 |         outputs = self.fnn(concatenated_inputs)
138 | 
139 |         return outputs
140 | 
141 | 
142 | class TwoTowerFactory:
143 |     def __init__(self, dataset: "TwoTowerDataset") -> None:
144 |         self._dataset = dataset
145 | 
146 |     def build(
147 |         self,
148 |         query_model: QueryTower,
149 |         item_model: ItemTower,
150 |         batch_size: int = settings.TWO_TOWER_MODEL_BATCH_SIZE,
151 |     ) -> "TwoTowerModel":
152 |         item_ds = self._dataset.get_items_subset()
153 | 
154 |         return TwoTowerModel(
155 |             query_model,
156 |             item_model,
157 |             item_ds=item_ds,
158 |             batch_size=batch_size,
159 |         )
160 | 
161 | 
162 | class TwoTowerModel(tf.keras.Model):
163 |     def __init__(
164 |         self,
165 |         query_model: QueryTower,
166 |         item_model: ItemTower,
167 |         item_ds: tf.data.Dataset,
168 |         batch_size: int,
169 |     ) -> None:
170 |         super().__init__()
171 |         self.query_model = query_model
172 |         self.item_model = item_model
173 |         self.task = tfrs.tasks.Retrieval(
174 |             metrics=tfrs.metrics.FactorizedTopK(
175 |                 candidates=item_ds.batch(batch_size).map(self.item_model)
176 |             )
177 |         )
178 | 
179 |     def train_step(self, batch) -> tf.Tensor:
180 |         # Set up a gradient tape to record gradients.
181 |         with tf.GradientTape() as tape:
182 |             # Loss computation.
183 |             user_embeddings = self.query_model(batch)
184 |             item_embeddings = self.item_model(batch)
185 |             loss = self.task(
186 |                 user_embeddings,
187 |                 item_embeddings,
188 |                 compute_metrics=False,
189 |             )
190 | 
191 |             # Handle regularization losses as well.
192 |             regularization_loss = sum(self.losses)
193 | 
194 |             total_loss = loss + regularization_loss
195 | 
196 |         gradients = tape.gradient(total_loss, self.trainable_variables)
197 |         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
198 | 
199 |         metrics = {
200 |             "loss": loss,
201 |             "regularization_loss": regularization_loss,
202 |             "total_loss": total_loss,
203 |         }
204 | 
205 |         return metrics
206 | 
207 |     def test_step(self, batch) -> tf.Tensor:
208 |         # Loss computation.
209 |         user_embeddings = self.query_model(batch)
210 |         item_embeddings = self.item_model(batch)
211 | 
212 |         loss = self.task(
213 |             user_embeddings,
214 |             item_embeddings,
215 |             compute_metrics=False,
216 |         )
217 | 
218 |         # Handle regularization losses as well.
219 |         regularization_loss = sum(self.losses)
220 | 
221 |         total_loss = loss + regularization_loss
222 | 
223 |         metrics = {metric.name: metric.result() for metric in self.metrics}
224 |         metrics["loss"] = loss
225 |         metrics["regularization_loss"] = regularization_loss
226 |         metrics["total_loss"] = total_loss
227 | 
228 |         return metrics
229 | 
230 | 
231 | class TwoTowerDataset:
232 |     def __init__(self, feature_view, batch_size: int) -> None:
233 |         self._feature_view = feature_view
234 |         self._batch_size = batch_size
235 |         self._properties: dict | None
236 | 
237 |     @property
238 |     def query_features(self) -> list[str]:
239 |         return ["customer_id", "age", "month_sin", "month_cos"]
240 | 
241 |     @property
242 |     def candidate_features(self) -> list[str]:
243 |         return [
244 |             "article_id",
245 |             "garment_group_name",
246 |             "index_group_name",
247 |         ]
248 | 
249 |     @property
250 |     def properties(self) -> dict:
251 |         assert self._properties is not None, "Call get_train_val_split() first."
252 | 
253 |         return self._properties
254 | 
255 |     def get_items_subset(self):
256 |         item_df = self.properties["train_df"][self.candidate_features]
257 |         item_df.drop_duplicates(subset="article_id", inplace=True)
258 |         item_ds = self.df_to_ds(item_df)
259 | 
260 |         return item_ds
261 | 
262 |     def get_train_val_split(self):
263 |         logger.info("Retrieving and creating train, val test split...")
264 | 
265 |         train_df, val_df, test_df, _, _, _ = (
266 |             self._feature_view.train_validation_test_split(
267 |                 validation_size=settings.TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE,
268 |                 test_size=settings.TWO_TOWER_DATASET_TEST_SPLIT_SIZE,
269 |                 description="Retrieval dataset splits",
270 |             )
271 |         )
272 | 
273 |         train_ds = (
274 |             self.df_to_ds(train_df)
275 |             .batch(self._batch_size)
276 |             .cache()
277 |             .shuffle(self._batch_size * 10)
278 |         )
279 |         val_ds = self.df_to_ds(val_df).batch(self._batch_size).cache()
280 | 
281 |         self._properties = {
282 |             "train_df": train_df,
283 |             "val_df": val_df,
284 |             "query_df": train_df[self.query_features],
285 |             "item_df": train_df[self.candidate_features],
286 |             "user_ids": train_df["customer_id"].unique().tolist(),
287 |             "item_ids": train_df["article_id"].unique().tolist(),
288 |             "garment_groups": train_df["garment_group_name"].unique().tolist(),
289 |             "index_groups": train_df["index_group_name"].unique().tolist(),
290 |         }
291 | 
292 |         return train_ds, val_ds
293 | 
294 |     def df_to_ds(self, df):
295 |         return tf.data.Dataset.from_tensor_slices({col: df[col] for col in df})
296 | 
297 | 
298 | class TwoTowerTrainer:
299 |     def __init__(self, dataset: TwoTowerDataset, model: TwoTowerModel) -> None:
300 |         self._dataset = dataset
301 |         self._model = model
302 | 
303 |     def train(self, train_ds, val_ds):
304 |         self._initialize_query_model(train_ds)
305 | 
306 |         # Define an optimizer using AdamW with a learning rate of 0.01
307 |         optimizer = tf.keras.optimizers.AdamW(
308 |             weight_decay=settings.TWO_TOWER_WEIGHT_DECAY,
309 |             learning_rate=settings.TWO_TOWER_LEARNING_RATE,
310 |         )
311 | 
312 |         # Compile the model using the specified optimizer
313 |         self._model.compile(optimizer=optimizer)
314 | 
315 |         # Start training
316 |         history = self._model.fit(
317 |             train_ds,
318 |             validation_data=val_ds,
319 |             epochs=settings.TWO_TOWER_NUM_EPOCHS,
320 |         )
321 | 
322 |         return history
323 | 
324 |     def _initialize_query_model(self, train_ds):
325 |         # Initialize age normalization layer.
326 |         self._model.query_model.normalized_age.adapt(train_ds.map(lambda x: x["age"]))
327 | 
328 |         # Initialize model with inputs.
329 |         query_df = self._dataset.properties["query_df"]
330 |         query_ds = self._dataset.df_to_ds(query_df).batch(1)
331 |         self._model.query_model(next(iter(query_ds)))
332 | 


--------------------------------------------------------------------------------
/recsys/ui/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decodingml/personalized-recommender-course/6f421432d8e623d68a06581415a97b0ad09d1e3c/recsys/ui/__init__.py


--------------------------------------------------------------------------------
/recsys/ui/feature_group_updater.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import random
  4 | from datetime import datetime
  5 | 
  6 | import hopsworks
  7 | import pandas as pd
  8 | import streamlit as st
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class FeatureGroupUpdater:
 15 |     def __init__(self):
 16 |         """Initialize the FeatureGroup updater"""
 17 |         self._initialize_feature_groups()
 18 | 
 19 |     def _initialize_feature_groups(self) -> None:
 20 |         """Initialize connection to Hopsworks Feature Groups"""
 21 |         try:
 22 |             if "feature_group" not in st.session_state:
 23 |                 logger.info("📡 Initializing Hopsworks Feature Groups connection...")
 24 |                 project = hopsworks.login()
 25 |                 fs = project.get_feature_store()
 26 | 
 27 |                 # Initialize interactions feature group
 28 |                 st.session_state.feature_group = fs.get_feature_group(
 29 |                     name="interactions",
 30 |                     version=1,
 31 |                 )
 32 | 
 33 |                 # Initialize transactions feature group
 34 |                 st.session_state.transactions_fg = fs.get_feature_group(
 35 |                     name="transactions",
 36 |                     version=1,
 37 |                 )
 38 |                 logger.info("✅ Feature Groups connection established")
 39 | 
 40 |         except Exception as e:
 41 |             logger.error(f"Failed to initialize Feature Groups connection: {str(e)}")
 42 |             st.error(
 43 |                 "❌ Failed to connect to Feature Groups. Check terminal for details."
 44 |             )
 45 |             raise
 46 | 
 47 |     def _prepare_transaction_for_insertion(self, purchase_data: dict) -> pd.DataFrame:
 48 |         """Prepare transaction data for insertion into transactions feature group"""
 49 |         try:
 50 |             timestamp = datetime.now()
 51 | 
 52 |             transaction = {
 53 |                 "t_dat": int(timestamp.timestamp()),
 54 |                 "customer_id": str(purchase_data["customer_id"]),
 55 |                 "article_id": str(purchase_data["article_id"]),
 56 |                 "price": round(random.uniform(10, 140), 2),
 57 |                 "sales_channel_id": 2,
 58 |                 "year": timestamp.year,
 59 |                 "month": timestamp.month,
 60 |                 "day": timestamp.day,
 61 |                 "day_of_week": timestamp.weekday(),
 62 |                 "month_sin": math.sin(2 * math.pi * timestamp.month / 12),
 63 |                 "month_cos": math.cos(2 * math.pi * timestamp.month / 12),
 64 |             }
 65 | 
 66 |             df = pd.DataFrame([transaction])
 67 | 
 68 |             # Ensure correct data types
 69 |             df["t_dat"] = df["t_dat"].astype("int64")
 70 |             df["customer_id"] = df["customer_id"].astype(str)
 71 |             df["article_id"] = df["article_id"].astype(str)
 72 |             df["price"] = df["price"].astype("float64")
 73 |             df["sales_channel_id"] = df["sales_channel_id"].astype("int64")
 74 |             df["year"] = df["year"].astype("int32")
 75 |             df["month"] = df["month"].astype("int32")
 76 |             df["day"] = df["day"].astype("int32")
 77 |             df["day_of_week"] = df["day_of_week"].astype("int32")
 78 |             df["month_sin"] = df["month_sin"].astype("float64")
 79 |             df["month_cos"] = df["month_cos"].astype("float64")
 80 | 
 81 |             logger.info(f"Prepared transaction for insertion: {transaction}")
 82 |             return df
 83 | 
 84 |         except Exception as e:
 85 |             logger.error(f"Error preparing transaction data: {str(e)}")
 86 |             return None
 87 | 
 88 |     def insert_transaction(self, purchase_data: dict) -> bool:
 89 |         """Insert a single transaction into transactions feature group"""
 90 |         try:
 91 |             transaction_df = self._prepare_transaction_for_insertion(purchase_data)
 92 | 
 93 |             if transaction_df is not None:
 94 |                 logger.info("Inserting transaction...")
 95 |                 with st.spinner("💫 Recording transaction..."):
 96 |                     st.session_state.transactions_fg.multi_part_insert(transaction_df)
 97 |                 logger.info("✅ Transaction inserted successfully")
 98 |                 return True
 99 | 
100 |         except Exception as e:
101 |             logger.error(f"Failed to insert transaction: {str(e)}")
102 |             st.error("❌ Failed to insert transaction. Check terminal for details.")
103 | 
104 |         return False
105 | 
106 |     def _prepare_interactions_for_insertion(self, df: pd.DataFrame) -> pd.DataFrame:
107 |         """Prepare interactions dataframe for insertion"""
108 |         if df is None or df.empty:
109 |             return None
110 | 
111 |         try:
112 |             # Convert timestamp to Unix timestamp if needed
113 |             if not pd.api.types.is_integer_dtype(df["t_dat"]):
114 |                 df["t_dat"] = pd.to_datetime(df["t_dat"]).astype("int64") // 10**9
115 | 
116 |             prepared_df = pd.DataFrame(
117 |                 {
118 |                     "t_dat": df["t_dat"].astype("int64"),
119 |                     "customer_id": df["customer_id"].astype(str),
120 |                     "article_id": df["article_id"].astype(str),
121 |                     "interaction_score": df["interaction_score"].astype("int64"),
122 |                     "prev_article_id": df["prev_article_id"].astype(str),
123 |                 }
124 |             )
125 | 
126 |             logger.info("Prepared interaction for insertion")
127 |             return prepared_df
128 | 
129 |         except Exception as e:
130 |             logger.error(f"Error preparing interaction data: {str(e)}")
131 |             return None
132 | 
133 |     def process_interactions(self, tracker, force: bool = False) -> bool:
134 |         """Process and insert interactions immediately"""
135 |         try:
136 |             interactions_df = tracker.get_interactions_data()
137 | 
138 |             if interactions_df.empty:
139 |                 return False
140 | 
141 |             prepared_df = self._prepare_interactions_for_insertion(interactions_df)
142 |             if prepared_df is not None:
143 |                 logger.info("Inserting interactions...")
144 |                 st.session_state.feature_group.multi_part_insert(prepared_df)
145 |                 logger.info("✅ Interactions inserted successfully")
146 |                 return True
147 | 
148 |         except Exception as e:
149 |             logger.error(f"Error processing interactions: {str(e)}")
150 |             return False
151 | 
152 |         return False
153 | 
154 | 
155 | def get_fg_updater():
156 |     """Get or create FeatureGroupUpdater instance"""
157 |     if "fg_updater" not in st.session_state:
158 |         st.session_state.fg_updater = FeatureGroupUpdater()
159 |     return st.session_state.fg_updater
160 | 


--------------------------------------------------------------------------------
/recsys/ui/interaction_tracker.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from dataclasses import dataclass
  3 | from datetime import datetime
  4 | from enum import Enum, auto
  5 | from typing import Dict, List, Optional, Set, Tuple
  6 | 
  7 | import pandas as pd
  8 | import streamlit as st
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class InteractionType(Enum):
 15 |     """Enum for interaction types and their corresponding scores"""
 16 | 
 17 |     PURCHASE = auto()
 18 |     CLICK = auto()
 19 |     IGNORE = auto()
 20 | 
 21 |     @property
 22 |     def score(self) -> int:
 23 |         return {
 24 |             InteractionType.PURCHASE: 2,
 25 |             InteractionType.CLICK: 1,
 26 |             InteractionType.IGNORE: 0,
 27 |         }[self]
 28 | 
 29 |     @classmethod
 30 |     def from_str(cls, value: str) -> "InteractionType":
 31 |         return {"purchase": cls.PURCHASE, "click": cls.CLICK, "ignore": cls.IGNORE}[
 32 |             value.lower()
 33 |         ]
 34 | 
 35 | 
 36 | @dataclass
 37 | class Interaction:
 38 |     t_dat: int  # Unix timestamp
 39 |     customer_id: str
 40 |     article_id: str
 41 |     interaction_type: str
 42 |     interaction_score: int
 43 |     prev_article_id: Optional[str]
 44 | 
 45 | 
 46 | class InteractionTracker:
 47 |     def __init__(self):
 48 |         """Initialize interaction tracking containers"""
 49 |         # Key: (customer_id, article_id, type) -> Interaction
 50 |         self.interactions: Dict[Tuple[str, str, str], Interaction] = {}
 51 |         # Key: customer_id -> list of article_ids
 52 |         self.current_items: Dict[str, List[str]] = {}
 53 |         # Key: customer_id -> set of article_ids
 54 |         self.purchased_items: Dict[str, Set[str]] = {}
 55 |         # Key: customer_id -> article_id
 56 |         self.last_interaction: Dict[str, str] = {}
 57 |         logger.info("Initialized InteractionTracker")
 58 | 
 59 |     def track_shown_items(self, customer_id: str, items_with_scores: list):
 60 |         """Record items being shown with their scores"""
 61 |         if customer_id not in self.purchased_items:
 62 |             self.purchased_items[customer_id] = set()
 63 | 
 64 |         item_ids = [str(item_id) for item_id, _ in items_with_scores]
 65 |         self.current_items[customer_id] = item_ids
 66 | 
 67 |         # Record ignore interactions
 68 |         timestamp = int(datetime.now().timestamp())
 69 | 
 70 |         for idx, item_id in enumerate(item_ids):
 71 |             if item_id not in self.purchased_items.get(customer_id, set()):
 72 |                 prev_id = item_ids[idx - 1] if idx > 0 else item_id
 73 |                 self._add_interaction(
 74 |                     customer_id=customer_id,
 75 |                     article_id=item_id,
 76 |                     interaction_type="ignore",
 77 |                     prev_article_id=prev_id,
 78 |                     timestamp=timestamp,
 79 |                 )
 80 | 
 81 |         logger.info(f"Tracked {len(item_ids)} shown items for customer {customer_id}")
 82 | 
 83 |     def track(self, customer_id: str, article_id: str, interaction_type: str):
 84 |         """Record a user interaction"""
 85 |         article_id = str(article_id)
 86 | 
 87 |         if customer_id not in self.purchased_items:
 88 |             self.purchased_items[customer_id] = set()
 89 | 
 90 |         prev_article_id = self.last_interaction.get(customer_id, article_id)
 91 | 
 92 |         self._add_interaction(
 93 |             customer_id=customer_id,
 94 |             article_id=article_id,
 95 |             interaction_type=interaction_type,
 96 |             prev_article_id=prev_article_id,
 97 |         )
 98 | 
 99 |         # Update tracking state and UI feedback
100 |         int_type = InteractionType.from_str(interaction_type)
101 |         if int_type == InteractionType.PURCHASE:
102 |             self.purchased_items[customer_id].add(article_id)
103 |             st.toast(f"🛍️ Purchased item {article_id}", icon="🛍️")
104 |             logger.info(
105 |                 f"Tracked purchase of item {article_id} by customer {customer_id}"
106 |             )
107 |         elif int_type == InteractionType.CLICK:
108 |             st.toast(f"Viewed details of item {article_id}", icon="👁️")
109 |             logger.info(f"Tracked click on item {article_id} by customer {customer_id}")
110 | 
111 |         if int_type in (InteractionType.CLICK, InteractionType.PURCHASE):
112 |             self.last_interaction[customer_id] = article_id
113 | 
114 |     def _add_interaction(
115 |         self, customer_id, article_id, interaction_type, prev_article_id, timestamp=None
116 |     ):
117 |         """Add interaction with duplicate handling using dictionary"""
118 |         if timestamp is None:
119 |             timestamp = int(datetime.now().timestamp())
120 | 
121 |         key = (customer_id, article_id, interaction_type)
122 |         int_type = InteractionType.from_str(interaction_type)
123 | 
124 |         self.interactions[key] = Interaction(
125 |             t_dat=timestamp,
126 |             customer_id=str(customer_id),
127 |             article_id=str(article_id),
128 |             interaction_type=interaction_type,
129 |             interaction_score=int_type.score,
130 |             prev_article_id=str(prev_article_id),
131 |         )
132 | 
133 |         logger.debug(
134 |             f"Added {interaction_type} interaction: "
135 |             f"customer={customer_id}, article={article_id}, score={int_type.score}"
136 |         )
137 | 
138 |     def get_interactions_data(self) -> pd.DataFrame:
139 |         """Get all recorded interactions as a pandas DataFrame"""
140 |         if not self.interactions:
141 |             logger.info("No interactions recorded yet")
142 |             return pd.DataFrame(
143 |                 columns=[
144 |                     "t_dat",
145 |                     "customer_id",
146 |                     "article_id",
147 |                     "interaction_type",
148 |                     "interaction_score",
149 |                     "prev_article_id",
150 |                 ]
151 |             )
152 | 
153 |         df = pd.DataFrame([vars(i) for i in self.interactions.values()])
154 |         logger.info(f"Retrieved {len(df)} interactions")
155 |         return df
156 | 
157 |     def should_show_item(self, customer_id: str, article_id: str) -> bool:
158 |         """Check if an item should be shown (not purchased)"""
159 |         return str(article_id) not in self.purchased_items.get(customer_id, set())
160 | 
161 |     def get_current_items(self, customer_id: str) -> List[str]:
162 |         """Get current items for a customer"""
163 |         return self.current_items.get(customer_id, [])
164 | 
165 |     def clear_interactions(self):
166 |         """Clear all recorded interactions while preserving purchased items"""
167 |         self.interactions.clear()
168 |         logger.info("Cleared all recorded interactions")
169 | 
170 | 
171 | def get_tracker():
172 |     """Get or create InteractionTracker instance"""
173 |     if "interaction_tracker" not in st.session_state:
174 |         st.session_state.interaction_tracker = InteractionTracker()
175 |         logger.info("Created new InteractionTracker instance")
176 |     return st.session_state.interaction_tracker
177 | 


--------------------------------------------------------------------------------
/recsys/ui/recommenders.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from datetime import datetime
  3 | 
  4 | import streamlit as st
  5 | from langchain.chains import LLMChain
  6 | from langchain.prompts import PromptTemplate
  7 | from langchain_openai import ChatOpenAI
  8 | from sentence_transformers import SentenceTransformer
  9 | 
 10 | from recsys.config import settings
 11 | 
 12 | from .feature_group_updater import get_fg_updater
 13 | from .interaction_tracker import get_tracker
 14 | from .utils import (
 15 |     fetch_and_process_image,
 16 |     get_item_image_url,
 17 |     print_header,
 18 |     process_description,
 19 | )
 20 | 
 21 | 
 22 | def initialize_llm_state():
 23 |     """Initialize all necessary session state variables for LLM recommendations"""
 24 |     if "llm_recommendations" not in st.session_state:
 25 |         st.session_state.llm_recommendations = []
 26 |     if "outfit_summary" not in st.session_state:
 27 |         st.session_state.outfit_summary = ""
 28 |     if "llm_extra_items" not in st.session_state:
 29 |         st.session_state.llm_extra_items = {}
 30 | 
 31 | 
 32 | def display_item(item_id, score, articles_fv, customer_id, tracker, source):
 33 |     """Display a single item with its interactions"""
 34 |     image_url = get_item_image_url(item_id, articles_fv)
 35 |     img = fetch_and_process_image(image_url)
 36 | 
 37 |     if img:
 38 |         st.image(img, use_column_width=True)
 39 |         st.write(f"**🎯 Score:** {score:.4f}")
 40 | 
 41 |         # View Details button
 42 |         details_key = f"{source}_details_{item_id}"
 43 |         if st.button("📝 View Details", key=details_key):
 44 |             tracker.track(customer_id, item_id, "click")
 45 |             with st.expander("Item Details", expanded=True):
 46 |                 description = process_description(
 47 |                     articles_fv.get_feature_vector({"article_id": item_id})[-2]
 48 |                 )
 49 |                 st.write(description)
 50 | 
 51 |         # Buy button
 52 |         buy_key = f"{source}_buy_{item_id}"
 53 |         if st.button("🛒 Buy", key=buy_key):
 54 |             # Track interaction
 55 |             tracker.track(customer_id, item_id, "purchase")
 56 | 
 57 |             # Insert transaction
 58 |             fg_updater = get_fg_updater()
 59 |             purchase_data = {"customer_id": customer_id, "article_id": item_id}
 60 | 
 61 |             if fg_updater.insert_transaction(purchase_data):
 62 |                 st.success(f"✅ Item {item_id} purchased!")
 63 |                 st.experimental_rerun()
 64 |             else:
 65 |                 st.error("Failed to record transaction, but purchase was tracked")
 66 | 
 67 | 
 68 | def customer_recommendations(
 69 |     articles_fv,
 70 |     ranking_deployment,
 71 |     query_model_deployment,
 72 |     customer_id,
 73 |     max_retries: int = 5,
 74 |     retry_delay: int = 30,
 75 | ):
 76 |     """Handle customer-based recommendations"""
 77 |     tracker = get_tracker()
 78 | 
 79 |     # Initialize or update recommendations
 80 |     if "customer_recs" not in st.session_state:
 81 |         st.session_state.customer_recs = []
 82 |         st.session_state.prediction_time = None
 83 | 
 84 |     # Only get new predictions if:
 85 |     # 1. Button is clicked OR
 86 |     # 2. No recommendations exist OR
 87 |     # 3. Customer ID changed
 88 |     if (
 89 |         st.sidebar.button("Get Recommendations", key="get_recommendations_button")
 90 |         or not st.session_state.customer_recs
 91 |         or "last_customer_id" not in st.session_state
 92 |         or st.session_state.last_customer_id != customer_id
 93 |     ):
 94 |         with st.spinner("🔮 Getting recommendations..."):
 95 |             # Format timestamp with microseconds
 96 |             current_time = datetime.now()
 97 |             formatted_timestamp = current_time.strftime("%Y-%m-%dT%H:%M:%S.%f")
 98 | 
 99 |             st.session_state.prediction_time = formatted_timestamp
100 |             st.session_state.last_customer_id = customer_id
101 | 
102 |             # Get predictions from model using a retry mechanism in case of failure.
103 |             deployment_input = [
104 |                 {"customer_id": customer_id, "transaction_date": formatted_timestamp}
105 |             ]
106 |             warning_placeholder = None
107 |             for attempt in range(max_retries):
108 |                 try:
109 |                     prediction = query_model_deployment.predict(
110 |                         inputs=deployment_input
111 |                     )["predictions"]["ranking"]
112 |                     if warning_placeholder:
113 |                         warning_placeholder.empty()
114 |                     break
115 |                 except Exception as e:
116 |                     if attempt < max_retries - 1:
117 |                         warning_placeholder = st.warning(
118 |                             f"⚠️ Failed to call the H&M recommender deployment. It's probably scaling from 0 to +1 instances, which may take 1-2 minutes. Retrying in {retry_delay} seconds..."
119 |                         )
120 |                         time.sleep(retry_delay)
121 |                     else:
122 |                         st.error(
123 |                             f"❌ Failed to get predictions after {max_retries} retries"
124 |                         )
125 |                         raise e
126 | 
127 |             # Filter out purchased items
128 |             available_items = [
129 |                 (item_id, score)
130 |                 for score, item_id in prediction
131 |                 if tracker.should_show_item(customer_id, item_id)
132 |             ]
133 | 
134 |             # Store recommendations and extras
135 |             st.session_state.customer_recs = available_items[:12]
136 |             st.session_state.extra_recs = available_items[12:]
137 | 
138 |             # Track shown items
139 |             tracker.track_shown_items(
140 |                 customer_id,
141 |                 [(item_id, score) for item_id, score in st.session_state.customer_recs],
142 |             )
143 | 
144 |             st.sidebar.success("✅ Got new recommendations")
145 | 
146 |     # Display recommendations
147 |     print_header("📝 Top 12 Recommendations:")
148 | 
149 |     if not st.session_state.customer_recs:
150 |         st.warning(
151 |             "No recommendations available. Click 'Get Recommendations' to start."
152 |         )
153 |         return
154 | 
155 |     # Display items in 3x4 grid
156 |     for row in range(3):
157 |         cols = st.columns(4)
158 |         for col in range(4):
159 |             idx = row * 4 + col
160 |             if idx < len(st.session_state.customer_recs):
161 |                 item_id, score = st.session_state.customer_recs[idx]
162 |                 if tracker.should_show_item(customer_id, item_id):
163 |                     with cols[col]:
164 |                         display_item(
165 |                             item_id,
166 |                             score,
167 |                             articles_fv,
168 |                             customer_id,
169 |                             tracker,
170 |                             "customer",
171 |                         )
172 |                 else:
173 |                     # Replace purchased item with one from extras
174 |                     if st.session_state.extra_recs:
175 |                         new_item = st.session_state.extra_recs.pop(0)
176 |                         st.session_state.customer_recs.append(new_item)
177 |                     st.session_state.customer_recs.pop(idx)
178 |                     st.experimental_rerun()
179 | 
180 | 
181 | def get_fashion_chain(api_key):
182 |     model = ChatOpenAI(
183 |         model_name=settings.OPENAI_MODEL_ID,
184 |         temperature=0.7,
185 |         openai_api_key=api_key,
186 |     )
187 |     template = """
188 |     You are a fashion recommender for H&M. 
189 |     
190 |     Customer request: {user_input}
191 |     
192 |     Gender: {gender}
193 |     
194 |     Generate 3-5 necessary fashion items with detailed descriptions, tailored for an H&M-style dataset and appropriate for the specified gender. 
195 |     Each item description should be specific, suitable for creating embeddings, and relevant to the gender. 
196 |     
197 |     STRICTLY FOLLOW the next response format:
198 |     <emoji> <item 1 category> @ <item 1 description> | <emoji> <item 2 category> @ <item 2 description> | <emoji> <item 3 category> @ <item 3 description> | <Additional items if necessary> | <BRIEF OUTFIT SUMMARY AND STYLING TIPS WITH EMOJIS>
199 |     
200 |     Example for male gender:
201 |     👖 Pants @ Slim-fit dark wash jeans with subtle distressing | 👕 Top @ Classic white cotton polo shirt with embroidered logo | 👟 Footwear @ Navy canvas sneakers with white soles | 🧥 Outerwear @ Lightweight olive green bomber jacket | 🕶️👔 Versatile casual look! Mix and match for various occasions. Add accessories for personal flair! 💼⌚
202 |     
203 |     Example for female gender:
204 |     👗 Dress @ Floral print wrap dress with flutter sleeves | 👠 Footwear @ Strappy nude block heel sandals | 👜 Accessory @ Woven straw tote bag with leather handles | 🧥 Outerwear @ Cropped denim jacket with raw hem | 🌸👒 Perfect for a summer day out! Layer with the jacket for cooler evenings. Add a wide-brim hat for extra style! 💃🏻🕶️
205 |     
206 |     Ensure each item category has a relevant emoji, each item description is detailed, unique, and appropriate for the specified gender. 
207 |     Make sure to take into account the gender when selecting items and descriptions. 
208 |     The final section should provide a brief summary and styling tips with relevant emojis. Tailor your recommendations to the specified gender.
209 |     """
210 |     prompt = PromptTemplate(
211 |         input_variables=["user_input", "gender"],
212 |         template=template,
213 |     )
214 |     fashion_chain = LLMChain(llm=model, prompt=prompt, verbose=True)
215 |     return fashion_chain
216 | 
217 | 
218 | def get_fashion_recommendations(user_input, fashion_chain, gender):
219 |     """Get recommendations from the LLM"""
220 |     response = fashion_chain.run(user_input=user_input, gender=gender)
221 |     items = response.strip().split(" | ")
222 | 
223 |     outfit_summary = items[-1] if len(items) > 1 else "No summary available."
224 |     item_descriptions = items[:-1] if len(items) > 1 else items
225 | 
226 |     parsed_items = []
227 |     for item in item_descriptions:
228 |         try:
229 |             emoji_category, description = item.split(" @ ", 1)
230 |             emoji, category = emoji_category.split(" ", 1)
231 |             parsed_items.append((emoji, category, description))
232 |         except ValueError:
233 |             parsed_items.append(("🔷", "Item", item))
234 | 
235 |     return parsed_items, outfit_summary
236 | 
237 | 
238 | def display_llm_item(item_data, col, articles_fv, customer_id, tracker):
239 |     """Display a single LLM recommendation item and handle interactions"""
240 |     description, item = item_data
241 |     item_id = str(item[0])
242 | 
243 |     image_url = get_item_image_url(item_id, articles_fv)
244 |     img = fetch_and_process_image(image_url)
245 | 
246 |     if not img:
247 |         return False
248 | 
249 |     col.image(img, use_column_width=True)
250 | 
251 |     # View Details button
252 |     if col.button("📝 View Details", key=f"llm_details_{item_id}"):
253 |         tracker.track(customer_id, item_id, "click")
254 |         with col.expander("Item Details", expanded=True):
255 |             col.write(process_description(item[-2]))
256 | 
257 |     # Buy button
258 |     if col.button("🛒 Buy", key=f"llm_buy_{item_id}"):
259 |         # Track interaction
260 |         tracker.track(customer_id, item_id, "purchase")
261 | 
262 |         # Insert transaction
263 |         fg_updater = get_fg_updater()
264 |         purchase_data = {"customer_id": customer_id, "article_id": item_id}
265 | 
266 |         if fg_updater.insert_transaction(purchase_data):
267 |             st.success(f"✅ Item {item_id} purchased!")
268 |             return True
269 |         else:
270 |             st.error("Failed to record transaction, but purchase was tracked")
271 | 
272 |     return False
273 | 
274 | 
275 | def display_category_items(emoji, category, items, articles_fv, customer_id, tracker):
276 |     """Display items for a category and handle purchases"""
277 |     st.markdown(f"## {emoji} {category}")
278 | 
279 |     if items:
280 |         st.write(f"**Recommendation: {items[0][0]}**")
281 | 
282 |         # Calculate number of rows needed
283 |         items_per_row = 5
284 |         num_rows = (len(items) + items_per_row - 1) // items_per_row
285 | 
286 |         need_rerun = False
287 |         remaining_items = []
288 | 
289 |         # Display items row by row
290 |         for row in range(num_rows):
291 |             start_idx = row * items_per_row
292 |             end_idx = min(start_idx + items_per_row, len(items))
293 |             row_items = items[start_idx:end_idx]
294 | 
295 |             cols = st.columns(items_per_row)
296 | 
297 |             for idx, item_data in enumerate(row_items):
298 |                 if tracker.should_show_item(customer_id, item_data[1][0]):
299 |                     with cols[idx]:
300 |                         if display_llm_item(
301 |                             item_data, cols[idx], articles_fv, customer_id, tracker
302 |                         ):
303 |                             need_rerun = True
304 |                         else:
305 |                             remaining_items.append(item_data)
306 | 
307 |         st.markdown("---")
308 |         return need_rerun, remaining_items
309 |     return False, []
310 | 
311 | 
312 | def llm_recommendations(articles_fv, api_key, customer_id):
313 |     """Handle LLM-based recommendations with proper state management"""
314 |     st.write("🤖 LLM Fashion Recommender")
315 | 
316 |     # Initialize session state
317 |     initialize_llm_state()
318 | 
319 |     tracker = get_tracker()
320 |     embedding_model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID)
321 | 
322 |     # Gender selection
323 |     gender = st.selectbox("Select gender:", ("Male", "Female"))
324 | 
325 |     # Input options
326 |     input_options = [
327 |         "I'm going to the beach for a week-long vacation. What items do I need?",
328 |         "I have a formal winter wedding to attend next month. What should I wear?",
329 |         "I'm starting a new job at a tech startup with a casual dress code. What items should I add to my wardrobe?",
330 |         "Custom input",
331 |     ]
332 | 
333 |     selected_input = st.selectbox(
334 |         "Choose your fashion need or enter a custom one:", input_options
335 |     )
336 | 
337 |     user_request = ""
338 |     if selected_input == "Custom input":
339 |         user_request = st.text_input("Enter your custom fashion need:")
340 |     else:
341 |         user_request = selected_input
342 | 
343 |     # Generate recommendations button
344 |     if st.button("Get LLM Recommendations") and user_request:
345 |         with st.spinner("Generating recommendations..."):
346 |             try:
347 |                 fashion_chain = get_fashion_chain(api_key)
348 |                 item_recommendations, summary = get_fashion_recommendations(
349 |                     user_request, fashion_chain, gender
350 |                 )
351 | 
352 |                 # Clear previous recommendations
353 |                 st.session_state.llm_recommendations = []
354 |                 st.session_state.llm_extra_items = {}
355 |                 st.session_state.outfit_summary = summary
356 | 
357 |                 for emoji, category, description in item_recommendations:
358 |                     similar_items = get_similar_items(
359 |                         description, embedding_model, articles_fv
360 |                     )
361 |                     shown_items = []
362 |                     extra_items = []
363 | 
364 |                     # Split items into shown and extra
365 |                     for item in similar_items:
366 |                         if len(shown_items) < 5 and tracker.should_show_item(
367 |                             customer_id, item[0]
368 |                         ):
369 |                             shown_items.append((description, item))
370 |                         elif tracker.should_show_item(customer_id, item[0]):
371 |                             extra_items.append((description, item))
372 | 
373 |                     if shown_items:
374 |                         st.session_state.llm_recommendations.append(
375 |                             (emoji, category, shown_items)
376 |                         )
377 |                         st.session_state.llm_extra_items[category] = extra_items
378 | 
379 |                         # Track shown items
380 |                         tracker.track_shown_items(
381 |                             customer_id, [(item[1][0], 0.0) for item in shown_items]
382 |                         )
383 | 
384 |             except Exception as e:
385 |                 st.error(f"An error occurred: {str(e)}")
386 |                 return
387 | 
388 |     # Display outfit summary if available
389 |     if st.session_state.outfit_summary:
390 |         st.markdown("## 🎨 Outfit Summary")
391 |         st.markdown(
392 |             f"<h3 style='font-size: 20px;'>{st.session_state.outfit_summary}</h3>",
393 |             unsafe_allow_html=True,
394 |         )
395 |         st.markdown("---")
396 | 
397 |     # Display recommendations by category
398 |     updated_recommendations = []
399 |     need_rerun = False
400 | 
401 |     for emoji, category, items in st.session_state.llm_recommendations:
402 |         if not items:
403 |             continue
404 | 
405 |         st.markdown(f"## {emoji} {category}")
406 |         st.write(f"**Recommendation: {items[0][0]}**")
407 | 
408 |         # Calculate number of columns needed
409 |         n_items = len(items)
410 |         n_cols = min(5, n_items)
411 |         cols = st.columns(n_cols)
412 | 
413 |         # Track which items to keep
414 |         remaining_items = []
415 |         category_updated = False
416 | 
417 |         # Display items
418 |         for idx, item_data in enumerate(items):
419 |             item_id = item_data[1][0]
420 | 
421 |             # Only show if not purchased
422 |             if tracker.should_show_item(customer_id, item_id):
423 |                 with cols[idx % n_cols]:
424 |                     # Display and handle purchase
425 |                     was_purchased = display_llm_item(
426 |                         item_data, cols[idx % n_cols], articles_fv, customer_id, tracker
427 |                     )
428 | 
429 |                     if was_purchased:
430 |                         # Item was purchased, try to get replacement
431 |                         category_updated = True
432 |                         extra_items = st.session_state.llm_extra_items.get(category, [])
433 | 
434 |                         if extra_items:
435 |                             # Add replacement item from extras
436 |                             new_item = extra_items.pop(0)
437 |                             remaining_items.append(new_item)
438 |                             st.session_state.llm_extra_items[category] = extra_items
439 |                     else:
440 |                         # Keep the item in display
441 |                         remaining_items.append(item_data)
442 | 
443 |         # If we still have items to display in this category
444 |         if remaining_items:
445 |             updated_recommendations.append((emoji, category, remaining_items))
446 | 
447 |         if category_updated:
448 |             need_rerun = True
449 | 
450 |         st.markdown("---")
451 | 
452 |     # Update recommendations and rerun if needed
453 |     if need_rerun:
454 |         st.session_state.llm_recommendations = updated_recommendations
455 |         st.experimental_rerun()
456 | 
457 | 
458 | def get_similar_items(description, embedding_model, articles_fv):
459 |     """Get similar items based on description embedding"""
460 |     description_embedding = embedding_model.encode(description)
461 | 
462 |     return articles_fv.find_neighbors(description_embedding, k=25)
463 | 


--------------------------------------------------------------------------------
/recsys/ui/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from io import BytesIO
 3 | 
 4 | import requests
 5 | import streamlit as st
 6 | from PIL import Image, UnidentifiedImageError
 7 | 
 8 | from recsys import hopsworks_integration
 9 | from recsys.config import settings
10 | 
11 | def print_header(text, font_size=22):
12 |     res = f'<span style="font-size: {font_size}px;">{text}</span>'
13 |     st.markdown(res, unsafe_allow_html=True)
14 | 
15 | 
16 | @st.cache_data()
17 | def fetch_and_process_image(image_url, width=200, height=300):
18 |     try:
19 |         response = requests.get(image_url)
20 |         img = Image.open(BytesIO(response.content))
21 |         img = img.resize((width, height), Image.LANCZOS)
22 |         return img
23 |     except (UnidentifiedImageError, requests.RequestException, IOError):
24 |         return None
25 | 
26 | 
27 | def process_description(description):
28 |     details_match = re.search(r"Details: (.+?)(?:\n|$)", description)
29 |     return details_match.group(1) if details_match else "No details available."
30 | 
31 | 
32 | def get_item_image_url(item_id, articles_fv):
33 |     article_feature_view = articles_fv.get_feature_vector({"article_id": item_id})
34 |     if not article_feature_view:
35 |         return None
36 | 
37 |     return article_feature_view[-1]
38 | 
39 | 
40 | @st.cache_resource()
41 | def get_deployments():
42 |     project, fs = hopsworks_integration.get_feature_store()
43 | 
44 |     ms = project.get_model_serving()
45 | 
46 |     articles_fv = fs.get_feature_view(
47 |         name="articles",
48 |         version=1,
49 |     )
50 | 
51 |     query_model_deployment = ms.get_deployment(
52 |         hopsworks_integration.two_tower_serving.HopsworksQueryModel.deployment_name
53 |     )
54 | 
55 |     ranking_deployment = ms.get_deployment(
56 |         settings.RANKING_MODEL_TYPE
57 |     )
58 | 
59 |     ranking_deployment.start(await_running=180)
60 |     query_model_deployment.start(await_running=180)
61 | 
62 |     return articles_fv, ranking_deployment, query_model_deployment
63 | 


--------------------------------------------------------------------------------
/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | import streamlit as st
  5 | 
  6 | from recsys.config import settings
  7 | from recsys.ui.feature_group_updater import get_fg_updater
  8 | from recsys.ui.interaction_tracker import get_tracker
  9 | from recsys.ui.recommenders import customer_recommendations, llm_recommendations
 10 | from recsys.ui.utils import get_deployments
 11 | 
 12 | # Configure logging
 13 | logging.basicConfig(level=logging.INFO)
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | # Constants
 17 | CUSTOMER_IDS = [
 18 |     "9e619265e3ae0d2ef96a71577c4aff3474bfa7dd0d60486b42bc8f921c3387c0",
 19 |     "a1f7201399574e78b0a1575c50e3b68d116f84e24c0f70c957083da99db6ab5f",
 20 |     "19fa659096de20f0c022b9727779e849813ccc82952b3d56e212ab18fa2c0bf3",
 21 |     "d9448c8585f1678937deb5118d95b09bf6f41fe00a65b1fb82c7d176c6bfc532",
 22 |     "b41d990c8a127dac386dd6c9f2a6ec4ac41185cd21ef2df0a952a8cbdf61ed5d",
 23 | ]
 24 | 
 25 | 
 26 | def initialize_page():
 27 |     """Initialize Streamlit page configuration"""
 28 |     st.set_page_config(layout="wide", initial_sidebar_state="expanded")
 29 |     st.title("👒 Fashion Items Recommender")
 30 |     st.sidebar.title("⚙️ Configuration")
 31 | 
 32 | 
 33 | def initialize_services():
 34 |     """Initialize tracker, updater, and deployments"""
 35 |     tracker = get_tracker()
 36 |     fg_updater = get_fg_updater()
 37 | 
 38 |     logger.info("Initializing deployments...")
 39 |     with st.sidebar:
 40 |         with st.spinner("🚀 Starting Deployments..."):
 41 |             articles_fv, ranking_deployment, query_model_deployment = get_deployments()
 42 |         st.success("✅ Deployments Ready")
 43 | 
 44 |         # Stop deployments button
 45 |         if st.button(
 46 |             "⏹️ Stop Deployments", key="stop_deployments_button", type="secondary"
 47 |         ):
 48 |             ranking_deployment.stop()
 49 |             query_model_deployment.stop()
 50 |             st.success("Deployments stopped successfully!")
 51 | 
 52 |     return tracker, fg_updater, articles_fv, ranking_deployment, query_model_deployment
 53 | 
 54 | 
 55 | def show_interaction_dashboard(tracker, fg_updater, page_selection):
 56 |     """Display interaction data and controls"""
 57 |     with st.sidebar.expander("📊 Interaction Dashboard", expanded=True):
 58 |         if page_selection == "LLM Recommendations":
 59 |             api_key = (
 60 |                 settings.OPENAI_API_KEY.get_secret_value()
 61 |                 if settings.OPENAI_API_KEY
 62 |                 and settings.OPENAI_API_KEY.get_secret_value()
 63 |                 else None
 64 |             )
 65 |             if not api_key:
 66 |                 api_key = st.text_input(
 67 |                     "🔑 OpenAI API Key:", type="password", key="openai_api_key"
 68 |                 )
 69 |             if api_key:
 70 |                 os.environ["OPENAI_API_KEY"] = api_key
 71 |             else:
 72 |                 st.warning("⚠️ Please enter OpenAI API Key for LLM Recommendations")
 73 |             st.divider()
 74 | 
 75 |         interaction_data = tracker.get_interactions_data()
 76 | 
 77 |         col1, col2, col3 = st.columns(3)
 78 |         total = len(interaction_data)
 79 |         clicks = len(interaction_data[interaction_data["interaction_score"] == 1])
 80 |         purchases = len(interaction_data[interaction_data["interaction_score"] == 2])
 81 | 
 82 |         col1.metric("Total", total)
 83 |         col2.metric("Clicks", clicks)
 84 |         col3.metric("Purchases", purchases)
 85 | 
 86 |         st.dataframe(interaction_data, hide_index=True)
 87 |         fg_updater.process_interactions(tracker, force=True)
 88 | 
 89 | 
 90 | def handle_llm_page(articles_fv, customer_id):
 91 |     """Handle LLM recommendations page"""
 92 |     if "OPENAI_API_KEY" in os.environ:
 93 |         llm_recommendations(articles_fv, os.environ["OPENAI_API_KEY"], customer_id)
 94 |     else:
 95 |         st.warning("Please provide your OpenAI API Key in the Interaction Dashboard")
 96 | 
 97 | 
 98 | def process_pending_interactions(tracker, fg_updater):
 99 |     """Process interactions immediately"""
100 |     fg_updater.process_interactions(tracker, force=True)
101 | 
102 | 
103 | def main():
104 |     # Initialize page
105 |     initialize_page()
106 | 
107 |     # Initialize services
108 |     tracker, fg_updater, articles_fv, ranking_deployment, query_model_deployment = (
109 |         initialize_services()
110 |     )
111 | 
112 |     # Select customer
113 |     customer_id = st.sidebar.selectbox(
114 |         "👤 Select Customer:", CUSTOMER_IDS, key="selected_customer"
115 |     )
116 | 
117 |     # Page selection
118 |     page_options = ["Customer Recommendations", "LLM Recommendations"]
119 |     page_selection = st.sidebar.radio("📑 Choose Page:", page_options)
120 | 
121 |     # Process any pending interactions with notification
122 |     process_pending_interactions(tracker, fg_updater)
123 | 
124 |     # Interaction dashboard with OpenAI API key field
125 |     show_interaction_dashboard(tracker, fg_updater, page_selection)
126 | 
127 |     # Handle page content
128 |     if page_selection == "Customer Recommendations":
129 |         customer_recommendations(
130 |             articles_fv, ranking_deployment, query_model_deployment, customer_id
131 |         )
132 |     else:  # LLM Recommendations
133 |         handle_llm_page(articles_fv, customer_id)
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/tools/clean_hopsworks_resources.py:
--------------------------------------------------------------------------------
 1 | import hopsworks
 2 | 
 3 | # Login to Hopsworks
 4 | project = hopsworks.login()
 5 | 
 6 | 
 7 | # Get deployment registry
 8 | mr = project.get_model_serving()
 9 | 
10 | # List all deployments
11 | deployments = mr.get_deployments()
12 | 
13 | # Delete each deployment
14 | for deployment in deployments:
15 |     print(f"Deleting deployment: {deployment.name}.")
16 |     deployment.stop()
17 |     deployment.delete()
18 | 
19 | # Get the model registry
20 | mr = project.get_model_registry()
21 | 
22 | # List all models
23 | for model_name in [
24 |     "llm_ranking_model",
25 |     "ranking_model",
26 |     "candidate_model",
27 |     "query_model",
28 | ]:
29 |     models = mr.get_models(name=model_name)
30 | 
31 |     # Delete each model
32 |     for model in models:
33 |         print(f"Deleting model: {model.name} (version: {model.version})")
34 |         try:
35 |             model.delete()
36 |         except Exception:
37 |             print(f"Failed to delete model {model_name}.")
38 | 
39 | # Get feature store
40 | fs = project.get_feature_store()
41 | 
42 | for feature_view in [
43 |     "retrieval",
44 |     "articles",
45 |     "customers",
46 |     "candidate_embeddings",
47 |     "ranking",
48 | ]:
49 |     # Get all feature views
50 |     try:
51 |         feature_views = fs.get_feature_views(name=feature_view)
52 |     except:
53 |         print(f"Couldn't find feature view: {feature_view}. Skipping...")
54 |         feature_views = []
55 | 
56 |     # Delete each feature view
57 |     for fv in feature_views:
58 |         print(f"Deleting feature view: {fv.name} (version: {fv.version})")
59 |         try:
60 |             fv.delete()
61 |         except Exception:
62 |             print(f"Failed to delete feature view {fv.name}.")
63 | 
64 | for feature_group in [
65 |     "customers",
66 |     "articles",
67 |     "transactions",
68 |     "interactions",
69 |     "candidate_embeddings",
70 |     "ranking",
71 | ]:
72 |     # Get all feature groups
73 |     try:
74 |         feature_groups = fs.get_feature_groups(name=feature_group)
75 |     except:
76 |         print(f"Couldn't find feature group: {feature_view}. Skipping...")
77 |         feature_groups = []
78 | 
79 |     # Delete each feature group
80 |     for fg in feature_groups:
81 |         print(f"Deleting feature group: {fg.name} (version: {fg.version})")
82 |         try:
83 |             fg.delete()
84 |         except:
85 |             print(f"Failed to delete feature group {fv.name}.")
86 | 


--------------------------------------------------------------------------------