├── dev-requirements.txt
├── images
    ├── feature-store-training.png
    ├── feature_store_diagram.png
    ├── feature-store-training-v2.png
    ├── feature-store-vector-line.png
    ├── feature-store-vector-screen.png
    ├── feature_store_demo_diagram.png
    ├── feature_store_how_it_works.png
    ├── features-catalog-transaction.png
    └── feature-store-graph.svg
├── requirements.txt
├── .flake8
├── pyproject.toml
├── .test
├── setup.py
├── src
    ├── __init__.py
    ├── get_vector.py
    ├── serving.py
    ├── date_adjust.py
    ├── train_sklearn.py
    └── train_workflow.py
├── tests
    └── test_data_prep.py
├── README.md
├── .github
    └── workflows
    │   └── ci.yaml
├── Makefile
├── project_setup.py
├── LICENSE
├── 05-real-time-serving-pipeline.ipynb
├── 04-train-test-pipeline.ipynb
└── 02-interactive-data-preparation.ipynb


/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | black
3 | isort
4 | flake8
5 | 


--------------------------------------------------------------------------------
/images/feature-store-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature-store-training.png


--------------------------------------------------------------------------------
/images/feature_store_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature_store_diagram.png


--------------------------------------------------------------------------------
/images/feature-store-training-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature-store-training-v2.png


--------------------------------------------------------------------------------
/images/feature-store-vector-line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature-store-vector-line.png


--------------------------------------------------------------------------------
/images/feature-store-vector-screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature-store-vector-screen.png


--------------------------------------------------------------------------------
/images/feature_store_demo_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature_store_demo_diagram.png


--------------------------------------------------------------------------------
/images/feature_store_how_it_works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/feature_store_how_it_works.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | seaborn
3 | pandas==2.1.4
4 | mlrun
5 | redis
6 | s3fs
7 | matplotlib
8 | plotly
9 | graphviz


--------------------------------------------------------------------------------
/images/features-catalog-transaction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlrun/demo-fraud/HEAD/images/features-catalog-transaction.png


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 85
3 | extend-ignore = E203, W503
4 | 
5 | # exclude these dirs
6 | exclude = .git,venv,playground
7 | 
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | multi_line_output = 3
4 | #line_length = 85
5 | 
6 | [tool.black]
7 | line-length = 85
8 | 


--------------------------------------------------------------------------------
/.test:
--------------------------------------------------------------------------------
1 | 01-exploratory-data-analysis.ipynb
2 | 02-interactive-data-preparation.ipynb
3 | 03-ingest-with-feature-store.ipynb
4 | 04-train-test-pipeline.ipynb
5 | 05-real-time-serving-pipeline.ipynb


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | project_name = "demo_fraud"
 4 | with open("README.md", "r", encoding="utf-8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(
 8 |     name=project_name,
 9 |     packages=[project_name],
10 |     package_dir={project_name: "src"},
11 |     version="0.1.0",
12 |     description="my desc",
13 |     author="Yaron",
14 |     author_email="author@example.com",
15 |     license="MIT",
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     python_requires=">=3.7",
19 | )
20 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Iguazio
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 


--------------------------------------------------------------------------------
/src/get_vector.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Iguazio
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | import mlrun
16 | import mlrun.feature_store as fstore
17 | 
18 | from mlrun.datastore.targets import ParquetTarget
19 | 
20 | 
21 | def get_offline_features(feature_vector, features, label_feature):
22 |     
23 |     fv = fstore.FeatureVector(feature_vector, 
24 |                           features, 
25 |                           label_feature=label_feature,
26 |                           description='Predicting a fraudulent transaction')
27 | 
28 |     data = fv.get_offline_features(target=ParquetTarget())
29 |     
30 |     return data


--------------------------------------------------------------------------------
/tests/test_data_prep.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class MyTest(unittest.TestCase):
 7 |     def test_(self):
 8 |         data = self.get_data()
 9 |         assert data is not None
10 | 
11 |     def get_data(self):
12 |         return pd.DataFrame(
13 |             {
14 |                 "key": [
15 |                     "2009-06-15 17:26:21.0000001",
16 |                     "2010-01-05 16:52:16.0000002",
17 |                     "2011-08-18 00:35:00.00000049",
18 |                 ],
19 |                 "fare_amount": [4.5, 16.9, 5.7],
20 |                 "pickup_datetime": [
21 |                     "2009-06-15 17:26:21 UTC",
22 |                     "2010-01-05 16:52:16 UTC",
23 |                     "2011-08-18 00:35:00 UTC",
24 |                 ],
25 |                 "pickup_longitude": [-73.844311, -74.016048, -73.982738],
26 |                 "pickup_latitude": [40.721319, 40.711303, 40.76127],
27 |                 "dropoff_longitude": [-73.84161, -73.979268, -73.991242],
28 |                 "dropoff_latitude": [40.712278, 40.782004, 40.750562],
29 |                 "passenger_count": [1, 1, 2],
30 |             }
31 |         )
32 | 


--------------------------------------------------------------------------------
/src/serving.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Iguazio
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | 
17 | import numpy as np
18 | from cloudpickle import load
19 | from mlrun.serving.v2_serving import V2ModelServer
20 | 
21 | 
22 | class ClassifierModel(V2ModelServer):
23 |     """Model serving classifer example"""
24 | 
25 |     def load(self):
26 |         """load and initialize the model and/or other elements"""
27 |         model_file, extra_data = self.get_model(".pkl")
28 |         self.model = load(open(model_file, "rb"))
29 | 
30 |     def predict(self, body: dict) -> list:
31 |         """Generate model predictions from sample"""
32 |         print(f"Input -> {body['inputs']}")
33 |         feats = np.asarray(body["inputs"])
34 |         result: np.ndarray = self.model.predict(feats)
35 |         return result.tolist()
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Feature-Store End-to-End Demo
 2 | 
 3 | 
 4 | This demo shows the usage of MLRun and the feature store. 
 5 | 
 6 | > - This demo works with the online feature store, which is currently not part of the Open Source default deployment.
 7 | 
 8 | The demo showcases:
 9 | 
10 | - [**Data Exploration**](01-exploratory-data-analysis.ipynb)
11 | - [**Offline Data prepare and train**](02-interactive-data-preparation.ipynb)
12 | - [**Data ingestion & preparation**](03-ingest-with-feature-store.ipynb)
13 | - [**Building An Automated ML Pipeline**](04-train-test-pipeline.ipynb)
14 | - [**Model serving**](05-real-time-serving-pipeline.ipynb)
15 | 
16 | Fraud prevention specifically is a challenge as it requires processing raw transaction and events in real-time and being able to
17 | quickly respond and block transactions before they occur. Consider, for example, a case where you would like to evaluate the
18 | average transaction amount. When training the model, it is common to take a DataFrame and just calculate the average. However,
19 | when dealing with real-time/online scenarios, this average has to be calculated incrementally.
20 | 
21 | In this demo we will learn how to **Ingest** different data sources to our **Feature Store**. Specifically, we will consider 2 types of data: 
22 | 
23 | - **Transactions**: Monetary activity between 2 parties to transfer funds.
24 | - **Events**: Activity that done by the party, such as login or password change.
25 | 
26 | ![](./images/feature_store_demo_diagram.png)
27 | 
28 | We will walk through creation of ingestion pipeline for each data source with all the needed preprocessing and validation. We will run the pipeline locally within the notebook and then launch a real-time function to **ingest live data** or schedule a cron to run the task when needed.
29 | 
30 | Following the ingestion, we will create a feature vector, select the most relevant features and create a final model. We will then deploy the model and showcase the feature vector and model serving.
31 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |     - development
 7 |   push:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   lint:
13 |     name: Lint code (Python ${{ matrix.python-version }})
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.9]
18 |     steps:
19 |     - uses: actions/checkout@v3
20 |     - name: Set up python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v4
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - uses: actions/cache@v2
25 |       with:
26 |         path: ~/.cache/pip
27 |         key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/dev-requirements.txt') }}
28 |         restore-keys: |
29 |           ${{ runner.os }}-pip-${{ matrix.python-version }}-
30 |           ${{ runner.os }}-pip-
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip~=22.3.0
34 |         pip install -r dev-requirements.txt
35 |     - name: Lint
36 |       run: make lint
37 | 
38 | 
39 |   tests:
40 |     name: Run tests (Python ${{ matrix.python-version }})
41 |     runs-on: ubuntu-latest
42 |     strategy:
43 |       matrix:
44 |         python-version: [3.9]
45 |     steps:
46 |     - uses: actions/checkout@v3
47 |     - name: Set up python ${{ matrix.python-version }}
48 |       uses: actions/setup-python@v4
49 |       with:
50 |         python-version: ${{ matrix.python-version }}
51 |     - uses: actions/cache@v2
52 |       with:
53 |         path: ~/.cache/pip
54 |         key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }}
55 |         restore-keys: |
56 |           ${{ runner.os }}-pip-${{ matrix.python-version }}-
57 |           ${{ runner.os }}-pip-
58 |     - name: Install automation scripts dependencies and add mlrun to dev packages
59 |       run: pip install -r requirements.txt -r dev-requirements.txt
60 |     - name: Test package
61 |       run: make test
62 | 


--------------------------------------------------------------------------------
/src/date_adjust.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Iguazio
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | 
17 | # Helper functions to adjust the timestamps of our data
18 | # while keeping the order of the selected events and
19 | # the relative distance from one event to the other
20 | import pandas as pd
21 | 
22 | 
23 | def date_adjustment(
24 |     sample: pd.Timestamp,
25 |     data_max: pd.Timestamp,
26 |     new_max: pd.Timestamp,
27 |     old_data_period: pd.Timedelta,
28 |     new_data_period: pd.Timedelta,
29 | ) -> pd.Timestamp:
30 |     """
31 |     Adjust a specific sample's date according to the original and new time periods
32 | 
33 |     :param sample: The sample's timestamp
34 |     :param data_max: The original data's max timestamp
35 |     :param new_max: The new data's max timestamp
36 |     :param old_data_period: The original data's time period
37 |     :param new_data_period: The new data's time period
38 | 
39 |     :returns: The adjusted timestamp
40 |     """
41 |     sample_dates_scale = (data_max - sample) / old_data_period
42 |     sample_delta = new_data_period * sample_dates_scale
43 |     new_sample_ts = new_max - sample_delta
44 |     return new_sample_ts
45 | 
46 | 
47 | def adjust_data_timespan(
48 |     dataframe: pd.DataFrame,
49 |     timestamp_col: str = "timestamp",
50 |     new_period: str = "2d",
51 |     new_max_date_str: str = "now",
52 | ):
53 |     """
54 |     Adjust the dataframe timestamps to the new time period
55 | 
56 |     :param dataframe: The dataframe to adjust
57 |     :param timestamp_col: The timestamp column name
58 |     :param new_period: The new time period
59 |     :param new_max_date_str: The new max date
60 | 
61 |     :returns: The adjusted dataframe
62 |     """
63 |     # Calculate old time period
64 |     data_min = dataframe.timestamp.min()
65 |     data_max = dataframe.timestamp.max()
66 |     old_data_period = data_max - data_min
67 | 
68 |     # Set new time period
69 |     new_time_period = pd.Timedelta(new_period)
70 |     new_max = pd.Timestamp(new_max_date_str)
71 |     new_min = new_max - new_time_period
72 |     new_data_period = new_max - new_min
73 | 
74 |     # Apply the timestamp change
75 |     df = dataframe.copy()
76 |     df[timestamp_col] = df[timestamp_col].apply(
77 |         lambda x: date_adjustment(
78 |             x, data_max, new_max, old_data_period, new_data_period
79 |         )
80 |     )
81 |     df.sort_values(by="timestamp", axis=0, inplace=True)
82 |     return df
83 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | PYTHON_INTERPRETER = python3
 3 | SHARED_DIR ?= ~/mlrun-data
 4 | HOST_MNT_DIR ?= $(SHARED_DIR)
 5 | MLRUN_TAG ?= 1.2.1
 6 | HOST_IP ?=$$(ip route get 1.2.3.4 | awk '{print $$7}')
 7 | CONDA_ENV ?= mlrun
 8 | SHELL=/bin/bash
 9 | CONDA_PY_VER ?= 3.7
10 | CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
11 | 
12 | #################################################################################
13 | # COMMANDS                                                                      #
14 | #################################################################################
15 | 
16 | .PHONY: help
17 | help: ## Display available commands
18 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
19 | 
20 | .PHONY: all
21 | all:
22 | 	$(error please pick a target)
23 | 
24 | .PHONY: install-requirements
25 | install-requirements: ## Install all requirements needed for development
26 | 	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt -r dev-requirements.txt
27 | 
28 | 
29 | .PHONY: package-wheel
30 | package-wheel: clean ## Build python package wheel
31 | 	python setup.py bdist_wheel
32 | 
33 | .PHONY: clean
34 | clean: ## Clean python package build artifacts
35 | 	rm -rf build
36 | 	rm -rf dist
37 | 	find . -type f -name "*.py[co]" -delete
38 | 	find . -type d -name "__pycache__" -delete
39 | 
40 | .PHONY: fmt
41 | fmt: ## Format the code (using black and isort)
42 | 	@echo "Running black fmt..."
43 | 	$(PYTHON_INTERPRETER) -m black src tests
44 | 	$(PYTHON_INTERPRETER) -m isort src tests
45 | 
46 | .PHONY: lint
47 | lint: fmt-check flake8 ## Run lint on the code
48 | 
49 | .PHONY: fmt-check
50 | fmt-check: ## Format and check the code (using black and isort)
51 | 	@echo "Running black+isort fmt check..."
52 | 	$(PYTHON_INTERPRETER) -m black --check --diff src tests
53 | 	$(PYTHON_INTERPRETER) -m isort --check --diff src tests
54 | 
55 | .PHONY: flake8
56 | flake8: ## Run flake8 lint
57 | 	@echo "Running flake8 lint..."
58 | 	$(PYTHON_INTERPRETER) -m flake8 src tests
59 | 
60 | .PHONY: test
61 | test: clean ## Run tests
62 | 	$(PYTHON_INTERPRETER) -m pytest -v --capture=no --disable-warnings -rf tests
63 | 
64 | .PHONY: mlrun-docker
65 | mlrun-docker: ## Start MLRun & Nuclio containers (using Docker compose)
66 | 	mkdir $(SHARED_DIR) -p
67 | 	@echo "HOST_IP=$(HOST_IP)" > .env
68 | 	SHARED_DIR=$(SHARED_DIR) HOST_MNT_DIR=$(HOST_MNT_DIR) TAG=$(MLRUN_TAG) docker-compose -f compose.yaml up -d
69 | 	@echo "use docker-compose stop / logs commands to stop or view logs"
70 | 
71 | .PHONY: mlrun-api
72 | mlrun-api: ## Run MLRun DB locally (as process)
73 | 	@echo "Installing MLRun API dependencies ..."
74 | 	MLRUN_IGNORE_ENV_FILE=1 mlrun db -b
75 | 
76 | .PHONY: conda-env
77 | conda-env: ## Create a conda environment
78 | 	@echo "Creating new conda environment $(CONDA_ENV)..."
79 | 	conda create -n $(CONDA_ENV) -y python=$(CONDA_PY_VER) ipykernel graphviz pip
80 | 	test -s ./mlrun.env && conda env config vars set -n $(CONDA_ENV) MLRUN_ENV_FILE=$$(realpath ./mlrun.env)
81 | 	@echo "Installing requirements.txt..."
82 | 	$(CONDA_ACTIVATE) $(CONDA_ENV); pip install -r requirements.txt -r dev-requirements.txt
83 | 	@echo -e "\nTo run mlrun API as a local process type:\n  conda activate $(CONDA_ENV) && make mlrun-api"
84 | 


--------------------------------------------------------------------------------
/src/train_sklearn.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Iguazio
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | 
 17 | import pandas as pd
 18 | from sklearn.model_selection import train_test_split
 19 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 20 | from sklearn.model_selection import RandomizedSearchCV
 21 | from sklearn.ensemble import RandomForestClassifier
 22 | 
 23 | 
 24 | def prepare_data_to_train(
 25 |     transactions_data_p: pd.DataFrame,
 26 |     user_events_data_p: pd.DataFrame,
 27 |     labels_set: pd.DataFrame,
 28 | ) -> pd.DataFrame:
 29 |     """
 30 |     This function prepare data to train and test
 31 | 
 32 |     :param transactions_data_p: transactions data
 33 |     :param user_events_data_p: user events data
 34 |     :param labels_set: labels data
 35 |     :return: train and test data
 36 |     """
 37 |     transactions_data_p.drop(columns=["age", "target", "device"], inplace=True)
 38 |     transactions_data_p.sort_values(by="timestamp", inplace=True)
 39 |     user_events_data_p.sort_values(by="timestamp", inplace=True)
 40 | 
 41 |     merged_df = pd.merge_asof(
 42 |         transactions_data_p,
 43 |         user_events_data_p,
 44 |         on="timestamp",
 45 |         by="source",
 46 |     )
 47 | 
 48 |     data_for_train = (
 49 |         pd.merge_asof(merged_df, labels_set, on="timestamp", by="source")
 50 |         .drop(columns=["source", "timestamp"])
 51 |         .dropna()
 52 |     )
 53 | 
 54 |     lable = data_for_train["label"]
 55 |     data_for_train.drop(columns=["label"], inplace=True)
 56 | 
 57 |     return train_test_split(data_for_train, lable, test_size=0.2, random_state=42)
 58 | 
 59 | 
 60 | def train_and_val(
 61 |     X_train: pd.DataFrame,
 62 |     X_test: pd.DataFrame,
 63 |     y_train: pd.DataFrame,
 64 |     y_test: pd.DataFrame,
 65 | ) -> RandomForestClassifier:
 66 |     """
 67 |     This function train and validate the model
 68 | 
 69 |     :param X_train: train data
 70 |     :param X_test: test data
 71 |     :param y_train: train labels
 72 |     :param y_test: test labels
 73 |     :return: model
 74 |     """
 75 |     grid_search = {
 76 |         "bootstrap": [True, False],
 77 |         "max_depth": [
 78 |             10,
 79 |             30,
 80 |             50,
 81 |             100,
 82 |         ],
 83 |         "max_features": ["log2", "sqrt"],
 84 |         "min_samples_leaf": [1, 2, 4],
 85 |         "min_samples_split": [2, 5, 10],
 86 |         "n_estimators": [50, 100, 500],
 87 |     }
 88 | 
 89 |     rf = RandomForestClassifier()
 90 |     rfc = RandomizedSearchCV(
 91 |         estimator=rf,
 92 |         param_distributions=grid_search,
 93 |         n_iter=100,
 94 |         cv=3,
 95 |         verbose=2,
 96 |         random_state=42,
 97 |         n_jobs=-1,
 98 |     )
 99 |     rfc.fit(X_train, y_train)
100 | 
101 |     # Make predictions on the test set
102 |     y_pred = rfc.best_estimator_.predict(X_test)
103 | 
104 |     # Calculate evaluation metrics
105 |     accuracy = accuracy_score(y_test, y_pred)
106 |     precision = precision_score(y_test, y_pred)
107 |     recall = recall_score(y_test, y_pred)
108 |     f1 = f1_score(y_test, y_pred)
109 | 
110 |     # Print the evaluation metrics
111 |     print("Accuracy:", accuracy)
112 |     print("Precision:", precision)
113 |     print("Recall:", recall)
114 |     print("F1 Score:", f1)
115 |     return rfc.best_estimator_
116 | 


--------------------------------------------------------------------------------
/project_setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Iguazio
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import mlrun
 17 | import os
 18 | from mlrun.datastore.datastore_profile import DatastoreProfileRedis, DatastoreProfileKafkaStream, register_temporary_client_datastore_profile
 19 | 
 20 | def setup(project: mlrun.projects.MlrunProject) -> mlrun.projects.MlrunProject:
 21 |     """
 22 |     Creating the project for this demo. This function is expected to be called automatically when
 23 |     calling the function `mlrun.get_or_create_project`.
 24 | 
 25 |     :returns: a fully prepared project for this demo.
 26 |     """
 27 |     # Set the project git source:
 28 |     source = project.get_param(key="source")
 29 |     if not source:
 30 |         source = "git://github.com/mlrun/demo-fraud.git"
 31 |     print(f"Project Source: {source}")
 32 |     project.set_source(source=source, pull_at_runtime=True)
 33 |     
 34 |     if project.get_param("pre_load_data"):
 35 |         print("pre_load_data")
 36 | 
 37 |     # Refresh MLRun hub to the most up-to-date version:
 38 |     mlrun.get_run_db().get_hub_catalog(source_name="default", force_refresh=True)
 39 | 
 40 |     # Set the functions:
 41 |     
 42 |     project.set_function(
 43 |         func="src/get_vector.py",
 44 |         name="get-vector",
 45 |         handler="get_offline_features",
 46 |         kind="job",
 47 |     ).save()
 48 |     project.set_function(f"db://{project.name}/get-vector", name="get-vector")
 49 |     
 50 |     _set_function(
 51 |         project=project,
 52 |         func="hub://feature_selection",
 53 |         name="feature-selection",
 54 |         kind="job",
 55 |     )
 56 | 
 57 |     _set_function(
 58 |         project=project,
 59 |         func="hub://auto_trainer",
 60 |         name="train",
 61 |         kind="job",
 62 |     )
 63 |     _set_function(
 64 |         project=project,
 65 |         func="hub://auto_trainer",
 66 |         name="evaluate",
 67 |         kind="job",
 68 |     )
 69 | 
 70 |     _set_function(
 71 |         project=project,
 72 |         func="hub://v2_model_server",
 73 |         name="serving",
 74 |         kind="serving",
 75 |     )
 76 | 
 77 |     # Set the training workflow:
 78 |     project.set_workflow("main", "src/train_workflow.py", embed=True)
 79 | 
 80 |     # Set data source for feature store
 81 |     _set_datasource(project)
 82 |     
 83 |     # Save and return the project:
 84 |     project.save()
 85 |     return project
 86 | 
 87 | 
 88 | def _set_function(
 89 |     project: mlrun.projects.MlrunProject,
 90 |     func: str,
 91 |     name: str,
 92 |     kind: str,
 93 |     node_name: str = None,
 94 |     image: str = None,
 95 | ):
 96 |     # Set the given function:
 97 |     with_repo = not func.startswith("hub://")
 98 |     mlrun_function = project.set_function(
 99 |         func=func,
100 |         name=name,
101 |         kind=kind,
102 |         with_repo=with_repo,
103 |         image=image,
104 |     )
105 |     if node_name:
106 |         mlrun_function.with_node_selection(node_name=node_name)
107 |     # Save:
108 |     mlrun_function.save()
109 | 
110 |     project.set_function(f"db://{project.name}/{name}", name=name)
111 | 
112 | 
113 | def _set_datasource(project: mlrun.projects.MlrunProject):
114 |     # If running on community edition - use redis and kafka.
115 |     tsdb_profile = mlrun.datastore.DatastoreProfileV3io(name="fraud-tsdb", v3io_access_key=mlrun.mlconf.get_v3io_access_key())
116 |     stream_profile = mlrun.datastore.DatastoreProfileV3io(name="fraud-stream", v3io_access_key=mlrun.mlconf.get_v3io_access_key())
117 |         
118 |     if mlrun.mlconf.is_ce_mode():
119 |         redis_uri = os.environ.get('REDIS_URI', None)
120 |         redis_user = os.environ.get('REDIS_USER', '')
121 |         redis_password = os.environ.get('REDIS_PASSWORD', '')
122 |         kafka_host = os.environ.get('KAFKA_SERVICE_HOST', f"kafka-stream.{os.environ.get('MLRUN_NAMESPACE', 'mlrun')}.svc.cluster.local")
123 |         kafka_port = os.environ.get('KAFKA_SERVICE_PORT', '9092')
124 |         assert redis_uri is not None, "ERROR - When running on community edition, redis endpoint is required to run fraud-demo."
125 |         assert kafka_host is not None, "ERROR - When running on community edition, kafka endpoint is required to run fraud-demo."
126 | 
127 |         project.set_secrets({'REDIS_URI': redis_uri,
128 |                              'REDIS_USER': redis_user,
129 |                              'REDIS_PASSWORD': redis_password,
130 |                              'KAFKA_SERVICE_HOST':kafka_host,
131 |                              'KAFKA_SERVICE_PORT': kafka_port})
132 |         
133 |         # Redis datastore-profile
134 |         tsdb_profile = DatastoreProfileRedis(
135 |             name="fraud-tsdb",
136 |             endpoint_url=redis_uri,
137 |             username=redis_user, 
138 |             password=redis_password,
139 |         )
140 |         # Kafka datastore-profile
141 |         stream_profile = DatastoreProfileKafkaStream(
142 |             name='fraud-stream',
143 |             brokers=f"{kafka_host}:{kafka_port}",
144 |             topics=[],
145 |         )
146 |         project.params['online_target'] = "ds://fraud-tsdb"
147 |         for fs in ['transactions', 'events', 'labels']:
148 |             project.params[fs] = os.path.join(mlrun.mlconf.artifact_path, fs + '.pq')
149 | 
150 |         # dealing with kafka
151 |         kafka_uri = f"{kafka_host}:{kafka_port}"
152 |         project.params['transaction_stream'] = f'kafka://{kafka_uri}?topic=transactions'
153 |         project.params['events_stream'] = f'kafka://{kafka_uri}?topic=events'
154 | 
155 |         register_temporary_client_datastore_profile(tsdb_profile)
156 |         register_temporary_client_datastore_profile(stream_profile)
157 |         
158 |     else:
159 |         project.params['transaction_stream'] = f'v3io:///projects/{project.name}/streams/transaction'
160 |         project.params['events_stream'] = f'v3io:///projects/{project.name}/streams/events'
161 | 
162 |     project.register_datastore_profile(tsdb_profile)
163 |     project.register_datastore_profile(stream_profile)
164 |     
165 | 
166 |     for key, value in project.params.items():
167 |         project.params[key] = value.replace('{{run.project}}', project.name)
168 |         
169 | 


--------------------------------------------------------------------------------
/src/train_workflow.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 Iguazio
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | import mlrun
 16 | from kfp import dsl
 17 | import os
 18 | 
 19 | from mlrun.model import HyperParamOptions
 20 | from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream, DatastoreProfileTDEngine
 21 | 
 22 | 
 23 | # Create a Kubeflow Pipelines pipeline
 24 | @dsl.pipeline(
 25 |     name="Fraud Detection Pipeline",
 26 |     description="Detecting fraud from a transactions dataset",
 27 | )
 28 | def pipeline(vector_name="transactions-fraud", features=[], label_column="is_error"):
 29 |     """
 30 |     This pipeline will train a model to detect fraud from a transactions dataset.
 31 |     :param vector_name: The name of the feature vector to use
 32 |     :param features: A list of features to use
 33 |     :param label_column: The name of the label column
 34 | 
 35 |     :returns: None
 36 |     """
 37 |     
 38 |     # Get the project
 39 |     project = mlrun.get_current_project()  
 40 | 
 41 |     # Get FeatureVector
 42 |     get_vector_func = project.get_function("get-vector")
 43 |     get_vector_run = project.run_function(
 44 |         get_vector_func,
 45 |         name="get-vector",
 46 |         params={
 47 |             "feature_vector": vector_name,
 48 |             "features": features,
 49 |             "label_feature": label_column,
 50 |             "target": {"name": "parquet", "kind": "parquet"},
 51 |             "update_stats": True,
 52 |         },
 53 |         outputs = [
 54 |             "feature_vector"
 55 |         ]
 56 |     )
 57 |     
 58 |     # Feature selection
 59 |     feature_selection_func = project.get_function("feature-selection")
 60 |     feature_selection_run = project.run_function(
 61 |         feature_selection_func,
 62 |         name="feature-selection",
 63 |         params={
 64 |             "output_vector_name": "short",
 65 |             "label_column": project.get_param("label_column", "label"),
 66 |             "k": 18,
 67 |             "min_votes": 2,
 68 |             "ignore_type_errors": True,
 69 |         },
 70 |         inputs={
 71 |             "df_artifact": project.get_artifact_uri(vector_name, "feature-vector")
 72 |         },
 73 |         outputs=[
 74 |             "feature_scores",
 75 |             "selected_features_count",
 76 |             "top_features_vector",
 77 |             "selected_features",
 78 |         ],
 79 |     ).after(get_vector_run)
 80 | 
 81 |     # train with hyper-paremeters
 82 |     train_func = project.get_function("train")
 83 |     train_run = project.run_function(
 84 |         train_func,
 85 |         name="train",
 86 |         handler="train",
 87 |         params={
 88 |             "sample": -1,
 89 |             "label_column": project.get_param("label_column", "label"),
 90 |             "test_size": 0.10,
 91 |         },
 92 |         hyperparams={
 93 |             "model_name": [
 94 |                 "transaction_fraud_rf",
 95 |                 "transaction_fraud_xgboost",
 96 |                 "transaction_fraud_adaboost",
 97 |             ],
 98 |             "model_class": [
 99 |                 "sklearn.ensemble.RandomForestClassifier",
100 |                 "sklearn.linear_model.LogisticRegression",
101 |                 "sklearn.ensemble.AdaBoostClassifier",
102 |             ],
103 |         },
104 |         hyper_param_options=HyperParamOptions(
105 |             strategy="list", selector="max.accuracy"
106 |         ),
107 |         inputs={"dataset": feature_selection_run.outputs["top_features_vector"]},
108 |         outputs=["model", "test_set"],
109 |     ).after(feature_selection_run)
110 | 
111 |     # test and visualize your model
112 |     test_func = project.get_function("evaluate")
113 |     test_run = mlrun.run_function(
114 |         test_func,
115 |         name="evaluate",
116 |         handler="evaluate",
117 |         params={
118 |             "label_columns": project.get_param("label_column", "label"),
119 |             "model": train_run.outputs["model"],
120 |             "drop_columns": project.get_param("label_column", "label"),
121 |         },
122 |         inputs={"dataset": train_run.outputs["test_set"]},
123 |     ).after(train_run)
124 | 
125 |     # Create a serverless function from the hub, add a feature enrichment router
126 |     # This will enrich and impute the request with data from the feature vector
127 |     serving_func = project.get_function("serving")
128 |     serving_func.set_topology(
129 |         "router",
130 |         mlrun.serving.routers.EnrichmentModelRouter(
131 |             feature_vector_uri="short", impute_policy={"*": "$mean"}
132 |         ),
133 |         exist_ok=True,
134 |     )
135 | 
136 |     # Enable model monitoring
137 |     serving_func.set_tracking()
138 | 
139 |     if mlrun.mlconf.is_ce_mode():
140 |         # Use default service
141 |         tsdb_profile = DatastoreProfileTDEngine(name="fraud-monitoring-tsdb",
142 |                                         user='root',
143 |                                         password='taosdata',
144 |                                         host=f"tdengine-tsdb.{os.environ.get('MLRUN_NAMESPACE', 'mlrun')}.svc.cluster.local",
145 |                                         port='6041')
146 |         project.register_datastore_profile(tsdb_profile)
147 | 
148 |         kafka_host = os.environ.get('KAFKA_SERVICE_HOST', f"kafka-stream.{os.environ.get('MLRUN_NAMESPACE', 'mlrun')}.svc.cluster.local")
149 |         kafka_port = os.environ.get('KAFKA_SERVICE_PORT', '9092')
150 | 
151 |         stream_profile = DatastoreProfileKafkaStream(
152 |             name='fraud-monitoring-stream',
153 |             brokers=f"{kafka_host}:{kafka_port}",
154 |             topics=[],
155 |         )
156 |         project.register_datastore_profile(stream_profile)
157 | 
158 |         project.set_model_monitoring_credentials(
159 |             tsdb_profile_name=tsdb_profile.name,
160 |             stream_profile_name=stream_profile.name,
161 |             replace_creds=True
162 |         )
163 | 
164 |     else:
165 |         project.set_model_monitoring_credentials(
166 |             tsdb_profile_name='fraud-tsdb',
167 |             stream_profile_name='fraud-stream',
168 |             replace_creds=True
169 |         )
170 | 
171 |     serving_func.save()
172 |     # deploy the model server, pass a list of trained models to serve
173 |     deploy = project.deploy_function(
174 |         serving_func,
175 |         models=[{"key": "fraud", "model_path": train_run.outputs["model"]}],
176 |     ).after(train_run)
177 | 


--------------------------------------------------------------------------------
/images/feature-store-graph.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | <!-- Generated by graphviz version 2.40.1 (20161225.0304)
  5 |  -->
  6 | <!-- Title: mlrun&#45;flow Pages: 1 -->
  7 | <svg width="1012pt" height="98pt"
  8 |  viewBox="0.00 0.00 1012.34 98.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
  9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 94)">
 10 | <title>mlrun&#45;flow</title>
 11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-94 1008.3418,-94 1008.3418,4 -4,4"/>
 12 | <!-- _start -->
 13 | <g id="node1" class="node">
 14 | <title>_start</title>
 15 | <polygon fill="#d3d3d3" stroke="#000000" points="38.5476,-27.0493 40.698,-27.1479 42.8263,-27.2953 44.9236,-27.4913 46.9815,-27.7353 48.9917,-28.0266 50.9463,-28.3645 52.8377,-28.7479 54.6587,-29.1759 56.4025,-29.6472 58.0628,-30.1606 59.634,-30.7147 61.1107,-31.308 62.4882,-31.9388 63.7625,-32.6054 64.9302,-33.3059 65.9882,-34.0385 66.9343,-34.8012 67.7669,-35.5918 68.4849,-36.4082 69.0878,-37.2481 69.5758,-38.1093 69.9496,-38.9894 70.2102,-39.886 70.3595,-40.7965 70.3997,-41.7186 70.3334,-42.6497 70.1636,-43.5873 69.8937,-44.5287 69.5276,-45.4713 69.0691,-46.4127 68.5225,-47.3503 67.8923,-48.2814 67.1831,-49.2035 66.3996,-50.114 65.5464,-51.0106 64.6285,-51.8907 63.6504,-52.7519 62.617,-53.5918 61.5329,-54.4082 60.4024,-55.1988 59.2299,-55.9615 58.0197,-56.6941 56.7755,-57.3946 55.5012,-58.0612 54.2002,-58.692 52.8757,-59.2853 51.5309,-59.8394 50.1684,-60.3528 48.7908,-60.8241 47.4003,-61.2521 45.9989,-61.6355 44.5886,-61.9734 43.1708,-62.2647 41.7472,-62.5087 40.3189,-62.7047 38.8872,-62.8521 37.4531,-62.9507 36.0175,-63 34.5815,-63 33.146,-62.9507 31.7119,-62.8521 30.2801,-62.7047 28.8519,-62.5087 27.4282,-62.2647 26.0105,-61.9734 24.6001,-61.6355 23.1988,-61.2521 21.8083,-60.8241 20.4306,-60.3528 19.0681,-59.8394 17.7233,-59.2853 16.3989,-58.692 15.0979,-58.0612 13.8236,-57.3946 12.5794,-56.6941 11.3691,-55.9615 10.1967,-55.1988 9.0662,-54.4082 7.982,-53.5918 6.9486,-52.7519 5.9706,-51.8907 5.0526,-51.0106 4.1995,-50.114 3.4159,-49.2035 2.7067,-48.2814 2.0765,-47.3503 1.53,-46.4127 1.0715,-45.4713 .7053,-44.5287 .4355,-43.5873 .2657,-42.6497 .1993,-41.7186 .2395,-40.7965 .3888,-39.886 .6495,-38.9894 1.0232,-38.1093 1.5112,-37.2481 2.1141,-36.4082 2.8321,-35.5918 3.6647,-34.8012 4.6109,-34.0385 5.6689,-33.3059 6.8365,-32.6054 8.1108,-31.9388 9.4884,-31.308 10.9651,-30.7147 12.5362,-30.1606 14.1966,-29.6472 15.9404,-29.1759 17.7614,-28.7479 19.6528,-28.3645 21.6074,-28.0266 23.6176,-27.7353 25.6755,-27.4913 27.7728,-27.2953 29.901,-27.1479 32.0515,-27.0493 34.2154,-27 36.3837,-27 38.5476,-27.0493"/>
 16 | <text text-anchor="middle" x="35.2995" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">start</text>
 17 | </g>
 18 | <!-- MyMap -->
 19 | <g id="node2" class="node">
 20 | <title>MyMap</title>
 21 | <ellipse fill="none" stroke="#000000" cx="152.745" cy="-45" rx="46.2923" ry="18"/>
 22 | <text text-anchor="middle" x="152.745" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">MyMap</text>
 23 | </g>
 24 | <!-- _start&#45;&gt;MyMap -->
 25 | <g id="edge1" class="edge">
 26 | <title>_start&#45;&gt;MyMap</title>
 27 | <path fill="none" stroke="#000000" d="M69.9018,-45C78.1609,-45 87.1905,-45 96.1538,-45"/>
 28 | <polygon fill="#000000" stroke="#000000" points="96.2009,-48.5001 106.2009,-45 96.2009,-41.5001 96.2009,-48.5001"/>
 29 | </g>
 30 | <!-- storey.Extend -->
 31 | <g id="node3" class="node">
 32 | <title>storey.Extend</title>
 33 | <ellipse fill="none" stroke="#000000" cx="310.2842" cy="-45" rx="75.2868" ry="18"/>
 34 | <text text-anchor="middle" x="310.2842" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">storey.Extend</text>
 35 | </g>
 36 | <!-- MyMap&#45;&gt;storey.Extend -->
 37 | <g id="edge2" class="edge">
 38 | <title>MyMap&#45;&gt;storey.Extend</title>
 39 | <path fill="none" stroke="#000000" d="M199.1598,-45C207.1838,-45 215.7499,-45 224.4316,-45"/>
 40 | <polygon fill="#000000" stroke="#000000" points="224.6509,-48.5001 234.6509,-45 224.6508,-41.5001 224.6509,-48.5001"/>
 41 | </g>
 42 | <!-- filter -->
 43 | <g id="node4" class="node">
 44 | <title>filter</title>
 45 | <ellipse fill="none" stroke="#000000" cx="455.4745" cy="-45" rx="33.5952" ry="18"/>
 46 | <text text-anchor="middle" x="455.4745" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">filter</text>
 47 | </g>
 48 | <!-- storey.Extend&#45;&gt;filter -->
 49 | <g id="edge3" class="edge">
 50 | <title>storey.Extend&#45;&gt;filter</title>
 51 | <path fill="none" stroke="#000000" d="M385.8561,-45C394.6736,-45 403.4325,-45 411.6112,-45"/>
 52 | <polygon fill="#000000" stroke="#000000" points="411.6214,-48.5001 421.6213,-45 411.6213,-41.5001 411.6214,-48.5001"/>
 53 | </g>
 54 | <!-- FeaturesetValidator -->
 55 | <g id="node5" class="node">
 56 | <title>FeaturesetValidator</title>
 57 | <ellipse fill="none" stroke="#000000" cx="627.9624" cy="-45" rx="102.8821" ry="18"/>
 58 | <text text-anchor="middle" x="627.9624" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">FeaturesetValidator</text>
 59 | </g>
 60 | <!-- filter&#45;&gt;FeaturesetValidator -->
 61 | <g id="edge4" class="edge">
 62 | <title>filter&#45;&gt;FeaturesetValidator</title>
 63 | <path fill="none" stroke="#000000" d="M489.5525,-45C497.3395,-45 506.0352,-45 515.1713,-45"/>
 64 | <polygon fill="#000000" stroke="#000000" points="515.2357,-48.5001 525.2357,-45 515.2356,-41.5001 515.2357,-48.5001"/>
 65 | </g>
 66 | <!-- Aggregates -->
 67 | <g id="node6" class="node">
 68 | <title>Aggregates</title>
 69 | <ellipse fill="none" stroke="#000000" cx="830.9976" cy="-45" rx="64.189" ry="18"/>
 70 | <text text-anchor="middle" x="830.9976" y="-41.3" font-family="Times,serif" font-size="14.00" fill="#000000">Aggregates</text>
 71 | </g>
 72 | <!-- FeaturesetValidator&#45;&gt;Aggregates -->
 73 | <g id="edge5" class="edge">
 74 | <title>FeaturesetValidator&#45;&gt;Aggregates</title>
 75 | <path fill="none" stroke="#000000" d="M730.6696,-45C739.3404,-45 747.9997,-45 756.3944,-45"/>
 76 | <polygon fill="#000000" stroke="#000000" points="756.4857,-48.5001 766.4857,-45 756.4857,-41.5001 756.4857,-48.5001"/>
 77 | </g>
 78 | <!-- parquet -->
 79 | <g id="node7" class="node">
 80 | <title>parquet</title>
 81 | <path fill="none" stroke="#000000" d="M1004.3418,-86.7273C1004.3418,-88.5331 987.982,-90 967.8418,-90 947.7017,-90 931.3418,-88.5331 931.3418,-86.7273 931.3418,-86.7273 931.3418,-57.2727 931.3418,-57.2727 931.3418,-55.4669 947.7017,-54 967.8418,-54 987.982,-54 1004.3418,-55.4669 1004.3418,-57.2727 1004.3418,-57.2727 1004.3418,-86.7273 1004.3418,-86.7273"/>
 82 | <path fill="none" stroke="#000000" d="M1004.3418,-86.7273C1004.3418,-84.9214 987.982,-83.4545 967.8418,-83.4545 947.7017,-83.4545 931.3418,-84.9214 931.3418,-86.7273"/>
 83 | <text text-anchor="middle" x="967.8418" y="-68.3" font-family="Times,serif" font-size="14.00" fill="#000000">parquet</text>
 84 | </g>
 85 | <!-- Aggregates&#45;&gt;parquet -->
 86 | <g id="edge6" class="edge">
 87 | <title>Aggregates&#45;&gt;parquet</title>
 88 | <path fill="none" stroke="#000000" d="M883.9051,-55.4389C896.2444,-57.8735 909.2943,-60.4483 921.3324,-62.8235"/>
 89 | <polygon fill="#000000" stroke="#000000" points="920.6803,-66.2622 931.1687,-64.7642 922.0353,-59.3946 920.6803,-66.2622"/>
 90 | </g>
 91 | <!-- nosql -->
 92 | <g id="node8" class="node">
 93 | <title>nosql</title>
 94 | <path fill="none" stroke="#000000" d="M995.3418,-32.7273C995.3418,-34.5331 983.0159,-36 967.8418,-36 952.6678,-36 940.3418,-34.5331 940.3418,-32.7273 940.3418,-32.7273 940.3418,-3.2727 940.3418,-3.2727 940.3418,-1.4669 952.6678,0 967.8418,0 983.0159,0 995.3418,-1.4669 995.3418,-3.2727 995.3418,-3.2727 995.3418,-32.7273 995.3418,-32.7273"/>
 95 | <path fill="none" stroke="#000000" d="M995.3418,-32.7273C995.3418,-30.9214 983.0159,-29.4545 967.8418,-29.4545 952.6678,-29.4545 940.3418,-30.9214 940.3418,-32.7273"/>
 96 | <text text-anchor="middle" x="967.8418" y="-14.3" font-family="Times,serif" font-size="14.00" fill="#000000">nosql</text>
 97 | </g>
 98 | <!-- Aggregates&#45;&gt;nosql -->
 99 | <g id="edge7" class="edge">
100 | <title>Aggregates&#45;&gt;nosql</title>
101 | <path fill="none" stroke="#000000" d="M883.9051,-34.5611C899.3986,-31.5042 916.0123,-28.2262 930.3369,-25.3999"/>
102 | <polygon fill="#000000" stroke="#000000" points="931.1806,-28.801 940.314,-23.4314 929.8256,-21.9334 931.1806,-28.801"/>
103 | </g>
104 | </g>
105 | </svg>
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/05-real-time-serving-pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Real-time Application Pipeline\n",
  8 |     "\n",
  9 |     "In this example, we define an application pipeline that accepts a user request, enriches the request with real-time features from the feature store, and feeds the features into a three-legged ensemble that uses the newly trained models.\n",
 10 |     "\n",
 11 |     "You would typically need to implement and deploy multiple microservices and com‐ plex logic to build such an application pipeline. But, with MLRun, you can define it in a few lines of code and deploy it automatically into elastic serverless functions. In addition, the MLRun serving framework will automatically support real-time feature imputing, model monitoring, and so on without requiring extra coding."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "import mlrun\n",
 29 |     "\n",
 30 |     "project = mlrun.get_or_create_project(\n",
 31 |     "    name=\"fraud-demo\",\n",
 32 |     "    context=\"./\",\n",
 33 |     "    user_project=True,\n",
 34 |     "    )"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Defining a custom serving class\n",
 42 |     "\n",
 43 |     "MLRun has many built-in model-serving classes for different frameworks (Sklearn, Xgboost, PyTorch, TensorFlow, ONNX, Hugging Face models, and so on). You can also build your custom model serving class as demonstrated in Example 7-24. The serving class must support the load() method for loading the model and the predict() method for making a prediction. You can read MLRun documentation to see all the hooks and advanced usage."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# mlrun: start-code"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import numpy as np\n",
 62 |     "from cloudpickle import load\n",
 63 |     "from mlrun.serving.v2_serving import V2ModelServer\n",
 64 |     "\n",
 65 |     "class ClassifierModel(V2ModelServer):\n",
 66 |     "    \n",
 67 |     "    def load(self):\n",
 68 |     "        \"\"\"load and initialize the model and/or other elements\"\"\"\n",
 69 |     "        model_file, extra_data = self.get_model('.pkl')\n",
 70 |     "        self.model = load(open(model_file, 'rb'))\n",
 71 |     "        \n",
 72 |     "    def predict(self, body: dict) -> list:\n",
 73 |     "        \"\"\"Generate model predictions from sample\"\"\"\n",
 74 |     "        print(f\"Input -> {body['inputs']}\")\n",
 75 |     "        feats = np.asarray(body['inputs'])\n",
 76 |     "        result: np.ndarray = self.model.predict(feats)\n",
 77 |     "        return result.tolist()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# mlrun: end-code"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "## Building an Application Pipeline with Enrichment and Ensemble\n",
 94 |     "\n",
 95 |     "MLRun serving can produce managed real-time serverless pipelines from various tasks, including MLRun models or standard model files. The pipelines use the Nuclio real-time serverless engine, which can be deployed anywhere. Nuclio is a high-performance open-source serverless framework focused on data, I/O, and compute-intensive workloads.\n",
 96 |     "\n",
 97 |     "The EnrichmentVotingEnsemble router class auto-enriches the request with data from the feature store. The router input accepts a list of inference requests (each request can be a dict or list of incoming features/keys). It enriches the request with data from the specified feature vector (feature_vector_uri), forwards the vector to one or more models in an ensemble, and returns an aggregated prediction value (for example, the average result across the three models).\n",
 98 |     "The features can often have null values (None, NaN, Inf). The Enrichment_ routers can substitute the null value with fixed or statistical value per fea ture. This is done through the `impute_policy` parameter, which accepts the impute policy per feature (where * is used to specify the default). The value can be a fixed number for constants or $mean, $max, $min, $std, $count for statistical values to substitute the value with the equivalent feature stats (taken from the feature store).\n",
 99 |     "The code in Example 7-24 defines a new serving function with the ClassifierModel class code (in serving.py) and a router topology (using the EnrichmentVotingEnsem ble router class) with three child models."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "[('transaction_fraud_rf',\n",
111 |        "  {'updated': '2024-10-08 13:21:47.319818+00:00',\n",
112 |        "   'project': 'fraud-demo-felipe',\n",
113 |        "   'key': 'model',\n",
114 |        "   'tree': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
115 |        "   'tag': 'latest',\n",
116 |        "   'labels': {'workflow-id': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
117 |        "    'framework': 'sklearn'},\n",
118 |        "   'iter': 1,\n",
119 |        "   'hash': 'd31c0e672e4ac97438e866612c9fb02c1a3a1732'}),\n",
120 |        " ('transaction_fraud_xgboost',\n",
121 |        "  {'updated': '2024-10-08 13:21:18.851606+00:00',\n",
122 |        "   'project': 'fraud-demo-felipe',\n",
123 |        "   'key': 'model',\n",
124 |        "   'tree': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
125 |        "   'tag': 'latest',\n",
126 |        "   'labels': {'workflow-id': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
127 |        "    'framework': 'sklearn'},\n",
128 |        "   'iter': 2,\n",
129 |        "   'hash': 'd40c64ec2d081899089ec7e34288c87a175a848f'}),\n",
130 |        " ('transaction_fraud_adaboost',\n",
131 |        "  {'updated': '2024-10-08 13:21:22.048244+00:00',\n",
132 |        "   'project': 'fraud-demo-felipe',\n",
133 |        "   'key': 'model',\n",
134 |        "   'tree': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
135 |        "   'tag': 'latest',\n",
136 |        "   'labels': {'workflow-id': 'e27f0d6d-8f14-4642-abee-43b9867631ef',\n",
137 |        "    'framework': 'sklearn'},\n",
138 |        "   'iter': 3,\n",
139 |        "   'hash': 'db02f4acce087779cbc634285ea1647206a5fc84'})]"
140 |       ]
141 |      },
142 |      "execution_count": 5,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "[(m.spec.db_key, m.metadata.to_dict()) for m in project.list_models('', tag='latest')]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 6,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "image/svg+xml": [
159 |        "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
160 |        "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
161 |        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
162 |        "<!-- Generated by graphviz version 2.43.0 (0)\n",
163 |        " -->\n",
164 |        "<!-- Title: mlrun&#45;flow Pages: 1 -->\n",
165 |        "<svg width=\"800pt\" height=\"196pt\"\n",
166 |        " viewBox=\"0.00 0.00 799.73 196.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
167 |        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 192)\">\n",
168 |        "<title>mlrun&#45;flow</title>\n",
169 |        "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-192 795.73,-192 795.73,4 -4,4\"/>\n",
170 |        "<!-- _start -->\n",
171 |        "<g id=\"node1\" class=\"node\">\n",
172 |        "<title>_start</title>\n",
173 |        "<polygon fill=\"lightgrey\" stroke=\"black\" points=\"364.89,-152.05 367.04,-152.15 369.17,-152.3 371.26,-152.49 373.32,-152.74 375.33,-153.03 377.29,-153.36 379.18,-153.75 381,-154.18 382.74,-154.65 384.4,-155.16 385.98,-155.71 387.45,-156.31 388.83,-156.94 390.1,-157.61 391.27,-158.31 392.33,-159.04 393.28,-159.8 394.11,-160.59 394.83,-161.41 395.43,-162.25 395.92,-163.11 396.29,-163.99 396.55,-164.89 396.7,-165.8 396.74,-166.72 396.67,-167.65 396.5,-168.59 396.23,-169.53 395.87,-170.47 395.41,-171.41 394.86,-172.35 394.23,-173.28 393.52,-174.2 392.74,-175.11 391.89,-176.01 390.97,-176.89 389.99,-177.75 388.96,-178.59 387.87,-179.41 386.74,-180.2 385.57,-180.96 384.36,-181.69 383.12,-182.39 381.84,-183.06 380.54,-183.69 379.22,-184.29 377.87,-184.84 376.51,-185.35 375.13,-185.82 373.74,-186.25 372.34,-186.64 370.93,-186.97 369.51,-187.26 368.09,-187.51 366.66,-187.7 365.23,-187.85 363.79,-187.95 362.36,-188 360.92,-188 359.49,-187.95 358.05,-187.85 356.62,-187.7 355.19,-187.51 353.77,-187.26 352.35,-186.97 350.94,-186.64 349.54,-186.25 348.15,-185.82 346.77,-185.35 345.41,-184.84 344.06,-184.29 342.74,-183.69 341.44,-183.06 340.16,-182.39 338.92,-181.69 337.71,-180.96 336.54,-180.2 335.41,-179.41 334.32,-178.59 333.29,-177.75 332.31,-176.89 331.39,-176.01 330.54,-175.11 329.76,-174.2 329.05,-173.28 328.42,-172.35 327.87,-171.41 327.41,-170.47 327.05,-169.53 326.78,-168.59 326.61,-167.65 326.54,-166.72 326.58,-165.8 326.73,-164.89 326.99,-163.99 327.36,-163.11 327.85,-162.25 328.46,-161.41 329.17,-160.59 330.01,-159.8 330.95,-159.04 332.01,-158.31 333.18,-157.61 334.45,-156.94 335.83,-156.31 337.31,-155.71 338.88,-155.16 340.54,-154.65 342.28,-154.18 344.1,-153.75 345.99,-153.36 347.95,-153.03 349.96,-152.74 352.02,-152.49 354.11,-152.3 356.24,-152.15 358.39,-152.05 360.56,-152 362.72,-152 364.89,-152.05\"/>\n",
174 |        "<text text-anchor=\"middle\" x=\"361.64\" y=\"-166.3\" font-family=\"Times,serif\" font-size=\"14.00\">start</text>\n",
175 |        "</g>\n",
176 |        "<g id=\"node2\" class=\"node\">\n",
177 |        "<title></title>\n",
178 |        "<polygon fill=\"none\" stroke=\"black\" points=\"388.64,-86.54 388.64,-101.46 372.82,-112 350.46,-112 334.64,-101.46 334.64,-86.54 350.46,-76 372.82,-76 388.64,-86.54\"/>\n",
179 |        "<polygon fill=\"none\" stroke=\"black\" points=\"392.64,-84.4 392.64,-103.6 374.04,-116 349.25,-116 330.64,-103.6 330.64,-84.4 349.25,-72 374.04,-72 392.64,-84.4\"/>\n",
180 |        "</g>\n",
181 |        "<!-- _start&#45;&gt; -->\n",
182 |        "<g id=\"edge1\" class=\"edge\">\n",
183 |        "<title>_start&#45;&gt;</title>\n",
184 |        "<path fill=\"none\" stroke=\"black\" d=\"M361.64,-151.84C361.64,-144.16 361.64,-134.88 361.64,-126.05\"/>\n",
185 |        "<polygon fill=\"black\" stroke=\"black\" points=\"365.14,-126.03 361.64,-116.03 358.14,-126.03 365.14,-126.03\"/>\n",
186 |        "</g>\n",
187 |        "<!-- transaction_fraud_rf -->\n",
188 |        "<g id=\"node3\" class=\"node\">\n",
189 |        "<title>transaction_fraud_rf</title>\n",
190 |        "<ellipse fill=\"none\" stroke=\"black\" cx=\"104.64\" cy=\"-18\" rx=\"104.78\" ry=\"18\"/>\n",
191 |        "<text text-anchor=\"middle\" x=\"104.64\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">transaction_fraud_rf</text>\n",
192 |        "</g>\n",
193 |        "<!-- &#45;&gt;transaction_fraud_rf -->\n",
194 |        "<g id=\"edge2\" class=\"edge\">\n",
195 |        "<title>&#45;&gt;transaction_fraud_rf</title>\n",
196 |        "<path fill=\"none\" stroke=\"black\" d=\"M330.9,-84.15C290.22,-72.43 217.59,-51.52 165.59,-36.55\"/>\n",
197 |        "<polygon fill=\"black\" stroke=\"black\" points=\"166.39,-33.14 155.81,-33.73 164.45,-39.86 166.39,-33.14\"/>\n",
198 |        "</g>\n",
199 |        "<!-- transaction_fraud_xgboost -->\n",
200 |        "<g id=\"node4\" class=\"node\">\n",
201 |        "<title>transaction_fraud_xgboost</title>\n",
202 |        "<ellipse fill=\"none\" stroke=\"black\" cx=\"361.64\" cy=\"-18\" rx=\"133.78\" ry=\"18\"/>\n",
203 |        "<text text-anchor=\"middle\" x=\"361.64\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">transaction_fraud_xgboost</text>\n",
204 |        "</g>\n",
205 |        "<!-- &#45;&gt;transaction_fraud_xgboost -->\n",
206 |        "<g id=\"edge3\" class=\"edge\">\n",
207 |        "<title>&#45;&gt;transaction_fraud_xgboost</title>\n",
208 |        "<path fill=\"none\" stroke=\"black\" d=\"M361.64,-71.99C361.64,-64.06 361.64,-54.91 361.64,-46.48\"/>\n",
209 |        "<polygon fill=\"black\" stroke=\"black\" points=\"365.14,-46.31 361.64,-36.31 358.14,-46.31 365.14,-46.31\"/>\n",
210 |        "</g>\n",
211 |        "<!-- transaction_fraud_adaboost -->\n",
212 |        "<g id=\"node5\" class=\"node\">\n",
213 |        "<title>transaction_fraud_adaboost</title>\n",
214 |        "<ellipse fill=\"none\" stroke=\"black\" cx=\"652.64\" cy=\"-18\" rx=\"139.18\" ry=\"18\"/>\n",
215 |        "<text text-anchor=\"middle\" x=\"652.64\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">transaction_fraud_adaboost</text>\n",
216 |        "</g>\n",
217 |        "<!-- &#45;&gt;transaction_fraud_adaboost -->\n",
218 |        "<g id=\"edge4\" class=\"edge\">\n",
219 |        "<title>&#45;&gt;transaction_fraud_adaboost</title>\n",
220 |        "<path fill=\"none\" stroke=\"black\" d=\"M392.78,-85.08C437.78,-73.64 522.21,-52.17 582.62,-36.81\"/>\n",
221 |        "<polygon fill=\"black\" stroke=\"black\" points=\"583.65,-40.16 592.48,-34.3 581.92,-33.37 583.65,-40.16\"/>\n",
222 |        "</g>\n",
223 |        "</g>\n",
224 |        "</svg>\n"
225 |       ],
226 |       "text/plain": [
227 |        "<graphviz.graphs.Digraph at 0x7fc6e54b5520>"
228 |       ]
229 |      },
230 |      "execution_count": 6,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "# Create the serving function from your code above\n",
237 |     "serving_fn = project.set_function('src/serving.py', name='test-function',\n",
238 |     "                                  image=\"mlrun/mlrun\", kind=\"serving\")\n",
239 |     "serving_fn.set_topology(\n",
240 |     "    \"router\",\n",
241 |     "    mlrun.serving.routers.EnrichmentVotingEnsemble(\n",
242 |     "        feature_vector_uri=\"short\",\n",
243 |     "        impute_policy={\"*\": \"$mean\"}),\n",
244 |     ")\n",
245 |     "# add the 3 trained models to the Ensemble\n",
246 |     "for model in project.list_models('', tag='latest'):\n",
247 |     "    name = model.spec.db_key\n",
248 |     "    serving_fn.add_model(name, class_name=\"ClassifierModel\", model_path=model.uri)\n",
249 |     "# Plot the ensemble configuration\n",
250 |     "serving_fn.spec.graph.plot()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "## Test the Application Pipeline Locally\n",
258 |     "\n",
259 |     "Before deploying the serving function, you can test it in the current notebook and check the model output."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 7,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "> 2024-10-08 13:41:43,004 [info] model transaction_fraud_rf was loaded\n",
272 |       "> 2024-10-08 13:41:43,033 [info] model transaction_fraud_xgboost was loaded\n",
273 |       "> 2024-10-08 13:41:43,064 [info] model transaction_fraud_adaboost was loaded\n"
274 |      ]
275 |     },
276 |     {
277 |      "name": "stderr",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "Trying to unpickle estimator DecisionTreeClassifier from version 1.5.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
281 |       "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
282 |       "Trying to unpickle estimator RandomForestClassifier from version 1.5.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
283 |       "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
284 |       "Trying to unpickle estimator LogisticRegression from version 1.5.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
285 |       "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
286 |       "Trying to unpickle estimator AdaBoostClassifier from version 1.5.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
287 |       "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "# Create a mock server from the serving function\n",
293 |     "local_server = serving_fn.to_mock_server()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 8,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "Input -> [[82.38333699999998, 176.010817, 0.0, 38.79236982490288, 134.16, 417.81, 11.0, 37.982727272727274, 134.16, 1275.0599999999997, 44.0, 28.978636363636358, 90.0, 1.0, 2.0]]\n",
306 |       "Input -> [[82.38333699999998, 176.010817, 0.0, 38.79236982490288, 134.16, 417.81, 11.0, 37.982727272727274, 134.16, 1275.0599999999997, 44.0, 28.978636363636358, 90.0, 1.0, 2.0]]\n",
307 |       "Input -> [[82.38333699999998, 176.010817, 0.0, 38.79236982490288, 134.16, 417.81, 11.0, 37.982727272727274, 134.16, 1275.0599999999997, 44.0, 28.978636363636358, 90.0, 1.0, 2.0]]\n"
308 |      ]
309 |     },
310 |     {
311 |      "name": "stderr",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n",
315 |       "X does not have valid feature names, but LogisticRegression was fitted with feature names\n",
316 |       "X does not have valid feature names, but AdaBoostClassifier was fitted with feature names\n"
317 |      ]
318 |     },
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "{'id': 'f7f45c8d7e3a4c6993d6e10cc0f91acd',\n",
323 |        " 'model_name': 'VotingEnsemble',\n",
324 |        " 'outputs': [0],\n",
325 |        " 'model_version': 'v1'}"
326 |       ]
327 |      },
328 |      "execution_count": 8,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "# Choose an id for your test\n",
335 |     "sample_id = 'C1000148617'\n",
336 |     "\n",
337 |     "# Send your sample ID for prediction\n",
338 |     "local_server.test(path='/v2/models/infer',\n",
339 |     "            body={'inputs': [[sample_id]]})\n",
340 |     "\n",
341 |     "# notice the input vector is printed 3 times (once per child model) and is enriched with data from the feature store"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {
347 |     "tags": []
348 |    },
349 |    "source": [
350 |     "### Accessing the real-time feature vector directly\n",
351 |     "\n",
352 |     "If you would like to access the real-time features directly from your application instead of using the EnrichmentVotingEnsemble, you can call the feature store `get_online_feature_service()` method. This method is used internally in the EnrichmentVotingEnsemble router class."
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 9,
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "data": {
362 |       "text/plain": [
363 |        "[{'amount_max_2h': 82.38333699999998,\n",
364 |        "  'amount_max_12h': 134.16,\n",
365 |        "  'amount_max_24h': 134.16,\n",
366 |        "  'amount_count_2h': 0.0,\n",
367 |        "  'amount_count_12h': 11.0,\n",
368 |        "  'amount_count_24h': 44.0,\n",
369 |        "  'amount_sum_2h': 176.010817,\n",
370 |        "  'amount_sum_12h': 417.81,\n",
371 |        "  'amount_sum_24h': 1275.0599999999997,\n",
372 |        "  'es_transportation_sum_14d': 90.0,\n",
373 |        "  'es_health_sum_14d': 1.0,\n",
374 |        "  'es_otherservices_sum_14d': 2.0,\n",
375 |        "  'amount_avg_2h': 38.79236982490288,\n",
376 |        "  'amount_avg_12h': 37.982727272727274,\n",
377 |        "  'amount_avg_24h': 28.978636363636358}]"
378 |       ]
379 |      },
380 |      "execution_count": 9,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "import mlrun.feature_store as fs\n",
387 |     "\n",
388 |     "# Create the online feature service\n",
389 |     "svc = fs.get_feature_vector('short:latest').get_online_feature_service(impute_policy={\"*\": \"$mean\"})\n",
390 |     "\n",
391 |     "# Get sample feature vector\n",
392 |     "sample_fv = svc.get([{'source': sample_id}])\n",
393 |     "sample_fv"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "## Deploying the function on the Kubernetes cluster\n",
401 |     "\n",
402 |     "You can now deploy the function. Once deployed, you get a function with http trigger that can be called from other locations."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "name": "stdout",
412 |      "output_type": "stream",
413 |      "text": [
414 |       "> 2024-10-08 13:41:43,535 [info] Starting remote function deploy\n",
415 |       "2024-10-08 13:41:43  (info) Deploying function\n",
416 |       "2024-10-08 13:41:44  (info) Building\n",
417 |       "2024-10-08 13:41:44  (info) Staging files and preparing base images\n",
418 |       "2024-10-08 13:41:44  (warn) Using user provided base image, runtime interpreter version is provided by the base image\n",
419 |       "2024-10-08 13:41:44  (info) Building processor image\n",
420 |       "2024-10-08 13:42:49  (info) Build complete\n",
421 |       "2024-10-08 13:43:13  (info) Function deploy complete\n",
422 |       "> 2024-10-08 13:43:14,979 [info] Successfully deployed function: {\"external_invocation_urls\":[\"fraud-demo-felipe-test-function.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/\"],\"internal_invocation_urls\":[\"nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080\"]}\n"
423 |      ]
424 |     },
425 |     {
426 |      "data": {
427 |       "text/plain": [
428 |        "'http://fraud-demo-felipe-test-function.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/'"
429 |       ]
430 |      },
431 |      "execution_count": 10,
432 |      "metadata": {},
433 |      "output_type": "execute_result"
434 |     }
435 |    ],
436 |    "source": [
437 |     "# Enable model monitoring\n",
438 |     "serving_fn.set_tracking()\n",
439 |     "\n",
440 |     "# Deploy the serving function\n",
441 |     "serving_fn.deploy()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "## Test the server\n",
449 |     "\n",
450 |     "You can test the serving function and examine the model output."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 11,
456 |    "metadata": {},
457 |    "outputs": [
458 |     {
459 |      "name": "stdout",
460 |      "output_type": "stream",
461 |      "text": [
462 |       "> 2024-10-08 13:43:15,027 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n"
463 |      ]
464 |     },
465 |     {
466 |      "data": {
467 |       "text/plain": [
468 |        "{'id': 'ee5635d0-bb82-4036-a71c-379696d9ed79',\n",
469 |        " 'model_name': 'VotingEnsemble',\n",
470 |        " 'outputs': [0],\n",
471 |        " 'model_version': 'v1'}"
472 |       ]
473 |      },
474 |      "execution_count": 11,
475 |      "metadata": {},
476 |      "output_type": "execute_result"
477 |     }
478 |    ],
479 |    "source": [
480 |     "# Choose an id for your test\n",
481 |     "sample_id = 'C1000148617'\n",
482 |     "\n",
483 |     "model_inference_path = '/v2/models/infer'\n",
484 |     "\n",
485 |     "# Send your sample ID for prediction\n",
486 |     "serving_fn.invoke(path='/v2/models/infer',\n",
487 |     "                  body={'inputs': [[sample_id]]})"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "You can also directly query the feature store values, which are used in the enrichment."
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "### Simulate incoming data"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 12,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "# Load the dataset\n",
511 |     "data = mlrun.get_dataitem(mlrun.get_sample_path(\"data/fraud-demo-mlrun-fs-docs/data.csv\")).as_df()\n",
512 |     "\n",
513 |     "# use only first 10k\n",
514 |     "data = data.sort_values(by='source', axis=0)[:10000]\n",
515 |     "\n",
516 |     "# keys\n",
517 |     "sample_ids = data['source'].to_list()"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": 13,
523 |    "metadata": {},
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "> 2024-10-08 13:43:19,205 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
530 |       "{'id': 'efa462b4-36ed-4f63-8d4d-8d6474fff08d', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
531 |       "> 2024-10-08 13:43:20,718 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
532 |       "{'id': 'b930f851-fd9a-498a-8b7f-4e84f3bd42ec', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
533 |       "> 2024-10-08 13:43:21,987 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
534 |       "{'id': 'c064d2e8-377a-4384-9a47-7c5881f83b0a', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
535 |       "> 2024-10-08 13:43:22,928 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
536 |       "{'id': 'd4d6a214-fcf5-4c94-9551-db129c1f52f2', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
537 |       "> 2024-10-08 13:43:23,263 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
538 |       "{'id': '72419b36-0e6e-4ade-8b23-70b3407115d6', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
539 |       "> 2024-10-08 13:43:24,141 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
540 |       "{'id': 'aa27481c-5903-4f02-9ce4-5540fb45d775', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
541 |       "> 2024-10-08 13:43:25,491 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
542 |       "{'id': 'fdc379d7-7a9c-452b-91ac-93b29abaffab', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
543 |       "> 2024-10-08 13:43:26,392 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
544 |       "{'id': '015fbeab-7f98-4295-a05e-a2fb38d54b3a', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
545 |       "> 2024-10-08 13:43:27,417 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
546 |       "{'id': '59733a3f-ffc8-4828-88e7-0e46ff9a129a', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n",
547 |       "> 2024-10-08 13:43:28,915 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-test-function.default-tenant.svc.cluster.local:8080/v2/models/infer\"}\n",
548 |       "{'id': '05e984c5-556d-415d-a4f4-e3eb389f2213', 'model_name': 'VotingEnsemble', 'outputs': [0], 'model_version': 'v1'}\n"
549 |      ]
550 |     }
551 |    ],
552 |    "source": [
553 |     "from random import choice, uniform\n",
554 |     "from time import sleep\n",
555 |     "\n",
556 |     "# Sending random requests\n",
557 |     "for _ in range(10):\n",
558 |     "    data_point = choice(sample_ids)\n",
559 |     "    try:\n",
560 |     "        resp = serving_fn.invoke(path=model_inference_path, body={'inputs': [[data_point]]})\n",
561 |     "        print(resp)\n",
562 |     "        sleep(uniform(0.2, 1.7))\n",
563 |     "    except OSError:\n",
564 |     "        pass"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "markdown",
569 |    "metadata": {
570 |     "pycharm": {
571 |      "name": "#%% md\n"
572 |     }
573 |    },
574 |    "source": [
575 |     "## Done!\n",
576 |     "\n",
577 |     "You've completed the fraud-detection demo. \n"
578 |    ]
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3 (ipykernel)",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.11.5"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 4
602 | }
603 | 


--------------------------------------------------------------------------------
/04-train-test-pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Model Training and Validation Pipeline\n",
  8 |     "Now that you have created features, you can use them to train one or more models. In this section, you will generate feature vectors with multiple features from one or more feature sets and feed them into an automated ML training and testing pipeline to create high-quality models.\n",
  9 |     "\n",
 10 |     "The ML pipeline can be triggered and tracked manually during the interactive devel‐ opment, or it can be saved (into Git) and be executed automatically on a given schedule or as a reaction to different events (such as code modification, CI/CD, data changes, model drift, and so on). See [MLRun project and CI/CD documentation](https://docs.mlrun.org/en/stable/projects/project.html) for details.\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Saving and loading projects from GIT\n",
 18 |     "\n",
 19 |     "After you saved your project and its elements (functions, workflows, artifacts, etc.) you can commit all your changes to a \n",
 20 |     "GIT repository. This can be done using standard GIT tools or using MLRun `project` methods such as `pull`, `push`, \n",
 21 |     "`remote`, which calls the Git API for you.\n",
 22 |     "\n",
 23 |     "Projects can then be loaded from Git using MLRun `load_project` method, for example: \n",
 24 |     "\n",
 25 |     "    project = mlrun.load_project(\"./myproj\", \"git://github.com/mlrun/project-demo.git\", name=project_name)\n",
 26 |     "    \n",
 27 |     "or using MLRun CLI:\n",
 28 |     "\n",
 29 |     "    mlrun project -n myproj -u \"git://github.com/mlrun/project-demo.git\" ./myproj\n",
 30 |     "    \n",
 31 |     "Projects can be loaded or created by using MLRun `get_or_create_project` method.\n",
 32 |     "    \n",
 33 |     "Read [CI/CD integration](../../projects/ci-integration.html) for more details."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## MLRun installation and configuration\n",
 41 |     "Before running this notebook make sure the `mlrun` package is installed (`pip install mlrun`) and that you have configured the access to MLRun service."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Install MLRun if not installed, run this only once (restart the notebook after the install !!!)\n",
 51 |     "%pip install mlrun"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n",
 64 |       "> 2024-10-08 13:19:58,328 [info] Project loaded successfully: {\"project_name\":\"fraud-demo-felipe\"}\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "import mlrun\n",
 70 |     "\n",
 71 |     "project = mlrun.get_or_create_project(\n",
 72 |     "    name=\"fraud-demo\",\n",
 73 |     "    context=\"./\",\n",
 74 |     "    user_project=True,\n",
 75 |     "    )"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Creating and Evaluating a Feature Vector\n",
 83 |     "\n",
 84 |     "Models are trained with multiple features, which can arrive from different feature sets and be collected into training (feature) vectors. Feature stores know how to correctly combine the features into a vector by implementing smart JOINs and assessing the time dimension (time traveling).\n",
 85 |     "To define a feature vector, you need to specify a name, the list of features it contains, the target features (labels), and other optional parameters. Features are specified as `<FeatureSet>.<Feature> or <FeatureSet>.*`  (all the features in a feature set). The following part demonstrates how to create and use a feature vector.\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Create a feature vector"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 2,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Import MLRun's Feature Store\n",
102 |     "import mlrun.feature_store as fstore\n",
103 |     "\n",
104 |     "# Define the list of features to use\n",
105 |     "features = ['events.*',\n",
106 |     "            'transactions.amount_max_2h', \n",
107 |     "            'transactions.amount_sum_2h', \n",
108 |     "            'transactions.amount_count_2h',\n",
109 |     "            'transactions.amount_avg_2h', \n",
110 |     "            'transactions.amount_max_12h', \n",
111 |     "            'transactions.amount_sum_12h',\n",
112 |     "            'transactions.amount_count_12h', \n",
113 |     "            'transactions.amount_avg_12h', \n",
114 |     "            'transactions.amount_max_24h',\n",
115 |     "            'transactions.amount_sum_24h', \n",
116 |     "            'transactions.amount_count_24h', \n",
117 |     "            'transactions.amount_avg_24h',\n",
118 |     "            'transactions.es_transportation_sum_14d', \n",
119 |     "            'transactions.es_health_sum_14d',\n",
120 |     "            'transactions.es_otherservices_sum_14d', \n",
121 |     "            'transactions.es_food_sum_14d',\n",
122 |     "            'transactions.es_hotelservices_sum_14d', \n",
123 |     "            'transactions.es_barsandrestaurants_sum_14d',\n",
124 |     "            'transactions.es_tech_sum_14d', \n",
125 |     "            'transactions.es_sportsandtoys_sum_14d',\n",
126 |     "            'transactions.es_wellnessandbeauty_sum_14d', \n",
127 |     "            'transactions.es_hyper_sum_14d',\n",
128 |     "            'transactions.es_fashion_sum_14d', \n",
129 |     "            'transactions.es_home_sum_14d', \n",
130 |     "            'transactions.es_travel_sum_14d', \n",
131 |     "            'transactions.es_leisure_sum_14d',\n",
132 |     "            'transactions.gender_F',\n",
133 |     "            'transactions.gender_M',\n",
134 |     "            'transactions.step', \n",
135 |     "            'transactions.amount', \n",
136 |     "            'transactions.timestamp_hour',\n",
137 |     "            'transactions.timestamp_day_of_week']"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### Create a feature vector"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 3,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# Define the feature vector name for future reference\n",
154 |     "fv_name = 'transactions-fraud'\n",
155 |     "\n",
156 |     "# Define the feature vector using the feature store (fstore)\n",
157 |     "transactions_fv = fstore.FeatureVector(fv_name, \n",
158 |     "                          features, \n",
159 |     "                          label_feature=\"labels.label\",\n",
160 |     "                          description='Predicting a fraudulent transaction')\n",
161 |     "\n",
162 |     "# Save the feature vector in the feature store\n",
163 |     "transactions_fv.save()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 4,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "> 2024-10-08 13:20:06,351 [info] Merger detected timestamp resolution incompatibility between feature set labels and others: datetime64[us] and datetime64[ms]. Converting feature set timestamp column 'timestamp' to type datetime64[us].\n",
176 |       "> 2024-10-08 13:20:06,399 [info] wrote target: {'size': 151272, 'name': 'parquet', 'updated': '2024-10-08T13:20:06.399382+00:00', 'partitioned': True, 'path': 'v3io:///projects/fraud-demo-felipe/FeatureStore/transactions-fraud/parquet/vectors/transactions-fraud-latest.parquet', 'kind': 'parquet', 'status': 'ready'}\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "# test get_offline_features function\n",
182 |     "\n",
183 |     "from mlrun.datastore.targets import ParquetTarget\n",
184 |     "data = transactions_fv.get_offline_features(target=ParquetTarget())\n"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 5,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/plain": [
195 |        "<mlrun.feature_store.feature_vector.OfflineVectorResponse at 0x7f7deccdcfa0>"
196 |       ]
197 |      },
198 |      "execution_count": 5,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "data"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 8,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/html": [
215 |        "<div>\n",
216 |        "<style scoped>\n",
217 |        "    .dataframe tbody tr th:only-of-type {\n",
218 |        "        vertical-align: middle;\n",
219 |        "    }\n",
220 |        "\n",
221 |        "    .dataframe tbody tr th {\n",
222 |        "        vertical-align: top;\n",
223 |        "    }\n",
224 |        "\n",
225 |        "    .dataframe thead th {\n",
226 |        "        text-align: right;\n",
227 |        "    }\n",
228 |        "</style>\n",
229 |        "<table border=\"1\" class=\"dataframe\">\n",
230 |        "  <thead>\n",
231 |        "    <tr style=\"text-align: right;\">\n",
232 |        "      <th></th>\n",
233 |        "      <th>event_password_change</th>\n",
234 |        "      <th>event_details_change</th>\n",
235 |        "      <th>event_login</th>\n",
236 |        "      <th>amount_max_2h</th>\n",
237 |        "      <th>amount_sum_2h</th>\n",
238 |        "      <th>amount_count_2h</th>\n",
239 |        "      <th>amount_avg_2h</th>\n",
240 |        "      <th>amount_max_12h</th>\n",
241 |        "      <th>amount_sum_12h</th>\n",
242 |        "      <th>amount_count_12h</th>\n",
243 |        "      <th>...</th>\n",
244 |        "      <th>es_home_sum_14d</th>\n",
245 |        "      <th>es_travel_sum_14d</th>\n",
246 |        "      <th>es_leisure_sum_14d</th>\n",
247 |        "      <th>gender_F</th>\n",
248 |        "      <th>gender_M</th>\n",
249 |        "      <th>step</th>\n",
250 |        "      <th>amount</th>\n",
251 |        "      <th>timestamp_hour</th>\n",
252 |        "      <th>timestamp_day_of_week</th>\n",
253 |        "      <th>label</th>\n",
254 |        "    </tr>\n",
255 |        "  </thead>\n",
256 |        "  <tbody>\n",
257 |        "    <tr>\n",
258 |        "      <th>0</th>\n",
259 |        "      <td>0</td>\n",
260 |        "      <td>0</td>\n",
261 |        "      <td>1</td>\n",
262 |        "      <td>1.83</td>\n",
263 |        "      <td>1.83</td>\n",
264 |        "      <td>1.0</td>\n",
265 |        "      <td>1.830000</td>\n",
266 |        "      <td>1.83</td>\n",
267 |        "      <td>1.83</td>\n",
268 |        "      <td>1.0</td>\n",
269 |        "      <td>...</td>\n",
270 |        "      <td>0.0</td>\n",
271 |        "      <td>0.0</td>\n",
272 |        "      <td>0.0</td>\n",
273 |        "      <td>0.0</td>\n",
274 |        "      <td>1.0</td>\n",
275 |        "      <td>72.0</td>\n",
276 |        "      <td>1.83</td>\n",
277 |        "      <td>7.0</td>\n",
278 |        "      <td>6.0</td>\n",
279 |        "      <td>0.0</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>1</th>\n",
283 |        "      <td>0</td>\n",
284 |        "      <td>0</td>\n",
285 |        "      <td>1</td>\n",
286 |        "      <td>18.72</td>\n",
287 |        "      <td>40.22</td>\n",
288 |        "      <td>3.0</td>\n",
289 |        "      <td>13.406667</td>\n",
290 |        "      <td>18.72</td>\n",
291 |        "      <td>40.22</td>\n",
292 |        "      <td>3.0</td>\n",
293 |        "      <td>...</td>\n",
294 |        "      <td>0.0</td>\n",
295 |        "      <td>0.0</td>\n",
296 |        "      <td>0.0</td>\n",
297 |        "      <td>0.0</td>\n",
298 |        "      <td>1.0</td>\n",
299 |        "      <td>66.0</td>\n",
300 |        "      <td>18.72</td>\n",
301 |        "      <td>7.0</td>\n",
302 |        "      <td>6.0</td>\n",
303 |        "      <td>0.0</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>2</th>\n",
307 |        "      <td>1</td>\n",
308 |        "      <td>0</td>\n",
309 |        "      <td>0</td>\n",
310 |        "      <td>25.92</td>\n",
311 |        "      <td>64.86</td>\n",
312 |        "      <td>3.0</td>\n",
313 |        "      <td>21.620000</td>\n",
314 |        "      <td>25.92</td>\n",
315 |        "      <td>64.86</td>\n",
316 |        "      <td>3.0</td>\n",
317 |        "      <td>...</td>\n",
318 |        "      <td>0.0</td>\n",
319 |        "      <td>0.0</td>\n",
320 |        "      <td>0.0</td>\n",
321 |        "      <td>0.0</td>\n",
322 |        "      <td>1.0</td>\n",
323 |        "      <td>27.0</td>\n",
324 |        "      <td>25.92</td>\n",
325 |        "      <td>7.0</td>\n",
326 |        "      <td>6.0</td>\n",
327 |        "      <td>0.0</td>\n",
328 |        "    </tr>\n",
329 |        "    <tr>\n",
330 |        "      <th>3</th>\n",
331 |        "      <td>1</td>\n",
332 |        "      <td>0</td>\n",
333 |        "      <td>0</td>\n",
334 |        "      <td>24.75</td>\n",
335 |        "      <td>30.17</td>\n",
336 |        "      <td>2.0</td>\n",
337 |        "      <td>15.085000</td>\n",
338 |        "      <td>24.75</td>\n",
339 |        "      <td>30.17</td>\n",
340 |        "      <td>2.0</td>\n",
341 |        "      <td>...</td>\n",
342 |        "      <td>0.0</td>\n",
343 |        "      <td>0.0</td>\n",
344 |        "      <td>0.0</td>\n",
345 |        "      <td>0.0</td>\n",
346 |        "      <td>1.0</td>\n",
347 |        "      <td>141.0</td>\n",
348 |        "      <td>24.75</td>\n",
349 |        "      <td>7.0</td>\n",
350 |        "      <td>6.0</td>\n",
351 |        "      <td>0.0</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>4</th>\n",
355 |        "      <td>1</td>\n",
356 |        "      <td>0</td>\n",
357 |        "      <td>0</td>\n",
358 |        "      <td>64.18</td>\n",
359 |        "      <td>65.17</td>\n",
360 |        "      <td>2.0</td>\n",
361 |        "      <td>32.585000</td>\n",
362 |        "      <td>64.18</td>\n",
363 |        "      <td>65.17</td>\n",
364 |        "      <td>2.0</td>\n",
365 |        "      <td>...</td>\n",
366 |        "      <td>0.0</td>\n",
367 |        "      <td>0.0</td>\n",
368 |        "      <td>0.0</td>\n",
369 |        "      <td>1.0</td>\n",
370 |        "      <td>0.0</td>\n",
371 |        "      <td>124.0</td>\n",
372 |        "      <td>64.18</td>\n",
373 |        "      <td>7.0</td>\n",
374 |        "      <td>6.0</td>\n",
375 |        "      <td>0.0</td>\n",
376 |        "    </tr>\n",
377 |        "  </tbody>\n",
378 |        "</table>\n",
379 |        "<p>5 rows × 36 columns</p>\n",
380 |        "</div>"
381 |       ],
382 |       "text/plain": [
383 |        "   event_password_change  event_details_change  event_login  amount_max_2h  \\\n",
384 |        "0                      0                     0            1           1.83   \n",
385 |        "1                      0                     0            1          18.72   \n",
386 |        "2                      1                     0            0          25.92   \n",
387 |        "3                      1                     0            0          24.75   \n",
388 |        "4                      1                     0            0          64.18   \n",
389 |        "\n",
390 |        "   amount_sum_2h  amount_count_2h  amount_avg_2h  amount_max_12h  \\\n",
391 |        "0           1.83              1.0       1.830000            1.83   \n",
392 |        "1          40.22              3.0      13.406667           18.72   \n",
393 |        "2          64.86              3.0      21.620000           25.92   \n",
394 |        "3          30.17              2.0      15.085000           24.75   \n",
395 |        "4          65.17              2.0      32.585000           64.18   \n",
396 |        "\n",
397 |        "   amount_sum_12h  amount_count_12h  ...  es_home_sum_14d  es_travel_sum_14d  \\\n",
398 |        "0            1.83               1.0  ...              0.0                0.0   \n",
399 |        "1           40.22               3.0  ...              0.0                0.0   \n",
400 |        "2           64.86               3.0  ...              0.0                0.0   \n",
401 |        "3           30.17               2.0  ...              0.0                0.0   \n",
402 |        "4           65.17               2.0  ...              0.0                0.0   \n",
403 |        "\n",
404 |        "   es_leisure_sum_14d  gender_F  gender_M   step  amount  timestamp_hour  \\\n",
405 |        "0                 0.0       0.0       1.0   72.0    1.83             7.0   \n",
406 |        "1                 0.0       0.0       1.0   66.0   18.72             7.0   \n",
407 |        "2                 0.0       0.0       1.0   27.0   25.92             7.0   \n",
408 |        "3                 0.0       0.0       1.0  141.0   24.75             7.0   \n",
409 |        "4                 0.0       1.0       0.0  124.0   64.18             7.0   \n",
410 |        "\n",
411 |        "   timestamp_day_of_week  label  \n",
412 |        "0                    6.0    0.0  \n",
413 |        "1                    6.0    0.0  \n",
414 |        "2                    6.0    0.0  \n",
415 |        "3                    6.0    0.0  \n",
416 |        "4                    6.0    0.0  \n",
417 |        "\n",
418 |        "[5 rows x 36 columns]"
419 |       ]
420 |      },
421 |      "execution_count": 8,
422 |      "metadata": {},
423 |      "output_type": "execute_result"
424 |     }
425 |    ],
426 |    "source": [
427 |     "data.to_dataframe().head()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "Once you have defined the feature vector, you can use `get_offline_features()` to generate the vector dataset and return it as a dataframe or materialize it into a file (CSV or Parquet). The next part demonstrates how to retrieve a vector, materialize it, and view its results."
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "## Building and Running an Automated Training and Validation Pipeline\n",
442 |     "\n",
443 |     "MLRun allows the building of distributed ML pipelines that can handle data processing, automated feature selection, training, optimization, testing, deployments, and so on. Pipelines are composed of steps that run or deploy custom or library (from the MLRun hub) serverless functions. Pipelines can be run locally (for debugging or small-scale tasks), on a scalable Kubernetes cluster (using Kubeflow), or in a CI/CD system.\n",
444 |     "\n",
445 |     "The example consists of the following pipeline steps (all using pre-defined MLRun hub functions):\n",
446 |     "\n",
447 |     "1. Materialize a feature vector (using `src/get_vector`). \n",
448 |     "2. Select the most optimal features (using `hub://feature_selection`).\n",
449 |     "3. Train the model with multiple algorithms (using `hub://auto_trainer`).\n",
450 |     "4. Evaluate the model (using `hub://auto_trainer`).\n",
451 |     "5. Deploy the model and its application to the test cluster (using `hub://v2_model_server`). The next section will explain the model and application pipeline in detail.\n",
452 |     "\n",
453 |     "Each step can accept the previous steps’ results or data, and generate results, multiple visual artifacts/charts, versioned data objects, and registered models.\n",
454 |     "\n",
455 |     "We have defined the workflow in [`src/new_train_workflow.py`](./src/new_train_workflow.py). "
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {
461 |     "tags": []
462 |    },
463 |    "source": [
464 |     "## Running the ML pipeline"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "The workflow/pipeline can be executed using the MLRun SDK (`project.run()` method) or using CLI commands (mlrun project), and can run directly from the source repo (GIT). See details in [MLRun Projects and Automation documentation](https://docs.mlrun.org/en/stable/projects/project.html).\n",
472 |     "\n",
473 |     "You can set arguments and destinations for the different artifacts when you run the workflow. The pipeline progress and results are shown in the notebook. Alternatively, you can check the progress, logs, artifacts, and more, in the MLRun UI or the CI/CD system. The next part demonstrates how to run the pipeline with custom arguments using the SDK."
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {
480 |     "tags": []
481 |    },
482 |    "outputs": [
483 |     {
484 |      "name": "stdout",
485 |      "output_type": "stream",
486 |      "text": [
487 |       "> 2024-10-08 13:20:13,879 [warning] WARNING!, You seem to have uncommitted git changes, use .push()\n"
488 |      ]
489 |     },
490 |     {
491 |      "name": "stderr",
492 |      "output_type": "stream",
493 |      "text": [
494 |       "Missing type name was inferred as \"JsonArray\" based on the value \"[]\".\n"
495 |      ]
496 |     },
497 |     {
498 |      "name": "stdout",
499 |      "output_type": "stream",
500 |      "text": [
501 |       "> 2024-10-08 13:20:14,738 [info] Pipeline submitted successfully: {\"id\":\"e27f0d6d-8f14-4642-abee-43b9867631ef\",\"pipeline_name\":\"fraud-demo-felipe-main 2024-10-08 13-20-14\"}\n",
502 |       "> 2024-10-08 13:20:14,738 [info] Pipeline run id=e27f0d6d-8f14-4642-abee-43b9867631ef, check UI for progress\n"
503 |      ]
504 |     },
505 |     {
506 |      "data": {
507 |       "text/html": [
508 |        "Workflow started in project fraud-demo-felipe id=e27f0d6d-8f14-4642-abee-43b9867631ef<div><a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor-workflows/workflow/e27f0d6d-8f14-4642-abee-43b9867631ef\" target=\"_blank\">click here to view progress</a></div>"
509 |       ],
510 |       "text/plain": [
511 |        "<IPython.core.display.HTML object>"
512 |       ]
513 |      },
514 |      "metadata": {},
515 |      "output_type": "display_data"
516 |     },
517 |     {
518 |      "data": {
519 |       "text/html": [
520 |        "<div>Pipeline running (id=e27f0d6d-8f14-4642-abee-43b9867631ef), <a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor-workflows/workflow/e27f0d6d-8f14-4642-abee-43b9867631ef\" target=\"_blank\"><b>click here</b></a> to view the details in MLRun UI</div>"
521 |       ],
522 |       "text/plain": [
523 |        "<IPython.core.display.HTML object>"
524 |       ]
525 |      },
526 |      "metadata": {},
527 |      "output_type": "display_data"
528 |     },
529 |     {
530 |      "data": {
531 |       "image/svg+xml": [
532 |        "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
533 |        "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
534 |        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
535 |        "<!-- Generated by graphviz version 2.43.0 (0)\n",
536 |        " -->\n",
537 |        "<!-- Title: kfp Pages: 1 -->\n",
538 |        "<svg width=\"8pt\" height=\"8pt\"\n",
539 |        " viewBox=\"0.00 0.00 8.00 8.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
540 |        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 4)\">\n",
541 |        "<title>kfp</title>\n",
542 |        "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-4 4,-4 4,4 -4,4\"/>\n",
543 |        "</g>\n",
544 |        "</svg>\n"
545 |       ],
546 |       "text/plain": [
547 |        "<graphviz.graphs.Digraph at 0x7f7e34b02370>"
548 |       ]
549 |      },
550 |      "metadata": {},
551 |      "output_type": "display_data"
552 |     },
553 |     {
554 |      "name": "stdout",
555 |      "output_type": "stream",
556 |      "text": [
557 |       "> 2024-10-08 13:20:14,810 [info] Started run workflow fraud-demo-felipe-main with run id = 'e27f0d6d-8f14-4642-abee-43b9867631ef' by kfp engine\n",
558 |       "> 2024-10-08 13:20:14,811 [info] Waiting for pipeline run completion: {\"project\":\"<mlrun.projects.project.MlrunProject object at 0x7f7decca0a30>\",\"run_id\":\"e27f0d6d-8f14-4642-abee-43b9867631ef\"}\n"
559 |      ]
560 |     },
561 |     {
562 |      "data": {
563 |       "image/svg+xml": [
564 |        "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
565 |        "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
566 |        " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
567 |        "<!-- Generated by graphviz version 2.43.0 (0)\n",
568 |        " -->\n",
569 |        "<!-- Title: kfp Pages: 1 -->\n",
570 |        "<svg width=\"248pt\" height=\"260pt\"\n",
571 |        " viewBox=\"0.00 0.00 248.05 260.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
572 |        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 256)\">\n",
573 |        "<title>kfp</title>\n",
574 |        "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-256 244.05,-256 244.05,4 -4,4\"/>\n",
575 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;2645070770 -->\n",
576 |        "<g id=\"node1\" class=\"node\">\n",
577 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;2645070770</title>\n",
578 |        "<ellipse fill=\"green\" stroke=\"black\" cx=\"50.05\" cy=\"-18\" rx=\"50.09\" ry=\"18\"/>\n",
579 |        "<text text-anchor=\"middle\" x=\"50.05\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">evaluate</text>\n",
580 |        "</g>\n",
581 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3318924573 -->\n",
582 |        "<g id=\"node2\" class=\"node\">\n",
583 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3318924573</title>\n",
584 |        "<ellipse fill=\"green\" stroke=\"black\" cx=\"114.05\" cy=\"-234\" rx=\"57.69\" ry=\"18\"/>\n",
585 |        "<text text-anchor=\"middle\" x=\"114.05\" y=\"-230.3\" font-family=\"Times,serif\" font-size=\"14.00\">get&#45;vector</text>\n",
586 |        "</g>\n",
587 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374 -->\n",
588 |        "<g id=\"node3\" class=\"node\">\n",
589 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374</title>\n",
590 |        "<ellipse fill=\"green\" stroke=\"black\" cx=\"114.05\" cy=\"-162\" rx=\"89.08\" ry=\"18\"/>\n",
591 |        "<text text-anchor=\"middle\" x=\"114.05\" y=\"-158.3\" font-family=\"Times,serif\" font-size=\"14.00\">feature&#45;selection</text>\n",
592 |        "</g>\n",
593 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3318924573&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374 -->\n",
594 |        "<g id=\"edge1\" class=\"edge\">\n",
595 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3318924573&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374</title>\n",
596 |        "<path fill=\"none\" stroke=\"black\" d=\"M114.05,-215.7C114.05,-207.98 114.05,-198.71 114.05,-190.11\"/>\n",
597 |        "<polygon fill=\"black\" stroke=\"black\" points=\"117.55,-190.1 114.05,-180.1 110.55,-190.1 117.55,-190.1\"/>\n",
598 |        "</g>\n",
599 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529 -->\n",
600 |        "<g id=\"node4\" class=\"node\">\n",
601 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529</title>\n",
602 |        "<ellipse fill=\"green\" stroke=\"black\" cx=\"114.05\" cy=\"-90\" rx=\"33.29\" ry=\"18\"/>\n",
603 |        "<text text-anchor=\"middle\" x=\"114.05\" y=\"-86.3\" font-family=\"Times,serif\" font-size=\"14.00\">train</text>\n",
604 |        "</g>\n",
605 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529 -->\n",
606 |        "<g id=\"edge2\" class=\"edge\">\n",
607 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;3653472374&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529</title>\n",
608 |        "<path fill=\"none\" stroke=\"black\" d=\"M114.05,-143.7C114.05,-135.98 114.05,-126.71 114.05,-118.11\"/>\n",
609 |        "<polygon fill=\"black\" stroke=\"black\" points=\"117.55,-118.1 114.05,-108.1 110.55,-118.1 117.55,-118.1\"/>\n",
610 |        "</g>\n",
611 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;2645070770 -->\n",
612 |        "<g id=\"edge4\" class=\"edge\">\n",
613 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;2645070770</title>\n",
614 |        "<path fill=\"none\" stroke=\"black\" d=\"M99.84,-73.46C91.63,-64.48 81.11,-52.98 71.87,-42.87\"/>\n",
615 |        "<polygon fill=\"black\" stroke=\"black\" points=\"74.43,-40.49 65.1,-35.47 69.27,-45.21 74.43,-40.49\"/>\n",
616 |        "</g>\n",
617 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;922820315 -->\n",
618 |        "<g id=\"node5\" class=\"node\">\n",
619 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;922820315</title>\n",
620 |        "<polygon fill=\"green\" stroke=\"black\" points=\"240.05,-36 122.05,-36 118.05,-32 118.05,0 236.05,0 240.05,-4 240.05,-36\"/>\n",
621 |        "<polyline fill=\"none\" stroke=\"black\" points=\"236.05,-32 118.05,-32 \"/>\n",
622 |        "<polyline fill=\"none\" stroke=\"black\" points=\"236.05,-32 236.05,0 \"/>\n",
623 |        "<polyline fill=\"none\" stroke=\"black\" points=\"236.05,-32 240.05,-36 \"/>\n",
624 |        "<text text-anchor=\"middle\" x=\"179.05\" y=\"-14.3\" font-family=\"Times,serif\" font-size=\"14.00\">deploy&#45;serving</text>\n",
625 |        "</g>\n",
626 |        "<!-- fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;922820315 -->\n",
627 |        "<g id=\"edge3\" class=\"edge\">\n",
628 |        "<title>fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;67864529&#45;&gt;fraud&#45;detection&#45;pipeline&#45;6l7vc&#45;922820315</title>\n",
629 |        "<path fill=\"none\" stroke=\"black\" d=\"M128.47,-73.46C136.5,-64.82 146.68,-53.85 155.81,-44.03\"/>\n",
630 |        "<polygon fill=\"black\" stroke=\"black\" points=\"158.59,-46.17 162.83,-36.46 153.46,-41.41 158.59,-46.17\"/>\n",
631 |        "</g>\n",
632 |        "</g>\n",
633 |        "</svg>\n"
634 |       ],
635 |       "text/plain": [
636 |        "<graphviz.graphs.Digraph at 0x7f7deca13fd0>"
637 |       ]
638 |      },
639 |      "metadata": {},
640 |      "output_type": "display_data"
641 |     },
642 |     {
643 |      "data": {
644 |       "text/html": [
645 |        "<h2>Run Results</h2><h3>[info] Workflow e27f0d6d-8f14-4642-abee-43b9867631ef finished, state=Succeeded</h3><br>click the hyper links below to see detailed results<br><table border=\"1\" class=\"dataframe\">\n",
646 |        "  <thead>\n",
647 |        "    <tr style=\"text-align: right;\">\n",
648 |        "      <th>uid</th>\n",
649 |        "      <th>start</th>\n",
650 |        "      <th>state</th>\n",
651 |        "      <th>kind</th>\n",
652 |        "      <th>name</th>\n",
653 |        "      <th>parameters</th>\n",
654 |        "      <th>results</th>\n",
655 |        "    </tr>\n",
656 |        "  </thead>\n",
657 |        "  <tbody>\n",
658 |        "    <tr>\n",
659 |        "      <td><div title=\"82435092a42244458f09090df8fcc251\"><a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor/82435092a42244458f09090df8fcc251/overview\" target=\"_blank\" >...f8fcc251</a></div></td>\n",
660 |        "      <td>Oct 08 13:21:44</td>\n",
661 |        "      <td>completed</td>\n",
662 |        "      <td>run</td>\n",
663 |        "      <td>evaluate</td>\n",
664 |        "      <td><div class=\"dictlist\">label_columns=label</div><div class=\"dictlist\">model=store://artifacts/fraud-demo-felipe/transaction_fraud_rf:latest@e27f0d6d-8f14-4642-abee-43b9867631ef</div><div class=\"dictlist\">drop_columns=label</div></td>\n",
665 |        "      <td><div class=\"dictlist\">evaluation_accuracy=0.9905</div><div class=\"dictlist\">evaluation_f1_score=0.17391304347826086</div><div class=\"dictlist\">evaluation_precision_score=0.4</div><div class=\"dictlist\">evaluation_recall_score=0.1111111111111111</div></td>\n",
666 |        "    </tr>\n",
667 |        "    <tr>\n",
668 |        "      <td><div title=\"4eef9c89992842cf90ca121a674058e1\"><a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor/4eef9c89992842cf90ca121a674058e1/overview\" target=\"_blank\" >...674058e1</a></div></td>\n",
669 |        "      <td>Oct 08 13:21:10</td>\n",
670 |        "      <td>completed</td>\n",
671 |        "      <td>run</td>\n",
672 |        "      <td>train</td>\n",
673 |        "      <td><div class=\"dictlist\">sample=-1</div><div class=\"dictlist\">label_column=label</div><div class=\"dictlist\">test_size=0.1</div></td>\n",
674 |        "      <td><div class=\"dictlist\">best_iteration=1</div><div class=\"dictlist\">accuracy=0.9905</div><div class=\"dictlist\">f1_score=0.17391304347826086</div><div class=\"dictlist\">precision_score=0.4</div><div class=\"dictlist\">recall_score=0.1111111111111111</div></td>\n",
675 |        "    </tr>\n",
676 |        "    <tr>\n",
677 |        "      <td><div title=\"f1054ceef99c4b85beb54850af1f9176\"><a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor/f1054ceef99c4b85beb54850af1f9176/overview\" target=\"_blank\" >...af1f9176</a></div></td>\n",
678 |        "      <td>Oct 08 13:20:42</td>\n",
679 |        "      <td>completed</td>\n",
680 |        "      <td>run</td>\n",
681 |        "      <td>feature-selection</td>\n",
682 |        "      <td><div class=\"dictlist\">output_vector_name=short</div><div class=\"dictlist\">label_column=label</div><div class=\"dictlist\">k=18</div><div class=\"dictlist\">min_votes=2</div><div class=\"dictlist\">ignore_type_errors=True</div></td>\n",
683 |        "      <td><div class=\"dictlist\">top_features_vector=store://feature-vectors/fraud-demo-felipe/short</div></td>\n",
684 |        "    </tr>\n",
685 |        "    <tr>\n",
686 |        "      <td><div title=\"9d5ec830a46640eaae20e2cac1e2831f\"><a href=\"https://dashboard.default-tenant.app.cust-cs-illl--3-6-0.iguazio-cd2.com/mlprojects/fraud-demo-felipe/jobs/monitor/9d5ec830a46640eaae20e2cac1e2831f/overview\" target=\"_blank\" >...c1e2831f</a></div></td>\n",
687 |        "      <td>Oct 08 13:20:21</td>\n",
688 |        "      <td>completed</td>\n",
689 |        "      <td>run</td>\n",
690 |        "      <td>get-vector</td>\n",
691 |        "      <td><div class=\"dictlist\">feature_vector=transactions-fraud</div><div class=\"dictlist\">features=['events.*', 'transactions.amount_max_2h', 'transactions.amount_sum_2h', 'transactions.amount_count_2h', 'transactions.amount_avg_2h', 'transactions.amount_max_12h', 'transactions.amount_sum_12h', 'transactions.amount_count_12h', 'transactions.amount_avg_12h', 'transactions.amount_max_24h', 'transactions.amount_sum_24h', 'transactions.amount_count_24h', 'transactions.amount_avg_24h', 'transactions.es_transportation_sum_14d', 'transactions.es_health_sum_14d', 'transactions.es_otherservices_sum_14d', 'transactions.es_food_sum_14d', 'transactions.es_hotelservices_sum_14d', 'transactions.es_barsandrestaurants_sum_14d', 'transactions.es_tech_sum_14d', 'transactions.es_sportsandtoys_sum_14d', 'transactions.es_wellnessandbeauty_sum_14d', 'transactions.es_hyper_sum_14d', 'transactions.es_fashion_sum_14d', 'transactions.es_home_sum_14d', 'transactions.es_travel_sum_14d', 'transactions.es_leisure_sum_14d', 'transactions.gender_F', 'transactions.gender_M', 'transactions.step', 'transactions.amount', 'transactions.timestamp_hour', 'transactions.timestamp_day_of_week']</div><div class=\"dictlist\">label_feature=labels.label</div><div class=\"dictlist\">target={'name': 'parquet', 'kind': 'parquet'}</div><div class=\"dictlist\">update_stats=True</div></td>\n",
692 |        "      <td><div class=\"dictlist\">return=<mlrun.feature_store.feature_vector.OfflineVectorResponse object at 0x7f1aba100fa0></div></td>\n",
693 |        "    </tr>\n",
694 |        "  </tbody>\n",
695 |        "</table>"
696 |       ],
697 |       "text/plain": [
698 |        "<IPython.core.display.HTML object>"
699 |       ]
700 |      },
701 |      "metadata": {},
702 |      "output_type": "display_data"
703 |     }
704 |    ],
705 |    "source": [
706 |     "import json\n",
707 |     "run_id = project.run(\n",
708 |     "    'main',\n",
709 |     "    arguments={'vector_name':\"transactions-fraud\",\n",
710 |     "               'features': json.dumps(features),\n",
711 |     "                'label_column':\"labels.label\",\n",
712 |     "              }, \n",
713 |     "    dirty=True, watch=True, engine='remote')"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "markdown",
718 |    "metadata": {},
719 |    "source": [
720 |     "## Test the model endpoint\n"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "markdown",
725 |    "metadata": {},
726 |    "source": [
727 |     "Now that your model is deployed using the pipeline, you can invoke it as usual:"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "code",
732 |    "execution_count": 10,
733 |    "metadata": {},
734 |    "outputs": [
735 |     {
736 |      "name": "stdout",
737 |      "output_type": "stream",
738 |      "text": [
739 |       "> 2024-10-08 13:41:17,914 [info] Invoking function: {\"method\":\"POST\",\"path\":\"http://nuclio-fraud-demo-felipe-serving.default-tenant.svc.cluster.local:8080/v2/models/fraud/infer\"}\n"
740 |      ]
741 |     },
742 |     {
743 |      "data": {
744 |       "text/plain": [
745 |        "{'id': 'c9b5036c-8957-48dc-adba-d7f60ccd7812',\n",
746 |        " 'model_name': 'fraud',\n",
747 |        " 'outputs': [0],\n",
748 |        " 'timestamp': '2024-10-08 13:41:17.934699+00:00',\n",
749 |        " 'model_version': 'latest'}"
750 |       ]
751 |      },
752 |      "execution_count": 10,
753 |      "metadata": {},
754 |      "output_type": "execute_result"
755 |     }
756 |    ],
757 |    "source": [
758 |     "# Define your serving function\n",
759 |     "serving_fn = project.get_function('serving')\n",
760 |     "\n",
761 |     "# Choose an id for your test\n",
762 |     "sample_id = 'C1000148617'\n",
763 |     "model_inference_path = '/v2/models/fraud/infer'\n",
764 |     "\n",
765 |     "# Send our sample ID for predcition\n",
766 |     "serving_fn.invoke(path=model_inference_path,\n",
767 |     "                  body={'inputs': [[sample_id]]})"
768 |    ]
769 |   },
770 |   {
771 |    "cell_type": "markdown",
772 |    "metadata": {},
773 |    "source": [
774 |     "## Done!\n",
775 |     "\n",
776 |     "You've completed part 4 - the model training with the feature store.\n",
777 |     "Proceed to [Part 5](06-real-time-serving-pipeline.ipynb) to learn how to deploy real-time application pipelines."
778 |    ]
779 |   }
780 |  ],
781 |  "metadata": {
782 |   "kernelspec": {
783 |    "display_name": "Python 3 (ipykernel)",
784 |    "language": "python",
785 |    "name": "python3"
786 |   },
787 |   "language_info": {
788 |    "codemirror_mode": {
789 |     "name": "ipython",
790 |     "version": 3
791 |    },
792 |    "file_extension": ".py",
793 |    "mimetype": "text/x-python",
794 |    "name": "python",
795 |    "nbconvert_exporter": "python",
796 |    "pygments_lexer": "ipython3",
797 |    "version": "3.11.5"
798 |   }
799 |  },
800 |  "nbformat": 4,
801 |  "nbformat_minor": 4
802 | }
803 | 


--------------------------------------------------------------------------------
/02-interactive-data-preparation.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Interactive Data Preparation\n"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "Before training the model, you should clean the data and create meaningful features that will be good predictors for the target variable (was there a fraud?). The `interactive-data-prep.ipynb` notebook demonstrates how to interactively build features for training the model. While this approach is simple, it is unsuitable for production environments with continuous data ingestion, large scale, or real-time. In the next section, you will implement the same logic for production using a feature store."
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "The training set is built from three datasets: credit transactions, user events, and labels indicating if there was fraud. In this example, we prepare each dataset separately and combine them later for training."
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "markdown",
  26 |    "metadata": {},
  27 |    "source": [
  28 |     "## Preparing the Credit Transaction Dataset"
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "markdown",
  33 |    "metadata": {},
  34 |    "source": [
  35 |     "The following transformations create more meaningful features, which can have a more significant impact on the prediction than the raw data:\n",
  36 |     "    \n",
  37 |     "- Extracting the date components (hour, day of week) from the timestamp.\n",
  38 |     "- One-hot encoding for the age groups, transaction category, and the gender.\n",
  39 |     "- Aggregating the amount (avg., sum, count, max over 2/12/24 hour time win‐ dows).\n",
  40 |     "- Aggregating the transactions per category (over 14 days time windows).\n"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "markdown",
  45 |    "metadata": {},
  46 |    "source": [
  47 |     "#### Building categorical features"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": 12,
  53 |    "metadata": {},
  54 |    "outputs": [
  55 |     {
  56 |      "data": {
  57 |       "text/html": [
  58 |        "<div>\n",
  59 |        "<style scoped>\n",
  60 |        "    .dataframe tbody tr th:only-of-type {\n",
  61 |        "        vertical-align: middle;\n",
  62 |        "    }\n",
  63 |        "\n",
  64 |        "    .dataframe tbody tr th {\n",
  65 |        "        vertical-align: top;\n",
  66 |        "    }\n",
  67 |        "\n",
  68 |        "    .dataframe thead th {\n",
  69 |        "        text-align: right;\n",
  70 |        "    }\n",
  71 |        "</style>\n",
  72 |        "<table border=\"1\" class=\"dataframe\">\n",
  73 |        "  <thead>\n",
  74 |        "    <tr style=\"text-align: right;\">\n",
  75 |        "      <th></th>\n",
  76 |        "      <th>step</th>\n",
  77 |        "      <th>age</th>\n",
  78 |        "      <th>gender</th>\n",
  79 |        "      <th>zipcodeOri</th>\n",
  80 |        "      <th>zipMerchant</th>\n",
  81 |        "      <th>category</th>\n",
  82 |        "      <th>amount</th>\n",
  83 |        "      <th>fraud</th>\n",
  84 |        "      <th>timestamp</th>\n",
  85 |        "      <th>source</th>\n",
  86 |        "      <th>target</th>\n",
  87 |        "      <th>device</th>\n",
  88 |        "    </tr>\n",
  89 |        "  </thead>\n",
  90 |        "  <tbody>\n",
  91 |        "    <tr>\n",
  92 |        "      <th>274633</th>\n",
  93 |        "      <td>91</td>\n",
  94 |        "      <td>5</td>\n",
  95 |        "      <td>F</td>\n",
  96 |        "      <td>28007</td>\n",
  97 |        "      <td>28007</td>\n",
  98 |        "      <td>es_transportation</td>\n",
  99 |        "      <td>26.92</td>\n",
 100 |        "      <td>0</td>\n",
 101 |        "      <td>2024-10-06 07:02:33.778149000</td>\n",
 102 |        "      <td>C1022153336</td>\n",
 103 |        "      <td>M1823072687</td>\n",
 104 |        "      <td>33832bb8607545df97632a7ab02d69c4</td>\n",
 105 |        "    </tr>\n",
 106 |        "    <tr>\n",
 107 |        "      <th>286902</th>\n",
 108 |        "      <td>94</td>\n",
 109 |        "      <td>2</td>\n",
 110 |        "      <td>M</td>\n",
 111 |        "      <td>28007</td>\n",
 112 |        "      <td>28007</td>\n",
 113 |        "      <td>es_transportation</td>\n",
 114 |        "      <td>48.22</td>\n",
 115 |        "      <td>0</td>\n",
 116 |        "      <td>2024-10-06 07:02:52.071774913</td>\n",
 117 |        "      <td>C1006176917</td>\n",
 118 |        "      <td>M348934600</td>\n",
 119 |        "      <td>fadd829c49e74ffa86c8da3be75ada53</td>\n",
 120 |        "    </tr>\n",
 121 |        "    <tr>\n",
 122 |        "      <th>416998</th>\n",
 123 |        "      <td>131</td>\n",
 124 |        "      <td>3</td>\n",
 125 |        "      <td>M</td>\n",
 126 |        "      <td>28007</td>\n",
 127 |        "      <td>28007</td>\n",
 128 |        "      <td>es_transportation</td>\n",
 129 |        "      <td>17.56</td>\n",
 130 |        "      <td>0</td>\n",
 131 |        "      <td>2024-10-06 07:02:57.178944939</td>\n",
 132 |        "      <td>C1010936270</td>\n",
 133 |        "      <td>M348934600</td>\n",
 134 |        "      <td>58d0422a50bc40c89d2b4977b2f1beea</td>\n",
 135 |        "    </tr>\n",
 136 |        "  </tbody>\n",
 137 |        "</table>\n",
 138 |        "</div>"
 139 |       ],
 140 |       "text/plain": [
 141 |        "        step age gender  zipcodeOri  zipMerchant           category  amount  \\\n",
 142 |        "274633    91   5      F       28007        28007  es_transportation   26.92   \n",
 143 |        "286902    94   2      M       28007        28007  es_transportation   48.22   \n",
 144 |        "416998   131   3      M       28007        28007  es_transportation   17.56   \n",
 145 |        "\n",
 146 |        "        fraud                     timestamp       source       target  \\\n",
 147 |        "274633      0 2024-10-06 07:02:33.778149000  C1022153336  M1823072687   \n",
 148 |        "286902      0 2024-10-06 07:02:52.071774913  C1006176917   M348934600   \n",
 149 |        "416998      0 2024-10-06 07:02:57.178944939  C1010936270   M348934600   \n",
 150 |        "\n",
 151 |        "                                  device  \n",
 152 |        "274633  33832bb8607545df97632a7ab02d69c4  \n",
 153 |        "286902  fadd829c49e74ffa86c8da3be75ada53  \n",
 154 |        "416998  58d0422a50bc40c89d2b4977b2f1beea  "
 155 |       ]
 156 |      },
 157 |      "execution_count": 12,
 158 |      "metadata": {},
 159 |      "output_type": "execute_result"
 160 |     }
 161 |    ],
 162 |    "source": [
 163 |     "import pandas as pd\n",
 164 |     "from src.date_adjust import adjust_data_timespan\n",
 165 |     "import mlrun\n",
 166 |     "\n",
 167 |     "# Fetch the transactions and event datasets from mlrun data samples \n",
 168 |     "data_path = mlrun.get_sample_path(\"data/fraud-demo-mlrun-fs-docs/\")\n",
 169 |     "transactions_data = pd.read_csv(data_path + \"data.csv\", parse_dates=['timestamp'])\n",
 170 |     "\n",
 171 |     "# use only first 10k\n",
 172 |     "transactions_data = transactions_data.sort_values(by='source', axis=0)[:10000]\n",
 173 |     "\n",
 174 |     "# Adjust the samples timestamp for the past 2 days\n",
 175 |     "transactions_data = adjust_data_timespan(transactions_data, new_period='2d')\n",
 176 |     "\n",
 177 |     "# Preview\n",
 178 |     "transactions_data.head(3)"
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "code",
 183 |    "execution_count": 13,
 184 |    "metadata": {},
 185 |    "outputs": [
 186 |     {
 187 |      "data": {
 188 |       "text/plain": [
 189 |        "Index(['step', 'age', 'gender', 'zipcodeOri', 'zipMerchant', 'category',\n",
 190 |        "       'amount', 'fraud', 'timestamp', 'source', 'target', 'device'],\n",
 191 |        "      dtype='object')"
 192 |       ]
 193 |      },
 194 |      "execution_count": 13,
 195 |      "metadata": {},
 196 |      "output_type": "execute_result"
 197 |     }
 198 |    ],
 199 |    "source": [
 200 |     "transactions_data.columns"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "markdown",
 205 |    "metadata": {},
 206 |    "source": [
 207 |     "The next part is aggregating the transaction amounts by time windows and transaction categories, providing you with a long list of derived features that can potentially help make better predictions."
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": 14,
 213 |    "metadata": {},
 214 |    "outputs": [
 215 |     {
 216 |      "data": {
 217 |       "text/html": [
 218 |        "<div>\n",
 219 |        "<style scoped>\n",
 220 |        "    .dataframe tbody tr th:only-of-type {\n",
 221 |        "        vertical-align: middle;\n",
 222 |        "    }\n",
 223 |        "\n",
 224 |        "    .dataframe tbody tr th {\n",
 225 |        "        vertical-align: top;\n",
 226 |        "    }\n",
 227 |        "\n",
 228 |        "    .dataframe thead th {\n",
 229 |        "        text-align: right;\n",
 230 |        "    }\n",
 231 |        "</style>\n",
 232 |        "<table border=\"1\" class=\"dataframe\">\n",
 233 |        "  <thead>\n",
 234 |        "    <tr style=\"text-align: right;\">\n",
 235 |        "      <th></th>\n",
 236 |        "      <th>step</th>\n",
 237 |        "      <th>age</th>\n",
 238 |        "      <th>zipcodeOri</th>\n",
 239 |        "      <th>zipMerchant</th>\n",
 240 |        "      <th>amount</th>\n",
 241 |        "      <th>fraud</th>\n",
 242 |        "      <th>timestamp</th>\n",
 243 |        "      <th>source</th>\n",
 244 |        "      <th>target</th>\n",
 245 |        "      <th>device</th>\n",
 246 |        "      <th>...</th>\n",
 247 |        "      <th>category_es_hyper</th>\n",
 248 |        "      <th>category_es_leisure</th>\n",
 249 |        "      <th>category_es_otherservices</th>\n",
 250 |        "      <th>category_es_sportsandtoys</th>\n",
 251 |        "      <th>category_es_tech</th>\n",
 252 |        "      <th>category_es_transportation</th>\n",
 253 |        "      <th>category_es_travel</th>\n",
 254 |        "      <th>category_es_wellnessandbeauty</th>\n",
 255 |        "      <th>gender_F</th>\n",
 256 |        "      <th>gender_M</th>\n",
 257 |        "    </tr>\n",
 258 |        "  </thead>\n",
 259 |        "  <tbody>\n",
 260 |        "    <tr>\n",
 261 |        "      <th>274633</th>\n",
 262 |        "      <td>91</td>\n",
 263 |        "      <td>5</td>\n",
 264 |        "      <td>28007</td>\n",
 265 |        "      <td>28007</td>\n",
 266 |        "      <td>26.92</td>\n",
 267 |        "      <td>0</td>\n",
 268 |        "      <td>2024-10-06 07:02:33.778149000</td>\n",
 269 |        "      <td>C1022153336</td>\n",
 270 |        "      <td>M1823072687</td>\n",
 271 |        "      <td>33832bb8607545df97632a7ab02d69c4</td>\n",
 272 |        "      <td>...</td>\n",
 273 |        "      <td>False</td>\n",
 274 |        "      <td>False</td>\n",
 275 |        "      <td>False</td>\n",
 276 |        "      <td>False</td>\n",
 277 |        "      <td>False</td>\n",
 278 |        "      <td>True</td>\n",
 279 |        "      <td>False</td>\n",
 280 |        "      <td>False</td>\n",
 281 |        "      <td>True</td>\n",
 282 |        "      <td>False</td>\n",
 283 |        "    </tr>\n",
 284 |        "    <tr>\n",
 285 |        "      <th>286902</th>\n",
 286 |        "      <td>94</td>\n",
 287 |        "      <td>2</td>\n",
 288 |        "      <td>28007</td>\n",
 289 |        "      <td>28007</td>\n",
 290 |        "      <td>48.22</td>\n",
 291 |        "      <td>0</td>\n",
 292 |        "      <td>2024-10-06 07:02:52.071774913</td>\n",
 293 |        "      <td>C1006176917</td>\n",
 294 |        "      <td>M348934600</td>\n",
 295 |        "      <td>fadd829c49e74ffa86c8da3be75ada53</td>\n",
 296 |        "      <td>...</td>\n",
 297 |        "      <td>False</td>\n",
 298 |        "      <td>False</td>\n",
 299 |        "      <td>False</td>\n",
 300 |        "      <td>False</td>\n",
 301 |        "      <td>False</td>\n",
 302 |        "      <td>True</td>\n",
 303 |        "      <td>False</td>\n",
 304 |        "      <td>False</td>\n",
 305 |        "      <td>False</td>\n",
 306 |        "      <td>True</td>\n",
 307 |        "    </tr>\n",
 308 |        "    <tr>\n",
 309 |        "      <th>416998</th>\n",
 310 |        "      <td>131</td>\n",
 311 |        "      <td>3</td>\n",
 312 |        "      <td>28007</td>\n",
 313 |        "      <td>28007</td>\n",
 314 |        "      <td>17.56</td>\n",
 315 |        "      <td>0</td>\n",
 316 |        "      <td>2024-10-06 07:02:57.178944939</td>\n",
 317 |        "      <td>C1010936270</td>\n",
 318 |        "      <td>M348934600</td>\n",
 319 |        "      <td>58d0422a50bc40c89d2b4977b2f1beea</td>\n",
 320 |        "      <td>...</td>\n",
 321 |        "      <td>False</td>\n",
 322 |        "      <td>False</td>\n",
 323 |        "      <td>False</td>\n",
 324 |        "      <td>False</td>\n",
 325 |        "      <td>False</td>\n",
 326 |        "      <td>True</td>\n",
 327 |        "      <td>False</td>\n",
 328 |        "      <td>False</td>\n",
 329 |        "      <td>False</td>\n",
 330 |        "      <td>True</td>\n",
 331 |        "    </tr>\n",
 332 |        "    <tr>\n",
 333 |        "      <th>334543</th>\n",
 334 |        "      <td>108</td>\n",
 335 |        "      <td>4</td>\n",
 336 |        "      <td>28007</td>\n",
 337 |        "      <td>28007</td>\n",
 338 |        "      <td>4.50</td>\n",
 339 |        "      <td>0</td>\n",
 340 |        "      <td>2024-10-06 07:03:09.471696118</td>\n",
 341 |        "      <td>C1033736586</td>\n",
 342 |        "      <td>M1823072687</td>\n",
 343 |        "      <td>30b269ae55984e5584f1dd5f642ac1a3</td>\n",
 344 |        "      <td>...</td>\n",
 345 |        "      <td>False</td>\n",
 346 |        "      <td>False</td>\n",
 347 |        "      <td>False</td>\n",
 348 |        "      <td>False</td>\n",
 349 |        "      <td>False</td>\n",
 350 |        "      <td>True</td>\n",
 351 |        "      <td>False</td>\n",
 352 |        "      <td>False</td>\n",
 353 |        "      <td>True</td>\n",
 354 |        "      <td>False</td>\n",
 355 |        "    </tr>\n",
 356 |        "    <tr>\n",
 357 |        "      <th>210647</th>\n",
 358 |        "      <td>72</td>\n",
 359 |        "      <td>4</td>\n",
 360 |        "      <td>28007</td>\n",
 361 |        "      <td>28007</td>\n",
 362 |        "      <td>1.83</td>\n",
 363 |        "      <td>0</td>\n",
 364 |        "      <td>2024-10-06 07:03:43.360778001</td>\n",
 365 |        "      <td>C1019071188</td>\n",
 366 |        "      <td>M348934600</td>\n",
 367 |        "      <td>97bee3503a984f59aa6139b59f933c0b</td>\n",
 368 |        "      <td>...</td>\n",
 369 |        "      <td>False</td>\n",
 370 |        "      <td>False</td>\n",
 371 |        "      <td>False</td>\n",
 372 |        "      <td>False</td>\n",
 373 |        "      <td>False</td>\n",
 374 |        "      <td>True</td>\n",
 375 |        "      <td>False</td>\n",
 376 |        "      <td>False</td>\n",
 377 |        "      <td>False</td>\n",
 378 |        "      <td>True</td>\n",
 379 |        "    </tr>\n",
 380 |        "  </tbody>\n",
 381 |        "</table>\n",
 382 |        "<p>5 rows × 30 columns</p>\n",
 383 |        "</div>"
 384 |       ],
 385 |       "text/plain": [
 386 |        "        step age  zipcodeOri  zipMerchant  amount  fraud  \\\n",
 387 |        "274633    91   5       28007        28007   26.92      0   \n",
 388 |        "286902    94   2       28007        28007   48.22      0   \n",
 389 |        "416998   131   3       28007        28007   17.56      0   \n",
 390 |        "334543   108   4       28007        28007    4.50      0   \n",
 391 |        "210647    72   4       28007        28007    1.83      0   \n",
 392 |        "\n",
 393 |        "                           timestamp       source       target  \\\n",
 394 |        "274633 2024-10-06 07:02:33.778149000  C1022153336  M1823072687   \n",
 395 |        "286902 2024-10-06 07:02:52.071774913  C1006176917   M348934600   \n",
 396 |        "416998 2024-10-06 07:02:57.178944939  C1010936270   M348934600   \n",
 397 |        "334543 2024-10-06 07:03:09.471696118  C1033736586  M1823072687   \n",
 398 |        "210647 2024-10-06 07:03:43.360778001  C1019071188   M348934600   \n",
 399 |        "\n",
 400 |        "                                  device  ...  category_es_hyper  \\\n",
 401 |        "274633  33832bb8607545df97632a7ab02d69c4  ...              False   \n",
 402 |        "286902  fadd829c49e74ffa86c8da3be75ada53  ...              False   \n",
 403 |        "416998  58d0422a50bc40c89d2b4977b2f1beea  ...              False   \n",
 404 |        "334543  30b269ae55984e5584f1dd5f642ac1a3  ...              False   \n",
 405 |        "210647  97bee3503a984f59aa6139b59f933c0b  ...              False   \n",
 406 |        "\n",
 407 |        "        category_es_leisure category_es_otherservices  \\\n",
 408 |        "274633                False                     False   \n",
 409 |        "286902                False                     False   \n",
 410 |        "416998                False                     False   \n",
 411 |        "334543                False                     False   \n",
 412 |        "210647                False                     False   \n",
 413 |        "\n",
 414 |        "        category_es_sportsandtoys  category_es_tech  \\\n",
 415 |        "274633                      False             False   \n",
 416 |        "286902                      False             False   \n",
 417 |        "416998                      False             False   \n",
 418 |        "334543                      False             False   \n",
 419 |        "210647                      False             False   \n",
 420 |        "\n",
 421 |        "        category_es_transportation  category_es_travel  \\\n",
 422 |        "274633                        True               False   \n",
 423 |        "286902                        True               False   \n",
 424 |        "416998                        True               False   \n",
 425 |        "334543                        True               False   \n",
 426 |        "210647                        True               False   \n",
 427 |        "\n",
 428 |        "        category_es_wellnessandbeauty  gender_F  gender_M  \n",
 429 |        "274633                          False      True     False  \n",
 430 |        "286902                          False     False      True  \n",
 431 |        "416998                          False     False      True  \n",
 432 |        "334543                          False      True     False  \n",
 433 |        "210647                          False     False      True  \n",
 434 |        "\n",
 435 |        "[5 rows x 30 columns]"
 436 |       ]
 437 |      },
 438 |      "execution_count": 14,
 439 |      "metadata": {},
 440 |      "output_type": "execute_result"
 441 |     }
 442 |    ],
 443 |    "source": [
 444 |     "processed_transactions = transactions_data\n",
 445 |     "\n",
 446 |     "# Generate day and hour columns from the timestamp\n",
 447 |     "processed_transactions['day_of_week'] = processed_transactions['timestamp'].dt.weekday\n",
 448 |     "processed_transactions['hour'] = processed_transactions['timestamp'].dt.hour\n",
 449 |     "\n",
 450 |     "# Map age groups\n",
 451 |     "processed_transactions[\"age_mapped\"] = processed_transactions[\"age\"].map(\n",
 452 |     "    lambda x: {'U': '0'}.get(x, x)\n",
 453 |     ")\n",
 454 |     "\n",
 455 |     "# encode categories and gender groups (using one hot encoding)\n",
 456 |     "processed_transactions = pd.get_dummies(processed_transactions, columns=['category', 'gender'])\n",
 457 |     "processed_transactions.head()"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "code",
 462 |    "execution_count": 15,
 463 |    "metadata": {},
 464 |    "outputs": [],
 465 |    "source": [
 466 |     "transactions_for_agg = processed_transactions.set_index(['timestamp'],)\n",
 467 |     "\n",
 468 |     "# Group/Aggregate amount stats (mean, max, ..) by time windows\n",
 469 |     "windows=['2H', '12H', '24H']\n",
 470 |     "operation = ['mean','sum', 'count','max']\n",
 471 |     "for window in windows:\n",
 472 |     "    for op in operation:\n",
 473 |     "        processed_transactions[f'amount_{op}_{window}'] = transactions_for_agg.groupby(['source', pd.Grouper(freq=window)])['amount'].transform(op).values"
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "code",
 478 |    "execution_count": 16,
 479 |    "metadata": {},
 480 |    "outputs": [
 481 |     {
 482 |      "data": {
 483 |       "text/html": [
 484 |        "<div>\n",
 485 |        "<style scoped>\n",
 486 |        "    .dataframe tbody tr th:only-of-type {\n",
 487 |        "        vertical-align: middle;\n",
 488 |        "    }\n",
 489 |        "\n",
 490 |        "    .dataframe tbody tr th {\n",
 491 |        "        vertical-align: top;\n",
 492 |        "    }\n",
 493 |        "\n",
 494 |        "    .dataframe thead th {\n",
 495 |        "        text-align: right;\n",
 496 |        "    }\n",
 497 |        "</style>\n",
 498 |        "<table border=\"1\" class=\"dataframe\">\n",
 499 |        "  <thead>\n",
 500 |        "    <tr style=\"text-align: right;\">\n",
 501 |        "      <th></th>\n",
 502 |        "      <th>step</th>\n",
 503 |        "      <th>age</th>\n",
 504 |        "      <th>zipcodeOri</th>\n",
 505 |        "      <th>zipMerchant</th>\n",
 506 |        "      <th>amount</th>\n",
 507 |        "      <th>fraud</th>\n",
 508 |        "      <th>timestamp</th>\n",
 509 |        "      <th>target</th>\n",
 510 |        "      <th>device</th>\n",
 511 |        "      <th>day_of_week</th>\n",
 512 |        "      <th>...</th>\n",
 513 |        "      <th>es_barsandrestaurants_sum_14D</th>\n",
 514 |        "      <th>es_tech_sum_14D</th>\n",
 515 |        "      <th>es_sportsandtoys_sum_14D</th>\n",
 516 |        "      <th>es_wellnessandbeauty_sum_14D</th>\n",
 517 |        "      <th>es_hyper_sum_14D</th>\n",
 518 |        "      <th>es_fashion_sum_14D</th>\n",
 519 |        "      <th>es_home_sum_14D</th>\n",
 520 |        "      <th>es_contents_sum_14D</th>\n",
 521 |        "      <th>es_travel_sum_14D</th>\n",
 522 |        "      <th>es_leisure_sum_14D</th>\n",
 523 |        "    </tr>\n",
 524 |        "    <tr>\n",
 525 |        "      <th>source</th>\n",
 526 |        "      <th></th>\n",
 527 |        "      <th></th>\n",
 528 |        "      <th></th>\n",
 529 |        "      <th></th>\n",
 530 |        "      <th></th>\n",
 531 |        "      <th></th>\n",
 532 |        "      <th></th>\n",
 533 |        "      <th></th>\n",
 534 |        "      <th></th>\n",
 535 |        "      <th></th>\n",
 536 |        "      <th></th>\n",
 537 |        "      <th></th>\n",
 538 |        "      <th></th>\n",
 539 |        "      <th></th>\n",
 540 |        "      <th></th>\n",
 541 |        "      <th></th>\n",
 542 |        "      <th></th>\n",
 543 |        "      <th></th>\n",
 544 |        "      <th></th>\n",
 545 |        "      <th></th>\n",
 546 |        "      <th></th>\n",
 547 |        "    </tr>\n",
 548 |        "  </thead>\n",
 549 |        "  <tbody>\n",
 550 |        "    <tr>\n",
 551 |        "      <th>C1022153336</th>\n",
 552 |        "      <td>91</td>\n",
 553 |        "      <td>5</td>\n",
 554 |        "      <td>28007</td>\n",
 555 |        "      <td>28007</td>\n",
 556 |        "      <td>26.92</td>\n",
 557 |        "      <td>0</td>\n",
 558 |        "      <td>2024-10-06 07:02:33.778149000</td>\n",
 559 |        "      <td>M1823072687</td>\n",
 560 |        "      <td>33832bb8607545df97632a7ab02d69c4</td>\n",
 561 |        "      <td>6</td>\n",
 562 |        "      <td>...</td>\n",
 563 |        "      <td>1</td>\n",
 564 |        "      <td>1</td>\n",
 565 |        "      <td>1</td>\n",
 566 |        "      <td>1</td>\n",
 567 |        "      <td>0</td>\n",
 568 |        "      <td>1</td>\n",
 569 |        "      <td>0</td>\n",
 570 |        "      <td>0</td>\n",
 571 |        "      <td>0</td>\n",
 572 |        "      <td>0</td>\n",
 573 |        "    </tr>\n",
 574 |        "    <tr>\n",
 575 |        "      <th>C1006176917</th>\n",
 576 |        "      <td>94</td>\n",
 577 |        "      <td>2</td>\n",
 578 |        "      <td>28007</td>\n",
 579 |        "      <td>28007</td>\n",
 580 |        "      <td>48.22</td>\n",
 581 |        "      <td>0</td>\n",
 582 |        "      <td>2024-10-06 07:02:52.071774913</td>\n",
 583 |        "      <td>M348934600</td>\n",
 584 |        "      <td>fadd829c49e74ffa86c8da3be75ada53</td>\n",
 585 |        "      <td>6</td>\n",
 586 |        "      <td>...</td>\n",
 587 |        "      <td>4</td>\n",
 588 |        "      <td>0</td>\n",
 589 |        "      <td>1</td>\n",
 590 |        "      <td>1</td>\n",
 591 |        "      <td>0</td>\n",
 592 |        "      <td>2</td>\n",
 593 |        "      <td>0</td>\n",
 594 |        "      <td>0</td>\n",
 595 |        "      <td>0</td>\n",
 596 |        "      <td>0</td>\n",
 597 |        "    </tr>\n",
 598 |        "    <tr>\n",
 599 |        "      <th>C1010936270</th>\n",
 600 |        "      <td>131</td>\n",
 601 |        "      <td>3</td>\n",
 602 |        "      <td>28007</td>\n",
 603 |        "      <td>28007</td>\n",
 604 |        "      <td>17.56</td>\n",
 605 |        "      <td>0</td>\n",
 606 |        "      <td>2024-10-06 07:02:57.178944939</td>\n",
 607 |        "      <td>M348934600</td>\n",
 608 |        "      <td>58d0422a50bc40c89d2b4977b2f1beea</td>\n",
 609 |        "      <td>6</td>\n",
 610 |        "      <td>...</td>\n",
 611 |        "      <td>4</td>\n",
 612 |        "      <td>0</td>\n",
 613 |        "      <td>0</td>\n",
 614 |        "      <td>6</td>\n",
 615 |        "      <td>6</td>\n",
 616 |        "      <td>0</td>\n",
 617 |        "      <td>0</td>\n",
 618 |        "      <td>0</td>\n",
 619 |        "      <td>0</td>\n",
 620 |        "      <td>0</td>\n",
 621 |        "    </tr>\n",
 622 |        "    <tr>\n",
 623 |        "      <th>C1033736586</th>\n",
 624 |        "      <td>108</td>\n",
 625 |        "      <td>4</td>\n",
 626 |        "      <td>28007</td>\n",
 627 |        "      <td>28007</td>\n",
 628 |        "      <td>4.50</td>\n",
 629 |        "      <td>0</td>\n",
 630 |        "      <td>2024-10-06 07:03:09.471696118</td>\n",
 631 |        "      <td>M1823072687</td>\n",
 632 |        "      <td>30b269ae55984e5584f1dd5f642ac1a3</td>\n",
 633 |        "      <td>6</td>\n",
 634 |        "      <td>...</td>\n",
 635 |        "      <td>3</td>\n",
 636 |        "      <td>2</td>\n",
 637 |        "      <td>0</td>\n",
 638 |        "      <td>1</td>\n",
 639 |        "      <td>3</td>\n",
 640 |        "      <td>0</td>\n",
 641 |        "      <td>2</td>\n",
 642 |        "      <td>0</td>\n",
 643 |        "      <td>1</td>\n",
 644 |        "      <td>0</td>\n",
 645 |        "    </tr>\n",
 646 |        "    <tr>\n",
 647 |        "      <th>C1019071188</th>\n",
 648 |        "      <td>72</td>\n",
 649 |        "      <td>4</td>\n",
 650 |        "      <td>28007</td>\n",
 651 |        "      <td>28007</td>\n",
 652 |        "      <td>1.83</td>\n",
 653 |        "      <td>0</td>\n",
 654 |        "      <td>2024-10-06 07:03:43.360778001</td>\n",
 655 |        "      <td>M348934600</td>\n",
 656 |        "      <td>97bee3503a984f59aa6139b59f933c0b</td>\n",
 657 |        "      <td>6</td>\n",
 658 |        "      <td>...</td>\n",
 659 |        "      <td>1</td>\n",
 660 |        "      <td>0</td>\n",
 661 |        "      <td>0</td>\n",
 662 |        "      <td>0</td>\n",
 663 |        "      <td>1</td>\n",
 664 |        "      <td>4</td>\n",
 665 |        "      <td>0</td>\n",
 666 |        "      <td>1</td>\n",
 667 |        "      <td>1</td>\n",
 668 |        "      <td>0</td>\n",
 669 |        "    </tr>\n",
 670 |        "  </tbody>\n",
 671 |        "</table>\n",
 672 |        "<p>5 rows × 56 columns</p>\n",
 673 |        "</div>"
 674 |       ],
 675 |       "text/plain": [
 676 |        "             step age  zipcodeOri  zipMerchant  amount  fraud  \\\n",
 677 |        "source                                                          \n",
 678 |        "C1022153336    91   5       28007        28007   26.92      0   \n",
 679 |        "C1006176917    94   2       28007        28007   48.22      0   \n",
 680 |        "C1010936270   131   3       28007        28007   17.56      0   \n",
 681 |        "C1033736586   108   4       28007        28007    4.50      0   \n",
 682 |        "C1019071188    72   4       28007        28007    1.83      0   \n",
 683 |        "\n",
 684 |        "                                timestamp       target  \\\n",
 685 |        "source                                                   \n",
 686 |        "C1022153336 2024-10-06 07:02:33.778149000  M1823072687   \n",
 687 |        "C1006176917 2024-10-06 07:02:52.071774913   M348934600   \n",
 688 |        "C1010936270 2024-10-06 07:02:57.178944939   M348934600   \n",
 689 |        "C1033736586 2024-10-06 07:03:09.471696118  M1823072687   \n",
 690 |        "C1019071188 2024-10-06 07:03:43.360778001   M348934600   \n",
 691 |        "\n",
 692 |        "                                       device  day_of_week  ...  \\\n",
 693 |        "source                                                      ...   \n",
 694 |        "C1022153336  33832bb8607545df97632a7ab02d69c4            6  ...   \n",
 695 |        "C1006176917  fadd829c49e74ffa86c8da3be75ada53            6  ...   \n",
 696 |        "C1010936270  58d0422a50bc40c89d2b4977b2f1beea            6  ...   \n",
 697 |        "C1033736586  30b269ae55984e5584f1dd5f642ac1a3            6  ...   \n",
 698 |        "C1019071188  97bee3503a984f59aa6139b59f933c0b            6  ...   \n",
 699 |        "\n",
 700 |        "             es_barsandrestaurants_sum_14D es_tech_sum_14D  \\\n",
 701 |        "source                                                       \n",
 702 |        "C1022153336                              1               1   \n",
 703 |        "C1006176917                              4               0   \n",
 704 |        "C1010936270                              4               0   \n",
 705 |        "C1033736586                              3               2   \n",
 706 |        "C1019071188                              1               0   \n",
 707 |        "\n",
 708 |        "             es_sportsandtoys_sum_14D  es_wellnessandbeauty_sum_14D  \\\n",
 709 |        "source                                                                \n",
 710 |        "C1022153336                         1                             1   \n",
 711 |        "C1006176917                         1                             1   \n",
 712 |        "C1010936270                         0                             6   \n",
 713 |        "C1033736586                         0                             1   \n",
 714 |        "C1019071188                         0                             0   \n",
 715 |        "\n",
 716 |        "             es_hyper_sum_14D  es_fashion_sum_14D  es_home_sum_14D  \\\n",
 717 |        "source                                                               \n",
 718 |        "C1022153336                 0                   1                0   \n",
 719 |        "C1006176917                 0                   2                0   \n",
 720 |        "C1010936270                 6                   0                0   \n",
 721 |        "C1033736586                 3                   0                2   \n",
 722 |        "C1019071188                 1                   4                0   \n",
 723 |        "\n",
 724 |        "             es_contents_sum_14D  es_travel_sum_14D  es_leisure_sum_14D  \n",
 725 |        "source                                                                   \n",
 726 |        "C1022153336                    0                  0                   0  \n",
 727 |        "C1006176917                    0                  0                   0  \n",
 728 |        "C1010936270                    0                  0                   0  \n",
 729 |        "C1033736586                    0                  1                   0  \n",
 730 |        "C1019071188                    1                  1                   0  \n",
 731 |        "\n",
 732 |        "[5 rows x 56 columns]"
 733 |       ]
 734 |      },
 735 |      "execution_count": 16,
 736 |      "metadata": {},
 737 |      "output_type": "execute_result"
 738 |     }
 739 |    ],
 740 |    "source": [
 741 |     "# Group/Aggregate amount stats (mean, max, ..) by transaction category\n",
 742 |     "main_categories = [\"es_transportation\", \"es_health\", \"es_otherservices\",\n",
 743 |     "       \"es_food\", \"es_hotelservices\", \"es_barsandrestaurants\",\n",
 744 |     "       \"es_tech\", \"es_sportsandtoys\", \"es_wellnessandbeauty\",\n",
 745 |     "       \"es_hyper\", \"es_fashion\", \"es_home\", \"es_contents\",\n",
 746 |     "       \"es_travel\", \"es_leisure\"]\n",
 747 |     "for category in main_categories:\n",
 748 |     "    processed_transactions[f'{category}_sum_14D'] = transactions_for_agg.groupby(['source', pd.Grouper(freq='14D')])[f'category_{category}'].transform('sum').values\n",
 749 |     "\n",
 750 |     "processed_transactions.set_index(['source'], inplace=True)\n",
 751 |     "processed_transactions.head()"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": 17,
 757 |    "metadata": {},
 758 |    "outputs": [
 759 |     {
 760 |      "data": {
 761 |       "text/plain": [
 762 |        "step                                       int64\n",
 763 |        "age                                       object\n",
 764 |        "zipcodeOri                                 int64\n",
 765 |        "zipMerchant                                int64\n",
 766 |        "amount                                   float64\n",
 767 |        "fraud                                      int64\n",
 768 |        "timestamp                         datetime64[ns]\n",
 769 |        "target                                    object\n",
 770 |        "device                                    object\n",
 771 |        "day_of_week                                int32\n",
 772 |        "hour                                       int32\n",
 773 |        "age_mapped                                object\n",
 774 |        "category_es_barsandrestaurants              bool\n",
 775 |        "category_es_contents                        bool\n",
 776 |        "category_es_fashion                         bool\n",
 777 |        "category_es_food                            bool\n",
 778 |        "category_es_health                          bool\n",
 779 |        "category_es_home                            bool\n",
 780 |        "category_es_hotelservices                   bool\n",
 781 |        "category_es_hyper                           bool\n",
 782 |        "category_es_leisure                         bool\n",
 783 |        "category_es_otherservices                   bool\n",
 784 |        "category_es_sportsandtoys                   bool\n",
 785 |        "category_es_tech                            bool\n",
 786 |        "category_es_transportation                  bool\n",
 787 |        "category_es_travel                          bool\n",
 788 |        "category_es_wellnessandbeauty               bool\n",
 789 |        "gender_F                                    bool\n",
 790 |        "gender_M                                    bool\n",
 791 |        "amount_mean_2H                           float64\n",
 792 |        "amount_sum_2H                            float64\n",
 793 |        "amount_count_2H                            int64\n",
 794 |        "amount_max_2H                            float64\n",
 795 |        "amount_mean_12H                          float64\n",
 796 |        "amount_sum_12H                           float64\n",
 797 |        "amount_count_12H                           int64\n",
 798 |        "amount_max_12H                           float64\n",
 799 |        "amount_mean_24H                          float64\n",
 800 |        "amount_sum_24H                           float64\n",
 801 |        "amount_count_24H                           int64\n",
 802 |        "amount_max_24H                           float64\n",
 803 |        "es_transportation_sum_14D                  int64\n",
 804 |        "es_health_sum_14D                          int64\n",
 805 |        "es_otherservices_sum_14D                   int64\n",
 806 |        "es_food_sum_14D                            int64\n",
 807 |        "es_hotelservices_sum_14D                   int64\n",
 808 |        "es_barsandrestaurants_sum_14D              int64\n",
 809 |        "es_tech_sum_14D                            int64\n",
 810 |        "es_sportsandtoys_sum_14D                   int64\n",
 811 |        "es_wellnessandbeauty_sum_14D               int64\n",
 812 |        "es_hyper_sum_14D                           int64\n",
 813 |        "es_fashion_sum_14D                         int64\n",
 814 |        "es_home_sum_14D                            int64\n",
 815 |        "es_contents_sum_14D                        int64\n",
 816 |        "es_travel_sum_14D                          int64\n",
 817 |        "es_leisure_sum_14D                         int64\n",
 818 |        "dtype: object"
 819 |       ]
 820 |      },
 821 |      "execution_count": 17,
 822 |      "metadata": {},
 823 |      "output_type": "execute_result"
 824 |     }
 825 |    ],
 826 |    "source": [
 827 |     "processed_transactions.dtypes"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "markdown",
 832 |    "metadata": {},
 833 |    "source": [
 834 |     "## Preparing the User Events(Activities) Dataset"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {},
 840 |    "source": [
 841 |     "The events dataset contains user activities such as login, change of details, or password, which can hint at a fraud attempt. The next part shows how to load the events dataset and create categorical features per event type."
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "markdown",
 846 |    "metadata": {},
 847 |    "source": [
 848 |     "### Processing the events dataset"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": 18,
 854 |    "metadata": {},
 855 |    "outputs": [
 856 |     {
 857 |      "data": {
 858 |       "text/html": [
 859 |        "<div>\n",
 860 |        "<style scoped>\n",
 861 |        "    .dataframe tbody tr th:only-of-type {\n",
 862 |        "        vertical-align: middle;\n",
 863 |        "    }\n",
 864 |        "\n",
 865 |        "    .dataframe tbody tr th {\n",
 866 |        "        vertical-align: top;\n",
 867 |        "    }\n",
 868 |        "\n",
 869 |        "    .dataframe thead th {\n",
 870 |        "        text-align: right;\n",
 871 |        "    }\n",
 872 |        "</style>\n",
 873 |        "<table border=\"1\" class=\"dataframe\">\n",
 874 |        "  <thead>\n",
 875 |        "    <tr style=\"text-align: right;\">\n",
 876 |        "      <th></th>\n",
 877 |        "      <th>source</th>\n",
 878 |        "      <th>event</th>\n",
 879 |        "      <th>timestamp</th>\n",
 880 |        "    </tr>\n",
 881 |        "  </thead>\n",
 882 |        "  <tbody>\n",
 883 |        "    <tr>\n",
 884 |        "      <th>45553</th>\n",
 885 |        "      <td>C137986193</td>\n",
 886 |        "      <td>password_change</td>\n",
 887 |        "      <td>2024-10-06 07:02:34.836980000</td>\n",
 888 |        "    </tr>\n",
 889 |        "    <tr>\n",
 890 |        "      <th>24134</th>\n",
 891 |        "      <td>C1940951230</td>\n",
 892 |        "      <td>details_change</td>\n",
 893 |        "      <td>2024-10-06 07:02:35.885162091</td>\n",
 894 |        "    </tr>\n",
 895 |        "    <tr>\n",
 896 |        "      <th>64444</th>\n",
 897 |        "      <td>C247537602</td>\n",
 898 |        "      <td>login</td>\n",
 899 |        "      <td>2024-10-06 07:02:37.539945103</td>\n",
 900 |        "    </tr>\n",
 901 |        "  </tbody>\n",
 902 |        "</table>\n",
 903 |        "</div>"
 904 |       ],
 905 |       "text/plain": [
 906 |        "            source            event                     timestamp\n",
 907 |        "45553   C137986193  password_change 2024-10-06 07:02:34.836980000\n",
 908 |        "24134  C1940951230   details_change 2024-10-06 07:02:35.885162091\n",
 909 |        "64444   C247537602            login 2024-10-06 07:02:37.539945103"
 910 |       ]
 911 |      },
 912 |      "execution_count": 18,
 913 |      "metadata": {},
 914 |      "output_type": "execute_result"
 915 |     }
 916 |    ],
 917 |    "source": [
 918 |     "# Fetch the user_events dataset from the server\n",
 919 |     "user_events_data = pd.read_csv(data_path + \"events.csv\", \n",
 920 |     "                               index_col=0, quotechar=\"\\'\", parse_dates=['timestamp'])\n",
 921 |     "\n",
 922 |     "# Adjust to the last 2 days to see the latest aggregations in the online feature vectors\n",
 923 |     "user_events_data = adjust_data_timespan(user_events_data, new_period='2d')\n",
 924 |     "\n",
 925 |     "# Preview\n",
 926 |     "user_events_data.head(3)"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "code",
 931 |    "execution_count": 19,
 932 |    "metadata": {},
 933 |    "outputs": [
 934 |     {
 935 |      "data": {
 936 |       "text/html": [
 937 |        "<div>\n",
 938 |        "<style scoped>\n",
 939 |        "    .dataframe tbody tr th:only-of-type {\n",
 940 |        "        vertical-align: middle;\n",
 941 |        "    }\n",
 942 |        "\n",
 943 |        "    .dataframe tbody tr th {\n",
 944 |        "        vertical-align: top;\n",
 945 |        "    }\n",
 946 |        "\n",
 947 |        "    .dataframe thead th {\n",
 948 |        "        text-align: right;\n",
 949 |        "    }\n",
 950 |        "</style>\n",
 951 |        "<table border=\"1\" class=\"dataframe\">\n",
 952 |        "  <thead>\n",
 953 |        "    <tr style=\"text-align: right;\">\n",
 954 |        "      <th></th>\n",
 955 |        "      <th>timestamp</th>\n",
 956 |        "      <th>event_details_change</th>\n",
 957 |        "      <th>event_login</th>\n",
 958 |        "      <th>event_password_change</th>\n",
 959 |        "    </tr>\n",
 960 |        "    <tr>\n",
 961 |        "      <th>source</th>\n",
 962 |        "      <th></th>\n",
 963 |        "      <th></th>\n",
 964 |        "      <th></th>\n",
 965 |        "      <th></th>\n",
 966 |        "    </tr>\n",
 967 |        "  </thead>\n",
 968 |        "  <tbody>\n",
 969 |        "    <tr>\n",
 970 |        "      <th>C137986193</th>\n",
 971 |        "      <td>2024-10-06 07:02:34.836980000</td>\n",
 972 |        "      <td>False</td>\n",
 973 |        "      <td>False</td>\n",
 974 |        "      <td>True</td>\n",
 975 |        "    </tr>\n",
 976 |        "    <tr>\n",
 977 |        "      <th>C1940951230</th>\n",
 978 |        "      <td>2024-10-06 07:02:35.885162091</td>\n",
 979 |        "      <td>True</td>\n",
 980 |        "      <td>False</td>\n",
 981 |        "      <td>False</td>\n",
 982 |        "    </tr>\n",
 983 |        "    <tr>\n",
 984 |        "      <th>C247537602</th>\n",
 985 |        "      <td>2024-10-06 07:02:37.539945103</td>\n",
 986 |        "      <td>False</td>\n",
 987 |        "      <td>True</td>\n",
 988 |        "      <td>False</td>\n",
 989 |        "    </tr>\n",
 990 |        "    <tr>\n",
 991 |        "      <th>C470079617</th>\n",
 992 |        "      <td>2024-10-06 07:02:38.830394428</td>\n",
 993 |        "      <td>False</td>\n",
 994 |        "      <td>False</td>\n",
 995 |        "      <td>True</td>\n",
 996 |        "    </tr>\n",
 997 |        "    <tr>\n",
 998 |        "      <th>C1142118359</th>\n",
 999 |        "      <td>2024-10-06 07:02:39.620686830</td>\n",
1000 |        "      <td>False</td>\n",
1001 |        "      <td>True</td>\n",
1002 |        "      <td>False</td>\n",
1003 |        "    </tr>\n",
1004 |        "  </tbody>\n",
1005 |        "</table>\n",
1006 |        "</div>"
1007 |       ],
1008 |       "text/plain": [
1009 |        "                                timestamp  event_details_change  event_login  \\\n",
1010 |        "source                                                                         \n",
1011 |        "C137986193  2024-10-06 07:02:34.836980000                 False        False   \n",
1012 |        "C1940951230 2024-10-06 07:02:35.885162091                  True        False   \n",
1013 |        "C247537602  2024-10-06 07:02:37.539945103                 False         True   \n",
1014 |        "C470079617  2024-10-06 07:02:38.830394428                 False        False   \n",
1015 |        "C1142118359 2024-10-06 07:02:39.620686830                 False         True   \n",
1016 |        "\n",
1017 |        "             event_password_change  \n",
1018 |        "source                              \n",
1019 |        "C137986193                    True  \n",
1020 |        "C1940951230                  False  \n",
1021 |        "C247537602                   False  \n",
1022 |        "C470079617                    True  \n",
1023 |        "C1142118359                  False  "
1024 |       ]
1025 |      },
1026 |      "execution_count": 19,
1027 |      "metadata": {},
1028 |      "output_type": "execute_result"
1029 |     }
1030 |    ],
1031 |    "source": [
1032 |     "# Generate categorical features from the event type\n",
1033 |     "processed_events = user_events_data\n",
1034 |     "processed_events = pd.get_dummies(processed_events, columns=['event'])\n",
1035 |     "processed_events.set_index(['source'], inplace=True)\n",
1036 |     "processed_events.head()"
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "markdown",
1041 |    "metadata": {},
1042 |    "source": [
1043 |     "## Extracting Labels and Training a Model"
1044 |    ]
1045 |   },
1046 |   {
1047 |    "cell_type": "markdown",
1048 |    "metadata": {},
1049 |    "source": [
1050 |     "The final step is to generate a target label column (the fraud yes/no indication) and train a basic model to evaluate your assumptions. The next part demonstrates how to create the labels dataset and use sklearn to train and evaluate a basic model."
1051 |    ]
1052 |   },
1053 |   {
1054 |    "cell_type": "markdown",
1055 |    "metadata": {},
1056 |    "source": [
1057 |     "### Label df"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "code",
1062 |    "execution_count": 20,
1063 |    "metadata": {},
1064 |    "outputs": [],
1065 |    "source": [
1066 |     "def create_labels(df):\n",
1067 |     "    labels = df[['fraud','timestamp']].copy()\n",
1068 |     "    labels = labels.rename(columns={\"fraud\": \"label\"})\n",
1069 |     "    labels['timestamp'] = labels['timestamp'].astype(\"datetime64[ns]\")\n",
1070 |     "    labels['label'] = labels['label'].astype(int)\n",
1071 |     "    return labels"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": 21,
1077 |    "metadata": {},
1078 |    "outputs": [
1079 |     {
1080 |      "data": {
1081 |       "text/html": [
1082 |        "<div>\n",
1083 |        "<style scoped>\n",
1084 |        "    .dataframe tbody tr th:only-of-type {\n",
1085 |        "        vertical-align: middle;\n",
1086 |        "    }\n",
1087 |        "\n",
1088 |        "    .dataframe tbody tr th {\n",
1089 |        "        vertical-align: top;\n",
1090 |        "    }\n",
1091 |        "\n",
1092 |        "    .dataframe thead th {\n",
1093 |        "        text-align: right;\n",
1094 |        "    }\n",
1095 |        "</style>\n",
1096 |        "<table border=\"1\" class=\"dataframe\">\n",
1097 |        "  <thead>\n",
1098 |        "    <tr style=\"text-align: right;\">\n",
1099 |        "      <th></th>\n",
1100 |        "      <th>label</th>\n",
1101 |        "      <th>timestamp</th>\n",
1102 |        "    </tr>\n",
1103 |        "    <tr>\n",
1104 |        "      <th>source</th>\n",
1105 |        "      <th></th>\n",
1106 |        "      <th></th>\n",
1107 |        "    </tr>\n",
1108 |        "  </thead>\n",
1109 |        "  <tbody>\n",
1110 |        "    <tr>\n",
1111 |        "      <th>C1022153336</th>\n",
1112 |        "      <td>0</td>\n",
1113 |        "      <td>2024-10-06 07:02:33.778149000</td>\n",
1114 |        "    </tr>\n",
1115 |        "    <tr>\n",
1116 |        "      <th>C1006176917</th>\n",
1117 |        "      <td>0</td>\n",
1118 |        "      <td>2024-10-06 07:02:52.071774913</td>\n",
1119 |        "    </tr>\n",
1120 |        "    <tr>\n",
1121 |        "      <th>C1010936270</th>\n",
1122 |        "      <td>0</td>\n",
1123 |        "      <td>2024-10-06 07:02:57.178944939</td>\n",
1124 |        "    </tr>\n",
1125 |        "    <tr>\n",
1126 |        "      <th>C1033736586</th>\n",
1127 |        "      <td>0</td>\n",
1128 |        "      <td>2024-10-06 07:03:09.471696118</td>\n",
1129 |        "    </tr>\n",
1130 |        "    <tr>\n",
1131 |        "      <th>C1019071188</th>\n",
1132 |        "      <td>0</td>\n",
1133 |        "      <td>2024-10-06 07:03:43.360778001</td>\n",
1134 |        "    </tr>\n",
1135 |        "  </tbody>\n",
1136 |        "</table>\n",
1137 |        "</div>"
1138 |       ],
1139 |       "text/plain": [
1140 |        "             label                     timestamp\n",
1141 |        "source                                          \n",
1142 |        "C1022153336      0 2024-10-06 07:02:33.778149000\n",
1143 |        "C1006176917      0 2024-10-06 07:02:52.071774913\n",
1144 |        "C1010936270      0 2024-10-06 07:02:57.178944939\n",
1145 |        "C1033736586      0 2024-10-06 07:03:09.471696118\n",
1146 |        "C1019071188      0 2024-10-06 07:03:43.360778001"
1147 |       ]
1148 |      },
1149 |      "execution_count": 21,
1150 |      "metadata": {},
1151 |      "output_type": "execute_result"
1152 |     }
1153 |    ],
1154 |    "source": [
1155 |     "# Create the target label dataset (fraud indication)\n",
1156 |     "labels_set = create_labels(processed_transactions)\n",
1157 |     "labels_set.head()"
1158 |    ]
1159 |   },
1160 |   {
1161 |    "cell_type": "markdown",
1162 |    "metadata": {},
1163 |    "source": [
1164 |     "## Train"
1165 |    ]
1166 |   },
1167 |   {
1168 |    "cell_type": "code",
1169 |    "execution_count": 22,
1170 |    "metadata": {},
1171 |    "outputs": [
1172 |     {
1173 |      "name": "stdout",
1174 |      "output_type": "stream",
1175 |      "text": [
1176 |       "Fitting 3 folds for each of 100 candidates, totalling 300 fits\n",
1177 |       "Accuracy: 1.0\n",
1178 |       "Precision: 1.0\n",
1179 |       "Recall: 1.0\n",
1180 |       "F1 Score: 1.0\n"
1181 |      ]
1182 |     },
1183 |     {
1184 |      "data": {
1185 |       "text/html": [
1186 |        "<style>#sk-container-id-2 {\n",
1187 |        "  /* Definition of color scheme common for light and dark mode */\n",
1188 |        "  --sklearn-color-text: black;\n",
1189 |        "  --sklearn-color-line: gray;\n",
1190 |        "  /* Definition of color scheme for unfitted estimators */\n",
1191 |        "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
1192 |        "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
1193 |        "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
1194 |        "  --sklearn-color-unfitted-level-3: chocolate;\n",
1195 |        "  /* Definition of color scheme for fitted estimators */\n",
1196 |        "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
1197 |        "  --sklearn-color-fitted-level-1: #d4ebff;\n",
1198 |        "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
1199 |        "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
1200 |        "\n",
1201 |        "  /* Specific color for light theme */\n",
1202 |        "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1203 |        "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
1204 |        "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
1205 |        "  --sklearn-color-icon: #696969;\n",
1206 |        "\n",
1207 |        "  @media (prefers-color-scheme: dark) {\n",
1208 |        "    /* Redefinition of color scheme for dark theme */\n",
1209 |        "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1210 |        "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
1211 |        "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
1212 |        "    --sklearn-color-icon: #878787;\n",
1213 |        "  }\n",
1214 |        "}\n",
1215 |        "\n",
1216 |        "#sk-container-id-2 {\n",
1217 |        "  color: var(--sklearn-color-text);\n",
1218 |        "}\n",
1219 |        "\n",
1220 |        "#sk-container-id-2 pre {\n",
1221 |        "  padding: 0;\n",
1222 |        "}\n",
1223 |        "\n",
1224 |        "#sk-container-id-2 input.sk-hidden--visually {\n",
1225 |        "  border: 0;\n",
1226 |        "  clip: rect(1px 1px 1px 1px);\n",
1227 |        "  clip: rect(1px, 1px, 1px, 1px);\n",
1228 |        "  height: 1px;\n",
1229 |        "  margin: -1px;\n",
1230 |        "  overflow: hidden;\n",
1231 |        "  padding: 0;\n",
1232 |        "  position: absolute;\n",
1233 |        "  width: 1px;\n",
1234 |        "}\n",
1235 |        "\n",
1236 |        "#sk-container-id-2 div.sk-dashed-wrapped {\n",
1237 |        "  border: 1px dashed var(--sklearn-color-line);\n",
1238 |        "  margin: 0 0.4em 0.5em 0.4em;\n",
1239 |        "  box-sizing: border-box;\n",
1240 |        "  padding-bottom: 0.4em;\n",
1241 |        "  background-color: var(--sklearn-color-background);\n",
1242 |        "}\n",
1243 |        "\n",
1244 |        "#sk-container-id-2 div.sk-container {\n",
1245 |        "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
1246 |        "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
1247 |        "     so we also need the `!important` here to be able to override the\n",
1248 |        "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
1249 |        "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
1250 |        "  display: inline-block !important;\n",
1251 |        "  position: relative;\n",
1252 |        "}\n",
1253 |        "\n",
1254 |        "#sk-container-id-2 div.sk-text-repr-fallback {\n",
1255 |        "  display: none;\n",
1256 |        "}\n",
1257 |        "\n",
1258 |        "div.sk-parallel-item,\n",
1259 |        "div.sk-serial,\n",
1260 |        "div.sk-item {\n",
1261 |        "  /* draw centered vertical line to link estimators */\n",
1262 |        "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
1263 |        "  background-size: 2px 100%;\n",
1264 |        "  background-repeat: no-repeat;\n",
1265 |        "  background-position: center center;\n",
1266 |        "}\n",
1267 |        "\n",
1268 |        "/* Parallel-specific style estimator block */\n",
1269 |        "\n",
1270 |        "#sk-container-id-2 div.sk-parallel-item::after {\n",
1271 |        "  content: \"\";\n",
1272 |        "  width: 100%;\n",
1273 |        "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
1274 |        "  flex-grow: 1;\n",
1275 |        "}\n",
1276 |        "\n",
1277 |        "#sk-container-id-2 div.sk-parallel {\n",
1278 |        "  display: flex;\n",
1279 |        "  align-items: stretch;\n",
1280 |        "  justify-content: center;\n",
1281 |        "  background-color: var(--sklearn-color-background);\n",
1282 |        "  position: relative;\n",
1283 |        "}\n",
1284 |        "\n",
1285 |        "#sk-container-id-2 div.sk-parallel-item {\n",
1286 |        "  display: flex;\n",
1287 |        "  flex-direction: column;\n",
1288 |        "}\n",
1289 |        "\n",
1290 |        "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
1291 |        "  align-self: flex-end;\n",
1292 |        "  width: 50%;\n",
1293 |        "}\n",
1294 |        "\n",
1295 |        "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
1296 |        "  align-self: flex-start;\n",
1297 |        "  width: 50%;\n",
1298 |        "}\n",
1299 |        "\n",
1300 |        "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
1301 |        "  width: 0;\n",
1302 |        "}\n",
1303 |        "\n",
1304 |        "/* Serial-specific style estimator block */\n",
1305 |        "\n",
1306 |        "#sk-container-id-2 div.sk-serial {\n",
1307 |        "  display: flex;\n",
1308 |        "  flex-direction: column;\n",
1309 |        "  align-items: center;\n",
1310 |        "  background-color: var(--sklearn-color-background);\n",
1311 |        "  padding-right: 1em;\n",
1312 |        "  padding-left: 1em;\n",
1313 |        "}\n",
1314 |        "\n",
1315 |        "\n",
1316 |        "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
1317 |        "clickable and can be expanded/collapsed.\n",
1318 |        "- Pipeline and ColumnTransformer use this feature and define the default style\n",
1319 |        "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
1320 |        "*/\n",
1321 |        "\n",
1322 |        "/* Pipeline and ColumnTransformer style (default) */\n",
1323 |        "\n",
1324 |        "#sk-container-id-2 div.sk-toggleable {\n",
1325 |        "  /* Default theme specific background. It is overwritten whether we have a\n",
1326 |        "  specific estimator or a Pipeline/ColumnTransformer */\n",
1327 |        "  background-color: var(--sklearn-color-background);\n",
1328 |        "}\n",
1329 |        "\n",
1330 |        "/* Toggleable label */\n",
1331 |        "#sk-container-id-2 label.sk-toggleable__label {\n",
1332 |        "  cursor: pointer;\n",
1333 |        "  display: block;\n",
1334 |        "  width: 100%;\n",
1335 |        "  margin-bottom: 0;\n",
1336 |        "  padding: 0.5em;\n",
1337 |        "  box-sizing: border-box;\n",
1338 |        "  text-align: center;\n",
1339 |        "}\n",
1340 |        "\n",
1341 |        "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
1342 |        "  /* Arrow on the left of the label */\n",
1343 |        "  content: \"▸\";\n",
1344 |        "  float: left;\n",
1345 |        "  margin-right: 0.25em;\n",
1346 |        "  color: var(--sklearn-color-icon);\n",
1347 |        "}\n",
1348 |        "\n",
1349 |        "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
1350 |        "  color: var(--sklearn-color-text);\n",
1351 |        "}\n",
1352 |        "\n",
1353 |        "/* Toggleable content - dropdown */\n",
1354 |        "\n",
1355 |        "#sk-container-id-2 div.sk-toggleable__content {\n",
1356 |        "  max-height: 0;\n",
1357 |        "  max-width: 0;\n",
1358 |        "  overflow: hidden;\n",
1359 |        "  text-align: left;\n",
1360 |        "  /* unfitted */\n",
1361 |        "  background-color: var(--sklearn-color-unfitted-level-0);\n",
1362 |        "}\n",
1363 |        "\n",
1364 |        "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
1365 |        "  /* fitted */\n",
1366 |        "  background-color: var(--sklearn-color-fitted-level-0);\n",
1367 |        "}\n",
1368 |        "\n",
1369 |        "#sk-container-id-2 div.sk-toggleable__content pre {\n",
1370 |        "  margin: 0.2em;\n",
1371 |        "  border-radius: 0.25em;\n",
1372 |        "  color: var(--sklearn-color-text);\n",
1373 |        "  /* unfitted */\n",
1374 |        "  background-color: var(--sklearn-color-unfitted-level-0);\n",
1375 |        "}\n",
1376 |        "\n",
1377 |        "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
1378 |        "  /* unfitted */\n",
1379 |        "  background-color: var(--sklearn-color-fitted-level-0);\n",
1380 |        "}\n",
1381 |        "\n",
1382 |        "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
1383 |        "  /* Expand drop-down */\n",
1384 |        "  max-height: 200px;\n",
1385 |        "  max-width: 100%;\n",
1386 |        "  overflow: auto;\n",
1387 |        "}\n",
1388 |        "\n",
1389 |        "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
1390 |        "  content: \"▾\";\n",
1391 |        "}\n",
1392 |        "\n",
1393 |        "/* Pipeline/ColumnTransformer-specific style */\n",
1394 |        "\n",
1395 |        "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1396 |        "  color: var(--sklearn-color-text);\n",
1397 |        "  background-color: var(--sklearn-color-unfitted-level-2);\n",
1398 |        "}\n",
1399 |        "\n",
1400 |        "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1401 |        "  background-color: var(--sklearn-color-fitted-level-2);\n",
1402 |        "}\n",
1403 |        "\n",
1404 |        "/* Estimator-specific style */\n",
1405 |        "\n",
1406 |        "/* Colorize estimator box */\n",
1407 |        "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1408 |        "  /* unfitted */\n",
1409 |        "  background-color: var(--sklearn-color-unfitted-level-2);\n",
1410 |        "}\n",
1411 |        "\n",
1412 |        "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
1413 |        "  /* fitted */\n",
1414 |        "  background-color: var(--sklearn-color-fitted-level-2);\n",
1415 |        "}\n",
1416 |        "\n",
1417 |        "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
1418 |        "#sk-container-id-2 div.sk-label label {\n",
1419 |        "  /* The background is the default theme color */\n",
1420 |        "  color: var(--sklearn-color-text-on-default-background);\n",
1421 |        "}\n",
1422 |        "\n",
1423 |        "/* On hover, darken the color of the background */\n",
1424 |        "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
1425 |        "  color: var(--sklearn-color-text);\n",
1426 |        "  background-color: var(--sklearn-color-unfitted-level-2);\n",
1427 |        "}\n",
1428 |        "\n",
1429 |        "/* Label box, darken color on hover, fitted */\n",
1430 |        "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
1431 |        "  color: var(--sklearn-color-text);\n",
1432 |        "  background-color: var(--sklearn-color-fitted-level-2);\n",
1433 |        "}\n",
1434 |        "\n",
1435 |        "/* Estimator label */\n",
1436 |        "\n",
1437 |        "#sk-container-id-2 div.sk-label label {\n",
1438 |        "  font-family: monospace;\n",
1439 |        "  font-weight: bold;\n",
1440 |        "  display: inline-block;\n",
1441 |        "  line-height: 1.2em;\n",
1442 |        "}\n",
1443 |        "\n",
1444 |        "#sk-container-id-2 div.sk-label-container {\n",
1445 |        "  text-align: center;\n",
1446 |        "}\n",
1447 |        "\n",
1448 |        "/* Estimator-specific */\n",
1449 |        "#sk-container-id-2 div.sk-estimator {\n",
1450 |        "  font-family: monospace;\n",
1451 |        "  border: 1px dotted var(--sklearn-color-border-box);\n",
1452 |        "  border-radius: 0.25em;\n",
1453 |        "  box-sizing: border-box;\n",
1454 |        "  margin-bottom: 0.5em;\n",
1455 |        "  /* unfitted */\n",
1456 |        "  background-color: var(--sklearn-color-unfitted-level-0);\n",
1457 |        "}\n",
1458 |        "\n",
1459 |        "#sk-container-id-2 div.sk-estimator.fitted {\n",
1460 |        "  /* fitted */\n",
1461 |        "  background-color: var(--sklearn-color-fitted-level-0);\n",
1462 |        "}\n",
1463 |        "\n",
1464 |        "/* on hover */\n",
1465 |        "#sk-container-id-2 div.sk-estimator:hover {\n",
1466 |        "  /* unfitted */\n",
1467 |        "  background-color: var(--sklearn-color-unfitted-level-2);\n",
1468 |        "}\n",
1469 |        "\n",
1470 |        "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
1471 |        "  /* fitted */\n",
1472 |        "  background-color: var(--sklearn-color-fitted-level-2);\n",
1473 |        "}\n",
1474 |        "\n",
1475 |        "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
1476 |        "\n",
1477 |        "/* Common style for \"i\" and \"?\" */\n",
1478 |        "\n",
1479 |        ".sk-estimator-doc-link,\n",
1480 |        "a:link.sk-estimator-doc-link,\n",
1481 |        "a:visited.sk-estimator-doc-link {\n",
1482 |        "  float: right;\n",
1483 |        "  font-size: smaller;\n",
1484 |        "  line-height: 1em;\n",
1485 |        "  font-family: monospace;\n",
1486 |        "  background-color: var(--sklearn-color-background);\n",
1487 |        "  border-radius: 1em;\n",
1488 |        "  height: 1em;\n",
1489 |        "  width: 1em;\n",
1490 |        "  text-decoration: none !important;\n",
1491 |        "  margin-left: 1ex;\n",
1492 |        "  /* unfitted */\n",
1493 |        "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1494 |        "  color: var(--sklearn-color-unfitted-level-1);\n",
1495 |        "}\n",
1496 |        "\n",
1497 |        ".sk-estimator-doc-link.fitted,\n",
1498 |        "a:link.sk-estimator-doc-link.fitted,\n",
1499 |        "a:visited.sk-estimator-doc-link.fitted {\n",
1500 |        "  /* fitted */\n",
1501 |        "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1502 |        "  color: var(--sklearn-color-fitted-level-1);\n",
1503 |        "}\n",
1504 |        "\n",
1505 |        "/* On hover */\n",
1506 |        "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
1507 |        ".sk-estimator-doc-link:hover,\n",
1508 |        "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
1509 |        ".sk-estimator-doc-link:hover {\n",
1510 |        "  /* unfitted */\n",
1511 |        "  background-color: var(--sklearn-color-unfitted-level-3);\n",
1512 |        "  color: var(--sklearn-color-background);\n",
1513 |        "  text-decoration: none;\n",
1514 |        "}\n",
1515 |        "\n",
1516 |        "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
1517 |        ".sk-estimator-doc-link.fitted:hover,\n",
1518 |        "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
1519 |        ".sk-estimator-doc-link.fitted:hover {\n",
1520 |        "  /* fitted */\n",
1521 |        "  background-color: var(--sklearn-color-fitted-level-3);\n",
1522 |        "  color: var(--sklearn-color-background);\n",
1523 |        "  text-decoration: none;\n",
1524 |        "}\n",
1525 |        "\n",
1526 |        "/* Span, style for the box shown on hovering the info icon */\n",
1527 |        ".sk-estimator-doc-link span {\n",
1528 |        "  display: none;\n",
1529 |        "  z-index: 9999;\n",
1530 |        "  position: relative;\n",
1531 |        "  font-weight: normal;\n",
1532 |        "  right: .2ex;\n",
1533 |        "  padding: .5ex;\n",
1534 |        "  margin: .5ex;\n",
1535 |        "  width: min-content;\n",
1536 |        "  min-width: 20ex;\n",
1537 |        "  max-width: 50ex;\n",
1538 |        "  color: var(--sklearn-color-text);\n",
1539 |        "  box-shadow: 2pt 2pt 4pt #999;\n",
1540 |        "  /* unfitted */\n",
1541 |        "  background: var(--sklearn-color-unfitted-level-0);\n",
1542 |        "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1543 |        "}\n",
1544 |        "\n",
1545 |        ".sk-estimator-doc-link.fitted span {\n",
1546 |        "  /* fitted */\n",
1547 |        "  background: var(--sklearn-color-fitted-level-0);\n",
1548 |        "  border: var(--sklearn-color-fitted-level-3);\n",
1549 |        "}\n",
1550 |        "\n",
1551 |        ".sk-estimator-doc-link:hover span {\n",
1552 |        "  display: block;\n",
1553 |        "}\n",
1554 |        "\n",
1555 |        "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1556 |        "\n",
1557 |        "#sk-container-id-2 a.estimator_doc_link {\n",
1558 |        "  float: right;\n",
1559 |        "  font-size: 1rem;\n",
1560 |        "  line-height: 1em;\n",
1561 |        "  font-family: monospace;\n",
1562 |        "  background-color: var(--sklearn-color-background);\n",
1563 |        "  border-radius: 1rem;\n",
1564 |        "  height: 1rem;\n",
1565 |        "  width: 1rem;\n",
1566 |        "  text-decoration: none;\n",
1567 |        "  /* unfitted */\n",
1568 |        "  color: var(--sklearn-color-unfitted-level-1);\n",
1569 |        "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1570 |        "}\n",
1571 |        "\n",
1572 |        "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
1573 |        "  /* fitted */\n",
1574 |        "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1575 |        "  color: var(--sklearn-color-fitted-level-1);\n",
1576 |        "}\n",
1577 |        "\n",
1578 |        "/* On hover */\n",
1579 |        "#sk-container-id-2 a.estimator_doc_link:hover {\n",
1580 |        "  /* unfitted */\n",
1581 |        "  background-color: var(--sklearn-color-unfitted-level-3);\n",
1582 |        "  color: var(--sklearn-color-background);\n",
1583 |        "  text-decoration: none;\n",
1584 |        "}\n",
1585 |        "\n",
1586 |        "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
1587 |        "  /* fitted */\n",
1588 |        "  background-color: var(--sklearn-color-fitted-level-3);\n",
1589 |        "}\n",
1590 |        "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(bootstrap=False, max_depth=100, min_samples_leaf=2,\n",
1591 |        "                       n_estimators=50)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(bootstrap=False, max_depth=100, min_samples_leaf=2,\n",
1592 |        "                       n_estimators=50)</pre></div> </div></div></div></div>"
1593 |       ],
1594 |       "text/plain": [
1595 |        "RandomForestClassifier(bootstrap=False, max_depth=100, min_samples_leaf=2,\n",
1596 |        "                       n_estimators=50)"
1597 |       ]
1598 |      },
1599 |      "execution_count": 22,
1600 |      "metadata": {},
1601 |      "output_type": "execute_result"
1602 |     }
1603 |    ],
1604 |    "source": [
1605 |     "# Train a model based on the transactions, events, and labels\n",
1606 |     "from src.train_sklearn import train_and_val, prepare_data_to_train\n",
1607 |     "\n",
1608 |     "X_train, X_test, y_train, y_test = prepare_data_to_train(processed_transactions, processed_events, labels_set)\n",
1609 |     "rf_best = train_and_val(X_train, X_test, y_train, y_test)\n",
1610 |     "\n",
1611 |     "# print the model results (Accuracy, ..)\n",
1612 |     "rf_best"
1613 |    ]
1614 |   },
1615 |   {
1616 |    "cell_type": "markdown",
1617 |    "metadata": {},
1618 |    "source": [
1619 |     "## Done!\n",
1620 |     "\n",
1621 |     "You've completed the second part - interactive data preparation.\n",
1622 |     "Proceed to [Part 3](03-ingest-with-feature-store.ipynb) to learn how to build data ingestion services with the Feature Store."
1623 |    ]
1624 |   }
1625 |  ],
1626 |  "metadata": {
1627 |   "kernelspec": {
1628 |    "display_name": "Python 3 (ipykernel)",
1629 |    "language": "python",
1630 |    "name": "python3"
1631 |   },
1632 |   "language_info": {
1633 |    "codemirror_mode": {
1634 |     "name": "ipython",
1635 |     "version": 3
1636 |    },
1637 |    "file_extension": ".py",
1638 |    "mimetype": "text/x-python",
1639 |    "name": "python",
1640 |    "nbconvert_exporter": "python",
1641 |    "pygments_lexer": "ipython3",
1642 |    "version": "3.11.5"
1643 |   },
1644 |   "toc-showtags": false
1645 |  },
1646 |  "nbformat": 4,
1647 |  "nbformat_minor": 4
1648 | }
1649 | 


--------------------------------------------------------------------------------