├── .gitignore ├── README.md ├── imgs ├── diagram1.png ├── diagram2.png ├── icon.png └── livestream.png ├── scikit_churn └── __init__.py ├── setup.py └── why-scikit-churn.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | *.parquet -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | > This repository is currently mainly here to get the ball rolling on some ideas. If there is sufficient interest we'll gladly invest more time in it, but first we'd like to get some feedback. 4 | > 5 | > We're mainly eager to confirm if this line of work is relevant to industry. 6 | 7 | ## Livestream 8 | 9 | 10 | [![](imgs/livestream.png)](https://www.youtube.com/watch?v=uevp7zJTM_c) 11 | 12 | This work was part of a [YouTube livestream](https://www.youtube.com/watch?v=uevp7zJTM_c) and it may help to watch for some extra context. 13 | 14 | # scikit-churn 15 | 16 | This repository explores some moments where data leakage can occur in churn-related use-cases. You may need more than a scikit-learn pipeline to protect you against temporal data leaks. 17 | 18 | The approach for now is to construct a generator that can generate `X`,`y` pairs from temporal data. You'd still need to provide a function to construct features, but the generator can at least guarantee that you're only generating features that don't peek into the future. 19 | 20 | Imagine that you're interested in doing some churn research. That is to say, you're interested in predicting if somebody is going to be around in the future. Then you might want to construct your data using the following conceptual model. 21 | 22 | ![imgs/diagram1.png](imgs/diagram1.png) 23 | 24 | There's a "checking" period where you're going to check who is still around. And this is information that you can use to construct a label. There's also an "input" period in which you're going to construct features for a machine learning model. 25 | 26 | But here is where you have to be very careful! You should only use the checking period to understand which customers are still around. If you use any data from this period to generete ML features, or from any part after the input period, then you are at risk of leaking data that your machine learning model does not have access to in real life. 27 | 28 | So one of the first features this library offers is to give you a safe way to generate these `X`, `y` pairs over time. Conceptually, it currently allows you to construct features from timeslots over time. 29 | 30 | ![imgs/diagram2.png](imgs/diagram2.png) 31 | 32 | To do such things in this library, you might do something like: 33 | 34 | ```python 35 | from skchurn import dataset_generator 36 | 37 | gen = dataset_generator( 38 | df, 39 | user_id="player_id", 40 | time_col="datetime", 41 | info_period=90, 42 | checking_period=30, 43 | start_date=datetime(2007, 1, 1), 44 | end_date=datetime(2007, 12, 31), 45 | feature_pipeline=feature_pipeline 46 | ) 47 | 48 | for X, y in gen: 49 | scorers = { 50 | "accuracy": make_scorer(accuracy_score), 51 | "precision": make_scorer(precision_score), 52 | "recall": make_scorer(recall_score) 53 | } 54 | # Cross validate your pipeline as you might normally. Maybe even gridsearch? 55 | print(pl.DataFrame(cross_validate(pipe, X, y, cv=5, scoring=scorers))) 56 | ``` 57 | 58 | The `dataset_generator` has the following arguments. 59 | 60 | - `df`: a Polars dataframe that contains logs over time for users 61 | - `user_id`: the column name that depicts the user id 62 | - `feature_pipeline`: a Polars compatible function that generatres ML features to go in `X` from the inserted dataframe 63 | - `input_period`: the number of days that the input period lasts 64 | - `checking_period`: the number of days that the checking period lasts 65 | - `start_date`: the start date for X,y-pair generation 66 | - `end_date`: the end date for X,y-pair generation 67 | - `step`: stepsize over time for new X,y-pairs. defaults to a month. 68 | - `time_col`: column name that depicts the datetime stamp 69 | 70 | You can explore this function by copying it from the `scikit_churn/__init__.py` file. 71 | 72 | This repository may grow, but for now we're eager to understand industry problems around these kinds of churn analyses better. We're particularily interested to see what one might do to remedy these temporal leakage issues for scikit-learn applications. 73 | -------------------------------------------------------------------------------- /imgs/diagram1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/scikit-churn/f5452c594d24e2a816b98971a912632de04b92ba/imgs/diagram1.png -------------------------------------------------------------------------------- /imgs/diagram2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/scikit-churn/f5452c594d24e2a816b98971a912632de04b92ba/imgs/diagram2.png -------------------------------------------------------------------------------- /imgs/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/scikit-churn/f5452c594d24e2a816b98971a912632de04b92ba/imgs/icon.png -------------------------------------------------------------------------------- /imgs/livestream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/scikit-churn/f5452c594d24e2a816b98971a912632de04b92ba/imgs/livestream.png -------------------------------------------------------------------------------- /scikit_churn/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import polars as pl 3 | from datetime import datetime, timedelta 4 | 5 | 6 | def churn_dataset_generator(df, user_id, feature_pipeline, 7 | info_period=180, 8 | checking_period=180, 9 | start_date=datetime(2007, 1, 1), 10 | end_date=datetime(2007, 12, 31), 11 | step="1mo", 12 | time_col="datetime"): 13 | """ 14 | Generates X,y pairs for churn related machine learning, with way less temporal data leaks to worry about. 15 | 16 | Arguments: 17 | 18 | - df: a Polars dataframe that contains logs over time for users 19 | - user_id: the column name that depicts the user id 20 | - feature_pipeline: a Polars compatible function that generatres ML features to go in `X` 21 | - input_period: the number of days that the input period lasts 22 | - checking_period: the number of days that the checking period lasts 23 | - start_date: the start date for X,y-pair generation 24 | - end_date: the end date for X,y-pair generation 25 | - step: stepsize over time for new X,y-pairs. defaults to a month. 26 | - time_col: column name that depicts the datetime stamp 27 | """ 28 | cutoff_start = pl.datetime_range(start_date, end_date, step, eager=True).alias(time_col) 29 | min_date = df[time_col].min() 30 | max_date = df[time_col].max() 31 | 32 | for start in cutoff_start.to_list(): 33 | info_period_start = start - timedelta(days=info_period) 34 | checking_period_end = start + timedelta(days=checking_period) 35 | if info_period_start < min_date: 36 | continue 37 | if checking_period_end > max_date: 38 | continue 39 | print(info_period_start, start, checking_period_end, min_date, max_date) 40 | train_info = df.filter(pl.col(time_col) < start, pl.col(time_col) >= (start - timedelta(days=info_period))) 41 | valid_info = df.filter(pl.col(time_col) >= start, pl.col(time_col) < (start + timedelta(days=checking_period))) 42 | 43 | 44 | target = valid_info.select("player_id").unique().with_columns(target=True) 45 | 46 | ml_df = (train_info 47 | .pipe(feature_pipeline) 48 | .join(target, on=user_id, how="left") 49 | .with_columns(target=pl.when(pl.col("target")).then(True).otherwise(False))) 50 | 51 | X = ml_df.drop("target", "player_id") 52 | y = np.array(ml_df["target"]).astype(int) 53 | 54 | yield X, y -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | 5 | base_packages = [ 6 | "scikit-learn", "polars" 7 | ] 8 | 9 | def read(fname): 10 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 11 | 12 | 13 | setup( 14 | name="scikit-churn", 15 | version="0.0.1", 16 | description="Remedy temporal data-leaks", 17 | author="Vincent D. Warmerdam", 18 | packages=find_packages(exclude=["notebooks"]), 19 | package_data={}, 20 | long_description=read("README.md"), 21 | long_description_content_type="text/markdown", 22 | install_requires=base_packages, 23 | classifiers=[ 24 | "Intended Audience :: Developers", 25 | "Intended Audience :: Science/Research", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3.8", 28 | "Programming Language :: Python :: 3.9", 29 | "Programming Language :: Python :: 3.10", 30 | "Programming Language :: Python :: 3.11", 31 | "Programming Language :: Python :: 3.12", 32 | "License :: OSI Approved :: MIT License", 33 | "Topic :: Scientific/Engineering", 34 | ], 35 | license_files=["LICENSE"], 36 | ) 37 | 38 | -------------------------------------------------------------------------------- /why-scikit-churn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f013ee40-4c76-4347-94d2-b14b8b725eea", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import polars as pl\n", 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "d801dad1", 17 | "metadata": {}, 18 | "source": [ 19 | "Dataset link: [GitHub - koaning/wow-avatar-datasets](https://github.com/koaning/wow-avatar-datasets/blob/main/wow-full.parquet)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "3541e18e-da7b-4452-a465-1e15653dd898", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = pl.read_parquet(\"wow-full.parquet\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "619df8f5-40fc-4443-8c92-a78ead790398", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 48 | "shape: (5, 7)
player_idguildlevelraceclasswheredatetime
i32f64i8catcatcatdatetime[ms]
4634053.026"Orc""Hunter""Razorfen Kraul…2007-06-12 03:17:48
31887null10"Orc""Hunter""Durotar"2007-06-12 03:17:48
47258null15"Orc""Warrior""The Barrens"2007-06-12 03:17:48
17448null43"Orc""Hunter""Silverpine For…2007-06-12 03:17:48
45159104.057"Orc""Warlock""Winterspring"2007-06-12 03:17:53
" 49 | ], 50 | "text/plain": [ 51 | "shape: (5, 7)\n", 52 | "┌───────────┬───────┬───────┬──────┬─────────┬───────────────────┬─────────────────────┐\n", 53 | "│ player_id ┆ guild ┆ level ┆ race ┆ class ┆ where ┆ datetime │\n", 54 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 55 | "│ i32 ┆ f64 ┆ i8 ┆ cat ┆ cat ┆ cat ┆ datetime[ms] │\n", 56 | "╞═══════════╪═══════╪═══════╪══════╪═════════╪═══════════════════╪═════════════════════╡\n", 57 | "│ 46340 ┆ 53.0 ┆ 26 ┆ Orc ┆ Hunter ┆ Razorfen Kraul ┆ 2007-06-12 03:17:48 │\n", 58 | "│ 31887 ┆ null ┆ 10 ┆ Orc ┆ Hunter ┆ Durotar ┆ 2007-06-12 03:17:48 │\n", 59 | "│ 47258 ┆ null ┆ 15 ┆ Orc ┆ Warrior ┆ The Barrens ┆ 2007-06-12 03:17:48 │\n", 60 | "│ 17448 ┆ null ┆ 43 ┆ Orc ┆ Hunter ┆ Silverpine Forest ┆ 2007-06-12 03:17:48 │\n", 61 | "│ 45159 ┆ 104.0 ┆ 57 ┆ Orc ┆ Warlock ┆ Winterspring ┆ 2007-06-12 03:17:53 │\n", 62 | "└───────────┴───────┴───────┴──────┴─────────┴───────────────────┴─────────────────────┘" 63 | ] 64 | }, 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "df.head()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "e617ba91-f77a-4149-b777-04f68cb0edf8", 78 | "metadata": { 79 | "scrolled": true 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "application/javascript": [ 85 | "(function(root) {\n", 86 | " function now() {\n", 87 | " return new Date();\n", 88 | " }\n", 89 | "\n", 90 | " var force = true;\n", 91 | " var py_version = '3.3.4'.replace('rc', '-rc.').replace('.dev', '-dev.');\n", 92 | " var reloading = false;\n", 93 | " var Bokeh = root.Bokeh;\n", 94 | "\n", 95 | " if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n", 96 | " root._bokeh_timeout = Date.now() + 5000;\n", 97 | " root._bokeh_failed_load = false;\n", 98 | " }\n", 99 | "\n", 100 | " function run_callbacks() {\n", 101 | " try {\n", 102 | " root._bokeh_onload_callbacks.forEach(function(callback) {\n", 103 | " if (callback != null)\n", 104 | " callback();\n", 105 | " });\n", 106 | " } finally {\n", 107 | " delete root._bokeh_onload_callbacks;\n", 108 | " }\n", 109 | " console.debug(\"Bokeh: all callbacks have finished\");\n", 110 | " }\n", 111 | "\n", 112 | " function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n", 113 | " if (css_urls == null) css_urls = [];\n", 114 | " if (js_urls == null) js_urls = [];\n", 115 | " if (js_modules == null) js_modules = [];\n", 116 | " if (js_exports == null) js_exports = {};\n", 117 | "\n", 118 | " root._bokeh_onload_callbacks.push(callback);\n", 119 | "\n", 120 | " if (root._bokeh_is_loading > 0) {\n", 121 | " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", 122 | " return null;\n", 123 | " }\n", 124 | " if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n", 125 | " run_callbacks();\n", 126 | " return null;\n", 127 | " }\n", 128 | " if (!reloading) {\n", 129 | " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", 130 | " }\n", 131 | "\n", 132 | " function on_load() {\n", 133 | " root._bokeh_is_loading--;\n", 134 | " if (root._bokeh_is_loading === 0) {\n", 135 | " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", 136 | " run_callbacks()\n", 137 | " }\n", 138 | " }\n", 139 | " window._bokeh_on_load = on_load\n", 140 | "\n", 141 | " function on_error() {\n", 142 | " console.error(\"failed to load \" + url);\n", 143 | " }\n", 144 | "\n", 145 | " var skip = [];\n", 146 | " if (window.requirejs) {\n", 147 | " window.requirejs.config({'packages': {}, 'paths': {'jspanel': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/jspanel', 'jspanel-modal': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/modal/jspanel.modal', 'jspanel-tooltip': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/tooltip/jspanel.tooltip', 'jspanel-hint': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/hint/jspanel.hint', 'jspanel-layout': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/layout/jspanel.layout', 'jspanel-contextmenu': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/contextmenu/jspanel.contextmenu', 'jspanel-dock': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/dock/jspanel.dock', 'gridstack': 'https://cdn.jsdelivr.net/npm/gridstack@7.2.3/dist/gridstack-all', 'notyf': 'https://cdn.jsdelivr.net/npm/notyf@3/notyf.min'}, 'shim': {'jspanel': {'exports': 'jsPanel'}, 'gridstack': {'exports': 'GridStack'}}});\n", 148 | " require([\"jspanel\"], function(jsPanel) {\n", 149 | "\twindow.jsPanel = jsPanel\n", 150 | "\ton_load()\n", 151 | " })\n", 152 | " require([\"jspanel-modal\"], function() {\n", 153 | "\ton_load()\n", 154 | " })\n", 155 | " require([\"jspanel-tooltip\"], function() {\n", 156 | "\ton_load()\n", 157 | " })\n", 158 | " require([\"jspanel-hint\"], function() {\n", 159 | "\ton_load()\n", 160 | " })\n", 161 | " require([\"jspanel-layout\"], function() {\n", 162 | "\ton_load()\n", 163 | " })\n", 164 | " require([\"jspanel-contextmenu\"], function() {\n", 165 | "\ton_load()\n", 166 | " })\n", 167 | " require([\"jspanel-dock\"], function() {\n", 168 | "\ton_load()\n", 169 | " })\n", 170 | " require([\"gridstack\"], function(GridStack) {\n", 171 | "\twindow.GridStack = GridStack\n", 172 | "\ton_load()\n", 173 | " })\n", 174 | " require([\"notyf\"], function() {\n", 175 | "\ton_load()\n", 176 | " })\n", 177 | " root._bokeh_is_loading = css_urls.length + 9;\n", 178 | " } else {\n", 179 | " root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n", 180 | " }\n", 181 | "\n", 182 | " var existing_stylesheets = []\n", 183 | " var links = document.getElementsByTagName('link')\n", 184 | " for (var i = 0; i < links.length; i++) {\n", 185 | " var link = links[i]\n", 186 | " if (link.href != null) {\n", 187 | "\texisting_stylesheets.push(link.href)\n", 188 | " }\n", 189 | " }\n", 190 | " for (var i = 0; i < css_urls.length; i++) {\n", 191 | " var url = css_urls[i];\n", 192 | " if (existing_stylesheets.indexOf(url) !== -1) {\n", 193 | "\ton_load()\n", 194 | "\tcontinue;\n", 195 | " }\n", 196 | " const element = document.createElement(\"link\");\n", 197 | " element.onload = on_load;\n", 198 | " element.onerror = on_error;\n", 199 | " element.rel = \"stylesheet\";\n", 200 | " element.type = \"text/css\";\n", 201 | " element.href = url;\n", 202 | " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", 203 | " document.body.appendChild(element);\n", 204 | " } if (((window['jsPanel'] !== undefined) && (!(window['jsPanel'] instanceof HTMLElement))) || window.requirejs) {\n", 205 | " var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/jspanel.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/modal/jspanel.modal.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/tooltip/jspanel.tooltip.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/hint/jspanel.hint.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/layout/jspanel.layout.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/contextmenu/jspanel.contextmenu.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/dock/jspanel.dock.js'];\n", 206 | " for (var i = 0; i < urls.length; i++) {\n", 207 | " skip.push(urls[i])\n", 208 | " }\n", 209 | " } if (((window['GridStack'] !== undefined) && (!(window['GridStack'] instanceof HTMLElement))) || window.requirejs) {\n", 210 | " var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/gridstack/gridstack@7.2.3/dist/gridstack-all.js'];\n", 211 | " for (var i = 0; i < urls.length; i++) {\n", 212 | " skip.push(urls[i])\n", 213 | " }\n", 214 | " } if (((window['Notyf'] !== undefined) && (!(window['Notyf'] instanceof HTMLElement))) || window.requirejs) {\n", 215 | " var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/notificationarea/notyf@3/notyf.min.js'];\n", 216 | " for (var i = 0; i < urls.length; i++) {\n", 217 | " skip.push(urls[i])\n", 218 | " }\n", 219 | " } var existing_scripts = []\n", 220 | " var scripts = document.getElementsByTagName('script')\n", 221 | " for (var i = 0; i < scripts.length; i++) {\n", 222 | " var script = scripts[i]\n", 223 | " if (script.src != null) {\n", 224 | "\texisting_scripts.push(script.src)\n", 225 | " }\n", 226 | " }\n", 227 | " for (var i = 0; i < js_urls.length; i++) {\n", 228 | " var url = js_urls[i];\n", 229 | " if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n", 230 | "\tif (!window.requirejs) {\n", 231 | "\t on_load();\n", 232 | "\t}\n", 233 | "\tcontinue;\n", 234 | " }\n", 235 | " var element = document.createElement('script');\n", 236 | " element.onload = on_load;\n", 237 | " element.onerror = on_error;\n", 238 | " element.async = false;\n", 239 | " element.src = url;\n", 240 | " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", 241 | " document.head.appendChild(element);\n", 242 | " }\n", 243 | " for (var i = 0; i < js_modules.length; i++) {\n", 244 | " var url = js_modules[i];\n", 245 | " if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n", 246 | "\tif (!window.requirejs) {\n", 247 | "\t on_load();\n", 248 | "\t}\n", 249 | "\tcontinue;\n", 250 | " }\n", 251 | " var element = document.createElement('script');\n", 252 | " element.onload = on_load;\n", 253 | " element.onerror = on_error;\n", 254 | " element.async = false;\n", 255 | " element.src = url;\n", 256 | " element.type = \"module\";\n", 257 | " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", 258 | " document.head.appendChild(element);\n", 259 | " }\n", 260 | " for (const name in js_exports) {\n", 261 | " var url = js_exports[name];\n", 262 | " if (skip.indexOf(url) >= 0 || root[name] != null) {\n", 263 | "\tif (!window.requirejs) {\n", 264 | "\t on_load();\n", 265 | "\t}\n", 266 | "\tcontinue;\n", 267 | " }\n", 268 | " var element = document.createElement('script');\n", 269 | " element.onerror = on_error;\n", 270 | " element.async = false;\n", 271 | " element.type = \"module\";\n", 272 | " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", 273 | " element.textContent = `\n", 274 | " import ${name} from \"${url}\"\n", 275 | " window.${name} = ${name}\n", 276 | " window._bokeh_on_load()\n", 277 | " `\n", 278 | " document.head.appendChild(element);\n", 279 | " }\n", 280 | " if (!js_urls.length && !js_modules.length) {\n", 281 | " on_load()\n", 282 | " }\n", 283 | " };\n", 284 | "\n", 285 | " function inject_raw_css(css) {\n", 286 | " const element = document.createElement(\"style\");\n", 287 | " element.appendChild(document.createTextNode(css));\n", 288 | " document.body.appendChild(element);\n", 289 | " }\n", 290 | "\n", 291 | " var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.holoviz.org/panel/1.3.8/dist/panel.min.js\"];\n", 292 | " var js_modules = [];\n", 293 | " var js_exports = {};\n", 294 | " var css_urls = [];\n", 295 | " var inline_js = [ function(Bokeh) {\n", 296 | " Bokeh.set_log_level(\"info\");\n", 297 | " },\n", 298 | "function(Bokeh) {} // ensure no trailing comma for IE\n", 299 | " ];\n", 300 | "\n", 301 | " function run_inline_js() {\n", 302 | " if ((root.Bokeh !== undefined) || (force === true)) {\n", 303 | " for (var i = 0; i < inline_js.length; i++) {\n", 304 | "\ttry {\n", 305 | " inline_js[i].call(root, root.Bokeh);\n", 306 | "\t} catch(e) {\n", 307 | "\t if (!reloading) {\n", 308 | "\t throw e;\n", 309 | "\t }\n", 310 | "\t}\n", 311 | " }\n", 312 | " // Cache old bokeh versions\n", 313 | " if (Bokeh != undefined && !reloading) {\n", 314 | "\tvar NewBokeh = root.Bokeh;\n", 315 | "\tif (Bokeh.versions === undefined) {\n", 316 | "\t Bokeh.versions = new Map();\n", 317 | "\t}\n", 318 | "\tif (NewBokeh.version !== Bokeh.version) {\n", 319 | "\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n", 320 | "\t}\n", 321 | "\troot.Bokeh = Bokeh;\n", 322 | " }} else if (Date.now() < root._bokeh_timeout) {\n", 323 | " setTimeout(run_inline_js, 100);\n", 324 | " } else if (!root._bokeh_failed_load) {\n", 325 | " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", 326 | " root._bokeh_failed_load = true;\n", 327 | " }\n", 328 | " root._bokeh_is_initializing = false\n", 329 | " }\n", 330 | "\n", 331 | " function load_or_wait() {\n", 332 | " // Implement a backoff loop that tries to ensure we do not load multiple\n", 333 | " // versions of Bokeh and its dependencies at the same time.\n", 334 | " // In recent versions we use the root._bokeh_is_initializing flag\n", 335 | " // to determine whether there is an ongoing attempt to initialize\n", 336 | " // bokeh, however for backward compatibility we also try to ensure\n", 337 | " // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n", 338 | " // before older versions are fully initialized.\n", 339 | " if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n", 340 | " root._bokeh_is_initializing = false;\n", 341 | " root._bokeh_onload_callbacks = undefined;\n", 342 | " console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n", 343 | " load_or_wait();\n", 344 | " } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n", 345 | " setTimeout(load_or_wait, 100);\n", 346 | " } else {\n", 347 | " root._bokeh_is_initializing = true\n", 348 | " root._bokeh_onload_callbacks = []\n", 349 | " var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n", 350 | " if (!reloading && !bokeh_loaded) {\n", 351 | "\troot.Bokeh = undefined;\n", 352 | " }\n", 353 | " load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n", 354 | "\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", 355 | "\trun_inline_js();\n", 356 | " });\n", 357 | " }\n", 358 | " }\n", 359 | " // Give older versions of the autoload script a head-start to ensure\n", 360 | " // they initialize before we start loading newer version.\n", 361 | " setTimeout(load_or_wait, 100)\n", 362 | "}(window));" 363 | ], 364 | "application/vnd.holoviews_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n var py_version = '3.3.4'.replace('rc', '-rc.').replace('.dev', '-dev.');\n var reloading = false;\n var Bokeh = root.Bokeh;\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks;\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n if (js_modules == null) js_modules = [];\n if (js_exports == null) js_exports = {};\n\n root._bokeh_onload_callbacks.push(callback);\n\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n run_callbacks();\n return null;\n }\n if (!reloading) {\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n }\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n window._bokeh_on_load = on_load\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n var skip = [];\n if (window.requirejs) {\n window.requirejs.config({'packages': {}, 'paths': {'jspanel': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/jspanel', 'jspanel-modal': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/modal/jspanel.modal', 'jspanel-tooltip': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/tooltip/jspanel.tooltip', 'jspanel-hint': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/hint/jspanel.hint', 'jspanel-layout': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/layout/jspanel.layout', 'jspanel-contextmenu': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/contextmenu/jspanel.contextmenu', 'jspanel-dock': 'https://cdn.jsdelivr.net/npm/jspanel4@4.12.0/dist/extensions/dock/jspanel.dock', 'gridstack': 'https://cdn.jsdelivr.net/npm/gridstack@7.2.3/dist/gridstack-all', 'notyf': 'https://cdn.jsdelivr.net/npm/notyf@3/notyf.min'}, 'shim': {'jspanel': {'exports': 'jsPanel'}, 'gridstack': {'exports': 'GridStack'}}});\n require([\"jspanel\"], function(jsPanel) {\n\twindow.jsPanel = jsPanel\n\ton_load()\n })\n require([\"jspanel-modal\"], function() {\n\ton_load()\n })\n require([\"jspanel-tooltip\"], function() {\n\ton_load()\n })\n require([\"jspanel-hint\"], function() {\n\ton_load()\n })\n require([\"jspanel-layout\"], function() {\n\ton_load()\n })\n require([\"jspanel-contextmenu\"], function() {\n\ton_load()\n })\n require([\"jspanel-dock\"], function() {\n\ton_load()\n })\n require([\"gridstack\"], function(GridStack) {\n\twindow.GridStack = GridStack\n\ton_load()\n })\n require([\"notyf\"], function() {\n\ton_load()\n })\n root._bokeh_is_loading = css_urls.length + 9;\n } else {\n root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n }\n\n var existing_stylesheets = []\n var links = document.getElementsByTagName('link')\n for (var i = 0; i < links.length; i++) {\n var link = links[i]\n if (link.href != null) {\n\texisting_stylesheets.push(link.href)\n }\n }\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n if (existing_stylesheets.indexOf(url) !== -1) {\n\ton_load()\n\tcontinue;\n }\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n } if (((window['jsPanel'] !== undefined) && (!(window['jsPanel'] instanceof HTMLElement))) || window.requirejs) {\n var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/jspanel.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/modal/jspanel.modal.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/tooltip/jspanel.tooltip.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/hint/jspanel.hint.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/layout/jspanel.layout.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/contextmenu/jspanel.contextmenu.js', 'https://cdn.holoviz.org/panel/1.3.8/dist/bundled/floatpanel/jspanel4@4.12.0/dist/extensions/dock/jspanel.dock.js'];\n for (var i = 0; i < urls.length; i++) {\n skip.push(urls[i])\n }\n } if (((window['GridStack'] !== undefined) && (!(window['GridStack'] instanceof HTMLElement))) || window.requirejs) {\n var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/gridstack/gridstack@7.2.3/dist/gridstack-all.js'];\n for (var i = 0; i < urls.length; i++) {\n skip.push(urls[i])\n }\n } if (((window['Notyf'] !== undefined) && (!(window['Notyf'] instanceof HTMLElement))) || window.requirejs) {\n var urls = ['https://cdn.holoviz.org/panel/1.3.8/dist/bundled/notificationarea/notyf@3/notyf.min.js'];\n for (var i = 0; i < urls.length; i++) {\n skip.push(urls[i])\n }\n } var existing_scripts = []\n var scripts = document.getElementsByTagName('script')\n for (var i = 0; i < scripts.length; i++) {\n var script = scripts[i]\n if (script.src != null) {\n\texisting_scripts.push(script.src)\n }\n }\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (var i = 0; i < js_modules.length; i++) {\n var url = js_modules[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (const name in js_exports) {\n var url = js_exports[name];\n if (skip.indexOf(url) >= 0 || root[name] != null) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onerror = on_error;\n element.async = false;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n element.textContent = `\n import ${name} from \"${url}\"\n window.${name} = ${name}\n window._bokeh_on_load()\n `\n document.head.appendChild(element);\n }\n if (!js_urls.length && !js_modules.length) {\n on_load()\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.holoviz.org/panel/1.3.8/dist/panel.min.js\"];\n var js_modules = [];\n var js_exports = {};\n var css_urls = [];\n var inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n\ttry {\n inline_js[i].call(root, root.Bokeh);\n\t} catch(e) {\n\t if (!reloading) {\n\t throw e;\n\t }\n\t}\n }\n // Cache old bokeh versions\n if (Bokeh != undefined && !reloading) {\n\tvar NewBokeh = root.Bokeh;\n\tif (Bokeh.versions === undefined) {\n\t Bokeh.versions = new Map();\n\t}\n\tif (NewBokeh.version !== Bokeh.version) {\n\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n\t}\n\troot.Bokeh = Bokeh;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n }\n root._bokeh_is_initializing = false\n }\n\n function load_or_wait() {\n // Implement a backoff loop that tries to ensure we do not load multiple\n // versions of Bokeh and its dependencies at the same time.\n // In recent versions we use the root._bokeh_is_initializing flag\n // to determine whether there is an ongoing attempt to initialize\n // bokeh, however for backward compatibility we also try to ensure\n // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n // before older versions are fully initialized.\n if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n root._bokeh_is_initializing = false;\n root._bokeh_onload_callbacks = undefined;\n console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n load_or_wait();\n } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n setTimeout(load_or_wait, 100);\n } else {\n root._bokeh_is_initializing = true\n root._bokeh_onload_callbacks = []\n var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n if (!reloading && !bokeh_loaded) {\n\troot.Bokeh = undefined;\n }\n load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n\trun_inline_js();\n });\n }\n }\n // Give older versions of the autoload script a head-start to ensure\n // they initialize before we start loading newer version.\n setTimeout(load_or_wait, 100)\n}(window));" 365 | }, 366 | "metadata": {}, 367 | "output_type": "display_data" 368 | }, 369 | { 370 | "data": { 371 | "application/javascript": [ 372 | "\n", 373 | "if ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n", 374 | " window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n", 375 | "}\n", 376 | "\n", 377 | "\n", 378 | " function JupyterCommManager() {\n", 379 | " }\n", 380 | "\n", 381 | " JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n", 382 | " if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n", 383 | " var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n", 384 | " comm_manager.register_target(comm_id, function(comm) {\n", 385 | " comm.on_msg(msg_handler);\n", 386 | " });\n", 387 | " } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n", 388 | " window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n", 389 | " comm.onMsg = msg_handler;\n", 390 | " });\n", 391 | " } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n", 392 | " google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n", 393 | " var messages = comm.messages[Symbol.asyncIterator]();\n", 394 | " function processIteratorResult(result) {\n", 395 | " var message = result.value;\n", 396 | " console.log(message)\n", 397 | " var content = {data: message.data, comm_id};\n", 398 | " var buffers = []\n", 399 | " for (var buffer of message.buffers || []) {\n", 400 | " buffers.push(new DataView(buffer))\n", 401 | " }\n", 402 | " var metadata = message.metadata || {};\n", 403 | " var msg = {content, buffers, metadata}\n", 404 | " msg_handler(msg);\n", 405 | " return messages.next().then(processIteratorResult);\n", 406 | " }\n", 407 | " return messages.next().then(processIteratorResult);\n", 408 | " })\n", 409 | " }\n", 410 | " }\n", 411 | "\n", 412 | " JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n", 413 | " if (comm_id in window.PyViz.comms) {\n", 414 | " return window.PyViz.comms[comm_id];\n", 415 | " } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n", 416 | " var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n", 417 | " var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n", 418 | " if (msg_handler) {\n", 419 | " comm.on_msg(msg_handler);\n", 420 | " }\n", 421 | " } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n", 422 | " var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n", 423 | " comm.open();\n", 424 | " if (msg_handler) {\n", 425 | " comm.onMsg = msg_handler;\n", 426 | " }\n", 427 | " } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n", 428 | " var comm_promise = google.colab.kernel.comms.open(comm_id)\n", 429 | " comm_promise.then((comm) => {\n", 430 | " window.PyViz.comms[comm_id] = comm;\n", 431 | " if (msg_handler) {\n", 432 | " var messages = comm.messages[Symbol.asyncIterator]();\n", 433 | " function processIteratorResult(result) {\n", 434 | " var message = result.value;\n", 435 | " var content = {data: message.data};\n", 436 | " var metadata = message.metadata || {comm_id};\n", 437 | " var msg = {content, metadata}\n", 438 | " msg_handler(msg);\n", 439 | " return messages.next().then(processIteratorResult);\n", 440 | " }\n", 441 | " return messages.next().then(processIteratorResult);\n", 442 | " }\n", 443 | " }) \n", 444 | " var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n", 445 | " return comm_promise.then((comm) => {\n", 446 | " comm.send(data, metadata, buffers, disposeOnDone);\n", 447 | " });\n", 448 | " };\n", 449 | " var comm = {\n", 450 | " send: sendClosure\n", 451 | " };\n", 452 | " }\n", 453 | " window.PyViz.comms[comm_id] = comm;\n", 454 | " return comm;\n", 455 | " }\n", 456 | " window.PyViz.comm_manager = new JupyterCommManager();\n", 457 | " \n", 458 | "\n", 459 | "\n", 460 | "var JS_MIME_TYPE = 'application/javascript';\n", 461 | "var HTML_MIME_TYPE = 'text/html';\n", 462 | "var EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\n", 463 | "var CLASS_NAME = 'output';\n", 464 | "\n", 465 | "/**\n", 466 | " * Render data to the DOM node\n", 467 | " */\n", 468 | "function render(props, node) {\n", 469 | " var div = document.createElement(\"div\");\n", 470 | " var script = document.createElement(\"script\");\n", 471 | " node.appendChild(div);\n", 472 | " node.appendChild(script);\n", 473 | "}\n", 474 | "\n", 475 | "/**\n", 476 | " * Handle when a new output is added\n", 477 | " */\n", 478 | "function handle_add_output(event, handle) {\n", 479 | " var output_area = handle.output_area;\n", 480 | " var output = handle.output;\n", 481 | " if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", 482 | " return\n", 483 | " }\n", 484 | " var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", 485 | " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", 486 | " if (id !== undefined) {\n", 487 | " var nchildren = toinsert.length;\n", 488 | " var html_node = toinsert[nchildren-1].children[0];\n", 489 | " html_node.innerHTML = output.data[HTML_MIME_TYPE];\n", 490 | " var scripts = [];\n", 491 | " var nodelist = html_node.querySelectorAll(\"script\");\n", 492 | " for (var i in nodelist) {\n", 493 | " if (nodelist.hasOwnProperty(i)) {\n", 494 | " scripts.push(nodelist[i])\n", 495 | " }\n", 496 | " }\n", 497 | "\n", 498 | " scripts.forEach( function (oldScript) {\n", 499 | " var newScript = document.createElement(\"script\");\n", 500 | " var attrs = [];\n", 501 | " var nodemap = oldScript.attributes;\n", 502 | " for (var j in nodemap) {\n", 503 | " if (nodemap.hasOwnProperty(j)) {\n", 504 | " attrs.push(nodemap[j])\n", 505 | " }\n", 506 | " }\n", 507 | " attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n", 508 | " newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n", 509 | " oldScript.parentNode.replaceChild(newScript, oldScript);\n", 510 | " });\n", 511 | " if (JS_MIME_TYPE in output.data) {\n", 512 | " toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n", 513 | " }\n", 514 | " output_area._hv_plot_id = id;\n", 515 | " if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n", 516 | " window.PyViz.plot_index[id] = Bokeh.index[id];\n", 517 | " } else {\n", 518 | " window.PyViz.plot_index[id] = null;\n", 519 | " }\n", 520 | " } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", 521 | " var bk_div = document.createElement(\"div\");\n", 522 | " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", 523 | " var script_attrs = bk_div.children[0].attributes;\n", 524 | " for (var i = 0; i < script_attrs.length; i++) {\n", 525 | " toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n", 526 | " }\n", 527 | " // store reference to server id on output_area\n", 528 | " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", 529 | " }\n", 530 | "}\n", 531 | "\n", 532 | "/**\n", 533 | " * Handle when an output is cleared or removed\n", 534 | " */\n", 535 | "function handle_clear_output(event, handle) {\n", 536 | " var id = handle.cell.output_area._hv_plot_id;\n", 537 | " var server_id = handle.cell.output_area._bokeh_server_id;\n", 538 | " if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n", 539 | " var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n", 540 | " if (server_id !== null) {\n", 541 | " comm.send({event_type: 'server_delete', 'id': server_id});\n", 542 | " return;\n", 543 | " } else if (comm !== null) {\n", 544 | " comm.send({event_type: 'delete', 'id': id});\n", 545 | " }\n", 546 | " delete PyViz.plot_index[id];\n", 547 | " if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n", 548 | " var doc = window.Bokeh.index[id].model.document\n", 549 | " doc.clear();\n", 550 | " const i = window.Bokeh.documents.indexOf(doc);\n", 551 | " if (i > -1) {\n", 552 | " window.Bokeh.documents.splice(i, 1);\n", 553 | " }\n", 554 | " }\n", 555 | "}\n", 556 | "\n", 557 | "/**\n", 558 | " * Handle kernel restart event\n", 559 | " */\n", 560 | "function handle_kernel_cleanup(event, handle) {\n", 561 | " delete PyViz.comms[\"hv-extension-comm\"];\n", 562 | " window.PyViz.plot_index = {}\n", 563 | "}\n", 564 | "\n", 565 | "/**\n", 566 | " * Handle update_display_data messages\n", 567 | " */\n", 568 | "function handle_update_output(event, handle) {\n", 569 | " handle_clear_output(event, {cell: {output_area: handle.output_area}})\n", 570 | " handle_add_output(event, handle)\n", 571 | "}\n", 572 | "\n", 573 | "function register_renderer(events, OutputArea) {\n", 574 | " function append_mime(data, metadata, element) {\n", 575 | " // create a DOM node to render to\n", 576 | " var toinsert = this.create_output_subarea(\n", 577 | " metadata,\n", 578 | " CLASS_NAME,\n", 579 | " EXEC_MIME_TYPE\n", 580 | " );\n", 581 | " this.keyboard_manager.register_events(toinsert);\n", 582 | " // Render to node\n", 583 | " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", 584 | " render(props, toinsert[0]);\n", 585 | " element.append(toinsert);\n", 586 | " return toinsert\n", 587 | " }\n", 588 | "\n", 589 | " events.on('output_added.OutputArea', handle_add_output);\n", 590 | " events.on('output_updated.OutputArea', handle_update_output);\n", 591 | " events.on('clear_output.CodeCell', handle_clear_output);\n", 592 | " events.on('delete.Cell', handle_clear_output);\n", 593 | " events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n", 594 | "\n", 595 | " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", 596 | " safe: true,\n", 597 | " index: 0\n", 598 | " });\n", 599 | "}\n", 600 | "\n", 601 | "if (window.Jupyter !== undefined) {\n", 602 | " try {\n", 603 | " var events = require('base/js/events');\n", 604 | " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", 605 | " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", 606 | " register_renderer(events, OutputArea);\n", 607 | " }\n", 608 | " } catch(err) {\n", 609 | " }\n", 610 | "}\n" 611 | ], 612 | "application/vnd.holoviews_load.v0+json": "\nif ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n}\n\n\n function JupyterCommManager() {\n }\n\n JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n comm_manager.register_target(comm_id, function(comm) {\n comm.on_msg(msg_handler);\n });\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n comm.onMsg = msg_handler;\n });\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n console.log(message)\n var content = {data: message.data, comm_id};\n var buffers = []\n for (var buffer of message.buffers || []) {\n buffers.push(new DataView(buffer))\n }\n var metadata = message.metadata || {};\n var msg = {content, buffers, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n })\n }\n }\n\n JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n if (comm_id in window.PyViz.comms) {\n return window.PyViz.comms[comm_id];\n } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n if (msg_handler) {\n comm.on_msg(msg_handler);\n }\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n comm.open();\n if (msg_handler) {\n comm.onMsg = msg_handler;\n }\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n var comm_promise = google.colab.kernel.comms.open(comm_id)\n comm_promise.then((comm) => {\n window.PyViz.comms[comm_id] = comm;\n if (msg_handler) {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data};\n var metadata = message.metadata || {comm_id};\n var msg = {content, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n }\n }) \n var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n return comm_promise.then((comm) => {\n comm.send(data, metadata, buffers, disposeOnDone);\n });\n };\n var comm = {\n send: sendClosure\n };\n }\n window.PyViz.comms[comm_id] = comm;\n return comm;\n }\n window.PyViz.comm_manager = new JupyterCommManager();\n \n\n\nvar JS_MIME_TYPE = 'application/javascript';\nvar HTML_MIME_TYPE = 'text/html';\nvar EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\nvar CLASS_NAME = 'output';\n\n/**\n * Render data to the DOM node\n */\nfunction render(props, node) {\n var div = document.createElement(\"div\");\n var script = document.createElement(\"script\");\n node.appendChild(div);\n node.appendChild(script);\n}\n\n/**\n * Handle when a new output is added\n */\nfunction handle_add_output(event, handle) {\n var output_area = handle.output_area;\n var output = handle.output;\n if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n return\n }\n var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n if (id !== undefined) {\n var nchildren = toinsert.length;\n var html_node = toinsert[nchildren-1].children[0];\n html_node.innerHTML = output.data[HTML_MIME_TYPE];\n var scripts = [];\n var nodelist = html_node.querySelectorAll(\"script\");\n for (var i in nodelist) {\n if (nodelist.hasOwnProperty(i)) {\n scripts.push(nodelist[i])\n }\n }\n\n scripts.forEach( function (oldScript) {\n var newScript = document.createElement(\"script\");\n var attrs = [];\n var nodemap = oldScript.attributes;\n for (var j in nodemap) {\n if (nodemap.hasOwnProperty(j)) {\n attrs.push(nodemap[j])\n }\n }\n attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n oldScript.parentNode.replaceChild(newScript, oldScript);\n });\n if (JS_MIME_TYPE in output.data) {\n toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n }\n output_area._hv_plot_id = id;\n if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n window.PyViz.plot_index[id] = Bokeh.index[id];\n } else {\n window.PyViz.plot_index[id] = null;\n }\n } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n var bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n var script_attrs = bk_div.children[0].attributes;\n for (var i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n}\n\n/**\n * Handle when an output is cleared or removed\n */\nfunction handle_clear_output(event, handle) {\n var id = handle.cell.output_area._hv_plot_id;\n var server_id = handle.cell.output_area._bokeh_server_id;\n if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n if (server_id !== null) {\n comm.send({event_type: 'server_delete', 'id': server_id});\n return;\n } else if (comm !== null) {\n comm.send({event_type: 'delete', 'id': id});\n }\n delete PyViz.plot_index[id];\n if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n var doc = window.Bokeh.index[id].model.document\n doc.clear();\n const i = window.Bokeh.documents.indexOf(doc);\n if (i > -1) {\n window.Bokeh.documents.splice(i, 1);\n }\n }\n}\n\n/**\n * Handle kernel restart event\n */\nfunction handle_kernel_cleanup(event, handle) {\n delete PyViz.comms[\"hv-extension-comm\"];\n window.PyViz.plot_index = {}\n}\n\n/**\n * Handle update_display_data messages\n */\nfunction handle_update_output(event, handle) {\n handle_clear_output(event, {cell: {output_area: handle.output_area}})\n handle_add_output(event, handle)\n}\n\nfunction register_renderer(events, OutputArea) {\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n var toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[0]);\n element.append(toinsert);\n return toinsert\n }\n\n events.on('output_added.OutputArea', handle_add_output);\n events.on('output_updated.OutputArea', handle_update_output);\n events.on('clear_output.CodeCell', handle_clear_output);\n events.on('delete.Cell', handle_clear_output);\n events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n safe: true,\n index: 0\n });\n}\n\nif (window.Jupyter !== undefined) {\n try {\n var events = require('base/js/events');\n var OutputArea = require('notebook/js/outputarea').OutputArea;\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n } catch(err) {\n }\n}\n" 613 | }, 614 | "metadata": {}, 615 | "output_type": "display_data" 616 | }, 617 | { 618 | "data": { 619 | "text/html": [ 620 | "" 636 | ] 637 | }, 638 | "metadata": {}, 639 | "output_type": "display_data" 640 | }, 641 | { 642 | "data": { 643 | "application/vnd.holoviews_exec.v0+json": "", 644 | "text/html": [ 645 | "
\n", 646 | "
\n", 647 | "
\n", 648 | "" 710 | ] 711 | }, 712 | "metadata": { 713 | "application/vnd.holoviews_exec.v0+json": { 714 | "id": "p1002" 715 | } 716 | }, 717 | "output_type": "display_data" 718 | }, 719 | { 720 | "data": {}, 721 | "metadata": {}, 722 | "output_type": "display_data" 723 | }, 724 | { 725 | "data": { 726 | "application/vnd.holoviews_exec.v0+json": "", 727 | "text/html": [ 728 | "
\n", 729 | "
\n", 730 | "
\n", 731 | "" 793 | ], 794 | "text/plain": [ 795 | ":Curve [date] (len)" 796 | ] 797 | }, 798 | "execution_count": 4, 799 | "metadata": { 800 | "application/vnd.holoviews_exec.v0+json": { 801 | "id": "p1004" 802 | } 803 | }, 804 | "output_type": "execute_result" 805 | } 806 | ], 807 | "source": [ 808 | "(df\n", 809 | " .with_columns(date=pl.col(\"datetime\").dt.date())\n", 810 | " .group_by(\"date\")\n", 811 | " .len()\n", 812 | " .plot(\"date\", \"len\"))" 813 | ] 814 | }, 815 | { 816 | "cell_type": "markdown", 817 | "id": "d8a3b40f-6b27-4742-938e-378f69e9d648", 818 | "metadata": {}, 819 | "source": [ 820 | "Let us now move towards Machine Learning." 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 5, 826 | "id": "5d19a584-3c94-4c04-b99e-3bc50a3cf4ea", 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "df_target = (df\n", 831 | " .group_by(\"player_id\")\n", 832 | " .agg(pl.col(\"datetime\").max())\n", 833 | " .with_columns(target=pl.col(\"datetime\").dt.year() >= 2007 )\n", 834 | " .drop(\"datetime\"))" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 6, 840 | "id": "b855bfea-2e88-4ae7-920c-4f84d8191d43", 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "ml_df = (df\n", 845 | " .group_by(\"player_id\")\n", 846 | " .agg(\n", 847 | " pl.col(\"level\").max(),\n", 848 | " pl.col(\"class\").first(),\n", 849 | " pl.col(\"race\").first(),\n", 850 | " pl.len().alias(\"n_row\"),\n", 851 | " )\n", 852 | " .join(df_target, on=\"player_id\")\n", 853 | " .drop(\"player_id\")\n", 854 | " .filter(pl.col(\"n_row\") > 10)\n", 855 | ")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 7, 861 | "id": "fb897ee0-91f3-4292-a911-643c8e051ac5", 862 | "metadata": {}, 863 | "outputs": [ 864 | { 865 | "data": { 866 | "text/html": [ 867 | "
\n", 874 | "shape: (5, 4)
levelclassracen_row
i8catcatu32
18"Hunter""Troll"106
57"Death Knight""Blood Elf"16
8"Hunter""Troll"30
70"Priest""Undead"4289
12"Paladin""Blood Elf"31
" 875 | ], 876 | "text/plain": [ 877 | "shape: (5, 4)\n", 878 | "┌───────┬──────────────┬───────────┬───────┐\n", 879 | "│ level ┆ class ┆ race ┆ n_row │\n", 880 | "│ --- ┆ --- ┆ --- ┆ --- │\n", 881 | "│ i8 ┆ cat ┆ cat ┆ u32 │\n", 882 | "╞═══════╪══════════════╪═══════════╪═══════╡\n", 883 | "│ 18 ┆ Hunter ┆ Troll ┆ 106 │\n", 884 | "│ 57 ┆ Death Knight ┆ Blood Elf ┆ 16 │\n", 885 | "│ 8 ┆ Hunter ┆ Troll ┆ 30 │\n", 886 | "│ 70 ┆ Priest ┆ Undead ┆ 4289 │\n", 887 | "│ 12 ┆ Paladin ┆ Blood Elf ┆ 31 │\n", 888 | "└───────┴──────────────┴───────────┴───────┘" 889 | ] 890 | }, 891 | "execution_count": 7, 892 | "metadata": {}, 893 | "output_type": "execute_result" 894 | } 895 | ], 896 | "source": [ 897 | "y = np.array(ml_df['target'])\n", 898 | "X = ml_df.drop(\"target\")\n", 899 | "X.head()" 900 | ] 901 | }, 902 | { 903 | "cell_type": "code", 904 | "execution_count": 8, 905 | "id": "94e670be-d177-45ab-8a6f-99fdf8d31ec7", 906 | "metadata": {}, 907 | "outputs": [], 908 | "source": [ 909 | "from skrub import SelectCols\n", 910 | "from sklearn.pipeline import make_pipeline, make_union\n", 911 | "from sklearn.preprocessing import OneHotEncoder\n", 912 | "from sklearn.linear_model import LogisticRegression\n", 913 | "from sklearn.model_selection import cross_validate\n", 914 | "from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": 9, 920 | "id": "1de987f5-6de9-47f8-93c9-8200bc167d5b", 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [ 924 | "pipe = make_pipeline(\n", 925 | " make_union(\n", 926 | " make_pipeline(\n", 927 | " SelectCols([\"class\", \"race\"]),\n", 928 | " OneHotEncoder(handle_unknown=\"infrequent_if_exist\")\n", 929 | " ),\n", 930 | " make_pipeline(\n", 931 | " SelectCols([\"level\"]),\n", 932 | " )\n", 933 | " ),\n", 934 | " LogisticRegression(max_iter=2_000)\n", 935 | ")" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": 10, 941 | "id": "e4e329ad-6b91-4644-b532-49e38056f926", 942 | "metadata": {}, 943 | "outputs": [ 944 | { 945 | "data": { 946 | "text/plain": [ 947 | "0.8308071286730513" 948 | ] 949 | }, 950 | "execution_count": 10, 951 | "metadata": {}, 952 | "output_type": "execute_result" 953 | } 954 | ], 955 | "source": [ 956 | "import numpy as np\n", 957 | "\n", 958 | "np.mean(pipe.fit(X, y).predict(X) == y)" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": 11, 964 | "id": "63b69b6e-88e0-41ba-9588-c04d3beb36a7", 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "data": { 969 | "text/plain": [ 970 | "{'fit_time': array([0.1542871 , 0.08812308, 0.21422696, 0.1478188 , 0.13812685]),\n", 971 | " 'score_time': array([0.00765991, 0.00752473, 0.00674915, 0.00745916, 0.00724912]),\n", 972 | " 'test_accuracy': array([0.83084077, 0.83084077, 0.83071509, 0.83081951, 0.83081951]),\n", 973 | " 'test_precision': array([0.83084077, 0.83084077, 0.83071509, 0.83081951, 0.83081951]),\n", 974 | " 'test_recall': array([1., 1., 1., 1., 1.])}" 975 | ] 976 | }, 977 | "execution_count": 11, 978 | "metadata": {}, 979 | "output_type": "execute_result" 980 | } 981 | ], 982 | "source": [ 983 | "from sklearn.metrics import precision_score, accuracy_score, recall_score, make_scorer\n", 984 | "from sklearn.model_selection import cross_validate\n", 985 | "\n", 986 | "y = np.array(ml_df['target'])\n", 987 | "X = ml_df.drop(\"target\")\n", 988 | "\n", 989 | "scorers = {\n", 990 | " \"accuracy\": make_scorer(accuracy_score), \n", 991 | " \"precision\": make_scorer(precision_score), \n", 992 | " \"recall\": make_scorer(recall_score)\n", 993 | "}\n", 994 | "cross_validate(pipe, X, y, cv=5, scoring=scorers)" 995 | ] 996 | }, 997 | { 998 | "cell_type": "markdown", 999 | "id": "07f6c225-f27e-46b9-ba67-067e4b50a342", 1000 | "metadata": {}, 1001 | "source": [ 1002 | "Ah yes. These numbers all give us confidence ... but alas ... this is a dataleak that can become a black hole. " 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": 12, 1008 | "id": "48632e3a-605f-4b0e-acd6-ca6fec9704eb", 1009 | "metadata": {}, 1010 | "outputs": [ 1011 | { 1012 | "data": { 1013 | "text/plain": [ 1014 | "(0.8308071286730513, (39783,))" 1015 | ] 1016 | }, 1017 | "execution_count": 12, 1018 | "metadata": {}, 1019 | "output_type": "execute_result" 1020 | } 1021 | ], 1022 | "source": [ 1023 | "np.mean(y), y.shape" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "markdown", 1028 | "id": "3d373658-a72d-4601-b9a3-2b8c8d5ad6bc", 1029 | "metadata": {}, 1030 | "source": [ 1031 | "Let's write a safety mechanism now." 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": 13, 1037 | "id": "51ac8774-7638-400c-83ce-a4a11a41c01b", 1038 | "metadata": {}, 1039 | "outputs": [], 1040 | "source": [ 1041 | "from datetime import datetime, timedelta\n", 1042 | "\n", 1043 | "def churn_dataset_generator(df, user_id, feature_pipeline, \n", 1044 | " info_period=180, \n", 1045 | " checking_period=180, \n", 1046 | " start_date=datetime(2007, 1, 1), \n", 1047 | " end_date=datetime(2007, 12, 31), \n", 1048 | " step=\"1mo\", \n", 1049 | " time_col=\"datetime\"):\n", 1050 | " \"\"\"\n", 1051 | " Generates X,y pairs for churn related machine learning, with way less temporal data leaks to worry about. \n", 1052 | "\n", 1053 | " Arguments:\n", 1054 | "\n", 1055 | " - df: a Polars dataframe that contains logs over time for users\n", 1056 | " - user_id: the column name that depicts the user id\n", 1057 | " - feature_pipeline: a Polars compatible function that generatres ML features to go in `X`\n", 1058 | " - input_period: the number of days that the input period lasts\n", 1059 | " - checking_period: the number of days that the checking period lasts\n", 1060 | " - start_date: the start date for X,y-pair generation\n", 1061 | " - end_date: the end date for X,y-pair generation\n", 1062 | " - step: stepsize over time for new X,y-pairs. defaults to a month. \n", 1063 | " - time_col: column name that depicts the datetime stamp\n", 1064 | " \"\"\"\n", 1065 | " cutoff_start = pl.datetime_range(start_date, end_date, step, eager=True).alias(time_col)\n", 1066 | " min_date = df[time_col].min()\n", 1067 | " max_date = df[time_col].max()\n", 1068 | " \n", 1069 | " for start in cutoff_start.to_list():\n", 1070 | " info_period_start = start - timedelta(days=info_period)\n", 1071 | " checking_period_end = start + timedelta(days=checking_period)\n", 1072 | " if info_period_start < min_date:\n", 1073 | " continue\n", 1074 | " if checking_period_end > max_date:\n", 1075 | " continue\n", 1076 | " print(info_period_start, start, checking_period_end, min_date, max_date)\n", 1077 | " train_info = df.filter(pl.col(time_col) < start, pl.col(time_col) >= (start - timedelta(days=info_period)))\n", 1078 | " valid_info = df.filter(pl.col(time_col) >= start, pl.col(time_col) < (start + timedelta(days=checking_period)))\n", 1079 | " \n", 1080 | " \n", 1081 | " target = valid_info.select(\"player_id\").unique().with_columns(target=True)\n", 1082 | "\n", 1083 | " ml_df = (train_info\n", 1084 | " .pipe(feature_pipeline)\n", 1085 | " .join(target, on=user_id, how=\"left\")\n", 1086 | " .with_columns(target=pl.when(pl.col(\"target\")).then(True).otherwise(False)))\n", 1087 | " \n", 1088 | " X = ml_df.drop(\"target\", \"player_id\")\n", 1089 | " y = np.array(ml_df[\"target\"]).astype(int)\n", 1090 | " \n", 1091 | " yield X, y" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "id": "e56c34d9-175b-4ac7-9cf5-daedcc8714e9", 1097 | "metadata": {}, 1098 | "source": [ 1099 | "Now, when we run the aggregation, we won't steal data from the future that we can't use" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 14, 1105 | "id": "9e14641c-3ee3-4f8d-b3c7-581363db4207", 1106 | "metadata": {}, 1107 | "outputs": [ 1108 | { 1109 | "name": "stdout", 1110 | "output_type": "stream", 1111 | "text": [ 1112 | "2006-09-03 00:00:00 2007-01-01 00:00:00 2007-05-01 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1113 | "shape: (5, 5)\n", 1114 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1115 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1116 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1117 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1118 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1119 | "│ 0.027518 ┆ 0.003589 ┆ 0.700935 ┆ 0.762815 ┆ 0.639476 │\n", 1120 | "│ 0.026381 ┆ 0.003645 ┆ 0.708333 ┆ 0.767123 ┆ 0.652586 │\n", 1121 | "│ 0.023318 ┆ 0.003554 ┆ 0.702882 ┆ 0.767075 ┆ 0.638019 │\n", 1122 | "│ 0.021533 ┆ 0.004087 ┆ 0.716511 ┆ 0.768719 ┆ 0.672489 │\n", 1123 | "│ 0.025888 ┆ 0.003689 ┆ 0.718458 ┆ 0.77892 ┆ 0.661572 │\n", 1124 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1125 | "2006-10-04 00:00:00 2007-02-01 00:00:00 2007-06-01 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1126 | "shape: (5, 5)\n", 1127 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1128 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1129 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1130 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1131 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1132 | "│ 0.022433 ┆ 0.003778 ┆ 0.716697 ┆ 0.776 ┆ 0.626705 │\n", 1133 | "│ 0.029459 ┆ 0.004338 ┆ 0.712922 ┆ 0.772809 ┆ 0.62069 │\n", 1134 | "│ 0.022286 ┆ 0.003783 ┆ 0.700073 ┆ 0.757386 ┆ 0.607759 │\n", 1135 | "│ 0.020075 ┆ 0.004238 ┆ 0.713656 ┆ 0.760204 ┆ 0.642241 │\n", 1136 | "│ 0.026213 ┆ 0.003821 ┆ 0.720631 ┆ 0.774109 ┆ 0.640086 │\n", 1137 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1138 | "2006-11-01 00:00:00 2007-03-01 00:00:00 2007-06-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1139 | "shape: (5, 5)\n", 1140 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1141 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1142 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1143 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1144 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1145 | "│ 0.02656 ┆ 0.003851 ┆ 0.728537 ┆ 0.7802 ┆ 0.614449 │\n", 1146 | "│ 0.031687 ┆ 0.003921 ┆ 0.71975 ┆ 0.768116 ┆ 0.606581 │\n", 1147 | "│ 0.034845 ┆ 0.003556 ┆ 0.720097 ┆ 0.772769 ┆ 0.600858 │\n", 1148 | "│ 0.027189 ┆ 0.004105 ┆ 0.727399 ┆ 0.766031 ┆ 0.632332 │\n", 1149 | "│ 0.033105 ┆ 0.003992 ┆ 0.718011 ┆ 0.762288 ┆ 0.610157 │\n", 1150 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1151 | "2006-12-02 00:00:00 2007-04-01 00:00:00 2007-07-30 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1152 | "shape: (5, 5)\n", 1153 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1154 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1155 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1156 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1157 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1158 | "│ 0.04597 ┆ 0.005289 ┆ 0.740741 ┆ 0.781631 ┆ 0.614591 │\n", 1159 | "│ 0.032287 ┆ 0.009541 ┆ 0.731046 ┆ 0.776062 ┆ 0.59292 │\n", 1160 | "│ 0.027807 ┆ 0.003864 ┆ 0.73036 ┆ 0.77113 ┆ 0.598379 │\n", 1161 | "│ 0.051411 ┆ 0.004183 ┆ 0.736535 ┆ 0.769936 ┆ 0.619013 │\n", 1162 | "│ 0.029864 ┆ 0.00384 ┆ 0.719039 ┆ 0.750933 ┆ 0.59322 │\n", 1163 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1164 | "2007-01-01 00:00:00 2007-05-01 00:00:00 2007-08-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1165 | "shape: (5, 5)\n", 1166 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1167 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1168 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1169 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1170 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1171 | "│ 0.080514 ┆ 0.005606 ┆ 0.737787 ┆ 0.752617 ┆ 0.469436 │\n", 1172 | "│ 0.036413 ┆ 0.005756 ┆ 0.735969 ┆ 0.743948 ┆ 0.473903 │\n", 1173 | "│ 0.035378 ┆ 0.004893 ┆ 0.737332 ┆ 0.747664 ┆ 0.474496 │\n", 1174 | "│ 0.036143 ┆ 0.005316 ┆ 0.731425 ┆ 0.731193 ┆ 0.472716 │\n", 1175 | "│ 0.038411 ┆ 0.004761 ┆ 0.74 ┆ 0.743913 ┆ 0.489614 │\n", 1176 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1177 | "2007-02-01 00:00:00 2007-06-01 00:00:00 2007-09-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1178 | "shape: (5, 5)\n", 1179 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1180 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1181 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1182 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1183 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1184 | "│ 0.043345 ┆ 0.0059 ┆ 0.748093 ┆ 0.73487 ┆ 0.4559 │\n", 1185 | "│ 0.090296 ┆ 0.005925 ┆ 0.749788 ┆ 0.735992 ┆ 0.461859 │\n", 1186 | "│ 0.050923 ┆ 0.005866 ┆ 0.750159 ┆ 0.740347 ┆ 0.457364 │\n", 1187 | "│ 0.059539 ┆ 0.005827 ┆ 0.753761 ┆ 0.729297 ┆ 0.488372 │\n", 1188 | "│ 0.094732 ┆ 0.008577 ┆ 0.751642 ┆ 0.737983 ┆ 0.466905 │\n", 1189 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1190 | "2007-03-03 00:00:00 2007-07-01 00:00:00 2007-10-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1191 | "shape: (5, 5)\n", 1192 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1193 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1194 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1195 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1196 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1197 | "│ 0.036745 ┆ 0.006585 ┆ 0.75852 ┆ 0.720574 ┆ 0.442681 │\n", 1198 | "│ 0.05794 ┆ 0.006206 ┆ 0.756767 ┆ 0.714015 ┆ 0.443269 │\n", 1199 | "│ 0.045632 ┆ 0.007245 ┆ 0.761636 ┆ 0.726066 ┆ 0.450323 │\n", 1200 | "│ 0.059822 ┆ 0.005823 ┆ 0.765336 ┆ 0.727523 ┆ 0.466196 │\n", 1201 | "│ 0.045603 ┆ 0.005517 ┆ 0.758862 ┆ 0.723671 ┆ 0.440329 │\n", 1202 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1203 | "2007-04-03 00:00:00 2007-08-01 00:00:00 2007-11-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1204 | "shape: (5, 5)\n", 1205 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1206 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1207 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1208 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1209 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1210 | "│ 0.051568 ┆ 0.006031 ┆ 0.770666 ┆ 0.724568 ┆ 0.444903 │\n", 1211 | "│ 0.051029 ┆ 0.005851 ┆ 0.780556 ┆ 0.741876 ┆ 0.470831 │\n", 1212 | "│ 0.036348 ┆ 0.006044 ┆ 0.76913 ┆ 0.72158 ┆ 0.441367 │\n", 1213 | "│ 0.06095 ┆ 0.006783 ┆ 0.763345 ┆ 0.699907 ┆ 0.442546 │\n", 1214 | "│ 0.04729 ┆ 0.005771 ┆ 0.774729 ┆ 0.728545 ┆ 0.460224 │\n", 1215 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1216 | "2007-05-04 00:00:00 2007-09-01 00:00:00 2007-12-30 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1217 | "shape: (5, 5)\n", 1218 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1219 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1220 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1221 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1222 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1223 | "│ 0.045014 ┆ 0.007876 ┆ 0.763756 ┆ 0.766059 ┆ 0.479304 │\n", 1224 | "│ 0.046918 ┆ 0.008443 ┆ 0.76843 ┆ 0.763952 ┆ 0.5009 │\n", 1225 | "│ 0.041252 ┆ 0.005527 ┆ 0.759295 ┆ 0.751887 ┆ 0.478104 │\n", 1226 | "│ 0.044612 ┆ 0.007882 ┆ 0.762856 ┆ 0.748871 ┆ 0.497301 │\n", 1227 | "│ 0.049095 ┆ 0.005729 ┆ 0.756481 ┆ 0.742777 ┆ 0.478104 │\n", 1228 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1229 | "2007-06-03 00:00:00 2007-10-01 00:00:00 2008-01-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1230 | "shape: (5, 5)\n", 1231 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1232 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1233 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1234 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1235 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1236 | "│ 0.046044 ┆ 0.005438 ┆ 0.761076 ┆ 0.771845 ┆ 0.483577 │\n", 1237 | "│ 0.053092 ┆ 0.005279 ┆ 0.751543 ┆ 0.742481 ┆ 0.480828 │\n", 1238 | "│ 0.092787 ┆ 0.011915 ┆ 0.763448 ┆ 0.760989 ┆ 0.505782 │\n", 1239 | "│ 0.071995 ┆ 0.005879 ┆ 0.759259 ┆ 0.749773 ┆ 0.503348 │\n", 1240 | "│ 0.045665 ┆ 0.005584 ┆ 0.752646 ┆ 0.743231 ┆ 0.48448 │\n", 1241 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1242 | "2007-07-04 00:00:00 2007-11-01 00:00:00 2008-02-29 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1243 | "shape: (5, 5)\n", 1244 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1245 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1246 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1247 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1248 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1249 | "│ 0.040068 ┆ 0.004758 ┆ 0.751118 ┆ 0.782359 ┆ 0.495747 │\n", 1250 | "│ 0.044423 ┆ 0.005457 ┆ 0.744996 ┆ 0.761374 ┆ 0.498177 │\n", 1251 | "│ 0.047837 ┆ 0.005099 ┆ 0.751118 ┆ 0.761566 ┆ 0.520365 │\n", 1252 | "│ 0.045359 ┆ 0.005647 ┆ 0.741229 ┆ 0.742021 ┆ 0.508815 │\n", 1253 | "│ 0.03557 ┆ 0.004908 ┆ 0.747587 ┆ 0.761167 ┆ 0.507599 │\n", 1254 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n", 1255 | "2007-08-03 00:00:00 2007-12-01 00:00:00 2008-03-30 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59\n", 1256 | "shape: (5, 5)\n", 1257 | "┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐\n", 1258 | "│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │\n", 1259 | "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", 1260 | "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", 1261 | "╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡\n", 1262 | "│ 0.039326 ┆ 0.004498 ┆ 0.727571 ┆ 0.760512 ┆ 0.503937 │\n", 1263 | "│ 0.029854 ┆ 0.004492 ┆ 0.731603 ┆ 0.757923 ┆ 0.521502 │\n", 1264 | "│ 0.030006 ┆ 0.004545 ┆ 0.742692 ┆ 0.765599 ┆ 0.54997 │\n", 1265 | "│ 0.038146 ┆ 0.004963 ┆ 0.737837 ┆ 0.764298 ┆ 0.534545 │\n", 1266 | "│ 0.04888 ┆ 0.004644 ┆ 0.733552 ┆ 0.766607 ┆ 0.517262 │\n", 1267 | "└──────────┴────────────┴───────────────┴────────────────┴─────────────┘\n" 1268 | ] 1269 | } 1270 | ], 1271 | "source": [ 1272 | "def feature_pipeline(dataf):\n", 1273 | " return (dataf\n", 1274 | " .group_by(\"player_id\")\n", 1275 | " .agg(\n", 1276 | " pl.col(\"race\").first(), \n", 1277 | " pl.col(\"class\").first(), \n", 1278 | " pl.col(\"level\").max(), \n", 1279 | " pl.len().alias(\"n_row\")))\n", 1280 | "\n", 1281 | "gen = churn_dataset_generator(df, user_id=\"player_id\", info_period=120, checking_period=120, feature_pipeline=feature_pipeline)\n", 1282 | "\n", 1283 | "for X, y in gen:\n", 1284 | " scorers = {\n", 1285 | " \"accuracy\": make_scorer(accuracy_score), \n", 1286 | " \"precision\": make_scorer(precision_score), \n", 1287 | " \"recall\": make_scorer(recall_score)\n", 1288 | " }\n", 1289 | " print(pl.DataFrame(cross_validate(pipe, X, y, cv=5, scoring=scorers)))" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "code", 1294 | "execution_count": null, 1295 | "id": "2814d967-b0bc-4c42-8de4-530f52acb496", 1296 | "metadata": {}, 1297 | "outputs": [], 1298 | "source": [] 1299 | } 1300 | ], 1301 | "metadata": { 1302 | "kernelspec": { 1303 | "display_name": "Python 3 (ipykernel)", 1304 | "language": "python", 1305 | "name": "python3" 1306 | }, 1307 | "language_info": { 1308 | "codemirror_mode": { 1309 | "name": "ipython", 1310 | "version": 3 1311 | }, 1312 | "file_extension": ".py", 1313 | "mimetype": "text/x-python", 1314 | "name": "python", 1315 | "nbconvert_exporter": "python", 1316 | "pygments_lexer": "ipython3", 1317 | "version": "3.11.5" 1318 | } 1319 | }, 1320 | "nbformat": 4, 1321 | "nbformat_minor": 5 1322 | } 1323 | --------------------------------------------------------------------------------