├── README.md ├── config.yaml ├── LICENSE ├── .gitignore ├── dask-kubernetes-cluster-step.md └── notebooks ├── data-preprocessing.ipynb └── eda.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # [Kaggle-TalkingData AdTracking Fraud Detection Challenge](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection#description) 2 | Can you detect fraudulent click traffic for mobile app ads? 3 | 4 | ## Objective 5 | 6 | For this competition, our objective is to predict whether a user will download an app after clicking a mobile 7 | app advertisement. 8 | 9 | ## [Data](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/data) 10 | 11 | - `train.csv`: 7537.65MB of the training set 12 | - `test.csv`: 863.27MB of the test set 13 | - `sample_submission.csv`: 195.58MB 14 | - `train_sample.csv`: 4.08MB 15 | 16 | 17 | ## [Notebook](http://nbviewer.jupyter.org/github/andersy005/kaggle-talkingdata-adtracking-fraud-detection/blob/master/kaggle-dask-gce.ipynb) 18 | 19 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | # config.yaml 2 | 3 | worker: 4 | replicas: 8 5 | limits: 6 | cpu: 4 7 | memory: 26 GiB 8 | pipPackages: >- 9 | git+https://github.com/dask/gcsfs.git 10 | git+https://github.com/ioam/holoviews.git 11 | git+https://github.com/ahuang11/holoext.git 12 | git+https://github.com/pydata/xarray.git 13 | git+https://github.com/scikit-learn-contrib/imbalanced-learn.git 14 | condaPackages: >- 15 | -c conda-forge 16 | fastparquet 17 | pyarrow 18 | seaborn 19 | bokeh 20 | scikit-learn 21 | 22 | 23 | 24 | # We want to keep the same packages on the worker and jupyter environments 25 | jupyter: 26 | pipPackages: >- 27 | git+https://github.com/dask/gcsfs.git 28 | git+https://github.com/ioam/holoviews.git 29 | git+https://github.com/ahuang11/holoext.git 30 | git+https://github.com/pydata/xarray.git 31 | git+https://github.com/scikit-learn-contrib/imbalanced-learn.git 32 | condaPackages: >- 33 | -c conda-forge 34 | fastparquet 35 | pyarrow 36 | seaborn 37 | bokeh 38 | scikit-learn 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Anderson Banihirwe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | data/ 103 | dask-worker-space/ 104 | -------------------------------------------------------------------------------- /dask-kubernetes-cluster-step.md: -------------------------------------------------------------------------------- 1 | ## Dask Cluster on Google Cloud 2 | 3 | ### Create a Kubernetes cluster on Google Cloud 4 | 5 | Create a Kubernetes cluster on Google Cloud, by typing in the following command: 6 | 7 | gcloud container clusters create kaggle --num-nodes=3 --machine-type=n1-standard-2 --zone=us-central1-b 8 | 9 | To test if your cluster is initialized, run: 10 | 11 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ kubectl get node 12 | NAME STATUS ROLES AGE VERSION 13 | gke-kaggle-default-pool-09161020-gtkz Ready 1m v1.8.7-gke.1 14 | gke-kaggle-default-pool-09161020-n963 Ready 1m v1.8.7-gke.1 15 | gke-kaggle-default-pool-09161020-s2tc Ready 1m v1.8.7-gke.1 16 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ 17 | 18 | Give your account super-user permissions, allowing you to perform all the actions needed to set up JupyterHub. 19 | 20 | kubectl create clusterrolebinding cluster-admin-binding --clusterrole=cluster-admin --user= 21 | 22 | 23 | ### Setting up Helm 24 | 25 | Installation 26 | 27 | curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get | bash 28 | 29 | Initialization 30 | 31 | After installing helm on your machine, initialize helm on your Kubernetes cluster. At the terminal, enter: 32 | 33 | kubectl --namespace kube-system create sa tiller 34 | kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller 35 | helm init --service-account tiller 36 | 37 | Helm Install Dask 38 | 39 | helm repo add dask https://dask.github.io/helm-chart 40 | helm repo update 41 | 42 | Now you can launch Dask on your Kubernetes cluster using the Dask Helm chart: 43 | 44 | helm install dask/dask 45 | 46 | This deploys a dask-scheduler, several dask-worker processes, and also a Jupyter server. 47 | 48 | Verify Deployment 49 | 50 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ kubectl get pods 51 | NAME READY STATUS RESTARTS AGE 52 | veering-ocelot-jupyter-7945df487c-qnwz6 0/1 ContainerCreating 0 2m 53 | veering-ocelot-scheduler-6cbdb49dc6-zc8bt 1/1 Running 0 2m 54 | veering-ocelot-worker-d8cb65c5-gfcn5 1/1 Running 0 2m 55 | veering-ocelot-worker-d8cb65c5-r6hwv 1/1 Running 0 2m 56 | veering-ocelot-worker-d8cb65c5-rpd6q 0/1 ContainerCreating 0 2m 57 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ kubectl get pods 58 | NAME READY STATUS RESTARTS AGE 59 | veering-ocelot-jupyter-7945df487c-qnwz6 1/1 Running 0 2m 60 | veering-ocelot-scheduler-6cbdb49dc6-zc8bt 1/1 Running 0 2m 61 | veering-ocelot-worker-d8cb65c5-gfcn5 1/1 Running 0 2m 62 | veering-ocelot-worker-d8cb65c5-r6hwv 1/1 Running 0 2m 63 | veering-ocelot-worker-d8cb65c5-rpd6q 1/1 Running 0 2m 64 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ kubectl get services 65 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S)AGE 66 | kubernetes ClusterIP 10.43.240.1 443/TCP13m 67 | veering-ocelot-jupyter LoadBalancer 10.43.254.47 35.225.254.167 80:31746/TCP2m 68 | veering-ocelot-scheduler LoadBalancer 10.43.241.39 35.194.26.249 8786:31932/TCP,80:32597/TCP2m 69 | 70 | Connect to Dask and Jupyter 71 | 72 | abanihirwe@yellowstone:~/devel/kaggle/kaggle-talkingdata-adtracking-fraud-detection$ kubectl get services 73 | NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S)AGE 74 | kubernetes ClusterIP 10.43.240.1 443/TCP19m 75 | veering-ocelot-jupyter LoadBalancer 10.43.254.47 35.225.254.167 80:31746/TCP8m 76 | veering-ocelot-scheduler LoadBalancer 10.43.241.39 35.194.26.249 8786:31932/TCP,80:32597/TCP8m 77 | 78 | We can navigate to these from any web browser. One is the Dask diagnostic dashboard. The other is the Jupyter server. You can log into the Jupyter notebook server with the password, dask 79 | 80 | ### Configure Environment 81 | 82 | helm upgrade veering-ocelot dask/dask -f config.yaml 83 | -------------------------------------------------------------------------------- /notebooks/data-preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "application/vnd.jupyter.widget-view+json": { 11 | "model_id": "353dd68f36bd486c9454c045cbac7880", 12 | "version_major": 2, 13 | "version_minor": 0 14 | }, 15 | "text/html": [ 16 | "

Failed to display Jupyter Widget of type VBox.

\n", 17 | "

\n", 18 | " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", 19 | " that the widgets JavaScript is still loading. If this message persists, it\n", 20 | " likely means that the widgets JavaScript library is either not installed or\n", 21 | " not enabled. See the Jupyter\n", 22 | " Widgets Documentation for setup instructions.\n", 23 | "

\n", 24 | "

\n", 25 | " If you're reading this message in another frontend (for example, a static\n", 26 | " rendering on GitHub or NBViewer),\n", 27 | " it may mean that your frontend doesn't currently support widgets.\n", 28 | "

\n" 29 | ], 30 | "text/plain": [ 31 | "VBox(children=(HTML(value='

KubeCluster

'), HBox(children=(HTML(value='\\n
\\n \\n \\n \\n \\n \\n
Workers 0
Cores 0
Memory 0 B
\\n
\\n', layout=Layout(min_width='150px')), Accordion(children=(HBox(children=(IntText(value=0, description='Workers', layout=Layout(width='150px')), Button(description='Scale', layout=Layout(width='150px'), style=ButtonStyle()))), HBox(children=(IntText(value=0, description='Minimum', layout=Layout(width='150px')), IntText(value=0, description='Maximum', layout=Layout(width='150px')), Button(description='Adapt', layout=Layout(width='150px'), style=ButtonStyle())))), layout=Layout(min_width='500px'), selected_index=None, _titles={'0': 'Manual Scaling', '1': 'Adaptive Scaling'}))), HTML(value='

Dashboard: /user/andersy005/proxy/8787/status

\\n')))" 32 | ] 33 | }, 34 | "metadata": {}, 35 | "output_type": "display_data" 36 | } 37 | ], 38 | "source": [ 39 | "from dask_kubernetes import KubeCluster\n", 40 | "cluster = KubeCluster(n_workers=16)\n", 41 | "cluster" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from dask.distributed import Client\n", 51 | "client = Client(cluster)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/html": [ 62 | "\n", 63 | "\n", 64 | "\n", 71 | "\n", 79 | "\n", 80 | "
\n", 65 | "

Client

\n", 66 | "\n", 70 | "
\n", 72 | "

Cluster

\n", 73 | "
    \n", 74 | "
  • Workers: 14
  • \n", 75 | "
  • Cores: 28
  • \n", 76 | "
  • Memory: 84.00 GB
  • \n", 77 | "
\n", 78 | "
" 81 | ], 82 | "text/plain": [ 83 | "" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "client" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import dask.dataframe as dd" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "dtypes = {\n", 111 | " 'ip':'category',\n", 112 | " 'app': 'category',\n", 113 | " 'device': 'category',\n", 114 | " 'os': 'category',\n", 115 | " 'channel': 'category',\n", 116 | " 'is_attributed': 'uint8'\n", 117 | " }" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "df = dd.read_csv('gcs://kaggle-talkingdata-bucket/train.csv', \n", 127 | " parse_dates=['click_time', 'attributed_time'], \n", 128 | " dtype=dtypes)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 8, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 153 | "\n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
08323031133792017-11-06 14:32:21NaT0
11735731193792017-11-06 14:33:34NaT0
23581031133792017-11-06 14:34:12NaT0
345745141134782017-11-06 14:34:52NaT0
416100731133792017-11-06 14:35:08NaT0
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " ip app device os channel click_time attributed_time \\\n", 229 | "0 83230 3 1 13 379 2017-11-06 14:32:21 NaT \n", 230 | "1 17357 3 1 19 379 2017-11-06 14:33:34 NaT \n", 231 | "2 35810 3 1 13 379 2017-11-06 14:34:12 NaT \n", 232 | "3 45745 14 1 13 478 2017-11-06 14:34:52 NaT \n", 233 | "4 161007 3 1 13 379 2017-11-06 14:35:08 NaT \n", 234 | "\n", 235 | " is_attributed \n", 236 | "0 0 \n", 237 | "1 0 \n", 238 | "2 0 \n", 239 | "3 0 \n", 240 | "4 0 " 241 | ] 242 | }, 243 | "execution_count": 8, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "df.head()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 9, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "from collections import Counter" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 10, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "Counter({0: 184447044, 1: 456846})" 270 | ] 271 | }, 272 | "execution_count": 10, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | }, 276 | { 277 | "name": "stderr", 278 | "output_type": "stream", 279 | "text": [ 280 | "distributed.core - WARNING - Event loop was unresponsive in Scheduler for 11.83s. This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "Counter(df.is_attributed.compute())" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 11, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/html": [ 296 | "
\n", 297 | "\n", 310 | "\n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
08323031133792017-11-06 14:32:21NaT0
11735731193792017-11-06 14:33:34NaT0
23581031133792017-11-06 14:34:12NaT0
345745141134782017-11-06 14:34:52NaT0
416100731133792017-11-06 14:35:08NaT0
\n", 382 | "
" 383 | ], 384 | "text/plain": [ 385 | " ip app device os channel click_time attributed_time \\\n", 386 | "0 83230 3 1 13 379 2017-11-06 14:32:21 NaT \n", 387 | "1 17357 3 1 19 379 2017-11-06 14:33:34 NaT \n", 388 | "2 35810 3 1 13 379 2017-11-06 14:34:12 NaT \n", 389 | "3 45745 14 1 13 478 2017-11-06 14:34:52 NaT \n", 390 | "4 161007 3 1 13 379 2017-11-06 14:35:08 NaT \n", 391 | "\n", 392 | " is_attributed \n", 393 | "0 0 \n", 394 | "1 0 \n", 395 | "2 0 \n", 396 | "3 0 \n", 397 | "4 0 " 398 | ] 399 | }, 400 | "execution_count": 11, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "df1 = df[df.is_attributed == 0]\n", 407 | "df1.head()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 12, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "184447044" 419 | ] 420 | }, 421 | "execution_count": 12, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | } 425 | ], 426 | "source": [ 427 | "len(df1)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 13, 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/html": [ 438 | "
\n", 439 | "\n", 452 | "\n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
10320415835113212017-11-06 15:41:072017-11-07 08:17:191
15042969291222152017-11-06 16:00:022017-11-07 10:05:221
17986451635113212017-11-06 16:00:022017-11-06 23:40:501
2102172429351462742017-11-06 16:00:032017-11-07 00:55:291
3056199085351132742017-11-06 16:00:042017-11-06 23:04:541
\n", 524 | "
" 525 | ], 526 | "text/plain": [ 527 | " ip app device os channel click_time attributed_time \\\n", 528 | "103 204158 35 1 13 21 2017-11-06 15:41:07 2017-11-07 08:17:19 \n", 529 | "1504 29692 9 1 22 215 2017-11-06 16:00:02 2017-11-07 10:05:22 \n", 530 | "1798 64516 35 1 13 21 2017-11-06 16:00:02 2017-11-06 23:40:50 \n", 531 | "2102 172429 35 1 46 274 2017-11-06 16:00:03 2017-11-07 00:55:29 \n", 532 | "3056 199085 35 1 13 274 2017-11-06 16:00:04 2017-11-06 23:04:54 \n", 533 | "\n", 534 | " is_attributed \n", 535 | "103 1 \n", 536 | "1504 1 \n", 537 | "1798 1 \n", 538 | "2102 1 \n", 539 | "3056 1 " 540 | ] 541 | }, 542 | "execution_count": 13, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "df2 = df[df.is_attributed == 1]\n", 549 | "df2.head()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 14, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "456846" 561 | ] 562 | }, 563 | "execution_count": 14, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "len(df2)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 15, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "553338" 581 | ] 582 | }, 583 | "execution_count": 15, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "df3 = df1.sample(0.003, random_state=42)\n", 590 | "len(df3)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 16, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/html": [ 601 | "
\n", 602 | "\n", 615 | "\n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
102609785171181221212017-11-06 16:22:27NaT0
8019517595151322452017-11-06 16:17:08NaT0
11474341568027191532017-11-06 16:25:33NaT0
3077371990231132802017-11-06 16:05:54NaT0
7603712514161194592017-11-06 16:01:21NaT0
\n", 687 | "
" 688 | ], 689 | "text/plain": [ 690 | " ip app device os channel click_time attributed_time \\\n", 691 | "1026097 85171 18 1 22 121 2017-11-06 16:22:27 NaT \n", 692 | "801951 7595 15 1 32 245 2017-11-06 16:17:08 NaT \n", 693 | "1147434 15680 27 1 9 153 2017-11-06 16:25:33 NaT \n", 694 | "307737 19902 3 1 13 280 2017-11-06 16:05:54 NaT \n", 695 | "76037 125141 6 1 19 459 2017-11-06 16:01:21 NaT \n", 696 | "\n", 697 | " is_attributed \n", 698 | "1026097 0 \n", 699 | "801951 0 \n", 700 | "1147434 0 \n", 701 | "307737 0 \n", 702 | "76037 0 " 703 | ] 704 | }, 705 | "execution_count": 16, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "df3.head()" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 17, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "data": { 721 | "text/html": [ 722 | "
\n", 723 | "\n", 736 | "\n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
10320415835113212017-11-06 15:41:072017-11-07 08:17:191
15042969291222152017-11-06 16:00:022017-11-07 10:05:221
17986451635113212017-11-06 16:00:022017-11-06 23:40:501
2102172429351462742017-11-06 16:00:032017-11-07 00:55:291
3056199085351132742017-11-06 16:00:042017-11-06 23:04:541
\n", 808 | "
" 809 | ], 810 | "text/plain": [ 811 | " ip app device os channel click_time attributed_time \\\n", 812 | "103 204158 35 1 13 21 2017-11-06 15:41:07 2017-11-07 08:17:19 \n", 813 | "1504 29692 9 1 22 215 2017-11-06 16:00:02 2017-11-07 10:05:22 \n", 814 | "1798 64516 35 1 13 21 2017-11-06 16:00:02 2017-11-06 23:40:50 \n", 815 | "2102 172429 35 1 46 274 2017-11-06 16:00:03 2017-11-07 00:55:29 \n", 816 | "3056 199085 35 1 13 274 2017-11-06 16:00:04 2017-11-06 23:04:54 \n", 817 | "\n", 818 | " is_attributed \n", 819 | "103 1 \n", 820 | "1504 1 \n", 821 | "1798 1 \n", 822 | "2102 1 \n", 823 | "3056 1 " 824 | ] 825 | }, 826 | "execution_count": 17, 827 | "metadata": {}, 828 | "output_type": "execute_result" 829 | } 830 | ], 831 | "source": [ 832 | "data = dd.concat([df2, df3], axis=0)\n", 833 | "data.head()" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": 18, 839 | "metadata": {}, 840 | "outputs": [ 841 | { 842 | "data": { 843 | "text/plain": [ 844 | "1010184" 845 | ] 846 | }, 847 | "execution_count": 18, 848 | "metadata": {}, 849 | "output_type": "execute_result" 850 | } 851 | ], 852 | "source": [ 853 | "len(data)" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 19, 859 | "metadata": {}, 860 | "outputs": [], 861 | "source": [ 862 | "data = client.persist(data.repartition(npartitions=1))" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 20, 868 | "metadata": {}, 869 | "outputs": [], 870 | "source": [ 871 | "df = data.compute()" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 21, 877 | "metadata": {}, 878 | "outputs": [], 879 | "source": [ 880 | "df.to_parquet('data/talking_data.parquet')" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 22, 886 | "metadata": {}, 887 | "outputs": [], 888 | "source": [ 889 | "client.close()" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": 23, 895 | "metadata": {}, 896 | "outputs": [ 897 | { 898 | "data": { 899 | "text/html": [ 900 | "

Client

\n", 901 | "
    \n", 902 | "
  • Scheduler: not connected\n", 903 | "
\n" 904 | ], 905 | "text/plain": [ 906 | "" 907 | ] 908 | }, 909 | "execution_count": 23, 910 | "metadata": {}, 911 | "output_type": "execute_result" 912 | } 913 | ], 914 | "source": [ 915 | "client" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": null, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [] 924 | } 925 | ], 926 | "metadata": { 927 | "kernelspec": { 928 | "display_name": "Python [default]", 929 | "language": "python", 930 | "name": "python3" 931 | }, 932 | "language_info": { 933 | "codemirror_mode": { 934 | "name": "ipython", 935 | "version": 3 936 | }, 937 | "file_extension": ".py", 938 | "mimetype": "text/x-python", 939 | "name": "python", 940 | "nbconvert_exporter": "python", 941 | "pygments_lexer": "ipython3", 942 | "version": "3.6.4" 943 | } 944 | }, 945 | "nbformat": 4, 946 | "nbformat_minor": 2 947 | } 948 | -------------------------------------------------------------------------------- /notebooks/eda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [Kaggle - TalkingData AdTracking Fraud Detection Challenge](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection)\n", 8 | "\n", 9 | "Our task is to predict where a click on some advertising is fraudlent given a few basic attributes about the device that made the click. What sets this competition apart is the sheer scale of the dataset: **with 240 million rows**.\n", 10 | "\n", 11 | "Looking at the evaluation page, we can see that the evaluation metric used is** ROC-AUC** (the area under a curve on a Receiver Operator Characteristic graph). In other words:\n", 12 | "\n", 13 | "- This competition is a **binary classification** problem - i.e. our target variable is a binary attribute (Is the user making the click fraudlent or not?) and our goal is to classify users into \"fraudlent\" or \"not fraudlent\" as well as possible\n", 14 | "\n", 15 | "- Unlike metrics such as [LogLoss](http://www.exegetic.biz/blog/2015/12/making-sense-logarithmic-loss/), the AUC score only depends on **how well you can separate the two classes**. In practice, this means that only the order of your predictions matter.\n", 16 | "\n", 17 | " - As a result of this, any rescaling done to your model's output probabilities will have no effect on your score. In some other competitions, adding a constant or multiplier to your predictions to rescale it to the distribution can help but that doesn't apply here.\n", 18 | " \n", 19 | "If you want a more intuitive explanation of how AUC works, I recommend [this post](https://stats.stackexchange.com/questions/132777/what-does-auc-stand-for-and-what-is-it).\n", 20 | " \n", 21 | "Let's dive right in by looking at the data we're given:\n", 22 | "\n", 23 | "Due to the sheer scale of the dataset, it is most likely that this dataset won't fit in memory of most laptops. One solution to this is to use **Dask** " 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Ignore warnings\n", 33 | "import warnings\n", 34 | "warnings.filterwarnings(\"ignore\")\n", 35 | "import pathlib\n", 36 | "import dask.dataframe as dd\n", 37 | "from distributed import Client, progress\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "DATA = pathlib.Path('data')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "sample_submission.csv test.csv train.csv\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "ls {DATA}" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "train_filepath = DATA / 'train.csv'" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "client = Client(processes=False)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "\n", 93 | "\n", 94 | "\n", 101 | "\n", 109 | "\n", 110 | "
\n", 95 | "

Client

\n", 96 | "\n", 100 | "
\n", 102 | "

Cluster

\n", 103 | "
    \n", 104 | "
  • Workers: 1
  • \n", 105 | "
  • Cores: 4
  • \n", 106 | "
  • Memory: 12.50 GB
  • \n", 107 | "
\n", 108 | "
" 111 | ], 112 | "text/plain": [ 113 | "" 114 | ] 115 | }, 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "client" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 7, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "dtypes = {\n", 132 | " 'ip':'uint32',\n", 133 | " 'app': 'uint16',\n", 134 | " 'device': 'uint16',\n", 135 | " 'os': 'uint16',\n", 136 | " 'channel': 'uint16',\n", 137 | " 'is_attributed': 'uint8'\n", 138 | " }\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "train_df = dd.read_csv(train_filepath, blocksize=100e6, \n", 148 | " parse_dates=['click_time', 'attributed_time'], dtype=dtypes,\n", 149 | " storage_options={'anon': True})" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/html": [ 160 | "
\n", 161 | "\n", 174 | "\n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
08323031133792017-11-06 14:32:21NaT0
11735731193792017-11-06 14:33:34NaT0
23581031133792017-11-06 14:34:12NaT0
345745141134782017-11-06 14:34:52NaT0
416100731133792017-11-06 14:35:08NaT0
\n", 246 | "
" 247 | ], 248 | "text/plain": [ 249 | " ip app device os channel click_time attributed_time \\\n", 250 | "0 83230 3 1 13 379 2017-11-06 14:32:21 NaT \n", 251 | "1 17357 3 1 19 379 2017-11-06 14:33:34 NaT \n", 252 | "2 35810 3 1 13 379 2017-11-06 14:34:12 NaT \n", 253 | "3 45745 14 1 13 478 2017-11-06 14:34:52 NaT \n", 254 | "4 161007 3 1 13 379 2017-11-06 14:35:08 NaT \n", 255 | "\n", 256 | " is_attributed \n", 257 | "0 0 \n", 258 | "1 0 \n", 259 | "2 0 \n", 260 | "3 0 \n", 261 | "4 0 " 262 | ] 263 | }, 264 | "execution_count": 9, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "train_df.head()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/html": [ 281 | "
\n", 282 | "\n", 295 | "\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | "
ipappdeviceoschannelclick_timeattributed_timeis_attributed
925194121312121103402017-11-09 16:00:00NaT0
9251954689431192112017-11-09 16:00:00NaT0
92519632012611132742017-11-09 16:00:00NaT0
925197189286121372592017-11-09 16:00:00NaT0
925198106485111191372017-11-09 16:00:00NaT0
\n", 367 | "
" 368 | ], 369 | "text/plain": [ 370 | " ip app device os channel click_time attributed_time \\\n", 371 | "925194 121312 12 1 10 340 2017-11-09 16:00:00 NaT \n", 372 | "925195 46894 3 1 19 211 2017-11-09 16:00:00 NaT \n", 373 | "925196 320126 1 1 13 274 2017-11-09 16:00:00 NaT \n", 374 | "925197 189286 12 1 37 259 2017-11-09 16:00:00 NaT \n", 375 | "925198 106485 11 1 19 137 2017-11-09 16:00:00 NaT \n", 376 | "\n", 377 | " is_attributed \n", 378 | "925194 0 \n", 379 | "925195 0 \n", 380 | "925196 0 \n", 381 | "925197 0 \n", 382 | "925198 0 " 383 | ] 384 | }, 385 | "execution_count": 10, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "train_df.tail()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "## Looking at the features\n", 399 | "\n", 400 | "Each row of the training data contains a click record, with the following features.:\n", 401 | "\n", 402 | "- ip: ip address of click\n", 403 | "- app: app id for marketing\n", 404 | "- device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)\n", 405 | "- os: os version id of user mobile phone\n", 406 | "- channel: channel id of mobile ad publisher\n", 407 | "- click_time: timestamp of click (UTC)\n", 408 | "- attributed_time: if user download the app for after clicking an ad, this is the time of the app download\n", 409 | "- is_attributed: the target that is to be predicted, indicating the app was downloaded\n", 410 | "\n", 411 | "**NOTE:**\n", 412 | "\n", 413 | "- By looking at the data samples above, you'll notice that all these variables are encoded - meaning we don't know what the actual value corresponds to - each value has instead been assigned an ID which we're given. This has likely been done because data such as IP addresses are sensitive, although it does unfortunately reduce the amount of feature engineering we can do on these.\n", 414 | " \n", 415 | "- The attributed_time variable is only available in the training set - it's not immediately useful for classification but it could be used for some interesting analysis (for example, one could fill in the variable in the test set by building a model to predict it).\n" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 11, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/plain": [ 426 | "0.002470721410998979" 427 | ] 428 | }, 429 | "execution_count": 11, 430 | "metadata": {}, 431 | "output_type": "execute_result" 432 | } 433 | ], 434 | "source": [ 435 | "train_df.is_attributed.mean().compute()" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 12, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "CPU times: user 5min 55s, sys: 18.2 s, total: 6min 13s\n", 448 | "Wall time: 4min 19s\n" 449 | ] 450 | }, 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "184903890" 455 | ] 456 | }, 457 | "execution_count": 12, 458 | "metadata": {}, 459 | "output_type": "execute_result" 460 | } 461 | ], 462 | "source": [ 463 | "%time len(train_df)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "We can see that the training set consists of **184,903,890 rows**." 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 16, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "name": "stdout", 480 | "output_type": "stream", 481 | "text": [ 482 | "CPU times: user 1h 1min 17s, sys: 2min 39s, total: 1h 3min 57s\n", 483 | "Wall time: 44min 37s\n" 484 | ] 485 | } 486 | ], 487 | "source": [ 488 | "%%time \n", 489 | "means = {}\n", 490 | "weights = {}\n", 491 | "cols = ['ip', 'app', 'device', 'os', 'channel']\n", 492 | "for col in cols:\n", 493 | " means[col] = train_df.groupby(col)['is_attributed'].mean().compute()#.to_dict()\n", 494 | " weights[col] = train_df[col].value_counts().compute()#.to_dict()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 17, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "{'app': app\n", 506 | " 0 0.309421\n", 507 | " 1 0.000212\n", 508 | " 2 0.000262\n", 509 | " 3 0.000303\n", 510 | " 4 0.000040\n", 511 | " 5 0.072598\n", 512 | " 6 0.000083\n", 513 | " 7 0.000670\n", 514 | " 8 0.001842\n", 515 | " 9 0.001144\n", 516 | " 10 0.050549\n", 517 | " 11 0.001555\n", 518 | " 12 0.000109\n", 519 | " 13 0.000167\n", 520 | " 14 0.000250\n", 521 | " 15 0.000214\n", 522 | " 16 0.230298\n", 523 | " 17 0.000640\n", 524 | " 18 0.000503\n", 525 | " 19 0.143450\n", 526 | " 20 0.002176\n", 527 | " 21 0.000133\n", 528 | " 22 0.000245\n", 529 | " 23 0.000019\n", 530 | " 24 0.000491\n", 531 | " 25 0.000047\n", 532 | " 26 0.000467\n", 533 | " 27 0.001720\n", 534 | " 28 0.000082\n", 535 | " 29 0.061275\n", 536 | " ... \n", 537 | " 768 0.300000\n", 538 | " 753 0.250000\n", 539 | " 748 0.000000\n", 540 | " 742 0.000000\n", 541 | " 755 0.000000\n", 542 | " 745 0.000000\n", 543 | " 756 0.000000\n", 544 | " 763 1.000000\n", 545 | " 760 0.000000\n", 546 | " 761 0.000000\n", 547 | " 424 0.000000\n", 548 | " 766 0.000000\n", 549 | " 410 0.000000\n", 550 | " 757 0.000000\n", 551 | " 759 0.000000\n", 552 | " 438 0.000000\n", 553 | " 743 0.000000\n", 554 | " 765 0.000000\n", 555 | " 767 0.000000\n", 556 | " 532 0.000000\n", 557 | " 744 0.000000\n", 558 | " 746 0.000000\n", 559 | " 747 0.000000\n", 560 | " 749 0.000000\n", 561 | " 764 0.000000\n", 562 | " 201 0.000000\n", 563 | " 741 0.000000\n", 564 | " 750 0.000000\n", 565 | " 751 0.000000\n", 566 | " 437 0.000000\n", 567 | " Name: is_attributed, Length: 706, dtype: float64, 'channel': channel\n", 568 | " 0 0.077345\n", 569 | " 3 0.000413\n", 570 | " 13 0.000121\n", 571 | " 15 0.000173\n", 572 | " 17 0.000188\n", 573 | " 18 0.000525\n", 574 | " 19 0.000224\n", 575 | " 21 0.140053\n", 576 | " 22 0.001292\n", 577 | " 24 0.000170\n", 578 | " 30 0.000976\n", 579 | " 101 0.015451\n", 580 | " 105 0.000381\n", 581 | " 107 0.000504\n", 582 | " 108 0.017480\n", 583 | " 110 0.003594\n", 584 | " 111 0.000237\n", 585 | " 113 0.117789\n", 586 | " 115 0.000142\n", 587 | " 116 0.000197\n", 588 | " 118 0.000588\n", 589 | " 120 0.001813\n", 590 | " 121 0.000438\n", 591 | " 122 0.000495\n", 592 | " 123 0.001204\n", 593 | " 124 0.000199\n", 594 | " 125 0.000062\n", 595 | " 126 0.001376\n", 596 | " 128 0.000204\n", 597 | " 129 0.001214\n", 598 | " ... \n", 599 | " 165 0.000000\n", 600 | " 473 0.000000\n", 601 | " 29 0.000000\n", 602 | " 256 0.000000\n", 603 | " 169 0.076923\n", 604 | " 162 0.083333\n", 605 | " 217 0.000000\n", 606 | " 216 0.007752\n", 607 | " 146 0.000000\n", 608 | " 233 0.262295\n", 609 | " 455 0.000000\n", 610 | " 470 0.000000\n", 611 | " 149 0.000000\n", 612 | " 172 0.000000\n", 613 | " 227 0.000000\n", 614 | " 422 0.000000\n", 615 | " 475 0.000000\n", 616 | " 248 0.000000\n", 617 | " 394 0.000000\n", 618 | " 354 0.000000\n", 619 | " 127 0.001126\n", 620 | " 476 0.000000\n", 621 | " 434 0.000000\n", 622 | " 448 0.001167\n", 623 | " 221 0.000000\n", 624 | " 322 0.000060\n", 625 | " 490 0.000020\n", 626 | " 404 0.000000\n", 627 | " 474 0.000000\n", 628 | " 500 0.000000\n", 629 | " Name: is_attributed, Length: 202, dtype: float64, 'device': device\n", 630 | " 0 0.098525\n", 631 | " 1 0.001758\n", 632 | " 2 0.000274\n", 633 | " 4 0.186807\n", 634 | " 6 0.233670\n", 635 | " 7 0.197398\n", 636 | " 8 0.149325\n", 637 | " 9 0.221194\n", 638 | " 11 0.159669\n", 639 | " 14 0.196864\n", 640 | " 15 0.155689\n", 641 | " 16 0.217010\n", 642 | " 17 0.174107\n", 643 | " 18 0.218481\n", 644 | " 19 0.179949\n", 645 | " 20 0.125554\n", 646 | " 21 0.211786\n", 647 | " 22 0.187500\n", 648 | " 24 0.190647\n", 649 | " 25 0.169653\n", 650 | " 27 0.117021\n", 651 | " 29 0.156863\n", 652 | " 30 0.190998\n", 653 | " 31 0.000000\n", 654 | " 32 0.294118\n", 655 | " 33 0.206108\n", 656 | " 34 0.153846\n", 657 | " 35 0.203846\n", 658 | " 36 0.236926\n", 659 | " 37 0.191710\n", 660 | " ... \n", 661 | " 4204 0.000000\n", 662 | " 4213 0.500000\n", 663 | " 4221 0.000000\n", 664 | " 282 0.000000\n", 665 | " 1842 0.000000\n", 666 | " 2151 0.000000\n", 667 | " 2654 0.000000\n", 668 | " 2800 0.000000\n", 669 | " 3876 0.000000\n", 670 | " 3900 0.000000\n", 671 | " 3946 0.000000\n", 672 | " 3960 0.000000\n", 673 | " 3991 0.500000\n", 674 | " 4010 0.000000\n", 675 | " 4083 0.000000\n", 676 | " 4125 0.500000\n", 677 | " 4145 0.000000\n", 678 | " 4176 0.000000\n", 679 | " 4188 0.000000\n", 680 | " 4197 0.000000\n", 681 | " 4202 0.000000\n", 682 | " 4207 0.000000\n", 683 | " 1824 0.000000\n", 684 | " 1900 0.000000\n", 685 | " 3878 0.000000\n", 686 | " 3914 0.000000\n", 687 | " 4108 0.000000\n", 688 | " 4120 0.000000\n", 689 | " 4133 0.000000\n", 690 | " 4142 0.000000\n", 691 | " Name: is_attributed, Length: 3475, dtype: float64, 'ip': ip\n", 692 | " 9 0.001489\n", 693 | " 10 0.002542\n", 694 | " 20 0.000670\n", 695 | " 25 0.004484\n", 696 | " 27 0.001598\n", 697 | " 31 0.002887\n", 698 | " 32 0.004167\n", 699 | " 36 0.000890\n", 700 | " 39 0.003356\n", 701 | " 45 0.000533\n", 702 | " 52 0.001873\n", 703 | " 59 0.001965\n", 704 | " 60 0.002985\n", 705 | " 61 0.000932\n", 706 | " 63 0.001556\n", 707 | " 81 0.000000\n", 708 | " 83 0.002798\n", 709 | " 85 0.001018\n", 710 | " 88 0.000468\n", 711 | " 92 0.000673\n", 712 | " 95 0.000747\n", 713 | " 113 0.008929\n", 714 | " 117 0.000493\n", 715 | " 120 0.001989\n", 716 | " 122 0.000949\n", 717 | " 126 0.002320\n", 718 | " 127 0.002334\n", 719 | " 133 0.001736\n", 720 | " 147 0.001054\n", 721 | " 150 0.001592\n", 722 | " ... \n", 723 | " 359087 0.333333\n", 724 | " 359264 1.000000\n", 725 | " 359329 1.000000\n", 726 | " 359653 1.000000\n", 727 | " 360159 1.000000\n", 728 | " 360373 1.000000\n", 729 | " 360476 1.000000\n", 730 | " 360879 1.000000\n", 731 | " 361131 1.000000\n", 732 | " 361374 0.500000\n", 733 | " 361773 0.333333\n", 734 | " 362054 0.500000\n", 735 | " 362057 1.000000\n", 736 | " 362147 0.500000\n", 737 | " 362161 0.400000\n", 738 | " 362203 0.166667\n", 739 | " 362246 1.000000\n", 740 | " 362345 0.000000\n", 741 | " 362401 0.600000\n", 742 | " 363000 0.100000\n", 743 | " 363059 0.333333\n", 744 | " 363107 1.000000\n", 745 | " 363821 1.000000\n", 746 | " 363824 0.500000\n", 747 | " 363912 1.000000\n", 748 | " 364131 1.000000\n", 749 | " 364360 1.000000\n", 750 | " 364553 1.000000\n", 751 | " 364736 0.500000\n", 752 | " 364750 1.000000\n", 753 | " Name: is_attributed, Length: 277396, dtype: float64, 'os': os\n", 754 | " 0 0.104272\n", 755 | " 1 0.001035\n", 756 | " 2 0.000249\n", 757 | " 3 0.000845\n", 758 | " 4 0.009622\n", 759 | " 5 0.000761\n", 760 | " 6 0.001184\n", 761 | " 7 0.025889\n", 762 | " 8 0.001063\n", 763 | " 9 0.000730\n", 764 | " 10 0.001195\n", 765 | " 11 0.004547\n", 766 | " 12 0.000756\n", 767 | " 13 0.001549\n", 768 | " 14 0.001545\n", 769 | " 15 0.000995\n", 770 | " 16 0.001371\n", 771 | " 17 0.001180\n", 772 | " 18 0.001302\n", 773 | " 19 0.001803\n", 774 | " 20 0.001257\n", 775 | " 21 0.165016\n", 776 | " 22 0.001769\n", 777 | " 23 0.000681\n", 778 | " 24 0.126777\n", 779 | " 25 0.001219\n", 780 | " 26 0.001398\n", 781 | " 27 0.003512\n", 782 | " 28 0.001078\n", 783 | " 29 0.188840\n", 784 | " ... \n", 785 | " 902 0.000000\n", 786 | " 914 0.000000\n", 787 | " 924 0.000000\n", 788 | " 929 0.000000\n", 789 | " 384 0.000000\n", 790 | " 869 0.000000\n", 791 | " 916 0.000000\n", 792 | " 925 0.000000\n", 793 | " 937 0.000000\n", 794 | " 939 0.000000\n", 795 | " 954 0.000000\n", 796 | " 500 0.000000\n", 797 | " 501 0.000000\n", 798 | " 897 0.000000\n", 799 | " 932 0.000000\n", 800 | " 951 0.000000\n", 801 | " 568 0.000000\n", 802 | " 870 0.000000\n", 803 | " 881 0.000000\n", 804 | " 888 0.000000\n", 805 | " 905 0.000000\n", 806 | " 911 0.000000\n", 807 | " 921 0.000000\n", 808 | " 944 0.000000\n", 809 | " 883 0.000000\n", 810 | " 915 0.000000\n", 811 | " 936 0.000000\n", 812 | " 949 0.000000\n", 813 | " 893 0.000000\n", 814 | " 910 0.000000\n", 815 | " Name: is_attributed, Length: 800, dtype: float64}" 816 | ] 817 | }, 818 | "execution_count": 17, 819 | "metadata": {}, 820 | "output_type": "execute_result" 821 | } 822 | ], 823 | "source": [ 824 | "means" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 18, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "data": { 834 | "text/plain": [ 835 | "{'app': 3 33911780\n", 836 | " 12 24179003\n", 837 | " 2 21642136\n", 838 | " 9 16458268\n", 839 | " 15 15958970\n", 840 | " 18 15756587\n", 841 | " 14 10027169\n", 842 | " 1 5796274\n", 843 | " 13 4329409\n", 844 | " 8 3731948\n", 845 | " 21 3616407\n", 846 | " 11 3466971\n", 847 | " 26 3126136\n", 848 | " 23 2675259\n", 849 | " 6 2464136\n", 850 | " 64 1893969\n", 851 | " 7 1764954\n", 852 | " 20 1758934\n", 853 | " 25 1467907\n", 854 | " 28 1311496\n", 855 | " 27 1296189\n", 856 | " 24 1259100\n", 857 | " 19 922902\n", 858 | " 17 797335\n", 859 | " 22 684604\n", 860 | " 10 684043\n", 861 | " 29 652090\n", 862 | " 32 485426\n", 863 | " 5 375533\n", 864 | " 151 188490\n", 865 | " ... \n", 866 | " 678 1\n", 867 | " 679 1\n", 868 | " 558 1\n", 869 | " 681 1\n", 870 | " 404 1\n", 871 | " 653 1\n", 872 | " 684 1\n", 873 | " 410 1\n", 874 | " 687 1\n", 875 | " 608 1\n", 876 | " 689 1\n", 877 | " 691 1\n", 878 | " 673 1\n", 879 | " 671 1\n", 880 | " 571 1\n", 881 | " 669 1\n", 882 | " 668 1\n", 883 | " 667 1\n", 884 | " 572 1\n", 885 | " 665 1\n", 886 | " 664 1\n", 887 | " 578 1\n", 888 | " 661 1\n", 889 | " 580 1\n", 890 | " 659 1\n", 891 | " 582 1\n", 892 | " 657 1\n", 893 | " 656 1\n", 894 | " 655 1\n", 895 | " 626 1\n", 896 | " Name: app, Length: 706, dtype: int64, 'channel': 280 15065927\n", 897 | " 245 8873025\n", 898 | " 107 8457471\n", 899 | " 477 7188340\n", 900 | " 134 5924250\n", 901 | " 259 5787004\n", 902 | " 153 5537977\n", 903 | " 265 5446388\n", 904 | " 178 5271408\n", 905 | " 121 4692438\n", 906 | " 205 4359180\n", 907 | " 145 3583945\n", 908 | " 442 3574611\n", 909 | " 459 3477150\n", 910 | " 379 3402636\n", 911 | " 439 2901762\n", 912 | " 489 2711240\n", 913 | " 128 2703163\n", 914 | " 135 2663913\n", 915 | " 466 2655396\n", 916 | " 237 2632794\n", 917 | " 480 2618441\n", 918 | " 469 2588380\n", 919 | " 140 2562964\n", 920 | " 122 2526161\n", 921 | " 219 2349454\n", 922 | " 137 2302316\n", 923 | " 232 2275338\n", 924 | " 435 2192870\n", 925 | " 101 2147174\n", 926 | " ... \n", 927 | " 223 118\n", 928 | " 225 96\n", 929 | " 233 61\n", 930 | " 14 22\n", 931 | " 471 20\n", 932 | " 458 16\n", 933 | " 251 14\n", 934 | " 248 14\n", 935 | " 256 13\n", 936 | " 169 13\n", 937 | " 162 12\n", 938 | " 238 9\n", 939 | " 29 6\n", 940 | " 473 5\n", 941 | " 475 3\n", 942 | " 142 2\n", 943 | " 476 2\n", 944 | " 165 2\n", 945 | " 149 2\n", 946 | " 470 1\n", 947 | " 146 1\n", 948 | " 434 1\n", 949 | " 422 1\n", 950 | " 394 1\n", 951 | " 354 1\n", 952 | " 227 1\n", 953 | " 221 1\n", 954 | " 217 1\n", 955 | " 172 1\n", 956 | " 500 1\n", 957 | " Name: channel, Length: 202, dtype: int64, 'device': 1 174330052\n", 958 | " 2 8105054\n", 959 | " 0 1033413\n", 960 | " 3032 692891\n", 961 | " 3543 266596\n", 962 | " 3866 178274\n", 963 | " 59 24018\n", 964 | " 5 14379\n", 965 | " 6 12569\n", 966 | " 40 12351\n", 967 | " 16 10147\n", 968 | " 18 6742\n", 969 | " 21 5430\n", 970 | " 33 5337\n", 971 | " 30 4199\n", 972 | " 154 4011\n", 973 | " 67 3770\n", 974 | " 114 3585\n", 975 | " 37 3474\n", 976 | " 88 3262\n", 977 | " 7 3075\n", 978 | " 46 3001\n", 979 | " 109 2973\n", 980 | " 50 2770\n", 981 | " 78 2767\n", 982 | " 97 2537\n", 983 | " 60 2335\n", 984 | " 82 2287\n", 985 | " 203 2265\n", 986 | " 56 2247\n", 987 | " ... \n", 988 | " 3278 1\n", 989 | " 1040 1\n", 990 | " 3279 1\n", 991 | " 3281 1\n", 992 | " 3285 1\n", 993 | " 3286 1\n", 994 | " 1032 1\n", 995 | " 3289 1\n", 996 | " 3290 1\n", 997 | " 3291 1\n", 998 | " 3297 1\n", 999 | " 3301 1\n", 1000 | " 1019 1\n", 1001 | " 3302 1\n", 1002 | " 3303 1\n", 1003 | " 3305 1\n", 1004 | " 3309 1\n", 1005 | " 3310 1\n", 1006 | " 3311 1\n", 1007 | " 994 1\n", 1008 | " 993 1\n", 1009 | " 3313 1\n", 1010 | " 3315 1\n", 1011 | " 3316 1\n", 1012 | " 3317 1\n", 1013 | " 3318 1\n", 1014 | " 3319 1\n", 1015 | " 3321 1\n", 1016 | " 975 1\n", 1017 | " 2061 1\n", 1018 | " Name: device, Length: 3475, dtype: int64, 'ip': 5348 1238734\n", 1019 | " 5314 1171448\n", 1020 | " 73516 770451\n", 1021 | " 73487 763854\n", 1022 | " 53454 498186\n", 1023 | " 114276 427453\n", 1024 | " 26995 401495\n", 1025 | " 95766 378693\n", 1026 | " 17149 310996\n", 1027 | " 105475 302192\n", 1028 | " 100275 276799\n", 1029 | " 43793 261970\n", 1030 | " 105560 260049\n", 1031 | " 86767 257649\n", 1032 | " 111025 247187\n", 1033 | " 137052 217614\n", 1034 | " 201182 212448\n", 1035 | " 5178 211556\n", 1036 | " 49602 200053\n", 1037 | " 5147 197994\n", 1038 | " 48282 188092\n", 1039 | " 48212 185885\n", 1040 | " 48240 182754\n", 1041 | " 48170 180609\n", 1042 | " 123994 176285\n", 1043 | " 209663 173423\n", 1044 | " 93587 171417\n", 1045 | " 84896 170166\n", 1046 | " 45745 169149\n", 1047 | " 119369 159276\n", 1048 | " ... \n", 1049 | " 148131 1\n", 1050 | " 148134 1\n", 1051 | " 148215 1\n", 1052 | " 254103 1\n", 1053 | " 148213 1\n", 1054 | " 310971 1\n", 1055 | " 148202 1\n", 1056 | " 148200 1\n", 1057 | " 148195 1\n", 1058 | " 148194 1\n", 1059 | " 148193 1\n", 1060 | " 254088 1\n", 1061 | " 283959 1\n", 1062 | " 235754 1\n", 1063 | " 254094 1\n", 1064 | " 221440 1\n", 1065 | " 148173 1\n", 1066 | " 254095 1\n", 1067 | " 254098 1\n", 1068 | " 310984 1\n", 1069 | " 148165 1\n", 1070 | " 310986 1\n", 1071 | " 221445 1\n", 1072 | " 254100 1\n", 1073 | " 221449 1\n", 1074 | " 148150 1\n", 1075 | " 310993 1\n", 1076 | " 148143 1\n", 1077 | " 310996 1\n", 1078 | " 233017 1\n", 1079 | " Name: ip, Length: 277396, dtype: int64, 'os': 19 44181914\n", 1080 | " 13 39782808\n", 1081 | " 17 9639776\n", 1082 | " 18 8974159\n", 1083 | " 22 7414405\n", 1084 | " 8 5278317\n", 1085 | " 10 5262422\n", 1086 | " 6 4632561\n", 1087 | " 15 4408470\n", 1088 | " 9 4370878\n", 1089 | " 25 4156096\n", 1090 | " 20 4013563\n", 1091 | " 16 3081268\n", 1092 | " 37 2905944\n", 1093 | " 3 2904808\n", 1094 | " 14 2503681\n", 1095 | " 41 2467180\n", 1096 | " 1 2215593\n", 1097 | " 12 2006410\n", 1098 | " 23 1844222\n", 1099 | " 27 1787478\n", 1100 | " 35 1687198\n", 1101 | " 32 1658075\n", 1102 | " 47 1450205\n", 1103 | " 53 1356478\n", 1104 | " 11 1354955\n", 1105 | " 28 1317819\n", 1106 | " 30 1156169\n", 1107 | " 26 881482\n", 1108 | " 31 767158\n", 1109 | " ... \n", 1110 | " 698 1\n", 1111 | " 695 1\n", 1112 | " 694 1\n", 1113 | " 690 1\n", 1114 | " 273 1\n", 1115 | " 688 1\n", 1116 | " 295 1\n", 1117 | " 687 1\n", 1118 | " 686 1\n", 1119 | " 684 1\n", 1120 | " 683 1\n", 1121 | " 701 1\n", 1122 | " 702 1\n", 1123 | " 240 1\n", 1124 | " 712 1\n", 1125 | " 724 1\n", 1126 | " 720 1\n", 1127 | " 717 1\n", 1128 | " 249 1\n", 1129 | " 715 1\n", 1130 | " 713 1\n", 1131 | " 708 1\n", 1132 | " 270 1\n", 1133 | " 707 1\n", 1134 | " 706 1\n", 1135 | " 263 1\n", 1136 | " 264 1\n", 1137 | " 267 1\n", 1138 | " 704 1\n", 1139 | " 728 1\n", 1140 | " Name: os, Length: 800, dtype: int64}" 1141 | ] 1142 | }, 1143 | "execution_count": 18, 1144 | "metadata": {}, 1145 | "output_type": "execute_result" 1146 | } 1147 | ], 1148 | "source": [ 1149 | "weights" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": 46, 1155 | "metadata": {}, 1156 | "outputs": [ 1157 | { 1158 | "data": { 1159 | "text/html": [ 1160 | "
\n", 1161 | "\n", 1174 | "\n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | "
appchanneldeviceipos
00.3094210.0773450.098525NaN0.104272
10.000212NaN0.0017580.1914890.001035
20.000262NaN0.000274NaN0.000249
30.0003030.000413NaNNaN0.000845
40.0000400.0858470.186807NaN0.009622
\n", 1228 | "
" 1229 | ], 1230 | "text/plain": [ 1231 | " app channel device ip os\n", 1232 | "0 0.309421 0.077345 0.098525 NaN 0.104272\n", 1233 | "1 0.000212 NaN 0.001758 0.191489 0.001035\n", 1234 | "2 0.000262 NaN 0.000274 NaN 0.000249\n", 1235 | "3 0.000303 0.000413 NaN NaN 0.000845\n", 1236 | "4 0.000040 0.085847 0.186807 NaN 0.009622" 1237 | ] 1238 | }, 1239 | "execution_count": 46, 1240 | "metadata": {}, 1241 | "output_type": "execute_result" 1242 | } 1243 | ], 1244 | "source": [ 1245 | "means_df = pd.DataFrame(means)\n", 1246 | "means_df.head()" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": 47, 1252 | "metadata": {}, 1253 | "outputs": [ 1254 | { 1255 | "data": { 1256 | "text/html": [ 1257 | "
\n", 1258 | "\n", 1271 | "\n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | "
appchanneldeviceipos
count706.000000202.0000003475.000000277396.000000800.000000
mean0.0642040.0432710.1497520.2560610.007943
std0.1618920.1380980.2304850.3514940.057189
min0.0000000.0000000.0000000.0000000.000000
25%0.0000000.0001530.0000000.0022000.000000
50%0.0000000.0004570.0769230.0666670.000000
75%0.0086990.0036150.2000000.3333330.000000
max1.0000000.9524501.0000001.0000000.925237
\n", 1349 | "
" 1350 | ], 1351 | "text/plain": [ 1352 | " app channel device ip os\n", 1353 | "count 706.000000 202.000000 3475.000000 277396.000000 800.000000\n", 1354 | "mean 0.064204 0.043271 0.149752 0.256061 0.007943\n", 1355 | "std 0.161892 0.138098 0.230485 0.351494 0.057189\n", 1356 | "min 0.000000 0.000000 0.000000 0.000000 0.000000\n", 1357 | "25% 0.000000 0.000153 0.000000 0.002200 0.000000\n", 1358 | "50% 0.000000 0.000457 0.076923 0.066667 0.000000\n", 1359 | "75% 0.008699 0.003615 0.200000 0.333333 0.000000\n", 1360 | "max 1.000000 0.952450 1.000000 1.000000 0.925237" 1361 | ] 1362 | }, 1363 | "execution_count": 47, 1364 | "metadata": {}, 1365 | "output_type": "execute_result" 1366 | } 1367 | ], 1368 | "source": [ 1369 | "means_df.describe()" 1370 | ] 1371 | }, 1372 | { 1373 | "cell_type": "code", 1374 | "execution_count": 51, 1375 | "metadata": {}, 1376 | "outputs": [], 1377 | "source": [ 1378 | "means_df.to_csv('cols_means.csv', index=False)" 1379 | ] 1380 | }, 1381 | { 1382 | "cell_type": "code", 1383 | "execution_count": 64, 1384 | "metadata": {}, 1385 | "outputs": [ 1386 | { 1387 | "data": { 1388 | "text/plain": [ 1389 | "10499" 1390 | ] 1391 | }, 1392 | "execution_count": 64, 1393 | "metadata": {}, 1394 | "output_type": "execute_result" 1395 | } 1396 | ], 1397 | "source": [ 1398 | "weights['ip'].nunique()" 1399 | ] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": 48, 1404 | "metadata": {}, 1405 | "outputs": [ 1406 | { 1407 | "data": { 1408 | "text/html": [ 1409 | "
\n", 1410 | "\n", 1423 | "\n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | "
appchanneldeviceipos
03248.01642.01033413.0NaN364804.0
15796274.0NaN174330052.047.02215593.0
221642136.0NaN8105054.0NaN691125.0
333911780.0875627.0NaNNaN2904808.0
4126275.0862.01713.0NaN593103.0
\n", 1477 | "
" 1478 | ], 1479 | "text/plain": [ 1480 | " app channel device ip os\n", 1481 | "0 3248.0 1642.0 1033413.0 NaN 364804.0\n", 1482 | "1 5796274.0 NaN 174330052.0 47.0 2215593.0\n", 1483 | "2 21642136.0 NaN 8105054.0 NaN 691125.0\n", 1484 | "3 33911780.0 875627.0 NaN NaN 2904808.0\n", 1485 | "4 126275.0 862.0 1713.0 NaN 593103.0" 1486 | ] 1487 | }, 1488 | "execution_count": 48, 1489 | "metadata": {}, 1490 | "output_type": "execute_result" 1491 | } 1492 | ], 1493 | "source": [ 1494 | "weights_df = pd.DataFrame(weights)\n", 1495 | "weights_df.head()" 1496 | ] 1497 | }, 1498 | { 1499 | "cell_type": "code", 1500 | "execution_count": 52, 1501 | "metadata": {}, 1502 | "outputs": [ 1503 | { 1504 | "data": { 1505 | "text/html": [ 1506 | "
\n", 1507 | "\n", 1520 | "\n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | "
appchanneldeviceipos
count7.060000e+022.020000e+023.475000e+032.773960e+058.000000e+02
mean2.619035e+059.153658e+055.320975e+046.665701e+022.311299e+05
std2.118369e+061.815411e+062.960519e+065.446831e+032.221556e+06
min1.000000e+001.000000e+001.000000e+001.000000e+001.000000e+00
25%2.000000e+004.234500e+031.000000e+003.000000e+001.000000e+00
50%2.900000e+011.077880e+054.000000e+001.300000e+013.000000e+00
75%7.532500e+021.019082e+061.800000e+011.490000e+027.350000e+01
max3.391178e+071.506593e+071.743301e+081.238734e+064.418191e+07
\n", 1598 | "
" 1599 | ], 1600 | "text/plain": [ 1601 | " app channel device ip os\n", 1602 | "count 7.060000e+02 2.020000e+02 3.475000e+03 2.773960e+05 8.000000e+02\n", 1603 | "mean 2.619035e+05 9.153658e+05 5.320975e+04 6.665701e+02 2.311299e+05\n", 1604 | "std 2.118369e+06 1.815411e+06 2.960519e+06 5.446831e+03 2.221556e+06\n", 1605 | "min 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00\n", 1606 | "25% 2.000000e+00 4.234500e+03 1.000000e+00 3.000000e+00 1.000000e+00\n", 1607 | "50% 2.900000e+01 1.077880e+05 4.000000e+00 1.300000e+01 3.000000e+00\n", 1608 | "75% 7.532500e+02 1.019082e+06 1.800000e+01 1.490000e+02 7.350000e+01\n", 1609 | "max 3.391178e+07 1.506593e+07 1.743301e+08 1.238734e+06 4.418191e+07" 1610 | ] 1611 | }, 1612 | "execution_count": 52, 1613 | "metadata": {}, 1614 | "output_type": "execute_result" 1615 | } 1616 | ], 1617 | "source": [ 1618 | "weights_df.describe()" 1619 | ] 1620 | }, 1621 | { 1622 | "cell_type": "code", 1623 | "execution_count": 65, 1624 | "metadata": {}, 1625 | "outputs": [], 1626 | "source": [ 1627 | "weights_df.to_csv('weights_df.csv', index=False)" 1628 | ] 1629 | }, 1630 | { 1631 | "cell_type": "markdown", 1632 | "metadata": {}, 1633 | "source": [ 1634 | "For each of our encoded values, let's look at the number of unique values:" 1635 | ] 1636 | }, 1637 | { 1638 | "cell_type": "code", 1639 | "execution_count": 21, 1640 | "metadata": {}, 1641 | "outputs": [ 1642 | { 1643 | "name": "stdout", 1644 | "output_type": "stream", 1645 | "text": [ 1646 | "CPU times: user 30min 45s, sys: 1min 10s, total: 31min 56s\n", 1647 | "Wall time: 23min 2s\n" 1648 | ] 1649 | } 1650 | ], 1651 | "source": [ 1652 | "%%time \n", 1653 | "uniques = [len(train_df[col].unique().compute()) for col in cols]" 1654 | ] 1655 | }, 1656 | { 1657 | "cell_type": "code", 1658 | "execution_count": 22, 1659 | "metadata": {}, 1660 | "outputs": [ 1661 | { 1662 | "data": { 1663 | "text/plain": [ 1664 | "[277396, 706, 3475, 800, 202]" 1665 | ] 1666 | }, 1667 | "execution_count": 22, 1668 | "metadata": {}, 1669 | "output_type": "execute_result" 1670 | } 1671 | ], 1672 | "source": [ 1673 | "uniques" 1674 | ] 1675 | }, 1676 | { 1677 | "cell_type": "code", 1678 | "execution_count": 66, 1679 | "metadata": {}, 1680 | "outputs": [], 1681 | "source": [ 1682 | "import pandas as pd" 1683 | ] 1684 | }, 1685 | { 1686 | "cell_type": "code", 1687 | "execution_count": 39, 1688 | "metadata": {}, 1689 | "outputs": [ 1690 | { 1691 | "data": { 1692 | "text/html": [ 1693 | "
\n", 1694 | "\n", 1707 | "\n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | "
colsunique_counts
0ip277396
1app706
2device3475
3os800
4channel202
\n", 1743 | "
" 1744 | ], 1745 | "text/plain": [ 1746 | " cols unique_counts\n", 1747 | "0 ip 277396\n", 1748 | "1 app 706\n", 1749 | "2 device 3475\n", 1750 | "3 os 800\n", 1751 | "4 channel 202" 1752 | ] 1753 | }, 1754 | "execution_count": 39, 1755 | "metadata": {}, 1756 | "output_type": "execute_result" 1757 | } 1758 | ], 1759 | "source": [ 1760 | "cols_df = pd.DataFrame({'cols': cols, 'unique_counts': uniques})\n", 1761 | "cols_df.head()" 1762 | ] 1763 | }, 1764 | { 1765 | "cell_type": "code", 1766 | "execution_count": 44, 1767 | "metadata": {}, 1768 | "outputs": [], 1769 | "source": [ 1770 | "cols_df.to_csv('cols_unique_counts.csv', index=False)" 1771 | ] 1772 | }, 1773 | { 1774 | "cell_type": "code", 1775 | "execution_count": 45, 1776 | "metadata": {}, 1777 | "outputs": [ 1778 | { 1779 | "data": { 1780 | "text/html": [ 1781 | "
\n", 1782 | "\n", 1795 | "\n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | "
colsunique_counts
0ip277396
1app706
2device3475
3os800
4channel202
\n", 1831 | "
" 1832 | ], 1833 | "text/plain": [ 1834 | " cols unique_counts\n", 1835 | "0 ip 277396\n", 1836 | "1 app 706\n", 1837 | "2 device 3475\n", 1838 | "3 os 800\n", 1839 | "4 channel 202" 1840 | ] 1841 | }, 1842 | "execution_count": 45, 1843 | "metadata": {}, 1844 | "output_type": "execute_result" 1845 | } 1846 | ], 1847 | "source": [ 1848 | "pd.read_csv('cols_unique_counts.csv')" 1849 | ] 1850 | }, 1851 | { 1852 | "cell_type": "code", 1853 | "execution_count": 68, 1854 | "metadata": {}, 1855 | "outputs": [ 1856 | { 1857 | "name": "stdout", 1858 | "output_type": "stream", 1859 | "text": [ 1860 | "CPU times: user 5min 59s, sys: 14.1 s, total: 6min 13s\n", 1861 | "Wall time: 4min 21s\n" 1862 | ] 1863 | } 1864 | ], 1865 | "source": [ 1866 | "%time target = train_df.is_attributed.values.compute()" 1867 | ] 1868 | }, 1869 | { 1870 | "cell_type": "code", 1871 | "execution_count": 73, 1872 | "metadata": {}, 1873 | "outputs": [ 1874 | { 1875 | "data": { 1876 | "text/plain": [ 1877 | "0.0024707214109990216" 1878 | ] 1879 | }, 1880 | "execution_count": 73, 1881 | "metadata": {}, 1882 | "output_type": "execute_result" 1883 | } 1884 | ], 1885 | "source": [ 1886 | "1 - (target == 0).mean()" 1887 | ] 1888 | }, 1889 | { 1890 | "cell_type": "code", 1891 | "execution_count": null, 1892 | "metadata": {}, 1893 | "outputs": [], 1894 | "source": [] 1895 | } 1896 | ], 1897 | "metadata": { 1898 | "kernelspec": { 1899 | "display_name": "Python 3", 1900 | "language": "python", 1901 | "name": "python3" 1902 | }, 1903 | "language_info": { 1904 | "codemirror_mode": { 1905 | "name": "ipython", 1906 | "version": 3 1907 | }, 1908 | "file_extension": ".py", 1909 | "mimetype": "text/x-python", 1910 | "name": "python", 1911 | "nbconvert_exporter": "python", 1912 | "pygments_lexer": "ipython3", 1913 | "version": "3.6.4" 1914 | }, 1915 | "toc": { 1916 | "nav_menu": {}, 1917 | "number_sections": false, 1918 | "sideBar": true, 1919 | "skip_h1_title": false, 1920 | "title_cell": "Table of Contents", 1921 | "title_sidebar": "Contents", 1922 | "toc_cell": false, 1923 | "toc_position": { 1924 | "height": "calc(100% - 180px)", 1925 | "left": "10px", 1926 | "top": "150px", 1927 | "width": "186px" 1928 | }, 1929 | "toc_section_display": true, 1930 | "toc_window_display": false 1931 | } 1932 | }, 1933 | "nbformat": 4, 1934 | "nbformat_minor": 2 1935 | } 1936 | --------------------------------------------------------------------------------