├── .gitignore
├── README.md
├── __init__.py
├── application.conf
├── factors
    ├── __init__.py
    └── factors_pd.py
├── junk
    ├── __init__.py
    ├── blend_submissions.py
    ├── collect_libffm_predictions.py
    ├── compute_libffm_auc.py
    ├── convert_tsv_to_columns.py
    ├── create_days_index.py
    ├── create_submission_index.py
    ├── create_subsample_index.py
    ├── create_train_day_8_9_hour_4_5_9_10_13_14_index.py
    ├── create_train_day_8_9_index.py
    ├── create_train_index.py
    └── print_submission_days_hours.py
├── lib
    ├── __init__.py
    ├── columns.py
    ├── hocon.py
    ├── project.py
    ├── quality.py
    └── utils.py
├── models
    ├── __init__.py
    ├── catboost_.py
    ├── libffm.py
    └── lightgbm_.py
└── preprocessing
    ├── eda.ipynb
    └── merge_test_sets.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | submissions
  2 | .idea
  3 | 
  4 | .DS_Store
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Kaggle TalkingData Ad Tracking Fraud Detection Challenge
 2 | 
 3 | Scripts for competition
 4 | https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection
 5 | 
 6 | Final ranking #55 with private leaderboard score 0.9824250
 7 | 
 8 | * Aggregate factors from Baris' kernel computed on all data with test_supplement
 9 | * Prev/next click times (1-, 2-, 3-step) computed on all data with test_supplement
10 | * LIBFFM out-of-fold using 5 folds
11 | * LGBM on 50% data (using all is_attributed=1 and 50% of is_attributed=0)
12 | * Average of a few LGBM models trained on different subsets of data and factors
13 | * Training with 96GB RAM
14 | * Store data column-wise in binary formats for fast loading
15 | * HOCON configurations are awesome
16 | 
17 | EDA [preprocessing/eda.ipyndb](preprocessing/eda.ipynb) - distributions of frequencies in train.csv
18 | 
19 | Merge test datasets [preprocessing/merge_test_sets.ipynb](preprocessing/merge_test_sets.ipynb) - script for merging test datasets
20 | test.csv
21 | test_supplement.csv
22 | 
23 | Baris kernel
24 | https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614
25 | https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm
26 | 
27 | Next/prev clicks without hashing trick
28 | https://www.kaggle.com/asydorchuk/nextclick-calculation-without-hashing-trick
29 | 
30 | Submissions
31 | 
32 | model    | dump       | train_set          | params                            | val AUC | lb AUC   | factors
33 | ---      | ---        | ---                | ---                               | ---     | ---      | ---
34 | lgbm     | lgbm_08    | d_8_9_h_4_5_9_...  | it=250 lr=0.2 md=3 nl=7 scw=300   | 0.983xx | 0.9795   | baris
35 | lgbm     | lgbm_09    | na_10pct           | it=500 lr=0.1 md=3 nl=7 scw=na    | 0.98498 | 0.9803   | baris
36 | lgbm     | lgbm_11    | d_8_9_h_4_5_9_...  | it=2000 lr=0.01 md=3 nl=7 scw=300 | 0.98332 | 0.9791   | baris
37 | cbst     | cbst_09    | na_10pct           | it=500 lr=0.05 md=6 rms=0.7       | 0.98221 | 0.9766   | baris
38 | lgbm     | lgbm_12    | na_10pct           | it=500 lr=0.1 md=3 nl=7 scw=na    | 0.98564 | 0.9806   | baris + t2
39 | lgbm     | lgbm_13    | na_10pct           | it=1000 lr=0.1 md=3 nl=7 scw=na   | 0.98610 | 0.9810   | baris + t2
40 | lgbm     | lgbm_14    | na_10pct           | it=1000 lr=0.1 md=3 nl=7 scw=na   | 0.9855x | 0.9810   | baris + t2 + t3
41 | lgbm     | lgbm_15    | na_10pct           | it=1370 lr=0.1 md=3 nl=7 scw=na   | 0.98567 | 0.9811   | baris + t2 + t3
42 | lgbm     | lgbm_16    | na_10pct           | it=1360 lr=0.1 md=3 nl=7 scw=na   | 0.98544 | 0.9809   | baris + t2 + libffm
43 | lgbm     | lgbm_17    | na_10pct           | it=820 lr=0.1 md=4 nl=15 scw=na   | 0.98580 | 0.9810   | baris + t2 + libffm
44 | blend    |            |                    | logit, weights=1.0                | -       | 0.9812   | blend lgbm_13..lgbm_17
45 | lgbm     | lgbm_18    | na_20pct           | it=1500 lr=0.1 md=3 nl=7 scw=na   | 0.98571 | 0.9808   | baris + t2 + libffm + tc2
46 | lgbm     | lgbm_19    | na_50pct           | it=2500 lr=0.1 md=3 nl=7 scw=na   | 0.98595 | 0.9811   | baris + t2
47 | blend    |            |                    | logit, weights=1.0                | -       | 0.9813   | blend lgbm_15 + lgbm_19
48 | lgbm     | lgbm_19    | na_50pct           | it=1500, all attributed           | -       | 0.9810   | baris + t2 + libffm
49 | lgbm     | lgbm_20    | na_50pct_2         | it=1500, all attributed           | -       | 0.9810   | baris + t2 + libffm
50 | blend    |            |                    | logit, weights=1.0                | -       | 0.9813   | blend lgbm_13..lgbm_20
51 | 
52 | Factors strength (lgbm_19)
53 | 
54 | feature | gain | split
55 | --- | --- | ---
56 | libffm_oof | 82.34410715832054 | 363
57 | ip_app_device_os_t_next | 4.858614532478393 | 555
58 | app | 4.285458537390102 | 1057
59 | ip_nunique_channel | 1.7134587347643337 | 173
60 | channel | 1.2185210564277669 | 2220
61 | os | 0.9917807676129159 | 1516
62 | hour | 0.8615137822058627 | 1078
63 | ip_nunique_app | 0.5879513478431967 | 157
64 | ip_nunique_device | 0.5397789939897564 | 190
65 | ip_day_hour_count | 0.461685682523349 | 191
66 | ip_app_count | 0.42650563120670515 | 99
67 | ip_app_device_os_t_next_2 | 0.35302018803960517 | 194
68 | ip_device_os_nunique_app | 0.3238393490783555 | 198
69 | ip_day_nunique_hour | 0.30262413717998854 | 76
70 | ip_app_os_count | 0.2384258533635993 | 84
71 | ip_device_os_cumcount_app | 0.11690381200637338 | 58
72 | ip_app_device_os_t_prev | 0.07133512725949863 | 95
73 | device | 0.06228725573269818 | 70
74 | ip_app_nunique_os | 0.06172545078968752 | 141
75 | ip_cumcount_os | 0.05417631528656273 | 44
76 | ip_app_channel_mean_hour | 0.04849352774733603 | 144
77 | day | 0.017391508495188064 | 61
78 | app_nunique_channel | 0.014128035383901352 | 35
79 | ip_app_os_var_hour | 0.01325167381501645 | 52
80 | ip_app_device_os_t_prev_2 | 0.012127423364797763 | 60
81 | ip_app_channel_var_day | 0.011027400971738753 | 39
82 | ip_day_channel_var_hour | 0.009866716722734953 | 50
83 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/__init__.py


--------------------------------------------------------------------------------
/application.conf:
--------------------------------------------------------------------------------
  1 | data = {
  2 |     dir = ../data
  3 |     files = {
  4 |         train = train.csv
  5 |         test = test.csv
  6 |         test_supplement = test_supplement.csv
  7 |         test_merged = test_merged.csv
  8 |     }
  9 |     ntrain = 184903890
 10 | }
 11 | 
 12 | factors = {
 13 | #    hash_id = {
 14 | #        ip_app_device_os_hash = {
 15 | #            columns = [ip, app, device, os]
 16 | #            groupby = [ip, app, device, os]
 17 | #            num_bits = 27
 18 | #            dtype = uint32
 19 | #        }
 20 | #    }
 21 | 
 22 |     click_time_no_hash = {
 23 |         ip_app_device_os_t_prev = {
 24 |             columns = [ip, app, device, os, epoch]
 25 |             groupby = [ip, app, device, os]
 26 |             reverse = false
 27 |             dtype = uint32
 28 |         },
 29 |         ip_app_device_os_t_next = {
 30 |             columns = [ip, app, device, os, epoch]
 31 |             groupby = [ip, app, device, os]
 32 |             reverse = true
 33 |             dtype = uint32
 34 |         },
 35 |         ip_app_device_os_t_prev_2 = {
 36 |             columns = [ip, app, device, os, epoch]
 37 |             groupby = [ip, app, device, os]
 38 |             reverse = false
 39 |             step = 2
 40 |             dtype = uint32
 41 |         },
 42 |         ip_app_device_os_t_next_2 = {
 43 |             columns = [ip, app, device, os, epoch]
 44 |             groupby = [ip, app, device, os]
 45 |             reverse = true
 46 |             step = 2
 47 |             dtype = uint32
 48 |         }
 49 |         ip_app_device_os_t_prev_3 = {
 50 |             columns = [ip, app, device, os, epoch]
 51 |             groupby = [ip, app, device, os]
 52 |             reverse = false
 53 |             step = 3
 54 |             dtype = uint32
 55 |         },
 56 |         ip_app_device_os_t_next_3 = {
 57 |             columns = [ip, app, device, os, epoch]
 58 |             groupby = [ip, app, device, os]
 59 |             reverse = true
 60 |             step = 3
 61 |             dtype = uint32
 62 |         }
 63 |         ip_app_device_os_channel_t_prev = {
 64 |             columns = [ip, app, device, os, channel, epoch]
 65 |             groupby = [ip, app, device, os, channel]
 66 |             reverse = false
 67 |             dtype = uint32
 68 |         },
 69 |         ip_app_device_os_channel_t_next = {
 70 |             columns = [ip, app, device, os, channel, epoch]
 71 |             groupby = [ip, app, device, os, channel]
 72 |             reverse = true
 73 |             dtype = uint32
 74 |         },
 75 |     }
 76 | 
 77 |     aggr = {
 78 |         ip_nunique_channel = {
 79 |             columns = [ip, channel]
 80 |             groupby = [ip]
 81 |             select = channel
 82 |             aggr = nunique
 83 |             dtype = uint8
 84 |         },
 85 |         ip_device_os_cumcount_app = {
 86 |             columns = [ip, device, os, app]
 87 |             groupby = [ip, device, os]
 88 |             select = app
 89 |             aggr = cumcount
 90 |             dtype = uint32
 91 |         },
 92 |         ip_day_nunique_hour = {
 93 |             columns = [ip, day, hour]
 94 |             groupby = [ip, day]
 95 |             select = hour
 96 |             aggr = nunique
 97 |             dtype = uint32
 98 |         },
 99 |         ip_nunique_app = {
100 |             columns = [ip, app]
101 |             groupby = [ip]
102 |             select = app
103 |             aggr = nunique
104 |             dtype = uint8
105 |         },
106 |         ip_app_nunique_os = {
107 |             columns = [ip, app, os]
108 |             groupby = [ip, app]
109 |             select = os
110 |             aggr = nunique
111 |             dtype = uint8
112 |         },
113 |         ip_nunique_device = {
114 |             columns = [ip, device]
115 |             groupby = [ip]
116 |             select = device
117 |             aggr = nunique
118 |             dtype = uint16
119 |         },
120 |         app_nunique_channel = {
121 |             columns = [app, channel]
122 |             groupby = [app]
123 |             select = channel
124 |             aggr = nunique
125 |             dtype = uint32
126 |         },
127 |         ip_cumcount_os = {
128 |             columns = [ip, os]
129 |             groupby = [ip]
130 |             select = os
131 |             aggr = cumcount
132 |             dtype = uint32
133 |         },
134 |         ip_device_os_nunique_app = {
135 |             columns = [ip, device, os, app]
136 |             groupby = [ip, device, os]
137 |             select = app
138 |             aggr = nunique
139 |             dtype = uint32
140 |         },
141 |         ip_day_hour_count = {
142 |             columns = [ip, day, hour, channel]
143 |             groupby = [ip, day, hour]
144 |             select = channel
145 |             aggr = count
146 |             dtype = uint32
147 |         },
148 |         ip_app_count = {
149 |             columns = [ip, app, channel]
150 |             groupby = [ip, app]
151 |             select = channel
152 |             aggr = count
153 |             dtype = uint32
154 |         },
155 |         ip_app_os_count = {
156 |             columns = [ip, app, os, channel]
157 |             groupby = [ip, app, os]
158 |             select = channel
159 |             aggr = count
160 |             dtype = uint32
161 |         },
162 |         ip_day_channel_var_hour = {
163 |             columns = [ip, day, channel, hour]
164 |             groupby = [ip, day, channel]
165 |             select = hour
166 |             aggr = var
167 |             dtype = float32
168 |         },
169 |         ip_app_os_var_hour = {
170 |             columns = [ip, app, os, hour]
171 |             groupby = [ip, app, os]
172 |             select = hour
173 |             aggr = var
174 |             dtype = float32
175 |         },
176 |         ip_app_channel_var_day = {
177 |             columns = [ip, app, channel, day]
178 |             groupby = [ip, app, channel]
179 |             select = day
180 |             aggr = var
181 |             dtype = float32
182 |         },
183 |         ip_app_channel_mean_hour = {
184 |             columns = [ip, app, channel, hour]
185 |             groupby = [ip, app, channel]
186 |             select = hour
187 |             aggr = mean
188 |             dtype = float32
189 |         }
190 |     }
191 | }
192 | 
193 | factors_pd = {
194 |     dump = {
195 |         dir = "../dumps/factors_baris_04/"
196 |     }
197 |     source = "../data/columns"
198 |     factors = ${factors}
199 | }
200 | 
201 | catboost = {
202 |     dump = {
203 |         dir = "../dumps/catboost_10"
204 |     }
205 | 
206 |     data = {
207 |         dir = "../data/columns"
208 |         train = {
209 |             index = subsample_not_attributed_10pct
210 |         }
211 |         test = {
212 |             index = submission
213 |         }
214 |     }
215 | 
216 |     target = is_attributed
217 | 
218 |     features = [
219 |         app,
220 |         device,
221 |         os,
222 |         channel,
223 |         hour,
224 |         day,
225 |         ip_app_device_os_t_next,
226 |         ip_app_device_os_t_prev,
227 |         ip_app_device_os_t_next_2,
228 |         ip_app_device_os_t_prev_2,
229 |         ip_nunique_channel,
230 |         ip_device_os_cumcount_app,
231 |         ip_day_nunique_hour,
232 |         ip_nunique_app,
233 |         ip_app_nunique_os,
234 |         ip_nunique_device,
235 |         app_nunique_channel,
236 |         ip_cumcount_os,
237 |         ip_device_os_nunique_app,
238 |         ip_day_hour_count,
239 |         ip_app_count,
240 |         ip_app_os_count,
241 |         ip_day_channel_var_hour,
242 |         ip_app_os_var_hour,
243 |         ip_app_channel_var_day,
244 |         ip_app_channel_mean_hour
245 |     ]
246 | 
247 |     categorical_features = [
248 |         app,
249 |         device,
250 |         os,
251 |         channel,
252 |         hour,
253 |         day
254 |     ]
255 | 
256 |     options = {
257 |         eval_metric = AUC
258 |         learning_rate = 0.1
259 |         iterations = 500
260 |         depth = 6
261 |         rsm=0.7
262 |         simple_ctr = [Counter, Counter, Counter, Counter, Counter, Counter]
263 |     }
264 | 
265 |     hyperopt = {
266 |         enabled = false
267 |         max_evals = 25
268 |         space = {
269 |             l2_leaf_reg = {
270 |                 expression = loguniform,
271 |                 params = {
272 |                     low = 3.0
273 |                     high = 5.0
274 |                 }
275 |             }
276 |             learning_rate = {
277 |                 expression = loguniform,
278 |                 params = {
279 |                     low = -5.0
280 |                     high = -0.5
281 |                 }
282 |             }
283 |         }
284 |     }
285 | }
286 | 
287 | lightgbm = {
288 |     dump = {
289 |         dir = "../dumps/lightgbm_20"
290 |     }
291 | 
292 |     data = {
293 |         dir = "../data/columns"
294 |         train = {
295 |             index = subsample_not_attributed_50pct_2
296 |             # index = subsample_not_attributed_50pct
297 |             # index = subsample_not_attributed_10pct
298 |             # index = days_8_9_hours_4_5_9_10_13_14_attributed
299 |         }
300 |         test = {
301 |             index = submission
302 |         }
303 |     }
304 | 
305 |     label = is_attributed
306 | 
307 |     features = [
308 |         app,
309 |         device,
310 |         os,
311 |         channel,
312 |         hour,
313 |         day,
314 |         ip_app_device_os_t_next,
315 |         ip_app_device_os_t_prev,
316 |         ip_app_device_os_t_prev_2,
317 |         ip_app_device_os_t_next_2,
318 |         ip_nunique_channel,
319 |         ip_device_os_cumcount_app,
320 |         ip_day_nunique_hour,
321 |         ip_nunique_app,
322 |         ip_app_nunique_os,
323 |         ip_nunique_device,
324 |         app_nunique_channel,
325 |         ip_cumcount_os,
326 |         ip_device_os_nunique_app,
327 |         ip_day_hour_count,
328 |         ip_app_count,
329 |         ip_app_os_count,
330 |         ip_day_channel_var_hour,
331 |         ip_app_os_var_hour,
332 |         ip_app_channel_var_day,
333 |         ip_app_channel_mean_hour,
334 |         libffm_oof
335 |     ]
336 | 
337 |     categorical_features = [
338 |         app,
339 |         device,
340 |         os,
341 |         channel,
342 |         hour,
343 |         day
344 |     ]
345 | 
346 |     params = {
347 |         objective = binary
348 |         metric = auc
349 |         learning_rate = 0.1
350 |         max_depth = 3
351 |         num_leaves = 7
352 |         min_child_samples = 100
353 |         min_child_weight = 0
354 |         max_bin = 100
355 |         subsample = 0.7
356 |         subsample_freq = 1
357 |         colsample_bytree = 0.9
358 |         scale_pos_weight = 100.0
359 |         early_stopping_rounds = 1000
360 |         num_threads = 15
361 |     }
362 | 
363 |     options = {
364 |         num_boost_round = 1500
365 |         verbose_eval = 10
366 |     }
367 | 
368 |     valid_size = 0
369 | }
370 | 
371 | libffm = {
372 |     dump = {
373 |         dir = "../dumps/libffm_00"
374 |     }
375 | 
376 |     data = {
377 |         dir = "../data/columns"
378 |     }
379 | 
380 |     options = {
381 |         learning_rate = "0.1"
382 |         factor = "12"
383 |         lambda = "0.00002"
384 |         num_iter = "2"
385 |     }
386 | }
387 | 
388 | 


--------------------------------------------------------------------------------
/factors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/factors/__init__.py


--------------------------------------------------------------------------------
/factors/factors_pd.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import logging
  3 | from os.path import abspath, join as join_path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from lib.project import project
  9 | from lib.columns import DataFrameCols
 10 | from lib.utils import makedirs
 11 | 
 12 | 
 13 | class Factors(object):
 14 | 
 15 |     def datetimes(self, df):
 16 |         df['day'] = pd.to_datetime(df['click_time']).dt.day.astype('uint8')
 17 |         df['hour'] = pd.to_datetime(df['click_time']).dt.hour.astype('uint8')
 18 |         return df
 19 | 
 20 |     def aggr(self, df, name, groupby, select, aggr, dtype, **other):
 21 |         """ Baris' aggregates
 22 |         https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614
 23 |         https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm
 24 |         """
 25 |         grouped = df[groupby + [select]].groupby(groupby)[select]
 26 |         if aggr == 'count':
 27 |             count = grouped.count().reset_index().rename(columns={select: name})
 28 |             df = df.merge(count, on=groupby, how='left')
 29 |             df[name] = df[name].astype(dtype)
 30 |         if aggr == 'nunique':
 31 |             nunique = grouped.nunique().reset_index().rename(columns={select: name})
 32 |             df = df.merge(nunique, on=groupby, how='left')
 33 |             df[name] = df[name].astype(dtype)
 34 |         if aggr == 'mean':
 35 |             mean = grouped.mean().reset_index().rename(columns={select: name}).fillna(0)
 36 |             df = df.merge(mean, on=groupby, how='left')
 37 |             df[name] = df[name].astype(dtype)
 38 |         if aggr == 'var':
 39 |             var = grouped.var().reset_index().rename(columns={select: name}).fillna(0)
 40 |             df = df.merge(var, on=groupby, how='left')
 41 |             df[name] = df[name].astype(dtype)
 42 |         if aggr == 'cumcount':
 43 |             cumcount = grouped.cumcount()
 44 |             df[name] = cumcount.values
 45 |             df[name] = df[name].astype(dtype)
 46 | 
 47 |         del grouped
 48 |         gc.collect()
 49 | 
 50 |         return df
 51 | 
 52 |     def hash_id(self, df, name, groupby, num_bits=27, salt='salt', **other):
 53 |         d = (1 << num_bits)
 54 | 
 55 |         def hashfcn(row):
 56 |             if row['id'] % 1000 == 0:
 57 |                 logging.info(row['id'])
 58 |             return hash(salt + '_'.join(map(str, [row[k] for k in groupby]))) % d
 59 | 
 60 |         df[name] = df.apply(hashfcn, axis=1, reduce=False).astype(np.uint32)
 61 |         return df
 62 | 
 63 |     def click_time(self, df, name, hash_id, reverse=False, num_bits=27, **other):
 64 |         """ Baris' time to prev/next click
 65 |         https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614
 66 |         https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm
 67 |         """
 68 |         d = (1 << num_bits)
 69 | 
 70 |         epochs = df['epoch'].values
 71 |         ids = df[hash_id].values
 72 |         if reverse:
 73 |             ids = reversed(ids)
 74 |             epochs = reversed(epochs)
 75 | 
 76 |         unknown = np.iinfo(np.uint32).max
 77 |         buf = np.full(d, unknown, dtype=np.uint32)
 78 |         prev_click = np.full(df.shape[0], unknown, dtype=np.uint32)
 79 | 
 80 |         for i, (_id, t) in enumerate(zip(ids, epochs)):
 81 |             t_prev = buf[_id]
 82 |             buf[_id] = t
 83 |             if t_prev != unknown:
 84 |                 if not reverse:
 85 |                     prev_click[i] = t - t_prev
 86 |                 else:
 87 |                     prev_click[i] = t_prev - t
 88 | 
 89 |         if not reversed:
 90 |             df[name] = prev_click
 91 |         else:
 92 |             df[name] = np.flipud(prev_click)
 93 | 
 94 |         return df
 95 | 
 96 |     def click_time_no_hash(self, df, name, groupby, step=1, reverse=False, **other):
 97 |         """ Compute previous/next click time without hashing trick
 98 |         https://www.kaggle.com/asydorchuk/nextclick-calculation-without-hashing-trick
 99 |         """
100 |         if not reverse:
101 |             df[name] = df['epoch'] - df.groupby(groupby)['epoch'].shift(step).fillna(0)
102 |         else:
103 |             df[name] = df.groupby(groupby)['epoch'].shift(-step).fillna(3000000000) - df['epoch']
104 | 
105 |         return df
106 | 
107 | def main(conf):
108 |     dump_dir = abspath(conf['factors_pd']['dump']['dir'])
109 |     makedirs(dump_dir)
110 | 
111 |     data_dir = abspath(conf['factors_pd']['source'])
112 |     dfc = DataFrameCols(data_dir)
113 | 
114 |     computer = Factors()
115 |     for group in conf['factors_pd']['factors']:
116 |         logging.info('Compute factors group: %s', group)
117 |         for factor in conf['factors_pd']['factors'][group]:
118 |             logging.info('Compute factor: %s', factor)
119 |             spec = conf['factors_pd']['factors'][group][factor]
120 |             df = dfc.load_df(['id'] + spec['columns'])
121 |             df = getattr(computer, group)(df, factor, **spec)
122 |             df.sort_values(by=['id'], inplace=True)
123 |             if conf['factors_pd']['factors'][group][factor].get('factors', None) is None:
124 |                 dfc.write_column(factor, df[factor].values)
125 |             else:
126 |                 for fout in conf['factors_pd']['factors'][group][factor].get('factors'):
127 |                     fname = factor + '_' + fout
128 |                     dfc.write_column(fname, df[fname].values)
129 |             del df
130 |             gc.collect()
131 | 
132 | if __name__ == '__main__':
133 |     main(project().conf)
134 | 


--------------------------------------------------------------------------------
/junk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/junk/__init__.py


--------------------------------------------------------------------------------
/junk/blend_submissions.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from scipy.special import logit, expit
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = ArgumentParser()
11 |     parser.add_argument('--submissions', nargs='+')
12 |     parser.add_argument('--weights', nargs='+', type=float)
13 |     parser.add_argument('--mix-logits', action='store_true')
14 |     parser.add_argument('--output-file')
15 |     args = parser.parse_args()
16 | 
17 |     n = 18790469
18 |     blend = np.zeros(n, dtype=np.float32)
19 |     wnorm = sum(args.weights)
20 |     for j, fname in enumerate(args.submissions):
21 |         df = pd.read_csv(fname)
22 |         df.sort_values(by=['click_id'], inplace=True)
23 |         values = df['is_attributed'].values
24 |         if args.mix_logits:
25 |             values = logit(values)
26 |         blend += args.weights[j] * values / wnorm
27 | 
28 |     if args.mix_logits:
29 |         blend = expit(blend)
30 | 
31 |     df_out = pd.DataFrame(data={'click_id': np.arange(n, dtype=np.int32), 'is_attributed': blend})
32 |     df_out.to_csv(args.output_file, header=True, index=False)
33 | 


--------------------------------------------------------------------------------
/junk/collect_libffm_predictions.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pickle
 3 | from os.path import abspath, join as join_path
 4 | from argparse import ArgumentParser
 5 | 
 6 | import numpy as np
 7 | 
 8 | from scipy.special import logit
 9 | from sklearn.metrics import roc_auc_score, log_loss
10 | 
11 | from lib.columns import DataFrameCols
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     parser = ArgumentParser()
16 |     parser.add_argument('dump')
17 |     args = parser.parse_args()
18 | 
19 |     dumpdir = abspath(args.dump)
20 |     datadir = abspath('../data/columns')
21 | 
22 |     dfc = DataFrameCols(datadir)
23 |     df = dfc.load_df(columns=['id', 'is_attributed'])
24 |     df['p'] = 0
25 | 
26 |     df_train = df[df['is_attributed'] >= 0]
27 |     df_test = df[df['is_attributed'] == -1]
28 |     print(df_test.shape[0])
29 | 
30 |     with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f:
31 |         folds = pickle.load(f)
32 | 
33 |     p_test_avg = np.zeros(df_test.shape[0])
34 |     for j_fold, (fold_idx, valid_idx) in enumerate(folds):
35 |         valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % j_fold)
36 |         with open(valid_pred_file, 'r') as f:
37 |             p_valid = np.array([float(s) for s in f.readlines()])
38 | 
39 |         y_valid = df_train.loc[valid_idx, 'is_attributed'].values
40 |         auc_valid = roc_auc_score(y_valid, p_valid)
41 |         print('Fold %d validation auc=%f' % (j_fold, auc_valid))
42 | 
43 |         df_train.loc[valid_idx, 'p'] = logit(p_valid)
44 | 
45 |         test_pred_file = join_path(dumpdir, 'test_pred_%d.txt' % j_fold)
46 |         with open(test_pred_file, 'r') as f:
47 |             p_test = np.array([float(s) for s in f.readlines()])
48 |             p_test_avg += logit(p_test)
49 | 
50 |     df_test.loc[:, 'p'] = p_test_avg / 5
51 |     df_all = df_train.append(df_test, ignore_index=True)
52 |     df_all.sort_values(by=['id'], inplace=True)
53 |     dfc.write_column('libffm_oof', df_all['p'].values)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/junk/compute_libffm_auc.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from os.path import abspath, join as join_path
 3 | from argparse import ArgumentParser
 4 | 
 5 | import numpy as np
 6 | from sklearn.metrics import roc_auc_score, log_loss
 7 | 
 8 | from lib.columns import DataFrameCols
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     parser = ArgumentParser()
13 |     parser.add_argument('dump', type=str)
14 |     parser.add_argument('fold', type=int)
15 |     args = parser.parse_args()
16 | 
17 |     dumpdir = abspath(args.dump)
18 |     datadir = abspath('../data/columns')
19 | 
20 |     dfc = DataFrameCols(datadir)
21 |     df = dfc.load_df(columns=['id', 'is_attributed'])
22 |     df = df[df['is_attributed'] >= 0]
23 | 
24 |     with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f:
25 |         folds = pickle.load(f)
26 | 
27 |     train_pred_file = join_path(dumpdir, 'train_pred_%d.txt' % args.fold)
28 |     with open(train_pred_file, 'r') as f:
29 |         p_train = np.array([float(s) for s in f.readlines()])
30 | 
31 |     valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % args.fold)
32 |     with open(valid_pred_file, 'r') as f:
33 |         p_valid = np.array([float(s) for s in f.readlines()])
34 | 
35 |     fold_idx = folds[args.fold][0]
36 |     valid_idx = folds[args.fold][1]
37 | 
38 |     y_train = df.loc[fold_idx, 'is_attributed'].values
39 |     y_valid = df.loc[valid_idx, 'is_attributed'].values
40 | 
41 |     print('Train results: log_loss=%f, auc=%f' % (log_loss(y_train, p_train), roc_auc_score(y_train, p_train)))
42 |     print('Valid results: log_loss=%f, auc=%f' % (log_loss(y_valid, p_valid), roc_auc_score(y_valid, p_valid)))
43 | 
44 | 
45 | 
46 | # ffm-train -p valid_fold_0.txt -l 0.0002 -k 4 -t 2 train_fold_0.txt model_fold_0.bin
47 | # Train results: log_loss=0.007449, auc=0.964776
48 | # Valid results: log_loss=0.007866, auc=0.961628
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/junk/convert_tsv_to_columns.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from lib.columns import DataFrameCols
 5 | from lib.utils import makedirs
 6 | 
 7 | if __name__ == '__main__':
 8 |     df = pd.read_csv('../data/train_test_merged/train_test_merged.tsv', sep='\t', parse_dates=['click_time'])
 9 | 
10 |     rename_columns = {}
11 |     for col in df.columns:
12 |         if col.startswith('# '):
13 |             rename_columns[col] = col[2:]
14 |     df.rename(columns=rename_columns, inplace=True)
15 |     df.sort_values(by=['id'], inplace=True)
16 | 
17 |     dtypes = {
18 |         'id': 'uint32',
19 |         'ip': 'uint32',
20 |         'app': 'uint16',
21 |         'device': 'uint16',
22 |         'os': 'uint16',
23 |         'channel': 'uint16',
24 |         'click_id': 'int32',
25 |         'click_id_submission': 'int32',
26 |         'is_attributed': 'int8'
27 |     }
28 | 
29 |     test_dir = '../data/columns'
30 |     makedirs(test_dir)
31 | 
32 |     dfc = DataFrameCols(test_dir)
33 |     for col, dtype in dtypes.items():
34 |         print(col, dtype)
35 |         dfc.write_column(col, df[col].astype(dtype).values)
36 | 
37 |     dfc.write_column('day', pd.to_datetime(df['click_time']).dt.day.astype('uint8').values)
38 |     dfc.write_column('hour', pd.to_datetime(df['click_time']).dt.hour.astype('uint8').values)
39 |     dfc.write_column('epoch', (df['click_time'].astype(np.int64) // 10 ** 9).values)
40 | 


--------------------------------------------------------------------------------
/junk/create_days_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 |     day_col = dfc.load_column('day')
 8 |     hour_col = dfc.load_column('hour')
 9 |     is_attributed_col = dfc.load_column('is_attributed')
10 | 
11 |     hours = np.unique(hour_col, return_counts=True)[0]
12 | 
13 |     for h in hours:
14 |         all = np.where((day_col == 9) & (hour_col == h))[0].shape[0]
15 |         attributed = np.where((day_col == 9) & (hour_col == h) & (is_attributed_col >= 0))[0].shape[0]
16 | 
17 |         print(h, all, attributed)
18 | 
19 | # hour all attributed
20 | # 0 3318301 3318301
21 | # 1 3082862 3082862
22 | # 2 3068887 3068887
23 | # 3 3351149 3351149
24 | # 4 4032691 4032691
25 | # 5 3671741 3671741
26 | # 6 3570940 3570940
27 | # 7 3186240 3186240
28 | # 8 2804701 2804701
29 | # 9 2986204 2986204
30 | # 10 3304199 3304199
31 | # 11 3347741 3347741
32 | # 12 3363917 3363917
33 | # 13 3457523 3457523
34 | # 14 3443348 3443283      !!! hour 14 has small fraction of not attributed events
35 | # 15 3026679 3026111
36 | # 16 2495595 447
37 | # 17 1265180 0
38 | # 18 762056 0
39 | # 19 526096 0
40 | # 20 432411 0
41 | # 21 571504 0
42 | # 22 1325626 0
43 | # 23 2423959 0


--------------------------------------------------------------------------------
/junk/create_submission_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     click_id_submission = dfc.load_column(col='click_id_submission')
 9 |     index = np.where(click_id_submission >= 0)[0].astype(np.uint32)
10 |     dfc.write_index('submission', index)
11 | 


--------------------------------------------------------------------------------
/junk/create_subsample_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     is_attributed_col = dfc.load_column('is_attributed')
 9 |     subsample = np.random.choice([0, 1], size=is_attributed_col.shape[0], p=[0.5, 0.5])
10 |     subsample_idx = np.where((is_attributed_col == 1) | ((is_attributed_col == 0) & (subsample == 1)))[0]
11 | 
12 |     print(subsample_idx.shape[0])
13 |     dfc.write_index('subsample_not_attributed_50pct_2', subsample_idx)
14 | 


--------------------------------------------------------------------------------
/junk/create_train_day_8_9_hour_4_5_9_10_13_14_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     day_col = dfc.load_column('day')
 9 |     hour_col = dfc.load_column('hour')
10 |     is_attributed_col = dfc.load_column('is_attributed')
11 | 
12 |     hidx = (hour_col == 4) | (hour_col == 5) | (hour_col == 9) | (hour_col == 10) | (hour_col == 13) | (hour_col == 14)
13 |     index = np.where((is_attributed_col >= 0) & (day_col > 7) & hidx)[0]
14 | 
15 |     dfc.write_index('days_8_9_hours_4_5_9_10_13_14_attributed', index)
16 | 


--------------------------------------------------------------------------------
/junk/create_train_day_8_9_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     day_col = dfc.load_column('day')
 9 |     is_attributed_col = dfc.load_column('is_attributed')
10 |     index = np.where((is_attributed_col >= 0) & (day_col > 7))[0]
11 | 
12 |     dfc.write_index('days_8_9_attributed', index)
13 | 


--------------------------------------------------------------------------------
/junk/create_train_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     is_attributed_col = dfc.load_column('is_attributed')
 9 |     index = np.where((is_attributed_col >= 0))[0]
10 | 
11 |     print(index.shape[0])
12 |     dfc.write_index('train', index)
13 | 


--------------------------------------------------------------------------------
/junk/print_submission_days_hours.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib.columns import DataFrameCols
 3 | 
 4 | if __name__ == '__main__':
 5 |     workdir = '../data/columns'
 6 |     dfc = DataFrameCols(workdir)
 7 | 
 8 |     submission_idx = dfc.load_index('submission')
 9 |     day_col = dfc.load_column('day', index=submission_idx)
10 |     hour_col = dfc.load_column('hour', index=submission_idx)
11 | 
12 |     print('Submission days:')
13 |     days = np.unique(day_col, return_counts=True)
14 |     for (d, c) in zip(days[0], days[1]):
15 |         print(d, c)
16 | 
17 |     print('Submission hours')
18 |     hours = np.unique(hour_col, return_counts=True)
19 |     for (h, c) in zip(hours[0], hours[1]):
20 |         print(h, c)
21 | 
22 | # Submission days:
23 | # 10 18790469
24 | # Submission hours
25 | # 4 3344125
26 | # 5 2858427
27 | # 6 381
28 | # 9 2984808
29 | # 10 3127993
30 | # 11 413
31 | # 13 3212566
32 | # 14 3261257
33 | # 15 499
34 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/lib/__init__.py


--------------------------------------------------------------------------------
/lib/columns.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, isfile
 2 | 
 3 | import sys
 4 | import ast
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from argparse import ArgumentParser
 9 | 
10 | 
11 | class DataFrameCols(object):
12 |     COL_EXT = '.bin'
13 |     IDX_EXT = '.idx'
14 |     META = 'meta'
15 | 
16 |     def __init__(self, workdir):
17 |         self.workdir = workdir
18 |         self.meta = DataFrameCols.read_meta(workdir)
19 | 
20 |     @staticmethod
21 |     def read_meta(workdir):
22 |         meta_file = join(workdir, DataFrameCols.META)
23 |         if not isfile(meta_file):
24 |             return {}
25 |         else:
26 |             with open(join(workdir, DataFrameCols.META), 'r') as fmeta:
27 |                 return ast.literal_eval(fmeta.read())
28 | 
29 |     def _write_meta(self):
30 |         with open(join(self.workdir, DataFrameCols.META), 'w') as fmeta:
31 |             fmeta.write(str(self.meta))
32 | 
33 |     def load_column(self, col, arange=None, index=None):
34 |         arr = np.fromfile(join(self.workdir, col + DataFrameCols.COL_EXT), dtype=self.meta[col])
35 |         if arange is not None:
36 |             start_index = arange[0]
37 |             end_index = arange[1]
38 |             return arr[start_index:end_index]
39 |         elif index is not None:
40 |             return arr[index]
41 |         else:
42 |             return arr
43 | 
44 |     def write_column(self, name, arr, arange=None, index=None):
45 |         if name in self.meta:
46 |             assert self.meta[name] == arr.dtype
47 |         else:
48 |             self.meta[name] = arr.dtype.str
49 | 
50 |         if arange is not None:
51 |             start_index = arange[0]
52 |             end_index = arange[0]
53 |             arr[start_index:end_index].tofile(join(self.workdir, name + DataFrameCols.COL_EXT))
54 |         else:
55 |             arr[index].tofile(join(self.workdir, name + DataFrameCols.COL_EXT))
56 | 
57 |         self._write_meta()
58 | 
59 |     def load_df(self, columns=None, arange=None, index=None):
60 |         data = dict()
61 |         columns = columns or self.meta.keys()
62 |         for col in columns:
63 |             data[col] = self.load_column(col, arange, index)
64 |         return pd.DataFrame(data=data)
65 | 
66 |     def write_df(self, df, arange=None, index=None):
67 |         for i, col in enumerate(df.columns):
68 |             self.write_column(col, df[col].values, arange, index)
69 | 
70 |     def load_index(self, name):
71 |         return np.fromfile(join(self.workdir, name + DataFrameCols.IDX_EXT), dtype=np.uint32)
72 | 
73 |     def write_index(self, name, arr):
74 |         arr.astype(np.uint32).tofile(join(self.workdir, name + DataFrameCols.IDX_EXT))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     parser = ArgumentParser()
79 |     parser.add_argument('path', default='.')
80 |     parser.add_argument('-f', '--fields', nargs='+', default=None)
81 |     parser.add_argument('--range-start', type=int, default=None)
82 |     parser.add_argument('--range-end', type=int, default=None)
83 |     parser.add_argument('--index', default=None)
84 |     args = parser.parse_args()
85 | 
86 |     dfc = DataFrameCols(args.path)
87 |     df = dfc.load_df(columns=args.fields, arange=(args.range_start, args.range_end))
88 |     df.to_csv(sys.stdout, header=True, index=False, sep='\t')
89 | 


--------------------------------------------------------------------------------
/lib/hocon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from pyhocon import HOCONConverter
 4 | 
 5 | 
 6 | def write_config(conf, filename, output_format):
 7 |     lines = HOCONConverter.convert(conf, output_format=output_format, indent=4)
 8 |     with open(filename, 'w') as fh:
 9 |         fh.writelines(lines)
10 | 
11 | 
12 | def config2json(conf):
13 |     lines = HOCONConverter.convert(conf, indent=0)
14 |     return ''.join(lines).replace('\n', ' ')


--------------------------------------------------------------------------------
/lib/project.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | import re
 4 | from collections import namedtuple
 5 | from argparse import ArgumentParser
 6 | from pyhocon import ConfigFactory, ConfigTree
 7 | 
 8 | logging.basicConfig(format='%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
 9 | 
10 | Project = namedtuple('Project', ['conf'])
11 | instance = None
12 | 
13 | 
14 | def project(argv=sys.argv):
15 |     global instance
16 | 
17 |     if instance is not None:
18 |         return instance
19 |     else:
20 |         pattern = re.compile('-D(.*)=(.*)')
21 |         conf_override = dict()
22 |         argv_filtered = []
23 |         for a in argv:
24 |             m = pattern.match(a)
25 |             if m is not None:
26 |                 conf_override[m.group(1)] = m.group(2)
27 |             else:
28 |                 argv_filtered.append(a)
29 | 
30 |         parser = ArgumentParser()
31 |         parser.add_argument('--conf', default='application.conf')
32 |         args, other = parser.parse_known_args(argv_filtered)
33 | 
34 |         conf = ConfigFactory.parse_file(args.conf)
35 |         conf_override = ConfigFactory.from_dict(conf_override)
36 |         conf_merged = ConfigTree.merge_configs(conf, conf_override)
37 | 
38 |         instance = Project(conf=conf_merged)
39 | 
40 |     return instance
41 | 


--------------------------------------------------------------------------------
/lib/quality.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def reliability_curve(labels, predictions, nbins, sample_weights=None):
 7 |     """ Reliability curve for binary classification tasks
 8 |     Group samples into bins by value of predicted probability and compute empirical probability in each bin
 9 | 
10 |     :param labels: true target values in {0, 1}
11 |     :param predictions: predicted probabilities in [0.0, 1.0]
12 |     :param nbins: number of bins
13 |     :param sample_weights: use the same sample weights that were used for training
14 |     :return:
15 |     """
16 | 
17 |     labels = np.array(labels)
18 |     predictions = np.array(predictions)
19 |     weights = sample_weights if sample_weights is not None else np.ones(len(labels))
20 | 
21 |     assert len(labels) == len(predictions)
22 |     assert len(labels) >= nbins
23 | 
24 |     ns = int(len(labels) / nbins)
25 |     rem = len(labels) - ns
26 | 
27 |     sort_idx = np.argsort(predictions)
28 |     count = np.zeros(nbins)
29 |     avg_pred = np.zeros(nbins)
30 |     avg_label = np.zeros(nbins)
31 |     weight_total = np.zeros(nbins)
32 | 
33 |     jbin = 0
34 |     for j, idx in enumerate(sort_idx):
35 |         avg_pred[jbin] += predictions[idx]
36 |         avg_label[jbin] += labels[idx] * weights[idx]
37 |         weight_total[jbin] += weights[idx]
38 |         count[jbin] += 1
39 |         if rem > 0 and count[jbin] == ns + 1:
40 |             jbin += 1
41 |             rem -= 1
42 |         elif rem == 0 and count[jbin] == ns:
43 |             jbin += 1
44 | 
45 |     return avg_label / weight_total, avg_pred / count
46 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from os import makedirs as os_makedirs
 3 | import errno
 4 | 
 5 | 
 6 | def makedirs(path):
 7 |     try:
 8 |         os_makedirs(path)
 9 |     except OSError as e:
10 |         if e.errno != errno.EEXIST:
11 |             raise


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/models/__init__.py


--------------------------------------------------------------------------------
/models/catboost_.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import json
  5 | from os import chdir, getcwd
  6 | from os.path import join as join_path, abspath
  7 | from copy import deepcopy
  8 | 
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import log_loss, roc_auc_score
 11 | from hyperopt import fmin, hp, STATUS_OK, Trials, tpe
 12 | 
 13 | from catboost import CatBoostClassifier
 14 | 
 15 | from lib.project import project
 16 | from lib.columns import DataFrameCols
 17 | from lib.utils import makedirs
 18 | from lib.hocon import write_config, config2json
 19 | from lib.quality import reliability_curve
 20 | 
 21 | 
 22 | def quality(labels, pred):
 23 |     return dict(
 24 |         ll=log_loss(labels, pred),
 25 |         auc=roc_auc_score(labels, pred),
 26 |         reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100)))
 27 |     )
 28 | 
 29 | 
 30 | def train_catboost(train_df, valid_df, target, features, categorical_features, options):
 31 |     logging.info('Training catboost with options: %s', options)
 32 | 
 33 |     cat_features = list(train_df[features].columns.get_loc(c) for c in categorical_features)
 34 | 
 35 |     model = CatBoostClassifier(**options)
 36 |     model.fit(X=train_df[features].values, y=train_df[target].values, cat_features=cat_features,
 37 |               eval_set=(valid_df[features].values, valid_df[target].values))
 38 | 
 39 |     model.save_model('model.bin')
 40 | 
 41 |     train_quality = quality(train_df[target].values, model.predict_proba(train_df[features].values)[:, 1])
 42 |     logging.info('Train quality: %s', train_quality)
 43 | 
 44 |     valid_quality = quality(valid_df[target].values, model.predict_proba(valid_df[features].values)[:, 1])
 45 |     logging.info('Validation quality: %s', valid_quality)
 46 | 
 47 |     return train_quality, valid_quality, model
 48 | 
 49 | 
 50 | def get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options):
 51 |     """ Construct hyperopt objective function """
 52 |     hyperopt_trial = 0
 53 | 
 54 |     def hyperobj(params):
 55 |         nonlocal hyperopt_trial
 56 |         hyperopt_trial += 1
 57 |         logging.info('Hyperopt trial %d, params=%s' % (hyperopt_trial, params))
 58 | 
 59 |         options = deepcopy(catboost_options)
 60 |         for p in params:
 61 |             options[p] = params[p]
 62 | 
 63 |         work_dir = getcwd()
 64 |         trial_dir = abspath(join_path(work_dir, 'trial_%d' % hyperopt_trial))
 65 |         makedirs(trial_dir)
 66 |         chdir(trial_dir)
 67 |         logging.info('Trial directory: %s', trial_dir)
 68 | 
 69 |         logging.info('Train catboost with options: %s' % config2json(options))
 70 |         train_quality, valid_quality, model = train_catboost(
 71 |             train_df, valid_df, target, features, categorical_features, options)
 72 | 
 73 |         model.save('model')
 74 |         chdir(work_dir)
 75 | 
 76 |         return {
 77 |             'loss': 1.0 - valid_quality['auc'],
 78 |             'status': STATUS_OK,
 79 |             'options': config2json(options),
 80 |             'quality': {
 81 |                 'train': train_quality,
 82 |                 'valid': valid_quality
 83 |             },
 84 |             'model': {
 85 |                 'file': join_path(trial_dir, 'model')
 86 |             }
 87 |         }
 88 | 
 89 |     return hyperobj
 90 | 
 91 | 
 92 | def train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options):
 93 |     logging.info('Running hyper parameters optimization: %s', config2json(hyperopt_options))
 94 | 
 95 |     space = dict()
 96 |     for param, opts in hyperopt_options['space'].items():
 97 |         expression = getattr(hp, opts['expression'])
 98 |         space[param] = expression(label=param, **opts['params'])
 99 | 
100 |     fcn = get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options)
101 | 
102 |     trials = Trials()
103 |     opt = fmin(
104 |         fn=fcn,
105 |         space=space,
106 |         algo=tpe.suggest,
107 |         trials=trials,
108 |         max_evals=hyperopt_options['max_evals']
109 |     )
110 | 
111 |     with open('hyperopt_trials.json', 'w') as f:
112 |         json.dump(trials.results, f, indent=4)
113 | 
114 |     logging.info('Best parameters: %s', opt)
115 | 
116 |     best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss'])
117 |     logging.info('Best model %d: AUC=%s, model=%s' % (
118 |         best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file']))
119 | 
120 |     best_model = CatBoostClassifier()
121 |     best_model.load_model(best_trial_result['model']['file'])
122 |     return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     conf = project().conf
127 | 
128 |     dump_dir = abspath(conf['catboost']['dump']['dir'])
129 |     makedirs(dump_dir)
130 | 
131 |     write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon')
132 |     write_config(conf, join_path(dump_dir, 'application.json'), 'json')
133 |     logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log')))
134 | 
135 |     logging.info('Kaggle Talking Data')
136 |     logging.info('Train Catboost')
137 |     logging.info('Dump: %s', dump_dir)
138 | 
139 |     target = conf['catboost']['target']
140 |     features = conf['catboost']['features']
141 |     categorical_features = conf['catboost']['categorical_features']
142 |     logging.info('Target: %s', target)
143 |     logging.info('Features: %s', config2json(features))
144 |     logging.info('Categorical features: %s', categorical_features)
145 | 
146 |     data_dir = abspath(conf['catboost']['data']['dir'])
147 |     dfc = DataFrameCols(data_dir)
148 | 
149 |     train_index_name = conf['catboost']['data']['train']['index']
150 |     train_index = dfc.load_index(train_index_name)
151 |     train_df = dfc.load_df(columns=[target] + features, index=train_index)
152 |     train_df, valid_df = train_test_split(train_df, test_size=0.1)
153 | 
154 |     catboost_options = conf['catboost']['options']
155 |     logging.info('Using catboost options: %s', catboost_options)
156 | 
157 |     work_dir = getcwd()
158 |     chdir(dump_dir)
159 | 
160 |     hyperopt_options = conf['catboost']['hyperopt']
161 |     if hyperopt_options['enabled']:
162 |         train_quality, valid_quality, model = train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options)
163 |     else:
164 |         train_quality, valid_quality, model = train_catboost(train_df, valid_df, target, features, categorical_features, catboost_options)
165 | 
166 |     chdir(work_dir)
167 | 
168 |     valid_pred = model.predict_proba(valid_df[features].values)[:, 1]
169 |     valid_quality = quality(valid_df[target].values, valid_pred)
170 |     logging.info('Cross-check best model validation score: AUC=%s' % valid_quality['auc'])
171 | 
172 |     # load model
173 |     # model = CatBoostClassifier()
174 |     # model.load_model(join_path(dump_dir, 'model.bin'))
175 | 
176 |     test_index_name = conf['catboost']['data']['test']['index']
177 |     test_index = dfc.load_index(test_index_name)
178 |     test_df = dfc.load_df(columns=features + ['click_id_submission'], index=test_index)
179 |     test_df['is_attributed'] = model.predict_proba(test_df[features].values)[:, 1]
180 |     test_df = test_df[['click_id_submission', 'is_attributed']].rename(columns={'click_id_submission': 'click_id'})
181 |     test_df.sort_values(by='click_id', inplace=True)
182 |     test_df.to_csv(join_path(dump_dir, 'submission.csv'), header=True, index=False)
183 | 


--------------------------------------------------------------------------------
/models/libffm.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import logging
  3 | import subprocess
  4 | import pickle
  5 | import csv
  6 | 
  7 | from os import chdir
  8 | from os.path import abspath, join as join_path
  9 | 
 10 | import numpy as np
 11 | 
 12 | from sklearn.model_selection import StratifiedKFold
 13 | from sklearn.metrics import roc_auc_score
 14 | 
 15 | from lib.project import project
 16 | from lib.columns import DataFrameCols
 17 | from lib.utils import makedirs
 18 | 
 19 | 
 20 | def write_libffm_data(df, target, fields, shifts):
 21 |     df['data'] = df[target].astype(str)
 22 |     for k, v in shifts.items():
 23 |         print(k)
 24 |         df['data'] += ' %d:' % fields[k]
 25 |         df['data'] += (df[k] + v).astype(str)
 26 |         df['data'] += ':1'
 27 |         df.drop(columns=[k], inplace=True)
 28 |         gc.collect()
 29 |     return df
 30 | 
 31 | 
 32 | def main(conf):
 33 |     dump_dir = abspath(conf['libffm']['dump']['dir'])
 34 |     makedirs(dump_dir)
 35 | 
 36 |     data_dir = abspath(conf['libffm']['data']['dir'])
 37 |     dfc = DataFrameCols(data_dir)
 38 | 
 39 |     target = 'is_attributed'
 40 |     fields = {'ip': 0, 'app': 1, 'device': 2, 'os': 3, 'channel': 4}
 41 |     shifts = {'ip': 0, 'app': 364779, 'device': 365548, 'os': 369776, 'channel': 370733}
 42 | 
 43 |     # 1) write test data
 44 |     # logging.info('Writing test data in libffm format')
 45 |     # df = dfc.load_df(columns=['id', target] + list(fields.keys()))
 46 |     # df = df[df[target] == -1]
 47 |     # df[target] = 0  # do we need this?
 48 |     # df = write_libffm_data(df, target, fields, shifts)
 49 |     test_fname = join_path(dump_dir, 'test.txt')
 50 |     # df[['data']].to_csv(test_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
 51 |     # del df
 52 |     # gc.collect()
 53 |     # exit()
 54 | 
 55 |     # 2) write training folds
 56 |     # logging.info('Writing k-fold training data')
 57 |     # df = dfc.load_df(columns=['id', target] + list(fields.keys()))
 58 |     # df = df[df[target] >= 0]
 59 |     # df = write_libffm_data(df, target, fields, shifts)
 60 |     #
 61 |     # folds = []
 62 |     # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)
 63 |     # for fold_idx, valid_idx in skf.split(df['id'].values, df[target].values):
 64 |     #     folds.append((fold_idx, valid_idx))
 65 |     #
 66 |     # with open(join_path(dump_dir, 'folds.pkl'), 'wb') as f:
 67 |     #     pickle.dump(folds, f)
 68 |     #
 69 |     # for j_fold, (fold_idx, valid_idx) in enumerate(folds):
 70 |     #     logging.info('Writing fold %d in libffm format', j_fold)
 71 |     #     train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold)
 72 |     #     df.loc[fold_idx, ['data']].to_csv(train_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
 73 |     #     valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold)
 74 |     #     df.loc[valid_idx, ['data']].to_csv(valid_fname, header=False, index=False, quoting=csv.QUOTE_NONE)
 75 |     #
 76 |     # del df
 77 |     # gc.collect()
 78 |     # exit()
 79 | 
 80 |     df = dfc.load_df(columns=['id', target])
 81 |     df = df[df[target] >= 0]
 82 | 
 83 |     with open(join_path(dump_dir, 'folds.pkl'), 'rb') as f:
 84 |         folds = pickle.load(f)
 85 | 
 86 |     chdir(dump_dir)
 87 |     for j_fold, (fold_idx, valid_idx) in enumerate(folds):
 88 |         logging.info('Training on fold %d', j_fold)
 89 |         train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold)
 90 |         valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold)
 91 |         model_fname = join_path(dump_dir, 'model_%d.bin' % j_fold)
 92 |         proc = subprocess.run([
 93 |             'ffm-train',
 94 |             '-p', valid_fname,
 95 |             '-l', str(conf['libffm']['options']['lambda']),
 96 |             '-k', str(conf['libffm']['options']['factor']),
 97 |             '-r', str(conf['libffm']['options']['learning_rate']),
 98 |             '-t', str(conf['libffm']['options']['num_iter']),
 99 |             train_fname,
100 |             model_fname
101 |         ], stdout=subprocess.PIPE, check=True)
102 | 
103 |         logging.info('Running command %s', ' '.join(proc.args))
104 |         logging.info('Process return code %d', proc.returncode)
105 |         logging.info(proc.stdout.decode('utf-8'))
106 | 
107 |         train_pred_file = join_path(dump_dir, 'train_pred_%d.txt' % j_fold)
108 |         proc = subprocess.run([
109 |             'ffm-predict',
110 |             train_fname,
111 |             model_fname,
112 |             train_pred_file
113 |         ], stdout=subprocess.PIPE, check=True)
114 | 
115 |         logging.info('Running command %s', ' '.join(proc.args))
116 |         logging.info('Process return code %d', proc.returncode)
117 | 
118 |         with open(train_pred_file, 'r') as f:
119 |             p_train = np.array([float(s) for s in f.readlines()], dtype=np.float32)
120 |             auc_train = roc_auc_score(df.loc[fold_idx, target].values, p_train)
121 | 
122 |         valid_pred_file = join_path(dump_dir, 'valid_pred_%d.txt' % j_fold)
123 |         proc = subprocess.run([
124 |             'ffm-predict',
125 |             valid_fname,
126 |             model_fname,
127 |             valid_pred_file
128 |         ], stdout=subprocess.PIPE, check=True)
129 | 
130 |         logging.info('Running command %s', ' '.join(proc.args))
131 |         logging.info('Process return code %d', proc.returncode)
132 | 
133 |         with open(valid_pred_file, 'r') as f:
134 |             p_valid = np.array([float(s) for s in f.readlines()], dtype=np.float32)
135 |             auc_valid = roc_auc_score(df.loc[valid_idx, target].values, p_valid)
136 | 
137 |         logging.info('Fold quality: auc_train=%f auc_valid=%f', auc_train, auc_valid)
138 | 
139 |         test_pred_file = join_path(dump_dir, 'test_pred_%d.txt' % j_fold)
140 |         proc = subprocess.run([
141 |             'ffm-predict',
142 |             test_fname,
143 |             model_fname,
144 |             test_pred_file
145 |         ], stdout=subprocess.PIPE, check=True)
146 | 
147 |         logging.info('Running command %s', ' '.join(proc.args))
148 |         logging.info('Process return code %d', proc.returncode)
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     main(project().conf)
153 | 


--------------------------------------------------------------------------------
/models/lightgbm_.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import logging
  3 | from os.path import abspath, join as join_path
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import log_loss, roc_auc_score
  9 | 
 10 | import lightgbm as lgb
 11 | 
 12 | from lib.project import project
 13 | from lib.columns import DataFrameCols
 14 | from lib.utils import makedirs
 15 | from lib.hocon import write_config, config2json
 16 | from lib.quality import reliability_curve
 17 | 
 18 | 
 19 | def quality(labels, pred):
 20 |     return dict(
 21 |         ll=log_loss(labels, pred),
 22 |         auc=roc_auc_score(labels, pred),
 23 |         reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100)))
 24 |     )
 25 | 
 26 | 
 27 | def train_lightgbm(params, train_dataset, valid_dataset=None, **options):
 28 |     logging.info('Training LightGBM with params: %s', config2json(params))
 29 |     if valid_dataset is not None:
 30 |         model = lgb.train(params, train_dataset, valid_sets=[train_dataset, valid_dataset], **options)
 31 |     else:
 32 |         model = lgb.train(params, train_dataset, valid_sets=[train_dataset], **options)
 33 |     return model
 34 | 
 35 | 
 36 | def main(conf):
 37 |     dump_dir = conf['lightgbm']['dump']['dir']
 38 |     makedirs(dump_dir)
 39 | 
 40 |     write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon')
 41 |     write_config(conf, join_path(dump_dir, 'application.json'), 'json')
 42 |     logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log')))
 43 | 
 44 |     logging.info('Kaggle Talking Data')
 45 | 
 46 |     label = conf['lightgbm']['label']
 47 |     features = conf['lightgbm']['features']
 48 |     categorical_features = conf['lightgbm']['categorical_features']
 49 |     logging.info('Label: %s', label)
 50 |     logging.info('Features: %s', features)
 51 |     logging.info('Categorical features: %s', categorical_features)
 52 | 
 53 |     data_dir = abspath(conf['lightgbm']['data']['dir'])
 54 |     dfc = DataFrameCols(data_dir)
 55 |     train_index_name = conf['lightgbm']['data']['train']['index']
 56 |     train_index = dfc.load_index(train_index_name)
 57 | 
 58 |     df = dfc.load_df(columns=[label] + features, index=train_index)
 59 | 
 60 |     if conf['lightgbm']['valid_size'] > 0:
 61 |         train_df, valid_df = train_test_split(df, test_size=conf['lightgbm']['valid_size'])
 62 | 
 63 |         train_dataset = lgb.Dataset(data=train_df[features].values, label=train_df[label].values, feature_name=features,
 64 |                                     categorical_feature=categorical_features)
 65 |         valid_dataset = lgb.Dataset(data=valid_df[features].values, label=valid_df[label].values, feature_name=features,
 66 |                                     categorical_feature=categorical_features)
 67 | 
 68 |         del train_df
 69 |         del valid_df
 70 |         gc.collect()
 71 |     else:
 72 |         train_dataset = lgb.Dataset(data=df[features].values, label=df[label].values, feature_name=features,
 73 |                                     categorical_feature=categorical_features)
 74 |         valid_dataset = None
 75 | 
 76 |     params = conf['lightgbm']['params']
 77 |     options = conf['lightgbm']['options']
 78 |     model = train_lightgbm(params, train_dataset, valid_dataset, **options)
 79 |     model.save_model(join_path(dump_dir, 'model.bin'))
 80 |     del train_dataset
 81 |     del valid_dataset
 82 |     gc.collect()
 83 | 
 84 |     # load model
 85 |     # model = lgb.Booster(model_file=join_path(dump_dir, 'model.bin'))
 86 | 
 87 |     # train_label = train_df[label].values
 88 |     # train_pred = model.predict(train_df[features])
 89 |     # train_quality = quality(train_label, train_pred)
 90 |     # logging.info('Train quality: %s', train_quality)
 91 |     #
 92 |     # valid_label = valid_df[label].values
 93 |     # valid_pred = model.predict(valid_df[features])
 94 |     # valid_quality = quality(valid_label, valid_pred)
 95 |     # logging.info('Valid quality: %s', valid_quality)
 96 | 
 97 |     test_index_name = conf['lightgbm']['data']['test']['index']
 98 |     test_index = dfc.load_index(test_index_name)
 99 |     test_df = dfc.load_df(columns=features + ['click_id_submission'], index=test_index)
100 |     test_df['is_attributed'] = model.predict(test_df[features])
101 |     test_df = test_df[['click_id_submission', 'is_attributed']].rename(columns={'click_id_submission': 'click_id'})
102 |     test_df.sort_values(by='click_id', inplace=True)
103 |     test_df.to_csv(join_path(dump_dir, 'submission.csv'), header=True, index=False)
104 | 
105 |     gain = model.feature_importance('gain')
106 |     ft = pd.DataFrame({
107 |         'feature': model.feature_name(),
108 |         'split': model.feature_importance('split'),
109 |         'gain': 100 * gain / gain.sum()}
110 |     ).sort_values('gain', ascending=False)
111 |     ft.to_csv(join_path(dump_dir, 'feature_strength.csv'), header=True, index=False, sep='\t')
112 | 
113 | if __name__ == '__main__':
114 |     main(project().conf)
115 | 


--------------------------------------------------------------------------------
/preprocessing/merge_test_sets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Create normalized dataset\n",
 10 |     "# - duplicates https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/52752\n",
 11 |     "# - join test_supplement.csv and test.csv\n",
 12 |     "# - sort by click_time"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 20,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "DATA_DIR = '../data/mnt/ssd/kaggle-talkingdata2/competition_files'\n",
 32 |     "TRAIN_SAMPLE_FILE = DATA_DIR + '/train_sample.csv'\n",
 33 |     "TRAIN_FILE = DATA_DIR + '/train.csv'\n",
 34 |     "TEST_FILE = DATA_DIR + '/test.csv'\n",
 35 |     "TEST_SUPPLEMENT_FILE = DATA_DIR + '/test_supplement.csv'\n",
 36 |     "TEST_JOINED_FILE = DATA_DIR + '/test_joined.csv'"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# test_supplement.csv\n",
 46 |     "df_test_supplement = pd.read_csv(TEST_SUPPLEMENT_FILE)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# test.csv: this is a subset of test_supplement.csv which is used to score submissions\n",
 56 |     "df_test = pd.read_csv(TEST_FILE)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 6,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# join test_supplement.csv and test.csv\n",
 66 |     "df_test_joined = df_test_supplement.merge(df_test, how='left', on=['ip', 'app', 'device', 'os', 'channel', 'click_time'], suffixes=['', '_submission'])"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 7,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# remove extra duplicates\n",
 76 |     "# note: pandas consider duplicates rows that are identical even if there are nans in some columns\n",
 77 |     "duplicated_idx = df_test_joined.duplicated(subset=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id_submission'], keep='first')\n",
 78 |     "df_test_joined_dedup = df_test_joined[(~ duplicated_idx) | (df_test_joined['click_id_submission'].isnull())]"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 8,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# check that all subsmission clicks are preserved after join and remove of extra duplicates\n",
 88 |     "df_test_joined_dedup['click_id_submission'].value_counts().sum()\n",
 89 |     "assert df_test.shape[0] == df_test_joined_dedup['click_id_submission'].value_counts().shape[0]\n",
 90 |     "assert df_test.shape[0] == df_test_joined_dedup['click_id_submission'].value_counts().sum()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 9,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "33\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "# note: a small number of events from test_supplement is lost after deduplications. \n",
108 |     "# Assuming these were events in test_supplement.csv which were not present in test.csv but still were duplicates of events from test.csv\n",
109 |     "print(df_test_supplement.shape[0] - df_test_joined_dedup.shape[0])"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 10,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "del(df_test_supplement)\n",
119 |     "del(df_test)\n",
120 |     "del(df_test_joined)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 11,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "df_test_joined_dedup.sort_values(by=['click_time', 'ip', 'app', 'device', 'os', 'channel'], inplace=True)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 16,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "df_test_joined_dedup['click_id_submission'] = df_test_joined_dedup['click_id_submission'].fillna(value=-1).astype(int)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 21,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "df_test_joined_dedup.to_csv(TEST_JOINED_FILE, index=False)"
148 |    ]
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "Python [conda env:kaggle-talking-data]",
154 |    "language": "python",
155 |    "name": "conda-env-kaggle-talking-data-py"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3.0
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.6.4"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 0
172 | }


--------------------------------------------------------------------------------