├── .gitignore ├── README.md ├── __init__.py ├── application.conf ├── factors ├── __init__.py └── factors_pd.py ├── junk ├── __init__.py ├── blend_submissions.py ├── collect_libffm_predictions.py ├── compute_libffm_auc.py ├── convert_tsv_to_columns.py ├── create_days_index.py ├── create_submission_index.py ├── create_subsample_index.py ├── create_train_day_8_9_hour_4_5_9_10_13_14_index.py ├── create_train_day_8_9_index.py ├── create_train_index.py └── print_submission_days_hours.py ├── lib ├── __init__.py ├── columns.py ├── hocon.py ├── project.py ├── quality.py └── utils.py ├── models ├── __init__.py ├── catboost_.py ├── libffm.py └── lightgbm_.py └── preprocessing ├── eda.ipynb └── merge_test_sets.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | submissions 2 | .idea 3 | 4 | .DS_Store 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Kaggle TalkingData Ad Tracking Fraud Detection Challenge 2 | 3 | Scripts for competition 4 | https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection 5 | 6 | Final ranking #55 with private leaderboard score 0.9824250 7 | 8 | * Aggregate factors from Baris' kernel computed on all data with test_supplement 9 | * Prev/next click times (1-, 2-, 3-step) computed on all data with test_supplement 10 | * LIBFFM out-of-fold using 5 folds 11 | * LGBM on 50% data (using all is_attributed=1 and 50% of is_attributed=0) 12 | * Average of a few LGBM models trained on different subsets of data and factors 13 | * Training with 96GB RAM 14 | * Store data column-wise in binary formats for fast loading 15 | * HOCON configurations are awesome 16 | 17 | EDA [preprocessing/eda.ipyndb](preprocessing/eda.ipynb) - distributions of frequencies in train.csv 18 | 19 | Merge test datasets [preprocessing/merge_test_sets.ipynb](preprocessing/merge_test_sets.ipynb) - script for merging test datasets 20 | test.csv 21 | test_supplement.csv 22 | 23 | Baris kernel 24 | https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614 25 | https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm 26 | 27 | Next/prev clicks without hashing trick 28 | https://www.kaggle.com/asydorchuk/nextclick-calculation-without-hashing-trick 29 | 30 | Submissions 31 | 32 | model | dump | train_set | params | val AUC | lb AUC | factors 33 | --- | --- | --- | --- | --- | --- | --- 34 | lgbm | lgbm_08 | d_8_9_h_4_5_9_... | it=250 lr=0.2 md=3 nl=7 scw=300 | 0.983xx | 0.9795 | baris 35 | lgbm | lgbm_09 | na_10pct | it=500 lr=0.1 md=3 nl=7 scw=na | 0.98498 | 0.9803 | baris 36 | lgbm | lgbm_11 | d_8_9_h_4_5_9_... | it=2000 lr=0.01 md=3 nl=7 scw=300 | 0.98332 | 0.9791 | baris 37 | cbst | cbst_09 | na_10pct | it=500 lr=0.05 md=6 rms=0.7 | 0.98221 | 0.9766 | baris 38 | lgbm | lgbm_12 | na_10pct | it=500 lr=0.1 md=3 nl=7 scw=na | 0.98564 | 0.9806 | baris + t2 39 | lgbm | lgbm_13 | na_10pct | it=1000 lr=0.1 md=3 nl=7 scw=na | 0.98610 | 0.9810 | baris + t2 40 | lgbm | lgbm_14 | na_10pct | it=1000 lr=0.1 md=3 nl=7 scw=na | 0.9855x | 0.9810 | baris + t2 + t3 41 | lgbm | lgbm_15 | na_10pct | it=1370 lr=0.1 md=3 nl=7 scw=na | 0.98567 | 0.9811 | baris + t2 + t3 42 | lgbm | lgbm_16 | na_10pct | it=1360 lr=0.1 md=3 nl=7 scw=na | 0.98544 | 0.9809 | baris + t2 + libffm 43 | lgbm | lgbm_17 | na_10pct | it=820 lr=0.1 md=4 nl=15 scw=na | 0.98580 | 0.9810 | baris + t2 + libffm 44 | blend | | | logit, weights=1.0 | - | 0.9812 | blend lgbm_13..lgbm_17 45 | lgbm | lgbm_18 | na_20pct | it=1500 lr=0.1 md=3 nl=7 scw=na | 0.98571 | 0.9808 | baris + t2 + libffm + tc2 46 | lgbm | lgbm_19 | na_50pct | it=2500 lr=0.1 md=3 nl=7 scw=na | 0.98595 | 0.9811 | baris + t2 47 | blend | | | logit, weights=1.0 | - | 0.9813 | blend lgbm_15 + lgbm_19 48 | lgbm | lgbm_19 | na_50pct | it=1500, all attributed | - | 0.9810 | baris + t2 + libffm 49 | lgbm | lgbm_20 | na_50pct_2 | it=1500, all attributed | - | 0.9810 | baris + t2 + libffm 50 | blend | | | logit, weights=1.0 | - | 0.9813 | blend lgbm_13..lgbm_20 51 | 52 | Factors strength (lgbm_19) 53 | 54 | feature | gain | split 55 | --- | --- | --- 56 | libffm_oof | 82.34410715832054 | 363 57 | ip_app_device_os_t_next | 4.858614532478393 | 555 58 | app | 4.285458537390102 | 1057 59 | ip_nunique_channel | 1.7134587347643337 | 173 60 | channel | 1.2185210564277669 | 2220 61 | os | 0.9917807676129159 | 1516 62 | hour | 0.8615137822058627 | 1078 63 | ip_nunique_app | 0.5879513478431967 | 157 64 | ip_nunique_device | 0.5397789939897564 | 190 65 | ip_day_hour_count | 0.461685682523349 | 191 66 | ip_app_count | 0.42650563120670515 | 99 67 | ip_app_device_os_t_next_2 | 0.35302018803960517 | 194 68 | ip_device_os_nunique_app | 0.3238393490783555 | 198 69 | ip_day_nunique_hour | 0.30262413717998854 | 76 70 | ip_app_os_count | 0.2384258533635993 | 84 71 | ip_device_os_cumcount_app | 0.11690381200637338 | 58 72 | ip_app_device_os_t_prev | 0.07133512725949863 | 95 73 | device | 0.06228725573269818 | 70 74 | ip_app_nunique_os | 0.06172545078968752 | 141 75 | ip_cumcount_os | 0.05417631528656273 | 44 76 | ip_app_channel_mean_hour | 0.04849352774733603 | 144 77 | day | 0.017391508495188064 | 61 78 | app_nunique_channel | 0.014128035383901352 | 35 79 | ip_app_os_var_hour | 0.01325167381501645 | 52 80 | ip_app_device_os_t_prev_2 | 0.012127423364797763 | 60 81 | ip_app_channel_var_day | 0.011027400971738753 | 39 82 | ip_day_channel_var_hour | 0.009866716722734953 | 50 83 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/__init__.py -------------------------------------------------------------------------------- /application.conf: -------------------------------------------------------------------------------- 1 | data = { 2 | dir = ../data 3 | files = { 4 | train = train.csv 5 | test = test.csv 6 | test_supplement = test_supplement.csv 7 | test_merged = test_merged.csv 8 | } 9 | ntrain = 184903890 10 | } 11 | 12 | factors = { 13 | # hash_id = { 14 | # ip_app_device_os_hash = { 15 | # columns = [ip, app, device, os] 16 | # groupby = [ip, app, device, os] 17 | # num_bits = 27 18 | # dtype = uint32 19 | # } 20 | # } 21 | 22 | click_time_no_hash = { 23 | ip_app_device_os_t_prev = { 24 | columns = [ip, app, device, os, epoch] 25 | groupby = [ip, app, device, os] 26 | reverse = false 27 | dtype = uint32 28 | }, 29 | ip_app_device_os_t_next = { 30 | columns = [ip, app, device, os, epoch] 31 | groupby = [ip, app, device, os] 32 | reverse = true 33 | dtype = uint32 34 | }, 35 | ip_app_device_os_t_prev_2 = { 36 | columns = [ip, app, device, os, epoch] 37 | groupby = [ip, app, device, os] 38 | reverse = false 39 | step = 2 40 | dtype = uint32 41 | }, 42 | ip_app_device_os_t_next_2 = { 43 | columns = [ip, app, device, os, epoch] 44 | groupby = [ip, app, device, os] 45 | reverse = true 46 | step = 2 47 | dtype = uint32 48 | } 49 | ip_app_device_os_t_prev_3 = { 50 | columns = [ip, app, device, os, epoch] 51 | groupby = [ip, app, device, os] 52 | reverse = false 53 | step = 3 54 | dtype = uint32 55 | }, 56 | ip_app_device_os_t_next_3 = { 57 | columns = [ip, app, device, os, epoch] 58 | groupby = [ip, app, device, os] 59 | reverse = true 60 | step = 3 61 | dtype = uint32 62 | } 63 | ip_app_device_os_channel_t_prev = { 64 | columns = [ip, app, device, os, channel, epoch] 65 | groupby = [ip, app, device, os, channel] 66 | reverse = false 67 | dtype = uint32 68 | }, 69 | ip_app_device_os_channel_t_next = { 70 | columns = [ip, app, device, os, channel, epoch] 71 | groupby = [ip, app, device, os, channel] 72 | reverse = true 73 | dtype = uint32 74 | }, 75 | } 76 | 77 | aggr = { 78 | ip_nunique_channel = { 79 | columns = [ip, channel] 80 | groupby = [ip] 81 | select = channel 82 | aggr = nunique 83 | dtype = uint8 84 | }, 85 | ip_device_os_cumcount_app = { 86 | columns = [ip, device, os, app] 87 | groupby = [ip, device, os] 88 | select = app 89 | aggr = cumcount 90 | dtype = uint32 91 | }, 92 | ip_day_nunique_hour = { 93 | columns = [ip, day, hour] 94 | groupby = [ip, day] 95 | select = hour 96 | aggr = nunique 97 | dtype = uint32 98 | }, 99 | ip_nunique_app = { 100 | columns = [ip, app] 101 | groupby = [ip] 102 | select = app 103 | aggr = nunique 104 | dtype = uint8 105 | }, 106 | ip_app_nunique_os = { 107 | columns = [ip, app, os] 108 | groupby = [ip, app] 109 | select = os 110 | aggr = nunique 111 | dtype = uint8 112 | }, 113 | ip_nunique_device = { 114 | columns = [ip, device] 115 | groupby = [ip] 116 | select = device 117 | aggr = nunique 118 | dtype = uint16 119 | }, 120 | app_nunique_channel = { 121 | columns = [app, channel] 122 | groupby = [app] 123 | select = channel 124 | aggr = nunique 125 | dtype = uint32 126 | }, 127 | ip_cumcount_os = { 128 | columns = [ip, os] 129 | groupby = [ip] 130 | select = os 131 | aggr = cumcount 132 | dtype = uint32 133 | }, 134 | ip_device_os_nunique_app = { 135 | columns = [ip, device, os, app] 136 | groupby = [ip, device, os] 137 | select = app 138 | aggr = nunique 139 | dtype = uint32 140 | }, 141 | ip_day_hour_count = { 142 | columns = [ip, day, hour, channel] 143 | groupby = [ip, day, hour] 144 | select = channel 145 | aggr = count 146 | dtype = uint32 147 | }, 148 | ip_app_count = { 149 | columns = [ip, app, channel] 150 | groupby = [ip, app] 151 | select = channel 152 | aggr = count 153 | dtype = uint32 154 | }, 155 | ip_app_os_count = { 156 | columns = [ip, app, os, channel] 157 | groupby = [ip, app, os] 158 | select = channel 159 | aggr = count 160 | dtype = uint32 161 | }, 162 | ip_day_channel_var_hour = { 163 | columns = [ip, day, channel, hour] 164 | groupby = [ip, day, channel] 165 | select = hour 166 | aggr = var 167 | dtype = float32 168 | }, 169 | ip_app_os_var_hour = { 170 | columns = [ip, app, os, hour] 171 | groupby = [ip, app, os] 172 | select = hour 173 | aggr = var 174 | dtype = float32 175 | }, 176 | ip_app_channel_var_day = { 177 | columns = [ip, app, channel, day] 178 | groupby = [ip, app, channel] 179 | select = day 180 | aggr = var 181 | dtype = float32 182 | }, 183 | ip_app_channel_mean_hour = { 184 | columns = [ip, app, channel, hour] 185 | groupby = [ip, app, channel] 186 | select = hour 187 | aggr = mean 188 | dtype = float32 189 | } 190 | } 191 | } 192 | 193 | factors_pd = { 194 | dump = { 195 | dir = "../dumps/factors_baris_04/" 196 | } 197 | source = "../data/columns" 198 | factors = ${factors} 199 | } 200 | 201 | catboost = { 202 | dump = { 203 | dir = "../dumps/catboost_10" 204 | } 205 | 206 | data = { 207 | dir = "../data/columns" 208 | train = { 209 | index = subsample_not_attributed_10pct 210 | } 211 | test = { 212 | index = submission 213 | } 214 | } 215 | 216 | target = is_attributed 217 | 218 | features = [ 219 | app, 220 | device, 221 | os, 222 | channel, 223 | hour, 224 | day, 225 | ip_app_device_os_t_next, 226 | ip_app_device_os_t_prev, 227 | ip_app_device_os_t_next_2, 228 | ip_app_device_os_t_prev_2, 229 | ip_nunique_channel, 230 | ip_device_os_cumcount_app, 231 | ip_day_nunique_hour, 232 | ip_nunique_app, 233 | ip_app_nunique_os, 234 | ip_nunique_device, 235 | app_nunique_channel, 236 | ip_cumcount_os, 237 | ip_device_os_nunique_app, 238 | ip_day_hour_count, 239 | ip_app_count, 240 | ip_app_os_count, 241 | ip_day_channel_var_hour, 242 | ip_app_os_var_hour, 243 | ip_app_channel_var_day, 244 | ip_app_channel_mean_hour 245 | ] 246 | 247 | categorical_features = [ 248 | app, 249 | device, 250 | os, 251 | channel, 252 | hour, 253 | day 254 | ] 255 | 256 | options = { 257 | eval_metric = AUC 258 | learning_rate = 0.1 259 | iterations = 500 260 | depth = 6 261 | rsm=0.7 262 | simple_ctr = [Counter, Counter, Counter, Counter, Counter, Counter] 263 | } 264 | 265 | hyperopt = { 266 | enabled = false 267 | max_evals = 25 268 | space = { 269 | l2_leaf_reg = { 270 | expression = loguniform, 271 | params = { 272 | low = 3.0 273 | high = 5.0 274 | } 275 | } 276 | learning_rate = { 277 | expression = loguniform, 278 | params = { 279 | low = -5.0 280 | high = -0.5 281 | } 282 | } 283 | } 284 | } 285 | } 286 | 287 | lightgbm = { 288 | dump = { 289 | dir = "../dumps/lightgbm_20" 290 | } 291 | 292 | data = { 293 | dir = "../data/columns" 294 | train = { 295 | index = subsample_not_attributed_50pct_2 296 | # index = subsample_not_attributed_50pct 297 | # index = subsample_not_attributed_10pct 298 | # index = days_8_9_hours_4_5_9_10_13_14_attributed 299 | } 300 | test = { 301 | index = submission 302 | } 303 | } 304 | 305 | label = is_attributed 306 | 307 | features = [ 308 | app, 309 | device, 310 | os, 311 | channel, 312 | hour, 313 | day, 314 | ip_app_device_os_t_next, 315 | ip_app_device_os_t_prev, 316 | ip_app_device_os_t_prev_2, 317 | ip_app_device_os_t_next_2, 318 | ip_nunique_channel, 319 | ip_device_os_cumcount_app, 320 | ip_day_nunique_hour, 321 | ip_nunique_app, 322 | ip_app_nunique_os, 323 | ip_nunique_device, 324 | app_nunique_channel, 325 | ip_cumcount_os, 326 | ip_device_os_nunique_app, 327 | ip_day_hour_count, 328 | ip_app_count, 329 | ip_app_os_count, 330 | ip_day_channel_var_hour, 331 | ip_app_os_var_hour, 332 | ip_app_channel_var_day, 333 | ip_app_channel_mean_hour, 334 | libffm_oof 335 | ] 336 | 337 | categorical_features = [ 338 | app, 339 | device, 340 | os, 341 | channel, 342 | hour, 343 | day 344 | ] 345 | 346 | params = { 347 | objective = binary 348 | metric = auc 349 | learning_rate = 0.1 350 | max_depth = 3 351 | num_leaves = 7 352 | min_child_samples = 100 353 | min_child_weight = 0 354 | max_bin = 100 355 | subsample = 0.7 356 | subsample_freq = 1 357 | colsample_bytree = 0.9 358 | scale_pos_weight = 100.0 359 | early_stopping_rounds = 1000 360 | num_threads = 15 361 | } 362 | 363 | options = { 364 | num_boost_round = 1500 365 | verbose_eval = 10 366 | } 367 | 368 | valid_size = 0 369 | } 370 | 371 | libffm = { 372 | dump = { 373 | dir = "../dumps/libffm_00" 374 | } 375 | 376 | data = { 377 | dir = "../data/columns" 378 | } 379 | 380 | options = { 381 | learning_rate = "0.1" 382 | factor = "12" 383 | lambda = "0.00002" 384 | num_iter = "2" 385 | } 386 | } 387 | 388 | -------------------------------------------------------------------------------- /factors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/factors/__init__.py -------------------------------------------------------------------------------- /factors/factors_pd.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import logging 3 | from os.path import abspath, join as join_path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from lib.project import project 9 | from lib.columns import DataFrameCols 10 | from lib.utils import makedirs 11 | 12 | 13 | class Factors(object): 14 | 15 | def datetimes(self, df): 16 | df['day'] = pd.to_datetime(df['click_time']).dt.day.astype('uint8') 17 | df['hour'] = pd.to_datetime(df['click_time']).dt.hour.astype('uint8') 18 | return df 19 | 20 | def aggr(self, df, name, groupby, select, aggr, dtype, **other): 21 | """ Baris' aggregates 22 | https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614 23 | https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm 24 | """ 25 | grouped = df[groupby + [select]].groupby(groupby)[select] 26 | if aggr == 'count': 27 | count = grouped.count().reset_index().rename(columns={select: name}) 28 | df = df.merge(count, on=groupby, how='left') 29 | df[name] = df[name].astype(dtype) 30 | if aggr == 'nunique': 31 | nunique = grouped.nunique().reset_index().rename(columns={select: name}) 32 | df = df.merge(nunique, on=groupby, how='left') 33 | df[name] = df[name].astype(dtype) 34 | if aggr == 'mean': 35 | mean = grouped.mean().reset_index().rename(columns={select: name}).fillna(0) 36 | df = df.merge(mean, on=groupby, how='left') 37 | df[name] = df[name].astype(dtype) 38 | if aggr == 'var': 39 | var = grouped.var().reset_index().rename(columns={select: name}).fillna(0) 40 | df = df.merge(var, on=groupby, how='left') 41 | df[name] = df[name].astype(dtype) 42 | if aggr == 'cumcount': 43 | cumcount = grouped.cumcount() 44 | df[name] = cumcount.values 45 | df[name] = df[name].astype(dtype) 46 | 47 | del grouped 48 | gc.collect() 49 | 50 | return df 51 | 52 | def hash_id(self, df, name, groupby, num_bits=27, salt='salt', **other): 53 | d = (1 << num_bits) 54 | 55 | def hashfcn(row): 56 | if row['id'] % 1000 == 0: 57 | logging.info(row['id']) 58 | return hash(salt + '_'.join(map(str, [row[k] for k in groupby]))) % d 59 | 60 | df[name] = df.apply(hashfcn, axis=1, reduce=False).astype(np.uint32) 61 | return df 62 | 63 | def click_time(self, df, name, hash_id, reverse=False, num_bits=27, **other): 64 | """ Baris' time to prev/next click 65 | https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977?scriptVersionId=3224614 66 | https://www.kaggle.com/aharless/kaggle-runnable-version-of-baris-kanber-s-lightgbm 67 | """ 68 | d = (1 << num_bits) 69 | 70 | epochs = df['epoch'].values 71 | ids = df[hash_id].values 72 | if reverse: 73 | ids = reversed(ids) 74 | epochs = reversed(epochs) 75 | 76 | unknown = np.iinfo(np.uint32).max 77 | buf = np.full(d, unknown, dtype=np.uint32) 78 | prev_click = np.full(df.shape[0], unknown, dtype=np.uint32) 79 | 80 | for i, (_id, t) in enumerate(zip(ids, epochs)): 81 | t_prev = buf[_id] 82 | buf[_id] = t 83 | if t_prev != unknown: 84 | if not reverse: 85 | prev_click[i] = t - t_prev 86 | else: 87 | prev_click[i] = t_prev - t 88 | 89 | if not reversed: 90 | df[name] = prev_click 91 | else: 92 | df[name] = np.flipud(prev_click) 93 | 94 | return df 95 | 96 | def click_time_no_hash(self, df, name, groupby, step=1, reverse=False, **other): 97 | """ Compute previous/next click time without hashing trick 98 | https://www.kaggle.com/asydorchuk/nextclick-calculation-without-hashing-trick 99 | """ 100 | if not reverse: 101 | df[name] = df['epoch'] - df.groupby(groupby)['epoch'].shift(step).fillna(0) 102 | else: 103 | df[name] = df.groupby(groupby)['epoch'].shift(-step).fillna(3000000000) - df['epoch'] 104 | 105 | return df 106 | 107 | def main(conf): 108 | dump_dir = abspath(conf['factors_pd']['dump']['dir']) 109 | makedirs(dump_dir) 110 | 111 | data_dir = abspath(conf['factors_pd']['source']) 112 | dfc = DataFrameCols(data_dir) 113 | 114 | computer = Factors() 115 | for group in conf['factors_pd']['factors']: 116 | logging.info('Compute factors group: %s', group) 117 | for factor in conf['factors_pd']['factors'][group]: 118 | logging.info('Compute factor: %s', factor) 119 | spec = conf['factors_pd']['factors'][group][factor] 120 | df = dfc.load_df(['id'] + spec['columns']) 121 | df = getattr(computer, group)(df, factor, **spec) 122 | df.sort_values(by=['id'], inplace=True) 123 | if conf['factors_pd']['factors'][group][factor].get('factors', None) is None: 124 | dfc.write_column(factor, df[factor].values) 125 | else: 126 | for fout in conf['factors_pd']['factors'][group][factor].get('factors'): 127 | fname = factor + '_' + fout 128 | dfc.write_column(fname, df[fname].values) 129 | del df 130 | gc.collect() 131 | 132 | if __name__ == '__main__': 133 | main(project().conf) 134 | -------------------------------------------------------------------------------- /junk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/junk/__init__.py -------------------------------------------------------------------------------- /junk/blend_submissions.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from scipy.special import logit, expit 7 | 8 | 9 | if __name__ == '__main__': 10 | parser = ArgumentParser() 11 | parser.add_argument('--submissions', nargs='+') 12 | parser.add_argument('--weights', nargs='+', type=float) 13 | parser.add_argument('--mix-logits', action='store_true') 14 | parser.add_argument('--output-file') 15 | args = parser.parse_args() 16 | 17 | n = 18790469 18 | blend = np.zeros(n, dtype=np.float32) 19 | wnorm = sum(args.weights) 20 | for j, fname in enumerate(args.submissions): 21 | df = pd.read_csv(fname) 22 | df.sort_values(by=['click_id'], inplace=True) 23 | values = df['is_attributed'].values 24 | if args.mix_logits: 25 | values = logit(values) 26 | blend += args.weights[j] * values / wnorm 27 | 28 | if args.mix_logits: 29 | blend = expit(blend) 30 | 31 | df_out = pd.DataFrame(data={'click_id': np.arange(n, dtype=np.int32), 'is_attributed': blend}) 32 | df_out.to_csv(args.output_file, header=True, index=False) 33 | -------------------------------------------------------------------------------- /junk/collect_libffm_predictions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | from os.path import abspath, join as join_path 4 | from argparse import ArgumentParser 5 | 6 | import numpy as np 7 | 8 | from scipy.special import logit 9 | from sklearn.metrics import roc_auc_score, log_loss 10 | 11 | from lib.columns import DataFrameCols 12 | 13 | 14 | if __name__ == '__main__': 15 | parser = ArgumentParser() 16 | parser.add_argument('dump') 17 | args = parser.parse_args() 18 | 19 | dumpdir = abspath(args.dump) 20 | datadir = abspath('../data/columns') 21 | 22 | dfc = DataFrameCols(datadir) 23 | df = dfc.load_df(columns=['id', 'is_attributed']) 24 | df['p'] = 0 25 | 26 | df_train = df[df['is_attributed'] >= 0] 27 | df_test = df[df['is_attributed'] == -1] 28 | print(df_test.shape[0]) 29 | 30 | with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f: 31 | folds = pickle.load(f) 32 | 33 | p_test_avg = np.zeros(df_test.shape[0]) 34 | for j_fold, (fold_idx, valid_idx) in enumerate(folds): 35 | valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % j_fold) 36 | with open(valid_pred_file, 'r') as f: 37 | p_valid = np.array([float(s) for s in f.readlines()]) 38 | 39 | y_valid = df_train.loc[valid_idx, 'is_attributed'].values 40 | auc_valid = roc_auc_score(y_valid, p_valid) 41 | print('Fold %d validation auc=%f' % (j_fold, auc_valid)) 42 | 43 | df_train.loc[valid_idx, 'p'] = logit(p_valid) 44 | 45 | test_pred_file = join_path(dumpdir, 'test_pred_%d.txt' % j_fold) 46 | with open(test_pred_file, 'r') as f: 47 | p_test = np.array([float(s) for s in f.readlines()]) 48 | p_test_avg += logit(p_test) 49 | 50 | df_test.loc[:, 'p'] = p_test_avg / 5 51 | df_all = df_train.append(df_test, ignore_index=True) 52 | df_all.sort_values(by=['id'], inplace=True) 53 | dfc.write_column('libffm_oof', df_all['p'].values) 54 | 55 | 56 | -------------------------------------------------------------------------------- /junk/compute_libffm_auc.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from os.path import abspath, join as join_path 3 | from argparse import ArgumentParser 4 | 5 | import numpy as np 6 | from sklearn.metrics import roc_auc_score, log_loss 7 | 8 | from lib.columns import DataFrameCols 9 | 10 | 11 | if __name__ == '__main__': 12 | parser = ArgumentParser() 13 | parser.add_argument('dump', type=str) 14 | parser.add_argument('fold', type=int) 15 | args = parser.parse_args() 16 | 17 | dumpdir = abspath(args.dump) 18 | datadir = abspath('../data/columns') 19 | 20 | dfc = DataFrameCols(datadir) 21 | df = dfc.load_df(columns=['id', 'is_attributed']) 22 | df = df[df['is_attributed'] >= 0] 23 | 24 | with open(join_path(dumpdir, 'folds.pkl'), 'rb') as f: 25 | folds = pickle.load(f) 26 | 27 | train_pred_file = join_path(dumpdir, 'train_pred_%d.txt' % args.fold) 28 | with open(train_pred_file, 'r') as f: 29 | p_train = np.array([float(s) for s in f.readlines()]) 30 | 31 | valid_pred_file = join_path(dumpdir, 'valid_pred_%d.txt' % args.fold) 32 | with open(valid_pred_file, 'r') as f: 33 | p_valid = np.array([float(s) for s in f.readlines()]) 34 | 35 | fold_idx = folds[args.fold][0] 36 | valid_idx = folds[args.fold][1] 37 | 38 | y_train = df.loc[fold_idx, 'is_attributed'].values 39 | y_valid = df.loc[valid_idx, 'is_attributed'].values 40 | 41 | print('Train results: log_loss=%f, auc=%f' % (log_loss(y_train, p_train), roc_auc_score(y_train, p_train))) 42 | print('Valid results: log_loss=%f, auc=%f' % (log_loss(y_valid, p_valid), roc_auc_score(y_valid, p_valid))) 43 | 44 | 45 | 46 | # ffm-train -p valid_fold_0.txt -l 0.0002 -k 4 -t 2 train_fold_0.txt model_fold_0.bin 47 | # Train results: log_loss=0.007449, auc=0.964776 48 | # Valid results: log_loss=0.007866, auc=0.961628 49 | 50 | 51 | -------------------------------------------------------------------------------- /junk/convert_tsv_to_columns.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from lib.columns import DataFrameCols 5 | from lib.utils import makedirs 6 | 7 | if __name__ == '__main__': 8 | df = pd.read_csv('../data/train_test_merged/train_test_merged.tsv', sep='\t', parse_dates=['click_time']) 9 | 10 | rename_columns = {} 11 | for col in df.columns: 12 | if col.startswith('# '): 13 | rename_columns[col] = col[2:] 14 | df.rename(columns=rename_columns, inplace=True) 15 | df.sort_values(by=['id'], inplace=True) 16 | 17 | dtypes = { 18 | 'id': 'uint32', 19 | 'ip': 'uint32', 20 | 'app': 'uint16', 21 | 'device': 'uint16', 22 | 'os': 'uint16', 23 | 'channel': 'uint16', 24 | 'click_id': 'int32', 25 | 'click_id_submission': 'int32', 26 | 'is_attributed': 'int8' 27 | } 28 | 29 | test_dir = '../data/columns' 30 | makedirs(test_dir) 31 | 32 | dfc = DataFrameCols(test_dir) 33 | for col, dtype in dtypes.items(): 34 | print(col, dtype) 35 | dfc.write_column(col, df[col].astype(dtype).values) 36 | 37 | dfc.write_column('day', pd.to_datetime(df['click_time']).dt.day.astype('uint8').values) 38 | dfc.write_column('hour', pd.to_datetime(df['click_time']).dt.hour.astype('uint8').values) 39 | dfc.write_column('epoch', (df['click_time'].astype(np.int64) // 10 ** 9).values) 40 | -------------------------------------------------------------------------------- /junk/create_days_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | day_col = dfc.load_column('day') 8 | hour_col = dfc.load_column('hour') 9 | is_attributed_col = dfc.load_column('is_attributed') 10 | 11 | hours = np.unique(hour_col, return_counts=True)[0] 12 | 13 | for h in hours: 14 | all = np.where((day_col == 9) & (hour_col == h))[0].shape[0] 15 | attributed = np.where((day_col == 9) & (hour_col == h) & (is_attributed_col >= 0))[0].shape[0] 16 | 17 | print(h, all, attributed) 18 | 19 | # hour all attributed 20 | # 0 3318301 3318301 21 | # 1 3082862 3082862 22 | # 2 3068887 3068887 23 | # 3 3351149 3351149 24 | # 4 4032691 4032691 25 | # 5 3671741 3671741 26 | # 6 3570940 3570940 27 | # 7 3186240 3186240 28 | # 8 2804701 2804701 29 | # 9 2986204 2986204 30 | # 10 3304199 3304199 31 | # 11 3347741 3347741 32 | # 12 3363917 3363917 33 | # 13 3457523 3457523 34 | # 14 3443348 3443283 !!! hour 14 has small fraction of not attributed events 35 | # 15 3026679 3026111 36 | # 16 2495595 447 37 | # 17 1265180 0 38 | # 18 762056 0 39 | # 19 526096 0 40 | # 20 432411 0 41 | # 21 571504 0 42 | # 22 1325626 0 43 | # 23 2423959 0 -------------------------------------------------------------------------------- /junk/create_submission_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | click_id_submission = dfc.load_column(col='click_id_submission') 9 | index = np.where(click_id_submission >= 0)[0].astype(np.uint32) 10 | dfc.write_index('submission', index) 11 | -------------------------------------------------------------------------------- /junk/create_subsample_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | is_attributed_col = dfc.load_column('is_attributed') 9 | subsample = np.random.choice([0, 1], size=is_attributed_col.shape[0], p=[0.5, 0.5]) 10 | subsample_idx = np.where((is_attributed_col == 1) | ((is_attributed_col == 0) & (subsample == 1)))[0] 11 | 12 | print(subsample_idx.shape[0]) 13 | dfc.write_index('subsample_not_attributed_50pct_2', subsample_idx) 14 | -------------------------------------------------------------------------------- /junk/create_train_day_8_9_hour_4_5_9_10_13_14_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | day_col = dfc.load_column('day') 9 | hour_col = dfc.load_column('hour') 10 | is_attributed_col = dfc.load_column('is_attributed') 11 | 12 | hidx = (hour_col == 4) | (hour_col == 5) | (hour_col == 9) | (hour_col == 10) | (hour_col == 13) | (hour_col == 14) 13 | index = np.where((is_attributed_col >= 0) & (day_col > 7) & hidx)[0] 14 | 15 | dfc.write_index('days_8_9_hours_4_5_9_10_13_14_attributed', index) 16 | -------------------------------------------------------------------------------- /junk/create_train_day_8_9_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | day_col = dfc.load_column('day') 9 | is_attributed_col = dfc.load_column('is_attributed') 10 | index = np.where((is_attributed_col >= 0) & (day_col > 7))[0] 11 | 12 | dfc.write_index('days_8_9_attributed', index) 13 | -------------------------------------------------------------------------------- /junk/create_train_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | is_attributed_col = dfc.load_column('is_attributed') 9 | index = np.where((is_attributed_col >= 0))[0] 10 | 11 | print(index.shape[0]) 12 | dfc.write_index('train', index) 13 | -------------------------------------------------------------------------------- /junk/print_submission_days_hours.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib.columns import DataFrameCols 3 | 4 | if __name__ == '__main__': 5 | workdir = '../data/columns' 6 | dfc = DataFrameCols(workdir) 7 | 8 | submission_idx = dfc.load_index('submission') 9 | day_col = dfc.load_column('day', index=submission_idx) 10 | hour_col = dfc.load_column('hour', index=submission_idx) 11 | 12 | print('Submission days:') 13 | days = np.unique(day_col, return_counts=True) 14 | for (d, c) in zip(days[0], days[1]): 15 | print(d, c) 16 | 17 | print('Submission hours') 18 | hours = np.unique(hour_col, return_counts=True) 19 | for (h, c) in zip(hours[0], hours[1]): 20 | print(h, c) 21 | 22 | # Submission days: 23 | # 10 18790469 24 | # Submission hours 25 | # 4 3344125 26 | # 5 2858427 27 | # 6 381 28 | # 9 2984808 29 | # 10 3127993 30 | # 11 413 31 | # 13 3212566 32 | # 14 3261257 33 | # 15 499 34 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/lib/__init__.py -------------------------------------------------------------------------------- /lib/columns.py: -------------------------------------------------------------------------------- 1 | from os.path import join, isfile 2 | 3 | import sys 4 | import ast 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from argparse import ArgumentParser 9 | 10 | 11 | class DataFrameCols(object): 12 | COL_EXT = '.bin' 13 | IDX_EXT = '.idx' 14 | META = 'meta' 15 | 16 | def __init__(self, workdir): 17 | self.workdir = workdir 18 | self.meta = DataFrameCols.read_meta(workdir) 19 | 20 | @staticmethod 21 | def read_meta(workdir): 22 | meta_file = join(workdir, DataFrameCols.META) 23 | if not isfile(meta_file): 24 | return {} 25 | else: 26 | with open(join(workdir, DataFrameCols.META), 'r') as fmeta: 27 | return ast.literal_eval(fmeta.read()) 28 | 29 | def _write_meta(self): 30 | with open(join(self.workdir, DataFrameCols.META), 'w') as fmeta: 31 | fmeta.write(str(self.meta)) 32 | 33 | def load_column(self, col, arange=None, index=None): 34 | arr = np.fromfile(join(self.workdir, col + DataFrameCols.COL_EXT), dtype=self.meta[col]) 35 | if arange is not None: 36 | start_index = arange[0] 37 | end_index = arange[1] 38 | return arr[start_index:end_index] 39 | elif index is not None: 40 | return arr[index] 41 | else: 42 | return arr 43 | 44 | def write_column(self, name, arr, arange=None, index=None): 45 | if name in self.meta: 46 | assert self.meta[name] == arr.dtype 47 | else: 48 | self.meta[name] = arr.dtype.str 49 | 50 | if arange is not None: 51 | start_index = arange[0] 52 | end_index = arange[0] 53 | arr[start_index:end_index].tofile(join(self.workdir, name + DataFrameCols.COL_EXT)) 54 | else: 55 | arr[index].tofile(join(self.workdir, name + DataFrameCols.COL_EXT)) 56 | 57 | self._write_meta() 58 | 59 | def load_df(self, columns=None, arange=None, index=None): 60 | data = dict() 61 | columns = columns or self.meta.keys() 62 | for col in columns: 63 | data[col] = self.load_column(col, arange, index) 64 | return pd.DataFrame(data=data) 65 | 66 | def write_df(self, df, arange=None, index=None): 67 | for i, col in enumerate(df.columns): 68 | self.write_column(col, df[col].values, arange, index) 69 | 70 | def load_index(self, name): 71 | return np.fromfile(join(self.workdir, name + DataFrameCols.IDX_EXT), dtype=np.uint32) 72 | 73 | def write_index(self, name, arr): 74 | arr.astype(np.uint32).tofile(join(self.workdir, name + DataFrameCols.IDX_EXT)) 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = ArgumentParser() 79 | parser.add_argument('path', default='.') 80 | parser.add_argument('-f', '--fields', nargs='+', default=None) 81 | parser.add_argument('--range-start', type=int, default=None) 82 | parser.add_argument('--range-end', type=int, default=None) 83 | parser.add_argument('--index', default=None) 84 | args = parser.parse_args() 85 | 86 | dfc = DataFrameCols(args.path) 87 | df = dfc.load_df(columns=args.fields, arange=(args.range_start, args.range_end)) 88 | df.to_csv(sys.stdout, header=True, index=False, sep='\t') 89 | -------------------------------------------------------------------------------- /lib/hocon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pyhocon import HOCONConverter 4 | 5 | 6 | def write_config(conf, filename, output_format): 7 | lines = HOCONConverter.convert(conf, output_format=output_format, indent=4) 8 | with open(filename, 'w') as fh: 9 | fh.writelines(lines) 10 | 11 | 12 | def config2json(conf): 13 | lines = HOCONConverter.convert(conf, indent=0) 14 | return ''.join(lines).replace('\n', ' ') -------------------------------------------------------------------------------- /lib/project.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import re 4 | from collections import namedtuple 5 | from argparse import ArgumentParser 6 | from pyhocon import ConfigFactory, ConfigTree 7 | 8 | logging.basicConfig(format='%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S') 9 | 10 | Project = namedtuple('Project', ['conf']) 11 | instance = None 12 | 13 | 14 | def project(argv=sys.argv): 15 | global instance 16 | 17 | if instance is not None: 18 | return instance 19 | else: 20 | pattern = re.compile('-D(.*)=(.*)') 21 | conf_override = dict() 22 | argv_filtered = [] 23 | for a in argv: 24 | m = pattern.match(a) 25 | if m is not None: 26 | conf_override[m.group(1)] = m.group(2) 27 | else: 28 | argv_filtered.append(a) 29 | 30 | parser = ArgumentParser() 31 | parser.add_argument('--conf', default='application.conf') 32 | args, other = parser.parse_known_args(argv_filtered) 33 | 34 | conf = ConfigFactory.parse_file(args.conf) 35 | conf_override = ConfigFactory.from_dict(conf_override) 36 | conf_merged = ConfigTree.merge_configs(conf, conf_override) 37 | 38 | instance = Project(conf=conf_merged) 39 | 40 | return instance 41 | -------------------------------------------------------------------------------- /lib/quality.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | def reliability_curve(labels, predictions, nbins, sample_weights=None): 7 | """ Reliability curve for binary classification tasks 8 | Group samples into bins by value of predicted probability and compute empirical probability in each bin 9 | 10 | :param labels: true target values in {0, 1} 11 | :param predictions: predicted probabilities in [0.0, 1.0] 12 | :param nbins: number of bins 13 | :param sample_weights: use the same sample weights that were used for training 14 | :return: 15 | """ 16 | 17 | labels = np.array(labels) 18 | predictions = np.array(predictions) 19 | weights = sample_weights if sample_weights is not None else np.ones(len(labels)) 20 | 21 | assert len(labels) == len(predictions) 22 | assert len(labels) >= nbins 23 | 24 | ns = int(len(labels) / nbins) 25 | rem = len(labels) - ns 26 | 27 | sort_idx = np.argsort(predictions) 28 | count = np.zeros(nbins) 29 | avg_pred = np.zeros(nbins) 30 | avg_label = np.zeros(nbins) 31 | weight_total = np.zeros(nbins) 32 | 33 | jbin = 0 34 | for j, idx in enumerate(sort_idx): 35 | avg_pred[jbin] += predictions[idx] 36 | avg_label[jbin] += labels[idx] * weights[idx] 37 | weight_total[jbin] += weights[idx] 38 | count[jbin] += 1 39 | if rem > 0 and count[jbin] == ns + 1: 40 | jbin += 1 41 | rem -= 1 42 | elif rem == 0 and count[jbin] == ns: 43 | jbin += 1 44 | 45 | return avg_label / weight_total, avg_pred / count 46 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from os import makedirs as os_makedirs 3 | import errno 4 | 5 | 6 | def makedirs(path): 7 | try: 8 | os_makedirs(path) 9 | except OSError as e: 10 | if e.errno != errno.EEXIST: 11 | raise -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stys/kaggle-talkingdata-adtracking-fraud-detection/cf2f2d838807f0044f48f7d261e00be184db669b/models/__init__.py -------------------------------------------------------------------------------- /models/catboost_.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import json 5 | from os import chdir, getcwd 6 | from os.path import join as join_path, abspath 7 | from copy import deepcopy 8 | 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import log_loss, roc_auc_score 11 | from hyperopt import fmin, hp, STATUS_OK, Trials, tpe 12 | 13 | from catboost import CatBoostClassifier 14 | 15 | from lib.project import project 16 | from lib.columns import DataFrameCols 17 | from lib.utils import makedirs 18 | from lib.hocon import write_config, config2json 19 | from lib.quality import reliability_curve 20 | 21 | 22 | def quality(labels, pred): 23 | return dict( 24 | ll=log_loss(labels, pred), 25 | auc=roc_auc_score(labels, pred), 26 | reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100))) 27 | ) 28 | 29 | 30 | def train_catboost(train_df, valid_df, target, features, categorical_features, options): 31 | logging.info('Training catboost with options: %s', options) 32 | 33 | cat_features = list(train_df[features].columns.get_loc(c) for c in categorical_features) 34 | 35 | model = CatBoostClassifier(**options) 36 | model.fit(X=train_df[features].values, y=train_df[target].values, cat_features=cat_features, 37 | eval_set=(valid_df[features].values, valid_df[target].values)) 38 | 39 | model.save_model('model.bin') 40 | 41 | train_quality = quality(train_df[target].values, model.predict_proba(train_df[features].values)[:, 1]) 42 | logging.info('Train quality: %s', train_quality) 43 | 44 | valid_quality = quality(valid_df[target].values, model.predict_proba(valid_df[features].values)[:, 1]) 45 | logging.info('Validation quality: %s', valid_quality) 46 | 47 | return train_quality, valid_quality, model 48 | 49 | 50 | def get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options): 51 | """ Construct hyperopt objective function """ 52 | hyperopt_trial = 0 53 | 54 | def hyperobj(params): 55 | nonlocal hyperopt_trial 56 | hyperopt_trial += 1 57 | logging.info('Hyperopt trial %d, params=%s' % (hyperopt_trial, params)) 58 | 59 | options = deepcopy(catboost_options) 60 | for p in params: 61 | options[p] = params[p] 62 | 63 | work_dir = getcwd() 64 | trial_dir = abspath(join_path(work_dir, 'trial_%d' % hyperopt_trial)) 65 | makedirs(trial_dir) 66 | chdir(trial_dir) 67 | logging.info('Trial directory: %s', trial_dir) 68 | 69 | logging.info('Train catboost with options: %s' % config2json(options)) 70 | train_quality, valid_quality, model = train_catboost( 71 | train_df, valid_df, target, features, categorical_features, options) 72 | 73 | model.save('model') 74 | chdir(work_dir) 75 | 76 | return { 77 | 'loss': 1.0 - valid_quality['auc'], 78 | 'status': STATUS_OK, 79 | 'options': config2json(options), 80 | 'quality': { 81 | 'train': train_quality, 82 | 'valid': valid_quality 83 | }, 84 | 'model': { 85 | 'file': join_path(trial_dir, 'model') 86 | } 87 | } 88 | 89 | return hyperobj 90 | 91 | 92 | def train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options): 93 | logging.info('Running hyper parameters optimization: %s', config2json(hyperopt_options)) 94 | 95 | space = dict() 96 | for param, opts in hyperopt_options['space'].items(): 97 | expression = getattr(hp, opts['expression']) 98 | space[param] = expression(label=param, **opts['params']) 99 | 100 | fcn = get_hyperopt_objective(train_df, valid_df, target, features, categorical_features, catboost_options) 101 | 102 | trials = Trials() 103 | opt = fmin( 104 | fn=fcn, 105 | space=space, 106 | algo=tpe.suggest, 107 | trials=trials, 108 | max_evals=hyperopt_options['max_evals'] 109 | ) 110 | 111 | with open('hyperopt_trials.json', 'w') as f: 112 | json.dump(trials.results, f, indent=4) 113 | 114 | logging.info('Best parameters: %s', opt) 115 | 116 | best_trial, best_trial_result = min(enumerate(trials.results), key=lambda r: r[1]['loss']) 117 | logging.info('Best model %d: AUC=%s, model=%s' % ( 118 | best_trial, best_trial_result['quality']['valid']['auc'], best_trial_result['model']['file'])) 119 | 120 | best_model = CatBoostClassifier() 121 | best_model.load_model(best_trial_result['model']['file']) 122 | return best_trial_result['quality']['train'], best_trial_result['quality']['valid'], best_model 123 | 124 | 125 | if __name__ == '__main__': 126 | conf = project().conf 127 | 128 | dump_dir = abspath(conf['catboost']['dump']['dir']) 129 | makedirs(dump_dir) 130 | 131 | write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon') 132 | write_config(conf, join_path(dump_dir, 'application.json'), 'json') 133 | logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log'))) 134 | 135 | logging.info('Kaggle Talking Data') 136 | logging.info('Train Catboost') 137 | logging.info('Dump: %s', dump_dir) 138 | 139 | target = conf['catboost']['target'] 140 | features = conf['catboost']['features'] 141 | categorical_features = conf['catboost']['categorical_features'] 142 | logging.info('Target: %s', target) 143 | logging.info('Features: %s', config2json(features)) 144 | logging.info('Categorical features: %s', categorical_features) 145 | 146 | data_dir = abspath(conf['catboost']['data']['dir']) 147 | dfc = DataFrameCols(data_dir) 148 | 149 | train_index_name = conf['catboost']['data']['train']['index'] 150 | train_index = dfc.load_index(train_index_name) 151 | train_df = dfc.load_df(columns=[target] + features, index=train_index) 152 | train_df, valid_df = train_test_split(train_df, test_size=0.1) 153 | 154 | catboost_options = conf['catboost']['options'] 155 | logging.info('Using catboost options: %s', catboost_options) 156 | 157 | work_dir = getcwd() 158 | chdir(dump_dir) 159 | 160 | hyperopt_options = conf['catboost']['hyperopt'] 161 | if hyperopt_options['enabled']: 162 | train_quality, valid_quality, model = train_catboost_with_hyperopt(train_df, valid_df, target, features, categorical_features, catboost_options, hyperopt_options) 163 | else: 164 | train_quality, valid_quality, model = train_catboost(train_df, valid_df, target, features, categorical_features, catboost_options) 165 | 166 | chdir(work_dir) 167 | 168 | valid_pred = model.predict_proba(valid_df[features].values)[:, 1] 169 | valid_quality = quality(valid_df[target].values, valid_pred) 170 | logging.info('Cross-check best model validation score: AUC=%s' % valid_quality['auc']) 171 | 172 | # load model 173 | # model = CatBoostClassifier() 174 | # model.load_model(join_path(dump_dir, 'model.bin')) 175 | 176 | test_index_name = conf['catboost']['data']['test']['index'] 177 | test_index = dfc.load_index(test_index_name) 178 | test_df = dfc.load_df(columns=features + ['click_id_submission'], index=test_index) 179 | test_df['is_attributed'] = model.predict_proba(test_df[features].values)[:, 1] 180 | test_df = test_df[['click_id_submission', 'is_attributed']].rename(columns={'click_id_submission': 'click_id'}) 181 | test_df.sort_values(by='click_id', inplace=True) 182 | test_df.to_csv(join_path(dump_dir, 'submission.csv'), header=True, index=False) 183 | -------------------------------------------------------------------------------- /models/libffm.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import logging 3 | import subprocess 4 | import pickle 5 | import csv 6 | 7 | from os import chdir 8 | from os.path import abspath, join as join_path 9 | 10 | import numpy as np 11 | 12 | from sklearn.model_selection import StratifiedKFold 13 | from sklearn.metrics import roc_auc_score 14 | 15 | from lib.project import project 16 | from lib.columns import DataFrameCols 17 | from lib.utils import makedirs 18 | 19 | 20 | def write_libffm_data(df, target, fields, shifts): 21 | df['data'] = df[target].astype(str) 22 | for k, v in shifts.items(): 23 | print(k) 24 | df['data'] += ' %d:' % fields[k] 25 | df['data'] += (df[k] + v).astype(str) 26 | df['data'] += ':1' 27 | df.drop(columns=[k], inplace=True) 28 | gc.collect() 29 | return df 30 | 31 | 32 | def main(conf): 33 | dump_dir = abspath(conf['libffm']['dump']['dir']) 34 | makedirs(dump_dir) 35 | 36 | data_dir = abspath(conf['libffm']['data']['dir']) 37 | dfc = DataFrameCols(data_dir) 38 | 39 | target = 'is_attributed' 40 | fields = {'ip': 0, 'app': 1, 'device': 2, 'os': 3, 'channel': 4} 41 | shifts = {'ip': 0, 'app': 364779, 'device': 365548, 'os': 369776, 'channel': 370733} 42 | 43 | # 1) write test data 44 | # logging.info('Writing test data in libffm format') 45 | # df = dfc.load_df(columns=['id', target] + list(fields.keys())) 46 | # df = df[df[target] == -1] 47 | # df[target] = 0 # do we need this? 48 | # df = write_libffm_data(df, target, fields, shifts) 49 | test_fname = join_path(dump_dir, 'test.txt') 50 | # df[['data']].to_csv(test_fname, header=False, index=False, quoting=csv.QUOTE_NONE) 51 | # del df 52 | # gc.collect() 53 | # exit() 54 | 55 | # 2) write training folds 56 | # logging.info('Writing k-fold training data') 57 | # df = dfc.load_df(columns=['id', target] + list(fields.keys())) 58 | # df = df[df[target] >= 0] 59 | # df = write_libffm_data(df, target, fields, shifts) 60 | # 61 | # folds = [] 62 | # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337) 63 | # for fold_idx, valid_idx in skf.split(df['id'].values, df[target].values): 64 | # folds.append((fold_idx, valid_idx)) 65 | # 66 | # with open(join_path(dump_dir, 'folds.pkl'), 'wb') as f: 67 | # pickle.dump(folds, f) 68 | # 69 | # for j_fold, (fold_idx, valid_idx) in enumerate(folds): 70 | # logging.info('Writing fold %d in libffm format', j_fold) 71 | # train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold) 72 | # df.loc[fold_idx, ['data']].to_csv(train_fname, header=False, index=False, quoting=csv.QUOTE_NONE) 73 | # valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold) 74 | # df.loc[valid_idx, ['data']].to_csv(valid_fname, header=False, index=False, quoting=csv.QUOTE_NONE) 75 | # 76 | # del df 77 | # gc.collect() 78 | # exit() 79 | 80 | df = dfc.load_df(columns=['id', target]) 81 | df = df[df[target] >= 0] 82 | 83 | with open(join_path(dump_dir, 'folds.pkl'), 'rb') as f: 84 | folds = pickle.load(f) 85 | 86 | chdir(dump_dir) 87 | for j_fold, (fold_idx, valid_idx) in enumerate(folds): 88 | logging.info('Training on fold %d', j_fold) 89 | train_fname = join_path(dump_dir, 'train_fold_%d.txt' % j_fold) 90 | valid_fname = join_path(dump_dir, 'valid_fold_%d.txt' % j_fold) 91 | model_fname = join_path(dump_dir, 'model_%d.bin' % j_fold) 92 | proc = subprocess.run([ 93 | 'ffm-train', 94 | '-p', valid_fname, 95 | '-l', str(conf['libffm']['options']['lambda']), 96 | '-k', str(conf['libffm']['options']['factor']), 97 | '-r', str(conf['libffm']['options']['learning_rate']), 98 | '-t', str(conf['libffm']['options']['num_iter']), 99 | train_fname, 100 | model_fname 101 | ], stdout=subprocess.PIPE, check=True) 102 | 103 | logging.info('Running command %s', ' '.join(proc.args)) 104 | logging.info('Process return code %d', proc.returncode) 105 | logging.info(proc.stdout.decode('utf-8')) 106 | 107 | train_pred_file = join_path(dump_dir, 'train_pred_%d.txt' % j_fold) 108 | proc = subprocess.run([ 109 | 'ffm-predict', 110 | train_fname, 111 | model_fname, 112 | train_pred_file 113 | ], stdout=subprocess.PIPE, check=True) 114 | 115 | logging.info('Running command %s', ' '.join(proc.args)) 116 | logging.info('Process return code %d', proc.returncode) 117 | 118 | with open(train_pred_file, 'r') as f: 119 | p_train = np.array([float(s) for s in f.readlines()], dtype=np.float32) 120 | auc_train = roc_auc_score(df.loc[fold_idx, target].values, p_train) 121 | 122 | valid_pred_file = join_path(dump_dir, 'valid_pred_%d.txt' % j_fold) 123 | proc = subprocess.run([ 124 | 'ffm-predict', 125 | valid_fname, 126 | model_fname, 127 | valid_pred_file 128 | ], stdout=subprocess.PIPE, check=True) 129 | 130 | logging.info('Running command %s', ' '.join(proc.args)) 131 | logging.info('Process return code %d', proc.returncode) 132 | 133 | with open(valid_pred_file, 'r') as f: 134 | p_valid = np.array([float(s) for s in f.readlines()], dtype=np.float32) 135 | auc_valid = roc_auc_score(df.loc[valid_idx, target].values, p_valid) 136 | 137 | logging.info('Fold quality: auc_train=%f auc_valid=%f', auc_train, auc_valid) 138 | 139 | test_pred_file = join_path(dump_dir, 'test_pred_%d.txt' % j_fold) 140 | proc = subprocess.run([ 141 | 'ffm-predict', 142 | test_fname, 143 | model_fname, 144 | test_pred_file 145 | ], stdout=subprocess.PIPE, check=True) 146 | 147 | logging.info('Running command %s', ' '.join(proc.args)) 148 | logging.info('Process return code %d', proc.returncode) 149 | 150 | 151 | if __name__ == '__main__': 152 | main(project().conf) 153 | -------------------------------------------------------------------------------- /models/lightgbm_.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import logging 3 | from os.path import abspath, join as join_path 4 | 5 | import pandas as pd 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import log_loss, roc_auc_score 9 | 10 | import lightgbm as lgb 11 | 12 | from lib.project import project 13 | from lib.columns import DataFrameCols 14 | from lib.utils import makedirs 15 | from lib.hocon import write_config, config2json 16 | from lib.quality import reliability_curve 17 | 18 | 19 | def quality(labels, pred): 20 | return dict( 21 | ll=log_loss(labels, pred), 22 | auc=roc_auc_score(labels, pred), 23 | reliability=list(map(lambda x: x.tolist(), reliability_curve(labels, pred, nbins=100))) 24 | ) 25 | 26 | 27 | def train_lightgbm(params, train_dataset, valid_dataset=None, **options): 28 | logging.info('Training LightGBM with params: %s', config2json(params)) 29 | if valid_dataset is not None: 30 | model = lgb.train(params, train_dataset, valid_sets=[train_dataset, valid_dataset], **options) 31 | else: 32 | model = lgb.train(params, train_dataset, valid_sets=[train_dataset], **options) 33 | return model 34 | 35 | 36 | def main(conf): 37 | dump_dir = conf['lightgbm']['dump']['dir'] 38 | makedirs(dump_dir) 39 | 40 | write_config(conf, join_path(dump_dir, 'application.conf'), 'hocon') 41 | write_config(conf, join_path(dump_dir, 'application.json'), 'json') 42 | logging.getLogger().addHandler(logging.FileHandler(join_path(dump_dir, 'application.log'))) 43 | 44 | logging.info('Kaggle Talking Data') 45 | 46 | label = conf['lightgbm']['label'] 47 | features = conf['lightgbm']['features'] 48 | categorical_features = conf['lightgbm']['categorical_features'] 49 | logging.info('Label: %s', label) 50 | logging.info('Features: %s', features) 51 | logging.info('Categorical features: %s', categorical_features) 52 | 53 | data_dir = abspath(conf['lightgbm']['data']['dir']) 54 | dfc = DataFrameCols(data_dir) 55 | train_index_name = conf['lightgbm']['data']['train']['index'] 56 | train_index = dfc.load_index(train_index_name) 57 | 58 | df = dfc.load_df(columns=[label] + features, index=train_index) 59 | 60 | if conf['lightgbm']['valid_size'] > 0: 61 | train_df, valid_df = train_test_split(df, test_size=conf['lightgbm']['valid_size']) 62 | 63 | train_dataset = lgb.Dataset(data=train_df[features].values, label=train_df[label].values, feature_name=features, 64 | categorical_feature=categorical_features) 65 | valid_dataset = lgb.Dataset(data=valid_df[features].values, label=valid_df[label].values, feature_name=features, 66 | categorical_feature=categorical_features) 67 | 68 | del train_df 69 | del valid_df 70 | gc.collect() 71 | else: 72 | train_dataset = lgb.Dataset(data=df[features].values, label=df[label].values, feature_name=features, 73 | categorical_feature=categorical_features) 74 | valid_dataset = None 75 | 76 | params = conf['lightgbm']['params'] 77 | options = conf['lightgbm']['options'] 78 | model = train_lightgbm(params, train_dataset, valid_dataset, **options) 79 | model.save_model(join_path(dump_dir, 'model.bin')) 80 | del train_dataset 81 | del valid_dataset 82 | gc.collect() 83 | 84 | # load model 85 | # model = lgb.Booster(model_file=join_path(dump_dir, 'model.bin')) 86 | 87 | # train_label = train_df[label].values 88 | # train_pred = model.predict(train_df[features]) 89 | # train_quality = quality(train_label, train_pred) 90 | # logging.info('Train quality: %s', train_quality) 91 | # 92 | # valid_label = valid_df[label].values 93 | # valid_pred = model.predict(valid_df[features]) 94 | # valid_quality = quality(valid_label, valid_pred) 95 | # logging.info('Valid quality: %s', valid_quality) 96 | 97 | test_index_name = conf['lightgbm']['data']['test']['index'] 98 | test_index = dfc.load_index(test_index_name) 99 | test_df = dfc.load_df(columns=features + ['click_id_submission'], index=test_index) 100 | test_df['is_attributed'] = model.predict(test_df[features]) 101 | test_df = test_df[['click_id_submission', 'is_attributed']].rename(columns={'click_id_submission': 'click_id'}) 102 | test_df.sort_values(by='click_id', inplace=True) 103 | test_df.to_csv(join_path(dump_dir, 'submission.csv'), header=True, index=False) 104 | 105 | gain = model.feature_importance('gain') 106 | ft = pd.DataFrame({ 107 | 'feature': model.feature_name(), 108 | 'split': model.feature_importance('split'), 109 | 'gain': 100 * gain / gain.sum()} 110 | ).sort_values('gain', ascending=False) 111 | ft.to_csv(join_path(dump_dir, 'feature_strength.csv'), header=True, index=False, sep='\t') 112 | 113 | if __name__ == '__main__': 114 | main(project().conf) 115 | -------------------------------------------------------------------------------- /preprocessing/merge_test_sets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Create normalized dataset\n", 10 | "# - duplicates https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/52752\n", 11 | "# - join test_supplement.csv and test.csv\n", 12 | "# - sort by click_time" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 20, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "DATA_DIR = '../data/mnt/ssd/kaggle-talkingdata2/competition_files'\n", 32 | "TRAIN_SAMPLE_FILE = DATA_DIR + '/train_sample.csv'\n", 33 | "TRAIN_FILE = DATA_DIR + '/train.csv'\n", 34 | "TEST_FILE = DATA_DIR + '/test.csv'\n", 35 | "TEST_SUPPLEMENT_FILE = DATA_DIR + '/test_supplement.csv'\n", 36 | "TEST_JOINED_FILE = DATA_DIR + '/test_joined.csv'" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# test_supplement.csv\n", 46 | "df_test_supplement = pd.read_csv(TEST_SUPPLEMENT_FILE)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# test.csv: this is a subset of test_supplement.csv which is used to score submissions\n", 56 | "df_test = pd.read_csv(TEST_FILE)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# join test_supplement.csv and test.csv\n", 66 | "df_test_joined = df_test_supplement.merge(df_test, how='left', on=['ip', 'app', 'device', 'os', 'channel', 'click_time'], suffixes=['', '_submission'])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 7, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# remove extra duplicates\n", 76 | "# note: pandas consider duplicates rows that are identical even if there are nans in some columns\n", 77 | "duplicated_idx = df_test_joined.duplicated(subset=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id_submission'], keep='first')\n", 78 | "df_test_joined_dedup = df_test_joined[(~ duplicated_idx) | (df_test_joined['click_id_submission'].isnull())]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 8, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# check that all subsmission clicks are preserved after join and remove of extra duplicates\n", 88 | "df_test_joined_dedup['click_id_submission'].value_counts().sum()\n", 89 | "assert df_test.shape[0] == df_test_joined_dedup['click_id_submission'].value_counts().shape[0]\n", 90 | "assert df_test.shape[0] == df_test_joined_dedup['click_id_submission'].value_counts().sum()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 9, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "33\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "# note: a small number of events from test_supplement is lost after deduplications. \n", 108 | "# Assuming these were events in test_supplement.csv which were not present in test.csv but still were duplicates of events from test.csv\n", 109 | "print(df_test_supplement.shape[0] - df_test_joined_dedup.shape[0])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 10, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "del(df_test_supplement)\n", 119 | "del(df_test)\n", 120 | "del(df_test_joined)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 11, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "df_test_joined_dedup.sort_values(by=['click_time', 'ip', 'app', 'device', 'os', 'channel'], inplace=True)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 16, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "df_test_joined_dedup['click_id_submission'] = df_test_joined_dedup['click_id_submission'].fillna(value=-1).astype(int)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 21, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "df_test_joined_dedup.to_csv(TEST_JOINED_FILE, index=False)" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python [conda env:kaggle-talking-data]", 154 | "language": "python", 155 | "name": "conda-env-kaggle-talking-data-py" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3.0 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.6.4" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 0 172 | } --------------------------------------------------------------------------------