├── 0_prepare_splits.py
├── 1_svm_data.py
├── 1_train_ftrl.py
├── 1_train_svm.py
├── 2_extract_leaked_docs.py
├── 2_leak_features.py
├── 3_doc_similarity_features.py
├── 4_categorical_data_join.py
├── 4_categorical_data_unwrap_columnwise.py
├── 4_mean_target_value.py
├── 5_best_mtv_features_xgb.py
├── 5_mtv_et.py
├── 5_mtv_xgb.py
├── 6_1_generate_ffm_data.py
├── 6_2_split_ffm_to_subfolds.py
├── 6_3_run_ffm.sh
├── 6_4_put_ffm_subfolds_together.py
├── 7_ensemble_data_prep.py
├── 7_ensemble_xgb.py
├── README.md
├── categorical_features.txt
├── ftrl.py
├── mapk.R
├── ml_metrics_auc.py
└── submission.R


/0_prepare_splits.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | import numpy as np
 4 | import feather
 5 | 
 6 | # prepare train split
 7 | df_all = pd.read_csv("../data/clicks_train.csv")
 8 | df_all.display_id = df_all.display_id.astype('uint32')
 9 | df_all.ad_id = df_all.ad_id.astype('uint32')
10 | df_all.clicked = df_all.clicked.astype('uint8')
11 | 
12 | ids = df_all.display_id.unique()
13 | np.random.seed(1)
14 | np.random.shuffle(ids)
15 | 
16 | val_size = int(len(ids) * 0.5)
17 | val_display_ids = set(ids[:val_size])
18 | 
19 | df_all['fold'] = 0
20 | 
21 | is_val = df_all.display_id.isin(val_display_ids)
22 | df_all.loc[is_val, 'fold'] = 1
23 | df_all.fold = df_all.fold.astype('uint8')
24 | 
25 | feather.write_dataframe(df_all, 'tmp/clicks_train_50_50.feather')
26 | 
27 | 
28 | # prepare test data
29 | 
30 | df_test = pd.read_csv("../data/clicks_test.csv")
31 | df_test.display_id = df_test.display_id.astype('uint32')
32 | df_test.ad_id = df_test.ad_id.astype('uint32')
33 | 
34 | feather.write_dataframe(df_test, 'tmp/clicks_test.feather')


--------------------------------------------------------------------------------
/1_svm_data.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import os 
  4 | from time import time
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from tqdm import tqdm
 10 | import feather
 11 | 
 12 | 
 13 | # events data
 14 | 
 15 | def paths(tokens):
 16 |     all_paths = ['_'.join(tokens[0:(i+1)]) for i in range(len(tokens))]
 17 |     return ' '.join(all_paths)
 18 | 
 19 | def unwrap_geo(geo):
 20 |     geo = geo.split('>')
 21 |     return paths(geo)
 22 | 
 23 | 
 24 | df_events = pd.read_csv("../data/events.csv")
 25 | df_events.geo_location.fillna('', inplace=1)
 26 | 
 27 | geo_str = df_events.geo_location.apply(unwrap_geo)
 28 | 
 29 | 
 30 | ts = (df_events.timestamp + 1465876799998) / 1000
 31 | df_events.timestamp = pd.to_datetime(ts, unit='s')
 32 | 
 33 | 
 34 | dt = df_events.timestamp.dt
 35 | 
 36 | dow = 'dow_' + dt.dayofweek.astype('str')
 37 | hours = 'hour_' + dt.hour.astype('str')
 38 | dow_hour = 'dow_hour_' + dt.dayofweek.astype('str') + '_' + dt.hour.astype('str')
 39 | 
 40 | display_str = 'u_' + df_events.uuid + ' ' + \
 41 |               'd_' + df_events.document_id.astype('str') + ' ' + \
 42 |               'p_' + df_events.platform.astype('str') + ' ' +  \
 43 |               dow + ' ' + hours + ' ' + dow_hour + ' ' + \
 44 |               geo_str
 45 | 
 46 | df_events_processed = pd.DataFrame()
 47 | df_events_processed['display_id'] = df_events.display_id
 48 | df_events_processed['display_str'] = display_str
 49 | 
 50 | 
 51 | # ad documents data
 52 | 
 53 | df_promoted = pd.read_csv("../data/promoted_content.csv")
 54 | 
 55 | ad_string = 'addoc_' + df_promoted.document_id.astype('str') + ' ' \
 56 |             'campaign_' + df_promoted.campaign_id.astype('str') + ' ' \
 57 |             'adv_' + df_promoted.advertiser_id.astype('str') 
 58 | 
 59 | df_promoted_processed = pd.DataFrame()
 60 | df_promoted_processed['ad_id'] = df_promoted.ad_id
 61 | df_promoted_processed['promoted_ad_str'] = ad_string
 62 | 
 63 | 
 64 | ad_to_idx = dict(zip(df_promoted_processed.ad_id, df_promoted_processed.index))
 65 | 
 66 | 
 67 | # processing data in batches
 68 | 
 69 | def prepare_batch(batch):
 70 |     batch = batch.reset_index(drop=1)
 71 |     
 72 |     promoted_idx = batch.ad_id.apply(ad_to_idx.get)
 73 |     promoted_ad_str = df_promoted_processed.promoted_ad_str.iloc[promoted_idx]
 74 | 
 75 |     display_str = df_events_processed.display_str.iloc[batch.display_id - 1]
 76 |     
 77 |     promoted_ad_str.reset_index(drop=1, inplace=1)
 78 |     display_str.reset_index(drop=1, inplace=1)
 79 | 
 80 |     batch['ad_display_str'] = promoted_ad_str + ' ' + display_str
 81 |     return batch
 82 | 
 83 | 
 84 | def append_to_csv(batch, csv_file):
 85 |     props = dict(encoding='utf-8', index=False)
 86 |     if not os.path.exists(csv_file):
 87 |         batch.to_csv(csv_file, **props)
 88 |     else:
 89 |         batch.to_csv(csv_file, mode='a', header=False, **props)
 90 | 
 91 | def delete_file_if_exists(filename):
 92 |     if os.path.exists(filename):
 93 |         os.remove(filename)
 94 | 
 95 | def chunk_dataframe(df, n):
 96 |     for i in range(0, len(df), n):
 97 |         yield df.iloc[i:i+n]
 98 | 
 99 | 
100 | # preparing data for train & test
101 | 
102 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
103 | 
104 | delete_file_if_exists('tmp/svm_features_train.csv')
105 | 
106 | for batch in tqdm(chunk_dataframe(df_all, n=1000000)):
107 |     batch = prepare_batch(batch)
108 |     append_to_csv(batch, 'tmp/svm_features_train.csv')
109 | 
110 | 
111 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
112 | 
113 | delete_file_if_exists('tmp/svm_features_test.csv')
114 | 
115 | for batch in tqdm(chunk_dataframe(df_test, n=1000000)):
116 |     batch = prepare_batch(batch)
117 |     append_to_csv(batch, 'tmp/svm_features_test.csv')
118 | 


--------------------------------------------------------------------------------
/1_train_ftrl.py:
--------------------------------------------------------------------------------
  1 | # use pypy for running this script
  2 | 
  3 | import re
  4 | from time import time
  5 | from csv import DictReader
  6 | from time import time
  7 | 
  8 | import ftrl
  9 | from ml_metrics_auc import auc
 10 | 
 11 | spaces = re.compile(r' +')
 12 | 
 13 | 
 14 | # model parameters
 15 | 
 16 | alpha = 0.1
 17 | beta = 0.0
 18 | L1 = 2.0
 19 | L2 = 0.0
 20 | 
 21 | D = 2 ** 25
 22 | 
 23 | interactions = True
 24 | n_epochs = 1
 25 | show_auc = False
 26 | 
 27 | models = {}
 28 | models['0'] = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions)
 29 | models['1'] = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions)
 30 | model_full  = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions)
 31 | 
 32 | 
 33 | # training the models
 34 | 
 35 | 
 36 | t0 = time()
 37 | 
 38 | print('trainning models...')
 39 | 
 40 | for i in range(n_epochs):
 41 |     print('epoch %d...' % i)
 42 | 
 43 |     with open('tmp/svm_features_train.csv', 'r') as f:
 44 |         reader = DictReader(f)
 45 | 
 46 |         cnt = 0
 47 |         for row in reader:
 48 |             y = int(row['clicked'])
 49 | 
 50 |             x = spaces.split(row['ad_display_str'].strip())
 51 | 
 52 |             if row['fold'] == '0':
 53 |                 fold = '1'
 54 |             else: # '1'
 55 |                 fold = '0'
 56 | 
 57 |             models[fold].fit(x, y)
 58 |             model_full.fit(x, y)
 59 | 
 60 |             cnt = cnt + 1
 61 |             if cnt % 1000000 == 0:
 62 |                 print('processed %dth row' % cnt)
 63 | 
 64 | 
 65 | print('training took %0.3fm' % ((time() - t0) / 60))
 66 | 
 67 | 
 68 | # validation and oof prediction
 69 | 
 70 | print('validating models...')
 71 | 
 72 | t0 = time()
 73 | 
 74 | all_y = {'0': [], '1': []}
 75 | all_pred = {'0': [], '1': []}
 76 | 
 77 | f_pred = {}
 78 | f_pred['0'] = open('predictions/ftrl_pred_0.txt', 'w')
 79 | f_pred['0'].write('y_actual,y_pred\n')
 80 | 
 81 | f_pred['1'] = open('predictions/ftrl_pred_1.txt', 'w')
 82 | f_pred['1'].write('y_actual,y_pred\n')
 83 | 
 84 | with open('tmp/svm_features_train.csv', 'r') as f:
 85 |     reader = DictReader(f)
 86 | 
 87 |     cnt = 0
 88 |     for row in reader:
 89 |         y = int(row['clicked'])
 90 |         fold = row['fold']
 91 | 
 92 |         x = spaces.split(row['ad_display_str'].strip())
 93 |         y_pred = models[fold].predict(x)
 94 | 
 95 |         all_y[fold].append(y)
 96 |         all_pred[fold].append(y_pred)
 97 |         f_pred[fold].write('%s,%s\n' % (y, y_pred))
 98 | 
 99 |         cnt = cnt + 1
100 |         if cnt % 1000000 == 0:
101 |             print('processed %dth row' % cnt)
102 |         if show_auc and cnt % 5000000 == 0:
103 |             auc0 = auc(all_y['0'], all_pred['0'])
104 |             auc1 = auc(all_y['1'], all_pred['1'])
105 |             print('auc: %.4f, %.4f' % (auc0, auc1))
106 | 
107 | auc0 = auc(all_y['0'], all_pred['0'])
108 | auc1 = auc(all_y['1'], all_pred['1'])            
109 | print('final auc: %.4f, %.4f' % (auc0, auc1))
110 | 
111 | f_pred['0'].close()
112 | f_pred['1'].close()    
113 | 
114 | print('predict took %0.3fm' % ((time() - t0) / 60))
115 | del all_y, all_pred
116 | 
117 | 
118 | # predicting the results on test
119 | 
120 | print('applying the model to the test data...')
121 | 
122 | t0 = time()
123 | 
124 | f_pred = open('predictions/ftrl_pred_test.txt', 'w')
125 | f_pred.write('y_pred\n')
126 | 
127 | with open('tmp/svm_features_test.csv', 'r') as f:
128 |     reader = DictReader(f)
129 | 
130 |     cnt = 0
131 |     for row in reader:
132 |         x = spaces.split(row['ad_display_str'].strip())
133 |         y_pred = model_full.predict(x)
134 |         f_pred.write('%s\n' % y_pred)
135 | 
136 |         cnt = cnt + 1
137 |         if cnt % 1000000 == 0:
138 |             print('processed %dth row' % cnt)
139 | 
140 | f_pred.close()
141 | 
142 | print('predict took %0.3fm' % ((time() - t0) / 60))
143 | 


--------------------------------------------------------------------------------
/1_train_svm.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from time import time
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | from sklearn.feature_extraction.text import HashingVectorizer
 9 | from sklearn.svm import LinearSVC
10 | from sklearn.metrics import roc_auc_score
11 | 
12 | 
13 | # building the data for train
14 | 
15 | df_all = pd.read_csv('tmp/svm_features_train.csv')
16 | 
17 | text_vec = HashingVectorizer(dtype=np.uint8, n_features=10000000, norm=None, 
18 |                              lowercase=False, binary=True, token_pattern='\\S+', 
19 |                              non_negative=True)
20 | 
21 | t0 = time()
22 | X = text_vec.transform(df_all.ad_display_str)
23 | 
24 | print('building the train matrix took %.4fm' % (time() - t0) / 60)
25 | 
26 | 
27 | fold = df_all.fold.values
28 | 
29 | X_0 = X[fold == 0]
30 | X_1 = X[fold == 1]
31 | 
32 | y = df_all.clicked.values
33 | y_0 = y[fold == 0]
34 | y_1 = y[fold == 0]
35 | 
36 | 
37 | # fitting the model for fold 1
38 | 
39 | C = 0.1
40 | 
41 | t0 = time()
42 | 
43 | svm = LinearSVC(penalty='l1', dual=False, C=C, random_state=1)
44 | svm.fit(X_0, y_0)
45 | 
46 | y_pred = svm.decision_function(X_1)
47 | auc = roc_auc_score(y_1, y_pred)
48 | 
49 | np.save('predictions/svm_1_preds.npy', y_pred)
50 | 
51 | print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))
52 | 
53 | 
54 | # fitting the model for fold 0
55 | 
56 | t0 = time()
57 | 
58 | svm = LinearSVC(penalty='l1', dual=False, C=C, random_state=1)
59 | svm.fit(X_1, y_1)
60 | 
61 | y_pred = svm.decision_function(X_0)
62 | auc = roc_auc_score(y_0, y_pred)
63 | 
64 | np.save('predictions/svm_0_preds.npy', y_pred)
65 | 
66 | print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc))
67 | 
68 | 
69 | # predictions for test
70 | 
71 | df_test = pd.read_csv('tmp/svm_features_test.csv')
72 | 
73 | t0 = time() 
74 | X_test = text_vec.transform(df_test.ad_display_str)
75 | 
76 | print('building the test matrix took %.4fm' % (time() - t0) / 60)
77 | 
78 | pred_0 = model_0.decision_function(X_test)
79 | pred_1 = model_1.decision_function(X_test)
80 | pred_final = (pred_0 + pred_1) / 2
81 | 
82 | np.save('predictions/svm_test_preds.npy', pred_final)


--------------------------------------------------------------------------------
/2_extract_leaked_docs.py:
--------------------------------------------------------------------------------
 1 | # run it with pypy
 2 | # taken from 
 3 | # https://www.kaggle.com/jiweiliu/outbrain-click-prediction/extract-leak-in-30-mins-with-small-memory
 4 | 
 5 | import csv
 6 | import os
 7 | 
 8 | leak = {}
 9 | 
10 | with open('../data/promoted_content.csv') as f:
11 |     promoted = csv.DictReader(f)
12 |     for c, row in enumerate(promoted):
13 |         if row['document_id'] != '':
14 |             leak[row['document_id']] = 1 
15 | 
16 | 
17 | with open('../data/page_views.csv') as f:
18 |     page_views = csv.DictReader(f)
19 |     for c, row in enumerate(page_views):
20 |         if c % 1000000 == 0:
21 |             print c
22 | 
23 |         doc_id = row['document_id']
24 | 
25 |         if doc_id not in leak:
26 |             continue
27 | 
28 |         if leak[doc_id] == 1:
29 |             leak[doc_id] = set()
30 | 
31 |         lu = len(leak[doc_id])
32 |         leak[doc_id].add(row['uuid'])
33 | 
34 | 
35 | with open('tmp/leaked_docs.csv', 'w') as fo:
36 |     fo.write('document_id,uuids\n')
37 |     for k, v in leak.items():
38 |         if v == 1:
39 |             continue
40 | 
41 |         fo.write('%s,%s\n' % (k, ' '.join(v)))
42 | 


--------------------------------------------------------------------------------
/2_leak_features.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import feather
 4 | import sys
 5 | 
 6 | import csv
 7 | csv.field_size_limit(sys.maxsize)
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | # reading the leaked documents
13 | 
14 | docs_size = {}
15 | leak_uuid_dict = {}
16 | 
17 | with open("tmp/leaked_docs.csv") as f:
18 |     reader = csv.DictReader(f)
19 |     leak_uuid_dict = {}
20 | 
21 |     for row in reader:
22 |         doc_id = int(row['document_id'])
23 |         uuids = row['uuids'].split(' ')
24 |         leak_uuid_dict[doc_id] = set(uuids)
25 |         docs_size[doc_id] = len(uuids)
26 | 
27 | 
28 | # 
29 | 
30 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
31 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
32 | 
33 | 
34 | # getting user ids and document ids
35 | 
36 | df_events = pd.read_csv('../data/events.csv', usecols=['uuid'])
37 | df_ads = pd.read_csv('../data/promoted_content.csv', 
38 |                      usecols=['ad_id', 'document_id'])
39 | 
40 | # joining doc_id and ad_id
41 | 
42 | ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index))
43 | 
44 | ad_idx = df_all.ad_id.apply(ad_to_idx.get)
45 | ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1)
46 | df_all['ad_document_id'] = ad_document_id
47 | 
48 | ad_idx = df_test.ad_id.apply(ad_to_idx.get)
49 | ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1)
50 | df_test['ad_document_id'] = ad_document_id
51 | 
52 | # joining display_id and user
53 | 
54 | df_all['uuid'] = df_events.iloc[df_all.display_id - 1].reset_index(drop=1)
55 | df_test['uuid'] = df_events.iloc[df_test.display_id - 1].reset_index(drop=1)
56 | 
57 | 
58 | # extracting the leak
59 | 
60 | def is_leak(doc_id, uuid):
61 |     if doc_id in leak_uuid_dict:
62 |         if uuid in leak_uuid_dict[doc_id]:
63 |             return 1
64 |     return 0
65 | 
66 | df_all['leak'] = df_all.ad_document_id.combine(df_all.uuid, is_leak)
67 | df_test['leak'] = df_test.ad_document_id.combine(df_test.uuid, is_leak)
68 | 
69 | df_all['doc_known_views'] = df_all.ad_document_id.apply(lambda d: docs_size.get(d, 0))
70 | df_test['doc_known_views'] = df_test.ad_document_id.apply(lambda d: docs_size.get(d, 0))
71 | 
72 | df_train_0 = df_all[df_all.fold == 0]
73 | df_train_1 = df_all[df_all.fold == 1]
74 | 
75 | np.save('features/leak_0.npy', df_train_0.leak.values)
76 | np.save('features/leak_1.npy', df_train_1.leak.values)
77 | np.save('features/leak_test.npy', df_test.leak.values)
78 | 
79 | np.save('features/doc_known_views_0.npy', df_train_0.doc_known_views.values)
80 | np.save('features/doc_known_views_1.npy', df_train_1.doc_known_views.values)
81 | np.save('features/doc_known_views_test.npy', df_test.doc_known_views.values)


--------------------------------------------------------------------------------
/3_doc_similarity_features.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import csv
  4 | from tqdm import tqdm
  5 | from collections import defaultdict, Counter
  6 | from math import log
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.feature_extraction import DictVectorizer
 11 | from sklearn.preprocessing import normalize
 12 | from sklearn.decomposition import TruncatedSVD
 13 | 
 14 | import feather
 15 | import os 
 16 | 
 17 | 
 18 | # display_id to document_id mapping
 19 | 
 20 | display_doc_ids = []
 21 | 
 22 | with open('../data/events.csv') as f:
 23 |     reader = csv.DictReader(f)
 24 |     
 25 |     for row in tqdm(reader):
 26 |         doc_id = int(row['document_id'])
 27 |         display_doc_ids.append(doc_id)
 28 | 
 29 | 
 30 | # ad_id to document_id mapping
 31 | 
 32 | ad_doc_id = {}
 33 | 
 34 | with open('../data/promoted_content.csv') as f:
 35 |     reader = csv.DictReader(f)
 36 |     
 37 |     for row in tqdm(reader):
 38 |         ad_id = int(row['ad_id'])
 39 |         doc_id = int(row['document_id'])
 40 |         ad_doc_id[ad_id] = doc_id
 41 | 
 42 | 
 43 | 
 44 | # reading document data
 45 | 
 46 | categories = defaultdict(list)
 47 | 
 48 | with open('../data/documents_categories.csv') as f:
 49 |     reader = csv.DictReader(f)
 50 | 
 51 |     for row in tqdm(reader):
 52 |         doc_id = int(row['document_id'])
 53 |         cat = 'cat_' + row['category_id']
 54 |         conf = float(row['confidence_level'])
 55 |         categories[doc_id].append((cat, conf))
 56 | 
 57 | entities = defaultdict(list)
 58 | 
 59 | with open('../data/documents_entities.csv') as f:
 60 |     reader = csv.DictReader(f)
 61 |     
 62 |     for row in tqdm(reader):
 63 |         doc_id = int(row['document_id'])
 64 |         en = 'entity_' + row['entity_id']
 65 |         conf = float(row['confidence_level'])
 66 |         entities[doc_id].append((en, conf))
 67 | 
 68 | topics = defaultdict(list)
 69 | 
 70 | with open('../data/documents_topics.csv') as f:
 71 |     reader = csv.DictReader(f)
 72 |     
 73 |     for row in tqdm(reader):
 74 |         doc_id = int(row['document_id'])
 75 |         t = 'topic_' + row['topic_id']
 76 |         conf = float(row['confidence_level'])
 77 |         topics[doc_id].append((t, conf))
 78 | 
 79 | 
 80 | 
 81 | doc_ids = []
 82 | doc_values = []
 83 | values_cnt = Counter()
 84 | 
 85 | with open('../data/documents_meta.csv') as f:
 86 |     reader = csv.DictReader(f)
 87 | 
 88 |     for row in tqdm(reader):
 89 |         doc_id = int(row['document_id'])
 90 |         
 91 |         source = 'src_' + row['source_id']
 92 |         if not source:
 93 |             source = 'src_unk'
 94 | 
 95 |         publisher = 'pub_' + row['publisher_id']
 96 |         if not publisher: 
 97 |             publisher = 'pub_unk'
 98 | 
 99 |         doc_vector = [(source, 1.0), (publisher, 1.0)]
100 |         doc_vector.extend(categories[doc_id])
101 |         doc_vector.extend(entities[doc_id])
102 |         doc_vector.extend(topics[doc_id])
103 | 
104 |         doc_ids.append(doc_id)
105 |         doc_values.append(dict(doc_vector))
106 | 
107 |         values_cnt.update([n for (n, _) in doc_vector])
108 | 
109 | 
110 | doc_id_to_idx = {d: i for (i, d) in enumerate(doc_ids)}
111 | 
112 | 
113 | # discard infrequent and calculate idf
114 | 
115 | min_df = 5
116 | freq = {t for (t, c) in values_cnt.items() if c >= min_df}
117 | 
118 | N = len(doc_ids)
119 | log_N = log(N)
120 | 
121 | idf = {k: log_N - log(v) for (k, v) in values_cnt.items() if k in freq}
122 | 
123 | 
124 | def discard_infreq(in_dict):
125 |     return {k: w for (k, w) in in_dict.items() if k in freq}
126 | 
127 | def idf_transform(in_dict):
128 |     return {k: w * idf[k] for (k, w) in in_dict.items()}
129 | 
130 | doc_values = [discard_infreq(d) for d in doc_values]
131 | idf_doc_values = [idf_transform(d) for d in doc_values]
132 | 
133 | 
134 | # vectorizing the documents 
135 | 
136 | dv = DictVectorizer(dtype=np.float32)
137 | X_idf = dv.fit_transform(idf_doc_values)
138 | 
139 | del dv
140 | del values_cnt, idf, freq, doc_values, idf_doc_values
141 | del categories, entities, topics, doc_ids, doc_values
142 | 
143 | 
144 | # lsi
145 | 
146 | svd_idf = TruncatedSVD(n_components=150, random_state=1)
147 | svd_idf.fit(X_idf)
148 | 
149 | 
150 | # processing data in batches
151 | 
152 | def append_to_csv(batch, csv_file):
153 |     props = dict(encoding='utf-8', index=False)
154 |     if not os.path.exists(csv_file):
155 |         batch.to_csv(csv_file, **props)
156 |     else:
157 |         batch.to_csv(csv_file, mode='a', header=False, **props)
158 | 
159 | def delete_file_if_exists(filename):
160 |     if os.path.exists(filename):
161 |         os.remove(filename)
162 |         
163 | def chunk_dataframe(df, n):
164 |     for i in range(0, len(df), n):
165 |         yield df.iloc[i:i+n]
166 | 
167 | 
168 | def prepare_batch(batch):
169 |     batch = batch.reset_index(drop=1)
170 | 
171 |     display_docs = (batch.display_id - 1).apply(display_doc_ids.__getitem__)
172 |     display_docs_idx = display_docs.apply(doc_id_to_idx.get)
173 | 
174 |     ad_docs = batch.ad_id.apply(ad_doc_id.get)
175 |     ad_docs_idx = ad_docs.apply(doc_id_to_idx.get)
176 | 
177 |     X1 = X_idf[display_docs_idx.values]
178 |     X2 = X_idf[ad_docs_idx.values]
179 | 
180 |     dot = X1.multiply(X2).sum(axis=1)
181 |     batch['doc_idf_dot'] = np.asarray(dot).reshape(-1)
182 | 
183 |     X1_svd = svd_idf.transform(X1)
184 |     X2_svd = svd_idf.transform(X2)
185 | 
186 |     batch['doc_idf_dot_lsa'] = (X1_svd * X2_svd).sum(axis=1)
187 | 
188 |     X1 = normalize(X1.astype(np.float))
189 |     X2 = normalize(X2.astype(np.float))
190 | 
191 |     dot = X1.multiply(X2).sum(axis=1)
192 |     batch['doc_idf_cos'] = np.asarray(dot).reshape(-1)
193 | 
194 |     return batch
195 | 
196 | 
197 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
198 | 
199 | delete_file_if_exists('tmp/doc_features_train.csv')
200 | 
201 | for batch in tqdm(chunk_dataframe(df_all, n=1000000)):
202 |     batch = prepare_batch(batch)
203 |     append_to_csv(batch, 'tmp/doc_features_train.csv')
204 | 
205 | del df_all
206 | 
207 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
208 | 
209 | delete_file_if_exists('tmp/doc_features_test.csv')
210 | 
211 | for batch in tqdm(chunk_dataframe(df_test, n=1000000)):
212 |     batch = prepare_batch(batch)
213 |     append_to_csv(batch, 'tmp/doc_features_test.csv')
214 | 
215 | del df_test
216 | 
217 | del svd_idf, X_idf
218 | 
219 | 
220 | # now processing the features and saving them as feather files
221 | 
222 | types = dict(display_id='uint32', ad_id='uint32', clicked='uint8', fold='uint8', 
223 |              doc_idf_dot='float32', doc_idf_dot_lsa='float32', doc_idf_cos='float32')
224 | df_all = pd.read_csv('tmp/doc_features_train.csv', dtype=types)
225 | 
226 | del types['clicked'], types['fold']
227 | df_test = pd.read_csv('tmp/doc_features_test.csv', dtype=types)
228 | 
229 | 
230 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
231 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
232 | del df_train_0['fold'], df_train_1['fold'], df_all
233 | 
234 | cols_to_rank = ['doc_idf_dot', 'doc_idf_dot_lsa', 'doc_idf_cos']
235 | 
236 | for f in tqdm(cols_to_rank):
237 |     for df in [df_train_0, df_train_1, df_test]:
238 |         df['%s_rank' % f] = df.groupby('display_id')[f].rank(ascending=0)
239 |         df['%s_rank' % f] = df['%s_rank' % f].astype('uint8')
240 | 
241 | 
242 | feather.write_dataframe(df_train_0, 'features/docs_df_train_0.feather')
243 | feather.write_dataframe(df_train_1, 'features/docs_df_train_1.doc.feather')
244 | feather.write_dataframe(df_test, 'features/docs_df_test.feather')


--------------------------------------------------------------------------------
/4_categorical_data_join.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import os
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import xgboost as xgb
  8 | import feather
  9 | from tqdm import tqdm
 10 | 
 11 | from sklearn.preprocessing import LabelEncoder
 12 | 
 13 | from itertools import combinations
 14 | 
 15 | 
 16 | 
 17 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
 18 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
 19 | 
 20 | 
 21 | # event features:
 22 | # - geo
 23 | # - time
 24 | # - user
 25 | # - platform
 26 | 
 27 | 
 28 | df_display = pd.read_csv('../data/events.csv')
 29 | df_display.geo_location.fillna('', inplace=1)
 30 | 
 31 | # geo features
 32 | 
 33 | df_geo = df_display.geo_location.str.split('>', expand=True)
 34 | df_geo.fillna('*', inplace=1)
 35 | df_geo.columns = ['geo_0', 'geo_1', 'geo_2']
 36 | del df_geo['geo_2']
 37 | df_geo['geo_second_lev'] = df_geo['geo_0'] + '>' + df_geo['geo_1']
 38 | del df_geo['geo_1']
 39 | 
 40 | df_display['geo_0'] = df_geo['geo_0']
 41 | df_display['geo_1'] = df_geo['geo_second_lev']
 42 | df_display.rename(columns={'geo_location': 'geo_2'}, inplace=1)
 43 | del df_geo
 44 | 
 45 | # time features
 46 | 
 47 | ts = (df_display.timestamp + 1465876799998) / 1000 - (4 * 60 * 60)
 48 | df_display.timestamp = pd.to_datetime(ts, unit='s')
 49 | 
 50 | dt = df_display.timestamp.dt
 51 | df_display['day'] = dt.dayofweek.astype('str')
 52 | df_display['hour'] = dt.hour.astype('str')
 53 | 
 54 | del df_display['timestamp'], dt, ts
 55 | 
 56 | # platform
 57 | 
 58 | df_display.platform = df_display.platform.astype('str')
 59 | del df_display['display_id']
 60 | 
 61 | 
 62 | # user: convert to base 32 to occupy less space
 63 | 
 64 | df_display['user_id'] = LabelEncoder().fit_transform(df_display.uuid)
 65 | del df_display['uuid']
 66 | 
 67 | def base32(i):
 68 |     return np.base_repr(i, base=32)
 69 | 
 70 | df_display['user_id'] = df_display['user_id'].apply(base32)
 71 | 
 72 | 
 73 | 
 74 | # document features:
 75 | # - top category
 76 | # - top entity
 77 | # - top topic
 78 | # - meta: publisher, source
 79 | 
 80 | df_ads = pd.read_csv('../data/promoted_content.csv')
 81 | ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index))
 82 | 
 83 | ads_docs = set(df_display.document_id)
 84 | ads_docs.update(df_ads.document_id)
 85 | 
 86 | 
 87 | # document categories
 88 | 
 89 | df_doc_cat = pd.read_csv('../data/documents_categories.csv')
 90 | 
 91 | df_doc_cat = df_doc_cat.drop_duplicates(subset='document_id', keep='first')
 92 | df_doc_cat = df_doc_cat[df_doc_cat.confidence_level >= 0.8]
 93 | df_doc_cat = df_doc_cat[df_doc_cat.document_id.isin(ads_docs)]
 94 | 
 95 | cat_counts = df_doc_cat.category_id.value_counts()
 96 | freq_cats = set(cat_counts[cat_counts >= 5].index)
 97 | 
 98 | df_doc_cat = df_doc_cat[df_doc_cat.category_id.isin(freq_cats)]
 99 | 
100 | doc_top_cat = dict(zip(df_doc_cat.document_id, df_doc_cat.category_id))
101 | del freq_cats, cat_counts, df_doc_cat
102 | 
103 | 
104 | # document entities: hash them to occupy less space
105 | 
106 | D = 2 ** 24
107 | def entity_name_reduce(entity):
108 |     return '%x' % abs(hash(entity) % D)
109 | 
110 | 
111 | df_doc_entities = pd.read_csv('../data/documents_entities.csv')
112 | 
113 | df_doc_entities = df_doc_entities[df_doc_entities.confidence_level >= 0.8]
114 | df_doc_entities = df_doc_entities[df_doc_entities.document_id.isin(ads_docs)]
115 | 
116 | df_doc_entities = df_doc_entities.drop_duplicates(subset='document_id', keep='first')
117 | df_doc_entities = df_doc_entities.reset_index(drop=1)
118 | 
119 | df_doc_entities.entity_id = df_doc_entities.entity_id.apply(entity_name_reduce)
120 | 
121 | entity_counts = df_doc_entities.entity_id.value_counts()
122 | freq_entites = set(entity_counts[entity_counts >= 5].index)
123 | df_doc_entities = df_doc_entities[df_doc_entities.entity_id.isin(freq_entites)]
124 | 
125 | doc_top_entity = dict(zip(df_doc_entities.document_id, df_doc_entities.entity_id))
126 | 
127 | del df_doc_entities, entity_counts, freq_entites
128 | 
129 | 
130 | # document topics
131 | 
132 | df_doc_topics = pd.read_csv('../data/documents_topics.csv')
133 | 
134 | df_doc_topics = df_doc_topics[df_doc_topics.confidence_level >= 0.8]
135 | df_doc_topics = df_doc_topics[df_doc_topics.document_id.isin(ads_docs)]
136 | 
137 | df_doc_topics = df_doc_topics.drop_duplicates(subset='document_id', keep='first')
138 | df_doc_topics = df_doc_topics.reset_index(drop=1)
139 | 
140 | topic_cnt = df_doc_topics.topic_id.value_counts()
141 | freq_topics = set(topic_cnt[topic_cnt >= 5].index)
142 | 
143 | df_doc_topics = df_doc_topics[df_doc_topics.topic_id.isin(freq_topics)]
144 | doc_top_topic = dict(zip(df_doc_topics.document_id, df_doc_topics.topic_id))
145 | 
146 | del df_doc_topics, topic_cnt, freq_topics
147 | 
148 | 
149 | # document meta info
150 | 
151 | df_doc_meta = pd.read_csv('../data/documents_meta.csv')
152 | df_doc_meta = df_doc_meta[df_doc_meta.document_id.isin(ads_docs)]
153 | del df_doc_meta['publish_time']
154 | 
155 | df_doc_meta.source_id.fillna(0, inplace=1)
156 | df_doc_meta.source_id = df_doc_meta.source_id.astype('uint32')
157 | 
158 | df_doc_meta.publisher_id.fillna(0, inplace=1)
159 | df_doc_meta.publisher_id = df_doc_meta.publisher_id.astype('uint32')
160 | 
161 | df_doc_meta = df_doc_meta.reset_index(drop=1)
162 | meta_idx = dict(zip(df_doc_meta.document_id, df_doc_meta.index))
163 | 
164 | 
165 | 
166 | # to avoid confusion, let's rename document_id columns
167 | 
168 | df_display.rename(columns={'document_id': 'on_document_id'}, inplace=1)
169 | df_ads.rename(columns={'document_id': 'ad_document_id'}, inplace=1)
170 | 
171 | 
172 | # we will do everything in batches
173 | def prepare_batch(batch):
174 |     batch = batch.reset_index(drop=1)
175 |     
176 |     batch_display = df_display.iloc[batch.display_id - 1].reset_index(drop=1)
177 | 
178 |     batch_ad_ids = batch.ad_id.apply(ad_to_idx.get)
179 |     batch_ads = df_ads.iloc[batch_ad_ids].reset_index(drop=1)
180 |     del batch_ads['ad_id']
181 | 
182 |     batch_meta_idx = batch_ads.ad_document_id.apply(meta_idx.get)
183 |     batch_ad_doc_meta = df_doc_meta.iloc[batch_meta_idx].reset_index(drop=1)
184 | 
185 |     batch_ad_doc_meta['top_entity'] = \
186 |             batch_ad_doc_meta.document_id.apply(lambda did: doc_top_entity.get(did, 'unk'))
187 |     batch_ad_doc_meta['top_topic'] = \
188 |             batch_ad_doc_meta.document_id.apply(lambda did: doc_top_topic.get(did, 'unk'))
189 |     batch_ad_doc_meta['top_cat'] = \
190 |             batch_ad_doc_meta.document_id.apply(lambda did: doc_top_cat.get(did, 'unk'))
191 | 
192 |     del batch_ad_doc_meta['document_id']
193 | 
194 |     batch_ad_doc_meta.columns = ['ad_doc_%s' % c for c in batch_ad_doc_meta.columns]
195 | 
196 |     batch_meta_idx = batch_display.on_document_id.apply(meta_idx.get)
197 |     batch_on_doc_meta = df_doc_meta.iloc[batch_meta_idx].reset_index(drop=1)
198 | 
199 |     batch_on_doc_meta['top_entity'] = \
200 |             batch_on_doc_meta.document_id.apply(lambda did: doc_top_entity.get(did, 'unk'))
201 |     batch_on_doc_meta['top_topic'] = \
202 |             batch_on_doc_meta.document_id.apply(lambda did: doc_top_topic.get(did, 'unk'))
203 |     batch_on_doc_meta['top_cat'] = \
204 |             batch_on_doc_meta.document_id.apply(lambda did: doc_top_cat.get(did, 'unk'))
205 | 
206 |     del batch_on_doc_meta['document_id']
207 | 
208 |     batch_on_doc_meta.columns = ['on_doc_%s' % c for c in batch_on_doc_meta.columns]
209 |     
210 |     joined_batch = pd.concat([batch, batch_ads, batch_display, 
211 |                               batch_ad_doc_meta, batch_on_doc_meta], axis=1)
212 | 
213 |     for c in ['ad_doc_source_id', 'ad_doc_publisher_id', 'ad_document_id', 'ad_doc_top_cat',
214 |           'on_doc_source_id', 'on_doc_publisher_id', 'on_document_id', 'on_doc_top_cat',
215 |           'ad_id', 'campaign_id', 'advertiser_id']:
216 |         joined_batch[c] = joined_batch[c].astype('str')
217 | 
218 |     joined_batch.fillna('unk', inplace=1)
219 |     all_features = set(joined_batch.columns) - {'clicked', 'fold', 'display_id'}
220 | 
221 |     for c in sorted(all_features):
222 |         if 'on_doc' in c or 'geo' in c or c in {'day', 'hour', 'user_id', 'ad_id'}:
223 |             continue
224 | 
225 |         for c2 in ['day', 'hour', 'geo_0', 'geo_1', 'geo_2']:
226 |             joined_batch['%s_%s' % (c, c2)] = joined_batch[c] + '_' + joined_batch[c2]
227 |     
228 |     two_way_comb = sorted(all_features - {'day', 'hour', 'geo_0', 'geo_1', 'geo_2'})
229 |     
230 |     combs = list(combinations(two_way_comb, 2))
231 | 
232 |     for c1, c2 in combs:
233 |         if 'on_doc' in c1 and 'on_doc' in c2:
234 |             continue
235 |         joined_batch['%s_%s' % (c1, c2)] = joined_batch[c1].astype('str') + '_' + joined_batch[c2].astype('str')
236 | 
237 |     return joined_batch
238 | 
239 | 
240 | 
241 | def append_to_csv(batch, csv_file):
242 |     props = dict(encoding='utf-8', index=False)
243 |     if not os.path.exists(csv_file):
244 |         batch.to_csv(csv_file, **props)
245 |     else:
246 |         batch.to_csv(csv_file, mode='a', header=False, **props)
247 | 
248 | def delete_file_if_exists(filename):
249 |     if os.path.exists(filename):
250 |         os.remove(filename)
251 |         
252 | def chunk_dataframe(df, n):
253 |     for i in range(0, len(df), n):
254 |         yield df.iloc[i:i+n]
255 | 
256 | 
257 | # apply to train
258 | 
259 | df = feather.read_dataframe('tmp/clicks_train_50_50.feather')
260 | 
261 | delete_file_if_exists('tmp/categorical_joined_train.csv')
262 | 
263 | for batch in tqdm(chunk_dataframe(df, n=100000)):
264 |     batch = prepare_batch(batch)
265 |     append_to_csv(batch, 'tmp/categorical_joined_train.csv')
266 | 
267 | 
268 | # apply to test
269 | 
270 | df = feather.read_dataframe('tmp/clicks_test.feather')
271 | 
272 | delete_file_if_exists('tmp/categorical_joined_test.csv')
273 | 
274 | for batch in tqdm(chunk_dataframe(df, n=100000)):
275 |     batch = prepare_batch(batch)
276 |     append_to_csv(batch, 'tmp/categorical_joined_test.csv')
277 | 


--------------------------------------------------------------------------------
/4_categorical_data_unwrap_columnwise.py:
--------------------------------------------------------------------------------
 1 | # run with pypy
 2 | 
 3 | from tqdm import tqdm
 4 | import csv
 5 | 
 6 | train_file = 'tmp/categorical_joined_train.csv'
 7 | test_file = 'tmp/categorical_joined_test.csv'
 8 | 
 9 | 
10 | def copy_columnwise(filename, result_dir):
11 |     with open(filename) as f:
12 |         reader = csv.DictReader(f)
13 |         files = {}
14 | 
15 |         for f in reader.fieldnames:
16 |             files[f] = open(result_dir + '/' + f + '.txt', 'w')
17 | 
18 |         for row in tqdm(reader):
19 |             for k, v in row.items():
20 |                 files[k].write(v + '\n')
21 | 
22 |         for f in files.values():
23 |             f.flush()
24 |             f.close()
25 | 
26 | print('copy train...')
27 | copy_columnwise(train_file, 'tmp/categorical/train')
28 | 
29 | print('copy test...')
30 | copy_columnwise(test_file, 'tmp/categorical/test')
31 | 


--------------------------------------------------------------------------------
/4_mean_target_value.py:
--------------------------------------------------------------------------------
  1 | # run with cat categorical_features.txt | parallel --jobs 6 python 04_mean_target_value.py {}
  2 | # coding: utf-8
  3 | 
  4 | import sys
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from time import time
 10 | import feather
 11 | 
 12 | column = sys.argv[1]
 13 | print('processing column %s...' % column)
 14 | 
 15 | C = 12
 16 | 
 17 | 
 18 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
 19 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
 20 | 
 21 | train_col = pd.read_csv('tmp/categorical/train/' + column + '.txt', header=None, dtype='str')
 22 | df_all[column] = train_col[0]
 23 | 
 24 | test_col = pd.read_csv('tmp/categorical/test/' + column + '.txt', header=None, dtype='str')
 25 | df_test[column] = test_col[0]
 26 | 
 27 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
 28 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
 29 | del df_train_0['fold'], df_train_1['fold'], df_all['fold']
 30 | 
 31 | 
 32 | # fold 0 train 
 33 | 
 34 | print('training on fold 0, predicting on 1')
 35 | 
 36 | t0 = time()
 37 | 
 38 | m0 = (df_train_0.clicked == 1).mean()
 39 | 
 40 | cnt_clicked_0 = df_train_0[df_train_0.clicked == 1][column].value_counts()
 41 | cnt_all_0 = df_train_0[column].value_counts()
 42 | 
 43 | probs_1 = (cnt_clicked_0 + C * m0) / (cnt_all_0 + C)
 44 | probs_1 = probs_1[df_train_1[column]].reset_index(drop=1)
 45 | probs_1.fillna(m0, inplace=1)
 46 | 
 47 | df_train_1['prob'] = probs_1
 48 | 
 49 | print('took %0.3fs' % (time() - t0))
 50 | 
 51 | 
 52 | # fold 1 train
 53 | 
 54 | print('training on fold 1, predicting on 0')
 55 | 
 56 | t0 = time()
 57 | 
 58 | m1 = (df_train_1.clicked == 1).mean()
 59 | cnt_clicked_1 = df_train_1[df_train_1.clicked == 1][column].value_counts()
 60 | cnt_all_1 = df_train_1[column].value_counts()
 61 | 
 62 | probs_0 = (cnt_clicked_1 + C * m1) / (cnt_all_1 + C)
 63 | probs_0 = probs_0[df_train_0[column]].reset_index(drop=1)
 64 | probs_0.fillna(m1, inplace=1)
 65 | 
 66 | df_train_0['prob'] = probs_0
 67 | 
 68 | print('took %0.3fs' % (time() - t0))
 69 | 
 70 | 
 71 | # full train
 72 | 
 73 | print('training on all data, predicting on test')
 74 | 
 75 | t0 = time()
 76 | 
 77 | m = (df_all.clicked == 1).mean()
 78 | cnt_clicked = df_all[df_all.clicked == 1][column].value_counts()
 79 | cnt_all = df_all[column].value_counts()
 80 | 
 81 | probs = (cnt_clicked + C * m) / (cnt_all + C)
 82 | probs = probs[df_test[column]].reset_index(drop=1)
 83 | probs.fillna(m, inplace=1)
 84 | 
 85 | df_test['prob'] = probs
 86 | 
 87 | print('took %0.3fs' % (time() - t0))
 88 | 
 89 | 
 90 | # saving the results
 91 | 
 92 | np.save('features/mtv/' + column + '_pred_0.npy', probs_0.values)
 93 | np.save('features/mtv/' + column + '_pred_1.npy', probs_1.values)
 94 | np.save('features/mtv/' + column + '_pred_test.npy', probs.values)
 95 | 
 96 | 
 97 | # creating the rank features
 98 | 
 99 | print('creating the ranking features...')
100 | 
101 | t0 = time()
102 | 
103 | f = column
104 | 
105 | for df in [df_train_0, df_train_1, df_test]:
106 |     df['%s_rank' % f] = df.groupby('display_id')[f].rank(method='max', ascending=0)
107 |     df['%s_rank' % f] = df['%s_rank' % f].astype('uint8')
108 | 
109 | print('took %0.3fs' % (time() - t0))
110 | 
111 | 
112 | np.save('features/mtv/' + column + '_pred_rank_0.npy', df_train_0['%s_rank' % f].values)
113 | np.save('features/mtv/' + column + '_pred_rank_1.npy', df_train_1['%s_rank' % f].values)
114 | np.save('features/mtv/' + column + '_pred_rank_test.npy', df_test['%s_rank' % f].values)


--------------------------------------------------------------------------------
/5_best_mtv_features_xgb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import xgboost as xgb
  6 | 
  7 | 
  8 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
  9 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
 10 | 
 11 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
 12 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
 13 | del df_train_0['fold'], df_train_1['fold'], df_all
 14 | 
 15 | features = list(pd.read_csv('categorical_features.txt', header=None)[0])
 16 | 
 17 | 
 18 | # training a small model to select best features
 19 | # first, load the data
 20 | 
 21 | 
 22 | df_train = df_train_0[:2000000].copy()
 23 | df_val = df_train_1[:1000000].copy()
 24 | del df_train_0, df_train_1
 25 | 
 26 | for f in features:
 27 |     print('loading data for %s...' % f)
 28 |     pred_0 = 'features/mte/%s_pred_0.npy' % f
 29 |     pred_1 = 'features/mte/%s_pred_1.npy' % f
 30 |     rank_0 = 'features/mte/%s_pred_rank_0.npy' % f
 31 |     rank_1 = 'features/mte/%s_pred_rank_1.npy' % f
 32 | 
 33 |     df_train[f] = np.load(pred_0)[:2000000]
 34 |     df_val[f] = np.load(pred_1)[:1000000]
 35 |     df_train[f + '_rank'] = np.load(rank_0)[:2000000]
 36 |     df_val[f + '_rank'] = np.load(rank_1)[:1000000]
 37 | 
 38 | 
 39 | ignore = {'display_id', 'ad_id', 'clicked'}
 40 | columns = sorted(set(df_train.columns) - ignore)
 41 | 
 42 | X_t = df_train[columns].values
 43 | y_t = df_train.clicked.values
 44 | 
 45 | X_v = df_val[columns].values
 46 | y_v = df_val.clicked.values
 47 | 
 48 | 
 49 | dtrain = xgb.DMatrix(X_t, y_t, feature_names=columns)
 50 | dval = xgb.DMatrix(X_v, y_v, feature_names=columns)
 51 | 
 52 | watchlist = [(dtrain, 'train'), (dval, 'val')]
 53 | del X_t, X_v, y_t, y_v
 54 | 
 55 | 
 56 | # train a small model and save only important feautures
 57 | 
 58 | xgb_pars = {
 59 |     'eta': 0.3,
 60 |     'gamma': 0.0,
 61 |     'max_depth': 6,
 62 |     'min_child_weight': 100,
 63 |     'max_delta_step': 0,
 64 |     'subsample': 1,
 65 |     'colsample_bytree': 0.6,
 66 |     'colsample_bylevel': 1,
 67 |     'lambda': 1,
 68 |     'alpha': 0,
 69 |     'tree_method': 'approx',
 70 |     'objective': 'binary:logistic',
 71 |     'eval_metric': 'auc',
 72 |     'nthread': 12,
 73 |     'seed': 42,
 74 |     'silent': 1
 75 | }
 76 | 
 77 | model = xgb.train(xgb_pars, dtrain, num_boost_round=20, verbose_eval=1, 
 78 |     evals=watchlist)
 79 | 
 80 | scores = model.get_score(importance_type='gain')
 81 | useful_features = [f for (f, s) in scores.items() if s >= 50.0]
 82 | 
 83 | 
 84 | # now let's put everything together in a data frame and save the result
 85 | 
 86 | for f in useful_features:
 87 |     if '_rank' in f:
 88 |         base_name = f[:-5]  + '_pred_rank'
 89 |     else:
 90 |         base_name = f + '_pred'
 91 | 
 92 |     df_train_0[f] = np.load('features/mtv/%s_0.npy' % base_name)
 93 |     df_train_1[f] = np.load('features/mtv/%s_1.npy' % base_name)
 94 |     df_test[f] = np.load('features/mtv/%s_test.npy' % base_name)
 95 | 
 96 | 
 97 | # also add the doc features
 98 | 
 99 | df_train_0_doc = feather.load_dataframe('features/docs_df_train_0.feather')
100 | df_train_1_doc = feather.load_dataframe('features/docs_df_train_1.feather')
101 | df_test_doc = feather.load_dataframe('features/docs_df_test.feather')
102 | 
103 | doc_features = ['doc_idf_dot', 'doc_idf_dot_lsa', 'doc_idf_cos',
104 |                 'doc_idf_dot_rank', 'doc_idf_dot_lsa_rank', 'doc_idf_cos_rank']
105 | 
106 | for f in doc_features:
107 |     df_train_0[f] = df_train_0_doc[f]
108 |     df_train_1[f] = df_train_1_doc[f]
109 |     df_test[f] = df_test_doc[f]
110 | 
111 | 
112 | df_train_0['doc_known_views'] = np.load('features/doc_known_views_0.npy')
113 | df_train_1['doc_known_views'] = np.load('features/doc_known_views_1.npy')
114 | df_test['doc_known_views'] = np.load('features/doc_known_views_test.npy)
115 | 
116 | 
117 | # now save evertyhing
118 | 
119 | feather.write_dataframe(df_train_0, 'tmp/mtv_df_train_0.feather')
120 | feather.write_dataframe(df_train_1, 'tmp/mtv_df_train_1.feather')
121 | feather.write_dataframe(df_test, 'tmp/mtv_df_test.feather')
122 | 


--------------------------------------------------------------------------------
/5_mtv_et.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import feather
  4 | import gc
  5 | 
  6 | from sklearn.metrics import roc_auc_score
  7 | from sklearn.ensemble import ExtraTreesClassifier
  8 | 
  9 | 
 10 | df_train_1 = feather.read_dataframe('tmp/mtv_df_train_1.feather')
 11 | features = sorted(set(df_train_1.columns) - {'display_id', 'clicked'})
 12 | 
 13 | y_1 = df_train_1.clicked.values
 14 | X_1 = df_train_1[features].values
 15 | 
 16 | del df_train_1
 17 | gc.collect()
 18 | 
 19 | 
 20 | df_train_0 = feather.read_dataframe('tmp/mtv_df_train_0.feather')
 21 | 
 22 | y_0 = df_train_0.clicked.values
 23 | X_0 = df_train_0[features].values
 24 | 
 25 | del df_train_0
 26 | gc.collect()
 27 | 
 28 | 
 29 | # training a model
 30 | 
 31 | n_estimators = 100
 32 | 
 33 | et_params = dict( 
 34 |     criterion='entropy',
 35 |     max_depth=40,
 36 |     min_samples_split=6,
 37 |     min_samples_leaf=6,
 38 |     max_features=6, 
 39 |     bootstrap=False, 
 40 |     n_jobs=-1,
 41 |     random_state=1
 42 | )
 43 | 
 44 | 
 45 | et0 = ExtraTreesClassifier(warm_start=True, **et_params)
 46 | et1 = ExtraTreesClassifier(warm_start=True, **et_params)
 47 | 
 48 | for n in range(10, n_estimators + 1, 10):
 49 |     et0.n_estimators = n
 50 |     et0.fit(X_1, y_1)
 51 |     pred_0 = et0.predict_proba(X_0)[:, 1]
 52 |     s0 = roc_auc_score(y_0, pred_0)
 53 | 
 54 |     et1.n_estimators = n
 55 |     et1.fit(X_0, y_0)
 56 |     pred_1 = et1.predict_proba(X_1)[:, 1]
 57 |     s1 = roc_auc_score(y_1, pred_1)
 58 |     
 59 |     scores = (s0, s1)
 60 |     scores_text = ', '.join('%0.5f' % s for s in scores)
 61 |     print('%3d, %0.4f, [%s]' % (n, np.mean(scores), scores_text))
 62 | 
 63 | print('final scores:', scores)
 64 | 
 65 | 
 66 | pred_0 = et0.predict_proba(X_0)[:, 1].astype('float32')
 67 | pred_1 = et1.predict_proba(X_1)[:, 1].astype('float32')
 68 | del et0, et1
 69 | 
 70 | 
 71 | np.save('predictions/et_pred0.npy', pred_0)
 72 | np.save('predictions/et_pred1.npy', pred_1)
 73 | 
 74 | 
 75 | # training on full dataset
 76 | 
 77 | print('full model...')
 78 | 
 79 | X = np.concatenate([X_0, X_1])
 80 | del X_0, X_1
 81 | gc.collect()
 82 | 
 83 | y = np.concatenate([y_0, y_1])
 84 | del y_0, y_1
 85 | gc.collect()
 86 | 
 87 | 
 88 | et_full = ExtraTreesClassifier(warm_start=True, **et_params)
 89 | et_full.n_estimators = n
 90 | et_full.fit(X, y)
 91 | 
 92 | del X, y
 93 | gc.collect()
 94 | 
 95 | 
 96 | 
 97 | # making predictions for test
 98 | 
 99 | df_test = feather.read_dataframe('tmp/mtv_df_test.feather')
100 | 
101 | X_test = df_test[features].values
102 | del df_test
103 | 
104 | 
105 | pred_test = et_full.predict_proba(X_test)[:, 1].astype('float32')
106 | np.save('predictions/et_pred_test.npy', pred_test)


--------------------------------------------------------------------------------
/5_mtv_xgb.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import xgboost as xgb
  4 | import feather
  5 | import gc
  6 | 
  7 | 
  8 | df_train_1 = feather.read_dataframe('tmp/mtv_df_train_1.feather')
  9 | features = sorted(set(df_train_1.columns) - {'display_id', 'clicked'})
 10 | 
 11 | y_1 = df_train_1.clicked.values
 12 | X_1 = df_train_1[features].values
 13 | del df_train_1
 14 | 
 15 | dfold1 = xgb.DMatrix(X_1, y_1, feature_names=features)
 16 | del X_1, y_1
 17 | gc.collect()
 18 | 
 19 | 
 20 | df_train_0 = feather.read_dataframe('tmp/mtv_df_train_0.feather')
 21 | 
 22 | y_0 = df_train_0.clicked.values
 23 | X_0 = df_train_0[features].values
 24 | del df_train_0
 25 | gc.collect()
 26 | 
 27 | dfold0 = xgb.DMatrix(X_0, y_0, feature_names=features)
 28 | del X_0, y_0
 29 | gc.collect()
 30 | 
 31 | 
 32 | 
 33 | # training a model
 34 | 
 35 | n_estimators = 100
 36 | 
 37 | xgb_pars = {
 38 |     'eta': 0.2,
 39 |     'gamma': 0.5,
 40 |     'max_depth': 6,
 41 |     'min_child_weight': 1,
 42 |     'max_delta_step': 0,
 43 |     'subsample': 1,
 44 |     'colsample_bytree': 0.5,
 45 |     'colsample_bylevel': 0.5,
 46 |     'lambda': 1,
 47 |     'alpha': 0,
 48 |     'tree_method': 'approx',
 49 |     'objective': 'binary:logistic',
 50 |     'eval_metric': 'auc',
 51 |     'nthread': 20,
 52 |     'seed': 42,
 53 |     'silent': 1
 54 | }
 55 | 
 56 | 
 57 | print('training model on fold 0...')
 58 | 
 59 | watchlist = [(dfold0, 'train'), (dfold1, 'val')]
 60 | model_fold1 = xgb.train(xgb_pars, dfold0, num_boost_round=n_estimators, 
 61 |                         verbose_eval=1, evals=watchlist)
 62 | 
 63 | print('training model on fold 1...')
 64 | 
 65 | watchlist = [(dfold1, 'train'), (dfold0, 'val')]
 66 | model_fold0 = xgb.train(xgb_pars, dfold1, num_boost_round=n_estimators, 
 67 |                         verbose_eval=1, evals=watchlist)
 68 | 
 69 | 
 70 | pred0 = model_fold0.predict(dfold0)
 71 | pred1 = model_fold1.predict(dfold1)
 72 | 
 73 | np.save('predictions/xgb_mtv_pred0.npy', pred0)
 74 | np.save('predictions/xgb_mtv_pred1.npy', pred1)
 75 | 
 76 | 
 77 | # saving the training leaves
 78 | 
 79 | leaves0 = model_0.predict(dfold0, pred_leaf=True).astype('uint8')
 80 | 
 81 | np.save('tmp/xgb_model_0_leaves.npy', leaves0)
 82 | del leaves0
 83 | gc.collect()
 84 | 
 85 | 
 86 | leaves1 = model_1.predict(dfold1, pred_leaf=True).astype('uint8')
 87 | 
 88 | np.save('tmp/xgb_model_1_leaves.npy', leaves1)
 89 | del leaves1
 90 | gc.collect()
 91 | 
 92 | 
 93 | 
 94 | # making prediction for test and getting the leaves
 95 | 
 96 | df_test = feather.read_dataframe('tmp/mtv_df_test.feather')
 97 | 
 98 | 
 99 | X_test = df_test[features].values
100 | del df_test
101 | gc.collect()
102 | 
103 | dtest = xgb.DMatrix(X_test, feature_names=features)
104 | del X_test
105 | gc.collect()
106 | 
107 | 
108 | pred0_test = model_0.predict(dtest)
109 | pred1_test = model_1.predict(dtest)
110 | pred_test = (pred0_test + pred1_test) / 2
111 | 
112 | np.save('predictions/xgb_mtv_pred_test.npy', pred_test)
113 | 
114 | 
115 | # predicting leaves for test
116 | 
117 | leaves0_test = model_0.predict(dtest, pred_leaf=True).astype('uint8')
118 | np.save('tmp/xgb_model_0_test_leaves.npy', leaves0_test)
119 | 
120 | del leaves0_test
121 | gc.collect()
122 | 
123 | leaves1_test = model_1.predict(dtest, pred_leaf=True).astype('uint8')
124 | np.save('tmp/xgb_model_1_test_leaves.npy', leaves1_test)
125 | 
126 | del leaves1_test
127 | gc.collect()


--------------------------------------------------------------------------------
/6_1_generate_ffm_data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from tqdm import tqdm
  5 | import feather
  6 | import gc
  7 | 
  8 | 
  9 | D = 2 ** 20
 10 | 
 11 | # display features
 12 | USER = '0'
 13 | ON_DOC = '1'
 14 | PLATFORM = '2'
 15 | 
 16 | # ads features
 17 | AD = '3'
 18 | AD_DOC = '4'
 19 | CAMPAIGN = '5'
 20 | ADVERTISER = '6'
 21 | 
 22 | # document features
 23 | ON_SRC = '7'
 24 | ON_PUBLISHER = '8'
 25 | 
 26 | AD_SRC = '9'
 27 | AD_PUBLISHER = '10' 
 28 | 
 29 | 
 30 | def hash_element(el):
 31 |     h = hash(el) % D
 32 |     if h < 0:
 33 |         h = h + D
 34 |     return str(h)
 35 | 
 36 | 
 37 | # reading the events features
 38 | 
 39 | df_events = pd.read_csv("../data/events.csv", usecols=['uuid', 'document_id', 'platform'])
 40 | 
 41 | user_str = USER + ':' + df_events.uuid.apply(hash_element) + ':1'
 42 | doc_str = ON_DOC + ':' + df_events.document_id.apply(hash_element) + ':1'
 43 | platforms = PLATFORM + ':' + df_events.platform.astype('str') + ':1'
 44 | 
 45 | df_events_processed = pd.DataFrame()
 46 | df_events_processed['display_str'] = user_str + ' ' + doc_str + ' ' + platforms
 47 | df_events_processed['document_id'] = df_events.document_id
 48 | 
 49 | del df_events, user_str, doc_str, platforms
 50 | 
 51 | 
 52 | # reading the ads features
 53 | 
 54 | df_ads = pd.read_csv("../data/promoted_content.csv")
 55 | ad_to_doc = dict(zip(df_ads.ad_id, df_ads.document_id))
 56 | 
 57 | ad_str = AD + ':' + df_ads.ad_id.astype(str) + ':1 ' + \
 58 |          AD_DOC + ':' + df_ads.document_id.apply(hash_element) + ':1 ' + \
 59 |          CAMPAIGN + ':' + df_ads.campaign_id.astype(str) + ':1 ' + \
 60 |          ADVERTISER + ':' + df_ads.advertiser_id.astype(str) + ':1'
 61 | 
 62 | ad_str_dict = dict(zip(df_ads.ad_id, ad_str))
 63 | 
 64 | del ad_str, df_ads
 65 | 
 66 | 
 67 | # reading the document meta features - others aren't included
 68 | 
 69 | df_doc_meta = pd.read_csv('../data/documents_meta.csv')
 70 | 
 71 | df_doc_meta.source_id.fillna(0, inplace=1)
 72 | df_doc_meta.source_id = df_doc_meta.source_id.astype('int32')
 73 | df_doc_meta.publisher_id.fillna(0, inplace=1)
 74 | df_doc_meta.publisher_id = df_doc_meta.publisher_id.astype('int32')
 75 | del df_doc_meta['publish_time']
 76 | 
 77 | meta_src = df_doc_meta.source_id.astype('str') + ':1 '
 78 | meta_src_dict = dict(zip(df_doc_meta.document_id, meta_src))
 79 | 
 80 | meta_pub = df_doc_meta.publisher_id.astype('str') + ':1'
 81 | meta_pub_dict = dict(zip(df_doc_meta.document_id, meta_pub))
 82 | 
 83 | del df_doc_meta, meta_src, meta_pub
 84 | 
 85 | # generating the ffm data 
 86 | 
 87 | leaves_start = 11
 88 | 
 89 | def ffm_feature_string(display_id, ad_id, leaves, label=None):
 90 |     ad_doc_id = ad_to_doc[ad_id]
 91 | 
 92 |     ad_features = ad_str_dict[ad_id] #
 93 |     
 94 |     disp_row = df_events_processed.iloc[display_id - 1]
 95 |     on_doc_id = disp_row.document_id
 96 |     disp_features = disp_row.display_str #
 97 | 
 98 |     on_src = ON_SRC + ':' + meta_src_dict[on_doc_id]
 99 |     on_pub = ON_PUBLISHER + ':' + meta_pub_dict[on_doc_id]
100 | 
101 |     ad_src = AD_SRC + ':' + meta_src_dict[ad_doc_id]
102 |     ad_pub = AD_PUBLISHER + ':' + meta_pub_dict[ad_doc_id]
103 |     
104 |     leaves_features = []
105 | 
106 |     for i, leaf in enumerate(leaves):
107 |         leaves_features.append('%d:%d:1' % (leaves_start + i, leaf))
108 | 
109 |     leaves_features = ' '.join(leaves_features)
110 | 
111 |     result = disp_features + ' ' + ad_features + ' ' + \
112 |              on_src + ' ' + on_pub + ' ' + \
113 |              ad_src + ' ' + ad_pub + ' ' + \
114 |              leaves_features
115 | 
116 |     if label is None:
117 |         return '0 ' + result
118 |     else:
119 |         return str(label) + ' ' + result
120 | 
121 | 
122 | # generating the data for train
123 | 
124 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
125 | 
126 | leaves_0 = np.load('tmp/xgb_model_0_leaves.npy')
127 | leaves_1 = np.load('tmp/xgb_model_1_leaves.npy')
128 | 
129 | 
130 | f_0 = open('ffm/ffm_xgb_0.txt', 'w')
131 | f_1 = open('ffm/ffm_xgb_1.txt', 'w')
132 | cnt_0 = 0
133 | cnt_1 = 0
134 | 
135 | for row in tqdm(df_all.itertuples()):
136 |     display_id = row.display_id
137 |     ad_id = row.ad_id
138 |     fold = row.fold
139 |     label = row.clicked
140 | 
141 |     if fold == 0:
142 |         row = ffm_feature_string(display_id, ad_id, leaves_0[cnt_0], label)
143 |         f_0.write(row + '\n')
144 |         cnt_0 = cnt_0 + 1
145 |     else:
146 |         row = ffm_feature_string(display_id, ad_id, leaves_1[cnt_1], label)
147 |         f_1.write(row + '\n')
148 |         cnt_1 = cnt_1 + 1
149 | 
150 | f_0.close()
151 | f_1.close()
152 | 
153 | 
154 | del df_all, leaves_0, leaves_1
155 | gc.collect()
156 | 
157 | 
158 | # generating the data for test
159 | 
160 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
161 | 
162 | leaves_0 = np.load('tmp/xgb_model_0_test_leaves.npy')
163 | leaves_1 = np.load('tmp/xgb_model_1_test_leaves.npy')
164 | 
165 | f_0 = open('ffm/ffm_xgb_test_0.txt', 'w')
166 | f_1 = open('ffm/ffm_xgb_test_1.txt', 'w')
167 | 
168 | cnt = 0
169 | 
170 | for row in tqdm(df_test.itertuples()):
171 |     display_id = row.display_id
172 |     ad_id = row.ad_id
173 | 
174 |     row = ffm_feature_string(display_id, ad_id, leaves_0[cnt])
175 |     f_0.write(row + '\n')
176 | 
177 |     row = ffm_feature_string(display_id, ad_id, leaves_1[cnt])
178 |     f_1.write(row + '\n')
179 |     
180 |     cnt = cnt + 1
181 | 
182 | f_0.close()
183 | f_1.close()


--------------------------------------------------------------------------------
/6_2_split_ffm_to_subfolds.py:
--------------------------------------------------------------------------------
 1 | import feather
 2 | import numpy as np
 3 | 
 4 | 
 5 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
 6 | 
 7 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
 8 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
 9 | del df_train_0['fold'], df_train_1['fold'], df_all
10 | 
11 | 
12 | # define subfolds for each fold
13 | np.random.seed(1)
14 | 
15 | uniq0 = df_train_0.display_id.unique()
16 | uniq1 = df_train_1.display_id.unique()
17 | 
18 | np.random.shuffle(uniq0)
19 | np.random.shuffle(uniq1)
20 | 
21 | n0 = len(uniq0) // 2
22 | fold_0_0 = set(uniq0[:n0])
23 | 
24 | n1 = len(uniq1) // 2
25 | fold_1_0 = set(uniq1[:n1])
26 | 
27 | 
28 | df_train_0['subfold'] = df_train_0.display_id.isin(fold_0_0).astype('uint8')
29 | df_train_1['subfold'] = df_train_1.display_id.isin(fold_1_0).astype('uint8')
30 | 
31 | np.save('tmp/fold_0_split.npy', df_train_0.fold.values)
32 | np.save('tmp/fold_1_split.npy', df_train_1.fold.values)
33 | 
34 | 
35 | # split fold 0 into subfolds
36 | 
37 | f_0 = open('ffm/ffm_xgb_0_0.txt', 'w')
38 | f_1 = open('ffm/ffm_xgb_0_1.txt', 'w')
39 | 
40 | with open('ffm/ffm_xgb_0.txt', 'r') as f_in:
41 |     for fold, line in tqdm(zip(df_train_0.fold, f_in)):
42 |         if fold == 0:
43 |             f_0.write(line)
44 |         else:
45 |             f_1.write(line)
46 | 
47 | f_0.close()
48 | f_1.close()
49 | 
50 | 
51 | # split fold 1 into subfolds
52 | 
53 | f_0 = open('ffm/ffm_xgb_1_0.txt', 'w')
54 | f_1 = open('ffm/ffm_xgb_1_1.txt', 'w')
55 | 
56 | with open('ffm/ffm_xgb_1.txt', 'r') as f_in:
57 |     for fold, line in tqdm(zip(df_train_1.fold, f_in)):
58 |         if fold == 0:
59 |             f_0.write(line)
60 |         else:
61 |             f_1.write(line)
62 | 
63 | f_0.close()
64 | f_1.close()
65 | 
66 | 


--------------------------------------------------------------------------------
/6_3_run_ffm.sh:
--------------------------------------------------------------------------------
 1 | # assumes libffm is on PATH
 2 | 
 3 | PARAMS='-s 12 -k 5 -l 0.000001 -t 5'
 4 | 
 5 | cd ffm
 6 | 
 7 | 
 8 | # fold 0
 9 | 
10 | ffm-train $PARAMS -p ffm_xgb_0_0.txt ffm_xgb_0_1.txt ffm_0_0.bin
11 | ffm-predict ffm_xgb_0_0.txt ffm_0_0.bin pred_0_0.txt
12 | 
13 | ffm-train $PARAMS -p ffm_xgb_0_1.txt ffm_xgb_0_0.txt ffm_0_1.bin
14 | ffm-predict ffm_xgb_0_1.txt ffm_0_1.bin pred_0_1.txt
15 | 
16 | ffm-train $PARAMS ffm_xgb_0.txt ffm_0_full.bin
17 | 
18 | 
19 | # fold 1
20 | 
21 | ffm-train $PARAMS -p ffm_xgb_1_0.txt ffm_xgb_1_1.txt ffm_1_0.bin
22 | ffm-predict ffm_xgb_1_0.txt ffm_1_0.bin pred_1_0.txt
23 | 
24 | ffm-train $PARAMS -p ffm_xgb_1_1.txt ffm_xgb_1_0.txt ffm_1_1.bin
25 | ffm-predict ffm_xgb_1_1.txt ffm_1_1.bin pred_1_1.txt
26 | 
27 | ffm-train $PARAMS ffm_xgb_1.txt ffm_1_full.bin
28 | 
29 | 
30 | # predict for test
31 | 
32 | ffm-predict ffm_xgb_test_0.txt ffm_0_full.bin pred_test_0.txt
33 | ffm-predict ffm_xgb_test_1.txt ffm_1_full.bin pred_test_1.txt


--------------------------------------------------------------------------------
/6_4_put_ffm_subfolds_together.py:
--------------------------------------------------------------------------------
 1 | import feather
 2 | import numpy as np
 3 | 
 4 | 
 5 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
 6 | 
 7 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
 8 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
 9 | del df_train_0['fold'], df_train_1['fold'], df_all
10 | 
11 | df_train_0['fold'] = np.load('tmp/fold_0_split.npy')
12 | df_train_1['fold'] = np.load('tmp/fold_1_split.npy')
13 | 
14 | 
15 | #  predictions of two subfolds of fold 0
16 | 
17 | pred_0_0 = pd.read_csv('ffm/pred_0_0.txt', header=None, dtype='float32')
18 | pred_0_0 = pred_0_0[0]
19 | 
20 | pred_0_1 = pd.read_csv('ffm/pred_0_1.txt', header=None, dtype='float32')
21 | pred_0_1 = pred_0_1[0]
22 | 
23 | 
24 | df_train_0.loc[df_train_0.fold == 0, 'ffm_xgb'] = pred_0_0.values
25 | df_train_0.loc[df_train_0.fold == 1, 'ffm_xgb'] = pred_0_1.values
26 | ffm_xgb_0 = df_train_0.ffm_xgb.astype('float32')
27 | 
28 | np.save('predictions/ffm_0.npy', ffm_xgb_0.values)
29 | 
30 | 
31 | #  predictions of two subfolds of fold 1
32 | 
33 | pred_1_0 = pd.read_csv('ffm/pred_1_0.txt', header=None, dtype='float32')
34 | pred_1_0 = pred_1_0[0]
35 | 
36 | pred_1_1 = pd.read_csv('ffm/pred_1_1.txt', header=None, dtype='float32')
37 | pred_1_1 = pred_1_1[0]
38 | 
39 | 
40 | df_train_1.loc[df_train_1.fold == 0, 'ffm_xgb'] = pred_1_0.values
41 | df_train_1.loc[df_train_1.fold == 1, 'ffm_xgb'] = pred_1_1.values
42 | ffm_xgb_1 = df_train_1.ffm_xgb.astype('float32')
43 | 
44 | np.save('predictions/ffm_1.npy', ffm_xgb_1.values)
45 | 
46 | 
47 | # test predictions
48 | 
49 | pred_test_0 = pd.read_csv('ffm/pred_test_0.txt', header=None, dtype='float32')
50 | pred_test_1 = pd.read_csv('ffm/pred_test_1.txt', header=None, dtype='float32')
51 | 
52 | pred_test = (pred_test_0[0] + pred_test_1[0]) / 2
53 | pred_test = pred_test.astype('float32')
54 | 
55 | np.save('predictions/ffm_test.npy', pred_test.values)


--------------------------------------------------------------------------------
/7_ensemble_data_prep.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import feather
  4 | 
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
  9 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
 10 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
 11 | del df_train_0['fold'], df_train_1['fold'], df_all
 12 | 
 13 | df_test = feather.read_dataframe('tmp/clicks_test.feather')
 14 | 
 15 | 
 16 | # read svm predictions
 17 | 
 18 | df_train_0['svm'] = np.load('predictions/svm_0_preds.npy')
 19 | df_train_0['svm'] = df_train_0['svm'].astype('float32')
 20 | 
 21 | df_train_1['svm'] = np.load('predictions/svm_1_preds.npy')
 22 | df_train_1['svm'] = df_train_1['svm'].astype('float32')
 23 | 
 24 | df_test['svm'] = np.load('predictions/svm_test_preds.npy')
 25 | df_test['svm'] = df_test['svm'].astype('float32')
 26 | 
 27 | 
 28 | # read ftrl predictions
 29 | 
 30 | ftrl_0 = pd.read_csv('predictions/ftrl_pred_0.txt')
 31 | df_train_0['ftrl'] = ftrl_0.y_pred.astype('float32')
 32 | 
 33 | ftrl_1 = pd.read_csv('predictions/ftrl_pred_1.txt')
 34 | df_train_1['ftrl'] = ftrl_1.y_pred.astype('float32')
 35 | 
 36 | ftrl_test = pd.read_csv('predictions/ftrl_pred_test.txt')
 37 | df_test['ftrl'] = ftrl_test.y_pred.astype('float32')
 38 | 
 39 | 
 40 | # read xgb predictions
 41 | 
 42 | df_train_0['xgb_mtv'] = np.load('predictions/xgb_mtv_pred0.npy')
 43 | df_train_1['xgb_mtv'] = np.load('predictions/xgb_mtv_pred1.npy')
 44 | df_test['xgb_mtv'] = np.load('predictions/xgb_mtv_pred_test.npy')
 45 | 
 46 | 
 47 | # read et predictions
 48 | 
 49 | df_train_0['et_mtv'] = np.load('predictions/et_pred0.npy')
 50 | df_train_1['et_mtv'] = np.load('predictions/et_pred1.npy')
 51 | df_test['et_mtv'] = np.load('predictions/et_pred_test.npy')
 52 | 
 53 | 
 54 | # read ffm predictions
 55 | 
 56 | df_train_0['ffm'] = np.load('predictions/ffm_0.npy')
 57 | df_train_1['ffm'] = np.load('predictions/ffm_1.npy')
 58 | df_test['ffm'] = np.load('predictions/ffm_test.npy')
 59 | 
 60 | 
 61 | # read the leak features
 62 | 
 63 | df_train_0['leak'] = np.load('features/leak_0.npy')
 64 | df_train_0['leak'] = df_train_0['leak'].astype('uint8')
 65 | 
 66 | df_train_1['leak'] = np.load('features/leak_1.npy')
 67 | df_train_1['leak'] = df_train_1['leak'].astype('uint8')
 68 | 
 69 | df_test['leak'] = np.load('features/leak_test.npy')
 70 | df_test['leak'] = df_test['leak'].astype('uint8')
 71 | 
 72 | 
 73 | df_train_0['doc_known_views'] = np.load('features/doc_known_views_0.npy')
 74 | df_train_0['doc_known_views'] = df_train_0['leak'].astype('uint32')
 75 | 
 76 | df_train_1['doc_known_views'] = np.load('features/doc_known_views_1.npy')
 77 | df_train_1['doc_known_views'] = df_train_1['leak'].astype('uint32')
 78 | 
 79 | df_test['doc_known_views'] = np.load('features/doc_known_views_test.npy')
 80 | df_test['doc_known_views'] = df_test['leak'].astype('uint32')
 81 | 
 82 | 
 83 | # rank features
 84 | 
 85 | cols_to_rank = ['svm', 'ftrl', 'xgb_mtv', 'et_mtv', 'ffm']
 86 | 
 87 | 
 88 | for f in tqdm(cols_to_rank):
 89 |     for df in [df_train_0, df_train_1, df_test]:
 90 |         df['%s_rank' % f] = df.groupby('display_id')[f].rank(method='dense', ascending=0)
 91 |         df['%s_rank' % f] = df['%s_rank' % f].astype('uint8')
 92 | 
 93 | 
 94 | # some mean target value features
 95 | 
 96 | mtv_features = ['ad_document_id_on_doc_publisher_id',
 97 |                 'ad_doc_source_id_on_doc_publisher_id',
 98 |                 'ad_document_id_on_doc_source_id']
 99 | 
100 | for f in mtv_features:
101 |     df_train_0[f] = np.load('features/mte/%s_pred_0.npy' % f)
102 |     df_train_0['%s_rank' % f] = np.load('features/mte/%s_pred_rank_0.npy' % f)
103 | 
104 |     df_train_1[f] = np.load('features/mte/%s_pred_1.npy' % f)
105 |     df_train_1['%s_rank' % f] = np.load('features/mte/%s_pred_rank_1.npy' % f)
106 | 
107 |     df_test[f] = np.load('features/mte/%s_pred_test.npy' % f)
108 |     df_test['%s_rank' % f] = np.load('features/mte/%s_pred_rank_test.npy' % f)
109 | 
110 | 
111 | # now save everything
112 | 
113 | feather.write_dataframe(df_train_0, 'df_train_0_ensemble.feather')
114 | feather.write_dataframe(df_train_1, 'df_train_1_ensemble.feather')
115 | feather.write_dataframe(df_test, 'df_test_ensemble.feather')
116 | 


--------------------------------------------------------------------------------
/7_ensemble_xgb.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import xgboost as xgb
  4 | import feather
  5 | import gc
  6 | 
  7 | # prapare the data matrices
  8 | 
  9 | 
 10 | df_train_0 = feather.read_dataframe('tmp/df_train_0_ensemble.feather')
 11 | 
 12 | ignore = {'display_id', 'ad_id', 'clicked', 'fold'}
 13 | columns = sorted(set(df_train_0.columns) - ignore)
 14 | 
 15 | group0_sizes = df_train_0.display_id.value_counts(sort=False)
 16 | group0_sizes.sort_index(inplace=1)
 17 | group0_sizes = group0_sizes.values.astype('uint8')
 18 | 
 19 | y_0 = df_train_0.clicked.values
 20 | X_0 = df_train_0[columns].values
 21 | del df_train_0
 22 | gc.collect()
 23 | 
 24 | dfold0 = xgb.DMatrix(X_0, y_0, feature_names=columns)
 25 | dfold0.set_group(group0_sizes)
 26 | 
 27 | del X_0, y_0
 28 | gc.collect()
 29 | 
 30 | 
 31 | 
 32 | df_train_1 = feather.read_dataframe('tmp/df_train_1_ensemble.feather')
 33 | 
 34 | group1_sizes = df_train_1.display_id.value_counts(sort=False)
 35 | group1_sizes.sort_index(inplace=1)
 36 | group1_sizes = group1_sizes.values.astype('uint8')
 37 | 
 38 | y_1 = df_train_1.clicked.values
 39 | X_1 = df_train_1[columns].values
 40 | del df_train_1
 41 | gc.collect()
 42 | 
 43 | dfold1 = xgb.DMatrix(X_1, y_1, feature_names=columns)
 44 | dfold1.set_group(group1_sizes)
 45 | 
 46 | del X_1, y_1
 47 | gc.collect()
 48 | 
 49 | watchlist = [(dfold0, 'train'), (dfold1, 'val')]
 50 | 
 51 | 
 52 | # train the model
 53 | 
 54 | n_estimators = 1000
 55 | 
 56 | xgb_pars = {
 57 |     'eta': 0.15,
 58 |     'gamma': 0.0,
 59 |     'max_depth': 8,
 60 |     'min_child_weight': 1,
 61 |     'max_delta_step': 0,
 62 |     'subsample': 0.6,
 63 |     'colsample_bytree': 0.6,
 64 |     'colsample_bylevel': 1,
 65 |     'lambda': 1,
 66 |     'alpha': 0,
 67 |     'tree_method': 'approx',
 68 |     'objective': 'rank:pairwise',
 69 |     'eval_metric': 'map@12',
 70 |     'nthread': 12,
 71 |     'seed': 42,
 72 |     'silent': 1
 73 | }
 74 | 
 75 | 
 76 | # train the model
 77 | 
 78 | model = xgb.train(xgb_pars, dfold0, num_boost_round=n_estimators, 
 79 |                   verbose_eval=1, evals=watchlist)
 80 | 
 81 | del dfold0, dfold1, watchlist
 82 | gc.collect()
 83 | 
 84 | 
 85 | # test predict
 86 | 
 87 | df_test = feather.read_dataframe('tmp/df_test_ensemble.feather')
 88 | 
 89 | group_test_sizes = df_test.display_id.value_counts(sort=False)
 90 | group_test_sizes.sort_index(inplace=1)
 91 | group_test_sizes = group_test_sizes.values.astype('uint8')
 92 | 
 93 | X_test = df_test[columns].values
 94 | df_test = df_test[['display_id', 'ad_id']].copy()
 95 | 
 96 | dtest = xgb.DMatrix(X_test, feature_names=columns)
 97 | dtest.set_group(group_test_sizes)
 98 | del X_test
 99 | 
100 | 
101 | test_pred = model.predict(dtest)
102 | df_test['pred'] = test_pred
103 | 
104 | 
105 | feather.write_dataframe(df_test, 'final_submission.feather')
106 | 
107 | # now run `Rscript submission.R final_submission.feather xgb_submission.csv`


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Outbrain Click Prediction challenge solution
 2 | 
 3 | - The goal of the competition is to predict which ad will be clicked on
 4 | - See https://www.kaggle.com/c/outbrain-click-prediction for more details
 5 | - This is `ololo`'s part of the 13th place solution to the challenge (team "diaman & ololo")
 6 | - The presentation of the solution: http://www.slideshare.net/AlexeyGrigorev/outbrain-click-prediction-71724151
 7 | - `diaman`'s solution can be found at https://github.com/dselivanov/kaggle-outbrain
 8 | 
 9 | 
10 | ## Overview:
11 | 
12 | The part of the solution is a combination of 5 models:
13 | 
14 | - SVM and FTRL on basic features:
15 |   - event features: user id, document id, platform id, day, hour and geo
16 |   - ad features: ad document id, campaign, advertizer id
17 | - XGB and ET on MTV (Mean Target Value) features:
18 |   - all categorical features that previous model used
19 |   - document features like publisher, source, top category, topic and entity
20 |   - interaction between these featuers
21 |   - also, the document similarity features: the cosine between the ad doc and the page with the ad
22 | - FFM with the following features:
23 |   - all categorical features from the above, except document similarity, categories, topics and entities
24 |   - XGB leaves from the previous step (see slide 9 from [this presentation](http://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf) for the description of the idea)
25 | - The models are combined with an XGB model (`rank:pairwise` objective)
26 | 
27 | To get the 13th positions, models from [diaman](https://www.kaggle.com/dselivanov) should also be added 
28 | 
29 | ## Files description
30 | 
31 | - `0_prepare_splits.py` splits the training dataset into two folds
32 | - `1_svm_data.py` prepares the data for SVM and FTRL
33 | - `1_train_ftrl.py` and `1_train_svm.py` train models on data from `1_svm_data.py`
34 | - `2_extract_leaked_docs.py` and `2_leak_features.py` extract the leak
35 | - `3_doc_similarity_features.py` calculates TF-IDF similarity between the document user on and the ad document
36 | - `4_categorical_data_join.py` and `4_categorical_data_unwrap_columnwise.py` prepare data for MTV features calculation
37 | - `4_mean_target_value.py` calculates MTV for all features from `categorical_features.txt`
38 | - `5_best_mtv_features_xgb.py` builds an XBG on a small part of data and selects best features to be used on for XGB and ET
39 | - `5_mtv_et.py` trains ET model on MTV features
40 | - `5_mtv_xgb.py` trains XGB model on MTV features and creates leaf featurse to be used in FFM
41 | - `6_1_generate_ffm_data.py` creates the input file to be read by ffmlib
42 | - `6_2_split_ffm_to_subfolds.py` splits each fold into two subfolds (can't use the original folds because the leaf features are not transferable between folds)
43 | - `6_3_run_ffm.sh` runs libffm for training FFM models
44 | - `6_4_put_ffm_subfolds_together.py` puts FFM predictions from each fold/subfold together
45 | - `7_ensemble_data_prep.py` puts all the features and model predictions together for ensembling
46 | - `7_ensemble_xgb.py` traings the second level XGB model on top of all these features
47 | 
48 | The files should be run in the above order
49 | 
50 | Diaman's features should be included into `7_ensemble_data_prep.py` - and the rest can stay unchanged.
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/categorical_features.txt:
--------------------------------------------------------------------------------
  1 | ad_id
  2 | ad_document_id
  3 | ad_doc_source_id
  4 | ad_doc_publisher_id
  5 | ad_doc_top_entity
  6 | ad_doc_top_topic
  7 | ad_doc_top_cat
  8 | ad_doc_publisher_id_day
  9 | ad_doc_publisher_id_hour
 10 | ad_doc_publisher_id_geo_0
 11 | ad_doc_publisher_id_geo_1
 12 | ad_doc_publisher_id_geo_2
 13 | ad_doc_source_id_day
 14 | ad_doc_source_id_hour
 15 | ad_doc_source_id_geo_0
 16 | ad_doc_source_id_geo_1
 17 | ad_doc_source_id_geo_2
 18 | ad_doc_top_cat_day
 19 | ad_doc_top_cat_hour
 20 | ad_doc_top_cat_geo_0
 21 | ad_doc_top_cat_geo_1
 22 | ad_doc_top_cat_geo_2
 23 | ad_doc_top_entity_day
 24 | ad_doc_top_entity_hour
 25 | ad_doc_top_entity_geo_0
 26 | ad_doc_top_entity_geo_1
 27 | ad_doc_top_entity_geo_2
 28 | ad_doc_top_topic_day
 29 | ad_doc_top_topic_hour
 30 | ad_doc_top_topic_geo_0
 31 | ad_doc_top_topic_geo_1
 32 | ad_doc_top_topic_geo_2
 33 | ad_document_id_day
 34 | ad_document_id_hour
 35 | ad_document_id_geo_0
 36 | ad_document_id_geo_1
 37 | ad_document_id_geo_2
 38 | advertiser_id_day
 39 | advertiser_id_hour
 40 | advertiser_id_geo_0
 41 | advertiser_id_geo_1
 42 | advertiser_id_geo_2
 43 | ad_doc_publisher_id_ad_doc_source_id
 44 | ad_doc_publisher_id_ad_doc_top_cat
 45 | ad_doc_publisher_id_ad_doc_top_entity
 46 | ad_doc_publisher_id_ad_doc_top_topic
 47 | ad_doc_publisher_id_ad_document_id
 48 | ad_doc_publisher_id_ad_id
 49 | ad_doc_publisher_id_advertiser_id
 50 | ad_doc_publisher_id_campaign_id
 51 | ad_doc_publisher_id_on_doc_publisher_id
 52 | ad_doc_publisher_id_on_doc_source_id
 53 | ad_doc_publisher_id_on_doc_top_cat
 54 | ad_doc_publisher_id_on_doc_top_entity
 55 | ad_doc_publisher_id_on_doc_top_topic
 56 | ad_doc_publisher_id_on_document_id
 57 | ad_doc_publisher_id_platform
 58 | ad_doc_publisher_id_user_id
 59 | ad_doc_source_id_ad_doc_top_cat
 60 | ad_doc_source_id_ad_doc_top_entity
 61 | ad_doc_source_id_ad_doc_top_topic
 62 | ad_doc_source_id_ad_document_id
 63 | ad_doc_source_id_ad_id
 64 | ad_doc_source_id_advertiser_id
 65 | ad_doc_source_id_campaign_id
 66 | ad_doc_source_id_on_doc_publisher_id
 67 | ad_doc_source_id_on_doc_source_id
 68 | ad_doc_source_id_on_doc_top_cat
 69 | ad_doc_source_id_on_doc_top_entity
 70 | ad_doc_source_id_on_doc_top_topic
 71 | ad_doc_source_id_on_document_id
 72 | ad_doc_source_id_platform
 73 | ad_doc_source_id_user_id
 74 | ad_doc_top_cat_ad_doc_top_entity
 75 | ad_doc_top_cat_ad_doc_top_topic
 76 | ad_doc_top_cat_ad_document_id
 77 | ad_doc_top_cat_ad_id
 78 | ad_doc_top_cat_advertiser_id
 79 | ad_doc_top_cat_campaign_id
 80 | ad_doc_top_cat_on_doc_publisher_id
 81 | ad_doc_top_cat_on_doc_source_id
 82 | ad_doc_top_cat_on_doc_top_cat
 83 | ad_doc_top_cat_on_doc_top_entity
 84 | ad_doc_top_cat_on_doc_top_topic
 85 | ad_doc_top_cat_on_document_id
 86 | ad_doc_top_cat_platform
 87 | ad_doc_top_cat_user_id
 88 | ad_doc_top_entity_ad_doc_top_topic
 89 | ad_doc_top_entity_ad_document_id
 90 | ad_doc_top_entity_ad_id
 91 | ad_doc_top_entity_advertiser_id
 92 | ad_doc_top_entity_campaign_id
 93 | ad_doc_top_entity_on_doc_publisher_id
 94 | ad_doc_top_entity_on_doc_source_id
 95 | ad_doc_top_entity_on_doc_top_cat
 96 | ad_doc_top_entity_on_doc_top_entity
 97 | ad_doc_top_entity_on_doc_top_topic
 98 | ad_doc_top_entity_on_document_id
 99 | ad_doc_top_entity_platform
100 | ad_doc_top_entity_user_id
101 | ad_doc_top_topic_ad_document_id
102 | ad_doc_top_topic_ad_id
103 | ad_doc_top_topic_advertiser_id
104 | ad_doc_top_topic_campaign_id
105 | ad_doc_top_topic_on_doc_publisher_id
106 | ad_doc_top_topic_on_doc_source_id
107 | ad_doc_top_topic_on_doc_top_cat
108 | ad_doc_top_topic_on_doc_top_entity
109 | ad_doc_top_topic_on_doc_top_topic
110 | ad_doc_top_topic_on_document_id
111 | ad_doc_top_topic_platform
112 | ad_doc_top_topic_user_id
113 | ad_document_id_campaign_id
114 | ad_document_id_on_doc_publisher_id
115 | ad_document_id_on_doc_source_id
116 | ad_document_id_on_doc_top_cat
117 | ad_document_id_on_doc_top_entity
118 | ad_document_id_on_doc_top_topic
119 | ad_document_id_on_document_id
120 | ad_document_id_platform
121 | ad_document_id_user_id
122 | ad_id_advertiser_id
123 | ad_id_campaign_id
124 | ad_id_on_doc_publisher_id
125 | ad_id_on_doc_source_id
126 | ad_id_on_doc_top_cat
127 | ad_id_on_doc_top_entity
128 | ad_id_on_doc_top_topic
129 | ad_id_on_document_id
130 | ad_id_platform
131 | ad_id_user_id
132 | advertiser_id_user_id
133 | campaign_id_user_id
134 | on_doc_publisher_id_user_id
135 | on_doc_source_id_user_id
136 | on_doc_top_cat_user_id
137 | on_doc_top_entity_user_id
138 | on_doc_top_topic_user_id
139 | on_document_id_user_id
140 | platform_user_id


--------------------------------------------------------------------------------
/ftrl.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from math import exp, log, sqrt
  3 | 
  4 | # implementation taken from kaggle scripts:
  5 | # https://www.kaggle.com/sudalairajkumar/outbrain-click-prediction/ftrl-starter-with-leakage-vars/code
  6 | 
  7 | 
  8 | def hash_element(el, D):
  9 |     h = hash(el) % D
 10 |     if h < 0:
 11 |         h = h + D
 12 |     return h
 13 | 
 14 | def hash_elements(elements, D):
 15 |     return [hash_element(el, D) for el in elements]
 16 | 
 17 | 
 18 | class FtrlProximal(object):
 19 |     ''' Our main algorithm: Follow the regularized leader - proximal
 20 | 
 21 |         In short,
 22 |         this is an adaptive-learning-rate sparse logistic-regression with
 23 |         efficient L1-L2-regularization
 24 | 
 25 |         Reference:
 26 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 27 |     '''
 28 | 
 29 |     def __init__(self, alpha, beta, L1, L2, D, interactions):
 30 |         # parameters
 31 |         self.alpha = alpha
 32 |         self.beta = beta
 33 |         self.L1 = L1
 34 |         self.L2 = L2
 35 | 
 36 |         # feature related parameters
 37 |         self.D = D
 38 | 
 39 |         self.interactions = interactions
 40 | 
 41 |         # model
 42 |         # n: squared sum of past gradients
 43 |         # z: weights
 44 |         # w: lazy weights
 45 |         self.n = [0.0] * (D + 1)
 46 |         self.z = [0.0] * (D + 1)
 47 |         self.w = {}
 48 | 
 49 |     def to_indices(self, x):
 50 |         res = hash_elements(x, self.D)
 51 | 
 52 |         if self.interactions:
 53 |             sorted_x = sorted(x)
 54 |             len_x = len(sorted_x)
 55 | 
 56 |             for i in range(len_x):
 57 |                 for j in range(i + 1, len_x):
 58 |                     h = hash_element(sorted_x[i] + '_' + sorted_x[j], self.D)
 59 |                     res.append(h)
 60 | 
 61 |         return res
 62 | 
 63 |     def predict(self, x):
 64 |         x_hashed = self.to_indices(x)
 65 |         return self.predict_hashed(x_hashed)
 66 | 
 67 |     def predict_hashed(self, x):
 68 |         ''' Get probability estimation on x
 69 | 
 70 |             INPUT:
 71 |                 x: features
 72 | 
 73 |             OUTPUT:
 74 |                 probability of p(y = 1 | x; w)
 75 |         '''
 76 | 
 77 |         # parameters
 78 |         alpha = self.alpha
 79 |         beta = self.beta
 80 |         L1 = self.L1
 81 |         L2 = self.L2
 82 | 
 83 |         # model
 84 |         n = self.n
 85 |         z = self.z
 86 |         w = {}
 87 | 
 88 |         # wTx is the inner product of w and x
 89 |         wTx = 0.
 90 | 
 91 |         indices = [0]
 92 |         for i in x:
 93 |             indices.append(i + 1)
 94 | 
 95 |         for i in indices:
 96 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
 97 | 
 98 |             # build w on the fly using z and n, hence the name - lazy weights
 99 |             # we are doing this at prediction instead of update time is because
100 |             # this allows us for not storing the complete w
101 |             if sign * z[i] <= L1:
102 |                 # w[i] vanishes due to L1 regularization
103 |                 w[i] = 0.0
104 |             else:
105 |                 # apply prediction time L1, L2 regularization to z and get w
106 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
107 | 
108 |             wTx += w[i]
109 | 
110 |         # cache the current w for update stage
111 |         self.w = w
112 | 
113 |         # bounded sigmoid function, this is the probability estimation
114 |         return 1.0 / (1.0 + exp(-max(min(wTx, 35.0), -35.0)))
115 | 
116 |     def update(self, x, p, y):
117 |         ''' Update model using x, p, y
118 | 
119 |             INPUT:
120 |                 x: a list of indices
121 |                 p: probability prediction of our model
122 |                 y: answer
123 | 
124 |             MODIFIES:
125 |                 self.n: increase by squared gradient
126 |                 self.z: weights
127 |         '''
128 | 
129 |         # parameter
130 |         alpha = self.alpha
131 | 
132 |         # model
133 |         n = self.n
134 |         z = self.z
135 |         w = self.w
136 | 
137 |         # gradient under logloss
138 |         g = p - y
139 | 
140 |         indices = [0]
141 |         for i in x:
142 |             indices.append(i + 1)
143 | 
144 |         # update z and n
145 |         for i in indices:
146 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
147 |             z[i] += g - sigma * w[i]
148 |             n[i] += g * g
149 | 
150 |     def fit(self, x, y):
151 |         x_hashed = self.to_indices(x)
152 |         p = self.predict_hashed(x_hashed)
153 |         self.update(x_hashed, p, y)


--------------------------------------------------------------------------------
/mapk.R:
--------------------------------------------------------------------------------
 1 | # usage
 2 | # Rscript mapk.R pred.feather
 3 | 
 4 | library(methods)
 5 | library(data.table)
 6 | library(feather)
 7 | 
 8 | input_cmd_args = commandArgs(trailingOnly = TRUE)
 9 | path = path.expand(input_cmd_args[[1]])
10 | 
11 | message(Sys.time(), " reading ", path)
12 | dt = read_feather(path)
13 | setDT(dt)
14 | dt[, p_neg:=-pred]
15 | message(Sys.time(), " sorting")
16 | setkey(dt, display_id, p_neg)
17 | message(Sys.time(), " calculating map...")
18 | map = dt[ , .(map_12 = 1 / which(clicked == 1)), by = display_id][['map_12']]
19 | message(Sys.time(), " MAP@12 = ", mean(map))


--------------------------------------------------------------------------------
/ml_metrics_auc.py:
--------------------------------------------------------------------------------
 1 | # implementation of auc is taken from ml_metrics:
 2 | # https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 3 | 
 4 | def tied_rank(x):
 5 |     """
 6 |     Computes the tied rank of elements in x.
 7 |     This function computes the tied rank of elements in x.
 8 |     Parameters
 9 |     ----------
10 |     x : list of numbers, numpy array
11 |     Returns
12 |     -------
13 |     score : list of numbers
14 |             The tied rank f each element in x
15 |     """
16 |     sorted_x = sorted(zip(x,range(len(x))))
17 |     r = [0 for k in x]
18 |     cur_val = sorted_x[0][0]
19 |     last_rank = 0
20 |     for i in range(len(sorted_x)):
21 |         if cur_val != sorted_x[i][0]:
22 |             cur_val = sorted_x[i][0]
23 |             for j in range(last_rank, i): 
24 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
25 |             last_rank = i
26 |         if i==len(sorted_x)-1:
27 |             for j in range(last_rank, i+1): 
28 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
29 |     return r
30 | 
31 | def auc(actual, posterior):
32 |     """
33 |     Computes the area under the receiver-operater characteristic (AUC)
34 |     This function computes the AUC error metric for binary classification.
35 |     Parameters
36 |     ----------
37 |     actual : list of binary numbers, numpy array
38 |              The ground truth value
39 |     posterior : same type as actual
40 |                 Defines a ranking on the binary numbers, from most likely to
41 |                 be positive to least likely to be positive.
42 |     Returns
43 |     -------
44 |     score : double
45 |             The mean squared error between actual and posterior
46 |     """
47 |     r = tied_rank(posterior)
48 |     num_positive = len([0 for x in actual if x==1])
49 |     num_negative = len(actual)-num_positive
50 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
51 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
52 |            (num_negative*num_positive))
53 |     return auc


--------------------------------------------------------------------------------
/submission.R:
--------------------------------------------------------------------------------
 1 | # usage:
 2 | # Rscript submission.R final_submission.feather xgb_submission.csv
 3 | 
 4 | library(methods)
 5 | library(data.table)
 6 | library(feather)
 7 | 
 8 | 
 9 | input_cmd_args = commandArgs(trailingOnly = TRUE)
10 | 
11 | path = path.expand(input_cmd_args[[1]])
12 | 
13 | out_path = path.expand(input_cmd_args[[2]])
14 | out_path = paste0(out_path, ".gz")
15 | 
16 | 
17 | 
18 | message(Sys.time(), " reading ", path)
19 | 
20 | dt = read_feather(path)
21 | setDT(dt)
22 | dt[, p_neg:=-pred]
23 | 
24 | message(Sys.time(), " sorting")
25 | setkey(dt, display_id, p_neg)
26 | 
27 | 
28 | message(Sys.time(), " generating submission")
29 | 
30 | submission = dt[ , .(ad_id = paste(ad_id, collapse = " ")), keyby = display_id]
31 | 
32 | write.table(submission, file = gzfile(out_path, compression = 1), row.names = F, quote = F, sep = ",", append = F)
33 | 
34 | message(Sys.time(), " DONE")


--------------------------------------------------------------------------------