├── 0_prepare_splits.py ├── 1_svm_data.py ├── 1_train_ftrl.py ├── 1_train_svm.py ├── 2_extract_leaked_docs.py ├── 2_leak_features.py ├── 3_doc_similarity_features.py ├── 4_categorical_data_join.py ├── 4_categorical_data_unwrap_columnwise.py ├── 4_mean_target_value.py ├── 5_best_mtv_features_xgb.py ├── 5_mtv_et.py ├── 5_mtv_xgb.py ├── 6_1_generate_ffm_data.py ├── 6_2_split_ffm_to_subfolds.py ├── 6_3_run_ffm.sh ├── 6_4_put_ffm_subfolds_together.py ├── 7_ensemble_data_prep.py ├── 7_ensemble_xgb.py ├── README.md ├── categorical_features.txt ├── ftrl.py ├── mapk.R ├── ml_metrics_auc.py └── submission.R /0_prepare_splits.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import feather 5 | 6 | # prepare train split 7 | df_all = pd.read_csv("../data/clicks_train.csv") 8 | df_all.display_id = df_all.display_id.astype('uint32') 9 | df_all.ad_id = df_all.ad_id.astype('uint32') 10 | df_all.clicked = df_all.clicked.astype('uint8') 11 | 12 | ids = df_all.display_id.unique() 13 | np.random.seed(1) 14 | np.random.shuffle(ids) 15 | 16 | val_size = int(len(ids) * 0.5) 17 | val_display_ids = set(ids[:val_size]) 18 | 19 | df_all['fold'] = 0 20 | 21 | is_val = df_all.display_id.isin(val_display_ids) 22 | df_all.loc[is_val, 'fold'] = 1 23 | df_all.fold = df_all.fold.astype('uint8') 24 | 25 | feather.write_dataframe(df_all, 'tmp/clicks_train_50_50.feather') 26 | 27 | 28 | # prepare test data 29 | 30 | df_test = pd.read_csv("../data/clicks_test.csv") 31 | df_test.display_id = df_test.display_id.astype('uint32') 32 | df_test.ad_id = df_test.ad_id.astype('uint32') 33 | 34 | feather.write_dataframe(df_test, 'tmp/clicks_test.feather') -------------------------------------------------------------------------------- /1_svm_data.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | from time import time 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from tqdm import tqdm 10 | import feather 11 | 12 | 13 | # events data 14 | 15 | def paths(tokens): 16 | all_paths = ['_'.join(tokens[0:(i+1)]) for i in range(len(tokens))] 17 | return ' '.join(all_paths) 18 | 19 | def unwrap_geo(geo): 20 | geo = geo.split('>') 21 | return paths(geo) 22 | 23 | 24 | df_events = pd.read_csv("../data/events.csv") 25 | df_events.geo_location.fillna('', inplace=1) 26 | 27 | geo_str = df_events.geo_location.apply(unwrap_geo) 28 | 29 | 30 | ts = (df_events.timestamp + 1465876799998) / 1000 31 | df_events.timestamp = pd.to_datetime(ts, unit='s') 32 | 33 | 34 | dt = df_events.timestamp.dt 35 | 36 | dow = 'dow_' + dt.dayofweek.astype('str') 37 | hours = 'hour_' + dt.hour.astype('str') 38 | dow_hour = 'dow_hour_' + dt.dayofweek.astype('str') + '_' + dt.hour.astype('str') 39 | 40 | display_str = 'u_' + df_events.uuid + ' ' + \ 41 | 'd_' + df_events.document_id.astype('str') + ' ' + \ 42 | 'p_' + df_events.platform.astype('str') + ' ' + \ 43 | dow + ' ' + hours + ' ' + dow_hour + ' ' + \ 44 | geo_str 45 | 46 | df_events_processed = pd.DataFrame() 47 | df_events_processed['display_id'] = df_events.display_id 48 | df_events_processed['display_str'] = display_str 49 | 50 | 51 | # ad documents data 52 | 53 | df_promoted = pd.read_csv("../data/promoted_content.csv") 54 | 55 | ad_string = 'addoc_' + df_promoted.document_id.astype('str') + ' ' \ 56 | 'campaign_' + df_promoted.campaign_id.astype('str') + ' ' \ 57 | 'adv_' + df_promoted.advertiser_id.astype('str') 58 | 59 | df_promoted_processed = pd.DataFrame() 60 | df_promoted_processed['ad_id'] = df_promoted.ad_id 61 | df_promoted_processed['promoted_ad_str'] = ad_string 62 | 63 | 64 | ad_to_idx = dict(zip(df_promoted_processed.ad_id, df_promoted_processed.index)) 65 | 66 | 67 | # processing data in batches 68 | 69 | def prepare_batch(batch): 70 | batch = batch.reset_index(drop=1) 71 | 72 | promoted_idx = batch.ad_id.apply(ad_to_idx.get) 73 | promoted_ad_str = df_promoted_processed.promoted_ad_str.iloc[promoted_idx] 74 | 75 | display_str = df_events_processed.display_str.iloc[batch.display_id - 1] 76 | 77 | promoted_ad_str.reset_index(drop=1, inplace=1) 78 | display_str.reset_index(drop=1, inplace=1) 79 | 80 | batch['ad_display_str'] = promoted_ad_str + ' ' + display_str 81 | return batch 82 | 83 | 84 | def append_to_csv(batch, csv_file): 85 | props = dict(encoding='utf-8', index=False) 86 | if not os.path.exists(csv_file): 87 | batch.to_csv(csv_file, **props) 88 | else: 89 | batch.to_csv(csv_file, mode='a', header=False, **props) 90 | 91 | def delete_file_if_exists(filename): 92 | if os.path.exists(filename): 93 | os.remove(filename) 94 | 95 | def chunk_dataframe(df, n): 96 | for i in range(0, len(df), n): 97 | yield df.iloc[i:i+n] 98 | 99 | 100 | # preparing data for train & test 101 | 102 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 103 | 104 | delete_file_if_exists('tmp/svm_features_train.csv') 105 | 106 | for batch in tqdm(chunk_dataframe(df_all, n=1000000)): 107 | batch = prepare_batch(batch) 108 | append_to_csv(batch, 'tmp/svm_features_train.csv') 109 | 110 | 111 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 112 | 113 | delete_file_if_exists('tmp/svm_features_test.csv') 114 | 115 | for batch in tqdm(chunk_dataframe(df_test, n=1000000)): 116 | batch = prepare_batch(batch) 117 | append_to_csv(batch, 'tmp/svm_features_test.csv') 118 | -------------------------------------------------------------------------------- /1_train_ftrl.py: -------------------------------------------------------------------------------- 1 | # use pypy for running this script 2 | 3 | import re 4 | from time import time 5 | from csv import DictReader 6 | from time import time 7 | 8 | import ftrl 9 | from ml_metrics_auc import auc 10 | 11 | spaces = re.compile(r' +') 12 | 13 | 14 | # model parameters 15 | 16 | alpha = 0.1 17 | beta = 0.0 18 | L1 = 2.0 19 | L2 = 0.0 20 | 21 | D = 2 ** 25 22 | 23 | interactions = True 24 | n_epochs = 1 25 | show_auc = False 26 | 27 | models = {} 28 | models['0'] = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions) 29 | models['1'] = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions) 30 | model_full = ftrl.FtrlProximal(alpha, beta, L1, L2, D, interactions) 31 | 32 | 33 | # training the models 34 | 35 | 36 | t0 = time() 37 | 38 | print('trainning models...') 39 | 40 | for i in range(n_epochs): 41 | print('epoch %d...' % i) 42 | 43 | with open('tmp/svm_features_train.csv', 'r') as f: 44 | reader = DictReader(f) 45 | 46 | cnt = 0 47 | for row in reader: 48 | y = int(row['clicked']) 49 | 50 | x = spaces.split(row['ad_display_str'].strip()) 51 | 52 | if row['fold'] == '0': 53 | fold = '1' 54 | else: # '1' 55 | fold = '0' 56 | 57 | models[fold].fit(x, y) 58 | model_full.fit(x, y) 59 | 60 | cnt = cnt + 1 61 | if cnt % 1000000 == 0: 62 | print('processed %dth row' % cnt) 63 | 64 | 65 | print('training took %0.3fm' % ((time() - t0) / 60)) 66 | 67 | 68 | # validation and oof prediction 69 | 70 | print('validating models...') 71 | 72 | t0 = time() 73 | 74 | all_y = {'0': [], '1': []} 75 | all_pred = {'0': [], '1': []} 76 | 77 | f_pred = {} 78 | f_pred['0'] = open('predictions/ftrl_pred_0.txt', 'w') 79 | f_pred['0'].write('y_actual,y_pred\n') 80 | 81 | f_pred['1'] = open('predictions/ftrl_pred_1.txt', 'w') 82 | f_pred['1'].write('y_actual,y_pred\n') 83 | 84 | with open('tmp/svm_features_train.csv', 'r') as f: 85 | reader = DictReader(f) 86 | 87 | cnt = 0 88 | for row in reader: 89 | y = int(row['clicked']) 90 | fold = row['fold'] 91 | 92 | x = spaces.split(row['ad_display_str'].strip()) 93 | y_pred = models[fold].predict(x) 94 | 95 | all_y[fold].append(y) 96 | all_pred[fold].append(y_pred) 97 | f_pred[fold].write('%s,%s\n' % (y, y_pred)) 98 | 99 | cnt = cnt + 1 100 | if cnt % 1000000 == 0: 101 | print('processed %dth row' % cnt) 102 | if show_auc and cnt % 5000000 == 0: 103 | auc0 = auc(all_y['0'], all_pred['0']) 104 | auc1 = auc(all_y['1'], all_pred['1']) 105 | print('auc: %.4f, %.4f' % (auc0, auc1)) 106 | 107 | auc0 = auc(all_y['0'], all_pred['0']) 108 | auc1 = auc(all_y['1'], all_pred['1']) 109 | print('final auc: %.4f, %.4f' % (auc0, auc1)) 110 | 111 | f_pred['0'].close() 112 | f_pred['1'].close() 113 | 114 | print('predict took %0.3fm' % ((time() - t0) / 60)) 115 | del all_y, all_pred 116 | 117 | 118 | # predicting the results on test 119 | 120 | print('applying the model to the test data...') 121 | 122 | t0 = time() 123 | 124 | f_pred = open('predictions/ftrl_pred_test.txt', 'w') 125 | f_pred.write('y_pred\n') 126 | 127 | with open('tmp/svm_features_test.csv', 'r') as f: 128 | reader = DictReader(f) 129 | 130 | cnt = 0 131 | for row in reader: 132 | x = spaces.split(row['ad_display_str'].strip()) 133 | y_pred = model_full.predict(x) 134 | f_pred.write('%s\n' % y_pred) 135 | 136 | cnt = cnt + 1 137 | if cnt % 1000000 == 0: 138 | print('processed %dth row' % cnt) 139 | 140 | f_pred.close() 141 | 142 | print('predict took %0.3fm' % ((time() - t0) / 60)) 143 | -------------------------------------------------------------------------------- /1_train_svm.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from time import time 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from sklearn.feature_extraction.text import HashingVectorizer 9 | from sklearn.svm import LinearSVC 10 | from sklearn.metrics import roc_auc_score 11 | 12 | 13 | # building the data for train 14 | 15 | df_all = pd.read_csv('tmp/svm_features_train.csv') 16 | 17 | text_vec = HashingVectorizer(dtype=np.uint8, n_features=10000000, norm=None, 18 | lowercase=False, binary=True, token_pattern='\\S+', 19 | non_negative=True) 20 | 21 | t0 = time() 22 | X = text_vec.transform(df_all.ad_display_str) 23 | 24 | print('building the train matrix took %.4fm' % (time() - t0) / 60) 25 | 26 | 27 | fold = df_all.fold.values 28 | 29 | X_0 = X[fold == 0] 30 | X_1 = X[fold == 1] 31 | 32 | y = df_all.clicked.values 33 | y_0 = y[fold == 0] 34 | y_1 = y[fold == 0] 35 | 36 | 37 | # fitting the model for fold 1 38 | 39 | C = 0.1 40 | 41 | t0 = time() 42 | 43 | svm = LinearSVC(penalty='l1', dual=False, C=C, random_state=1) 44 | svm.fit(X_0, y_0) 45 | 46 | y_pred = svm.decision_function(X_1) 47 | auc = roc_auc_score(y_1, y_pred) 48 | 49 | np.save('predictions/svm_1_preds.npy', y_pred) 50 | 51 | print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc)) 52 | 53 | 54 | # fitting the model for fold 0 55 | 56 | t0 = time() 57 | 58 | svm = LinearSVC(penalty='l1', dual=False, C=C, random_state=1) 59 | svm.fit(X_1, y_1) 60 | 61 | y_pred = svm.decision_function(X_0) 62 | auc = roc_auc_score(y_0, y_pred) 63 | 64 | np.save('predictions/svm_0_preds.npy', y_pred) 65 | 66 | print('C=%s, took %.3fs, auc=%.3f' % (C, time() - t0, auc)) 67 | 68 | 69 | # predictions for test 70 | 71 | df_test = pd.read_csv('tmp/svm_features_test.csv') 72 | 73 | t0 = time() 74 | X_test = text_vec.transform(df_test.ad_display_str) 75 | 76 | print('building the test matrix took %.4fm' % (time() - t0) / 60) 77 | 78 | pred_0 = model_0.decision_function(X_test) 79 | pred_1 = model_1.decision_function(X_test) 80 | pred_final = (pred_0 + pred_1) / 2 81 | 82 | np.save('predictions/svm_test_preds.npy', pred_final) -------------------------------------------------------------------------------- /2_extract_leaked_docs.py: -------------------------------------------------------------------------------- 1 | # run it with pypy 2 | # taken from 3 | # https://www.kaggle.com/jiweiliu/outbrain-click-prediction/extract-leak-in-30-mins-with-small-memory 4 | 5 | import csv 6 | import os 7 | 8 | leak = {} 9 | 10 | with open('../data/promoted_content.csv') as f: 11 | promoted = csv.DictReader(f) 12 | for c, row in enumerate(promoted): 13 | if row['document_id'] != '': 14 | leak[row['document_id']] = 1 15 | 16 | 17 | with open('../data/page_views.csv') as f: 18 | page_views = csv.DictReader(f) 19 | for c, row in enumerate(page_views): 20 | if c % 1000000 == 0: 21 | print c 22 | 23 | doc_id = row['document_id'] 24 | 25 | if doc_id not in leak: 26 | continue 27 | 28 | if leak[doc_id] == 1: 29 | leak[doc_id] = set() 30 | 31 | lu = len(leak[doc_id]) 32 | leak[doc_id].add(row['uuid']) 33 | 34 | 35 | with open('tmp/leaked_docs.csv', 'w') as fo: 36 | fo.write('document_id,uuids\n') 37 | for k, v in leak.items(): 38 | if v == 1: 39 | continue 40 | 41 | fo.write('%s,%s\n' % (k, ' '.join(v))) 42 | -------------------------------------------------------------------------------- /2_leak_features.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import feather 4 | import sys 5 | 6 | import csv 7 | csv.field_size_limit(sys.maxsize) 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # reading the leaked documents 13 | 14 | docs_size = {} 15 | leak_uuid_dict = {} 16 | 17 | with open("tmp/leaked_docs.csv") as f: 18 | reader = csv.DictReader(f) 19 | leak_uuid_dict = {} 20 | 21 | for row in reader: 22 | doc_id = int(row['document_id']) 23 | uuids = row['uuids'].split(' ') 24 | leak_uuid_dict[doc_id] = set(uuids) 25 | docs_size[doc_id] = len(uuids) 26 | 27 | 28 | # 29 | 30 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 31 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 32 | 33 | 34 | # getting user ids and document ids 35 | 36 | df_events = pd.read_csv('../data/events.csv', usecols=['uuid']) 37 | df_ads = pd.read_csv('../data/promoted_content.csv', 38 | usecols=['ad_id', 'document_id']) 39 | 40 | # joining doc_id and ad_id 41 | 42 | ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index)) 43 | 44 | ad_idx = df_all.ad_id.apply(ad_to_idx.get) 45 | ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1) 46 | df_all['ad_document_id'] = ad_document_id 47 | 48 | ad_idx = df_test.ad_id.apply(ad_to_idx.get) 49 | ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1) 50 | df_test['ad_document_id'] = ad_document_id 51 | 52 | # joining display_id and user 53 | 54 | df_all['uuid'] = df_events.iloc[df_all.display_id - 1].reset_index(drop=1) 55 | df_test['uuid'] = df_events.iloc[df_test.display_id - 1].reset_index(drop=1) 56 | 57 | 58 | # extracting the leak 59 | 60 | def is_leak(doc_id, uuid): 61 | if doc_id in leak_uuid_dict: 62 | if uuid in leak_uuid_dict[doc_id]: 63 | return 1 64 | return 0 65 | 66 | df_all['leak'] = df_all.ad_document_id.combine(df_all.uuid, is_leak) 67 | df_test['leak'] = df_test.ad_document_id.combine(df_test.uuid, is_leak) 68 | 69 | df_all['doc_known_views'] = df_all.ad_document_id.apply(lambda d: docs_size.get(d, 0)) 70 | df_test['doc_known_views'] = df_test.ad_document_id.apply(lambda d: docs_size.get(d, 0)) 71 | 72 | df_train_0 = df_all[df_all.fold == 0] 73 | df_train_1 = df_all[df_all.fold == 1] 74 | 75 | np.save('features/leak_0.npy', df_train_0.leak.values) 76 | np.save('features/leak_1.npy', df_train_1.leak.values) 77 | np.save('features/leak_test.npy', df_test.leak.values) 78 | 79 | np.save('features/doc_known_views_0.npy', df_train_0.doc_known_views.values) 80 | np.save('features/doc_known_views_1.npy', df_train_1.doc_known_views.values) 81 | np.save('features/doc_known_views_test.npy', df_test.doc_known_views.values) -------------------------------------------------------------------------------- /3_doc_similarity_features.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import csv 4 | from tqdm import tqdm 5 | from collections import defaultdict, Counter 6 | from math import log 7 | 8 | import numpy as np 9 | 10 | from sklearn.feature_extraction import DictVectorizer 11 | from sklearn.preprocessing import normalize 12 | from sklearn.decomposition import TruncatedSVD 13 | 14 | import feather 15 | import os 16 | 17 | 18 | # display_id to document_id mapping 19 | 20 | display_doc_ids = [] 21 | 22 | with open('../data/events.csv') as f: 23 | reader = csv.DictReader(f) 24 | 25 | for row in tqdm(reader): 26 | doc_id = int(row['document_id']) 27 | display_doc_ids.append(doc_id) 28 | 29 | 30 | # ad_id to document_id mapping 31 | 32 | ad_doc_id = {} 33 | 34 | with open('../data/promoted_content.csv') as f: 35 | reader = csv.DictReader(f) 36 | 37 | for row in tqdm(reader): 38 | ad_id = int(row['ad_id']) 39 | doc_id = int(row['document_id']) 40 | ad_doc_id[ad_id] = doc_id 41 | 42 | 43 | 44 | # reading document data 45 | 46 | categories = defaultdict(list) 47 | 48 | with open('../data/documents_categories.csv') as f: 49 | reader = csv.DictReader(f) 50 | 51 | for row in tqdm(reader): 52 | doc_id = int(row['document_id']) 53 | cat = 'cat_' + row['category_id'] 54 | conf = float(row['confidence_level']) 55 | categories[doc_id].append((cat, conf)) 56 | 57 | entities = defaultdict(list) 58 | 59 | with open('../data/documents_entities.csv') as f: 60 | reader = csv.DictReader(f) 61 | 62 | for row in tqdm(reader): 63 | doc_id = int(row['document_id']) 64 | en = 'entity_' + row['entity_id'] 65 | conf = float(row['confidence_level']) 66 | entities[doc_id].append((en, conf)) 67 | 68 | topics = defaultdict(list) 69 | 70 | with open('../data/documents_topics.csv') as f: 71 | reader = csv.DictReader(f) 72 | 73 | for row in tqdm(reader): 74 | doc_id = int(row['document_id']) 75 | t = 'topic_' + row['topic_id'] 76 | conf = float(row['confidence_level']) 77 | topics[doc_id].append((t, conf)) 78 | 79 | 80 | 81 | doc_ids = [] 82 | doc_values = [] 83 | values_cnt = Counter() 84 | 85 | with open('../data/documents_meta.csv') as f: 86 | reader = csv.DictReader(f) 87 | 88 | for row in tqdm(reader): 89 | doc_id = int(row['document_id']) 90 | 91 | source = 'src_' + row['source_id'] 92 | if not source: 93 | source = 'src_unk' 94 | 95 | publisher = 'pub_' + row['publisher_id'] 96 | if not publisher: 97 | publisher = 'pub_unk' 98 | 99 | doc_vector = [(source, 1.0), (publisher, 1.0)] 100 | doc_vector.extend(categories[doc_id]) 101 | doc_vector.extend(entities[doc_id]) 102 | doc_vector.extend(topics[doc_id]) 103 | 104 | doc_ids.append(doc_id) 105 | doc_values.append(dict(doc_vector)) 106 | 107 | values_cnt.update([n for (n, _) in doc_vector]) 108 | 109 | 110 | doc_id_to_idx = {d: i for (i, d) in enumerate(doc_ids)} 111 | 112 | 113 | # discard infrequent and calculate idf 114 | 115 | min_df = 5 116 | freq = {t for (t, c) in values_cnt.items() if c >= min_df} 117 | 118 | N = len(doc_ids) 119 | log_N = log(N) 120 | 121 | idf = {k: log_N - log(v) for (k, v) in values_cnt.items() if k in freq} 122 | 123 | 124 | def discard_infreq(in_dict): 125 | return {k: w for (k, w) in in_dict.items() if k in freq} 126 | 127 | def idf_transform(in_dict): 128 | return {k: w * idf[k] for (k, w) in in_dict.items()} 129 | 130 | doc_values = [discard_infreq(d) for d in doc_values] 131 | idf_doc_values = [idf_transform(d) for d in doc_values] 132 | 133 | 134 | # vectorizing the documents 135 | 136 | dv = DictVectorizer(dtype=np.float32) 137 | X_idf = dv.fit_transform(idf_doc_values) 138 | 139 | del dv 140 | del values_cnt, idf, freq, doc_values, idf_doc_values 141 | del categories, entities, topics, doc_ids, doc_values 142 | 143 | 144 | # lsi 145 | 146 | svd_idf = TruncatedSVD(n_components=150, random_state=1) 147 | svd_idf.fit(X_idf) 148 | 149 | 150 | # processing data in batches 151 | 152 | def append_to_csv(batch, csv_file): 153 | props = dict(encoding='utf-8', index=False) 154 | if not os.path.exists(csv_file): 155 | batch.to_csv(csv_file, **props) 156 | else: 157 | batch.to_csv(csv_file, mode='a', header=False, **props) 158 | 159 | def delete_file_if_exists(filename): 160 | if os.path.exists(filename): 161 | os.remove(filename) 162 | 163 | def chunk_dataframe(df, n): 164 | for i in range(0, len(df), n): 165 | yield df.iloc[i:i+n] 166 | 167 | 168 | def prepare_batch(batch): 169 | batch = batch.reset_index(drop=1) 170 | 171 | display_docs = (batch.display_id - 1).apply(display_doc_ids.__getitem__) 172 | display_docs_idx = display_docs.apply(doc_id_to_idx.get) 173 | 174 | ad_docs = batch.ad_id.apply(ad_doc_id.get) 175 | ad_docs_idx = ad_docs.apply(doc_id_to_idx.get) 176 | 177 | X1 = X_idf[display_docs_idx.values] 178 | X2 = X_idf[ad_docs_idx.values] 179 | 180 | dot = X1.multiply(X2).sum(axis=1) 181 | batch['doc_idf_dot'] = np.asarray(dot).reshape(-1) 182 | 183 | X1_svd = svd_idf.transform(X1) 184 | X2_svd = svd_idf.transform(X2) 185 | 186 | batch['doc_idf_dot_lsa'] = (X1_svd * X2_svd).sum(axis=1) 187 | 188 | X1 = normalize(X1.astype(np.float)) 189 | X2 = normalize(X2.astype(np.float)) 190 | 191 | dot = X1.multiply(X2).sum(axis=1) 192 | batch['doc_idf_cos'] = np.asarray(dot).reshape(-1) 193 | 194 | return batch 195 | 196 | 197 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 198 | 199 | delete_file_if_exists('tmp/doc_features_train.csv') 200 | 201 | for batch in tqdm(chunk_dataframe(df_all, n=1000000)): 202 | batch = prepare_batch(batch) 203 | append_to_csv(batch, 'tmp/doc_features_train.csv') 204 | 205 | del df_all 206 | 207 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 208 | 209 | delete_file_if_exists('tmp/doc_features_test.csv') 210 | 211 | for batch in tqdm(chunk_dataframe(df_test, n=1000000)): 212 | batch = prepare_batch(batch) 213 | append_to_csv(batch, 'tmp/doc_features_test.csv') 214 | 215 | del df_test 216 | 217 | del svd_idf, X_idf 218 | 219 | 220 | # now processing the features and saving them as feather files 221 | 222 | types = dict(display_id='uint32', ad_id='uint32', clicked='uint8', fold='uint8', 223 | doc_idf_dot='float32', doc_idf_dot_lsa='float32', doc_idf_cos='float32') 224 | df_all = pd.read_csv('tmp/doc_features_train.csv', dtype=types) 225 | 226 | del types['clicked'], types['fold'] 227 | df_test = pd.read_csv('tmp/doc_features_test.csv', dtype=types) 228 | 229 | 230 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 231 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 232 | del df_train_0['fold'], df_train_1['fold'], df_all 233 | 234 | cols_to_rank = ['doc_idf_dot', 'doc_idf_dot_lsa', 'doc_idf_cos'] 235 | 236 | for f in tqdm(cols_to_rank): 237 | for df in [df_train_0, df_train_1, df_test]: 238 | df['%s_rank' % f] = df.groupby('display_id')[f].rank(ascending=0) 239 | df['%s_rank' % f] = df['%s_rank' % f].astype('uint8') 240 | 241 | 242 | feather.write_dataframe(df_train_0, 'features/docs_df_train_0.feather') 243 | feather.write_dataframe(df_train_1, 'features/docs_df_train_1.doc.feather') 244 | feather.write_dataframe(df_test, 'features/docs_df_test.feather') -------------------------------------------------------------------------------- /4_categorical_data_join.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import xgboost as xgb 8 | import feather 9 | from tqdm import tqdm 10 | 11 | from sklearn.preprocessing import LabelEncoder 12 | 13 | from itertools import combinations 14 | 15 | 16 | 17 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 18 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 19 | 20 | 21 | # event features: 22 | # - geo 23 | # - time 24 | # - user 25 | # - platform 26 | 27 | 28 | df_display = pd.read_csv('../data/events.csv') 29 | df_display.geo_location.fillna('', inplace=1) 30 | 31 | # geo features 32 | 33 | df_geo = df_display.geo_location.str.split('>', expand=True) 34 | df_geo.fillna('*', inplace=1) 35 | df_geo.columns = ['geo_0', 'geo_1', 'geo_2'] 36 | del df_geo['geo_2'] 37 | df_geo['geo_second_lev'] = df_geo['geo_0'] + '>' + df_geo['geo_1'] 38 | del df_geo['geo_1'] 39 | 40 | df_display['geo_0'] = df_geo['geo_0'] 41 | df_display['geo_1'] = df_geo['geo_second_lev'] 42 | df_display.rename(columns={'geo_location': 'geo_2'}, inplace=1) 43 | del df_geo 44 | 45 | # time features 46 | 47 | ts = (df_display.timestamp + 1465876799998) / 1000 - (4 * 60 * 60) 48 | df_display.timestamp = pd.to_datetime(ts, unit='s') 49 | 50 | dt = df_display.timestamp.dt 51 | df_display['day'] = dt.dayofweek.astype('str') 52 | df_display['hour'] = dt.hour.astype('str') 53 | 54 | del df_display['timestamp'], dt, ts 55 | 56 | # platform 57 | 58 | df_display.platform = df_display.platform.astype('str') 59 | del df_display['display_id'] 60 | 61 | 62 | # user: convert to base 32 to occupy less space 63 | 64 | df_display['user_id'] = LabelEncoder().fit_transform(df_display.uuid) 65 | del df_display['uuid'] 66 | 67 | def base32(i): 68 | return np.base_repr(i, base=32) 69 | 70 | df_display['user_id'] = df_display['user_id'].apply(base32) 71 | 72 | 73 | 74 | # document features: 75 | # - top category 76 | # - top entity 77 | # - top topic 78 | # - meta: publisher, source 79 | 80 | df_ads = pd.read_csv('../data/promoted_content.csv') 81 | ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index)) 82 | 83 | ads_docs = set(df_display.document_id) 84 | ads_docs.update(df_ads.document_id) 85 | 86 | 87 | # document categories 88 | 89 | df_doc_cat = pd.read_csv('../data/documents_categories.csv') 90 | 91 | df_doc_cat = df_doc_cat.drop_duplicates(subset='document_id', keep='first') 92 | df_doc_cat = df_doc_cat[df_doc_cat.confidence_level >= 0.8] 93 | df_doc_cat = df_doc_cat[df_doc_cat.document_id.isin(ads_docs)] 94 | 95 | cat_counts = df_doc_cat.category_id.value_counts() 96 | freq_cats = set(cat_counts[cat_counts >= 5].index) 97 | 98 | df_doc_cat = df_doc_cat[df_doc_cat.category_id.isin(freq_cats)] 99 | 100 | doc_top_cat = dict(zip(df_doc_cat.document_id, df_doc_cat.category_id)) 101 | del freq_cats, cat_counts, df_doc_cat 102 | 103 | 104 | # document entities: hash them to occupy less space 105 | 106 | D = 2 ** 24 107 | def entity_name_reduce(entity): 108 | return '%x' % abs(hash(entity) % D) 109 | 110 | 111 | df_doc_entities = pd.read_csv('../data/documents_entities.csv') 112 | 113 | df_doc_entities = df_doc_entities[df_doc_entities.confidence_level >= 0.8] 114 | df_doc_entities = df_doc_entities[df_doc_entities.document_id.isin(ads_docs)] 115 | 116 | df_doc_entities = df_doc_entities.drop_duplicates(subset='document_id', keep='first') 117 | df_doc_entities = df_doc_entities.reset_index(drop=1) 118 | 119 | df_doc_entities.entity_id = df_doc_entities.entity_id.apply(entity_name_reduce) 120 | 121 | entity_counts = df_doc_entities.entity_id.value_counts() 122 | freq_entites = set(entity_counts[entity_counts >= 5].index) 123 | df_doc_entities = df_doc_entities[df_doc_entities.entity_id.isin(freq_entites)] 124 | 125 | doc_top_entity = dict(zip(df_doc_entities.document_id, df_doc_entities.entity_id)) 126 | 127 | del df_doc_entities, entity_counts, freq_entites 128 | 129 | 130 | # document topics 131 | 132 | df_doc_topics = pd.read_csv('../data/documents_topics.csv') 133 | 134 | df_doc_topics = df_doc_topics[df_doc_topics.confidence_level >= 0.8] 135 | df_doc_topics = df_doc_topics[df_doc_topics.document_id.isin(ads_docs)] 136 | 137 | df_doc_topics = df_doc_topics.drop_duplicates(subset='document_id', keep='first') 138 | df_doc_topics = df_doc_topics.reset_index(drop=1) 139 | 140 | topic_cnt = df_doc_topics.topic_id.value_counts() 141 | freq_topics = set(topic_cnt[topic_cnt >= 5].index) 142 | 143 | df_doc_topics = df_doc_topics[df_doc_topics.topic_id.isin(freq_topics)] 144 | doc_top_topic = dict(zip(df_doc_topics.document_id, df_doc_topics.topic_id)) 145 | 146 | del df_doc_topics, topic_cnt, freq_topics 147 | 148 | 149 | # document meta info 150 | 151 | df_doc_meta = pd.read_csv('../data/documents_meta.csv') 152 | df_doc_meta = df_doc_meta[df_doc_meta.document_id.isin(ads_docs)] 153 | del df_doc_meta['publish_time'] 154 | 155 | df_doc_meta.source_id.fillna(0, inplace=1) 156 | df_doc_meta.source_id = df_doc_meta.source_id.astype('uint32') 157 | 158 | df_doc_meta.publisher_id.fillna(0, inplace=1) 159 | df_doc_meta.publisher_id = df_doc_meta.publisher_id.astype('uint32') 160 | 161 | df_doc_meta = df_doc_meta.reset_index(drop=1) 162 | meta_idx = dict(zip(df_doc_meta.document_id, df_doc_meta.index)) 163 | 164 | 165 | 166 | # to avoid confusion, let's rename document_id columns 167 | 168 | df_display.rename(columns={'document_id': 'on_document_id'}, inplace=1) 169 | df_ads.rename(columns={'document_id': 'ad_document_id'}, inplace=1) 170 | 171 | 172 | # we will do everything in batches 173 | def prepare_batch(batch): 174 | batch = batch.reset_index(drop=1) 175 | 176 | batch_display = df_display.iloc[batch.display_id - 1].reset_index(drop=1) 177 | 178 | batch_ad_ids = batch.ad_id.apply(ad_to_idx.get) 179 | batch_ads = df_ads.iloc[batch_ad_ids].reset_index(drop=1) 180 | del batch_ads['ad_id'] 181 | 182 | batch_meta_idx = batch_ads.ad_document_id.apply(meta_idx.get) 183 | batch_ad_doc_meta = df_doc_meta.iloc[batch_meta_idx].reset_index(drop=1) 184 | 185 | batch_ad_doc_meta['top_entity'] = \ 186 | batch_ad_doc_meta.document_id.apply(lambda did: doc_top_entity.get(did, 'unk')) 187 | batch_ad_doc_meta['top_topic'] = \ 188 | batch_ad_doc_meta.document_id.apply(lambda did: doc_top_topic.get(did, 'unk')) 189 | batch_ad_doc_meta['top_cat'] = \ 190 | batch_ad_doc_meta.document_id.apply(lambda did: doc_top_cat.get(did, 'unk')) 191 | 192 | del batch_ad_doc_meta['document_id'] 193 | 194 | batch_ad_doc_meta.columns = ['ad_doc_%s' % c for c in batch_ad_doc_meta.columns] 195 | 196 | batch_meta_idx = batch_display.on_document_id.apply(meta_idx.get) 197 | batch_on_doc_meta = df_doc_meta.iloc[batch_meta_idx].reset_index(drop=1) 198 | 199 | batch_on_doc_meta['top_entity'] = \ 200 | batch_on_doc_meta.document_id.apply(lambda did: doc_top_entity.get(did, 'unk')) 201 | batch_on_doc_meta['top_topic'] = \ 202 | batch_on_doc_meta.document_id.apply(lambda did: doc_top_topic.get(did, 'unk')) 203 | batch_on_doc_meta['top_cat'] = \ 204 | batch_on_doc_meta.document_id.apply(lambda did: doc_top_cat.get(did, 'unk')) 205 | 206 | del batch_on_doc_meta['document_id'] 207 | 208 | batch_on_doc_meta.columns = ['on_doc_%s' % c for c in batch_on_doc_meta.columns] 209 | 210 | joined_batch = pd.concat([batch, batch_ads, batch_display, 211 | batch_ad_doc_meta, batch_on_doc_meta], axis=1) 212 | 213 | for c in ['ad_doc_source_id', 'ad_doc_publisher_id', 'ad_document_id', 'ad_doc_top_cat', 214 | 'on_doc_source_id', 'on_doc_publisher_id', 'on_document_id', 'on_doc_top_cat', 215 | 'ad_id', 'campaign_id', 'advertiser_id']: 216 | joined_batch[c] = joined_batch[c].astype('str') 217 | 218 | joined_batch.fillna('unk', inplace=1) 219 | all_features = set(joined_batch.columns) - {'clicked', 'fold', 'display_id'} 220 | 221 | for c in sorted(all_features): 222 | if 'on_doc' in c or 'geo' in c or c in {'day', 'hour', 'user_id', 'ad_id'}: 223 | continue 224 | 225 | for c2 in ['day', 'hour', 'geo_0', 'geo_1', 'geo_2']: 226 | joined_batch['%s_%s' % (c, c2)] = joined_batch[c] + '_' + joined_batch[c2] 227 | 228 | two_way_comb = sorted(all_features - {'day', 'hour', 'geo_0', 'geo_1', 'geo_2'}) 229 | 230 | combs = list(combinations(two_way_comb, 2)) 231 | 232 | for c1, c2 in combs: 233 | if 'on_doc' in c1 and 'on_doc' in c2: 234 | continue 235 | joined_batch['%s_%s' % (c1, c2)] = joined_batch[c1].astype('str') + '_' + joined_batch[c2].astype('str') 236 | 237 | return joined_batch 238 | 239 | 240 | 241 | def append_to_csv(batch, csv_file): 242 | props = dict(encoding='utf-8', index=False) 243 | if not os.path.exists(csv_file): 244 | batch.to_csv(csv_file, **props) 245 | else: 246 | batch.to_csv(csv_file, mode='a', header=False, **props) 247 | 248 | def delete_file_if_exists(filename): 249 | if os.path.exists(filename): 250 | os.remove(filename) 251 | 252 | def chunk_dataframe(df, n): 253 | for i in range(0, len(df), n): 254 | yield df.iloc[i:i+n] 255 | 256 | 257 | # apply to train 258 | 259 | df = feather.read_dataframe('tmp/clicks_train_50_50.feather') 260 | 261 | delete_file_if_exists('tmp/categorical_joined_train.csv') 262 | 263 | for batch in tqdm(chunk_dataframe(df, n=100000)): 264 | batch = prepare_batch(batch) 265 | append_to_csv(batch, 'tmp/categorical_joined_train.csv') 266 | 267 | 268 | # apply to test 269 | 270 | df = feather.read_dataframe('tmp/clicks_test.feather') 271 | 272 | delete_file_if_exists('tmp/categorical_joined_test.csv') 273 | 274 | for batch in tqdm(chunk_dataframe(df, n=100000)): 275 | batch = prepare_batch(batch) 276 | append_to_csv(batch, 'tmp/categorical_joined_test.csv') 277 | -------------------------------------------------------------------------------- /4_categorical_data_unwrap_columnwise.py: -------------------------------------------------------------------------------- 1 | # run with pypy 2 | 3 | from tqdm import tqdm 4 | import csv 5 | 6 | train_file = 'tmp/categorical_joined_train.csv' 7 | test_file = 'tmp/categorical_joined_test.csv' 8 | 9 | 10 | def copy_columnwise(filename, result_dir): 11 | with open(filename) as f: 12 | reader = csv.DictReader(f) 13 | files = {} 14 | 15 | for f in reader.fieldnames: 16 | files[f] = open(result_dir + '/' + f + '.txt', 'w') 17 | 18 | for row in tqdm(reader): 19 | for k, v in row.items(): 20 | files[k].write(v + '\n') 21 | 22 | for f in files.values(): 23 | f.flush() 24 | f.close() 25 | 26 | print('copy train...') 27 | copy_columnwise(train_file, 'tmp/categorical/train') 28 | 29 | print('copy test...') 30 | copy_columnwise(test_file, 'tmp/categorical/test') 31 | -------------------------------------------------------------------------------- /4_mean_target_value.py: -------------------------------------------------------------------------------- 1 | # run with cat categorical_features.txt | parallel --jobs 6 python 04_mean_target_value.py {} 2 | # coding: utf-8 3 | 4 | import sys 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from time import time 10 | import feather 11 | 12 | column = sys.argv[1] 13 | print('processing column %s...' % column) 14 | 15 | C = 12 16 | 17 | 18 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 19 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 20 | 21 | train_col = pd.read_csv('tmp/categorical/train/' + column + '.txt', header=None, dtype='str') 22 | df_all[column] = train_col[0] 23 | 24 | test_col = pd.read_csv('tmp/categorical/test/' + column + '.txt', header=None, dtype='str') 25 | df_test[column] = test_col[0] 26 | 27 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 28 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 29 | del df_train_0['fold'], df_train_1['fold'], df_all['fold'] 30 | 31 | 32 | # fold 0 train 33 | 34 | print('training on fold 0, predicting on 1') 35 | 36 | t0 = time() 37 | 38 | m0 = (df_train_0.clicked == 1).mean() 39 | 40 | cnt_clicked_0 = df_train_0[df_train_0.clicked == 1][column].value_counts() 41 | cnt_all_0 = df_train_0[column].value_counts() 42 | 43 | probs_1 = (cnt_clicked_0 + C * m0) / (cnt_all_0 + C) 44 | probs_1 = probs_1[df_train_1[column]].reset_index(drop=1) 45 | probs_1.fillna(m0, inplace=1) 46 | 47 | df_train_1['prob'] = probs_1 48 | 49 | print('took %0.3fs' % (time() - t0)) 50 | 51 | 52 | # fold 1 train 53 | 54 | print('training on fold 1, predicting on 0') 55 | 56 | t0 = time() 57 | 58 | m1 = (df_train_1.clicked == 1).mean() 59 | cnt_clicked_1 = df_train_1[df_train_1.clicked == 1][column].value_counts() 60 | cnt_all_1 = df_train_1[column].value_counts() 61 | 62 | probs_0 = (cnt_clicked_1 + C * m1) / (cnt_all_1 + C) 63 | probs_0 = probs_0[df_train_0[column]].reset_index(drop=1) 64 | probs_0.fillna(m1, inplace=1) 65 | 66 | df_train_0['prob'] = probs_0 67 | 68 | print('took %0.3fs' % (time() - t0)) 69 | 70 | 71 | # full train 72 | 73 | print('training on all data, predicting on test') 74 | 75 | t0 = time() 76 | 77 | m = (df_all.clicked == 1).mean() 78 | cnt_clicked = df_all[df_all.clicked == 1][column].value_counts() 79 | cnt_all = df_all[column].value_counts() 80 | 81 | probs = (cnt_clicked + C * m) / (cnt_all + C) 82 | probs = probs[df_test[column]].reset_index(drop=1) 83 | probs.fillna(m, inplace=1) 84 | 85 | df_test['prob'] = probs 86 | 87 | print('took %0.3fs' % (time() - t0)) 88 | 89 | 90 | # saving the results 91 | 92 | np.save('features/mtv/' + column + '_pred_0.npy', probs_0.values) 93 | np.save('features/mtv/' + column + '_pred_1.npy', probs_1.values) 94 | np.save('features/mtv/' + column + '_pred_test.npy', probs.values) 95 | 96 | 97 | # creating the rank features 98 | 99 | print('creating the ranking features...') 100 | 101 | t0 = time() 102 | 103 | f = column 104 | 105 | for df in [df_train_0, df_train_1, df_test]: 106 | df['%s_rank' % f] = df.groupby('display_id')[f].rank(method='max', ascending=0) 107 | df['%s_rank' % f] = df['%s_rank' % f].astype('uint8') 108 | 109 | print('took %0.3fs' % (time() - t0)) 110 | 111 | 112 | np.save('features/mtv/' + column + '_pred_rank_0.npy', df_train_0['%s_rank' % f].values) 113 | np.save('features/mtv/' + column + '_pred_rank_1.npy', df_train_1['%s_rank' % f].values) 114 | np.save('features/mtv/' + column + '_pred_rank_test.npy', df_test['%s_rank' % f].values) -------------------------------------------------------------------------------- /5_best_mtv_features_xgb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import xgboost as xgb 6 | 7 | 8 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 9 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 10 | 11 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 12 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 13 | del df_train_0['fold'], df_train_1['fold'], df_all 14 | 15 | features = list(pd.read_csv('categorical_features.txt', header=None)[0]) 16 | 17 | 18 | # training a small model to select best features 19 | # first, load the data 20 | 21 | 22 | df_train = df_train_0[:2000000].copy() 23 | df_val = df_train_1[:1000000].copy() 24 | del df_train_0, df_train_1 25 | 26 | for f in features: 27 | print('loading data for %s...' % f) 28 | pred_0 = 'features/mte/%s_pred_0.npy' % f 29 | pred_1 = 'features/mte/%s_pred_1.npy' % f 30 | rank_0 = 'features/mte/%s_pred_rank_0.npy' % f 31 | rank_1 = 'features/mte/%s_pred_rank_1.npy' % f 32 | 33 | df_train[f] = np.load(pred_0)[:2000000] 34 | df_val[f] = np.load(pred_1)[:1000000] 35 | df_train[f + '_rank'] = np.load(rank_0)[:2000000] 36 | df_val[f + '_rank'] = np.load(rank_1)[:1000000] 37 | 38 | 39 | ignore = {'display_id', 'ad_id', 'clicked'} 40 | columns = sorted(set(df_train.columns) - ignore) 41 | 42 | X_t = df_train[columns].values 43 | y_t = df_train.clicked.values 44 | 45 | X_v = df_val[columns].values 46 | y_v = df_val.clicked.values 47 | 48 | 49 | dtrain = xgb.DMatrix(X_t, y_t, feature_names=columns) 50 | dval = xgb.DMatrix(X_v, y_v, feature_names=columns) 51 | 52 | watchlist = [(dtrain, 'train'), (dval, 'val')] 53 | del X_t, X_v, y_t, y_v 54 | 55 | 56 | # train a small model and save only important feautures 57 | 58 | xgb_pars = { 59 | 'eta': 0.3, 60 | 'gamma': 0.0, 61 | 'max_depth': 6, 62 | 'min_child_weight': 100, 63 | 'max_delta_step': 0, 64 | 'subsample': 1, 65 | 'colsample_bytree': 0.6, 66 | 'colsample_bylevel': 1, 67 | 'lambda': 1, 68 | 'alpha': 0, 69 | 'tree_method': 'approx', 70 | 'objective': 'binary:logistic', 71 | 'eval_metric': 'auc', 72 | 'nthread': 12, 73 | 'seed': 42, 74 | 'silent': 1 75 | } 76 | 77 | model = xgb.train(xgb_pars, dtrain, num_boost_round=20, verbose_eval=1, 78 | evals=watchlist) 79 | 80 | scores = model.get_score(importance_type='gain') 81 | useful_features = [f for (f, s) in scores.items() if s >= 50.0] 82 | 83 | 84 | # now let's put everything together in a data frame and save the result 85 | 86 | for f in useful_features: 87 | if '_rank' in f: 88 | base_name = f[:-5] + '_pred_rank' 89 | else: 90 | base_name = f + '_pred' 91 | 92 | df_train_0[f] = np.load('features/mtv/%s_0.npy' % base_name) 93 | df_train_1[f] = np.load('features/mtv/%s_1.npy' % base_name) 94 | df_test[f] = np.load('features/mtv/%s_test.npy' % base_name) 95 | 96 | 97 | # also add the doc features 98 | 99 | df_train_0_doc = feather.load_dataframe('features/docs_df_train_0.feather') 100 | df_train_1_doc = feather.load_dataframe('features/docs_df_train_1.feather') 101 | df_test_doc = feather.load_dataframe('features/docs_df_test.feather') 102 | 103 | doc_features = ['doc_idf_dot', 'doc_idf_dot_lsa', 'doc_idf_cos', 104 | 'doc_idf_dot_rank', 'doc_idf_dot_lsa_rank', 'doc_idf_cos_rank'] 105 | 106 | for f in doc_features: 107 | df_train_0[f] = df_train_0_doc[f] 108 | df_train_1[f] = df_train_1_doc[f] 109 | df_test[f] = df_test_doc[f] 110 | 111 | 112 | df_train_0['doc_known_views'] = np.load('features/doc_known_views_0.npy') 113 | df_train_1['doc_known_views'] = np.load('features/doc_known_views_1.npy') 114 | df_test['doc_known_views'] = np.load('features/doc_known_views_test.npy) 115 | 116 | 117 | # now save evertyhing 118 | 119 | feather.write_dataframe(df_train_0, 'tmp/mtv_df_train_0.feather') 120 | feather.write_dataframe(df_train_1, 'tmp/mtv_df_train_1.feather') 121 | feather.write_dataframe(df_test, 'tmp/mtv_df_test.feather') 122 | -------------------------------------------------------------------------------- /5_mtv_et.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import feather 4 | import gc 5 | 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.ensemble import ExtraTreesClassifier 8 | 9 | 10 | df_train_1 = feather.read_dataframe('tmp/mtv_df_train_1.feather') 11 | features = sorted(set(df_train_1.columns) - {'display_id', 'clicked'}) 12 | 13 | y_1 = df_train_1.clicked.values 14 | X_1 = df_train_1[features].values 15 | 16 | del df_train_1 17 | gc.collect() 18 | 19 | 20 | df_train_0 = feather.read_dataframe('tmp/mtv_df_train_0.feather') 21 | 22 | y_0 = df_train_0.clicked.values 23 | X_0 = df_train_0[features].values 24 | 25 | del df_train_0 26 | gc.collect() 27 | 28 | 29 | # training a model 30 | 31 | n_estimators = 100 32 | 33 | et_params = dict( 34 | criterion='entropy', 35 | max_depth=40, 36 | min_samples_split=6, 37 | min_samples_leaf=6, 38 | max_features=6, 39 | bootstrap=False, 40 | n_jobs=-1, 41 | random_state=1 42 | ) 43 | 44 | 45 | et0 = ExtraTreesClassifier(warm_start=True, **et_params) 46 | et1 = ExtraTreesClassifier(warm_start=True, **et_params) 47 | 48 | for n in range(10, n_estimators + 1, 10): 49 | et0.n_estimators = n 50 | et0.fit(X_1, y_1) 51 | pred_0 = et0.predict_proba(X_0)[:, 1] 52 | s0 = roc_auc_score(y_0, pred_0) 53 | 54 | et1.n_estimators = n 55 | et1.fit(X_0, y_0) 56 | pred_1 = et1.predict_proba(X_1)[:, 1] 57 | s1 = roc_auc_score(y_1, pred_1) 58 | 59 | scores = (s0, s1) 60 | scores_text = ', '.join('%0.5f' % s for s in scores) 61 | print('%3d, %0.4f, [%s]' % (n, np.mean(scores), scores_text)) 62 | 63 | print('final scores:', scores) 64 | 65 | 66 | pred_0 = et0.predict_proba(X_0)[:, 1].astype('float32') 67 | pred_1 = et1.predict_proba(X_1)[:, 1].astype('float32') 68 | del et0, et1 69 | 70 | 71 | np.save('predictions/et_pred0.npy', pred_0) 72 | np.save('predictions/et_pred1.npy', pred_1) 73 | 74 | 75 | # training on full dataset 76 | 77 | print('full model...') 78 | 79 | X = np.concatenate([X_0, X_1]) 80 | del X_0, X_1 81 | gc.collect() 82 | 83 | y = np.concatenate([y_0, y_1]) 84 | del y_0, y_1 85 | gc.collect() 86 | 87 | 88 | et_full = ExtraTreesClassifier(warm_start=True, **et_params) 89 | et_full.n_estimators = n 90 | et_full.fit(X, y) 91 | 92 | del X, y 93 | gc.collect() 94 | 95 | 96 | 97 | # making predictions for test 98 | 99 | df_test = feather.read_dataframe('tmp/mtv_df_test.feather') 100 | 101 | X_test = df_test[features].values 102 | del df_test 103 | 104 | 105 | pred_test = et_full.predict_proba(X_test)[:, 1].astype('float32') 106 | np.save('predictions/et_pred_test.npy', pred_test) -------------------------------------------------------------------------------- /5_mtv_xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | import feather 5 | import gc 6 | 7 | 8 | df_train_1 = feather.read_dataframe('tmp/mtv_df_train_1.feather') 9 | features = sorted(set(df_train_1.columns) - {'display_id', 'clicked'}) 10 | 11 | y_1 = df_train_1.clicked.values 12 | X_1 = df_train_1[features].values 13 | del df_train_1 14 | 15 | dfold1 = xgb.DMatrix(X_1, y_1, feature_names=features) 16 | del X_1, y_1 17 | gc.collect() 18 | 19 | 20 | df_train_0 = feather.read_dataframe('tmp/mtv_df_train_0.feather') 21 | 22 | y_0 = df_train_0.clicked.values 23 | X_0 = df_train_0[features].values 24 | del df_train_0 25 | gc.collect() 26 | 27 | dfold0 = xgb.DMatrix(X_0, y_0, feature_names=features) 28 | del X_0, y_0 29 | gc.collect() 30 | 31 | 32 | 33 | # training a model 34 | 35 | n_estimators = 100 36 | 37 | xgb_pars = { 38 | 'eta': 0.2, 39 | 'gamma': 0.5, 40 | 'max_depth': 6, 41 | 'min_child_weight': 1, 42 | 'max_delta_step': 0, 43 | 'subsample': 1, 44 | 'colsample_bytree': 0.5, 45 | 'colsample_bylevel': 0.5, 46 | 'lambda': 1, 47 | 'alpha': 0, 48 | 'tree_method': 'approx', 49 | 'objective': 'binary:logistic', 50 | 'eval_metric': 'auc', 51 | 'nthread': 20, 52 | 'seed': 42, 53 | 'silent': 1 54 | } 55 | 56 | 57 | print('training model on fold 0...') 58 | 59 | watchlist = [(dfold0, 'train'), (dfold1, 'val')] 60 | model_fold1 = xgb.train(xgb_pars, dfold0, num_boost_round=n_estimators, 61 | verbose_eval=1, evals=watchlist) 62 | 63 | print('training model on fold 1...') 64 | 65 | watchlist = [(dfold1, 'train'), (dfold0, 'val')] 66 | model_fold0 = xgb.train(xgb_pars, dfold1, num_boost_round=n_estimators, 67 | verbose_eval=1, evals=watchlist) 68 | 69 | 70 | pred0 = model_fold0.predict(dfold0) 71 | pred1 = model_fold1.predict(dfold1) 72 | 73 | np.save('predictions/xgb_mtv_pred0.npy', pred0) 74 | np.save('predictions/xgb_mtv_pred1.npy', pred1) 75 | 76 | 77 | # saving the training leaves 78 | 79 | leaves0 = model_0.predict(dfold0, pred_leaf=True).astype('uint8') 80 | 81 | np.save('tmp/xgb_model_0_leaves.npy', leaves0) 82 | del leaves0 83 | gc.collect() 84 | 85 | 86 | leaves1 = model_1.predict(dfold1, pred_leaf=True).astype('uint8') 87 | 88 | np.save('tmp/xgb_model_1_leaves.npy', leaves1) 89 | del leaves1 90 | gc.collect() 91 | 92 | 93 | 94 | # making prediction for test and getting the leaves 95 | 96 | df_test = feather.read_dataframe('tmp/mtv_df_test.feather') 97 | 98 | 99 | X_test = df_test[features].values 100 | del df_test 101 | gc.collect() 102 | 103 | dtest = xgb.DMatrix(X_test, feature_names=features) 104 | del X_test 105 | gc.collect() 106 | 107 | 108 | pred0_test = model_0.predict(dtest) 109 | pred1_test = model_1.predict(dtest) 110 | pred_test = (pred0_test + pred1_test) / 2 111 | 112 | np.save('predictions/xgb_mtv_pred_test.npy', pred_test) 113 | 114 | 115 | # predicting leaves for test 116 | 117 | leaves0_test = model_0.predict(dtest, pred_leaf=True).astype('uint8') 118 | np.save('tmp/xgb_model_0_test_leaves.npy', leaves0_test) 119 | 120 | del leaves0_test 121 | gc.collect() 122 | 123 | leaves1_test = model_1.predict(dtest, pred_leaf=True).astype('uint8') 124 | np.save('tmp/xgb_model_1_test_leaves.npy', leaves1_test) 125 | 126 | del leaves1_test 127 | gc.collect() -------------------------------------------------------------------------------- /6_1_generate_ffm_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from tqdm import tqdm 5 | import feather 6 | import gc 7 | 8 | 9 | D = 2 ** 20 10 | 11 | # display features 12 | USER = '0' 13 | ON_DOC = '1' 14 | PLATFORM = '2' 15 | 16 | # ads features 17 | AD = '3' 18 | AD_DOC = '4' 19 | CAMPAIGN = '5' 20 | ADVERTISER = '6' 21 | 22 | # document features 23 | ON_SRC = '7' 24 | ON_PUBLISHER = '8' 25 | 26 | AD_SRC = '9' 27 | AD_PUBLISHER = '10' 28 | 29 | 30 | def hash_element(el): 31 | h = hash(el) % D 32 | if h < 0: 33 | h = h + D 34 | return str(h) 35 | 36 | 37 | # reading the events features 38 | 39 | df_events = pd.read_csv("../data/events.csv", usecols=['uuid', 'document_id', 'platform']) 40 | 41 | user_str = USER + ':' + df_events.uuid.apply(hash_element) + ':1' 42 | doc_str = ON_DOC + ':' + df_events.document_id.apply(hash_element) + ':1' 43 | platforms = PLATFORM + ':' + df_events.platform.astype('str') + ':1' 44 | 45 | df_events_processed = pd.DataFrame() 46 | df_events_processed['display_str'] = user_str + ' ' + doc_str + ' ' + platforms 47 | df_events_processed['document_id'] = df_events.document_id 48 | 49 | del df_events, user_str, doc_str, platforms 50 | 51 | 52 | # reading the ads features 53 | 54 | df_ads = pd.read_csv("../data/promoted_content.csv") 55 | ad_to_doc = dict(zip(df_ads.ad_id, df_ads.document_id)) 56 | 57 | ad_str = AD + ':' + df_ads.ad_id.astype(str) + ':1 ' + \ 58 | AD_DOC + ':' + df_ads.document_id.apply(hash_element) + ':1 ' + \ 59 | CAMPAIGN + ':' + df_ads.campaign_id.astype(str) + ':1 ' + \ 60 | ADVERTISER + ':' + df_ads.advertiser_id.astype(str) + ':1' 61 | 62 | ad_str_dict = dict(zip(df_ads.ad_id, ad_str)) 63 | 64 | del ad_str, df_ads 65 | 66 | 67 | # reading the document meta features - others aren't included 68 | 69 | df_doc_meta = pd.read_csv('../data/documents_meta.csv') 70 | 71 | df_doc_meta.source_id.fillna(0, inplace=1) 72 | df_doc_meta.source_id = df_doc_meta.source_id.astype('int32') 73 | df_doc_meta.publisher_id.fillna(0, inplace=1) 74 | df_doc_meta.publisher_id = df_doc_meta.publisher_id.astype('int32') 75 | del df_doc_meta['publish_time'] 76 | 77 | meta_src = df_doc_meta.source_id.astype('str') + ':1 ' 78 | meta_src_dict = dict(zip(df_doc_meta.document_id, meta_src)) 79 | 80 | meta_pub = df_doc_meta.publisher_id.astype('str') + ':1' 81 | meta_pub_dict = dict(zip(df_doc_meta.document_id, meta_pub)) 82 | 83 | del df_doc_meta, meta_src, meta_pub 84 | 85 | # generating the ffm data 86 | 87 | leaves_start = 11 88 | 89 | def ffm_feature_string(display_id, ad_id, leaves, label=None): 90 | ad_doc_id = ad_to_doc[ad_id] 91 | 92 | ad_features = ad_str_dict[ad_id] # 93 | 94 | disp_row = df_events_processed.iloc[display_id - 1] 95 | on_doc_id = disp_row.document_id 96 | disp_features = disp_row.display_str # 97 | 98 | on_src = ON_SRC + ':' + meta_src_dict[on_doc_id] 99 | on_pub = ON_PUBLISHER + ':' + meta_pub_dict[on_doc_id] 100 | 101 | ad_src = AD_SRC + ':' + meta_src_dict[ad_doc_id] 102 | ad_pub = AD_PUBLISHER + ':' + meta_pub_dict[ad_doc_id] 103 | 104 | leaves_features = [] 105 | 106 | for i, leaf in enumerate(leaves): 107 | leaves_features.append('%d:%d:1' % (leaves_start + i, leaf)) 108 | 109 | leaves_features = ' '.join(leaves_features) 110 | 111 | result = disp_features + ' ' + ad_features + ' ' + \ 112 | on_src + ' ' + on_pub + ' ' + \ 113 | ad_src + ' ' + ad_pub + ' ' + \ 114 | leaves_features 115 | 116 | if label is None: 117 | return '0 ' + result 118 | else: 119 | return str(label) + ' ' + result 120 | 121 | 122 | # generating the data for train 123 | 124 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 125 | 126 | leaves_0 = np.load('tmp/xgb_model_0_leaves.npy') 127 | leaves_1 = np.load('tmp/xgb_model_1_leaves.npy') 128 | 129 | 130 | f_0 = open('ffm/ffm_xgb_0.txt', 'w') 131 | f_1 = open('ffm/ffm_xgb_1.txt', 'w') 132 | cnt_0 = 0 133 | cnt_1 = 0 134 | 135 | for row in tqdm(df_all.itertuples()): 136 | display_id = row.display_id 137 | ad_id = row.ad_id 138 | fold = row.fold 139 | label = row.clicked 140 | 141 | if fold == 0: 142 | row = ffm_feature_string(display_id, ad_id, leaves_0[cnt_0], label) 143 | f_0.write(row + '\n') 144 | cnt_0 = cnt_0 + 1 145 | else: 146 | row = ffm_feature_string(display_id, ad_id, leaves_1[cnt_1], label) 147 | f_1.write(row + '\n') 148 | cnt_1 = cnt_1 + 1 149 | 150 | f_0.close() 151 | f_1.close() 152 | 153 | 154 | del df_all, leaves_0, leaves_1 155 | gc.collect() 156 | 157 | 158 | # generating the data for test 159 | 160 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 161 | 162 | leaves_0 = np.load('tmp/xgb_model_0_test_leaves.npy') 163 | leaves_1 = np.load('tmp/xgb_model_1_test_leaves.npy') 164 | 165 | f_0 = open('ffm/ffm_xgb_test_0.txt', 'w') 166 | f_1 = open('ffm/ffm_xgb_test_1.txt', 'w') 167 | 168 | cnt = 0 169 | 170 | for row in tqdm(df_test.itertuples()): 171 | display_id = row.display_id 172 | ad_id = row.ad_id 173 | 174 | row = ffm_feature_string(display_id, ad_id, leaves_0[cnt]) 175 | f_0.write(row + '\n') 176 | 177 | row = ffm_feature_string(display_id, ad_id, leaves_1[cnt]) 178 | f_1.write(row + '\n') 179 | 180 | cnt = cnt + 1 181 | 182 | f_0.close() 183 | f_1.close() -------------------------------------------------------------------------------- /6_2_split_ffm_to_subfolds.py: -------------------------------------------------------------------------------- 1 | import feather 2 | import numpy as np 3 | 4 | 5 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 6 | 7 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 8 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 9 | del df_train_0['fold'], df_train_1['fold'], df_all 10 | 11 | 12 | # define subfolds for each fold 13 | np.random.seed(1) 14 | 15 | uniq0 = df_train_0.display_id.unique() 16 | uniq1 = df_train_1.display_id.unique() 17 | 18 | np.random.shuffle(uniq0) 19 | np.random.shuffle(uniq1) 20 | 21 | n0 = len(uniq0) // 2 22 | fold_0_0 = set(uniq0[:n0]) 23 | 24 | n1 = len(uniq1) // 2 25 | fold_1_0 = set(uniq1[:n1]) 26 | 27 | 28 | df_train_0['subfold'] = df_train_0.display_id.isin(fold_0_0).astype('uint8') 29 | df_train_1['subfold'] = df_train_1.display_id.isin(fold_1_0).astype('uint8') 30 | 31 | np.save('tmp/fold_0_split.npy', df_train_0.fold.values) 32 | np.save('tmp/fold_1_split.npy', df_train_1.fold.values) 33 | 34 | 35 | # split fold 0 into subfolds 36 | 37 | f_0 = open('ffm/ffm_xgb_0_0.txt', 'w') 38 | f_1 = open('ffm/ffm_xgb_0_1.txt', 'w') 39 | 40 | with open('ffm/ffm_xgb_0.txt', 'r') as f_in: 41 | for fold, line in tqdm(zip(df_train_0.fold, f_in)): 42 | if fold == 0: 43 | f_0.write(line) 44 | else: 45 | f_1.write(line) 46 | 47 | f_0.close() 48 | f_1.close() 49 | 50 | 51 | # split fold 1 into subfolds 52 | 53 | f_0 = open('ffm/ffm_xgb_1_0.txt', 'w') 54 | f_1 = open('ffm/ffm_xgb_1_1.txt', 'w') 55 | 56 | with open('ffm/ffm_xgb_1.txt', 'r') as f_in: 57 | for fold, line in tqdm(zip(df_train_1.fold, f_in)): 58 | if fold == 0: 59 | f_0.write(line) 60 | else: 61 | f_1.write(line) 62 | 63 | f_0.close() 64 | f_1.close() 65 | 66 | -------------------------------------------------------------------------------- /6_3_run_ffm.sh: -------------------------------------------------------------------------------- 1 | # assumes libffm is on PATH 2 | 3 | PARAMS='-s 12 -k 5 -l 0.000001 -t 5' 4 | 5 | cd ffm 6 | 7 | 8 | # fold 0 9 | 10 | ffm-train $PARAMS -p ffm_xgb_0_0.txt ffm_xgb_0_1.txt ffm_0_0.bin 11 | ffm-predict ffm_xgb_0_0.txt ffm_0_0.bin pred_0_0.txt 12 | 13 | ffm-train $PARAMS -p ffm_xgb_0_1.txt ffm_xgb_0_0.txt ffm_0_1.bin 14 | ffm-predict ffm_xgb_0_1.txt ffm_0_1.bin pred_0_1.txt 15 | 16 | ffm-train $PARAMS ffm_xgb_0.txt ffm_0_full.bin 17 | 18 | 19 | # fold 1 20 | 21 | ffm-train $PARAMS -p ffm_xgb_1_0.txt ffm_xgb_1_1.txt ffm_1_0.bin 22 | ffm-predict ffm_xgb_1_0.txt ffm_1_0.bin pred_1_0.txt 23 | 24 | ffm-train $PARAMS -p ffm_xgb_1_1.txt ffm_xgb_1_0.txt ffm_1_1.bin 25 | ffm-predict ffm_xgb_1_1.txt ffm_1_1.bin pred_1_1.txt 26 | 27 | ffm-train $PARAMS ffm_xgb_1.txt ffm_1_full.bin 28 | 29 | 30 | # predict for test 31 | 32 | ffm-predict ffm_xgb_test_0.txt ffm_0_full.bin pred_test_0.txt 33 | ffm-predict ffm_xgb_test_1.txt ffm_1_full.bin pred_test_1.txt -------------------------------------------------------------------------------- /6_4_put_ffm_subfolds_together.py: -------------------------------------------------------------------------------- 1 | import feather 2 | import numpy as np 3 | 4 | 5 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 6 | 7 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 8 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 9 | del df_train_0['fold'], df_train_1['fold'], df_all 10 | 11 | df_train_0['fold'] = np.load('tmp/fold_0_split.npy') 12 | df_train_1['fold'] = np.load('tmp/fold_1_split.npy') 13 | 14 | 15 | # predictions of two subfolds of fold 0 16 | 17 | pred_0_0 = pd.read_csv('ffm/pred_0_0.txt', header=None, dtype='float32') 18 | pred_0_0 = pred_0_0[0] 19 | 20 | pred_0_1 = pd.read_csv('ffm/pred_0_1.txt', header=None, dtype='float32') 21 | pred_0_1 = pred_0_1[0] 22 | 23 | 24 | df_train_0.loc[df_train_0.fold == 0, 'ffm_xgb'] = pred_0_0.values 25 | df_train_0.loc[df_train_0.fold == 1, 'ffm_xgb'] = pred_0_1.values 26 | ffm_xgb_0 = df_train_0.ffm_xgb.astype('float32') 27 | 28 | np.save('predictions/ffm_0.npy', ffm_xgb_0.values) 29 | 30 | 31 | # predictions of two subfolds of fold 1 32 | 33 | pred_1_0 = pd.read_csv('ffm/pred_1_0.txt', header=None, dtype='float32') 34 | pred_1_0 = pred_1_0[0] 35 | 36 | pred_1_1 = pd.read_csv('ffm/pred_1_1.txt', header=None, dtype='float32') 37 | pred_1_1 = pred_1_1[0] 38 | 39 | 40 | df_train_1.loc[df_train_1.fold == 0, 'ffm_xgb'] = pred_1_0.values 41 | df_train_1.loc[df_train_1.fold == 1, 'ffm_xgb'] = pred_1_1.values 42 | ffm_xgb_1 = df_train_1.ffm_xgb.astype('float32') 43 | 44 | np.save('predictions/ffm_1.npy', ffm_xgb_1.values) 45 | 46 | 47 | # test predictions 48 | 49 | pred_test_0 = pd.read_csv('ffm/pred_test_0.txt', header=None, dtype='float32') 50 | pred_test_1 = pd.read_csv('ffm/pred_test_1.txt', header=None, dtype='float32') 51 | 52 | pred_test = (pred_test_0[0] + pred_test_1[0]) / 2 53 | pred_test = pred_test.astype('float32') 54 | 55 | np.save('predictions/ffm_test.npy', pred_test.values) -------------------------------------------------------------------------------- /7_ensemble_data_prep.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import feather 4 | 5 | from tqdm import tqdm 6 | 7 | 8 | df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather') 9 | df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1) 10 | df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1) 11 | del df_train_0['fold'], df_train_1['fold'], df_all 12 | 13 | df_test = feather.read_dataframe('tmp/clicks_test.feather') 14 | 15 | 16 | # read svm predictions 17 | 18 | df_train_0['svm'] = np.load('predictions/svm_0_preds.npy') 19 | df_train_0['svm'] = df_train_0['svm'].astype('float32') 20 | 21 | df_train_1['svm'] = np.load('predictions/svm_1_preds.npy') 22 | df_train_1['svm'] = df_train_1['svm'].astype('float32') 23 | 24 | df_test['svm'] = np.load('predictions/svm_test_preds.npy') 25 | df_test['svm'] = df_test['svm'].astype('float32') 26 | 27 | 28 | # read ftrl predictions 29 | 30 | ftrl_0 = pd.read_csv('predictions/ftrl_pred_0.txt') 31 | df_train_0['ftrl'] = ftrl_0.y_pred.astype('float32') 32 | 33 | ftrl_1 = pd.read_csv('predictions/ftrl_pred_1.txt') 34 | df_train_1['ftrl'] = ftrl_1.y_pred.astype('float32') 35 | 36 | ftrl_test = pd.read_csv('predictions/ftrl_pred_test.txt') 37 | df_test['ftrl'] = ftrl_test.y_pred.astype('float32') 38 | 39 | 40 | # read xgb predictions 41 | 42 | df_train_0['xgb_mtv'] = np.load('predictions/xgb_mtv_pred0.npy') 43 | df_train_1['xgb_mtv'] = np.load('predictions/xgb_mtv_pred1.npy') 44 | df_test['xgb_mtv'] = np.load('predictions/xgb_mtv_pred_test.npy') 45 | 46 | 47 | # read et predictions 48 | 49 | df_train_0['et_mtv'] = np.load('predictions/et_pred0.npy') 50 | df_train_1['et_mtv'] = np.load('predictions/et_pred1.npy') 51 | df_test['et_mtv'] = np.load('predictions/et_pred_test.npy') 52 | 53 | 54 | # read ffm predictions 55 | 56 | df_train_0['ffm'] = np.load('predictions/ffm_0.npy') 57 | df_train_1['ffm'] = np.load('predictions/ffm_1.npy') 58 | df_test['ffm'] = np.load('predictions/ffm_test.npy') 59 | 60 | 61 | # read the leak features 62 | 63 | df_train_0['leak'] = np.load('features/leak_0.npy') 64 | df_train_0['leak'] = df_train_0['leak'].astype('uint8') 65 | 66 | df_train_1['leak'] = np.load('features/leak_1.npy') 67 | df_train_1['leak'] = df_train_1['leak'].astype('uint8') 68 | 69 | df_test['leak'] = np.load('features/leak_test.npy') 70 | df_test['leak'] = df_test['leak'].astype('uint8') 71 | 72 | 73 | df_train_0['doc_known_views'] = np.load('features/doc_known_views_0.npy') 74 | df_train_0['doc_known_views'] = df_train_0['leak'].astype('uint32') 75 | 76 | df_train_1['doc_known_views'] = np.load('features/doc_known_views_1.npy') 77 | df_train_1['doc_known_views'] = df_train_1['leak'].astype('uint32') 78 | 79 | df_test['doc_known_views'] = np.load('features/doc_known_views_test.npy') 80 | df_test['doc_known_views'] = df_test['leak'].astype('uint32') 81 | 82 | 83 | # rank features 84 | 85 | cols_to_rank = ['svm', 'ftrl', 'xgb_mtv', 'et_mtv', 'ffm'] 86 | 87 | 88 | for f in tqdm(cols_to_rank): 89 | for df in [df_train_0, df_train_1, df_test]: 90 | df['%s_rank' % f] = df.groupby('display_id')[f].rank(method='dense', ascending=0) 91 | df['%s_rank' % f] = df['%s_rank' % f].astype('uint8') 92 | 93 | 94 | # some mean target value features 95 | 96 | mtv_features = ['ad_document_id_on_doc_publisher_id', 97 | 'ad_doc_source_id_on_doc_publisher_id', 98 | 'ad_document_id_on_doc_source_id'] 99 | 100 | for f in mtv_features: 101 | df_train_0[f] = np.load('features/mte/%s_pred_0.npy' % f) 102 | df_train_0['%s_rank' % f] = np.load('features/mte/%s_pred_rank_0.npy' % f) 103 | 104 | df_train_1[f] = np.load('features/mte/%s_pred_1.npy' % f) 105 | df_train_1['%s_rank' % f] = np.load('features/mte/%s_pred_rank_1.npy' % f) 106 | 107 | df_test[f] = np.load('features/mte/%s_pred_test.npy' % f) 108 | df_test['%s_rank' % f] = np.load('features/mte/%s_pred_rank_test.npy' % f) 109 | 110 | 111 | # now save everything 112 | 113 | feather.write_dataframe(df_train_0, 'df_train_0_ensemble.feather') 114 | feather.write_dataframe(df_train_1, 'df_train_1_ensemble.feather') 115 | feather.write_dataframe(df_test, 'df_test_ensemble.feather') 116 | -------------------------------------------------------------------------------- /7_ensemble_xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | import feather 5 | import gc 6 | 7 | # prapare the data matrices 8 | 9 | 10 | df_train_0 = feather.read_dataframe('tmp/df_train_0_ensemble.feather') 11 | 12 | ignore = {'display_id', 'ad_id', 'clicked', 'fold'} 13 | columns = sorted(set(df_train_0.columns) - ignore) 14 | 15 | group0_sizes = df_train_0.display_id.value_counts(sort=False) 16 | group0_sizes.sort_index(inplace=1) 17 | group0_sizes = group0_sizes.values.astype('uint8') 18 | 19 | y_0 = df_train_0.clicked.values 20 | X_0 = df_train_0[columns].values 21 | del df_train_0 22 | gc.collect() 23 | 24 | dfold0 = xgb.DMatrix(X_0, y_0, feature_names=columns) 25 | dfold0.set_group(group0_sizes) 26 | 27 | del X_0, y_0 28 | gc.collect() 29 | 30 | 31 | 32 | df_train_1 = feather.read_dataframe('tmp/df_train_1_ensemble.feather') 33 | 34 | group1_sizes = df_train_1.display_id.value_counts(sort=False) 35 | group1_sizes.sort_index(inplace=1) 36 | group1_sizes = group1_sizes.values.astype('uint8') 37 | 38 | y_1 = df_train_1.clicked.values 39 | X_1 = df_train_1[columns].values 40 | del df_train_1 41 | gc.collect() 42 | 43 | dfold1 = xgb.DMatrix(X_1, y_1, feature_names=columns) 44 | dfold1.set_group(group1_sizes) 45 | 46 | del X_1, y_1 47 | gc.collect() 48 | 49 | watchlist = [(dfold0, 'train'), (dfold1, 'val')] 50 | 51 | 52 | # train the model 53 | 54 | n_estimators = 1000 55 | 56 | xgb_pars = { 57 | 'eta': 0.15, 58 | 'gamma': 0.0, 59 | 'max_depth': 8, 60 | 'min_child_weight': 1, 61 | 'max_delta_step': 0, 62 | 'subsample': 0.6, 63 | 'colsample_bytree': 0.6, 64 | 'colsample_bylevel': 1, 65 | 'lambda': 1, 66 | 'alpha': 0, 67 | 'tree_method': 'approx', 68 | 'objective': 'rank:pairwise', 69 | 'eval_metric': 'map@12', 70 | 'nthread': 12, 71 | 'seed': 42, 72 | 'silent': 1 73 | } 74 | 75 | 76 | # train the model 77 | 78 | model = xgb.train(xgb_pars, dfold0, num_boost_round=n_estimators, 79 | verbose_eval=1, evals=watchlist) 80 | 81 | del dfold0, dfold1, watchlist 82 | gc.collect() 83 | 84 | 85 | # test predict 86 | 87 | df_test = feather.read_dataframe('tmp/df_test_ensemble.feather') 88 | 89 | group_test_sizes = df_test.display_id.value_counts(sort=False) 90 | group_test_sizes.sort_index(inplace=1) 91 | group_test_sizes = group_test_sizes.values.astype('uint8') 92 | 93 | X_test = df_test[columns].values 94 | df_test = df_test[['display_id', 'ad_id']].copy() 95 | 96 | dtest = xgb.DMatrix(X_test, feature_names=columns) 97 | dtest.set_group(group_test_sizes) 98 | del X_test 99 | 100 | 101 | test_pred = model.predict(dtest) 102 | df_test['pred'] = test_pred 103 | 104 | 105 | feather.write_dataframe(df_test, 'final_submission.feather') 106 | 107 | # now run `Rscript submission.R final_submission.feather xgb_submission.csv` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Outbrain Click Prediction challenge solution 2 | 3 | - The goal of the competition is to predict which ad will be clicked on 4 | - See https://www.kaggle.com/c/outbrain-click-prediction for more details 5 | - This is `ololo`'s part of the 13th place solution to the challenge (team "diaman & ololo") 6 | - The presentation of the solution: http://www.slideshare.net/AlexeyGrigorev/outbrain-click-prediction-71724151 7 | - `diaman`'s solution can be found at https://github.com/dselivanov/kaggle-outbrain 8 | 9 | 10 | ## Overview: 11 | 12 | The part of the solution is a combination of 5 models: 13 | 14 | - SVM and FTRL on basic features: 15 | - event features: user id, document id, platform id, day, hour and geo 16 | - ad features: ad document id, campaign, advertizer id 17 | - XGB and ET on MTV (Mean Target Value) features: 18 | - all categorical features that previous model used 19 | - document features like publisher, source, top category, topic and entity 20 | - interaction between these featuers 21 | - also, the document similarity features: the cosine between the ad doc and the page with the ad 22 | - FFM with the following features: 23 | - all categorical features from the above, except document similarity, categories, topics and entities 24 | - XGB leaves from the previous step (see slide 9 from [this presentation](http://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf) for the description of the idea) 25 | - The models are combined with an XGB model (`rank:pairwise` objective) 26 | 27 | To get the 13th positions, models from [diaman](https://www.kaggle.com/dselivanov) should also be added 28 | 29 | ## Files description 30 | 31 | - `0_prepare_splits.py` splits the training dataset into two folds 32 | - `1_svm_data.py` prepares the data for SVM and FTRL 33 | - `1_train_ftrl.py` and `1_train_svm.py` train models on data from `1_svm_data.py` 34 | - `2_extract_leaked_docs.py` and `2_leak_features.py` extract the leak 35 | - `3_doc_similarity_features.py` calculates TF-IDF similarity between the document user on and the ad document 36 | - `4_categorical_data_join.py` and `4_categorical_data_unwrap_columnwise.py` prepare data for MTV features calculation 37 | - `4_mean_target_value.py` calculates MTV for all features from `categorical_features.txt` 38 | - `5_best_mtv_features_xgb.py` builds an XBG on a small part of data and selects best features to be used on for XGB and ET 39 | - `5_mtv_et.py` trains ET model on MTV features 40 | - `5_mtv_xgb.py` trains XGB model on MTV features and creates leaf featurse to be used in FFM 41 | - `6_1_generate_ffm_data.py` creates the input file to be read by ffmlib 42 | - `6_2_split_ffm_to_subfolds.py` splits each fold into two subfolds (can't use the original folds because the leaf features are not transferable between folds) 43 | - `6_3_run_ffm.sh` runs libffm for training FFM models 44 | - `6_4_put_ffm_subfolds_together.py` puts FFM predictions from each fold/subfold together 45 | - `7_ensemble_data_prep.py` puts all the features and model predictions together for ensembling 46 | - `7_ensemble_xgb.py` traings the second level XGB model on top of all these features 47 | 48 | The files should be run in the above order 49 | 50 | Diaman's features should be included into `7_ensemble_data_prep.py` - and the rest can stay unchanged. 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /categorical_features.txt: -------------------------------------------------------------------------------- 1 | ad_id 2 | ad_document_id 3 | ad_doc_source_id 4 | ad_doc_publisher_id 5 | ad_doc_top_entity 6 | ad_doc_top_topic 7 | ad_doc_top_cat 8 | ad_doc_publisher_id_day 9 | ad_doc_publisher_id_hour 10 | ad_doc_publisher_id_geo_0 11 | ad_doc_publisher_id_geo_1 12 | ad_doc_publisher_id_geo_2 13 | ad_doc_source_id_day 14 | ad_doc_source_id_hour 15 | ad_doc_source_id_geo_0 16 | ad_doc_source_id_geo_1 17 | ad_doc_source_id_geo_2 18 | ad_doc_top_cat_day 19 | ad_doc_top_cat_hour 20 | ad_doc_top_cat_geo_0 21 | ad_doc_top_cat_geo_1 22 | ad_doc_top_cat_geo_2 23 | ad_doc_top_entity_day 24 | ad_doc_top_entity_hour 25 | ad_doc_top_entity_geo_0 26 | ad_doc_top_entity_geo_1 27 | ad_doc_top_entity_geo_2 28 | ad_doc_top_topic_day 29 | ad_doc_top_topic_hour 30 | ad_doc_top_topic_geo_0 31 | ad_doc_top_topic_geo_1 32 | ad_doc_top_topic_geo_2 33 | ad_document_id_day 34 | ad_document_id_hour 35 | ad_document_id_geo_0 36 | ad_document_id_geo_1 37 | ad_document_id_geo_2 38 | advertiser_id_day 39 | advertiser_id_hour 40 | advertiser_id_geo_0 41 | advertiser_id_geo_1 42 | advertiser_id_geo_2 43 | ad_doc_publisher_id_ad_doc_source_id 44 | ad_doc_publisher_id_ad_doc_top_cat 45 | ad_doc_publisher_id_ad_doc_top_entity 46 | ad_doc_publisher_id_ad_doc_top_topic 47 | ad_doc_publisher_id_ad_document_id 48 | ad_doc_publisher_id_ad_id 49 | ad_doc_publisher_id_advertiser_id 50 | ad_doc_publisher_id_campaign_id 51 | ad_doc_publisher_id_on_doc_publisher_id 52 | ad_doc_publisher_id_on_doc_source_id 53 | ad_doc_publisher_id_on_doc_top_cat 54 | ad_doc_publisher_id_on_doc_top_entity 55 | ad_doc_publisher_id_on_doc_top_topic 56 | ad_doc_publisher_id_on_document_id 57 | ad_doc_publisher_id_platform 58 | ad_doc_publisher_id_user_id 59 | ad_doc_source_id_ad_doc_top_cat 60 | ad_doc_source_id_ad_doc_top_entity 61 | ad_doc_source_id_ad_doc_top_topic 62 | ad_doc_source_id_ad_document_id 63 | ad_doc_source_id_ad_id 64 | ad_doc_source_id_advertiser_id 65 | ad_doc_source_id_campaign_id 66 | ad_doc_source_id_on_doc_publisher_id 67 | ad_doc_source_id_on_doc_source_id 68 | ad_doc_source_id_on_doc_top_cat 69 | ad_doc_source_id_on_doc_top_entity 70 | ad_doc_source_id_on_doc_top_topic 71 | ad_doc_source_id_on_document_id 72 | ad_doc_source_id_platform 73 | ad_doc_source_id_user_id 74 | ad_doc_top_cat_ad_doc_top_entity 75 | ad_doc_top_cat_ad_doc_top_topic 76 | ad_doc_top_cat_ad_document_id 77 | ad_doc_top_cat_ad_id 78 | ad_doc_top_cat_advertiser_id 79 | ad_doc_top_cat_campaign_id 80 | ad_doc_top_cat_on_doc_publisher_id 81 | ad_doc_top_cat_on_doc_source_id 82 | ad_doc_top_cat_on_doc_top_cat 83 | ad_doc_top_cat_on_doc_top_entity 84 | ad_doc_top_cat_on_doc_top_topic 85 | ad_doc_top_cat_on_document_id 86 | ad_doc_top_cat_platform 87 | ad_doc_top_cat_user_id 88 | ad_doc_top_entity_ad_doc_top_topic 89 | ad_doc_top_entity_ad_document_id 90 | ad_doc_top_entity_ad_id 91 | ad_doc_top_entity_advertiser_id 92 | ad_doc_top_entity_campaign_id 93 | ad_doc_top_entity_on_doc_publisher_id 94 | ad_doc_top_entity_on_doc_source_id 95 | ad_doc_top_entity_on_doc_top_cat 96 | ad_doc_top_entity_on_doc_top_entity 97 | ad_doc_top_entity_on_doc_top_topic 98 | ad_doc_top_entity_on_document_id 99 | ad_doc_top_entity_platform 100 | ad_doc_top_entity_user_id 101 | ad_doc_top_topic_ad_document_id 102 | ad_doc_top_topic_ad_id 103 | ad_doc_top_topic_advertiser_id 104 | ad_doc_top_topic_campaign_id 105 | ad_doc_top_topic_on_doc_publisher_id 106 | ad_doc_top_topic_on_doc_source_id 107 | ad_doc_top_topic_on_doc_top_cat 108 | ad_doc_top_topic_on_doc_top_entity 109 | ad_doc_top_topic_on_doc_top_topic 110 | ad_doc_top_topic_on_document_id 111 | ad_doc_top_topic_platform 112 | ad_doc_top_topic_user_id 113 | ad_document_id_campaign_id 114 | ad_document_id_on_doc_publisher_id 115 | ad_document_id_on_doc_source_id 116 | ad_document_id_on_doc_top_cat 117 | ad_document_id_on_doc_top_entity 118 | ad_document_id_on_doc_top_topic 119 | ad_document_id_on_document_id 120 | ad_document_id_platform 121 | ad_document_id_user_id 122 | ad_id_advertiser_id 123 | ad_id_campaign_id 124 | ad_id_on_doc_publisher_id 125 | ad_id_on_doc_source_id 126 | ad_id_on_doc_top_cat 127 | ad_id_on_doc_top_entity 128 | ad_id_on_doc_top_topic 129 | ad_id_on_document_id 130 | ad_id_platform 131 | ad_id_user_id 132 | advertiser_id_user_id 133 | campaign_id_user_id 134 | on_doc_publisher_id_user_id 135 | on_doc_source_id_user_id 136 | on_doc_top_cat_user_id 137 | on_doc_top_entity_user_id 138 | on_doc_top_topic_user_id 139 | on_document_id_user_id 140 | platform_user_id -------------------------------------------------------------------------------- /ftrl.py: -------------------------------------------------------------------------------- 1 | 2 | from math import exp, log, sqrt 3 | 4 | # implementation taken from kaggle scripts: 5 | # https://www.kaggle.com/sudalairajkumar/outbrain-click-prediction/ftrl-starter-with-leakage-vars/code 6 | 7 | 8 | def hash_element(el, D): 9 | h = hash(el) % D 10 | if h < 0: 11 | h = h + D 12 | return h 13 | 14 | def hash_elements(elements, D): 15 | return [hash_element(el, D) for el in elements] 16 | 17 | 18 | class FtrlProximal(object): 19 | ''' Our main algorithm: Follow the regularized leader - proximal 20 | 21 | In short, 22 | this is an adaptive-learning-rate sparse logistic-regression with 23 | efficient L1-L2-regularization 24 | 25 | Reference: 26 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 27 | ''' 28 | 29 | def __init__(self, alpha, beta, L1, L2, D, interactions): 30 | # parameters 31 | self.alpha = alpha 32 | self.beta = beta 33 | self.L1 = L1 34 | self.L2 = L2 35 | 36 | # feature related parameters 37 | self.D = D 38 | 39 | self.interactions = interactions 40 | 41 | # model 42 | # n: squared sum of past gradients 43 | # z: weights 44 | # w: lazy weights 45 | self.n = [0.0] * (D + 1) 46 | self.z = [0.0] * (D + 1) 47 | self.w = {} 48 | 49 | def to_indices(self, x): 50 | res = hash_elements(x, self.D) 51 | 52 | if self.interactions: 53 | sorted_x = sorted(x) 54 | len_x = len(sorted_x) 55 | 56 | for i in range(len_x): 57 | for j in range(i + 1, len_x): 58 | h = hash_element(sorted_x[i] + '_' + sorted_x[j], self.D) 59 | res.append(h) 60 | 61 | return res 62 | 63 | def predict(self, x): 64 | x_hashed = self.to_indices(x) 65 | return self.predict_hashed(x_hashed) 66 | 67 | def predict_hashed(self, x): 68 | ''' Get probability estimation on x 69 | 70 | INPUT: 71 | x: features 72 | 73 | OUTPUT: 74 | probability of p(y = 1 | x; w) 75 | ''' 76 | 77 | # parameters 78 | alpha = self.alpha 79 | beta = self.beta 80 | L1 = self.L1 81 | L2 = self.L2 82 | 83 | # model 84 | n = self.n 85 | z = self.z 86 | w = {} 87 | 88 | # wTx is the inner product of w and x 89 | wTx = 0. 90 | 91 | indices = [0] 92 | for i in x: 93 | indices.append(i + 1) 94 | 95 | for i in indices: 96 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 97 | 98 | # build w on the fly using z and n, hence the name - lazy weights 99 | # we are doing this at prediction instead of update time is because 100 | # this allows us for not storing the complete w 101 | if sign * z[i] <= L1: 102 | # w[i] vanishes due to L1 regularization 103 | w[i] = 0.0 104 | else: 105 | # apply prediction time L1, L2 regularization to z and get w 106 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 107 | 108 | wTx += w[i] 109 | 110 | # cache the current w for update stage 111 | self.w = w 112 | 113 | # bounded sigmoid function, this is the probability estimation 114 | return 1.0 / (1.0 + exp(-max(min(wTx, 35.0), -35.0))) 115 | 116 | def update(self, x, p, y): 117 | ''' Update model using x, p, y 118 | 119 | INPUT: 120 | x: a list of indices 121 | p: probability prediction of our model 122 | y: answer 123 | 124 | MODIFIES: 125 | self.n: increase by squared gradient 126 | self.z: weights 127 | ''' 128 | 129 | # parameter 130 | alpha = self.alpha 131 | 132 | # model 133 | n = self.n 134 | z = self.z 135 | w = self.w 136 | 137 | # gradient under logloss 138 | g = p - y 139 | 140 | indices = [0] 141 | for i in x: 142 | indices.append(i + 1) 143 | 144 | # update z and n 145 | for i in indices: 146 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 147 | z[i] += g - sigma * w[i] 148 | n[i] += g * g 149 | 150 | def fit(self, x, y): 151 | x_hashed = self.to_indices(x) 152 | p = self.predict_hashed(x_hashed) 153 | self.update(x_hashed, p, y) -------------------------------------------------------------------------------- /mapk.R: -------------------------------------------------------------------------------- 1 | # usage 2 | # Rscript mapk.R pred.feather 3 | 4 | library(methods) 5 | library(data.table) 6 | library(feather) 7 | 8 | input_cmd_args = commandArgs(trailingOnly = TRUE) 9 | path = path.expand(input_cmd_args[[1]]) 10 | 11 | message(Sys.time(), " reading ", path) 12 | dt = read_feather(path) 13 | setDT(dt) 14 | dt[, p_neg:=-pred] 15 | message(Sys.time(), " sorting") 16 | setkey(dt, display_id, p_neg) 17 | message(Sys.time(), " calculating map...") 18 | map = dt[ , .(map_12 = 1 / which(clicked == 1)), by = display_id][['map_12']] 19 | message(Sys.time(), " MAP@12 = ", mean(map)) -------------------------------------------------------------------------------- /ml_metrics_auc.py: -------------------------------------------------------------------------------- 1 | # implementation of auc is taken from ml_metrics: 2 | # https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 3 | 4 | def tied_rank(x): 5 | """ 6 | Computes the tied rank of elements in x. 7 | This function computes the tied rank of elements in x. 8 | Parameters 9 | ---------- 10 | x : list of numbers, numpy array 11 | Returns 12 | ------- 13 | score : list of numbers 14 | The tied rank f each element in x 15 | """ 16 | sorted_x = sorted(zip(x,range(len(x)))) 17 | r = [0 for k in x] 18 | cur_val = sorted_x[0][0] 19 | last_rank = 0 20 | for i in range(len(sorted_x)): 21 | if cur_val != sorted_x[i][0]: 22 | cur_val = sorted_x[i][0] 23 | for j in range(last_rank, i): 24 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 25 | last_rank = i 26 | if i==len(sorted_x)-1: 27 | for j in range(last_rank, i+1): 28 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 29 | return r 30 | 31 | def auc(actual, posterior): 32 | """ 33 | Computes the area under the receiver-operater characteristic (AUC) 34 | This function computes the AUC error metric for binary classification. 35 | Parameters 36 | ---------- 37 | actual : list of binary numbers, numpy array 38 | The ground truth value 39 | posterior : same type as actual 40 | Defines a ranking on the binary numbers, from most likely to 41 | be positive to least likely to be positive. 42 | Returns 43 | ------- 44 | score : double 45 | The mean squared error between actual and posterior 46 | """ 47 | r = tied_rank(posterior) 48 | num_positive = len([0 for x in actual if x==1]) 49 | num_negative = len(actual)-num_positive 50 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 51 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 52 | (num_negative*num_positive)) 53 | return auc -------------------------------------------------------------------------------- /submission.R: -------------------------------------------------------------------------------- 1 | # usage: 2 | # Rscript submission.R final_submission.feather xgb_submission.csv 3 | 4 | library(methods) 5 | library(data.table) 6 | library(feather) 7 | 8 | 9 | input_cmd_args = commandArgs(trailingOnly = TRUE) 10 | 11 | path = path.expand(input_cmd_args[[1]]) 12 | 13 | out_path = path.expand(input_cmd_args[[2]]) 14 | out_path = paste0(out_path, ".gz") 15 | 16 | 17 | 18 | message(Sys.time(), " reading ", path) 19 | 20 | dt = read_feather(path) 21 | setDT(dt) 22 | dt[, p_neg:=-pred] 23 | 24 | message(Sys.time(), " sorting") 25 | setkey(dt, display_id, p_neg) 26 | 27 | 28 | message(Sys.time(), " generating submission") 29 | 30 | submission = dt[ , .(ad_id = paste(ad_id, collapse = " ")), keyby = display_id] 31 | 32 | write.table(submission, file = gzfile(out_path, compression = 1), row.names = F, quote = F, sep = ",", append = F) 33 | 34 | message(Sys.time(), " DONE") --------------------------------------------------------------------------------