├── LICENSE ├── README.md └── clf.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Kyle Kastner 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kaggle-criteo 2 | ============= 3 | 4 | Code for Criteo competition http://www.kaggle.com/c/criteo-display-ad-challenge 5 | -------------------------------------------------------------------------------- /clf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modified from code by: https://www.kaggle.com/users/185835/tinrtgu 3 | Forum post: https://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10322/beat-the-benchmark-with-less-then-200mb-of-memory 4 | """ 5 | 6 | # Authors: tinrtgu 7 | # Kyle Kastner 8 | # License: BSD 3 Clause 9 | 10 | from datetime import datetime 11 | from csv import DictReader 12 | from math import exp, log, sqrt 13 | import numpy as np 14 | 15 | 16 | # parameters ################################################################# 17 | 18 | train = 'train.csv' # path to training file 19 | rev_train = 'rev_train.csv' 20 | test = 'test.csv' # path to testing file 21 | 22 | D = 2 ** 20 # number of weights use for learning 23 | alpha = .1 # learning rate for sgd optimization 24 | n_models = 11 # number of models for bagging/random subset 25 | 26 | # function definitions ####################################################### 27 | 28 | # A. Bounded logloss 29 | # INPUT: 30 | # p: our prediction 31 | # y: real answer 32 | # OUTPUT 33 | # logarithmic loss of p given y 34 | def logloss(p, y): 35 | p = max(min(p, 1. - 1e-6), 1e-6) 36 | return -log(p) if y == 1. else -log(1. - p) 37 | 38 | 39 | # B. Apply hash trick of the original csv row 40 | # for simplicity, we treat both integer and categorical features as categorical 41 | # INPUT: 42 | # csv_row: a csv dictionary, ex: {'Label': '1', 'I1': '357', 'I2': '', ...} 43 | # D: the max index that we can hash to 44 | # OUTPUT: 45 | # x: a list of indices that its value is 1 46 | def get_x(csv_row, D): 47 | x = [0] # 0 is the index of the bias term 48 | for key, value in csv_row.items(): 49 | index = int(value + key[1:], 16) % D # weakest hash ever ;) 50 | x.append(index) 51 | return x # x contains indices of features that have a value of 1 52 | 53 | 54 | # C. Get probability estimation on x 55 | # INPUT: 56 | # x: features 57 | # w: weights 58 | # OUTPUT: 59 | # probability of p(y = 1 | x; w) 60 | def get_p(x, w): 61 | wTx = 0. 62 | for i in x: # do wTx 63 | wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. 64 | return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid 65 | 66 | 67 | # D. Update given model 68 | # INPUT: 69 | # w: weights 70 | # n: a counter that counts the number of times we encounter a feature 71 | # this is used for adaptive learning rate 72 | # x: feature 73 | # p: prediction of our model 74 | # y: answer 75 | # OUTPUT: 76 | # w: updated model 77 | # n: updated count 78 | def update_w(w, n, x, p, y): 79 | for i in x: 80 | # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic 81 | # (p - y) * x[i] is the current gradient 82 | # note that in our case, if i in x then x[i] = 1 83 | w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.) 84 | n[i] += 1. 85 | 86 | return w, n 87 | 88 | 89 | def training_loop(w_arr, n_arr, include_prob=.3, reverse=False): 90 | loss_arr = [0.] * len(w_arr) 91 | if reverse: 92 | dr = DictReader(open(rev_train)) 93 | else: 94 | dr = DictReader(open(train)) 95 | 96 | random_state = np.random.RandomState(1999) 97 | for t, row in enumerate(dr): 98 | y = 1. if row['Label'] == '1' else 0. 99 | 100 | del row['Label'] # can't let the model peek the answer 101 | del row['Id'] # we don't need the Id 102 | 103 | # main training procedure 104 | # step 1, get the hashed features 105 | x = get_x(row, D) 106 | 107 | for i in range(len(w_arr)): 108 | if random_state.random_sample() < include_prob: 109 | # step 2, get prediction 110 | p = get_p(x, w_arr[i]) 111 | 112 | # for progress validation, useless for learning our model 113 | loss_arr[i] += logloss(p, y) 114 | 115 | # step 3, update model with answer 116 | w_arr[i], n_arr[i] = update_w(w_arr[i], n_arr[i], x, p, y) 117 | 118 | if t % 1000000 == 0 and t > 1: 119 | # for progress validation, useless for learning our model 120 | for i in range(len(w_arr)): 121 | print('Model %d\t %s\tencountered: %d\tcurrent logloss: %f' % ( 122 | i, datetime.now(), t, loss_arr[i]/(t * include_prob))) 123 | return w_arr, n_arr 124 | 125 | 126 | def bag_prediction(x, w_arr): 127 | return np.min([get_p(x, w) for w in w_arr]) 128 | 129 | # training and testing ####################################################### 130 | # initialize our model 131 | w_arr = [[0.] * D] * n_models # weights 132 | n_arr = [[0.] * D] * n_models # number of times we've encountered a feature 133 | 134 | w_arr, n_arr = training_loop(w_arr, n_arr, reverse=True) 135 | w_arr, n_arr = training_loop(w_arr, n_arr) 136 | w_arr, n_arr = training_loop(w_arr, n_arr) 137 | 138 | # testing (build kaggle's submission file) 139 | with open('submission.csv', 'w') as submission: 140 | submission.write('Id,Predicted\n') 141 | for t, row in enumerate(DictReader(open(test))): 142 | Id = row['Id'] 143 | del row['Id'] 144 | x = get_x(row, D) 145 | p = bag_prediction(x, w_arr) 146 | submission.write('%s,%f\n' % (Id, p)) 147 | --------------------------------------------------------------------------------