├── LICENSE
├── README.md
└── clf.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Kyle Kastner
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the {organization} nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | kaggle-criteo
2 | =============
3 | 
4 | Code for Criteo competition http://www.kaggle.com/c/criteo-display-ad-challenge
5 | 


--------------------------------------------------------------------------------
/clf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Modified from code by: https://www.kaggle.com/users/185835/tinrtgu
  3 | Forum post: https://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10322/beat-the-benchmark-with-less-then-200mb-of-memory
  4 | """
  5 | 
  6 | # Authors: tinrtgu
  7 | #          Kyle Kastner
  8 | # License: BSD 3 Clause
  9 | 
 10 | from datetime import datetime
 11 | from csv import DictReader
 12 | from math import exp, log, sqrt
 13 | import numpy as np
 14 | 
 15 | 
 16 | # parameters #################################################################
 17 | 
 18 | train = 'train.csv'  # path to training file
 19 | rev_train = 'rev_train.csv'
 20 | test = 'test.csv'  # path to testing file
 21 | 
 22 | D = 2 ** 20   # number of weights use for learning
 23 | alpha = .1    # learning rate for sgd optimization
 24 | n_models = 11  # number of models for bagging/random subset
 25 | 
 26 | # function definitions #######################################################
 27 | 
 28 | # A. Bounded logloss
 29 | # INPUT:
 30 | #     p: our prediction
 31 | #     y: real answer
 32 | # OUTPUT
 33 | #     logarithmic loss of p given y
 34 | def logloss(p, y):
 35 |     p = max(min(p, 1. - 1e-6), 1e-6)
 36 |     return -log(p) if y == 1. else -log(1. - p)
 37 | 
 38 | 
 39 | # B. Apply hash trick of the original csv row
 40 | # for simplicity, we treat both integer and categorical features as categorical
 41 | # INPUT:
 42 | #     csv_row: a csv dictionary, ex: {'Label': '1', 'I1': '357', 'I2': '', ...}
 43 | #     D: the max index that we can hash to
 44 | # OUTPUT:
 45 | #     x: a list of indices that its value is 1
 46 | def get_x(csv_row, D):
 47 |     x = [0]  # 0 is the index of the bias term
 48 |     for key, value in csv_row.items():
 49 |         index = int(value + key[1:], 16) % D  # weakest hash ever ;)
 50 |         x.append(index)
 51 |     return x  # x contains indices of features that have a value of 1
 52 | 
 53 | 
 54 | # C. Get probability estimation on x
 55 | # INPUT:
 56 | #     x: features
 57 | #     w: weights
 58 | # OUTPUT:
 59 | #     probability of p(y = 1 | x; w)
 60 | def get_p(x, w):
 61 |     wTx = 0.
 62 |     for i in x:  # do wTx
 63 |         wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
 64 |     return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
 65 | 
 66 | 
 67 | # D. Update given model
 68 | # INPUT:
 69 | #     w: weights
 70 | #     n: a counter that counts the number of times we encounter a feature
 71 | #        this is used for adaptive learning rate
 72 | #     x: feature
 73 | #     p: prediction of our model
 74 | #     y: answer
 75 | # OUTPUT:
 76 | #     w: updated model
 77 | #     n: updated count
 78 | def update_w(w, n, x, p, y):
 79 |     for i in x:
 80 |         # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
 81 |         # (p - y) * x[i] is the current gradient
 82 |         # note that in our case, if i in x then x[i] = 1
 83 |         w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
 84 |         n[i] += 1.
 85 | 
 86 |     return w, n
 87 | 
 88 | 
 89 | def training_loop(w_arr, n_arr, include_prob=.3, reverse=False):
 90 |     loss_arr = [0.] * len(w_arr)
 91 |     if reverse:
 92 |         dr = DictReader(open(rev_train))
 93 |     else:
 94 |         dr = DictReader(open(train))
 95 | 
 96 |     random_state = np.random.RandomState(1999)
 97 |     for t, row in enumerate(dr):
 98 |         y = 1. if row['Label'] == '1' else 0.
 99 | 
100 |         del row['Label']  # can't let the model peek the answer
101 |         del row['Id']  # we don't need the Id
102 | 
103 |         # main training procedure
104 |         # step 1, get the hashed features
105 |         x = get_x(row, D)
106 | 
107 |         for i in range(len(w_arr)):
108 |             if random_state.random_sample() < include_prob:
109 |                 # step 2, get prediction
110 |                 p = get_p(x, w_arr[i])
111 | 
112 |                 # for progress validation, useless for learning our model
113 |                 loss_arr[i] += logloss(p, y)
114 | 
115 |                 # step 3, update model with answer
116 |                 w_arr[i], n_arr[i] = update_w(w_arr[i], n_arr[i], x, p, y)
117 | 
118 |         if t % 1000000 == 0 and t > 1:
119 |         # for progress validation, useless for learning our model
120 |             for i in range(len(w_arr)):
121 |                 print('Model %d\t %s\tencountered: %d\tcurrent logloss: %f' % (
122 |                        i, datetime.now(), t, loss_arr[i]/(t * include_prob)))
123 |     return w_arr, n_arr
124 | 
125 | 
126 | def bag_prediction(x, w_arr):
127 |     return np.min([get_p(x, w) for w in w_arr])
128 | 
129 | # training and testing #######################################################
130 | # initialize our model
131 | w_arr = [[0.] * D] * n_models  # weights
132 | n_arr = [[0.] * D] * n_models  # number of times we've encountered a feature
133 | 
134 | w_arr, n_arr = training_loop(w_arr, n_arr, reverse=True)
135 | w_arr, n_arr = training_loop(w_arr, n_arr)
136 | w_arr, n_arr = training_loop(w_arr, n_arr)
137 | 
138 | # testing (build kaggle's submission file)
139 | with open('submission.csv', 'w') as submission:
140 |     submission.write('Id,Predicted\n')
141 |     for t, row in enumerate(DictReader(open(test))):
142 |         Id = row['Id']
143 |         del row['Id']
144 |         x = get_x(row, D)
145 |         p = bag_prediction(x, w_arr)
146 |         submission.write('%s,%f\n' % (Id, p))
147 | 


--------------------------------------------------------------------------------