├── README.md └── mart.py /README.md: -------------------------------------------------------------------------------- 1 | LambdaMart 2 | ========== 3 | 4 | LambdaMart python implementation -------------------------------------------------------------------------------- /mart.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import math 4 | # import pandas 5 | from optparse import OptionParser 6 | from sklearn.tree import DecisionTreeRegressor 7 | from collections import defaultdict 8 | from copy import deepcopy 9 | from multiprocessing import Pool 10 | from itertools import chain 11 | import time 12 | 13 | class Ensemble: 14 | def __init__(self, rate): 15 | self.trees = [] 16 | self.rate = rate 17 | 18 | def __len__(self): 19 | return len(self.trees) 20 | 21 | def add(self, tree): 22 | self.trees.append(tree) 23 | 24 | def eval_one(self, object): 25 | return self.eval([object])[0] 26 | 27 | def eval(self, objects): 28 | results = np.zeros(len(objects)) 29 | for tree in self.trees: 30 | results += tree.predict(objects) * self.rate 31 | return results 32 | 33 | def remove(self, number): 34 | self.trees = self.trees[:-number] 35 | 36 | 37 | def groupby(score, query): 38 | result = [] 39 | this_query = None 40 | for s, q in zip(score, query): 41 | if q != this_query: 42 | result.append([]) 43 | this_query = q 44 | result[-1].append(s) 45 | result = map(np.array, result) 46 | return result 47 | 48 | 49 | def point_dcg(arg): 50 | i, label = arg 51 | return (2 ** label - 1) / math.log(i + 2, 2) 52 | 53 | 54 | def dcg(scores): 55 | return sum(map(point_dcg, enumerate(scores))) 56 | 57 | 58 | def ndcg(page, k=10): 59 | model_top = page[:k] 60 | 61 | true_top = np.array([]) 62 | if len(page) > 10: 63 | true_top = np.partition(page, -10)[-k:] 64 | true_top.sort() 65 | else: 66 | true_top = np.sort(page) 67 | true_top = true_top[::-1] 68 | 69 | 70 | max_dcg = dcg(true_top) 71 | model_dcg = dcg(model_top) 72 | 73 | if max_dcg == 0: 74 | return 1 75 | 76 | return model_dcg / max_dcg 77 | 78 | 79 | def score(prediction, true_score, query, k=10): 80 | true_pages = groupby(true_score, query) 81 | model_pages = groupby(prediction, query) 82 | 83 | total_ndcg = [] 84 | 85 | for true_page, model_page in zip(true_pages, model_pages): 86 | page = true_page[np.argsort(model_page)[::-1]] 87 | total_ndcg.append(ndcg(page, k)) 88 | 89 | return sum(total_ndcg) / len(total_ndcg) 90 | 91 | 92 | def query_lambdas(page, k=10): 93 | true_page, model_page = page 94 | worst_order = np.argsort(true_page) 95 | 96 | true_page = true_page[worst_order] 97 | model_page = model_page[worst_order] 98 | 99 | 100 | model_order = np.argsort(model_page) 101 | 102 | idcg = dcg(np.sort(true_page)[-10:][::-1]) 103 | 104 | size = len(true_page) 105 | position_score = np.zeros((size, size)) 106 | 107 | for i in xrange(size): 108 | for j in xrange(size): 109 | position_score[model_order[i], model_order[j]] = \ 110 | point_dcg((model_order[j], true_page[model_order[i]])) 111 | 112 | lambdas = np.zeros(size) 113 | 114 | for i in xrange(size): 115 | for j in xrange(size): 116 | if true_page[i] > true_page[j]: 117 | 118 | delta_dcg = position_score[i][j] - position_score[i][i] 119 | delta_dcg += position_score[j][i] - position_score[j][j] 120 | 121 | delta_ndcg = abs(delta_dcg / idcg) 122 | 123 | rho = 1 / (1 + math.exp(model_page[i] - model_page[j])) 124 | 125 | lam = rho * delta_ndcg 126 | 127 | lambdas[j] -= lam 128 | lambdas[i] += lam 129 | return lambdas 130 | 131 | 132 | def compute_lambdas(prediction, true_score, query, k=10): 133 | true_pages = groupby(true_score, query) 134 | model_pages = groupby(prediction, query) 135 | 136 | print len(true_pages), "pages" 137 | 138 | pool = Pool() 139 | lambdas = pool.map(query_lambdas, zip(true_pages, model_pages)) 140 | return list(chain(*lambdas)) 141 | 142 | 143 | def mart_responces(prediction, true_score): 144 | return true_score - prediction 145 | 146 | 147 | def learn(train_file, n_trees=10, learning_rate=0.1, k=10, validate=False): 148 | print "Loading train file" 149 | train = np.loadtxt(train_file, delimiter=",", skiprows=1) 150 | 151 | scores = train[:, 0] 152 | # val_scores = train[:, 0] 153 | 154 | queries = train[:, 1] 155 | # val_queries = validation[:, 1] 156 | 157 | features = train[:, 3:] 158 | # val_features = validation[:, 3:] 159 | 160 | ensemble = Ensemble(learning_rate) 161 | 162 | print "Training starts..." 163 | model_output = np.zeros(len(features)) 164 | # val_output = np.array([float(0)] * len(validation)) 165 | 166 | # best_validation_score = 0 167 | time.clock() 168 | for i in range(n_trees): 169 | print " Iteration: " + str(i + 1) 170 | 171 | # Compute psedo responces (lambdas) 172 | # witch act as training label for document 173 | start = time.clock() 174 | print " --generating labels" 175 | lambdas = compute_lambdas(model_output, scores, queries, k) 176 | 177 | print zip(lambdas, scores) 178 | #lambdas = mart_responces(model_output, scores) 179 | print " --done", str(time.clock() - start) + " sec" 180 | 181 | # create tree and append it to the model 182 | print " --fitting tree" 183 | start = time.clock() 184 | tree = DecisionTreeRegressor(max_depth=6) 185 | # print "Distinct lambdas", set(lambdas) 186 | tree.fit(features, lambdas) 187 | 188 | print " ---done", str(time.clock() - start) + " sec" 189 | print " --adding tree to ensemble" 190 | ensemble.add(tree) 191 | 192 | # update model score 193 | print " --generating step prediction" 194 | prediction = tree.predict(features) 195 | # print "Distinct answers", set(prediction) 196 | 197 | print " --updating full model output" 198 | model_output += learning_rate * prediction 199 | # print set(model_output) 200 | 201 | # train_score 202 | start = time.clock() 203 | print " --scoring on train" 204 | train_score = score(model_output, scores, queries, 10) 205 | print " --iteration train score " + str(train_score) + ", took " + str(time.clock() - start) + "sec to calculate" 206 | 207 | # # validation score 208 | # print " --scoring on validation" 209 | # val_output += learning_rate * tree.predict(val_features) 210 | # val_score = ndcg(val_output, val_scores, val_queries, 10) 211 | 212 | # print " --iteration validation score " + str(val_score) 213 | 214 | # if(validation_score > best_validation_score): 215 | # best_validation_score = validation_score 216 | # best_model_len = len(ensemble) 217 | 218 | # # have we assidently break the celling? 219 | # if (best_validation_score > 0.9): 220 | # break 221 | 222 | # rollback to best 223 | # if len(ensemble) > best_model_len: 224 | # ensemble.remove(len(ensemble) - best_model_len) 225 | 226 | # finishing up 227 | # print "final quality evaluation" 228 | train_score = compute_ndcg(ensemble.eval(features), scores) 229 | # test_score = compute_ndcg(ensemble.eval(validation), validation_score) 230 | 231 | # print "train %s, test %s" % (train_score, test_score) 232 | print "Finished sucessfully." 233 | print "------------------------------------------------" 234 | return ensemble 235 | 236 | 237 | def predict(model, fn): 238 | predict = np.loadtxt(fn, delimiter=",", skiprows=1) 239 | 240 | queries = predict[:, 1] 241 | doc_id = predict[:, 2] 242 | features = predict[:, 3:] 243 | 244 | results = model.eval(features) 245 | writer = csv.writer(open("result.csv")) 246 | for line in zip(queries, results, doc_id): 247 | writer.writerow(line) 248 | return "OK" 249 | 250 | 251 | if __name__ == "__main__": 252 | parser = OptionParser() 253 | parser.add_option("-t", "--train", action="store", type="string", dest="train_file") 254 | parser.add_option("-v", "--validation", action="store_true", dest="validate") 255 | parser.add_option("-p", "--predict", action="store", type="string", dest="predict_file") 256 | 257 | options, args = parser.parse_args() 258 | iterations = 30 259 | learning_rate = 0.001 260 | 261 | model = learn(options.train_file, 262 | validate = options.validate, 263 | n_trees = 200) 264 | 265 | if options.predict_file: 266 | predict(model, options.predict_file) 267 | 268 | --------------------------------------------------------------------------------