├── .DS_Store ├── .gitignore ├── README.pdf ├── avito-context-click-py ├── convert_csv_to_libffm.py ├── convert_csv_to_libsvm.py ├── train_ftrl.py ├── train_pylearn.py ├── train_scikit.py ├── train_xgb.py ├── train_xgb_dtry.py └── util_rpython.py ├── avito-context-click-r ├── .Rapp.history ├── .Rprofile ├── .Rproj.user │ ├── 4B3CD3A5 │ │ ├── graphics-r3 │ │ │ └── empty.png │ │ ├── pcs │ │ │ ├── files-pane.pper │ │ │ ├── source-pane.pper │ │ │ ├── windowlayoutstate.pper │ │ │ └── workbench-pane.pper │ │ ├── persistent-state │ │ ├── saved_source_markers │ │ └── sdb │ │ │ ├── prop │ │ │ ├── 19AE70ED │ │ │ ├── 23B66537 │ │ │ ├── 266CD89 │ │ │ ├── 3B4F6947 │ │ │ ├── 6A3DD511 │ │ │ ├── 6EC2D3AD │ │ │ ├── 6F75496B │ │ │ ├── 9EF9E6CB │ │ │ ├── A7A18FB5 │ │ │ ├── BBE2842F │ │ │ ├── C2BB45F6 │ │ │ ├── D970D594 │ │ │ ├── F8AE1A87 │ │ │ ├── FB96E70 │ │ │ └── INDEX │ │ │ └── s-8FDFA111 │ │ │ └── lock_file │ ├── 73D1DC80 │ │ ├── persistent-state │ │ └── saved_source_markers │ └── D01F76BA │ │ ├── pcs │ │ ├── files-pane.pper │ │ ├── source-pane.pper │ │ ├── windowlayoutstate.pper │ │ └── workbench-pane.pper │ │ └── sdb │ │ └── prop │ │ ├── 61214431 │ │ ├── 186F7A30 │ │ ├── 2191579E │ │ ├── 30E2EF33 │ │ ├── 31E02146 │ │ ├── 396ED20 │ │ ├── 3ACE6FF1 │ │ ├── 51638EA1 │ │ ├── 5C8CFDEE │ │ ├── 6D0737A6 │ │ ├── 84994D68 │ │ ├── B4D42750 │ │ ├── B9BF59A7 │ │ ├── BD7C1F23 │ │ ├── C3A0AE51 │ │ ├── D0B7BE74 │ │ ├── DC31BA58 │ │ ├── E0E39D19 │ │ ├── F66A3FFA │ │ ├── FC1B5EBA │ │ └── INDEX ├── _fn.base.R ├── _fn.base.cpp ├── _utils.R ├── avito-context-click-r.Rproj ├── data.build.R ├── data.build.dtry.R ├── data.build.tree.R ├── data.combine.R ├── main.R ├── train.l1.fm.01.R ├── train.l1.fm.02.R ├── train.l1.fm.03.R ├── train.l1.fm.04.R ├── train.l1.fm.05.R ├── train.l1.ftrl.04.R ├── train.l1.ftrl.05.R ├── train.l1.ftrl.06.R ├── train.l1.xgb.03.R ├── train.l1.xgb.05.R ├── train.l2.xgb.02.R ├── train.xgb.dtry.R └── train.zens.R ├── data ├── input │ └── empty.csv ├── log │ └── data_build │ │ └── empty.log ├── output-libffm │ └── empty.libffm ├── output-libsvm │ └── empty.libsvm ├── output-py │ └── empty.csv ├── output-r │ └── empty.csv ├── rdata │ └── empty.csv ├── submission │ └── empty.csv └── template │ └── zens_nn.yaml └── fm ├── .DS_Store ├── Makefile ├── README ├── fm ├── fm.dSYM └── Contents │ ├── Info.plist │ └── Resources │ └── DWARF │ └── fm └── src ├── common.cpp ├── common.h ├── timer.cpp ├── timer.h └── train.cpp /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | -------------------------------------------------------------------------------- /README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/README.pdf -------------------------------------------------------------------------------- /avito-context-click-py/convert_csv_to_libffm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import warnings 3 | import os 4 | import gzip 5 | from csv import DictReader 6 | from collections import namedtuple 7 | from datetime import datetime 8 | import sys 9 | 10 | OutInfo = namedtuple('OutInfo', ['writer', 'selector']) 11 | 12 | 13 | def convert_csv_2_libffm(input_files, out_selector, col_out, col_in_cat, col_in_num, old_format, silent): 14 | 15 | if old_format and len(col_in_num) > 0: 16 | raise ValueError('Old format doesn''t support numeric columns') 17 | 18 | start = datetime.now() 19 | out_lst = [OutInfo(writer=open_write(out_file), selector=expr) for out_file, expr in out_selector.items()] 20 | invalid_output = {'', 'na', "nan", 'NA', 'NaN'} 21 | 22 | feat_map = {} 23 | feat_index = 1 24 | for col_in in col_in_cat: 25 | feat_map[col_in] = {} 26 | for col_in in col_in_num: 27 | feat_map[col_in] = feat_index 28 | feat_index += 1 29 | 30 | row_count = 0 31 | for input_file in input_files: 32 | for row in DictReader(open_read(input_file)): 33 | 34 | row_count += 1 35 | 36 | out_select = [out_stream.selector(input_file, row) for out_stream in out_lst] 37 | 38 | if sum(out_select) > 0: 39 | cur_row = row[col_out] 40 | if cur_row in invalid_output: 41 | cur_row = '0' 42 | 43 | col_index = 1 44 | 45 | for col_in in col_in_cat: 46 | col_map = feat_map[col_in] 47 | col_val = row[col_in] 48 | 49 | if col_val in col_map: 50 | col_feat = col_map[col_val] 51 | else: 52 | col_feat = col_map[col_val] = feat_index 53 | feat_index += 1 54 | 55 | if old_format: 56 | cur_row += ' ' + str(col_feat) 57 | else: 58 | cur_row += ' ' + str(col_index) + ':' + str(col_feat) + ':1.0' 59 | 60 | col_index += 1 61 | 62 | for col_in in col_in_num: 63 | col_feat = feat_map[col_in] 64 | cur_row += ' ' + str(col_index) + ':' + str(col_feat) + ':' + row[col_in] 65 | 66 | col_index += 1 67 | 68 | for i, out_stream in enumerate(out_lst): 69 | if out_select[i]: 70 | out_stream.writer.write(cur_row) 71 | out_stream.writer.write('\n') 72 | 73 | if not silent and row_count % 10000000 == 0: 74 | print('Lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start))) 75 | 76 | for out_stream in out_lst: 77 | out_stream.writer.close() 78 | 79 | if not silent: 80 | print('Total lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start))) 81 | 82 | 83 | def open_read(path): 84 | 85 | if not os.path.exists(path) and os.path.exists(path + '.gz'): 86 | path += '.gz' 87 | print('\nOpening %s to read.' % path) 88 | 89 | if path == 'sys.stdin': 90 | return sys.stdin 91 | 92 | if path.endswith('.gz'): 93 | return gzip.open(path, mode='rt') 94 | else: 95 | return open(path, mode='rt') 96 | 97 | 98 | def open_write(path): 99 | path_dir = os.path.dirname(path) 100 | if not os.path.exists(path_dir): 101 | os.makedirs(path_dir) 102 | 103 | return open(path, 'w') 104 | 105 | if __name__ == "__main__": 106 | 107 | # -input_files ../data/output-r/data.all.lr.csv.gz 108 | # -out_selector 109 | # "{'../data/output-libffm/fm_01/data.val.tr.fm': lambda file, row: row['SearchType'] in ['hist', 'tr'], 110 | # '../data/output-libffm/fm_01/data.val.tt.fm': lambda file, row: row['SearchType'] in ['val'], 111 | # '../data/output-libffm/fm_01/data.test.tr.fm': lambda file, row: row['SearchType'] in ['hist', 'tr', 'val'], 112 | # '../data/output-libffm/fm_01/data.test.tt.fm': lambda file, row: row['SearchType'] in ['test']}" 113 | # -col_out IsClick 114 | # -col_in_cat 115 | # AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin 116 | # Position 117 | # SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count 118 | # SearchCatID SearchLocID SearchParamsSZBin SearchQuerySZBin SearchRussian 119 | # UserID UserIPID UserPrevQryDateBin UserQryTotalTimeBin 120 | 121 | parser = argparse.ArgumentParser(description='Conver a csv file to libffm format', 122 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 123 | 124 | parser.add_argument('-input_files', 125 | default=[], 126 | type=str, 127 | nargs='+', 128 | help='CSV with training data') 129 | 130 | parser.add_argument('-out_selector', 131 | required=True, 132 | type=str, 133 | help='Dictionary with format {"out_file" : lambda file, row: is_instance(file, row)}' 134 | ' to select instances') 135 | 136 | parser.add_argument('-col_out', 137 | required=True, 138 | type=str, 139 | help='Output column.') 140 | 141 | parser.add_argument('-col_in_cat', 142 | default=[], 143 | type=str, 144 | nargs='+', 145 | help='List of the names of the categorical input columns') 146 | 147 | parser.add_argument('-col_in_num', 148 | default=[], 149 | type=str, 150 | nargs='+', 151 | help='List of the names of the numerical input columns') 152 | 153 | parser.add_argument('-silent', 154 | default=False, 155 | action='store_true', 156 | help='Don''t print execution information') 157 | 158 | parser.add_argument('-old_format', 159 | default=False, 160 | action='store_true', 161 | help="Outputs files in the old format") 162 | 163 | args = vars(parser.parse_args()) 164 | args['out_selector'] = eval(args['out_selector']) 165 | 166 | # if not sys.stdin.isatty(): 167 | # args['input_files'].append('sys.stdin') 168 | 169 | if not args['silent']: 170 | print('\n\n\n\n\n\n\n\n\n') 171 | print(args) 172 | 173 | with warnings.catch_warnings(): 174 | warnings.filterwarnings("ignore", category=Warning) 175 | convert_csv_2_libffm(**args) 176 | 177 | 178 | -------------------------------------------------------------------------------- /avito-context-click-py/convert_csv_to_libsvm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import warnings 3 | import os 4 | import gzip 5 | from csv import DictReader 6 | from collections import namedtuple 7 | from datetime import datetime 8 | 9 | OutInfo = namedtuple('OutInfo', ['writer', 'out_file', 'selector', 'weight_builder', 'weight_writer']) 10 | 11 | 12 | def convert_csv_2_libsvm(input_files, feat_map_file, out_selector, 13 | col_out, col_in_cat, col_in_num, missing_values, 14 | feat_start, weight_builder_dict): 15 | 16 | start = datetime.now() 17 | out_lst = [OutInfo(writer=open_write(out_file), 18 | out_file=out_file, 19 | selector=expr, 20 | weight_builder=weight_builder_dict[out_file] if out_file in weight_builder_dict else None, 21 | weight_writer=open_write(out_file + '.weight') if out_file in weight_builder_dict else None) 22 | for out_file, expr in out_selector.items()] 23 | missing_values = set(missing_values) 24 | 25 | feat_map = {} 26 | feat_map_list = ['skip' + str(ix) for ix in range(feat_start)] 27 | for col_in in col_in_cat: 28 | feat_map[col_in] = {} 29 | for col_in in col_in_num: 30 | feat_map[col_in] = len(feat_map_list) 31 | feat_map_list.append(col_in + '\tfloat') 32 | 33 | row_count = 0 34 | for input_file in input_files: 35 | for row in DictReader(open_read(input_file)): 36 | 37 | row_count += 1 38 | 39 | out_select = [out_stream.selector(input_file, row) for out_stream in out_lst] 40 | 41 | if sum(out_select) > 0: 42 | cur_row = row[col_out] 43 | if cur_row in missing_values: 44 | cur_row = '0' 45 | 46 | for col_in in col_in_cat: 47 | col_map = feat_map[col_in] 48 | col_val = row[col_in] 49 | if col_val not in missing_values: 50 | if col_val in col_map: 51 | col_feat = col_map[col_val] 52 | else: 53 | col_feat = col_map[col_val] = len(feat_map_list) 54 | feat_map_list.append(col_in + '=' + col_val + '\ti') 55 | cur_row += ' ' + str(col_feat) + ':1' 56 | 57 | for col_in in col_in_num: 58 | col_val = row[col_in] 59 | if col_val not in missing_values: 60 | col_feat = feat_map[col_in] 61 | cur_row += ' ' + str(col_feat) + ':' + col_val 62 | 63 | for i, out_stream in enumerate(out_lst): 64 | if out_select[i]: 65 | out_stream.writer.write(cur_row) 66 | out_stream.writer.write('\n') 67 | if out_stream.weight_builder is not None: 68 | cur_weight = str(out_stream.weight_builder(input_file, row)) 69 | out_stream.weight_writer.write(cur_weight) 70 | out_stream.weight_writer.write('\n') 71 | 72 | if row_count % 10000000 == 0: 73 | print('Lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start))) 74 | 75 | for out_stream in out_lst: 76 | out_stream.writer.close() 77 | if out_stream.weight_writer is not None: 78 | out_stream.weight_writer.close() 79 | 80 | if feat_map_file is not None: 81 | with open_write(feat_map_file) as fmap_out: 82 | for ix, fvalue in enumerate(feat_map_list): 83 | fmap_out.write(str(ix) + '\t' + fvalue + '\n') 84 | 85 | print('Total lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start))) 86 | 87 | 88 | def open_read(path): 89 | 90 | if not os.path.exists(path) and os.path.exists(path + '.gz'): 91 | path += '.gz' 92 | print('\nOpening %s to read.' % path) 93 | 94 | if path.endswith('.gz'): 95 | return gzip.open(path, mode='rt') 96 | else: 97 | return open(path, mode='rt') 98 | 99 | 100 | def open_write(path): 101 | path_dir = os.path.dirname(path) 102 | if not os.path.exists(path_dir): 103 | os.makedirs(path_dir) 104 | 105 | return open(path, 'w') 106 | 107 | if __name__ == "__main__": 108 | 109 | # -input_files ../data/dmitry/data.all.tree.dl.csv.samp 110 | # -out_selector 111 | # "{'../data/dmitry/data.all.tree.dl.csv.samp.libsvm': lambda file, row: True}" 112 | # -feat_map_file ../data/dmitry/data.all.tree.dl.csv.samp.fmap 113 | # -col_out IsClick 114 | # -col_in_cat 115 | # AdCatID 116 | # -col_in_num 117 | # AdHistCTR AdPrice Position UserQryTotalTime 118 | 119 | parser = argparse.ArgumentParser(description='Conver a csv file to libffm format', 120 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 121 | 122 | parser.add_argument('-input_files', 123 | default=[], 124 | type=str, 125 | nargs='+', 126 | help='CSV with training data') 127 | 128 | parser.add_argument('-feat_map_file', 129 | default=None, 130 | type=str, 131 | help='Path to output feat map') 132 | 133 | parser.add_argument('-out_selector', 134 | required=True, 135 | type=str, 136 | help='Dictionary with format {"out_file" : lambda file, row: is_instance(file, row)}' 137 | ' to select instances') 138 | 139 | parser.add_argument('-col_out', 140 | required=True, 141 | type=str, 142 | help='Output column.') 143 | 144 | parser.add_argument('-col_in_cat', 145 | default=[], 146 | type=str, 147 | nargs='+', 148 | help='List of the names of the categorical input columns') 149 | 150 | parser.add_argument('-col_in_num', 151 | default=[], 152 | type=str, 153 | nargs='+', 154 | help='List of the names of the numerical input columns') 155 | 156 | parser.add_argument('-missing_values', 157 | default=['', 'na', "nan", 'NA', 'NaN'], 158 | type=str, 159 | nargs='+', 160 | help='List of the names of the numerical input columns') 161 | 162 | parser.add_argument('-feat_start', 163 | default=0, 164 | type=int, 165 | help='Starting index of features') 166 | 167 | parser.add_argument('-weight_builder_dict', 168 | default='{}', 169 | type=str, 170 | help='create weight features') 171 | 172 | args = vars(parser.parse_args()) 173 | args['out_selector'] = eval(args['out_selector']) 174 | if args['weight_builder_dict'] is not None: 175 | args['weight_builder_dict'] = eval(args['weight_builder_dict']) 176 | 177 | print('\n' + (' '*300) + '\n') 178 | print(args) 179 | 180 | with warnings.catch_warnings(): 181 | warnings.filterwarnings("ignore", category=Warning) 182 | convert_csv_2_libsvm(**args) 183 | -------------------------------------------------------------------------------- /avito-context-click-py/train_ftrl.py: -------------------------------------------------------------------------------- 1 | """ 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | """ 16 | 17 | from datetime import datetime 18 | from csv import DictReader 19 | from math import exp, log, sqrt, copysign 20 | import os 21 | import gzip 22 | import argparse 23 | import warnings 24 | import random 25 | import pickle as pkl 26 | import inspect 27 | 28 | 29 | class DataObject(object): 30 | 31 | def __props__(self): 32 | attribs = inspect.getmembers(self, lambda attr: not(inspect.isroutine(attr))) 33 | return [a for a in attribs if not(a[0].startswith('__') and a[0].endswith('__'))] 34 | 35 | def __repr__(self): 36 | return '{}({})'.format(self.__class__.__name__, repr(self.__props__())) 37 | 38 | 39 | class FTRLProximal(object): 40 | """ Our main algorithm: Follow the regularized leader - proximal 41 | 42 | In short, 43 | this is an adaptive-learning-rate sparse logistic-regression with 44 | efficient L1-L2-regularization 45 | 46 | Reference: 47 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 48 | """ 49 | 50 | class StatusData(DataObject): 51 | 52 | def __init__(self): 53 | self.start = datetime.now() 54 | self.loss = 0. 55 | self.count = 0 56 | self.loss_check = 0. 57 | self.count_check = 0 58 | self.total_count = 0 59 | 60 | class InstanceData(DataObject): 61 | 62 | def __init__(self, x, y, row): 63 | self.x = x 64 | self.y = y 65 | self.row = row 66 | 67 | def __init__(self, alpha, beta, l1, l2, bits, two_way, dropout, seed, col_out, 68 | col_in_cat, test_pred_col, test_pred_extra_cols, no_pred_suffix): 69 | # parameters 70 | self.alpha = alpha 71 | self.beta = beta 72 | self.l1 = l1 73 | self.l2 = l2 74 | 75 | # feature related parameters 76 | self.two_way = two_way 77 | self.dropout = max(min(dropout, 1.), 0.) 78 | self.rnd = random.Random() 79 | self.rnd.seed(seed) 80 | self.seed = seed 81 | 82 | self.bits = bits 83 | self.key_cache = {} 84 | self.d = 2 ** bits 85 | self.n = [0.] * self.d 86 | self.z = [0.] * self.d 87 | 88 | self.epoch = 1 89 | self.col_out = col_out 90 | self.col_in_cat = col_in_cat 91 | self.test_pred_col = test_pred_col 92 | self.test_pred_extra_cols = test_pred_extra_cols 93 | self.no_pred_suffix = no_pred_suffix 94 | 95 | def _calc_col_index(self, val_id): 96 | index_offset = 1 97 | return (abs(hash(val_id)) % (self.d-index_offset)) + index_offset 98 | 99 | def _open_pred_path(self, outpath): 100 | return open(outpath + ('' if self.no_pred_suffix else "." + str(self.epoch)), 'w') 101 | 102 | @staticmethod 103 | def _get_prob(wtx): 104 | return 1. / (1. + exp(-max(min(wtx, 35.), -35.))) 105 | 106 | def _get_w(self, i): 107 | sign = copysign(1, self.z[i]) 108 | if sign * self.z[i] <= self.l1: 109 | return 0. 110 | else: 111 | return (sign * self.l1 - self.z[i]) / ((self.beta + sqrt(self.n[i])) / self.alpha + self.l2) 112 | 113 | def _predict(self, x): 114 | """ Get probability estimation on x 115 | 116 | INPUT: 117 | x: features 118 | dropped: if the weight was dropped 119 | OUTPUT: 120 | probability of p(y = 1 | x; w) 121 | """ 122 | wtx = sum([self._get_w(i) for i in x]) 123 | return FTRLProximal._get_prob(wtx) 124 | 125 | def _update(self, x, y): 126 | """ Update model using x, y 127 | 128 | INPUT: 129 | x: feature, a list of indices 130 | y: answer 131 | 132 | MODIFIES: 133 | self.n: increase by squared gradient 134 | self.z: weights 135 | """ 136 | 137 | w = [0.]*(len(x)+1) 138 | wtx = 0. 139 | for j, i in enumerate(x): 140 | if self.dropout > 0. and self.rnd.random() < self.dropout: 141 | w[j] = None 142 | else: 143 | w[j] = self._get_w(i) 144 | wtx += w[j] 145 | # wtx /= (1.-self.dropout) 146 | 147 | p = FTRLProximal._get_prob(wtx) 148 | g = p - y 149 | 150 | # update z and n 151 | for j, i in enumerate(x): 152 | # implement dropout as overfitting prevention 153 | if w[j] is None: 154 | continue 155 | 156 | sigma = (sqrt(self.n[i] + g * g) - sqrt(self.n[i])) / self.alpha 157 | self.z[i] += g - sigma * w[j] 158 | self.n[i] += g * g 159 | 160 | @staticmethod 161 | def _open_path(path): 162 | 163 | if not os.path.exists(path) and os.path.exists(path + '.gz'): 164 | path += '.gz' 165 | print('\nOpening %s to read.' % path) 166 | 167 | # noinspection PyUnresolvedReferences 168 | if path.endswith('.gz'): 169 | return gzip.open(path, mode='rt') 170 | else: 171 | return open(path, mode='rt') 172 | 173 | def _data(self, path): 174 | 175 | for row in DictReader(FTRLProximal._open_path(path)): 176 | 177 | data = FTRLProximal.InstanceData(x=[0] * (len(self.col_in_cat) + len(self.two_way) + 1), y=None, row=row) 178 | if self.col_out in row: 179 | row_val = row[self.col_out] 180 | data.y = float(row_val) if row_val == '0' or row_val == '1' else None 181 | 182 | # bias is 0 183 | ix = 1 184 | 185 | # one-hot encode features 186 | for col_nam in self.col_in_cat: 187 | data.x[ix] = self._calc_col_index(str(ix+self.seed) + "_" + row[col_nam]) 188 | ix += 1 189 | # one-hot encode two way features 190 | for cols_two_way in self.two_way: 191 | data.x[ix] = self._calc_col_index(str(ix+self.seed) + "_" + 192 | row[cols_two_way[0]] + "_" + row[cols_two_way[1]]) 193 | ix += 1 194 | yield data 195 | 196 | @staticmethod 197 | def log_loss(pred, actual): 198 | """ FUNCTION: Bounded logloss 199 | 200 | INPUT: 201 | p: our prediction 202 | y: real answer 203 | 204 | OUTPUT: 205 | logarithmic loss of p given y 206 | """ 207 | 208 | pred = max(min(pred, 1. - 10e-15), 10e-15) 209 | return -log(pred) if actual == 1. else -log(1. - pred) 210 | 211 | def fit(self, path): 212 | 213 | self._on_start_fit(path=path) 214 | 215 | print('\n\n\nTraining started...') 216 | fit_status = FTRLProximal.StatusData() 217 | 218 | for data in self._data(path): 219 | self._fit_instance(fit_status=fit_status, data=data) 220 | 221 | self._on_end_fit(fit_status=fit_status) 222 | 223 | def _on_start_fit(self, path): 224 | pass 225 | 226 | def _fit_instance(self, fit_status, data): 227 | 228 | fit_status.total_count += 1 229 | if fit_status.total_count % 20 == 0: 230 | p = self._predict(data.x) 231 | cur_loss = FTRLProximal.log_loss(p, data.y) 232 | fit_status.loss += cur_loss 233 | fit_status.loss_check += cur_loss 234 | fit_status.count += 1 235 | fit_status.count_check += 1 236 | 237 | self._update(data.x, data.y) 238 | 239 | if fit_status.total_count % 10000000 == 0: 240 | print('Epoch %d (%d instances), train logloss: %f (since last %f - %d samples), elapsed time: %s' % ( 241 | self.epoch, fit_status.total_count, fit_status.loss / fit_status.count, 242 | fit_status.loss_check / fit_status.count_check, fit_status.count_check, 243 | str(datetime.now() - fit_status.start))) 244 | fit_status.loss_check = 0. 245 | fit_status.count_check = 0 246 | 247 | def _on_end_fit(self, fit_status): 248 | print('Epoch %d finished (%d total instances), train logloss: %f (since last %f - %d samples),' 249 | ' elapsed time: %s' % ( 250 | self.epoch, fit_status.total_count, fit_status.loss / fit_status.count, 251 | fit_status.loss_check / fit_status.count_check, fit_status.count_check, 252 | str(datetime.now() - fit_status.start))) 253 | fit_status.loss_check = 0. 254 | fit_status.count_check = 0 255 | self.epoch += 1 256 | 257 | def pred_proba(self, path): 258 | print('\n\n\nPredicting started...') 259 | pred_status = FTRLProximal.StatusData() 260 | pred_lst = [] 261 | 262 | for data in self._data(path): 263 | pred = self._predict(data.x) 264 | pred_lst.append(pred) 265 | self._update_pred_status(data=data, pred=pred, pred_status=pred_status) 266 | 267 | self._on_end_pred_proba(pred_status=pred_status) 268 | return pred_lst 269 | 270 | def save_pred_proba(self, inpath, outpath): 271 | 272 | print('\n\n\nSaving prediction started...') 273 | pred_status = FTRLProximal.StatusData() 274 | 275 | with self._open_pred_path(outpath) as out_file: 276 | self._write_pred_headers(out_file) 277 | for data in self._data(inpath): 278 | self._write_instance_prediction(pred_status=pred_status, out_file=out_file, data=data) 279 | 280 | self._on_end_pred_proba(pred_status=pred_status) 281 | 282 | def _write_pred_headers(self, out_file): 283 | out_file.write('%s\n' % ','.join(self.test_pred_extra_cols + [self.test_pred_col])) 284 | 285 | def _write_instance_prediction(self, pred_status, out_file, data): 286 | pred = self._predict(data.x) 287 | values = [data.row[col_nam] for col_nam in self.test_pred_extra_cols] + [str(pred)] 288 | out_file.write('%s\n' % ','.join(values)) 289 | self._update_pred_status(data=data, pred=pred, pred_status=pred_status) 290 | 291 | # noinspection PyMethodMayBeStatic 292 | def _update_pred_status(self, data, pred, pred_status): 293 | if data.y is not None: 294 | pred_status.loss += FTRLProximal.log_loss(pred, data.y) 295 | pred_status.count += 1 296 | pred_status.total_count += 1 297 | 298 | # noinspection PyMethodMayBeStatic 299 | def _on_end_pred_proba(self, pred_status): 300 | if pred_status.count > 0: 301 | print('Prediction logloss: %f (%d instances)' % 302 | (pred_status.loss / pred_status.count, pred_status.count)) 303 | print('Prediction saving time (%d instances): %s' % 304 | (pred_status.total_count, str(datetime.now() - pred_status.start))) 305 | 306 | def fit_and_save_pred_proba(self, inpath, outpath, train_is_test_col): 307 | 308 | self._on_start_fit(path=inpath) 309 | 310 | print('\n\n\nTraining and predicting started...') 311 | fit_status = FTRLProximal.StatusData() 312 | pred_status = FTRLProximal.StatusData() 313 | 314 | with self._open_pred_path(outpath) as out_file: 315 | self._write_pred_headers(out_file) 316 | for data in self._data(inpath): 317 | if data.row[train_is_test_col] == '1': 318 | self._write_instance_prediction(pred_status=pred_status, out_file=out_file, data=data) 319 | else: 320 | self._fit_instance(fit_status=fit_status, data=data) 321 | 322 | self._on_end_fit(fit_status=fit_status) 323 | self._on_end_pred_proba(pred_status=pred_status) 324 | 325 | @staticmethod 326 | def load_model(model_file): 327 | start = datetime.now() 328 | print('\n\nLoading model from \'%s\'' % model_file) 329 | with gzip.open(model_file, 'rb') as model_stream: 330 | model = pkl.load(model_stream) 331 | print('Loading model time: %s' % (str(datetime.now() - start))) 332 | return model 333 | 334 | @staticmethod 335 | def save_model(model, model_file): 336 | start = datetime.now() 337 | print('\n\nSaving model to \'%s\'' % model_file) 338 | with gzip.open(model_file, 'wb') as model_stream: 339 | pkl.dump(model, model_stream, pkl.HIGHEST_PROTOCOL) 340 | print('Saving model time: %s' % (str(datetime.now() - start))) 341 | 342 | 343 | def train_and_predict(train_file, train_is_test_col, test_file, test_pred_file, train_model_file, epochs, load_model, 344 | alpha, beta, l1, l2, bits, two_way, dropout, seed, 345 | col_out, col_in_cat, test_pred_col, test_pred_extra_cols, 346 | no_pred_suffix): 347 | 348 | start = datetime.now() 349 | 350 | if load_model and os.path.exists(train_model_file): 351 | learner = FTRLProximal.load_model(train_model_file) 352 | else: 353 | learner = FTRLProximal(alpha=alpha, beta=beta, l1=l1, l2=l2, bits=bits, two_way=two_way, 354 | dropout=dropout, seed=seed, 355 | col_out=col_out, col_in_cat=col_in_cat, 356 | test_pred_col=test_pred_col, 357 | test_pred_extra_cols=test_pred_extra_cols, no_pred_suffix=no_pred_suffix) 358 | 359 | # training 360 | if train_file is not None: 361 | for e in range(epochs): 362 | if train_is_test_col is None: 363 | learner.fit(path=train_file) 364 | else: 365 | learner.fit_and_save_pred_proba(inpath=train_file, outpath=test_pred_file, 366 | train_is_test_col=train_is_test_col) 367 | 368 | # predicting 369 | if test_file is not None: 370 | learner.save_pred_proba(inpath=test_file, outpath=test_pred_file) 371 | 372 | if train_model_file is not None: 373 | FTRLProximal.save_model(learner, train_model_file) 374 | 375 | print('Total elapsed time: %s' % (str(datetime.now() - start))) 376 | 377 | 378 | def arg_to_list(arg_map, name, sort=True): 379 | if not isinstance(arg_map[name], list): 380 | arg_map[name] = [arg_map[name]] 381 | if sort: 382 | arg_map[name] = sorted(arg_map[name]) 383 | 384 | 385 | def arg_build_2way(arg_map): 386 | 387 | two_way_arg = arg_map['two_way'] 388 | input_cols = sorted(arg_map['col_in_cat']) 389 | 390 | two_way_lst = [] 391 | two_way_added = set() 392 | 393 | l = len(input_cols) 394 | for two_way in two_way_arg: 395 | two_way = sorted(two_way.split(' ')) 396 | for i in range(l): 397 | for j in range(i + 1, l): 398 | if input_cols[i].startswith(two_way[0]) and input_cols[j].startswith(two_way[1]): 399 | input_key = input_cols[i] + '\t' + input_cols[j] 400 | if input_key not in two_way_added: 401 | two_way_added.add(input_key) 402 | two_way_lst.append([input_cols[i], input_cols[j]]) 403 | arg_map['two_way'] = two_way_lst 404 | 405 | 406 | def arg_replace(arg_map, name, old, new): 407 | if arg_map[name] is not None: 408 | arg_map[name] = arg_map[name].replace(old, new) 409 | 410 | if __name__ == "__main__": 411 | 412 | # -train_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/ftrl_05/data.val.all.csv 413 | # -train_is_test_col IsTestRow 414 | # -test_pred_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/ftrl_05/data.val.all.tmp.pred 415 | # -col_out IsClick 416 | # -col_in_cat AdID UserID Position 417 | # -col_query_id SearchID 418 | # -min_freq 1 419 | 420 | parser = argparse.ArgumentParser(description='Train and predict using FTRL proximal algorithm', 421 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 422 | 423 | parser.add_argument('-train_file', 424 | default=None, 425 | type=str, 426 | help='CSV with training data') 427 | 428 | parser.add_argument('-train_model_file', 429 | default=None, 430 | type=str, 431 | help='Path to save the trained model to be used later to resume training') 432 | 433 | parser.add_argument('-train_is_test_col', 434 | default=None, 435 | type=str, 436 | help='If given its assumed that test instances are written in TRAIN_FILE with TRAIN_TEST_COL' 437 | ' set to 1 if its a test instance; those test instances will be predicted on the fly in' 438 | ' such cases') 439 | 440 | parser.add_argument('-test_file', 441 | default=None, 442 | type=str, 443 | help='CSV with testing data') 444 | 445 | parser.add_argument('-test_pred_file', 446 | default=None, 447 | type=str, 448 | help='Path to save predictions') 449 | 450 | parser.add_argument('-test_pred_col', 451 | default='Pred', 452 | type=str, 453 | help='Name of the prediction column that will be writen to TEST_PRED_FILE') 454 | 455 | parser.add_argument('-test_pred_extra_cols', 456 | default=[], 457 | type=str, 458 | nargs='+', 459 | help='Extra columns that will be copied from TEST_FILE to TEST_PRED_FILE') 460 | 461 | parser.add_argument('-no_pred_suffix', 462 | default=False, 463 | action='store_true', 464 | help="Removes epoch suffix from prediction file") 465 | 466 | parser.add_argument('-load_model', 467 | default=False, 468 | action='store_true', 469 | help="Loads saved model to resume training if exists") 470 | 471 | parser.add_argument('-col_out', 472 | required=True, 473 | type=str, 474 | help='Name of the output column') 475 | 476 | parser.add_argument('-col_in_cat', 477 | required=True, 478 | type=str, 479 | nargs='+', 480 | help='List of the names of the categorical input columns') 481 | 482 | parser.add_argument('-two_way', 483 | default=[], 484 | type=str, 485 | nargs='+', 486 | help='Two way features list in format \'F1 F2\', so all fields starting with \'F1\' ' 487 | 'will be combined with all fields starting with \'F2\'') 488 | 489 | parser.add_argument('-alpha', 490 | default=.1, 491 | type=float, 492 | help='Learning rate') 493 | 494 | parser.add_argument('-beta', 495 | default=1., 496 | type=float, 497 | help='Smoothing parameter for adaptive learning rate') 498 | 499 | parser.add_argument('-l1', 500 | default=1., 501 | type=float, 502 | help='L1 regularization, larger value means more regularized') 503 | 504 | parser.add_argument('-l2', 505 | default=1., 506 | type=float, 507 | help='L2 regularization, larger value means more regularized') 508 | 509 | parser.add_argument('-bits', 510 | default=24, 511 | type=int, 512 | help='Bits to use with the hashing trick to define weights' 513 | ' (a -1 value will make it take longer but it will' 514 | ' make sure there will be no collisions)') 515 | 516 | parser.add_argument('-dropout', 517 | default=0., 518 | type=float, 519 | help='Percentage of weights to dropout at each update') 520 | 521 | parser.add_argument('-seed', 522 | default=1, 523 | type=int, 524 | help='Seed to use for random operations (like dropout) and hash offset' 525 | ' so changing the seed will also change the hash colisions') 526 | 527 | parser.add_argument('-epochs', 528 | default=1, 529 | type=int, 530 | help='Learn training data for N passes') 531 | 532 | args = vars(parser.parse_args()) 533 | 534 | arg_to_list(arg_map=args, name='col_in_cat') 535 | arg_to_list(arg_map=args, name='two_way') 536 | arg_to_list(arg_map=args, name='test_pred_extra_cols') 537 | 538 | arg_build_2way(arg_map=args) 539 | 540 | arg_replace(arg_map=args, name='train_model_file', old='{TRAIN_FILE}', new=args['train_file']) 541 | 542 | print('\n\n' + (' '*100) + '\n\n') 543 | print(args) 544 | 545 | with warnings.catch_warnings(): 546 | warnings.filterwarnings("ignore", category=Warning) 547 | train_and_predict(**args) 548 | -------------------------------------------------------------------------------- /avito-context-click-py/train_pylearn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | import os 4 | 5 | import numpy as np 6 | import csv 7 | import gzip 8 | 9 | from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix 10 | from pylearn2.utils.string_utils import preprocess 11 | from pylearn2.config import yaml_parse 12 | from pylearn2.utils import serial 13 | 14 | 15 | class CSVDataset(DenseDesignMatrix): 16 | def __init__(self, path, n_labels=2, start=None, stop=None, del_raw=True, x_only=False): 17 | self.del_raw = del_raw 18 | path = preprocess(path) 19 | 20 | x, y = CSVDataset._load_data(path, del_raw=del_raw) 21 | if np.isnan(np.min(y)): 22 | y = None 23 | else: 24 | y = y.astype(int).reshape(-1, 1) 25 | 26 | if start is not None: 27 | if stop is None: 28 | stop = x.shape[0] 29 | assert start >= 0 30 | assert start < stop 31 | if not (stop <= x.shape[0]): 32 | raise ValueError("stop must be less than the # of examples but " + 33 | "stop is " + str(stop) + " and there are " + str(x.shape[0]) + 34 | " examples.") 35 | x = x[start:stop, :] 36 | if y is not None: 37 | y = y[start:stop, :] 38 | 39 | if x_only: 40 | y = None 41 | n_labels = None 42 | 43 | super(CSVDataset, self).__init__(X=x, y=y, y_labels=n_labels) 44 | 45 | @staticmethod 46 | def _open_path(path): 47 | 48 | if not os.path.exists(path) and os.path.exists(path + '.gz'): 49 | path += '.gz' 50 | 51 | if path.endswith('.gz'): 52 | return gzip.open(path, mode='rt') 53 | else: 54 | return open(path, mode='rt') 55 | 56 | @staticmethod 57 | def _load_data(path, del_raw): 58 | 59 | npy_path = path + '.npz' 60 | 61 | if os.path.exists(npy_path): 62 | if not os.path.exists(path) or os.path.getmtime(npy_path) > os.path.getmtime(path): 63 | data = np.load(npy_path) 64 | return data['x'], data['y'] 65 | 66 | # Convert the .csv file to numpy 67 | y_list = [] 68 | x_list = [] 69 | with CSVDataset._open_path(path) as csv_file: 70 | 71 | invalid_y = {'', 'na', "nan", 'NA', 'NaN'} 72 | 73 | is_header = True 74 | for row in csv.reader(csv_file): 75 | if is_header: 76 | is_header = False 77 | else: 78 | y_list.append(float(row[0]) if row[0] not in invalid_y else np.nan) 79 | x_list.append(list(map(float, row[1:]))) 80 | 81 | x = np.array(x_list, dtype=np.float32) 82 | y = np.array(y_list, dtype=np.float32) 83 | 84 | np.savez_compressed(npy_path, x=x, y=y) 85 | if del_raw: 86 | os.remove(path) 87 | return x, y 88 | 89 | 90 | def train(config, config_args): 91 | 92 | # Load config replacing tags 93 | with open(config, 'r') as f: 94 | config = ''.join(f.readlines()) 95 | for nam in config_args: 96 | config = config.replace('${' + nam + "}", config_args[nam]) 97 | 98 | train_obj = yaml_parse.load(config) 99 | 100 | try: 101 | iter(train_obj) 102 | iterable = True 103 | except TypeError: 104 | iterable = False 105 | 106 | # # Undo our custom logging setup. 107 | # restore_defaults() 108 | # root_logger = logging.getLogger() 109 | # formatter = CustomFormatter(prefix='%(asctime)s ', only_from='pylearn2') 110 | # handler = CustomStreamHandler(formatter=formatter) 111 | # root_logger.addHandler(handler) 112 | # root_logger.setLevel(logging.INFO) 113 | 114 | if iterable: 115 | for number, subobj in enumerate(iter(train_obj)): 116 | # Execute this training phase. 117 | subobj.main_loop() 118 | del subobj 119 | gc.collect() 120 | else: 121 | train_obj.main_loop() 122 | 123 | 124 | def add_config_args(cfg_args, cfg_dict=None): 125 | if cfg_dict is None: 126 | cfg_dict = {} 127 | cfg_args = vars(cfg_args) 128 | for nam in cfg_args: 129 | val = str(cfg_args[nam]) 130 | cfg_dict[nam] = val 131 | cfg_dict[nam.lower()] = val 132 | cfg_dict[nam.upper()] = val 133 | return cfg_dict 134 | 135 | 136 | def predict(model_file, test_data_file, test_pred_file): 137 | 138 | model = serial.load(model_file) 139 | dataset = CSVDataset(path=test_data_file, x_only=True) 140 | 141 | # use smallish batches to avoid running out of memory 142 | batch_size = 100 143 | model.set_batch_size(batch_size) 144 | # dataset must be multiple of batch size of some batches will have 145 | # different sizes. theano convolution requires a hard-coded batch size 146 | n_row = dataset.X.shape[0] 147 | extra = batch_size - n_row % batch_size 148 | assert (n_row + extra) % batch_size == 0 149 | if extra > 0: 150 | dataset.X = np.concatenate((dataset.X, np.zeros((extra, dataset.X.shape[1]), 151 | dtype=dataset.X.dtype)), axis=0) 152 | assert dataset.X.shape[0] % batch_size == 0 153 | 154 | x_theano = model.get_input_space().make_batch_theano() 155 | y_theano = model.fprop(x_theano) 156 | 157 | # from theano import tensor as T 158 | from theano import function 159 | f = function([x_theano], y_theano) 160 | 161 | y = [] 162 | for i in range(int(dataset.X.shape[0] / batch_size)): 163 | x_arg = dataset.X[i*batch_size:(i+1)*batch_size, :] 164 | if x_theano.ndim > 2: 165 | x_arg = dataset.get_topological_view(x_arg) 166 | y.append(f(x_arg.astype(x_theano.dtype))) 167 | 168 | y = np.concatenate(y) 169 | 170 | y = y[:n_row, :] 171 | 172 | # wirtes prediction to output 173 | n_col = y.shape[1] 174 | with open(test_pred_file, 'w') as out: 175 | for r in range(n_row): 176 | for c in range(n_col): 177 | if n_col == 2 and c == 0: 178 | continue 179 | if c > 0 and n_col > 2: 180 | out.write(',') 181 | out.write('%f' % (y[r, c])) 182 | 183 | out.write('\n') 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | # -train_config /home/lucas/ml-r-tb/contest/avito-context-click/data/template/zens_nn.yaml 189 | # -model_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.model.pkl 190 | # -train_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.tr.csv 191 | # -val_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.val.csv 192 | # -test_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.test.csv 193 | # -test_pred_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.test.pred 194 | # -n_features 5 195 | 196 | parser = argparse.ArgumentParser( 197 | description="Launch an training from a YAML configuration file.", 198 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 199 | ) 200 | 201 | parser.add_argument('-train_config', 202 | type=str, 203 | default=None, 204 | help='A YAML configuration file specifying the training procedure') 205 | 206 | parser.add_argument('-model_file', 207 | type=str, 208 | default=None, 209 | help='File with model path used to predict') 210 | 211 | parser.add_argument('-test_data_file', 212 | type=str, 213 | default=None, 214 | help='File data to predict') 215 | 216 | parser.add_argument('-test_pred_file', 217 | type=str, 218 | default=None, 219 | help='File to output predictions to') 220 | 221 | args, extra_args = parser.parse_known_args() 222 | for arg in extra_args: 223 | if arg.startswith("-"): 224 | parser.add_argument(arg, type=str) 225 | 226 | args = parser.parse_args() 227 | 228 | # if args.train_config is not None: 229 | # config_dict = add_config_args(cfg_args=args) 230 | # train(config=args.train_config, config_args=config_dict) 231 | 232 | if args.test_data_file is not None and args.model_file is not None and args.test_pred_file is not None: 233 | predict(model_file=args.model_file, test_data_file=args.test_data_file, test_pred_file=args.test_pred_file) 234 | -------------------------------------------------------------------------------- /avito-context-click-py/train_scikit.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import sklearn 3 | from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, \ 4 | RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor 5 | from sklearn.linear_model import LogisticRegression 6 | import sklearn.metrics as sk_metrics 7 | import argparse 8 | import pandas as pd 9 | import timeit 10 | import numpy as np 11 | import ml_metrics 12 | import ast 13 | import itertools 14 | import os 15 | import math 16 | import random 17 | from sklearn.pipeline import Pipeline 18 | import six 19 | import inspect 20 | import gzip 21 | # noinspection PyUnresolvedReferences 22 | from six.moves import cPickle as pkl 23 | 24 | 25 | def get_class(kls): 26 | parts = kls.split('.') 27 | module = ".".join(parts[:-1]) 28 | m = __import__(module) 29 | for comp in parts[1:]: 30 | m = getattr(m, comp) 31 | return m 32 | 33 | 34 | def predict_test(feat_importance_fun, mappings, model, na_fill_value, predict, silent, staged_predict, 35 | target_col, test_data_file, test_metric, test_pred_file, x_cols, metric_type, weight_col): 36 | if not silent: 37 | print("Predicting : %s to %s" % (test_data_file, test_pred_file)) 38 | 39 | test_x = load_pd_df(test_data_file) 40 | if mappings is not None: 41 | for col in test_x.columns: 42 | if col in mappings: 43 | test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value) 44 | else: 45 | test_x[col] = test_x[col].fillna(na_fill_value) 46 | test_y = None 47 | if target_col in test_x.columns: 48 | test_y = test_x[target_col][test_x[target_col] != na_fill_value] 49 | test_y2 = test_x[target_col][pd.notnull(test_x[target_col])] 50 | if len(test_y) != len(test_x) or len(test_y2) != len(test_x): 51 | test_y = None 52 | del test_y2 53 | 54 | test_weight = None 55 | if weight_col is not None: 56 | if weight_col in test_x.columns: 57 | test_weight = test_x[weight_col] 58 | del test_x[weight_col] 59 | 60 | test_x = test_x[x_cols] 61 | 62 | test_pred = predict((model, test_x)) 63 | if test_pred.shape[1] == 1: 64 | test_pred = pd.DataFrame({'pred': test_pred[:, 0]}) 65 | elif test_pred.shape[1] == 2: 66 | test_pred = pd.DataFrame({'pred': test_pred[:, 1]}) 67 | else: 68 | test_pred_df = None 69 | for c in range(test_pred.shape[1]): 70 | if test_pred_df is None: 71 | test_pred_df = pd.DataFrame({'pred0': test_pred[:, c]}) 72 | else: 73 | test_pred_df['pred' + str(c)] = test_pred[:, c] 74 | test_pred = test_pred_df 75 | 76 | if not silent and test_y is not None: 77 | print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)), 78 | test_metric=test_metric, metric_type=metric_type, test_weight=test_weight) 79 | 80 | if not silent: 81 | feat_importance = feat_importance_fun(model) 82 | if feat_importance is not None: 83 | feat_importance = pd.DataFrame({'Features': x_cols, 84 | 'Importance': feat_importance}) 85 | pd.set_option('max_columns', len(test_x.columns)) 86 | pd.set_option('max_rows', len(test_x)) 87 | print("Feature importances:") 88 | feat_importance.sort(columns='Importance', ascending=False, inplace=True) 89 | feat_importance.index = range(1, len(feat_importance) + 1) 90 | print(feat_importance) 91 | 92 | test_pred.to_csv(test_pred_file, index=False) 93 | 94 | 95 | def load_pd_df(file_name, del_old=True, bin_suffix='.bin.pkl'): 96 | ret_val = None 97 | bin_file_name = file_name + bin_suffix 98 | if os.path.isfile(bin_file_name): 99 | if not os.path.isfile(file_name) or os.path.getmtime(bin_file_name) > os.path.getmtime(file_name): 100 | ret_val = load_model_bin(model_file=bin_file_name) 101 | print("Loading %s cache file" % bin_file_name) 102 | 103 | if ret_val is None: 104 | print("Loading %s raw file" % file_name) 105 | ret_val = pd.read_csv(file_name) 106 | print("Saving %s cache file" % bin_file_name) 107 | save_model_bin(model=ret_val, model_file=bin_file_name) 108 | if del_old: 109 | print("Erasing %s raw file" % file_name) 110 | os.remove(file_name) 111 | 112 | return ret_val 113 | 114 | 115 | def data_filter(data, filter_dict): 116 | if len(filter_dict) > 0: 117 | for filter_col in filter_dict: 118 | data = data[data[filter_col] == filter_dict[filter_col]] 119 | return data 120 | 121 | 122 | def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file, 123 | test_data_file2, test_pred_file2, 124 | model_type, model_file, fit_args, test_metric, na_fill_value, 125 | silent, skip_mapping, load_model, train_filter, metric_type, load_type, 126 | bootstrap, bootstrap_seed, weight_col): 127 | start = timeit.default_timer() 128 | 129 | train_x = load_pd_df(train_data_file) 130 | 131 | len_train_before = len(train_x) 132 | train_x = data_filter(train_x, train_filter) 133 | if not silent: 134 | print("Train has %d instances (was %d before filtering)" % (len(train_x), len_train_before)) 135 | 136 | mappings = None if skip_mapping else dict() 137 | if mappings is not None: 138 | data_all = train_x.append(load_pd_df(test_data_file)) 139 | if test_data_file2 is not None: 140 | data_all = data_all.append(load_pd_df(test_data_file2)) 141 | if not silent: 142 | print("Mapping unkown and category values...") 143 | for col in train_x.columns: 144 | if col not in ['target_col']: 145 | if data_all[col].dtype == np.dtype('object'): 146 | s = np.unique(data_all[col].fillna(na_fill_value).values) 147 | mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s) 148 | train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value) 149 | else: 150 | train_x[col] = train_x[col].fillna(na_fill_value) 151 | del data_all 152 | train_y = train_x[target_col] 153 | del train_x[target_col] 154 | 155 | extra_fit_args = dict() 156 | if weight_col is not None: 157 | extra_fit_args['sample_weight'] = train_x[weight_col].values 158 | del train_x[weight_col] 159 | 160 | if 0 < bootstrap < 1.0: 161 | if bootstrap_seed is not None: 162 | if not silent: 163 | print("Setting bootstrap seed to %d" % bootstrap_seed) 164 | np.random.seed(bootstrap_seed) 165 | random.seed(bootstrap_seed) 166 | bootstrap_len = int(math.floor(bootstrap * len(train_x))) 167 | bootstrap_ix = random.sample(range(len(train_x)), bootstrap_len) 168 | train_x = train_x.iloc[bootstrap_ix] 169 | train_x.reset_index() 170 | train_y = train_y.iloc[bootstrap_ix] 171 | train_y.reset_index() 172 | 173 | x_cols = train_x.columns 174 | feat_importance_fun = lambda fitted_model: fitted_model.feature_importances_ 175 | predict = lambda fitted_model, pred_x: fitted_model.predict(pred_x) 176 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 177 | 178 | model = None 179 | if load_model and os.path.exists(model_file): 180 | if not silent: 181 | print("Loading model %s" % model_file) 182 | model = load_model_bin(model_file=model_file) 183 | 184 | if model_type == "RandomForestRegressor": 185 | if model is None: 186 | model = RandomForestRegressor(**fit_args) 187 | model.fit(X=train_x, y=train_y, **extra_fit_args) 188 | predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x) 189 | 190 | elif model_type == "RandomForestClassifier": 191 | if model is None: 192 | model = RandomForestClassifier(**fit_args) 193 | model.fit(X=train_x, y=train_y, **extra_fit_args) 194 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 195 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 196 | 197 | elif model_type == "ExtraTreesRegressor": 198 | if model is None: 199 | model = ExtraTreesRegressor(**fit_args) 200 | model.fit(X=train_x, y=train_y, **extra_fit_args) 201 | predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x) 202 | 203 | elif model_type == "ExtraTreesClassifier": 204 | if model is None: 205 | model = ExtraTreesClassifier(**fit_args) 206 | model.fit(X=train_x, y=train_y, **extra_fit_args) 207 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 208 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 209 | 210 | elif model_type == "GradientBoostingRegressor": 211 | if model is None: 212 | model = GradientBoostingRegressor(**fit_args) 213 | model.fit(X=train_x, y=train_y, **extra_fit_args) 214 | elif load_type == "fit_more": 215 | model.warm_start = True 216 | model.n_estimators += fit_args['n_estimators'] 217 | model.fit(X=train_x, y=train_y) 218 | predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x) 219 | staged_predict = lambda fitted_model, pred_x: staged_pred_continuous(model=fitted_model, x=pred_x) 220 | if load_type == "pred_at" and fit_args['n_estimators'] < model.n_estimators: 221 | if not silent: 222 | print("Predict using %d trees" % fit_args['n_estimators']) 223 | predict = lambda fitted_model, pred_x: staged_pred_continuous_at_n(model=fitted_model, x=pred_x, 224 | n=fit_args['n_estimators']) 225 | elif model_type == "GradientBoostingClassifier": 226 | if model is None: 227 | model = GradientBoostingClassifier(**fit_args) 228 | model.fit(X=train_x, y=train_y, **extra_fit_args) 229 | elif load_type == "fit_more": 230 | model.warm_start = True 231 | model.n_estimators += fit_args['n_estimators'] 232 | model.fit(X=train_x, y=train_y) 233 | staged_predict = lambda fitted_model, pred_x: staged_pred_proba(model=fitted_model, x=pred_x) 234 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 235 | if load_type == "pred_at" and fit_args['n_estimators'] < model.n_estimators: 236 | if not silent: 237 | print("Predict using %d trees" % fit_args['n_estimators']) 238 | predict = lambda fitted_model, pred_x: staged_pred_proba_at_n(model=fitted_model, x=pred_x, 239 | n=fit_args['n_estimators']) 240 | elif model_type == "LogisticRegression": 241 | if model is None: 242 | model = LogisticRegression(**fit_args) 243 | model.fit(X=train_x, y=train_y) 244 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 245 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 246 | feat_importance_fun = lambda fitted_model: None 247 | 248 | elif model_type == "SVC": 249 | if model is None: 250 | model = sklearn.svm.SVC(**fit_args) 251 | model.fit(X=train_x, y=train_y) 252 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 253 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 254 | feat_importance_fun = lambda fitted_model: None 255 | 256 | elif model_type == "Pipeline": 257 | if model is None: 258 | model = Pipeline([ 259 | ('pre_process', get_class(fit_args['pre_process']['name'])(**fit_args['pre_process']['args'])), 260 | ('model', get_class(fit_args['model']['name'])(**fit_args['model']['args'])) 261 | ]) 262 | model.fit(X=train_x, y=train_y) 263 | predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x) 264 | staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)] 265 | feat_importance_fun = lambda fitted_model: None 266 | 267 | if not silent: 268 | print("Saving model %s" % model_file) 269 | save_model_bin(model=model, model_file=model_file) 270 | 271 | if not silent: 272 | stop = timeit.default_timer() 273 | print("Train time: %d s" % (stop - start)) 274 | 275 | del train_x, train_y 276 | 277 | start_pred = timeit.default_timer() 278 | predict_test(feat_importance_fun=feat_importance_fun, 279 | mappings=mappings, 280 | model=model, 281 | na_fill_value=na_fill_value, 282 | predict=predict, 283 | silent=silent, 284 | staged_predict=staged_predict, 285 | target_col=target_col, 286 | test_data_file=test_data_file, 287 | test_metric=test_metric, 288 | test_pred_file=test_pred_file, 289 | x_cols=x_cols, 290 | metric_type=metric_type, 291 | weight_col=weight_col) 292 | if not silent: 293 | stop = timeit.default_timer() 294 | print("Predict time: %d s" % (stop - start_pred)) 295 | 296 | if test_data_file2 is not None: 297 | start_pred = timeit.default_timer() 298 | predict_test(feat_importance_fun=lambda fitted_model: None, 299 | mappings=mappings, 300 | model=model, 301 | na_fill_value=na_fill_value, 302 | predict=predict, 303 | silent=silent, 304 | staged_predict=staged_predict, 305 | target_col=target_col, 306 | test_data_file=test_data_file2, 307 | test_metric=test_metric, 308 | test_pred_file=test_pred_file2, 309 | x_cols=x_cols, 310 | metric_type=metric_type, 311 | weight_col=weight_col) 312 | if not silent: 313 | stop = timeit.default_timer() 314 | print("Predict2 time: %d s" % (stop - start_pred)) 315 | 316 | if not silent: 317 | stop = timeit.default_timer() 318 | print("Total time: %d s" % (stop - start)) 319 | 320 | 321 | def staged_pred_proba(model, x): 322 | for pred in model.staged_predict_proba(x): 323 | yield prob_pred(pred) 324 | 325 | 326 | def staged_pred_proba_at_n(model, x, n): 327 | return nth(staged_pred_proba(model=model, x=x), n) 328 | 329 | 330 | def pred_proba(model, x): 331 | return prob_pred(model.predict_proba(X=x)) 332 | 333 | 334 | def prob_pred(pred): 335 | return pred 336 | 337 | 338 | def staged_pred_continuous(model, x): 339 | for pred in model.staged_predict(x): 340 | yield to_2dim(pred) 341 | 342 | 343 | def staged_pred_continuous_at_n(model, x, n): 344 | return nth(staged_pred_continuous(model=model, x=x), n) 345 | 346 | 347 | def continuous_predict(model, x): 348 | return to_2dim(model.predict(X=x)) 349 | 350 | 351 | def to_2dim(array_val): 352 | return np.array(array_val, ndmin=2).transpose() 353 | 354 | 355 | def nth(iterable, n): 356 | return next(itertools.islice(iterable, n, None)) 357 | 358 | 359 | def avg_eval_metric(eval_metric, test_y, prediction, metric_type): 360 | if prediction.shape[1] == 1: 361 | return eval_metric(test_y, prediction[:, 0]) 362 | elif prediction.shape[1] == 2: 363 | return eval_metric(test_y, prediction[:, 1]) 364 | else: 365 | metric_val = 0.0 366 | metric_count = 0.0 367 | if metric_type == "cumulative": 368 | cur_pred = np.zeros(prediction.shape[0]) 369 | for c in range(prediction.shape[1] - 1): 370 | # noinspection PyTypeChecker 371 | cur_actual = np.array(np.array(test_y) <= c).astype(int) 372 | cur_pred += prediction[:, c] 373 | metric_val += eval_metric(cur_actual, cur_pred) 374 | metric_count += 1.0 375 | else: 376 | for c in range(prediction.shape[1]): 377 | # noinspection PyTypeChecker 378 | cur_actual = np.array(np.array(test_y) == c).astype(int) 379 | metric_val += eval_metric(cur_actual, prediction[:, c]) 380 | metric_count += 1.0 381 | if metric_type == "sum": 382 | metric_count = 1.0 383 | return metric_val / metric_count 384 | 385 | 386 | def print_stages(test_y, stage_predictions, test_metric, metric_type, test_weight): 387 | if hasattr(ml_metrics, test_metric): 388 | eval_metric = getattr(ml_metrics, test_metric) 389 | else: 390 | eval_metric = getattr(sk_metrics, test_metric) 391 | if test_weight is not None: 392 | metric_args = inspect.getargspec(eval_metric)[0] 393 | if 'weight' in metric_args: 394 | eval_metric_orig = eval_metric 395 | eval_metric = lambda act, pred: eval_metric_orig(act, pred, test_weight) 396 | count = 0 397 | iters = [] 398 | loss = [] 399 | count_factor = 50 400 | for prediction in stage_predictions: 401 | count += 1 402 | if count in [1, 10, 50] or count % count_factor == 0: 403 | iters.append(count) 404 | loss.append(avg_eval_metric(eval_metric, test_y, prediction, metric_type=metric_type)) 405 | if count > 1000: 406 | count_factor = 500 407 | elif count > 500: 408 | count_factor = 200 409 | elif count > 250: 410 | count_factor = 100 411 | loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss}) 412 | loss_df.rename(columns={'Loss': test_metric}, inplace=True) 413 | pd.set_option('max_columns', len(loss_df.columns)) 414 | pd.set_option('max_rows', len(loss_df)) 415 | print("Loss:") 416 | print(loss_df) 417 | 418 | 419 | def load_model_bin(model_file): 420 | with gzip.open(model_file, 'rb') as model_file: 421 | return pkl.load(model_file) 422 | 423 | 424 | def save_model_bin(model, model_file): 425 | with gzip.open(model_file, 'wb') as model_file: 426 | pkl.dump(model, model_file, pkl.HIGHEST_PROTOCOL) 427 | 428 | 429 | if __name__ == "__main__": 430 | 431 | parser = argparse.ArgumentParser(description='Train and predict data using some sklearn algorithms.', 432 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 433 | 434 | parser.add_argument('-train_data_file', 435 | # default='../data/output-py/gbr_01/gbr_01_k_1_tr.csv', 436 | required=True, 437 | type=str, 438 | help='CSV with training data.') 439 | 440 | parser.add_argument('-test_data_file', 441 | # default='../data/output-py/gbr_01/gbr_01_k_1_test.csv', 442 | required=True, 443 | type=str, 444 | help='CSV with testing data.') 445 | 446 | parser.add_argument('-test_data_file2', 447 | default=None, 448 | type=str, 449 | help='CSV with testing data.') 450 | 451 | parser.add_argument('-test_pred_file', 452 | # default='../data/output-py/gbr_01/gbr_01_k_1_test_pred.csv', 453 | required=True, 454 | type=str, 455 | help='Path to output testing predictions.') 456 | 457 | parser.add_argument('-test_pred_file2', 458 | default=None, 459 | type=str, 460 | help='Path to output testing predictions.') 461 | 462 | parser.add_argument('-test_metric', 463 | # default='normalized_weighted_gini', 464 | required=True, 465 | type=str, 466 | help='Metric to compute on test set. Any metric on ml_metrics or sklearn.metrics') 467 | 468 | parser.add_argument('-target_col', 469 | # default='target', 470 | required=True, 471 | type=str, 472 | help='Name of target variable.') 473 | 474 | parser.add_argument('-weight_col', 475 | # default='weight', 476 | default=None, 477 | type=str, 478 | help='Name of weight column.') 479 | 480 | parser.add_argument('-metric_type', 481 | default='auto', 482 | type=str, 483 | help='Type of metric to evaluate.', 484 | choices=[ 485 | "auto", 486 | "cumulative", 487 | "sum"]) 488 | 489 | parser.add_argument('-model_type', 490 | # default='GradientBoostingRegressor', 491 | required=True, 492 | type=str, 493 | help='Type of model to fit.', 494 | choices=["RandomForestRegressor", 495 | "RandomForestClassifier", 496 | "ExtraTreesRegressor", 497 | "ExtraTreesClassifier", 498 | "GradientBoostingRegressor", 499 | "GradientBoostingClassifier", 500 | "LogisticRegression", 501 | "SVC", 502 | "Pipeline"]) 503 | 504 | parser.add_argument('-model_file', 505 | # default='../data/output-py/gbr_01/gbr_01_k_1_tr.csv.pkl', 506 | required=True, 507 | type=str, 508 | help='File to save the model to.') 509 | 510 | parser.add_argument('-na_fill_value', 511 | default=-20000, 512 | type=int, 513 | help='Value to fill in NAs.') 514 | 515 | parser.add_argument('-skip_mapping', 516 | # default=True, 517 | default=False, 518 | action='store_true', 519 | help='Skip na filling and category mapping.') 520 | 521 | parser.add_argument('-fit_args', 522 | # default='{\"n_estimators\": 10, \"learning_rate\": 0.001, \"loss\": \"ls\", ' 523 | # '\"max_features\": 5, \"max_depth\": 7, \"random_state\": 788954, ' 524 | # '\"subsample\": 1, \"verbose\": 50}', 525 | required=True, 526 | type=str, 527 | help='String in dictionary form of fit params.') 528 | 529 | parser.add_argument('-silent', 530 | default=False, 531 | action='store_true', 532 | help="Don't print execution information.") 533 | 534 | parser.add_argument('-train_filter', 535 | default='{}', 536 | type=str, 537 | help="Don't print execution information.") 538 | 539 | parser.add_argument('-load_model', 540 | default=False, 541 | action='store_true', 542 | help="Loads saved model if exists.") 543 | 544 | parser.add_argument('-load_type', 545 | default='fit_more', 546 | type=str, 547 | help='Type of model loading', 548 | choices=[ 549 | "fit_more", 550 | "pred_at"]) 551 | 552 | parser.add_argument('-bootstrap', 553 | default=0, 554 | type=float, 555 | help="Do bootstrap sampling.") 556 | 557 | parser.add_argument('-bootstrap_seed', 558 | default=None, 559 | type=int, 560 | help='Bootstrap seed.') 561 | 562 | args = vars(parser.parse_args()) 563 | 564 | args['train_filter'] = ast.literal_eval(args['train_filter']) 565 | args['fit_args'] = ast.literal_eval(args['fit_args']) 566 | for key in args['fit_args']: 567 | if isinstance(args['fit_args'][key], six.string_types): 568 | if args['fit_args'][key] in args: 569 | args['fit_args'][key] = args[args['fit_args'][key]] 570 | 571 | if not args['silent']: 572 | print(args) 573 | 574 | with warnings.catch_warnings(): 575 | warnings.filterwarnings("ignore", category=Warning) 576 | train_and_predict(**args) 577 | -------------------------------------------------------------------------------- /avito-context-click-py/train_xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | import xgboost as xgb 4 | import os 5 | 6 | if __name__ == "__main__": 7 | 8 | parser = argparse.ArgumentParser(description='train xgb model', 9 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 10 | 11 | parser.add_argument( 12 | '-train', 13 | default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/" 14 | "../data/output-r/data.val.tr.full.libsvm.head#data.val.tr.full.cache", 15 | type=str, 16 | help='CSV with training data') 17 | parser.add_argument( 18 | '-test', 19 | default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/" 20 | "../data/output-r/data.val.tr.full.libsvm.head#data.val.tr.full.cache", 21 | type=str, 22 | help='CSV with test data') 23 | parser.add_argument( 24 | '-pred', 25 | default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/" 26 | "../data/output-r/data.val.tt.full.pred", 27 | type=str, 28 | help='Prediction path') 29 | parser.add_argument( 30 | '-epoch', 31 | default=4, 32 | type=int, 33 | help='Epochs') 34 | 35 | args = parser.parse_args() 36 | 37 | pred_file = args.pred 38 | 39 | xg_params = { 40 | 'objective': 'binary:logistic', 41 | 'eta': 0.2, 42 | 'max_depth': 10, 43 | 'eval_metric': 'logloss', 44 | 'silent': 1, 45 | 'nthread': 6, 46 | 'gamma': 0.8, 47 | 'min_child_weight': 4, 48 | 'colsample_bytree': 0.7, 49 | 'colsample_bylevel': 0.8 50 | } 51 | num_round = 75 52 | xg_data_tr = xgb.DMatrix(args.train) 53 | xg_data_tst = xgb.DMatrix(args.test) 54 | 55 | for e in range(args.epoch): 56 | print("processing iteration %d" % e) 57 | xg_params['seed'] = 3015 + (10*e) 58 | model = xgb.train( 59 | params=list(xg_params.items()), 60 | dtrain=xg_data_tr, 61 | num_boost_round=75, 62 | feval=[(xg_data_tst, 'val')] 63 | ) 64 | pred_list = model.predict(xg_data_tst) 65 | pred_list = [round(x, 6) for x in pred_list] 66 | preds = pd.DataFrame(pred_list, columns=['IsClick']) 67 | preds.to_csv(os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False) 68 | -------------------------------------------------------------------------------- /avito-context-click-py/train_xgb_dtry.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | import getopt 5 | from sklearn import feature_extraction 6 | from sklearn.cross_validation import StratifiedKFold 7 | from sklearn.preprocessing import LabelEncoder 8 | #sys.path.append('/Users/ef/xgboost/wrapper') 9 | import xgboost as xgb 10 | import random 11 | from sklearn.metrics import log_loss 12 | import os 13 | 14 | def load_train_data(path): 15 | df = pd.read_csv(path) 16 | y = df['IsClick'].values 17 | ids = df['ID'].values 18 | df.drop(['IsClick','ID'], axis=1, inplace=True) 19 | #df.drop(['AdTitleWordLikeli'], axis=1, inplace=True) 20 | #df.drop(['AdIDRareWordCount'], axis=1, inplace=True) 21 | #df.drop(['IPIDlikeli'], axis=1, inplace=True) 22 | #df.drop(['IPIDUserAgentOSIDlikeli'], axis=1, inplace=True) 23 | #df.drop(['UserAdViewTotalCount', 'UserAdViewUniqueCount','UserAdViewTotalCount2', 'UserAdViewUniqueCount2'], axis=1, inplace=True) 24 | #df.drop(['LocationUserUniqueCount', 'CategoryUserUniqueCount'], axis=1, inplace=True) 25 | #df.drop(['AdPosition1Count', 'AdPosition7Count'], axis=1, inplace=True) 26 | #df.drop(['UserAgentIDlikeli', 'UserAgentOSIDlikeli', 'UserDeviceIDlikeli', 'UserAgentFamilyIDlikeli'], axis=1, inplace=True) 27 | #df.drop(['AdCategoryPriceDeviation'], axis=1, inplace=True) 28 | X = df.values.copy().astype(np.float32) 29 | #np.random.seed(seed=2015) 30 | #np.random.shuffle(X) 31 | return X, y, ids 32 | 33 | def load_test_data(path): 34 | df = pd.read_csv(path) 35 | if 'IsClick' in df.columns.values: 36 | df.drop(['IsClick'], axis=1, inplace=True) 37 | ids = df['ID'].values 38 | df.drop(['ID'], axis=1, inplace=True) 39 | #df.drop(['AdTitleWordLikeli'], axis=1, inplace=True) 40 | #df.drop(['AdIDRareWordCount'], axis=1, inplace=True) 41 | #df.drop(['IPIDlikeli'], axis=1, inplace=True) 42 | #df.drop(['IPIDUserAgentOSIDlikeli'], axis=1, inplace=True) 43 | #df.drop(['UserAdViewTotalCount', 'UserAdViewUniqueCount','UserAdViewTotalCount2', 'UserAdViewUniqueCount2'], axis=1, inplace=True) 44 | #df.drop(['LocationUserUniqueCount', 'CategoryUserUniqueCount'], axis=1, inplace=True) 45 | #df.drop(['AdPosition1Count', 'AdPosition7Count'], axis=1, inplace=True) 46 | #df.drop(['UserAgentIDlikeli', 'UserAgentOSIDlikeli', 'UserDeviceIDlikeli', 'UserAgentFamilyIDlikeli'], axis=1, inplace=True) 47 | #df.drop(['AdCategoryPriceDeviation'], axis=1, inplace=True) 48 | X = df.values.copy().astype(np.float32) 49 | return X, ids 50 | 51 | opts, args = getopt.getopt(sys.argv[1:], "t:v:p:e:", ["train=", "test=", "pred=", "epoch="]) 52 | opts = {x[0]:x[1] for x in opts} 53 | train_file = opts['--train'] 54 | test_file = opts['--test'] 55 | pred_file = opts['--pred'] 56 | epoch = int(opts['--epoch']) 57 | 58 | X, y, ids_train = load_train_data(train_file) 59 | X_test, ids_test = load_test_data(test_file) 60 | num_features = X.shape[1] 61 | 62 | param = {} 63 | param['objective'] = 'binary:logistic' 64 | param['eta'] = 0.2 #0.1 65 | param['max_depth'] = 10 66 | param['eval_metric'] = 'logloss' 67 | param['silent'] = 1 68 | param['nthread'] = 6 69 | param['gamma'] = 0.8 70 | param['min_child_weight'] = 4 #10 71 | #param['subsample'] = 0.8 72 | param['colsample_bytree'] = 0.7 73 | param['colsample_bylevel'] = 0.8 74 | num_round = 75 #85 75 | 76 | for e in range(epoch): 77 | print "processing iteration", e 78 | param['seed'] = 3015 + (10*e) 79 | plst = list(param.items()) 80 | 81 | index_shuffle = [i for i in range(X.shape[0])] 82 | random.shuffle(index_shuffle) 83 | xgmat_train = xgb.DMatrix( X[index_shuffle,:], label=y[index_shuffle], missing = -1.0) 84 | bst = xgb.train( plst, xgmat_train, num_round ); 85 | #fscore = [ (v,k) for k,v in bst.get_fscore().iteritems() ] 86 | #fscore.sort(reverse=True) 87 | #print fscore 88 | xgmat_test = xgb.DMatrix( X_test, missing = -1.0 ) 89 | pred_list = bst.predict( xgmat_test ) 90 | pred_list = [round(x, 5) for x in pred_list] 91 | preds = pd.DataFrame(pred_list, columns=['IsClick']) 92 | preds['ID'] = ids_test 93 | preds.to_csv(os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False) 94 | -------------------------------------------------------------------------------- /avito-context-click-py/util_rpython.py: -------------------------------------------------------------------------------- 1 | from sklearn.calibration import CalibratedClassifierCV 2 | from sklearn.cross_validation import KFold 3 | import numpy as np 4 | 5 | 6 | def calibrate_probs(y_val, prob_val, prob_test, n_folds=2, method='isotonic', random_state=5968): 7 | """ Calling from R: 8 | 9 | suppressMessages(library("rPython")) # Load RPython 10 | python.load("path/to/util_rpython.py") 11 | 12 | data.pred.calib <- python.call('calibrate_probs', 13 | y_val=y_val, # Actual values from validation 14 | prob_val=pred_val, # Predicted values from validation 15 | prob_test=pred_test) # Predicted values from test 16 | 17 | # data.pred.calib will be a list, so to get the calibrated predictions for each value we do: 18 | calib_pred_val = data.pred.calib$val 19 | calib_pred_test = data.pred.calib$test 20 | 21 | """ 22 | 23 | y_val = np.asarray(y_val, dtype=float) 24 | prob_val = np.asarray(prob_val, dtype=float).reshape((-1, 1)) 25 | prob_test = np.asarray(prob_test, dtype=float).reshape((-1, 1)) 26 | 27 | prob_clb_val = np.zeros(len(y_val)) 28 | prob_clb_test = np.zeros(len(prob_test)) 29 | 30 | kf_val_full = KFold(len(y_val), n_folds=n_folds, random_state=random_state) 31 | 32 | for ix_train, ix_test in kf_val_full: 33 | kf_val_inner = KFold(len(ix_train), n_folds=n_folds, random_state=random_state) 34 | clf = CalibratedClassifierCV(method=method, cv=kf_val_inner) 35 | clf.fit(prob_val[ix_train], y_val[ix_train]) 36 | prob_clb_val[ix_test] = clf.predict_proba(prob_val[ix_test])[:, 1] 37 | prob_clb_test += clf.predict_proba(prob_test)[:, 1]/n_folds 38 | 39 | return {'val': list(prob_clb_val), 'test': list(prob_clb_test)} 40 | -------------------------------------------------------------------------------- /avito-context-click-r/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rapp.history -------------------------------------------------------------------------------- /avito-context-click-r/.Rprofile: -------------------------------------------------------------------------------- 1 | 2 | source("_fn.base.R") 3 | -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/graphics-r3/empty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rproj.user/4B3CD3A5/graphics-r3/empty.png -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "~/ml-r-tb/contest/avito-context-click/avito-context-click-r", 3 | "sortOrder" : [ 4 | { 5 | "ascending" : true, 6 | "columnIndex" : 2 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab" : -1 3 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left" : { 3 | "panelheight" : 582, 4 | "splitterpos" : 261, 5 | "topwindowstate" : "HIDE", 6 | "windowheight" : 653 7 | }, 8 | "right" : { 9 | "panelheight" : 582, 10 | "splitterpos" : 391, 11 | "topwindowstate" : "NORMAL", 12 | "windowheight" : 653 13 | } 14 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1" : 2, 3 | "TabSet2" : 1 4 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/persistent-state: -------------------------------------------------------------------------------- 1 | abend="1" 2 | active-client-id="ceb8a635-f5e6-4d93-be42-1e0ba4cb32c2" 3 | -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/saved_source_markers: -------------------------------------------------------------------------------- 1 | {"active_set":"","sets":[]} -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/19AE70ED: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/23B66537: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/266CD89: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/3B4F6947: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6A3DD511: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6EC2D3AD: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6F75496B: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/9EF9E6CB: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/A7A18FB5: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/BBE2842F: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/C2BB45F6: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/D970D594: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/F8AE1A87: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/FB96E70: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/INDEX: -------------------------------------------------------------------------------- 1 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Fdata.build.R="C2BB45F6" 2 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Fdata.combine.R="FB96E70" 3 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.01.R="9EF9E6CB" 4 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.02.R="23B66537" 5 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.03.R="F8AE1A87" 6 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.04.R="A7A18FB5" 7 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.05.R="6EC2D3AD" 8 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.04.R="266CD89" 9 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.05.R="BBE2842F" 10 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.06.R="19AE70ED" 11 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.xgb.03.R="3B4F6947" 12 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.xgb.05.R="6F75496B" 13 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l2.xgb.02.R="6A3DD511" 14 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.zens.R="D970D594" 15 | -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/s-8FDFA111/lock_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/s-8FDFA111/lock_file -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/73D1DC80/persistent-state: -------------------------------------------------------------------------------- 1 | build-last-errors="[]" 2 | build-last-errors-base-dir="" 3 | build-last-outputs="[]" 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" 5 | console_procs="[]" 6 | files.monitored-path="" 7 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}" 8 | imageDirtyState="0" 9 | saveActionState="0" 10 | -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/73D1DC80/saved_source_markers: -------------------------------------------------------------------------------- 1 | {"active_set":"","sets":[]} -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "~/Documents/eclipse/AvitoContext2015/final_model/avito-context-click-r", 3 | "sortOrder" : [ 4 | { 5 | "ascending" : true, 6 | "columnIndex" : 2 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab" : -1 3 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left" : { 3 | "panelheight" : 737, 4 | "splitterpos" : 310, 5 | "topwindowstate" : "MAXIMIZE", 6 | "windowheight" : 775 7 | }, 8 | "right" : { 9 | "panelheight" : 737, 10 | "splitterpos" : 465, 11 | "topwindowstate" : "HIDE", 12 | "windowheight" : 775 13 | } 14 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1" : 0, 3 | "TabSet2" : 0 4 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/186F7A30: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/2191579E: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/30E2EF33: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/31E02146: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/396ED20: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/3ACE6FF1: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/51638EA1: -------------------------------------------------------------------------------- 1 | { 2 | "tempName" : "Untitled1" 3 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/5C8CFDEE: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/61214431: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/6D0737A6: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/84994D68: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/B4D42750: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/B9BF59A7: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/BD7C1F23: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/C3A0AE51: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/D0B7BE74: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/DC31BA58: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/E0E39D19: -------------------------------------------------------------------------------- 1 | { 2 | "tempName" : "Untitled1" 3 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/F66A3FFA: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/FC1B5EBA: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/INDEX: -------------------------------------------------------------------------------- 1 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2F_fn.base.R="B4D42750" 2 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2F_utils.R="396ED20" 3 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.R="C3A0AE51" 4 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.dtry.R="DC31BA58" 5 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.tree.R="51638EA1" 6 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.combine.R="BD7C1F23" 7 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fmain.R="E0E39D19" 8 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.01.R="5C8CFDEE" 9 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.02.R="3ACE6FF1" 10 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.03.R="2191579E" 11 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.04.R="F66A3FFA" 12 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.05.R="D0B7BE74" 13 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.04.R="31E02146" 14 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.05.R="84994D68" 15 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.06.R="6D0737A6" 16 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.xgb.03.R="B9BF59A7" 17 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.xgb.05.R="61214431" 18 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l2.xgb.02.R="30E2EF33" 19 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.xgb.dtry.R="FC1B5EBA" 20 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.zens.R="186F7A30" 21 | -------------------------------------------------------------------------------- /avito-context-click-r/_fn.base.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define MIN(a, b) ((a < b) ? a : b) 5 | #define MAX(a, b) ((a > b) ? a : b) 6 | #define BOUND(x, xmin, xmax) (MAX(xmin, MIN(xmax, x))) 7 | #define LOG_LOSS(act, pred) (-(act * log(pred) + (1 - act) * log(1 - pred))) 8 | 9 | using namespace Rcpp; 10 | 11 | 12 | double fn_opt_base(NumericMatrix x, NumericVector y, NumericVector params, int type) { 13 | 14 | int rows = x.nrow(); 15 | int cols = x.ncol(); 16 | 17 | int offset_params = 0; 18 | double bias = 0.0; 19 | if (params.size() > cols) { 20 | offset_params = 1; 21 | bias = params[0]; 22 | } 23 | 24 | double pred_max = 1.0 - 1e-15; 25 | double pred_min = 1e-15; 26 | 27 | double total_err = 0.0; 28 | for (int r = 0; r < rows; ++r) { 29 | 30 | double y_pred = (type==2)? 1.0 : 0.0; 31 | for (int c = 0; c < cols; ++c) { 32 | if (type==2) { 33 | y_pred *= pow(x(r,c), params[c+offset_params]); 34 | } else { 35 | y_pred += x(r,c)*params[c+offset_params]; 36 | } 37 | } 38 | y_pred += bias; 39 | y_pred = BOUND(y_pred, pred_min, pred_max); 40 | total_err += LOG_LOSS(y[r], y_pred); 41 | } 42 | 43 | return total_err/(double)rows; 44 | } 45 | 46 | 47 | // [[Rcpp::export]] 48 | double fn_opt_gm(NumericMatrix x, NumericVector y, NumericVector params) { 49 | return fn_opt_base(x, y, params, 2); 50 | } 51 | 52 | // [[Rcpp::export]] 53 | double fn_opt_am(NumericMatrix x, NumericVector y, NumericVector params) { 54 | return fn_opt_base(x, y, params, 1); 55 | } 56 | -------------------------------------------------------------------------------- /avito-context-click-r/_utils.R: -------------------------------------------------------------------------------- 1 | options(scipen=999) 2 | 3 | fn.stats <- function(data.all, flist.col, target.col) { 4 | setnames(data.all, target.col, "target") 5 | data.stats <- c() 6 | for (f in flist.col) { 7 | setnames(data.all, f, "feature") 8 | stats.row <- c(length(unique(data.all[,feature])), 9 | length(unique(data.all[!is.na(target),feature])), 10 | length(unique(data.all[is.na(target),feature])), 11 | max(data.all[,feature]), 12 | max(data.all[!is.na(target),feature]), 13 | max(data.all[is.na(target),feature]), 14 | min(data.all[,feature]), 15 | min(data.all[!is.na(target),feature]), 16 | min(data.all[is.na(target),feature]), 17 | mean(data.all[,feature]), 18 | mean(data.all[!is.na(target),feature]), 19 | mean(data.all[is.na(target),feature]), 20 | median(data.all[,feature]), 21 | median(data.all[!is.na(target),feature]), 22 | median(data.all[is.na(target),feature]), 23 | sd(data.all[,feature]), 24 | sd(data.all[!is.na(target),feature]), 25 | sd(data.all[is.na(target),feature])) 26 | data.stats <- rbind(data.stats, stats.row) 27 | setnames(data.all, "feature", f) 28 | } 29 | colnames(data.stats) <- c("unique","unique_tr","unique_test", 30 | "max","max_tr","max_test", 31 | "min","min_tr","min_test", 32 | "mean","mean_tr","mean_test", 33 | "median", "median_tr", "median_test", 34 | "sd", "sd_tr", "sd_test") 35 | data.stats <- as.data.table(data.stats) 36 | data.stats[, feature := flist.col] 37 | setnames(data.all, "target", target.col) 38 | return (data.stats) 39 | } 40 | 41 | fn.multilogloss <- function(data.actual, data.predicted) { 42 | actual <- as.matrix(data.actual) 43 | predicted <- as.matrix(data.predicted) 44 | probs <- rowSums(actual*predicted) 45 | probs[which(probs>0.999999)] <- 0.999999 46 | probs[which(probs<0.000001)] <- 0.000001 47 | return(-(1/nrow(actual))*sum(log(probs))) 48 | } 49 | 50 | fn.logloss <- function(actual, predicted, pred.min=0.000001, pred.max=0.999999) { 51 | predicted[which(predicted > pred.max)] <- pred.max 52 | predicted[which(predicted < pred.min)] <- pred.min 53 | error <- sum(-actual*log(predicted) - (1-actual)*log(1-predicted))/length(actual) 54 | return (error) 55 | } 56 | 57 | fn.mcrmse <- function(actual, predicted) { 58 | if (is.vector(predicted) & is.vector(actual)) { 59 | ix <- which(!is.na(actual)) 60 | nsamples <- length(ix) 61 | return (sqrt(sum((actual[ix] - predicted[ix])^2)/nsamples)) 62 | } 63 | if (ncol(actual) != ncol(predicted)) return (NULL) 64 | if (nrow(actual) != nrow(predicted)) return (NULL) 65 | ix <- which(!is.na(actual[,1])) 66 | nsamples <- length(ix) 67 | error <- 0 68 | #cat("Errors by targets:") 69 | errors <- c() 70 | for (i in 1:ncol(actual)) { 71 | error.col <- sqrt(sum((actual[ix,i] - predicted[ix,i])^2)/nsamples) 72 | errors <- c(errors, error.col) 73 | error <- error + error.col 74 | #cat(colnames(actual)[i],":",error.col,";") 75 | } 76 | #cat("\n") 77 | errors <- c(errors, error/ncol(actual)) 78 | return (errors) 79 | } 80 | 81 | fn.memory.usage <- function() { 82 | return (sum(sort( sapply(ls(globalenv()),function(x){object.size(get(x))})))) 83 | } 84 | 85 | fn.write.batches.csv <- function(data, train.file, col.names, sep, nchunks = 4, continue.chunks=FALSE) { 86 | options(scipen=999) 87 | if (nchunks == 1) { 88 | write.table( 89 | data, 90 | file=train.file, 91 | row.names = F, quote = F, na = "", sep = sep, 92 | append = FALSE, col.names = col.names 93 | ) 94 | } else { 95 | nr <- nrow(data) 96 | ix <- seq(1, nr, round(nr/nchunks)) 97 | if (ix[length(ix)] != nr) { 98 | ix <- c(ix, nr+1) 99 | } else { 100 | ix[length(ix)] <- nr+1 101 | } 102 | gc() 103 | for (i in 1:(length(ix)-1)) { 104 | cat("Processing chunk", i, "...\n") 105 | if (i == 1 & !continue.chunks) { 106 | write.table( 107 | data[ix[i]:(ix[i+1]-1),], 108 | file=train.file, 109 | row.names = F, quote = F, na = "", sep = sep, 110 | append = FALSE, col.names = col.names 111 | ) 112 | } else { 113 | write.table( 114 | data[ix[i]:(ix[i+1]-1),], 115 | file=train.file, 116 | row.names = F, quote = F, na = "", sep = sep, 117 | append = TRUE, col.names = FALSE 118 | ) 119 | } 120 | invisible(gc()) 121 | } 122 | } 123 | } 124 | 125 | fn.optim <- function(y, x) { 126 | 127 | x <- as.matrix(x) 128 | pars0 <- rep(0.0, ncol(x)) 129 | 130 | #error to minimize 131 | fn.loss <- function(pars) { 132 | y.pred <- 1 / (1 + exp(-as.numeric(x %*% pars))) 133 | y.pred <- pmax(y.pred, 10^(-6)) 134 | y.pred <- pmin(y.pred, 1-10^(-6)) 135 | sum(-y*log(y.pred) - (1-y)*log(1-y.pred))/length(y) 136 | } 137 | 138 | cat ("Initial loss:", fn.loss(pars0), "\n") 139 | opt.result <- optim(pars0, 140 | fn.loss, 141 | #method = "Brent", 142 | #method = "L-BFGS-B", 143 | #lower = 0.0, 144 | #upper = 10.0, 145 | control = list(trace = T,maxit=5000)) 146 | return (opt.result$par) 147 | } 148 | -------------------------------------------------------------------------------- /avito-context-click-r/avito-context-click-r.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /avito-context-click-r/data.build.tree.R: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | ## create tree features data 3 | ############################################################## 4 | cat("Tree data... \n") 5 | 6 | fn.register.wk(1) 7 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 8 | 9 | fn.init.worker('data_build/build_tree') 10 | 11 | cat('\nMerging data.all.search.small + data.all.search.cont \n') 12 | data.all.tree <- merge( 13 | data.all.search.small, 14 | data.all.search.cont[ID %in% data.all.search.small$ID], 15 | by="ID") 16 | fn.soar.unload(data.all.search.small, data.all.search.cont) 17 | invisible(gc()) 18 | 19 | cat('\nAdding data.all.search.info\n') 20 | data.all.tree <- merge( 21 | data.all.tree, 22 | data.all.search.info[SearchID %in% unique(data.all.tree$SearchID)], 23 | by="SearchID") 24 | fn.soar.unload(data.all.search.info) 25 | invisible(gc()) 26 | 27 | cat('\nAdding data.all.search.info.cont\n') 28 | data.all.tree <- merge( 29 | data.all.tree, 30 | data.all.search.info.cont[SearchID %in% unique(data.all.tree$SearchID)], 31 | by="SearchID") 32 | fn.soar.unload(data.all.search.info.cont) 33 | invisible(gc()) 34 | 35 | cat('\nAdding data.all.prob.1way\n') 36 | data.all.tree <- merge( 37 | data.all.tree, 38 | data.all.prob.1way, 39 | by="ID") 40 | fn.soar.unload(data.all.prob.1way) 41 | invisible(gc()) 42 | 43 | cat('\nAdding data.all.prob.2way.ad.us\n') 44 | data.all.tree <- merge( 45 | data.all.tree, 46 | data.all.prob.2way.ad.us, 47 | by="ID") 48 | fn.soar.unload(data.all.prob.2way.ad.us) 49 | invisible(gc()) 50 | 51 | cat('\nAdding data.all.prob.2way.ad.srch\n') 52 | data.all.tree <- merge( 53 | data.all.tree, 54 | data.all.prob.2way.ad.srch, 55 | by="ID") 56 | fn.soar.unload(data.all.prob.2way.ad.srch) 57 | invisible(gc()) 58 | 59 | cat('\nAdding data.all.prob.2way.us.srch\n') 60 | data.all.tree <- merge( 61 | data.all.tree, 62 | data.all.prob.2way.us.srch, 63 | by="ID") 64 | fn.soar.unload(data.all.prob.2way.us.srch) 65 | invisible(gc()) 66 | 67 | cat('\nAdding data.all.prob.2way.srch\n') 68 | data.all.tree <- merge( 69 | data.all.tree, 70 | data.all.prob.2way.srch, 71 | by="ID") 72 | fn.soar.unload(data.all.prob.2way.srch) 73 | invisible(gc()) 74 | 75 | cat('\nRemoving unwanted columns\n') 76 | cols.excl <- c( 77 | "UserID", "UserIPID", "UserAgentID", "UserDeviceID", 78 | "AdID", "AdParams", 79 | "SearchLocID" 80 | ) 81 | 82 | for (col.nam in cols.excl) { 83 | data.all.tree[, col.nam := NULL, with=F] 84 | } 85 | invisible(gc()) 86 | 87 | cat('\nSorting columns\n') 88 | setkeyv(data.all.tree, c("SearchDate", "SearchID", "Position")) 89 | invisible(gc()) 90 | 91 | 92 | cat('\nFilling NAs\n') 93 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick") 94 | cols.in <- sort(setdiff(colnames(data.all.tree), cols.extra)) 95 | 96 | for (col.nam in cols.in) { 97 | if (any(is.na(data.all.tree[[col.nam]]))) { 98 | setnames(data.all.tree, col.nam, 'change_val') 99 | data.all.tree[is.na(change_val), change_val := -1] 100 | setnames(data.all.tree, 'change_val', col.nam) 101 | } 102 | } 103 | 104 | invisible(gc()) 105 | cols.in.tree <- cols.in 106 | cat('\nSaving\n') 107 | setcolorder(data.all.tree, c(cols.extra, cols.in)) 108 | Store(data.all.tree, cols.in) 109 | invisible(gc()) 110 | 111 | 112 | data.l2.pred <- fn.load.ens( 113 | ens.cols = c( 114 | "data.ftrl.04.pred", 115 | "data.ftrl.05.pred", 116 | "data.ftrl.06.pred", 117 | "data.fm.05.pred", 118 | "data.fm.04.pred", 119 | "data.fm.03.pred", 120 | "data.fm.02.pred", 121 | "data.fm.01.pred" 122 | ), print.err = F) 123 | data.l2.all.tree <- merge(data.all.tree, data.l2.pred, 124 | by="ID") 125 | rm(data.l2.pred) 126 | fn.soar.unload(data.all.tree) 127 | 128 | col.num <- sapply(data.l2.all.tree, is.numeric) 129 | col.num <- names(col.num)[col.num] 130 | col.num <- col.num[(col.num %like% '(^Prob)|(^ftrl)|(^fm)|(^Ratio)|AdHistCTR')] 131 | 132 | for (col.nam in col.num) { 133 | setnames(data.l2.all.tree, col.nam, 'change_val') 134 | data.l2.all.tree[, change_val := round(change_val, digits = 6)] 135 | setnames(data.l2.all.tree, 'change_val', col.nam) 136 | } 137 | 138 | Store(data.l2.all.tree) 139 | invisible(gc()) 140 | 141 | fn.clean.worker() 142 | } 143 | fn.kill.wk() 144 | 145 | ############################################################# 146 | # Probability features - full dataset 147 | ############################################################# 148 | cat("Probability features full data... \n") 149 | 150 | fn.register.wk(1) 151 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 152 | 153 | fn.init.worker('data_build/prob_features_full') 154 | 155 | cols.in.1way <- c( 156 | "AdID", "AdCatID", "AdParams", 157 | "UserID", "UserIPID", "UserAgentID", 158 | "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID", 159 | "SearchLocID", "SearchCatID" 160 | ) 161 | data.all.prob.full.1way <- fn.build.prob.full(cols.in.1way) 162 | Store(data.all.prob.full.1way) 163 | invisible(gc()) 164 | 165 | cols.in.2way <- c( 166 | "AdID", "AdCatID", "AdParams", 167 | "UserID", "UserIPID", "UserAgentID", 168 | "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID", 169 | "SearchLocID", "SearchCatID" 170 | ) 171 | 172 | data.all.prob.full.2way.ad.us <- fn.build.prob.full( 173 | fn.build.interaction(cols.in.2way, c("Ad", "Us"))) 174 | Store(data.all.prob.full.2way.ad.us) 175 | invisible(gc()) 176 | 177 | data.all.prob.full.2way.ad.srch <- fn.build.prob.full( 178 | fn.build.interaction(cols.in.2way, c("Ad", "Search"))) 179 | Store(data.all.prob.full.2way.ad.srch) 180 | invisible(gc()) 181 | 182 | data.all.prob.full.2way.srch <- fn.build.prob.full( 183 | fn.build.interaction(cols.in.2way, c("Search", "Search"))) 184 | invisible(gc()) 185 | Store(data.all.prob.full.2way.srch) 186 | invisible(gc()) 187 | 188 | cols.in.2way.us.srch.1 <- c( 189 | "UserAgentID", "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID", 190 | "SearchLocID", "SearchCatID" 191 | ) 192 | 193 | data.all.prob.full.2way.us.srch.1 <- fn.build.prob.full( 194 | fn.build.interaction(cols.in.2way.us.srch.1, c("Us", "Search"))) 195 | invisible(gc()) 196 | Store(data.all.prob.full.2way.us.srch.1) 197 | invisible(gc()) 198 | 199 | cols.in.2way.us.srch.2 <- c( 200 | "UserID", "UserIPID", 201 | "SearchLocID", "SearchCatID" 202 | ) 203 | 204 | data.all.prob.full.2way.us.srch.2 <- fn.build.prob.full( 205 | fn.build.interaction(cols.in.2way.us.srch.2, c("Us", "Search"))) 206 | invisible(gc()) 207 | Store(data.all.prob.full.2way.us.srch.2) 208 | invisible(gc()) 209 | 210 | 211 | 212 | cat("\nMerging probabilities\n") 213 | data.all.prob.full <- data.all.prob.full.1way 214 | setkey(data.all.prob.full, ID) 215 | 216 | setkey(data.all.prob.full.2way.ad.us, ID) 217 | fn.check.id(data.all.prob.full, 218 | data.all.prob.full.2way.ad.us) 219 | for (col.nam in colnames(data.all.prob.full.2way.ad.us)[-1]) { 220 | data.all.prob.full[ 221 | , col.nam := data.all.prob.full.2way.ad.us[[col.nam]], 222 | with=F] 223 | } 224 | fn.soar.unload(data.all.prob.full.2way.ad.us) 225 | invisible(gc()) 226 | 227 | 228 | 229 | setkey(data.all.prob.full.2way.ad.srch, ID) 230 | fn.check.id(data.all.prob.full, 231 | data.all.prob.full.2way.ad.srch) 232 | for (col.nam in colnames(data.all.prob.full.2way.ad.srch)[-1]) { 233 | data.all.prob.full[ 234 | , col.nam := data.all.prob.full.2way.ad.srch[[col.nam]], 235 | with=F] 236 | } 237 | fn.soar.unload(data.all.prob.full.2way.ad.srch) 238 | invisible(gc()) 239 | 240 | 241 | 242 | setkey(data.all.prob.full.2way.us.srch.1, ID) 243 | fn.check.id(data.all.prob.full, 244 | data.all.prob.full.2way.us.srch.1) 245 | for (col.nam in colnames(data.all.prob.full.2way.us.srch.1)[-1]) { 246 | data.all.prob.full[ 247 | , col.nam := data.all.prob.full.2way.us.srch.1[[col.nam]], 248 | with=F] 249 | } 250 | fn.soar.unload(data.all.prob.full.2way.us.srch.1) 251 | invisible(gc()) 252 | 253 | 254 | setkey(data.all.prob.full.2way.us.srch.2, ID) 255 | fn.check.id(data.all.prob.full, 256 | data.all.prob.full.2way.us.srch.2) 257 | for (col.nam in colnames(data.all.prob.full.2way.us.srch.2)[-1]) { 258 | data.all.prob.full[ 259 | , col.nam := data.all.prob.full.2way.us.srch.2[[col.nam]], 260 | with=F] 261 | } 262 | fn.soar.unload(data.all.prob.full.2way.us.srch.2) 263 | invisible(gc()) 264 | 265 | 266 | 267 | setkey(data.all.prob.full.2way.srch, ID) 268 | fn.check.id(data.all.prob.full, 269 | data.all.prob.full.2way.srch) 270 | for (col.nam in colnames(data.all.prob.full.2way.srch)[-1]) { 271 | data.all.prob.full[ 272 | , col.nam := data.all.prob.full.2way.srch[[col.nam]], 273 | with=F] 274 | } 275 | fn.soar.unload(data.all.prob.full.2way.srch) 276 | invisible(gc()) 277 | 278 | 279 | cat("\nSaving dataset csv\n") 280 | setkey(data.all.prob.full, ID) 281 | data.all.prob.full[, ID := NULL] 282 | fn.write.csv.chunk(data=data.all.prob.full, 283 | file=fn.out.file("data.all.prob.full.csv"), 284 | compress=F) 285 | 286 | fn.clean.worker() 287 | } 288 | fn.kill.wk() 289 | 290 | 291 | 292 | ############################################################## 293 | ## full tree mdel data 294 | ############################################################## 295 | tic() 296 | cat("full tree model data... \n") 297 | 298 | 299 | fn.register.wk(1) 300 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 301 | 302 | fn.init.worker('data_build/build_tree_full') 303 | 304 | 305 | cat('\nMerging data.all.search.info + data.all.search.info.cont \n') 306 | data.all.tree.full <- merge(data.all.search.info, 307 | data.all.search.info.cont, 308 | by="SearchID") 309 | fn.soar.unload(data.all.search.info, data.all.search.info.cont) 310 | invisible(gc()) 311 | 312 | setkey(data.all.tree.full, "SearchID") 313 | data.all.tree.full <- data.all.tree.full[J(data.all.search$SearchID)] 314 | if (!all(data.all.tree.full$SearchID == data.all.search$SearchID)) { 315 | stop('SearchIDs do not match') 316 | } 317 | data.all.tree.full[, ID := data.all.search$ID] 318 | 319 | fn.to.data.all.tree.full <- function(data.add) { 320 | fn.check.id(data.all.tree.full, data.add) 321 | for (col.nam in colnames(data.add)) { 322 | if (col.nam %ni% colnames(data.all.tree.full)) { 323 | data.all.tree.full[, col.nam := data.add[[col.nam]], with=F] 324 | } 325 | } 326 | invisible(NULL) 327 | } 328 | 329 | cat('\nAdding data.all.search\n') 330 | fn.to.data.all.tree.full(data.all.search) 331 | fn.soar.unload(data.all.search) 332 | 333 | cat('\nAdding data.all.search.cont\n') 334 | fn.to.data.all.tree.full(data.all.search.cont) 335 | fn.soar.unload(data.all.search.cont) 336 | 337 | setkeyv(data.all.tree.full, c("SearchDate", "SearchID", "Position")) 338 | 339 | # cat('\nRemoving unwanted columns\n') 340 | # cols.excl <- c( 341 | # "UserID", "UserIPID", "UserAgentID", "UserDeviceID", 342 | # "AdID", "AdParams", "SearchLocID" 343 | # ) 344 | # 345 | # for (col.nam in cols.excl) { 346 | # data.all.tree.full[, col.nam := NULL, with=F] 347 | # } 348 | # invisible(gc()) 349 | 350 | cat('\nSorting columns\n') 351 | setkeyv(data.all.tree.full, c("SearchDate", "SearchID", "Position")) 352 | invisible(gc()) 353 | 354 | 355 | cat('\nFilling NAs\n') 356 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick") 357 | cols.in <- sort(setdiff(colnames(data.all.tree.full), cols.extra)) 358 | 359 | for (col.nam in cols.in) { 360 | if (any(is.na(data.all.tree.full[[col.nam]]))) { 361 | setnames(data.all.tree.full, col.nam, 'change_val') 362 | data.all.tree.full[is.na(change_val), change_val := -1] 363 | setnames(data.all.tree.full, 'change_val', col.nam) 364 | } 365 | } 366 | 367 | invisible(gc()) 368 | 369 | cat('\nSaving\n') 370 | setcolorder(data.all.tree.full, c(cols.extra, cols.in)) 371 | Store(data.all.tree.full) 372 | invisible(gc()) 373 | 374 | setkey(data.all.tree.full, ID) 375 | cat("\nSaving dataset csv\n") 376 | fn.write.csv.chunk(data=data.all.tree.full, 377 | file=fn.out.file("data.all.tree.full.csv"), 378 | compress=F) 379 | 380 | cat('\nMerging data\n') 381 | system( 382 | paste( 383 | "paste -d ',' ", 384 | fn.out.file("data.all.tree.full.csv"), 385 | fn.out.file("data.all.prob.full.csv"), 386 | "> ", fn.out.file("data.all.tree.full.merge.csv") 387 | ) 388 | ) 389 | cat('\nCompressing data\n') 390 | system( 391 | paste( 392 | "pigz -f", fn.out.file("data.all.tree.full.merge.csv") 393 | ) 394 | ) 395 | 396 | cat('\nSaving libsvm data\n') 397 | cols.in.tree.full <- c( 398 | "AdCatID","AdHistCTR","AdID","AdParams","AdPrice", 399 | "AdTitleSZ","CountAdSearch","CountAdSearchCat", 400 | "CountAdSearchLoc","CountAdUsers","CountIPUser", 401 | "CountUserAd","CountUserAdDupT1","CountUserAdDupT3", 402 | "CountUserAdT1","CountUserAdT3","CountUserSearch", 403 | "CountUserSearchCategory","CountUserSearchLocation", 404 | "Position","RatioAdPos1","RatioSearchRuss","SearchAdCount", 405 | "SearchAdT1Count","SearchAdT2Count","SearchAdT3Count", 406 | "SearchCatID","SearchDate","SearchLocID","SearchOrdUsrAsc", 407 | "SearchOrdUsrDesc","SearchParamsSZ","SearchQuerySZ", 408 | "SearchRussian","UserAgentFamilyID","UserAgentID", 409 | "UserAgentOSID","UserDeviceID","UserID","UserIPID", 410 | "UserLogged","UserPrevPhoneRequest", 411 | "UserPrevPrevPrevQryDate","UserPrevPrevQryDate", 412 | "UserPrevQryDate","UserPrevVisitReq","UserPrevVisitReqUni", 413 | "UserQryTotalTime","ProbAdID","ProbAdCatID","ProbAdParams", 414 | "ProbUserID","ProbUserIPID","ProbUserAgentID", 415 | "ProbUserAgentOSID","ProbUserDeviceID", 416 | "ProbUserAgentFamilyID","ProbSearchLocID", 417 | "ProbSearchCatID","ProbAdCatIDUserAgentFamilyID", 418 | "ProbAdIDUserAgentFamilyID","ProbAdCatIDUserAgentOSID", 419 | "ProbAdIDUserAgentOSID","ProbAdCatIDUserID","ProbAdIDUserID", 420 | "ProbAdCatIDUserIPID","ProbAdIDUserIPID", 421 | "ProbAdCatIDSearchCatID","ProbAdIDSearchCatID", 422 | "ProbAdCatIDSearchLocID","ProbAdIDSearchLocID", 423 | "ProbSearchCatIDUserAgentFamilyID", 424 | "ProbSearchLocIDUserAgentFamilyID", 425 | "ProbSearchCatIDUserAgentOSID", 426 | "ProbSearchLocIDUserAgentOSID","ProbSearchCatIDUserID", 427 | "ProbSearchLocIDUserID","ProbSearchCatIDUserIPID", 428 | "ProbSearchLocIDUserIPID", 429 | "ProbSearchLocIDSearchCatID") 430 | extra.tr.sel <- "int(row[\"SearchOrdUsrDesc\"]) <= 10 and" # row[\"SearchDate\"]) >= 1431396000 431 | system(paste( 432 | "cd ../avito-context-click-py &&", 433 | "pypy -u convert_csv_to_libsvm.py", 434 | "-input_files ../data/output-r/data.all.tree.full.merge.csv", 435 | "-out_selector '{", 436 | "\"../data/output-r/data.val.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\"],", 437 | "\"../data/output-r/data.val.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 438 | "\"../data/output-r/data.test.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 439 | "\"../data/output-r/data.test.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 440 | "}'", 441 | "-weight_builder_dict '{", 442 | "\"../data/output-r/data.val.tr.full.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])),", 443 | "\"../data/output-r/data.test.tr.full.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"]))", 444 | "}'", 445 | "-feat_map_file ../data/output-r/data.all.full.fmap", 446 | "-col_out IsClick", 447 | "-col_in_num", paste(cols.in.tree.full, collapse=' '), 448 | "-missing_values '' 'na' 'nan' 'NA' 'NaN' '-1'", 449 | ">> ../data/log/data_build/build_tree_full.log 2>&1")) 450 | 451 | fn.clean.worker() 452 | } 453 | fn.kill.wk() 454 | 455 | 456 | 457 | 458 | 459 | ############################################################## 460 | ## Ensenble cross validation scheme 461 | ############################################################## 462 | tic() 463 | cat("Ensenble cross validation... \n") 464 | 465 | data.cv.ens <- fn.cv.ens.folds() 466 | Store(data.cv.ens) 467 | 468 | toc() 469 | -------------------------------------------------------------------------------- /avito-context-click-r/data.combine.R: -------------------------------------------------------------------------------- 1 | # ############################################################# 2 | # # merge lucas and dmitry data 3 | # ############################################################# 4 | # 5 | tic() 6 | cat("Merging datasets... \n") 7 | 8 | fn.register.wk(1) 9 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 10 | 11 | fn.init.worker('data_build/combine_datasets') 12 | 13 | cat("\nLoading datasets\n") 14 | load(fn.rdata.file('data.reduced.all.RData')) 15 | 16 | setkey(data.reduced.all, ID) 17 | setkey(data.l2.all.tree, ID) 18 | 19 | cat("\nCalculating common cols\n") 20 | col.common <- intersect(colnames(data.reduced.all), 21 | colnames(data.l2.all.tree)) 22 | col.common <- unique(c(col.common, "HistCTR", "Price")) 23 | 24 | col.uniq.dtry <- sapply(data.reduced.all, 25 | function(x) length(unique(x))) 26 | col.uniq.dtry <- col.uniq.dtry[!names(col.uniq.dtry) %in% 27 | col.common] 28 | 29 | col.uniq.lucas <- sapply(data.l2.all.tree, 30 | function(x) length(unique(x))) 31 | col.uniq.lucas <- col.uniq.lucas[ 32 | !names(col.uniq.lucas) %in% 33 | col.common] 34 | 35 | # check for length and then for value 36 | cols.match <- list() 37 | for (ix in 1:length(col.uniq.lucas)) { 38 | col.uniq <- col.uniq.lucas[ix] 39 | col.same <- col.uniq.dtry[col.uniq.dtry == col.uniq] 40 | if (length(col.same) >= 1) { 41 | col.lucas.nam <- names(col.uniq) 42 | for (col.dtry.nam in names(col.same)) { 43 | if (all(data.reduced.all[[col.dtry.nam]] == 44 | data.l2.all.tree[[col.lucas.nam]])) { 45 | cols.match[[col.lucas.nam]] <- col.dtry.nam 46 | col.common <- unique(c(col.common, col.dtry.nam)) 47 | } 48 | } 49 | } 50 | } 51 | 52 | cat("\nCopying and cols and saving RData\n") 53 | for (col.nam in setdiff(col.common, "ID")) { 54 | data.reduced.all[, col.nam := NULL, with=F] 55 | } 56 | 57 | data.all.tree.dl <- data.l2.all.tree 58 | fn.soar.unload(data.l2.all.tree) 59 | 60 | setkey(data.reduced.all, ID) 61 | setkey(data.all.tree.dl, ID) 62 | 63 | for (col.nam in setdiff(colnames(data.reduced.all), "ID")) { 64 | data.all.tree.dl[, col.nam := data.reduced.all[[col.nam]], with=F] 65 | data.reduced.all[, col.nam := NULL, with=F] 66 | invisible(gc()) 67 | } 68 | rm(data.reduced.all) 69 | 70 | save(data.all.tree.dl, file=fn.rdata.file('data.all.tree.dl.RData')) 71 | 72 | cat("\nSaving dataset csv\n") 73 | fn.write.csv.chunk(data=data.all.tree.dl, 74 | file=fn.out.file("data.all.tree.dl.csv"), 75 | compress=F) 76 | 77 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick") 78 | cols.in <- sort(setdiff(colnames(data.all.tree.dl), 79 | c(cols.extra))) 80 | 81 | cols.in.combine <- cols.in 82 | Store(cols.in.combine) 83 | 84 | rm(data.all.tree.dl) 85 | invisible(gc()) 86 | 87 | system(paste( 88 | "cd ../avito-context-click-py &&", 89 | "pypy -u convert_csv_to_libsvm.py", 90 | "-input_files ../data/output-r/data.all.tree.dl.csv", 91 | "-out_selector '{", 92 | "\"../data/output-libsvm/data.val.tr.libsvm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 93 | "\"../data/output-libsvm/data.val.tt.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 94 | "\"../data/output-libsvm/data.test.tr.libsvm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 95 | "\"../data/output-libsvm/data.test.tt.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 96 | "}'", 97 | "-col_out IsClick", 98 | "-col_in_num", paste(cols.in.combine, collapse=' '), 99 | "-missing_values '' 'na' 'nan' 'NA' 'NaN' '-1'", 100 | ">> ../data/log/data_build/combine_datasets.log 2>&1")) 101 | 102 | system(paste( 103 | "cd ../avito-context-click-py &&", 104 | "pypy -u convert_csv_to_libsvm.py", 105 | "-input_files ../data/output-r/data.all.tree.dl.csv", 106 | "-out_selector '{", 107 | "\"../data/output-libsvm/data.val.tr.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 108 | "\"../data/output-libsvm/data.val.tt.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 109 | "\"../data/output-libsvm/data.test.tr.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 110 | "\"../data/output-libsvm/data.test.tt.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 111 | "}'", 112 | "-feat_map_file ../data/output-libsvm/data.all.nllh.fmap", 113 | "-col_out IsClick", 114 | "-col_in_num", paste(cols.in.combine[!cols.in.combine %like% 'likeli'], 115 | collapse=' '), 116 | "-missing_values '' 'na' 'nan' 'NA' 'NaN' '-1'", 117 | ">> ../data/log/data_build/combine_datasets.log 2>&1")) 118 | 119 | system(paste( 120 | "cd ../avito-context-click-py &&", 121 | "pypy -u convert_csv_to_libsvm.py", 122 | "-input_files ../data/output-r/data.all.tree.dl.csv", 123 | "-out_selector '{", 124 | "\"../data/output-libsvm/data.val.tr.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 125 | "\"../data/output-libsvm/data.val.tt.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 126 | "\"../data/output-libsvm/data.test.tr.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\", \"val\"],", 127 | "\"../data/output-libsvm/data.test.tt.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 128 | "}'", 129 | "-weight_builder_dict '{", 130 | "\"../data/output-libsvm/data.val.tr.nprob.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])-3),", 131 | "\"../data/output-libsvm/data.test.tr.nprob.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])-0)", 132 | "}'", 133 | "-feat_map_file ../data/output-libsvm/data.all.nprob.fmap", 134 | "-col_out IsClick", 135 | "-col_in_num", paste(cols.in.combine[!cols.in.combine %like% '^Prob'], 136 | collapse=' '), 137 | "-missing_values '' 'na' 'nan' 'NA' 'NaN' '-1'", 138 | ">> ../data/log/data_build/combine_datasets.log 2>&1")) 139 | 140 | load(fn.rdata.file('data.full.all.RData')) 141 | setkey(data.full.all, ID) 142 | data.full.all[, ID := NULL] 143 | cat("\nSaving dataset csv\n") 144 | fn.write.csv.chunk(data=data.full.all, 145 | file=fn.out.file("data.dtry.full.all.csv"), 146 | compress=F) 147 | rm(data.full.all) 148 | invisible(gc()) 149 | # 150 | cat('\nMerging data\n') 151 | system( 152 | paste( 153 | "paste -d ',' ", 154 | fn.out.file("data.all.tree.full.csv"), 155 | fn.out.file("data.all.prob.full.csv"), 156 | fn.out.file("data.dtry.full.all.csv"), 157 | "> ", fn.out.file("data.all.tree.full.combine.csv") 158 | ) 159 | ) 160 | #cat('\nCompressing data\n') 161 | #system( 162 | # paste( 163 | # "pigz -f", fn.out.file("data.all.tree.full.combine.csv") 164 | # ) 165 | #) 166 | 167 | cat('\nSaving libsvm data\n') 168 | cols.in.combine.full <- c( 169 | "AdCatID","AdHistCTR","AdID","AdParams","AdPrice", 170 | "AdTitleSZ","CountAdSearch","CountAdSearchCat", 171 | "CountAdSearchLoc","CountAdUsers","CountIPUser", 172 | "CountUserAd","CountUserAdDupT1","CountUserAdDupT3", 173 | "CountUserAdT1","CountUserAdT3","CountUserSearch", 174 | "CountUserSearchCategory","CountUserSearchLocation", 175 | "Position","RatioAdPos1","RatioSearchRuss","SearchAdCount", 176 | "SearchAdT1Count","SearchAdT2Count","SearchAdT3Count", 177 | "SearchCatID","SearchDate","SearchLocID","SearchOrdUsrAsc", 178 | "SearchOrdUsrDesc","SearchParamsSZ","SearchQuerySZ", 179 | "SearchRussian","UserAgentFamilyID","UserAgentID", 180 | "UserAgentOSID","UserDeviceID","UserID","UserIPID", 181 | "UserLogged","UserPrevPhoneRequest", 182 | "UserPrevPrevPrevQryDate","UserPrevPrevQryDate", 183 | "UserPrevQryDate","UserPrevVisitReq","UserPrevVisitReqUni", 184 | "UserQryTotalTime", 185 | "ProbAdID","ProbAdCatID","ProbAdParams", 186 | "ProbUserID","ProbUserIPID","ProbUserAgentID", 187 | "ProbUserAgentOSID","ProbUserDeviceID", 188 | "ProbUserAgentFamilyID","ProbSearchLocID", 189 | "ProbSearchCatID","ProbAdCatIDUserAgentFamilyID", 190 | "ProbAdIDUserAgentFamilyID","ProbAdCatIDUserAgentOSID", 191 | "ProbAdIDUserAgentOSID","ProbAdCatIDUserID","ProbAdIDUserID", 192 | "ProbAdCatIDUserIPID","ProbAdIDUserIPID", 193 | "ProbAdCatIDSearchCatID","ProbAdIDSearchCatID", 194 | "ProbAdCatIDSearchLocID","ProbAdIDSearchLocID", 195 | "ProbSearchCatIDUserAgentFamilyID", 196 | "ProbSearchLocIDUserAgentFamilyID", 197 | "ProbSearchCatIDUserAgentOSID", 198 | "ProbSearchLocIDUserAgentOSID","ProbSearchCatIDUserID", 199 | "ProbSearchLocIDUserID","ProbSearchCatIDUserIPID", 200 | "ProbSearchLocIDUserIPID", 201 | 'SearchDayYear', 'SearchPosition2Count', 'SearchPosition6Count', 202 | 'SearchPosition7Count', 'AdPosition1Count', 'AdPosition7Count', 203 | 'SearchParamsCount', 'LocationUserUniqueCount', 'CategoryUserUniqueCount', 204 | 'SearchIDPreviousAge', 'AdParamsSize', 'AdParamsCount', 'UserAdCount', 205 | 'AdCategoryPriceDeviation', 'UserAdViewTotalCount', 'UserAdViewUniqueCount', 206 | 'UserAdCategoryPriceMean', 'UserAdCategoryPriceMedian', 207 | 'UserAdCategoryPriceMin', 'UserAdCategoryPriceMax', 'UserAdViewTotalCount2', 208 | 'UserAdViewUniqueCount2', 'UserAdCategoryPriceMean2', 209 | 'UserAdCategoryPriceMedian2', 'UserAdCategoryPriceMin2', 210 | 'UserAdCategoryPriceMax2' 211 | ) 212 | 213 | extra.tr.sel <- "int(row[\"SearchOrdUsrDesc\"]) <= 7 and" # row[\"SearchDate\"]) >= 1431396000 214 | system(paste( 215 | "cd ../avito-context-click-py &&", 216 | "pypy -u convert_csv_to_libsvm.py", 217 | "-input_files ../data/output-r/data.all.tree.full.combine.csv", 218 | "-out_selector '{", 219 | "\"../data/output-libsvm/data.val.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\"],", 220 | "\"../data/output-libsvm/data.val.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 221 | "\"../data/output-libsvm/data.test.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 222 | "\"../data/output-libsvm/data.test.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 223 | "}'", 224 | "-feat_map_file ../data/output-libsvm/data.all.full.fmap", 225 | "-col_out IsClick", 226 | "-col_in_num", paste(unique(cols.in.combine.full), collapse=' '), 227 | "-missing_values '' 'na' 'nan' 'NA' 'NaN' '-1'", 228 | ">> ../data/log/data_build/combine_datasets.log 2>&1")) 229 | 230 | 231 | cat("\nDone!\n") 232 | 233 | fn.clean.worker() 234 | } 235 | fn.kill.wk() 236 | 237 | -------------------------------------------------------------------------------- /avito-context-click-r/main.R: -------------------------------------------------------------------------------- 1 | source("_fn.base.R") 2 | source("_utils.R") 3 | source("data.build.R") 4 | source("train.l1.fm.01.R") 5 | source("train.l1.fm.02.R") 6 | source("train.l1.fm.03.R") 7 | source("train.l1.fm.04.R") 8 | source("train.l1.fm.05.R") 9 | source("train.l1.ftrl.04.R") 10 | source("train.l1.ftrl.05.R") 11 | source("train.l1.ftrl.06.R") 12 | 13 | source("data.build.tree.R") 14 | source("data.build.dtry.R") 15 | source("data.combine.R") 16 | 17 | source("train.l1.xgb.03.R") 18 | source("train.l1.xgb.05.R") 19 | source("train.l2.xgb.02.R") 20 | source("train.xgb.dtry.R") 21 | 22 | source("train.zens.R") 23 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.fm.01.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("fm_01/build_fm") 9 | system(paste( 10 | "cd ../avito-context-click-py &&", 11 | "pypy -u convert_csv_to_libffm.py", 12 | "-input_files ../data/output-r/data.all.lr.csv", 13 | "-out_selector '{", 14 | "\"../data/output-libffm/fm_01/data.val.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 15 | "\"../data/output-libffm/fm_01/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],", 16 | "\"../data/output-libffm/fm_01/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 17 | "\"../data/output-libffm/fm_01/data.val.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 18 | "\"../data/output-libffm/fm_01/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 19 | "\"../data/output-libffm/fm_01/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 20 | "\"../data/output-libffm/fm_01/data.test.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\", \"val\"],", 21 | "\"../data/output-libffm/fm_01/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 22 | "\"../data/output-libffm/fm_01/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 23 | "}'", 24 | "-col_out IsClick", 25 | "-col_in_cat", 26 | " AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin", 27 | " Position", 28 | " SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count", 29 | " SearchCatID SearchLocID SearchParamsSZBin SearchQuerySZBin SearchRussian", 30 | " UserID UserIPID UserPrevQryDateBin UserQryTotalTimeBin", 31 | "-old_format", 32 | ">> ../data/log/fm_01/build_fm.log 2>&1")) 33 | fn.clean.worker() 34 | NULL 35 | } 36 | fn.kill.wk() 37 | 38 | ############################################################# 39 | # train phase 40 | ############################################################# 41 | fn.register.wk(2) 42 | data.fm.01.pred.tmp <- foreach(test.type=c("tr", "val", "test"), .combine=rbind, 43 | .noexport=all.noexport) %dopar% { 44 | 45 | log.name <- paste0("fm_01/fm_01_",test.type) 46 | fn.init.worker(log.name) 47 | 48 | system(paste( 49 | "../fm/fm", 50 | "-k 16 -t 20 -r 0.02 -s 6 -l 0.00001", 51 | paste0("../data/output-libffm/fm_01/data.",test.type,".tt.fm "), 52 | paste0("../data/output-libffm/fm_01/data.",test.type,".tr.fm "), 53 | " >> ", paste0("../data/log/",log.name,".log"), " 2>&1")) 54 | 55 | data.pred <- data.table( 56 | ID = data.all.lr.id[SearchType==test.type,ID], 57 | Pred = scan(paste0("../data/output-libffm/fm_01/data.",test.type,".tt.fm.out")) 58 | ) 59 | fn.print.err(data.pred) 60 | 61 | fn.clean.worker() 62 | data.pred 63 | } 64 | fn.kill.wk() 65 | 66 | data.fm.01.pred.tmp <- data.fm.01.pred.tmp[order(ID)] 67 | Store(data.fm.01.pred.tmp) 68 | 69 | data.fm.01.pred <- copy(data.fm.01.pred.tmp) 70 | 71 | fn.print.err(data.fm.01.pred) 72 | # Size Loss 73 | # 1 7888752 0.04148 - tr 74 | # 1 8512834 0.04334 - val 75 | # 1 16401586 0.04244 - all 76 | 77 | Store(data.fm.01.pred) 78 | 79 | # fn.write.submission(data.fm.01.pred, "data.fm.01.pred") 80 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.fm.02.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("fm_02/build_fm") 9 | system(paste( 10 | "cd ../avito-context-click-py &&", 11 | "pypy -u convert_csv_to_libffm.py", 12 | "-input_files ../data/output-r/data.all.lr.csv", 13 | "-out_selector '{", 14 | "\"../data/output-libffm/fm_02/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],", 15 | "\"../data/output-libffm/fm_02/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 16 | "\"../data/output-libffm/fm_02/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 17 | "\"../data/output-libffm/fm_02/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 18 | "\"../data/output-libffm/fm_02/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 19 | "\"../data/output-libffm/fm_02/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 20 | "}'", 21 | "-col_out IsClick", 22 | "-col_in_cat", 23 | " AdCatID AdHistCTRBin AdParams AdPriceBin AdTitleSZBin", 24 | " Position SearchAdT1Count", 25 | "-old_format", 26 | ">> ../data/log/fm_02/build_fm.log 2>&1")) 27 | fn.clean.worker() 28 | NULL 29 | } 30 | fn.kill.wk() 31 | 32 | ############################################################# 33 | # train phase 34 | ############################################################# 35 | fn.register.wk(1) # , "tr", "test" 36 | data.fm.02.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind, 37 | .noexport=all.noexport) %dopar% { 38 | 39 | log.name <- paste0("fm_02/fm_02_",test.type) 40 | fn.init.worker(log.name) 41 | 42 | system(paste( 43 | "../fm/fm", 44 | "-k 12 -t 5 -r 0.015 -s 6 -l 0.00001", 45 | paste0("../data/output-libffm/fm_02/data.",test.type,".tt.fm "), 46 | paste0("../data/output-libffm/fm_02/data.",test.type,".tr.fm "), 47 | " >> ", paste0("../data/log/",log.name,".log"), " 2>&1")) 48 | 49 | data.pred <- data.table( 50 | ID = data.all.lr.id[SearchType==test.type,ID], 51 | Pred = scan(paste0("../data/output-libffm/fm_02/data.",test.type,".tt.fm.out")) 52 | ) 53 | fn.print.err(data.pred) 54 | 55 | fn.clean.worker() 56 | data.pred 57 | } 58 | fn.kill.wk() 59 | 60 | data.fm.02.pred.tmp <- data.fm.02.pred.tmp[order(ID)] 61 | Store(data.fm.02.pred.tmp) 62 | 63 | data.fm.02.pred <- copy(data.fm.02.pred.tmp) 64 | 65 | fn.print.err(data.fm.02.pred) 66 | # Size Loss 67 | # 1 7888752 0.04474 - tr 68 | # 1 8512834 0.04682 - val 69 | # 1 16401586 0.04582 - all 70 | 71 | Store(data.fm.02.pred) 72 | 73 | # fn.write.submission(data.fm.02.pred, "data.fm.02.pred") 74 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.fm.03.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("fm_03/build_fm") 9 | system(paste( 10 | "cd ../avito-context-click-py &&", 11 | "pypy -u convert_csv_to_libffm.py", 12 | "-input_files ../data/output-r/data.all.lr.csv", 13 | "-out_selector '{", 14 | "\"../data/output-libffm/fm_03/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],", 15 | "\"../data/output-libffm/fm_03/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 16 | "\"../data/output-libffm/fm_03/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 17 | "\"../data/output-libffm/fm_03/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 18 | "\"../data/output-libffm/fm_03/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 19 | "\"../data/output-libffm/fm_03/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 20 | "}'", 21 | "-col_out IsClick", 22 | "-col_in_cat", 23 | " SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ", 24 | " SearchCatID SearchLocID", 25 | " SearchParamsSZBin SearchQuerySZBin SearchRussian ", 26 | " Position", 27 | "-old_format", 28 | ">> ../data/log/fm_03/build_fm.log 2>&1")) 29 | fn.clean.worker() 30 | NULL 31 | } 32 | fn.kill.wk() 33 | 34 | ############################################################# 35 | # train phase 36 | ############################################################# 37 | fn.register.wk(1) 38 | data.fm.03.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind, 39 | .noexport=all.noexport) %dopar% { 40 | 41 | log.name <- paste0("fm_03/fm_03_",test.type) 42 | fn.init.worker(log.name) 43 | 44 | system(paste( 45 | "../fm/fm", 46 | "-k 12 -t 3 -r 0.008 -s 12 -l 0.00001", 47 | paste0("../data/output-libffm/fm_03/data.",test.type,".tt.fm "), 48 | paste0("../data/output-libffm/fm_03/data.",test.type,".tr.fm "), 49 | " >> ", paste0("../data/log/",log.name,".log"), " 2>&1")) 50 | 51 | data.pred <- data.table( 52 | ID = data.all.lr.id[SearchType==test.type,ID], 53 | Pred = scan(paste0("../data/output-libffm/fm_03/data.",test.type,".tt.fm.out")) 54 | ) 55 | fn.print.err(data.pred) 56 | 57 | fn.clean.worker() 58 | data.pred 59 | } 60 | fn.kill.wk() 61 | 62 | data.fm.03.pred.tmp <- data.fm.03.pred.tmp[order(ID)] 63 | Store(data.fm.03.pred.tmp) 64 | 65 | data.fm.03.pred <- copy(data.fm.03.pred.tmp) 66 | 67 | fn.print.err(data.fm.03.pred) 68 | # Size Loss 69 | # 1 7888752 0.04507 - tr 70 | # 1 8512834 0.04713 - val 71 | # 1 16401586 0.04614 - all 72 | 73 | Store(data.fm.03.pred) 74 | 75 | # fn.write.submission(data.fm.03.pred, "data.fm.03.pred") 76 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.fm.04.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("fm_04/build_fm") 9 | system(paste( 10 | "cd ../avito-context-click-py &&", 11 | "pypy -u convert_csv_to_libffm.py", 12 | "-input_files ../data/output-r/data.all.lr.csv", 13 | "-out_selector '{", 14 | "\"../data/output-libffm/fm_04/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],", 15 | "\"../data/output-libffm/fm_04/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 16 | "\"../data/output-libffm/fm_04/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 17 | "\"../data/output-libffm/fm_04/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 18 | "\"../data/output-libffm/fm_04/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 19 | "\"../data/output-libffm/fm_04/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 20 | "}'", 21 | "-col_out IsClick", 22 | "-col_in_cat", 23 | " UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ", 24 | " UserPrevPhoneRequest ", 25 | " UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ", 26 | " UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ", 27 | " UserQryTotalTimeBin", 28 | " Position", 29 | "-old_format", 30 | ">> ../data/log/fm_04/build_fm.log 2>&1")) 31 | fn.clean.worker() 32 | NULL 33 | } 34 | fn.kill.wk() 35 | 36 | ############################################################# 37 | # train phase 38 | ############################################################# 39 | fn.register.wk(1) 40 | data.fm.04.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind, 41 | .noexport=all.noexport) %dopar% { 42 | 43 | log.name <- paste0("fm_04/fm_04_",test.type) 44 | fn.init.worker(log.name) 45 | 46 | system(paste( 47 | "../fm/fm", 48 | "-k 12 -t 5 -r 0.004 -s 12 -l 0.00001", 49 | paste0("../data/output-libffm/fm_04/data.",test.type,".tt.fm "), 50 | paste0("../data/output-libffm/fm_04/data.",test.type,".tr.fm "), 51 | " >> ", paste0("../data/log/",log.name,".log"), " 2>&1")) 52 | 53 | data.pred <- data.table( 54 | ID = data.all.lr.id[SearchType==test.type,ID], 55 | Pred = scan(paste0("../data/output-libffm/fm_04/data.",test.type,".tt.fm.out")) 56 | ) 57 | fn.print.err(data.pred) 58 | 59 | fn.clean.worker() 60 | data.pred 61 | } 62 | fn.kill.wk() 63 | 64 | data.fm.04.pred.tmp <- data.fm.04.pred.tmp[order(ID)] 65 | Store(data.fm.04.pred.tmp) 66 | 67 | data.fm.04.pred <- copy(data.fm.04.pred.tmp) 68 | 69 | fn.print.err(data.fm.04.pred) 70 | # Size Loss 71 | # 1 7888752 0.04888 - tr 72 | # 1 8512834 0.05135 - val 73 | # 1 16401586 0.05017 - all 74 | 75 | Store(data.fm.04.pred) 76 | 77 | # fn.write.submission(data.fm.04.pred, "data.fm.04.pred") 78 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.fm.05.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("fm_05/build_fm") 9 | system(paste( 10 | "cd ../avito-context-click-py &&", 11 | "pypy -u convert_csv_to_libffm.py", 12 | "-input_files ../data/output-r/data.all.lr.csv", 13 | "-out_selector '{", 14 | "\"../data/output-libffm/fm_05/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],", 15 | "\"../data/output-libffm/fm_05/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],", 16 | "\"../data/output-libffm/fm_05/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],", 17 | "\"../data/output-libffm/fm_05/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 18 | "\"../data/output-libffm/fm_05/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 19 | "\"../data/output-libffm/fm_05/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]", 20 | "}'", 21 | "-col_out IsClick", 22 | "-col_in_cat", 23 | " CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ", 24 | " CountAdUsersBin CountIPUserBin CountUserAdBin ", 25 | " CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ", 26 | " CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ", 27 | " CountUserSearchLocationBin ", 28 | " RatioAdPos1Bin RatioSearchRussBin ", 29 | " Position", 30 | "-old_format", 31 | ">> ../data/log/fm_05/build_fm.log 2>&1")) 32 | fn.clean.worker() 33 | NULL 34 | } 35 | fn.kill.wk() 36 | 37 | ############################################################# 38 | # train phase 39 | ############################################################# 40 | fn.register.wk(1) # 41 | data.fm.05.pred.tmp <- foreach(test.type=c("val" , "tr", "test"), .combine=rbind, 42 | .noexport=all.noexport) %dopar% { 43 | 44 | log.name <- paste0("fm_05/fm_05_",test.type) 45 | fn.init.worker(log.name) 46 | 47 | system(paste( 48 | "../fm/fm", 49 | "-k 12 -t 5 -r 0.004 -s 12 -l 0.00001", 50 | paste0("../data/output-libffm/fm_05/data.",test.type,".tt.fm "), 51 | paste0("../data/output-libffm/fm_05/data.",test.type,".tr.fm "), 52 | " >> ", paste0("../data/log/",log.name,".log"), " 2>&1")) 53 | 54 | data.pred <- data.table( 55 | ID = data.all.lr.id[SearchType==test.type,ID], 56 | Pred = scan(paste0("../data/output-libffm/fm_05/data.",test.type,".tt.fm.out")) 57 | ) 58 | fn.print.err(data.pred) 59 | 60 | fn.clean.worker() 61 | data.pred 62 | } 63 | fn.kill.wk() 64 | 65 | data.fm.05.pred.tmp <- data.fm.05.pred.tmp[order(ID)] 66 | Store(data.fm.05.pred.tmp) 67 | 68 | data.fm.05.pred <- copy(data.fm.05.pred.tmp) 69 | 70 | fn.print.err(data.fm.05.pred) 71 | # Size Loss 72 | # 1 7888752 0.04992 - tr 73 | # 1 8512834 0.04812 - val 74 | # 1 16401586 0.04905 - all 75 | 76 | Store(data.fm.05.pred) 77 | 78 | # fn.write.submission(data.fm.05.pred, "data.fm.05.pred") 79 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.ftrl.04.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("ftrl_04/build_csv") 9 | 10 | cat("\nLoading data...\n") 11 | data.all.cur <- data.all.lr 12 | # fn.soar.unload(data.all.lr) 13 | 14 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate") 15 | cols.in <- setdiff(colnames(data.all.cur), cols.extra) 16 | 17 | # data.all.cur[, SearchType:=NULL] 18 | 19 | for (jx in 1:2) { 20 | fold.name <- paste0("ftrl_04_", jx) 21 | 22 | if (jx == 1) { 23 | setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position")) 24 | } else { 25 | setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 26 | "SearchDate", "SearchID")) 27 | } 28 | data.search.type <- data.all.cur$SearchType 29 | for (test.type in c("tr", "val", "test")) { 30 | cat("\nTest type", test.type, jx, "...\n") 31 | 32 | data.fold <- fn.create.data.fold(fold.name, test.type) 33 | data.fold$writedir <- fn.py.file(data.fold$basename) 34 | dir.create(data.fold$writedir, showWarnings = F, recursive = T) 35 | 36 | data.fold$col.out <- "IsClick" 37 | data.fold$cols.in <- cols.in 38 | 39 | tr.type <- fn.lr.tr.type(test.type) 40 | cat("\ntr.type", paste(tr.type, collapse=", "), "...\n") 41 | 42 | data.fold$tr.idx <- which(data.search.type %in% unique(tr.type)) 43 | cat("\ntr.idx", length(data.fold$tr.idx), "...\n") 44 | 45 | data.fold$test.idx <- which(data.search.type %in% unique(test.type)) 46 | cat("\ntest.idx", length(data.fold$test.idx), "...\n") 47 | data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred") 48 | 49 | data.all.cur[, IsTestRow:=0] 50 | data.all.cur[data.fold$test.idx, IsTestRow:=1] 51 | all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx))) 52 | cat("\nall.ix", length(all.ix), "...\n") 53 | 54 | cat("\nSaving", test.type, jx, "csv...\n") 55 | data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv") 56 | fn.write.csv.chunk( 57 | data=data.all.cur, subset=all.ix, 58 | file=data.fold$all.file, row.names = F, compress = F 59 | ) 60 | data.all.cur[, IsTestRow:=0] 61 | 62 | cat("\nSaving", test.type, "data.fold...\n") 63 | fn.save.data.fold(data.fold) 64 | } 65 | } 66 | NULL 67 | } 68 | fn.kill.wk() 69 | 70 | ############################################################# 71 | # train phase 72 | ############################################################# 73 | train.grid = expand.grid( 74 | test.type=c("tr", "val", "test"), 75 | jx=1:2, 76 | stringsAsFactors=F 77 | ) 78 | 79 | fn.register.wk(nrow(train.grid)) 80 | data.ftrl.04.pred.tmp <- foreach( 81 | r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% { 82 | 83 | test.type <- train.grid$test.type[r] 84 | jx <- train.grid$jx[r] 85 | 86 | fn.init.fold.worker(paste0("ftrl_04_", jx), test.type) 87 | # fn.clean.worker() 88 | 89 | epochs <- 1 90 | system(paste( 91 | "cd ../avito-context-click-py && pypy -u train_ftrl.py", 92 | "-train_file", data.fold$all.file, 93 | # "-train_model_file {TRAIN_FILE}.pklz", 94 | "-test_pred_file", data.fold$test.pred.file, 95 | "-test_pred_extra_cols ID -test_pred_col Pred", 96 | "-col_out IsClick", 97 | "-col_in_cat", 98 | " AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ", 99 | " Position ", 100 | " SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ", 101 | " SearchCatID SearchLocID", 102 | " SearchParamsSZBin SearchQuerySZBin SearchRussian ", 103 | " UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ", 104 | " UserID UserIPID UserPrevPhoneRequest ", 105 | " UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ", 106 | " UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ", 107 | "-train_is_test_col IsTestRow", 108 | "-bits 27 -alpha 0.07 -beta 1.0 -l1 0.01 -l2 1. -dropout 0", 109 | "-two_way 'Ad Us' 'Us Search' 'Ad Search' 'Ad Pos' 'Us Pos' 'Pos Search'", 110 | "-seed 7 -epochs", epochs, 111 | # "-load_model", 112 | " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1" 113 | )) 114 | 115 | data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep=".")) 116 | fn.print.err(data.fold$test.pred) 117 | 118 | fn.clean.worker() 119 | 120 | data.fold$test.pred 121 | } 122 | fn.kill.wk() 123 | 124 | data.ftrl.04.pred.tmp <- data.ftrl.04.pred.tmp[order(ID)] 125 | Store(data.ftrl.04.pred.tmp) 126 | 127 | data.ftrl.04.pred <- data.ftrl.04.pred.tmp[ 128 | ,list( 129 | Pred=sum(Pred)/.N 130 | ), by="ID" 131 | ] 132 | 133 | fn.print.err(data.ftrl.04.pred) 134 | # Size Loss 135 | # 1 7888752 0.04148 - tr 136 | # 1 8512834 0.04335 - val 137 | # 1 16401586 0.04245 - all 138 | 139 | 140 | Store(data.ftrl.04.pred) 141 | # 142 | # # fn.write.submission(data.ftrl.04.pred, "data.ftrl.04.pred") 143 | 144 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.ftrl.05.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("ftrl_05/build_csv") 9 | 10 | cat("\nLoading data...\n") 11 | data.all.cur <- data.all.lr 12 | # fn.soar.unload(data.all.lr) 13 | 14 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate") 15 | cols.in <- setdiff(colnames(data.all.cur), cols.extra) 16 | 17 | # data.all.cur[, SearchType:=NULL] 18 | 19 | for (jx in 1) { # :2 20 | fold.name <- paste0("ftrl_05_", jx) 21 | 22 | if (jx == 1) { 23 | setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position")) 24 | } else { 25 | setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 26 | "SearchDate", "SearchID")) 27 | } 28 | data.search.type <- data.all.cur$SearchType 29 | for (test.type in c("val", "test")) { # "tr", 30 | cat("\nTest type", test.type, jx, "...\n") 31 | 32 | data.fold <- fn.create.data.fold(fold.name, test.type) 33 | data.fold$writedir <- fn.py.file(data.fold$basename) 34 | dir.create(data.fold$writedir, showWarnings = F, recursive = T) 35 | 36 | data.fold$col.out <- "IsClick" 37 | data.fold$cols.in <- cols.in 38 | 39 | tr.type <- fn.lr.tr.type(test.type) 40 | cat("\ntr.type", paste(tr.type, collapse=", "), "...\n") 41 | 42 | data.fold$tr.idx <- which(data.search.type %in% unique(tr.type)) 43 | cat("\ntr.idx", length(data.fold$tr.idx), "...\n") 44 | 45 | data.fold$test.idx <- which(data.search.type %in% unique(test.type)) 46 | cat("\ntest.idx", length(data.fold$test.idx), "...\n") 47 | data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred") 48 | 49 | data.all.cur[, IsTestRow:=0] 50 | data.all.cur[data.fold$test.idx, IsTestRow:=1] 51 | all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx))) 52 | cat("\nall.ix", length(all.ix), "...\n") 53 | 54 | cat("\nSaving", test.type, jx, "csv...\n") 55 | data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv") 56 | fn.write.csv.chunk( 57 | data=data.all.cur, subset=all.ix, 58 | file=data.fold$all.file, row.names = F, compress = F 59 | ) 60 | data.all.cur[, IsTestRow:=0] 61 | 62 | cat("\nSaving", test.type, "data.fold...\n") 63 | fn.save.data.fold(data.fold) 64 | } 65 | } 66 | NULL 67 | } 68 | fn.kill.wk() 69 | 70 | ############################################################# 71 | # train phase 72 | ############################################################# 73 | train.grid = expand.grid( 74 | test.type=c("tr", "val", "test"), 75 | jx=1:2, 76 | stringsAsFactors=F 77 | ) 78 | 79 | fn.register.wk(nrow(train.grid)) 80 | data.ftrl.05.pred.tmp <- foreach( 81 | r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% { 82 | 83 | test.type <- train.grid$test.type[r] 84 | jx <- train.grid$jx[r] 85 | 86 | fn.init.fold.worker(paste0("ftrl_05_", jx), test.type) 87 | # fn.clean.worker() 88 | 89 | epochs <- 2 90 | system(paste( 91 | "cd ../avito-context-click-py && pypy -u train_ftrl.py", 92 | "-train_file", data.fold$all.file, 93 | # "-train_model_file {TRAIN_FILE}.pklz", 94 | "-test_pred_file", data.fold$test.pred.file, 95 | "-test_pred_extra_cols ID -test_pred_col Pred", 96 | "-col_out IsClick", 97 | "-col_in_cat", 98 | " AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ", 99 | " CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ", 100 | " CountAdUsersBin CountIPUserBin CountUserAdBin ", 101 | " CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ", 102 | " CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ", 103 | " CountUserSearchLocationBin ", 104 | " Position ", 105 | " RatioAdPos1Bin RatioSearchRussBin ", 106 | " SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ", 107 | " SearchCatID SearchLocID SearchOrdUsrAsc SearchOrdUsrDesc ", 108 | " SearchParamsSZBin SearchQuerySZBin SearchRussian ", 109 | " UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ", 110 | " UserID UserIPID UserLogged UserPrevPhoneRequest ", 111 | " UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ", 112 | " UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ", 113 | " UserQryTotalTimeBin", 114 | "-train_is_test_col IsTestRow", 115 | "-bits 27 -alpha .008 -beta .1 -l1 0.1 -l2 0.15 -dropout 0", 116 | "-two_way 'Ad Us' 'Us Search' 'Ad Search' 'Ad Pos' 'Us Pos' 'Pos Search'", 117 | "-seed 7 -epochs", epochs, 118 | # "-load_model", 119 | " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1" 120 | )) 121 | 122 | data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep=".")) 123 | fn.print.err(data.fold$test.pred) 124 | 125 | fn.clean.worker() 126 | 127 | data.fold$test.pred 128 | } 129 | fn.kill.wk() 130 | 131 | data.ftrl.05.pred.tmp <- data.ftrl.05.pred.tmp[order(ID)] 132 | Store(data.ftrl.05.pred.tmp) 133 | 134 | data.ftrl.05.pred <- data.ftrl.05.pred.tmp[ 135 | ,list( 136 | Pred=sum(Pred)/.N 137 | ), by="ID" 138 | ] 139 | 140 | fn.print.err(data.ftrl.05.pred) 141 | # Size Loss 142 | # 1 7888752 0.04122 - tr 143 | # 1 8512834 0.04314 - val 144 | # 1 16401586 0.04222 - all 145 | 146 | Store(data.ftrl.05.pred) 147 | # 148 | # # fn.write.submission(data.ftrl.05.pred, "data.ftrl.05.pred") 149 | 150 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.ftrl.06.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # save csv to disk data 3 | ############################################################# 4 | 5 | fn.register.wk(1) 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% { 7 | 8 | fn.init.worker("ftrl_06/build_csv") 9 | 10 | cat("\nLoading data...\n") 11 | data.all.cur <- data.all.lr 12 | # fn.soar.unload(data.all.lr) 13 | 14 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate") 15 | cols.in <- setdiff(colnames(data.all.cur), cols.extra) 16 | 17 | # data.all.cur[, SearchType:=NULL] 18 | 19 | for (jx in 1:2) { 20 | fold.name <- paste0("ftrl_06_", jx) 21 | 22 | if (jx == 1) { 23 | setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position")) 24 | } else { 25 | setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 26 | "SearchDate", "SearchID")) 27 | } 28 | data.search.type <- data.all.cur$SearchType 29 | for (test.type in c("tr", "val", "test")) { 30 | cat("\nTest type", test.type, jx, "...\n") 31 | 32 | data.fold <- fn.create.data.fold(fold.name, test.type) 33 | data.fold$writedir <- fn.py.file(data.fold$basename) 34 | dir.create(data.fold$writedir, showWarnings = F, recursive = T) 35 | 36 | data.fold$col.out <- "IsClick" 37 | data.fold$cols.in <- cols.in 38 | 39 | tr.type <- fn.lr.tr.type(test.type) 40 | cat("\ntr.type", paste(tr.type, collapse=", "), "...\n") 41 | 42 | data.fold$tr.idx <- which(data.search.type %in% unique(tr.type)) 43 | cat("\ntr.idx", length(data.fold$tr.idx), "...\n") 44 | 45 | data.fold$test.idx <- which(data.search.type %in% unique(test.type)) 46 | cat("\ntest.idx", length(data.fold$test.idx), "...\n") 47 | data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred") 48 | 49 | data.all.cur[, IsTestRow:=0] 50 | data.all.cur[data.fold$test.idx, IsTestRow:=1] 51 | all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx))) 52 | cat("\nall.ix", length(all.ix), "...\n") 53 | 54 | cat("\nSaving", test.type, jx, "csv...\n") 55 | data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv") 56 | fn.write.csv.chunk( 57 | data=data.all.cur, subset=all.ix, 58 | file=data.fold$all.file, row.names = F, compress = F 59 | ) 60 | data.all.cur[, IsTestRow:=0] 61 | 62 | cat("\nSaving", test.type, "data.fold...\n") 63 | fn.save.data.fold(data.fold) 64 | } 65 | } 66 | NULL 67 | } 68 | fn.kill.wk() 69 | 70 | ############################################################# 71 | # train phase 72 | ############################################################# 73 | train.grid = expand.grid( 74 | test.type=c("tr", "val", "test"), 75 | jx=1:2, 76 | stringsAsFactors=F 77 | ) 78 | 79 | fn.register.wk(nrow(train.grid)) 80 | data.ftrl.06.pred.tmp <- foreach( 81 | r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% { 82 | 83 | test.type <- train.grid$test.type[r] 84 | jx <- train.grid$jx[r] 85 | 86 | fn.init.fold.worker(paste0("ftrl_06_", jx), test.type) 87 | # fn.clean.worker() 88 | 89 | epochs <- 2 90 | system(paste( 91 | "cd ../avito-context-click-py && pypy -u train_ftrl.py", 92 | "-train_file", data.fold$all.file, 93 | "-test_pred_file", data.fold$test.pred.file, 94 | "-test_pred_extra_cols ID -test_pred_col Pred", 95 | "-col_out IsClick", 96 | "-col_in_cat", 97 | " AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ", 98 | " CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ", 99 | " CountAdUsersBin CountIPUserBin CountUserAdBin ", 100 | " CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ", 101 | " CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ", 102 | " CountUserSearchLocationBin ", 103 | " Position ", 104 | " RatioAdPos1Bin RatioSearchRussBin ", 105 | " SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ", 106 | " SearchCatID SearchLocID SearchOrdUsrAsc SearchOrdUsrDesc ", 107 | " SearchParamsSZBin SearchQuerySZBin SearchRussian ", 108 | " UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ", 109 | " UserID UserIPID UserLogged UserPrevPhoneRequest ", 110 | " UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ", 111 | " UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ", 112 | " UserQryTotalTimeBin", 113 | "-train_is_test_col IsTestRow", 114 | "-bits 27 -alpha 0.07 -beta 1.0 -l1 0.01 -l2 1. -dropout 0", 115 | "-two_way 'AdID SearchCatID' 'AdID UserID' 'AdCatID SearchCatID'", 116 | " 'AdID SearchLocID' 'SearchCatID UserID' 'AdCatID UserID'", 117 | " 'SearchLocID UserID' 'AdID Pos' 'AdCatID Pos' 'SearchCatID Pos'", 118 | " 'SearchLocID Pos' 'UserID Pos' 'SearchRussian Pos'", 119 | " 'SearchAdT1 AdID' 'SearchAdT1 AdCatID' 'SearchAdT1 Pos'", 120 | " 'AdID UserAgentOSID' 'AdID UserAgentFamilyID' 'AdCatID AdPriceBin'", 121 | " 'AdPriceBin UserID' ", 122 | "-seed 7 -epochs", epochs, 123 | # "-load_model", 124 | " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1" 125 | )) 126 | 127 | data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep=".")) 128 | fn.print.err(data.fold$test.pred) 129 | 130 | fn.clean.worker() 131 | 132 | data.fold$test.pred 133 | } 134 | fn.kill.wk() 135 | 136 | data.ftrl.06.pred.tmp <- data.ftrl.06.pred.tmp[order(ID)] 137 | Store(data.ftrl.06.pred.tmp) 138 | 139 | data.ftrl.06.pred <- data.ftrl.06.pred.tmp[ 140 | ,list( 141 | Pred=sum(Pred)/.N 142 | ), by="ID" 143 | ] 144 | 145 | fn.print.err(data.ftrl.06.pred) 146 | # Size Loss 147 | # 1 7888752 0.04163 - tr 148 | # 1 8512834 0.04359 - val 149 | # 1 16401586 0.04265 - all 150 | 151 | Store(data.ftrl.06.pred) 152 | # 153 | # # fn.write.submission(data.ftrl.06.pred, "data.ftrl.06.pred") 154 | 155 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.xgb.03.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # train model 3 | ############################################################# 4 | 5 | fn.register.wk(1, seed=5471887) # 5471887 6 | data.xgb.03.pred.tmp <- foreach( 7 | test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% { 8 | 9 | fn.init.new.fold.worker("xgb_03", paste0(test.type)) 10 | # fn.clean.worker() 11 | 12 | 13 | tr.type <- fn.lr.tr.type(test.type) 14 | # tr.type <- fn.tree.tr.type(test.type) 15 | 16 | data.fold$tr.idx <- which(data.all.tree$SearchType %in% tr.type) 17 | data.fold$test.idx <- which(!data.all.tree$SearchType %in% tr.type) 18 | data.fold$val.idx <- data.fold$test.idx[!is.na( 19 | data.all.tree$IsClick[data.fold$test.idx])] 20 | 21 | cols.extra <- c("ID", "SearchID", "SearchType", "IsClick") 22 | cols.in <- sort(setdiff(colnames(data.all.tree), 23 | c(cols.extra))) 24 | 25 | cat("\n\nTr size:", length(data.fold$tr.idx), 26 | ", Val size:", length(data.fold$val.idx), 27 | ", Test size:", length(data.fold$test.idx), 28 | "...\n") 29 | 30 | data.tr <- fn.xgb.matrix( 31 | data=data.all.tree, subset=data.fold$tr.idx, col.in=cols.in) 32 | 33 | if (length(data.fold$val.idx) > 0) { 34 | data.val <- fn.xgb.matrix( 35 | data=data.all.tree, subset=data.fold$val.idx, col.in=cols.in) 36 | data.watch = list(val=data.val) 37 | } else { 38 | data.watch = list(tr=data.tr) 39 | } 40 | 41 | 42 | data.test <- fn.xgb.matrix( 43 | data=data.all.tree, subset=data.fold$test.idx, col.in=cols.in) 44 | 45 | data.fold$test.pred <- data.table( 46 | ID = data.all.tree$ID[data.fold$test.idx], 47 | Pred = 0.0, 48 | n = 0 49 | ) 50 | 51 | avg.ix <- which(data.fold$test.pred$ID > 0) 52 | if (length(avg.ix) == 0) { 53 | avg.ix <- 1:nrow(data.fold$test.pred) 54 | } 55 | # print(length(avg.ix)) 56 | 57 | fn.soar.unload(data.all.tree) 58 | 59 | data.fold$params = list( 60 | objective = "binary:logistic", 61 | eval_metric = "logloss", 62 | nthread = 12, 63 | eta = 0.2, 64 | max_depth = 10, 65 | gamma = 0.8, 66 | colsample_bytree = 0.7, 67 | colsample_bylevel = 0.8 68 | ) 69 | 70 | data.fold$nrounds <- 75 71 | 72 | cat("\nParams:\n") 73 | print(data.fold$params) 74 | 75 | n.models <- 20 76 | for (ix in 1:n.models) { 77 | 78 | cat("\n\nTraining ", ix, "...\n") 79 | 80 | set.seed(ix + 89475560) 81 | 82 | suppressMessages(library("xgboost")) 83 | model = xgb.train( 84 | data = data.tr, 85 | watchlist=data.watch, 86 | params = data.fold$params, 87 | nrounds = data.fold$nrounds, 88 | verbose = 1) 89 | 90 | 91 | ntreelimit <- data.fold$nrounds 92 | try.pred <- T 93 | 94 | while (try.pred) { 95 | pred.cur <- predict(model, data.test, ntreelimit=ntreelimit) 96 | pred.cur.avg <- mean(pred.cur[avg.ix]) 97 | 98 | cat("\nCurrent prediction avg of", length(avg.ix), 99 | "instances:", pred.cur.avg, "\n") 100 | if (test.type == "val" || 101 | (pred.cur.avg >= 0.008 && pred.cur.avg <= 0.012)) { 102 | 103 | try.pred <- F 104 | # data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)] 105 | # data.fold$test.pred[, n := n+1] 106 | 107 | data.fold$test.pred[ , Pred := ((Pred^n)*pred.cur)^(1/(n+1))] 108 | data.fold$test.pred[, n := n+1] 109 | 110 | fn.save.data.fold(data.fold) 111 | cat("\nPrediction with", ntreelimit ,"trees included\n") 112 | } else { 113 | cat("\nPrediction with", ntreelimit ,"trees discarded\n") 114 | ntreelimit <- ntreelimit - 5 115 | try.pred <- ntreelimit >= 60 116 | } 117 | } 118 | 119 | cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n") 120 | fn.print.err(data.fold$test.pred) 121 | 122 | set.seed(Sys.time()) 123 | rm(pred.cur, pred.cur.avg) 124 | invisible(gc()) 125 | 126 | } 127 | 128 | # data.fold$importance <- xgb.importance( 129 | # feature_names=cols.in, model=model) 130 | # 131 | # cat("\n\nFeature importance:\n") 132 | # print(data.fold$importance) 133 | 134 | fn.clean.worker() 135 | 136 | data.fold$test.pred 137 | 138 | } 139 | fn.kill.wk() 140 | 141 | data.xgb.03.pred.tmp <- data.xgb.03.pred.tmp[ 142 | order(ID),list(Pred=mean(Pred)), by="ID"] 143 | Store(data.xgb.03.pred.tmp) 144 | 145 | data.xgb.03.pred <- copy(data.xgb.03.pred.tmp) 146 | 147 | ############################################################# 148 | # save data 149 | ############################################################# 150 | 151 | fn.print.err(data.xgb.03.pred) 152 | # Size Loss 153 | # 1 8512834 0.04256 154 | 155 | Store(data.xgb.03.pred) # 0.04086 156 | 157 | cat('Test avg:', mean(data.xgb.03.pred[ID > 0]$Pred), "\n") 158 | # Test avg: 0.008966203 159 | 160 | # fn.write.submission(data.xgb.03.pred, "data.xgb.03.pred") 161 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l1.xgb.05.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # train model 3 | ############################################################# 4 | 5 | wk.seed <- 5471887 6 | fn.register.wk(1, seed=wk.seed) 7 | data.l1.xgb.05.pred.tmp <- foreach( 8 | test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% { 9 | 10 | fn.init.new.fold.worker("l1_xgb_05", paste0(test.type, "2")) 11 | # fn.clean.worker() 12 | 13 | cat("\n\nSeed:", wk.seed, "...\n") 14 | 15 | 16 | data.tr <- xgb.DMatrix(fn.libsvm.file( 17 | paste0("data.", test.type, ".tr.full.libsvm"))) 18 | data.test <- xgb.DMatrix(fn.libsvm.file( 19 | paste0("data.", test.type, ".tt.full.libsvm"))) 20 | 21 | eval_metric = "logloss" 22 | data.watch = list(val=data.test) 23 | if (test.type == "val") { 24 | eval_metric <- function(preds, dtrain) { 25 | labels <- as.numeric(getinfo(dtrain, "label")) 26 | preds - as.numeric(preds) 27 | err <- round(fn.log.loss(actual=labels, pred=preds), digits=5) 28 | return(list(metric = "logloss", value = err)) 29 | } 30 | } 31 | 32 | data.fold$test.pred <- data.table( 33 | ID = sort(data.all.search.small[SearchType == test.type, ID]), 34 | Pred = 0.0, 35 | n = 0 36 | ) 37 | 38 | fn.soar.unload(data.all.search.small) 39 | 40 | # Num rounds 66 41 | # Eta 0,5 42 | # Maxdepth 10 43 | # Colsample 0,375 44 | # Minchildweight 10 45 | 46 | data.fold$params = list( 47 | objective = "binary:logistic", 48 | eval_metric = eval_metric, 49 | nthread = 6, 50 | eta = 0.18, 51 | max_depth = 10, 52 | gamma = 0.8, 53 | colsample_bytree = 0.7, 54 | min_child_weight = 5, 55 | colsample_bylevel = 0.8 56 | ) 57 | 58 | data.fold$nrounds <- 75 59 | 60 | cat("\nParams:\n") 61 | print(data.fold$params) 62 | 63 | n.models <- 10 64 | for (ix in 1:n.models) { 65 | 66 | cat("\n\nTraining ", ix, "of", n.models,"...\n") 67 | 68 | set.seed(ix + 89475560) 69 | 70 | model = xgb.train( 71 | data = data.tr, 72 | watchlist=data.watch, 73 | params = data.fold$params, 74 | nrounds = data.fold$nrounds, 75 | verbose = 1) 76 | 77 | 78 | ntreelimit <- data.fold$nrounds 79 | try.pred <- T 80 | 81 | while (try.pred) { 82 | pred.cur <- xgboost::predict(model, data.test, ntreelimit=ntreelimit) 83 | pred.cur.avg <- mean(pred.cur) 84 | 85 | cat("\nCurrent prediction avg of", length(pred.cur), 86 | "instances:", pred.cur.avg, "\n") 87 | if (test.type == "val" || 88 | (pred.cur.avg >= 0.006 && pred.cur.avg <= 0.016)) { 89 | 90 | try.pred <- F 91 | data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)] 92 | data.fold$test.pred[, n := n+1] 93 | 94 | 95 | fn.save.data.fold(data.fold) 96 | cat("\nPrediction with", ntreelimit ,"trees included\n") 97 | } else { 98 | cat("\nPrediction with", ntreelimit ,"trees discarded\n") 99 | ntreelimit <- ntreelimit - 5 100 | try.pred <- ntreelimit >= 60 101 | } 102 | } 103 | 104 | cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n") 105 | fn.print.err(data.fold$test.pred) 106 | 107 | set.seed(Sys.time()) 108 | rm(pred.cur, pred.cur.avg) 109 | invisible(gc()) 110 | 111 | } 112 | 113 | # cat("\n\nFeature importance:\n") 114 | # data.fold$importance <- xgb.importance( 115 | # feature_names=cols.in.combine, model=model) 116 | # print(data.fold$importance) 117 | 118 | fn.clean.worker() 119 | 120 | data.fold$test.pred 121 | 122 | } 123 | fn.kill.wk() 124 | 125 | data.l1.xgb.05.pred.tmp <- data.l1.xgb.05.pred.tmp[order(ID)] 126 | Store(data.l1.xgb.05.pred.tmp) 127 | 128 | for (ix in "2") { 129 | test.type <- "test" 130 | cat("\nLoading",test.type, ix,"...\n") 131 | 132 | fn.init.fold.worker("l1_xgb_05", paste0(test.type, ix), no.log=T) 133 | pred.nam <- paste("data.l1.xgb.05.pred", test.type, ix, sep=".") 134 | assign(pred.nam, data.fold$test.pred[order(ID)]) 135 | cat("Saving",pred.nam,"...\n") 136 | Store(list=pred.nam) 137 | } 138 | 139 | data.l1.xgb.05.pred.tmp <- rbind( 140 | data.l1.xgb.05.pred.test.2 141 | )[order(ID), list(Pred=sum(Pred*n)/sum(n)), by="ID"] 142 | 143 | 144 | data.l1.xgb.05.pred <- copy(data.l1.xgb.05.pred.tmp) 145 | 146 | ############################################################# 147 | # save data 148 | ############################################################# 149 | 150 | # fn.print.err(data.l1.xgb.05.pred) 151 | 152 | 153 | Store(data.l1.xgb.05.pred) # 0.04076 154 | 155 | cat('Test avg:', mean(data.l1.xgb.05.pred[ID > 0]$Pred), "\n") 156 | # Test avg: 0.007848369 157 | 158 | # fn.write.submission(data.l1.xgb.05.pred, "data.l1.xgb.05.pred") 159 | 160 | 161 | -------------------------------------------------------------------------------- /avito-context-click-r/train.l2.xgb.02.R: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # train model 3 | ############################################################# 4 | 5 | wk.seed <- 5471887 + 19117 6 | fn.register.wk(1, seed=wk.seed) 7 | data.l2.xgb.02.pred.tmp <- foreach( 8 | test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% { 9 | 10 | fn.init.new.fold.worker("l2_xgb_02", paste0(test.type, "1b")) 11 | # fn.clean.worker() 12 | 13 | cat("\n\nSeed:", wk.seed, "...\n") 14 | 15 | data.tr <- xgb.DMatrix(fn.libsvm.file(paste0("data.", test.type, ".tr.libsvm"))) 16 | data.test <- xgb.DMatrix(fn.libsvm.file(paste0("data.", test.type, ".tt.libsvm"))) 17 | 18 | eval_metric = "logloss" 19 | data.watch = list(val=data.test) 20 | if (test.type == "val") { 21 | eval_metric <- function(preds, dtrain) { 22 | labels <- as.numeric(getinfo(dtrain, "label")) 23 | preds - as.numeric(preds) 24 | err <- round(fn.log.loss(actual=labels, pred=preds), digits=5) 25 | return(list(metric = "logloss", value = err)) 26 | } 27 | } 28 | 29 | data.fold$test.pred <- data.table( 30 | ID = sort(data.all.search.small[SearchType == test.type, ID]), 31 | Pred = 0.0, 32 | n = 0 33 | ) 34 | 35 | fn.soar.unload(data.all.search.small) 36 | 37 | 38 | data.fold$params = list( 39 | objective = "binary:logistic", 40 | eval_metric = eval_metric, 41 | nthread = 6, 42 | eta = 0.18, 43 | max_depth = 10, 44 | gamma = 0.8, 45 | colsample_bytree = 0.7, 46 | colsample_bylevel = 0.8 47 | ) 48 | 49 | data.fold$nrounds <- 75 50 | 51 | cat("\nParams:\n") 52 | print(data.fold$params) 53 | 54 | n.models <- 20 55 | for (ix in 1:n.models) { 56 | 57 | cat("\n\nTraining ", ix, "of", n.models,"...\n") 58 | 59 | set.seed(ix + 89475560) 60 | 61 | model = xgb.train( 62 | data = data.tr, 63 | watchlist=data.watch, 64 | params = data.fold$params, 65 | nrounds = data.fold$nrounds, 66 | verbose = 1) 67 | 68 | 69 | ntreelimit <- data.fold$nrounds 70 | try.pred <- T 71 | 72 | while (try.pred) { 73 | pred.cur <- xgboost::predict(model, data.test, ntreelimit=ntreelimit) 74 | pred.cur.avg <- mean(pred.cur) 75 | 76 | cat("\nCurrent prediction avg of", length(pred.cur), 77 | "instances:", pred.cur.avg, "\n") 78 | if (test.type == "val" || 79 | (pred.cur.avg >= 0.006 && pred.cur.avg <= 0.012)) { 80 | 81 | try.pred <- F 82 | data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)] 83 | data.fold$test.pred[, n := n+1] 84 | 85 | 86 | fn.save.data.fold(data.fold) 87 | cat("\nPrediction with", ntreelimit ,"trees included\n") 88 | } else { 89 | cat("\nPrediction with", ntreelimit ,"trees discarded\n") 90 | ntreelimit <- ntreelimit - 5 91 | try.pred <- ntreelimit >= 60 92 | } 93 | } 94 | 95 | cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n") 96 | fn.print.err(data.fold$test.pred) 97 | 98 | set.seed(Sys.time()) 99 | rm(pred.cur, pred.cur.avg) 100 | invisible(gc()) 101 | 102 | } 103 | 104 | # cat("\n\nFeature importance:\n") 105 | # data.fold$importance <- xgb.importance( 106 | # feature_names=cols.in.combine, model=model) 107 | # print(data.fold$importance) 108 | 109 | fn.clean.worker() 110 | 111 | data.fold$test.pred 112 | 113 | } 114 | fn.kill.wk() 115 | 116 | data.l2.xgb.02.pred.tmp <- data.l2.xgb.02.pred.tmp[order(ID)] 117 | Store(data.l2.xgb.02.pred.tmp) 118 | 119 | data.l2.xgb.02.pred <- copy(data.l2.xgb.02.pred.tmp) 120 | 121 | ############################################################# 122 | # save data 123 | ############################################################# 124 | 125 | fn.print.err(data.l2.xgb.02.pred) 126 | # Size Loss 127 | # 1 8512834 0.04155 128 | 129 | Store(data.l2.xgb.02.pred) # 0.04043 130 | 131 | cat('Test avg:', mean(data.l2.xgb.02.pred[ID > 0]$Pred), "\n") 132 | # Test avg: 0.008131078 133 | 134 | # fn.write.submission(data.l2.xgb.02.pred, "data.l2.xgb.02.pred") 135 | 136 | data.l2.xgb.02.pred.calib <- fn.calibrate.prob.wk(data.l2.xgb.02.pred) 137 | fn.print.err(data.l2.xgb.02.pred.calib) 138 | 139 | Store(data.l2.xgb.02.pred.calib) 140 | 141 | -------------------------------------------------------------------------------- /avito-context-click-r/train.xgb.dtry.R: -------------------------------------------------------------------------------- 1 | load(fn.rdata.file("data.reduced.all.RData")) 2 | flist <- setdiff(colnames(data.reduced.all), c("SearchType", "SearchDayYear")) 3 | 4 | write.table(as.data.frame(data.reduced.all[SearchType==1][,flist,with=F]), 5 | file = fn.out.file("train.xgb.csv"), 6 | quote = F, 7 | sep = ",", 8 | row.names = F, 9 | col.names = T) 10 | 11 | write.table(as.data.frame(data.reduced.all[SearchType==2][,flist,with=F]), 12 | file = fn.out.file("val.xgb.csv"), 13 | quote = F, 14 | sep = ",", 15 | row.names = F, 16 | col.names = T) 17 | 18 | write.table(as.data.frame(data.reduced.all[SearchType==3][,flist,with=F]), 19 | file = fn.out.file("test.xgb.csv"), 20 | quote = F, 21 | sep = ",", 22 | row.names = F, 23 | col.names = T) 24 | 25 | write.table(as.data.frame(data.reduced.all[SearchType==1 | SearchType==2][,flist,with=F]), 26 | file = fn.out.file("train.val.xgb.csv"), 27 | quote = F, 28 | sep = ",", 29 | row.names = F, 30 | col.names = T) 31 | 32 | n <- nrow(data.reduced.all[SearchType==1]) 33 | set.seed(23243) 34 | ix.train <- sample(c(1:n), 0.2*n) 35 | n <- nrow(data.reduced.all[SearchType==2]) 36 | set.seed(102903) 37 | ix.val <- sample(c(1:n), 0.2*n) 38 | 39 | write.table(as.data.frame(data.reduced.all[SearchType==1][ix.train][,flist,with=F]), 40 | file = fn.out.file("train.part.xgb.csv"), 41 | quote = F, 42 | sep = ",", 43 | row.names = F, 44 | col.names = T) 45 | 46 | write.table(as.data.frame(data.reduced.all[SearchType==2][ix.val][,flist,with=F]), 47 | file = fn.out.file("val.part.xgb.csv"), 48 | quote = F, 49 | sep = ",", 50 | row.names = F, 51 | col.names = T) 52 | val.part.actual <- data.reduced.all[SearchType==2][ix.val][,list(ID,IsClick)] 53 | setkey(val.part.actual, ID) 54 | rm(data.reduced.all) 55 | gc() 56 | 57 | 58 | ### lb part ### 59 | system(paste("cd ../avito-context-click-py && python train_xgb_dtry.py", 60 | "--train", fn.out.file("train.val.xgb.csv"), 61 | "--test", fn.out.file("test.xgb.csv"), 62 | "--pred", fn.py.file("test.pred.xgb.csv"), 63 | "--epoch", 15, 64 | ">> ../data/log/xgb.dtry.log 2>&1")) 65 | 66 | test.pred <- list() 67 | for (i in c(0:14)) { 68 | test.pred[[length(test.pred)+1]] <- fread(fn.py.file(paste0("test.pred.xgb.epoch",i,".csv"))) 69 | cat(mean(test.pred[[length(test.pred)]]$IsClick), "...\n") 70 | } 71 | test.pred <- rbindlist(test.pred, use.names=T, fill=F) 72 | test.pred <- test.pred[,list(IsClick = round(mean(IsClick), 6)), by="ID"] 73 | 74 | write.table(as.data.frame(test.pred), 75 | file = fn.submission.file("dtry.xgb9__0.0409xx"), 76 | quote = F, 77 | sep = ",", 78 | row.names = F, 79 | col.names = T) 80 | 81 | 82 | ### cv part ### 83 | #system(paste("cd ../avito-context-click-py && python train_xgb_dtry.py", 84 | # "--train", fn.out.file("train.part.xgb.csv"), 85 | # "--test", fn.out.file("val.part.xgb.csv"), 86 | # "--pred", fn.py.file("val.part.pred.xgb.csv"), 87 | # "--epoch", 15, 88 | # ">> ../data/log/xgb.dtry.log 2>&1")) 89 | 90 | #val.part.pred <- list() 91 | #for (i in c(0:4)) { 92 | # val.part.pred[[length(val.part.pred)+1]] <- fread(fn.py.file(paste0("val.part.pred.xgb.epoch",i,".csv"))) 93 | # cat(mean(val.part.pred[[length(val.part.pred)]]$IsClick), "...\n") 94 | #} 95 | #val.part.pred <- rbindlist(val.part.pred, use.names=T, fill=F) 96 | #val.part.pred <- val.part.pred[,list(IsClick = round(mean(IsClick), 6)), by="ID"] 97 | #setkey(val.part.pred, ID) 98 | #fn.logloss(val.part.actual$IsClick, val.part.pred$IsClick) 99 | -------------------------------------------------------------------------------- /avito-context-click-r/train.zens.R: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | ## csvs 3 | ############################################################## 4 | 5 | 6 | data.sub.dtry.0409xx <- fread(fn.submission.file( 7 | "dtry.xgb9__0.0409xx"))[order(ID)] 8 | 9 | data.l1.xgb.03.pred <- data.l1.xgb.03.pred[ID > 0] 10 | 11 | data.sub.ens.0404x <- merge(data.l1.xgb.03.pred, 12 | data.sub.dtry.0409xx, 13 | suffixes=c(".l",".d"), 14 | by="ID") 15 | setkey(data.sub.ens.0404x, ID) 16 | data.sub.ens.0404x[, IsClick := IsClick.l^0.6 * IsClick.d^0.4 ] 17 | 18 | data.l2.xgb.02.pred.val <- data.l2.xgb.02.pred[ID < 0] 19 | data.l2.xgb.02.pred <- data.l2.xgb.02.pred.calib[ID > 0] 20 | setkey(data.l2.xgb.02.pred, ID) # 0.04043 21 | 22 | 23 | data.l1.xgb.05.pred <- data.l1.xgb.05.pred[ID > 0] 24 | setkey(data.l1.xgb.05.pred, ID) # 0.04076 25 | 26 | 27 | 28 | data.sub.ens <- data.sub.ens.0404x[, list(ID)] 29 | 30 | data.sub.ens[, Pred.l2 := data.l2.xgb.02.pred$Pred] 31 | data.sub.ens[, Pred.Ens1 := 32 | data.sub.ens.0404x$IsClick^0.4 33 | * Pred.l2^0.6] 34 | data.sub.ens[, Pred.Ens2 := 35 | Pred.Ens1^0.9 36 | * data.l1.xgb.05.pred$Pred^0.1] 37 | 38 | data.sub.ens[, Pred := Pred.Ens2] 39 | data.sub.ens[, Pred := Pred.Ens2*1.1] 40 | 41 | cat('Test avg:', mean(data.sub.ens$Pred), "\n") 42 | # Test avg: 0.007941962 43 | 44 | fn.write.submission(data.sub.ens, "data.sub.ens", mean.adj=T) 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /data/input/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/input/empty.csv -------------------------------------------------------------------------------- /data/log/data_build/empty.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/log/data_build/empty.log -------------------------------------------------------------------------------- /data/output-libffm/empty.libffm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-libffm/empty.libffm -------------------------------------------------------------------------------- /data/output-libsvm/empty.libsvm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-libsvm/empty.libsvm -------------------------------------------------------------------------------- /data/output-py/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-py/empty.csv -------------------------------------------------------------------------------- /data/output-r/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-r/empty.csv -------------------------------------------------------------------------------- /data/rdata/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/rdata/empty.csv -------------------------------------------------------------------------------- /data/submission/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/submission/empty.csv -------------------------------------------------------------------------------- /data/template/zens_nn.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | # The "&train" syntax lets us refer back to this object as "*train" elsewhere in the yaml file 3 | dataset: &train !obj:train_pylearn.CSVDataset { 4 | path: '${TRAIN_DATA_FILE}', 5 | }, 6 | # Here we specify the model to train as being an MLP 7 | model: !obj:pylearn2.models.mlp.MLP { 8 | batch_size: 100000, 9 | layers : [ 10 | # We use two hidden layers with rectified linear activations 11 | !obj:pylearn2.models.mlp.RectifiedLinear { 12 | layer_name: 'h0', 13 | dim: 100, 14 | irange: .05, 15 | # Rather than using weight decay, we constrain the norms of the weight vectors 16 | # max_col_norm: 2., 17 | }, 18 | !obj:pylearn2.models.mlp.Softmax { 19 | layer_name: 'y', 20 | init_bias_target_marginals: *train, 21 | # Initialize the weights to all 0s 22 | irange: .0, 23 | n_classes: 2, 24 | } 25 | ], 26 | nvis: ${N_FEATURES}, 27 | }, 28 | # We train using SGD and momentum 29 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 30 | learning_rate: .05, 31 | train_iteration_mode: 'even_shuffled_sequential', 32 | monitor_iteration_mode : 'even_shuffled_sequential', 33 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 34 | init_momentum: .05, 35 | nesterov_momentum: True, 36 | }, 37 | # We monitor how well we're doing during training on a validation set 38 | monitoring_dataset: 39 | { 40 | 'train' : *train, 41 | 'valid' : !obj:train_pylearn.CSVDataset { 42 | path: '${VAL_DATA_FILE}', 43 | } 44 | }, 45 | # We stop when validation set classification error hasn't decreased for 10 epochs 46 | termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased { 47 | channel_name: 'valid_objective', 48 | prop_decrease: 0., 49 | N: 10 50 | }, 51 | cost: !obj:pylearn2.costs.mlp.dropout.Dropout { 52 | default_input_include_prob: .5, 53 | default_input_scale: 2., 54 | }, 55 | }, 56 | # We save the model whenever we improve on the validation set classification error 57 | extensions: [ 58 | !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 59 | channel_name: 'valid_y_misclass', 60 | save_path: '${MODEL_FILE}' 61 | }, 62 | ], 63 | save_freq: 0, 64 | } 65 | -------------------------------------------------------------------------------- /fm/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/fm/.DS_Store -------------------------------------------------------------------------------- /fm/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++-4.9.2 2 | CXXFLAGS = -Wall -Wno-format -Wconversion -O3 -fPIC -std=c++0x -fopenmp 3 | MAIN = fm 4 | FILES = common.cpp timer.cpp 5 | SRCS = $(FILES:%.cpp=src/%.cpp) 6 | HEADERS = $(FILES:%.cpp=src/%.h) 7 | 8 | all: $(MAIN) 9 | 10 | fm: src/train.cpp $(SRCS) $(HEADERS) 11 | $(CXX) $(CXXFLAGS) -o $@ $< $(SRCS) 12 | 13 | clean: 14 | rm -f $(MAIN) 15 | -------------------------------------------------------------------------------- /fm/README: -------------------------------------------------------------------------------- 1 | Data Format 2 | =========== 3 | The input of this factorization machine solver consists of a label vector (y) 4 | and a binary sparse matrix (X). The input format is: 5 | 6 |