├── .DS_Store
├── .gitignore
├── README.pdf
├── avito-context-click-py
    ├── convert_csv_to_libffm.py
    ├── convert_csv_to_libsvm.py
    ├── train_ftrl.py
    ├── train_pylearn.py
    ├── train_scikit.py
    ├── train_xgb.py
    ├── train_xgb_dtry.py
    └── util_rpython.py
├── avito-context-click-r
    ├── .Rapp.history
    ├── .Rprofile
    ├── .Rproj.user
    │   ├── 4B3CD3A5
    │   │   ├── graphics-r3
    │   │   │   └── empty.png
    │   │   ├── pcs
    │   │   │   ├── files-pane.pper
    │   │   │   ├── source-pane.pper
    │   │   │   ├── windowlayoutstate.pper
    │   │   │   └── workbench-pane.pper
    │   │   ├── persistent-state
    │   │   ├── saved_source_markers
    │   │   └── sdb
    │   │   │   ├── prop
    │   │   │       ├── 19AE70ED
    │   │   │       ├── 23B66537
    │   │   │       ├── 266CD89
    │   │   │       ├── 3B4F6947
    │   │   │       ├── 6A3DD511
    │   │   │       ├── 6EC2D3AD
    │   │   │       ├── 6F75496B
    │   │   │       ├── 9EF9E6CB
    │   │   │       ├── A7A18FB5
    │   │   │       ├── BBE2842F
    │   │   │       ├── C2BB45F6
    │   │   │       ├── D970D594
    │   │   │       ├── F8AE1A87
    │   │   │       ├── FB96E70
    │   │   │       └── INDEX
    │   │   │   └── s-8FDFA111
    │   │   │       └── lock_file
    │   ├── 73D1DC80
    │   │   ├── persistent-state
    │   │   └── saved_source_markers
    │   └── D01F76BA
    │   │   ├── pcs
    │   │       ├── files-pane.pper
    │   │       ├── source-pane.pper
    │   │       ├── windowlayoutstate.pper
    │   │       └── workbench-pane.pper
    │   │   └── sdb
    │   │       └── prop
    │   │           ├── 61214431
    │   │           ├── 186F7A30
    │   │           ├── 2191579E
    │   │           ├── 30E2EF33
    │   │           ├── 31E02146
    │   │           ├── 396ED20
    │   │           ├── 3ACE6FF1
    │   │           ├── 51638EA1
    │   │           ├── 5C8CFDEE
    │   │           ├── 6D0737A6
    │   │           ├── 84994D68
    │   │           ├── B4D42750
    │   │           ├── B9BF59A7
    │   │           ├── BD7C1F23
    │   │           ├── C3A0AE51
    │   │           ├── D0B7BE74
    │   │           ├── DC31BA58
    │   │           ├── E0E39D19
    │   │           ├── F66A3FFA
    │   │           ├── FC1B5EBA
    │   │           └── INDEX
    ├── _fn.base.R
    ├── _fn.base.cpp
    ├── _utils.R
    ├── avito-context-click-r.Rproj
    ├── data.build.R
    ├── data.build.dtry.R
    ├── data.build.tree.R
    ├── data.combine.R
    ├── main.R
    ├── train.l1.fm.01.R
    ├── train.l1.fm.02.R
    ├── train.l1.fm.03.R
    ├── train.l1.fm.04.R
    ├── train.l1.fm.05.R
    ├── train.l1.ftrl.04.R
    ├── train.l1.ftrl.05.R
    ├── train.l1.ftrl.06.R
    ├── train.l1.xgb.03.R
    ├── train.l1.xgb.05.R
    ├── train.l2.xgb.02.R
    ├── train.xgb.dtry.R
    └── train.zens.R
├── data
    ├── input
    │   └── empty.csv
    ├── log
    │   └── data_build
    │   │   └── empty.log
    ├── output-libffm
    │   └── empty.libffm
    ├── output-libsvm
    │   └── empty.libsvm
    ├── output-py
    │   └── empty.csv
    ├── output-r
    │   └── empty.csv
    ├── rdata
    │   └── empty.csv
    ├── submission
    │   └── empty.csv
    └── template
    │   └── zens_nn.yaml
└── fm
    ├── .DS_Store
    ├── Makefile
    ├── README
    ├── fm
    ├── fm.dSYM
        └── Contents
        │   ├── Info.plist
        │   └── Resources
        │       └── DWARF
        │           └── fm
    └── src
        ├── common.cpp
        ├── common.h
        ├── timer.cpp
        ├── timer.h
        └── train.cpp


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | 


--------------------------------------------------------------------------------
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/README.pdf


--------------------------------------------------------------------------------
/avito-context-click-py/convert_csv_to_libffm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import warnings
  3 | import os
  4 | import gzip
  5 | from csv import DictReader
  6 | from collections import namedtuple
  7 | from datetime import datetime
  8 | import sys
  9 | 
 10 | OutInfo = namedtuple('OutInfo', ['writer', 'selector'])
 11 | 
 12 | 
 13 | def convert_csv_2_libffm(input_files, out_selector, col_out, col_in_cat, col_in_num, old_format, silent):
 14 | 
 15 |     if old_format and len(col_in_num) > 0:
 16 |         raise ValueError('Old format doesn''t support numeric columns')
 17 | 
 18 |     start = datetime.now()
 19 |     out_lst = [OutInfo(writer=open_write(out_file), selector=expr) for out_file, expr in out_selector.items()]
 20 |     invalid_output = {'', 'na', "nan", 'NA', 'NaN'}
 21 | 
 22 |     feat_map = {}
 23 |     feat_index = 1
 24 |     for col_in in col_in_cat:
 25 |         feat_map[col_in] = {}
 26 |     for col_in in col_in_num:
 27 |         feat_map[col_in] = feat_index
 28 |         feat_index += 1
 29 | 
 30 |     row_count = 0
 31 |     for input_file in input_files:
 32 |         for row in DictReader(open_read(input_file)):
 33 | 
 34 |             row_count += 1
 35 | 
 36 |             out_select = [out_stream.selector(input_file, row) for out_stream in out_lst]
 37 | 
 38 |             if sum(out_select) > 0:
 39 |                 cur_row = row[col_out]
 40 |                 if cur_row in invalid_output:
 41 |                     cur_row = '0'
 42 | 
 43 |                 col_index = 1
 44 | 
 45 |                 for col_in in col_in_cat:
 46 |                     col_map = feat_map[col_in]
 47 |                     col_val = row[col_in]
 48 | 
 49 |                     if col_val in col_map:
 50 |                         col_feat = col_map[col_val]
 51 |                     else:
 52 |                         col_feat = col_map[col_val] = feat_index
 53 |                         feat_index += 1
 54 | 
 55 |                     if old_format:
 56 |                         cur_row += ' ' + str(col_feat)
 57 |                     else:
 58 |                         cur_row += ' ' + str(col_index) + ':' + str(col_feat) + ':1.0'
 59 | 
 60 |                     col_index += 1
 61 | 
 62 |                 for col_in in col_in_num:
 63 |                     col_feat = feat_map[col_in]
 64 |                     cur_row += ' ' + str(col_index) + ':' + str(col_feat) + ':' + row[col_in]
 65 | 
 66 |                     col_index += 1
 67 | 
 68 |                 for i, out_stream in enumerate(out_lst):
 69 |                     if out_select[i]:
 70 |                         out_stream.writer.write(cur_row)
 71 |                         out_stream.writer.write('\n')
 72 | 
 73 |             if not silent and row_count % 10000000 == 0:
 74 |                 print('Lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start)))
 75 | 
 76 |     for out_stream in out_lst:
 77 |         out_stream.writer.close()
 78 | 
 79 |     if not silent:
 80 |         print('Total lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start)))
 81 | 
 82 | 
 83 | def open_read(path):
 84 | 
 85 |     if not os.path.exists(path) and os.path.exists(path + '.gz'):
 86 |         path += '.gz'
 87 |     print('\nOpening %s to read.' % path)
 88 | 
 89 |     if path == 'sys.stdin':
 90 |         return sys.stdin
 91 | 
 92 |     if path.endswith('.gz'):
 93 |         return gzip.open(path, mode='rt')
 94 |     else:
 95 |         return open(path, mode='rt')
 96 | 
 97 | 
 98 | def open_write(path):
 99 |     path_dir = os.path.dirname(path)
100 |     if not os.path.exists(path_dir):
101 |         os.makedirs(path_dir)
102 | 
103 |     return open(path, 'w')
104 | 
105 | if __name__ == "__main__":
106 | 
107 |     # -input_files ../data/output-r/data.all.lr.csv.gz
108 |     # -out_selector
109 |     # "{'../data/output-libffm/fm_01/data.val.tr.fm': lambda file, row: row['SearchType'] in ['hist', 'tr'],
110 |     # '../data/output-libffm/fm_01/data.val.tt.fm': lambda file, row: row['SearchType'] in ['val'],
111 |     # '../data/output-libffm/fm_01/data.test.tr.fm': lambda file, row: row['SearchType'] in ['hist', 'tr', 'val'],
112 |     # '../data/output-libffm/fm_01/data.test.tt.fm': lambda file, row: row['SearchType'] in ['test']}"
113 |     # -col_out IsClick
114 |     # -col_in_cat
115 |     #   AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin
116 |     #   Position
117 |     #   SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count
118 |     #       SearchCatID SearchLocID SearchParamsSZBin SearchQuerySZBin SearchRussian
119 |     #   UserID UserIPID UserPrevQryDateBin UserQryTotalTimeBin
120 | 
121 |     parser = argparse.ArgumentParser(description='Conver a csv file to libffm format',
122 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
123 | 
124 |     parser.add_argument('-input_files',
125 |                         default=[],
126 |                         type=str,
127 |                         nargs='+',
128 |                         help='CSV with training data')
129 | 
130 |     parser.add_argument('-out_selector',
131 |                         required=True,
132 |                         type=str,
133 |                         help='Dictionary with format {"out_file" : lambda file, row: is_instance(file, row)}'
134 |                              ' to select instances')
135 | 
136 |     parser.add_argument('-col_out',
137 |                         required=True,
138 |                         type=str,
139 |                         help='Output column.')
140 | 
141 |     parser.add_argument('-col_in_cat',
142 |                         default=[],
143 |                         type=str,
144 |                         nargs='+',
145 |                         help='List of the names of the categorical input columns')
146 | 
147 |     parser.add_argument('-col_in_num',
148 |                         default=[],
149 |                         type=str,
150 |                         nargs='+',
151 |                         help='List of the names of the numerical input columns')
152 | 
153 |     parser.add_argument('-silent',
154 |                         default=False,
155 |                         action='store_true',
156 |                         help='Don''t print execution information')
157 | 
158 |     parser.add_argument('-old_format',
159 |                         default=False,
160 |                         action='store_true',
161 |                         help="Outputs files in the old format")
162 | 
163 |     args = vars(parser.parse_args())
164 |     args['out_selector'] = eval(args['out_selector'])
165 | 
166 |     # if not sys.stdin.isatty():
167 |     #     args['input_files'].append('sys.stdin')
168 | 
169 |     if not args['silent']:
170 |         print('\n\n\n\n\n\n\n\n\n')
171 |         print(args)
172 | 
173 |     with warnings.catch_warnings():
174 |         warnings.filterwarnings("ignore", category=Warning)
175 |         convert_csv_2_libffm(**args)
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/avito-context-click-py/convert_csv_to_libsvm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import warnings
  3 | import os
  4 | import gzip
  5 | from csv import DictReader
  6 | from collections import namedtuple
  7 | from datetime import datetime
  8 | 
  9 | OutInfo = namedtuple('OutInfo', ['writer', 'out_file', 'selector', 'weight_builder', 'weight_writer'])
 10 | 
 11 | 
 12 | def convert_csv_2_libsvm(input_files, feat_map_file, out_selector,
 13 |                          col_out, col_in_cat, col_in_num, missing_values,
 14 |                          feat_start, weight_builder_dict):
 15 | 
 16 |     start = datetime.now()
 17 |     out_lst = [OutInfo(writer=open_write(out_file),
 18 |                        out_file=out_file,
 19 |                        selector=expr,
 20 |                        weight_builder=weight_builder_dict[out_file] if out_file in weight_builder_dict else None,
 21 |                        weight_writer=open_write(out_file + '.weight') if out_file in weight_builder_dict else None)
 22 |                for out_file, expr in out_selector.items()]
 23 |     missing_values = set(missing_values)
 24 | 
 25 |     feat_map = {}
 26 |     feat_map_list = ['skip' + str(ix) for ix in range(feat_start)]
 27 |     for col_in in col_in_cat:
 28 |         feat_map[col_in] = {}
 29 |     for col_in in col_in_num:
 30 |         feat_map[col_in] = len(feat_map_list)
 31 |         feat_map_list.append(col_in + '\tfloat')
 32 | 
 33 |     row_count = 0
 34 |     for input_file in input_files:
 35 |         for row in DictReader(open_read(input_file)):
 36 | 
 37 |             row_count += 1
 38 | 
 39 |             out_select = [out_stream.selector(input_file, row) for out_stream in out_lst]
 40 | 
 41 |             if sum(out_select) > 0:
 42 |                 cur_row = row[col_out]
 43 |                 if cur_row in missing_values:
 44 |                     cur_row = '0'
 45 | 
 46 |                 for col_in in col_in_cat:
 47 |                     col_map = feat_map[col_in]
 48 |                     col_val = row[col_in]
 49 |                     if col_val not in missing_values:
 50 |                         if col_val in col_map:
 51 |                             col_feat = col_map[col_val]
 52 |                         else:
 53 |                             col_feat = col_map[col_val] = len(feat_map_list)
 54 |                             feat_map_list.append(col_in + '=' + col_val + '\ti')
 55 |                         cur_row += ' ' + str(col_feat) + ':1'
 56 | 
 57 |                 for col_in in col_in_num:
 58 |                     col_val = row[col_in]
 59 |                     if col_val not in missing_values:
 60 |                         col_feat = feat_map[col_in]
 61 |                         cur_row += ' ' + str(col_feat) + ':' + col_val
 62 | 
 63 |                 for i, out_stream in enumerate(out_lst):
 64 |                     if out_select[i]:
 65 |                         out_stream.writer.write(cur_row)
 66 |                         out_stream.writer.write('\n')
 67 |                         if out_stream.weight_builder is not None:
 68 |                             cur_weight = str(out_stream.weight_builder(input_file, row))
 69 |                             out_stream.weight_writer.write(cur_weight)
 70 |                             out_stream.weight_writer.write('\n')
 71 | 
 72 |             if row_count % 10000000 == 0:
 73 |                 print('Lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start)))
 74 | 
 75 |     for out_stream in out_lst:
 76 |         out_stream.writer.close()
 77 |         if out_stream.weight_writer is not None:
 78 |             out_stream.weight_writer.close()
 79 | 
 80 |     if feat_map_file is not None:
 81 |         with open_write(feat_map_file) as fmap_out:
 82 |             for ix, fvalue in enumerate(feat_map_list):
 83 |                 fmap_out.write(str(ix) + '\t' + fvalue + '\n')
 84 | 
 85 |     print('Total lines read: %d, Elapsed time: %s' % (row_count, str(datetime.now() - start)))
 86 | 
 87 | 
 88 | def open_read(path):
 89 | 
 90 |     if not os.path.exists(path) and os.path.exists(path + '.gz'):
 91 |         path += '.gz'
 92 |     print('\nOpening %s to read.' % path)
 93 | 
 94 |     if path.endswith('.gz'):
 95 |         return gzip.open(path, mode='rt')
 96 |     else:
 97 |         return open(path, mode='rt')
 98 | 
 99 | 
100 | def open_write(path):
101 |     path_dir = os.path.dirname(path)
102 |     if not os.path.exists(path_dir):
103 |         os.makedirs(path_dir)
104 | 
105 |     return open(path, 'w')
106 | 
107 | if __name__ == "__main__":
108 | 
109 |     # -input_files ../data/dmitry/data.all.tree.dl.csv.samp
110 |     # -out_selector
111 |     # "{'../data/dmitry/data.all.tree.dl.csv.samp.libsvm': lambda file, row: True}"
112 |     # -feat_map_file ../data/dmitry/data.all.tree.dl.csv.samp.fmap
113 |     # -col_out IsClick
114 |     # -col_in_cat
115 |     #   AdCatID
116 |     # -col_in_num
117 |     #   AdHistCTR AdPrice Position UserQryTotalTime
118 | 
119 |     parser = argparse.ArgumentParser(description='Conver a csv file to libffm format',
120 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
121 | 
122 |     parser.add_argument('-input_files',
123 |                         default=[],
124 |                         type=str,
125 |                         nargs='+',
126 |                         help='CSV with training data')
127 | 
128 |     parser.add_argument('-feat_map_file',
129 |                         default=None,
130 |                         type=str,
131 |                         help='Path to output feat map')
132 | 
133 |     parser.add_argument('-out_selector',
134 |                         required=True,
135 |                         type=str,
136 |                         help='Dictionary with format {"out_file" : lambda file, row: is_instance(file, row)}'
137 |                              ' to select instances')
138 | 
139 |     parser.add_argument('-col_out',
140 |                         required=True,
141 |                         type=str,
142 |                         help='Output column.')
143 | 
144 |     parser.add_argument('-col_in_cat',
145 |                         default=[],
146 |                         type=str,
147 |                         nargs='+',
148 |                         help='List of the names of the categorical input columns')
149 | 
150 |     parser.add_argument('-col_in_num',
151 |                         default=[],
152 |                         type=str,
153 |                         nargs='+',
154 |                         help='List of the names of the numerical input columns')
155 | 
156 |     parser.add_argument('-missing_values',
157 |                         default=['', 'na', "nan", 'NA', 'NaN'],
158 |                         type=str,
159 |                         nargs='+',
160 |                         help='List of the names of the numerical input columns')
161 | 
162 |     parser.add_argument('-feat_start',
163 |                         default=0,
164 |                         type=int,
165 |                         help='Starting index of features')
166 | 
167 |     parser.add_argument('-weight_builder_dict',
168 |                         default='{}',
169 |                         type=str,
170 |                         help='create weight features')
171 | 
172 |     args = vars(parser.parse_args())
173 |     args['out_selector'] = eval(args['out_selector'])
174 |     if args['weight_builder_dict'] is not None:
175 |         args['weight_builder_dict'] = eval(args['weight_builder_dict'])
176 | 
177 |     print('\n' + (' '*300) + '\n')
178 |     print(args)
179 | 
180 |     with warnings.catch_warnings():
181 |         warnings.filterwarnings("ignore", category=Warning)
182 |         convert_csv_2_libsvm(**args)
183 | 


--------------------------------------------------------------------------------
/avito-context-click-py/train_ftrl.py:
--------------------------------------------------------------------------------
  1 | """
  2 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  3 |                    Version 2, December 2004
  4 | 
  5 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
  6 | 
  7 | Everyone is permitted to copy and distribute verbatim or modified
  8 | copies of this license document, and changing it is allowed as long
  9 | as the name is changed.
 10 | 
 11 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 12 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 13 | 
 14 |  0. You just DO WHAT THE FUCK YOU WANT TO.
 15 | """
 16 | 
 17 | from datetime import datetime
 18 | from csv import DictReader
 19 | from math import exp, log, sqrt, copysign
 20 | import os
 21 | import gzip
 22 | import argparse
 23 | import warnings
 24 | import random
 25 | import pickle as pkl
 26 | import inspect
 27 | 
 28 | 
 29 | class DataObject(object):
 30 | 
 31 |     def __props__(self):
 32 |         attribs = inspect.getmembers(self, lambda attr: not(inspect.isroutine(attr)))
 33 |         return [a for a in attribs if not(a[0].startswith('__') and a[0].endswith('__'))]
 34 | 
 35 |     def __repr__(self):
 36 |         return '{}({})'.format(self.__class__.__name__, repr(self.__props__()))
 37 | 
 38 | 
 39 | class FTRLProximal(object):
 40 |     """ Our main algorithm: Follow the regularized leader - proximal
 41 | 
 42 |         In short,
 43 |         this is an adaptive-learning-rate sparse logistic-regression with
 44 |         efficient L1-L2-regularization
 45 | 
 46 |         Reference:
 47 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 48 |     """
 49 | 
 50 |     class StatusData(DataObject):
 51 | 
 52 |         def __init__(self):
 53 |             self.start = datetime.now()
 54 |             self.loss = 0.
 55 |             self.count = 0
 56 |             self.loss_check = 0.
 57 |             self.count_check = 0
 58 |             self.total_count = 0
 59 | 
 60 |     class InstanceData(DataObject):
 61 | 
 62 |         def __init__(self, x, y, row):
 63 |             self.x = x
 64 |             self.y = y
 65 |             self.row = row
 66 | 
 67 |     def __init__(self, alpha, beta, l1, l2, bits, two_way, dropout, seed, col_out,
 68 |                  col_in_cat, test_pred_col, test_pred_extra_cols, no_pred_suffix):
 69 |         # parameters
 70 |         self.alpha = alpha
 71 |         self.beta = beta
 72 |         self.l1 = l1
 73 |         self.l2 = l2
 74 | 
 75 |         # feature related parameters
 76 |         self.two_way = two_way
 77 |         self.dropout = max(min(dropout, 1.), 0.)
 78 |         self.rnd = random.Random()
 79 |         self.rnd.seed(seed)
 80 |         self.seed = seed
 81 | 
 82 |         self.bits = bits
 83 |         self.key_cache = {}
 84 |         self.d = 2 ** bits
 85 |         self.n = [0.] * self.d
 86 |         self.z = [0.] * self.d
 87 |         
 88 |         self.epoch = 1
 89 |         self.col_out = col_out
 90 |         self.col_in_cat = col_in_cat
 91 |         self.test_pred_col = test_pred_col
 92 |         self.test_pred_extra_cols = test_pred_extra_cols
 93 |         self.no_pred_suffix = no_pred_suffix
 94 | 
 95 |     def _calc_col_index(self, val_id):
 96 |         index_offset = 1
 97 |         return (abs(hash(val_id)) % (self.d-index_offset)) + index_offset
 98 | 
 99 |     def _open_pred_path(self, outpath):
100 |         return open(outpath + ('' if self.no_pred_suffix else "." + str(self.epoch)), 'w')
101 | 
102 |     @staticmethod
103 |     def _get_prob(wtx):
104 |         return 1. / (1. + exp(-max(min(wtx, 35.), -35.)))
105 |       
106 |     def _get_w(self, i):
107 |         sign = copysign(1, self.z[i])
108 |         if sign * self.z[i] <= self.l1:
109 |             return 0.
110 |         else:
111 |             return (sign * self.l1 - self.z[i]) / ((self.beta + sqrt(self.n[i])) / self.alpha + self.l2)
112 | 
113 |     def _predict(self, x):
114 |         """ Get probability estimation on x
115 | 
116 |             INPUT:
117 |                       x: features
118 |                 dropped: if the weight was dropped
119 |             OUTPUT:
120 |                 probability of p(y = 1 | x; w)
121 |         """
122 |         wtx = sum([self._get_w(i) for i in x])
123 |         return FTRLProximal._get_prob(wtx)
124 | 
125 |     def _update(self, x, y):
126 |         """ Update model using x, y
127 | 
128 |             INPUT:
129 |                 x: feature, a list of indices
130 |                 y: answer
131 | 
132 |             MODIFIES:
133 |                 self.n: increase by squared gradient
134 |                 self.z: weights
135 |         """
136 | 
137 |         w = [0.]*(len(x)+1)
138 |         wtx = 0.
139 |         for j, i in enumerate(x):
140 |             if self.dropout > 0. and self.rnd.random() < self.dropout:
141 |                 w[j] = None
142 |             else:
143 |                 w[j] = self._get_w(i)
144 |                 wtx += w[j]
145 |         # wtx /= (1.-self.dropout)
146 | 
147 |         p = FTRLProximal._get_prob(wtx)
148 |         g = p - y
149 | 
150 |         # update z and n
151 |         for j, i in enumerate(x):
152 |             # implement dropout as overfitting prevention
153 |             if w[j] is None:
154 |                 continue
155 | 
156 |             sigma = (sqrt(self.n[i] + g * g) - sqrt(self.n[i])) / self.alpha
157 |             self.z[i] += g - sigma * w[j]
158 |             self.n[i] += g * g
159 | 
160 |     @staticmethod
161 |     def _open_path(path):
162 |         
163 |         if not os.path.exists(path) and os.path.exists(path + '.gz'):
164 |             path += '.gz'
165 |         print('\nOpening %s to read.' % path)
166 | 
167 |         # noinspection PyUnresolvedReferences
168 |         if path.endswith('.gz'):
169 |             return gzip.open(path, mode='rt')
170 |         else:
171 |             return open(path, mode='rt')
172 | 
173 |     def _data(self, path):
174 | 
175 |         for row in DictReader(FTRLProximal._open_path(path)):
176 | 
177 |             data = FTRLProximal.InstanceData(x=[0] * (len(self.col_in_cat) + len(self.two_way) + 1), y=None, row=row)
178 |             if self.col_out in row:
179 |                 row_val = row[self.col_out]
180 |                 data.y = float(row_val) if row_val == '0' or row_val == '1' else None
181 | 
182 |             # bias is 0
183 |             ix = 1
184 | 
185 |             # one-hot encode features
186 |             for col_nam in self.col_in_cat:
187 |                 data.x[ix] = self._calc_col_index(str(ix+self.seed) + "_" + row[col_nam])
188 |                 ix += 1
189 |             # one-hot encode two way features
190 |             for cols_two_way in self.two_way:
191 |                 data.x[ix] = self._calc_col_index(str(ix+self.seed) + "_" +
192 |                                                   row[cols_two_way[0]] + "_" + row[cols_two_way[1]])
193 |                 ix += 1
194 |             yield data
195 | 
196 |     @staticmethod
197 |     def log_loss(pred, actual):
198 |         """ FUNCTION: Bounded logloss
199 | 
200 |             INPUT:
201 |                 p: our prediction
202 |                 y: real answer
203 | 
204 |             OUTPUT:
205 |                 logarithmic loss of p given y
206 |         """
207 | 
208 |         pred = max(min(pred, 1. - 10e-15), 10e-15)
209 |         return -log(pred) if actual == 1. else -log(1. - pred)
210 | 
211 |     def fit(self, path):
212 | 
213 |         self._on_start_fit(path=path)
214 | 
215 |         print('\n\n\nTraining started...')
216 |         fit_status = FTRLProximal.StatusData()
217 | 
218 |         for data in self._data(path):
219 |             self._fit_instance(fit_status=fit_status, data=data)
220 | 
221 |         self._on_end_fit(fit_status=fit_status)
222 | 
223 |     def _on_start_fit(self, path):
224 |         pass
225 | 
226 |     def _fit_instance(self, fit_status, data):
227 | 
228 |         fit_status.total_count += 1
229 |         if fit_status.total_count % 20 == 0:
230 |             p = self._predict(data.x)
231 |             cur_loss = FTRLProximal.log_loss(p, data.y)
232 |             fit_status.loss += cur_loss
233 |             fit_status.loss_check += cur_loss
234 |             fit_status.count += 1
235 |             fit_status.count_check += 1
236 | 
237 |         self._update(data.x, data.y)
238 | 
239 |         if fit_status.total_count % 10000000 == 0:
240 |             print('Epoch %d (%d instances), train logloss: %f (since last %f - %d samples), elapsed time: %s' % (
241 |                 self.epoch, fit_status.total_count, fit_status.loss / fit_status.count,
242 |                 fit_status.loss_check / fit_status.count_check, fit_status.count_check,
243 |                 str(datetime.now() - fit_status.start)))
244 |             fit_status.loss_check = 0.
245 |             fit_status.count_check = 0
246 | 
247 |     def _on_end_fit(self, fit_status):
248 |         print('Epoch %d finished (%d total instances), train logloss: %f (since last %f - %d samples),'
249 |               ' elapsed time: %s' % (
250 |                   self.epoch, fit_status.total_count, fit_status.loss / fit_status.count,
251 |                   fit_status.loss_check / fit_status.count_check, fit_status.count_check,
252 |                   str(datetime.now() - fit_status.start)))
253 |         fit_status.loss_check = 0.
254 |         fit_status.count_check = 0
255 |         self.epoch += 1
256 | 
257 |     def pred_proba(self, path):
258 |         print('\n\n\nPredicting started...')
259 |         pred_status = FTRLProximal.StatusData()
260 |         pred_lst = []
261 | 
262 |         for data in self._data(path):
263 |             pred = self._predict(data.x)
264 |             pred_lst.append(pred)
265 |             self._update_pred_status(data=data, pred=pred, pred_status=pred_status)
266 | 
267 |         self._on_end_pred_proba(pred_status=pred_status)
268 |         return pred_lst
269 | 
270 |     def save_pred_proba(self, inpath, outpath):
271 | 
272 |         print('\n\n\nSaving prediction started...')
273 |         pred_status = FTRLProximal.StatusData()
274 | 
275 |         with self._open_pred_path(outpath) as out_file:
276 |             self._write_pred_headers(out_file)
277 |             for data in self._data(inpath):
278 |                 self._write_instance_prediction(pred_status=pred_status, out_file=out_file, data=data)
279 | 
280 |         self._on_end_pred_proba(pred_status=pred_status)
281 | 
282 |     def _write_pred_headers(self, out_file):
283 |         out_file.write('%s\n' % ','.join(self.test_pred_extra_cols + [self.test_pred_col]))
284 | 
285 |     def _write_instance_prediction(self, pred_status, out_file, data):
286 |         pred = self._predict(data.x)
287 |         values = [data.row[col_nam] for col_nam in self.test_pred_extra_cols] + [str(pred)]
288 |         out_file.write('%s\n' % ','.join(values))
289 |         self._update_pred_status(data=data, pred=pred, pred_status=pred_status)
290 | 
291 |     # noinspection PyMethodMayBeStatic
292 |     def _update_pred_status(self, data, pred, pred_status):
293 |         if data.y is not None:
294 |             pred_status.loss += FTRLProximal.log_loss(pred, data.y)
295 |             pred_status.count += 1
296 |         pred_status.total_count += 1
297 | 
298 |     # noinspection PyMethodMayBeStatic
299 |     def _on_end_pred_proba(self, pred_status):
300 |         if pred_status.count > 0:
301 |             print('Prediction logloss: %f (%d instances)' %
302 |                   (pred_status.loss / pred_status.count, pred_status.count))
303 |         print('Prediction saving time (%d instances): %s' %
304 |               (pred_status.total_count, str(datetime.now() - pred_status.start)))
305 | 
306 |     def fit_and_save_pred_proba(self, inpath, outpath, train_is_test_col):
307 | 
308 |         self._on_start_fit(path=inpath)
309 | 
310 |         print('\n\n\nTraining and predicting started...')
311 |         fit_status = FTRLProximal.StatusData()
312 |         pred_status = FTRLProximal.StatusData()
313 | 
314 |         with self._open_pred_path(outpath) as out_file:
315 |             self._write_pred_headers(out_file)
316 |             for data in self._data(inpath):
317 |                 if data.row[train_is_test_col] == '1':
318 |                     self._write_instance_prediction(pred_status=pred_status, out_file=out_file, data=data)
319 |                 else:
320 |                     self._fit_instance(fit_status=fit_status, data=data)
321 | 
322 |         self._on_end_fit(fit_status=fit_status)
323 |         self._on_end_pred_proba(pred_status=pred_status)
324 | 
325 |     @staticmethod
326 |     def load_model(model_file):
327 |         start = datetime.now()
328 |         print('\n\nLoading model from \'%s\'' % model_file)
329 |         with gzip.open(model_file, 'rb') as model_stream:
330 |             model = pkl.load(model_stream)
331 |         print('Loading model time: %s' % (str(datetime.now() - start)))
332 |         return model
333 | 
334 |     @staticmethod
335 |     def save_model(model, model_file):
336 |         start = datetime.now()
337 |         print('\n\nSaving model to \'%s\'' % model_file)
338 |         with gzip.open(model_file, 'wb') as model_stream:
339 |             pkl.dump(model, model_stream, pkl.HIGHEST_PROTOCOL)
340 |         print('Saving model time: %s' % (str(datetime.now() - start)))
341 | 
342 | 
343 | def train_and_predict(train_file, train_is_test_col, test_file, test_pred_file, train_model_file, epochs, load_model,
344 |                       alpha, beta, l1, l2, bits, two_way, dropout, seed,
345 |                       col_out, col_in_cat, test_pred_col, test_pred_extra_cols,
346 |                       no_pred_suffix):
347 | 
348 |     start = datetime.now()
349 | 
350 |     if load_model and os.path.exists(train_model_file):
351 |         learner = FTRLProximal.load_model(train_model_file)
352 |     else:
353 |         learner = FTRLProximal(alpha=alpha, beta=beta, l1=l1, l2=l2, bits=bits, two_way=two_way,
354 |                                dropout=dropout, seed=seed,
355 |                                col_out=col_out, col_in_cat=col_in_cat,
356 |                                test_pred_col=test_pred_col,
357 |                                test_pred_extra_cols=test_pred_extra_cols, no_pred_suffix=no_pred_suffix)
358 | 
359 |     # training
360 |     if train_file is not None:
361 |         for e in range(epochs):
362 |             if train_is_test_col is None:
363 |                 learner.fit(path=train_file)
364 |             else:
365 |                 learner.fit_and_save_pred_proba(inpath=train_file, outpath=test_pred_file,
366 |                                                 train_is_test_col=train_is_test_col)
367 | 
368 |     # predicting
369 |     if test_file is not None:
370 |         learner.save_pred_proba(inpath=test_file, outpath=test_pred_file)
371 | 
372 |     if train_model_file is not None:
373 |         FTRLProximal.save_model(learner, train_model_file)
374 | 
375 |     print('Total elapsed time: %s' % (str(datetime.now() - start)))
376 | 
377 | 
378 | def arg_to_list(arg_map, name, sort=True):
379 |     if not isinstance(arg_map[name], list):
380 |         arg_map[name] = [arg_map[name]]
381 |     if sort:
382 |         arg_map[name] = sorted(arg_map[name])
383 | 
384 | 
385 | def arg_build_2way(arg_map):
386 | 
387 |     two_way_arg = arg_map['two_way']
388 |     input_cols = sorted(arg_map['col_in_cat'])
389 | 
390 |     two_way_lst = []
391 |     two_way_added = set()
392 | 
393 |     l = len(input_cols)
394 |     for two_way in two_way_arg:
395 |         two_way = sorted(two_way.split(' '))
396 |         for i in range(l):
397 |             for j in range(i + 1, l):
398 |                 if input_cols[i].startswith(two_way[0]) and input_cols[j].startswith(two_way[1]):
399 |                     input_key = input_cols[i] + '\t' + input_cols[j]
400 |                     if input_key not in two_way_added:
401 |                         two_way_added.add(input_key)
402 |                         two_way_lst.append([input_cols[i], input_cols[j]])
403 |     arg_map['two_way'] = two_way_lst
404 | 
405 | 
406 | def arg_replace(arg_map, name, old, new):
407 |     if arg_map[name] is not None:
408 |         arg_map[name] = arg_map[name].replace(old, new)
409 | 
410 | if __name__ == "__main__":
411 | 
412 |     # -train_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/ftrl_05/data.val.all.csv
413 |     # -train_is_test_col IsTestRow
414 |     # -test_pred_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/ftrl_05/data.val.all.tmp.pred
415 |     # -col_out IsClick
416 |     # -col_in_cat AdID UserID Position
417 |     # -col_query_id SearchID
418 |     # -min_freq 1
419 | 
420 |     parser = argparse.ArgumentParser(description='Train and predict using FTRL proximal algorithm',
421 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
422 | 
423 |     parser.add_argument('-train_file',
424 |                         default=None,
425 |                         type=str,
426 |                         help='CSV with training data')
427 | 
428 |     parser.add_argument('-train_model_file',
429 |                         default=None,
430 |                         type=str,
431 |                         help='Path to save the trained model to be used later to resume training')
432 | 
433 |     parser.add_argument('-train_is_test_col',
434 |                         default=None,
435 |                         type=str,
436 |                         help='If given its assumed that test instances are written in TRAIN_FILE with TRAIN_TEST_COL'
437 |                              ' set to 1 if its a test instance; those test instances will be predicted on the fly in'
438 |                              ' such cases')
439 | 
440 |     parser.add_argument('-test_file',
441 |                         default=None,
442 |                         type=str,
443 |                         help='CSV with testing data')
444 | 
445 |     parser.add_argument('-test_pred_file',
446 |                         default=None,
447 |                         type=str,
448 |                         help='Path to save predictions')
449 | 
450 |     parser.add_argument('-test_pred_col',
451 |                         default='Pred',
452 |                         type=str,
453 |                         help='Name of the prediction column that will be writen to TEST_PRED_FILE')
454 | 
455 |     parser.add_argument('-test_pred_extra_cols',
456 |                         default=[],
457 |                         type=str,
458 |                         nargs='+',
459 |                         help='Extra columns that will be copied from TEST_FILE to TEST_PRED_FILE')
460 | 
461 |     parser.add_argument('-no_pred_suffix',
462 |                         default=False,
463 |                         action='store_true',
464 |                         help="Removes epoch suffix from prediction file")
465 | 
466 |     parser.add_argument('-load_model',
467 |                         default=False,
468 |                         action='store_true',
469 |                         help="Loads saved model to resume training if exists")
470 | 
471 |     parser.add_argument('-col_out',
472 |                         required=True,
473 |                         type=str,
474 |                         help='Name of the output column')
475 | 
476 |     parser.add_argument('-col_in_cat',
477 |                         required=True,
478 |                         type=str,
479 |                         nargs='+',
480 |                         help='List of the names of the categorical input columns')
481 | 
482 |     parser.add_argument('-two_way',
483 |                         default=[],
484 |                         type=str,
485 |                         nargs='+',
486 |                         help='Two way features list in format \'F1 F2\', so all fields starting with \'F1\' '
487 |                              'will be combined with all fields starting with \'F2\'')
488 | 
489 |     parser.add_argument('-alpha',
490 |                         default=.1,
491 |                         type=float,
492 |                         help='Learning rate')
493 | 
494 |     parser.add_argument('-beta',
495 |                         default=1.,
496 |                         type=float,
497 |                         help='Smoothing parameter for adaptive learning rate')
498 | 
499 |     parser.add_argument('-l1',
500 |                         default=1.,
501 |                         type=float,
502 |                         help='L1 regularization, larger value means more regularized')
503 | 
504 |     parser.add_argument('-l2',
505 |                         default=1.,
506 |                         type=float,
507 |                         help='L2 regularization, larger value means more regularized')
508 | 
509 |     parser.add_argument('-bits',
510 |                         default=24,
511 |                         type=int,
512 |                         help='Bits to use with the hashing trick to define weights'
513 |                              ' (a -1 value will make it take longer but it will'
514 |                              ' make sure there will be no collisions)')
515 | 
516 |     parser.add_argument('-dropout',
517 |                         default=0.,
518 |                         type=float,
519 |                         help='Percentage of weights to dropout at each update')
520 | 
521 |     parser.add_argument('-seed',
522 |                         default=1,
523 |                         type=int,
524 |                         help='Seed to use for random operations (like dropout) and hash offset'
525 |                              ' so changing the seed will also change the hash colisions')
526 | 
527 |     parser.add_argument('-epochs',
528 |                         default=1,
529 |                         type=int,
530 |                         help='Learn training data for N passes')
531 | 
532 |     args = vars(parser.parse_args())
533 | 
534 |     arg_to_list(arg_map=args, name='col_in_cat')
535 |     arg_to_list(arg_map=args, name='two_way')
536 |     arg_to_list(arg_map=args, name='test_pred_extra_cols')
537 | 
538 |     arg_build_2way(arg_map=args)
539 | 
540 |     arg_replace(arg_map=args, name='train_model_file', old='{TRAIN_FILE}', new=args['train_file'])
541 | 
542 |     print('\n\n' + (' '*100) + '\n\n')
543 |     print(args)
544 | 
545 |     with warnings.catch_warnings():
546 |         warnings.filterwarnings("ignore", category=Warning)
547 |         train_and_predict(**args)
548 | 


--------------------------------------------------------------------------------
/avito-context-click-py/train_pylearn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gc
  3 | import os
  4 | 
  5 | import numpy as np
  6 | import csv
  7 | import gzip
  8 | 
  9 | from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
 10 | from pylearn2.utils.string_utils import preprocess
 11 | from pylearn2.config import yaml_parse
 12 | from pylearn2.utils import serial
 13 | 
 14 | 
 15 | class CSVDataset(DenseDesignMatrix):
 16 |     def __init__(self, path, n_labels=2, start=None, stop=None, del_raw=True, x_only=False):
 17 |         self.del_raw = del_raw
 18 |         path = preprocess(path)
 19 | 
 20 |         x, y = CSVDataset._load_data(path, del_raw=del_raw)
 21 |         if np.isnan(np.min(y)):
 22 |             y = None
 23 |         else:
 24 |             y = y.astype(int).reshape(-1, 1)
 25 | 
 26 |         if start is not None:
 27 |             if stop is None:
 28 |                 stop = x.shape[0]
 29 |             assert start >= 0
 30 |             assert start < stop
 31 |             if not (stop <= x.shape[0]):
 32 |                 raise ValueError("stop must be less than the # of examples but " +
 33 |                                  "stop is " + str(stop) + " and there are " + str(x.shape[0]) +
 34 |                                  " examples.")
 35 |             x = x[start:stop, :]
 36 |             if y is not None:
 37 |                 y = y[start:stop, :]
 38 | 
 39 |         if x_only:
 40 |             y = None
 41 |             n_labels = None
 42 | 
 43 |         super(CSVDataset, self).__init__(X=x, y=y, y_labels=n_labels)
 44 | 
 45 |     @staticmethod
 46 |     def _open_path(path):
 47 | 
 48 |         if not os.path.exists(path) and os.path.exists(path + '.gz'):
 49 |             path += '.gz'
 50 | 
 51 |         if path.endswith('.gz'):
 52 |             return gzip.open(path, mode='rt')
 53 |         else:
 54 |             return open(path, mode='rt')
 55 | 
 56 |     @staticmethod
 57 |     def _load_data(path, del_raw):
 58 | 
 59 |         npy_path = path + '.npz'
 60 | 
 61 |         if os.path.exists(npy_path):
 62 |             if not os.path.exists(path) or os.path.getmtime(npy_path) > os.path.getmtime(path):
 63 |                 data = np.load(npy_path)
 64 |                 return data['x'], data['y']
 65 | 
 66 |         # Convert the .csv file to numpy
 67 |         y_list = []
 68 |         x_list = []
 69 |         with CSVDataset._open_path(path) as csv_file:
 70 | 
 71 |             invalid_y = {'', 'na', "nan", 'NA', 'NaN'}
 72 | 
 73 |             is_header = True
 74 |             for row in csv.reader(csv_file):
 75 |                 if is_header:
 76 |                     is_header = False
 77 |                 else:
 78 |                     y_list.append(float(row[0]) if row[0] not in invalid_y else np.nan)
 79 |                     x_list.append(list(map(float, row[1:])))
 80 | 
 81 |             x = np.array(x_list, dtype=np.float32)
 82 |             y = np.array(y_list, dtype=np.float32)
 83 | 
 84 |         np.savez_compressed(npy_path, x=x, y=y)
 85 |         if del_raw:
 86 |             os.remove(path)
 87 |         return x, y
 88 | 
 89 | 
 90 | def train(config, config_args):
 91 | 
 92 |     # Load config replacing tags
 93 |     with open(config, 'r') as f:
 94 |         config = ''.join(f.readlines())
 95 |     for nam in config_args:
 96 |         config = config.replace('${' + nam + "}", config_args[nam])
 97 | 
 98 |     train_obj = yaml_parse.load(config)
 99 | 
100 |     try:
101 |         iter(train_obj)
102 |         iterable = True
103 |     except TypeError:
104 |         iterable = False
105 | 
106 |     # # Undo our custom logging setup.
107 |     # restore_defaults()
108 |     # root_logger = logging.getLogger()
109 |     # formatter = CustomFormatter(prefix='%(asctime)s ', only_from='pylearn2')
110 |     # handler = CustomStreamHandler(formatter=formatter)
111 |     # root_logger.addHandler(handler)
112 |     # root_logger.setLevel(logging.INFO)
113 | 
114 |     if iterable:
115 |         for number, subobj in enumerate(iter(train_obj)):
116 |             # Execute this training phase.
117 |             subobj.main_loop()
118 |             del subobj
119 |             gc.collect()
120 |     else:
121 |         train_obj.main_loop()
122 | 
123 | 
124 | def add_config_args(cfg_args, cfg_dict=None):
125 |     if cfg_dict is None:
126 |         cfg_dict = {}
127 |     cfg_args = vars(cfg_args)
128 |     for nam in cfg_args:
129 |         val = str(cfg_args[nam])
130 |         cfg_dict[nam] = val
131 |         cfg_dict[nam.lower()] = val
132 |         cfg_dict[nam.upper()] = val
133 |     return cfg_dict
134 | 
135 | 
136 | def predict(model_file, test_data_file, test_pred_file):
137 | 
138 |     model = serial.load(model_file)
139 |     dataset = CSVDataset(path=test_data_file, x_only=True)
140 | 
141 |     # use smallish batches to avoid running out of memory
142 |     batch_size = 100
143 |     model.set_batch_size(batch_size)
144 |     # dataset must be multiple of batch size of some batches will have
145 |     # different sizes. theano convolution requires a hard-coded batch size
146 |     n_row = dataset.X.shape[0]
147 |     extra = batch_size - n_row % batch_size
148 |     assert (n_row + extra) % batch_size == 0
149 |     if extra > 0:
150 |         dataset.X = np.concatenate((dataset.X, np.zeros((extra, dataset.X.shape[1]),
151 |                                                         dtype=dataset.X.dtype)), axis=0)
152 |     assert dataset.X.shape[0] % batch_size == 0
153 | 
154 |     x_theano = model.get_input_space().make_batch_theano()
155 |     y_theano = model.fprop(x_theano)
156 | 
157 |     # from theano import tensor as T
158 |     from theano import function
159 |     f = function([x_theano], y_theano)
160 | 
161 |     y = []
162 |     for i in range(int(dataset.X.shape[0] / batch_size)):
163 |         x_arg = dataset.X[i*batch_size:(i+1)*batch_size, :]
164 |         if x_theano.ndim > 2:
165 |             x_arg = dataset.get_topological_view(x_arg)
166 |         y.append(f(x_arg.astype(x_theano.dtype)))
167 | 
168 |     y = np.concatenate(y)
169 | 
170 |     y = y[:n_row, :]
171 | 
172 |     # wirtes prediction to output
173 |     n_col = y.shape[1]
174 |     with open(test_pred_file, 'w') as out:
175 |         for r in range(n_row):
176 |             for c in range(n_col):
177 |                 if n_col == 2 and c == 0:
178 |                     continue
179 |                 if c > 0 and n_col > 2:
180 |                     out.write(',')
181 |                 out.write('%f' % (y[r, c]))
182 | 
183 |             out.write('\n')
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     # -train_config /home/lucas/ml-r-tb/contest/avito-context-click/data/template/zens_nn.yaml
189 |     # -model_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.model.pkl
190 |     # -train_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.tr.csv
191 |     # -val_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.val.csv
192 |     # -test_data_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.test.csv
193 |     # -test_pred_file /home/lucas/ml-r-tb/contest/avito-context-click/data/output-py/zens_nn/data.1.test.pred
194 |     # -n_features 5
195 | 
196 |     parser = argparse.ArgumentParser(
197 |         description="Launch an training from a YAML configuration file.",
198 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
199 |     )
200 | 
201 |     parser.add_argument('-train_config',
202 |                         type=str,
203 |                         default=None,
204 |                         help='A YAML configuration file specifying the training procedure')
205 | 
206 |     parser.add_argument('-model_file',
207 |                         type=str,
208 |                         default=None,
209 |                         help='File with model path used to predict')
210 | 
211 |     parser.add_argument('-test_data_file',
212 |                         type=str,
213 |                         default=None,
214 |                         help='File data to predict')
215 | 
216 |     parser.add_argument('-test_pred_file',
217 |                         type=str,
218 |                         default=None,
219 |                         help='File to output predictions to')
220 | 
221 |     args, extra_args = parser.parse_known_args()
222 |     for arg in extra_args:
223 |         if arg.startswith("-"):
224 |             parser.add_argument(arg, type=str)
225 | 
226 |     args = parser.parse_args()
227 | 
228 |     # if args.train_config is not None:
229 |     #     config_dict = add_config_args(cfg_args=args)
230 |     #     train(config=args.train_config, config_args=config_dict)
231 | 
232 |     if args.test_data_file is not None and args.model_file is not None and args.test_pred_file is not None:
233 |         predict(model_file=args.model_file, test_data_file=args.test_data_file, test_pred_file=args.test_pred_file)
234 | 


--------------------------------------------------------------------------------
/avito-context-click-py/train_scikit.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import sklearn
  3 | from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, \
  4 |     RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
  5 | from sklearn.linear_model import LogisticRegression
  6 | import sklearn.metrics as sk_metrics
  7 | import argparse
  8 | import pandas as pd
  9 | import timeit
 10 | import numpy as np
 11 | import ml_metrics
 12 | import ast
 13 | import itertools
 14 | import os
 15 | import math
 16 | import random
 17 | from sklearn.pipeline import Pipeline
 18 | import six
 19 | import inspect
 20 | import gzip
 21 | # noinspection PyUnresolvedReferences
 22 | from six.moves import cPickle as pkl
 23 | 
 24 | 
 25 | def get_class(kls):
 26 |     parts = kls.split('.')
 27 |     module = ".".join(parts[:-1])
 28 |     m = __import__(module)
 29 |     for comp in parts[1:]:
 30 |         m = getattr(m, comp)
 31 |     return m
 32 | 
 33 | 
 34 | def predict_test(feat_importance_fun, mappings, model, na_fill_value, predict, silent, staged_predict,
 35 |                  target_col, test_data_file, test_metric, test_pred_file, x_cols, metric_type, weight_col):
 36 |     if not silent:
 37 |         print("Predicting : %s to %s" % (test_data_file, test_pred_file))
 38 | 
 39 |     test_x = load_pd_df(test_data_file)
 40 |     if mappings is not None:
 41 |         for col in test_x.columns:
 42 |             if col in mappings:
 43 |                 test_x[col] = test_x[col].map(mappings[col]).fillna(na_fill_value)
 44 |             else:
 45 |                 test_x[col] = test_x[col].fillna(na_fill_value)
 46 |     test_y = None
 47 |     if target_col in test_x.columns:
 48 |         test_y = test_x[target_col][test_x[target_col] != na_fill_value]
 49 |         test_y2 = test_x[target_col][pd.notnull(test_x[target_col])]
 50 |         if len(test_y) != len(test_x) or len(test_y2) != len(test_x):
 51 |             test_y = None
 52 |         del test_y2
 53 | 
 54 |     test_weight = None
 55 |     if weight_col is not None:
 56 |         if weight_col in test_x.columns:
 57 |             test_weight = test_x[weight_col]
 58 |             del test_x[weight_col]
 59 | 
 60 |     test_x = test_x[x_cols]
 61 | 
 62 |     test_pred = predict((model, test_x))
 63 |     if test_pred.shape[1] == 1:
 64 |         test_pred = pd.DataFrame({'pred': test_pred[:, 0]})
 65 |     elif test_pred.shape[1] == 2:
 66 |         test_pred = pd.DataFrame({'pred': test_pred[:, 1]})
 67 |     else:
 68 |         test_pred_df = None
 69 |         for c in range(test_pred.shape[1]):
 70 |             if test_pred_df is None:
 71 |                 test_pred_df = pd.DataFrame({'pred0': test_pred[:, c]})
 72 |             else:
 73 |                 test_pred_df['pred' + str(c)] = test_pred[:, c]
 74 |         test_pred = test_pred_df
 75 | 
 76 |     if not silent and test_y is not None:
 77 |         print_stages(test_y=test_y, stage_predictions=staged_predict((model, test_x)),
 78 |                      test_metric=test_metric, metric_type=metric_type, test_weight=test_weight)
 79 | 
 80 |     if not silent:
 81 |         feat_importance = feat_importance_fun(model)
 82 |         if feat_importance is not None:
 83 |             feat_importance = pd.DataFrame({'Features': x_cols,
 84 |                                             'Importance': feat_importance})
 85 |             pd.set_option('max_columns', len(test_x.columns))
 86 |             pd.set_option('max_rows', len(test_x))
 87 |             print("Feature importances:")
 88 |             feat_importance.sort(columns='Importance', ascending=False, inplace=True)
 89 |             feat_importance.index = range(1, len(feat_importance) + 1)
 90 |             print(feat_importance)
 91 | 
 92 |     test_pred.to_csv(test_pred_file, index=False)
 93 | 
 94 | 
 95 | def load_pd_df(file_name, del_old=True, bin_suffix='.bin.pkl'):
 96 |     ret_val = None
 97 |     bin_file_name = file_name + bin_suffix
 98 |     if os.path.isfile(bin_file_name):
 99 |         if not os.path.isfile(file_name) or os.path.getmtime(bin_file_name) > os.path.getmtime(file_name):
100 |             ret_val = load_model_bin(model_file=bin_file_name)
101 |             print("Loading %s cache file" % bin_file_name)
102 | 
103 |     if ret_val is None:
104 |         print("Loading %s raw file" % file_name)
105 |         ret_val = pd.read_csv(file_name)
106 |         print("Saving %s cache file" % bin_file_name)
107 |         save_model_bin(model=ret_val, model_file=bin_file_name)
108 |         if del_old:
109 |             print("Erasing %s raw file" % file_name)
110 |             os.remove(file_name)
111 | 
112 |     return ret_val
113 | 
114 | 
115 | def data_filter(data, filter_dict):
116 |     if len(filter_dict) > 0:
117 |         for filter_col in filter_dict:
118 |             data = data[data[filter_col] == filter_dict[filter_col]]
119 |     return data
120 | 
121 | 
122 | def train_and_predict(train_data_file, test_data_file, target_col, test_pred_file,
123 |                       test_data_file2, test_pred_file2,
124 |                       model_type, model_file, fit_args, test_metric, na_fill_value,
125 |                       silent, skip_mapping, load_model, train_filter, metric_type, load_type,
126 |                       bootstrap, bootstrap_seed, weight_col):
127 |     start = timeit.default_timer()
128 | 
129 |     train_x = load_pd_df(train_data_file)
130 | 
131 |     len_train_before = len(train_x)
132 |     train_x = data_filter(train_x, train_filter)
133 |     if not silent:
134 |         print("Train has %d instances (was %d before filtering)" % (len(train_x), len_train_before))
135 | 
136 |     mappings = None if skip_mapping else dict()
137 |     if mappings is not None:
138 |         data_all = train_x.append(load_pd_df(test_data_file))
139 |         if test_data_file2 is not None:
140 |             data_all = data_all.append(load_pd_df(test_data_file2))
141 |         if not silent:
142 |             print("Mapping unkown and category values...")
143 |         for col in train_x.columns:
144 |             if col not in ['target_col']:
145 |                 if data_all[col].dtype == np.dtype('object'):
146 |                     s = np.unique(data_all[col].fillna(na_fill_value).values)
147 |                     mappings[col] = pd.Series([x[0] for x in enumerate(s)], index=s)
148 |                     train_x[col] = train_x[col].map(mappings[col]).fillna(na_fill_value)
149 |                 else:
150 |                     train_x[col] = train_x[col].fillna(na_fill_value)
151 |         del data_all
152 |     train_y = train_x[target_col]
153 |     del train_x[target_col]
154 | 
155 |     extra_fit_args = dict()
156 |     if weight_col is not None:
157 |         extra_fit_args['sample_weight'] = train_x[weight_col].values
158 |         del train_x[weight_col]
159 | 
160 |     if 0 < bootstrap < 1.0:
161 |         if bootstrap_seed is not None:
162 |             if not silent:
163 |                 print("Setting bootstrap seed to %d" % bootstrap_seed)
164 |             np.random.seed(bootstrap_seed)
165 |             random.seed(bootstrap_seed)
166 |         bootstrap_len = int(math.floor(bootstrap * len(train_x)))
167 |         bootstrap_ix = random.sample(range(len(train_x)), bootstrap_len)
168 |         train_x = train_x.iloc[bootstrap_ix]
169 |         train_x.reset_index()
170 |         train_y = train_y.iloc[bootstrap_ix]
171 |         train_y.reset_index()
172 | 
173 |     x_cols = train_x.columns
174 |     feat_importance_fun = lambda fitted_model: fitted_model.feature_importances_
175 |     predict = lambda fitted_model, pred_x: fitted_model.predict(pred_x)
176 |     staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
177 | 
178 |     model = None
179 |     if load_model and os.path.exists(model_file):
180 |         if not silent:
181 |             print("Loading model %s" % model_file)
182 |         model = load_model_bin(model_file=model_file)
183 | 
184 |     if model_type == "RandomForestRegressor":
185 |         if model is None:
186 |             model = RandomForestRegressor(**fit_args)
187 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
188 |         predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x)
189 | 
190 |     elif model_type == "RandomForestClassifier":
191 |         if model is None:
192 |             model = RandomForestClassifier(**fit_args)
193 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
194 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
195 |         staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
196 | 
197 |     elif model_type == "ExtraTreesRegressor":
198 |         if model is None:
199 |             model = ExtraTreesRegressor(**fit_args)
200 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
201 |         predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x)
202 | 
203 |     elif model_type == "ExtraTreesClassifier":
204 |         if model is None:
205 |             model = ExtraTreesClassifier(**fit_args)
206 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
207 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
208 |         staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
209 | 
210 |     elif model_type == "GradientBoostingRegressor":
211 |         if model is None:
212 |             model = GradientBoostingRegressor(**fit_args)
213 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
214 |         elif load_type == "fit_more":
215 |             model.warm_start = True
216 |             model.n_estimators += fit_args['n_estimators']
217 |             model.fit(X=train_x, y=train_y)
218 |         predict = lambda fitted_model, pred_x: continuous_predict(model=fitted_model, x=pred_x)
219 |         staged_predict = lambda fitted_model, pred_x: staged_pred_continuous(model=fitted_model, x=pred_x)
220 |         if load_type == "pred_at" and fit_args['n_estimators'] < model.n_estimators:
221 |             if not silent:
222 |                 print("Predict using %d trees" % fit_args['n_estimators'])
223 |             predict = lambda fitted_model, pred_x: staged_pred_continuous_at_n(model=fitted_model, x=pred_x,
224 |                                                                                n=fit_args['n_estimators'])
225 |     elif model_type == "GradientBoostingClassifier":
226 |         if model is None:
227 |             model = GradientBoostingClassifier(**fit_args)
228 |             model.fit(X=train_x, y=train_y, **extra_fit_args)
229 |         elif load_type == "fit_more":
230 |             model.warm_start = True
231 |             model.n_estimators += fit_args['n_estimators']
232 |             model.fit(X=train_x, y=train_y)
233 |         staged_predict = lambda fitted_model, pred_x: staged_pred_proba(model=fitted_model, x=pred_x)
234 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
235 |         if load_type == "pred_at" and fit_args['n_estimators'] < model.n_estimators:
236 |             if not silent:
237 |                 print("Predict using %d trees" % fit_args['n_estimators'])
238 |             predict = lambda fitted_model, pred_x: staged_pred_proba_at_n(model=fitted_model, x=pred_x,
239 |                                                                           n=fit_args['n_estimators'])
240 |     elif model_type == "LogisticRegression":
241 |         if model is None:
242 |             model = LogisticRegression(**fit_args)
243 |             model.fit(X=train_x, y=train_y)
244 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
245 |         staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
246 |         feat_importance_fun = lambda fitted_model: None
247 | 
248 |     elif model_type == "SVC":
249 |         if model is None:
250 |             model = sklearn.svm.SVC(**fit_args)
251 |             model.fit(X=train_x, y=train_y)
252 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
253 |         staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
254 |         feat_importance_fun = lambda fitted_model: None
255 | 
256 |     elif model_type == "Pipeline":
257 |         if model is None:
258 |             model = Pipeline([
259 |                 ('pre_process', get_class(fit_args['pre_process']['name'])(**fit_args['pre_process']['args'])),
260 |                 ('model', get_class(fit_args['model']['name'])(**fit_args['model']['args']))
261 |             ])
262 |             model.fit(X=train_x, y=train_y)
263 |         predict = lambda fitted_model, pred_x: pred_proba(model=fitted_model, x=pred_x)
264 |         staged_predict = lambda fitted_model, pred_x: [predict(fitted_model, pred_x)]
265 |         feat_importance_fun = lambda fitted_model: None
266 | 
267 |     if not silent:
268 |         print("Saving model %s" % model_file)
269 |     save_model_bin(model=model, model_file=model_file)
270 | 
271 |     if not silent:
272 |         stop = timeit.default_timer()
273 |         print("Train time: %d s" % (stop - start))
274 | 
275 |     del train_x, train_y
276 | 
277 |     start_pred = timeit.default_timer()
278 |     predict_test(feat_importance_fun=feat_importance_fun,
279 |                  mappings=mappings,
280 |                  model=model,
281 |                  na_fill_value=na_fill_value,
282 |                  predict=predict,
283 |                  silent=silent,
284 |                  staged_predict=staged_predict,
285 |                  target_col=target_col,
286 |                  test_data_file=test_data_file,
287 |                  test_metric=test_metric,
288 |                  test_pred_file=test_pred_file,
289 |                  x_cols=x_cols,
290 |                  metric_type=metric_type,
291 |                  weight_col=weight_col)
292 |     if not silent:
293 |         stop = timeit.default_timer()
294 |         print("Predict time: %d s" % (stop - start_pred))
295 | 
296 |     if test_data_file2 is not None:
297 |         start_pred = timeit.default_timer()
298 |         predict_test(feat_importance_fun=lambda fitted_model: None,
299 |                      mappings=mappings,
300 |                      model=model,
301 |                      na_fill_value=na_fill_value,
302 |                      predict=predict,
303 |                      silent=silent,
304 |                      staged_predict=staged_predict,
305 |                      target_col=target_col,
306 |                      test_data_file=test_data_file2,
307 |                      test_metric=test_metric,
308 |                      test_pred_file=test_pred_file2,
309 |                      x_cols=x_cols,
310 |                      metric_type=metric_type,
311 |                      weight_col=weight_col)
312 |         if not silent:
313 |             stop = timeit.default_timer()
314 |             print("Predict2 time: %d s" % (stop - start_pred))
315 | 
316 |     if not silent:
317 |         stop = timeit.default_timer()
318 |         print("Total time: %d s" % (stop - start))
319 | 
320 | 
321 | def staged_pred_proba(model, x):
322 |     for pred in model.staged_predict_proba(x):
323 |         yield prob_pred(pred)
324 | 
325 | 
326 | def staged_pred_proba_at_n(model, x, n):
327 |     return nth(staged_pred_proba(model=model, x=x), n)
328 | 
329 | 
330 | def pred_proba(model, x):
331 |     return prob_pred(model.predict_proba(X=x))
332 | 
333 | 
334 | def prob_pred(pred):
335 |     return pred
336 | 
337 | 
338 | def staged_pred_continuous(model, x):
339 |     for pred in model.staged_predict(x):
340 |         yield to_2dim(pred)
341 | 
342 | 
343 | def staged_pred_continuous_at_n(model, x, n):
344 |     return nth(staged_pred_continuous(model=model, x=x), n)
345 | 
346 | 
347 | def continuous_predict(model, x):
348 |     return to_2dim(model.predict(X=x))
349 | 
350 | 
351 | def to_2dim(array_val):
352 |     return np.array(array_val, ndmin=2).transpose()
353 | 
354 | 
355 | def nth(iterable, n):
356 |     return next(itertools.islice(iterable, n, None))
357 | 
358 | 
359 | def avg_eval_metric(eval_metric, test_y, prediction, metric_type):
360 |     if prediction.shape[1] == 1:
361 |         return eval_metric(test_y, prediction[:, 0])
362 |     elif prediction.shape[1] == 2:
363 |         return eval_metric(test_y, prediction[:, 1])
364 |     else:
365 |         metric_val = 0.0
366 |         metric_count = 0.0
367 |         if metric_type == "cumulative":
368 |             cur_pred = np.zeros(prediction.shape[0])
369 |             for c in range(prediction.shape[1] - 1):
370 |                 # noinspection PyTypeChecker
371 |                 cur_actual = np.array(np.array(test_y) <= c).astype(int)
372 |                 cur_pred += prediction[:, c]
373 |                 metric_val += eval_metric(cur_actual, cur_pred)
374 |                 metric_count += 1.0
375 |         else:
376 |             for c in range(prediction.shape[1]):
377 |                 # noinspection PyTypeChecker
378 |                 cur_actual = np.array(np.array(test_y) == c).astype(int)
379 |                 metric_val += eval_metric(cur_actual, prediction[:, c])
380 |                 metric_count += 1.0
381 |         if metric_type == "sum":
382 |             metric_count = 1.0
383 |         return metric_val / metric_count
384 | 
385 | 
386 | def print_stages(test_y, stage_predictions, test_metric, metric_type, test_weight):
387 |     if hasattr(ml_metrics, test_metric):
388 |         eval_metric = getattr(ml_metrics, test_metric)
389 |     else:
390 |         eval_metric = getattr(sk_metrics, test_metric)
391 |     if test_weight is not None:
392 |         metric_args = inspect.getargspec(eval_metric)[0]
393 |         if 'weight' in metric_args:
394 |             eval_metric_orig = eval_metric
395 |             eval_metric = lambda act, pred: eval_metric_orig(act, pred, test_weight)
396 |     count = 0
397 |     iters = []
398 |     loss = []
399 |     count_factor = 50
400 |     for prediction in stage_predictions:
401 |         count += 1
402 |         if count in [1, 10, 50] or count % count_factor == 0:
403 |             iters.append(count)
404 |             loss.append(avg_eval_metric(eval_metric, test_y, prediction, metric_type=metric_type))
405 |         if count > 1000:
406 |             count_factor = 500
407 |         elif count > 500:
408 |             count_factor = 200
409 |         elif count > 250:
410 |             count_factor = 100
411 |     loss_df = pd.DataFrame({'Iteration': iters, 'Loss': loss})
412 |     loss_df.rename(columns={'Loss': test_metric}, inplace=True)
413 |     pd.set_option('max_columns', len(loss_df.columns))
414 |     pd.set_option('max_rows', len(loss_df))
415 |     print("Loss:")
416 |     print(loss_df)
417 | 
418 | 
419 | def load_model_bin(model_file):
420 |     with gzip.open(model_file, 'rb') as model_file:
421 |         return pkl.load(model_file)
422 | 
423 | 
424 | def save_model_bin(model, model_file):
425 |     with gzip.open(model_file, 'wb') as model_file:
426 |         pkl.dump(model, model_file, pkl.HIGHEST_PROTOCOL)
427 | 
428 | 
429 | if __name__ == "__main__":
430 | 
431 |     parser = argparse.ArgumentParser(description='Train and predict data using some sklearn algorithms.',
432 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
433 | 
434 |     parser.add_argument('-train_data_file',
435 |                         # default='../data/output-py/gbr_01/gbr_01_k_1_tr.csv',
436 |                         required=True,
437 |                         type=str,
438 |                         help='CSV with training data.')
439 | 
440 |     parser.add_argument('-test_data_file',
441 |                         # default='../data/output-py/gbr_01/gbr_01_k_1_test.csv',
442 |                         required=True,
443 |                         type=str,
444 |                         help='CSV with testing data.')
445 | 
446 |     parser.add_argument('-test_data_file2',
447 |                         default=None,
448 |                         type=str,
449 |                         help='CSV with testing data.')
450 | 
451 |     parser.add_argument('-test_pred_file',
452 |                         # default='../data/output-py/gbr_01/gbr_01_k_1_test_pred.csv',
453 |                         required=True,
454 |                         type=str,
455 |                         help='Path to output testing predictions.')
456 | 
457 |     parser.add_argument('-test_pred_file2',
458 |                         default=None,
459 |                         type=str,
460 |                         help='Path to output testing predictions.')
461 | 
462 |     parser.add_argument('-test_metric',
463 |                         # default='normalized_weighted_gini',
464 |                         required=True,
465 |                         type=str,
466 |                         help='Metric to compute on test set. Any metric on ml_metrics or sklearn.metrics')
467 | 
468 |     parser.add_argument('-target_col',
469 |                         # default='target',
470 |                         required=True,
471 |                         type=str,
472 |                         help='Name of target variable.')
473 | 
474 |     parser.add_argument('-weight_col',
475 |                         # default='weight',
476 |                         default=None,
477 |                         type=str,
478 |                         help='Name of weight column.')
479 | 
480 |     parser.add_argument('-metric_type',
481 |                         default='auto',
482 |                         type=str,
483 |                         help='Type of metric to evaluate.',
484 |                         choices=[
485 |                             "auto",
486 |                             "cumulative",
487 |                             "sum"])
488 | 
489 |     parser.add_argument('-model_type',
490 |                         # default='GradientBoostingRegressor',
491 |                         required=True,
492 |                         type=str,
493 |                         help='Type of model to fit.',
494 |                         choices=["RandomForestRegressor",
495 |                                  "RandomForestClassifier",
496 |                                  "ExtraTreesRegressor",
497 |                                  "ExtraTreesClassifier",
498 |                                  "GradientBoostingRegressor",
499 |                                  "GradientBoostingClassifier",
500 |                                  "LogisticRegression",
501 |                                  "SVC",
502 |                                  "Pipeline"])
503 | 
504 |     parser.add_argument('-model_file',
505 |                         # default='../data/output-py/gbr_01/gbr_01_k_1_tr.csv.pkl',
506 |                         required=True,
507 |                         type=str,
508 |                         help='File to save the model to.')
509 | 
510 |     parser.add_argument('-na_fill_value',
511 |                         default=-20000,
512 |                         type=int,
513 |                         help='Value to fill in NAs.')
514 | 
515 |     parser.add_argument('-skip_mapping',
516 |                         # default=True,
517 |                         default=False,
518 |                         action='store_true',
519 |                         help='Skip na filling and category mapping.')
520 | 
521 |     parser.add_argument('-fit_args',
522 |                         # default='{\"n_estimators\": 10, \"learning_rate\": 0.001,  \"loss\": \"ls\",  '
523 |                         #         '\"max_features\": 5, \"max_depth\": 7,  \"random_state\": 788954,  '
524 |                         #         '\"subsample\": 1, \"verbose\": 50}',
525 |                         required=True,
526 |                         type=str,
527 |                         help='String in dictionary form of fit params.')
528 | 
529 |     parser.add_argument('-silent',
530 |                         default=False,
531 |                         action='store_true',
532 |                         help="Don't print execution information.")
533 | 
534 |     parser.add_argument('-train_filter',
535 |                         default='{}',
536 |                         type=str,
537 |                         help="Don't print execution information.")
538 | 
539 |     parser.add_argument('-load_model',
540 |                         default=False,
541 |                         action='store_true',
542 |                         help="Loads saved model if exists.")
543 | 
544 |     parser.add_argument('-load_type',
545 |                         default='fit_more',
546 |                         type=str,
547 |                         help='Type of model loading',
548 |                         choices=[
549 |                             "fit_more",
550 |                             "pred_at"])
551 | 
552 |     parser.add_argument('-bootstrap',
553 |                         default=0,
554 |                         type=float,
555 |                         help="Do bootstrap sampling.")
556 | 
557 |     parser.add_argument('-bootstrap_seed',
558 |                         default=None,
559 |                         type=int,
560 |                         help='Bootstrap seed.')
561 | 
562 |     args = vars(parser.parse_args())
563 | 
564 |     args['train_filter'] = ast.literal_eval(args['train_filter'])
565 |     args['fit_args'] = ast.literal_eval(args['fit_args'])
566 |     for key in args['fit_args']:
567 |         if isinstance(args['fit_args'][key], six.string_types):
568 |             if args['fit_args'][key] in args:
569 |                 args['fit_args'][key] = args[args['fit_args'][key]]
570 | 
571 |     if not args['silent']:
572 |         print(args)
573 | 
574 |     with warnings.catch_warnings():
575 |         warnings.filterwarnings("ignore", category=Warning)
576 |         train_and_predict(**args)
577 | 


--------------------------------------------------------------------------------
/avito-context-click-py/train_xgb.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import xgboost as xgb
 4 | import os
 5 | 
 6 | if __name__ == "__main__":
 7 | 
 8 |     parser = argparse.ArgumentParser(description='train xgb model',
 9 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
10 | 
11 |     parser.add_argument(
12 |         '-train',
13 |         default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/"
14 |                 "../data/output-r/data.val.tr.full.libsvm.head#data.val.tr.full.cache",
15 |         type=str,
16 |         help='CSV with training data')
17 |     parser.add_argument(
18 |         '-test',
19 |         default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/"
20 |                 "../data/output-r/data.val.tr.full.libsvm.head#data.val.tr.full.cache",
21 |         type=str,
22 |         help='CSV with test data')
23 |     parser.add_argument(
24 |         '-pred',
25 |         default="/home/lucas/ml-r-tb/contest/avito-context-click/avito-context-click-r/"
26 |                 "../data/output-r/data.val.tt.full.pred",
27 |         type=str,
28 |         help='Prediction path')
29 |     parser.add_argument(
30 |         '-epoch',
31 |         default=4,
32 |         type=int,
33 |         help='Epochs')
34 | 
35 |     args = parser.parse_args()
36 | 
37 |     pred_file = args.pred
38 | 
39 |     xg_params = {
40 |         'objective': 'binary:logistic',
41 |         'eta': 0.2,
42 |         'max_depth': 10,
43 |         'eval_metric': 'logloss',
44 |         'silent': 1,
45 |         'nthread': 6,
46 |         'gamma': 0.8,
47 |         'min_child_weight': 4,
48 |         'colsample_bytree': 0.7,
49 |         'colsample_bylevel': 0.8
50 |     }
51 |     num_round = 75
52 |     xg_data_tr = xgb.DMatrix(args.train)
53 |     xg_data_tst = xgb.DMatrix(args.test)
54 | 
55 |     for e in range(args.epoch):
56 |         print("processing iteration %d" % e)
57 |         xg_params['seed'] = 3015 + (10*e)
58 |         model = xgb.train(
59 |             params=list(xg_params.items()),
60 |             dtrain=xg_data_tr,
61 |             num_boost_round=75,
62 |             feval=[(xg_data_tst, 'val')]
63 |         )
64 |         pred_list = model.predict(xg_data_tst)
65 |         pred_list = [round(x, 6) for x in pred_list]
66 |         preds = pd.DataFrame(pred_list, columns=['IsClick'])
67 |         preds.to_csv(os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
68 | 


--------------------------------------------------------------------------------
/avito-context-click-py/train_xgb_dtry.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import sys
 4 | import getopt
 5 | from sklearn import feature_extraction
 6 | from sklearn.cross_validation import StratifiedKFold
 7 | from sklearn.preprocessing import LabelEncoder
 8 | #sys.path.append('/Users/ef/xgboost/wrapper')
 9 | import xgboost as xgb
10 | import random
11 | from sklearn.metrics import log_loss
12 | import os
13 | 
14 | def load_train_data(path):
15 |     df = pd.read_csv(path)
16 |     y = df['IsClick'].values
17 |     ids = df['ID'].values
18 |     df.drop(['IsClick','ID'], axis=1, inplace=True)
19 |     #df.drop(['AdTitleWordLikeli'], axis=1, inplace=True)
20 |     #df.drop(['AdIDRareWordCount'], axis=1, inplace=True)
21 |     #df.drop(['IPIDlikeli'], axis=1, inplace=True)
22 |     #df.drop(['IPIDUserAgentOSIDlikeli'], axis=1, inplace=True)
23 |     #df.drop(['UserAdViewTotalCount', 'UserAdViewUniqueCount','UserAdViewTotalCount2', 'UserAdViewUniqueCount2'], axis=1, inplace=True)
24 |     #df.drop(['LocationUserUniqueCount', 'CategoryUserUniqueCount'], axis=1, inplace=True)
25 |     #df.drop(['AdPosition1Count', 'AdPosition7Count'], axis=1, inplace=True)
26 |     #df.drop(['UserAgentIDlikeli', 'UserAgentOSIDlikeli', 'UserDeviceIDlikeli', 'UserAgentFamilyIDlikeli'], axis=1, inplace=True)
27 |     #df.drop(['AdCategoryPriceDeviation'], axis=1, inplace=True)
28 |     X = df.values.copy().astype(np.float32)
29 |     #np.random.seed(seed=2015)
30 |     #np.random.shuffle(X)
31 |     return X, y, ids
32 |     
33 | def load_test_data(path):
34 |     df = pd.read_csv(path)
35 |     if 'IsClick' in df.columns.values:
36 |         df.drop(['IsClick'], axis=1, inplace=True)
37 |     ids = df['ID'].values
38 |     df.drop(['ID'], axis=1, inplace=True)
39 |     #df.drop(['AdTitleWordLikeli'], axis=1, inplace=True)
40 |     #df.drop(['AdIDRareWordCount'], axis=1, inplace=True)
41 |     #df.drop(['IPIDlikeli'], axis=1, inplace=True)
42 |     #df.drop(['IPIDUserAgentOSIDlikeli'], axis=1, inplace=True)
43 |     #df.drop(['UserAdViewTotalCount', 'UserAdViewUniqueCount','UserAdViewTotalCount2', 'UserAdViewUniqueCount2'], axis=1, inplace=True)
44 |     #df.drop(['LocationUserUniqueCount', 'CategoryUserUniqueCount'], axis=1, inplace=True)
45 |     #df.drop(['AdPosition1Count', 'AdPosition7Count'], axis=1, inplace=True)
46 |     #df.drop(['UserAgentIDlikeli', 'UserAgentOSIDlikeli', 'UserDeviceIDlikeli', 'UserAgentFamilyIDlikeli'], axis=1, inplace=True)
47 |     #df.drop(['AdCategoryPriceDeviation'], axis=1, inplace=True)
48 |     X = df.values.copy().astype(np.float32)
49 |     return X, ids
50 | 
51 | opts, args = getopt.getopt(sys.argv[1:], "t:v:p:e:", ["train=", "test=", "pred=", "epoch="])
52 | opts = {x[0]:x[1] for x in opts}
53 | train_file = opts['--train']
54 | test_file = opts['--test']
55 | pred_file = opts['--pred']
56 | epoch = int(opts['--epoch'])
57 | 
58 | X, y, ids_train = load_train_data(train_file)
59 | X_test, ids_test = load_test_data(test_file)
60 | num_features = X.shape[1]
61 | 
62 | param = {}
63 | param['objective'] = 'binary:logistic'
64 | param['eta'] = 0.2 #0.1
65 | param['max_depth'] = 10
66 | param['eval_metric'] = 'logloss'
67 | param['silent'] = 1
68 | param['nthread'] = 6
69 | param['gamma'] = 0.8
70 | param['min_child_weight'] = 4 #10
71 | #param['subsample'] = 0.8
72 | param['colsample_bytree'] = 0.7
73 | param['colsample_bylevel'] = 0.8
74 | num_round = 75 #85
75 | 
76 | for e in range(epoch):
77 |     print "processing iteration", e
78 |     param['seed'] = 3015 + (10*e)
79 |     plst = list(param.items())
80 | 
81 |     index_shuffle = [i for i in range(X.shape[0])]
82 |     random.shuffle(index_shuffle)
83 |     xgmat_train = xgb.DMatrix( X[index_shuffle,:], label=y[index_shuffle], missing = -1.0)
84 |     bst = xgb.train( plst, xgmat_train, num_round );
85 |     #fscore = [ (v,k) for k,v in bst.get_fscore().iteritems() ]
86 |     #fscore.sort(reverse=True)
87 |     #print fscore
88 |     xgmat_test = xgb.DMatrix( X_test, missing = -1.0 )
89 |     pred_list = bst.predict( xgmat_test )
90 |     pred_list = [round(x, 5) for x in pred_list]
91 |     preds = pd.DataFrame(pred_list, columns=['IsClick'])
92 |     preds['ID'] = ids_test
93 |     preds.to_csv(os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
94 | 


--------------------------------------------------------------------------------
/avito-context-click-py/util_rpython.py:
--------------------------------------------------------------------------------
 1 | from sklearn.calibration import CalibratedClassifierCV
 2 | from sklearn.cross_validation import KFold
 3 | import numpy as np
 4 | 
 5 | 
 6 | def calibrate_probs(y_val, prob_val, prob_test, n_folds=2, method='isotonic', random_state=5968):
 7 |     """ Calling from R:
 8 | 
 9 |         suppressMessages(library("rPython")) # Load RPython
10 |         python.load("path/to/util_rpython.py")
11 | 
12 |         data.pred.calib <- python.call('calibrate_probs',
13 |                                    y_val=y_val, # Actual values from validation
14 |                                    prob_val=pred_val, # Predicted values from validation
15 |                                    prob_test=pred_test) # Predicted values from test
16 | 
17 |         # data.pred.calib will be a list, so to get the calibrated predictions for each value we do:
18 |         calib_pred_val = data.pred.calib$val
19 |         calib_pred_test = data.pred.calib$test
20 | 
21 |     """
22 | 
23 |     y_val = np.asarray(y_val, dtype=float)
24 |     prob_val = np.asarray(prob_val, dtype=float).reshape((-1, 1))
25 |     prob_test = np.asarray(prob_test, dtype=float).reshape((-1, 1))
26 | 
27 |     prob_clb_val = np.zeros(len(y_val))
28 |     prob_clb_test = np.zeros(len(prob_test))
29 | 
30 |     kf_val_full = KFold(len(y_val), n_folds=n_folds, random_state=random_state)
31 | 
32 |     for ix_train, ix_test in kf_val_full:
33 |         kf_val_inner = KFold(len(ix_train), n_folds=n_folds, random_state=random_state)
34 |         clf = CalibratedClassifierCV(method=method, cv=kf_val_inner)
35 |         clf.fit(prob_val[ix_train], y_val[ix_train])
36 |         prob_clb_val[ix_test] = clf.predict_proba(prob_val[ix_test])[:, 1]
37 |         prob_clb_test += clf.predict_proba(prob_test)[:, 1]/n_folds
38 | 
39 |     return {'val': list(prob_clb_val), 'test': list(prob_clb_test)}
40 | 


--------------------------------------------------------------------------------
/avito-context-click-r/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rapp.history


--------------------------------------------------------------------------------
/avito-context-click-r/.Rprofile:
--------------------------------------------------------------------------------
1 | 
2 | source("_fn.base.R")
3 | 


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/graphics-r3/empty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rproj.user/4B3CD3A5/graphics-r3/empty.png


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "path" : "~/ml-r-tb/contest/avito-context-click/avito-context-click-r",
3 |     "sortOrder" : [
4 |         {
5 |             "ascending" : true,
6 |             "columnIndex" : 2
7 |         }
8 |     ]
9 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "activeTab" : -1
3 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
 1 | {
 2 |     "left" : {
 3 |         "panelheight" : 582,
 4 |         "splitterpos" : 261,
 5 |         "topwindowstate" : "HIDE",
 6 |         "windowheight" : 653
 7 |     },
 8 |     "right" : {
 9 |         "panelheight" : 582,
10 |         "splitterpos" : 391,
11 |         "topwindowstate" : "NORMAL",
12 |         "windowheight" : 653
13 |     }
14 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "TabSet1" : 2,
3 |     "TabSet2" : 1
4 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/persistent-state:
--------------------------------------------------------------------------------
1 | abend="1"
2 | active-client-id="ceb8a635-f5e6-4d93-be42-1e0ba4cb32c2"
3 | 


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/saved_source_markers:
--------------------------------------------------------------------------------
1 | {"active_set":"","sets":[]}


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/19AE70ED:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/23B66537:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/266CD89:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/3B4F6947:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6A3DD511:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6EC2D3AD:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/6F75496B:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/9EF9E6CB:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/A7A18FB5:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/BBE2842F:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/C2BB45F6:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/D970D594:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/F8AE1A87:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/FB96E70:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/prop/INDEX:
--------------------------------------------------------------------------------
 1 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Fdata.build.R="C2BB45F6"
 2 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Fdata.combine.R="FB96E70"
 3 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.01.R="9EF9E6CB"
 4 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.02.R="23B66537"
 5 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.03.R="F8AE1A87"
 6 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.04.R="A7A18FB5"
 7 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.fm.05.R="6EC2D3AD"
 8 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.04.R="266CD89"
 9 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.05.R="BBE2842F"
10 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.ftrl.06.R="19AE70ED"
11 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.xgb.03.R="3B4F6947"
12 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l1.xgb.05.R="6F75496B"
13 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.l2.xgb.02.R="6A3DD511"
14 | ~%2Fml-r-tb%2Fcontest%2Favito-context-click%2Favito-context-click-r%2Ftrain.zens.R="D970D594"
15 | 


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/s-8FDFA111/lock_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/avito-context-click-r/.Rproj.user/4B3CD3A5/sdb/s-8FDFA111/lock_file


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/73D1DC80/persistent-state:
--------------------------------------------------------------------------------
 1 | build-last-errors="[]"
 2 | build-last-errors-base-dir=""
 3 | build-last-outputs="[]"
 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}"
 5 | console_procs="[]"
 6 | files.monitored-path=""
 7 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}"
 8 | imageDirtyState="0"
 9 | saveActionState="0"
10 | 


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/73D1DC80/saved_source_markers:
--------------------------------------------------------------------------------
1 | {"active_set":"","sets":[]}


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "path" : "~/Documents/eclipse/AvitoContext2015/final_model/avito-context-click-r",
3 |     "sortOrder" : [
4 |         {
5 |             "ascending" : true,
6 |             "columnIndex" : 2
7 |         }
8 |     ]
9 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "activeTab" : -1
3 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
 1 | {
 2 |     "left" : {
 3 |         "panelheight" : 737,
 4 |         "splitterpos" : 310,
 5 |         "topwindowstate" : "MAXIMIZE",
 6 |         "windowheight" : 775
 7 |     },
 8 |     "right" : {
 9 |         "panelheight" : 737,
10 |         "splitterpos" : 465,
11 |         "topwindowstate" : "HIDE",
12 |         "windowheight" : 775
13 |     }
14 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "TabSet1" : 0,
3 |     "TabSet2" : 0
4 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/186F7A30:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/2191579E:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/30E2EF33:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/31E02146:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/396ED20:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/3ACE6FF1:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/51638EA1:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName" : "Untitled1"
3 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/5C8CFDEE:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/61214431:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/6D0737A6:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/84994D68:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/B4D42750:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/B9BF59A7:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/BD7C1F23:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/C3A0AE51:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/D0B7BE74:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/DC31BA58:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/E0E39D19:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName" : "Untitled1"
3 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/F66A3FFA:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/FC1B5EBA:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/avito-context-click-r/.Rproj.user/D01F76BA/sdb/prop/INDEX:
--------------------------------------------------------------------------------
 1 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2F_fn.base.R="B4D42750"
 2 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2F_utils.R="396ED20"
 3 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.R="C3A0AE51"
 4 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.dtry.R="DC31BA58"
 5 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.build.tree.R="51638EA1"
 6 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fdata.combine.R="BD7C1F23"
 7 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Fmain.R="E0E39D19"
 8 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.01.R="5C8CFDEE"
 9 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.02.R="3ACE6FF1"
10 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.03.R="2191579E"
11 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.04.R="F66A3FFA"
12 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.fm.05.R="D0B7BE74"
13 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.04.R="31E02146"
14 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.05.R="84994D68"
15 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.ftrl.06.R="6D0737A6"
16 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.xgb.03.R="B9BF59A7"
17 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l1.xgb.05.R="61214431"
18 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.l2.xgb.02.R="30E2EF33"
19 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.xgb.dtry.R="FC1B5EBA"
20 | ~%2FDocuments%2Feclipse%2FAvitoContext2015%2Ffinal_model%2Favito-context-click-r%2Ftrain.zens.R="186F7A30"
21 | 


--------------------------------------------------------------------------------
/avito-context-click-r/_fn.base.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <math.h>
 3 | 
 4 | #define MIN(a, b) ((a < b) ? a : b)
 5 | #define MAX(a, b) ((a > b) ? a : b)
 6 | #define BOUND(x, xmin, xmax) (MAX(xmin, MIN(xmax, x)))
 7 | #define LOG_LOSS(act, pred) (-(act * log(pred) + (1 - act) * log(1 - pred)))
 8 | 
 9 | using namespace Rcpp;
10 | 
11 | 
12 | double fn_opt_base(NumericMatrix x, NumericVector y, NumericVector params, int type) {
13 |     
14 |   int rows = x.nrow();
15 |   int cols = x.ncol();
16 |   
17 |   int offset_params = 0;
18 |   double bias = 0.0;
19 |   if (params.size() > cols) {
20 |     offset_params = 1;
21 |     bias = params[0];
22 |   }
23 |   
24 |   double pred_max = 1.0 - 1e-15;
25 |   double pred_min = 1e-15;
26 |   
27 |   double total_err = 0.0;
28 |   for (int r = 0; r < rows; ++r) {
29 |     
30 |     double y_pred = (type==2)? 1.0 : 0.0;
31 |     for (int c = 0; c < cols; ++c) {
32 |       if (type==2) {
33 |         y_pred *= pow(x(r,c), params[c+offset_params]);
34 |       } else {
35 |         y_pred += x(r,c)*params[c+offset_params];
36 |       }
37 |     }
38 |     y_pred += bias;
39 |     y_pred = BOUND(y_pred, pred_min, pred_max);
40 |     total_err += LOG_LOSS(y[r], y_pred);
41 |   }
42 |   
43 |   return total_err/(double)rows;
44 | }
45 | 
46 | 
47 | // [[Rcpp::export]]
48 | double fn_opt_gm(NumericMatrix x, NumericVector y, NumericVector params) {
49 |     return fn_opt_base(x, y, params, 2);
50 | }
51 | 
52 | // [[Rcpp::export]]
53 | double fn_opt_am(NumericMatrix x, NumericVector y, NumericVector params) {
54 |     return fn_opt_base(x, y, params, 1);
55 | }
56 | 


--------------------------------------------------------------------------------
/avito-context-click-r/_utils.R:
--------------------------------------------------------------------------------
  1 | options(scipen=999)
  2 | 
  3 | fn.stats <- function(data.all, flist.col, target.col) {
  4 |   setnames(data.all, target.col, "target")
  5 |   data.stats <- c()
  6 |   for (f in flist.col) {
  7 |     setnames(data.all, f, "feature")
  8 |     stats.row <- c(length(unique(data.all[,feature])),
  9 |                    length(unique(data.all[!is.na(target),feature])),
 10 |                    length(unique(data.all[is.na(target),feature])),
 11 |                    max(data.all[,feature]),
 12 |                    max(data.all[!is.na(target),feature]),
 13 |                    max(data.all[is.na(target),feature]),
 14 |                    min(data.all[,feature]),
 15 |                    min(data.all[!is.na(target),feature]),
 16 |                    min(data.all[is.na(target),feature]),
 17 |                    mean(data.all[,feature]),
 18 |                    mean(data.all[!is.na(target),feature]),
 19 |                    mean(data.all[is.na(target),feature]),
 20 |                    median(data.all[,feature]),
 21 |                    median(data.all[!is.na(target),feature]),
 22 |                    median(data.all[is.na(target),feature]),
 23 |                    sd(data.all[,feature]),
 24 |                    sd(data.all[!is.na(target),feature]),
 25 |                    sd(data.all[is.na(target),feature]))
 26 |     data.stats <- rbind(data.stats, stats.row)
 27 |     setnames(data.all, "feature", f)
 28 |   }
 29 |   colnames(data.stats) <- c("unique","unique_tr","unique_test",
 30 |                             "max","max_tr","max_test",
 31 |                             "min","min_tr","min_test",
 32 |                             "mean","mean_tr","mean_test",
 33 |                             "median", "median_tr", "median_test",
 34 |                             "sd", "sd_tr", "sd_test")
 35 |   data.stats <- as.data.table(data.stats)
 36 |   data.stats[, feature := flist.col]
 37 |   setnames(data.all, "target", target.col)
 38 |   return (data.stats)
 39 | }
 40 | 
 41 | fn.multilogloss <- function(data.actual, data.predicted) {
 42 |   actual <- as.matrix(data.actual)
 43 |   predicted <- as.matrix(data.predicted)
 44 |   probs <- rowSums(actual*predicted)
 45 |   probs[which(probs>0.999999)] <- 0.999999
 46 |   probs[which(probs<0.000001)] <- 0.000001  
 47 |   return(-(1/nrow(actual))*sum(log(probs)))
 48 | }
 49 | 
 50 | fn.logloss <- function(actual, predicted, pred.min=0.000001, pred.max=0.999999) {
 51 |   predicted[which(predicted > pred.max)] <- pred.max
 52 |   predicted[which(predicted < pred.min)] <- pred.min
 53 |   error <- sum(-actual*log(predicted) - (1-actual)*log(1-predicted))/length(actual)
 54 |   return (error)
 55 | }
 56 | 
 57 | fn.mcrmse <- function(actual, predicted) {
 58 |   if (is.vector(predicted) & is.vector(actual)) {
 59 |     ix <- which(!is.na(actual))
 60 |     nsamples <- length(ix)
 61 |     return (sqrt(sum((actual[ix] - predicted[ix])^2)/nsamples))
 62 |   }
 63 |   if (ncol(actual) != ncol(predicted)) return (NULL)
 64 |   if (nrow(actual) != nrow(predicted)) return (NULL)
 65 |   ix <- which(!is.na(actual[,1]))
 66 |   nsamples <- length(ix)
 67 |   error <- 0
 68 |   #cat("Errors by targets:")
 69 |   errors <- c()
 70 |   for (i in 1:ncol(actual)) {
 71 |     error.col <- sqrt(sum((actual[ix,i] - predicted[ix,i])^2)/nsamples)
 72 |     errors <- c(errors, error.col)
 73 |     error <- error + error.col
 74 |     #cat(colnames(actual)[i],":",error.col,";")
 75 |   }
 76 |   #cat("\n")
 77 |   errors <- c(errors, error/ncol(actual))
 78 |   return (errors)
 79 | }
 80 | 
 81 | fn.memory.usage <- function() {
 82 |   return (sum(sort( sapply(ls(globalenv()),function(x){object.size(get(x))}))))
 83 | }
 84 | 
 85 | fn.write.batches.csv <- function(data, train.file, col.names, sep, nchunks = 4, continue.chunks=FALSE) {
 86 |   options(scipen=999)
 87 |   if (nchunks == 1) {
 88 |     write.table(
 89 |       data,
 90 |       file=train.file,
 91 |       row.names = F, quote = F, na = "", sep = sep,
 92 |       append = FALSE, col.names = col.names
 93 |     )
 94 |   } else {
 95 |     nr <- nrow(data)
 96 |     ix <- seq(1, nr, round(nr/nchunks))
 97 |     if (ix[length(ix)] != nr) {
 98 |       ix <- c(ix, nr+1)
 99 |     } else {
100 |       ix[length(ix)] <- nr+1
101 |     }
102 |     gc()
103 |     for (i in 1:(length(ix)-1)) {
104 |       cat("Processing chunk", i, "...\n")
105 |       if (i == 1 & !continue.chunks) {
106 |         write.table(
107 |           data[ix[i]:(ix[i+1]-1),],
108 |           file=train.file,
109 |           row.names = F, quote = F, na = "", sep = sep,
110 |           append = FALSE, col.names = col.names
111 |         )
112 |       } else {
113 |         write.table(
114 |           data[ix[i]:(ix[i+1]-1),],
115 |           file=train.file,
116 |           row.names = F, quote = F, na = "", sep = sep,
117 |           append = TRUE, col.names = FALSE
118 |         )
119 |       } 
120 |       invisible(gc())
121 |     }
122 |   }
123 | }
124 | 
125 | fn.optim <- function(y, x) {
126 |   
127 |   x <- as.matrix(x)
128 |   pars0 <- rep(0.0, ncol(x))
129 |   
130 |   #error to minimize
131 |   fn.loss <- function(pars) {
132 |     y.pred <- 1 / (1 + exp(-as.numeric(x %*% pars)))
133 |     y.pred <- pmax(y.pred, 10^(-6))
134 |     y.pred <- pmin(y.pred, 1-10^(-6))
135 |     sum(-y*log(y.pred) - (1-y)*log(1-y.pred))/length(y)
136 |   } 
137 |   
138 |   cat ("Initial loss:", fn.loss(pars0), "\n")
139 |   opt.result <- optim(pars0, 
140 |                       fn.loss, 
141 |                       #method = "Brent",
142 |                       #method = "L-BFGS-B",
143 |                       #lower = 0.0,
144 |                       #upper = 10.0,
145 |                       control = list(trace = T,maxit=5000))
146 |   return (opt.result$par)
147 | }
148 | 


--------------------------------------------------------------------------------
/avito-context-click-r/avito-context-click-r.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: No
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/avito-context-click-r/data.build.tree.R:
--------------------------------------------------------------------------------
  1 | ##############################################################
  2 | ## create tree features data
  3 | ##############################################################
  4 | cat("Tree data... \n")
  5 | 
  6 | fn.register.wk(1)
  7 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
  8 |   
  9 |   fn.init.worker('data_build/build_tree')
 10 |   
 11 |   cat('\nMerging data.all.search.small + data.all.search.cont \n')
 12 |   data.all.tree <- merge(
 13 |     data.all.search.small, 
 14 |     data.all.search.cont[ID %in% data.all.search.small$ID], 
 15 |     by="ID")
 16 |   fn.soar.unload(data.all.search.small, data.all.search.cont)
 17 |   invisible(gc()) 
 18 |   
 19 |   cat('\nAdding data.all.search.info\n')
 20 |   data.all.tree <- merge(
 21 |     data.all.tree, 
 22 |     data.all.search.info[SearchID %in% unique(data.all.tree$SearchID)], 
 23 |     by="SearchID")
 24 |   fn.soar.unload(data.all.search.info)
 25 |   invisible(gc())
 26 |   
 27 |   cat('\nAdding data.all.search.info.cont\n')
 28 |   data.all.tree <- merge(
 29 |     data.all.tree, 
 30 |     data.all.search.info.cont[SearchID %in% unique(data.all.tree$SearchID)], 
 31 |     by="SearchID")
 32 |   fn.soar.unload(data.all.search.info.cont)
 33 |   invisible(gc())
 34 |   
 35 |   cat('\nAdding data.all.prob.1way\n')
 36 |   data.all.tree <- merge(
 37 |     data.all.tree, 
 38 |     data.all.prob.1way, 
 39 |     by="ID")
 40 |   fn.soar.unload(data.all.prob.1way)
 41 |   invisible(gc())
 42 |   
 43 |   cat('\nAdding data.all.prob.2way.ad.us\n')
 44 |   data.all.tree <- merge(
 45 |     data.all.tree, 
 46 |     data.all.prob.2way.ad.us, 
 47 |     by="ID")
 48 |   fn.soar.unload(data.all.prob.2way.ad.us)
 49 |   invisible(gc())
 50 |   
 51 |   cat('\nAdding data.all.prob.2way.ad.srch\n')
 52 |   data.all.tree <- merge(
 53 |     data.all.tree, 
 54 |     data.all.prob.2way.ad.srch, 
 55 |     by="ID")
 56 |   fn.soar.unload(data.all.prob.2way.ad.srch)
 57 |   invisible(gc())
 58 |   
 59 |   cat('\nAdding data.all.prob.2way.us.srch\n')
 60 |   data.all.tree <- merge(
 61 |     data.all.tree, 
 62 |     data.all.prob.2way.us.srch, 
 63 |     by="ID")
 64 |   fn.soar.unload(data.all.prob.2way.us.srch)
 65 |   invisible(gc())
 66 |   
 67 |   cat('\nAdding data.all.prob.2way.srch\n')
 68 |   data.all.tree <- merge(
 69 |     data.all.tree, 
 70 |     data.all.prob.2way.srch, 
 71 |     by="ID")
 72 |   fn.soar.unload(data.all.prob.2way.srch)
 73 |   invisible(gc())
 74 |   
 75 |   cat('\nRemoving unwanted columns\n')
 76 |   cols.excl <- c(
 77 |     "UserID", "UserIPID", "UserAgentID", "UserDeviceID",
 78 |     "AdID", "AdParams",
 79 |     "SearchLocID"
 80 |   )
 81 |   
 82 |   for (col.nam in cols.excl) {
 83 |     data.all.tree[, col.nam := NULL, with=F]
 84 |   }
 85 |   invisible(gc())
 86 |   
 87 |   cat('\nSorting columns\n')
 88 |   setkeyv(data.all.tree, c("SearchDate", "SearchID", "Position"))
 89 |   invisible(gc())
 90 |   
 91 |   
 92 |   cat('\nFilling NAs\n')
 93 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick")
 94 |   cols.in <- sort(setdiff(colnames(data.all.tree), cols.extra))
 95 |   
 96 |   for (col.nam in cols.in) {
 97 |     if (any(is.na(data.all.tree[[col.nam]]))) {
 98 |       setnames(data.all.tree, col.nam, 'change_val')
 99 |       data.all.tree[is.na(change_val), change_val := -1]
100 |       setnames(data.all.tree, 'change_val', col.nam)
101 |     }
102 |   }
103 |   
104 |   invisible(gc())
105 |   cols.in.tree <- cols.in
106 |   cat('\nSaving\n')
107 |   setcolorder(data.all.tree, c(cols.extra, cols.in))
108 |   Store(data.all.tree, cols.in)
109 |   invisible(gc())
110 |   
111 |   
112 |   data.l2.pred <- fn.load.ens(
113 |     ens.cols = c(
114 |       "data.ftrl.04.pred",
115 |       "data.ftrl.05.pred",
116 |       "data.ftrl.06.pred",
117 |       "data.fm.05.pred",
118 |       "data.fm.04.pred",
119 |       "data.fm.03.pred",
120 |       "data.fm.02.pred",
121 |       "data.fm.01.pred"
122 |     ), print.err = F)
123 |   data.l2.all.tree <- merge(data.all.tree, data.l2.pred,
124 |                             by="ID")
125 |   rm(data.l2.pred)
126 |   fn.soar.unload(data.all.tree)
127 |   
128 |   col.num <- sapply(data.l2.all.tree, is.numeric)
129 |   col.num <- names(col.num)[col.num]
130 |   col.num <- col.num[(col.num %like% '(^Prob)|(^ftrl)|(^fm)|(^Ratio)|AdHistCTR')]
131 |   
132 |   for (col.nam in col.num) {
133 |     setnames(data.l2.all.tree, col.nam, 'change_val')
134 |     data.l2.all.tree[, change_val := round(change_val, digits = 6)]
135 |     setnames(data.l2.all.tree, 'change_val', col.nam)
136 |   }
137 |   
138 |   Store(data.l2.all.tree)
139 |   invisible(gc())
140 |   
141 |   fn.clean.worker()
142 | }
143 | fn.kill.wk()
144 | 
145 | #############################################################
146 | # Probability features - full dataset
147 | #############################################################
148 | cat("Probability features full data... \n")
149 | 
150 | fn.register.wk(1)
151 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
152 |   
153 |   fn.init.worker('data_build/prob_features_full')
154 |   
155 |   cols.in.1way <- c(
156 |     "AdID", "AdCatID",  "AdParams",
157 |     "UserID", "UserIPID", "UserAgentID", 
158 |     "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID",
159 |     "SearchLocID", "SearchCatID"
160 |   )
161 |   data.all.prob.full.1way <- fn.build.prob.full(cols.in.1way)
162 |   Store(data.all.prob.full.1way)
163 |   invisible(gc())
164 |   
165 |   cols.in.2way <- c(
166 |     "AdID", "AdCatID",  "AdParams",
167 |     "UserID", "UserIPID", "UserAgentID", 
168 |     "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID",
169 |     "SearchLocID", "SearchCatID"
170 |   )
171 |   
172 |   data.all.prob.full.2way.ad.us <- fn.build.prob.full(
173 |     fn.build.interaction(cols.in.2way, c("Ad", "Us")))
174 |   Store(data.all.prob.full.2way.ad.us)
175 |   invisible(gc())
176 |   
177 |   data.all.prob.full.2way.ad.srch <- fn.build.prob.full(
178 |     fn.build.interaction(cols.in.2way, c("Ad", "Search")))
179 |   Store(data.all.prob.full.2way.ad.srch)
180 |   invisible(gc())
181 |   
182 |   data.all.prob.full.2way.srch <- fn.build.prob.full(
183 |     fn.build.interaction(cols.in.2way, c("Search", "Search")))
184 |   invisible(gc())
185 |   Store(data.all.prob.full.2way.srch)
186 |   invisible(gc())
187 |   
188 |   cols.in.2way.us.srch.1 <- c(
189 |     "UserAgentID", "UserAgentOSID", "UserDeviceID", "UserAgentFamilyID",
190 |     "SearchLocID", "SearchCatID"
191 |   )
192 |   
193 |   data.all.prob.full.2way.us.srch.1 <- fn.build.prob.full(
194 |     fn.build.interaction(cols.in.2way.us.srch.1, c("Us", "Search")))
195 |   invisible(gc())
196 |   Store(data.all.prob.full.2way.us.srch.1)
197 |   invisible(gc())
198 |   
199 |   cols.in.2way.us.srch.2 <- c(
200 |     "UserID", "UserIPID", 
201 |     "SearchLocID", "SearchCatID"
202 |   )
203 |   
204 |   data.all.prob.full.2way.us.srch.2 <- fn.build.prob.full(
205 |     fn.build.interaction(cols.in.2way.us.srch.2, c("Us", "Search")))
206 |   invisible(gc())
207 |   Store(data.all.prob.full.2way.us.srch.2)
208 |   invisible(gc())
209 |   
210 |   
211 |   
212 |   cat("\nMerging probabilities\n")
213 |   data.all.prob.full <- data.all.prob.full.1way
214 |   setkey(data.all.prob.full, ID)
215 |   
216 |   setkey(data.all.prob.full.2way.ad.us, ID)
217 |   fn.check.id(data.all.prob.full, 
218 |               data.all.prob.full.2way.ad.us)
219 |   for (col.nam in colnames(data.all.prob.full.2way.ad.us)[-1]) {
220 |     data.all.prob.full[
221 |       , col.nam := data.all.prob.full.2way.ad.us[[col.nam]], 
222 |       with=F]
223 |   }
224 |   fn.soar.unload(data.all.prob.full.2way.ad.us)
225 |   invisible(gc())
226 |   
227 |   
228 |   
229 |   setkey(data.all.prob.full.2way.ad.srch, ID)
230 |   fn.check.id(data.all.prob.full, 
231 |               data.all.prob.full.2way.ad.srch)
232 |   for (col.nam in colnames(data.all.prob.full.2way.ad.srch)[-1]) {
233 |     data.all.prob.full[
234 |       , col.nam := data.all.prob.full.2way.ad.srch[[col.nam]], 
235 |       with=F]
236 |   }
237 |   fn.soar.unload(data.all.prob.full.2way.ad.srch)
238 |   invisible(gc())
239 |   
240 |   
241 |   
242 |   setkey(data.all.prob.full.2way.us.srch.1, ID)
243 |   fn.check.id(data.all.prob.full, 
244 |               data.all.prob.full.2way.us.srch.1)
245 |   for (col.nam in colnames(data.all.prob.full.2way.us.srch.1)[-1]) {
246 |     data.all.prob.full[
247 |       , col.nam := data.all.prob.full.2way.us.srch.1[[col.nam]], 
248 |       with=F]
249 |   }
250 |   fn.soar.unload(data.all.prob.full.2way.us.srch.1)
251 |   invisible(gc())
252 |   
253 |   
254 |   setkey(data.all.prob.full.2way.us.srch.2, ID)
255 |   fn.check.id(data.all.prob.full, 
256 |               data.all.prob.full.2way.us.srch.2)
257 |   for (col.nam in colnames(data.all.prob.full.2way.us.srch.2)[-1]) {
258 |     data.all.prob.full[
259 |       , col.nam := data.all.prob.full.2way.us.srch.2[[col.nam]], 
260 |       with=F]
261 |   }
262 |   fn.soar.unload(data.all.prob.full.2way.us.srch.2)
263 |   invisible(gc())
264 |   
265 |   
266 |   
267 |   setkey(data.all.prob.full.2way.srch, ID)
268 |   fn.check.id(data.all.prob.full, 
269 |               data.all.prob.full.2way.srch)
270 |   for (col.nam in colnames(data.all.prob.full.2way.srch)[-1]) {
271 |     data.all.prob.full[
272 |       , col.nam := data.all.prob.full.2way.srch[[col.nam]], 
273 |       with=F]
274 |   }
275 |   fn.soar.unload(data.all.prob.full.2way.srch)
276 |   invisible(gc())
277 |   
278 |   
279 |   cat("\nSaving dataset csv\n")
280 |   setkey(data.all.prob.full, ID)
281 |   data.all.prob.full[, ID := NULL]
282 |   fn.write.csv.chunk(data=data.all.prob.full,
283 |                      file=fn.out.file("data.all.prob.full.csv"),
284 |                      compress=F)
285 |   
286 |   fn.clean.worker()
287 | }
288 | fn.kill.wk()
289 | 
290 | 
291 | 
292 | ##############################################################
293 | ## full tree mdel data
294 | ##############################################################
295 | tic()
296 | cat("full tree model data... \n")
297 | 
298 | 
299 | fn.register.wk(1)
300 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
301 |   
302 |   fn.init.worker('data_build/build_tree_full')
303 |   
304 |   
305 |   cat('\nMerging data.all.search.info + data.all.search.info.cont \n')
306 |   data.all.tree.full <- merge(data.all.search.info, 
307 |                               data.all.search.info.cont, 
308 |                               by="SearchID")
309 |   fn.soar.unload(data.all.search.info, data.all.search.info.cont)
310 |   invisible(gc()) 
311 |   
312 |   setkey(data.all.tree.full, "SearchID")
313 |   data.all.tree.full <- data.all.tree.full[J(data.all.search$SearchID)]
314 |   if (!all(data.all.tree.full$SearchID == data.all.search$SearchID)) {
315 |     stop('SearchIDs do not match')
316 |   }
317 |   data.all.tree.full[, ID := data.all.search$ID]
318 |   
319 |   fn.to.data.all.tree.full <- function(data.add) {
320 |     fn.check.id(data.all.tree.full, data.add)
321 |     for (col.nam in colnames(data.add)) {
322 |       if (col.nam %ni% colnames(data.all.tree.full)) {
323 |         data.all.tree.full[, col.nam := data.add[[col.nam]], with=F]
324 |       }
325 |     }
326 |     invisible(NULL)
327 |   }
328 |   
329 |   cat('\nAdding data.all.search\n')
330 |   fn.to.data.all.tree.full(data.all.search)
331 |   fn.soar.unload(data.all.search)
332 |   
333 |   cat('\nAdding data.all.search.cont\n')
334 |   fn.to.data.all.tree.full(data.all.search.cont)
335 |   fn.soar.unload(data.all.search.cont)
336 |   
337 |   setkeyv(data.all.tree.full, c("SearchDate", "SearchID", "Position"))
338 |   
339 |   #   cat('\nRemoving unwanted columns\n')
340 |   #   cols.excl <- c(
341 |   #     "UserID", "UserIPID", "UserAgentID", "UserDeviceID",
342 |   #     "AdID", "AdParams", "SearchLocID"
343 |   #   )
344 |   #   
345 |   #   for (col.nam in cols.excl) {
346 |   #     data.all.tree.full[, col.nam := NULL, with=F]
347 |   #   }
348 |   #   invisible(gc())
349 |   
350 |   cat('\nSorting columns\n')
351 |   setkeyv(data.all.tree.full, c("SearchDate", "SearchID", "Position"))
352 |   invisible(gc())
353 |   
354 |   
355 |   cat('\nFilling NAs\n')
356 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick")
357 |   cols.in <- sort(setdiff(colnames(data.all.tree.full), cols.extra))
358 |   
359 |   for (col.nam in cols.in) {
360 |     if (any(is.na(data.all.tree.full[[col.nam]]))) {
361 |       setnames(data.all.tree.full, col.nam, 'change_val')
362 |       data.all.tree.full[is.na(change_val), change_val := -1]
363 |       setnames(data.all.tree.full, 'change_val', col.nam)
364 |     }
365 |   }
366 |   
367 |   invisible(gc())
368 |   
369 |   cat('\nSaving\n')
370 |   setcolorder(data.all.tree.full, c(cols.extra, cols.in))
371 |   Store(data.all.tree.full)
372 |   invisible(gc())
373 |   
374 |   setkey(data.all.tree.full, ID)
375 |   cat("\nSaving dataset csv\n")
376 |   fn.write.csv.chunk(data=data.all.tree.full,
377 |                      file=fn.out.file("data.all.tree.full.csv"),
378 |                      compress=F)
379 |   
380 |   cat('\nMerging data\n')
381 |   system(
382 |     paste(
383 |       "paste -d ',' ",
384 |       fn.out.file("data.all.tree.full.csv"),
385 |       fn.out.file("data.all.prob.full.csv"),
386 |       "> ", fn.out.file("data.all.tree.full.merge.csv")
387 |     )
388 |   )
389 |   cat('\nCompressing data\n')
390 |   system(
391 |     paste(
392 |       "pigz -f", fn.out.file("data.all.tree.full.merge.csv")
393 |     )
394 |   )
395 |   
396 |   cat('\nSaving libsvm data\n')
397 |   cols.in.tree.full <- c(
398 |     "AdCatID","AdHistCTR","AdID","AdParams","AdPrice",
399 |     "AdTitleSZ","CountAdSearch","CountAdSearchCat",
400 |     "CountAdSearchLoc","CountAdUsers","CountIPUser",
401 |     "CountUserAd","CountUserAdDupT1","CountUserAdDupT3",
402 |     "CountUserAdT1","CountUserAdT3","CountUserSearch",
403 |     "CountUserSearchCategory","CountUserSearchLocation",
404 |     "Position","RatioAdPos1","RatioSearchRuss","SearchAdCount",
405 |     "SearchAdT1Count","SearchAdT2Count","SearchAdT3Count",
406 |     "SearchCatID","SearchDate","SearchLocID","SearchOrdUsrAsc",
407 |     "SearchOrdUsrDesc","SearchParamsSZ","SearchQuerySZ",
408 |     "SearchRussian","UserAgentFamilyID","UserAgentID",
409 |     "UserAgentOSID","UserDeviceID","UserID","UserIPID",
410 |     "UserLogged","UserPrevPhoneRequest",
411 |     "UserPrevPrevPrevQryDate","UserPrevPrevQryDate",
412 |     "UserPrevQryDate","UserPrevVisitReq","UserPrevVisitReqUni",
413 |     "UserQryTotalTime","ProbAdID","ProbAdCatID","ProbAdParams",
414 |     "ProbUserID","ProbUserIPID","ProbUserAgentID",
415 |     "ProbUserAgentOSID","ProbUserDeviceID",
416 |     "ProbUserAgentFamilyID","ProbSearchLocID",
417 |     "ProbSearchCatID","ProbAdCatIDUserAgentFamilyID",
418 |     "ProbAdIDUserAgentFamilyID","ProbAdCatIDUserAgentOSID",
419 |     "ProbAdIDUserAgentOSID","ProbAdCatIDUserID","ProbAdIDUserID",
420 |     "ProbAdCatIDUserIPID","ProbAdIDUserIPID",
421 |     "ProbAdCatIDSearchCatID","ProbAdIDSearchCatID",
422 |     "ProbAdCatIDSearchLocID","ProbAdIDSearchLocID",
423 |     "ProbSearchCatIDUserAgentFamilyID",
424 |     "ProbSearchLocIDUserAgentFamilyID",
425 |     "ProbSearchCatIDUserAgentOSID",
426 |     "ProbSearchLocIDUserAgentOSID","ProbSearchCatIDUserID",
427 |     "ProbSearchLocIDUserID","ProbSearchCatIDUserIPID",
428 |     "ProbSearchLocIDUserIPID",
429 |     "ProbSearchLocIDSearchCatID")
430 |   extra.tr.sel <- "int(row[\"SearchOrdUsrDesc\"]) <= 10 and" # row[\"SearchDate\"]) >= 1431396000
431 |   system(paste(
432 |     "cd ../avito-context-click-py &&", 
433 |     "pypy -u convert_csv_to_libsvm.py", 
434 |     "-input_files ../data/output-r/data.all.tree.full.merge.csv",
435 |     "-out_selector '{", 
436 |     "\"../data/output-r/data.val.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\"],",   
437 |     "\"../data/output-r/data.val.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
438 |     "\"../data/output-r/data.test.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
439 |     "\"../data/output-r/data.test.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
440 |     "}'",
441 |     "-weight_builder_dict '{", 
442 |     "\"../data/output-r/data.val.tr.full.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])),",   
443 |     "\"../data/output-r/data.test.tr.full.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"]))", 
444 |     "}'",
445 |     "-feat_map_file ../data/output-r/data.all.full.fmap",
446 |     "-col_out IsClick",
447 |     "-col_in_num", paste(cols.in.tree.full, collapse=' '),
448 |     "-missing_values ''  'na'  'nan' 'NA' 'NaN' '-1'",
449 |     ">> ../data/log/data_build/build_tree_full.log 2>&1"))
450 |   
451 |   fn.clean.worker()
452 | }
453 | fn.kill.wk()
454 | 
455 | 
456 | 
457 | 
458 | 
459 | ##############################################################
460 | ## Ensenble cross validation scheme
461 | ##############################################################
462 | tic()
463 | cat("Ensenble cross validation... \n")
464 | 
465 | data.cv.ens <- fn.cv.ens.folds()
466 | Store(data.cv.ens)
467 | 
468 | toc()
469 | 


--------------------------------------------------------------------------------
/avito-context-click-r/data.combine.R:
--------------------------------------------------------------------------------
  1 | # #############################################################
  2 | # # merge lucas and dmitry data
  3 | # #############################################################
  4 | # 
  5 | tic()
  6 | cat("Merging datasets... \n")
  7 | 
  8 | fn.register.wk(1)
  9 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 10 |   
 11 |   fn.init.worker('data_build/combine_datasets')
 12 | 
 13 |   cat("\nLoading datasets\n")
 14 |   load(fn.rdata.file('data.reduced.all.RData'))
 15 | 
 16 |   setkey(data.reduced.all, ID)
 17 |   setkey(data.l2.all.tree, ID)
 18 |   
 19 |   cat("\nCalculating common cols\n")
 20 |   col.common <- intersect(colnames(data.reduced.all),
 21 |                           colnames(data.l2.all.tree))
 22 |   col.common <- unique(c(col.common, "HistCTR", "Price"))
 23 |   
 24 |   col.uniq.dtry <- sapply(data.reduced.all, 
 25 |                           function(x) length(unique(x)))
 26 |   col.uniq.dtry <- col.uniq.dtry[!names(col.uniq.dtry) %in% 
 27 |                                    col.common]
 28 |   
 29 |   col.uniq.lucas <- sapply(data.l2.all.tree, 
 30 |                           function(x) length(unique(x)))
 31 |   col.uniq.lucas <- col.uniq.lucas[
 32 |     !names(col.uniq.lucas) %in%
 33 |       col.common]
 34 |   
 35 |   # check for length and then for value
 36 |   cols.match <- list()
 37 |   for (ix in 1:length(col.uniq.lucas)) {
 38 |     col.uniq <- col.uniq.lucas[ix]
 39 |     col.same <- col.uniq.dtry[col.uniq.dtry == col.uniq]
 40 |     if (length(col.same) >= 1) {
 41 |       col.lucas.nam <- names(col.uniq)
 42 |       for (col.dtry.nam in names(col.same)) {
 43 |         if (all(data.reduced.all[[col.dtry.nam]] == 
 44 |                  data.l2.all.tree[[col.lucas.nam]])) {
 45 |           cols.match[[col.lucas.nam]] <- col.dtry.nam
 46 |           col.common <- unique(c(col.common, col.dtry.nam))
 47 |         }
 48 |       }
 49 |     }
 50 |   }
 51 |   
 52 |   cat("\nCopying and cols and saving RData\n")
 53 |   for (col.nam in setdiff(col.common, "ID")) {
 54 |     data.reduced.all[, col.nam := NULL, with=F]
 55 |   }
 56 |   
 57 |   data.all.tree.dl <- data.l2.all.tree
 58 |   fn.soar.unload(data.l2.all.tree)
 59 |   
 60 |   setkey(data.reduced.all, ID)
 61 |   setkey(data.all.tree.dl, ID)
 62 |   
 63 |   for (col.nam in setdiff(colnames(data.reduced.all), "ID")) {
 64 |     data.all.tree.dl[, col.nam := data.reduced.all[[col.nam]], with=F]
 65 |     data.reduced.all[, col.nam := NULL, with=F]
 66 |     invisible(gc())
 67 |   }
 68 |   rm(data.reduced.all)
 69 |   
 70 |   save(data.all.tree.dl, file=fn.rdata.file('data.all.tree.dl.RData'))
 71 |   
 72 |   cat("\nSaving dataset csv\n")
 73 |   fn.write.csv.chunk(data=data.all.tree.dl,
 74 |                      file=fn.out.file("data.all.tree.dl.csv"),
 75 |                      compress=F)
 76 |   
 77 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick")
 78 |   cols.in <- sort(setdiff(colnames(data.all.tree.dl), 
 79 |                           c(cols.extra)))
 80 |   
 81 |   cols.in.combine <- cols.in
 82 |   Store(cols.in.combine)
 83 |   
 84 |   rm(data.all.tree.dl)
 85 |   invisible(gc())
 86 |   
 87 |   system(paste(
 88 |     "cd ../avito-context-click-py &&", 
 89 |     "pypy -u convert_csv_to_libsvm.py", 
 90 |     "-input_files ../data/output-r/data.all.tree.dl.csv",
 91 |     "-out_selector '{", 
 92 |     "\"../data/output-libsvm/data.val.tr.libsvm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
 93 |     "\"../data/output-libsvm/data.val.tt.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
 94 |     "\"../data/output-libsvm/data.test.tr.libsvm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
 95 |     "\"../data/output-libsvm/data.test.tt.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
 96 |     "}'",
 97 |     "-col_out IsClick",
 98 |     "-col_in_num", paste(cols.in.combine, collapse=' '),
 99 |     "-missing_values ''  'na'  'nan' 'NA' 'NaN' '-1'",
100 |     ">> ../data/log/data_build/combine_datasets.log 2>&1"))
101 |   
102 |   system(paste(
103 |     "cd ../avito-context-click-py &&", 
104 |     "pypy -u convert_csv_to_libsvm.py", 
105 |     "-input_files ../data/output-r/data.all.tree.dl.csv",
106 |     "-out_selector '{", 
107 |     "\"../data/output-libsvm/data.val.tr.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",   
108 |     "\"../data/output-libsvm/data.val.tt.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
109 |     "\"../data/output-libsvm/data.test.tr.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],", 
110 |     "\"../data/output-libsvm/data.test.tt.nllh.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
111 |     "}'",
112 |     "-feat_map_file ../data/output-libsvm/data.all.nllh.fmap",
113 |     "-col_out IsClick",
114 |     "-col_in_num", paste(cols.in.combine[!cols.in.combine %like% 'likeli'], 
115 |                          collapse=' '),
116 |     "-missing_values ''  'na'  'nan' 'NA' 'NaN' '-1'",
117 |     ">> ../data/log/data_build/combine_datasets.log 2>&1"))
118 |   
119 |   system(paste(
120 |     "cd ../avito-context-click-py &&", 
121 |     "pypy -u convert_csv_to_libsvm.py", 
122 |     "-input_files ../data/output-r/data.all.tree.dl.csv",
123 |     "-out_selector '{", 
124 |     "\"../data/output-libsvm/data.val.tr.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",   
125 |     "\"../data/output-libsvm/data.val.tt.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
126 |     "\"../data/output-libsvm/data.test.tr.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"tr\", \"val\"],", 
127 |     "\"../data/output-libsvm/data.test.tt.nprob.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
128 |     "}'",
129 |     "-weight_builder_dict '{", 
130 |     "\"../data/output-libsvm/data.val.tr.nprob.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])-3),",   
131 |     "\"../data/output-libsvm/data.test.tr.nprob.libsvm\": lambda file, row: 1/(float(row[\"SearchOrdUsrDesc\"])-0)", 
132 |     "}'",
133 |     "-feat_map_file ../data/output-libsvm/data.all.nprob.fmap",
134 |     "-col_out IsClick",
135 |     "-col_in_num", paste(cols.in.combine[!cols.in.combine %like% '^Prob'], 
136 |                          collapse=' '),
137 |     "-missing_values ''  'na'  'nan' 'NA' 'NaN' '-1'",
138 |     ">> ../data/log/data_build/combine_datasets.log 2>&1"))
139 |   
140 |     load(fn.rdata.file('data.full.all.RData'))
141 |     setkey(data.full.all, ID)
142 |     data.full.all[, ID := NULL]
143 |     cat("\nSaving dataset csv\n")
144 |     fn.write.csv.chunk(data=data.full.all,
145 |                        file=fn.out.file("data.dtry.full.all.csv"),
146 |                        compress=F)
147 |     rm(data.full.all)
148 |     invisible(gc())
149 | #     
150 |     cat('\nMerging data\n')
151 |     system(
152 |       paste(
153 |         "paste -d ',' ",
154 |         fn.out.file("data.all.tree.full.csv"),
155 |         fn.out.file("data.all.prob.full.csv"),
156 |         fn.out.file("data.dtry.full.all.csv"),
157 |         "> ", fn.out.file("data.all.tree.full.combine.csv")
158 |       )
159 |     )
160 |     #cat('\nCompressing data\n')
161 |     #system(
162 |     #  paste(
163 |     #    "pigz -f", fn.out.file("data.all.tree.full.combine.csv")
164 |     #  )
165 |     #)
166 |     
167 |   cat('\nSaving libsvm data\n')
168 |   cols.in.combine.full <- c(
169 |     "AdCatID","AdHistCTR","AdID","AdParams","AdPrice",
170 |     "AdTitleSZ","CountAdSearch","CountAdSearchCat",
171 |     "CountAdSearchLoc","CountAdUsers","CountIPUser",
172 |     "CountUserAd","CountUserAdDupT1","CountUserAdDupT3",
173 |     "CountUserAdT1","CountUserAdT3","CountUserSearch",
174 |     "CountUserSearchCategory","CountUserSearchLocation",
175 |     "Position","RatioAdPos1","RatioSearchRuss","SearchAdCount",
176 |     "SearchAdT1Count","SearchAdT2Count","SearchAdT3Count",
177 |     "SearchCatID","SearchDate","SearchLocID","SearchOrdUsrAsc",
178 |     "SearchOrdUsrDesc","SearchParamsSZ","SearchQuerySZ",
179 |     "SearchRussian","UserAgentFamilyID","UserAgentID",
180 |     "UserAgentOSID","UserDeviceID","UserID","UserIPID",
181 |     "UserLogged","UserPrevPhoneRequest",
182 |     "UserPrevPrevPrevQryDate","UserPrevPrevQryDate",
183 |     "UserPrevQryDate","UserPrevVisitReq","UserPrevVisitReqUni",
184 |     "UserQryTotalTime",
185 |     "ProbAdID","ProbAdCatID","ProbAdParams",
186 |     "ProbUserID","ProbUserIPID","ProbUserAgentID",
187 |     "ProbUserAgentOSID","ProbUserDeviceID",
188 |     "ProbUserAgentFamilyID","ProbSearchLocID",
189 |     "ProbSearchCatID","ProbAdCatIDUserAgentFamilyID",
190 |     "ProbAdIDUserAgentFamilyID","ProbAdCatIDUserAgentOSID",
191 |     "ProbAdIDUserAgentOSID","ProbAdCatIDUserID","ProbAdIDUserID",
192 |     "ProbAdCatIDUserIPID","ProbAdIDUserIPID",
193 |     "ProbAdCatIDSearchCatID","ProbAdIDSearchCatID",
194 |     "ProbAdCatIDSearchLocID","ProbAdIDSearchLocID",
195 |     "ProbSearchCatIDUserAgentFamilyID",
196 |     "ProbSearchLocIDUserAgentFamilyID",
197 |     "ProbSearchCatIDUserAgentOSID",
198 |     "ProbSearchLocIDUserAgentOSID","ProbSearchCatIDUserID",
199 |     "ProbSearchLocIDUserID","ProbSearchCatIDUserIPID",
200 |     "ProbSearchLocIDUserIPID",
201 |     'SearchDayYear', 'SearchPosition2Count', 'SearchPosition6Count', 
202 |     'SearchPosition7Count', 'AdPosition1Count', 'AdPosition7Count', 
203 |     'SearchParamsCount', 'LocationUserUniqueCount', 'CategoryUserUniqueCount', 
204 |     'SearchIDPreviousAge', 'AdParamsSize', 'AdParamsCount', 'UserAdCount', 
205 |     'AdCategoryPriceDeviation', 'UserAdViewTotalCount', 'UserAdViewUniqueCount', 
206 |     'UserAdCategoryPriceMean', 'UserAdCategoryPriceMedian', 
207 |     'UserAdCategoryPriceMin', 'UserAdCategoryPriceMax', 'UserAdViewTotalCount2', 
208 |     'UserAdViewUniqueCount2', 'UserAdCategoryPriceMean2', 
209 |     'UserAdCategoryPriceMedian2', 'UserAdCategoryPriceMin2', 
210 |     'UserAdCategoryPriceMax2'
211 |   )
212 | 
213 |   extra.tr.sel <- "int(row[\"SearchOrdUsrDesc\"]) <= 7 and" # row[\"SearchDate\"]) >= 1431396000
214 |   system(paste(
215 |     "cd ../avito-context-click-py &&", 
216 |     "pypy -u convert_csv_to_libsvm.py", 
217 |     "-input_files ../data/output-r/data.all.tree.full.combine.csv",
218 |     "-out_selector '{", 
219 |     "\"../data/output-libsvm/data.val.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\"],",   
220 |     "\"../data/output-libsvm/data.val.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
221 |     "\"../data/output-libsvm/data.test.tr.full.libsvm\": lambda file, row: ", extra.tr.sel, " row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
222 |     "\"../data/output-libsvm/data.test.tt.full.libsvm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
223 |     "}'",
224 |     "-feat_map_file ../data/output-libsvm/data.all.full.fmap",
225 |     "-col_out IsClick",
226 |     "-col_in_num", paste(unique(cols.in.combine.full), collapse=' '),
227 |     "-missing_values ''  'na'  'nan' 'NA' 'NaN' '-1'",
228 |     ">> ../data/log/data_build/combine_datasets.log 2>&1"))
229 |   
230 |   
231 |   cat("\nDone!\n")
232 |   
233 |   fn.clean.worker()
234 | }
235 | fn.kill.wk()
236 | 
237 | 


--------------------------------------------------------------------------------
/avito-context-click-r/main.R:
--------------------------------------------------------------------------------
 1 | source("_fn.base.R")
 2 | source("_utils.R")
 3 | source("data.build.R")
 4 | source("train.l1.fm.01.R")
 5 | source("train.l1.fm.02.R")
 6 | source("train.l1.fm.03.R")
 7 | source("train.l1.fm.04.R")
 8 | source("train.l1.fm.05.R")
 9 | source("train.l1.ftrl.04.R")
10 | source("train.l1.ftrl.05.R")
11 | source("train.l1.ftrl.06.R")
12 | 
13 | source("data.build.tree.R")
14 | source("data.build.dtry.R")
15 | source("data.combine.R")
16 | 
17 | source("train.l1.xgb.03.R")
18 | source("train.l1.xgb.05.R")
19 | source("train.l2.xgb.02.R")
20 | source("train.xgb.dtry.R")
21 | 
22 | source("train.zens.R")
23 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.fm.01.R:
--------------------------------------------------------------------------------
 1 | #############################################################
 2 | # save csv to disk data
 3 | #############################################################
 4 | 
 5 | fn.register.wk(1)
 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 7 |   
 8 |   fn.init.worker("fm_01/build_fm")
 9 |   system(paste(
10 |     "cd ../avito-context-click-py &&", 
11 |     "pypy -u convert_csv_to_libffm.py", 
12 |     "-input_files ../data/output-r/data.all.lr.csv",
13 |     "-out_selector '{", 
14 |     "\"../data/output-libffm/fm_01/data.val.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",  
15 |     "\"../data/output-libffm/fm_01/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],",   
16 |     "\"../data/output-libffm/fm_01/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",
17 |     "\"../data/output-libffm/fm_01/data.val.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",  
18 |     "\"../data/output-libffm/fm_01/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
19 |     "\"../data/output-libffm/fm_01/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
20 |     "\"../data/output-libffm/fm_01/data.test.tr.small.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\", \"val\"],", 
21 |     "\"../data/output-libffm/fm_01/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
22 |     "\"../data/output-libffm/fm_01/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
23 |     "}'",
24 |     "-col_out IsClick",
25 |     "-col_in_cat",
26 |     "  AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin",
27 |     "  Position",
28 |     "  SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count",
29 |     "      SearchCatID SearchLocID SearchParamsSZBin SearchQuerySZBin SearchRussian",
30 |     "  UserID UserIPID UserPrevQryDateBin UserQryTotalTimeBin",
31 |     "-old_format",
32 |      ">> ../data/log/fm_01/build_fm.log 2>&1"))
33 |   fn.clean.worker()
34 |   NULL
35 | }
36 | fn.kill.wk()
37 | 
38 | #############################################################
39 | # train phase
40 | #############################################################
41 | fn.register.wk(2)
42 | data.fm.01.pred.tmp <- foreach(test.type=c("tr", "val", "test"), .combine=rbind,
43 |                                .noexport=all.noexport) %dopar% {
44 |   
45 |   log.name <- paste0("fm_01/fm_01_",test.type)
46 |   fn.init.worker(log.name)
47 |   
48 |   system(paste(
49 |     "../fm/fm",
50 |     "-k 16 -t 20 -r 0.02 -s 6 -l 0.00001",
51 |     paste0("../data/output-libffm/fm_01/data.",test.type,".tt.fm "),
52 |     paste0("../data/output-libffm/fm_01/data.",test.type,".tr.fm "),
53 |      " >> ", paste0("../data/log/",log.name,".log"), " 2>&1"))
54 |   
55 |   data.pred <- data.table(
56 |     ID = data.all.lr.id[SearchType==test.type,ID],
57 |     Pred = scan(paste0("../data/output-libffm/fm_01/data.",test.type,".tt.fm.out"))
58 |   )
59 |   fn.print.err(data.pred)
60 |     
61 |   fn.clean.worker()
62 |   data.pred
63 | }
64 | fn.kill.wk()
65 | 
66 | data.fm.01.pred.tmp <- data.fm.01.pred.tmp[order(ID)]
67 | Store(data.fm.01.pred.tmp)
68 | 
69 | data.fm.01.pred <- copy(data.fm.01.pred.tmp)
70 | 
71 | fn.print.err(data.fm.01.pred)
72 | #      Size    Loss
73 | # 1 7888752  0.04148 - tr
74 | # 1 8512834  0.04334 - val
75 | # 1 16401586 0.04244 - all
76 | 
77 | Store(data.fm.01.pred)
78 | 
79 | # fn.write.submission(data.fm.01.pred, "data.fm.01.pred")
80 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.fm.02.R:
--------------------------------------------------------------------------------
 1 | #############################################################
 2 | # save csv to disk data
 3 | #############################################################
 4 | 
 5 | fn.register.wk(1)
 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 7 |   
 8 |   fn.init.worker("fm_02/build_fm")
 9 |   system(paste(
10 |     "cd ../avito-context-click-py &&", 
11 |     "pypy -u convert_csv_to_libffm.py", 
12 |     "-input_files ../data/output-r/data.all.lr.csv",
13 |     "-out_selector '{", 
14 |     "\"../data/output-libffm/fm_02/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],",   
15 |     "\"../data/output-libffm/fm_02/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",
16 |     "\"../data/output-libffm/fm_02/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
17 |     "\"../data/output-libffm/fm_02/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
18 |     "\"../data/output-libffm/fm_02/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
19 |     "\"../data/output-libffm/fm_02/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
20 |     "}'",
21 |     "-col_out IsClick",
22 |     "-col_in_cat",
23 |     "    AdCatID AdHistCTRBin AdParams AdPriceBin AdTitleSZBin",
24 |     "     Position SearchAdT1Count",
25 |     "-old_format",
26 |      ">> ../data/log/fm_02/build_fm.log 2>&1"))
27 |   fn.clean.worker()
28 |   NULL
29 | }
30 | fn.kill.wk()
31 | 
32 | #############################################################
33 | # train phase
34 | #############################################################
35 | fn.register.wk(1) # , "tr", "test"
36 | data.fm.02.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind,
37 |                                .noexport=all.noexport) %dopar% {
38 |   
39 |   log.name <- paste0("fm_02/fm_02_",test.type)
40 |   fn.init.worker(log.name)
41 |   
42 |   system(paste(
43 |     "../fm/fm",
44 |     "-k 12 -t 5 -r 0.015 -s 6 -l 0.00001",
45 |     paste0("../data/output-libffm/fm_02/data.",test.type,".tt.fm "),
46 |     paste0("../data/output-libffm/fm_02/data.",test.type,".tr.fm "),
47 |      " >> ", paste0("../data/log/",log.name,".log"), " 2>&1"))
48 |   
49 |   data.pred <- data.table(
50 |     ID = data.all.lr.id[SearchType==test.type,ID],
51 |     Pred = scan(paste0("../data/output-libffm/fm_02/data.",test.type,".tt.fm.out"))
52 |   )
53 |   fn.print.err(data.pred)
54 |     
55 |   fn.clean.worker()
56 |   data.pred
57 | }
58 | fn.kill.wk()
59 | 
60 | data.fm.02.pred.tmp <- data.fm.02.pred.tmp[order(ID)]
61 | Store(data.fm.02.pred.tmp)
62 | 
63 | data.fm.02.pred <- copy(data.fm.02.pred.tmp)
64 | 
65 | fn.print.err(data.fm.02.pred)
66 | #      Size    Loss
67 | # 1 7888752  0.04474 - tr
68 | # 1 8512834  0.04682 - val
69 | # 1 16401586 0.04582 - all
70 | 
71 | Store(data.fm.02.pred)
72 | 
73 | # fn.write.submission(data.fm.02.pred, "data.fm.02.pred")
74 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.fm.03.R:
--------------------------------------------------------------------------------
 1 | #############################################################
 2 | # save csv to disk data
 3 | #############################################################
 4 | 
 5 | fn.register.wk(1)
 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 7 |   
 8 |   fn.init.worker("fm_03/build_fm")
 9 |   system(paste(
10 |     "cd ../avito-context-click-py &&", 
11 |     "pypy -u convert_csv_to_libffm.py", 
12 |     "-input_files ../data/output-r/data.all.lr.csv",
13 |     "-out_selector '{", 
14 |     "\"../data/output-libffm/fm_03/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],",   
15 |     "\"../data/output-libffm/fm_03/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",
16 |     "\"../data/output-libffm/fm_03/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
17 |     "\"../data/output-libffm/fm_03/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
18 |     "\"../data/output-libffm/fm_03/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
19 |     "\"../data/output-libffm/fm_03/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
20 |     "}'",
21 |     "-col_out IsClick",
22 |     "-col_in_cat",
23 |      "    SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ",
24 |      "     SearchCatID SearchLocID",
25 |      "     SearchParamsSZBin SearchQuerySZBin SearchRussian ",
26 |      "     Position",
27 |     "-old_format",
28 |      ">> ../data/log/fm_03/build_fm.log 2>&1"))
29 |   fn.clean.worker()
30 |   NULL
31 | }
32 | fn.kill.wk()
33 | 
34 | #############################################################
35 | # train phase
36 | #############################################################
37 | fn.register.wk(1)
38 | data.fm.03.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind,
39 |                                .noexport=all.noexport) %dopar% {
40 |   
41 |   log.name <- paste0("fm_03/fm_03_",test.type)
42 |   fn.init.worker(log.name)
43 |   
44 |   system(paste(
45 |     "../fm/fm",
46 |     "-k 12 -t 3 -r 0.008 -s 12 -l 0.00001",
47 |     paste0("../data/output-libffm/fm_03/data.",test.type,".tt.fm "),
48 |     paste0("../data/output-libffm/fm_03/data.",test.type,".tr.fm "),
49 |      " >> ", paste0("../data/log/",log.name,".log"), " 2>&1"))
50 |   
51 |   data.pred <- data.table(
52 |     ID = data.all.lr.id[SearchType==test.type,ID],
53 |     Pred = scan(paste0("../data/output-libffm/fm_03/data.",test.type,".tt.fm.out"))
54 |   )
55 |   fn.print.err(data.pred)
56 |     
57 |   fn.clean.worker()
58 |   data.pred
59 | }
60 | fn.kill.wk()
61 | 
62 | data.fm.03.pred.tmp <- data.fm.03.pred.tmp[order(ID)]
63 | Store(data.fm.03.pred.tmp)
64 | 
65 | data.fm.03.pred <- copy(data.fm.03.pred.tmp)
66 | 
67 | fn.print.err(data.fm.03.pred)
68 | #      Size    Loss
69 | # 1 7888752  0.04507 - tr
70 | # 1 8512834  0.04713 - val
71 | # 1 16401586 0.04614 - all
72 | 
73 | Store(data.fm.03.pred)
74 | 
75 | # fn.write.submission(data.fm.03.pred, "data.fm.03.pred")
76 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.fm.04.R:
--------------------------------------------------------------------------------
 1 | #############################################################
 2 | # save csv to disk data
 3 | #############################################################
 4 | 
 5 | fn.register.wk(1)
 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 7 |   
 8 |   fn.init.worker("fm_04/build_fm")
 9 |   system(paste(
10 |     "cd ../avito-context-click-py &&", 
11 |     "pypy -u convert_csv_to_libffm.py", 
12 |     "-input_files ../data/output-r/data.all.lr.csv",
13 |     "-out_selector '{", 
14 |     "\"../data/output-libffm/fm_04/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],",   
15 |     "\"../data/output-libffm/fm_04/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",
16 |     "\"../data/output-libffm/fm_04/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
17 |     "\"../data/output-libffm/fm_04/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
18 |     "\"../data/output-libffm/fm_04/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
19 |     "\"../data/output-libffm/fm_04/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
20 |     "}'",
21 |     "-col_out IsClick",
22 |     "-col_in_cat",
23 |     "    UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ",
24 |     "     UserPrevPhoneRequest ",
25 |     "     UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ",
26 |     "     UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ",
27 |     "     UserQryTotalTimeBin",
28 |     "     Position",
29 |     "-old_format",
30 |     ">> ../data/log/fm_04/build_fm.log 2>&1"))
31 |   fn.clean.worker()
32 |   NULL
33 | }
34 | fn.kill.wk()
35 | 
36 | #############################################################
37 | # train phase
38 | #############################################################
39 | fn.register.wk(1)
40 | data.fm.04.pred.tmp <- foreach(test.type=c("val", "tr", "test"), .combine=rbind,
41 |                                .noexport=all.noexport) %dopar% {
42 |   
43 |   log.name <- paste0("fm_04/fm_04_",test.type)
44 |   fn.init.worker(log.name)
45 |   
46 |   system(paste(
47 |     "../fm/fm",
48 |     "-k 12 -t 5 -r 0.004 -s 12 -l 0.00001",
49 |     paste0("../data/output-libffm/fm_04/data.",test.type,".tt.fm "),
50 |     paste0("../data/output-libffm/fm_04/data.",test.type,".tr.fm "),
51 |      " >> ", paste0("../data/log/",log.name,".log"), " 2>&1"))
52 |   
53 |   data.pred <- data.table(
54 |     ID = data.all.lr.id[SearchType==test.type,ID],
55 |     Pred = scan(paste0("../data/output-libffm/fm_04/data.",test.type,".tt.fm.out"))
56 |   )
57 |   fn.print.err(data.pred)
58 |     
59 |   fn.clean.worker()
60 |   data.pred
61 | }
62 | fn.kill.wk()
63 | 
64 | data.fm.04.pred.tmp <- data.fm.04.pred.tmp[order(ID)]
65 | Store(data.fm.04.pred.tmp)
66 | 
67 | data.fm.04.pred <- copy(data.fm.04.pred.tmp)
68 | 
69 | fn.print.err(data.fm.04.pred)
70 | #      Size     Loss
71 | # 1 7888752  0.04888 - tr
72 | # 1 8512834  0.05135 - val
73 | # 1 16401586 0.05017 - all
74 | 
75 | Store(data.fm.04.pred)
76 | 
77 | # fn.write.submission(data.fm.04.pred, "data.fm.04.pred")
78 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.fm.05.R:
--------------------------------------------------------------------------------
 1 | #############################################################
 2 | # save csv to disk data
 3 | #############################################################
 4 | 
 5 | fn.register.wk(1)
 6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
 7 |   
 8 |   fn.init.worker("fm_05/build_fm")
 9 |   system(paste(
10 |     "cd ../avito-context-click-py &&", 
11 |     "pypy -u convert_csv_to_libffm.py", 
12 |     "-input_files ../data/output-r/data.all.lr.csv",
13 |     "-out_selector '{", 
14 |     "\"../data/output-libffm/fm_05/data.tr.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\"],",   
15 |     "\"../data/output-libffm/fm_05/data.tr.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"tr\"],",
16 |     "\"../data/output-libffm/fm_05/data.val.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\"],",   
17 |     "\"../data/output-libffm/fm_05/data.val.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"val\"],",
18 |     "\"../data/output-libffm/fm_05/data.test.tr.fm\": lambda file, row: row[\"SearchType\"] in [\"hist\", \"tr\", \"val\"],", 
19 |     "\"../data/output-libffm/fm_05/data.test.tt.fm\": lambda file, row: row[\"SearchType\"] in [\"test\"]",
20 |     "}'",
21 |     "-col_out IsClick",
22 |     "-col_in_cat",
23 |     "    CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ",
24 |     "     CountAdUsersBin CountIPUserBin CountUserAdBin ",
25 |     "     CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ",
26 |     "     CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ",
27 |     "     CountUserSearchLocationBin ",
28 |     "     RatioAdPos1Bin RatioSearchRussBin ",
29 |     "   Position",
30 |     "-old_format",
31 |     ">> ../data/log/fm_05/build_fm.log 2>&1"))
32 |   fn.clean.worker()
33 |   NULL
34 | }
35 | fn.kill.wk()
36 | 
37 | #############################################################
38 | # train phase
39 | #############################################################
40 | fn.register.wk(1) #  
41 | data.fm.05.pred.tmp <- foreach(test.type=c("val" , "tr", "test"), .combine=rbind,
42 |                                .noexport=all.noexport) %dopar% {
43 |   
44 |   log.name <- paste0("fm_05/fm_05_",test.type)
45 |   fn.init.worker(log.name)
46 |   
47 |   system(paste(
48 |     "../fm/fm",
49 |     "-k 12 -t 5 -r 0.004 -s 12 -l 0.00001",
50 |     paste0("../data/output-libffm/fm_05/data.",test.type,".tt.fm "),
51 |     paste0("../data/output-libffm/fm_05/data.",test.type,".tr.fm "),
52 |      " >> ", paste0("../data/log/",log.name,".log"), " 2>&1"))
53 |   
54 |   data.pred <- data.table(
55 |     ID = data.all.lr.id[SearchType==test.type,ID],
56 |     Pred = scan(paste0("../data/output-libffm/fm_05/data.",test.type,".tt.fm.out"))
57 |   )
58 |   fn.print.err(data.pred)
59 |     
60 |   fn.clean.worker()
61 |   data.pred
62 | }
63 | fn.kill.wk()
64 | 
65 | data.fm.05.pred.tmp <- data.fm.05.pred.tmp[order(ID)]
66 | Store(data.fm.05.pred.tmp)
67 | 
68 | data.fm.05.pred <- copy(data.fm.05.pred.tmp)
69 | 
70 | fn.print.err(data.fm.05.pred)
71 | #      Size     Loss
72 | # 1 7888752  0.04992 - tr
73 | # 1 8512834  0.04812 - val
74 | # 1 16401586 0.04905 - all
75 | 
76 | Store(data.fm.05.pred)
77 | 
78 | # fn.write.submission(data.fm.05.pred, "data.fm.05.pred")
79 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.ftrl.04.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # save csv to disk data
  3 | #############################################################
  4 | 
  5 | fn.register.wk(1)
  6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
  7 |     
  8 |   fn.init.worker("ftrl_04/build_csv")
  9 |   
 10 |   cat("\nLoading data...\n")
 11 |   data.all.cur <- data.all.lr
 12 |   # fn.soar.unload(data.all.lr)
 13 |   
 14 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate")
 15 |   cols.in <- setdiff(colnames(data.all.cur), cols.extra)
 16 |   
 17 |   # data.all.cur[, SearchType:=NULL]
 18 |   
 19 |   for (jx in 1:2) {
 20 |     fold.name <- paste0("ftrl_04_", jx)
 21 |   
 22 |     if (jx == 1) {
 23 |       setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position"))
 24 |     } else {
 25 |       setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 
 26 |                               "SearchDate", "SearchID"))
 27 |     }
 28 |     data.search.type <- data.all.cur$SearchType
 29 |     for (test.type in c("tr", "val", "test")) {
 30 |       cat("\nTest type", test.type, jx, "...\n")
 31 |       
 32 |       data.fold <- fn.create.data.fold(fold.name, test.type)
 33 |       data.fold$writedir <- fn.py.file(data.fold$basename)
 34 |       dir.create(data.fold$writedir, showWarnings = F, recursive = T)
 35 |       
 36 |       data.fold$col.out <- "IsClick"
 37 |       data.fold$cols.in <- cols.in
 38 |       
 39 |       tr.type <- fn.lr.tr.type(test.type)
 40 |       cat("\ntr.type", paste(tr.type, collapse=", "), "...\n")
 41 |       
 42 |       data.fold$tr.idx <- which(data.search.type %in% unique(tr.type))
 43 |       cat("\ntr.idx", length(data.fold$tr.idx), "...\n")
 44 |       
 45 |       data.fold$test.idx <- which(data.search.type %in% unique(test.type))
 46 |       cat("\ntest.idx", length(data.fold$test.idx), "...\n")
 47 |       data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred")
 48 |       
 49 |       data.all.cur[, IsTestRow:=0]
 50 |       data.all.cur[data.fold$test.idx, IsTestRow:=1]
 51 |       all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx)))
 52 |       cat("\nall.ix", length(all.ix), "...\n")
 53 |       
 54 |       cat("\nSaving", test.type, jx, "csv...\n")
 55 |       data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv")
 56 |       fn.write.csv.chunk(
 57 |         data=data.all.cur, subset=all.ix,
 58 |         file=data.fold$all.file, row.names = F, compress = F
 59 |       )
 60 |       data.all.cur[, IsTestRow:=0]
 61 |       
 62 |       cat("\nSaving", test.type, "data.fold...\n")
 63 |       fn.save.data.fold(data.fold)
 64 |     }
 65 |   }
 66 |   NULL
 67 | }
 68 | fn.kill.wk()
 69 | 
 70 | #############################################################
 71 | # train phase
 72 | #############################################################
 73 | train.grid = expand.grid(
 74 |   test.type=c("tr", "val", "test"),
 75 |   jx=1:2,
 76 |   stringsAsFactors=F
 77 | )
 78 | 
 79 | fn.register.wk(nrow(train.grid))
 80 | data.ftrl.04.pred.tmp <- foreach(
 81 |   r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% {
 82 |   
 83 |   test.type <- train.grid$test.type[r]
 84 |   jx <- train.grid$jx[r]
 85 |   
 86 |   fn.init.fold.worker(paste0("ftrl_04_", jx), test.type)
 87 |   # fn.clean.worker()  
 88 |   
 89 |   epochs <- 1
 90 |   system(paste(
 91 |     "cd ../avito-context-click-py && pypy -u train_ftrl.py",
 92 |     "-train_file", data.fold$all.file,
 93 |     # "-train_model_file {TRAIN_FILE}.pklz",
 94 |     "-test_pred_file", data.fold$test.pred.file,
 95 |     "-test_pred_extra_cols ID -test_pred_col Pred",
 96 |     "-col_out IsClick",
 97 |     "-col_in_cat", 
 98 |      "    AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ",
 99 |      "    Position ",
100 |      "    SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ",
101 |      "     SearchCatID SearchLocID",
102 |      "     SearchParamsSZBin SearchQuerySZBin SearchRussian ",
103 |      "    UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ",
104 |      "     UserID UserIPID UserPrevPhoneRequest ",
105 |      "     UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ",
106 |      "     UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ",
107 |     "-train_is_test_col IsTestRow",
108 |     "-bits 27 -alpha 0.07 -beta 1.0 -l1 0.01 -l2 1. -dropout 0",
109 |     "-two_way 'Ad Us' 'Us Search' 'Ad Search' 'Ad Pos' 'Us Pos' 'Pos Search'",
110 |     "-seed 7 -epochs", epochs,
111 |     # "-load_model",
112 |     " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1"
113 |   ))
114 |   
115 |   data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep="."))
116 |   fn.print.err(data.fold$test.pred)
117 | 
118 |   fn.clean.worker()
119 |   
120 |   data.fold$test.pred
121 | }
122 | fn.kill.wk()
123 | 
124 | data.ftrl.04.pred.tmp <- data.ftrl.04.pred.tmp[order(ID)]
125 | Store(data.ftrl.04.pred.tmp)
126 | 
127 | data.ftrl.04.pred <- data.ftrl.04.pred.tmp[
128 |   ,list(
129 |     Pred=sum(Pred)/.N
130 |     ), by="ID"
131 | ]
132 | 
133 | fn.print.err(data.ftrl.04.pred)
134 | #      Size     Loss
135 | # 1 7888752  0.04148 - tr
136 | # 1 8512834  0.04335 - val
137 | # 1 16401586 0.04245 - all
138 | 
139 | 
140 | Store(data.ftrl.04.pred)
141 | # 
142 | # # fn.write.submission(data.ftrl.04.pred, "data.ftrl.04.pred")
143 | 
144 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.ftrl.05.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # save csv to disk data
  3 | #############################################################
  4 | 
  5 | fn.register.wk(1)
  6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
  7 |     
  8 |   fn.init.worker("ftrl_05/build_csv")
  9 |   
 10 |   cat("\nLoading data...\n")
 11 |   data.all.cur <- data.all.lr
 12 |   # fn.soar.unload(data.all.lr)
 13 |   
 14 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate")
 15 |   cols.in <- setdiff(colnames(data.all.cur), cols.extra)
 16 |   
 17 |   # data.all.cur[, SearchType:=NULL]
 18 |   
 19 |   for (jx in 1) { # :2
 20 |     fold.name <- paste0("ftrl_05_", jx)
 21 |   
 22 |     if (jx == 1) {
 23 |       setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position"))
 24 |     } else {
 25 |       setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 
 26 |                               "SearchDate", "SearchID"))
 27 |     }
 28 |     data.search.type <- data.all.cur$SearchType
 29 |     for (test.type in c("val", "test")) { # "tr", 
 30 |       cat("\nTest type", test.type, jx, "...\n")
 31 |       
 32 |       data.fold <- fn.create.data.fold(fold.name, test.type)
 33 |       data.fold$writedir <- fn.py.file(data.fold$basename)
 34 |       dir.create(data.fold$writedir, showWarnings = F, recursive = T)
 35 |       
 36 |       data.fold$col.out <- "IsClick"
 37 |       data.fold$cols.in <- cols.in
 38 |       
 39 |       tr.type <- fn.lr.tr.type(test.type)
 40 |       cat("\ntr.type", paste(tr.type, collapse=", "), "...\n")
 41 |       
 42 |       data.fold$tr.idx <- which(data.search.type %in% unique(tr.type))
 43 |       cat("\ntr.idx", length(data.fold$tr.idx), "...\n")
 44 |       
 45 |       data.fold$test.idx <- which(data.search.type %in% unique(test.type))
 46 |       cat("\ntest.idx", length(data.fold$test.idx), "...\n")
 47 |       data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred")
 48 |       
 49 |       data.all.cur[, IsTestRow:=0]
 50 |       data.all.cur[data.fold$test.idx, IsTestRow:=1]
 51 |       all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx)))
 52 |       cat("\nall.ix", length(all.ix), "...\n")
 53 |       
 54 |       cat("\nSaving", test.type, jx, "csv...\n")
 55 |       data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv")
 56 |       fn.write.csv.chunk(
 57 |         data=data.all.cur, subset=all.ix,
 58 |         file=data.fold$all.file, row.names = F, compress = F
 59 |       )
 60 |       data.all.cur[, IsTestRow:=0]
 61 |       
 62 |       cat("\nSaving", test.type, "data.fold...\n")
 63 |       fn.save.data.fold(data.fold)
 64 |     }
 65 |   }
 66 |   NULL
 67 | }
 68 | fn.kill.wk()
 69 | 
 70 | #############################################################
 71 | # train phase
 72 | #############################################################
 73 | train.grid = expand.grid(
 74 |   test.type=c("tr", "val", "test"),
 75 |   jx=1:2,
 76 |   stringsAsFactors=F
 77 | )
 78 | 
 79 | fn.register.wk(nrow(train.grid))
 80 | data.ftrl.05.pred.tmp <- foreach(
 81 |   r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% {
 82 |   
 83 |   test.type <- train.grid$test.type[r]
 84 |   jx <- train.grid$jx[r]
 85 |   
 86 |   fn.init.fold.worker(paste0("ftrl_05_", jx), test.type)
 87 |   # fn.clean.worker()  
 88 |   
 89 |   epochs <- 2
 90 |   system(paste(
 91 |     "cd ../avito-context-click-py && pypy -u train_ftrl.py",
 92 |     "-train_file", data.fold$all.file,
 93 |     # "-train_model_file {TRAIN_FILE}.pklz",
 94 |     "-test_pred_file", data.fold$test.pred.file,
 95 |     "-test_pred_extra_cols ID -test_pred_col Pred",
 96 |     "-col_out IsClick",
 97 |     "-col_in_cat", 
 98 |      "    AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ",
 99 |      "    CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ",
100 |      "     CountAdUsersBin CountIPUserBin CountUserAdBin ",
101 |      "     CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ",
102 |      "     CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ",
103 |      "     CountUserSearchLocationBin ",
104 |      "    Position ",
105 |      "    RatioAdPos1Bin RatioSearchRussBin ",
106 |      "    SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ",
107 |      "     SearchCatID SearchLocID SearchOrdUsrAsc SearchOrdUsrDesc ",
108 |      "     SearchParamsSZBin SearchQuerySZBin SearchRussian ",
109 |      "    UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ",
110 |      "     UserID UserIPID UserLogged UserPrevPhoneRequest ",
111 |      "     UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ",
112 |      "     UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ",
113 |      "     UserQryTotalTimeBin",
114 |     "-train_is_test_col IsTestRow",
115 |     "-bits 27 -alpha .008 -beta .1 -l1 0.1 -l2 0.15 -dropout 0",
116 |     "-two_way 'Ad Us' 'Us Search' 'Ad Search' 'Ad Pos' 'Us Pos' 'Pos Search'",
117 |     "-seed 7 -epochs", epochs,
118 |     # "-load_model",
119 |     " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1"
120 |   ))
121 |   
122 |   data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep="."))
123 |   fn.print.err(data.fold$test.pred)
124 | 
125 |   fn.clean.worker()
126 |   
127 |   data.fold$test.pred
128 | }
129 | fn.kill.wk()
130 | 
131 | data.ftrl.05.pred.tmp <- data.ftrl.05.pred.tmp[order(ID)]
132 | Store(data.ftrl.05.pred.tmp)
133 | 
134 | data.ftrl.05.pred <- data.ftrl.05.pred.tmp[
135 |   ,list(
136 |     Pred=sum(Pred)/.N
137 |     ), by="ID"
138 | ]
139 | 
140 | fn.print.err(data.ftrl.05.pred)
141 | #      Size     Loss
142 | # 1 7888752  0.04122 - tr
143 | # 1 8512834  0.04314 - val
144 | # 1 16401586 0.04222 - all
145 | 
146 | Store(data.ftrl.05.pred)
147 | # 
148 | # # fn.write.submission(data.ftrl.05.pred, "data.ftrl.05.pred")
149 | 
150 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.ftrl.06.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # save csv to disk data
  3 | #############################################################
  4 | 
  5 | fn.register.wk(1)
  6 | tmp <- foreach(tmp=1, .noexport=all.noexport) %dopar% {
  7 |     
  8 |   fn.init.worker("ftrl_06/build_csv")
  9 |   
 10 |   cat("\nLoading data...\n")
 11 |   data.all.cur <- data.all.lr
 12 |   # fn.soar.unload(data.all.lr)
 13 |   
 14 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick", "SearchDate")
 15 |   cols.in <- setdiff(colnames(data.all.cur), cols.extra)
 16 |   
 17 |   # data.all.cur[, SearchType:=NULL]
 18 |   
 19 |   for (jx in 1:2) {
 20 |     fold.name <- paste0("ftrl_06_", jx)
 21 |   
 22 |     if (jx == 1) {
 23 |       setkeyv(data.all.cur, c("SearchDate", "SearchID", "Position"))
 24 |     } else {
 25 |       setkeyv(data.all.cur, c("Position", "SearchCatID", "SearchRussian", 
 26 |                               "SearchDate", "SearchID"))
 27 |     }
 28 |     data.search.type <- data.all.cur$SearchType
 29 |     for (test.type in c("tr", "val", "test")) {
 30 |       cat("\nTest type", test.type, jx, "...\n")
 31 |       
 32 |       data.fold <- fn.create.data.fold(fold.name, test.type)
 33 |       data.fold$writedir <- fn.py.file(data.fold$basename)
 34 |       dir.create(data.fold$writedir, showWarnings = F, recursive = T)
 35 |       
 36 |       data.fold$col.out <- "IsClick"
 37 |       data.fold$cols.in <- cols.in
 38 |       
 39 |       tr.type <- fn.lr.tr.type(test.type)
 40 |       cat("\ntr.type", paste(tr.type, collapse=", "), "...\n")
 41 |       
 42 |       data.fold$tr.idx <- which(data.search.type %in% unique(tr.type))
 43 |       cat("\ntr.idx", length(data.fold$tr.idx), "...\n")
 44 |       
 45 |       data.fold$test.idx <- which(data.search.type %in% unique(test.type))
 46 |       cat("\ntest.idx", length(data.fold$test.idx), "...\n")
 47 |       data.fold$test.pred.file <- fn.file.data.fold(data.fold, "eval.pred")
 48 |       
 49 |       data.all.cur[, IsTestRow:=0]
 50 |       data.all.cur[data.fold$test.idx, IsTestRow:=1]
 51 |       all.ix <- sort(unique(c(data.fold$tr.idx, data.fold$test.idx)))
 52 |       cat("\nall.ix", length(all.ix), "...\n")
 53 |       
 54 |       cat("\nSaving", test.type, jx, "csv...\n")
 55 |       data.fold$all.file <- fn.file.data.fold(data.fold, "all.csv")
 56 |       fn.write.csv.chunk(
 57 |         data=data.all.cur, subset=all.ix,
 58 |         file=data.fold$all.file, row.names = F, compress = F
 59 |       )
 60 |       data.all.cur[, IsTestRow:=0]
 61 |       
 62 |       cat("\nSaving", test.type, "data.fold...\n")
 63 |       fn.save.data.fold(data.fold)
 64 |     }
 65 |   }
 66 |   NULL
 67 | }
 68 | fn.kill.wk()
 69 | 
 70 | #############################################################
 71 | # train phase
 72 | #############################################################
 73 | train.grid = expand.grid(
 74 |   test.type=c("tr", "val", "test"),
 75 |   jx=1:2,
 76 |   stringsAsFactors=F
 77 | )
 78 | 
 79 | fn.register.wk(nrow(train.grid))
 80 | data.ftrl.06.pred.tmp <- foreach(
 81 |   r=1:nrow(train.grid), .combine=rbind, .noexport=all.noexport) %dopar% {
 82 |   
 83 |   test.type <- train.grid$test.type[r]
 84 |   jx <- train.grid$jx[r]
 85 |   
 86 |   fn.init.fold.worker(paste0("ftrl_06_", jx), test.type)
 87 |   # fn.clean.worker()  
 88 |   
 89 |   epochs <- 2
 90 |   system(paste(
 91 |     "cd ../avito-context-click-py && pypy -u train_ftrl.py",
 92 |     "-train_file", data.fold$all.file,
 93 |     "-test_pred_file", data.fold$test.pred.file,
 94 |     "-test_pred_extra_cols ID -test_pred_col Pred",
 95 |     "-col_out IsClick",
 96 |     "-col_in_cat", 
 97 |      "    AdCatID AdHistCTRBin AdID AdParams AdPriceBin AdTitleSZBin ",
 98 |      "    CountAdSearchBin CountAdSearchCatBin CountAdSearchLocBin ",
 99 |      "     CountAdUsersBin CountIPUserBin CountUserAdBin ",
100 |      "     CountUserAdDupT1Bin CountUserAdDupT3Bin CountUserAdT1Bin ",
101 |      "     CountUserAdT3Bin CountUserSearchBin CountUserSearchCategoryBin ",
102 |      "     CountUserSearchLocationBin ",
103 |      "    Position ",
104 |      "    RatioAdPos1Bin RatioSearchRussBin ",
105 |      "    SearchAdCount SearchAdT1Count SearchAdT2Count SearchAdT3Count ",
106 |      "     SearchCatID SearchLocID SearchOrdUsrAsc SearchOrdUsrDesc ",
107 |      "     SearchParamsSZBin SearchQuerySZBin SearchRussian ",
108 |      "    UserAgentFamilyID UserAgentID UserAgentOSID UserDeviceID ",
109 |      "     UserID UserIPID UserLogged UserPrevPhoneRequest ",
110 |      "     UserPrevPrevPrevQryDateBin UserPrevPrevQryDateBin ",
111 |      "     UserPrevQryDateBin UserPrevVisitReq UserPrevVisitReqUni ",
112 |      "     UserQryTotalTimeBin",
113 |     "-train_is_test_col IsTestRow",
114 |     "-bits 27 -alpha 0.07 -beta 1.0 -l1 0.01 -l2 1. -dropout 0",
115 |     "-two_way 'AdID SearchCatID' 'AdID UserID' 'AdCatID SearchCatID'",
116 |     "         'AdID SearchLocID' 'SearchCatID UserID' 'AdCatID UserID'",
117 |     "         'SearchLocID UserID' 'AdID Pos' 'AdCatID Pos' 'SearchCatID Pos'",
118 |     "         'SearchLocID Pos' 'UserID Pos' 'SearchRussian Pos'",
119 |     "         'SearchAdT1 AdID' 'SearchAdT1 AdCatID' 'SearchAdT1 Pos'",
120 |     "         'AdID UserAgentOSID' 'AdID UserAgentFamilyID' 'AdCatID AdPriceBin'",
121 |     "         'AdPriceBin UserID' ",
122 |     "-seed 7 -epochs", epochs,
123 |     # "-load_model",
124 |     " >> ", paste0("../data/log/", data.fold$logname, ".log"), " 2>&1"
125 |   ))
126 |   
127 |   data.fold$test.pred <- fread(paste(data.fold$test.pred.file, epochs, sep="."))
128 |   fn.print.err(data.fold$test.pred)
129 | 
130 |   fn.clean.worker()
131 |   
132 |   data.fold$test.pred
133 | }
134 | fn.kill.wk()
135 | 
136 | data.ftrl.06.pred.tmp <- data.ftrl.06.pred.tmp[order(ID)]
137 | Store(data.ftrl.06.pred.tmp)
138 | 
139 | data.ftrl.06.pred <- data.ftrl.06.pred.tmp[
140 |   ,list(
141 |     Pred=sum(Pred)/.N
142 |     ), by="ID"
143 | ]
144 | 
145 | fn.print.err(data.ftrl.06.pred)
146 | #      Size     Loss
147 | # 1 7888752  0.04163 - tr
148 | # 1 8512834  0.04359 - val
149 | # 1 16401586 0.04265 - all
150 | 
151 | Store(data.ftrl.06.pred)
152 | # 
153 | # # fn.write.submission(data.ftrl.06.pred, "data.ftrl.06.pred")
154 | 
155 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.xgb.03.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # train model
  3 | #############################################################
  4 | 
  5 | fn.register.wk(1, seed=5471887) # 5471887
  6 | data.xgb.03.pred.tmp <- foreach(
  7 |   test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% {
  8 |   
  9 |   fn.init.new.fold.worker("xgb_03", paste0(test.type))
 10 |   # fn.clean.worker()
 11 |   
 12 | 
 13 |   tr.type <- fn.lr.tr.type(test.type)
 14 |   # tr.type <- fn.tree.tr.type(test.type)
 15 |   
 16 |   data.fold$tr.idx <- which(data.all.tree$SearchType %in% tr.type)
 17 |   data.fold$test.idx <- which(!data.all.tree$SearchType %in% tr.type)
 18 |   data.fold$val.idx <- data.fold$test.idx[!is.na(
 19 |     data.all.tree$IsClick[data.fold$test.idx])]
 20 |   
 21 |   cols.extra <- c("ID", "SearchID", "SearchType", "IsClick")
 22 |   cols.in <- sort(setdiff(colnames(data.all.tree), 
 23 |                           c(cols.extra)))
 24 |   
 25 |   cat("\n\nTr size:", length(data.fold$tr.idx), 
 26 |       ", Val size:", length(data.fold$val.idx),
 27 |       ", Test size:", length(data.fold$test.idx),
 28 |       "...\n")
 29 |   
 30 |   data.tr <- fn.xgb.matrix(
 31 |     data=data.all.tree, subset=data.fold$tr.idx, col.in=cols.in)
 32 |   
 33 |   if (length(data.fold$val.idx) > 0) {
 34 |     data.val <- fn.xgb.matrix(
 35 |       data=data.all.tree, subset=data.fold$val.idx, col.in=cols.in)
 36 |     data.watch = list(val=data.val)
 37 |   } else {
 38 |     data.watch = list(tr=data.tr)
 39 |   }
 40 | 
 41 |   
 42 |   data.test <- fn.xgb.matrix(
 43 |     data=data.all.tree, subset=data.fold$test.idx, col.in=cols.in)
 44 |   
 45 |   data.fold$test.pred <- data.table(
 46 |     ID = data.all.tree$ID[data.fold$test.idx],
 47 |     Pred = 0.0,
 48 |     n = 0
 49 |   )
 50 |   
 51 |   avg.ix <- which(data.fold$test.pred$ID > 0)
 52 |   if (length(avg.ix) == 0) {
 53 |     avg.ix <- 1:nrow(data.fold$test.pred)
 54 |   }
 55 |   # print(length(avg.ix))
 56 |   
 57 |   fn.soar.unload(data.all.tree)
 58 | 
 59 |   data.fold$params = list(
 60 |     objective = "binary:logistic",
 61 |     eval_metric = "logloss",
 62 |     nthread = 12,
 63 |     eta = 0.2,
 64 |     max_depth = 10,
 65 |     gamma = 0.8, 
 66 |     colsample_bytree = 0.7, 
 67 |     colsample_bylevel = 0.8
 68 |   )
 69 |   
 70 |   data.fold$nrounds <- 75
 71 |   
 72 |   cat("\nParams:\n")
 73 |   print(data.fold$params)
 74 |   
 75 |   n.models <- 20
 76 |   for (ix in 1:n.models) {
 77 |     
 78 |     cat("\n\nTraining ", ix, "...\n")
 79 |     
 80 |     set.seed(ix + 89475560)
 81 |     
 82 |     suppressMessages(library("xgboost"))
 83 |     model =  xgb.train(
 84 |       data = data.tr,
 85 |       watchlist=data.watch,
 86 |       params = data.fold$params,
 87 |       nrounds = data.fold$nrounds,
 88 |       verbose = 1)
 89 |     
 90 |     
 91 |     ntreelimit <- data.fold$nrounds
 92 |     try.pred <- T
 93 |     
 94 |     while (try.pred) {
 95 |       pred.cur <- predict(model, data.test, ntreelimit=ntreelimit)
 96 |       pred.cur.avg <- mean(pred.cur[avg.ix])
 97 |       
 98 |       cat("\nCurrent prediction avg of", length(avg.ix),
 99 |           "instances:", pred.cur.avg, "\n")
100 |       if (test.type == "val" || 
101 |           (pred.cur.avg >= 0.008 && pred.cur.avg <= 0.012)) {
102 |         
103 |         try.pred <- F
104 |         # data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)]
105 |         # data.fold$test.pred[,  n := n+1]
106 |         
107 |         data.fold$test.pred[ , Pred := ((Pred^n)*pred.cur)^(1/(n+1))]
108 |         data.fold$test.pred[,  n := n+1]
109 |         
110 |         fn.save.data.fold(data.fold)
111 |         cat("\nPrediction with", ntreelimit ,"trees included\n")
112 |       } else {
113 |         cat("\nPrediction with", ntreelimit ,"trees discarded\n")
114 |         ntreelimit <- ntreelimit - 5
115 |         try.pred <- ntreelimit >= 60
116 |       }
117 |     }
118 | 
119 |     cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n")
120 |     fn.print.err(data.fold$test.pred)
121 |     
122 |     set.seed(Sys.time())
123 |     rm(pred.cur, pred.cur.avg)
124 |     invisible(gc())
125 |     
126 |   }
127 |   
128 | #   data.fold$importance <- xgb.importance(
129 | #     feature_names=cols.in, model=model)
130 | #   
131 | #   cat("\n\nFeature importance:\n")
132 | #   print(data.fold$importance)
133 |   
134 |   fn.clean.worker()
135 |   
136 |   data.fold$test.pred
137 |   
138 | }
139 | fn.kill.wk()
140 | 
141 | data.xgb.03.pred.tmp <- data.xgb.03.pred.tmp[
142 |   order(ID),list(Pred=mean(Pred)), by="ID"]
143 | Store(data.xgb.03.pred.tmp)
144 | 
145 | data.xgb.03.pred <- copy(data.xgb.03.pred.tmp)
146 | 
147 | #############################################################
148 | # save data
149 | #############################################################
150 | 
151 | fn.print.err(data.xgb.03.pred)
152 | #      Size    Loss
153 | # 1 8512834 0.04256
154 | 
155 | Store(data.xgb.03.pred) # 0.04086
156 | 
157 | cat('Test avg:', mean(data.xgb.03.pred[ID > 0]$Pred), "\n") 
158 | # Test avg: 0.008966203 
159 | 
160 | # fn.write.submission(data.xgb.03.pred, "data.xgb.03.pred")
161 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l1.xgb.05.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # train model
  3 | #############################################################
  4 | 
  5 | wk.seed <- 5471887
  6 | fn.register.wk(1, seed=wk.seed)
  7 | data.l1.xgb.05.pred.tmp <- foreach(
  8 |   test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% {
  9 |   
 10 |   fn.init.new.fold.worker("l1_xgb_05", paste0(test.type, "2"))
 11 |   # fn.clean.worker()
 12 |   
 13 |   cat("\n\nSeed:", wk.seed, "...\n")
 14 |   
 15 | 
 16 |   data.tr <- xgb.DMatrix(fn.libsvm.file(
 17 |     paste0("data.", test.type, ".tr.full.libsvm")))
 18 |   data.test <- xgb.DMatrix(fn.libsvm.file(
 19 |     paste0("data.", test.type, ".tt.full.libsvm")))
 20 |   
 21 |   eval_metric = "logloss"
 22 |   data.watch = list(val=data.test)
 23 |   if (test.type == "val") {
 24 |     eval_metric <- function(preds, dtrain) {
 25 |       labels <- as.numeric(getinfo(dtrain, "label"))
 26 |       preds - as.numeric(preds)
 27 |       err <- round(fn.log.loss(actual=labels, pred=preds), digits=5)
 28 |       return(list(metric = "logloss", value = err))
 29 |     }
 30 |   } 
 31 |   
 32 |   data.fold$test.pred <- data.table(
 33 |     ID = sort(data.all.search.small[SearchType == test.type, ID]),
 34 |     Pred = 0.0,
 35 |     n = 0
 36 |   )
 37 |   
 38 |   fn.soar.unload(data.all.search.small)
 39 |   
 40 |   # Num rounds 66
 41 |   # Eta 0,5
 42 |   # Maxdepth 10
 43 |   # Colsample 0,375
 44 |   # Minchildweight 10
 45 | 
 46 |   data.fold$params = list(
 47 |     objective = "binary:logistic",
 48 |     eval_metric = eval_metric,
 49 |     nthread = 6,
 50 |     eta = 0.18,
 51 |     max_depth = 10,
 52 |     gamma = 0.8, 
 53 |     colsample_bytree = 0.7, 
 54 |     min_child_weight = 5,
 55 |     colsample_bylevel = 0.8
 56 |   )
 57 |   
 58 |   data.fold$nrounds <- 75
 59 |   
 60 |   cat("\nParams:\n")
 61 |   print(data.fold$params)
 62 |   
 63 |   n.models <- 10
 64 |   for (ix in 1:n.models) {
 65 |     
 66 |     cat("\n\nTraining ", ix, "of", n.models,"...\n")
 67 |     
 68 |     set.seed(ix + 89475560)
 69 |     
 70 |     model =  xgb.train(
 71 |       data = data.tr,
 72 |       watchlist=data.watch,
 73 |       params = data.fold$params,
 74 |       nrounds = data.fold$nrounds,
 75 |       verbose = 1)
 76 |     
 77 |     
 78 |     ntreelimit <- data.fold$nrounds
 79 |     try.pred <- T
 80 |     
 81 |     while (try.pred) {
 82 |       pred.cur <- xgboost::predict(model, data.test, ntreelimit=ntreelimit)
 83 |       pred.cur.avg <- mean(pred.cur)
 84 |       
 85 |       cat("\nCurrent prediction avg of", length(pred.cur),
 86 |           "instances:", pred.cur.avg, "\n")
 87 |       if (test.type == "val" || 
 88 |           (pred.cur.avg >= 0.006 && pred.cur.avg <= 0.016)) {
 89 |         
 90 |         try.pred <- F
 91 |         data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)]
 92 |         data.fold$test.pred[,  n := n+1]
 93 | 
 94 |         
 95 |         fn.save.data.fold(data.fold)
 96 |         cat("\nPrediction with", ntreelimit ,"trees included\n")
 97 |       } else {
 98 |         cat("\nPrediction with", ntreelimit ,"trees discarded\n")
 99 |         ntreelimit <- ntreelimit - 5
100 |         try.pred <- ntreelimit >= 60
101 |       }
102 |     }
103 | 
104 |     cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n")
105 |     fn.print.err(data.fold$test.pred)
106 | 
107 |     set.seed(Sys.time())
108 |     rm(pred.cur, pred.cur.avg)
109 |     invisible(gc())
110 |     
111 |   }
112 | 
113 | #   cat("\n\nFeature importance:\n") 
114 | #   data.fold$importance <- xgb.importance(
115 | #     feature_names=cols.in.combine, model=model)
116 | #   print(data.fold$importance)
117 |   
118 |   fn.clean.worker()
119 |   
120 |   data.fold$test.pred
121 |   
122 | }
123 | fn.kill.wk()
124 | 
125 | data.l1.xgb.05.pred.tmp <- data.l1.xgb.05.pred.tmp[order(ID)]
126 | Store(data.l1.xgb.05.pred.tmp)
127 | 
128 | for (ix in "2") {
129 |   test.type <- "test"
130 |   cat("\nLoading",test.type, ix,"...\n")
131 |   
132 |   fn.init.fold.worker("l1_xgb_05", paste0(test.type, ix), no.log=T)
133 |   pred.nam <- paste("data.l1.xgb.05.pred", test.type, ix, sep=".")
134 |   assign(pred.nam, data.fold$test.pred[order(ID)])
135 |   cat("Saving",pred.nam,"...\n")
136 |   Store(list=pred.nam)
137 | }
138 | 
139 | data.l1.xgb.05.pred.tmp <- rbind(
140 |   data.l1.xgb.05.pred.test.2
141 | )[order(ID), list(Pred=sum(Pred*n)/sum(n)), by="ID"]
142 | 
143 | 
144 | data.l1.xgb.05.pred <- copy(data.l1.xgb.05.pred.tmp)
145 | 
146 | #############################################################
147 | # save data
148 | #############################################################
149 | 
150 | # fn.print.err(data.l1.xgb.05.pred)
151 | 
152 | 
153 | Store(data.l1.xgb.05.pred) #  0.04076
154 | 
155 | cat('Test avg:', mean(data.l1.xgb.05.pred[ID > 0]$Pred), "\n") 
156 | # Test avg: 0.007848369
157 | 
158 | # fn.write.submission(data.l1.xgb.05.pred, "data.l1.xgb.05.pred")
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.l2.xgb.02.R:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | # train model
  3 | #############################################################
  4 | 
  5 | wk.seed <- 5471887 + 19117
  6 | fn.register.wk(1, seed=wk.seed)
  7 | data.l2.xgb.02.pred.tmp <- foreach(
  8 |   test.type=c("test"), .combine=rbind, .noexport=all.noexport) %dopar% {
  9 |   
 10 |   fn.init.new.fold.worker("l2_xgb_02", paste0(test.type, "1b"))
 11 |   # fn.clean.worker()
 12 |   
 13 |   cat("\n\nSeed:", wk.seed, "...\n")
 14 |   
 15 |   data.tr <- xgb.DMatrix(fn.libsvm.file(paste0("data.", test.type, ".tr.libsvm")))
 16 |   data.test <- xgb.DMatrix(fn.libsvm.file(paste0("data.", test.type, ".tt.libsvm")))
 17 |     
 18 |   eval_metric = "logloss"
 19 |   data.watch = list(val=data.test)
 20 |   if (test.type == "val") {
 21 |     eval_metric <- function(preds, dtrain) {
 22 |       labels <- as.numeric(getinfo(dtrain, "label"))
 23 |       preds - as.numeric(preds)
 24 |       err <- round(fn.log.loss(actual=labels, pred=preds), digits=5)
 25 |       return(list(metric = "logloss", value = err))
 26 |     }
 27 |   } 
 28 |   
 29 |   data.fold$test.pred <- data.table(
 30 |     ID = sort(data.all.search.small[SearchType == test.type, ID]),
 31 |     Pred = 0.0,
 32 |     n = 0
 33 |   )
 34 |   
 35 |   fn.soar.unload(data.all.search.small)
 36 |   
 37 | 
 38 |   data.fold$params = list(
 39 |     objective = "binary:logistic",
 40 |     eval_metric = eval_metric,
 41 |     nthread = 6,
 42 |     eta = 0.18,
 43 |     max_depth = 10,
 44 |     gamma = 0.8, 
 45 |     colsample_bytree = 0.7, 
 46 |     colsample_bylevel = 0.8
 47 |   )
 48 |   
 49 |   data.fold$nrounds <- 75
 50 |   
 51 |   cat("\nParams:\n")
 52 |   print(data.fold$params)
 53 |   
 54 |   n.models <- 20
 55 |   for (ix in 1:n.models) {
 56 |     
 57 |     cat("\n\nTraining ", ix, "of", n.models,"...\n")
 58 |     
 59 |     set.seed(ix + 89475560)
 60 |     
 61 |     model =  xgb.train(
 62 |       data = data.tr,
 63 |       watchlist=data.watch,
 64 |       params = data.fold$params,
 65 |       nrounds = data.fold$nrounds,
 66 |       verbose = 1)
 67 |     
 68 |     
 69 |     ntreelimit <- data.fold$nrounds
 70 |     try.pred <- T
 71 |     
 72 |     while (try.pred) {
 73 |       pred.cur <- xgboost::predict(model, data.test, ntreelimit=ntreelimit)
 74 |       pred.cur.avg <- mean(pred.cur)
 75 |       
 76 |       cat("\nCurrent prediction avg of", length(pred.cur),
 77 |           "instances:", pred.cur.avg, "\n")
 78 |       if (test.type == "val" || 
 79 |           (pred.cur.avg >= 0.006 && pred.cur.avg <= 0.012)) {
 80 |         
 81 |         try.pred <- F
 82 |         data.fold$test.pred[ , Pred := (Pred*n + pred.cur)/(n+1)]
 83 |         data.fold$test.pred[,  n := n+1]
 84 | 
 85 |         
 86 |         fn.save.data.fold(data.fold)
 87 |         cat("\nPrediction with", ntreelimit ,"trees included\n")
 88 |       } else {
 89 |         cat("\nPrediction with", ntreelimit ,"trees discarded\n")
 90 |         ntreelimit <- ntreelimit - 5
 91 |         try.pred <- ntreelimit >= 60
 92 |       }
 93 |     }
 94 | 
 95 |     cat("\nPrediction status using", data.fold$test.pred$n[1], "iteration(s) :\n")
 96 |     fn.print.err(data.fold$test.pred)
 97 | 
 98 |     set.seed(Sys.time())
 99 |     rm(pred.cur, pred.cur.avg)
100 |     invisible(gc())
101 |     
102 |   }
103 | 
104 | #   cat("\n\nFeature importance:\n") 
105 | #   data.fold$importance <- xgb.importance(
106 | #     feature_names=cols.in.combine, model=model)
107 | #   print(data.fold$importance)
108 |   
109 |   fn.clean.worker()
110 |   
111 |   data.fold$test.pred
112 |   
113 | }
114 | fn.kill.wk()
115 | 
116 | data.l2.xgb.02.pred.tmp <- data.l2.xgb.02.pred.tmp[order(ID)]
117 | Store(data.l2.xgb.02.pred.tmp)
118 | 
119 | data.l2.xgb.02.pred <- copy(data.l2.xgb.02.pred.tmp)
120 | 
121 | #############################################################
122 | # save data
123 | #############################################################
124 | 
125 | fn.print.err(data.l2.xgb.02.pred)
126 | #      Size    Loss
127 | # 1 8512834 0.04155
128 | 
129 | Store(data.l2.xgb.02.pred) # 0.04043
130 | 
131 | cat('Test avg:', mean(data.l2.xgb.02.pred[ID > 0]$Pred), "\n") 
132 | # Test avg: 0.008131078   
133 | 
134 | # fn.write.submission(data.l2.xgb.02.pred, "data.l2.xgb.02.pred")
135 | 
136 | data.l2.xgb.02.pred.calib <- fn.calibrate.prob.wk(data.l2.xgb.02.pred)
137 | fn.print.err(data.l2.xgb.02.pred.calib)
138 | 
139 | Store(data.l2.xgb.02.pred.calib)
140 | 
141 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.xgb.dtry.R:
--------------------------------------------------------------------------------
 1 | load(fn.rdata.file("data.reduced.all.RData"))
 2 | flist <- setdiff(colnames(data.reduced.all), c("SearchType", "SearchDayYear"))
 3 | 
 4 | write.table(as.data.frame(data.reduced.all[SearchType==1][,flist,with=F]),
 5 |             file = fn.out.file("train.xgb.csv"),
 6 |             quote = F,
 7 |             sep = ",",
 8 |             row.names = F,
 9 |             col.names = T)
10 | 
11 | write.table(as.data.frame(data.reduced.all[SearchType==2][,flist,with=F]),
12 |             file = fn.out.file("val.xgb.csv"),
13 |             quote = F,
14 |             sep = ",",
15 |             row.names = F,
16 |             col.names = T)
17 | 
18 | write.table(as.data.frame(data.reduced.all[SearchType==3][,flist,with=F]),
19 |             file = fn.out.file("test.xgb.csv"),
20 |             quote = F,
21 |             sep = ",",
22 |             row.names = F,
23 |             col.names = T)
24 | 
25 | write.table(as.data.frame(data.reduced.all[SearchType==1 | SearchType==2][,flist,with=F]),
26 |             file = fn.out.file("train.val.xgb.csv"),
27 |             quote = F,
28 |             sep = ",",
29 |             row.names = F,
30 |             col.names = T)
31 | 
32 | n <- nrow(data.reduced.all[SearchType==1])
33 | set.seed(23243)
34 | ix.train <- sample(c(1:n), 0.2*n)
35 | n <- nrow(data.reduced.all[SearchType==2])
36 | set.seed(102903)
37 | ix.val <- sample(c(1:n), 0.2*n)
38 | 
39 | write.table(as.data.frame(data.reduced.all[SearchType==1][ix.train][,flist,with=F]),
40 |             file = fn.out.file("train.part.xgb.csv"),
41 |             quote = F,
42 |             sep = ",",
43 |             row.names = F,
44 |             col.names = T)
45 | 
46 | write.table(as.data.frame(data.reduced.all[SearchType==2][ix.val][,flist,with=F]),
47 |             file = fn.out.file("val.part.xgb.csv"),
48 |             quote = F,
49 |             sep = ",",
50 |             row.names = F,
51 |             col.names = T)
52 | val.part.actual <- data.reduced.all[SearchType==2][ix.val][,list(ID,IsClick)]
53 | setkey(val.part.actual, ID)
54 | rm(data.reduced.all)
55 | gc()
56 | 
57 | 
58 | ### lb part ###
59 | system(paste("cd ../avito-context-click-py && python train_xgb_dtry.py",
60 |              "--train", fn.out.file("train.val.xgb.csv"),
61 |              "--test", fn.out.file("test.xgb.csv"),
62 |              "--pred", fn.py.file("test.pred.xgb.csv"),
63 |              "--epoch", 15,
64 |              ">> ../data/log/xgb.dtry.log 2>&1"))  
65 | 
66 | test.pred <- list()
67 | for (i in c(0:14)) {
68 |   test.pred[[length(test.pred)+1]] <- fread(fn.py.file(paste0("test.pred.xgb.epoch",i,".csv")))
69 |   cat(mean(test.pred[[length(test.pred)]]$IsClick), "...\n")
70 | }
71 | test.pred <- rbindlist(test.pred, use.names=T, fill=F)
72 | test.pred <- test.pred[,list(IsClick = round(mean(IsClick), 6)), by="ID"]
73 | 
74 | write.table(as.data.frame(test.pred),
75 |             file = fn.submission.file("dtry.xgb9__0.0409xx"),
76 |             quote = F,
77 |             sep = ",",
78 |             row.names = F,
79 |             col.names = T)
80 | 
81 | 
82 | ### cv part ###
83 | #system(paste("cd ../avito-context-click-py && python train_xgb_dtry.py",
84 | #             "--train", fn.out.file("train.part.xgb.csv"),
85 | #             "--test", fn.out.file("val.part.xgb.csv"),
86 | #             "--pred", fn.py.file("val.part.pred.xgb.csv"),
87 | #             "--epoch", 15,
88 | #             ">> ../data/log/xgb.dtry.log 2>&1"))  
89 | 
90 | #val.part.pred <- list()
91 | #for (i in c(0:4)) {
92 | #  val.part.pred[[length(val.part.pred)+1]] <- fread(fn.py.file(paste0("val.part.pred.xgb.epoch",i,".csv")))
93 | #  cat(mean(val.part.pred[[length(val.part.pred)]]$IsClick), "...\n")
94 | #}
95 | #val.part.pred <- rbindlist(val.part.pred, use.names=T, fill=F)
96 | #val.part.pred <- val.part.pred[,list(IsClick = round(mean(IsClick), 6)), by="ID"]
97 | #setkey(val.part.pred, ID)
98 | #fn.logloss(val.part.actual$IsClick, val.part.pred$IsClick)
99 | 


--------------------------------------------------------------------------------
/avito-context-click-r/train.zens.R:
--------------------------------------------------------------------------------
 1 | ##############################################################
 2 | ## csvs
 3 | ##############################################################
 4 | 
 5 | 
 6 | data.sub.dtry.0409xx <- fread(fn.submission.file(
 7 |   "dtry.xgb9__0.0409xx"))[order(ID)]
 8 | 
 9 | data.l1.xgb.03.pred <- data.l1.xgb.03.pred[ID > 0]
10 | 
11 | data.sub.ens.0404x <- merge(data.l1.xgb.03.pred,
12 |                             data.sub.dtry.0409xx,
13 |                             suffixes=c(".l",".d"),
14 |                             by="ID")
15 | setkey(data.sub.ens.0404x, ID)
16 | data.sub.ens.0404x[, IsClick := IsClick.l^0.6 * IsClick.d^0.4 ]
17 | 
18 | data.l2.xgb.02.pred.val <- data.l2.xgb.02.pred[ID < 0]
19 | data.l2.xgb.02.pred <- data.l2.xgb.02.pred.calib[ID > 0]
20 | setkey(data.l2.xgb.02.pred, ID) # 0.04043
21 | 
22 | 
23 | data.l1.xgb.05.pred <- data.l1.xgb.05.pred[ID > 0]
24 | setkey(data.l1.xgb.05.pred, ID) #  0.04076
25 | 
26 | 
27 | 
28 | data.sub.ens <- data.sub.ens.0404x[, list(ID)]
29 | 
30 | data.sub.ens[, Pred.l2 :=  data.l2.xgb.02.pred$Pred]
31 | data.sub.ens[, Pred.Ens1 := 
32 |                data.sub.ens.0404x$IsClick^0.4
33 |              * Pred.l2^0.6]
34 | data.sub.ens[, Pred.Ens2 := 
35 |                Pred.Ens1^0.9
36 |              * data.l1.xgb.05.pred$Pred^0.1]
37 | 
38 | data.sub.ens[, Pred := Pred.Ens2]
39 | data.sub.ens[, Pred := Pred.Ens2*1.1]
40 | 
41 | cat('Test avg:', mean(data.sub.ens$Pred), "\n") 
42 | # Test avg: 0.007941962 
43 | 
44 | fn.write.submission(data.sub.ens, "data.sub.ens", mean.adj=T)
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/data/input/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/input/empty.csv


--------------------------------------------------------------------------------
/data/log/data_build/empty.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/log/data_build/empty.log


--------------------------------------------------------------------------------
/data/output-libffm/empty.libffm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-libffm/empty.libffm


--------------------------------------------------------------------------------
/data/output-libsvm/empty.libsvm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-libsvm/empty.libsvm


--------------------------------------------------------------------------------
/data/output-py/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-py/empty.csv


--------------------------------------------------------------------------------
/data/output-r/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/output-r/empty.csv


--------------------------------------------------------------------------------
/data/rdata/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/rdata/empty.csv


--------------------------------------------------------------------------------
/data/submission/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/data/submission/empty.csv


--------------------------------------------------------------------------------
/data/template/zens_nn.yaml:
--------------------------------------------------------------------------------
 1 | !obj:pylearn2.train.Train {
 2 |     # The "&train" syntax lets us refer back to this object as "*train" elsewhere in the yaml file
 3 |     dataset: &train !obj:train_pylearn.CSVDataset {
 4 |         path: '${TRAIN_DATA_FILE}',
 5 |     },
 6 |     # Here we specify the model to train as being an MLP
 7 |     model: !obj:pylearn2.models.mlp.MLP {
 8 |         batch_size: 100000,
 9 |         layers : [
10 |             # We use two hidden layers with rectified linear activations
11 |             !obj:pylearn2.models.mlp.RectifiedLinear {
12 |                 layer_name: 'h0',
13 |                 dim: 100,
14 |                 irange: .05,
15 |                 # Rather than using weight decay, we constrain the norms of the weight vectors
16 |                 # max_col_norm: 2.,
17 |             },
18 |             !obj:pylearn2.models.mlp.Softmax {
19 |                 layer_name: 'y',
20 |                 init_bias_target_marginals: *train,
21 |                 # Initialize the weights to all 0s
22 |                 irange: .0,
23 |                 n_classes: 2,
24 |             }
25 |         ],
26 |         nvis: ${N_FEATURES},
27 |     },
28 |     # We train using SGD and momentum
29 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
30 |         learning_rate: .05,
31 |         train_iteration_mode: 'even_shuffled_sequential',
32 |         monitor_iteration_mode : 'even_shuffled_sequential', 
33 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
34 |             init_momentum: .05,
35 |             nesterov_momentum: True,
36 |         },
37 |         # We monitor how well we're doing during training on a validation set
38 |         monitoring_dataset:
39 |             {
40 |                 'train' : *train,
41 |                 'valid' : !obj:train_pylearn.CSVDataset {
42 |                     path: '${VAL_DATA_FILE}',
43 |                 }
44 |             },
45 |         # We stop when validation set classification error hasn't decreased for 10 epochs
46 |         termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased {
47 |             channel_name: 'valid_objective',
48 |             prop_decrease: 0.,
49 |             N: 10
50 |         },
51 |         cost: !obj:pylearn2.costs.mlp.dropout.Dropout {
52 |             default_input_include_prob: .5,
53 |             default_input_scale: 2.,
54 |         },
55 |     },
56 |     # We save the model whenever we improve on the validation set classification error
57 |     extensions: [
58 |         !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
59 |              channel_name: 'valid_y_misclass',
60 |              save_path: '${MODEL_FILE}'
61 |         },
62 |     ],
63 |     save_freq: 0,
64 | }
65 | 


--------------------------------------------------------------------------------
/fm/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/fm/.DS_Store


--------------------------------------------------------------------------------
/fm/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++-4.9.2
 2 | CXXFLAGS = -Wall -Wno-format -Wconversion -O3 -fPIC -std=c++0x -fopenmp
 3 | MAIN = fm
 4 | FILES = common.cpp timer.cpp
 5 | SRCS = $(FILES:%.cpp=src/%.cpp)
 6 | HEADERS = $(FILES:%.cpp=src/%.h)
 7 | 
 8 | all: $(MAIN)
 9 | 
10 | fm: src/train.cpp $(SRCS) $(HEADERS)
11 | 	$(CXX) $(CXXFLAGS) -o $@ $< $(SRCS)
12 | 
13 | clean:
14 | 	rm -f $(MAIN)
15 | 


--------------------------------------------------------------------------------
/fm/README:
--------------------------------------------------------------------------------
 1 | Data Format
 2 | ===========
 3 | The input of this factorization machine solver consists of a label vector (y)
 4 | and a binary sparse matrix (X). The input format is:
 5 | 
 6 | <label> <index_1> <index_2> ... 
 7 | .
 8 | .
 9 | .
10 | 
11 | To represent a binary sparse matrix, we only need to know where non-zero
12 | elements are, so values are not specified.
13 | 
14 | For example, 
15 | 
16 | 1 2 9 5
17 | 0 1 3 7
18 | 0 4 8 2
19 | 
20 | represents:
21 | 
22 | y          X
23 | 1   0 1 0 0 1 0 0 0 1
24 | 0   1 0 1 0 0 0 1 0 0
25 | 0   0 1 0 1 0 0 0 1 0
26 | 


--------------------------------------------------------------------------------
/fm/fm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/fm/fm


--------------------------------------------------------------------------------
/fm/fm.dSYM/Contents/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | 	<dict>
 5 | 		<key>CFBundleDevelopmentRegion</key>
 6 | 		<string>English</string>
 7 | 		<key>CFBundleIdentifier</key>
 8 | 		<string>com.apple.xcode.dsym.fm</string>
 9 | 		<key>CFBundleInfoDictionaryVersion</key>
10 | 		<string>6.0</string>
11 | 		<key>CFBundlePackageType</key>
12 | 		<string>dSYM</string>
13 | 		<key>CFBundleSignature</key>
14 | 		<string>????</string>
15 | 		<key>CFBundleShortVersionString</key>
16 | 		<string>1.0</string>
17 | 		<key>CFBundleVersion</key>
18 | 		<string>1</string>
19 | 	</dict>
20 | </plist>
21 | 


--------------------------------------------------------------------------------
/fm/fm.dSYM/Contents/Resources/DWARF/fm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diefimov/avito_context_click_2015/717bfbb6508aed73a8293336ff9ce677a7c23f0a/fm/fm.dSYM/Contents/Resources/DWARF/fm


--------------------------------------------------------------------------------
/fm/src/common.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdexcept>
  2 | #include <cstring>
  3 | #include <omp.h>
  4 | 
  5 | #include "common.h"
  6 | 
  7 | namespace {
  8 | 
  9 | int const kMaxLineSize = 1000000;
 10 | 
 11 | uint32_t get_nr_line(std::string const &path)
 12 | {
 13 |     FILE *f = open_c_file(path.c_str(), "r");
 14 |     char line[kMaxLineSize];
 15 | 
 16 |     uint32_t nr_line = 0;
 17 |     while(fgets(line, kMaxLineSize, f) != nullptr)
 18 |         ++nr_line;
 19 | 
 20 |     fclose(f);
 21 | 
 22 |     return nr_line;
 23 | }
 24 | 
 25 | uint32_t get_nr_field(std::string const &path)
 26 | {
 27 |     FILE *f = open_c_file(path.c_str(), "r");
 28 |     char line[kMaxLineSize];
 29 | 
 30 |     fgets(line, kMaxLineSize, f);
 31 |     strtok(line, " \t");
 32 | 
 33 |     uint32_t nr_field = 0;
 34 |     while(1)
 35 |     {
 36 |         char *idx_char = strtok(nullptr," \t");
 37 |         if(idx_char == nullptr || *idx_char == '\n')
 38 |             break;
 39 |         ++nr_field;
 40 |     }
 41 | 
 42 |     fclose(f);
 43 | 
 44 |     return nr_field;
 45 | }
 46 | 
 47 | } //unamed namespace
 48 | 
 49 | Problem read_problem(std::string const path)
 50 | {
 51 |     if(path.empty())
 52 |         return Problem(0, 0);
 53 |     Problem prob(get_nr_line(path), get_nr_field(path));
 54 | 
 55 |     FILE *f = open_c_file(path.c_str(), "r");
 56 |     char line[kMaxLineSize];
 57 | 
 58 |     uint64_t p = 0;
 59 |     for(uint32_t i = 0; fgets(line, kMaxLineSize, f) != nullptr; ++i)
 60 |     {
 61 |         char *y_char = strtok(line, " \t");
 62 |         float const y = (atoi(y_char)>0)? 1.0f : -1.0f;
 63 |         prob.Y[i] = y;
 64 |         for(; ; ++p)
 65 |         {
 66 |             char *idx_char = strtok(nullptr," \t");
 67 |             if(idx_char == nullptr || *idx_char == '\n')
 68 |                 break;
 69 |             uint32_t idx = static_cast<uint32_t>(atoi(idx_char));
 70 |             prob.nr_feature = std::max(prob.nr_feature, idx);
 71 |             prob.J[p] = idx-1;
 72 |         }
 73 |     }
 74 | 
 75 |     fclose(f);
 76 | 
 77 |     return prob;
 78 | }
 79 | 
 80 | FILE *open_c_file(std::string const &path, std::string const &mode)
 81 | {
 82 |     FILE *f = fopen(path.c_str(), mode.c_str());
 83 |     if(!f)
 84 |         throw std::runtime_error(std::string("cannot open ")+path);
 85 |     return f;
 86 | }
 87 | 
 88 | std::vector<std::string> 
 89 | argv_to_args(int const argc, char const * const * const argv)
 90 | {
 91 |     std::vector<std::string> args;
 92 |     for(int i = 1; i < argc; ++i)
 93 |         args.emplace_back(argv[i]);
 94 |     return args;
 95 | }
 96 | 
 97 | float predict(Problem const &prob, Model &model, 
 98 |     std::string const &output_path)
 99 | {
100 |     FILE *f = nullptr;
101 |     if(!output_path.empty())
102 |         f = open_c_file(output_path, "w");
103 | 
104 |     double loss = 0;
105 |     #pragma omp parallel for schedule(static) reduction(+:loss)
106 |     for(uint32_t i = 0; i < prob.Y.size(); ++i)
107 |     {
108 |         float const y = prob.Y[i];
109 | 
110 |         float const t = wTx(prob, model, i);
111 |         
112 |         float const prob = 1/(1+static_cast<float>(exp(-t)));
113 | 
114 |         float const expnyt = static_cast<float>(exp(-y*t));
115 | 
116 |         loss += log(1+expnyt);
117 | 
118 |         if(f)
119 |             fprintf(f, "%lf\n", prob);
120 |     }
121 | 
122 |     if(f)
123 |         fclose(f);
124 | 
125 |     return static_cast<float>(loss/static_cast<double>(prob.Y.size()));
126 | }
127 | 


--------------------------------------------------------------------------------
/fm/src/common.h:
--------------------------------------------------------------------------------
  1 | #pragma GCC diagnostic ignored "-Wunused-result"
  2 | 
  3 | #ifndef _COMMON_H_
  4 | #define _COMMON_H_
  5 | 
  6 | #define flag { printf("\nLINE: %d\n", __LINE__); fflush(stdout); }
  7 | 
  8 | #include <vector>
  9 | #include <cmath>
 10 | #include <pmmintrin.h>
 11 | 
 12 | struct Problem
 13 | {
 14 |     Problem(uint32_t const nr_instance, uint32_t const nr_field) 
 15 |         : nr_feature(0), nr_instance(nr_instance), nr_field(nr_field), 
 16 |           v(2.0f/static_cast<float>(nr_field)), 
 17 |           J(static_cast<uint64_t>(nr_instance)*nr_field), 
 18 |           Y(nr_instance) {}
 19 |     uint32_t nr_feature, nr_instance, nr_field;
 20 |     float v;
 21 |     std::vector<uint32_t> J;
 22 |     std::vector<float> Y;
 23 | };
 24 | 
 25 | Problem read_problem(std::string const path);
 26 | 
 27 | uint32_t const kW_NODE_SIZE = 2;
 28 | 
 29 | struct Model
 30 | {
 31 |     Model(uint32_t const nr_feature, uint32_t const nr_factor, uint32_t const nr_field) 
 32 |         : W(static_cast<uint64_t>(nr_feature)*nr_field*nr_factor*kW_NODE_SIZE, 0), 
 33 |           nr_feature(nr_feature), nr_factor(nr_factor), nr_field(nr_field) {}
 34 |     std::vector<float> W;
 35 |     const uint32_t nr_feature, nr_factor, nr_field;
 36 | };
 37 | 
 38 | FILE *open_c_file(std::string const &path, std::string const &mode);
 39 | 
 40 | std::vector<std::string> 
 41 | argv_to_args(int const argc, char const * const * const argv);
 42 | 
 43 | inline float wTx(Problem const &prob, Model &model, uint32_t const i, 
 44 |     float const kappa=0, float const eta=0, float const lambda=0, 
 45 |     bool const do_update=false)
 46 | {
 47 |     uint32_t const nr_factor = model.nr_factor;
 48 |     uint32_t const nr_field = model.nr_field;
 49 |     uint32_t const nr_feature = model.nr_feature;
 50 |     uint64_t const align0 = nr_factor*kW_NODE_SIZE;
 51 |     uint64_t const align1 = nr_field*align0;
 52 | 
 53 |     uint32_t const * const J = &prob.J[i*nr_field];
 54 |     float * const W = model.W.data();
 55 | 
 56 |     __m128 const XMMv = _mm_set1_ps(prob.v);
 57 |     __m128 const XMMkappav = _mm_set1_ps(kappa*prob.v);
 58 |     __m128 const XMMeta = _mm_set1_ps(eta);
 59 |     __m128 const XMMlambda = _mm_set1_ps(lambda);
 60 | 
 61 |     __m128 XMMt = _mm_setzero_ps();
 62 |     for(uint32_t f1 = 0; f1 < nr_field; ++f1)
 63 |     {
 64 |         uint32_t const j1 = J[f1];
 65 |         if(j1 >= nr_feature)
 66 |             continue;
 67 | 
 68 |         for(uint32_t f2 = f1+1; f2 < nr_field; ++f2)
 69 |         {
 70 |             uint32_t const j2 = J[f2];
 71 |             if(j2 >= nr_feature)
 72 |                 continue;
 73 | 
 74 |             float * const w1 = W + j1*align1 + f2*align0;
 75 |             float * const w2 = W + j2*align1 + f1*align0;
 76 | 
 77 |             if(do_update)
 78 |             {
 79 |                 float * const wg1 = w1 + nr_factor;
 80 |                 float * const wg2 = w2 + nr_factor;
 81 |                 for(uint32_t d = 0; d < nr_factor; d += 4)
 82 |                 {
 83 |                     __m128 XMMw1 = _mm_load_ps(w1+d);
 84 |                     __m128 XMMw2 = _mm_load_ps(w2+d);
 85 | 
 86 |                     __m128 XMMwg1 = _mm_load_ps(wg1+d);
 87 |                     __m128 XMMwg2 = _mm_load_ps(wg2+d);
 88 | 
 89 |                     __m128 XMMg1 = _mm_add_ps(
 90 |                                    _mm_mul_ps(XMMlambda, XMMw1),
 91 |                                    _mm_mul_ps(XMMkappav, XMMw2));
 92 |                     __m128 XMMg2 = _mm_add_ps(
 93 |                                    _mm_mul_ps(XMMlambda, XMMw2),
 94 |                                    _mm_mul_ps(XMMkappav, XMMw1));
 95 | 
 96 |                     XMMwg1 = _mm_add_ps(XMMwg1, _mm_mul_ps(XMMg1, XMMg1));
 97 |                     XMMwg2 = _mm_add_ps(XMMwg2, _mm_mul_ps(XMMg2, XMMg2));
 98 | 
 99 |                     XMMw1 = _mm_sub_ps(XMMw1, _mm_mul_ps(XMMeta, 
100 |                             _mm_mul_ps(_mm_rsqrt_ps(XMMwg1), XMMg1)));
101 |                     XMMw2 = _mm_sub_ps(XMMw2, _mm_mul_ps(XMMeta, 
102 |                             _mm_mul_ps(_mm_rsqrt_ps(XMMwg2), XMMg2)));
103 | 
104 |                     _mm_store_ps(w1+d, XMMw1);
105 |                     _mm_store_ps(w2+d, XMMw2);
106 | 
107 |                     _mm_store_ps(wg1+d, XMMwg1);
108 |                     _mm_store_ps(wg2+d, XMMwg2);
109 |                 }
110 |             }
111 |             else
112 |             {
113 |                 for(uint32_t d = 0; d < nr_factor; d += 4)
114 |                 {
115 |                     __m128 const XMMw1 = _mm_load_ps(w1+d);
116 |                     __m128 const XMMw2 = _mm_load_ps(w2+d);
117 | 
118 |                     XMMt = _mm_add_ps(XMMt, 
119 |                            _mm_mul_ps(_mm_mul_ps(XMMw1, XMMw2), XMMv));
120 |                 }
121 |             }
122 |         }
123 |     }
124 | 
125 |     if(do_update)
126 |         return 0;
127 | 
128 |     XMMt = _mm_hadd_ps(XMMt, XMMt);
129 |     XMMt = _mm_hadd_ps(XMMt, XMMt);
130 |     float t;
131 |     _mm_store_ss(&t, XMMt);
132 | 
133 |     return t;
134 | }
135 | 
136 | float predict(Problem const &prob, Model &model, 
137 |     std::string const &output_path = std::string(""));
138 | #endif // _COMMON_H_
139 | 


--------------------------------------------------------------------------------
/fm/src/timer.cpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | #include "timer.h"
 3 | 
 4 | Timer::Timer()
 5 | {
 6 |     reset();
 7 | }
 8 | 
 9 | void Timer::reset()
10 | {
11 |     begin = std::chrono::high_resolution_clock::now();
12 |     duration = 
13 |         std::chrono::duration_cast<std::chrono::milliseconds>(begin-begin);
14 | }
15 | 
16 | void Timer::tic()
17 | {
18 |     begin = std::chrono::high_resolution_clock::now();
19 | }
20 | 
21 | float Timer::toc()
22 | {
23 |     duration += std::chrono::duration_cast<std::chrono::milliseconds>
24 |                     (std::chrono::high_resolution_clock::now()-begin);
25 |     return (float)duration.count()/1000;
26 | }
27 | 
28 | float Timer::get()
29 | {
30 |     float time = toc();
31 |     tic();
32 |     return time;
33 | }
34 | 


--------------------------------------------------------------------------------
/fm/src/timer.h:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | 
 3 | class Timer
 4 | {
 5 | public:
 6 |     Timer();
 7 |     void reset();
 8 |     void tic();
 9 |     float toc();
10 |     float get();
11 | private:
12 |     std::chrono::high_resolution_clock::time_point begin;
13 |     std::chrono::milliseconds duration;
14 | };
15 | 


--------------------------------------------------------------------------------
/fm/src/train.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <algorithm>
  3 | #include <stdexcept>
  4 | #include <omp.h>
  5 | 
  6 | #include "common.h"
  7 | #include "timer.h"
  8 | 
  9 | namespace {
 10 | 
 11 | struct Option
 12 | {
 13 |     Option() 
 14 |         : eta(0.1f), lambda(0.00002f), iter(15), nr_factor(4), 
 15 |           nr_threads(1), do_prediction(true) {}
 16 |     std::string Tr_path, Va_path;
 17 |     float eta, lambda;
 18 |     uint32_t iter, nr_factor, nr_threads;
 19 |     bool do_prediction;
 20 | };
 21 | 
 22 | std::string train_help()
 23 | {
 24 |     return std::string(
 25 | "usage: fm [<options>] <validation_path> <train_path>\n"
 26 | "\n"
 27 | "<validation_path>.out will be automatically generated at the end of training\n"
 28 | "\n"
 29 | "options:\n"
 30 | "-l <lambda>: set the regularization penalty\n"
 31 | "-k <factor>: set the number of latent factors, which must be a multiple of 4\n"
 32 | "-t <iteration>: set the number of iterations\n"
 33 | "-r <eta>: set the learning rate\n"
 34 | "-s <nr_threads>: set the number of threads\n"
 35 | "-q: if it is set, then there is no output file\n");
 36 | }
 37 | 
 38 | Option parse_option(std::vector<std::string> const &args)
 39 | {
 40 |     uint32_t const argc = static_cast<uint32_t>(args.size());
 41 | 
 42 |     if(argc == 0)
 43 |         throw std::invalid_argument(train_help());
 44 | 
 45 |     Option opt; 
 46 | 
 47 |     uint32_t i = 0;
 48 |     for(; i < argc; ++i)
 49 |     {
 50 |         if(args[i].compare("-t") == 0)
 51 |         {
 52 |             if(i == argc-1)
 53 |                 throw std::invalid_argument("invalid command\n");
 54 |             opt.iter = std::stoi(args[++i]);
 55 |         }
 56 |         else if(args[i].compare("-k") == 0)
 57 |         {
 58 |             if(i == argc-1)
 59 |                 throw std::invalid_argument("invalid command\n");
 60 |             opt.nr_factor = std::stoi(args[++i]);
 61 |             if(opt.nr_factor%4 != 0)
 62 |                 throw std::invalid_argument("k should be a multiple of 4\n");
 63 |         }
 64 |         else if(args[i].compare("-r") == 0)
 65 |         {
 66 |             if(i == argc-1)
 67 |                 throw std::invalid_argument("invalid command\n");
 68 |             opt.eta = std::stof(args[++i]);
 69 |         }
 70 |         else if(args[i].compare("-l") == 0)
 71 |         {
 72 |             if(i == argc-1)
 73 |                 throw std::invalid_argument("invalid command\n");
 74 |             opt.lambda = std::stof(args[++i]);
 75 |         }
 76 |         else if(args[i].compare("-s") == 0)
 77 |         {
 78 |             if(i == argc-1)
 79 |                 throw std::invalid_argument("invalid command\n");
 80 |             opt.nr_threads = std::stoi(args[++i]);
 81 |         }
 82 |         else if(args[i].compare("-q") == 0)
 83 |         {
 84 |             opt.do_prediction = false;
 85 |         }
 86 |         else
 87 |         {
 88 |             break;
 89 |         }
 90 |     }
 91 | 
 92 |     if(i >= argc-1)
 93 |         throw std::invalid_argument("training or test set not specified\n");
 94 | 
 95 |     opt.Va_path = args[i++];
 96 |     opt.Tr_path = args[i++];
 97 | 
 98 |     return opt;
 99 | }
100 | 
101 | void init_model(Model &model)
102 | {
103 |     uint32_t const nr_factor = model.nr_factor;
104 |     float const coef = 
105 |         static_cast<float>(0.5/sqrt(static_cast<double>(nr_factor)));
106 | 
107 |     float * w = model.W.data();
108 |     for(uint32_t j = 0; j < model.nr_feature; ++j)
109 |     {
110 |         for(uint32_t f = 0; f < model.nr_field; ++f)
111 |         {
112 |             for(uint32_t d = 0; d < nr_factor; ++d, ++w)
113 |                 *w = coef*static_cast<float>(drand48());
114 |             for(uint32_t d = nr_factor; d < nr_factor; ++d, ++w)
115 |                 *w = 0;
116 |             for(uint32_t d = nr_factor; d < 2*nr_factor; ++d, ++w)
117 |                 *w = 1;
118 |         }
119 |     }
120 | }
121 | 
122 | void train(Problem const &Tr, Problem const &Va, Model &model, Option const &opt)
123 | {
124 |     std::vector<uint32_t> order(Tr.Y.size());
125 |     for(uint32_t i = 0; i < Tr.Y.size(); ++i)
126 |         order[i] = i;
127 | 
128 |     Timer timer;
129 |     //printf("iter     time    tr_loss    va_loss\n");
130 |     for(uint32_t iter = 0; iter < opt.iter; ++iter)
131 |     {
132 |         timer.tic();
133 | 
134 |         double Tr_loss = 0;
135 |         std::random_shuffle(order.begin(), order.end());
136 | #pragma omp parallel for schedule(static)
137 |         for(uint32_t i_ = 0; i_ < order.size(); ++i_)
138 |         {
139 |             uint32_t const i = order[i_];
140 | 
141 |             float const y = Tr.Y[i];
142 |             
143 |             float const t = wTx(Tr, model, i);
144 | 
145 |             float const expnyt = static_cast<float>(exp(-y*t));
146 | 
147 |             Tr_loss += log(1+expnyt);
148 |                
149 |             float const kappa = -y*expnyt/(1+expnyt);
150 | 
151 |             wTx(Tr, model, i, kappa, opt.eta, opt.lambda, true);
152 |         }
153 |         Tr_loss /= static_cast<double>(Tr.Y.size());
154 | 
155 |         //double const Va_loss = predict(Va, model);
156 | 
157 |         //printf("%4d %8.1f %10.5f %10.5f\n", 
158 |         //       iter, timer.toc(), Tr_loss, Va_loss);
159 |         //fflush(stdout);
160 |     }
161 | }
162 | 
163 | } //unnamed namespace
164 | 
165 | int main(int const argc, char const * const * const argv)
166 | {
167 |     Option opt;
168 |     try
169 |     {
170 |         opt = parse_option(argv_to_args(argc, argv));
171 |     }
172 |     catch(std::invalid_argument const &e)
173 |     {
174 |         std::cout << e.what();
175 |         return EXIT_FAILURE;
176 |     }
177 | 
178 |     //std::cout << "reading data..." << std::flush;
179 |     Problem const Va = read_problem(opt.Va_path);
180 |     Problem const Tr = read_problem(opt.Tr_path);
181 |     //std::cout << "done\n" << std::flush;
182 | 
183 |     //std::cout << "initializing model..." << std::flush;
184 |     Model model(Tr.nr_feature, opt.nr_factor, Tr.nr_field);
185 |     init_model(model);
186 |     //std::cout << "done\n" << std::flush;
187 | 
188 | 	omp_set_num_threads(static_cast<int>(opt.nr_threads));
189 | 
190 |     train(Tr, Va, model, opt);
191 | 
192 | 	omp_set_num_threads(1);
193 | 
194 |     if(opt.do_prediction)
195 |         predict(Va, model, opt.Va_path+".out");
196 | 
197 |     return EXIT_SUCCESS;
198 | }
199 | 


--------------------------------------------------------------------------------