├── 001_christine.py
├── 001_jasmine.py
├── 001_madeline.py
├── 001_philippine.py
├── 001_sylvine.py
├── 002_albert.py
├── 002_dilbert.py
├── 002_fabert.py
├── 002_robert.py
├── 002_volkert.py
├── 003_alexis.py
├── 003_dionis.py
├── 003_grigoris.py
├── 003_jannis.py
├── 003_wallis.py
├── 004_evita.py
├── 004_flora.py
├── 004_helena.py
├── 004_tania.py
├── 004_yolanda.py
├── DeepFeedNet.py
├── FeedForwardNet.py
├── LICENSE
├── README.md
└── RegDeepNet.py


/001_christine.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | import sklearn.cross_validation
 6 | 
 7 | import autosklearn
 8 | import autosklearn.data
 9 | import autosklearn.data.data_manager
10 | import autosklearn.models.evaluator
11 | from ParamSklearn.classification import ParamSklearnClassifier
12 | 
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('input')
16 | parser.add_argument('output')
17 | args = parser.parse_args()
18 | 
19 | input = args.input
20 | dataset = 'christine'
21 | output = args.output
22 | 
23 | D = autosklearn.data.data_manager.DataManager(dataset, input)
24 | X = D.data['X_train']
25 | y = D.data['Y_train']
26 | X_valid = D.data['X_valid']
27 | X_test = D.data['X_test']
28 | 
29 | weights = np.array([1.0])
30 | 
31 | # Choosing the single best model without feature selection by RFE (but by
32 | # select percentile classification which is in the auto-sklearn pipeline) seems
33 | # to work best here
34 | configurations = [
35 |     {'balancing:strategy': 'none',
36 |      'classifier': 'libsvm_svc',
37 |      'imputation:strategy': 'median',
38 |      'libsvm_svc:C': '5.06888516101',
39 |      'libsvm_svc:class_weight': 'None',
40 |      'libsvm_svc:gamma': '0.0870955322069',
41 |      'libsvm_svc:kernel': 'rbf',
42 |      'libsvm_svc:max_iter': '-1.0',
43 |      'libsvm_svc:shrinking': 'False',
44 |      'libsvm_svc:tol': '2.62849564978e-05',
45 |      'preprocessor': 'select_percentile_classification',
46 |      'rescaling:strategy': 'min/max',
47 |      'select_percentile_classification:percentile': '36.4058569521',
48 |      'select_percentile_classification:score_func': 'f_classif'}
49 | ]
50 | 
51 | classifiers = []
52 | predictions_valid = []
53 | predictions_test = []
54 | 
55 | # Make predictions and weight them
56 | for weight, configuration in zip(weights, configurations):
57 |     for param in configuration:
58 |         try:
59 |             configuration[param] = int(configuration[param])
60 |         except Exception:
61 |             try:
62 |                 configuration[param] = float(configuration[param])
63 |             except Exception:
64 |                 pass
65 | 
66 |     classifier = ParamSklearnClassifier(configuration, 1)
67 |     classifiers.append(classifier)
68 |     try:
69 |         classifier.fit(X.copy(), y.copy())
70 |         predictions_valid.append(
71 |             classifier.predict_proba(X_valid.copy()) * weight)
72 |         predictions_test.append(
73 |             classifier.predict_proba(X_test.copy()) * weight)
74 |     except Exception as e:
75 |         print e
76 |         print configuration
77 | 
78 | # Output the predictions
79 | for name, predictions in [('valid', predictions_valid),
80 |                           ('test', predictions_test)]:
81 |     predictions = np.array(predictions)
82 |     predictions = np.sum(predictions, axis=0)
83 |     predictions = predictions[:, 1].reshape((-1, 1))
84 | 
85 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
86 |     np.savetxt(filepath, predictions, delimiter=' ')


--------------------------------------------------------------------------------
/001_jasmine.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import autosklearn
  7 | import autosklearn.data
  8 | import autosklearn.data.data_manager
  9 | import autosklearn.models.evaluator
 10 | from ParamSklearn.classification import ParamSklearnClassifier
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'jasmine'
 19 | output = args.output
 20 | 
 21 | D = autosklearn.data.data_manager.DataManager(dataset, input)
 22 | X = D.data['X_train']
 23 | y = D.data['Y_train']
 24 | X_valid = D.data['X_valid']
 25 | X_test = D.data['X_test']
 26 | 
 27 | # Subset of features found with RFE. Feature with least importance in sklearn
 28 | #  RF removed. Afterwards, trained RF on remaining features with 5CV. In the
 29 | # end, choose feature set with lowest error
 30 | features = [6, 8, 10, 12, 16, 18, 20, 21, 22, 25, 26, 33, 37, 38, 39, 40, 42,
 31 |             44, 46, 47, 52, 55, 56, 58, 62, 77, 78, 79, 82, 85, 91, 92, 94, 96,
 32 |             101, 104, 106, 108, 110, 119, 122, 125, 130, 131, 133, 137, 139,
 33 |             140, 141]
 34 | 
 35 | X = X[:, features]
 36 | X_valid = X_valid[:, features]
 37 | X_test = X_test[:, features]
 38 | 
 39 | # Weights of the ensemble members as determined by Ensemble Selection
 40 | weights = np.array([0.140000, 0.120000, 0.080000, 0.060000, 0.040000, 0.040000,
 41 |                     0.040000, 0.040000, 0.040000, 0.040000, 0.020000, 0.020000,
 42 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 43 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 44 |                     0.020000, 0.020000, 0.020000, 0.020000])
 45 | 
 46 | # Ensemble members found by SMAC
 47 | configurations = [
 48 |     {'balancing:strategy': 'weighting',
 49 |      'classifier': 'random_forest',
 50 |      'imputation:strategy': 'median',
 51 |      'preprocessor': 'select_percentile_classification',
 52 |      'random_forest:bootstrap': 'True',
 53 |      'random_forest:criterion': 'gini',
 54 |      'random_forest:max_depth': 'None',
 55 |      'random_forest:max_features': '1.58545644982',
 56 |      'random_forest:max_leaf_nodes': 'None',
 57 |      'random_forest:min_samples_leaf': '3.0',
 58 |      'random_forest:min_samples_split': '2.0',
 59 |      'random_forest:n_estimators': '100.0',
 60 |      'rescaling:strategy': 'min/max',
 61 |      'select_percentile_classification:percentile': '39.9235093683',
 62 |      'select_percentile_classification:score_func': 'f_classif'},
 63 |     {'balancing:strategy': 'weighting',
 64 |      'classifier': 'random_forest',
 65 |      'imputation:strategy': 'most_frequent',
 66 |      'preprocessor': 'select_rates',
 67 |      'random_forest:bootstrap': 'False',
 68 |      'random_forest:criterion': 'entropy',
 69 |      'random_forest:max_depth': 'None',
 70 |      'random_forest:max_features': '0.6715305958',
 71 |      'random_forest:max_leaf_nodes': 'None',
 72 |      'random_forest:min_samples_leaf': '4.0',
 73 |      'random_forest:min_samples_split': '3.0',
 74 |      'random_forest:n_estimators': '100.0',
 75 |      'rescaling:strategy': 'standard',
 76 |      'select_rates:alpha': '0.486873466534',
 77 |      'select_rates:mode': 'fwe',
 78 |      'select_rates:score_func': 'f_classif'},
 79 |     {'balancing:strategy': 'weighting',
 80 |      'classifier': 'random_forest',
 81 |      'imputation:strategy': 'mean',
 82 |      'preprocessor': 'select_percentile_classification',
 83 |      'random_forest:bootstrap': 'False',
 84 |      'random_forest:criterion': 'gini',
 85 |      'random_forest:max_depth': 'None',
 86 |      'random_forest:max_features': '1.82773631717',
 87 |      'random_forest:max_leaf_nodes': 'None',
 88 |      'random_forest:min_samples_leaf': '2.0',
 89 |      'random_forest:min_samples_split': '3.0',
 90 |      'random_forest:n_estimators': '100.0',
 91 |      'rescaling:strategy': 'min/max',
 92 |      'select_percentile_classification:percentile': '50.0',
 93 |      'select_percentile_classification:score_func': 'chi2'},
 94 |     {'balancing:strategy': 'none',
 95 |      'classifier': 'random_forest',
 96 |      'fast_ica:algorithm': 'deflation',
 97 |      'fast_ica:fun': 'logcosh',
 98 |      'fast_ica:n_components': '832.0',
 99 |      'fast_ica:whiten': 'False',
100 |      'imputation:strategy': 'median',
101 |      'preprocessor': 'fast_ica',
102 |      'random_forest:bootstrap': 'False',
103 |      'random_forest:criterion': 'gini',
104 |      'random_forest:max_depth': 'None',
105 |      'random_forest:max_features': '2.93148979051',
106 |      'random_forest:max_leaf_nodes': 'None',
107 |      'random_forest:min_samples_leaf': '5.0',
108 |      'random_forest:min_samples_split': '7.0',
109 |      'random_forest:n_estimators': '100.0',
110 |      'rescaling:strategy': 'min/max'},
111 |     {'balancing:strategy': 'weighting',
112 |      'classifier': 'random_forest',
113 |      'imputation:strategy': 'mean',
114 |      'preprocessor': 'select_percentile_classification',
115 |      'random_forest:bootstrap': 'False',
116 |      'random_forest:criterion': 'entropy',
117 |      'random_forest:max_depth': 'None',
118 |      'random_forest:max_features': '1.79654377812',
119 |      'random_forest:max_leaf_nodes': 'None',
120 |      'random_forest:min_samples_leaf': '1.0',
121 |      'random_forest:min_samples_split': '6.0',
122 |      'random_forest:n_estimators': '100.0',
123 |      'rescaling:strategy': 'min/max',
124 |      'select_percentile_classification:percentile': '50.0',
125 |      'select_percentile_classification:score_func': 'chi2'},
126 |     {'balancing:strategy': 'weighting',
127 |      'classifier': 'extra_trees',
128 |      'extra_trees:bootstrap': 'False',
129 |      'extra_trees:criterion': 'entropy',
130 |      'extra_trees:max_depth': 'None',
131 |      'extra_trees:max_features': '1.81061189332',
132 |      'extra_trees:min_samples_leaf': '1.0',
133 |      'extra_trees:min_samples_split': '3.0',
134 |      'extra_trees:n_estimators': '100.0',
135 |      'imputation:strategy': 'mean',
136 |      'preprocessor': 'select_rates',
137 |      'rescaling:strategy': 'none',
138 |      'select_rates:alpha': '0.201722721361',
139 |      'select_rates:mode': 'fwe',
140 |      'select_rates:score_func': 'f_classif'},
141 |     {'balancing:strategy': 'weighting',
142 |      'classifier': 'extra_trees',
143 |      'extra_trees:bootstrap': 'False',
144 |      'extra_trees:criterion': 'gini',
145 |      'extra_trees:max_depth': 'None',
146 |      'extra_trees:max_features': '1.76442905847',
147 |      'extra_trees:min_samples_leaf': '4.0',
148 |      'extra_trees:min_samples_split': '6.0',
149 |      'extra_trees:n_estimators': '100.0',
150 |      'imputation:strategy': 'mean',
151 |      'preprocessor': 'select_rates',
152 |      'rescaling:strategy': 'min/max',
153 |      'select_rates:alpha': '0.113572172949',
154 |      'select_rates:mode': 'fwe',
155 |      'select_rates:score_func': 'f_classif'},
156 |     {'balancing:strategy': 'weighting',
157 |      'classifier': 'random_forest',
158 |      'imputation:strategy': 'median',
159 |      'preprocessor': 'select_rates',
160 |      'random_forest:bootstrap': 'False',
161 |      'random_forest:criterion': 'entropy',
162 |      'random_forest:max_depth': 'None',
163 |      'random_forest:max_features': '2.87832643035',
164 |      'random_forest:max_leaf_nodes': 'None',
165 |      'random_forest:min_samples_leaf': '1.0',
166 |      'random_forest:min_samples_split': '19.0',
167 |      'random_forest:n_estimators': '100.0',
168 |      'rescaling:strategy': 'min/max',
169 |      'select_rates:alpha': '0.110716868617',
170 |      'select_rates:mode': 'fwe',
171 |      'select_rates:score_func': 'f_classif'},
172 |     {'balancing:strategy': 'weighting',
173 |      'classifier': 'extra_trees',
174 |      'extra_trees:bootstrap': 'True',
175 |      'extra_trees:criterion': 'entropy',
176 |      'extra_trees:max_depth': 'None',
177 |      'extra_trees:max_features': '3.23138088334',
178 |      'extra_trees:min_samples_leaf': '3.0',
179 |      'extra_trees:min_samples_split': '6.0',
180 |      'extra_trees:n_estimators': '100.0',
181 |      'imputation:strategy': 'mean',
182 |      'preprocessor': 'select_percentile_classification',
183 |      'rescaling:strategy': 'min/max',
184 |      'select_percentile_classification:percentile': '45.1994111355',
185 |      'select_percentile_classification:score_func': 'chi2'},
186 |     {'balancing:strategy': 'none',
187 |      'classifier': 'random_forest',
188 |      'fast_ica:algorithm': 'deflation',
189 |      'fast_ica:fun': 'logcosh',
190 |      'fast_ica:n_components': '509.0',
191 |      'fast_ica:whiten': 'True',
192 |      'imputation:strategy': 'mean',
193 |      'preprocessor': 'fast_ica',
194 |      'random_forest:bootstrap': 'False',
195 |      'random_forest:criterion': 'entropy',
196 |      'random_forest:max_depth': 'None',
197 |      'random_forest:max_features': '2.2727882732',
198 |      'random_forest:max_leaf_nodes': 'None',
199 |      'random_forest:min_samples_leaf': '2.0',
200 |      'random_forest:min_samples_split': '12.0',
201 |      'random_forest:n_estimators': '100.0',
202 |      'rescaling:strategy': 'min/max'},
203 |     {'balancing:strategy': 'weighting',
204 |      'classifier': 'random_forest',
205 |      'imputation:strategy': 'median',
206 |      'preprocessor': 'select_percentile_classification',
207 |      'random_forest:bootstrap': 'False',
208 |      'random_forest:criterion': 'entropy',
209 |      'random_forest:max_depth': 'None',
210 |      'random_forest:max_features': '2.32162402484',
211 |      'random_forest:max_leaf_nodes': 'None',
212 |      'random_forest:min_samples_leaf': '1.0',
213 |      'random_forest:min_samples_split': '12.0',
214 |      'random_forest:n_estimators': '100.0',
215 |      'rescaling:strategy': 'min/max',
216 |      'select_percentile_classification:percentile': '41.8671636453',
217 |      'select_percentile_classification:score_func': 'f_classif'},
218 |     {'balancing:strategy': 'weighting',
219 |      'classifier': 'random_forest',
220 |      'fast_ica:algorithm': 'deflation',
221 |      'fast_ica:fun': 'logcosh',
222 |      'fast_ica:n_components': '690.0',
223 |      'fast_ica:whiten': 'True',
224 |      'imputation:strategy': 'mean',
225 |      'preprocessor': 'fast_ica',
226 |      'random_forest:bootstrap': 'False',
227 |      'random_forest:criterion': 'entropy',
228 |      'random_forest:max_depth': 'None',
229 |      'random_forest:max_features': '2.3355464987',
230 |      'random_forest:max_leaf_nodes': 'None',
231 |      'random_forest:min_samples_leaf': '2.0',
232 |      'random_forest:min_samples_split': '11.0',
233 |      'random_forest:n_estimators': '100.0',
234 |      'rescaling:strategy': 'min/max'},
235 |     {'balancing:strategy': 'weighting',
236 |      'classifier': 'random_forest',
237 |      'imputation:strategy': 'median',
238 |      'preprocessor': 'select_rates',
239 |      'random_forest:bootstrap': 'True',
240 |      'random_forest:criterion': 'entropy',
241 |      'random_forest:max_depth': 'None',
242 |      'random_forest:max_features': '4.2700093411',
243 |      'random_forest:max_leaf_nodes': 'None',
244 |      'random_forest:min_samples_leaf': '4.0',
245 |      'random_forest:min_samples_split': '11.0',
246 |      'random_forest:n_estimators': '100.0',
247 |      'rescaling:strategy': 'min/max',
248 |      'select_rates:alpha': '0.294021193269',
249 |      'select_rates:mode': 'fwe',
250 |      'select_rates:score_func': 'f_classif'},
251 |     {'balancing:strategy': 'weighting',
252 |      'classifier': 'random_forest',
253 |      'fast_ica:algorithm': 'deflation',
254 |      'fast_ica:fun': 'logcosh',
255 |      'fast_ica:n_components': '613.0',
256 |      'fast_ica:whiten': 'True',
257 |      'imputation:strategy': 'median',
258 |      'preprocessor': 'fast_ica',
259 |      'random_forest:bootstrap': 'False',
260 |      'random_forest:criterion': 'entropy',
261 |      'random_forest:max_depth': 'None',
262 |      'random_forest:max_features': '1.8000767552',
263 |      'random_forest:max_leaf_nodes': 'None',
264 |      'random_forest:min_samples_leaf': '2.0',
265 |      'random_forest:min_samples_split': '7.0',
266 |      'random_forest:n_estimators': '100.0',
267 |      'rescaling:strategy': 'min/max'},
268 |     {'balancing:strategy': 'none',
269 |      'classifier': 'random_forest',
270 |      'fast_ica:algorithm': 'deflation',
271 |      'fast_ica:fun': 'logcosh',
272 |      'fast_ica:n_components': '661.0',
273 |      'fast_ica:whiten': 'False',
274 |      'imputation:strategy': 'mean',
275 |      'preprocessor': 'fast_ica',
276 |      'random_forest:bootstrap': 'False',
277 |      'random_forest:criterion': 'entropy',
278 |      'random_forest:max_depth': 'None',
279 |      'random_forest:max_features': '2.23424202393',
280 |      'random_forest:max_leaf_nodes': 'None',
281 |      'random_forest:min_samples_leaf': '3.0',
282 |      'random_forest:min_samples_split': '10.0',
283 |      'random_forest:n_estimators': '100.0',
284 |      'rescaling:strategy': 'min/max'},
285 |     {'balancing:strategy': 'none',
286 |      'classifier': 'random_forest',
287 |      'fast_ica:algorithm': 'deflation',
288 |      'fast_ica:fun': 'logcosh',
289 |      'fast_ica:n_components': '606.0',
290 |      'fast_ica:whiten': 'True',
291 |      'imputation:strategy': 'median',
292 |      'preprocessor': 'fast_ica',
293 |      'random_forest:bootstrap': 'False',
294 |      'random_forest:criterion': 'entropy',
295 |      'random_forest:max_depth': 'None',
296 |      'random_forest:max_features': '1.82743208676',
297 |      'random_forest:max_leaf_nodes': 'None',
298 |      'random_forest:min_samples_leaf': '3.0',
299 |      'random_forest:min_samples_split': '11.0',
300 |      'random_forest:n_estimators': '100.0',
301 |      'rescaling:strategy': 'min/max'},
302 |     {'balancing:strategy': 'weighting',
303 |      'classifier': 'extra_trees',
304 |      'extra_trees:bootstrap': 'True',
305 |      'extra_trees:criterion': 'gini',
306 |      'extra_trees:max_depth': 'None',
307 |      'extra_trees:max_features': '4.32850858484',
308 |      'extra_trees:min_samples_leaf': '3.0',
309 |      'extra_trees:min_samples_split': '5.0',
310 |      'extra_trees:n_estimators': '100.0',
311 |      'imputation:strategy': 'mean',
312 |      'preprocessor': 'select_rates',
313 |      'rescaling:strategy': 'min/max',
314 |      'select_rates:alpha': '0.118453703147',
315 |      'select_rates:mode': 'fpr',
316 |      'select_rates:score_func': 'f_classif'},
317 |     {'balancing:strategy': 'weighting',
318 |      'classifier': 'random_forest',
319 |      'fast_ica:algorithm': 'deflation',
320 |      'fast_ica:fun': 'logcosh',
321 |      'fast_ica:n_components': '1098.0',
322 |      'fast_ica:whiten': 'True',
323 |      'imputation:strategy': 'most_frequent',
324 |      'preprocessor': 'fast_ica',
325 |      'random_forest:bootstrap': 'False',
326 |      'random_forest:criterion': 'entropy',
327 |      'random_forest:max_depth': 'None',
328 |      'random_forest:max_features': '4.83031750621',
329 |      'random_forest:max_leaf_nodes': 'None',
330 |      'random_forest:min_samples_leaf': '1.0',
331 |      'random_forest:min_samples_split': '15.0',
332 |      'random_forest:n_estimators': '100.0',
333 |      'rescaling:strategy': 'min/max'},
334 |     {'balancing:strategy': 'weighting',
335 |      'classifier': 'random_forest',
336 |      'imputation:strategy': 'median',
337 |      'preprocessor': 'select_rates',
338 |      'random_forest:bootstrap': 'False',
339 |      'random_forest:criterion': 'gini',
340 |      'random_forest:max_depth': 'None',
341 |      'random_forest:max_features': '3.52038352463',
342 |      'random_forest:max_leaf_nodes': 'None',
343 |      'random_forest:min_samples_leaf': '4.0',
344 |      'random_forest:min_samples_split': '4.0',
345 |      'random_forest:n_estimators': '100.0',
346 |      'rescaling:strategy': 'standard',
347 |      'select_rates:alpha': '0.441859738474',
348 |      'select_rates:mode': 'fpr',
349 |      'select_rates:score_func': 'f_classif'},
350 |     {'balancing:strategy': 'none',
351 |      'classifier': 'random_forest',
352 |      'fast_ica:algorithm': 'deflation',
353 |      'fast_ica:fun': 'logcosh',
354 |      'fast_ica:n_components': '743.0',
355 |      'fast_ica:whiten': 'False',
356 |      'imputation:strategy': 'median',
357 |      'preprocessor': 'fast_ica',
358 |      'random_forest:bootstrap': 'False',
359 |      'random_forest:criterion': 'entropy',
360 |      'random_forest:max_depth': 'None',
361 |      'random_forest:max_features': '2.37406180812',
362 |      'random_forest:max_leaf_nodes': 'None',
363 |      'random_forest:min_samples_leaf': '2.0',
364 |      'random_forest:min_samples_split': '17.0',
365 |      'random_forest:n_estimators': '100.0',
366 |      'rescaling:strategy': 'min/max'},
367 |     {'balancing:strategy': 'none',
368 |      'classifier': 'random_forest',
369 |      'fast_ica:algorithm': 'deflation',
370 |      'fast_ica:fun': 'logcosh',
371 |      'fast_ica:n_components': '531.0',
372 |      'fast_ica:whiten': 'True',
373 |      'imputation:strategy': 'mean',
374 |      'preprocessor': 'fast_ica',
375 |      'random_forest:bootstrap': 'False',
376 |      'random_forest:criterion': 'entropy',
377 |      'random_forest:max_depth': 'None',
378 |      'random_forest:max_features': '2.38993786345',
379 |      'random_forest:max_leaf_nodes': 'None',
380 |      'random_forest:min_samples_leaf': '4.0',
381 |      'random_forest:min_samples_split': '16.0',
382 |      'random_forest:n_estimators': '100.0',
383 |      'rescaling:strategy': 'min/max'},
384 |     {'balancing:strategy': 'weighting',
385 |      'classifier': 'extra_trees',
386 |      'extra_trees:bootstrap': 'False',
387 |      'extra_trees:criterion': 'entropy',
388 |      'extra_trees:max_depth': 'None',
389 |      'extra_trees:max_features': '1.60284209578',
390 |      'extra_trees:min_samples_leaf': '4.0',
391 |      'extra_trees:min_samples_split': '10.0',
392 |      'extra_trees:n_estimators': '100.0',
393 |      'imputation:strategy': 'most_frequent',
394 |      'preprocessor': 'select_rates',
395 |      'rescaling:strategy': 'min/max',
396 |      'select_rates:alpha': '0.486662334462',
397 |      'select_rates:mode': 'fwe',
398 |      'select_rates:score_func': 'chi2'},
399 |     {'balancing:strategy': 'weighting',
400 |      'classifier': 'random_forest',
401 |      'fast_ica:algorithm': 'deflation',
402 |      'fast_ica:fun': 'logcosh',
403 |      'fast_ica:n_components': '1082.0',
404 |      'fast_ica:whiten': 'False',
405 |      'imputation:strategy': 'median',
406 |      'preprocessor': 'fast_ica',
407 |      'random_forest:bootstrap': 'False',
408 |      'random_forest:criterion': 'entropy',
409 |      'random_forest:max_depth': 'None',
410 |      'random_forest:max_features': '1.47545539014',
411 |      'random_forest:max_leaf_nodes': 'None',
412 |      'random_forest:min_samples_leaf': '2.0',
413 |      'random_forest:min_samples_split': '15.0',
414 |      'random_forest:n_estimators': '100.0',
415 |      'rescaling:strategy': 'min/max'},
416 |     {'balancing:strategy': 'weighting',
417 |      'classifier': 'random_forest',
418 |      'fast_ica:algorithm': 'deflation',
419 |      'fast_ica:fun': 'logcosh',
420 |      'fast_ica:n_components': '985.0',
421 |      'fast_ica:whiten': 'True',
422 |      'imputation:strategy': 'most_frequent',
423 |      'preprocessor': 'fast_ica',
424 |      'random_forest:bootstrap': 'False',
425 |      'random_forest:criterion': 'gini',
426 |      'random_forest:max_depth': 'None',
427 |      'random_forest:max_features': '3.87640604363',
428 |      'random_forest:max_leaf_nodes': 'None',
429 |      'random_forest:min_samples_leaf': '2.0',
430 |      'random_forest:min_samples_split': '11.0',
431 |      'random_forest:n_estimators': '100.0',
432 |      'rescaling:strategy': 'min/max'},
433 |     {'balancing:strategy': 'weighting',
434 |      'classifier': 'gradient_boosting',
435 |      'gradient_boosting:learning_rate': '0.236639577539',
436 |      'gradient_boosting:max_depth': '5.0',
437 |      'gradient_boosting:max_features': '1.94802938969',
438 |      'gradient_boosting:min_samples_leaf': '3.0',
439 |      'gradient_boosting:min_samples_split': '4.0',
440 |      'gradient_boosting:n_estimators': '100.0',
441 |      'gradient_boosting:subsample': '0.499388145134',
442 |      'imputation:strategy': 'most_frequent',
443 |      'preprocessor': 'select_rates',
444 |      'rescaling:strategy': 'min/max',
445 |      'select_rates:alpha': '0.078631031495',
446 |      'select_rates:mode': 'fwe',
447 |      'select_rates:score_func': 'f_classif'},
448 |     {'balancing:strategy': 'none',
449 |      'classifier': 'random_forest',
450 |      'imputation:strategy': 'mean',
451 |      'preprocessor': 'select_percentile_classification',
452 |      'random_forest:bootstrap': 'False',
453 |      'random_forest:criterion': 'gini',
454 |      'random_forest:max_depth': 'None',
455 |      'random_forest:max_features': '2.89271865035',
456 |      'random_forest:max_leaf_nodes': 'None',
457 |      'random_forest:min_samples_leaf': '9.0',
458 |      'random_forest:min_samples_split': '2.0',
459 |      'random_forest:n_estimators': '100.0',
460 |      'rescaling:strategy': 'min/max',
461 |      'select_percentile_classification:percentile': '58.6633457276',
462 |      'select_percentile_classification:score_func': 'chi2'},
463 |     {'balancing:strategy': 'none',
464 |      'classifier': 'random_forest',
465 |      'fast_ica:algorithm': 'deflation',
466 |      'fast_ica:fun': 'logcosh',
467 |      'fast_ica:n_components': '1299.0',
468 |      'fast_ica:whiten': 'False',
469 |      'imputation:strategy': 'mean',
470 |      'preprocessor': 'fast_ica',
471 |      'random_forest:bootstrap': 'False',
472 |      'random_forest:criterion': 'entropy',
473 |      'random_forest:max_depth': 'None',
474 |      'random_forest:max_features': '4.38103060363',
475 |      'random_forest:max_leaf_nodes': 'None',
476 |      'random_forest:min_samples_leaf': '3.0',
477 |      'random_forest:min_samples_split': '2.0',
478 |      'random_forest:n_estimators': '100.0',
479 |      'rescaling:strategy': 'min/max'},
480 |     {'balancing:strategy': 'none',
481 |      'classifier': 'random_forest',
482 |      'fast_ica:algorithm': 'deflation',
483 |      'fast_ica:fun': 'logcosh',
484 |      'fast_ica:n_components': '1653.0',
485 |      'fast_ica:whiten': 'True',
486 |      'imputation:strategy': 'median',
487 |      'preprocessor': 'fast_ica',
488 |      'random_forest:bootstrap': 'False',
489 |      'random_forest:criterion': 'entropy',
490 |      'random_forest:max_depth': 'None',
491 |      'random_forest:max_features': '2.58731902957',
492 |      'random_forest:max_leaf_nodes': 'None',
493 |      'random_forest:min_samples_leaf': '8.0',
494 |      'random_forest:min_samples_split': '19.0',
495 |      'random_forest:n_estimators': '100.0',
496 |      'rescaling:strategy': 'min/max'},
497 | ]
498 | 
499 | classifiers = []
500 | predictions_valid = []
501 | predictions_test = []
502 | 
503 | # Make predictions and weight them
504 | for weight, configuration in zip(weights, configurations):
505 |     for param in configuration:
506 |         try:
507 |             configuration[param] = int(configuration[param])
508 |         except Exception:
509 |             try:
510 |                 configuration[param] = float(configuration[param])
511 |             except Exception:
512 |                 pass
513 | 
514 |     classifier = ParamSklearnClassifier(configuration, 1)
515 |     classifiers.append(classifier)
516 |     try:
517 |         classifier.fit(X.copy(), y.copy())
518 |         predictions_valid.append(classifier.predict_proba(X_valid.copy()) * weight)
519 |         predictions_test.append(classifier.predict_proba(X_test.copy()) * weight)
520 |     except Exception as e:
521 |         print e
522 |         print configuration
523 | 
524 | # Output the predictions
525 | for name, predictions in [('valid', predictions_valid),
526 |                           ('test', predictions_test)]:
527 |     predictions = np.array(predictions)
528 |     predictions = np.sum(predictions, axis=0)
529 |     predictions = predictions[:, 1].reshape((-1, 1))
530 | 
531 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
532 |     np.savetxt(filepath, predictions, delimiter=' ')


--------------------------------------------------------------------------------
/001_madeline.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import autosklearn
  7 | import autosklearn.data
  8 | import autosklearn.data.data_manager
  9 | import autosklearn.models.evaluator
 10 | from ParamSklearn.classification import ParamSklearnClassifier
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument('input')
 15 | parser.add_argument('output')
 16 | args = parser.parse_args()
 17 | 
 18 | input = args.input
 19 | dataset = 'madeline'
 20 | output = args.output
 21 | 
 22 | D = autosklearn.data.data_manager.DataManager(dataset, input)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Subset of features found with RFE. Feature with least importance in sklearn
 29 | # RF removed. Afterwards, trained RF on remaining features with 5CV. In the
 30 | # end, choose feature set with lowest error
 31 | features = [52, 70, 74, 83, 85, 135, 162, 183, 184, 185, 191, 197, 232, 237,
 32 |             239, 252]
 33 | 
 34 | X = X[:, features]
 35 | X_valid = X_valid[:, features]
 36 | X_test = X_test[:, features]
 37 | 
 38 | # Weights of the ensemble members as determined by Ensemble Selection
 39 | weights = np.array([0.100000, 0.080000, 0.080000, 0.060000, 0.060000,
 40 |                     0.060000, 0.060000, 0.040000, 0.040000, 0.040000,
 41 |                     0.040000, 0.040000, 0.020000, 0.020000, 0.020000,
 42 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 43 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 44 |                     0.020000, 0.020000])
 45 | 
 46 | # Ensemble members found by SMAC
 47 | configurations = [
 48 |     {'balancing:strategy': 'weighting',
 49 |      'classifier': 'k_nearest_neighbors',
 50 |      'imputation:strategy': 'median',
 51 |      'k_nearest_neighbors:algorithm': 'auto',
 52 |      'k_nearest_neighbors:leaf_size': '30.0',
 53 |      'k_nearest_neighbors:n_neighbors': '4.0',
 54 |      'k_nearest_neighbors:p': '1.0',
 55 |      'k_nearest_neighbors:weights': 'distance',
 56 |      'preprocessor': 'select_rates',
 57 |      'rescaling:strategy': 'standard',
 58 |      'select_rates:alpha': '0.124513266268',
 59 |      'select_rates:mode': 'fdr',
 60 |      'select_rates:score_func': 'f_classif'},
 61 |     {'balancing:strategy': 'weighting',
 62 |      'classifier': 'qda',
 63 |      'imputation:strategy': 'mean',
 64 |      'kitchen_sinks:gamma': '0.802981892271',
 65 |      'kitchen_sinks:n_components': '704.0',
 66 |      'preprocessor': 'kitchen_sinks',
 67 |      'qda:reg_param': '7.66537661987',
 68 |      'qda:tol': '0.000779904033875',
 69 |      'rescaling:strategy': 'standard'},
 70 |     {'balancing:strategy': 'none',
 71 |      'classifier': 'qda',
 72 |      'imputation:strategy': 'mean',
 73 |      'kitchen_sinks:gamma': '0.658527701661',
 74 |      'kitchen_sinks:n_components': '499.0',
 75 |      'preprocessor': 'kitchen_sinks',
 76 |      'qda:reg_param': '4.13193776587',
 77 |      'qda:tol': '0.0026677961139',
 78 |      'rescaling:strategy': 'standard'},
 79 |     {'balancing:strategy': 'none',
 80 |      'classifier': 'qda',
 81 |      'imputation:strategy': 'mean',
 82 |      'kitchen_sinks:gamma': '0.658527701661',
 83 |      'kitchen_sinks:n_components': '498.0',
 84 |      'preprocessor': 'kitchen_sinks',
 85 |      'qda:reg_param': '7.39545021165',
 86 |      'qda:tol': '0.00116251661342',
 87 |      'rescaling:strategy': 'standard'},
 88 |     {'balancing:strategy': 'none',
 89 |      'classifier': 'qda',
 90 |      'imputation:strategy': 'mean',
 91 |      'kitchen_sinks:gamma': '0.758771699267',
 92 |      'kitchen_sinks:n_components': '794.0',
 93 |      'preprocessor': 'kitchen_sinks',
 94 |      'qda:reg_param': '4.57263430441',
 95 |      'qda:tol': '0.00284918317943',
 96 |      'rescaling:strategy': 'standard'},
 97 |     {'balancing:strategy': 'none',
 98 |      'classifier': 'k_nearest_neighbors',
 99 |      'imputation:strategy': 'most_frequent',
100 |      'k_nearest_neighbors:algorithm': 'auto',
101 |      'k_nearest_neighbors:leaf_size': '30.0',
102 |      'k_nearest_neighbors:n_neighbors': '5.0',
103 |      'k_nearest_neighbors:p': '1.0',
104 |      'k_nearest_neighbors:weights': 'distance',
105 |      'preprocessor': 'select_rates',
106 |      'rescaling:strategy': 'min/max',
107 |      'select_rates:alpha': '0.0683198728939',
108 |      'select_rates:mode': 'fdr',
109 |      'select_rates:score_func': 'f_classif'},
110 |     {'balancing:strategy': 'none',
111 |      'classifier': 'qda',
112 |      'imputation:strategy': 'mean',
113 |      'kitchen_sinks:gamma': '0.773869494191',
114 |      'kitchen_sinks:n_components': '608.0',
115 |      'preprocessor': 'kitchen_sinks',
116 |      'qda:reg_param': '5.34388968302',
117 |      'qda:tol': '0.000118437687463',
118 |      'rescaling:strategy': 'standard'},
119 |     {'balancing:strategy': 'weighting',
120 |      'classifier': 'k_nearest_neighbors',
121 |      'imputation:strategy': 'mean',
122 |      'k_nearest_neighbors:algorithm': 'auto',
123 |      'k_nearest_neighbors:leaf_size': '30.0',
124 |      'k_nearest_neighbors:n_neighbors': '4.0',
125 |      'k_nearest_neighbors:p': '1.0',
126 |      'k_nearest_neighbors:weights': 'distance',
127 |      'preprocessor': 'select_rates',
128 |      'rescaling:strategy': 'min/max',
129 |      'select_rates:alpha': '0.0953909302386',
130 |      'select_rates:mode': 'fdr',
131 |      'select_rates:score_func': 'chi2'},
132 |     {'balancing:strategy': 'none',
133 |      'classifier': 'qda',
134 |      'imputation:strategy': 'mean',
135 |      'kitchen_sinks:gamma': '0.722743897655',
136 |      'kitchen_sinks:n_components': '952.0',
137 |      'preprocessor': 'kitchen_sinks',
138 |      'qda:reg_param': '3.61200930387',
139 |      'qda:tol': '0.000911935213882',
140 |      'rescaling:strategy': 'standard'},
141 |     {'balancing:strategy': 'weighting',
142 |      'classifier': 'k_nearest_neighbors',
143 |      'imputation:strategy': 'most_frequent',
144 |      'k_nearest_neighbors:algorithm': 'auto',
145 |      'k_nearest_neighbors:leaf_size': '30.0',
146 |      'k_nearest_neighbors:n_neighbors': '3.0',
147 |      'k_nearest_neighbors:p': '2.0',
148 |      'k_nearest_neighbors:weights': 'distance',
149 |      'preprocessor': 'select_rates',
150 |      'rescaling:strategy': 'standard',
151 |      'select_rates:alpha': '0.12499749257',
152 |      'select_rates:mode': 'fdr',
153 |      'select_rates:score_func': 'f_classif'},
154 |     {'balancing:strategy': 'none',
155 |      'classifier': 'qda',
156 |      'imputation:strategy': 'most_frequent',
157 |      'kitchen_sinks:gamma': '0.521009778754',
158 |      'kitchen_sinks:n_components': '581.0',
159 |      'preprocessor': 'kitchen_sinks',
160 |      'qda:reg_param': '0.570532656005',
161 |      'qda:tol': '0.00759604479274',
162 |      'rescaling:strategy': 'standard'},
163 |     {'balancing:strategy': 'none',
164 |      'classifier': 'qda',
165 |      'imputation:strategy': 'median',
166 |      'kitchen_sinks:gamma': '0.736334496442',
167 |      'kitchen_sinks:n_components': '590.0',
168 |      'preprocessor': 'kitchen_sinks',
169 |      'qda:reg_param': '8.78913455152',
170 |      'qda:tol': '0.0417125881025',
171 |      'rescaling:strategy': 'standard'},
172 |     {'balancing:strategy': 'weighting',
173 |      'classifier': 'k_nearest_neighbors',
174 |      'imputation:strategy': 'median',
175 |      'k_nearest_neighbors:algorithm': 'auto',
176 |      'k_nearest_neighbors:leaf_size': '30.0',
177 |      'k_nearest_neighbors:n_neighbors': '10.0',
178 |      'k_nearest_neighbors:p': '2.0',
179 |      'k_nearest_neighbors:weights': 'distance',
180 |      'preprocessor': 'select_rates',
181 |      'rescaling:strategy': 'min/max',
182 |      'select_rates:alpha': '0.065583595323',
183 |      'select_rates:mode': 'fdr',
184 |      'select_rates:score_func': 'f_classif'},
185 |     {'balancing:strategy': 'none',
186 |      'classifier': 'qda',
187 |      'imputation:strategy': 'mean',
188 |      'kitchen_sinks:gamma': '0.725282605688',
189 |      'kitchen_sinks:n_components': '591.0',
190 |      'preprocessor': 'kitchen_sinks',
191 |      'qda:reg_param': '4.32023431675',
192 |      'qda:tol': '2.95483713232e-05',
193 |      'rescaling:strategy': 'standard'},
194 |     {'balancing:strategy': 'none',
195 |      'classifier': 'qda',
196 |      'imputation:strategy': 'mean',
197 |      'kitchen_sinks:gamma': '0.686955501206',
198 |      'kitchen_sinks:n_components': '646.0',
199 |      'preprocessor': 'kitchen_sinks',
200 |      'qda:reg_param': '9.58493774318',
201 |      'qda:tol': '0.00612419830773',
202 |      'rescaling:strategy': 'standard'},
203 |     {'balancing:strategy': 'none',
204 |      'classifier': 'k_nearest_neighbors',
205 |      'imputation:strategy': 'median',
206 |      'k_nearest_neighbors:algorithm': 'auto',
207 |      'k_nearest_neighbors:leaf_size': '30.0',
208 |      'k_nearest_neighbors:n_neighbors': '6.0',
209 |      'k_nearest_neighbors:p': '2.0',
210 |      'k_nearest_neighbors:weights': 'distance',
211 |      'preprocessor': 'select_rates',
212 |      'rescaling:strategy': 'min/max',
213 |      'select_rates:alpha': '0.276130352686',
214 |      'select_rates:mode': 'fdr',
215 |      'select_rates:score_func': 'f_classif'},
216 |     {'balancing:strategy': 'none',
217 |      'classifier': 'qda',
218 |      'imputation:strategy': 'most_frequent',
219 |      'kitchen_sinks:gamma': '0.549862378472',
220 |      'kitchen_sinks:n_components': '591.0',
221 |      'preprocessor': 'kitchen_sinks',
222 |      'qda:reg_param': '1.11536443906',
223 |      'qda:tol': '4.98941924261e-05',
224 |      'rescaling:strategy': 'standard'},
225 |     {'balancing:strategy': 'none',
226 |      'classifier': 'qda',
227 |      'imputation:strategy': 'median',
228 |      'kitchen_sinks:gamma': '0.551878628115',
229 |      'kitchen_sinks:n_components': '913.0',
230 |      'preprocessor': 'kitchen_sinks',
231 |      'qda:reg_param': '2.80643663684',
232 |      'qda:tol': '0.0030955537468',
233 |      'rescaling:strategy': 'standard'},
234 |     {'balancing:strategy': 'none',
235 |      'classifier': 'qda',
236 |      'imputation:strategy': 'mean',
237 |      'kitchen_sinks:gamma': '0.797948222068',
238 |      'kitchen_sinks:n_components': '856.0',
239 |      'preprocessor': 'kitchen_sinks',
240 |      'qda:reg_param': '0.753439507859',
241 |      'qda:tol': '0.000179635997544',
242 |      'rescaling:strategy': 'standard'},
243 |     {'balancing:strategy': 'weighting',
244 |      'classifier': 'k_nearest_neighbors',
245 |      'imputation:strategy': 'median',
246 |      'k_nearest_neighbors:algorithm': 'auto',
247 |      'k_nearest_neighbors:leaf_size': '30.0',
248 |      'k_nearest_neighbors:n_neighbors': '6.0',
249 |      'k_nearest_neighbors:p': '2.0',
250 |      'k_nearest_neighbors:weights': 'distance',
251 |      'preprocessor': 'select_rates',
252 |      'rescaling:strategy': 'standard',
253 |      'select_rates:alpha': '0.121674691962',
254 |      'select_rates:mode': 'fdr',
255 |      'select_rates:score_func': 'f_classif'},
256 |     {'balancing:strategy': 'none',
257 |      'classifier': 'qda',
258 |      'imputation:strategy': 'median',
259 |      'kitchen_sinks:gamma': '0.870787144807',
260 |      'kitchen_sinks:n_components': '591.0',
261 |      'preprocessor': 'kitchen_sinks',
262 |      'qda:reg_param': '3.25265485261',
263 |      'qda:tol': '0.000232802336471',
264 |      'rescaling:strategy': 'standard'},
265 |     {'balancing:strategy': 'none',
266 |      'classifier': 'qda',
267 |      'imputation:strategy': 'mean',
268 |      'kitchen_sinks:gamma': '0.725282605688',
269 |      'kitchen_sinks:n_components': '469.0',
270 |      'preprocessor': 'kitchen_sinks',
271 |      'qda:reg_param': '4.32023431675',
272 |      'qda:tol': '6.11461737038e-05',
273 |      'rescaling:strategy': 'standard'},
274 |     {'balancing:strategy': 'none',
275 |      'classifier': 'qda',
276 |      'imputation:strategy': 'mean',
277 |      'kitchen_sinks:gamma': '0.742290491524',
278 |      'kitchen_sinks:n_components': '699.0',
279 |      'preprocessor': 'kitchen_sinks',
280 |      'qda:reg_param': '1.80605719583',
281 |      'qda:tol': '0.00759903394814',
282 |      'rescaling:strategy': 'standard'},
283 |     {'balancing:strategy': 'weighting',
284 |      'classifier': 'k_nearest_neighbors',
285 |      'imputation:strategy': 'mean',
286 |      'k_nearest_neighbors:algorithm': 'auto',
287 |      'k_nearest_neighbors:leaf_size': '30.0',
288 |      'k_nearest_neighbors:n_neighbors': '4.0',
289 |      'k_nearest_neighbors:p': '2.0',
290 |      'k_nearest_neighbors:weights': 'distance',
291 |      'preprocessor': 'select_rates',
292 |      'rescaling:strategy': 'min/max',
293 |      'select_rates:alpha': '0.0556366440458',
294 |      'select_rates:mode': 'fdr',
295 |      'select_rates:score_func': 'f_classif'},
296 |     {'balancing:strategy': 'none',
297 |      'classifier': 'qda',
298 |      'imputation:strategy': 'mean',
299 |      'kitchen_sinks:gamma': '0.69436212216',
300 |      'kitchen_sinks:n_components': '477.0',
301 |      'preprocessor': 'kitchen_sinks',
302 |      'qda:reg_param': '7.19343875838',
303 |      'qda:tol': '0.00130430743783',
304 |      'rescaling:strategy': 'standard'},
305 |     {'balancing:strategy': 'weighting',
306 |      'classifier': 'k_nearest_neighbors',
307 |      'imputation:strategy': 'median',
308 |      'k_nearest_neighbors:algorithm': 'auto',
309 |      'k_nearest_neighbors:leaf_size': '30.0',
310 |      'k_nearest_neighbors:n_neighbors': '8.0',
311 |      'k_nearest_neighbors:p': '1.0',
312 |      'k_nearest_neighbors:weights': 'distance',
313 |      'preprocessor': 'select_rates',
314 |      'rescaling:strategy': 'standard',
315 |      'select_rates:alpha': '0.0962781949808',
316 |      'select_rates:mode': 'fdr',
317 |      'select_rates:score_func': 'f_classif'},
318 |     {'balancing:strategy': 'none',
319 |      'classifier': 'qda',
320 |      'imputation:strategy': 'mean',
321 |      'kitchen_sinks:gamma': '0.680526800011',
322 |      'kitchen_sinks:n_components': '627.0',
323 |      'preprocessor': 'kitchen_sinks',
324 |      'qda:reg_param': '3.3758872613',
325 |      'qda:tol': '0.0025551077682',
326 |      'rescaling:strategy': 'standard'},
327 | ]
328 | 
329 | classifiers = []
330 | predictions_valid = []
331 | predictions_test = []
332 | 
333 | # Make predictions and weight them
334 | for weight, configuration in zip(weights, configurations):
335 |     for param in configuration:
336 |         try:
337 |             configuration[param] = int(configuration[param])
338 |         except Exception:
339 |             try:
340 |                 configuration[param] = float(configuration[param])
341 |             except Exception:
342 |                 pass
343 | 
344 |     classifier = ParamSklearnClassifier(configuration, 1)
345 |     classifiers.append(classifier)
346 |     try:
347 |         classifier.fit(X.copy(), y.copy())
348 |         predictions_valid.append(
349 |             classifier.predict_proba(X_valid.copy()) * weight)
350 |         predictions_test.append(
351 |             classifier.predict_proba(X_test.copy()) * weight)
352 |     except Exception as e:
353 |         print e
354 |         print configuration
355 | 
356 | # Output the predictions
357 | for name, predictions in [('valid', predictions_valid),
358 |                           ('test', predictions_test)]:
359 |     predictions = np.array(predictions)
360 |     predictions = np.sum(predictions, axis=0)
361 |     predictions = predictions[:, 1].reshape((-1, 1))
362 | 
363 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
364 |     np.savetxt(filepath, predictions, delimiter=' ')


--------------------------------------------------------------------------------
/001_philippine.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import autosklearn
  7 | import autosklearn.data
  8 | import autosklearn.data.data_manager
  9 | import autosklearn.models.evaluator
 10 | from ParamSklearn.classification import ParamSklearnClassifier
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument('input')
 15 | parser.add_argument('output')
 16 | args = parser.parse_args()
 17 | 
 18 | input = args.input
 19 | dataset = 'philippine'
 20 | output = args.output
 21 | 
 22 | D = autosklearn.data.data_manager.DataManager(dataset, input)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Subset of features found with RFE. Feature with least importance in sklearn
 29 | # RF removed. Afterwards, trained RF on remaining features with 5CV. In the
 30 | # end, choose feature set with lowest error
 31 | features = [33, 89, 140, 168, 178, 271]
 32 | 
 33 | X = X[:, features]
 34 | X_valid = X_valid[:, features]
 35 | X_test = X_test[:, features]
 36 | 
 37 | # Weights of the ensemble members as determined by Ensemble Selection
 38 | weights = np.array([0.100000, 0.080000, 0.080000, 0.060000, 0.040000,
 39 |                     0.040000, 0.040000, 0.040000, 0.040000, 0.040000,
 40 |                     0.040000, 0.020000, 0.020000, 0.020000, 0.020000,
 41 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 42 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 43 |                     0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
 44 |                     0.020000])
 45 | 
 46 | # Ensemble members found by SMAC
 47 | configurations = [
 48 |     {'adaboost:algorithm': 'SAMME.R',
 49 |      'adaboost:learning_rate': '0.243038132773',
 50 |      'adaboost:max_depth': '9.0',
 51 |      'adaboost:n_estimators': '475.0',
 52 |      'balancing:strategy': 'none',
 53 |      'classifier': 'adaboost',
 54 |      'feature_agglomeration:affinity': 'cosine',
 55 |      'feature_agglomeration:linkage': 'complete',
 56 |      'feature_agglomeration:n_clusters': '287.0',
 57 |      'imputation:strategy': 'most_frequent',
 58 |      'preprocessor': 'feature_agglomeration',
 59 |      'rescaling:strategy': 'none',
 60 |     },
 61 |     {
 62 |         'adaboost:algorithm': 'SAMME.R',
 63 |         'adaboost:learning_rate': '0.246430392425',
 64 |         'adaboost:max_depth': '9.0',
 65 |         'adaboost:n_estimators': '436.0',
 66 |         'balancing:strategy': 'weighting',
 67 |         'classifier': 'adaboost',
 68 |         'feature_agglomeration:affinity': 'manhattan',
 69 |         'feature_agglomeration:linkage': 'average',
 70 |         'feature_agglomeration:n_clusters': '156.0',
 71 |         'imputation:strategy': 'median',
 72 |         'preprocessor': 'feature_agglomeration',
 73 |         'rescaling:strategy': 'standard',
 74 |     },
 75 |     {
 76 |         'adaboost:algorithm': 'SAMME.R',
 77 |         'adaboost:learning_rate': '0.205679811363',
 78 |         'adaboost:max_depth': '9.0',
 79 |         'adaboost:n_estimators': '485.0',
 80 |         'balancing:strategy': 'none',
 81 |         'classifier': 'adaboost',
 82 |         'feature_agglomeration:affinity': 'euclidean',
 83 |         'feature_agglomeration:linkage': 'complete',
 84 |         'feature_agglomeration:n_clusters': '79.0',
 85 |         'imputation:strategy': 'most_frequent',
 86 |         'preprocessor': 'feature_agglomeration',
 87 |         'rescaling:strategy': 'min/max',
 88 |     },
 89 |     {
 90 |         'adaboost:algorithm': 'SAMME.R',
 91 |         'adaboost:learning_rate': '0.250841964136',
 92 |         'adaboost:max_depth': '10.0',
 93 |         'adaboost:n_estimators': '479.0',
 94 |         'balancing:strategy': 'none',
 95 |         'classifier': 'adaboost',
 96 |         'feature_agglomeration:affinity': 'euclidean',
 97 |         'feature_agglomeration:linkage': 'average',
 98 |         'feature_agglomeration:n_clusters': '352.0',
 99 |         'imputation:strategy': 'median',
100 |         'preprocessor': 'feature_agglomeration',
101 |         'rescaling:strategy': 'none',
102 |     },
103 |     {
104 |         'adaboost:algorithm': 'SAMME.R',
105 |         'adaboost:learning_rate': '0.329040651125',
106 |         'adaboost:max_depth': '10.0',
107 |         'adaboost:n_estimators': '493.0',
108 |         'balancing:strategy': 'weighting',
109 |         'classifier': 'adaboost',
110 |         'feature_agglomeration:affinity': 'manhattan',
111 |         'feature_agglomeration:linkage': 'average',
112 |         'feature_agglomeration:n_clusters': '268.0',
113 |         'imputation:strategy': 'most_frequent',
114 |         'preprocessor': 'feature_agglomeration',
115 |         'rescaling:strategy': 'min/max',
116 |     },
117 |     {
118 |         'adaboost:algorithm': 'SAMME.R',
119 |         'adaboost:learning_rate': '0.376704790019',
120 |         'adaboost:max_depth': '10.0',
121 |         'adaboost:n_estimators': '400.0',
122 |         'balancing:strategy': 'weighting',
123 |         'classifier': 'adaboost',
124 |         'feature_agglomeration:affinity': 'euclidean',
125 |         'feature_agglomeration:linkage': 'ward',
126 |         'feature_agglomeration:n_clusters': '344.0',
127 |         'imputation:strategy': 'median',
128 |         'preprocessor': 'feature_agglomeration',
129 |         'rescaling:strategy': 'min/max',
130 |     },
131 |     {
132 |         'adaboost:algorithm': 'SAMME.R',
133 |         'adaboost:learning_rate': '0.483824181899',
134 |         'adaboost:max_depth': '9.0',
135 |         'adaboost:n_estimators': '479.0',
136 |         'balancing:strategy': 'weighting',
137 |         'classifier': 'adaboost',
138 |         'feature_agglomeration:affinity': 'cosine',
139 |         'feature_agglomeration:linkage': 'average',
140 |         'feature_agglomeration:n_clusters': '310.0',
141 |         'imputation:strategy': 'most_frequent',
142 |         'preprocessor': 'feature_agglomeration',
143 |         'rescaling:strategy': 'min/max',
144 |     },
145 |     {
146 |         'adaboost:algorithm': 'SAMME.R',
147 |         'adaboost:learning_rate': '0.246430392425',
148 |         'adaboost:max_depth': '9.0',
149 |         'adaboost:n_estimators': '494.0',
150 |         'balancing:strategy': 'weighting',
151 |         'classifier': 'adaboost',
152 |         'feature_agglomeration:affinity': 'cosine',
153 |         'feature_agglomeration:linkage': 'average',
154 |         'feature_agglomeration:n_clusters': '156.0',
155 |         'imputation:strategy': 'median',
156 |         'preprocessor': 'feature_agglomeration',
157 |         'rescaling:strategy': 'min/max',
158 |     },
159 |     {
160 |         'adaboost:algorithm': 'SAMME.R',
161 |         'adaboost:learning_rate': '0.319596208353',
162 |         'adaboost:max_depth': '10.0',
163 |         'adaboost:n_estimators': '446.0',
164 |         'balancing:strategy': 'weighting',
165 |         'classifier': 'adaboost',
166 |         'feature_agglomeration:affinity': 'euclidean',
167 |         'feature_agglomeration:linkage': 'complete',
168 |         'feature_agglomeration:n_clusters': '65.0',
169 |         'imputation:strategy': 'mean',
170 |         'preprocessor': 'feature_agglomeration',
171 |         'rescaling:strategy': 'min/max',
172 |     },
173 |     {
174 |         'adaboost:algorithm': 'SAMME.R',
175 |         'adaboost:learning_rate': '0.208071429428',
176 |         'adaboost:max_depth': '9.0',
177 |         'adaboost:n_estimators': '487.0',
178 |         'balancing:strategy': 'weighting',
179 |         'classifier': 'adaboost',
180 |         'feature_agglomeration:affinity': 'cosine',
181 |         'feature_agglomeration:linkage': 'complete',
182 |         'feature_agglomeration:n_clusters': '219.0',
183 |         'imputation:strategy': 'most_frequent',
184 |         'preprocessor': 'feature_agglomeration',
185 |         'rescaling:strategy': 'none',
186 |     },
187 |     {
188 |         'adaboost:algorithm': 'SAMME.R',
189 |         'adaboost:learning_rate': '0.362379903949',
190 |         'adaboost:max_depth': '10.0',
191 |         'adaboost:n_estimators': '389.0',
192 |         'balancing:strategy': 'none',
193 |         'classifier': 'adaboost',
194 |         'feature_agglomeration:affinity': 'cosine',
195 |         'feature_agglomeration:linkage': 'complete',
196 |         'feature_agglomeration:n_clusters': '123.0',
197 |         'imputation:strategy': 'most_frequent',
198 |         'preprocessor': 'feature_agglomeration',
199 |         'rescaling:strategy': 'min/max',
200 |     },
201 |     {
202 |         'adaboost:algorithm': 'SAMME.R',
203 |         'adaboost:learning_rate': '0.468508930474',
204 |         'adaboost:max_depth': '10.0',
205 |         'adaboost:n_estimators': '477.0',
206 |         'balancing:strategy': 'weighting',
207 |         'classifier': 'adaboost',
208 |         'feature_agglomeration:affinity': 'euclidean',
209 |         'feature_agglomeration:linkage': 'average',
210 |         'feature_agglomeration:n_clusters': '244.0',
211 |         'imputation:strategy': 'median',
212 |         'preprocessor': 'feature_agglomeration',
213 |         'rescaling:strategy': 'min/max',
214 |     },
215 |     {
216 |         'adaboost:algorithm': 'SAMME.R',
217 |         'adaboost:learning_rate': '0.284273806405',
218 |         'adaboost:max_depth': '9.0',
219 |         'adaboost:n_estimators': '483.0',
220 |         'balancing:strategy': 'none',
221 |         'classifier': 'adaboost',
222 |         'feature_agglomeration:affinity': 'cosine',
223 |         'feature_agglomeration:linkage': 'complete',
224 |         'feature_agglomeration:n_clusters': '174.0',
225 |         'imputation:strategy': 'median',
226 |         'preprocessor': 'feature_agglomeration',
227 |         'rescaling:strategy': 'min/max',
228 |     },
229 |     {
230 |         'adaboost:algorithm': 'SAMME.R',
231 |         'adaboost:learning_rate': '0.2635286978',
232 |         'adaboost:max_depth': '10.0',
233 |         'adaboost:n_estimators': '482.0',
234 |         'balancing:strategy': 'none',
235 |         'classifier': 'adaboost',
236 |         'feature_agglomeration:affinity': 'manhattan',
237 |         'feature_agglomeration:linkage': 'average',
238 |         'feature_agglomeration:n_clusters': '118.0',
239 |         'imputation:strategy': 'most_frequent',
240 |         'preprocessor': 'feature_agglomeration',
241 |         'rescaling:strategy': 'min/max',
242 |     },
243 |     {
244 |         'adaboost:algorithm': 'SAMME.R',
245 |         'adaboost:learning_rate': '0.326966274076',
246 |         'adaboost:max_depth': '10.0',
247 |         'adaboost:n_estimators': '494.0',
248 |         'balancing:strategy': 'none',
249 |         'classifier': 'adaboost',
250 |         'feature_agglomeration:affinity': 'euclidean',
251 |         'feature_agglomeration:linkage': 'average',
252 |         'feature_agglomeration:n_clusters': '87.0',
253 |         'imputation:strategy': 'most_frequent',
254 |         'preprocessor': 'feature_agglomeration',
255 |         'rescaling:strategy': 'min/max',
256 |     },
257 |     {
258 |         'adaboost:algorithm': 'SAMME.R',
259 |         'adaboost:learning_rate': '0.239427049389',
260 |         'adaboost:max_depth': '9.0',
261 |         'adaboost:n_estimators': '393.0',
262 |         'balancing:strategy': 'none',
263 |         'classifier': 'adaboost',
264 |         'feature_agglomeration:affinity': 'euclidean',
265 |         'feature_agglomeration:linkage': 'complete',
266 |         'feature_agglomeration:n_clusters': '331.0',
267 |         'imputation:strategy': 'most_frequent',
268 |         'preprocessor': 'feature_agglomeration',
269 |         'rescaling:strategy': 'min/max',
270 |     },
271 |     {
272 |         'adaboost:algorithm': 'SAMME.R',
273 |         'adaboost:learning_rate': '0.272345990341',
274 |         'adaboost:max_depth': '10.0',
275 |         'adaboost:n_estimators': '478.0',
276 |         'balancing:strategy': 'none',
277 |         'classifier': 'adaboost',
278 |         'feature_agglomeration:affinity': 'manhattan',
279 |         'feature_agglomeration:linkage': 'average',
280 |         'feature_agglomeration:n_clusters': '20.0',
281 |         'imputation:strategy': 'most_frequent',
282 |         'preprocessor': 'feature_agglomeration',
283 |         'rescaling:strategy': 'standard',
284 |     },
285 |     {
286 |         'adaboost:algorithm': 'SAMME.R',
287 |         'adaboost:learning_rate': '0.36300772469',
288 |         'adaboost:max_depth': '10.0',
289 |         'adaboost:n_estimators': '430.0',
290 |         'balancing:strategy': 'weighting',
291 |         'classifier': 'adaboost',
292 |         'feature_agglomeration:affinity': 'euclidean',
293 |         'feature_agglomeration:linkage': 'complete',
294 |         'feature_agglomeration:n_clusters': '88.0',
295 |         'imputation:strategy': 'median',
296 |         'preprocessor': 'feature_agglomeration',
297 |         'rescaling:strategy': 'min/max',
298 |     },
299 |     {
300 |         'adaboost:algorithm': 'SAMME.R',
301 |         'adaboost:learning_rate': '0.29318612753',
302 |         'adaboost:max_depth': '10.0',
303 |         'adaboost:n_estimators': '418.0',
304 |         'balancing:strategy': 'weighting',
305 |         'classifier': 'adaboost',
306 |         'feature_agglomeration:affinity': 'cosine',
307 |         'feature_agglomeration:linkage': 'complete',
308 |         'feature_agglomeration:n_clusters': '220.0',
309 |         'imputation:strategy': 'median',
310 |         'preprocessor': 'feature_agglomeration',
311 |         'rescaling:strategy': 'standard',
312 |     },
313 |     {
314 |         'adaboost:algorithm': 'SAMME.R',
315 |         'adaboost:learning_rate': '0.315769388471',
316 |         'adaboost:max_depth': '10.0',
317 |         'adaboost:n_estimators': '494.0',
318 |         'balancing:strategy': 'none',
319 |         'classifier': 'adaboost',
320 |         'feature_agglomeration:affinity': 'euclidean',
321 |         'feature_agglomeration:linkage': 'average',
322 |         'feature_agglomeration:n_clusters': '270.0',
323 |         'imputation:strategy': 'median',
324 |         'preprocessor': 'feature_agglomeration',
325 |         'rescaling:strategy': 'min/max',
326 |     },
327 |     {
328 |         'adaboost:algorithm': 'SAMME.R',
329 |         'adaboost:learning_rate': '0.295544282435',
330 |         'adaboost:max_depth': '9.0',
331 |         'adaboost:n_estimators': '478.0',
332 |         'balancing:strategy': 'none',
333 |         'classifier': 'adaboost',
334 |         'feature_agglomeration:affinity': 'euclidean',
335 |         'feature_agglomeration:linkage': 'average',
336 |         'feature_agglomeration:n_clusters': '195.0',
337 |         'imputation:strategy': 'most_frequent',
338 |         'preprocessor': 'feature_agglomeration',
339 |         'rescaling:strategy': 'min/max',
340 |     },
341 |     {
342 |         'adaboost:algorithm': 'SAMME.R',
343 |         'adaboost:learning_rate': '0.298219714131',
344 |         'adaboost:max_depth': '9.0',
345 |         'adaboost:n_estimators': '473.0',
346 |         'balancing:strategy': 'none',
347 |         'classifier': 'adaboost',
348 |         'feature_agglomeration:affinity': 'euclidean',
349 |         'feature_agglomeration:linkage': 'average',
350 |         'feature_agglomeration:n_clusters': '39.0',
351 |         'imputation:strategy': 'mean',
352 |         'preprocessor': 'feature_agglomeration',
353 |         'rescaling:strategy': 'standard',
354 |     },
355 |     {
356 |         'adaboost:algorithm': 'SAMME.R',
357 |         'adaboost:learning_rate': '0.370877623224',
358 |         'adaboost:max_depth': '10.0',
359 |         'adaboost:n_estimators': '382.0',
360 |         'balancing:strategy': 'none',
361 |         'classifier': 'adaboost',
362 |         'feature_agglomeration:affinity': 'euclidean',
363 |         'feature_agglomeration:linkage': 'average',
364 |         'feature_agglomeration:n_clusters': '331.0',
365 |         'imputation:strategy': 'most_frequent',
366 |         'preprocessor': 'feature_agglomeration',
367 |         'rescaling:strategy': 'min/max',
368 |     },
369 |     {
370 |         'adaboost:algorithm': 'SAMME.R',
371 |         'adaboost:learning_rate': '0.339058617161',
372 |         'adaboost:max_depth': '10.0',
373 |         'adaboost:n_estimators': '466.0',
374 |         'balancing:strategy': 'none',
375 |         'classifier': 'adaboost',
376 |         'feature_agglomeration:affinity': 'manhattan',
377 |         'feature_agglomeration:linkage': 'complete',
378 |         'feature_agglomeration:n_clusters': '38.0',
379 |         'imputation:strategy': 'most_frequent',
380 |         'preprocessor': 'feature_agglomeration',
381 |         'rescaling:strategy': 'standard',
382 |     },
383 |     {
384 |         'adaboost:algorithm': 'SAMME.R',
385 |         'adaboost:learning_rate': '0.272345990341',
386 |         'adaboost:max_depth': '10.0',
387 |         'adaboost:n_estimators': '478.0',
388 |         'balancing:strategy': 'weighting',
389 |         'classifier': 'adaboost',
390 |         'feature_agglomeration:affinity': 'cosine',
391 |         'feature_agglomeration:linkage': 'average',
392 |         'feature_agglomeration:n_clusters': '68.0',
393 |         'imputation:strategy': 'most_frequent',
394 |         'preprocessor': 'feature_agglomeration',
395 |         'rescaling:strategy': 'none',
396 |     },
397 |     {
398 |         'adaboost:algorithm': 'SAMME.R',
399 |         'adaboost:learning_rate': '0.268568387674',
400 |         'adaboost:max_depth': '10.0',
401 |         'adaboost:n_estimators': '499.0',
402 |         'balancing:strategy': 'none',
403 |         'classifier': 'adaboost',
404 |         'feature_agglomeration:affinity': 'manhattan',
405 |         'feature_agglomeration:linkage': 'average',
406 |         'feature_agglomeration:n_clusters': '78.0',
407 |         'imputation:strategy': 'most_frequent',
408 |         'preprocessor': 'feature_agglomeration',
409 |         'rescaling:strategy': 'standard',
410 |     },
411 |     {
412 |         'adaboost:algorithm': 'SAMME.R',
413 |         'adaboost:learning_rate': '0.286357615604',
414 |         'adaboost:max_depth': '9.0',
415 |         'adaboost:n_estimators': '490.0',
416 |         'balancing:strategy': 'weighting',
417 |         'classifier': 'adaboost',
418 |         'feature_agglomeration:affinity': 'euclidean',
419 |         'feature_agglomeration:linkage': 'ward',
420 |         'feature_agglomeration:n_clusters': '220.0',
421 |         'imputation:strategy': 'median',
422 |         'preprocessor': 'feature_agglomeration',
423 |         'rescaling:strategy': 'min/max',
424 |     },
425 |     {
426 |         'adaboost:algorithm': 'SAMME.R',
427 |         'adaboost:learning_rate': '0.377112372612',
428 |         'adaboost:max_depth': '10.0',
429 |         'adaboost:n_estimators': '458.0',
430 |         'balancing:strategy': 'weighting',
431 |         'classifier': 'adaboost',
432 |         'feature_agglomeration:affinity': 'euclidean',
433 |         'feature_agglomeration:linkage': 'ward',
434 |         'feature_agglomeration:n_clusters': '125.0',
435 |         'imputation:strategy': 'most_frequent',
436 |         'preprocessor': 'feature_agglomeration',
437 |         'rescaling:strategy': 'min/max',
438 |     },
439 |     {
440 |         'adaboost:algorithm': 'SAMME.R',
441 |         'adaboost:learning_rate': '0.400954561452',
442 |         'adaboost:max_depth': '10.0',
443 |         'adaboost:n_estimators': '408.0',
444 |         'balancing:strategy': 'none',
445 |         'classifier': 'adaboost',
446 |         'feature_agglomeration:affinity': 'euclidean',
447 |         'feature_agglomeration:linkage': 'average',
448 |         'feature_agglomeration:n_clusters': '345.0',
449 |         'imputation:strategy': 'median',
450 |         'preprocessor': 'feature_agglomeration',
451 |         'rescaling:strategy': 'min/max',
452 |     },
453 |     {
454 |         'adaboost:algorithm': 'SAMME.R',
455 |         'adaboost:learning_rate': '0.196044249482',
456 |         'adaboost:max_depth': '9.0',
457 |         'adaboost:n_estimators': '494.0',
458 |         'balancing:strategy': 'none',
459 |         'classifier': 'adaboost',
460 |         'feature_agglomeration:affinity': 'manhattan',
461 |         'feature_agglomeration:linkage': 'average',
462 |         'feature_agglomeration:n_clusters': '182.0',
463 |         'imputation:strategy': 'median',
464 |         'preprocessor': 'feature_agglomeration',
465 |         'rescaling:strategy': 'min/max',
466 |     },
467 |     {
468 |         'adaboost:algorithm': 'SAMME.R',
469 |         'adaboost:learning_rate': '0.312315129765',
470 |         'adaboost:max_depth': '10.0',
471 |         'adaboost:n_estimators': '442.0',
472 |         'balancing:strategy': 'weighting',
473 |         'classifier': 'adaboost',
474 |         'feature_agglomeration:affinity': 'manhattan',
475 |         'feature_agglomeration:linkage': 'complete',
476 |         'feature_agglomeration:n_clusters': '347.0',
477 |         'imputation:strategy': 'median',
478 |         'preprocessor': 'feature_agglomeration',
479 |         'rescaling:strategy': 'none'}
480 | ]
481 | 
482 | classifiers = []
483 | predictions_valid = []
484 | predictions_test = []
485 | 
486 | # Make predictions and weight them
487 | for weight, configuration in zip(weights, configurations):
488 |     for param in configuration:
489 |         try:
490 |             configuration[param] = int(configuration[param])
491 |         except Exception:
492 |             try:
493 |                 configuration[param] = float(configuration[param])
494 |             except Exception:
495 |                 pass
496 | 
497 |     classifier = ParamSklearnClassifier(configuration, 1)
498 |     classifiers.append(classifier)
499 |     try:
500 |         classifier.fit(X.copy(), y.copy())
501 |         predictions_valid.append(
502 |             classifier.predict_proba(X_valid.copy()) * weight)
503 |         predictions_test.append(
504 |             classifier.predict_proba(X_test.copy()) * weight)
505 |     except Exception as e:
506 |         print e
507 |         print configuration
508 | 
509 | # Output the predictions
510 | for name, predictions in [('valid', predictions_valid),
511 |                           ('test', predictions_test)]:
512 |     predictions = np.array(predictions)
513 |     predictions = np.sum(predictions, axis=0)
514 |     predictions = predictions[:, 1].reshape((-1, 1))
515 | 
516 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
517 |     np.savetxt(filepath, predictions, delimiter=' ')


--------------------------------------------------------------------------------
/001_sylvine.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import autosklearn
  7 | import autosklearn.data
  8 | import autosklearn.data.data_manager
  9 | import autosklearn.models.evaluator
 10 | from ParamSklearn.classification import ParamSklearnClassifier
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument('input')
 15 | parser.add_argument('output')
 16 | args = parser.parse_args()
 17 | 
 18 | input = args.input
 19 | dataset = 'sylvine'
 20 | output = args.output
 21 | 
 22 | D = autosklearn.data.data_manager.DataManager(dataset, input)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Subset of features found with RFE. Feature with least importance in sklearn
 29 | # RF removed. Afterwards, trained RF on remaining features with 5CV. In the
 30 | # end, choose feature set with lowest error
 31 | features = [6, 8, 9, 14]
 32 | 
 33 | X = X[:, features]
 34 | X_valid = X_valid[:, features]
 35 | X_test = X_test[:, features]
 36 | 
 37 | # Weights of the ensemble members as determined by Ensemble Selection
 38 | weights = np.array([0.420000, 0.360000, 0.060000, 0.040000, 0.040000,
 39 |                     0.040000, 0.020000, 0.020000])
 40 | 
 41 | # Ensemble members found by SMAC
 42 | configurations = [
 43 |     {'balancing:strategy': 'none',
 44 |      'classifier': 'qda',
 45 |      'imputation:strategy': 'median',
 46 |      'kitchen_sinks:gamma': '1.92120672046',
 47 |      'kitchen_sinks:n_components': '716.0',
 48 |      'preprocessor': 'kitchen_sinks',
 49 |      'qda:reg_param': '1.58062868571',
 50 |      'qda:tol': '0.0247837474409',
 51 |      'rescaling:strategy': 'standard', },
 52 |     {'balancing:strategy': 'none',
 53 |      'classifier': 'qda',
 54 |      'imputation:strategy': 'most_frequent',
 55 |      'kitchen_sinks:gamma': '1.61329137115',
 56 |      'kitchen_sinks:n_components': '500.0',
 57 |      'preprocessor': 'kitchen_sinks',
 58 |      'qda:reg_param': '5.45636866541',
 59 |      'qda:tol': '5.69425859943e-05',
 60 |      'rescaling:strategy': 'min/max', },
 61 |     {'balancing:strategy': 'weighting',
 62 |      'classifier': 'qda',
 63 |      'imputation:strategy': 'most_frequent',
 64 |      'kitchen_sinks:gamma': '1.95127135806',
 65 |      'kitchen_sinks:n_components': '564.0',
 66 |      'preprocessor': 'kitchen_sinks',
 67 |      'qda:reg_param': '0.512205857283',
 68 |      'qda:tol': '0.000168304749916',
 69 |      'rescaling:strategy': 'standard', },
 70 |     {'balancing:strategy': 'weighting',
 71 |      'classifier': 'qda',
 72 |      'imputation:strategy': 'median',
 73 |      'kitchen_sinks:gamma': '1.8592926955',
 74 |      'kitchen_sinks:n_components': '539.0',
 75 |      'preprocessor': 'kitchen_sinks',
 76 |      'qda:reg_param': '7.384724657',
 77 |      'qda:tol': '0.0200780040497',
 78 |      'rescaling:strategy': 'standard', },
 79 |     {'balancing:strategy': 'none',
 80 |      'classifier': 'qda',
 81 |      'imputation:strategy': 'median',
 82 |      'kitchen_sinks:gamma': '0.968569589575',
 83 |      'kitchen_sinks:n_components': '528.0',
 84 |      'preprocessor': 'kitchen_sinks',
 85 |      'qda:reg_param': '5.73540397488',
 86 |      'qda:tol': '0.00632432527713',
 87 |      'rescaling:strategy': 'min/max', },
 88 |     {'balancing:strategy': 'weighting',
 89 |      'classifier': 'qda',
 90 |      'imputation:strategy': 'most_frequent',
 91 |      'kitchen_sinks:gamma': '1.7159380388',
 92 |      'kitchen_sinks:n_components': '586.0',
 93 |      'preprocessor': 'kitchen_sinks',
 94 |      'qda:reg_param': '4.84995966137',
 95 |      'qda:tol': '0.0143521983037',
 96 |      'rescaling:strategy': 'standard', },
 97 |     {'balancing:strategy': 'weighting',
 98 |      'classifier': 'qda',
 99 |      'imputation:strategy': 'median',
100 |      'nystroem_sampler:gamma': '3.79316084659',
101 |      'nystroem_sampler:kernel': 'rbf',
102 |      'nystroem_sampler:n_components': '516.0',
103 |      'preprocessor': 'nystroem_sampler',
104 |      'qda:reg_param': '9.63571710058',
105 |      'qda:tol': '0.00901955088569',
106 |      'rescaling:strategy': 'min/max', },
107 |     {'balancing:strategy': 'weighting',
108 |      'classifier': 'qda',
109 |      'imputation:strategy': 'most_frequent',
110 |      'kitchen_sinks:gamma': '1.85336603609',
111 |      'kitchen_sinks:n_components': '509.0',
112 |      'preprocessor': 'kitchen_sinks',
113 |      'qda:reg_param': '8.57076337966',
114 |      'qda:tol': '0.000361249119707',
115 |      'rescaling:strategy': 'standard'}
116 | ]
117 | 
118 | classifiers = []
119 | predictions_valid = []
120 | predictions_test = []
121 | 
122 | # Make predictions and weight them
123 | for weight, configuration in zip(weights, configurations):
124 |     for param in configuration:
125 |         try:
126 |             configuration[param] = int(configuration[param])
127 |         except Exception:
128 |             try:
129 |                 configuration[param] = float(configuration[param])
130 |             except Exception:
131 |                 pass
132 | 
133 |     classifier = ParamSklearnClassifier(configuration, 1)
134 |     classifiers.append(classifier)
135 |     try:
136 |         classifier.fit(X.copy(), y.copy())
137 |         predictions_valid.append(
138 |             classifier.predict_proba(X_valid.copy()) * weight)
139 |         predictions_test.append(
140 |             classifier.predict_proba(X_test.copy()) * weight)
141 |     except Exception as e:
142 |         print e
143 |         print configuration
144 | 
145 | # Output the predictions
146 | for name, predictions in [('valid', predictions_valid),
147 |                           ('test', predictions_test)]:
148 |     predictions = np.array(predictions)
149 |     predictions = np.sum(predictions, axis=0)
150 |     predictions = predictions[:,1].reshape((-1, 1))
151 | 
152 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
153 |     np.savetxt(filepath, predictions, delimiter=' ')


--------------------------------------------------------------------------------
/002_albert.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | 
 7 | import autosklearn
 8 | import autosklearn.data
 9 | import autosklearn.data.competition_data_manager
10 | from autosklearn.evaluation.util import calculate_score
11 | from ParamSklearn.classification import ParamSklearnClassifier
12 | 
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('input')
16 | parser.add_argument('output')
17 | args = parser.parse_args()
18 | 
19 | input = args.input
20 | dataset = 'albert'
21 | output = args.output
22 | 
23 | path = os.path.join(input, dataset)
24 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
25 | X = D.data['X_train']
26 | y = D.data['Y_train']
27 | X_valid = D.data['X_valid']
28 | X_test = D.data['X_test']
29 | 
30 | # Replace the following array by a new ensemble
31 | choices = \
32 |     [(1.0, ParamSklearnClassifier(configuration={
33 |         'balancing:strategy': 'weighting',
34 |         'classifier:__choice__': 'sgd',
35 |         'classifier:sgd:loss': 'hinge',
36 |         'classifier:sgd:penalty': 'l2',
37 |         'classifier:sgd:alpha': 0.0001,
38 |         'classifier:sgd:fit_intercept': True,
39 |         'classifier:sgd:n_iter': 5,
40 |         'classifier:sgd:learning_rate': 'optimal',
41 |         'classifier:sgd:eta0': 0.01,
42 |         'classifier:sgd:average': True,
43 |         'imputation:strategy': 'mean',
44 |         'one_hot_encoding:use_minimum_fraction': 'True',
45 |         'one_hot_encoding:minimum_fraction': 0.1,
46 |         'preprocessor:__choice__': 'no_preprocessing',
47 |         'rescaling:__choice__': 'min/max'}))]
48 | 
49 | classifiers = []
50 | targets = []
51 | predictions = []
52 | predictions_valid = []
53 | predictions_test = []
54 | 
55 | # Make predictions and weight them
56 | iteration = 0
57 | for weight, classifier in choices:
58 |     iteration += 1
59 |     print dataset, "Iteration %d/%d" % (iteration, len(choices))
60 |     classifiers.append(classifier)
61 |     try:
62 |         classifier.fit(X.copy(), y.copy())
63 |         predictions_valid.append(
64 |             classifier.predict_proba(X_valid.copy()) * weight)
65 |         predictions_test.append(
66 |             classifier.predict_proba(X_test.copy()) * weight)
67 |     except Exception as e:
68 |         print e
69 |         print classifier.configuration
70 | 
71 | # Output the predictions
72 | for name, predictions in [('valid', predictions_valid),
73 |                           ('test', predictions_test)]:
74 |     predictions = np.array(predictions)
75 |     predictions = np.sum(predictions, axis=0)
76 |     predictions = predictions[:, 1].reshape((-1, 1))
77 | 
78 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
79 |     np.savetxt(filepath, predictions, delimiter=' ')
80 | 


--------------------------------------------------------------------------------
/002_dilbert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | from sklearn.cross_validation import StratifiedKFold
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.evaluation.util import calculate_score
 11 | from ParamSklearn.classification import ParamSklearnClassifier
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('input')
 16 | parser.add_argument('output')
 17 | args = parser.parse_args()
 18 | 
 19 | input = args.input
 20 | dataset = 'dilbert'
 21 | output = args.output
 22 | 
 23 | path = os.path.join(input, dataset)
 24 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 25 | X = D.data['X_train']
 26 | y = D.data['Y_train']
 27 | X_valid = D.data['X_valid']
 28 | X_test = D.data['X_test']
 29 | 
 30 | # Replace the following array by a new ensemble
 31 | choices = \
 32 |     [(0.220000, ParamSklearnClassifier(
 33 |         configuration={
 34 |             'balancing:strategy': 'weighting',
 35 |             'classifier:__choice__': 'passive_aggressive',
 36 |             'classifier:passive_aggressive:C': 0.0022574783522003694,
 37 |             'classifier:passive_aggressive:fit_intercept': 'True',
 38 |             'classifier:passive_aggressive:loss': 'hinge',
 39 |             'classifier:passive_aggressive:n_iter': 119,
 40 |             'imputation:strategy': 'most_frequent',
 41 |             'one_hot_encoding:minimum_fraction': 0.1898871876010834,
 42 |             'one_hot_encoding:use_minimum_fraction': 'True',
 43 |             'preprocessor:__choice__': 'gem',
 44 |             'preprocessor:gem:N': 20,
 45 |             'preprocessor:gem:precond': 0.27540716190663134,
 46 |             'rescaling:__choice__': 'min/max'})),
 47 |      (0.160000, ParamSklearnClassifier(
 48 |         configuration={
 49 |             'balancing:strategy': 'none',
 50 |             'classifier:__choice__': 'passive_aggressive',
 51 |             'classifier:passive_aggressive:C': 8.011168723835382,
 52 |             'classifier:passive_aggressive:fit_intercept': 'True',
 53 |             'classifier:passive_aggressive:loss': 'hinge',
 54 |             'classifier:passive_aggressive:n_iter': 20,
 55 |             'imputation:strategy': 'median',
 56 |             'one_hot_encoding:minimum_fraction': 0.020771877142610626,
 57 |             'one_hot_encoding:use_minimum_fraction': 'True',
 58 |             'preprocessor:__choice__': 'gem',
 59 |             'preprocessor:gem:N': 16,
 60 |             'preprocessor:gem:precond': 0.035878450355803344,
 61 |             'rescaling:__choice__': 'min/max'})),
 62 |      (0.160000, ParamSklearnClassifier(
 63 |          configuration={
 64 |              'balancing:strategy': 'none',
 65 |              'classifier:__choice__': 'passive_aggressive',
 66 |              'classifier:passive_aggressive:C': 0.00010934133255683256,
 67 |              'classifier:passive_aggressive:fit_intercept': 'True',
 68 |              'classifier:passive_aggressive:loss': 'hinge',
 69 |              'classifier:passive_aggressive:n_iter': 235,
 70 |              'imputation:strategy': 'mean',
 71 |              'one_hot_encoding:minimum_fraction': 0.022038507512545786,
 72 |              'one_hot_encoding:use_minimum_fraction': 'True',
 73 |              'preprocessor:__choice__': 'gem',
 74 |              'preprocessor:gem:N': 17,
 75 |              'preprocessor:gem:precond': 0.02104468261583234,
 76 |              'rescaling:__choice__': 'min/max'})),
 77 |      (0.140000, ParamSklearnClassifier(
 78 |          configuration={
 79 |              'balancing:strategy': 'none',
 80 |              'classifier:__choice__': 'passive_aggressive',
 81 |              'classifier:passive_aggressive:C': 8.011168723835382,
 82 |              'classifier:passive_aggressive:fit_intercept': 'True',
 83 |              'classifier:passive_aggressive:loss': 'hinge',
 84 |              'classifier:passive_aggressive:n_iter': 20,
 85 |              'imputation:strategy': 'mean',
 86 |              'one_hot_encoding:minimum_fraction': 0.020771877142610626,
 87 |              'one_hot_encoding:use_minimum_fraction': 'True',
 88 |              'preprocessor:__choice__': 'gem',
 89 |              'preprocessor:gem:N': 16,
 90 |              'preprocessor:gem:precond': 0.047677121638912856,
 91 |              'rescaling:__choice__': 'min/max'})),
 92 |      (0.140000, ParamSklearnClassifier(
 93 |          configuration={
 94 |              'balancing:strategy': 'none',
 95 |              'classifier:__choice__': 'passive_aggressive',
 96 |              'classifier:passive_aggressive:C': 8.011168723835382,
 97 |              'classifier:passive_aggressive:fit_intercept': 'True',
 98 |              'classifier:passive_aggressive:loss': 'squared_hinge',
 99 |              'classifier:passive_aggressive:n_iter': 301,
100 |              'imputation:strategy': 'median',
101 |              'one_hot_encoding:minimum_fraction': 0.028040769173853935,
102 |              'one_hot_encoding:use_minimum_fraction': 'True',
103 |              'preprocessor:__choice__': 'gem',
104 |              'preprocessor:gem:N': 20,
105 |              'preprocessor:gem:precond': 0.047677121638912856,
106 |              'rescaling:__choice__': 'min/max'})),
107 |      (0.120000, ParamSklearnClassifier(
108 |          configuration={
109 |              'balancing:strategy': 'none',
110 |              'classifier:__choice__': 'passive_aggressive',
111 |              'classifier:passive_aggressive:C': 0.00010934133255683256,
112 |              'classifier:passive_aggressive:fit_intercept': 'True',
113 |              'classifier:passive_aggressive:loss': 'hinge',
114 |              'classifier:passive_aggressive:n_iter': 235,
115 |              'imputation:strategy': 'mean',
116 |              'one_hot_encoding:minimum_fraction': 0.041303833357502165,
117 |              'one_hot_encoding:use_minimum_fraction': 'True',
118 |              'preprocessor:__choice__': 'gem',
119 |              'preprocessor:gem:N': 18,
120 |              'preprocessor:gem:precond': 0.09599232591423834,
121 |              'rescaling:__choice__': 'min/max'})),
122 |      (0.040000, ParamSklearnClassifier(
123 |          configuration={
124 |              'balancing:strategy': 'none',
125 |              'classifier:__choice__': 'liblinear_svc',
126 |              'classifier:liblinear_svc:C': 37.176582995422606,
127 |              'classifier:liblinear_svc:dual': 'False',
128 |              'classifier:liblinear_svc:fit_intercept': 'True',
129 |              'classifier:liblinear_svc:intercept_scaling': 1,
130 |              'classifier:liblinear_svc:loss': 'squared_hinge',
131 |              'classifier:liblinear_svc:multi_class': 'ovr',
132 |              'classifier:liblinear_svc:penalty': 'l2',
133 |              'classifier:liblinear_svc:tol': 0.00016373824508657717,
134 |              'imputation:strategy': 'median',
135 |              'one_hot_encoding:minimum_fraction': 0.0008207509562933506,
136 |              'one_hot_encoding:use_minimum_fraction': 'True',
137 |              'preprocessor:__choice__': 'gem',
138 |              'preprocessor:gem:N': 15,
139 |              'preprocessor:gem:precond': 0.1010713117945701,
140 |              'rescaling:__choice__': 'min/max'})),
141 |      (0.020000, ParamSklearnClassifier(
142 |          configuration={
143 |              'balancing:strategy': 'none',
144 |              'classifier:__choice__': 'passive_aggressive',
145 |              'classifier:passive_aggressive:C': 8.011168723835382,
146 |              'classifier:passive_aggressive:fit_intercept': 'True',
147 |              'classifier:passive_aggressive:loss': 'squared_hinge',
148 |              'classifier:passive_aggressive:n_iter': 20,
149 |              'imputation:strategy': 'median',
150 |              'one_hot_encoding:minimum_fraction': 0.028040769173853935,
151 |              'one_hot_encoding:use_minimum_fraction': 'True',
152 |              'preprocessor:__choice__': 'gem',
153 |              'preprocessor:gem:N': 20,
154 |              'preprocessor:gem:precond': 0.047677121638912856,
155 |              'rescaling:__choice__': 'min/max'}))
156 |     ]
157 | 
158 | classifiers = []
159 | targets = []
160 | predictions = []
161 | predictions_valid = []
162 | predictions_test = []
163 | 
164 | # Make predictions and weight them
165 | iteration = 0
166 | for weight, classifier in choices:
167 |     iteration += 1
168 |     print dataset, "Iteration %d/%d" % (iteration, len(choices))
169 | 
170 |     classifiers.append(classifier)
171 |     try:
172 |         classifier.fit(X.copy(), y.copy())
173 |         predictions_valid.append(
174 |             classifier.predict_proba(X_valid.copy()) * weight)
175 |         predictions_test.append(
176 |             classifier.predict_proba(X_test.copy()) * weight)
177 |     except Exception as e:
178 |         print e
179 |         print classifier
180 | 
181 | # Output the predictions
182 | for name, predictions in [('valid', predictions_valid),
183 |                           ('test', predictions_test)]:
184 |     predictions = np.array(predictions)
185 |     predictions = np.sum(predictions, axis=0)
186 | 
187 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
188 |     np.savetxt(filepath, predictions, delimiter=' ')
189 | 


--------------------------------------------------------------------------------
/002_fabert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | from sklearn.cross_validation import StratifiedKFold
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.evaluation.util import calculate_score
 11 | from ParamSklearn.classification import ParamSklearnClassifier
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('input')
 16 | parser.add_argument('output')
 17 | args = parser.parse_args()
 18 | 
 19 | input = args.input
 20 | dataset = 'fabert'
 21 | output = args.output
 22 | 
 23 | path = os.path.join(input, dataset)
 24 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 25 | X = D.data['X_train']
 26 | y = D.data['Y_train']
 27 | X_valid = D.data['X_valid']
 28 | X_test = D.data['X_test']
 29 | 
 30 | # Replace the following array by a new ensemble
 31 | choices = \
 32 |     [(0.580000, ParamSklearnClassifier(
 33 |         configuration={
 34 |             'balancing:strategy': 'weighting',
 35 |             'classifier:__choice__': 'extra_trees',
 36 |             'classifier:extra_trees:bootstrap': 'True',
 37 |             'classifier:extra_trees:criterion': 'gini',
 38 |             'classifier:extra_trees:max_depth': 'None',
 39 |             'classifier:extra_trees:max_features': 1.4927328322706173,
 40 |             'classifier:extra_trees:min_samples_leaf': 1,
 41 |             'classifier:extra_trees:min_samples_split': 5,
 42 |             'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
 43 |             'classifier:extra_trees:n_estimators': 100,
 44 |             'imputation:strategy': 'mean',
 45 |             'one_hot_encoding:use_minimum_fraction': 'False',
 46 |             'preprocessor:__choice__': 'select_rates',
 47 |             'preprocessor:select_rates:alpha': 0.4308279694614349,
 48 |             'preprocessor:select_rates:mode': 'fwe',
 49 |             'preprocessor:select_rates:score_func': 'f_classif',
 50 |             'rescaling:__choice__': 'min/max'})),
 51 |      (0.200000, ParamSklearnClassifier(
 52 |         configuration={
 53 |             'balancing:strategy': 'none',
 54 |             'classifier:__choice__': 'sgd',
 55 |             'classifier:sgd:alpha': 5.707045187542232e-06,
 56 |             'classifier:sgd:average': 'True',
 57 |             'classifier:sgd:eta0': 0.059208215107360226,
 58 |             'classifier:sgd:fit_intercept': 'True',
 59 |             'classifier:sgd:l1_ratio': 0.5696965689983325,
 60 |             'classifier:sgd:learning_rate': 'constant',
 61 |             'classifier:sgd:loss': 'log',
 62 |             'classifier:sgd:n_iter': 809,
 63 |             'classifier:sgd:penalty': 'elasticnet',
 64 |             'imputation:strategy': 'median',
 65 |             'one_hot_encoding:minimum_fraction': 0.45801169150718357,
 66 |             'one_hot_encoding:use_minimum_fraction': 'True',
 67 |             'preprocessor:__choice__': 'liblinear_svc_preprocessor',
 68 |             'preprocessor:liblinear_svc_preprocessor:C': 9.102297055334894,
 69 |             'preprocessor:liblinear_svc_preprocessor:dual': 'False',
 70 |             'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True',
 71 |             'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1,
 72 |             'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge',
 73 |             'preprocessor:liblinear_svc_preprocessor:multi_class': 'ovr',
 74 |             'preprocessor:liblinear_svc_preprocessor:penalty': 'l1',
 75 |             'preprocessor:liblinear_svc_preprocessor:tol': 9.129411357422978e-05,
 76 |             'rescaling:__choice__': 'normalize'})),
 77 |      (0.060000, ParamSklearnClassifier(
 78 |          configuration={
 79 |              'balancing:strategy': 'weighting',
 80 |              'classifier:__choice__': 'sgd',
 81 |              'classifier:sgd:alpha': 3.104241273548187e-05,
 82 |              'classifier:sgd:average': 'False',
 83 |              'classifier:sgd:eta0': 0.050396014246875294,
 84 |              'classifier:sgd:fit_intercept': 'True',
 85 |              'classifier:sgd:l1_ratio': 0.7121576951214108,
 86 |              'classifier:sgd:learning_rate': 'optimal',
 87 |              'classifier:sgd:loss': 'log',
 88 |              'classifier:sgd:n_iter': 649,
 89 |              'classifier:sgd:penalty': 'elasticnet',
 90 |              'imputation:strategy': 'mean',
 91 |              'one_hot_encoding:use_minimum_fraction': 'False',
 92 |              'preprocessor:__choice__': 'no_preprocessing',
 93 |              'rescaling:__choice__': 'min/max'})),
 94 |      (0.060000, ParamSklearnClassifier(
 95 |          configuration={
 96 |              'balancing:strategy': 'none',
 97 |              'classifier:__choice__': 'passive_aggressive',
 98 |              'classifier:passive_aggressive:C': 0.023003251414120036,
 99 |              'classifier:passive_aggressive:fit_intercept': 'True',
100 |              'classifier:passive_aggressive:loss': 'hinge',
101 |              'classifier:passive_aggressive:n_iter': 57,
102 |              'imputation:strategy': 'most_frequent',
103 |              'one_hot_encoding:minimum_fraction': 0.012167961375954476,
104 |              'one_hot_encoding:use_minimum_fraction': 'True',
105 |              'preprocessor:__choice__': 'liblinear_svc_preprocessor',
106 |              'preprocessor:liblinear_svc_preprocessor:C': 0.07417606253933476,
107 |              'preprocessor:liblinear_svc_preprocessor:dual': 'False',
108 |              'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True',
109 |              'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1,
110 |              'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge',
111 |              'preprocessor:liblinear_svc_preprocessor:multi_class': 'ovr',
112 |              'preprocessor:liblinear_svc_preprocessor:penalty': 'l1',
113 |              'preprocessor:liblinear_svc_preprocessor:tol': 0.0009557179607902859,
114 |              'rescaling:__choice__': 'none'})),
115 |      (0.040000, ParamSklearnClassifier(
116 |          configuration={
117 |              'balancing:strategy': 'none',
118 |              'classifier:__choice__': 'liblinear_svc',
119 |              'classifier:liblinear_svc:C': 491.8319475226706,
120 |              'classifier:liblinear_svc:dual': 'False',
121 |              'classifier:liblinear_svc:fit_intercept': 'True',
122 |              'classifier:liblinear_svc:intercept_scaling': 1,
123 |              'classifier:liblinear_svc:loss': 'squared_hinge',
124 |              'classifier:liblinear_svc:multi_class': 'ovr',
125 |              'classifier:liblinear_svc:penalty': 'l2',
126 |              'classifier:liblinear_svc:tol': 0.0008252238346618138,
127 |              'imputation:strategy': 'most_frequent',
128 |              'one_hot_encoding:minimum_fraction': 0.00028396835704950287,
129 |              'one_hot_encoding:use_minimum_fraction': 'True',
130 |              'preprocessor:__choice__': 'liblinear_svc_preprocessor',
131 |              'preprocessor:liblinear_svc_preprocessor:C': 0.11029125786578071,
132 |              'preprocessor:liblinear_svc_preprocessor:dual': 'False',
133 |              'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True',
134 |              'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1,
135 |              'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge',
136 |              'preprocessor:liblinear_svc_preprocessor:multi_class': 'ovr',
137 |              'preprocessor:liblinear_svc_preprocessor:penalty': 'l1',
138 |              'preprocessor:liblinear_svc_preprocessor:tol': 0.0003417183512181233,
139 |              'rescaling:__choice__': 'min/max'})),
140 |      (0.040000, ParamSklearnClassifier(
141 |          configuration={
142 |              'balancing:strategy': 'weighting',
143 |              'classifier:__choice__': 'sgd',
144 |              'classifier:sgd:alpha': 2.618489922233997e-06,
145 |              'classifier:sgd:average': 'False',
146 |              'classifier:sgd:eta0': 0.0785971926323006,
147 |              'classifier:sgd:fit_intercept': 'True',
148 |              'classifier:sgd:l1_ratio': 0.1596938886542899,
149 |              'classifier:sgd:learning_rate': 'constant',
150 |              'classifier:sgd:loss': 'hinge',
151 |              'classifier:sgd:n_iter': 509,
152 |              'classifier:sgd:penalty': 'elasticnet',
153 |              'imputation:strategy': 'mean',
154 |              'one_hot_encoding:use_minimum_fraction': 'False',
155 |              'preprocessor:__choice__': 'select_rates',
156 |              'preprocessor:select_rates:alpha': 0.25578392394574817,
157 |              'preprocessor:select_rates:mode': 'fpr',
158 |              'preprocessor:select_rates:score_func': 'chi2',
159 |              'rescaling:__choice__': 'min/max'})),
160 |      (0.020000, ParamSklearnClassifier(
161 |          configuration={
162 |              'balancing:strategy': 'weighting',
163 |              'classifier:__choice__': 'extra_trees',
164 |              'classifier:extra_trees:bootstrap': 'False',
165 |              'classifier:extra_trees:criterion': 'gini',
166 |              'classifier:extra_trees:max_depth': 'None',
167 |              'classifier:extra_trees:max_features': 2.1694048668692454,
168 |              'classifier:extra_trees:min_samples_leaf': 1,
169 |              'classifier:extra_trees:min_samples_split': 8,
170 |              'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
171 |              'classifier:extra_trees:n_estimators': 100,
172 |              'imputation:strategy': 'median',
173 |              'one_hot_encoding:minimum_fraction': 0.23760831456778012,
174 |              'one_hot_encoding:use_minimum_fraction': 'True',
175 |              'preprocessor:__choice__': 'no_preprocessing',
176 |              'rescaling:__choice__': 'standardize'})), ]
177 | 
178 | classifiers = []
179 | targets = []
180 | predictions = []
181 | predictions_valid = []
182 | predictions_test = []
183 | 
184 | # Make predictions and weight them
185 | iteration = 0
186 | for weight, classifier in choices:
187 |     iteration += 1
188 |     print dataset, "Iteration %d/%d" % (iteration, len(choices))
189 | 
190 |     classifiers.append(classifier)
191 |     try:
192 |         classifier.fit(X.copy(), y.copy())
193 |         predictions_valid.append(
194 |             classifier.predict_proba(X_valid.copy()) * weight)
195 |         predictions_test.append(
196 |             classifier.predict_proba(X_test.copy()) * weight)
197 |     except Exception as e:
198 |         print e
199 |         print classifier
200 | 
201 | # Output the predictions
202 | for name, predictions in [('valid', predictions_valid),
203 |                           ('test', predictions_test)]:
204 |     predictions = np.array(predictions)
205 |     predictions = np.sum(predictions, axis=0)
206 | 
207 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
208 |     np.savetxt(filepath, predictions, delimiter=' ')
209 | 


--------------------------------------------------------------------------------
/002_volkert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | from sklearn.cross_validation import StratifiedKFold
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.evaluation.util import calculate_score
 11 | from ParamSklearn.classification import ParamSklearnClassifier
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('input')
 16 | parser.add_argument('output')
 17 | args = parser.parse_args()
 18 | 
 19 | input = args.input
 20 | dataset = 'volkert'
 21 | output = args.output
 22 | 
 23 | path = os.path.join(input, dataset)
 24 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 25 | X = D.data['X_train']
 26 | y = D.data['Y_train']
 27 | X_valid = D.data['X_valid']
 28 | X_test = D.data['X_test']
 29 | 
 30 | # Replace the following array by a new ensemble
 31 | choices = \
 32 |     [(0.480000, ParamSklearnClassifier(configuration={
 33 |         'balancing:strategy': 'none',
 34 |         'classifier:__choice__': 'random_forest',
 35 |         'classifier:random_forest:bootstrap': 'True',
 36 |         'classifier:random_forest:criterion': 'entropy',
 37 |         'classifier:random_forest:max_depth': 'None',
 38 |         'classifier:random_forest:max_features': 4.885151102990943,
 39 |         'classifier:random_forest:max_leaf_nodes': 'None',
 40 |         'classifier:random_forest:min_samples_leaf': 2,
 41 |         'classifier:random_forest:min_samples_split': 2,
 42 |         'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 43 |         'classifier:random_forest:n_estimators': 100,
 44 |         'imputation:strategy': 'median',
 45 |         'one_hot_encoding:minimum_fraction': 0.059297498551361,
 46 |         'one_hot_encoding:use_minimum_fraction': 'True',
 47 |         'preprocessor:__choice__': 'gem',
 48 |         'preprocessor:gem:N': 13,
 49 |         'preprocessor:gem:precond': 0.31299029323203487,
 50 |         'rescaling:__choice__': 'min/max'})),
 51 |      (0.300000, ParamSklearnClassifier(
 52 |         configuration={
 53 |             'balancing:strategy': 'none',
 54 |             'classifier:__choice__': 'random_forest',
 55 |             'classifier:random_forest:bootstrap': 'False',
 56 |             'classifier:random_forest:criterion': 'entropy',
 57 |             'classifier:random_forest:max_depth': 'None',
 58 |             'classifier:random_forest:max_features': 4.908992016092793,
 59 |             'classifier:random_forest:max_leaf_nodes': 'None',
 60 |             'classifier:random_forest:min_samples_leaf': 2,
 61 |             'classifier:random_forest:min_samples_split': 6,
 62 |             'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 63 |             'classifier:random_forest:n_estimators': 100,
 64 |             'imputation:strategy': 'mean',
 65 |             'one_hot_encoding:minimum_fraction': 0.009349768412523697,
 66 |             'one_hot_encoding:use_minimum_fraction': 'True',
 67 |             'preprocessor:__choice__': 'fast_ica',
 68 |             'preprocessor:fast_ica:algorithm': 'deflation',
 69 |             'preprocessor:fast_ica:fun': 'exp',
 70 |             'preprocessor:fast_ica:whiten': 'False',
 71 |             'rescaling:__choice__': 'none'})),
 72 |      (0.180000,
 73 |         ParamSklearnClassifier(
 74 |             configuration={
 75 |                 'balancing:strategy': 'weighting',
 76 |                 'classifier:__choice__': 'libsvm_svc',
 77 |                 'classifier:libsvm_svc:C': 445.91825904609124,
 78 |                 'classifier:libsvm_svc:gamma': 0.03873498413280048,
 79 |                 'classifier:libsvm_svc:kernel': 'rbf',
 80 |                 'classifier:libsvm_svc:max_iter': -1,
 81 |                 'classifier:libsvm_svc:shrinking': 'True',
 82 |                 'classifier:libsvm_svc:tol': 0.0008078719040695308,
 83 |                 'imputation:strategy': 'median',
 84 |                 'one_hot_encoding:use_minimum_fraction': 'False',
 85 |                 'preprocessor:__choice__': 'pca',
 86 |                 'preprocessor:pca:keep_variance': 0.7596970304901425,
 87 |                 'preprocessor:pca:whiten': 'True',
 88 |                 'rescaling:__choice__': 'standardize'})),
 89 |      (0.040000, ParamSklearnClassifier(
 90 |          configuration={
 91 |              'balancing:strategy': 'none',
 92 |              'classifier:__choice__': 'random_forest',
 93 |              'classifier:random_forest:bootstrap': 'False',
 94 |              'classifier:random_forest:criterion': 'entropy',
 95 |              'classifier:random_forest:max_depth': 'None',
 96 |              'classifier:random_forest:max_features': 3.5340547102377364,
 97 |              'classifier:random_forest:max_leaf_nodes': 'None',
 98 |              'classifier:random_forest:min_samples_leaf': 2,
 99 |              'classifier:random_forest:min_samples_split': 6,
100 |              'classifier:random_forest:min_weight_fraction_leaf': 0.0,
101 |              'classifier:random_forest:n_estimators': 100,
102 |              'imputation:strategy': 'mean',
103 |              'one_hot_encoding:minimum_fraction': 0.008518947433195237,
104 |              'one_hot_encoding:use_minimum_fraction': 'True',
105 |              'preprocessor:__choice__': 'fast_ica',
106 |              'preprocessor:fast_ica:algorithm': 'deflation',
107 |              'preprocessor:fast_ica:fun': 'cube',
108 |              'preprocessor:fast_ica:whiten': 'False',
109 |              'rescaling:__choice__': 'none'})), ]
110 | 
111 | classifiers = []
112 | targets = []
113 | predictions = []
114 | predictions_valid = []
115 | predictions_test = []
116 | 
117 | # Make predictions and weight them
118 | iteration = 0
119 | for weight, classifier in choices:
120 |     iteration += 1
121 |     print dataset, "Iteration %d/%d" % (iteration, len(choices))
122 | 
123 |     classifiers.append(classifier)
124 |     try:
125 |         classifier.fit(X.copy(), y.copy())
126 |         predictions_valid.append(
127 |             classifier.predict_proba(X_valid.copy()) * weight)
128 |         predictions_test.append(
129 |             classifier.predict_proba(X_test.copy()) * weight)
130 |     except Exception as e:
131 |         print e
132 |         print classifier
133 | 
134 | # Output the predictions
135 | for name, predictions in [('valid', predictions_valid),
136 |                           ('test', predictions_test)]:
137 |     predictions = np.array(predictions)
138 |     predictions = np.sum(predictions, axis=0)
139 | 
140 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
141 |     np.savetxt(filepath, predictions, delimiter=' ')
142 | 


--------------------------------------------------------------------------------
/003_dionis.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.classification import SimpleClassificationPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'dionis'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Replace the following array by a new ensemble
 29 | choices = \
 30 |     [(0.520000, SimpleClassificationPipeline(configuration={
 31 |         'balancing:strategy': 'none',
 32 |         'classifier:__choice__': 'qda',
 33 |         'classifier:qda:reg_param': 7.017044041208607,
 34 |         'imputation:strategy': 'most_frequent',
 35 |         'one_hot_encoding:use_minimum_fraction': 'False',
 36 |         'preprocessor:__choice__': 'no_preprocessing',
 37 |         'rescaling:__choice__': 'normalize'})),
 38 |      (0.360000, SimpleClassificationPipeline(configuration={
 39 |          'balancing:strategy': 'none',
 40 |          'classifier:__choice__': 'qda',
 41 |          'classifier:qda:reg_param': 0.5,
 42 |          'imputation:strategy': 'most_frequent',
 43 |          'one_hot_encoding:use_minimum_fraction': 'False',
 44 |          'preprocessor:__choice__': 'select_rates',
 45 |          'preprocessor:select_rates:alpha': 0.1,
 46 |          'preprocessor:select_rates:mode': 'fpr',
 47 |          'preprocessor:select_rates:score_func': 'chi2',
 48 |          'rescaling:__choice__': 'min/max'})),
 49 |      (0.020000, SimpleClassificationPipeline(configuration={
 50 |          'balancing:strategy': 'none',
 51 |          'classifier:__choice__': 'k_nearest_neighbors',
 52 |          'classifier:k_nearest_neighbors:n_neighbors': 53,
 53 |          'classifier:k_nearest_neighbors:p': 2,
 54 |          'classifier:k_nearest_neighbors:weights': 'uniform',
 55 |          'imputation:strategy': 'most_frequent',
 56 |          'one_hot_encoding:minimum_fraction': 0.004107223932117523,
 57 |          'one_hot_encoding:use_minimum_fraction': 'True',
 58 |          'preprocessor:__choice__': 'select_rates',
 59 |          'preprocessor:select_rates:alpha': 0.06365705922416094,
 60 |          'preprocessor:select_rates:mode': 'fpr',
 61 |          'preprocessor:select_rates:score_func': 'f_classif',
 62 |          'rescaling:__choice__': 'min/max'})),
 63 |      (0.020000, SimpleClassificationPipeline(configuration={
 64 |          'balancing:strategy': 'weighting',
 65 |          'classifier:__choice__': 'liblinear_svc',
 66 |          'classifier:liblinear_svc:C': 1288.9425457179896,
 67 |          'classifier:liblinear_svc:dual': 'False',
 68 |          'classifier:liblinear_svc:fit_intercept': 'True',
 69 |          'classifier:liblinear_svc:intercept_scaling': 1,
 70 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 71 |          'classifier:liblinear_svc:multi_class': 'ovr',
 72 |          'classifier:liblinear_svc:penalty': 'l2',
 73 |          'classifier:liblinear_svc:tol': 6.852190351970404e-05,
 74 |          'imputation:strategy': 'most_frequent',
 75 |          'one_hot_encoding:minimum_fraction': 0.016322736180045382,
 76 |          'one_hot_encoding:use_minimum_fraction': 'True',
 77 |          'preprocessor:__choice__': 'select_rates',
 78 |          'preprocessor:select_rates:alpha': 0.48582026589548283,
 79 |          'preprocessor:select_rates:mode': 'fpr',
 80 |          'preprocessor:select_rates:score_func': 'chi2',
 81 |          'rescaling:__choice__': 'min/max'})),
 82 |      (0.020000, SimpleClassificationPipeline(configuration={
 83 |          'balancing:strategy': 'weighting',
 84 |          'classifier:__choice__': 'extra_trees',
 85 |          'classifier:extra_trees:bootstrap': 'False',
 86 |          'classifier:extra_trees:criterion': 'gini',
 87 |          'classifier:extra_trees:max_depth': 'None',
 88 |          'classifier:extra_trees:max_features': 0.6872563090086077,
 89 |          'classifier:extra_trees:min_samples_leaf': 9,
 90 |          'classifier:extra_trees:min_samples_split': 8,
 91 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
 92 |          'classifier:extra_trees:n_estimators': 100,
 93 |          'imputation:strategy': 'median',
 94 |          'one_hot_encoding:minimum_fraction': 0.00048281479349728755,
 95 |          'one_hot_encoding:use_minimum_fraction': 'True',
 96 |          'preprocessor:__choice__': 'feature_agglomeration',
 97 |          'preprocessor:feature_agglomeration:affinity': 'manhattan',
 98 |          'preprocessor:feature_agglomeration:linkage': 'average',
 99 |          'preprocessor:feature_agglomeration:n_clusters': 170,
100 |          'preprocessor:feature_agglomeration:pooling_func': 'mean',
101 |          'rescaling:__choice__': 'normalize'})),
102 |      (0.020000, SimpleClassificationPipeline(configuration={
103 |          'balancing:strategy': 'weighting',
104 |          'classifier:__choice__': 'liblinear_svc',
105 |          'classifier:liblinear_svc:C': 737.3354222113379,
106 |          'classifier:liblinear_svc:dual': 'False',
107 |          'classifier:liblinear_svc:fit_intercept': 'True',
108 |          'classifier:liblinear_svc:intercept_scaling': 1,
109 |          'classifier:liblinear_svc:loss': 'squared_hinge',
110 |          'classifier:liblinear_svc:multi_class': 'ovr',
111 |          'classifier:liblinear_svc:penalty': 'l2',
112 |          'classifier:liblinear_svc:tol': 0.029993063054990464,
113 |          'imputation:strategy': 'median',
114 |          'one_hot_encoding:minimum_fraction': 0.0007084092083452885,
115 |          'one_hot_encoding:use_minimum_fraction': 'True',
116 |          'preprocessor:__choice__': 'select_rates',
117 |          'preprocessor:select_rates:alpha': 0.28020088992913833,
118 |          'preprocessor:select_rates:mode': 'fdr',
119 |          'preprocessor:select_rates:score_func': 'f_classif',
120 |          'rescaling:__choice__': 'standardize'})),
121 |      (0.020000, SimpleClassificationPipeline(configuration={
122 |          'balancing:strategy': 'none',
123 |          'classifier:__choice__': 'k_nearest_neighbors',
124 |          'classifier:k_nearest_neighbors:n_neighbors': 1,
125 |          'classifier:k_nearest_neighbors:p': 2,
126 |          'classifier:k_nearest_neighbors:weights': 'uniform',
127 |          'imputation:strategy': 'median',
128 |          'one_hot_encoding:minimum_fraction': 0.015690633649222446,
129 |          'one_hot_encoding:use_minimum_fraction': 'True',
130 |          'preprocessor:__choice__': 'no_preprocessing',
131 |          'rescaling:__choice__': 'min/max'})),
132 |      (0.020000, SimpleClassificationPipeline(configuration={
133 |          'balancing:strategy': 'weighting',
134 |          'classifier:__choice__': 'extra_trees',
135 |          'classifier:extra_trees:bootstrap': 'False',
136 |          'classifier:extra_trees:criterion': 'gini',
137 |          'classifier:extra_trees:max_depth': 'None',
138 |          'classifier:extra_trees:max_features': 1.0,
139 |          'classifier:extra_trees:min_samples_leaf': 10,
140 |          'classifier:extra_trees:min_samples_split': 2,
141 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
142 |          'classifier:extra_trees:n_estimators': 100,
143 |          'imputation:strategy': 'median',
144 |          'one_hot_encoding:minimum_fraction': 0.01,
145 |          'one_hot_encoding:use_minimum_fraction': 'True',
146 |          'preprocessor:__choice__': 'select_rates',
147 |          'preprocessor:select_rates:alpha': 0.1,
148 |          'preprocessor:select_rates:mode': 'fpr',
149 |          'preprocessor:select_rates:score_func': 'chi2',
150 |          'rescaling:__choice__': 'none'})),
151 |      ]
152 | 
153 | targets = []
154 | predictions = []
155 | predictions_valid = []
156 | predictions_test = []
157 | 
158 | 
159 | def fit_and_predict(estimator, weight, X, y):
160 |     try:
161 |         estimator.fit(X.copy(), y.copy())
162 |         pv = estimator.predict_proba(X_valid.copy()) * weight
163 |         pt = estimator.predict_proba(X_test.copy()) * weight
164 |     except Exception as e:
165 |         print(e)
166 |         print(estimator.configuration)
167 |         pv = None
168 |         pt = None
169 |     return pv, pt
170 | 
171 | 
172 | # Make predictions and weight them
173 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
174 |                                           (estimator, weight, X, y) for
175 |                                       weight, estimator in choices)
176 | for pv, pt in all_predictions:
177 |     predictions_valid.append(pv)
178 |     predictions_test.append(pt)
179 | 
180 | # Output the predictions
181 | for name, predictions in [('valid', predictions_valid),
182 |                           ('test', predictions_test)]:
183 |     predictions = np.array(predictions)
184 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
185 | 
186 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
187 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
188 | 


--------------------------------------------------------------------------------
/003_grigoris.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.classification import SimpleClassificationPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'grigoris'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Replace the following array by a new ensemble
 29 | choices = \
 30 |     [(0.720000, SimpleClassificationPipeline(configuration={
 31 |         'balancing:strategy': 'none',
 32 |         'classifier:__choice__': 'liblinear_svc',
 33 |         'classifier:liblinear_svc:C': 0.0665747065156058,
 34 |         'classifier:liblinear_svc:dual': 'False',
 35 |         'classifier:liblinear_svc:fit_intercept': 'True',
 36 |         'classifier:liblinear_svc:intercept_scaling': 1,
 37 |         'classifier:liblinear_svc:loss': 'squared_hinge',
 38 |         'classifier:liblinear_svc:multi_class': 'ovr',
 39 |         'classifier:liblinear_svc:penalty': 'l2',
 40 |         'classifier:liblinear_svc:tol': 0.002362381246384099,
 41 |         'imputation:strategy': 'mean',
 42 |         'one_hot_encoding:minimum_fraction': 0.0972585384393519,
 43 |         'one_hot_encoding:use_minimum_fraction': 'True',
 44 |         'preprocessor:__choice__': 'no_preprocessing',
 45 |         'rescaling:__choice__': 'normalize'})),
 46 |      (0.100000, SimpleClassificationPipeline(configuration={
 47 |          'balancing:strategy': 'weighting',
 48 |          'classifier:__choice__': 'liblinear_svc',
 49 |          'classifier:liblinear_svc:C': 7.705276414124367,
 50 |          'classifier:liblinear_svc:dual': 'False',
 51 |          'classifier:liblinear_svc:fit_intercept': 'True',
 52 |          'classifier:liblinear_svc:intercept_scaling': 1,
 53 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 54 |          'classifier:liblinear_svc:multi_class': 'ovr',
 55 |          'classifier:liblinear_svc:penalty': 'l2',
 56 |          'classifier:liblinear_svc:tol': 0.028951969755081776,
 57 |          'imputation:strategy': 'most_frequent',
 58 |          'one_hot_encoding:use_minimum_fraction': 'False',
 59 |          'preprocessor:__choice__': 'no_preprocessing',
 60 |          'rescaling:__choice__': 'normalize'})),
 61 |      (0.080000, SimpleClassificationPipeline(configuration={
 62 |          'balancing:strategy': 'weighting',
 63 |          'classifier:__choice__': 'liblinear_svc',
 64 |          'classifier:liblinear_svc:C': 1.0,
 65 |          'classifier:liblinear_svc:dual': 'False',
 66 |          'classifier:liblinear_svc:fit_intercept': 'True',
 67 |          'classifier:liblinear_svc:intercept_scaling': 1,
 68 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 69 |          'classifier:liblinear_svc:multi_class': 'ovr',
 70 |          'classifier:liblinear_svc:penalty': 'l2',
 71 |          'classifier:liblinear_svc:tol': 0.0001,
 72 |          'imputation:strategy': 'median',
 73 |          'one_hot_encoding:minimum_fraction': 0.0033856971814438443,
 74 |          'one_hot_encoding:use_minimum_fraction': 'True',
 75 |          'preprocessor:__choice__': 'no_preprocessing',
 76 |          'rescaling:__choice__': 'normalize'})),
 77 |      (0.080000, SimpleClassificationPipeline(configuration={
 78 |          'balancing:strategy': 'weighting',
 79 |          'classifier:__choice__': 'liblinear_svc',
 80 |          'classifier:liblinear_svc:C': 0.2598769185905466,
 81 |          'classifier:liblinear_svc:dual': 'False',
 82 |          'classifier:liblinear_svc:fit_intercept': 'True',
 83 |          'classifier:liblinear_svc:intercept_scaling': 1,
 84 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 85 |          'classifier:liblinear_svc:multi_class': 'ovr',
 86 |          'classifier:liblinear_svc:penalty': 'l2',
 87 |          'classifier:liblinear_svc:tol': 0.001007160236770467,
 88 |          'imputation:strategy': 'median',
 89 |          'one_hot_encoding:minimum_fraction': 0.019059927375795167,
 90 |          'one_hot_encoding:use_minimum_fraction': 'True',
 91 |          'preprocessor:__choice__': 'no_preprocessing',
 92 |          'rescaling:__choice__': 'normalize'})),
 93 |      (0.020000, SimpleClassificationPipeline(configuration={
 94 |          'balancing:strategy': 'weighting',
 95 |          'classifier:__choice__': 'liblinear_svc',
 96 |          'classifier:liblinear_svc:C': 0.6849477125990308,
 97 |          'classifier:liblinear_svc:dual': 'False',
 98 |          'classifier:liblinear_svc:fit_intercept': 'True',
 99 |          'classifier:liblinear_svc:intercept_scaling': 1,
100 |          'classifier:liblinear_svc:loss': 'squared_hinge',
101 |          'classifier:liblinear_svc:multi_class': 'ovr',
102 |          'classifier:liblinear_svc:penalty': 'l2',
103 |          'classifier:liblinear_svc:tol': 1.2676147487949745e-05,
104 |          'imputation:strategy': 'mean',
105 |          'one_hot_encoding:minimum_fraction': 0.003803817610653382,
106 |          'one_hot_encoding:use_minimum_fraction': 'True',
107 |          'preprocessor:__choice__': 'no_preprocessing',
108 |          'rescaling:__choice__': 'normalize'})),
109 |      ]
110 | 
111 | targets = []
112 | predictions = []
113 | predictions_valid = []
114 | predictions_test = []
115 | 
116 | 
117 | def fit_and_predict(estimator, weight, X, y):
118 |     try:
119 |         estimator.fit(X.copy(), y.copy())
120 |         pv = estimator.predict_proba(X_valid.copy()) * weight
121 |         pt = estimator.predict_proba(X_test.copy()) * weight
122 |     except Exception as e:
123 |         print(e)
124 |         print(estimator.configuration)
125 |         pv = None
126 |         pt = None
127 |     return pv, pt
128 | 
129 | 
130 | # Make predictions and weight them
131 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
132 |                                           (estimator, weight, X, y) for
133 |                                       weight, estimator in choices)
134 | for pv, pt in all_predictions:
135 |     predictions_valid.append(pv)
136 |     predictions_test.append(pt)
137 | 
138 | # Output the predictions
139 | for name, predictions in [('valid', predictions_valid),
140 |                           ('test', predictions_test)]:
141 |     predictions = np.array(predictions)
142 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
143 | 
144 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
145 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
146 | 


--------------------------------------------------------------------------------
/003_wallis.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | 
  6 | import autosklearn
  7 | import autosklearn.data
  8 | import autosklearn.data.competition_data_manager
  9 | from autosklearn.pipeline.classification import SimpleClassificationPipeline
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'wallis'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Replace the following array by a new ensemble
 29 | choices = \
 30 |     [(0.580000, SimpleClassificationPipeline(configuration={
 31 |         'balancing:strategy': 'weighting',
 32 |         'classifier:__choice__': 'passive_aggressive',
 33 |         'classifier:passive_aggressive:C': 0.0006373873391108438,
 34 |         'classifier:passive_aggressive:fit_intercept': 'True',
 35 |         'classifier:passive_aggressive:loss': 'squared_hinge',
 36 |         'classifier:passive_aggressive:n_iter': 18,
 37 |         'imputation:strategy': 'median',
 38 |         'one_hot_encoding:use_minimum_fraction': 'False',
 39 |         'preprocessor:__choice__': 'no_preprocessing',
 40 |         'rescaling:__choice__': 'normalize'})),
 41 |      (0.200000, SimpleClassificationPipeline(configuration={
 42 |          'balancing:strategy': 'weighting',
 43 |          'classifier:__choice__': 'passive_aggressive',
 44 |          'classifier:passive_aggressive:C': 0.000465329983806252,
 45 |          'classifier:passive_aggressive:fit_intercept': 'True',
 46 |          'classifier:passive_aggressive:loss': 'squared_hinge',
 47 |          'classifier:passive_aggressive:n_iter': 34,
 48 |          'imputation:strategy': 'median',
 49 |          'one_hot_encoding:use_minimum_fraction': 'False',
 50 |          'preprocessor:__choice__': 'kernel_pca',
 51 |          'preprocessor:kernel_pca:kernel': 'cosine',
 52 |          'preprocessor:kernel_pca:n_components': 1351,
 53 |          'rescaling:__choice__': 'normalize'})),
 54 |      (0.180000, SimpleClassificationPipeline(configuration={
 55 |          'balancing:strategy': 'none',
 56 |          'classifier:__choice__': 'liblinear_svc',
 57 |          'classifier:liblinear_svc:C': 0.7416809477859192,
 58 |          'classifier:liblinear_svc:dual': 'False',
 59 |          'classifier:liblinear_svc:fit_intercept': 'True',
 60 |          'classifier:liblinear_svc:intercept_scaling': 1,
 61 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 62 |          'classifier:liblinear_svc:multi_class': 'ovr',
 63 |          'classifier:liblinear_svc:penalty': 'l2',
 64 |          'classifier:liblinear_svc:tol': 0.0048882934000166346,
 65 |          'imputation:strategy': 'most_frequent',
 66 |          'one_hot_encoding:use_minimum_fraction': 'False',
 67 |          'preprocessor:__choice__': 'select_percentile_classification',
 68 |          'preprocessor:select_percentile_classification:percentile': 19.775149789978155,
 69 |          'preprocessor:select_percentile_classification:score_func': 'chi2',
 70 |          'rescaling:__choice__': 'normalize'})),
 71 |      (0.020000, SimpleClassificationPipeline(configuration={
 72 |          'balancing:strategy': 'none',
 73 |          'classifier:__choice__': 'liblinear_svc',
 74 |          'classifier:liblinear_svc:C': 0.4010081266689033,
 75 |          'classifier:liblinear_svc:dual': 'False',
 76 |          'classifier:liblinear_svc:fit_intercept': 'True',
 77 |          'classifier:liblinear_svc:intercept_scaling': 1,
 78 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 79 |          'classifier:liblinear_svc:multi_class': 'ovr',
 80 |          'classifier:liblinear_svc:penalty': 'l2',
 81 |          'classifier:liblinear_svc:tol': 0.003197120920655818,
 82 |          'imputation:strategy': 'mean',
 83 |          'one_hot_encoding:minimum_fraction': 0.0002497904559463802,
 84 |          'one_hot_encoding:use_minimum_fraction': 'True',
 85 |          'preprocessor:__choice__': 'no_preprocessing',
 86 |          'rescaling:__choice__': 'normalize'})),
 87 |      (0.020000, SimpleClassificationPipeline(configuration={
 88 |          'balancing:strategy': 'none',
 89 |          'classifier:__choice__': 'liblinear_svc',
 90 |          'classifier:liblinear_svc:C': 0.7444178979935873,
 91 |          'classifier:liblinear_svc:dual': 'False',
 92 |          'classifier:liblinear_svc:fit_intercept': 'True',
 93 |          'classifier:liblinear_svc:intercept_scaling': 1,
 94 |          'classifier:liblinear_svc:loss': 'squared_hinge',
 95 |          'classifier:liblinear_svc:multi_class': 'ovr',
 96 |          'classifier:liblinear_svc:penalty': 'l2',
 97 |          'classifier:liblinear_svc:tol': 0.00359411438055,
 98 |          'imputation:strategy': 'mean',
 99 |          'one_hot_encoding:minimum_fraction': 0.0018636449908690695,
100 |          'one_hot_encoding:use_minimum_fraction': 'True',
101 |          'preprocessor:__choice__': 'nystroem_sampler',
102 |          'preprocessor:nystroem_sampler:kernel': 'cosine',
103 |          'preprocessor:nystroem_sampler:n_components': 5183,
104 |          'rescaling:__choice__': 'normalize'})),
105 |      ]
106 | 
107 | targets = []
108 | predictions = []
109 | predictions_valid = []
110 | predictions_test = []
111 | 
112 | # Make predictions and weight them
113 | iteration = 0
114 | for weight, classifier in choices:
115 |     iteration += 1
116 |     print(dataset, "Iteration %d/%d" % (iteration, len(choices)))
117 |     try:
118 |         classifier.fit(X.copy(), y.copy())
119 |         predictions_valid.append(
120 |             classifier.predict_proba(X_valid.copy()) * weight)
121 |         predictions_test.append(
122 |             classifier.predict_proba(X_test.copy()) * weight)
123 |     except Exception as e:
124 |         print(e)
125 |         print(classifier.configuration)
126 | 
127 | # Output the predictions
128 | for name, predictions in [('valid', predictions_valid),
129 |                           ('test', predictions_test)]:
130 |     predictions = np.array(predictions)
131 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
132 | 
133 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
134 |     np.savetxt(filepath, predictions, delimiter=' ', fmt = '%.4e')
135 | 


--------------------------------------------------------------------------------
/004_evita.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.classification import SimpleClassificationPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'evita'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Replace the following array by a new ensemble
 29 | choices = \
 30 |     [(0.320000, SimpleClassificationPipeline(configuration={
 31 |         'balancing:strategy': 'weighting',
 32 |         'classifier:__choice__': 'xgradient_boosting',
 33 |         'classifier:xgradient_boosting:base_score': 0.5,
 34 |         'classifier:xgradient_boosting:colsample_bylevel': 1,
 35 |         'classifier:xgradient_boosting:colsample_bytree': 1,
 36 |         'classifier:xgradient_boosting:gamma': 0,
 37 |         'classifier:xgradient_boosting:learning_rate': 0.083957576764175909,
 38 |         'classifier:xgradient_boosting:max_delta_step': 0,
 39 |         'classifier:xgradient_boosting:max_depth': 9,
 40 |         'classifier:xgradient_boosting:min_child_weight': 1,
 41 |         'classifier:xgradient_boosting:n_estimators': 207,
 42 |         'classifier:xgradient_boosting:reg_alpha': 0,
 43 |         'classifier:xgradient_boosting:reg_lambda': 1,
 44 |         'classifier:xgradient_boosting:scale_pos_weight': 1,
 45 |         'classifier:xgradient_boosting:subsample': 0.79041547139233681,
 46 |         'imputation:strategy': 'median',
 47 |         'one_hot_encoding:use_minimum_fraction': 'False',
 48 |         'preprocessor:__choice__': 'select_rates',
 49 |         'preprocessor:select_rates:alpha': 0.033271689466917775,
 50 |         'preprocessor:select_rates:mode': 'fdr',
 51 |         'preprocessor:select_rates:score_func': 'chi2',
 52 |         'rescaling:__choice__': 'none'})),
 53 |      (0.140000, SimpleClassificationPipeline(configuration={
 54 |          'balancing:strategy': 'none',
 55 |          'classifier:__choice__': 'extra_trees',
 56 |          'classifier:extra_trees:bootstrap': 'False',
 57 |          'classifier:extra_trees:criterion': 'gini',
 58 |          'classifier:extra_trees:max_depth': 'None',
 59 |          'classifier:extra_trees:max_features': 1.0,
 60 |          'classifier:extra_trees:min_samples_leaf': 1,
 61 |          'classifier:extra_trees:min_samples_split': 2,
 62 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
 63 |          'classifier:extra_trees:n_estimators': 100,
 64 |          'imputation:strategy': 'most_frequent',
 65 |          'one_hot_encoding:use_minimum_fraction': 'False',
 66 |          'preprocessor:__choice__': 'select_rates',
 67 |          'preprocessor:select_rates:alpha': 0.10000000000000001,
 68 |          'preprocessor:select_rates:mode': 'fdr',
 69 |          'preprocessor:select_rates:score_func': 'chi2',
 70 |          'rescaling:__choice__': 'none'})),
 71 |      (0.100000, SimpleClassificationPipeline(configuration={
 72 |          'balancing:strategy': 'none',
 73 |          'classifier:__choice__': 'random_forest',
 74 |          'classifier:random_forest:bootstrap': 'False',
 75 |          'classifier:random_forest:criterion': 'gini',
 76 |          'classifier:random_forest:max_depth': 'None',
 77 |          'classifier:random_forest:max_features': 3.904721926856924,
 78 |          'classifier:random_forest:max_leaf_nodes': 'None',
 79 |          'classifier:random_forest:min_samples_leaf': 2,
 80 |          'classifier:random_forest:min_samples_split': 7,
 81 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 82 |          'classifier:random_forest:n_estimators': 100,
 83 |          'imputation:strategy': 'most_frequent',
 84 |          'one_hot_encoding:minimum_fraction': 0.036176664478653142,
 85 |          'one_hot_encoding:use_minimum_fraction': 'True',
 86 |          'preprocessor:__choice__': 'select_percentile_classification',
 87 |          'preprocessor:select_percentile_classification:percentile': 91.78175624881186,
 88 |          'preprocessor:select_percentile_classification:score_func': 'chi2',
 89 |          'rescaling:__choice__': 'none'})),
 90 |      (0.080000, SimpleClassificationPipeline(configuration={
 91 |          'balancing:strategy': 'none',
 92 |          'classifier:__choice__': 'random_forest',
 93 |          'classifier:random_forest:bootstrap': 'True',
 94 |          'classifier:random_forest:criterion': 'gini',
 95 |          'classifier:random_forest:max_depth': 'None',
 96 |          'classifier:random_forest:max_features': 1.0,
 97 |          'classifier:random_forest:max_leaf_nodes': 'None',
 98 |          'classifier:random_forest:min_samples_leaf': 1,
 99 |          'classifier:random_forest:min_samples_split': 2,
100 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
101 |          'classifier:random_forest:n_estimators': 100,
102 |          'imputation:strategy': 'median',
103 |          'one_hot_encoding:use_minimum_fraction': 'False',
104 |          'preprocessor:__choice__': 'select_rates',
105 |          'preprocessor:select_rates:alpha': 0.18915206967606921,
106 |          'preprocessor:select_rates:mode': 'fpr',
107 |          'preprocessor:select_rates:score_func': 'chi2',
108 |          'rescaling:__choice__': 'standardize'})),
109 |      (0.080000, SimpleClassificationPipeline(configuration={
110 |          'balancing:strategy': 'none',
111 |          'classifier:__choice__': 'extra_trees',
112 |          'classifier:extra_trees:bootstrap': 'False',
113 |          'classifier:extra_trees:criterion': 'gini',
114 |          'classifier:extra_trees:max_depth': 'None',
115 |          'classifier:extra_trees:max_features': 0.59875097583441961,
116 |          'classifier:extra_trees:min_samples_leaf': 1,
117 |          'classifier:extra_trees:min_samples_split': 2,
118 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
119 |          'classifier:extra_trees:n_estimators': 100,
120 |          'imputation:strategy': 'mean',
121 |          'one_hot_encoding:use_minimum_fraction': 'False',
122 |          'preprocessor:__choice__': 'select_rates',
123 |          'preprocessor:select_rates:alpha': 0.13663946292601112,
124 |          'preprocessor:select_rates:mode': 'fpr',
125 |          'preprocessor:select_rates:score_func': 'chi2',
126 |          'rescaling:__choice__': 'standardize'})),
127 |      (0.060000, SimpleClassificationPipeline(configuration={
128 |          'balancing:strategy': 'weighting',
129 |          'classifier:__choice__': 'random_forest',
130 |          'classifier:random_forest:bootstrap': 'True',
131 |          'classifier:random_forest:criterion': 'gini',
132 |          'classifier:random_forest:max_depth': 'None',
133 |          'classifier:random_forest:max_features': 1.0,
134 |          'classifier:random_forest:max_leaf_nodes': 'None',
135 |          'classifier:random_forest:min_samples_leaf': 1,
136 |          'classifier:random_forest:min_samples_split': 2,
137 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
138 |          'classifier:random_forest:n_estimators': 100,
139 |          'imputation:strategy': 'median',
140 |          'one_hot_encoding:use_minimum_fraction': 'False',
141 |          'preprocessor:__choice__': 'select_rates',
142 |          'preprocessor:select_rates:alpha': 0.10000000000000001,
143 |          'preprocessor:select_rates:mode': 'fpr',
144 |          'preprocessor:select_rates:score_func': 'chi2',
145 |          'rescaling:__choice__': 'none'})),
146 |      (0.040000, SimpleClassificationPipeline(configuration={
147 |          'balancing:strategy': 'none',
148 |          'classifier:__choice__': 'extra_trees',
149 |          'classifier:extra_trees:bootstrap': 'False',
150 |          'classifier:extra_trees:criterion': 'gini',
151 |          'classifier:extra_trees:max_depth': 'None',
152 |          'classifier:extra_trees:max_features': 2.4071018354857294,
153 |          'classifier:extra_trees:min_samples_leaf': 2,
154 |          'classifier:extra_trees:min_samples_split': 9,
155 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
156 |          'classifier:extra_trees:n_estimators': 100,
157 |          'imputation:strategy': 'median',
158 |          'one_hot_encoding:use_minimum_fraction': 'False',
159 |          'preprocessor:__choice__': 'select_rates',
160 |          'preprocessor:select_rates:alpha': 0.34844304591109215,
161 |          'preprocessor:select_rates:mode': 'fpr',
162 |          'preprocessor:select_rates:score_func': 'chi2',
163 |          'rescaling:__choice__': 'none'})),
164 |      (0.040000, SimpleClassificationPipeline(configuration={
165 |          'balancing:strategy': 'weighting',
166 |          'classifier:__choice__': 'random_forest',
167 |          'classifier:random_forest:bootstrap': 'False',
168 |          'classifier:random_forest:criterion': 'gini',
169 |          'classifier:random_forest:max_depth': 'None',
170 |          'classifier:random_forest:max_features': 2.3037777871550227,
171 |          'classifier:random_forest:max_leaf_nodes': 'None',
172 |          'classifier:random_forest:min_samples_leaf': 1,
173 |          'classifier:random_forest:min_samples_split': 6,
174 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
175 |          'classifier:random_forest:n_estimators': 100,
176 |          'imputation:strategy': 'mean',
177 |          'one_hot_encoding:use_minimum_fraction': 'False',
178 |          'preprocessor:__choice__': 'no_preprocessing',
179 |          'rescaling:__choice__': 'standardize'})),
180 |      (0.040000, SimpleClassificationPipeline(configuration={
181 |          'balancing:strategy': 'weighting',
182 |          'classifier:__choice__': 'random_forest',
183 |          'classifier:random_forest:bootstrap': 'False',
184 |          'classifier:random_forest:criterion': 'entropy',
185 |          'classifier:random_forest:max_depth': 'None',
186 |          'classifier:random_forest:max_features': 3.9417933307381925,
187 |          'classifier:random_forest:max_leaf_nodes': 'None',
188 |          'classifier:random_forest:min_samples_leaf': 2,
189 |          'classifier:random_forest:min_samples_split': 3,
190 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
191 |          'classifier:random_forest:n_estimators': 100,
192 |          'imputation:strategy': 'median',
193 |          'one_hot_encoding:minimum_fraction': 0.076515481895064422,
194 |          'one_hot_encoding:use_minimum_fraction': 'True',
195 |          'preprocessor:__choice__': 'select_rates',
196 |          'preprocessor:select_rates:alpha': 0.39998541946519961,
197 |          'preprocessor:select_rates:mode': 'fpr',
198 |          'preprocessor:select_rates:score_func': 'chi2',
199 |          'rescaling:__choice__': 'standardize'})),
200 |      (0.020000, SimpleClassificationPipeline(configuration={
201 |          'balancing:strategy': 'weighting',
202 |          'classifier:__choice__': 'extra_trees',
203 |          'classifier:extra_trees:bootstrap': 'True',
204 |          'classifier:extra_trees:criterion': 'gini',
205 |          'classifier:extra_trees:max_depth': 'None',
206 |          'classifier:extra_trees:max_features': 2.6560184696178109,
207 |          'classifier:extra_trees:min_samples_leaf': 1,
208 |          'classifier:extra_trees:min_samples_split': 9,
209 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
210 |          'classifier:extra_trees:n_estimators': 100,
211 |          'imputation:strategy': 'most_frequent',
212 |          'one_hot_encoding:use_minimum_fraction': 'False',
213 |          'preprocessor:__choice__': 'select_rates',
214 |          'preprocessor:select_rates:alpha': 0.49576705570976692,
215 |          'preprocessor:select_rates:mode': 'fdr',
216 |          'preprocessor:select_rates:score_func': 'chi2',
217 |          'rescaling:__choice__': 'none'})),
218 |      (0.020000, SimpleClassificationPipeline(configuration={
219 |          'balancing:strategy': 'weighting',
220 |          'classifier:__choice__': 'extra_trees',
221 |          'classifier:extra_trees:bootstrap': 'True',
222 |          'classifier:extra_trees:criterion': 'gini',
223 |          'classifier:extra_trees:max_depth': 'None',
224 |          'classifier:extra_trees:max_features': 2.8762254807814838,
225 |          'classifier:extra_trees:min_samples_leaf': 7,
226 |          'classifier:extra_trees:min_samples_split': 7,
227 |          'classifier:extra_trees:min_weight_fraction_leaf': 0.0,
228 |          'classifier:extra_trees:n_estimators': 100,
229 |          'imputation:strategy': 'mean',
230 |          'one_hot_encoding:minimum_fraction': 0.00037525617209727315,
231 |          'one_hot_encoding:use_minimum_fraction': 'True',
232 |          'preprocessor:__choice__': 'select_rates',
233 |          'preprocessor:select_rates:alpha': 0.36323622954313295,
234 |          'preprocessor:select_rates:mode': 'fpr',
235 |          'preprocessor:select_rates:score_func': 'chi2',
236 |          'rescaling:__choice__': 'min/max'})),
237 |      (0.020000, SimpleClassificationPipeline(configuration={
238 |          'balancing:strategy': 'weighting',
239 |          'classifier:__choice__': 'random_forest',
240 |          'classifier:random_forest:bootstrap': 'False',
241 |          'classifier:random_forest:criterion': 'gini',
242 |          'classifier:random_forest:max_depth': 'None',
243 |          'classifier:random_forest:max_features': 4.7911724862642,
244 |          'classifier:random_forest:max_leaf_nodes': 'None',
245 |          'classifier:random_forest:min_samples_leaf': 1,
246 |          'classifier:random_forest:min_samples_split': 11,
247 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
248 |          'classifier:random_forest:n_estimators': 100,
249 |          'imputation:strategy': 'median',
250 |          'one_hot_encoding:use_minimum_fraction': 'False',
251 |          'preprocessor:__choice__': 'select_rates',
252 |          'preprocessor:select_rates:alpha': 0.47510655107871991,
253 |          'preprocessor:select_rates:mode': 'fdr',
254 |          'preprocessor:select_rates:score_func': 'chi2',
255 |          'rescaling:__choice__': 'standardize'})),
256 |      (0.020000, SimpleClassificationPipeline(configuration={
257 |          'balancing:strategy': 'none',
258 |          'classifier:__choice__': 'random_forest',
259 |          'classifier:random_forest:bootstrap': 'False',
260 |          'classifier:random_forest:criterion': 'entropy',
261 |          'classifier:random_forest:max_depth': 'None',
262 |          'classifier:random_forest:max_features': 4.9237570615905248,
263 |          'classifier:random_forest:max_leaf_nodes': 'None',
264 |          'classifier:random_forest:min_samples_leaf': 13,
265 |          'classifier:random_forest:min_samples_split': 15,
266 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
267 |          'classifier:random_forest:n_estimators': 100,
268 |          'imputation:strategy': 'most_frequent',
269 |          'one_hot_encoding:minimum_fraction': 0.00028264986304734767,
270 |          'one_hot_encoding:use_minimum_fraction': 'True',
271 |          'preprocessor:__choice__': 'select_rates',
272 |          'preprocessor:select_rates:alpha': 0.27910583898194102,
273 |          'preprocessor:select_rates:mode': 'fdr',
274 |          'preprocessor:select_rates:score_func': 'chi2',
275 |          'rescaling:__choice__': 'none'})),
276 |      (0.020000, SimpleClassificationPipeline(configuration={
277 |          'balancing:strategy': 'weighting',
278 |          'classifier:__choice__': 'random_forest',
279 |          'classifier:random_forest:bootstrap': 'False',
280 |          'classifier:random_forest:criterion': 'entropy',
281 |          'classifier:random_forest:max_depth': 'None',
282 |          'classifier:random_forest:max_features': 3.0988613659452917,
283 |          'classifier:random_forest:max_leaf_nodes': 'None',
284 |          'classifier:random_forest:min_samples_leaf': 3,
285 |          'classifier:random_forest:min_samples_split': 3,
286 |          'classifier:random_forest:min_weight_fraction_leaf': 0.0,
287 |          'classifier:random_forest:n_estimators': 100,
288 |          'imputation:strategy': 'most_frequent',
289 |          'one_hot_encoding:use_minimum_fraction': 'False',
290 |          'preprocessor:__choice__': 'no_preprocessing',
291 |          'rescaling:__choice__': 'none'})),
292 |      ]
293 | 
294 | targets = []
295 | predictions = []
296 | predictions_valid = []
297 | predictions_test = []
298 | 
299 | 
300 | def fit_and_predict(estimator, weight, X, y):
301 |     try:
302 |         estimator.fit(X.copy(), y.copy())
303 |         pv = estimator.predict_proba(X_valid.copy()) * weight
304 |         pt = estimator.predict_proba(X_test.copy()) * weight
305 |     except Exception as e:
306 |         print(e)
307 |         print(estimator.configuration)
308 |         pv = None
309 |         pt = None
310 |     return pv, pt
311 | 
312 | 
313 | # Make predictions and weight them
314 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
315 |                                           (estimator, weight, X, y) for
316 |                                       weight, estimator in choices)
317 | for pv, pt in all_predictions:
318 |     predictions_valid.append(pv)
319 |     predictions_test.append(pt)
320 | 
321 | # Output the predictions
322 | for name, predictions in [('valid', predictions_valid),
323 |                           ('test', predictions_test)]:
324 |     predictions = np.array(predictions)
325 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
326 |     predictions = predictions[:, 1].reshape((-1, 1))
327 | 
328 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
329 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
330 | 


--------------------------------------------------------------------------------
/004_flora.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.regression import SimpleRegressionPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'flora'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Replace the following array by a new ensemble
 29 | choices = \
 30 |     [(0.220000, SimpleRegressionPipeline(configuration={
 31 |         'imputation:strategy': 'most_frequent',
 32 |         'one_hot_encoding:use_minimum_fraction': 'False',
 33 |         'preprocessor:__choice__': 'no_preprocessing',
 34 |         'regressor:__choice__': 'xgradient_boosting',
 35 |         'regressor:xgradient_boosting:base_score': 0.5,
 36 |         'regressor:xgradient_boosting:colsample_bylevel': 1,
 37 |         'regressor:xgradient_boosting:colsample_bytree': 1,
 38 |         'regressor:xgradient_boosting:gamma': 0,
 39 |         'regressor:xgradient_boosting:learning_rate': 0.056838908807173093,
 40 |         'regressor:xgradient_boosting:max_delta_step': 0,
 41 |         'regressor:xgradient_boosting:max_depth': 8,
 42 |         'regressor:xgradient_boosting:min_child_weight': 16,
 43 |         'regressor:xgradient_boosting:n_estimators': 178,
 44 |         'regressor:xgradient_boosting:reg_alpha': 0,
 45 |         'regressor:xgradient_boosting:reg_lambda': 1,
 46 |         'regressor:xgradient_boosting:scale_pos_weight': 1,
 47 |         'regressor:xgradient_boosting:subsample': 0.70026686345272005,
 48 |         'rescaling:__choice__': 'none'})),
 49 |      (0.160000, SimpleRegressionPipeline(configuration={
 50 |          'imputation:strategy': 'mean',
 51 |          'one_hot_encoding:minimum_fraction': 0.028721299365033225,
 52 |          'one_hot_encoding:use_minimum_fraction': 'True',
 53 |          'preprocessor:__choice__': 'no_preprocessing',
 54 |          'regressor:__choice__': 'xgradient_boosting',
 55 |          'regressor:xgradient_boosting:base_score': 0.5,
 56 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
 57 |          'regressor:xgradient_boosting:colsample_bytree': 1,
 58 |          'regressor:xgradient_boosting:gamma': 0,
 59 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
 60 |          'regressor:xgradient_boosting:max_delta_step': 0,
 61 |          'regressor:xgradient_boosting:max_depth': 6,
 62 |          'regressor:xgradient_boosting:min_child_weight': 13,
 63 |          'regressor:xgradient_boosting:n_estimators': 100,
 64 |          'regressor:xgradient_boosting:reg_alpha': 0,
 65 |          'regressor:xgradient_boosting:reg_lambda': 1,
 66 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
 67 |          'regressor:xgradient_boosting:subsample': 1.0,
 68 |          'rescaling:__choice__': 'none'})),
 69 |      (0.120000, SimpleRegressionPipeline(configuration={
 70 |          'imputation:strategy': 'median',
 71 |          'one_hot_encoding:minimum_fraction': 0.00076890296310299397,
 72 |          'one_hot_encoding:use_minimum_fraction': 'True',
 73 |          'preprocessor:__choice__': 'no_preprocessing',
 74 |          'regressor:__choice__': 'xgradient_boosting',
 75 |          'regressor:xgradient_boosting:base_score': 0.5,
 76 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
 77 |          'regressor:xgradient_boosting:colsample_bytree': 1,
 78 |          'regressor:xgradient_boosting:gamma': 0,
 79 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
 80 |          'regressor:xgradient_boosting:max_delta_step': 0,
 81 |          'regressor:xgradient_boosting:max_depth': 8,
 82 |          'regressor:xgradient_boosting:min_child_weight': 1,
 83 |          'regressor:xgradient_boosting:n_estimators': 100,
 84 |          'regressor:xgradient_boosting:reg_alpha': 0,
 85 |          'regressor:xgradient_boosting:reg_lambda': 1,
 86 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
 87 |          'regressor:xgradient_boosting:subsample': 1.0,
 88 |          'rescaling:__choice__': 'none'})),
 89 |      (0.080000, SimpleRegressionPipeline(configuration={
 90 |          'imputation:strategy': 'most_frequent',
 91 |          'one_hot_encoding:use_minimum_fraction': 'False',
 92 |          'preprocessor:__choice__': 'no_preprocessing',
 93 |          'regressor:__choice__': 'xgradient_boosting',
 94 |          'regressor:xgradient_boosting:base_score': 0.5,
 95 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
 96 |          'regressor:xgradient_boosting:colsample_bytree': 1,
 97 |          'regressor:xgradient_boosting:gamma': 0,
 98 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
 99 |          'regressor:xgradient_boosting:max_delta_step': 0,
100 |          'regressor:xgradient_boosting:max_depth': 7,
101 |          'regressor:xgradient_boosting:min_child_weight': 1,
102 |          'regressor:xgradient_boosting:n_estimators': 100,
103 |          'regressor:xgradient_boosting:reg_alpha': 0,
104 |          'regressor:xgradient_boosting:reg_lambda': 1,
105 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
106 |          'regressor:xgradient_boosting:subsample': 1.0,
107 |          'rescaling:__choice__': 'none'})),
108 |      (0.080000, SimpleRegressionPipeline(configuration={
109 |          'imputation:strategy': 'median',
110 |          'one_hot_encoding:minimum_fraction': 0.0023636879664826662,
111 |          'one_hot_encoding:use_minimum_fraction': 'True',
112 |          'preprocessor:__choice__': 'no_preprocessing',
113 |          'regressor:__choice__': 'liblinear_svr',
114 |          'regressor:liblinear_svr:C': 1756.3281019761341,
115 |          'regressor:liblinear_svr:dual': 'False',
116 |          'regressor:liblinear_svr:epsilon': 0.12958135960591446,
117 |          'regressor:liblinear_svr:fit_intercept': 'True',
118 |          'regressor:liblinear_svr:intercept_scaling': 1,
119 |          'regressor:liblinear_svr:loss': 'squared_epsilon_insensitive',
120 |          'regressor:liblinear_svr:tol': 6.7973376271281637e-05,
121 |          'rescaling:__choice__': 'none'})),
122 |      (0.060000, SimpleRegressionPipeline(configuration={
123 |          'imputation:strategy': 'mean',
124 |          'one_hot_encoding:minimum_fraction': 0.0078832566242014457,
125 |          'one_hot_encoding:use_minimum_fraction': 'True',
126 |          'preprocessor:__choice__': 'kernel_pca',
127 |          'preprocessor:kernel_pca:coef0': 0.830468268944067,
128 |          'preprocessor:kernel_pca:kernel': 'sigmoid',
129 |          'preprocessor:kernel_pca:n_components': 1297,
130 |          'regressor:__choice__': 'sgd',
131 |          'regressor:sgd:alpha': 7.1922597888891864e-06,
132 |          'regressor:sgd:average': 'True',
133 |          'regressor:sgd:epsilon': 0.002325854486140731,
134 |          'regressor:sgd:eta0': 0.09745049410405518,
135 |          'regressor:sgd:fit_intercept': 'True',
136 |          'regressor:sgd:learning_rate': 'invscaling',
137 |          'regressor:sgd:loss': 'squared_epsilon_insensitive',
138 |          'regressor:sgd:n_iter': 56,
139 |          'regressor:sgd:penalty': 'l1',
140 |          'regressor:sgd:power_t': 0.2820868931235419,
141 |          'rescaling:__choice__': 'standardize'})),
142 |      (0.040000, SimpleRegressionPipeline(configuration={
143 |          'imputation:strategy': 'median',
144 |          'one_hot_encoding:use_minimum_fraction': 'False',
145 |          'preprocessor:__choice__': 'no_preprocessing',
146 |          'regressor:__choice__': 'xgradient_boosting',
147 |          'regressor:xgradient_boosting:base_score': 0.5,
148 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
149 |          'regressor:xgradient_boosting:colsample_bytree': 1,
150 |          'regressor:xgradient_boosting:gamma': 0,
151 |          'regressor:xgradient_boosting:learning_rate': 0.39354372832974382,
152 |          'regressor:xgradient_boosting:max_delta_step': 0,
153 |          'regressor:xgradient_boosting:max_depth': 3,
154 |          'regressor:xgradient_boosting:min_child_weight': 19,
155 |          'regressor:xgradient_boosting:n_estimators': 73,
156 |          'regressor:xgradient_boosting:reg_alpha': 0,
157 |          'regressor:xgradient_boosting:reg_lambda': 1,
158 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
159 |          'regressor:xgradient_boosting:subsample': 0.51160818820515941,
160 |          'rescaling:__choice__': 'standardize'})),
161 |      (0.040000, SimpleRegressionPipeline(configuration={
162 |          'imputation:strategy': 'most_frequent',
163 |          'one_hot_encoding:minimum_fraction': 0.0001292396238727452,
164 |          'one_hot_encoding:use_minimum_fraction': 'True',
165 |          'preprocessor:__choice__': 'no_preprocessing',
166 |          'regressor:__choice__': 'xgradient_boosting',
167 |          'regressor:xgradient_boosting:base_score': 0.5,
168 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
169 |          'regressor:xgradient_boosting:colsample_bytree': 1,
170 |          'regressor:xgradient_boosting:gamma': 0,
171 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
172 |          'regressor:xgradient_boosting:max_delta_step': 0,
173 |          'regressor:xgradient_boosting:max_depth': 5,
174 |          'regressor:xgradient_boosting:min_child_weight': 1,
175 |          'regressor:xgradient_boosting:n_estimators': 100,
176 |          'regressor:xgradient_boosting:reg_alpha': 0,
177 |          'regressor:xgradient_boosting:reg_lambda': 1,
178 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
179 |          'regressor:xgradient_boosting:subsample': 1.0,
180 |          'rescaling:__choice__': 'none'})),
181 |      (0.040000, SimpleRegressionPipeline(configuration={
182 |          'imputation:strategy': 'median',
183 |          'one_hot_encoding:minimum_fraction': 0.0010042712846593592,
184 |          'one_hot_encoding:use_minimum_fraction': 'True',
185 |          'preprocessor:__choice__': 'extra_trees_preproc_for_regression',
186 |          'preprocessor:extra_trees_preproc_for_regression:bootstrap': 'False',
187 |          'preprocessor:extra_trees_preproc_for_regression:criterion': 'mse',
188 |          'preprocessor:extra_trees_preproc_for_regression:max_depth': 'None',
189 |          'preprocessor:extra_trees_preproc_for_regression:max_features': 4.4366238138449141,
190 |          'preprocessor:extra_trees_preproc_for_regression:min_samples_leaf': 5,
191 |          'preprocessor:extra_trees_preproc_for_regression:min_samples_split': 2,
192 |          'preprocessor:extra_trees_preproc_for_regression:min_weight_fraction_leaf': 0.0,
193 |          'preprocessor:extra_trees_preproc_for_regression:n_estimators': 100,
194 |          'regressor:__choice__': 'xgradient_boosting',
195 |          'regressor:xgradient_boosting:base_score': 0.5,
196 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
197 |          'regressor:xgradient_boosting:colsample_bytree': 1,
198 |          'regressor:xgradient_boosting:gamma': 0,
199 |          'regressor:xgradient_boosting:learning_rate': 0.24786184996967336,
200 |          'regressor:xgradient_boosting:max_delta_step': 0,
201 |          'regressor:xgradient_boosting:max_depth': 4,
202 |          'regressor:xgradient_boosting:min_child_weight': 12,
203 |          'regressor:xgradient_boosting:n_estimators': 487,
204 |          'regressor:xgradient_boosting:reg_alpha': 0,
205 |          'regressor:xgradient_boosting:reg_lambda': 1,
206 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
207 |          'regressor:xgradient_boosting:subsample': 0.51768561001523961,
208 |          'rescaling:__choice__': 'standardize'})),
209 |      (0.040000, SimpleRegressionPipeline(configuration={
210 |          'imputation:strategy': 'most_frequent',
211 |          'one_hot_encoding:use_minimum_fraction': 'False',
212 |          'preprocessor:__choice__': 'no_preprocessing',
213 |          'regressor:__choice__': 'xgradient_boosting',
214 |          'regressor:xgradient_boosting:base_score': 0.5,
215 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
216 |          'regressor:xgradient_boosting:colsample_bytree': 1,
217 |          'regressor:xgradient_boosting:gamma': 0,
218 |          'regressor:xgradient_boosting:learning_rate': 0.056838908807173093,
219 |          'regressor:xgradient_boosting:max_delta_step': 0,
220 |          'regressor:xgradient_boosting:max_depth': 6,
221 |          'regressor:xgradient_boosting:min_child_weight': 20,
222 |          'regressor:xgradient_boosting:n_estimators': 178,
223 |          'regressor:xgradient_boosting:reg_alpha': 0,
224 |          'regressor:xgradient_boosting:reg_lambda': 1,
225 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
226 |          'regressor:xgradient_boosting:subsample': 0.81655152788480145,
227 |          'rescaling:__choice__': 'none'})),
228 |      (0.020000, SimpleRegressionPipeline(configuration={
229 |          'imputation:strategy': 'median',
230 |          'one_hot_encoding:use_minimum_fraction': 'False',
231 |          'preprocessor:__choice__': 'truncatedSVD',
232 |          'preprocessor:truncatedSVD:target_dim': 222,
233 |          'regressor:__choice__': 'xgradient_boosting',
234 |          'regressor:xgradient_boosting:base_score': 0.5,
235 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
236 |          'regressor:xgradient_boosting:colsample_bytree': 1,
237 |          'regressor:xgradient_boosting:gamma': 0,
238 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
239 |          'regressor:xgradient_boosting:max_delta_step': 0,
240 |          'regressor:xgradient_boosting:max_depth': 3,
241 |          'regressor:xgradient_boosting:min_child_weight': 1,
242 |          'regressor:xgradient_boosting:n_estimators': 100,
243 |          'regressor:xgradient_boosting:reg_alpha': 0,
244 |          'regressor:xgradient_boosting:reg_lambda': 1,
245 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
246 |          'regressor:xgradient_boosting:subsample': 1.0,
247 |          'rescaling:__choice__': 'none'})),
248 |      (0.020000, SimpleRegressionPipeline(configuration={
249 |          'imputation:strategy': 'most_frequent',
250 |          'one_hot_encoding:use_minimum_fraction': 'False',
251 |          'preprocessor:__choice__': 'truncatedSVD',
252 |          'preprocessor:truncatedSVD:target_dim': 156,
253 |          'regressor:__choice__': 'decision_tree',
254 |          'regressor:decision_tree:criterion': 'mse',
255 |          'regressor:decision_tree:max_depth': 1.4573346058635357,
256 |          'regressor:decision_tree:max_features': 1.0,
257 |          'regressor:decision_tree:max_leaf_nodes': 'None',
258 |          'regressor:decision_tree:min_samples_leaf': 17,
259 |          'regressor:decision_tree:min_samples_split': 8,
260 |          'regressor:decision_tree:min_weight_fraction_leaf': 0.0,
261 |          'regressor:decision_tree:splitter': 'best',
262 |          'rescaling:__choice__': 'normalize'})),
263 |      (0.020000, SimpleRegressionPipeline(configuration={
264 |          'imputation:strategy': 'mean',
265 |          'one_hot_encoding:use_minimum_fraction': 'False',
266 |          'preprocessor:__choice__': 'no_preprocessing',
267 |          'regressor:__choice__': 'xgradient_boosting',
268 |          'regressor:xgradient_boosting:base_score': 0.5,
269 |          'regressor:xgradient_boosting:colsample_bylevel': 1,
270 |          'regressor:xgradient_boosting:colsample_bytree': 1,
271 |          'regressor:xgradient_boosting:gamma': 0,
272 |          'regressor:xgradient_boosting:learning_rate': 0.10000000000000002,
273 |          'regressor:xgradient_boosting:max_delta_step': 0,
274 |          'regressor:xgradient_boosting:max_depth': 5,
275 |          'regressor:xgradient_boosting:min_child_weight': 13,
276 |          'regressor:xgradient_boosting:n_estimators': 100,
277 |          'regressor:xgradient_boosting:reg_alpha': 0,
278 |          'regressor:xgradient_boosting:reg_lambda': 1,
279 |          'regressor:xgradient_boosting:scale_pos_weight': 1,
280 |          'regressor:xgradient_boosting:subsample': 1.0,
281 |          'rescaling:__choice__': 'none'})),
282 |      (0.020000, SimpleRegressionPipeline(configuration={
283 |          'imputation:strategy': 'median',
284 |          'one_hot_encoding:minimum_fraction': 0.0030893906804030156,
285 |          'one_hot_encoding:use_minimum_fraction': 'True',
286 |          'preprocessor:__choice__': 'truncatedSVD',
287 |          'preprocessor:truncatedSVD:target_dim': 67,
288 |          'regressor:__choice__': 'k_nearest_neighbors',
289 |          'regressor:k_nearest_neighbors:n_neighbors': 29,
290 |          'regressor:k_nearest_neighbors:p': 2,
291 |          'regressor:k_nearest_neighbors:weights': 'distance',
292 |          'rescaling:__choice__': 'normalize'})),
293 |      (0.020000, SimpleRegressionPipeline(configuration={
294 |          'imputation:strategy': 'most_frequent',
295 |          'one_hot_encoding:minimum_fraction': 0.0027171559129851464,
296 |          'one_hot_encoding:use_minimum_fraction': 'True',
297 |          'preprocessor:__choice__': 'truncatedSVD',
298 |          'preprocessor:truncatedSVD:target_dim': 35,
299 |          'regressor:__choice__': 'liblinear_svr',
300 |          'regressor:liblinear_svr:C': 0.0485964760119761,
301 |          'regressor:liblinear_svr:dual': 'False',
302 |          'regressor:liblinear_svr:epsilon': 0.01333919934708307,
303 |          'regressor:liblinear_svr:fit_intercept': 'True',
304 |          'regressor:liblinear_svr:intercept_scaling': 1,
305 |          'regressor:liblinear_svr:loss': 'squared_epsilon_insensitive',
306 |          'regressor:liblinear_svr:tol': 0.030573671793931671,
307 |          'rescaling:__choice__': 'min/max'})),
308 |      (0.020000, SimpleRegressionPipeline(configuration={
309 |          'imputation:strategy': 'mean',
310 |          'one_hot_encoding:use_minimum_fraction': 'False',
311 |          'preprocessor:__choice__': 'no_preprocessing',
312 |          'regressor:__choice__': 'decision_tree',
313 |          'regressor:decision_tree:criterion': 'mse',
314 |          'regressor:decision_tree:max_depth': 0.031442410091469419,
315 |          'regressor:decision_tree:max_features': 1.0,
316 |          'regressor:decision_tree:max_leaf_nodes': 'None',
317 |          'regressor:decision_tree:min_samples_leaf': 15,
318 |          'regressor:decision_tree:min_samples_split': 10,
319 |          'regressor:decision_tree:min_weight_fraction_leaf': 0.0,
320 |          'regressor:decision_tree:splitter': 'best',
321 |          'rescaling:__choice__': 'normalize'})),
322 |      ]
323 | 
324 | targets = []
325 | predictions = []
326 | predictions_valid = []
327 | predictions_test = []
328 | 
329 | 
330 | def fit_and_predict(estimator, weight, X, y):
331 |     try:
332 |         estimator.fit(X.copy(), y.copy())
333 |         pv = estimator.predict(X_valid.copy()) * weight
334 |         pt = estimator.predict(X_test.copy()) * weight
335 |     except Exception as e:
336 |         print(e)
337 |         print(estimator.configuration)
338 |         pv = None
339 |         pt = None
340 |     return pv, pt
341 | 
342 | 
343 | # Make predictions and weight them
344 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
345 |                                           (estimator, weight, X, y) for
346 |                                       weight, estimator in choices)
347 | for pv, pt in all_predictions:
348 |     predictions_valid.append(pv)
349 |     predictions_test.append(pt)
350 | 
351 | # Output the predictions
352 | for name, predictions in [('valid', predictions_valid),
353 |                           ('test', predictions_test)]:
354 |     predictions = np.array(predictions)
355 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
356 | 
357 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
358 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
359 | 


--------------------------------------------------------------------------------
/004_tania.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.classification import SimpleClassificationPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'tania'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Use this version of lasagne commit of the lasagne master branch:
 29 | # 24c9ed2ffc25504c3b0df4598afb1e63fdd59eee
 30 | # https://github.com/Lasagne/Lasagne/commit/24c9ed2ffc25504c3b0df4598afb1e63fdd59eee
 31 | # Copy the file DeepFeedNet into autosklearn.pipeline.components.classification
 32 | # Copy the file FeedForwardNet into autosklearn.pipeline.implementations
 33 | 
 34 | choices = \
 35 |     [(0.220000, SimpleClassificationPipeline(configuration={
 36 |         'balancing:strategy': 'none',
 37 |         'classifier:DeepFeedNet:activation': 'relu',
 38 |         'classifier:DeepFeedNet:batch_size': 1526,
 39 |         'classifier:DeepFeedNet:dropout_layer_1': 0.07375877191623954,
 40 |         'classifier:DeepFeedNet:dropout_layer_2': 0.25061726159515596,
 41 |         'classifier:DeepFeedNet:dropout_output': 0.44276742232825533,
 42 |         'classifier:DeepFeedNet:lambda2': 0.00559189810319557,
 43 |         'classifier:DeepFeedNet:learning_rate': 0.01,
 44 |         'classifier:DeepFeedNet:num_layers': 'd',
 45 |         'classifier:DeepFeedNet:num_units_layer_1': 3512,
 46 |         'classifier:DeepFeedNet:num_units_layer_2': 2456,
 47 |         'classifier:DeepFeedNet:number_updates': 1019,
 48 |         'classifier:DeepFeedNet:solver': 'smorm3s',
 49 |         'classifier:DeepFeedNet:std_layer_1': 0.0031572295374762784,
 50 |         'classifier:DeepFeedNet:std_layer_2': 0.024102151721155526,
 51 |         'classifier:__choice__': 'DeepFeedNet',
 52 |         'imputation:strategy': 'median',
 53 |         'one_hot_encoding:use_minimum_fraction': 'False',
 54 |         'preprocessor:truncatedSVD:target_dim': 169,
 55 |         'preprocessor:__choice__': 'truncatedSVD',
 56 |         'rescaling:__choice__': 'normalize'})),
 57 |      (0.180000, SimpleClassificationPipeline(configuration={
 58 |          'balancing:strategy': 'weighting',
 59 |          'classifier:__choice__': 'sgd',
 60 |          'classifier:sgd:alpha': 1e-06,
 61 |          'classifier:sgd:average': 'False',
 62 |          'classifier:sgd:eta0': 1e-07,
 63 |          'classifier:sgd:fit_intercept': 'True',
 64 |          'classifier:sgd:learning_rate': 'optimal',
 65 |          'classifier:sgd:loss': 'log',
 66 |          'classifier:sgd:n_iter': 5,
 67 |          'classifier:sgd:penalty': 'l2',
 68 |          'imputation:strategy': 'mean',
 69 |          'one_hot_encoding:use_minimum_fraction': 'False',
 70 |          'preprocessor:__choice__': 'no_preprocessing',
 71 |          'rescaling:__choice__': 'normalize'})),
 72 |      (0.140000, SimpleClassificationPipeline(configuration={
 73 |          'balancing:strategy': 'none',
 74 |          'classifier:DeepFeedNet:activation': 'relu',
 75 |          'classifier:DeepFeedNet:batch_size': 1526,
 76 |          'classifier:DeepFeedNet:dropout_layer_1': 0.07375877191623954,
 77 |          'classifier:DeepFeedNet:dropout_layer_2': 0.25061726159515596,
 78 |          'classifier:DeepFeedNet:dropout_output': 0.5318548466903714,
 79 |          'classifier:DeepFeedNet:lambda2': 0.00559189810319557,
 80 |          'classifier:DeepFeedNet:learning_rate': 0.01,
 81 |          'classifier:DeepFeedNet:num_layers': 'd',
 82 |          'classifier:DeepFeedNet:num_units_layer_1': 3512,
 83 |          'classifier:DeepFeedNet:num_units_layer_2': 2456,
 84 |          'classifier:DeepFeedNet:number_updates': 942,
 85 |          'classifier:DeepFeedNet:solver': 'smorm3s',
 86 |          'classifier:DeepFeedNet:std_layer_1': 0.0031572295374762784,
 87 |          'classifier:DeepFeedNet:std_layer_2': 0.024102151721155526,
 88 |          'classifier:__choice__': 'DeepFeedNet',
 89 |          'imputation:strategy': 'median',
 90 |          'one_hot_encoding:use_minimum_fraction': 'False',
 91 |          'preprocessor:truncatedSVD:target_dim': 169,
 92 |          'preprocessor:__choice__': 'truncatedSVD',
 93 |          'rescaling:__choice__': 'normalize'})),
 94 |      (0.100000, SimpleClassificationPipeline(configuration={
 95 |          'balancing:strategy': 'weighting',
 96 |          'classifier:DeepFeedNet:activation': 'relu',
 97 |          'classifier:DeepFeedNet:batch_size': 1526,
 98 |          'classifier:DeepFeedNet:dropout_layer_1': 0.07375877191623954,
 99 |          'classifier:DeepFeedNet:dropout_layer_2': 0.25061726159515596,
100 |          'classifier:DeepFeedNet:dropout_output': 0.5318548466903714,
101 |          'classifier:DeepFeedNet:lambda2': 0.00559189810319557,
102 |          'classifier:DeepFeedNet:learning_rate': 0.01,
103 |          'classifier:DeepFeedNet:num_layers': 'd',
104 |          'classifier:DeepFeedNet:num_units_layer_1': 2825,
105 |          'classifier:DeepFeedNet:num_units_layer_2': 2456,
106 |          'classifier:DeepFeedNet:number_updates': 942,
107 |          'classifier:DeepFeedNet:solver': 'smorm3s',
108 |          'classifier:DeepFeedNet:std_layer_1': 0.0031572295374762784,
109 |          'classifier:DeepFeedNet:std_layer_2': 0.024102151721155526,
110 |          'classifier:__choice__': 'DeepFeedNet',
111 |          'imputation:strategy': 'median',
112 |          'one_hot_encoding:use_minimum_fraction': 'False',
113 |          'preprocessor:truncatedSVD:target_dim': 169,
114 |          'preprocessor:__choice__': 'truncatedSVD',
115 |          'rescaling:__choice__': 'normalize'})),
116 |      (0.080000, SimpleClassificationPipeline(configuration={
117 |          'balancing:strategy': 'none',
118 |          'classifier:DeepFeedNet:activation': 'relu',
119 |          'classifier:DeepFeedNet:batch_size': 1526,
120 |          'classifier:DeepFeedNet:dropout_layer_1': 0.07375877191623954,
121 |          'classifier:DeepFeedNet:dropout_layer_2': 0.25061726159515596,
122 |          'classifier:DeepFeedNet:dropout_output': 0.6315030660705527,
123 |          'classifier:DeepFeedNet:lambda2': 0.00559189810319557,
124 |          'classifier:DeepFeedNet:learning_rate': 0.01,
125 |          'classifier:DeepFeedNet:num_layers': 'd',
126 |          'classifier:DeepFeedNet:num_units_layer_1': 3512,
127 |          'classifier:DeepFeedNet:num_units_layer_2': 2456,
128 |          'classifier:DeepFeedNet:number_updates': 942,
129 |          'classifier:DeepFeedNet:solver': 'smorm3s',
130 |          'classifier:DeepFeedNet:std_layer_1': 0.0031572295374762784,
131 |          'classifier:DeepFeedNet:std_layer_2': 0.024102151721155526,
132 |          'classifier:__choice__': 'DeepFeedNet',
133 |          'imputation:strategy': 'median',
134 |          'one_hot_encoding:use_minimum_fraction': 'False',
135 |          'preprocessor:truncatedSVD:target_dim': 169,
136 |          'preprocessor:__choice__': 'truncatedSVD',
137 |          'rescaling:__choice__': 'normalize'})),
138 |      (0.080000, SimpleClassificationPipeline(configuration={
139 |          'balancing:strategy': 'none',
140 |          'classifier:DeepFeedNet:activation': 'relu',
141 |          'classifier:DeepFeedNet:batch_size': 2124,
142 |          'classifier:DeepFeedNet:dropout_layer_1': 0.01360549061849139,
143 |          'classifier:DeepFeedNet:dropout_output': 0.2644391773986185,
144 |          'classifier:DeepFeedNet:lambda2': 0.004871660362477711,
145 |          'classifier:DeepFeedNet:learning_rate': 0.01,
146 |          'classifier:DeepFeedNet:num_layers': 'c',
147 |          'classifier:DeepFeedNet:num_units_layer_1': 2812,
148 |          'classifier:DeepFeedNet:number_updates': 2710,
149 |          'classifier:DeepFeedNet:solver': 'smorm3s',
150 |          'classifier:DeepFeedNet:std_layer_1': 0.09316319189582598,
151 |          'classifier:__choice__': 'DeepFeedNet',
152 |          'imputation:strategy': 'median',
153 |          'one_hot_encoding:use_minimum_fraction': 'False',
154 |          'preprocessor:truncatedSVD:target_dim': 186,
155 |          'preprocessor:__choice__': 'truncatedSVD',
156 |          'rescaling:__choice__': 'normalize'})),
157 |      (0.040000, SimpleClassificationPipeline(configuration={
158 |          'balancing:strategy': 'weighting',
159 |          'classifier:DeepFeedNet:activation': 'relu',
160 |          'classifier:DeepFeedNet:batch_size': 1867,
161 |          'classifier:DeepFeedNet:dropout_layer_1': 0.01908790794742743,
162 |          'classifier:DeepFeedNet:dropout_output': 0.3448188758299382,
163 |          'classifier:DeepFeedNet:lambda2': 0.0007755741149255707,
164 |          'classifier:DeepFeedNet:learning_rate': 0.01,
165 |          'classifier:DeepFeedNet:num_layers': 'c',
166 |          'classifier:DeepFeedNet:num_units_layer_1': 3665,
167 |          'classifier:DeepFeedNet:number_updates': 2512,
168 |          'classifier:DeepFeedNet:solver': 'smorm3s',
169 |          'classifier:DeepFeedNet:std_layer_1': 0.0024468150980905207,
170 |          'classifier:__choice__': 'DeepFeedNet',
171 |          'imputation:strategy': 'most_frequent',
172 |          'one_hot_encoding:minimum_fraction': 0.05266063283992454,
173 |          'one_hot_encoding:use_minimum_fraction': 'True',
174 |          'preprocessor:truncatedSVD:target_dim': 166,
175 |          'preprocessor:__choice__': 'truncatedSVD',
176 |          'rescaling:__choice__': 'normalize'})),
177 |      (0.040000, SimpleClassificationPipeline(configuration={
178 |          'balancing:strategy': 'weighting',
179 |          'classifier:DeepFeedNet:activation': 'relu',
180 |          'classifier:DeepFeedNet:batch_size': 2281,
181 |          'classifier:DeepFeedNet:dropout_layer_1': 0.09094796094063819,
182 |          'classifier:DeepFeedNet:dropout_output': 0.4958339054016198,
183 |          'classifier:DeepFeedNet:lambda2': 1.805699319151882e-05,
184 |          'classifier:DeepFeedNet:learning_rate': 0.001,
185 |          'classifier:DeepFeedNet:num_layers': 'c',
186 |          'classifier:DeepFeedNet:num_units_layer_1': 2651,
187 |          'classifier:DeepFeedNet:number_updates': 3403,
188 |          'classifier:DeepFeedNet:solver': 'smorm3s',
189 |          'classifier:DeepFeedNet:std_layer_1': 0.007630682901621406,
190 |          'classifier:__choice__': 'DeepFeedNet',
191 |          'imputation:strategy': 'mean',
192 |          'one_hot_encoding:use_minimum_fraction': 'False',
193 |          'preprocessor:truncatedSVD:target_dim': 197,
194 |          'preprocessor:__choice__': 'truncatedSVD',
195 |          'rescaling:__choice__': 'none'})),
196 |      (0.040000, SimpleClassificationPipeline(configuration={
197 |          'balancing:strategy': 'none',
198 |          'classifier:DeepFeedNet:activation': 'relu',
199 |          'classifier:DeepFeedNet:batch_size': 2086,
200 |          'classifier:DeepFeedNet:dropout_layer_1': 0.1030823826758656,
201 |          'classifier:DeepFeedNet:dropout_output': 0.22142344211272239,
202 |          'classifier:DeepFeedNet:lambda2': 3.4109499881542005e-06,
203 |          'classifier:DeepFeedNet:learning_rate': 0.01,
204 |          'classifier:DeepFeedNet:num_layers': 'c',
205 |          'classifier:DeepFeedNet:num_units_layer_1': 3317,
206 |          'classifier:DeepFeedNet:number_updates': 711,
207 |          'classifier:DeepFeedNet:solver': 'smorm3s',
208 |          'classifier:DeepFeedNet:std_layer_1': 0.0012484056182083289,
209 |          'classifier:__choice__': 'DeepFeedNet',
210 |          'imputation:strategy': 'most_frequent',
211 |          'one_hot_encoding:minimum_fraction': 0.030925614928477674,
212 |          'one_hot_encoding:use_minimum_fraction': 'True',
213 |          'preprocessor:truncatedSVD:target_dim': 159,
214 |          'preprocessor:__choice__': 'truncatedSVD',
215 |          'rescaling:__choice__': 'min/max'})),
216 |      (0.020000, SimpleClassificationPipeline(configuration={
217 |          'balancing:strategy': 'weighting',
218 |          'classifier:DeepFeedNet:activation': 'relu',
219 |          'classifier:DeepFeedNet:batch_size': 1336,
220 |          'classifier:DeepFeedNet:dropout_layer_1': 0.0331786272132608,
221 |          'classifier:DeepFeedNet:dropout_output': 0.3783990976694647,
222 |          'classifier:DeepFeedNet:lambda2': 0.006318427713029419,
223 |          'classifier:DeepFeedNet:learning_rate': 0.01,
224 |          'classifier:DeepFeedNet:num_layers': 'c',
225 |          'classifier:DeepFeedNet:num_units_layer_1': 2491,
226 |          'classifier:DeepFeedNet:number_updates': 3437,
227 |          'classifier:DeepFeedNet:solver': 'smorm3s',
228 |          'classifier:DeepFeedNet:std_layer_1': 0.09522419264016894,
229 |          'classifier:__choice__': 'DeepFeedNet',
230 |          'imputation:strategy': 'median',
231 |          'one_hot_encoding:minimum_fraction': 0.03562984523180951,
232 |          'one_hot_encoding:use_minimum_fraction': 'True',
233 |          'preprocessor:truncatedSVD:target_dim': 189,
234 |          'preprocessor:__choice__': 'truncatedSVD',
235 |          'rescaling:__choice__': 'normalize'})),
236 |      (0.020000, SimpleClassificationPipeline(configuration={
237 |          'balancing:strategy': 'none',
238 |          'classifier:DeepFeedNet:activation': 'relu',
239 |          'classifier:DeepFeedNet:batch_size': 1967,
240 |          'classifier:DeepFeedNet:dropout_layer_1': 0.06971989322917795,
241 |          'classifier:DeepFeedNet:dropout_output': 0.14345632673233852,
242 |          'classifier:DeepFeedNet:lambda2': 0.0008778987660283575,
243 |          'classifier:DeepFeedNet:learning_rate': 0.01,
244 |          'classifier:DeepFeedNet:num_layers': 'c',
245 |          'classifier:DeepFeedNet:num_units_layer_1': 3587,
246 |          'classifier:DeepFeedNet:number_updates': 3182,
247 |          'classifier:DeepFeedNet:solver': 'smorm3s',
248 |          'classifier:DeepFeedNet:std_layer_1': 0.0015311970092555642,
249 |          'classifier:__choice__': 'DeepFeedNet',
250 |          'imputation:strategy': 'median',
251 |          'one_hot_encoding:use_minimum_fraction': 'False',
252 |          'preprocessor:truncatedSVD:target_dim': 135,
253 |          'preprocessor:__choice__': 'truncatedSVD',
254 |          'rescaling:__choice__': 'normalize'})),
255 |      (0.020000, SimpleClassificationPipeline(configuration={
256 |          'balancing:strategy': 'weighting',
257 |          'classifier:DeepFeedNet:activation': 'relu',
258 |          'classifier:DeepFeedNet:batch_size': 1882,
259 |          'classifier:DeepFeedNet:dropout_layer_1': 0.007184660164183019,
260 |          'classifier:DeepFeedNet:dropout_output': 0.35789769788034004,
261 |          'classifier:DeepFeedNet:lambda2': 0.008162829194808478,
262 |          'classifier:DeepFeedNet:learning_rate': 0.01,
263 |          'classifier:DeepFeedNet:num_layers': 'c',
264 |          'classifier:DeepFeedNet:num_units_layer_1': 3376,
265 |          'classifier:DeepFeedNet:number_updates': 2868,
266 |          'classifier:DeepFeedNet:solver': 'smorm3s',
267 |          'classifier:DeepFeedNet:std_layer_1': 0.0010604662105437909,
268 |          'classifier:__choice__': 'DeepFeedNet',
269 |          'imputation:strategy': 'median',
270 |          'one_hot_encoding:use_minimum_fraction': 'False',
271 |          'preprocessor:truncatedSVD:target_dim': 199,
272 |          'preprocessor:__choice__': 'truncatedSVD',
273 |          'rescaling:__choice__': 'normalize'})),
274 |      (0.020000, SimpleClassificationPipeline(configuration={
275 |          'balancing:strategy': 'none',
276 |          'classifier:DeepFeedNet:activation': 'relu',
277 |          'classifier:DeepFeedNet:batch_size': 2086,
278 |          'classifier:DeepFeedNet:dropout_layer_1': 0.15565773821145037,
279 |          'classifier:DeepFeedNet:dropout_output': 0.22142344211272239,
280 |          'classifier:DeepFeedNet:lambda2': 1.7925329564209397e-06,
281 |          'classifier:DeepFeedNet:learning_rate': 0.01,
282 |          'classifier:DeepFeedNet:num_layers': 'c',
283 |          'classifier:DeepFeedNet:num_units_layer_1': 3317,
284 |          'classifier:DeepFeedNet:number_updates': 711,
285 |          'classifier:DeepFeedNet:solver': 'smorm3s',
286 |          'classifier:DeepFeedNet:std_layer_1': 0.0012484056182083289,
287 |          'classifier:__choice__': 'DeepFeedNet',
288 |          'imputation:strategy': 'most_frequent',
289 |          'one_hot_encoding:minimum_fraction': 0.030925614928477674,
290 |          'one_hot_encoding:use_minimum_fraction': 'True',
291 |          'preprocessor:truncatedSVD:target_dim': 159,
292 |          'preprocessor:__choice__': 'truncatedSVD',
293 |          'rescaling:__choice__': 'min/max'})),
294 |      ]
295 | 
296 | targets = []
297 | predictions = []
298 | predictions_valid = []
299 | predictions_test = []
300 | 
301 | 
302 | def fit_and_predict(estimator, weight, X, y):
303 |     try:
304 |         estimator.fit(X.copy(), y.copy())
305 |         pv = estimator.predict_proba(X_valid.copy()) * weight
306 |         pt = estimator.predict_proba(X_test.copy()) * weight
307 |     except Exception as e:
308 |         print(e)
309 |         print(estimator.configuration)
310 |         pv = None
311 |         pt = None
312 |     return pv, pt
313 | 
314 | 
315 | # Make predictions and weight them
316 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
317 |                                           (estimator, weight, X, y) for
318 |                                       weight, estimator in choices)
319 | for pv, pt in all_predictions:
320 |     predictions_valid.append(pv)
321 |     predictions_test.append(pt)
322 | 
323 | # Output the predictions
324 | for name, predictions in [('valid', predictions_valid),
325 |                           ('test', predictions_test)]:
326 |     predictions = np.array(predictions)
327 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
328 | 
329 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
330 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
331 | 


--------------------------------------------------------------------------------
/004_yolanda.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from joblib import Parallel, delayed
  5 | import numpy as np
  6 | 
  7 | import autosklearn
  8 | import autosklearn.data
  9 | import autosklearn.data.competition_data_manager
 10 | from autosklearn.pipeline.regression import SimpleRegressionPipeline
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('input')
 14 | parser.add_argument('output')
 15 | args = parser.parse_args()
 16 | 
 17 | input = args.input
 18 | dataset = 'yolanda'
 19 | output = args.output
 20 | 
 21 | path = os.path.join(input, dataset)
 22 | D = autosklearn.data.competition_data_manager.CompetitionDataManager(path)
 23 | X = D.data['X_train']
 24 | y = D.data['Y_train']
 25 | X_valid = D.data['X_valid']
 26 | X_test = D.data['X_test']
 27 | 
 28 | # Use this version of lasagne commit of the lasagne master branch:
 29 | # 24c9ed2ffc25504c3b0df4598afb1e63fdd59eee
 30 | # https://github.com/Lasagne/Lasagne/commit/24c9ed2ffc25504c3b0df4598afb1e63fdd59eee
 31 | # Copy the file RegDeepNet into autosklearn.pipeline.components.regression
 32 | # Copy the file FeedForwardNet into autosklearn.pipeline.implementations
 33 | 
 34 | choices = \
 35 |     [(0.360000, SimpleRegressionPipeline(configuration={
 36 |         'imputation:strategy': 'mean',
 37 |         'one_hot_encoding:minimum_fraction': 0.049682918006307676,
 38 |         'one_hot_encoding:use_minimum_fraction': 'True',
 39 |         'preprocessor:__choice__': 'no_preprocessing',
 40 |         'regressor:RegDeepNet:activation': 'tanh',
 41 |         'regressor:RegDeepNet:batch_size': 1865,
 42 |         'regressor:RegDeepNet:dropout_layer_1': 0.017462492577406473,
 43 |         'regressor:RegDeepNet:dropout_layer_2': 0.048354205627225436,
 44 |         'regressor:RegDeepNet:dropout_output': 0.00962149073006804,
 45 |         'regressor:RegDeepNet:lambda2': 1.0282444549550921e-05,
 46 |         'regressor:RegDeepNet:learning_rate': 0.001,
 47 |         'regressor:RegDeepNet:num_layers': 'd',
 48 |         'regressor:RegDeepNet:num_units_layer_1': 2615,
 49 |         'regressor:RegDeepNet:num_units_layer_2': 252,
 50 |         'regressor:RegDeepNet:number_updates': 3225,
 51 |         'regressor:RegDeepNet:solver': 'smorm3s',
 52 |         'regressor:RegDeepNet:std_layer_1': 0.006861129306844183,
 53 |         'regressor:RegDeepNet:std_layer_2': 0.002395977520245193,
 54 |         'regressor:__choice__': 'RegDeepNet',
 55 |         'rescaling:__choice__': 'standardize'})),
 56 |      (0.320000, SimpleRegressionPipeline(configuration={
 57 |          'imputation:strategy': 'mean',
 58 |          'one_hot_encoding:minimum_fraction': 0.05112532429613385,
 59 |          'one_hot_encoding:use_minimum_fraction': 'True',
 60 |          'preprocessor:__choice__': 'no_preprocessing',
 61 |          'regressor:RegDeepNet:activation': 'sigmoid',
 62 |          'regressor:RegDeepNet:batch_size': 1840,
 63 |          'regressor:RegDeepNet:dropout_layer_1': 0.15186663743978646,
 64 |          'regressor:RegDeepNet:dropout_layer_2': 0.11387781420379316,
 65 |          'regressor:RegDeepNet:dropout_layer_3': 0.19220971946536616,
 66 |          'regressor:RegDeepNet:dropout_output': 0.5509953660515314,
 67 |          'regressor:RegDeepNet:lambda2': 2.3655442216865217e-06,
 68 |          'regressor:RegDeepNet:learning_rate': 0.1,
 69 |          'regressor:RegDeepNet:num_layers': 'e',
 70 |          'regressor:RegDeepNet:num_units_layer_1': 173,
 71 |          'regressor:RegDeepNet:num_units_layer_2': 690,
 72 |          'regressor:RegDeepNet:num_units_layer_3': 2761,
 73 |          'regressor:RegDeepNet:number_updates': 4173,
 74 |          'regressor:RegDeepNet:solver': 'smorm3s',
 75 |          'regressor:RegDeepNet:std_layer_1': 0.006483588902887654,
 76 |          'regressor:RegDeepNet:std_layer_2': 0.006696161430555593,
 77 |          'regressor:RegDeepNet:std_layer_3': 0.0030798462419321746,
 78 |          'regressor:__choice__': 'RegDeepNet',
 79 |          'rescaling:__choice__': 'standardize'})),
 80 |      (0.160000, SimpleRegressionPipeline(configuration={
 81 |          'imputation:strategy': 'mean',
 82 |          'one_hot_encoding:minimum_fraction': 0.00044746581915706805,
 83 |          'one_hot_encoding:use_minimum_fraction': 'True',
 84 |          'preprocessor:__choice__': 'no_preprocessing',
 85 |          'regressor:RegDeepNet:activation': 'tanh',
 86 |          'regressor:RegDeepNet:batch_size': 1867,
 87 |          'regressor:RegDeepNet:dropout_layer_1': 0.0044842379741719856,
 88 |          'regressor:RegDeepNet:dropout_output': 0.029970881815609602,
 89 |          'regressor:RegDeepNet:lambda2': 3.922344043854585e-05,
 90 |          'regressor:RegDeepNet:learning_rate': 0.001,
 91 |          'regressor:RegDeepNet:num_layers': 'c',
 92 |          'regressor:RegDeepNet:num_units_layer_1': 2775,
 93 |          'regressor:RegDeepNet:number_updates': 4672,
 94 |          'regressor:RegDeepNet:solver': 'smorm3s',
 95 |          'regressor:RegDeepNet:std_layer_1': 0.0011091871005401157,
 96 |          'regressor:__choice__': 'RegDeepNet',
 97 |          'rescaling:__choice__': 'standardize'})),
 98 |      (0.100000, SimpleRegressionPipeline(configuration={
 99 |          'imputation:strategy': 'mean',
100 |          'one_hot_encoding:minimum_fraction': 0.0006151267694526832,
101 |          'one_hot_encoding:use_minimum_fraction': 'True',
102 |          'preprocessor:__choice__': 'no_preprocessing',
103 |          'regressor:RegDeepNet:activation': 'tanh',
104 |          'regressor:RegDeepNet:batch_size': 1293,
105 |          'regressor:RegDeepNet:dropout_layer_1': 0.024322298790122678,
106 |          'regressor:RegDeepNet:dropout_layer_2': 0.4831886801640319,
107 |          'regressor:RegDeepNet:dropout_layer_3': 0.7303058944461246,
108 |          'regressor:RegDeepNet:dropout_output': 0.43112081941910074,
109 |          'regressor:RegDeepNet:lambda2': 4.561723820100022e-06,
110 |          'regressor:RegDeepNet:learning_rate': 0.001,
111 |          'regressor:RegDeepNet:num_layers': 'e',
112 |          'regressor:RegDeepNet:num_units_layer_1': 2999,
113 |          'regressor:RegDeepNet:num_units_layer_2': 1630,
114 |          'regressor:RegDeepNet:num_units_layer_3': 897,
115 |          'regressor:RegDeepNet:number_updates': 4471,
116 |          'regressor:RegDeepNet:solver': 'smorm3s',
117 |          'regressor:RegDeepNet:std_layer_1': 0.0013646791717249367,
118 |          'regressor:RegDeepNet:std_layer_2': 0.012431732856634247,
119 |          'regressor:RegDeepNet:std_layer_3': 0.002351992156794049,
120 |          'regressor:__choice__': 'RegDeepNet',
121 |          'rescaling:__choice__': 'standardize'})),
122 |      (0.060000, SimpleRegressionPipeline(configuration={
123 |          'imputation:strategy': 'mean',
124 |          'one_hot_encoding:minimum_fraction': 0.006283026157824821,
125 |          'one_hot_encoding:use_minimum_fraction': 'True',
126 |          'preprocessor:__choice__': 'no_preprocessing',
127 |          'regressor:RegDeepNet:activation': 'tanh',
128 |          'regressor:RegDeepNet:batch_size': 1802,
129 |          'regressor:RegDeepNet:dropout_layer_1': 0.01257793094940521,
130 |          'regressor:RegDeepNet:dropout_output': 0.023821950297696383,
131 |          'regressor:RegDeepNet:lambda2': 8.078248563082777e-05,
132 |          'regressor:RegDeepNet:learning_rate': 0.001,
133 |          'regressor:RegDeepNet:num_layers': 'c',
134 |          'regressor:RegDeepNet:num_units_layer_1': 3293,
135 |          'regressor:RegDeepNet:number_updates': 4842,
136 |          'regressor:RegDeepNet:solver': 'smorm3s',
137 |          'regressor:RegDeepNet:std_layer_1': 0.001130906938022124,
138 |          'regressor:__choice__': 'RegDeepNet',
139 |          'rescaling:__choice__': 'standardize'})),
140 |      ]
141 | 
142 | targets = []
143 | predictions = []
144 | predictions_valid = []
145 | predictions_test = []
146 | 
147 | 
148 | def fit_and_predict(estimator, weight, X, y):
149 |     try:
150 |         estimator.fit(X.copy(), y.copy())
151 |         pv = estimator.predict(X_valid.copy()) * weight
152 |         pt = estimator.predict(X_test.copy()) * weight
153 |     except Exception as e:
154 |         print(e)
155 |         print(estimator.configuration)
156 |         pv = None
157 |         pt = None
158 |     return pv, pt
159 | 
160 | 
161 | # Make predictions and weight them
162 | all_predictions = Parallel(n_jobs=-1)(delayed(fit_and_predict) \
163 |                                           (estimator, weight, X, y) for
164 |                                       weight, estimator in choices)
165 | for pv, pt in all_predictions:
166 |     predictions_valid.append(pv)
167 |     predictions_test.append(pt)
168 | 
169 | # Output the predictions
170 | for name, predictions in [('valid', predictions_valid),
171 |                           ('test', predictions_test)]:
172 |     predictions = np.array(predictions)
173 |     predictions = np.sum(predictions, axis=0).astype(np.float32)
174 | 
175 |     filepath = os.path.join(output, '%s_%s_000.predict' % (dataset, name))
176 |     np.savetxt(filepath, predictions, delimiter=' ', fmt='%.4e')
177 | 


--------------------------------------------------------------------------------
/DeepFeedNet.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse as sp
  3 | 
  4 | from ConfigSpace.configuration_space import ConfigurationSpace
  5 | from ConfigSpace.conditions import EqualsCondition, InCondition
  6 | from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
  7 |     UniformIntegerHyperparameter, CategoricalHyperparameter, Constant
  8 | 
  9 | from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
 10 | from autosklearn.pipeline.constants import *
 11 | 
 12 | 
 13 | class DeepFeedNet(AutoSklearnClassificationAlgorithm):
 14 | 
 15 |     def __init__(self, number_updates, batch_size, num_layers, num_units_layer_1,
 16 |                  dropout_layer_1, dropout_output, std_layer_1,
 17 |                  learning_rate, solver, lambda2, activation,
 18 |                  num_units_layer_2=10, num_units_layer_3=10, num_units_layer_4=10,
 19 |                  dropout_layer_2=0.5, dropout_layer_3=0.5, dropout_layer_4=0.5,
 20 |                  std_layer_2=0.005, std_layer_3=0.005, std_layer_4=0.005,
 21 |                  momentum=0.99, beta1=0.9, beta2=0.9, rho=0.95,
 22 |                  lr_policy='fixed', gamma=0.01, power=1.0, epoch_step=2,
 23 |                  random_state=None):
 24 |         self.number_updates = number_updates
 25 |         self.batch_size = batch_size
 26 |         # Hacky implementation of condition on number of layers
 27 |         self.num_layers = ord(num_layers) - ord('a')
 28 |         self.dropout_output = dropout_output
 29 |         self.learning_rate = learning_rate
 30 |         self.lr_policy = lr_policy
 31 |         self.lambda2 = lambda2
 32 |         self.momentum = momentum
 33 |         # Added 1-beta due to change in config space
 34 |         self.beta1 = 1-beta1
 35 |         self.beta2 = 1-beta2
 36 |         self.rho = rho
 37 |         self.solver = solver
 38 |         self.activation = activation
 39 |         self.gamma = gamma
 40 |         self.power = power
 41 |         self.epoch_step = epoch_step
 42 | 
 43 |         # Empty features and shape
 44 |         self.n_features = None
 45 |         self.input_shape = None
 46 |         self.m_issparse = False
 47 |         self.m_isbinary = False
 48 |         self.m_ismultilabel = False
 49 | 
 50 |         # To avoid eval call. Could be done with **karws
 51 |         args = locals()
 52 | 
 53 |         self.num_units_per_layer = []
 54 |         self.dropout_per_layer = []
 55 |         self.std_per_layer = []
 56 |         for i in range(1, self.num_layers):
 57 |             self.num_units_per_layer.append(int(args.get("num_units_layer_" + str(i))))
 58 |             self.dropout_per_layer.append(float(args.get("dropout_layer_" + str(i))))
 59 |             self.std_per_layer.append(float(args.get("std_layer_" + str(i))))
 60 |         self.estimator = None
 61 | 
 62 |     def _prefit(self, X, y):
 63 |         self.batch_size = int(self.batch_size)
 64 |         self.n_features = X.shape[1]
 65 |         self.input_shape = (self.batch_size, self.n_features)
 66 | 
 67 |         assert len(self.num_units_per_layer) == self.num_layers - 1,\
 68 |             "Number of created layers is different than actual layers"
 69 |         assert len(self.dropout_per_layer) == self.num_layers - 1,\
 70 |             "Number of created layers is different than actual layers"
 71 | 
 72 |         # TODO: Better if statement
 73 |         if len(y.shape) == 2 and y.shape[1] > 1:  # Multilabel
 74 |             self.m_ismultilabel = True
 75 |             self.num_output_units = y.shape[1]
 76 |         else:
 77 |             number_classes = len(np.unique(y.astype(int)))
 78 |             if number_classes == 2:  # Make it binary
 79 |                 self.m_isbinary = True
 80 |                 self.num_output_units = 1
 81 |                 if len(y.shape) == 1:
 82 |                     y = y[:, np.newaxis]
 83 |             else:
 84 |                 self.num_output_units = number_classes
 85 | 
 86 |         self.m_issparse = sp.issparse(X)
 87 | 
 88 |         return X, y
 89 | 
 90 |     def fit(self, X, y):
 91 | 
 92 |         Xf, yf = self._prefit(X, y)
 93 | 
 94 |         epoch = (self.number_updates * self.batch_size)//X.shape[0]
 95 |         number_epochs = min(max(2, epoch), 50)  # Capping of epochs
 96 | 
 97 |         from ...implementations import FeedForwardNet
 98 |         self.estimator = FeedForwardNet.FeedForwardNet(batch_size=self.batch_size,
 99 |                                                        input_shape=self.input_shape,
100 |                                                        num_layers=self.num_layers,
101 |                                                        num_units_per_layer=self.num_units_per_layer,
102 |                                                        dropout_per_layer=self.dropout_per_layer,
103 |                                                        std_per_layer=self.std_per_layer,
104 |                                                        num_output_units=self.num_output_units,
105 |                                                        dropout_output=self.dropout_output,
106 |                                                        learning_rate=self.learning_rate,
107 |                                                        lr_policy=self.lr_policy,
108 |                                                        lambda2=self.lambda2,
109 |                                                        momentum=self.momentum,
110 |                                                        beta1=self.beta1,
111 |                                                        beta2=self.beta2,
112 |                                                        rho=self.rho,
113 |                                                        solver=self.solver,
114 |                                                        activation=self.activation,
115 |                                                        num_epochs=number_epochs,
116 |                                                        gamma=self.gamma,
117 |                                                        power=self.power,
118 |                                                        epoch_step=self.epoch_step,
119 |                                                        is_sparse=self.m_issparse,
120 |                                                        is_binary=self.m_isbinary,
121 |                                                        is_multilabel=self.m_ismultilabel)
122 |         self.estimator.fit(Xf, yf)
123 |         return self
124 | 
125 |     def predict(self, X):
126 |         if self.estimator is None:
127 |             raise NotImplementedError
128 |         return self.estimator.predict(X, self.m_issparse)
129 | 
130 |     def predict_proba(self, X):
131 |         if self.estimator is None:
132 |             raise NotImplementedError()
133 |         return self.estimator.predict_proba(X, self.m_issparse)
134 | 
135 |     @staticmethod
136 |     def get_properties(dataset_properties=None):
137 |         return {'shortname': 'feed_nn',
138 |                 'name': 'Feed Forward Neural Network',
139 |                 'handles_regression': False,
140 |                 'handles_classification': True,
141 |                 'handles_multiclass': True,
142 |                 'handles_multilabel': True,
143 |                 'is_deterministic': True,
144 |                 'input': (DENSE, SPARSE, UNSIGNED_DATA),
145 |                 'output': (PREDICTIONS,)}
146 | 
147 |     @staticmethod
148 |     def get_hyperparameter_search_space(dataset_properties=None):
149 |         # GPUTRACK: Based on http://svail.github.io/rnn_perf/
150 |         # We make batch size and number of units multiples of 64
151 | 
152 |         # Hacky way to condition layers params based on the number of layers
153 |         # GPUTRACK: Reduced number of layers
154 |         # 'c'=1, 'd'=2, 'e'=3 ,'f'=4 + output_layer
155 |         # layer_choices = [chr(i) for i in xrange(ord('c'), ord('e'))]
156 | 
157 |         layer_choices = ['c', 'd', 'e']
158 | 
159 |         batch_size = UniformIntegerHyperparameter("batch_size",
160 |                                                   32, 4096,
161 |                                                   log=True,
162 |                                                   default=32)
163 | 
164 |         number_updates = UniformIntegerHyperparameter("number_updates",
165 |                                                       200, 3500,
166 |                                                       log=True,
167 |                                                       default=200)
168 | 
169 |         num_layers = CategoricalHyperparameter("num_layers",
170 |                                                choices=layer_choices,
171 |                                                default='c')
172 | 
173 |         num_units_layer_1 = UniformIntegerHyperparameter("num_units_layer_1",
174 |                                                          64, 4096,
175 |                                                          log=True,
176 |                                                          default=256)
177 | 
178 |         num_units_layer_2 = UniformIntegerHyperparameter("num_units_layer_2",
179 |                                                          64, 4096,
180 |                                                          log=True,
181 |                                                          default=128)
182 | 
183 |         num_units_layer_3 = UniformIntegerHyperparameter("num_units_layer_3",
184 |                                                          64, 4096,
185 |                                                          log=True,
186 |                                                          default=128)
187 | 
188 |         dropout_layer_1 = UniformFloatHyperparameter("dropout_layer_1",
189 |                                                      0.0, 0.99,
190 |                                                      default=0.5)
191 | 
192 |         dropout_layer_2 = UniformFloatHyperparameter("dropout_layer_2",
193 |                                                      0.0, 0.99,
194 |                                                      default=0.5)
195 | 
196 |         dropout_layer_3 = UniformFloatHyperparameter("dropout_layer_3",
197 |                                                      0.0, 0.99,
198 |                                                      default=0.5)
199 | 
200 |         dropout_output = UniformFloatHyperparameter("dropout_output",
201 |                                                     0.0, 0.99,
202 |                                                     default=0.5)
203 | 
204 |         lr = CategoricalHyperparameter("learning_rate",
205 |                                        choices=[1e-1, 1e-2, 1e-3, 1e-4],
206 |                                        default=1e-2)
207 | 
208 |         l2 = UniformFloatHyperparameter("lambda2", 1e-6, 1e-2, log=True,
209 |                                         default=1e-3)
210 | 
211 |         std_layer_1 = UniformFloatHyperparameter("std_layer_1", 0.001, 0.1,
212 |                                                  log=True,
213 |                                                  default=0.005)
214 | 
215 |         std_layer_2 = UniformFloatHyperparameter("std_layer_2", 0.001, 0.1,
216 |                                                  log=True,
217 |                                                  default=0.005)
218 | 
219 |         std_layer_3 = UniformFloatHyperparameter("std_layer_3", 0.001, 0.1,
220 |                                                  log=True,
221 |                                                  default=0.005)
222 | 
223 |         # Using Tobias' adam
224 |         solver = Constant(name="solver", value="smorm3s")
225 |         non_linearities = CategoricalHyperparameter(name='activation',
226 |                                                     choices=['relu', 'tanh'],
227 |                                                     default='relu')
228 | 
229 |         cs = ConfigurationSpace()
230 |         # cs.add_hyperparameter(number_epochs)
231 |         cs.add_hyperparameter(number_updates)
232 |         cs.add_hyperparameter(batch_size)
233 |         cs.add_hyperparameter(num_layers)
234 |         cs.add_hyperparameter(num_units_layer_1)
235 |         cs.add_hyperparameter(num_units_layer_2)
236 |         cs.add_hyperparameter(num_units_layer_3)
237 |         cs.add_hyperparameter(dropout_layer_1)
238 |         cs.add_hyperparameter(dropout_layer_2)
239 |         cs.add_hyperparameter(dropout_layer_3)
240 |         cs.add_hyperparameter(dropout_output)
241 |         cs.add_hyperparameter(std_layer_1)
242 |         cs.add_hyperparameter(std_layer_2)
243 |         cs.add_hyperparameter(std_layer_3)
244 |         cs.add_hyperparameter(lr)
245 |         cs.add_hyperparameter(l2)
246 |         cs.add_hyperparameter(solver)
247 |         cs.add_hyperparameter(non_linearities)
248 | 
249 |         layer_2_condition = InCondition(num_units_layer_2, num_layers,
250 |                                         ['d', 'e'])
251 |         layer_3_condition = InCondition(num_units_layer_3, num_layers,
252 |                                         ['e'])
253 |         cs.add_condition(layer_2_condition)
254 |         cs.add_condition(layer_3_condition)
255 | 
256 |         # Condition dropout parameter on layer choice
257 |         dropout_2_condition = InCondition(dropout_layer_2, num_layers,
258 |                                           ['d', 'e'])
259 |         dropout_3_condition = InCondition(dropout_layer_3, num_layers,
260 |                                           ['e'])
261 |         cs.add_condition(dropout_2_condition)
262 |         cs.add_condition(dropout_3_condition)
263 | 
264 |         # Condition std parameter on layer choice
265 |         std_2_condition = InCondition(std_layer_2, num_layers, ['d', 'e'])
266 |         std_3_condition = InCondition(std_layer_3, num_layers, ['e'])
267 |         cs.add_condition(std_2_condition)
268 |         cs.add_condition(std_3_condition)
269 | 
270 |         return cs
271 | 


--------------------------------------------------------------------------------
/FeedForwardNet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Jul 22, 2015
  3 | Modified on Apr 21, 2016
  4 | 
  5 | @author: Aaron Klein
  6 | @modified: Hector Mendoza
  7 | """
  8 | import numpy as np
  9 | import theano
 10 | import theano.tensor as T
 11 | import theano.sparse as S
 12 | import lasagne
 13 | 
 14 | DEBUG = True
 15 | 
 16 | 
 17 | def sharedX(X, dtype=theano.config.floatX, name=None):
 18 |     return theano.shared(np.asarray(X, dtype=dtype), name=name)
 19 | 
 20 | 
 21 | def smorms3(cost, params, learning_rate=1e-3, eps=1e-16, gather=False):
 22 |     updates = []
 23 |     optim_params = []
 24 |     grads = T.grad(cost, params)
 25 | 
 26 |     for p, grad in zip(params, grads):
 27 |         mem = sharedX(p.get_value() * 0. + 1.)
 28 |         g = sharedX(p.get_value() * 0.)
 29 |         g2 = sharedX(p.get_value() * 0.)
 30 |         if gather:
 31 |             optim_params.append(mem)
 32 |             optim_params.append(g)
 33 |             optim_params.append(g2)
 34 | 
 35 |         r_t = 1. / (mem + 1)
 36 |         g_t = (1 - r_t) * g + r_t * grad
 37 |         g2_t = (1 - r_t) * g2 + r_t * grad**2
 38 |         p_t = p - grad * T.minimum(learning_rate, g_t * g_t / (g2_t + eps)) / \
 39 |                   (T.sqrt(g2_t + eps) + eps)
 40 |         mem_t = 1 + mem * (1 - g_t * g_t / (g2_t + eps))
 41 | 
 42 |         updates.append((g, g_t))
 43 |         updates.append((g2, g2_t))
 44 |         updates.append((p, p_t))
 45 |         updates.append((mem, mem_t))
 46 | 
 47 |     return updates
 48 | 
 49 | 
 50 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
 51 |     assert inputs.shape[0] == targets.shape[0],\
 52 |            "The number of training points is not the same"
 53 |     if shuffle:
 54 |         indices = np.arange(inputs.shape[0])
 55 |         np.random.shuffle(indices)
 56 |     for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize):
 57 |         if shuffle:
 58 |             excerpt = indices[start_idx:start_idx + batchsize]
 59 |         else:
 60 |             excerpt = slice(start_idx, start_idx + batchsize)
 61 |         yield inputs[excerpt], targets[excerpt]
 62 | 
 63 | 
 64 | class FeedForwardNet(object):
 65 |     def __init__(self, input_shape=(100, 28*28),
 66 |                  batch_size=100, num_layers=4, num_units_per_layer=(10, 10, 10),
 67 |                  dropout_per_layer=(0.5, 0.5, 0.5), std_per_layer=(0.005, 0.005, 0.005),
 68 |                  num_output_units=2, dropout_output=0.5, learning_rate=0.01,
 69 |                  lambda2=1e-4, momentum=0.9, beta1=0.9, beta2=0.9,
 70 |                  rho=0.95, solver="adam", num_epochs=2, activation='relu',
 71 |                  lr_policy="fixed", gamma=0.01, power=1.0, epoch_step=1,
 72 |                  is_sparse=False, is_binary=False, is_regression=False, is_multilabel=False):
 73 | 
 74 |         self.batch_size = batch_size
 75 |         self.input_shape = input_shape
 76 |         self.num_layers = num_layers
 77 |         self.num_units_per_layer = num_units_per_layer
 78 |         self.dropout_per_layer = np.asarray(dropout_per_layer, dtype=theano.config.floatX)
 79 |         self.num_output_units = num_output_units
 80 |         self.dropout_output = T.cast(dropout_output, dtype=theano.config.floatX)
 81 |         self.std_per_layer = np.asarray(std_per_layer, dtype=theano.config.floatX)
 82 |         self.momentum = T.cast(momentum, dtype=theano.config.floatX)
 83 |         self.learning_rate = np.asarray(learning_rate, dtype=theano.config.floatX)
 84 |         self.lambda2 = T.cast(lambda2, dtype=theano.config.floatX)
 85 |         self.beta1 = T.cast(beta1, dtype=theano.config.floatX)
 86 |         self.beta2 = T.cast(beta2, dtype=theano.config.floatX)
 87 |         self.rho = T.cast(rho, dtype=theano.config.floatX)
 88 |         # self.number_updates = number_updates
 89 |         self.num_epochs = num_epochs
 90 |         self.lr_policy = lr_policy
 91 |         self.gamma = np.asarray(gamma, dtype=theano.config.floatX)
 92 |         if power > 1.0:
 93 |             print('hyperparameter must be between 0 and 1')
 94 |             self.power = np.asarray(1.0, dtype=theano.config.floatX)
 95 |         else:
 96 |             self.power = np.asarray(power, dtype=theano.config.floatX)
 97 |         self.epoch_step = np.asarray(epoch_step, dtype=theano.config.floatX)
 98 |         self.is_binary = is_binary
 99 |         self.is_regression = is_regression
100 |         self.is_multilabel = is_multilabel
101 |         self.is_sparse = is_sparse
102 |         self.solver = solver
103 |         self.activation = activation
104 | 
105 |         if is_sparse:
106 |             input_var = S.csr_matrix('inputs', dtype=theano.config.floatX)
107 |         else:
108 |             input_var = T.matrix('inputs')
109 | 
110 |         if self.is_binary or self.is_multilabel or self.is_regression:
111 |             target_var = T.matrix('targets')
112 |         else:
113 |             target_var = T.ivector('targets')
114 | 
115 |         if DEBUG:
116 |             if self.is_binary:
117 |                 print("... using binary loss")
118 |             if self.is_multilabel:
119 |                 print("... using multilabel prediction")
120 |             if self.is_regression:
121 |                 print("... using regression loss")
122 |             print("... building network")
123 |             print input_shape
124 |             print("... with number of epochs")
125 |             print(num_epochs)
126 | 
127 |         self.network = lasagne.layers.InputLayer(shape=input_shape,
128 |                                                  input_var=input_var)
129 | 
130 |         # Choose hidden activation function
131 |         if self.is_binary or self.is_multilabel or self.is_regression:
132 |             activation_function = self.binary_activation.get(self.activation,
133 |                                                              lasagne.nonlinearities.tanh)
134 |         else:
135 |             activation_function = self.multiclass_activation.get(self.activation,
136 |                                                                  lasagne.nonlinearities.rectify)
137 | 
138 |         # Define each layer
139 |         for i in range(num_layers - 1):
140 |             self.network = lasagne.layers.DenseLayer(
141 |                  lasagne.layers.dropout(self.network,
142 |                                         p=self.dropout_per_layer[i]),
143 |                  num_units=self.num_units_per_layer[i],
144 |                  W=lasagne.init.GlorotNormal(gain=1.0),
145 |                  b=lasagne.init.Constant(val=0.0),
146 |                  nonlinearity=activation_function)
147 | 
148 |         # Define output layer and nonlinearity of last layer
149 |         if self.is_regression:
150 |             output_activation = lasagne.nonlinearities.linear
151 |         elif self.is_binary or self.is_multilabel:
152 |             output_activation = lasagne.nonlinearities.sigmoid
153 |         else:
154 |             output_activation = lasagne.nonlinearities.softmax
155 | 
156 |         self.network = lasagne.layers.DenseLayer(
157 |                  lasagne.layers.dropout(self.network,
158 |                                         p=self.dropout_output),
159 |                  num_units=self.num_output_units,
160 |                  W=lasagne.init.GlorotNormal(),
161 |                  b=lasagne.init.Constant(),
162 |                  nonlinearity=output_activation)
163 | 
164 |         prediction = lasagne.layers.get_output(self.network)
165 | 
166 |         if self.is_regression:
167 |             loss_function = lasagne.objectives.squared_error
168 |         elif self.is_binary or self.is_multilabel:
169 |             loss_function = lasagne.objectives.binary_crossentropy
170 |         else:
171 |             loss_function = lasagne.objectives.categorical_crossentropy
172 | 
173 |         loss = loss_function(prediction, target_var)
174 | 
175 |         # Aggregate loss mean function with l2 Regularization on all layers' params
176 |         if self.is_binary or self.is_multilabel:
177 |             loss = T.sum(loss, dtype=theano.config.floatX)
178 |         else:
179 |             loss = T.mean(loss, dtype=theano.config.floatX)
180 |         l2_penalty = self.lambda2 * lasagne.regularization.regularize_network_params(
181 |             self.network, lasagne.regularization.l2)
182 |         loss += l2_penalty
183 |         params = lasagne.layers.get_all_params(self.network, trainable=True)
184 | 
185 |         # Create the symbolic scalar lr for loss & updates function
186 |         lr_scalar = T.scalar('lr', dtype=theano.config.floatX)
187 | 
188 |         if solver == "nesterov":
189 |             updates = lasagne.updates.nesterov_momentum(loss, params,
190 |                                                         learning_rate=lr_scalar,
191 |                                                         momentum=self.momentum)
192 |         elif solver == "adam":
193 |             updates = lasagne.updates.adam(loss, params,
194 |                                            learning_rate=lr_scalar,
195 |                                            beta1=self.beta1, beta2=self.beta2)
196 |         elif solver == "adadelta":
197 |             updates = lasagne.updates.adadelta(loss, params,
198 |                                                learning_rate=lr_scalar,
199 |                                                rho=self.rho)
200 |         elif solver == "adagrad":
201 |             updates = lasagne.updates.adagrad(loss, params,
202 |                                               learning_rate=lr_scalar)
203 |         elif solver == "sgd":
204 |             updates = lasagne.updates.sgd(loss, params,
205 |                                           learning_rate=lr_scalar)
206 |         elif solver == "momentum":
207 |             updates = lasagne.updates.momentum(loss, params,
208 |                                                learning_rate=lr_scalar,
209 |                                                momentum=self.momentum)
210 |         elif solver == "smorm3s":
211 |             updates = smorms3(loss, params,
212 |                               learning_rate=lr_scalar)
213 |         else:
214 |             updates = lasagne.updates.sgd(loss, params,
215 |                                           learning_rate=lr_scalar)
216 | 
217 |         if DEBUG:
218 |             print("... compiling theano functions")
219 |         self.train_fn = theano.function([input_var, target_var, lr_scalar],
220 |                                         loss,
221 |                                         updates=updates,
222 |                                         allow_input_downcast=True,
223 |                                         profile=False,
224 |                                         on_unused_input='warn',
225 |                                         name='train_fn')
226 |         if DEBUG:
227 |             print('... compiling update function')
228 |         self.update_function = self._policy_function()
229 | 
230 |     def _policy_function(self):
231 |         epoch, gm, powr, step = T.scalars('epoch', 'gm', 'powr', 'step')
232 |         if self.lr_policy == 'inv':
233 |             decay = T.power(1.0+gm*epoch, -powr)
234 |         elif self.lr_policy == 'exp':
235 |             decay = gm ** epoch
236 |         elif self.lr_policy == 'step':
237 |             decay = T.switch(T.eq(T.mod_check(epoch, step), 0.0),
238 |                              T.power(gm, T.floor_div(epoch, step)),
239 |                              1.0)
240 |         elif self.lr_policy == 'fixed':
241 |             decay = T.constant(1.0, name='fixed', dtype=theano.config.floatX)
242 | 
243 |         return theano.function([gm, epoch, powr, step],
244 |                                decay,
245 |                                allow_input_downcast=True,
246 |                                on_unused_input='ignore',
247 |                                name='update_fn')
248 | 
249 |     def fit(self, X, y):
250 |         if self.batch_size > X.shape[0]:
251 |             self.batch_size = X.shape[0]
252 |             print('One update per epoch batch size')
253 | 
254 |         if self.is_sparse:
255 |             X = X.astype(np.float32)
256 |         else:
257 |             try:
258 |                 X = np.asarray(X, dtype=theano.config.floatX)
259 |                 y = np.asarray(y, dtype=theano.config.floatX)
260 |             except Exception as E:
261 |                 print('Fit casting error: %s' % E)
262 | 
263 |         for epoch in range(self.num_epochs):
264 |             train_err = 0
265 |             train_batches = 0
266 |             for inputs, targets in iterate_minibatches(X, y, self.batch_size, shuffle=True):
267 |                 train_err += self.train_fn(inputs, targets, self.learning_rate)
268 |                 train_batches += 1
269 |             decay = self.update_function(self.gamma, epoch+1.0,
270 |                                          self.power, self.epoch_step)
271 |             self.learning_rate *= decay
272 |             print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
273 |         return self
274 | 
275 |     def predict(self, X, is_sparse=False):
276 |         predictions = self.predict_proba(X, is_sparse)
277 |         if self.is_multilabel:
278 |             return np.round(predictions)
279 |         elif self.is_regression:
280 |             return predictions
281 |         else:
282 |             return np.argmax(predictions, axis=1)
283 | 
284 |     def predict_proba(self, X, is_sparse=False):
285 |         if is_sparse:
286 |             X = X.astype(np.float32)
287 |             X = S.as_sparse_or_tensor_variable(X)
288 |         else:
289 |             try:
290 |                 X = np.asarray(X, dtype=theano.config.floatX)
291 |             except Exception as E:
292 |                 print('Prediction casting error: %s' % E)
293 | 
294 |         predictions = lasagne.layers.get_output(self.network, X, deterministic=True).eval()
295 |         if self.is_binary:
296 |             return np.append(1.0 - predictions, predictions, axis=1)
297 |         else:
298 |             return predictions
299 | 
300 |     # TODO: Maybe create a utility module for constants
301 |     multiclass_activation = {
302 |         'softmax': lasagne.nonlinearities.softmax,
303 |         'relu': lasagne.nonlinearities.rectify,
304 |         'leaky': lasagne.nonlinearities.leaky_rectify,
305 |         'very_leaky': lasagne.nonlinearities.very_leaky_rectify,
306 |         'elu': lasagne.nonlinearities.elu,
307 |         'softplus': lasagne.nonlinearities.softplus,
308 |         'linear': lasagne.nonlinearities.linear,
309 |         'scaledTanh': lasagne.nonlinearities.ScaledTanH(scale_in=2./3.,
310 |                                                         scale_out=1.7159)
311 |     }
312 | 
313 |     binary_activation = {
314 |         'sigmoid': lasagne.nonlinearities.sigmoid,
315 |         'softplus': lasagne.nonlinearities.softplus,
316 |         'tahn': lasagne.nonlinearities.tanh,
317 |         'scaledTanh': lasagne.nonlinearities.ScaledTanH(scale_in=2./3.,
318 |                                                         scale_out=1.7159),
319 |         'elu': lasagne.nonlinearities.elu,
320 |         'relu': lasagne.nonlinearities.rectify,
321 |     }
322 | 
323 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, automl
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Code to reproduce tweakathon submissions of the team aad_freiburg for the Chalearn Automatic Machine Learning Challenge 2015
 2 | 
 3 | # Final 1
 4 | 
 5 | Code runs with ParamSklearn and auto-sklearn as in http://aad.informatik.uni-freiburg.de/downloads/automl_competition_2015_001.zip
 6 | 
 7 | # Final 2
 8 | 
 9 | Code runs with ParamSklearn and auto-sklearn as in http://aad.informatik.uni-freiburg.de/downloads/automl_competition_2015_002.zip
10 | 
11 | # Final 3
12 | 
13 | Code runs with auto-sklearn as in http://aad.informatik.uni-freiburg.de/downloads/automl_competition_2015_003.zip
14 | To obtain predictions for alexis, one has to install Lasagne and Theano.
15 | 
16 | # Final 4
17 | 
18 | Code runs with development branch of auto-sklearn as of commit b76a2b8e51856f7a2f9db53082b6d0f1cb23ed5a (https://github.com/automl/auto-sklearn/commit/b76a2b8e51856f7a2f9db53082b6d0f1cb23ed5a).
19 | To obtain predictions for dataset 4 and 5 one needs lasagne and theano as written in the respective python files.
20 | 


--------------------------------------------------------------------------------
/RegDeepNet.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.sparse as sp
  3 | 
  4 | from ConfigSpace.configuration_space import ConfigurationSpace
  5 | from ConfigSpace.conditions import EqualsCondition, InCondition
  6 | from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
  7 |     UniformIntegerHyperparameter, CategoricalHyperparameter, Constant
  8 | 
  9 | from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm
 10 | from autosklearn.pipeline.constants import *
 11 | 
 12 | 
 13 | class RegDeepNet(AutoSklearnRegressionAlgorithm):
 14 | 
 15 |     def __init__(self, number_updates, batch_size, num_layers, num_units_layer_1,
 16 |                  dropout_layer_1, dropout_output, std_layer_1,
 17 |                  learning_rate, solver, lambda2, activation,
 18 |                  num_units_layer_2=10, num_units_layer_3=10, num_units_layer_4=10,
 19 |                  num_units_layer_5=10, num_units_layer_6=10,
 20 |                  dropout_layer_2=0.5, dropout_layer_3=0.5, dropout_layer_4=0.5,
 21 |                  dropout_layer_5=0.5, dropout_layer_6=0.5,
 22 |                  std_layer_2=0.005, std_layer_3=0.005, std_layer_4=0.005,
 23 |                  std_layer_5=0.005, std_layer_6=0.005,
 24 |                  momentum=0.99, beta1=0.9, beta2=0.9, rho=0.95,
 25 |                  lr_policy='fixed', gamma=0.01, power=1.0, epoch_step=2,
 26 |                  random_state=None):
 27 |         self.number_updates = number_updates
 28 |         self.batch_size = batch_size
 29 |         # Hacky implementation of condition on number of layers
 30 |         self.num_layers = ord(num_layers) - ord('a')
 31 |         self.dropout_output = dropout_output
 32 |         self.learning_rate = learning_rate
 33 |         self.lr_policy = lr_policy
 34 |         self.lambda2 = lambda2
 35 |         self.momentum = momentum
 36 |         self.beta1 = 1-beta1
 37 |         self.beta2 = 1-beta2
 38 |         self.rho = rho
 39 |         self.solver = solver
 40 |         self.activation = activation
 41 |         self.gamma = gamma
 42 |         self.power = power
 43 |         self.epoch_step = epoch_step
 44 | 
 45 |         # Empty features and shape
 46 |         self.n_features = None
 47 |         self.input_shape = None
 48 |         self.m_issparse = False
 49 |         self.m_isregression = True
 50 | 
 51 |         # To avoid eval call. Could be done with **karws
 52 |         args = locals()
 53 | 
 54 |         self.num_units_per_layer = []
 55 |         self.dropout_per_layer = []
 56 |         self.std_per_layer = []
 57 |         for i in range(1, self.num_layers):
 58 |             self.num_units_per_layer.append(int(args.get("num_units_layer_" + str(i))))
 59 |             self.dropout_per_layer.append(float(args.get("dropout_layer_" + str(i))))
 60 |             self.std_per_layer.append(float(args.get("std_layer_" + str(i))))
 61 |         self.estimator = None
 62 | 
 63 |     def _prefit(self, X, y):
 64 |         self.batch_size = int(self.batch_size)
 65 |         self.n_features = X.shape[1]
 66 |         self.input_shape = (self.batch_size, self.n_features)
 67 | 
 68 |         assert len(self.num_units_per_layer) == self.num_layers - 1,\
 69 |             "Number of created layers is different than actual layers"
 70 |         assert len(self.dropout_per_layer) == self.num_layers - 1,\
 71 |             "Number of created layers is different than actual layers"
 72 | 
 73 |         self.num_output_units = 1  # Regression
 74 |         # Normalize the output - Suggestion on 24.04
 75 |         self.mean_y = np.mean(y)
 76 |         self.std_y = np.std(y)
 77 |         y = (y - self.mean_y) / self.std_y
 78 |         if len(y.shape) == 1:
 79 |             y = y[:, np.newaxis]
 80 | 
 81 |         self.m_issparse = sp.issparse(X)
 82 | 
 83 |         return X, y
 84 | 
 85 |     def fit(self, X, y):
 86 | 
 87 |         Xf, yf = self._prefit(X, y)
 88 | 
 89 |         epoch = (self.number_updates * self.batch_size)//X.shape[0]
 90 |         number_epochs = min(max(2, epoch), 50)  # Cap the max number of possible epochs
 91 | 
 92 |         from ...implementations import FeedForwardNet
 93 |         self.estimator = FeedForwardNet.FeedForwardNet(batch_size=self.batch_size,
 94 |                                                        input_shape=self.input_shape,
 95 |                                                        num_layers=self.num_layers,
 96 |                                                        num_units_per_layer=self.num_units_per_layer,
 97 |                                                        dropout_per_layer=self.dropout_per_layer,
 98 |                                                        std_per_layer=self.std_per_layer,
 99 |                                                        num_output_units=self.num_output_units,
100 |                                                        dropout_output=self.dropout_output,
101 |                                                        learning_rate=self.learning_rate,
102 |                                                        lr_policy=self.lr_policy,
103 |                                                        lambda2=self.lambda2,
104 |                                                        momentum=self.momentum,
105 |                                                        beta1=self.beta1,
106 |                                                        beta2=self.beta2,
107 |                                                        rho=self.rho,
108 |                                                        solver=self.solver,
109 |                                                        activation=self.activation,
110 |                                                        num_epochs=number_epochs,
111 |                                                        gamma=self.gamma,
112 |                                                        power=self.power,
113 |                                                        epoch_step=self.epoch_step,
114 |                                                        is_sparse=self.m_issparse,
115 |                                                        is_binary=False,
116 |                                                        is_regression=self.m_isregression)
117 |         self.estimator.fit(Xf, yf)
118 |         return self
119 | 
120 |     def predict(self, X):
121 |         if self.estimator is None:
122 |             raise NotImplementedError
123 |         preds = self.estimator.predict(X, self. m_issparse)
124 |         return preds * self.std_y + self.mean_y
125 | 
126 |     def predict_proba(self, X):
127 |         if self.estimator is None:
128 |             raise NotImplementedError()
129 |         return self.estimator.predict_proba(X, self.m_issparse)
130 | 
131 |     @staticmethod
132 |     def get_properties(dataset_properties=None):
133 |         return {'shortname': 'feed_nn',
134 |                 'name': 'Feed Forward Neural Network',
135 |                 'handles_regression': True,
136 |                 'handles_classification': False,
137 |                 'handles_multiclass': False,
138 |                 'handles_multilabel': False,
139 |                 'is_deterministic': True,
140 |                 'input': (DENSE, SPARSE, UNSIGNED_DATA),
141 |                 'output': (PREDICTIONS,)}
142 | 
143 |     @staticmethod
144 |     def get_hyperparameter_search_space(dataset_properties=None):
145 |         # GPUTRACK: Based on http://svail.github.io/rnn_perf/
146 |         # We make batch size and number of units multiples of 64
147 | 
148 |         # Hacky way to condition layers params based on the number of layers
149 |         # GPUTRACK: Reduced number of layers
150 |         # 'c'=1, 'd'=2, 'e'=3 ,'f'=4 + output_layer
151 |         # layer_choices = [chr(i) for i in xrange(ord('c'), ord('e'))]
152 | 
153 |         layer_choices = ['c', 'd', 'e']
154 | 
155 |         batch_size = UniformIntegerHyperparameter("batch_size",
156 |                                                   64, 2048,
157 |                                                   default=550)
158 | 
159 |         number_updates = UniformIntegerHyperparameter("number_updates",
160 |                                                       200, 5500,
161 |                                                       log=True,
162 |                                                       default=512)
163 | 
164 |         num_layers = CategoricalHyperparameter("num_layers",
165 |                                                choices=layer_choices,
166 |                                                default='c')
167 | 
168 |         num_units_layer_1 = UniformIntegerHyperparameter("num_units_layer_1",
169 |                                                          64, 4096,
170 |                                                          default=128)
171 | 
172 |         num_units_layer_2 = UniformIntegerHyperparameter("num_units_layer_2",
173 |                                                          64, 4096,
174 |                                                          default=128)
175 |         num_units_layer_3 = UniformIntegerHyperparameter("num_units_layer_3",
176 |                                                          64, 4096,
177 |                                                          log=True,
178 |                                                          default=128)
179 | 
180 |         dropout_layer_1 = UniformFloatHyperparameter("dropout_layer_1",
181 |                                                      0.0, 0.99,
182 |                                                      default=0.5)
183 | 
184 |         dropout_layer_2 = UniformFloatHyperparameter("dropout_layer_2",
185 |                                                      0.0, 0.99,
186 |                                                      default=0.5)
187 | 
188 |         dropout_layer_3 = UniformFloatHyperparameter("dropout_layer_3",
189 |                                                      0.0, 0.99,
190 |                                                      default=0.5)
191 | 
192 |         dropout_output = UniformFloatHyperparameter("dropout_output",
193 |                                                     0.0, 0.99,
194 |                                                     default=0.5)
195 | 
196 |         lr = CategoricalHyperparameter("learning_rate",
197 |                                        choices=[1e-1, 1e-2, 1e-3, 1e-4],
198 |                                        default=1e-2)
199 | 
200 |         l2 = UniformFloatHyperparameter("lambda2", 1e-6, 1e-2, log=True,
201 |                                         default=1e-3)
202 | 
203 |         std_layer_1 = UniformFloatHyperparameter("std_layer_1", 0.001, 0.1,
204 |                                                  log=True,
205 |                                                  default=0.005)
206 | 
207 |         std_layer_2 = UniformFloatHyperparameter("std_layer_2", 0.001, 0.1,
208 |                                                  log=True,
209 |                                                  default=0.005)
210 | 
211 |         std_layer_3 = UniformFloatHyperparameter("std_layer_3", 0.001, 0.1,
212 |                                                  log=True,
213 |                                                  default=0.005)
214 | 
215 |         # Using Tobias' adam
216 |         solver = Constant(name="solver", value="smorm3s")
217 | 
218 |         non_linearities = CategoricalHyperparameter(name='activation',
219 |                                                     choices=['tanh', 'scaledTanh', 'sigmoid'],
220 |                                                     default='tanh')
221 | 
222 |         cs = ConfigurationSpace()
223 |         # cs.add_hyperparameter(number_epochs)
224 |         cs.add_hyperparameter(number_updates)
225 |         cs.add_hyperparameter(batch_size)
226 |         cs.add_hyperparameter(num_layers)
227 |         cs.add_hyperparameter(num_units_layer_1)
228 |         cs.add_hyperparameter(num_units_layer_2)
229 |         cs.add_hyperparameter(num_units_layer_3)
230 |         cs.add_hyperparameter(dropout_layer_1)
231 |         cs.add_hyperparameter(dropout_layer_2)
232 |         cs.add_hyperparameter(dropout_layer_3)
233 |         cs.add_hyperparameter(dropout_output)
234 |         cs.add_hyperparameter(std_layer_1)
235 |         cs.add_hyperparameter(std_layer_2)
236 |         cs.add_hyperparameter(std_layer_3)
237 |         cs.add_hyperparameter(lr)
238 |         cs.add_hyperparameter(l2)
239 |         cs.add_hyperparameter(solver)
240 |         cs.add_hyperparameter(non_linearities)
241 | 
242 |         layer_2_condition = InCondition(num_units_layer_2, num_layers,
243 |                                         ['d', 'e'])
244 |         layer_3_condition = InCondition(num_units_layer_3, num_layers,
245 |                                         ['e'])
246 |         cs.add_condition(layer_2_condition)
247 |         cs.add_condition(layer_3_condition)
248 | 
249 |         # Condition dropout parameter on layer choice
250 |         dropout_2_condition = InCondition(dropout_layer_2, num_layers,
251 |                                           ['d', 'e'])
252 |         dropout_3_condition = InCondition(dropout_layer_3, num_layers,
253 |                                           ['e'])
254 |         cs.add_condition(dropout_2_condition)
255 |         cs.add_condition(dropout_3_condition)
256 | 
257 |         # Condition std parameter on layer choice
258 |         std_2_condition = InCondition(std_layer_2, num_layers, ['d', 'e'])
259 |         std_3_condition = InCondition(std_layer_3, num_layers, ['e'])
260 |         cs.add_condition(std_2_condition)
261 |         cs.add_condition(std_3_condition)
262 | 
263 |         return cs
264 | 


--------------------------------------------------------------------------------