├── .gitignore
└── otto
    └── hyperopt_xgboost.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/otto/hyperopt_xgboost.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cross_validation import train_test_split
 2 | from sklearn.metrics import log_loss
 3 | from sklearn import preprocessing
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from hyperopt import hp
 9 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
10 | 
11 | import sys
12 | # The path to XGBoost wrappers goes here
13 | sys.path.append('C:\\Users\\Amine\\Documents\\GitHub\\xgboost\\wrapper')
14 | import xgboost as xgb
15 | 
16 | 
17 | def load_train():
18 |     train = pd.read_csv('../data/train.csv')
19 |     labels = train.target.values
20 |     lbl_enc = preprocessing.LabelEncoder()
21 |     labels = lbl_enc.fit_transform(labels)
22 |     train = train.drop('id', axis=1)
23 |     train = train.drop('target', axis=1)
24 |     return train.values, labels.astype('int32')
25 | 
26 | 
27 | def load_test():
28 |     test = pd.read_csv('../data/test.csv')
29 |     test = test.drop('id', axis=1)
30 |     return test.values
31 | 
32 | 
33 | def write_submission(preds, output):
34 |     sample = pd.read_csv('../data/sampleSubmission.csv')
35 |     preds = pd.DataFrame(
36 |         preds, index=sample.id.values, columns=sample.columns[1:])
37 |     preds.to_csv(output, index_label='id')
38 | 
39 | 
40 | def score(params):
41 |     print "Training with params : "
42 |     print params
43 |     num_round = int(params['n_estimators'])
44 |     del params['n_estimators']
45 |     dtrain = xgb.DMatrix(X_train, label=y_train)
46 |     dvalid = xgb.DMatrix(X_test, label=y_test)
47 |     # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
48 |     model = xgb.train(params, dtrain, num_round)
49 |     predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
50 |     score = log_loss(y_test, predictions)
51 |     print "\tScore {0}\n\n".format(score)
52 |     return {'loss': score, 'status': STATUS_OK}
53 | 
54 | 
55 | def optimize(trials):
56 |     space = {
57 |              'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
58 |              'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
59 |              'max_depth' : hp.quniform('max_depth', 1, 13, 1),
60 |              'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
61 |              'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
62 |              'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
63 |              'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
64 |              'num_class' : 9,
65 |              'eval_metric': 'mlogloss',
66 |              'objective': 'multi:softprob',
67 |              'nthread' : 6,
68 |              'silent' : 1
69 |              }
70 | 
71 |     best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)
72 | 
73 |     print best
74 | 
75 | 
76 | X, y = load_train()
77 | print "Splitting data into train and valid ...\n\n"
78 | X_train, X_test, y_train, y_test = train_test_split(
79 |     X, y, test_size=0.2, random_state=1234)
80 | 
81 | #Trials object where the history of search will be stored
82 | trials = Trials()
83 | 
84 | optimize(trials)


--------------------------------------------------------------------------------