├── .gitignore └── otto └── hyperopt_xgboost.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /otto/hyperopt_xgboost.py: -------------------------------------------------------------------------------- 1 | from sklearn.cross_validation import train_test_split 2 | from sklearn.metrics import log_loss 3 | from sklearn import preprocessing 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from hyperopt import hp 9 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 10 | 11 | import sys 12 | # The path to XGBoost wrappers goes here 13 | sys.path.append('C:\\Users\\Amine\\Documents\\GitHub\\xgboost\\wrapper') 14 | import xgboost as xgb 15 | 16 | 17 | def load_train(): 18 | train = pd.read_csv('../data/train.csv') 19 | labels = train.target.values 20 | lbl_enc = preprocessing.LabelEncoder() 21 | labels = lbl_enc.fit_transform(labels) 22 | train = train.drop('id', axis=1) 23 | train = train.drop('target', axis=1) 24 | return train.values, labels.astype('int32') 25 | 26 | 27 | def load_test(): 28 | test = pd.read_csv('../data/test.csv') 29 | test = test.drop('id', axis=1) 30 | return test.values 31 | 32 | 33 | def write_submission(preds, output): 34 | sample = pd.read_csv('../data/sampleSubmission.csv') 35 | preds = pd.DataFrame( 36 | preds, index=sample.id.values, columns=sample.columns[1:]) 37 | preds.to_csv(output, index_label='id') 38 | 39 | 40 | def score(params): 41 | print "Training with params : " 42 | print params 43 | num_round = int(params['n_estimators']) 44 | del params['n_estimators'] 45 | dtrain = xgb.DMatrix(X_train, label=y_train) 46 | dvalid = xgb.DMatrix(X_test, label=y_test) 47 | # watchlist = [(dvalid, 'eval'), (dtrain, 'train')] 48 | model = xgb.train(params, dtrain, num_round) 49 | predictions = model.predict(dvalid).reshape((X_test.shape[0], 9)) 50 | score = log_loss(y_test, predictions) 51 | print "\tScore {0}\n\n".format(score) 52 | return {'loss': score, 'status': STATUS_OK} 53 | 54 | 55 | def optimize(trials): 56 | space = { 57 | 'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1), 58 | 'eta' : hp.quniform('eta', 0.025, 0.5, 0.025), 59 | 'max_depth' : hp.quniform('max_depth', 1, 13, 1), 60 | 'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1), 61 | 'subsample' : hp.quniform('subsample', 0.5, 1, 0.05), 62 | 'gamma' : hp.quniform('gamma', 0.5, 1, 0.05), 63 | 'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05), 64 | 'num_class' : 9, 65 | 'eval_metric': 'mlogloss', 66 | 'objective': 'multi:softprob', 67 | 'nthread' : 6, 68 | 'silent' : 1 69 | } 70 | 71 | best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250) 72 | 73 | print best 74 | 75 | 76 | X, y = load_train() 77 | print "Splitting data into train and valid ...\n\n" 78 | X_train, X_test, y_train, y_test = train_test_split( 79 | X, y, test_size=0.2, random_state=1234) 80 | 81 | #Trials object where the history of search will be stored 82 | trials = Trials() 83 | 84 | optimize(trials) --------------------------------------------------------------------------------