├── README.md ├── blend.py └── load_data.py /README.md: -------------------------------------------------------------------------------- 1 | kaggle_pbr 2 | ========== 3 | 4 | My best submission to the Kaggle competition "Predicting a Biological Response", ranked 17th over 711 teams. -------------------------------------------------------------------------------- /blend.py: -------------------------------------------------------------------------------- 1 | """Kaggle competition: Predicting a Biological Response. 2 | 3 | Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to 4 | [0,1]. The blending scheme is related to the idea Jose H. Solorzano 5 | presented here: 6 | http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950 7 | '''You can try this: In one of the 5 folds, train the models, then use 8 | the results of the models as 'variables' in logistic regression over 9 | the validation data of that fold'''. Or at least this is the 10 | implementation of my understanding of that idea :-) 11 | 12 | The predictions are saved in test.csv. The code below created my best 13 | submission to the competition: 14 | - public score (25%): 0.43464 15 | - private score (75%): 0.37751 16 | - final rank on the private leaderboard: 17th over 711 teams :-) 17 | 18 | Note: if you increase the number of estimators of the classifiers, 19 | e.g. n_estimators=1000, you get a better score/rank on the private 20 | test set. 21 | 22 | Copyright 2012, Emanuele Olivetti. 23 | BSD license, 3 clauses. 24 | """ 25 | 26 | from __future__ import division 27 | import numpy as np 28 | import load_data 29 | from sklearn.cross_validation import StratifiedKFold 30 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 31 | from sklearn.ensemble import GradientBoostingClassifier 32 | from sklearn.linear_model import LogisticRegression 33 | 34 | 35 | def logloss(attempt, actual, epsilon=1.0e-15): 36 | """Logloss, i.e. the score of the bioresponse competition. 37 | """ 38 | attempt = np.clip(attempt, epsilon, 1.0-epsilon) 39 | return - np.mean(actual * np.log(attempt) + 40 | (1.0 - actual) * np.log(1.0 - attempt)) 41 | 42 | 43 | if __name__ == '__main__': 44 | 45 | np.random.seed(0) # seed to shuffle the train set 46 | 47 | n_folds = 10 48 | verbose = True 49 | shuffle = False 50 | 51 | X, y, X_submission = load_data.load() 52 | 53 | if shuffle: 54 | idx = np.random.permutation(y.size) 55 | X = X[idx] 56 | y = y[idx] 57 | 58 | skf = list(StratifiedKFold(y, n_folds)) 59 | 60 | clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), 61 | RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), 62 | ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), 63 | ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), 64 | GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)] 65 | 66 | print "Creating train and test sets for blending." 67 | 68 | dataset_blend_train = np.zeros((X.shape[0], len(clfs))) 69 | dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) 70 | 71 | for j, clf in enumerate(clfs): 72 | print j, clf 73 | dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) 74 | for i, (train, test) in enumerate(skf): 75 | print "Fold", i 76 | X_train = X[train] 77 | y_train = y[train] 78 | X_test = X[test] 79 | y_test = y[test] 80 | clf.fit(X_train, y_train) 81 | y_submission = clf.predict_proba(X_test)[:, 1] 82 | dataset_blend_train[test, j] = y_submission 83 | dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] 84 | dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) 85 | 86 | print 87 | print "Blending." 88 | clf = LogisticRegression() 89 | clf.fit(dataset_blend_train, y) 90 | y_submission = clf.predict_proba(dataset_blend_test)[:, 1] 91 | 92 | print "Linear stretch of predictions to [0,1]" 93 | y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) 94 | 95 | print "Saving Results." 96 | tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T 97 | np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f', 98 | header='MoleculeId,PredictedProbability', comments='') 99 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions to load the dataset. 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | def read_data(file_name): 9 | """This function is adapted from: 10 | https://github.com/benhamner/BioResponse/blob/master/Benchmarks/csv_io.py 11 | """ 12 | f = open(file_name) 13 | # skip header 14 | f.readline() 15 | samples = [] 16 | for line in f: 17 | line = line.strip().split(",") 18 | sample = [float(x) for x in line] 19 | samples.append(sample) 20 | return samples 21 | 22 | 23 | def load(): 24 | """Conveninence function to load all data as numpy arrays. 25 | """ 26 | print "Loading data..." 27 | train = read_data("data/train.csv") 28 | y_train = np.array([x[0] for x in train]) 29 | X_train = np.array([x[1:] for x in train]) 30 | X_test = np.array(read_data("data/test.csv")) 31 | return X_train, y_train, X_test 32 | 33 | if __name__ == '__main__': 34 | 35 | X_train, y_train, X_test = load() 36 | --------------------------------------------------------------------------------