├── README.md
├── blend.py
└── load_data.py


/README.md:
--------------------------------------------------------------------------------
1 | kaggle_pbr
2 | ==========
3 | 
4 | My best submission to the Kaggle competition "Predicting a Biological Response", ranked 17th over 711 teams.


--------------------------------------------------------------------------------
/blend.py:
--------------------------------------------------------------------------------
 1 | """Kaggle competition: Predicting a Biological Response.
 2 | 
 3 | Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
 4 | [0,1]. The blending scheme is related to the idea Jose H. Solorzano
 5 | presented here:
 6 | http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
 7 | '''You can try this: In one of the 5 folds, train the models, then use
 8 | the results of the models as 'variables' in logistic regression over
 9 | the validation data of that fold'''. Or at least this is the
10 | implementation of my understanding of that idea :-)
11 | 
12 | The predictions are saved in test.csv. The code below created my best
13 | submission to the competition:
14 | - public score (25%): 0.43464
15 | - private score (75%): 0.37751
16 | - final rank on the private leaderboard: 17th over 711 teams :-)
17 | 
18 | Note: if you increase the number of estimators of the classifiers,
19 | e.g. n_estimators=1000, you get a better score/rank on the private
20 | test set.
21 | 
22 | Copyright 2012, Emanuele Olivetti.
23 | BSD license, 3 clauses.
24 | """
25 | 
26 | from __future__ import division
27 | import numpy as np
28 | import load_data
29 | from sklearn.cross_validation import StratifiedKFold
30 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
31 | from sklearn.ensemble import GradientBoostingClassifier
32 | from sklearn.linear_model import LogisticRegression
33 | 
34 | 
35 | def logloss(attempt, actual, epsilon=1.0e-15):
36 |     """Logloss, i.e. the score of the bioresponse competition.
37 |     """
38 |     attempt = np.clip(attempt, epsilon, 1.0-epsilon)
39 |     return - np.mean(actual * np.log(attempt) +
40 |                      (1.0 - actual) * np.log(1.0 - attempt))
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 
45 |     np.random.seed(0)  # seed to shuffle the train set
46 | 
47 |     n_folds = 10
48 |     verbose = True
49 |     shuffle = False
50 | 
51 |     X, y, X_submission = load_data.load()
52 | 
53 |     if shuffle:
54 |         idx = np.random.permutation(y.size)
55 |         X = X[idx]
56 |         y = y[idx]
57 | 
58 |     skf = list(StratifiedKFold(y, n_folds))
59 | 
60 |     clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
61 |             RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
62 |             ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
63 |             ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
64 |             GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
65 | 
66 |     print "Creating train and test sets for blending."
67 | 
68 |     dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
69 |     dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
70 | 
71 |     for j, clf in enumerate(clfs):
72 |         print j, clf
73 |         dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
74 |         for i, (train, test) in enumerate(skf):
75 |             print "Fold", i
76 |             X_train = X[train]
77 |             y_train = y[train]
78 |             X_test = X[test]
79 |             y_test = y[test]
80 |             clf.fit(X_train, y_train)
81 |             y_submission = clf.predict_proba(X_test)[:, 1]
82 |             dataset_blend_train[test, j] = y_submission
83 |             dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
84 |         dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
85 | 
86 |     print
87 |     print "Blending."
88 |     clf = LogisticRegression()
89 |     clf.fit(dataset_blend_train, y)
90 |     y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
91 | 
92 |     print "Linear stretch of predictions to [0,1]"
93 |     y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
94 | 
95 |     print "Saving Results."
96 |     tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
97 |     np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
98 |                header='MoleculeId,PredictedProbability', comments='')
99 | 


--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions to load the dataset.
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def read_data(file_name):
 9 |     """This function is adapted from:
10 |     https://github.com/benhamner/BioResponse/blob/master/Benchmarks/csv_io.py
11 |     """
12 |     f = open(file_name)
13 |     # skip header
14 |     f.readline()
15 |     samples = []
16 |     for line in f:
17 |         line = line.strip().split(",")
18 |         sample = [float(x) for x in line]
19 |         samples.append(sample)
20 |     return samples
21 | 
22 | 
23 | def load():
24 |     """Conveninence function to load all data as numpy arrays.
25 |     """
26 |     print "Loading data..."
27 |     train = read_data("data/train.csv")
28 |     y_train = np.array([x[0] for x in train])
29 |     X_train = np.array([x[1:] for x in train])
30 |     X_test = np.array(read_data("data/test.csv"))
31 |     return X_train, y_train, X_test
32 | 
33 | if __name__ == '__main__':
34 | 
35 |     X_train, y_train, X_test = load()
36 | 


--------------------------------------------------------------------------------