├── README.md
└── xgb-bias-variance.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Bias Variance Decompositions using XGBoost
 2 | 
 3 | This repository contains experiments for the Nvidia devblog post "Bias Variance Decompositions using XGBoost".
 4 | 
 5 | ## Dependencies
 6 | ```bash
 7 | pip install xgboost distributed
 8 | ```
 9 | 
10 | ## Running experiments
11 | These experiments are set up to run on a [distributed cluster](http://distributed.dask.org/en/latest/client.html#). They can easily be run on a local machine by replacing the following line, although they may be time-consuming
12 | ```python
13 | # client = Client('127.0.0.1:8786')
14 | client = Client()
15 | ```
16 | 
17 | To run all experiments:
18 | ```bash
19 | python xgb-bias-variance.py
20 | ```
21 | Images will be output to `images/`
22 | 
23 | ## Creating your own experiment
24 | Add your own function based on this template
25 | ```python
26 | def experiment_gbm_subsample(client):
27 |     subsample_range = np.linspace(0.1, 1.0)
28 |     models = [xgb.XGBRegressor(max_depth=15, reg_lambda=0.01, subsample=subsample) for subsample in
29 |               subsample_range]
30 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
31 |     results = client.gather(futures)
32 |     plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Subsample", subsample_range,
33 |                     results)
34 | ```
35 | Add any iterable set of scikit-learn compatible models.


--------------------------------------------------------------------------------
/xgb-bias-variance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import xgboost as xgb
  3 | import copy
  4 | import os
  5 | from distributed import Client
  6 | import matplotlib
  7 | import re
  8 | import matplotlib.pyplot as plt
  9 | from itertools import repeat
 10 | 
 11 | import pandas as pd
 12 | 
 13 | plots_directory = "images"
 14 | plt.style.use("seaborn")
 15 | 
 16 | 
 17 | def generate_rosenbrock(n, variance=0):
 18 |     X = np.random.random((n, 2))
 19 |     X[:, 0] = (X[:, 0] - 0.5) * 4
 20 |     X[:, 1] = ((X[:, 1] - 0.5) * 4) + 1
 21 |     y = np.zeros(n)
 22 |     for i in range(n):
 23 |         y[i] = 100 * np.square(X[i, 1] - (X[i, 0] * X[i, 0])) + np.square(1 - X[i, 0])
 24 |     noise = np.random.normal(scale=np.sqrt(variance), size=n)
 25 |     y = y + noise
 26 |     return X, y, noise
 27 | 
 28 | 
 29 | def expected_bias_squared(expected_predictions, labels):
 30 |     bias_squared = np.square(expected_predictions - labels)
 31 |     return np.average(bias_squared)
 32 | 
 33 | 
 34 | def expected_variance(predictions, expected_predictions):
 35 |     squared_expected_predictions = np.square(expected_predictions)
 36 |     expected_squared_predictions = np.average(np.square(predictions), axis=0)
 37 |     return np.average(expected_squared_predictions - squared_expected_predictions)
 38 | 
 39 | 
 40 | def test_expected_bias():
 41 |     expected_predictions = np.asarray([1.5, 2.0])
 42 |     labels = np.asarray([1.0, 1.5])
 43 |     assert (expected_bias_squared(expected_predictions, labels) == 0.25)
 44 | 
 45 | 
 46 | def test_expected_variance():
 47 |     predictions = np.asarray([[1.0, 1.0, 1.0],
 48 |                               [3.0, 3.0, 3.0]])
 49 |     expected_predictions = np.average(predictions, axis=0)
 50 |     assert ((expected_predictions == [2.0, 2.0, 2.0]).all())
 51 |     assert (expected_variance(predictions, expected_predictions) == 1.0)
 52 | 
 53 | 
 54 | def expected_mse(predictions, labels):
 55 |     preds = np.asarray(predictions)
 56 |     num_instances = len(labels)
 57 |     expected_mse_per_instance = np.zeros(num_instances)
 58 |     for i in range(num_instances):
 59 |         diff = labels[i] - preds[:, i]
 60 |         expected_mse_per_instance[i] = np.average(np.square(diff))
 61 |     return np.average(expected_mse_per_instance)
 62 | 
 63 | 
 64 | def test_expected_mse():
 65 |     predictions = np.asarray([[0.5, - 0.5],
 66 |                               [1.5, 3.5]])
 67 |     labels = np.asarray([1.0, 1.5])
 68 |     assert (expected_mse(predictions, labels) == 2.125)
 69 | 
 70 | 
 71 | def test_unbiased_model():
 72 |     predictions = np.asarray([[0.5, 1.0],
 73 |                               [1.5, 2.0]])
 74 |     expected_predictions = np.average(predictions, axis=0)
 75 |     labels = np.asarray([1.0, 1.5])
 76 |     assert (expected_bias_squared(expected_predictions, labels) == 0.0)
 77 |     assert (expected_mse(predictions, labels) == expected_variance(predictions,
 78 |                                                                    expected_predictions))
 79 | 
 80 | 
 81 | def test_biased_model():
 82 |     predictions = np.asarray([[1.5, 2.0],
 83 |                               [1.5, 2.0]])
 84 |     expected_predictions = np.average(predictions, axis=0)
 85 |     labels = np.asarray([1.0, 1.5])
 86 |     assert (expected_bias_squared(expected_predictions, labels) == 0.25)
 87 |     assert (expected_mse(predictions, labels) == expected_bias_squared(expected_predictions,
 88 |                                                                        labels))
 89 | 
 90 | 
 91 | def plot_3d(X, y, name):
 92 |     import matplotlib.pyplot as plt
 93 |     from mpl_toolkits.mplot3d import Axes3D
 94 | 
 95 |     fig = plt.figure()
 96 |     ax = fig.add_subplot(111, projection='3d')
 97 |     ax.scatter(X[:, 0], X[:, 1], y, c=y, cmap='coolwarm')
 98 |     plt.savefig(os.path.join(plots_directory, name))
 99 |     plt.show()
100 | 
101 | 
102 | def run_on_worker(base_model, generator, n=1000, n_test=10000, label_variance=1000.0,
103 |                   num_models=100,
104 |                   ):
105 |     X_test, y_test, test_noise = generator(n_test, label_variance)
106 |     models = []
107 |     for i in range(num_models):
108 |         X, y, noise = generator(n, label_variance)
109 |         model = copy.deepcopy(base_model)
110 |         models.append(model.fit(
111 |             X, y))
112 | 
113 |     preds = []
114 |     for model in models:
115 |         pred = model.predict(X_test)
116 |         preds.append(pred.astype(np.double))
117 | 
118 |     expected_predictions = np.average(preds, axis=0)
119 | 
120 |     bias_squared = expected_bias_squared(expected_predictions, y_test - test_noise)
121 |     variance = expected_variance(preds, expected_predictions)
122 |     mse = expected_mse(preds, y_test)
123 |     return {"bias^2": bias_squared, "mse": mse, "variance": variance,
124 |             "irreducible_error": label_variance}
125 | 
126 | 
127 | def plot_experiment(title, x_label, plot_x, results):
128 |     labels = ["irreducible_error", "variance", "bias^2"]
129 |     df = pd.DataFrame()
130 |     for label in labels:
131 |         df[label] = [res[label] for res in results]
132 |     df[x_label] = plot_x
133 |     df = df.set_index(x_label)
134 |     df.plot.area()
135 |     plt.title(title)
136 |     plt.ylabel("MSE")
137 |     plt.xlim(np.min(plot_x), np.max(plot_x))
138 |     title = re.sub(' -', '', title)
139 |     snake_title = re.sub(' ', '_', title + ' ' + x_label).lower()
140 |     plt.savefig(os.path.join(plots_directory, snake_title + '.png'))
141 | 
142 | 
143 | def experiment_gbm_rounds(client):
144 |     n_estimators_range = range(20, 100, 5)
145 |     models = [xgb.XGBRegressor(n_estimators=n_estimators) for n_estimators in n_estimators_range]
146 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
147 |     results = client.gather(futures)
148 |     plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Boosting Rounds",
149 |                     n_estimators_range,
150 |                     results)
151 | 
152 | 
153 | def experiment_rf_num_trees(client):
154 |     n_estimators_range = range(1, 50)
155 |     models = [xgb.XGBRFRegressor(max_depth=12, n_estimators=n_estimators) for n_estimators in
156 |               n_estimators_range]
157 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
158 |     results = client.gather(futures)
159 |     plot_experiment("Bias Variance Decomposition - Random Forest", "Number of trees",
160 |                     n_estimators_range,
161 |                     results)
162 | 
163 | 
164 | def experiment_rf_training_examples(client):
165 |     training_examples_range = range(10, 1000, 10)
166 |     model = xgb.XGBRFRegressor(max_depth=12, reg_lambda=0.01)
167 |     futures = client.map(run_on_worker, repeat(model), repeat(generate_rosenbrock),
168 |                          training_examples_range)
169 |     results = client.gather(futures)
170 |     plot_experiment("Bias Variance Decomposition - Random Forest", "Training examples",
171 |                     training_examples_range,
172 |                     results)
173 | 
174 | 
175 | def experiment_rf_max_depth(client):
176 |     max_depth_range = range(1, 15)
177 |     models = [xgb.XGBRFRegressor(max_depth=max_depth, reg_lambda=0.01) for max_depth in
178 |               max_depth_range]
179 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
180 |     results = client.gather(futures)
181 |     plot_experiment("Bias Variance Decomposition - Random Forest", "Max Depth", max_depth_range,
182 |                     results)
183 | 
184 | 
185 | def experiment_rf_lambda(client):
186 |     reg_lambda_range = np.linspace(0.001, 1.0)
187 |     models = [xgb.XGBRFRegressor(max_depth=15, reg_lambda=reg_lambda) for reg_lambda in
188 |               reg_lambda_range]
189 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
190 |     results = client.gather(futures)
191 |     plot_experiment("Bias Variance Decomposition - Random Forest", "Lambda (L2 penalty)",
192 |                     reg_lambda_range,
193 |                     results)
194 | 
195 | 
196 | def experiment_gbm_lambda(client):
197 |     reg_lambda_range = np.linspace(0.001, 20.0)
198 |     models = [xgb.XGBRegressor(max_depth=15, reg_lambda=reg_lambda) for reg_lambda in
199 |               reg_lambda_range]
200 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
201 |     results = client.gather(futures)
202 |     plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Lambda (L2 penalty)",
203 |                     reg_lambda_range,
204 |                     results)
205 | 
206 | 
207 | def experiment_gbm_subsample(client):
208 |     subsample_range = np.linspace(0.1, 1.0)
209 |     models = [xgb.XGBRegressor(max_depth=15, reg_lambda=0.01, subsample=subsample) for subsample in
210 |               subsample_range]
211 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
212 |     results = client.gather(futures)
213 |     plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Subsample", subsample_range,
214 |                     results)
215 | 
216 | 
217 | def experiment_gbm_learning_rate(client):
218 |     learning_rate_range = np.linspace(0.1, 1.0)
219 |     models = [xgb.XGBRegressor(learning_rate=learning_rate) for learning_rate in
220 |               learning_rate_range]
221 |     futures = client.map(run_on_worker, models, generator=generate_rosenbrock)
222 |     results = client.gather(futures)
223 |     plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Learning rate",
224 |                     learning_rate_range,
225 |                     results)
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     client = Client('127.0.0.1:8786')
230 |     if not os.path.exists(plots_directory):
231 |         os.makedirs(plots_directory)
232 |     all_experiments = [exp for exp in dir() if 'experiment_' in exp]
233 |     for exp in all_experiments:
234 |         print("Running {} ...".format(exp))
235 |         globals()[exp](client)
236 | 


--------------------------------------------------------------------------------