├── README.md └── xgb-bias-variance.py /README.md: -------------------------------------------------------------------------------- 1 | # Bias Variance Decompositions using XGBoost 2 | 3 | This repository contains experiments for the Nvidia devblog post "Bias Variance Decompositions using XGBoost". 4 | 5 | ## Dependencies 6 | ```bash 7 | pip install xgboost distributed 8 | ``` 9 | 10 | ## Running experiments 11 | These experiments are set up to run on a [distributed cluster](http://distributed.dask.org/en/latest/client.html#). They can easily be run on a local machine by replacing the following line, although they may be time-consuming 12 | ```python 13 | # client = Client('127.0.0.1:8786') 14 | client = Client() 15 | ``` 16 | 17 | To run all experiments: 18 | ```bash 19 | python xgb-bias-variance.py 20 | ``` 21 | Images will be output to `images/` 22 | 23 | ## Creating your own experiment 24 | Add your own function based on this template 25 | ```python 26 | def experiment_gbm_subsample(client): 27 | subsample_range = np.linspace(0.1, 1.0) 28 | models = [xgb.XGBRegressor(max_depth=15, reg_lambda=0.01, subsample=subsample) for subsample in 29 | subsample_range] 30 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 31 | results = client.gather(futures) 32 | plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Subsample", subsample_range, 33 | results) 34 | ``` 35 | Add any iterable set of scikit-learn compatible models. -------------------------------------------------------------------------------- /xgb-bias-variance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xgboost as xgb 3 | import copy 4 | import os 5 | from distributed import Client 6 | import matplotlib 7 | import re 8 | import matplotlib.pyplot as plt 9 | from itertools import repeat 10 | 11 | import pandas as pd 12 | 13 | plots_directory = "images" 14 | plt.style.use("seaborn") 15 | 16 | 17 | def generate_rosenbrock(n, variance=0): 18 | X = np.random.random((n, 2)) 19 | X[:, 0] = (X[:, 0] - 0.5) * 4 20 | X[:, 1] = ((X[:, 1] - 0.5) * 4) + 1 21 | y = np.zeros(n) 22 | for i in range(n): 23 | y[i] = 100 * np.square(X[i, 1] - (X[i, 0] * X[i, 0])) + np.square(1 - X[i, 0]) 24 | noise = np.random.normal(scale=np.sqrt(variance), size=n) 25 | y = y + noise 26 | return X, y, noise 27 | 28 | 29 | def expected_bias_squared(expected_predictions, labels): 30 | bias_squared = np.square(expected_predictions - labels) 31 | return np.average(bias_squared) 32 | 33 | 34 | def expected_variance(predictions, expected_predictions): 35 | squared_expected_predictions = np.square(expected_predictions) 36 | expected_squared_predictions = np.average(np.square(predictions), axis=0) 37 | return np.average(expected_squared_predictions - squared_expected_predictions) 38 | 39 | 40 | def test_expected_bias(): 41 | expected_predictions = np.asarray([1.5, 2.0]) 42 | labels = np.asarray([1.0, 1.5]) 43 | assert (expected_bias_squared(expected_predictions, labels) == 0.25) 44 | 45 | 46 | def test_expected_variance(): 47 | predictions = np.asarray([[1.0, 1.0, 1.0], 48 | [3.0, 3.0, 3.0]]) 49 | expected_predictions = np.average(predictions, axis=0) 50 | assert ((expected_predictions == [2.0, 2.0, 2.0]).all()) 51 | assert (expected_variance(predictions, expected_predictions) == 1.0) 52 | 53 | 54 | def expected_mse(predictions, labels): 55 | preds = np.asarray(predictions) 56 | num_instances = len(labels) 57 | expected_mse_per_instance = np.zeros(num_instances) 58 | for i in range(num_instances): 59 | diff = labels[i] - preds[:, i] 60 | expected_mse_per_instance[i] = np.average(np.square(diff)) 61 | return np.average(expected_mse_per_instance) 62 | 63 | 64 | def test_expected_mse(): 65 | predictions = np.asarray([[0.5, - 0.5], 66 | [1.5, 3.5]]) 67 | labels = np.asarray([1.0, 1.5]) 68 | assert (expected_mse(predictions, labels) == 2.125) 69 | 70 | 71 | def test_unbiased_model(): 72 | predictions = np.asarray([[0.5, 1.0], 73 | [1.5, 2.0]]) 74 | expected_predictions = np.average(predictions, axis=0) 75 | labels = np.asarray([1.0, 1.5]) 76 | assert (expected_bias_squared(expected_predictions, labels) == 0.0) 77 | assert (expected_mse(predictions, labels) == expected_variance(predictions, 78 | expected_predictions)) 79 | 80 | 81 | def test_biased_model(): 82 | predictions = np.asarray([[1.5, 2.0], 83 | [1.5, 2.0]]) 84 | expected_predictions = np.average(predictions, axis=0) 85 | labels = np.asarray([1.0, 1.5]) 86 | assert (expected_bias_squared(expected_predictions, labels) == 0.25) 87 | assert (expected_mse(predictions, labels) == expected_bias_squared(expected_predictions, 88 | labels)) 89 | 90 | 91 | def plot_3d(X, y, name): 92 | import matplotlib.pyplot as plt 93 | from mpl_toolkits.mplot3d import Axes3D 94 | 95 | fig = plt.figure() 96 | ax = fig.add_subplot(111, projection='3d') 97 | ax.scatter(X[:, 0], X[:, 1], y, c=y, cmap='coolwarm') 98 | plt.savefig(os.path.join(plots_directory, name)) 99 | plt.show() 100 | 101 | 102 | def run_on_worker(base_model, generator, n=1000, n_test=10000, label_variance=1000.0, 103 | num_models=100, 104 | ): 105 | X_test, y_test, test_noise = generator(n_test, label_variance) 106 | models = [] 107 | for i in range(num_models): 108 | X, y, noise = generator(n, label_variance) 109 | model = copy.deepcopy(base_model) 110 | models.append(model.fit( 111 | X, y)) 112 | 113 | preds = [] 114 | for model in models: 115 | pred = model.predict(X_test) 116 | preds.append(pred.astype(np.double)) 117 | 118 | expected_predictions = np.average(preds, axis=0) 119 | 120 | bias_squared = expected_bias_squared(expected_predictions, y_test - test_noise) 121 | variance = expected_variance(preds, expected_predictions) 122 | mse = expected_mse(preds, y_test) 123 | return {"bias^2": bias_squared, "mse": mse, "variance": variance, 124 | "irreducible_error": label_variance} 125 | 126 | 127 | def plot_experiment(title, x_label, plot_x, results): 128 | labels = ["irreducible_error", "variance", "bias^2"] 129 | df = pd.DataFrame() 130 | for label in labels: 131 | df[label] = [res[label] for res in results] 132 | df[x_label] = plot_x 133 | df = df.set_index(x_label) 134 | df.plot.area() 135 | plt.title(title) 136 | plt.ylabel("MSE") 137 | plt.xlim(np.min(plot_x), np.max(plot_x)) 138 | title = re.sub(' -', '', title) 139 | snake_title = re.sub(' ', '_', title + ' ' + x_label).lower() 140 | plt.savefig(os.path.join(plots_directory, snake_title + '.png')) 141 | 142 | 143 | def experiment_gbm_rounds(client): 144 | n_estimators_range = range(20, 100, 5) 145 | models = [xgb.XGBRegressor(n_estimators=n_estimators) for n_estimators in n_estimators_range] 146 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 147 | results = client.gather(futures) 148 | plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Boosting Rounds", 149 | n_estimators_range, 150 | results) 151 | 152 | 153 | def experiment_rf_num_trees(client): 154 | n_estimators_range = range(1, 50) 155 | models = [xgb.XGBRFRegressor(max_depth=12, n_estimators=n_estimators) for n_estimators in 156 | n_estimators_range] 157 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 158 | results = client.gather(futures) 159 | plot_experiment("Bias Variance Decomposition - Random Forest", "Number of trees", 160 | n_estimators_range, 161 | results) 162 | 163 | 164 | def experiment_rf_training_examples(client): 165 | training_examples_range = range(10, 1000, 10) 166 | model = xgb.XGBRFRegressor(max_depth=12, reg_lambda=0.01) 167 | futures = client.map(run_on_worker, repeat(model), repeat(generate_rosenbrock), 168 | training_examples_range) 169 | results = client.gather(futures) 170 | plot_experiment("Bias Variance Decomposition - Random Forest", "Training examples", 171 | training_examples_range, 172 | results) 173 | 174 | 175 | def experiment_rf_max_depth(client): 176 | max_depth_range = range(1, 15) 177 | models = [xgb.XGBRFRegressor(max_depth=max_depth, reg_lambda=0.01) for max_depth in 178 | max_depth_range] 179 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 180 | results = client.gather(futures) 181 | plot_experiment("Bias Variance Decomposition - Random Forest", "Max Depth", max_depth_range, 182 | results) 183 | 184 | 185 | def experiment_rf_lambda(client): 186 | reg_lambda_range = np.linspace(0.001, 1.0) 187 | models = [xgb.XGBRFRegressor(max_depth=15, reg_lambda=reg_lambda) for reg_lambda in 188 | reg_lambda_range] 189 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 190 | results = client.gather(futures) 191 | plot_experiment("Bias Variance Decomposition - Random Forest", "Lambda (L2 penalty)", 192 | reg_lambda_range, 193 | results) 194 | 195 | 196 | def experiment_gbm_lambda(client): 197 | reg_lambda_range = np.linspace(0.001, 20.0) 198 | models = [xgb.XGBRegressor(max_depth=15, reg_lambda=reg_lambda) for reg_lambda in 199 | reg_lambda_range] 200 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 201 | results = client.gather(futures) 202 | plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Lambda (L2 penalty)", 203 | reg_lambda_range, 204 | results) 205 | 206 | 207 | def experiment_gbm_subsample(client): 208 | subsample_range = np.linspace(0.1, 1.0) 209 | models = [xgb.XGBRegressor(max_depth=15, reg_lambda=0.01, subsample=subsample) for subsample in 210 | subsample_range] 211 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 212 | results = client.gather(futures) 213 | plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Subsample", subsample_range, 214 | results) 215 | 216 | 217 | def experiment_gbm_learning_rate(client): 218 | learning_rate_range = np.linspace(0.1, 1.0) 219 | models = [xgb.XGBRegressor(learning_rate=learning_rate) for learning_rate in 220 | learning_rate_range] 221 | futures = client.map(run_on_worker, models, generator=generate_rosenbrock) 222 | results = client.gather(futures) 223 | plot_experiment("Bias Variance Decomposition - Gradient Boosting", "Learning rate", 224 | learning_rate_range, 225 | results) 226 | 227 | 228 | if __name__ == '__main__': 229 | client = Client('127.0.0.1:8786') 230 | if not os.path.exists(plots_directory): 231 | os.makedirs(plots_directory) 232 | all_experiments = [exp for exp in dir() if 'experiment_' in exp] 233 | for exp in all_experiments: 234 | print("Running {} ...".format(exp)) 235 | globals()[exp](client) 236 | --------------------------------------------------------------------------------