├── .gitignore ├── README.md ├── analyze_results.py ├── do_tensorflow.py ├── do_xgboost.py ├── plots ├── .gitkeep ├── by_hour.png ├── exc_times.csv ├── execution_time.png ├── results_table.txt └── roc.png ├── preprocess_data.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /outputs 3 | /*.csv 4 | /*.npz 5 | /*.pickle 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gradient Boosting in TensorFlow vs XGBoost 2 | 3 | TensorFlow 1.4 includes a Gradient Boosting implementation, aptly named 4 | TensorFlow Boosted Trees (TFBT). This repo contains the benchmarking code 5 | that I used to compare it [XGBoost](https://github.com/dmlc/xgboost). 6 | 7 | For more background, have a look at [the article](https://nicolovaligi.com/gradient-boosting-tensorflow-xgboost.html). 8 | 9 | 10 | 11 | ## Getting started 12 | 13 | ``` 14 | # Prepare the python environment 15 | mkvirtualenv env 16 | source env/bin/activate 17 | pip install -r requirements.txt 18 | 19 | # Download the dataset 20 | wget http://stat-computing.org/dataexpo/2009/{2006,2007}.csv.bz2 21 | bunzip2 {2006,2007}.csv.bz2 22 | 23 | # Prepare the dataset 24 | python preprocess_data.py 25 | ``` 26 | 27 | ## Running the experiments 28 | 29 | Train and run xgboost: 30 | 31 | ``` 32 | python do_xgboost.py 33 | ``` 34 | 35 | Train and run TensorFlow: 36 | 37 | ``` 38 | python do_tensorflow.py 39 | ``` 40 | 41 | Draw nice plots: 42 | 43 | ``` 44 | python analyze_results.py 45 | ``` 46 | 47 | ## Timing results 48 | 49 | ``` 50 | ./do_xgboost.py --num_trees=50 42.06s user 1.82s system 1727% cpu 2.540 total 51 | 52 | ./do_tensorflow.py --num_trees=50 --examples_per_layer=1000 124.12s user 27.50s system 374% cpu 40.456 total 53 | ./do_tensorflow.py --num_trees=50 --examples_per_layer=5000 659.74s user 188.80s system 356% cpu 3:58.30 total 54 | ``` 55 | -------------------------------------------------------------------------------- /analyze_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Draw pretty plots. 5 | """ 6 | 7 | import os 8 | 9 | from IPython import embed 10 | import pandas as pd 11 | import numpy as np 12 | from sklearn import metrics 13 | from matplotlib import pyplot as plt 14 | import seaborn as sns 15 | 16 | def plot_by_hour(): 17 | """Plot likelihood of delays vs. scheduled departure time.""" 18 | 19 | df = pd.read_csv('2006.csv') 20 | df = df.dropna(subset=['DepDelay']) 21 | df['IsDelayed'] = df.DepDelay > 15 22 | df.CRSDepTime = df.CRSDepTime // 100 23 | 24 | sns.set() 25 | f, ax = plt.subplots(figsize=(5, 3.75)) 26 | data = df.groupby('CRSDepTime').IsDelayed.mean() 27 | data.plot(ax=ax, kind='bar', color=sns.color_palette()[0]) 28 | ax.set_xlabel('Departure hour') 29 | ax.set_ylabel('') 30 | yticks = ax.get_yticks() 31 | ax.set_yticklabels(['{:.0f}%'.format(x * 100) for x in yticks]) 32 | ax.set_title('Delayed flights by departure hour') 33 | ax.set_xlim((-0.5, 24)) 34 | 35 | plt.tight_layout() 36 | f.savefig('plots/by_hour.png', bbox_inches='tight') 37 | 38 | 39 | def plot_timings(): 40 | df = pd.read_csv('plots/exc_times.csv', index_col='Model') 41 | 42 | sns.set() 43 | f, ax = plt.subplots(figsize=(5, 3.5)) 44 | df.plot(ax=ax, kind='bar', rot=0) 45 | ax.set_title('Training time') 46 | ax.set_xlabel('') 47 | ax.set_ylabel('Time [s]') 48 | plt.tight_layout() 49 | f.savefig('plots/execution_time.png', bbox_inches='tight') 50 | 51 | 52 | if __name__ == '__main__': 53 | data = np.load('airlines_data.npz') 54 | 55 | experiments = { 56 | 'pred_xgb_t050_d06.npy': {'label': 'XGBoost (50 trees)'}, 57 | 58 | 'tf_t050_d06_ex01000/pred_tf.npy': {'label': 'TensorFlow (1k ex/layer)'}, 59 | 'tf_t050_d06_ex05000/pred_tf.npy': {'label': 'TensorFlow (5k ex/layer)'}, 60 | } 61 | 62 | plot_curves = [] 63 | exp_metrics = [] 64 | 65 | for pred_path, exp in experiments.items(): 66 | y_prob = np.load(os.path.join('outputs', pred_path)) 67 | y_pred = y_prob > 0.5 68 | false_pos_rate, true_pos_rate, _ = metrics.roc_curve(data['y_test'], y_prob) 69 | plot_curves.append( 70 | [false_pos_rate, true_pos_rate, exp['label']]) 71 | exp_metrics.append({ 72 | 'Model': exp['label'], 73 | 'AUC score': 100 * metrics.roc_auc_score(data['y_test'], y_prob), 74 | }) 75 | 76 | # Do the actual plotting 77 | sns.set() 78 | f, ax = plt.subplots(figsize=(5, 3.5)) 79 | 80 | for fpr, tpr, label in plot_curves: 81 | ax.plot(fpr, tpr, label=label) 82 | 83 | ax.legend() 84 | ax.set_xlabel('False positive rate') 85 | ax.set_ylabel('True positive rate') 86 | ax.set_title('ROC') 87 | plt.tight_layout() 88 | f.savefig('plots/roc.png', bbox_inches='tight') 89 | 90 | # Score table 91 | metrics_df = pd.DataFrame.from_dict(exp_metrics) 92 | with open('plots/results_table.txt', 'w') as f: 93 | metrics_df.to_string( 94 | f, index=False, 95 | columns=['Model', 'AUC score'], 96 | float_format=lambda x: '{:0.1f}'.format(x)) 97 | 98 | # Other plots 99 | plot_timings() 100 | plot_by_hour() 101 | -------------------------------------------------------------------------------- /do_tensorflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Train a gradient boosting classifier on the airlines dataset using 5 | TensorFlow's Boosted Trees. 6 | 7 | References: 8 | 9 | https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html 10 | https://www.tensorflow.org/programmers_guide/datasets#consuming_numpy_arrays 11 | """ 12 | 13 | import argparse 14 | import os 15 | import sys 16 | 17 | from IPython import embed 18 | import numpy as np 19 | import tensorflow as tf 20 | from tensorflow.contrib.boosted_trees.estimator_batch.estimator import GradientBoostedDecisionTreeClassifier 21 | from tensorflow.contrib.boosted_trees.proto import learner_pb2 22 | from tensorflow.contrib.layers.python.layers import feature_column 23 | from tensorflow.contrib.learn import learn_runner 24 | 25 | FLAGS = None 26 | 27 | 28 | def _get_tfbt(output_dir, feature_cols): 29 | """Configures TF Boosted Trees estimator based on flags.""" 30 | learner_config = learner_pb2.LearnerConfig() 31 | 32 | learner_config.learning_rate_tuner.fixed.learning_rate = FLAGS.learning_rate 33 | learner_config.regularization.l1 = 0.0 34 | # Set the regularization per instance in such a way that 35 | # regularization for the full training data is equal to l2 flag. 36 | learner_config.regularization.l2 = FLAGS.l2 / FLAGS.batch_size 37 | learner_config.constraints.max_tree_depth = FLAGS.depth 38 | learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER 39 | 40 | run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=30) 41 | 42 | # Create a TF Boosted trees regression estimator. 43 | estimator = GradientBoostedDecisionTreeClassifier( 44 | learner_config=learner_config, 45 | examples_per_layer=FLAGS.examples_per_layer, 46 | n_classes=2, 47 | num_trees=FLAGS.num_trees, 48 | feature_columns=feature_cols, 49 | model_dir=output_dir, 50 | config=run_config, 51 | center_bias=False) 52 | return estimator 53 | 54 | 55 | def _matrix_to_dict(matrix, col_names): 56 | return { 57 | feat_name: matrix[:, feat_idx, np.newaxis] 58 | for feat_idx, feat_name in enumerate(col_names)} 59 | 60 | 61 | def _make_input_fn(which_set): 62 | data = np.load('airlines_data.npz') 63 | feature_names = data['feature_names'] 64 | 65 | feature_columns = [feature_column.real_valued_column( 66 | k) for k in feature_names] 67 | 68 | if which_set == 'train': 69 | return feature_columns, tf.estimator.inputs.numpy_input_fn( 70 | x=_matrix_to_dict(data['X_train'], feature_names), 71 | y=data['y_train'], 72 | batch_size=32, 73 | num_epochs=None, 74 | shuffle=True) 75 | elif which_set == 'test': 76 | return feature_columns, tf.estimator.inputs.numpy_input_fn( 77 | x=_matrix_to_dict(data['X_test'], feature_names), 78 | y=data['y_test'], 79 | num_epochs=1, 80 | shuffle=False) 81 | else: 82 | raise NotImplementedError() 83 | 84 | 85 | def _make_experiment_fn(output_dir): 86 | feature_columns, train_input_fn = _make_input_fn('train') 87 | feature_columns, test_input_fn = _make_input_fn('test') 88 | 89 | return tf.contrib.learn.Experiment( 90 | estimator=_get_tfbt(output_dir, feature_columns), 91 | train_input_fn=train_input_fn, 92 | eval_input_fn=test_input_fn, 93 | train_steps=None, 94 | eval_metrics=None, 95 | eval_steps=None, # Run through the test data once 96 | ) 97 | 98 | 99 | def main(unused_argv): 100 | learn_runner.run( 101 | experiment_fn=_make_experiment_fn, 102 | output_dir=FLAGS.output_dir, 103 | schedule='train_and_evaluate') 104 | 105 | # Run inference on the test dataset 106 | feature_columns, test_input_fn = _make_input_fn('test') 107 | 108 | estimator = _get_tfbt(FLAGS.output_dir, feature_columns) 109 | results = estimator.predict(input_fn=test_input_fn) 110 | 111 | y_predict = np.array([r['probabilities'][1] for r in results]) 112 | np.save(os.path.join(FLAGS.output_dir, 'pred_tf.npy'), y_predict) 113 | 114 | 115 | if __name__ == '__main__': 116 | tf.logging.set_verbosity(tf.logging.INFO) 117 | parser = argparse.ArgumentParser() 118 | 119 | parser.add_argument( 120 | "--batch_size", 121 | type=int, 122 | default=10000, 123 | help="The batch size for reading data.") 124 | parser.add_argument( 125 | "--depth", 126 | type=int, 127 | default=6, 128 | help="Maximum depth of weak learners.") 129 | parser.add_argument( 130 | "--l2", 131 | type=float, 132 | default=1.0, 133 | help="l2 regularization per batch.") 134 | parser.add_argument( 135 | "--learning_rate", 136 | type=float, 137 | default=0.1, 138 | help="Learning rate (shrinkage weight) with which each new tree is added.") 139 | parser.add_argument( 140 | "--examples_per_layer", 141 | type=int, 142 | default=5000, 143 | help="Number of examples to accumulate stats for per layer.") 144 | parser.add_argument( 145 | "--num_trees", 146 | type=int, 147 | default=10, 148 | help="Number of trees to grow before stopping.") 149 | 150 | FLAGS, unparsed = parser.parse_known_args() 151 | 152 | FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format( 153 | FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer) 154 | 155 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 156 | -------------------------------------------------------------------------------- /do_xgboost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Train a gradient boosting classifier on the airline dataset using 5 | XGBoost's python API. 6 | """ 7 | 8 | import argparse 9 | 10 | import numpy as np 11 | import pandas as pd 12 | from IPython import embed 13 | import pickle 14 | import scipy 15 | import xgboost as xgb 16 | from xgboost import XGBClassifier 17 | from sklearn import metrics 18 | from sklearn.ensemble import RandomForestClassifier 19 | 20 | from matplotlib import pyplot as plt 21 | import seaborn as sns 22 | 23 | FLAGS = None 24 | 25 | 26 | def train_and_predict(X_train, y_train, X_test, y_test, **kwargs): 27 | """Run training and evaluation using xgboost.""" 28 | 29 | bst = XGBClassifier( 30 | max_depth=FLAGS.depth, 31 | learning_rate=FLAGS.learning_rate, 32 | n_estimators=FLAGS.num_trees, 33 | silent=False, 34 | objective='binary:logistic', 35 | nthread=-1, 36 | seed=42, 37 | ) 38 | bst.fit(X_train, y_train) 39 | # pickle.dump(bst, open('xgboost.pickle', 'wb')) 40 | y_pred = bst.predict_proba(X_test)[:, 1] 41 | 42 | # Save predictions 43 | np.save( 44 | 'outputs/pred_xgb_t{:03d}_d{:02d}.npy'.format(FLAGS.num_trees, FLAGS.depth), 45 | y_pred) 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | 51 | parser.add_argument( 52 | "--num_trees", 53 | type=int, 54 | default=10, 55 | help="Number of trees to grow before stopping.") 56 | parser.add_argument( 57 | "--depth", 58 | type=int, 59 | default=6, 60 | help="Maximum depth of weak learners.") 61 | parser.add_argument( 62 | "--learning_rate", 63 | type=float, 64 | default=0.1, 65 | help="Learning rate (shrinkage weight) with which each new tree is added.") 66 | 67 | FLAGS = parser.parse_args() 68 | 69 | data = np.load('airlines_data.npz') 70 | train_and_predict(**data) 71 | -------------------------------------------------------------------------------- /plots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/.gitkeep -------------------------------------------------------------------------------- /plots/by_hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/by_hour.png -------------------------------------------------------------------------------- /plots/exc_times.csv: -------------------------------------------------------------------------------- 1 | Model,Wall time, Total time 2 | XGB,2.54,42.06 3 | TF (1k ex/l),40.46,124.12 4 | TF (5k ex/l),238.30,659.74 5 | -------------------------------------------------------------------------------- /plots/execution_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/execution_time.png -------------------------------------------------------------------------------- /plots/results_table.txt: -------------------------------------------------------------------------------- 1 | Model AUC score 2 | XGBoost (50 trees) 67.7 3 | TensorFlow (1k ex/layer) 62.2 4 | TensorFlow (5k ex/layer) 66.2 -------------------------------------------------------------------------------- /plots/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/roc.png -------------------------------------------------------------------------------- /preprocess_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Prepare datasets for training and testing 5 | """ 6 | 7 | from IPython import embed 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.preprocessing import LabelEncoder 12 | 13 | # Subset of the columns to use 14 | cols = ['Month', 'DayOfWeek', 'Distance', 15 | 'DepDelay', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest'] 16 | categorical_cols = ['UniqueCarrier', 'Origin', 'Dest'] 17 | 18 | 19 | def _get_df_from_file(file_name, n): 20 | df = pd.read_csv(file_name, usecols=cols) 21 | df = df.dropna(subset=cols) 22 | # Keep `n` samples 23 | df = df.sample(n=n, random_state=42) 24 | 25 | # Create binary labels from the delay column, and delete it from the 26 | # training data 27 | labels = df.DepDelay > 15 28 | del df['DepDelay'] 29 | 30 | # Discard minutes in departure times 31 | df.CRSDepTime = df.CRSDepTime // 100 32 | return df, labels 33 | 34 | 35 | if __name__ == '__main__': 36 | df2006, y2006 = _get_df_from_file('2006.csv', 100*1000) 37 | df2007, y2007 = _get_df_from_file('2007.csv', 200*1000) 38 | 39 | # xgboost wants numbers for categorical variables 40 | for col in categorical_cols: 41 | lenc = LabelEncoder().fit(pd.concat([df2006[col], df2007[col]])) 42 | df2006[col] = lenc.transform(df2006[col]) 43 | df2007[col] = lenc.transform(df2007[col]) 44 | 45 | # Get test/validation sets from 2007 data 46 | X_val, X_test, y_val, y_test = train_test_split( 47 | df2007, y2007, test_size=0.5, random_state=43) 48 | 49 | data = dict( 50 | feature_names=df2006.columns, 51 | X_train=df2006, y_train=y2006, 52 | X_test=X_test, y_test=y_test, 53 | X_val=X_val, y_val=y_val, 54 | ) 55 | 56 | for k, v in data.items(): 57 | print(k, v.shape) 58 | 59 | np.savez('airlines_data.npz', **data) 60 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.functools-lru-cache==1.4 2 | backports.shutil-get-terminal-size==1.0.0 3 | backports.weakref==1.0.post1 4 | bleach==1.5.0 5 | cycler==0.10.0 6 | decorator==4.1.2 7 | enum34==1.1.6 8 | feather-format==0.4.0 9 | funcsigs==1.0.2 10 | futures==3.1.1 11 | html5lib==0.9999999 12 | ipython==5.5.0 13 | ipython-genutils==0.2.0 14 | Markdown==2.6.9 15 | matplotlib==2.1.0 16 | mock==2.0.0 17 | numpy==1.13.3 18 | pandas==0.21.0 19 | pathlib2==2.3.0 20 | pbr==3.1.1 21 | pexpect==4.2.1 22 | pickleshare==0.7.4 23 | prompt-toolkit==1.0.15 24 | protobuf==3.4.0 25 | ptyprocess==0.5.2 26 | pyarrow==0.7.1 27 | Pygments==2.2.0 28 | pyparsing==2.2.0 29 | PySide==1.2.4 30 | python-dateutil==2.6.1 31 | pytz==2017.3 32 | scandir==1.6 33 | scikit-learn==0.19.1 34 | scipy==1.0.0 35 | seaborn==0.8.1 36 | simplegeneric==0.8.1 37 | six==1.11.0 38 | subprocess32==3.2.7 39 | tensorflow==1.4.0 40 | tensorflow-tensorboard==0.4.0rc2 41 | traitlets==4.3.2 42 | wcwidth==0.1.7 43 | Werkzeug==0.12.2 44 | xgboost==0.6a2 45 | --------------------------------------------------------------------------------