├── .gitignore
├── README.md
├── analyze_results.py
├── do_tensorflow.py
├── do_xgboost.py
├── plots
    ├── .gitkeep
    ├── by_hour.png
    ├── exc_times.csv
    ├── execution_time.png
    ├── results_table.txt
    └── roc.png
├── preprocess_data.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /outputs
3 | /*.csv
4 | /*.npz
5 | /*.pickle
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gradient Boosting in TensorFlow vs XGBoost
 2 | 
 3 | TensorFlow 1.4 includes a Gradient Boosting implementation, aptly named
 4 | TensorFlow Boosted Trees (TFBT). This repo contains the benchmarking code
 5 | that I used to compare it [XGBoost](https://github.com/dmlc/xgboost).
 6 | 
 7 | For more background, have a look at [the article](https://nicolovaligi.com/gradient-boosting-tensorflow-xgboost.html).
 8 | 
 9 | <img src="plots/roc.png" />
10 | 
11 | ## Getting started
12 | 
13 | ```
14 | # Prepare the python environment
15 | mkvirtualenv env
16 | source env/bin/activate
17 | pip install -r requirements.txt
18 | 
19 | # Download the dataset
20 | wget http://stat-computing.org/dataexpo/2009/{2006,2007}.csv.bz2
21 | bunzip2 {2006,2007}.csv.bz2
22 | 
23 | # Prepare the dataset
24 | python preprocess_data.py
25 | ```
26 | 
27 | ## Running the experiments
28 | 
29 | Train and run xgboost:
30 | 
31 | ```
32 | python do_xgboost.py
33 | ```
34 | 
35 | Train and run TensorFlow:
36 | 
37 | ```
38 | python do_tensorflow.py
39 | ```
40 | 
41 | Draw nice plots:
42 | 
43 | ```
44 | python analyze_results.py
45 | ```
46 | 
47 | ## Timing results
48 | 
49 | ```
50 | ./do_xgboost.py --num_trees=50  42.06s user 1.82s system 1727% cpu 2.540 total
51 | 
52 | ./do_tensorflow.py --num_trees=50 --examples_per_layer=1000  124.12s user 27.50s system 374% cpu 40.456 total
53 | ./do_tensorflow.py --num_trees=50 --examples_per_layer=5000  659.74s user 188.80s system 356% cpu 3:58.30 total
54 | ```
55 | 


--------------------------------------------------------------------------------
/analyze_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Draw pretty plots.
  5 | """
  6 | 
  7 | import os
  8 | 
  9 | from IPython import embed
 10 | import pandas as pd
 11 | import numpy as np
 12 | from sklearn import metrics
 13 | from matplotlib import pyplot as plt
 14 | import seaborn as sns
 15 | 
 16 | def plot_by_hour():
 17 |     """Plot likelihood of delays vs. scheduled departure time."""
 18 | 
 19 |     df = pd.read_csv('2006.csv')
 20 |     df = df.dropna(subset=['DepDelay'])
 21 |     df['IsDelayed'] = df.DepDelay > 15
 22 |     df.CRSDepTime = df.CRSDepTime // 100  
 23 | 
 24 |     sns.set()
 25 |     f, ax = plt.subplots(figsize=(5, 3.75))
 26 |     data = df.groupby('CRSDepTime').IsDelayed.mean()
 27 |     data.plot(ax=ax, kind='bar', color=sns.color_palette()[0])
 28 |     ax.set_xlabel('Departure hour')
 29 |     ax.set_ylabel('')
 30 |     yticks = ax.get_yticks()
 31 |     ax.set_yticklabels(['{:.0f}%'.format(x * 100) for x in yticks])
 32 |     ax.set_title('Delayed flights by departure hour')
 33 |     ax.set_xlim((-0.5, 24))
 34 | 
 35 |     plt.tight_layout()
 36 |     f.savefig('plots/by_hour.png', bbox_inches='tight')
 37 | 
 38 | 
 39 | def plot_timings():
 40 |     df = pd.read_csv('plots/exc_times.csv', index_col='Model')
 41 | 
 42 |     sns.set()
 43 |     f, ax = plt.subplots(figsize=(5, 3.5))
 44 |     df.plot(ax=ax, kind='bar', rot=0)
 45 |     ax.set_title('Training time')
 46 |     ax.set_xlabel('')
 47 |     ax.set_ylabel('Time [s]')
 48 |     plt.tight_layout()
 49 |     f.savefig('plots/execution_time.png', bbox_inches='tight')
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     data = np.load('airlines_data.npz')
 54 | 
 55 |     experiments = {
 56 |         'pred_xgb_t050_d06.npy': {'label': 'XGBoost (50 trees)'},
 57 | 
 58 |         'tf_t050_d06_ex01000/pred_tf.npy': {'label': 'TensorFlow (1k ex/layer)'},
 59 |         'tf_t050_d06_ex05000/pred_tf.npy': {'label': 'TensorFlow (5k ex/layer)'},
 60 |     }
 61 | 
 62 |     plot_curves = []
 63 |     exp_metrics = []
 64 | 
 65 |     for pred_path, exp in experiments.items():
 66 |         y_prob = np.load(os.path.join('outputs', pred_path))
 67 |         y_pred = y_prob > 0.5
 68 |         false_pos_rate, true_pos_rate, _ = metrics.roc_curve(data['y_test'], y_prob)
 69 |         plot_curves.append(
 70 |             [false_pos_rate, true_pos_rate, exp['label']])
 71 |         exp_metrics.append({
 72 |             'Model': exp['label'],
 73 |             'AUC score': 100 * metrics.roc_auc_score(data['y_test'], y_prob),
 74 |         })
 75 | 
 76 |     # Do the actual plotting
 77 |     sns.set()
 78 |     f, ax = plt.subplots(figsize=(5, 3.5))
 79 | 
 80 |     for fpr, tpr, label in plot_curves:
 81 |         ax.plot(fpr, tpr, label=label)
 82 |     
 83 |     ax.legend()
 84 |     ax.set_xlabel('False positive rate')
 85 |     ax.set_ylabel('True positive rate')
 86 |     ax.set_title('ROC')
 87 |     plt.tight_layout()
 88 |     f.savefig('plots/roc.png', bbox_inches='tight')
 89 | 
 90 |     # Score table
 91 |     metrics_df = pd.DataFrame.from_dict(exp_metrics)
 92 |     with open('plots/results_table.txt', 'w') as f:
 93 |         metrics_df.to_string(
 94 |             f, index=False,
 95 |             columns=['Model', 'AUC score'],
 96 |             float_format=lambda x: '{:0.1f}'.format(x))
 97 | 
 98 |     # Other plots
 99 |     plot_timings()
100 |     plot_by_hour()
101 | 


--------------------------------------------------------------------------------
/do_tensorflow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Train a gradient boosting classifier on the airlines dataset using
  5 | TensorFlow's Boosted Trees.
  6 | 
  7 | References:
  8 | 
  9 | https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html
 10 | https://www.tensorflow.org/programmers_guide/datasets#consuming_numpy_arrays
 11 | """
 12 | 
 13 | import argparse
 14 | import os
 15 | import sys
 16 | 
 17 | from IPython import embed
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | from tensorflow.contrib.boosted_trees.estimator_batch.estimator import GradientBoostedDecisionTreeClassifier
 21 | from tensorflow.contrib.boosted_trees.proto import learner_pb2
 22 | from tensorflow.contrib.layers.python.layers import feature_column
 23 | from tensorflow.contrib.learn import learn_runner
 24 | 
 25 | FLAGS = None
 26 | 
 27 | 
 28 | def _get_tfbt(output_dir, feature_cols):
 29 |     """Configures TF Boosted Trees estimator based on flags."""
 30 |     learner_config = learner_pb2.LearnerConfig()
 31 | 
 32 |     learner_config.learning_rate_tuner.fixed.learning_rate = FLAGS.learning_rate
 33 |     learner_config.regularization.l1 = 0.0
 34 |     # Set the regularization per instance in such a way that
 35 |     # regularization for the full training data is equal to l2 flag.
 36 |     learner_config.regularization.l2 = FLAGS.l2 / FLAGS.batch_size
 37 |     learner_config.constraints.max_tree_depth = FLAGS.depth
 38 |     learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
 39 | 
 40 |     run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=30)
 41 | 
 42 |     # Create a TF Boosted trees regression estimator.
 43 |     estimator = GradientBoostedDecisionTreeClassifier(
 44 |         learner_config=learner_config,
 45 |         examples_per_layer=FLAGS.examples_per_layer,
 46 |         n_classes=2,
 47 |         num_trees=FLAGS.num_trees,
 48 |         feature_columns=feature_cols,
 49 |         model_dir=output_dir,
 50 |         config=run_config,
 51 |         center_bias=False)
 52 |     return estimator
 53 | 
 54 | 
 55 | def _matrix_to_dict(matrix, col_names):
 56 |     return {
 57 |         feat_name: matrix[:, feat_idx, np.newaxis]
 58 |         for feat_idx, feat_name in enumerate(col_names)}
 59 | 
 60 | 
 61 | def _make_input_fn(which_set):
 62 |     data = np.load('airlines_data.npz')
 63 |     feature_names = data['feature_names']
 64 | 
 65 |     feature_columns = [feature_column.real_valued_column(
 66 |         k) for k in feature_names]
 67 | 
 68 |     if which_set == 'train':
 69 |         return feature_columns, tf.estimator.inputs.numpy_input_fn(
 70 |             x=_matrix_to_dict(data['X_train'], feature_names),
 71 |             y=data['y_train'],
 72 |             batch_size=32,
 73 |             num_epochs=None,
 74 |             shuffle=True)
 75 |     elif which_set == 'test':
 76 |         return feature_columns, tf.estimator.inputs.numpy_input_fn(
 77 |             x=_matrix_to_dict(data['X_test'], feature_names),
 78 |             y=data['y_test'],
 79 |             num_epochs=1,
 80 |             shuffle=False)
 81 |     else:
 82 |         raise NotImplementedError()
 83 | 
 84 | 
 85 | def _make_experiment_fn(output_dir):
 86 |     feature_columns, train_input_fn = _make_input_fn('train')
 87 |     feature_columns, test_input_fn = _make_input_fn('test')
 88 | 
 89 |     return tf.contrib.learn.Experiment(
 90 |         estimator=_get_tfbt(output_dir, feature_columns),
 91 |         train_input_fn=train_input_fn,
 92 |         eval_input_fn=test_input_fn,
 93 |         train_steps=None,
 94 |         eval_metrics=None,
 95 |         eval_steps=None,  # Run through the test data once
 96 |     )
 97 | 
 98 | 
 99 | def main(unused_argv):
100 |     learn_runner.run(
101 |         experiment_fn=_make_experiment_fn,
102 |         output_dir=FLAGS.output_dir,
103 |         schedule='train_and_evaluate')
104 | 
105 |     # Run inference on the test dataset
106 |     feature_columns, test_input_fn = _make_input_fn('test')
107 | 
108 |     estimator = _get_tfbt(FLAGS.output_dir, feature_columns)
109 |     results = estimator.predict(input_fn=test_input_fn)
110 | 
111 |     y_predict = np.array([r['probabilities'][1] for r in results])
112 |     np.save(os.path.join(FLAGS.output_dir, 'pred_tf.npy'), y_predict)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     tf.logging.set_verbosity(tf.logging.INFO)
117 |     parser = argparse.ArgumentParser()
118 | 
119 |     parser.add_argument(
120 |         "--batch_size",
121 |         type=int,
122 |         default=10000,
123 |         help="The batch size for reading data.")
124 |     parser.add_argument(
125 |         "--depth",
126 |         type=int,
127 |         default=6,
128 |         help="Maximum depth of weak learners.")
129 |     parser.add_argument(
130 |         "--l2",
131 |         type=float,
132 |         default=1.0,
133 |         help="l2 regularization per batch.")
134 |     parser.add_argument(
135 |         "--learning_rate",
136 |         type=float,
137 |         default=0.1,
138 |         help="Learning rate (shrinkage weight) with which each new tree is added.")
139 |     parser.add_argument(
140 |         "--examples_per_layer",
141 |         type=int,
142 |         default=5000,
143 |         help="Number of examples to accumulate stats for per layer.")
144 |     parser.add_argument(
145 |         "--num_trees",
146 |         type=int,
147 |         default=10,
148 |         help="Number of trees to grow before stopping.")
149 | 
150 |     FLAGS, unparsed = parser.parse_known_args()
151 | 
152 |     FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format(
153 |         FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer)
154 | 
155 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
156 | 


--------------------------------------------------------------------------------
/do_xgboost.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Train a gradient boosting classifier on the airline dataset using
 5 | XGBoost's python API.
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | from IPython import embed
13 | import pickle
14 | import scipy
15 | import xgboost as xgb
16 | from xgboost import XGBClassifier
17 | from sklearn import metrics
18 | from sklearn.ensemble import RandomForestClassifier
19 | 
20 | from matplotlib import pyplot as plt
21 | import seaborn as sns
22 | 
23 | FLAGS = None
24 | 
25 | 
26 | def train_and_predict(X_train, y_train, X_test, y_test, **kwargs):
27 |     """Run training and evaluation using xgboost."""
28 | 
29 |     bst = XGBClassifier(
30 |         max_depth=FLAGS.depth,
31 |         learning_rate=FLAGS.learning_rate,
32 |         n_estimators=FLAGS.num_trees,
33 |         silent=False,
34 |         objective='binary:logistic',
35 |         nthread=-1,
36 |         seed=42,
37 |     )
38 |     bst.fit(X_train, y_train)
39 |     # pickle.dump(bst, open('xgboost.pickle', 'wb'))
40 |     y_pred = bst.predict_proba(X_test)[:, 1]
41 | 
42 |     # Save predictions
43 |     np.save(
44 |         'outputs/pred_xgb_t{:03d}_d{:02d}.npy'.format(FLAGS.num_trees, FLAGS.depth),
45 |         y_pred)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     parser = argparse.ArgumentParser()
50 | 
51 |     parser.add_argument(
52 |         "--num_trees",
53 |         type=int,
54 |         default=10,
55 |         help="Number of trees to grow before stopping.")
56 |     parser.add_argument(
57 |         "--depth",
58 |         type=int,
59 |         default=6,
60 |         help="Maximum depth of weak learners.")
61 |     parser.add_argument(
62 |         "--learning_rate",
63 |         type=float,
64 |         default=0.1,
65 |         help="Learning rate (shrinkage weight) with which each new tree is added.")
66 | 
67 |     FLAGS = parser.parse_args()
68 | 
69 |     data = np.load('airlines_data.npz')
70 |     train_and_predict(**data)
71 | 


--------------------------------------------------------------------------------
/plots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/.gitkeep


--------------------------------------------------------------------------------
/plots/by_hour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/by_hour.png


--------------------------------------------------------------------------------
/plots/exc_times.csv:
--------------------------------------------------------------------------------
1 | Model,Wall time, Total time
2 | XGB,2.54,42.06
3 | TF (1k ex/l),40.46,124.12
4 | TF (5k ex/l),238.30,659.74
5 | 


--------------------------------------------------------------------------------
/plots/execution_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/execution_time.png


--------------------------------------------------------------------------------
/plots/results_table.txt:
--------------------------------------------------------------------------------
1 | Model  AUC score
2 |       XGBoost (50 trees)       67.7
3 | TensorFlow (1k ex/layer)       62.2
4 | TensorFlow (5k ex/layer)       66.2


--------------------------------------------------------------------------------
/plots/roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicolov/gradient_boosting_tensorflow_xgboost/628e7cd5ecd141a4c148e8b725ee7b708e1ae804/plots/roc.png


--------------------------------------------------------------------------------
/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Prepare datasets for training and testing
 5 | """
 6 | 
 7 | from IPython import embed
 8 | import pandas as pd
 9 | import numpy as np
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.preprocessing import LabelEncoder
12 | 
13 | # Subset of the columns to use
14 | cols = ['Month', 'DayOfWeek', 'Distance',
15 |         'DepDelay', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest']
16 | categorical_cols = ['UniqueCarrier', 'Origin', 'Dest']
17 | 
18 | 
19 | def _get_df_from_file(file_name, n):
20 |     df = pd.read_csv(file_name, usecols=cols)
21 |     df = df.dropna(subset=cols)
22 |     # Keep `n` samples
23 |     df = df.sample(n=n, random_state=42)
24 | 
25 |     # Create binary labels from the delay column, and delete it from the
26 |     # training data
27 |     labels = df.DepDelay > 15
28 |     del df['DepDelay']
29 | 
30 |     # Discard minutes in departure times
31 |     df.CRSDepTime = df.CRSDepTime // 100    
32 |     return df, labels
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     df2006, y2006 = _get_df_from_file('2006.csv', 100*1000)
37 |     df2007, y2007 = _get_df_from_file('2007.csv', 200*1000)
38 | 
39 |     # xgboost wants numbers for categorical variables
40 |     for col in categorical_cols:
41 |         lenc = LabelEncoder().fit(pd.concat([df2006[col], df2007[col]]))
42 |         df2006[col] = lenc.transform(df2006[col])
43 |         df2007[col] = lenc.transform(df2007[col])
44 | 
45 |     # Get test/validation sets from 2007 data
46 |     X_val, X_test, y_val, y_test = train_test_split(
47 |         df2007, y2007, test_size=0.5, random_state=43)
48 | 
49 |     data = dict(
50 |         feature_names=df2006.columns,
51 |         X_train=df2006, y_train=y2006,
52 |         X_test=X_test, y_test=y_test,
53 |         X_val=X_val, y_val=y_val,
54 |     )
55 | 
56 |     for k, v in data.items():
57 |         print(k, v.shape)
58 | 
59 |     np.savez('airlines_data.npz', **data)
60 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.functools-lru-cache==1.4
 2 | backports.shutil-get-terminal-size==1.0.0
 3 | backports.weakref==1.0.post1
 4 | bleach==1.5.0
 5 | cycler==0.10.0
 6 | decorator==4.1.2
 7 | enum34==1.1.6
 8 | feather-format==0.4.0
 9 | funcsigs==1.0.2
10 | futures==3.1.1
11 | html5lib==0.9999999
12 | ipython==5.5.0
13 | ipython-genutils==0.2.0
14 | Markdown==2.6.9
15 | matplotlib==2.1.0
16 | mock==2.0.0
17 | numpy==1.13.3
18 | pandas==0.21.0
19 | pathlib2==2.3.0
20 | pbr==3.1.1
21 | pexpect==4.2.1
22 | pickleshare==0.7.4
23 | prompt-toolkit==1.0.15
24 | protobuf==3.4.0
25 | ptyprocess==0.5.2
26 | pyarrow==0.7.1
27 | Pygments==2.2.0
28 | pyparsing==2.2.0
29 | PySide==1.2.4
30 | python-dateutil==2.6.1
31 | pytz==2017.3
32 | scandir==1.6
33 | scikit-learn==0.19.1
34 | scipy==1.0.0
35 | seaborn==0.8.1
36 | simplegeneric==0.8.1
37 | six==1.11.0
38 | subprocess32==3.2.7
39 | tensorflow==1.4.0
40 | tensorflow-tensorboard==0.4.0rc2
41 | traitlets==4.3.2
42 | wcwidth==0.1.7
43 | Werkzeug==0.12.2
44 | xgboost==0.6a2
45 | 


--------------------------------------------------------------------------------