├── .DS_Store
├── .gitattributes
├── Chapter01
    ├── .ipynb_checkpoints
    │   └── Gradient_Boosting_in_Machine_Learning-checkpoint.ipynb
    ├── Gradient_Boosting_in_Machine_Learning.ipynb
    ├── bike_rentals.csv
    └── bike_rentals_cleaned.csv
├── Chapter02
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Decision_Trees_In_Depth-checkpoint.ipynb
    ├── 1st Degree Polynomial - High Bias.png
    ├── 3rd Degree Polynomial - Balanced.png
    ├── 8th Degree Polynomial - High Variance.png
    ├── Decision_Trees_In_Depth.ipynb
    ├── Random_Points.png
    ├── bike_rentals_cleaned.csv
    ├── census_cleaned.csv
    ├── census_tree.png
    └── heart_disease.csv
├── Chapter03
    ├── .ipynb_checkpoints
    │   └── Random_Forest_Baggging-checkpoint.ipynb
    ├── Random_Forest_Baggging.ipynb
    ├── Random_Forest_Bike_Rentals.png
    ├── Random_Forest_Warm_Start.png
    ├── XGBoost_v_Random_Forest.png
    ├── bike_rentals_cleaned.csv
    └── census_cleaned.csv
├── Chapter04
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Gradient_Boosting-checkpoint.ipynb
    ├── Gradient Boosting learning_rate 30 trees.png
    ├── Gradient Boosting learning_rate 300 trees.png
    ├── Gradient Boosting learning_rate 3000 trees.png
    ├── Gradient_Boosting.ipynb
    ├── bike_rentals_cleaned.csv
    ├── exoplanets.csv
    └── exoplanets.csv.zip
├── Chapter05
    ├── .DS_Store
    ├── .gitattributes
    ├── .ipynb_checkpoints
    │   └── Advanced_XGBoost_Unveiled-checkpoint.ipynb
    ├── Advanced_XGBoost_Unveiled.ipynb
    ├── atlas-higgs-challenge-2014-v2.csv.gz
    └── higgs.model
├── Chapter06
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── XGBoost_Hyperparameters-checkpoint.ipynb
    ├── XGBoost_Hyperparameters.ipynb
    └── heart_disease.csv
├── Chapter07
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Discovering_Exoplanets-checkpoint.ipynb
    ├── Discovering_Exoplanets.ipynb
    ├── Light Plot 0.png
    ├── Light Plot 1.png
    ├── Light Plot 37.png
    ├── Light Plot 38.png
    ├── Light Plot 39.png
    ├── exoplanets.csv
    └── exoplanets.csv.zip
├── Chapter08
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Alternative_Base_Learners-checkpoint.ipynb
    ├── Alternative_Base_Learners.ipynb
    └── census_cleaned.csv
├── Chapter09
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Kaggle_Winners-checkpoint.ipynb
    ├── Kaggle_Winners.ipynb
    ├── cab_rides.csv
    └── weather.csv
├── Chapter10
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   ├── XGBoost_Model_Deployment-Copy1-checkpoint.ipynb
    │   ├── XGBoost_Model_Deployment-Copy2-checkpoint.ipynb
    │   └── XGBoost_Model_Deployment-checkpoint.ipynb
    ├── XGBoost_Model_Deployment-Copy1.ipynb
    ├── XGBoost_Model_Deployment-Copy2.ipynb
    ├── XGBoost_Model_Deployment.ipynb
    └── student-por.csv
├── LICENSE
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 | *.gz filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/Chapter01/bike_rentals.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c37fd915a375aabaf026c3b5efac46dd7490a817c1a56b6e9b8ca83d2537e261
3 | size 68796
4 | 


--------------------------------------------------------------------------------
/Chapter01/bike_rentals_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:048ee648708e302171ba1d42cf64c536274fbd059210f38da03c699a3386d7af
3 | size 50793
4 | 


--------------------------------------------------------------------------------
/Chapter02/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/.DS_Store


--------------------------------------------------------------------------------
/Chapter02/.ipynb_checkpoints/Decision_Trees_In_Depth-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 29,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Import pandas and numpy\n",
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "# Import warnings\n",
 14 |     "import warnings\n",
 15 |     "warnings.filterwarnings('ignore')"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 30,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Load dataset 'census_cleaned.csv'\n",
 25 |     "df_census = pd.read_csv('census_cleaned.csv')\n",
 26 |     "\n",
 27 |     "# Split data into X and y\n",
 28 |     "X = df_census.iloc[:,:-1]\n",
 29 |     "y = df_census.iloc[:,-1]\n",
 30 |     "\n",
 31 |     "# Import train_test_split\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "\n",
 34 |     "# Split data into train and test sets\n",
 35 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 31,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "0.8131679154894976"
 47 |       ]
 48 |      },
 49 |      "execution_count": 31,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "# Import Decision Tree classifier\n",
 56 |     "from sklearn.tree import DecisionTreeClassifier\n",
 57 |     "\n",
 58 |     "# Import accuracy_score\n",
 59 |     "from sklearn.metrics import accuracy_score\n",
 60 |     "\n",
 61 |     "# Initialize classification model\n",
 62 |     "clf = DecisionTreeClassifier(random_state=2)\n",
 63 |     "\n",
 64 |     "# Fit model on training data\n",
 65 |     "clf.fit(X_train, y_train)\n",
 66 |     "\n",
 67 |     "# Make predictions for test data\n",
 68 |     "y_pred = clf.predict(X_test)\n",
 69 |     "\n",
 70 |     "# Calculate accuracy\n",
 71 |     "accuracy_score(y_pred, y_test)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 32,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Download bike_rentals_cleaned dataset\n",
 81 |     "df_bikes = pd.read_csv('bike_rentals_cleaned.csv')\n",
 82 |     "\n",
 83 |     "# split data into X and y\n",
 84 |     "X_bikes = df_bikes.iloc[:,:-1]\n",
 85 |     "y_bikes = df_bikes.iloc[:,-1]\n",
 86 |     "\n",
 87 |     "# Import Linear Regression\n",
 88 |     "from sklearn.linear_model import LinearRegression\n",
 89 |     "\n",
 90 |     "# Split data into train and test sets\n",
 91 |     "X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 33,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# Import Decision Tree Regressor\n",
101 |     "from sklearn.tree import DecisionTreeRegressor\n",
102 |     "\n",
103 |     "# Import cross_val_score\n",
104 |     "from sklearn.model_selection import cross_val_score"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 34,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "RMSE mean: 1233.36\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "# Initialize Decision Tree Regressor\n",
122 |     "reg = DecisionTreeRegressor(random_state=2)\n",
123 |     "\n",
124 |     "# Obtain scores of cross-validation using mean squared error\n",
125 |     "scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)\n",
126 |     "\n",
127 |     "# Take square root of the scores\n",
128 |     "rmse = np.sqrt(-scores)\n",
129 |     "\n",
130 |     "# Display mean score\n",
131 |     "print('RMSE mean: %0.2f' % (rmse.mean()))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 35,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "0.0"
143 |       ]
144 |      },
145 |      "execution_count": 35,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "reg = DecisionTreeRegressor()\n",
152 |     "reg.fit(X_train, y_train)\n",
153 |     "y_pred = reg.predict(X_train)\n",
154 |     "from sklearn.metrics import mean_squared_error\n",
155 |     "reg_mse = mean_squared_error(y_train, y_pred)\n",
156 |     "reg_rmse = np.sqrt(reg_mse)\n",
157 |     "reg_rmse"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 36,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "RMSE mean: 1205.97\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "# Initialize Decision Tree Regressor\n",
175 |     "reg = DecisionTreeRegressor(random_state=2, max_depth=6)\n",
176 |     "\n",
177 |     "# Obtain scores of cross-validation using mean squared error\n",
178 |     "scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)\n",
179 |     "\n",
180 |     "# Take square root of the scores\n",
181 |     "rmse = np.sqrt(-scores)\n",
182 |     "\n",
183 |     "# Display mean score\n",
184 |     "print('RMSE mean: %0.2f' % (rmse.mean()))"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 37,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "Best params: {'max_depth': 6}\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "# Import GridSearchCV\n",
202 |     "from sklearn.model_selection import GridSearchCV\n",
203 |     "\n",
204 |     "# Choose max_depth hyperparameters\n",
205 |     "params = {'max_depth':[None,2,3,4,6,8,10,20]}\n",
206 |     "\n",
207 |     "# Initialize regression model as reg\n",
208 |     "reg = DecisionTreeRegressor(random_state=2)\n",
209 |     "\n",
210 |     "# Initialize GridSearchCV as grid_reg\n",
211 |     "grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)\n",
212 |     "\n",
213 |     "# Fit grid_reg on X_train and y_train\n",
214 |     "grid_reg.fit(X_train, y_train)\n",
215 |     "\n",
216 |     "# Extract best parameters\n",
217 |     "best_params = grid_reg.best_params_\n",
218 |     "\n",
219 |     "# Print best hyperparameters\n",
220 |     "print(\"Best params:\", best_params)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 38,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "Training score: 951.398\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "# Compute best score\n",
238 |     "best_score = np.sqrt(-grid_reg.best_score_)\n",
239 |     "\n",
240 |     "# Print best score\n",
241 |     "print(\"Training score: {:.3f}\".format(best_score))"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 39,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "Test score: 864.670\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "# Extract best model\n",
259 |     "best_model = grid_reg.best_estimator_\n",
260 |     "\n",
261 |     "# Predict test set labels\n",
262 |     "y_pred = best_model.predict(X_test)\n",
263 |     "\n",
264 |     "# Import mean_squared_error from sklearn.metrics as MSE \n",
265 |     "from sklearn.metrics import mean_squared_error\n",
266 |     "\n",
267 |     "# Compute rmse_test\n",
268 |     "rmse_test = mean_squared_error(y_test, y_pred)**0.5\n",
269 |     "\n",
270 |     "# Print rmse_test\n",
271 |     "print('Test score: {:.3f}'.format(rmse_test))"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 40,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):\n",
281 |     "\n",
282 |     "    # Instantiate GridSearchCV as grid_reg\n",
283 |     "    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)\n",
284 |     "    \n",
285 |     "    # Fit grid_reg on X_train and y_train\n",
286 |     "    grid_reg.fit(X_train, y_train)\n",
287 |     "\n",
288 |     "    # Extract best params\n",
289 |     "    best_params = grid_reg.best_params_\n",
290 |     "\n",
291 |     "    # Print best params\n",
292 |     "    print(\"Best params:\", best_params)\n",
293 |     "    \n",
294 |     "    # Compute best score\n",
295 |     "    best_score = np.sqrt(-grid_reg.best_score_)\n",
296 |     "\n",
297 |     "    # Print best score\n",
298 |     "    print(\"Training score: {:.3f}\".format(best_score))\n",
299 |     "\n",
300 |     "    # Predict test set labels\n",
301 |     "    y_pred = grid_reg.predict(X_test)\n",
302 |     "\n",
303 |     "    # Compute rmse_test\n",
304 |     "    rmse_test = mean_squared_error(y_test, y_pred)**0.5\n",
305 |     "\n",
306 |     "    # Print rmse_test\n",
307 |     "    print('Test score: {:.3f}'.format(rmse_test))"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 41,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "(548, 12)"
319 |       ]
320 |      },
321 |      "execution_count": 41,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "X_train.shape"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 42,
333 |    "metadata": {},
334 |    "outputs": [
335 |     {
336 |      "name": "stdout",
337 |      "output_type": "stream",
338 |      "text": [
339 |       "Best params: {'min_samples_leaf': 8}\n",
340 |       "Training score: 896.083\n",
341 |       "Test score: 855.620\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "grid_search(params={'min_samples_leaf':[1,2,4,6,8,10,20,30]})"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 15,
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "name": "stdout",
356 |      "output_type": "stream",
357 |      "text": [
358 |       "Best params: {'max_depth': 6, 'min_samples_leaf': 2}\n",
359 |       "Training score: 870.396\n",
360 |       "Test score: 913.000\n"
361 |      ]
362 |     }
363 |    ],
364 |    "source": [
365 |     "grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 16,
371 |    "metadata": {
372 |     "scrolled": true
373 |    },
374 |    "outputs": [
375 |     {
376 |      "name": "stdout",
377 |      "output_type": "stream",
378 |      "text": [
379 |       "Best params: {'max_depth': 9, 'min_samples_leaf': 7}\n",
380 |       "Training score: 888.905\n",
381 |       "Test score: 878.538\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "grid_search(params={'max_depth':[5,6,7,8,9],'min_samples_leaf':[3,5,7,9]})"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "# Case Study - Heart Disease"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 17,
399 |    "metadata": {},
400 |    "outputs": [
401 |     {
402 |      "data": {
403 |       "text/html": [
404 |        "<div>\n",
405 |        "<style scoped>\n",
406 |        "    .dataframe tbody tr th:only-of-type {\n",
407 |        "        vertical-align: middle;\n",
408 |        "    }\n",
409 |        "\n",
410 |        "    .dataframe tbody tr th {\n",
411 |        "        vertical-align: top;\n",
412 |        "    }\n",
413 |        "\n",
414 |        "    .dataframe thead th {\n",
415 |        "        text-align: right;\n",
416 |        "    }\n",
417 |        "</style>\n",
418 |        "<table border=\"1\" class=\"dataframe\">\n",
419 |        "  <thead>\n",
420 |        "    <tr style=\"text-align: right;\">\n",
421 |        "      <th></th>\n",
422 |        "      <th>age</th>\n",
423 |        "      <th>sex</th>\n",
424 |        "      <th>cp</th>\n",
425 |        "      <th>trestbps</th>\n",
426 |        "      <th>chol</th>\n",
427 |        "      <th>fbs</th>\n",
428 |        "      <th>restecg</th>\n",
429 |        "      <th>thalach</th>\n",
430 |        "      <th>exang</th>\n",
431 |        "      <th>oldpeak</th>\n",
432 |        "      <th>slope</th>\n",
433 |        "      <th>ca</th>\n",
434 |        "      <th>thal</th>\n",
435 |        "      <th>target</th>\n",
436 |        "    </tr>\n",
437 |        "  </thead>\n",
438 |        "  <tbody>\n",
439 |        "    <tr>\n",
440 |        "      <th>0</th>\n",
441 |        "      <td>63</td>\n",
442 |        "      <td>1</td>\n",
443 |        "      <td>3</td>\n",
444 |        "      <td>145</td>\n",
445 |        "      <td>233</td>\n",
446 |        "      <td>1</td>\n",
447 |        "      <td>0</td>\n",
448 |        "      <td>150</td>\n",
449 |        "      <td>0</td>\n",
450 |        "      <td>2.3</td>\n",
451 |        "      <td>0</td>\n",
452 |        "      <td>0</td>\n",
453 |        "      <td>1</td>\n",
454 |        "      <td>1</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>1</th>\n",
458 |        "      <td>37</td>\n",
459 |        "      <td>1</td>\n",
460 |        "      <td>2</td>\n",
461 |        "      <td>130</td>\n",
462 |        "      <td>250</td>\n",
463 |        "      <td>0</td>\n",
464 |        "      <td>1</td>\n",
465 |        "      <td>187</td>\n",
466 |        "      <td>0</td>\n",
467 |        "      <td>3.5</td>\n",
468 |        "      <td>0</td>\n",
469 |        "      <td>0</td>\n",
470 |        "      <td>2</td>\n",
471 |        "      <td>1</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>2</th>\n",
475 |        "      <td>41</td>\n",
476 |        "      <td>0</td>\n",
477 |        "      <td>1</td>\n",
478 |        "      <td>130</td>\n",
479 |        "      <td>204</td>\n",
480 |        "      <td>0</td>\n",
481 |        "      <td>0</td>\n",
482 |        "      <td>172</td>\n",
483 |        "      <td>0</td>\n",
484 |        "      <td>1.4</td>\n",
485 |        "      <td>2</td>\n",
486 |        "      <td>0</td>\n",
487 |        "      <td>2</td>\n",
488 |        "      <td>1</td>\n",
489 |        "    </tr>\n",
490 |        "    <tr>\n",
491 |        "      <th>3</th>\n",
492 |        "      <td>56</td>\n",
493 |        "      <td>1</td>\n",
494 |        "      <td>1</td>\n",
495 |        "      <td>120</td>\n",
496 |        "      <td>236</td>\n",
497 |        "      <td>0</td>\n",
498 |        "      <td>1</td>\n",
499 |        "      <td>178</td>\n",
500 |        "      <td>0</td>\n",
501 |        "      <td>0.8</td>\n",
502 |        "      <td>2</td>\n",
503 |        "      <td>0</td>\n",
504 |        "      <td>2</td>\n",
505 |        "      <td>1</td>\n",
506 |        "    </tr>\n",
507 |        "    <tr>\n",
508 |        "      <th>4</th>\n",
509 |        "      <td>57</td>\n",
510 |        "      <td>0</td>\n",
511 |        "      <td>0</td>\n",
512 |        "      <td>120</td>\n",
513 |        "      <td>354</td>\n",
514 |        "      <td>0</td>\n",
515 |        "      <td>1</td>\n",
516 |        "      <td>163</td>\n",
517 |        "      <td>1</td>\n",
518 |        "      <td>0.6</td>\n",
519 |        "      <td>2</td>\n",
520 |        "      <td>0</td>\n",
521 |        "      <td>2</td>\n",
522 |        "      <td>1</td>\n",
523 |        "    </tr>\n",
524 |        "  </tbody>\n",
525 |        "</table>\n",
526 |        "</div>"
527 |       ],
528 |       "text/plain": [
529 |        "   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \\\n",
530 |        "0   63    1   3       145   233    1        0      150      0      2.3      0   \n",
531 |        "1   37    1   2       130   250    0        1      187      0      3.5      0   \n",
532 |        "2   41    0   1       130   204    0        0      172      0      1.4      2   \n",
533 |        "3   56    1   1       120   236    0        1      178      0      0.8      2   \n",
534 |        "4   57    0   0       120   354    0        1      163      1      0.6      2   \n",
535 |        "\n",
536 |        "   ca  thal  target  \n",
537 |        "0   0     1       1  \n",
538 |        "1   0     2       1  \n",
539 |        "2   0     2       1  \n",
540 |        "3   0     2       1  \n",
541 |        "4   0     2       1  "
542 |       ]
543 |      },
544 |      "execution_count": 17,
545 |      "metadata": {},
546 |      "output_type": "execute_result"
547 |     }
548 |    ],
549 |    "source": [
550 |     "# Upload heart.csv to dataFrame\n",
551 |     "df_heart = pd.read_csv('heart_disease.csv')\n",
552 |     "\n",
553 |     "# Show first five rows\n",
554 |     "df_heart.head()"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 18,
560 |    "metadata": {},
561 |    "outputs": [],
562 |    "source": [
563 |     "# split data into X and y\n",
564 |     "X = df_heart.iloc[:,:-1]\n",
565 |     "y = df_heart.iloc[:,-1]\n",
566 |     "\n",
567 |     "# Split data into train and test sets\n",
568 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 19,
574 |    "metadata": {},
575 |    "outputs": [
576 |     {
577 |      "name": "stdout",
578 |      "output_type": "stream",
579 |      "text": [
580 |       "Accuracy: [0.74 0.85 0.77 0.73 0.7 ]\n",
581 |       "Accuracy mean: 0.76\n"
582 |      ]
583 |     }
584 |    ],
585 |    "source": [
586 |     "# Initialize Decision Tree Classifier\n",
587 |     "model = DecisionTreeClassifier(random_state=2)\n",
588 |     "\n",
589 |     "# Obtain scores of cross-validation\n",
590 |     "scores = cross_val_score(model, X, y, cv=5)\n",
591 |     "\n",
592 |     "# Display accuracy\n",
593 |     "print('Accuracy:', np.round(scores, 2))\n",
594 |     "\n",
595 |     "# Display mean accuracy\n",
596 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": 20,
602 |    "metadata": {},
603 |    "outputs": [],
604 |    "source": [
605 |     "# Import RandomizedSearchCV\n",
606 |     "from sklearn.model_selection import RandomizedSearchCV\n",
607 |     "\n",
608 |     "def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):\n",
609 |     "\n",
610 |     "    # Instantiate GridSearchCV as grid_reg\n",
611 |     "    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, \n",
612 |     "                                  cv=5, n_jobs=-1, random_state=2)\n",
613 |     "    \n",
614 |     "    # Fit grid_reg on X_train and y_train\n",
615 |     "    rand_clf.fit(X_train, y_train)\n",
616 |     "\n",
617 |     "    # Extract best estimator\n",
618 |     "    best_model = rand_clf.best_estimator_\n",
619 |     "    \n",
620 |     "    # Extract best score\n",
621 |     "    best_score = rand_clf.best_score_\n",
622 |     "\n",
623 |     "    # Print best score\n",
624 |     "    print(\"Training score: {:.3f}\".format(best_score))\n",
625 |     "\n",
626 |     "    # Predict test set labels\n",
627 |     "    y_pred = best_model.predict(X_test)\n",
628 |     "\n",
629 |     "    # Compute accuracy\n",
630 |     "    accuracy = accuracy_score(y_test, y_pred)\n",
631 |     "\n",
632 |     "    # Print accuracy\n",
633 |     "    print('Test score: {:.3f}'.format(accuracy))\n",
634 |     "        \n",
635 |     "    # Return best model\n",
636 |     "    return best_model"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "code",
641 |    "execution_count": 21,
642 |    "metadata": {},
643 |    "outputs": [
644 |     {
645 |      "name": "stdout",
646 |      "output_type": "stream",
647 |      "text": [
648 |       "Training score: 0.798\n",
649 |       "Test score: 0.855\n"
650 |      ]
651 |     },
652 |     {
653 |      "data": {
654 |       "text/plain": [
655 |        "DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,\n",
656 |        "                       max_leaf_nodes=45, min_samples_leaf=0.04,\n",
657 |        "                       min_samples_split=10, min_weight_fraction_leaf=0.05,\n",
658 |        "                       random_state=2)"
659 |       ]
660 |      },
661 |      "execution_count": 21,
662 |      "metadata": {},
663 |      "output_type": "execute_result"
664 |     }
665 |    ],
666 |    "source": [
667 |     "randomized_search_clf(params={'criterion':['entropy', 'gini'],\n",
668 |     "                              'splitter':['random', 'best'],\n",
669 |     "                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],\n",
670 |     "                          'min_samples_split':[2, 3, 4, 5, 6, 8, 10],\n",
671 |     "                          'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],\n",
672 |     "                          'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],\n",
673 |     "                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],\n",
674 |     "                          'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],\n",
675 |     "                          'max_depth':[None, 2,4,6,8],\n",
676 |     "                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]\n",
677 |     "                         })"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": 22,
683 |    "metadata": {},
684 |    "outputs": [
685 |     {
686 |      "name": "stdout",
687 |      "output_type": "stream",
688 |      "text": [
689 |       "Training score: 0.802\n",
690 |       "Test score: 0.868\n"
691 |      ]
692 |     },
693 |     {
694 |      "data": {
695 |       "text/plain": [
696 |        "DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,\n",
697 |        "                       min_samples_leaf=0.045, min_samples_split=9,\n",
698 |        "                       min_weight_fraction_leaf=0.06, random_state=2)"
699 |       ]
700 |      },
701 |      "execution_count": 22,
702 |      "metadata": {},
703 |      "output_type": "execute_result"
704 |     }
705 |    ],
706 |    "source": [
707 |     "randomized_search_clf(params={'max_depth':[None, 6, 7],\n",
708 |     "'max_features':['auto', 0.78],\n",
709 |     "'max_leaf_nodes':[45, None],\n",
710 |     "'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],\n",
711 |     "'min_samples_split':[2, 9, 10],\n",
712 |     "'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],\n",
713 |     "},\n",
714 |     "runs=100)"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": 23,
720 |    "metadata": {},
721 |    "outputs": [
722 |     {
723 |      "name": "stdout",
724 |      "output_type": "stream",
725 |      "text": [
726 |       "Accuracy: [0.82 0.9  0.8  0.8  0.78]\n",
727 |       "Accuracy mean: 0.82\n"
728 |      ]
729 |     }
730 |    ],
731 |    "source": [
732 |     "# Initialize Decision Tree Classifier\n",
733 |     "model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,\n",
734 |     "            max_features=0.78, max_leaf_nodes=45,\n",
735 |     "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
736 |     "            min_samples_leaf=0.045, min_samples_split=9,\n",
737 |     "            min_weight_fraction_leaf=0.06, presort=False, random_state=2,\n",
738 |     "            splitter='best')\n",
739 |     "\n",
740 |     "# Obtain scores of cross-validation\n",
741 |     "scores = cross_val_score(model, X, y, cv=5)\n",
742 |     "\n",
743 |     "# Display accuracy\n",
744 |     "print('Accuracy:', np.round(scores, 2))\n",
745 |     "\n",
746 |     "# Display mean accuracy\n",
747 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": 24,
753 |    "metadata": {},
754 |    "outputs": [
755 |     {
756 |      "data": {
757 |       "text/plain": [
758 |        "DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,\n",
759 |        "                       min_samples_leaf=0.045, min_samples_split=9,\n",
760 |        "                       min_weight_fraction_leaf=0.06, presort=False,\n",
761 |        "                       random_state=2)"
762 |       ]
763 |      },
764 |      "execution_count": 24,
765 |      "metadata": {},
766 |      "output_type": "execute_result"
767 |     }
768 |    ],
769 |    "source": [
770 |     "best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,\n",
771 |     "                       max_features=0.78, max_leaf_nodes=45,\n",
772 |     "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
773 |     "                       min_samples_leaf=0.045, min_samples_split=9,\n",
774 |     "                       min_weight_fraction_leaf=0.06, presort=False,\n",
775 |     "                       random_state=2, splitter='best')\n",
776 |     "best_clf.fit(X, y)"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 25,
782 |    "metadata": {},
783 |    "outputs": [
784 |     {
785 |      "data": {
786 |       "text/plain": [
787 |        "array([0.04826754, 0.04081653, 0.48409586, 0.00568635, 0.        ,\n",
788 |        "       0.        , 0.        , 0.00859483, 0.        , 0.02690379,\n",
789 |        "       0.        , 0.18069065, 0.20494446])"
790 |       ]
791 |      },
792 |      "execution_count": 25,
793 |      "metadata": {},
794 |      "output_type": "execute_result"
795 |     }
796 |    ],
797 |    "source": [
798 |     "best_clf.feature_importances_"
799 |    ]
800 |   },
801 |   {
802 |    "cell_type": "code",
803 |    "execution_count": 26,
804 |    "metadata": {},
805 |    "outputs": [
806 |     {
807 |      "data": {
808 |       "text/plain": [
809 |        "[('cp', 0.4840958610240171),\n",
810 |        " ('thal', 0.20494445570568706),\n",
811 |        " ('ca', 0.18069065321397942)]"
812 |       ]
813 |      },
814 |      "execution_count": 26,
815 |      "metadata": {},
816 |      "output_type": "execute_result"
817 |     }
818 |    ],
819 |    "source": [
820 |     "# Zip columns and feature_importances_ into dict\n",
821 |     "feature_dict = dict(zip(X.columns, best_clf.feature_importances_))\n",
822 |     "\n",
823 |     "# Import operator\n",
824 |     "import operator\n",
825 |     "\n",
826 |     "# Sort dict by values (as list of tuples)\n",
827 |     "sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]"
828 |    ]
829 |   }
830 |  ],
831 |  "metadata": {
832 |   "kernelspec": {
833 |    "display_name": "Python 3",
834 |    "language": "python",
835 |    "name": "python3"
836 |   },
837 |   "language_info": {
838 |    "codemirror_mode": {
839 |     "name": "ipython",
840 |     "version": 3
841 |    },
842 |    "file_extension": ".py",
843 |    "mimetype": "text/x-python",
844 |    "name": "python",
845 |    "nbconvert_exporter": "python",
846 |    "pygments_lexer": "ipython3",
847 |    "version": "3.7.7"
848 |   }
849 |  },
850 |  "nbformat": 4,
851 |  "nbformat_minor": 2
852 | }
853 | 


--------------------------------------------------------------------------------
/Chapter02/1st Degree Polynomial - High Bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/1st Degree Polynomial - High Bias.png


--------------------------------------------------------------------------------
/Chapter02/3rd Degree Polynomial - Balanced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/3rd Degree Polynomial - Balanced.png


--------------------------------------------------------------------------------
/Chapter02/8th Degree Polynomial - High Variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/8th Degree Polynomial - High Variance.png


--------------------------------------------------------------------------------
/Chapter02/Decision_Trees_In_Depth.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Import pandas and numpy\n",
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "# Import warnings\n",
 14 |     "import warnings\n",
 15 |     "warnings.filterwarnings('ignore')"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Load dataset 'census_cleaned.csv'\n",
 25 |     "df_census = pd.read_csv('census_cleaned.csv')\n",
 26 |     "\n",
 27 |     "# Split data into X and y\n",
 28 |     "X = df_census.iloc[:,:-1]\n",
 29 |     "y = df_census.iloc[:,-1]\n",
 30 |     "\n",
 31 |     "# Import train_test_split\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "\n",
 34 |     "# Split data into train and test sets\n",
 35 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "0.8131679154894976"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "# Import Decision Tree classifier\n",
 56 |     "from sklearn.tree import DecisionTreeClassifier\n",
 57 |     "\n",
 58 |     "# Import accuracy_score\n",
 59 |     "from sklearn.metrics import accuracy_score\n",
 60 |     "\n",
 61 |     "# Initialize classification model\n",
 62 |     "clf = DecisionTreeClassifier(random_state=2)\n",
 63 |     "\n",
 64 |     "# Fit model on training data\n",
 65 |     "clf.fit(X_train, y_train)\n",
 66 |     "\n",
 67 |     "# Make predictions for test data\n",
 68 |     "y_pred = clf.predict(X_test)\n",
 69 |     "\n",
 70 |     "# Calculate accuracy\n",
 71 |     "accuracy_score(y_pred, y_test)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Download bike_rentals_cleaned dataset\n",
 81 |     "df_bikes = pd.read_csv('bike_rentals_cleaned.csv')\n",
 82 |     "\n",
 83 |     "# Split data into X and y\n",
 84 |     "X_bikes = df_bikes.iloc[:,:-1]\n",
 85 |     "y_bikes = df_bikes.iloc[:,-1]\n",
 86 |     "\n",
 87 |     "# Import Linear Regression\n",
 88 |     "from sklearn.linear_model import LinearRegression\n",
 89 |     "\n",
 90 |     "# Split data into train and test sets\n",
 91 |     "X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# Import Decision Tree Regressor\n",
101 |     "from sklearn.tree import DecisionTreeRegressor\n",
102 |     "\n",
103 |     "# Import cross_val_score\n",
104 |     "from sklearn.model_selection import cross_val_score"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 6,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "RMSE mean: 1233.36\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "# Initialize Decision Tree Regressor\n",
122 |     "reg = DecisionTreeRegressor(random_state=2)\n",
123 |     "\n",
124 |     "# Obtain scores of cross-validation using mean squared error\n",
125 |     "scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)\n",
126 |     "\n",
127 |     "# Take square root of the scores\n",
128 |     "rmse = np.sqrt(-scores)\n",
129 |     "\n",
130 |     "# Display mean score\n",
131 |     "print('RMSE mean: %0.2f' % (rmse.mean()))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "0.0"
143 |       ]
144 |      },
145 |      "execution_count": 7,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "# Initialize and score DecisionTreeRegressor on training set\n",
152 |     "reg = DecisionTreeRegressor()\n",
153 |     "reg.fit(X_train, y_train)\n",
154 |     "y_pred = reg.predict(X_train)\n",
155 |     "from sklearn.metrics import mean_squared_error\n",
156 |     "reg_mse = mean_squared_error(y_train, y_pred)\n",
157 |     "reg_rmse = np.sqrt(reg_mse)\n",
158 |     "reg_rmse"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 9,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "Best params: {'max_depth': 6}\n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "# Import GridSearchCV\n",
176 |     "from sklearn.model_selection import GridSearchCV\n",
177 |     "\n",
178 |     "# Choose max_depth hyperparameters\n",
179 |     "params = {'max_depth':[None,2,3,4,6,8,10,20]}\n",
180 |     "\n",
181 |     "# Initialize regression model as reg\n",
182 |     "reg = DecisionTreeRegressor(random_state=2)\n",
183 |     "\n",
184 |     "# Initialize GridSearchCV as grid_reg\n",
185 |     "grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)\n",
186 |     "\n",
187 |     "# Fit grid_reg on X_train and y_train\n",
188 |     "grid_reg.fit(X_train, y_train)\n",
189 |     "\n",
190 |     "# Extract best parameters\n",
191 |     "best_params = grid_reg.best_params_\n",
192 |     "\n",
193 |     "# Print best hyperparameters\n",
194 |     "print(\"Best params:\", best_params)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 10,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "Training score: 951.398\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "# Compute best score\n",
212 |     "best_score = np.sqrt(-grid_reg.best_score_)\n",
213 |     "\n",
214 |     "# Print best score\n",
215 |     "print(\"Training score: {:.3f}\".format(best_score))"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 11,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "Test score: 864.670\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "# Extract best model\n",
233 |     "best_model = grid_reg.best_estimator_\n",
234 |     "\n",
235 |     "# Predict test set labels\n",
236 |     "y_pred = best_model.predict(X_test)\n",
237 |     "\n",
238 |     "# Import mean_squared_error from sklearn.metrics as MSE \n",
239 |     "from sklearn.metrics import mean_squared_error\n",
240 |     "\n",
241 |     "# Compute rmse_test\n",
242 |     "rmse_test = mean_squared_error(y_test, y_pred)**0.5\n",
243 |     "\n",
244 |     "# Print rmse_test\n",
245 |     "print('Test score: {:.3f}'.format(rmse_test))"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 12,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# Create grid_search function\n",
255 |     "def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):\n",
256 |     "\n",
257 |     "    # Instantiate GridSearchCV as grid_reg\n",
258 |     "    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)\n",
259 |     "    \n",
260 |     "    # Fit grid_reg on X_train and y_train\n",
261 |     "    grid_reg.fit(X_train, y_train)\n",
262 |     "\n",
263 |     "    # Extract best params\n",
264 |     "    best_params = grid_reg.best_params_\n",
265 |     "\n",
266 |     "    # Print best params\n",
267 |     "    print(\"Best params:\", best_params)\n",
268 |     "    \n",
269 |     "    # Compute best score\n",
270 |     "    best_score = np.sqrt(-grid_reg.best_score_)\n",
271 |     "\n",
272 |     "    # Print best score\n",
273 |     "    print(\"Training score: {:.3f}\".format(best_score))\n",
274 |     "\n",
275 |     "    # Predict test set labels\n",
276 |     "    y_pred = grid_reg.predict(X_test)\n",
277 |     "\n",
278 |     "    # Compute rmse_test\n",
279 |     "    rmse_test = mean_squared_error(y_test, y_pred)**0.5\n",
280 |     "\n",
281 |     "    # Print rmse_test\n",
282 |     "    print('Test score: {:.3f}'.format(rmse_test))"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 13,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/plain": [
293 |        "(548, 12)"
294 |       ]
295 |      },
296 |      "execution_count": 13,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "X_train.shape"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 14,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "name": "stdout",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "Best params: {'min_samples_leaf': 8}\n",
315 |       "Training score: 896.083\n",
316 |       "Test score: 855.620\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "grid_search(params={'min_samples_leaf':[1,2,4,6,8,10,20,30]})"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 15,
327 |    "metadata": {},
328 |    "outputs": [
329 |     {
330 |      "name": "stdout",
331 |      "output_type": "stream",
332 |      "text": [
333 |       "Best params: {'max_depth': 6, 'min_samples_leaf': 2}\n",
334 |       "Training score: 870.396\n",
335 |       "Test score: 913.000\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 16,
346 |    "metadata": {
347 |     "scrolled": true
348 |    },
349 |    "outputs": [
350 |     {
351 |      "name": "stdout",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "Best params: {'max_depth': 9, 'min_samples_leaf': 7}\n",
355 |       "Training score: 888.905\n",
356 |       "Test score: 878.538\n"
357 |      ]
358 |     }
359 |    ],
360 |    "source": [
361 |     "grid_search(params={'max_depth':[5,6,7,8,9],'min_samples_leaf':[3,5,7,9]})"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "# Case Study - Heart Disease"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 17,
374 |    "metadata": {},
375 |    "outputs": [
376 |     {
377 |      "data": {
378 |       "text/html": [
379 |        "<div>\n",
380 |        "<style scoped>\n",
381 |        "    .dataframe tbody tr th:only-of-type {\n",
382 |        "        vertical-align: middle;\n",
383 |        "    }\n",
384 |        "\n",
385 |        "    .dataframe tbody tr th {\n",
386 |        "        vertical-align: top;\n",
387 |        "    }\n",
388 |        "\n",
389 |        "    .dataframe thead th {\n",
390 |        "        text-align: right;\n",
391 |        "    }\n",
392 |        "</style>\n",
393 |        "<table border=\"1\" class=\"dataframe\">\n",
394 |        "  <thead>\n",
395 |        "    <tr style=\"text-align: right;\">\n",
396 |        "      <th></th>\n",
397 |        "      <th>age</th>\n",
398 |        "      <th>sex</th>\n",
399 |        "      <th>cp</th>\n",
400 |        "      <th>trestbps</th>\n",
401 |        "      <th>chol</th>\n",
402 |        "      <th>fbs</th>\n",
403 |        "      <th>restecg</th>\n",
404 |        "      <th>thalach</th>\n",
405 |        "      <th>exang</th>\n",
406 |        "      <th>oldpeak</th>\n",
407 |        "      <th>slope</th>\n",
408 |        "      <th>ca</th>\n",
409 |        "      <th>thal</th>\n",
410 |        "      <th>target</th>\n",
411 |        "    </tr>\n",
412 |        "  </thead>\n",
413 |        "  <tbody>\n",
414 |        "    <tr>\n",
415 |        "      <th>0</th>\n",
416 |        "      <td>63</td>\n",
417 |        "      <td>1</td>\n",
418 |        "      <td>3</td>\n",
419 |        "      <td>145</td>\n",
420 |        "      <td>233</td>\n",
421 |        "      <td>1</td>\n",
422 |        "      <td>0</td>\n",
423 |        "      <td>150</td>\n",
424 |        "      <td>0</td>\n",
425 |        "      <td>2.3</td>\n",
426 |        "      <td>0</td>\n",
427 |        "      <td>0</td>\n",
428 |        "      <td>1</td>\n",
429 |        "      <td>1</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>1</th>\n",
433 |        "      <td>37</td>\n",
434 |        "      <td>1</td>\n",
435 |        "      <td>2</td>\n",
436 |        "      <td>130</td>\n",
437 |        "      <td>250</td>\n",
438 |        "      <td>0</td>\n",
439 |        "      <td>1</td>\n",
440 |        "      <td>187</td>\n",
441 |        "      <td>0</td>\n",
442 |        "      <td>3.5</td>\n",
443 |        "      <td>0</td>\n",
444 |        "      <td>0</td>\n",
445 |        "      <td>2</td>\n",
446 |        "      <td>1</td>\n",
447 |        "    </tr>\n",
448 |        "    <tr>\n",
449 |        "      <th>2</th>\n",
450 |        "      <td>41</td>\n",
451 |        "      <td>0</td>\n",
452 |        "      <td>1</td>\n",
453 |        "      <td>130</td>\n",
454 |        "      <td>204</td>\n",
455 |        "      <td>0</td>\n",
456 |        "      <td>0</td>\n",
457 |        "      <td>172</td>\n",
458 |        "      <td>0</td>\n",
459 |        "      <td>1.4</td>\n",
460 |        "      <td>2</td>\n",
461 |        "      <td>0</td>\n",
462 |        "      <td>2</td>\n",
463 |        "      <td>1</td>\n",
464 |        "    </tr>\n",
465 |        "    <tr>\n",
466 |        "      <th>3</th>\n",
467 |        "      <td>56</td>\n",
468 |        "      <td>1</td>\n",
469 |        "      <td>1</td>\n",
470 |        "      <td>120</td>\n",
471 |        "      <td>236</td>\n",
472 |        "      <td>0</td>\n",
473 |        "      <td>1</td>\n",
474 |        "      <td>178</td>\n",
475 |        "      <td>0</td>\n",
476 |        "      <td>0.8</td>\n",
477 |        "      <td>2</td>\n",
478 |        "      <td>0</td>\n",
479 |        "      <td>2</td>\n",
480 |        "      <td>1</td>\n",
481 |        "    </tr>\n",
482 |        "    <tr>\n",
483 |        "      <th>4</th>\n",
484 |        "      <td>57</td>\n",
485 |        "      <td>0</td>\n",
486 |        "      <td>0</td>\n",
487 |        "      <td>120</td>\n",
488 |        "      <td>354</td>\n",
489 |        "      <td>0</td>\n",
490 |        "      <td>1</td>\n",
491 |        "      <td>163</td>\n",
492 |        "      <td>1</td>\n",
493 |        "      <td>0.6</td>\n",
494 |        "      <td>2</td>\n",
495 |        "      <td>0</td>\n",
496 |        "      <td>2</td>\n",
497 |        "      <td>1</td>\n",
498 |        "    </tr>\n",
499 |        "  </tbody>\n",
500 |        "</table>\n",
501 |        "</div>"
502 |       ],
503 |       "text/plain": [
504 |        "   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \\\n",
505 |        "0   63    1   3       145   233    1        0      150      0      2.3      0   \n",
506 |        "1   37    1   2       130   250    0        1      187      0      3.5      0   \n",
507 |        "2   41    0   1       130   204    0        0      172      0      1.4      2   \n",
508 |        "3   56    1   1       120   236    0        1      178      0      0.8      2   \n",
509 |        "4   57    0   0       120   354    0        1      163      1      0.6      2   \n",
510 |        "\n",
511 |        "   ca  thal  target  \n",
512 |        "0   0     1       1  \n",
513 |        "1   0     2       1  \n",
514 |        "2   0     2       1  \n",
515 |        "3   0     2       1  \n",
516 |        "4   0     2       1  "
517 |       ]
518 |      },
519 |      "execution_count": 17,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "# Upload heart.csv to dataFrame\n",
526 |     "df_heart = pd.read_csv('heart_disease.csv')\n",
527 |     "\n",
528 |     "# Show first five rows\n",
529 |     "df_heart.head()"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 18,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": [
538 |     "# split data into X and y\n",
539 |     "X = df_heart.iloc[:,:-1]\n",
540 |     "y = df_heart.iloc[:,-1]\n",
541 |     "\n",
542 |     "# Split data into train and test sets\n",
543 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 19,
549 |    "metadata": {},
550 |    "outputs": [
551 |     {
552 |      "name": "stdout",
553 |      "output_type": "stream",
554 |      "text": [
555 |       "Accuracy: [0.74 0.85 0.77 0.73 0.7 ]\n",
556 |       "Accuracy mean: 0.76\n"
557 |      ]
558 |     }
559 |    ],
560 |    "source": [
561 |     "# Initialize Decision Tree Classifier\n",
562 |     "model = DecisionTreeClassifier(random_state=2)\n",
563 |     "\n",
564 |     "# Obtain scores of cross-validation\n",
565 |     "scores = cross_val_score(model, X, y, cv=5)\n",
566 |     "\n",
567 |     "# Display accuracy\n",
568 |     "print('Accuracy:', np.round(scores, 2))\n",
569 |     "\n",
570 |     "# Display mean accuracy\n",
571 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 20,
577 |    "metadata": {},
578 |    "outputs": [],
579 |    "source": [
580 |     "# Import RandomizedSearchCV\n",
581 |     "from sklearn.model_selection import RandomizedSearchCV\n",
582 |     "\n",
583 |     "def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):\n",
584 |     "\n",
585 |     "    # Instantiate GridSearchCV as grid_reg\n",
586 |     "    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, \n",
587 |     "                                  cv=5, n_jobs=-1, random_state=2)\n",
588 |     "    \n",
589 |     "    # Fit grid_reg on X_train and y_train\n",
590 |     "    rand_clf.fit(X_train, y_train)\n",
591 |     "\n",
592 |     "    # Extract best estimator\n",
593 |     "    best_model = rand_clf.best_estimator_\n",
594 |     "    \n",
595 |     "    # Extract best score\n",
596 |     "    best_score = rand_clf.best_score_\n",
597 |     "\n",
598 |     "    # Print best score\n",
599 |     "    print(\"Training score: {:.3f}\".format(best_score))\n",
600 |     "\n",
601 |     "    # Predict test set labels\n",
602 |     "    y_pred = best_model.predict(X_test)\n",
603 |     "\n",
604 |     "    # Compute accuracy\n",
605 |     "    accuracy = accuracy_score(y_test, y_pred)\n",
606 |     "\n",
607 |     "    # Print accuracy\n",
608 |     "    print('Test score: {:.3f}'.format(accuracy))\n",
609 |     "        \n",
610 |     "    # Return best model\n",
611 |     "    return best_model"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 21,
617 |    "metadata": {},
618 |    "outputs": [
619 |     {
620 |      "name": "stdout",
621 |      "output_type": "stream",
622 |      "text": [
623 |       "Training score: 0.798\n",
624 |       "Test score: 0.855\n"
625 |      ]
626 |     },
627 |     {
628 |      "data": {
629 |       "text/plain": [
630 |        "DecisionTreeClassifier(criterion='entropy', max_depth=8, max_features=0.8,\n",
631 |        "                       max_leaf_nodes=45, min_samples_leaf=0.04,\n",
632 |        "                       min_samples_split=10, min_weight_fraction_leaf=0.05,\n",
633 |        "                       random_state=2)"
634 |       ]
635 |      },
636 |      "execution_count": 21,
637 |      "metadata": {},
638 |      "output_type": "execute_result"
639 |     }
640 |    ],
641 |    "source": [
642 |     "randomized_search_clf(params={'criterion':['entropy', 'gini'],\n",
643 |     "                              'splitter':['random', 'best'],\n",
644 |     "                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01],\n",
645 |     "                          'min_samples_split':[2, 3, 4, 5, 6, 8, 10],\n",
646 |     "                          'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],\n",
647 |     "                          'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],\n",
648 |     "                          'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],\n",
649 |     "                          'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],\n",
650 |     "                          'max_depth':[None, 2,4,6,8],\n",
651 |     "                          'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]\n",
652 |     "                         })"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": 22,
658 |    "metadata": {},
659 |    "outputs": [
660 |     {
661 |      "name": "stdout",
662 |      "output_type": "stream",
663 |      "text": [
664 |       "Training score: 0.802\n",
665 |       "Test score: 0.868\n"
666 |      ]
667 |     },
668 |     {
669 |      "data": {
670 |       "text/plain": [
671 |        "DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,\n",
672 |        "                       min_samples_leaf=0.045, min_samples_split=9,\n",
673 |        "                       min_weight_fraction_leaf=0.06, random_state=2)"
674 |       ]
675 |      },
676 |      "execution_count": 22,
677 |      "metadata": {},
678 |      "output_type": "execute_result"
679 |     }
680 |    ],
681 |    "source": [
682 |     "randomized_search_clf(params={'max_depth':[None, 6, 7],\n",
683 |     "'max_features':['auto', 0.78],\n",
684 |     "'max_leaf_nodes':[45, None],\n",
685 |     "'min_samples_leaf':[1, 0.035, 0.04, 0.045, 0.05],\n",
686 |     "'min_samples_split':[2, 9, 10],\n",
687 |     "'min_weight_fraction_leaf': [0.0, 0.05, 0.06, 0.07],\n",
688 |     "},\n",
689 |     "runs=100)"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 23,
695 |    "metadata": {},
696 |    "outputs": [
697 |     {
698 |      "name": "stdout",
699 |      "output_type": "stream",
700 |      "text": [
701 |       "Accuracy: [0.82 0.9  0.8  0.8  0.78]\n",
702 |       "Accuracy mean: 0.82\n"
703 |      ]
704 |     }
705 |    ],
706 |    "source": [
707 |     "# Initialize Decision Tree Classifier\n",
708 |     "model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,\n",
709 |     "            max_features=0.78, max_leaf_nodes=45,\n",
710 |     "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
711 |     "            min_samples_leaf=0.045, min_samples_split=9,\n",
712 |     "            min_weight_fraction_leaf=0.06, presort=False, random_state=2,\n",
713 |     "            splitter='best')\n",
714 |     "\n",
715 |     "# Obtain scores of cross-validation\n",
716 |     "scores = cross_val_score(model, X, y, cv=5)\n",
717 |     "\n",
718 |     "# Display accuracy\n",
719 |     "print('Accuracy:', np.round(scores, 2))\n",
720 |     "\n",
721 |     "# Display mean accuracy\n",
722 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "code",
727 |    "execution_count": 24,
728 |    "metadata": {},
729 |    "outputs": [
730 |     {
731 |      "data": {
732 |       "text/plain": [
733 |        "DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,\n",
734 |        "                       min_samples_leaf=0.045, min_samples_split=9,\n",
735 |        "                       min_weight_fraction_leaf=0.06, presort=False,\n",
736 |        "                       random_state=2)"
737 |       ]
738 |      },
739 |      "execution_count": 24,
740 |      "metadata": {},
741 |      "output_type": "execute_result"
742 |     }
743 |    ],
744 |    "source": [
745 |     "best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,\n",
746 |     "                       max_features=0.78, max_leaf_nodes=45,\n",
747 |     "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
748 |     "                       min_samples_leaf=0.045, min_samples_split=9,\n",
749 |     "                       min_weight_fraction_leaf=0.06, presort=False,\n",
750 |     "                       random_state=2, splitter='best')\n",
751 |     "best_clf.fit(X, y)"
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": 25,
757 |    "metadata": {},
758 |    "outputs": [
759 |     {
760 |      "data": {
761 |       "text/plain": [
762 |        "array([0.04826754, 0.04081653, 0.48409586, 0.00568635, 0.        ,\n",
763 |        "       0.        , 0.        , 0.00859483, 0.        , 0.02690379,\n",
764 |        "       0.        , 0.18069065, 0.20494446])"
765 |       ]
766 |      },
767 |      "execution_count": 25,
768 |      "metadata": {},
769 |      "output_type": "execute_result"
770 |     }
771 |    ],
772 |    "source": [
773 |     "best_clf.feature_importances_"
774 |    ]
775 |   },
776 |   {
777 |    "cell_type": "code",
778 |    "execution_count": 26,
779 |    "metadata": {},
780 |    "outputs": [
781 |     {
782 |      "data": {
783 |       "text/plain": [
784 |        "[('cp', 0.4840958610240171),\n",
785 |        " ('thal', 0.20494445570568706),\n",
786 |        " ('ca', 0.18069065321397942)]"
787 |       ]
788 |      },
789 |      "execution_count": 26,
790 |      "metadata": {},
791 |      "output_type": "execute_result"
792 |     }
793 |    ],
794 |    "source": [
795 |     "# Zip columns and feature_importances_ into dict\n",
796 |     "feature_dict = dict(zip(X.columns, best_clf.feature_importances_))\n",
797 |     "\n",
798 |     "# Import operator\n",
799 |     "import operator\n",
800 |     "\n",
801 |     "# Sort dict by values (as list of tuples)\n",
802 |     "sorted(feature_dict.items(), key=operator.itemgetter(1), reverse=True)[0:3]"
803 |    ]
804 |   }
805 |  ],
806 |  "metadata": {
807 |   "kernelspec": {
808 |    "display_name": "Python 3",
809 |    "language": "python",
810 |    "name": "python3"
811 |   },
812 |   "language_info": {
813 |    "codemirror_mode": {
814 |     "name": "ipython",
815 |     "version": 3
816 |    },
817 |    "file_extension": ".py",
818 |    "mimetype": "text/x-python",
819 |    "name": "python",
820 |    "nbconvert_exporter": "python",
821 |    "pygments_lexer": "ipython3",
822 |    "version": "3.7.7"
823 |   }
824 |  },
825 |  "nbformat": 4,
826 |  "nbformat_minor": 2
827 | }
828 | 


--------------------------------------------------------------------------------
/Chapter02/Random_Points.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/Random_Points.png


--------------------------------------------------------------------------------
/Chapter02/bike_rentals_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:76a1beb36c78dc0c974d96583476bddae8538e31a9e316745b9f54c3cb54094e
3 | size 52255
4 | 


--------------------------------------------------------------------------------
/Chapter02/census_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6836a2159748b9d132c8c403faf0da5219db6486162fdbf8317de7ee8a319080
3 | size 6311513
4 | 


--------------------------------------------------------------------------------
/Chapter02/census_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter02/census_tree.png


--------------------------------------------------------------------------------
/Chapter02/heart_disease.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7c3014365675306819510a49ff289efbec1d1a6a666a2dc7652f1547b383d859
3 | size 11328
4 | 


--------------------------------------------------------------------------------
/Chapter03/Random_Forest_Bike_Rentals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter03/Random_Forest_Bike_Rentals.png


--------------------------------------------------------------------------------
/Chapter03/Random_Forest_Warm_Start.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter03/Random_Forest_Warm_Start.png


--------------------------------------------------------------------------------
/Chapter03/XGBoost_v_Random_Forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter03/XGBoost_v_Random_Forest.png


--------------------------------------------------------------------------------
/Chapter03/bike_rentals_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:76a1beb36c78dc0c974d96583476bddae8538e31a9e316745b9f54c3cb54094e
3 | size 52255
4 | 


--------------------------------------------------------------------------------
/Chapter03/census_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6836a2159748b9d132c8c403faf0da5219db6486162fdbf8317de7ee8a319080
3 | size 6311513
4 | 


--------------------------------------------------------------------------------
/Chapter04/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter04/.DS_Store


--------------------------------------------------------------------------------
/Chapter04/.ipynb_checkpoints/Gradient_Boosting-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "# Import pandas and numpy\n",
  10 |     "import pandas as pd\n",
  11 |     "import numpy as np\n",
  12 |     "\n",
  13 |     "# Silence warnings\n",
  14 |     "import warnings\n",
  15 |     "warnings.filterwarnings('ignore')"
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "code",
  20 |    "execution_count": 2,
  21 |    "metadata": {},
  22 |    "outputs": [
  23 |     {
  24 |      "data": {
  25 |       "text/html": [
  26 |        "<div>\n",
  27 |        "<style scoped>\n",
  28 |        "    .dataframe tbody tr th:only-of-type {\n",
  29 |        "        vertical-align: middle;\n",
  30 |        "    }\n",
  31 |        "\n",
  32 |        "    .dataframe tbody tr th {\n",
  33 |        "        vertical-align: top;\n",
  34 |        "    }\n",
  35 |        "\n",
  36 |        "    .dataframe thead th {\n",
  37 |        "        text-align: right;\n",
  38 |        "    }\n",
  39 |        "</style>\n",
  40 |        "<table border=\"1\" class=\"dataframe\">\n",
  41 |        "  <thead>\n",
  42 |        "    <tr style=\"text-align: right;\">\n",
  43 |        "      <th></th>\n",
  44 |        "      <th>instant</th>\n",
  45 |        "      <th>season</th>\n",
  46 |        "      <th>yr</th>\n",
  47 |        "      <th>mnth</th>\n",
  48 |        "      <th>holiday</th>\n",
  49 |        "      <th>weekday</th>\n",
  50 |        "      <th>workingday</th>\n",
  51 |        "      <th>weathersit</th>\n",
  52 |        "      <th>temp</th>\n",
  53 |        "      <th>atemp</th>\n",
  54 |        "      <th>hum</th>\n",
  55 |        "      <th>windspeed</th>\n",
  56 |        "      <th>cnt</th>\n",
  57 |        "    </tr>\n",
  58 |        "  </thead>\n",
  59 |        "  <tbody>\n",
  60 |        "    <tr>\n",
  61 |        "      <th>0</th>\n",
  62 |        "      <td>1</td>\n",
  63 |        "      <td>1.0</td>\n",
  64 |        "      <td>0.0</td>\n",
  65 |        "      <td>1.0</td>\n",
  66 |        "      <td>0.0</td>\n",
  67 |        "      <td>6.0</td>\n",
  68 |        "      <td>0.0</td>\n",
  69 |        "      <td>2</td>\n",
  70 |        "      <td>0.344167</td>\n",
  71 |        "      <td>0.363625</td>\n",
  72 |        "      <td>0.805833</td>\n",
  73 |        "      <td>0.160446</td>\n",
  74 |        "      <td>985</td>\n",
  75 |        "    </tr>\n",
  76 |        "    <tr>\n",
  77 |        "      <th>1</th>\n",
  78 |        "      <td>2</td>\n",
  79 |        "      <td>1.0</td>\n",
  80 |        "      <td>0.0</td>\n",
  81 |        "      <td>1.0</td>\n",
  82 |        "      <td>0.0</td>\n",
  83 |        "      <td>0.0</td>\n",
  84 |        "      <td>0.0</td>\n",
  85 |        "      <td>2</td>\n",
  86 |        "      <td>0.363478</td>\n",
  87 |        "      <td>0.353739</td>\n",
  88 |        "      <td>0.696087</td>\n",
  89 |        "      <td>0.248539</td>\n",
  90 |        "      <td>801</td>\n",
  91 |        "    </tr>\n",
  92 |        "    <tr>\n",
  93 |        "      <th>2</th>\n",
  94 |        "      <td>3</td>\n",
  95 |        "      <td>1.0</td>\n",
  96 |        "      <td>0.0</td>\n",
  97 |        "      <td>1.0</td>\n",
  98 |        "      <td>0.0</td>\n",
  99 |        "      <td>1.0</td>\n",
 100 |        "      <td>1.0</td>\n",
 101 |        "      <td>1</td>\n",
 102 |        "      <td>0.196364</td>\n",
 103 |        "      <td>0.189405</td>\n",
 104 |        "      <td>0.437273</td>\n",
 105 |        "      <td>0.248309</td>\n",
 106 |        "      <td>1349</td>\n",
 107 |        "    </tr>\n",
 108 |        "    <tr>\n",
 109 |        "      <th>3</th>\n",
 110 |        "      <td>4</td>\n",
 111 |        "      <td>1.0</td>\n",
 112 |        "      <td>0.0</td>\n",
 113 |        "      <td>1.0</td>\n",
 114 |        "      <td>0.0</td>\n",
 115 |        "      <td>2.0</td>\n",
 116 |        "      <td>1.0</td>\n",
 117 |        "      <td>1</td>\n",
 118 |        "      <td>0.200000</td>\n",
 119 |        "      <td>0.212122</td>\n",
 120 |        "      <td>0.590435</td>\n",
 121 |        "      <td>0.160296</td>\n",
 122 |        "      <td>1562</td>\n",
 123 |        "    </tr>\n",
 124 |        "    <tr>\n",
 125 |        "      <th>4</th>\n",
 126 |        "      <td>5</td>\n",
 127 |        "      <td>1.0</td>\n",
 128 |        "      <td>0.0</td>\n",
 129 |        "      <td>1.0</td>\n",
 130 |        "      <td>0.0</td>\n",
 131 |        "      <td>3.0</td>\n",
 132 |        "      <td>1.0</td>\n",
 133 |        "      <td>1</td>\n",
 134 |        "      <td>0.226957</td>\n",
 135 |        "      <td>0.229270</td>\n",
 136 |        "      <td>0.436957</td>\n",
 137 |        "      <td>0.186900</td>\n",
 138 |        "      <td>1600</td>\n",
 139 |        "    </tr>\n",
 140 |        "  </tbody>\n",
 141 |        "</table>\n",
 142 |        "</div>"
 143 |       ],
 144 |       "text/plain": [
 145 |        "   instant  season   yr  mnth  holiday  weekday  workingday  weathersit  \\\n",
 146 |        "0        1     1.0  0.0   1.0      0.0      6.0         0.0           2   \n",
 147 |        "1        2     1.0  0.0   1.0      0.0      0.0         0.0           2   \n",
 148 |        "2        3     1.0  0.0   1.0      0.0      1.0         1.0           1   \n",
 149 |        "3        4     1.0  0.0   1.0      0.0      2.0         1.0           1   \n",
 150 |        "4        5     1.0  0.0   1.0      0.0      3.0         1.0           1   \n",
 151 |        "\n",
 152 |        "       temp     atemp       hum  windspeed   cnt  \n",
 153 |        "0  0.344167  0.363625  0.805833   0.160446   985  \n",
 154 |        "1  0.363478  0.353739  0.696087   0.248539   801  \n",
 155 |        "2  0.196364  0.189405  0.437273   0.248309  1349  \n",
 156 |        "3  0.200000  0.212122  0.590435   0.160296  1562  \n",
 157 |        "4  0.226957  0.229270  0.436957   0.186900  1600  "
 158 |       ]
 159 |      },
 160 |      "execution_count": 2,
 161 |      "metadata": {},
 162 |      "output_type": "execute_result"
 163 |     }
 164 |    ],
 165 |    "source": [
 166 |     "df_bikes = pd.read_csv('bike_rentals_cleaned.csv')\n",
 167 |     "df_bikes.head()"
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "code",
 172 |    "execution_count": 3,
 173 |    "metadata": {},
 174 |    "outputs": [],
 175 |    "source": [
 176 |     "# Split data into X and y\n",
 177 |     "X_bikes = df_bikes.iloc[:,:-1]\n",
 178 |     "y_bikes = df_bikes.iloc[:,-1]\n",
 179 |     "\n",
 180 |     "# Import train_test_split\n",
 181 |     "from sklearn.model_selection import train_test_split\n",
 182 |     "\n",
 183 |     "# Split data into train and test sets\n",
 184 |     "X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "code",
 189 |    "execution_count": 43,
 190 |    "metadata": {},
 191 |    "outputs": [
 192 |     {
 193 |      "data": {
 194 |       "text/plain": [
 195 |        "DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,\n",
 196 |        "                      max_features=None, max_leaf_nodes=None,\n",
 197 |        "                      min_impurity_decrease=0.0, min_impurity_split=None,\n",
 198 |        "                      min_samples_leaf=1, min_samples_split=2,\n",
 199 |        "                      min_weight_fraction_leaf=0.0, presort='deprecated',\n",
 200 |        "                      random_state=2, splitter='best')"
 201 |       ]
 202 |      },
 203 |      "execution_count": 43,
 204 |      "metadata": {},
 205 |      "output_type": "execute_result"
 206 |     }
 207 |    ],
 208 |    "source": [
 209 |     "# Import Decision Tree Regressor\n",
 210 |     "from sklearn.tree import DecisionTreeRegressor\n",
 211 |     "\n",
 212 |     "# Initialize Decision Tree Regressor\n",
 213 |     "tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 214 |     "\n",
 215 |     "# Fit tree to training data\n",
 216 |     "tree_1.fit(X_train, y_train)"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "code",
 221 |    "execution_count": 44,
 222 |    "metadata": {},
 223 |    "outputs": [],
 224 |    "source": [
 225 |     "# Make predictions on training set\n",
 226 |     "y_train_pred = tree_1.predict(X_train)"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": 45,
 232 |    "metadata": {},
 233 |    "outputs": [
 234 |     {
 235 |      "data": {
 236 |       "text/plain": [
 237 |        "DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,\n",
 238 |        "                      max_features=None, max_leaf_nodes=None,\n",
 239 |        "                      min_impurity_decrease=0.0, min_impurity_split=None,\n",
 240 |        "                      min_samples_leaf=1, min_samples_split=2,\n",
 241 |        "                      min_weight_fraction_leaf=0.0, presort='deprecated',\n",
 242 |        "                      random_state=2, splitter='best')"
 243 |       ]
 244 |      },
 245 |      "execution_count": 45,
 246 |      "metadata": {},
 247 |      "output_type": "execute_result"
 248 |     }
 249 |    ],
 250 |    "source": [
 251 |     "# Compute residuals\n",
 252 |     "y2_train = y_train - y_train_pred\n",
 253 |     "\n",
 254 |     "# Initialize Decision Tree Regressor\n",
 255 |     "tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 256 |     "\n",
 257 |     "# Fit tree to training data\n",
 258 |     "tree_2.fit(X_train, y2_train)"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "code",
 263 |    "execution_count": 50,
 264 |    "metadata": {},
 265 |    "outputs": [
 266 |     {
 267 |      "data": {
 268 |       "text/plain": [
 269 |        "DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,\n",
 270 |        "                      max_features=None, max_leaf_nodes=None,\n",
 271 |        "                      min_impurity_decrease=0.0, min_impurity_split=None,\n",
 272 |        "                      min_samples_leaf=1, min_samples_split=2,\n",
 273 |        "                      min_weight_fraction_leaf=0.0, presort='deprecated',\n",
 274 |        "                      random_state=2, splitter='best')"
 275 |       ]
 276 |      },
 277 |      "execution_count": 50,
 278 |      "metadata": {},
 279 |      "output_type": "execute_result"
 280 |     }
 281 |    ],
 282 |    "source": [
 283 |     "# Make predictions on training set\n",
 284 |     "y2_train_pred = tree_2.predict(X_train)\n",
 285 |     "\n",
 286 |     "# Compute residuals\n",
 287 |     "y3_train = y2_train - y2_train_pred\n",
 288 |     "\n",
 289 |     "# Initialize Decision Tree Regressor\n",
 290 |     "tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 291 |     "\n",
 292 |     "# Fit tree to training data\n",
 293 |     "tree_3.fit(X_train, y3_train)"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": 56,
 299 |    "metadata": {},
 300 |    "outputs": [
 301 |     {
 302 |      "data": {
 303 |       "text/plain": [
 304 |        "911.0479538776444"
 305 |       ]
 306 |      },
 307 |      "execution_count": 56,
 308 |      "metadata": {},
 309 |      "output_type": "execute_result"
 310 |     }
 311 |    ],
 312 |    "source": [
 313 |     "y1_pred = tree_1.predict(X_test)\n",
 314 |     "\n",
 315 |     "y2_pred = tree_2.predict(X_test)\n",
 316 |     "\n",
 317 |     "y3_pred = tree_3.predict(X_test)\n",
 318 |     "\n",
 319 |     "y_pred = y1_pred + y2_pred + y3_pred\n",
 320 |     "\n",
 321 |     "# Import mean_squared_error \n",
 322 |     "from sklearn.metrics import mean_squared_error as MSE\n",
 323 |     "\n",
 324 |     "# Compute root mean squared error (rmse)\n",
 325 |     "MSE(y_test, y_pred)**0.5\n"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "code",
 330 |    "execution_count": 14,
 331 |    "metadata": {},
 332 |    "outputs": [],
 333 |    "source": [
 334 |     "from sklearn.ensemble import GradientBoostingRegressor"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 57,
 340 |    "metadata": {},
 341 |    "outputs": [
 342 |     {
 343 |      "data": {
 344 |       "text/plain": [
 345 |        "911.0479538776439"
 346 |       ]
 347 |      },
 348 |      "execution_count": 57,
 349 |      "metadata": {},
 350 |      "output_type": "execute_result"
 351 |     }
 352 |    ],
 353 |    "source": [
 354 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2, learning_rate=1.0)\n",
 355 |     "\n",
 356 |     "gbr.fit(X_train, y_train)\n",
 357 |     "\n",
 358 |     "# Predict test data\n",
 359 |     "y_pred = gbr.predict(X_test)\n",
 360 |     "\n",
 361 |     "# Compute root mean squared error (rmse)\n",
 362 |     "MSE(y_test, y_pred)**0.5"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": 58,
 368 |    "metadata": {},
 369 |    "outputs": [
 370 |     {
 371 |      "data": {
 372 |       "text/plain": [
 373 |        "857.1072323426944"
 374 |       ]
 375 |      },
 376 |      "execution_count": 58,
 377 |      "metadata": {},
 378 |      "output_type": "execute_result"
 379 |     }
 380 |    ],
 381 |    "source": [
 382 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, random_state=2, learning_rate=1.0)\n",
 383 |     "gbr.fit(X_train, y_train)\n",
 384 |     "y_pred = gbr.predict(X_test)\n",
 385 |     "MSE(y_test, y_pred)**0.5"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": 59,
 391 |    "metadata": {},
 392 |    "outputs": [
 393 |     {
 394 |      "data": {
 395 |       "text/plain": [
 396 |        "936.3617413678853"
 397 |       ]
 398 |      },
 399 |      "execution_count": 59,
 400 |      "metadata": {},
 401 |      "output_type": "execute_result"
 402 |     }
 403 |    ],
 404 |    "source": [
 405 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=1.0)\n",
 406 |     "gbr.fit(X_train, y_train)\n",
 407 |     "y_pred = gbr.predict(X_test)\n",
 408 |     "MSE(y_test, y_pred)**0.5"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "code",
 413 |    "execution_count": 60,
 414 |    "metadata": {},
 415 |    "outputs": [
 416 |     {
 417 |      "data": {
 418 |       "text/plain": [
 419 |        "653.7456840231495"
 420 |       ]
 421 |      },
 422 |      "execution_count": 60,
 423 |      "metadata": {},
 424 |      "output_type": "execute_result"
 425 |     }
 426 |    ],
 427 |    "source": [
 428 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2)\n",
 429 |     "gbr.fit(X_train, y_train)\n",
 430 |     "y_pred = gbr.predict(X_test)\n",
 431 |     "MSE(y_test, y_pred)**0.5"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": 111,
 437 |    "metadata": {},
 438 |    "outputs": [
 439 |     {
 440 |      "name": "stdout",
 441 |      "output_type": "stream",
 442 |      "text": [
 443 |       "Learning Rate: 0.001 , Score: 1633.0261400367258\n",
 444 |       "Learning Rate: 0.01 , Score: 831.5430182728547\n",
 445 |       "Learning Rate: 0.05 , Score: 685.0192988749717\n",
 446 |       "Learning Rate: 0.1 , Score: 653.7456840231495\n",
 447 |       "Learning Rate: 0.15 , Score: 687.666134269379\n",
 448 |       "Learning Rate: 0.2 , Score: 664.312804425697\n",
 449 |       "Learning Rate: 0.3 , Score: 689.4190385930236\n",
 450 |       "Learning Rate: 0.5 , Score: 693.8856905068778\n",
 451 |       "Learning Rate: 1.0 , Score: 936.3617413678853\n"
 452 |      ]
 453 |     }
 454 |    ],
 455 |    "source": [
 456 |     "learning_rate_values = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]\n",
 457 |     "for value in learning_rate_values:\n",
 458 |     "    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=value)\n",
 459 |     "    gbr.fit(X_train, y_train)\n",
 460 |     "    y_pred = gbr.predict(X_test)\n",
 461 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 462 |     "    print('Learning Rate:', value, ', Score:', rmse)"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": 120,
 468 |    "metadata": {},
 469 |    "outputs": [
 470 |     {
 471 |      "name": "stdout",
 472 |      "output_type": "stream",
 473 |      "text": [
 474 |       "Max Depth: None , Score: 867.9366621617327\n",
 475 |       "Max Depth: 1 , Score: 707.8261886858736\n",
 476 |       "Max Depth: 2 , Score: 653.7456840231495\n",
 477 |       "Max Depth: 3 , Score: 646.4045923317708\n",
 478 |       "Max Depth: 4 , Score: 663.048387855927\n"
 479 |      ]
 480 |     }
 481 |    ],
 482 |    "source": [
 483 |     "depths = [None, 1, 2, 3, 4]\n",
 484 |     "for depth in depths:\n",
 485 |     "    gbr = GradientBoostingRegressor(max_depth=depth, n_estimators=300, random_state=2)\n",
 486 |     "    gbr.fit(X_train, y_train)\n",
 487 |     "    y_pred = gbr.predict(X_test)\n",
 488 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 489 |     "    print('Max Depth:', depth, ', Score:', rmse)"
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": 125,
 495 |    "metadata": {},
 496 |    "outputs": [
 497 |     {
 498 |      "name": "stdout",
 499 |      "output_type": "stream",
 500 |      "text": [
 501 |       "Subsample: 1 , Score: 646.4045923317708\n",
 502 |       "Subsample: 0.9 , Score: 620.1819001443569\n",
 503 |       "Subsample: 0.8 , Score: 617.2355650565677\n",
 504 |       "Subsample: 0.7 , Score: 612.9879156983139\n",
 505 |       "Subsample: 0.6 , Score: 622.6385116402317\n",
 506 |       "Subsample: 0.5 , Score: 626.9974073227554\n"
 507 |      ]
 508 |     }
 509 |    ],
 510 |    "source": [
 511 |     "samples = [1, 0.9, 0.8, 0.7, 0.6, 0.5]\n",
 512 |     "for sample in samples:\n",
 513 |     "    gbr = GradientBoostingRegressor(max_depth=3, n_estimators=300, subsample=sample, random_state=2)\n",
 514 |     "    gbr.fit(X_train, y_train)\n",
 515 |     "    y_pred = gbr.predict(X_test)\n",
 516 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 517 |     "    print('Subsample:', sample, ', Score:', rmse)"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": 134,
 523 |    "metadata": {},
 524 |    "outputs": [
 525 |     {
 526 |      "name": "stdout",
 527 |      "output_type": "stream",
 528 |      "text": [
 529 |       "Best params: {'learning_rate': 0.05, 'n_estimators': 300, 'subsample': 0.65}\n",
 530 |       "Training score: 636.200\n",
 531 |       "Test set score: 625.985\n"
 532 |      ]
 533 |     }
 534 |    ],
 535 |    "source": [
 536 |     "params={'subsample':[0.65, 0.7, 0.75],\n",
 537 |     "                          'n_estimators':[300, 500, 1000],\n",
 538 |     "                          'learning_rate':[0.05, 0.075, 0.1]\n",
 539 |     "                         }\n",
 540 |     "\n",
 541 |     "reg = GradientBoostingRegressor(max_depth=3, random_state=2)\n",
 542 |     "\n",
 543 |     "# Import RandomizedSearchCV\n",
 544 |     "from sklearn.model_selection import GridSearchCV\n",
 545 |     "\n",
 546 |     "\n",
 547 |     "# Instantiate RandomizedSearchCV as grid_reg\n",
 548 |     "grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', \n",
 549 |     "                              cv=5, n_jobs=-1)\n",
 550 |     "\n",
 551 |     "# Fit grid_reg on X_train and y_train\n",
 552 |     "grid_reg.fit(X_train, y_train)\n",
 553 |     "\n",
 554 |     "# Extract best estimator\n",
 555 |     "best_model = grid_reg.best_estimator_\n",
 556 |     "\n",
 557 |     "# Extract best params\n",
 558 |     "best_params = grid_reg.best_params_\n",
 559 |     "\n",
 560 |     "# Print best params\n",
 561 |     "print(\"Best params:\", best_params)\n",
 562 |     "\n",
 563 |     "# Compute best score\n",
 564 |     "best_score = np.sqrt(-rand_reg.best_score_)\n",
 565 |     "\n",
 566 |     "# Print best score\n",
 567 |     "print(\"Training score: {:.3f}\".format(best_score))\n",
 568 |     "\n",
 569 |     "# Predict test set labels\n",
 570 |     "y_pred = best_model.predict(X_test)\n",
 571 |     "\n",
 572 |     "# Import mean_squared_error from sklearn.metrics as MSE \n",
 573 |     "from sklearn.metrics import mean_squared_error as MSE\n",
 574 |     "\n",
 575 |     "# Compute rmse_test\n",
 576 |     "rmse_test = MSE(y_test, y_pred)**0.5\n",
 577 |     "\n",
 578 |     "# Print rmse_test\n",
 579 |     "print('Test set score: {:.3f}'.format(rmse_test))"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "code",
 584 |    "execution_count": 136,
 585 |    "metadata": {},
 586 |    "outputs": [
 587 |     {
 588 |      "data": {
 589 |       "text/plain": [
 590 |        "625.9849010532475"
 591 |       ]
 592 |      },
 593 |      "execution_count": 136,
 594 |      "metadata": {},
 595 |      "output_type": "execute_result"
 596 |     }
 597 |    ],
 598 |    "source": [
 599 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=300, subsample=0.65, learning_rate=0.05, random_state=2)\n",
 600 |     "gbr.fit(X_train, y_train)\n",
 601 |     "y_pred = gbr.predict(X_test)\n",
 602 |     "MSE(y_test, y_pred)**0.5"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "code",
 607 |    "execution_count": null,
 608 |    "metadata": {},
 609 |    "outputs": [],
 610 |    "source": [
 611 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1000, subsample=0.65, learning_rate=0.05, random_state=2)\n",
 612 |     "gbr.fit(X_train, y_train)\n",
 613 |     "y_pred = gbr.predict(X_test)\n",
 614 |     "MSE(y_test, y_pred)**0.5"
 615 |    ]
 616 |   },
 617 |   {
 618 |    "cell_type": "code",
 619 |    "execution_count": 151,
 620 |    "metadata": {},
 621 |    "outputs": [
 622 |     {
 623 |      "data": {
 624 |       "text/plain": [
 625 |        "602.6566164753407"
 626 |       ]
 627 |      },
 628 |      "execution_count": 151,
 629 |      "metadata": {},
 630 |      "output_type": "execute_result"
 631 |     }
 632 |    ],
 633 |    "source": [
 634 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1000, subsample=0.75, learning_rate=0.05, random_state=2)\n",
 635 |     "gbr.fit(X_train, y_train)\n",
 636 |     "y_pred = gbr.predict(X_test)\n",
 637 |     "MSE(y_test, y_pred)**0.5"
 638 |    ]
 639 |   },
 640 |   {
 641 |    "cell_type": "code",
 642 |    "execution_count": 155,
 643 |    "metadata": {},
 644 |    "outputs": [
 645 |     {
 646 |      "data": {
 647 |       "text/plain": [
 648 |        "601.4279752102285"
 649 |       ]
 650 |      },
 651 |      "execution_count": 155,
 652 |      "metadata": {},
 653 |      "output_type": "execute_result"
 654 |     }
 655 |    ],
 656 |    "source": [
 657 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1200, subsample=0.75, learning_rate=0.05, random_state=2)\n",
 658 |     "gbr.fit(X_train, y_train)\n",
 659 |     "y_pred = gbr.predict(X_test)\n",
 660 |     "MSE(y_test, y_pred)**0.5"
 661 |    ]
 662 |   },
 663 |   {
 664 |    "cell_type": "code",
 665 |    "execution_count": 159,
 666 |    "metadata": {},
 667 |    "outputs": [
 668 |     {
 669 |      "data": {
 670 |       "text/plain": [
 671 |        "596.9544588974487"
 672 |       ]
 673 |      },
 674 |      "execution_count": 159,
 675 |      "metadata": {},
 676 |      "output_type": "execute_result"
 677 |     }
 678 |    ],
 679 |    "source": [
 680 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, subsample=0.75, learning_rate=0.02, random_state=2)\n",
 681 |     "gbr.fit(X_train, y_train)\n",
 682 |     "y_pred = gbr.predict(X_test)\n",
 683 |     "MSE(y_test, y_pred)**0.5"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "code",
 688 |    "execution_count": 186,
 689 |    "metadata": {},
 690 |    "outputs": [
 691 |     {
 692 |      "data": {
 693 |       "text/plain": [
 694 |        "596.3871480845066"
 695 |       ]
 696 |      },
 697 |      "execution_count": 186,
 698 |      "metadata": {},
 699 |      "output_type": "execute_result"
 700 |     }
 701 |    ],
 702 |    "source": [
 703 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, subsample=0.7, learning_rate=0.02, random_state=2)\n",
 704 |     "gbr.fit(X_train, y_train)\n",
 705 |     "y_pred = gbr.predict(X_test)\n",
 706 |     "MSE(y_test, y_pred)**0.5"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": 195,
 712 |    "metadata": {},
 713 |    "outputs": [
 714 |     {
 715 |      "name": "stdout",
 716 |      "output_type": "stream",
 717 |      "text": [
 718 |       "[17:49:02] WARNING: /usr/local/miniconda/conda-bld/xgboost_1566327371504/work/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
 719 |      ]
 720 |     },
 721 |     {
 722 |      "data": {
 723 |       "text/plain": [
 724 |        "584.6690439795171"
 725 |       ]
 726 |      },
 727 |      "execution_count": 195,
 728 |      "metadata": {},
 729 |      "output_type": "execute_result"
 730 |     }
 731 |    ],
 732 |    "source": [
 733 |     "# Import XGBRegressor\n",
 734 |     "from xgboost import XGBRegressor\n",
 735 |     "\n",
 736 |     "# Instantiate the XGBRegressor, xg_reg\n",
 737 |     "xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, subsample=0.75, random_state=2)\n",
 738 |     "\n",
 739 |     "# Fit xg_reg to training set\n",
 740 |     "xg_reg.fit(X_train, y_train)\n",
 741 |     "\n",
 742 |     "# Predict labels of test set, y_pred\n",
 743 |     "y_pred = xg_reg.predict(X_test)\n",
 744 |     "\n",
 745 |     "# Compute root mean squared error (rmse)\n",
 746 |     "MSE(y_test, y_pred)**0.5"
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "code",
 751 |    "execution_count": null,
 752 |    "metadata": {},
 753 |    "outputs": [],
 754 |    "source": []
 755 |   },
 756 |   {
 757 |    "cell_type": "code",
 758 |    "execution_count": 231,
 759 |    "metadata": {},
 760 |    "outputs": [
 761 |     {
 762 |      "data": {
 763 |       "text/html": [
 764 |        "<div>\n",
 765 |        "<style scoped>\n",
 766 |        "    .dataframe tbody tr th:only-of-type {\n",
 767 |        "        vertical-align: middle;\n",
 768 |        "    }\n",
 769 |        "\n",
 770 |        "    .dataframe tbody tr th {\n",
 771 |        "        vertical-align: top;\n",
 772 |        "    }\n",
 773 |        "\n",
 774 |        "    .dataframe thead th {\n",
 775 |        "        text-align: right;\n",
 776 |        "    }\n",
 777 |        "</style>\n",
 778 |        "<table border=\"1\" class=\"dataframe\">\n",
 779 |        "  <thead>\n",
 780 |        "    <tr style=\"text-align: right;\">\n",
 781 |        "      <th></th>\n",
 782 |        "      <th>LABEL</th>\n",
 783 |        "      <th>FLUX.1</th>\n",
 784 |        "      <th>FLUX.2</th>\n",
 785 |        "      <th>FLUX.3</th>\n",
 786 |        "      <th>FLUX.4</th>\n",
 787 |        "      <th>FLUX.5</th>\n",
 788 |        "      <th>FLUX.6</th>\n",
 789 |        "      <th>FLUX.7</th>\n",
 790 |        "      <th>FLUX.8</th>\n",
 791 |        "      <th>FLUX.9</th>\n",
 792 |        "      <th>...</th>\n",
 793 |        "      <th>FLUX.3188</th>\n",
 794 |        "      <th>FLUX.3189</th>\n",
 795 |        "      <th>FLUX.3190</th>\n",
 796 |        "      <th>FLUX.3191</th>\n",
 797 |        "      <th>FLUX.3192</th>\n",
 798 |        "      <th>FLUX.3193</th>\n",
 799 |        "      <th>FLUX.3194</th>\n",
 800 |        "      <th>FLUX.3195</th>\n",
 801 |        "      <th>FLUX.3196</th>\n",
 802 |        "      <th>FLUX.3197</th>\n",
 803 |        "    </tr>\n",
 804 |        "  </thead>\n",
 805 |        "  <tbody>\n",
 806 |        "    <tr>\n",
 807 |        "      <th>0</th>\n",
 808 |        "      <td>2</td>\n",
 809 |        "      <td>93.85</td>\n",
 810 |        "      <td>83.81</td>\n",
 811 |        "      <td>20.10</td>\n",
 812 |        "      <td>-26.98</td>\n",
 813 |        "      <td>-39.56</td>\n",
 814 |        "      <td>-124.71</td>\n",
 815 |        "      <td>-135.18</td>\n",
 816 |        "      <td>-96.27</td>\n",
 817 |        "      <td>-79.89</td>\n",
 818 |        "      <td>...</td>\n",
 819 |        "      <td>-78.07</td>\n",
 820 |        "      <td>-102.15</td>\n",
 821 |        "      <td>-102.15</td>\n",
 822 |        "      <td>25.13</td>\n",
 823 |        "      <td>48.57</td>\n",
 824 |        "      <td>92.54</td>\n",
 825 |        "      <td>39.32</td>\n",
 826 |        "      <td>61.42</td>\n",
 827 |        "      <td>5.08</td>\n",
 828 |        "      <td>-39.54</td>\n",
 829 |        "    </tr>\n",
 830 |        "    <tr>\n",
 831 |        "      <th>1</th>\n",
 832 |        "      <td>2</td>\n",
 833 |        "      <td>-38.88</td>\n",
 834 |        "      <td>-33.83</td>\n",
 835 |        "      <td>-58.54</td>\n",
 836 |        "      <td>-40.09</td>\n",
 837 |        "      <td>-79.31</td>\n",
 838 |        "      <td>-72.81</td>\n",
 839 |        "      <td>-86.55</td>\n",
 840 |        "      <td>-85.33</td>\n",
 841 |        "      <td>-83.97</td>\n",
 842 |        "      <td>...</td>\n",
 843 |        "      <td>-3.28</td>\n",
 844 |        "      <td>-32.21</td>\n",
 845 |        "      <td>-32.21</td>\n",
 846 |        "      <td>-24.89</td>\n",
 847 |        "      <td>-4.86</td>\n",
 848 |        "      <td>0.76</td>\n",
 849 |        "      <td>-11.70</td>\n",
 850 |        "      <td>6.46</td>\n",
 851 |        "      <td>16.00</td>\n",
 852 |        "      <td>19.93</td>\n",
 853 |        "    </tr>\n",
 854 |        "    <tr>\n",
 855 |        "      <th>2</th>\n",
 856 |        "      <td>2</td>\n",
 857 |        "      <td>532.64</td>\n",
 858 |        "      <td>535.92</td>\n",
 859 |        "      <td>513.73</td>\n",
 860 |        "      <td>496.92</td>\n",
 861 |        "      <td>456.45</td>\n",
 862 |        "      <td>466.00</td>\n",
 863 |        "      <td>464.50</td>\n",
 864 |        "      <td>486.39</td>\n",
 865 |        "      <td>436.56</td>\n",
 866 |        "      <td>...</td>\n",
 867 |        "      <td>-71.69</td>\n",
 868 |        "      <td>13.31</td>\n",
 869 |        "      <td>13.31</td>\n",
 870 |        "      <td>-29.89</td>\n",
 871 |        "      <td>-20.88</td>\n",
 872 |        "      <td>5.06</td>\n",
 873 |        "      <td>-11.80</td>\n",
 874 |        "      <td>-28.91</td>\n",
 875 |        "      <td>-70.02</td>\n",
 876 |        "      <td>-96.67</td>\n",
 877 |        "    </tr>\n",
 878 |        "    <tr>\n",
 879 |        "      <th>3</th>\n",
 880 |        "      <td>2</td>\n",
 881 |        "      <td>326.52</td>\n",
 882 |        "      <td>347.39</td>\n",
 883 |        "      <td>302.35</td>\n",
 884 |        "      <td>298.13</td>\n",
 885 |        "      <td>317.74</td>\n",
 886 |        "      <td>312.70</td>\n",
 887 |        "      <td>322.33</td>\n",
 888 |        "      <td>311.31</td>\n",
 889 |        "      <td>312.42</td>\n",
 890 |        "      <td>...</td>\n",
 891 |        "      <td>5.71</td>\n",
 892 |        "      <td>-3.73</td>\n",
 893 |        "      <td>-3.73</td>\n",
 894 |        "      <td>30.05</td>\n",
 895 |        "      <td>20.03</td>\n",
 896 |        "      <td>-12.67</td>\n",
 897 |        "      <td>-8.77</td>\n",
 898 |        "      <td>-17.31</td>\n",
 899 |        "      <td>-17.35</td>\n",
 900 |        "      <td>13.98</td>\n",
 901 |        "    </tr>\n",
 902 |        "    <tr>\n",
 903 |        "      <th>4</th>\n",
 904 |        "      <td>2</td>\n",
 905 |        "      <td>-1107.21</td>\n",
 906 |        "      <td>-1112.59</td>\n",
 907 |        "      <td>-1118.95</td>\n",
 908 |        "      <td>-1095.10</td>\n",
 909 |        "      <td>-1057.55</td>\n",
 910 |        "      <td>-1034.48</td>\n",
 911 |        "      <td>-998.34</td>\n",
 912 |        "      <td>-1022.71</td>\n",
 913 |        "      <td>-989.57</td>\n",
 914 |        "      <td>...</td>\n",
 915 |        "      <td>-594.37</td>\n",
 916 |        "      <td>-401.66</td>\n",
 917 |        "      <td>-401.66</td>\n",
 918 |        "      <td>-357.24</td>\n",
 919 |        "      <td>-443.76</td>\n",
 920 |        "      <td>-438.54</td>\n",
 921 |        "      <td>-399.71</td>\n",
 922 |        "      <td>-384.65</td>\n",
 923 |        "      <td>-411.79</td>\n",
 924 |        "      <td>-510.54</td>\n",
 925 |        "    </tr>\n",
 926 |        "  </tbody>\n",
 927 |        "</table>\n",
 928 |        "<p>5 rows × 3198 columns</p>\n",
 929 |        "</div>"
 930 |       ],
 931 |       "text/plain": [
 932 |        "   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \\\n",
 933 |        "0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   \n",
 934 |        "1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   \n",
 935 |        "2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   \n",
 936 |        "3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   \n",
 937 |        "4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   \n",
 938 |        "\n",
 939 |        "    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \\\n",
 940 |        "0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   \n",
 941 |        "1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   \n",
 942 |        "2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   \n",
 943 |        "3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   \n",
 944 |        "4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   \n",
 945 |        "\n",
 946 |        "   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  \n",
 947 |        "0      48.57      92.54      39.32      61.42       5.08     -39.54  \n",
 948 |        "1      -4.86       0.76     -11.70       6.46      16.00      19.93  \n",
 949 |        "2     -20.88       5.06     -11.80     -28.91     -70.02     -96.67  \n",
 950 |        "3      20.03     -12.67      -8.77     -17.31     -17.35      13.98  \n",
 951 |        "4    -443.76    -438.54    -399.71    -384.65    -411.79    -510.54  \n",
 952 |        "\n",
 953 |        "[5 rows x 3198 columns]"
 954 |       ]
 955 |      },
 956 |      "execution_count": 231,
 957 |      "metadata": {},
 958 |      "output_type": "execute_result"
 959 |     }
 960 |    ],
 961 |    "source": [
 962 |     "df = pd.read_csv('exoplanets.csv')\n",
 963 |     "df.head()"
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "code",
 968 |    "execution_count": 232,
 969 |    "metadata": {},
 970 |    "outputs": [
 971 |     {
 972 |      "name": "stdout",
 973 |      "output_type": "stream",
 974 |      "text": [
 975 |       "<class 'pandas.core.frame.DataFrame'>\n",
 976 |       "RangeIndex: 5087 entries, 0 to 5086\n",
 977 |       "Columns: 3198 entries, LABEL to FLUX.3197\n",
 978 |       "dtypes: float64(3197), int64(1)\n",
 979 |       "memory usage: 124.1 MB\n"
 980 |      ]
 981 |     }
 982 |    ],
 983 |    "source": [
 984 |     "df.info()"
 985 |    ]
 986 |   },
 987 |   {
 988 |    "cell_type": "code",
 989 |    "execution_count": 233,
 990 |    "metadata": {},
 991 |    "outputs": [
 992 |     {
 993 |      "data": {
 994 |       "text/plain": [
 995 |        "0"
 996 |       ]
 997 |      },
 998 |      "execution_count": 233,
 999 |      "metadata": {},
1000 |      "output_type": "execute_result"
1001 |     }
1002 |    ],
1003 |    "source": [
1004 |     "df.isnull().sum().sum()"
1005 |    ]
1006 |   },
1007 |   {
1008 |    "cell_type": "code",
1009 |    "execution_count": 203,
1010 |    "metadata": {},
1011 |    "outputs": [],
1012 |    "source": [
1013 |     "# Split data into X and y\n",
1014 |     "X = df.iloc[:,1:]\n",
1015 |     "y = df.iloc[:,0]\n",
1016 |     "\n",
1017 |     "# Split data into train and test sets\n",
1018 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": 213,
1024 |    "metadata": {},
1025 |    "outputs": [],
1026 |    "source": [
1027 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
1028 |     "# Import XGBRegressor\n",
1029 |     "from xgboost import XGBClassifier\n",
1030 |     "\n",
1031 |     "# Import accuracy_score\n",
1032 |     "from sklearn.metrics import accuracy_score"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": 228,
1038 |    "metadata": {},
1039 |    "outputs": [
1040 |     {
1041 |      "data": {
1042 |       "text/plain": [
1043 |        "1585509739.918818"
1044 |       ]
1045 |      },
1046 |      "execution_count": 228,
1047 |      "metadata": {},
1048 |      "output_type": "execute_result"
1049 |     }
1050 |    ],
1051 |    "source": [
1052 |     "import time\n",
1053 |     "time.time()"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "code",
1058 |    "execution_count": 227,
1059 |    "metadata": {},
1060 |    "outputs": [
1061 |     {
1062 |      "name": "stdout",
1063 |      "output_type": "stream",
1064 |      "text": [
1065 |       "<class 'pandas.core.frame.DataFrame'>\n",
1066 |       "RangeIndex: 5087 entries, 0 to 5086\n",
1067 |       "Columns: 3198 entries, LABEL to FLUX.3197\n",
1068 |       "dtypes: float64(3197), int64(1)\n",
1069 |       "memory usage: 124.1 MB\n",
1070 |       "\n",
1071 |       "Run Time: 0.0525362491607666 seconds.\n"
1072 |      ]
1073 |     }
1074 |    ],
1075 |    "source": [
1076 |     "import time\n",
1077 |     "start = time.time()\n",
1078 |     "\n",
1079 |     "df.info()\n",
1080 |     "\n",
1081 |     "end = time.time()\n",
1082 |     "elapsed = end - start\n",
1083 |     "\n",
1084 |     "print('\\nRun Time: ' + str(elapsed) + ' seconds.')"
1085 |    ]
1086 |   },
1087 |   {
1088 |    "cell_type": "code",
1089 |    "execution_count": 230,
1090 |    "metadata": {},
1091 |    "outputs": [
1092 |     {
1093 |      "name": "stdout",
1094 |      "output_type": "stream",
1095 |      "text": [
1096 |       "Score: 0.9874213836477987\n",
1097 |       "\n",
1098 |       "Run Time: 317.6318619251251 seconds.\n"
1099 |      ]
1100 |     }
1101 |    ],
1102 |    "source": [
1103 |     "start = time.time()\n",
1104 |     "\n",
1105 |     "gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=2)\n",
1106 |     "gbr.fit(X_train, y_train)\n",
1107 |     "y_pred = gbr.predict(X_test)\n",
1108 |     "score = accuracy_score(y_pred, y_test)\n",
1109 |     "print('Score: ' + str(score))\n",
1110 |     "\n",
1111 |     "end = time.time()\n",
1112 |     "elapsed = end - start\n",
1113 |     "\n",
1114 |     "print('Run Time: ' + str(elapsed) + ' seconds')"
1115 |    ]
1116 |   },
1117 |   {
1118 |    "cell_type": "code",
1119 |    "execution_count": 229,
1120 |    "metadata": {},
1121 |    "outputs": [
1122 |     {
1123 |      "name": "stdout",
1124 |      "output_type": "stream",
1125 |      "text": [
1126 |       "Score: 0.9913522012578616\n",
1127 |       "\n",
1128 |       "Run Time: 118.90568995475769 seconds.\n"
1129 |      ]
1130 |     }
1131 |    ],
1132 |    "source": [
1133 |     "start = time.time()\n",
1134 |     "\n",
1135 |     "# Instantiate the XGBRegressor, xg_reg\n",
1136 |     "xg_reg = XGBClassifier(n_estimators=100, max_depth=2, random_state=2)\n",
1137 |     "\n",
1138 |     "# Fit xg_reg to training set\n",
1139 |     "xg_reg.fit(X_train, y_train)\n",
1140 |     "\n",
1141 |     "# Predict labels of test set, y_pred\n",
1142 |     "y_pred = xg_reg.predict(X_test)\n",
1143 |     "\n",
1144 |     "score = accuracy_score(y_pred, y_test)\n",
1145 |     "\n",
1146 |     "print('Score: ' + str(score))\n",
1147 |     "\n",
1148 |     "end = time.time()\n",
1149 |     "elapsed = end - start\n",
1150 |     "\n",
1151 |     "print('Run Time: ' + str(elapsed) + ' seconds')"
1152 |    ]
1153 |   }
1154 |  ],
1155 |  "metadata": {
1156 |   "kernelspec": {
1157 |    "display_name": "Python 3",
1158 |    "language": "python",
1159 |    "name": "python3"
1160 |   },
1161 |   "language_info": {
1162 |    "codemirror_mode": {
1163 |     "name": "ipython",
1164 |     "version": 3
1165 |    },
1166 |    "file_extension": ".py",
1167 |    "mimetype": "text/x-python",
1168 |    "name": "python",
1169 |    "nbconvert_exporter": "python",
1170 |    "pygments_lexer": "ipython3",
1171 |    "version": "3.6.10"
1172 |   }
1173 |  },
1174 |  "nbformat": 4,
1175 |  "nbformat_minor": 2
1176 | }
1177 | 


--------------------------------------------------------------------------------
/Chapter04/Gradient Boosting learning_rate 30 trees.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter04/Gradient Boosting learning_rate 30 trees.png


--------------------------------------------------------------------------------
/Chapter04/Gradient Boosting learning_rate 300 trees.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter04/Gradient Boosting learning_rate 300 trees.png


--------------------------------------------------------------------------------
/Chapter04/Gradient Boosting learning_rate 3000 trees.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter04/Gradient Boosting learning_rate 3000 trees.png


--------------------------------------------------------------------------------
/Chapter04/Gradient_Boosting.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "# Import pandas and numpy\n",
  10 |     "import pandas as pd\n",
  11 |     "import numpy as np\n",
  12 |     "\n",
  13 |     "# Silence warnings\n",
  14 |     "import warnings\n",
  15 |     "warnings.filterwarnings('ignore')"
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "code",
  20 |    "execution_count": 2,
  21 |    "metadata": {},
  22 |    "outputs": [
  23 |     {
  24 |      "data": {
  25 |       "text/html": [
  26 |        "<div>\n",
  27 |        "<style scoped>\n",
  28 |        "    .dataframe tbody tr th:only-of-type {\n",
  29 |        "        vertical-align: middle;\n",
  30 |        "    }\n",
  31 |        "\n",
  32 |        "    .dataframe tbody tr th {\n",
  33 |        "        vertical-align: top;\n",
  34 |        "    }\n",
  35 |        "\n",
  36 |        "    .dataframe thead th {\n",
  37 |        "        text-align: right;\n",
  38 |        "    }\n",
  39 |        "</style>\n",
  40 |        "<table border=\"1\" class=\"dataframe\">\n",
  41 |        "  <thead>\n",
  42 |        "    <tr style=\"text-align: right;\">\n",
  43 |        "      <th></th>\n",
  44 |        "      <th>instant</th>\n",
  45 |        "      <th>season</th>\n",
  46 |        "      <th>yr</th>\n",
  47 |        "      <th>mnth</th>\n",
  48 |        "      <th>holiday</th>\n",
  49 |        "      <th>weekday</th>\n",
  50 |        "      <th>workingday</th>\n",
  51 |        "      <th>weathersit</th>\n",
  52 |        "      <th>temp</th>\n",
  53 |        "      <th>atemp</th>\n",
  54 |        "      <th>hum</th>\n",
  55 |        "      <th>windspeed</th>\n",
  56 |        "      <th>cnt</th>\n",
  57 |        "    </tr>\n",
  58 |        "  </thead>\n",
  59 |        "  <tbody>\n",
  60 |        "    <tr>\n",
  61 |        "      <th>0</th>\n",
  62 |        "      <td>1</td>\n",
  63 |        "      <td>1.0</td>\n",
  64 |        "      <td>0.0</td>\n",
  65 |        "      <td>1.0</td>\n",
  66 |        "      <td>0.0</td>\n",
  67 |        "      <td>6.0</td>\n",
  68 |        "      <td>0.0</td>\n",
  69 |        "      <td>2</td>\n",
  70 |        "      <td>0.344167</td>\n",
  71 |        "      <td>0.363625</td>\n",
  72 |        "      <td>0.805833</td>\n",
  73 |        "      <td>0.160446</td>\n",
  74 |        "      <td>985</td>\n",
  75 |        "    </tr>\n",
  76 |        "    <tr>\n",
  77 |        "      <th>1</th>\n",
  78 |        "      <td>2</td>\n",
  79 |        "      <td>1.0</td>\n",
  80 |        "      <td>0.0</td>\n",
  81 |        "      <td>1.0</td>\n",
  82 |        "      <td>0.0</td>\n",
  83 |        "      <td>0.0</td>\n",
  84 |        "      <td>0.0</td>\n",
  85 |        "      <td>2</td>\n",
  86 |        "      <td>0.363478</td>\n",
  87 |        "      <td>0.353739</td>\n",
  88 |        "      <td>0.696087</td>\n",
  89 |        "      <td>0.248539</td>\n",
  90 |        "      <td>801</td>\n",
  91 |        "    </tr>\n",
  92 |        "    <tr>\n",
  93 |        "      <th>2</th>\n",
  94 |        "      <td>3</td>\n",
  95 |        "      <td>1.0</td>\n",
  96 |        "      <td>0.0</td>\n",
  97 |        "      <td>1.0</td>\n",
  98 |        "      <td>0.0</td>\n",
  99 |        "      <td>1.0</td>\n",
 100 |        "      <td>1.0</td>\n",
 101 |        "      <td>1</td>\n",
 102 |        "      <td>0.196364</td>\n",
 103 |        "      <td>0.189405</td>\n",
 104 |        "      <td>0.437273</td>\n",
 105 |        "      <td>0.248309</td>\n",
 106 |        "      <td>1349</td>\n",
 107 |        "    </tr>\n",
 108 |        "    <tr>\n",
 109 |        "      <th>3</th>\n",
 110 |        "      <td>4</td>\n",
 111 |        "      <td>1.0</td>\n",
 112 |        "      <td>0.0</td>\n",
 113 |        "      <td>1.0</td>\n",
 114 |        "      <td>0.0</td>\n",
 115 |        "      <td>2.0</td>\n",
 116 |        "      <td>1.0</td>\n",
 117 |        "      <td>1</td>\n",
 118 |        "      <td>0.200000</td>\n",
 119 |        "      <td>0.212122</td>\n",
 120 |        "      <td>0.590435</td>\n",
 121 |        "      <td>0.160296</td>\n",
 122 |        "      <td>1562</td>\n",
 123 |        "    </tr>\n",
 124 |        "    <tr>\n",
 125 |        "      <th>4</th>\n",
 126 |        "      <td>5</td>\n",
 127 |        "      <td>1.0</td>\n",
 128 |        "      <td>0.0</td>\n",
 129 |        "      <td>1.0</td>\n",
 130 |        "      <td>0.0</td>\n",
 131 |        "      <td>3.0</td>\n",
 132 |        "      <td>1.0</td>\n",
 133 |        "      <td>1</td>\n",
 134 |        "      <td>0.226957</td>\n",
 135 |        "      <td>0.229270</td>\n",
 136 |        "      <td>0.436957</td>\n",
 137 |        "      <td>0.186900</td>\n",
 138 |        "      <td>1600</td>\n",
 139 |        "    </tr>\n",
 140 |        "  </tbody>\n",
 141 |        "</table>\n",
 142 |        "</div>"
 143 |       ],
 144 |       "text/plain": [
 145 |        "   instant  season   yr  mnth  holiday  weekday  workingday  weathersit  \\\n",
 146 |        "0        1     1.0  0.0   1.0      0.0      6.0         0.0           2   \n",
 147 |        "1        2     1.0  0.0   1.0      0.0      0.0         0.0           2   \n",
 148 |        "2        3     1.0  0.0   1.0      0.0      1.0         1.0           1   \n",
 149 |        "3        4     1.0  0.0   1.0      0.0      2.0         1.0           1   \n",
 150 |        "4        5     1.0  0.0   1.0      0.0      3.0         1.0           1   \n",
 151 |        "\n",
 152 |        "       temp     atemp       hum  windspeed   cnt  \n",
 153 |        "0  0.344167  0.363625  0.805833   0.160446   985  \n",
 154 |        "1  0.363478  0.353739  0.696087   0.248539   801  \n",
 155 |        "2  0.196364  0.189405  0.437273   0.248309  1349  \n",
 156 |        "3  0.200000  0.212122  0.590435   0.160296  1562  \n",
 157 |        "4  0.226957  0.229270  0.436957   0.186900  1600  "
 158 |       ]
 159 |      },
 160 |      "execution_count": 2,
 161 |      "metadata": {},
 162 |      "output_type": "execute_result"
 163 |     }
 164 |    ],
 165 |    "source": [
 166 |     "df_bikes = pd.read_csv('bike_rentals_cleaned.csv')\n",
 167 |     "df_bikes.head()"
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "code",
 172 |    "execution_count": 3,
 173 |    "metadata": {},
 174 |    "outputs": [],
 175 |    "source": [
 176 |     "# Split data into X and y\n",
 177 |     "X_bikes = df_bikes.iloc[:,:-1]\n",
 178 |     "y_bikes = df_bikes.iloc[:,-1]\n",
 179 |     "\n",
 180 |     "# Import train_test_split\n",
 181 |     "from sklearn.model_selection import train_test_split\n",
 182 |     "\n",
 183 |     "# Split data into train and test sets\n",
 184 |     "X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "code",
 189 |    "execution_count": 4,
 190 |    "metadata": {},
 191 |    "outputs": [
 192 |     {
 193 |      "data": {
 194 |       "text/plain": [
 195 |        "DecisionTreeRegressor(max_depth=2, random_state=2)"
 196 |       ]
 197 |      },
 198 |      "execution_count": 4,
 199 |      "metadata": {},
 200 |      "output_type": "execute_result"
 201 |     }
 202 |    ],
 203 |    "source": [
 204 |     "# Import Decision Tree Regressor\n",
 205 |     "from sklearn.tree import DecisionTreeRegressor\n",
 206 |     "\n",
 207 |     "# Initialize Decision Tree Regressor\n",
 208 |     "tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 209 |     "\n",
 210 |     "# Fit tree to training data\n",
 211 |     "tree_1.fit(X_train, y_train)"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": 5,
 217 |    "metadata": {},
 218 |    "outputs": [],
 219 |    "source": [
 220 |     "# Make predictions on training set\n",
 221 |     "y_train_pred = tree_1.predict(X_train)"
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "code",
 226 |    "execution_count": 6,
 227 |    "metadata": {},
 228 |    "outputs": [
 229 |     {
 230 |      "data": {
 231 |       "text/plain": [
 232 |        "DecisionTreeRegressor(max_depth=2, random_state=2)"
 233 |       ]
 234 |      },
 235 |      "execution_count": 6,
 236 |      "metadata": {},
 237 |      "output_type": "execute_result"
 238 |     }
 239 |    ],
 240 |    "source": [
 241 |     "# Compute residuals\n",
 242 |     "y2_train = y_train - y_train_pred\n",
 243 |     "\n",
 244 |     "# Initialize Decision Tree Regressor\n",
 245 |     "tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 246 |     "\n",
 247 |     "# Fit tree to training data\n",
 248 |     "tree_2.fit(X_train, y2_train)"
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "code",
 253 |    "execution_count": 7,
 254 |    "metadata": {},
 255 |    "outputs": [
 256 |     {
 257 |      "data": {
 258 |       "text/plain": [
 259 |        "DecisionTreeRegressor(max_depth=2, random_state=2)"
 260 |       ]
 261 |      },
 262 |      "execution_count": 7,
 263 |      "metadata": {},
 264 |      "output_type": "execute_result"
 265 |     }
 266 |    ],
 267 |    "source": [
 268 |     "# Make predictions on training set\n",
 269 |     "y2_train_pred = tree_2.predict(X_train)\n",
 270 |     "\n",
 271 |     "# Compute residuals\n",
 272 |     "y3_train = y2_train - y2_train_pred\n",
 273 |     "\n",
 274 |     "# Initialize Decision Tree Regressor\n",
 275 |     "tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)\n",
 276 |     "\n",
 277 |     "# Fit tree to training data\n",
 278 |     "tree_3.fit(X_train, y3_train)"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": 8,
 284 |    "metadata": {},
 285 |    "outputs": [
 286 |     {
 287 |      "data": {
 288 |       "text/plain": [
 289 |        "911.0479538776444"
 290 |       ]
 291 |      },
 292 |      "execution_count": 8,
 293 |      "metadata": {},
 294 |      "output_type": "execute_result"
 295 |     }
 296 |    ],
 297 |    "source": [
 298 |     "y1_pred = tree_1.predict(X_test)\n",
 299 |     "\n",
 300 |     "y2_pred = tree_2.predict(X_test)\n",
 301 |     "\n",
 302 |     "y3_pred = tree_3.predict(X_test)\n",
 303 |     "\n",
 304 |     "y_pred = y1_pred + y2_pred + y3_pred\n",
 305 |     "\n",
 306 |     "# Import mean_squared_error \n",
 307 |     "from sklearn.metrics import mean_squared_error as MSE\n",
 308 |     "\n",
 309 |     "# Compute root mean squared error (rmse)\n",
 310 |     "MSE(y_test, y_pred)**0.5\n"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": 9,
 316 |    "metadata": {},
 317 |    "outputs": [],
 318 |    "source": [
 319 |     "from sklearn.ensemble import GradientBoostingRegressor"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": 10,
 325 |    "metadata": {},
 326 |    "outputs": [
 327 |     {
 328 |      "data": {
 329 |       "text/plain": [
 330 |        "911.0479538776439"
 331 |       ]
 332 |      },
 333 |      "execution_count": 10,
 334 |      "metadata": {},
 335 |      "output_type": "execute_result"
 336 |     }
 337 |    ],
 338 |    "source": [
 339 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2, learning_rate=1.0)\n",
 340 |     "\n",
 341 |     "gbr.fit(X_train, y_train)\n",
 342 |     "\n",
 343 |     "# Predict test data\n",
 344 |     "y_pred = gbr.predict(X_test)\n",
 345 |     "\n",
 346 |     "# Compute root mean squared error (rmse)\n",
 347 |     "MSE(y_test, y_pred)**0.5"
 348 |    ]
 349 |   },
 350 |   {
 351 |    "cell_type": "code",
 352 |    "execution_count": 11,
 353 |    "metadata": {},
 354 |    "outputs": [
 355 |     {
 356 |      "data": {
 357 |       "text/plain": [
 358 |        "857.1072323426944"
 359 |       ]
 360 |      },
 361 |      "execution_count": 11,
 362 |      "metadata": {},
 363 |      "output_type": "execute_result"
 364 |     }
 365 |    ],
 366 |    "source": [
 367 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, random_state=2, learning_rate=1.0)\n",
 368 |     "gbr.fit(X_train, y_train)\n",
 369 |     "y_pred = gbr.predict(X_test)\n",
 370 |     "MSE(y_test, y_pred)**0.5"
 371 |    ]
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": 12,
 376 |    "metadata": {},
 377 |    "outputs": [
 378 |     {
 379 |      "data": {
 380 |       "text/plain": [
 381 |        "936.3617413678853"
 382 |       ]
 383 |      },
 384 |      "execution_count": 12,
 385 |      "metadata": {},
 386 |      "output_type": "execute_result"
 387 |     }
 388 |    ],
 389 |    "source": [
 390 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=1.0)\n",
 391 |     "gbr.fit(X_train, y_train)\n",
 392 |     "y_pred = gbr.predict(X_test)\n",
 393 |     "MSE(y_test, y_pred)**0.5"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "code",
 398 |    "execution_count": 13,
 399 |    "metadata": {},
 400 |    "outputs": [
 401 |     {
 402 |      "data": {
 403 |       "text/plain": [
 404 |        "653.7456840231495"
 405 |       ]
 406 |      },
 407 |      "execution_count": 13,
 408 |      "metadata": {},
 409 |      "output_type": "execute_result"
 410 |     }
 411 |    ],
 412 |    "source": [
 413 |     "gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2)\n",
 414 |     "gbr.fit(X_train, y_train)\n",
 415 |     "y_pred = gbr.predict(X_test)\n",
 416 |     "MSE(y_test, y_pred)**0.5"
 417 |    ]
 418 |   },
 419 |   {
 420 |    "cell_type": "code",
 421 |    "execution_count": 14,
 422 |    "metadata": {},
 423 |    "outputs": [
 424 |     {
 425 |      "name": "stdout",
 426 |      "output_type": "stream",
 427 |      "text": [
 428 |       "Learning Rate: 0.001 , Score: 1633.0261400367258\n",
 429 |       "Learning Rate: 0.01 , Score: 831.5430182728547\n",
 430 |       "Learning Rate: 0.05 , Score: 685.0192988749717\n",
 431 |       "Learning Rate: 0.1 , Score: 653.7456840231495\n",
 432 |       "Learning Rate: 0.15 , Score: 687.666134269379\n",
 433 |       "Learning Rate: 0.2 , Score: 664.312804425697\n",
 434 |       "Learning Rate: 0.3 , Score: 689.4190385930236\n",
 435 |       "Learning Rate: 0.5 , Score: 693.8856905068778\n",
 436 |       "Learning Rate: 1.0 , Score: 936.3617413678853\n"
 437 |      ]
 438 |     }
 439 |    ],
 440 |    "source": [
 441 |     "learning_rate_values = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]\n",
 442 |     "for value in learning_rate_values:\n",
 443 |     "    gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, random_state=2, learning_rate=value)\n",
 444 |     "    gbr.fit(X_train, y_train)\n",
 445 |     "    y_pred = gbr.predict(X_test)\n",
 446 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 447 |     "    print('Learning Rate:', value, ', Score:', rmse)"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": 15,
 453 |    "metadata": {},
 454 |    "outputs": [
 455 |     {
 456 |      "name": "stdout",
 457 |      "output_type": "stream",
 458 |      "text": [
 459 |       "Max Depth: None , Score: 869.2783041945797\n",
 460 |       "Max Depth: 1 , Score: 707.8261886858736\n",
 461 |       "Max Depth: 2 , Score: 653.7456840231495\n",
 462 |       "Max Depth: 3 , Score: 646.4045923317708\n",
 463 |       "Max Depth: 4 , Score: 663.048387855927\n"
 464 |      ]
 465 |     }
 466 |    ],
 467 |    "source": [
 468 |     "depths = [None, 1, 2, 3, 4]\n",
 469 |     "for depth in depths:\n",
 470 |     "    gbr = GradientBoostingRegressor(max_depth=depth, n_estimators=300, random_state=2)\n",
 471 |     "    gbr.fit(X_train, y_train)\n",
 472 |     "    y_pred = gbr.predict(X_test)\n",
 473 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 474 |     "    print('Max Depth:', depth, ', Score:', rmse)"
 475 |    ]
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": 16,
 480 |    "metadata": {},
 481 |    "outputs": [
 482 |     {
 483 |      "name": "stdout",
 484 |      "output_type": "stream",
 485 |      "text": [
 486 |       "Subsample: 1 , Score: 646.4045923317708\n",
 487 |       "Subsample: 0.9 , Score: 620.1819001443569\n",
 488 |       "Subsample: 0.8 , Score: 617.2355650565677\n",
 489 |       "Subsample: 0.7 , Score: 612.9879156983139\n",
 490 |       "Subsample: 0.6 , Score: 622.6385116402317\n",
 491 |       "Subsample: 0.5 , Score: 626.9974073227554\n"
 492 |      ]
 493 |     }
 494 |    ],
 495 |    "source": [
 496 |     "samples = [1, 0.9, 0.8, 0.7, 0.6, 0.5]\n",
 497 |     "for sample in samples:\n",
 498 |     "    gbr = GradientBoostingRegressor(max_depth=3, n_estimators=300, subsample=sample, random_state=2)\n",
 499 |     "    gbr.fit(X_train, y_train)\n",
 500 |     "    y_pred = gbr.predict(X_test)\n",
 501 |     "    rmse = MSE(y_test, y_pred)**0.5\n",
 502 |     "    print('Subsample:', sample, ', Score:', rmse)"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 17,
 508 |    "metadata": {},
 509 |    "outputs": [
 510 |     {
 511 |      "name": "stdout",
 512 |      "output_type": "stream",
 513 |      "text": [
 514 |       "Best params: {'subsample': 0.65, 'n_estimators': 300, 'learning_rate': 0.05}\n",
 515 |       "Training score: 636.200\n",
 516 |       "Test set score: 625.985\n"
 517 |      ]
 518 |     }
 519 |    ],
 520 |    "source": [
 521 |     "params={'subsample':[0.65, 0.7, 0.75],\n",
 522 |     "                          'n_estimators':[300, 500, 1000],\n",
 523 |     "                          'learning_rate':[0.05, 0.075, 0.1]\n",
 524 |     "                         }\n",
 525 |     "\n",
 526 |     "# Import RandomizedSearchCV\n",
 527 |     "from sklearn.model_selection import RandomizedSearchCV\n",
 528 |     "\n",
 529 |     "gbr = GradientBoostingRegressor(max_depth=3, random_state=2)\n",
 530 |     "\n",
 531 |     "\n",
 532 |     "# Instantiate RandomizedSearchCV as rand_reg\n",
 533 |     "rand_reg = RandomizedSearchCV(gbr, params, n_iter=10, scoring='neg_mean_squared_error', \n",
 534 |     "                              cv=5, n_jobs=-1, random_state=2)\n",
 535 |     "\n",
 536 |     "# Fit grid_reg on X_train and y_train\n",
 537 |     "rand_reg.fit(X_train, y_train)\n",
 538 |     "\n",
 539 |     "# Extract best estimator\n",
 540 |     "best_model = rand_reg.best_estimator_\n",
 541 |     "\n",
 542 |     "# Extract best params\n",
 543 |     "best_params = rand_reg.best_params_\n",
 544 |     "\n",
 545 |     "# Print best params\n",
 546 |     "print(\"Best params:\", best_params)\n",
 547 |     "\n",
 548 |     "# Compute best score\n",
 549 |     "best_score = np.sqrt(-rand_reg.best_score_)\n",
 550 |     "\n",
 551 |     "# Print best score\n",
 552 |     "print(\"Training score: {:.3f}\".format(best_score))\n",
 553 |     "\n",
 554 |     "# Predict test set labels\n",
 555 |     "y_pred = best_model.predict(X_test)\n",
 556 |     "\n",
 557 |     "# Compute rmse_test\n",
 558 |     "rmse_test = MSE(y_test, y_pred)**0.5\n",
 559 |     "\n",
 560 |     "# Print rmse_test\n",
 561 |     "print('Test set score: {:.3f}'.format(rmse_test))"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": 18,
 567 |    "metadata": {},
 568 |    "outputs": [
 569 |     {
 570 |      "data": {
 571 |       "text/plain": [
 572 |        "596.9544588974487"
 573 |       ]
 574 |      },
 575 |      "execution_count": 18,
 576 |      "metadata": {},
 577 |      "output_type": "execute_result"
 578 |     }
 579 |    ],
 580 |    "source": [
 581 |     "gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, subsample=0.75, learning_rate=0.02, random_state=2)\n",
 582 |     "gbr.fit(X_train, y_train)\n",
 583 |     "y_pred = gbr.predict(X_test)\n",
 584 |     "MSE(y_test, y_pred)**0.5"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "code",
 589 |    "execution_count": 19,
 590 |    "metadata": {},
 591 |    "outputs": [
 592 |     {
 593 |      "data": {
 594 |       "text/plain": [
 595 |        "584.339544309016"
 596 |       ]
 597 |      },
 598 |      "execution_count": 19,
 599 |      "metadata": {},
 600 |      "output_type": "execute_result"
 601 |     }
 602 |    ],
 603 |    "source": [
 604 |     "# Import XGBRegressor\n",
 605 |     "from xgboost import XGBRegressor\n",
 606 |     "\n",
 607 |     "# Instantiate the XGBRegressor, xg_reg\n",
 608 |     "xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, subsample=0.75, random_state=2)\n",
 609 |     "\n",
 610 |     "# Fit xg_reg to training set\n",
 611 |     "xg_reg.fit(X_train, y_train)\n",
 612 |     "\n",
 613 |     "# Predict labels of test set, y_pred\n",
 614 |     "y_pred = xg_reg.predict(X_test)\n",
 615 |     "\n",
 616 |     "# Compute root mean squared error (rmse)\n",
 617 |     "MSE(y_test, y_pred)**0.5"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "code",
 622 |    "execution_count": 20,
 623 |    "metadata": {},
 624 |    "outputs": [
 625 |     {
 626 |      "data": {
 627 |       "text/html": [
 628 |        "<div>\n",
 629 |        "<style scoped>\n",
 630 |        "    .dataframe tbody tr th:only-of-type {\n",
 631 |        "        vertical-align: middle;\n",
 632 |        "    }\n",
 633 |        "\n",
 634 |        "    .dataframe tbody tr th {\n",
 635 |        "        vertical-align: top;\n",
 636 |        "    }\n",
 637 |        "\n",
 638 |        "    .dataframe thead th {\n",
 639 |        "        text-align: right;\n",
 640 |        "    }\n",
 641 |        "</style>\n",
 642 |        "<table border=\"1\" class=\"dataframe\">\n",
 643 |        "  <thead>\n",
 644 |        "    <tr style=\"text-align: right;\">\n",
 645 |        "      <th></th>\n",
 646 |        "      <th>LABEL</th>\n",
 647 |        "      <th>FLUX.1</th>\n",
 648 |        "      <th>FLUX.2</th>\n",
 649 |        "      <th>FLUX.3</th>\n",
 650 |        "      <th>FLUX.4</th>\n",
 651 |        "      <th>FLUX.5</th>\n",
 652 |        "      <th>FLUX.6</th>\n",
 653 |        "      <th>FLUX.7</th>\n",
 654 |        "      <th>FLUX.8</th>\n",
 655 |        "      <th>FLUX.9</th>\n",
 656 |        "      <th>...</th>\n",
 657 |        "      <th>FLUX.3188</th>\n",
 658 |        "      <th>FLUX.3189</th>\n",
 659 |        "      <th>FLUX.3190</th>\n",
 660 |        "      <th>FLUX.3191</th>\n",
 661 |        "      <th>FLUX.3192</th>\n",
 662 |        "      <th>FLUX.3193</th>\n",
 663 |        "      <th>FLUX.3194</th>\n",
 664 |        "      <th>FLUX.3195</th>\n",
 665 |        "      <th>FLUX.3196</th>\n",
 666 |        "      <th>FLUX.3197</th>\n",
 667 |        "    </tr>\n",
 668 |        "  </thead>\n",
 669 |        "  <tbody>\n",
 670 |        "    <tr>\n",
 671 |        "      <th>0</th>\n",
 672 |        "      <td>2</td>\n",
 673 |        "      <td>93.85</td>\n",
 674 |        "      <td>83.81</td>\n",
 675 |        "      <td>20.10</td>\n",
 676 |        "      <td>-26.98</td>\n",
 677 |        "      <td>-39.56</td>\n",
 678 |        "      <td>-124.71</td>\n",
 679 |        "      <td>-135.18</td>\n",
 680 |        "      <td>-96.27</td>\n",
 681 |        "      <td>-79.89</td>\n",
 682 |        "      <td>...</td>\n",
 683 |        "      <td>-78.07</td>\n",
 684 |        "      <td>-102.15</td>\n",
 685 |        "      <td>-102.15</td>\n",
 686 |        "      <td>25.13</td>\n",
 687 |        "      <td>48.57</td>\n",
 688 |        "      <td>92.54</td>\n",
 689 |        "      <td>39.32</td>\n",
 690 |        "      <td>61.42</td>\n",
 691 |        "      <td>5.08</td>\n",
 692 |        "      <td>-39.54</td>\n",
 693 |        "    </tr>\n",
 694 |        "    <tr>\n",
 695 |        "      <th>1</th>\n",
 696 |        "      <td>2</td>\n",
 697 |        "      <td>-38.88</td>\n",
 698 |        "      <td>-33.83</td>\n",
 699 |        "      <td>-58.54</td>\n",
 700 |        "      <td>-40.09</td>\n",
 701 |        "      <td>-79.31</td>\n",
 702 |        "      <td>-72.81</td>\n",
 703 |        "      <td>-86.55</td>\n",
 704 |        "      <td>-85.33</td>\n",
 705 |        "      <td>-83.97</td>\n",
 706 |        "      <td>...</td>\n",
 707 |        "      <td>-3.28</td>\n",
 708 |        "      <td>-32.21</td>\n",
 709 |        "      <td>-32.21</td>\n",
 710 |        "      <td>-24.89</td>\n",
 711 |        "      <td>-4.86</td>\n",
 712 |        "      <td>0.76</td>\n",
 713 |        "      <td>-11.70</td>\n",
 714 |        "      <td>6.46</td>\n",
 715 |        "      <td>16.00</td>\n",
 716 |        "      <td>19.93</td>\n",
 717 |        "    </tr>\n",
 718 |        "    <tr>\n",
 719 |        "      <th>2</th>\n",
 720 |        "      <td>2</td>\n",
 721 |        "      <td>532.64</td>\n",
 722 |        "      <td>535.92</td>\n",
 723 |        "      <td>513.73</td>\n",
 724 |        "      <td>496.92</td>\n",
 725 |        "      <td>456.45</td>\n",
 726 |        "      <td>466.00</td>\n",
 727 |        "      <td>464.50</td>\n",
 728 |        "      <td>486.39</td>\n",
 729 |        "      <td>436.56</td>\n",
 730 |        "      <td>...</td>\n",
 731 |        "      <td>-71.69</td>\n",
 732 |        "      <td>13.31</td>\n",
 733 |        "      <td>13.31</td>\n",
 734 |        "      <td>-29.89</td>\n",
 735 |        "      <td>-20.88</td>\n",
 736 |        "      <td>5.06</td>\n",
 737 |        "      <td>-11.80</td>\n",
 738 |        "      <td>-28.91</td>\n",
 739 |        "      <td>-70.02</td>\n",
 740 |        "      <td>-96.67</td>\n",
 741 |        "    </tr>\n",
 742 |        "    <tr>\n",
 743 |        "      <th>3</th>\n",
 744 |        "      <td>2</td>\n",
 745 |        "      <td>326.52</td>\n",
 746 |        "      <td>347.39</td>\n",
 747 |        "      <td>302.35</td>\n",
 748 |        "      <td>298.13</td>\n",
 749 |        "      <td>317.74</td>\n",
 750 |        "      <td>312.70</td>\n",
 751 |        "      <td>322.33</td>\n",
 752 |        "      <td>311.31</td>\n",
 753 |        "      <td>312.42</td>\n",
 754 |        "      <td>...</td>\n",
 755 |        "      <td>5.71</td>\n",
 756 |        "      <td>-3.73</td>\n",
 757 |        "      <td>-3.73</td>\n",
 758 |        "      <td>30.05</td>\n",
 759 |        "      <td>20.03</td>\n",
 760 |        "      <td>-12.67</td>\n",
 761 |        "      <td>-8.77</td>\n",
 762 |        "      <td>-17.31</td>\n",
 763 |        "      <td>-17.35</td>\n",
 764 |        "      <td>13.98</td>\n",
 765 |        "    </tr>\n",
 766 |        "    <tr>\n",
 767 |        "      <th>4</th>\n",
 768 |        "      <td>2</td>\n",
 769 |        "      <td>-1107.21</td>\n",
 770 |        "      <td>-1112.59</td>\n",
 771 |        "      <td>-1118.95</td>\n",
 772 |        "      <td>-1095.10</td>\n",
 773 |        "      <td>-1057.55</td>\n",
 774 |        "      <td>-1034.48</td>\n",
 775 |        "      <td>-998.34</td>\n",
 776 |        "      <td>-1022.71</td>\n",
 777 |        "      <td>-989.57</td>\n",
 778 |        "      <td>...</td>\n",
 779 |        "      <td>-594.37</td>\n",
 780 |        "      <td>-401.66</td>\n",
 781 |        "      <td>-401.66</td>\n",
 782 |        "      <td>-357.24</td>\n",
 783 |        "      <td>-443.76</td>\n",
 784 |        "      <td>-438.54</td>\n",
 785 |        "      <td>-399.71</td>\n",
 786 |        "      <td>-384.65</td>\n",
 787 |        "      <td>-411.79</td>\n",
 788 |        "      <td>-510.54</td>\n",
 789 |        "    </tr>\n",
 790 |        "  </tbody>\n",
 791 |        "</table>\n",
 792 |        "<p>5 rows × 3198 columns</p>\n",
 793 |        "</div>"
 794 |       ],
 795 |       "text/plain": [
 796 |        "   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \\\n",
 797 |        "0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   \n",
 798 |        "1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   \n",
 799 |        "2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   \n",
 800 |        "3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   \n",
 801 |        "4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   \n",
 802 |        "\n",
 803 |        "    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \\\n",
 804 |        "0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   \n",
 805 |        "1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   \n",
 806 |        "2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   \n",
 807 |        "3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   \n",
 808 |        "4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   \n",
 809 |        "\n",
 810 |        "   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  \n",
 811 |        "0      48.57      92.54      39.32      61.42       5.08     -39.54  \n",
 812 |        "1      -4.86       0.76     -11.70       6.46      16.00      19.93  \n",
 813 |        "2     -20.88       5.06     -11.80     -28.91     -70.02     -96.67  \n",
 814 |        "3      20.03     -12.67      -8.77     -17.31     -17.35      13.98  \n",
 815 |        "4    -443.76    -438.54    -399.71    -384.65    -411.79    -510.54  \n",
 816 |        "\n",
 817 |        "[5 rows x 3198 columns]"
 818 |       ]
 819 |      },
 820 |      "execution_count": 20,
 821 |      "metadata": {},
 822 |      "output_type": "execute_result"
 823 |     }
 824 |    ],
 825 |    "source": [
 826 |     "df = pd.read_csv('exoplanets.csv')\n",
 827 |     "df.head()"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": 21,
 833 |    "metadata": {},
 834 |    "outputs": [
 835 |     {
 836 |      "name": "stdout",
 837 |      "output_type": "stream",
 838 |      "text": [
 839 |       "<class 'pandas.core.frame.DataFrame'>\n",
 840 |       "RangeIndex: 5087 entries, 0 to 5086\n",
 841 |       "Columns: 3198 entries, LABEL to FLUX.3197\n",
 842 |       "dtypes: float64(3197), int64(1)\n",
 843 |       "memory usage: 124.1 MB\n"
 844 |      ]
 845 |     }
 846 |    ],
 847 |    "source": [
 848 |     "df.info()"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": 22,
 854 |    "metadata": {},
 855 |    "outputs": [
 856 |     {
 857 |      "data": {
 858 |       "text/plain": [
 859 |        "0"
 860 |       ]
 861 |      },
 862 |      "execution_count": 22,
 863 |      "metadata": {},
 864 |      "output_type": "execute_result"
 865 |     }
 866 |    ],
 867 |    "source": [
 868 |     "df.isnull().sum().sum()"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": 23,
 874 |    "metadata": {},
 875 |    "outputs": [],
 876 |    "source": [
 877 |     "# Split data into X and y\n",
 878 |     "X = df.iloc[:,1:]\n",
 879 |     "y = df.iloc[:,0]\n",
 880 |     "\n",
 881 |     "# Split data into train and test sets\n",
 882 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
 883 |    ]
 884 |   },
 885 |   {
 886 |    "cell_type": "code",
 887 |    "execution_count": 24,
 888 |    "metadata": {},
 889 |    "outputs": [],
 890 |    "source": [
 891 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 892 |     "# Import XGBRegressor\n",
 893 |     "from xgboost import XGBClassifier\n",
 894 |     "\n",
 895 |     "# Import accuracy_score\n",
 896 |     "from sklearn.metrics import accuracy_score"
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "code",
 901 |    "execution_count": 25,
 902 |    "metadata": {},
 903 |    "outputs": [
 904 |     {
 905 |      "name": "stdout",
 906 |      "output_type": "stream",
 907 |      "text": [
 908 |       "<class 'pandas.core.frame.DataFrame'>\n",
 909 |       "RangeIndex: 5087 entries, 0 to 5086\n",
 910 |       "Columns: 3198 entries, LABEL to FLUX.3197\n",
 911 |       "dtypes: float64(3197), int64(1)\n",
 912 |       "memory usage: 124.1 MB\n",
 913 |       "\n",
 914 |       "Run Time: 0.02074909210205078 seconds.\n"
 915 |      ]
 916 |     }
 917 |    ],
 918 |    "source": [
 919 |     "import time\n",
 920 |     "start = time.time()\n",
 921 |     "\n",
 922 |     "df.info()\n",
 923 |     "\n",
 924 |     "end = time.time()\n",
 925 |     "elapsed = end - start\n",
 926 |     "\n",
 927 |     "print('\\nRun Time: ' + str(elapsed) + ' seconds.')"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": 26,
 933 |    "metadata": {},
 934 |    "outputs": [
 935 |     {
 936 |      "name": "stdout",
 937 |      "output_type": "stream",
 938 |      "text": [
 939 |       "Score: 0.9874213836477987\n",
 940 |       "Run Time: 205.1609354019165 seconds\n"
 941 |      ]
 942 |     }
 943 |    ],
 944 |    "source": [
 945 |     "start = time.time()\n",
 946 |     "\n",
 947 |     "gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=2)\n",
 948 |     "gbr.fit(X_train, y_train)\n",
 949 |     "y_pred = gbr.predict(X_test)\n",
 950 |     "score = accuracy_score(y_pred, y_test)\n",
 951 |     "print('Score: ' + str(score))\n",
 952 |     "\n",
 953 |     "end = time.time()\n",
 954 |     "elapsed = end - start\n",
 955 |     "\n",
 956 |     "print('Run Time: ' + str(elapsed) + ' seconds')"
 957 |    ]
 958 |   },
 959 |   {
 960 |    "cell_type": "code",
 961 |    "execution_count": 27,
 962 |    "metadata": {},
 963 |    "outputs": [
 964 |     {
 965 |      "name": "stdout",
 966 |      "output_type": "stream",
 967 |      "text": [
 968 |       "Score: 0.9913522012578616\n",
 969 |       "Run Time: 9.091089010238647 seconds\n"
 970 |      ]
 971 |     }
 972 |    ],
 973 |    "source": [
 974 |     "start = time.time()\n",
 975 |     "\n",
 976 |     "# Instantiate the XGBRegressor, xg_reg\n",
 977 |     "xg_reg = XGBClassifier(n_estimators=100, max_depth=2, random_state=2)\n",
 978 |     "\n",
 979 |     "# Fit xg_reg to training set\n",
 980 |     "xg_reg.fit(X_train, y_train)\n",
 981 |     "\n",
 982 |     "# Predict labels of test set, y_pred\n",
 983 |     "y_pred = xg_reg.predict(X_test)\n",
 984 |     "\n",
 985 |     "score = accuracy_score(y_pred, y_test)\n",
 986 |     "\n",
 987 |     "print('Score: ' + str(score))\n",
 988 |     "\n",
 989 |     "end = time.time()\n",
 990 |     "elapsed = end - start\n",
 991 |     "\n",
 992 |     "print('Run Time: ' + str(elapsed) + ' seconds')"
 993 |    ]
 994 |   }
 995 |  ],
 996 |  "metadata": {
 997 |   "kernelspec": {
 998 |    "display_name": "Python 3",
 999 |    "language": "python",
1000 |    "name": "python3"
1001 |   },
1002 |   "language_info": {
1003 |    "codemirror_mode": {
1004 |     "name": "ipython",
1005 |     "version": 3
1006 |    },
1007 |    "file_extension": ".py",
1008 |    "mimetype": "text/x-python",
1009 |    "name": "python",
1010 |    "nbconvert_exporter": "python",
1011 |    "pygments_lexer": "ipython3",
1012 |    "version": "3.7.7"
1013 |   }
1014 |  },
1015 |  "nbformat": 4,
1016 |  "nbformat_minor": 2
1017 | }
1018 | 


--------------------------------------------------------------------------------
/Chapter04/bike_rentals_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:76a1beb36c78dc0c974d96583476bddae8538e31a9e316745b9f54c3cb54094e
3 | size 52255
4 | 


--------------------------------------------------------------------------------
/Chapter04/exoplanets.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6f04167636d2fcdce44af6fd5cbef359d8af033ce8457d8c8e917fdb822e044c
3 | size 262223348
4 | 


--------------------------------------------------------------------------------
/Chapter04/exoplanets.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter04/exoplanets.csv.zip


--------------------------------------------------------------------------------
/Chapter05/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter05/.DS_Store


--------------------------------------------------------------------------------
/Chapter05/.gitattributes:
--------------------------------------------------------------------------------
 1 | test.csv filter=lfs diff=lfs merge=lfs -text
 2 | Ch.5.ipynb filter=lfs diff=lfs merge=lfs -text
 3 | higgs.model filter=lfs diff=lfs merge=lfs -text
 4 | random_submission.csv filter=lfs diff=lfs merge=lfs -text
 5 | random_submission.zip filter=lfs diff=lfs merge=lfs -text
 6 | test.zip filter=lfs diff=lfs merge=lfs -text
 7 | training.csv filter=lfs diff=lfs merge=lfs -text
 8 | training.zip filter=lfs diff=lfs merge=lfs -text
 9 | Advanced_XGBoost_Unveiled.ipynb filter=lfs diff=lfs merge=lfs -text
10 | 


--------------------------------------------------------------------------------
/Chapter05/Advanced_XGBoost_Unveiled.ipynb:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:46d5d2436996e152e6fc3df009167359688b49be3f107ece27be2ee238085474
3 | size 38645
4 | 


--------------------------------------------------------------------------------
/Chapter05/atlas-higgs-challenge-2014-v2.csv.gz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f370a6c17b2c8f552fb4620385cf8667f9943a5b1afb3f7c6ead84510d04a8dc
3 | size 65630848
4 | 


--------------------------------------------------------------------------------
/Chapter05/higgs.model:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e6f37790477f843eacabcd27af6b64c98f0fb9f9c741cb8a999fdf0321a804dd
3 | size 537627
4 | 


--------------------------------------------------------------------------------
/Chapter06/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter06/.DS_Store


--------------------------------------------------------------------------------
/Chapter06/XGBoost_Hyperparameters.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "scrolled": true
   8 |    },
   9 |    "outputs": [
  10 |     {
  11 |      "data": {
  12 |       "text/html": [
  13 |        "<div>\n",
  14 |        "<style scoped>\n",
  15 |        "    .dataframe tbody tr th:only-of-type {\n",
  16 |        "        vertical-align: middle;\n",
  17 |        "    }\n",
  18 |        "\n",
  19 |        "    .dataframe tbody tr th {\n",
  20 |        "        vertical-align: top;\n",
  21 |        "    }\n",
  22 |        "\n",
  23 |        "    .dataframe thead th {\n",
  24 |        "        text-align: right;\n",
  25 |        "    }\n",
  26 |        "</style>\n",
  27 |        "<table border=\"1\" class=\"dataframe\">\n",
  28 |        "  <thead>\n",
  29 |        "    <tr style=\"text-align: right;\">\n",
  30 |        "      <th></th>\n",
  31 |        "      <th>age</th>\n",
  32 |        "      <th>sex</th>\n",
  33 |        "      <th>cp</th>\n",
  34 |        "      <th>trestbps</th>\n",
  35 |        "      <th>chol</th>\n",
  36 |        "      <th>fbs</th>\n",
  37 |        "      <th>restecg</th>\n",
  38 |        "      <th>thalach</th>\n",
  39 |        "      <th>exang</th>\n",
  40 |        "      <th>oldpeak</th>\n",
  41 |        "      <th>slope</th>\n",
  42 |        "      <th>ca</th>\n",
  43 |        "      <th>thal</th>\n",
  44 |        "      <th>target</th>\n",
  45 |        "    </tr>\n",
  46 |        "  </thead>\n",
  47 |        "  <tbody>\n",
  48 |        "    <tr>\n",
  49 |        "      <th>0</th>\n",
  50 |        "      <td>63</td>\n",
  51 |        "      <td>1</td>\n",
  52 |        "      <td>3</td>\n",
  53 |        "      <td>145</td>\n",
  54 |        "      <td>233</td>\n",
  55 |        "      <td>1</td>\n",
  56 |        "      <td>0</td>\n",
  57 |        "      <td>150</td>\n",
  58 |        "      <td>0</td>\n",
  59 |        "      <td>2.3</td>\n",
  60 |        "      <td>0</td>\n",
  61 |        "      <td>0</td>\n",
  62 |        "      <td>1</td>\n",
  63 |        "      <td>1</td>\n",
  64 |        "    </tr>\n",
  65 |        "    <tr>\n",
  66 |        "      <th>1</th>\n",
  67 |        "      <td>37</td>\n",
  68 |        "      <td>1</td>\n",
  69 |        "      <td>2</td>\n",
  70 |        "      <td>130</td>\n",
  71 |        "      <td>250</td>\n",
  72 |        "      <td>0</td>\n",
  73 |        "      <td>1</td>\n",
  74 |        "      <td>187</td>\n",
  75 |        "      <td>0</td>\n",
  76 |        "      <td>3.5</td>\n",
  77 |        "      <td>0</td>\n",
  78 |        "      <td>0</td>\n",
  79 |        "      <td>2</td>\n",
  80 |        "      <td>1</td>\n",
  81 |        "    </tr>\n",
  82 |        "    <tr>\n",
  83 |        "      <th>2</th>\n",
  84 |        "      <td>41</td>\n",
  85 |        "      <td>0</td>\n",
  86 |        "      <td>1</td>\n",
  87 |        "      <td>130</td>\n",
  88 |        "      <td>204</td>\n",
  89 |        "      <td>0</td>\n",
  90 |        "      <td>0</td>\n",
  91 |        "      <td>172</td>\n",
  92 |        "      <td>0</td>\n",
  93 |        "      <td>1.4</td>\n",
  94 |        "      <td>2</td>\n",
  95 |        "      <td>0</td>\n",
  96 |        "      <td>2</td>\n",
  97 |        "      <td>1</td>\n",
  98 |        "    </tr>\n",
  99 |        "    <tr>\n",
 100 |        "      <th>3</th>\n",
 101 |        "      <td>56</td>\n",
 102 |        "      <td>1</td>\n",
 103 |        "      <td>1</td>\n",
 104 |        "      <td>120</td>\n",
 105 |        "      <td>236</td>\n",
 106 |        "      <td>0</td>\n",
 107 |        "      <td>1</td>\n",
 108 |        "      <td>178</td>\n",
 109 |        "      <td>0</td>\n",
 110 |        "      <td>0.8</td>\n",
 111 |        "      <td>2</td>\n",
 112 |        "      <td>0</td>\n",
 113 |        "      <td>2</td>\n",
 114 |        "      <td>1</td>\n",
 115 |        "    </tr>\n",
 116 |        "    <tr>\n",
 117 |        "      <th>4</th>\n",
 118 |        "      <td>57</td>\n",
 119 |        "      <td>0</td>\n",
 120 |        "      <td>0</td>\n",
 121 |        "      <td>120</td>\n",
 122 |        "      <td>354</td>\n",
 123 |        "      <td>0</td>\n",
 124 |        "      <td>1</td>\n",
 125 |        "      <td>163</td>\n",
 126 |        "      <td>1</td>\n",
 127 |        "      <td>0.6</td>\n",
 128 |        "      <td>2</td>\n",
 129 |        "      <td>0</td>\n",
 130 |        "      <td>2</td>\n",
 131 |        "      <td>1</td>\n",
 132 |        "    </tr>\n",
 133 |        "  </tbody>\n",
 134 |        "</table>\n",
 135 |        "</div>"
 136 |       ],
 137 |       "text/plain": [
 138 |        "   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \\\n",
 139 |        "0   63    1   3       145   233    1        0      150      0      2.3      0   \n",
 140 |        "1   37    1   2       130   250    0        1      187      0      3.5      0   \n",
 141 |        "2   41    0   1       130   204    0        0      172      0      1.4      2   \n",
 142 |        "3   56    1   1       120   236    0        1      178      0      0.8      2   \n",
 143 |        "4   57    0   0       120   354    0        1      163      1      0.6      2   \n",
 144 |        "\n",
 145 |        "   ca  thal  target  \n",
 146 |        "0   0     1       1  \n",
 147 |        "1   0     2       1  \n",
 148 |        "2   0     2       1  \n",
 149 |        "3   0     2       1  \n",
 150 |        "4   0     2       1  "
 151 |       ]
 152 |      },
 153 |      "execution_count": 1,
 154 |      "metadata": {},
 155 |      "output_type": "execute_result"
 156 |     }
 157 |    ],
 158 |    "source": [
 159 |     "import pandas as pd\n",
 160 |     "df = pd.read_csv('heart_disease.csv')\n",
 161 |     "df.head()"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "code",
 166 |    "execution_count": 2,
 167 |    "metadata": {},
 168 |    "outputs": [
 169 |     {
 170 |      "name": "stdout",
 171 |      "output_type": "stream",
 172 |      "text": [
 173 |       "<class 'pandas.core.frame.DataFrame'>\n",
 174 |       "RangeIndex: 303 entries, 0 to 302\n",
 175 |       "Data columns (total 14 columns):\n",
 176 |       " #   Column    Non-Null Count  Dtype  \n",
 177 |       "---  ------    --------------  -----  \n",
 178 |       " 0   age       303 non-null    int64  \n",
 179 |       " 1   sex       303 non-null    int64  \n",
 180 |       " 2   cp        303 non-null    int64  \n",
 181 |       " 3   trestbps  303 non-null    int64  \n",
 182 |       " 4   chol      303 non-null    int64  \n",
 183 |       " 5   fbs       303 non-null    int64  \n",
 184 |       " 6   restecg   303 non-null    int64  \n",
 185 |       " 7   thalach   303 non-null    int64  \n",
 186 |       " 8   exang     303 non-null    int64  \n",
 187 |       " 9   oldpeak   303 non-null    float64\n",
 188 |       " 10  slope     303 non-null    int64  \n",
 189 |       " 11  ca        303 non-null    int64  \n",
 190 |       " 12  thal      303 non-null    int64  \n",
 191 |       " 13  target    303 non-null    int64  \n",
 192 |       "dtypes: float64(1), int64(13)\n",
 193 |       "memory usage: 33.3 KB\n"
 194 |      ]
 195 |     }
 196 |    ],
 197 |    "source": [
 198 |     "df.info()"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 3,
 204 |    "metadata": {},
 205 |    "outputs": [],
 206 |    "source": [
 207 |     "from xgboost import XGBClassifier\n",
 208 |     "from sklearn.metrics import accuracy_score"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": 4,
 214 |    "metadata": {},
 215 |    "outputs": [],
 216 |    "source": [
 217 |     "# Split data into X and y\n",
 218 |     "X = df.iloc[:, :-1]\n",
 219 |     "y = df.iloc[:, -1]"
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "code",
 224 |    "execution_count": 5,
 225 |    "metadata": {},
 226 |    "outputs": [],
 227 |    "source": [
 228 |     "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 6,
 234 |    "metadata": {},
 235 |    "outputs": [
 236 |     {
 237 |      "name": "stdout",
 238 |      "output_type": "stream",
 239 |      "text": [
 240 |       "Accuracy: [0.84 0.85 0.82 0.8  0.77]\n",
 241 |       "Accuracy mean: 0.81\n"
 242 |      ]
 243 |     }
 244 |    ],
 245 |    "source": [
 246 |     "# Import cross_val_score\n",
 247 |     "from sklearn.model_selection import cross_val_score\n",
 248 |     "\n",
 249 |     "# Import numpy\n",
 250 |     "import numpy as np\n",
 251 |     "\n",
 252 |     "# Obtain scores of cross-validation\n",
 253 |     "scores = cross_val_score(model, X, y, cv=5)\n",
 254 |     "\n",
 255 |     "# Display accuracy\n",
 256 |     "print('Accuracy:', np.round(scores, 2))\n",
 257 |     "\n",
 258 |     "# Display mean accuracy\n",
 259 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 7,
 265 |    "metadata": {},
 266 |    "outputs": [],
 267 |    "source": [
 268 |     "# Import GridSearchCV\n",
 269 |     "from sklearn.model_selection import StratifiedKFold\n",
 270 |     "\n",
 271 |     "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 8,
 277 |    "metadata": {},
 278 |    "outputs": [
 279 |     {
 280 |      "name": "stdout",
 281 |      "output_type": "stream",
 282 |      "text": [
 283 |       "Accuracy: [0.72 0.82 0.75 0.8  0.82]\n",
 284 |       "Accuracy mean: 0.78\n"
 285 |      ]
 286 |     }
 287 |    ],
 288 |    "source": [
 289 |     "# Obtain scores of cross-validation\n",
 290 |     "scores = cross_val_score(model, X, y, cv=kfold)\n",
 291 |     "\n",
 292 |     "# Display accuracy\n",
 293 |     "print('Accuracy:', np.round(scores, 2))\n",
 294 |     "\n",
 295 |     "# Display mean accuracy\n",
 296 |     "print('Accuracy mean: %0.2f' % (scores.mean()))"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "code",
 301 |    "execution_count": 9,
 302 |    "metadata": {},
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "# Import GridSearchCV\n",
 306 |     "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold\n",
 307 |     "\n",
 308 |     "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)"
 309 |    ]
 310 |   },
 311 |   {
 312 |    "cell_type": "code",
 313 |    "execution_count": 10,
 314 |    "metadata": {},
 315 |    "outputs": [],
 316 |    "source": [
 317 |     "def grid_search(params, random=False): \n",
 318 |     "    \n",
 319 |     "    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)\n",
 320 |     "    \n",
 321 |     "    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)\n",
 322 |     "    \n",
 323 |     "    if random:\n",
 324 |     "        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=2)\n",
 325 |     "    else:\n",
 326 |     "        # Instantiate GridSearchCV as grid_reg\n",
 327 |     "        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)\n",
 328 |     "    \n",
 329 |     "    # Fit grid_reg on X_train and y_train\n",
 330 |     "    grid.fit(X, y)\n",
 331 |     "\n",
 332 |     "    # Extract best params\n",
 333 |     "    best_params = grid.best_params_\n",
 334 |     "\n",
 335 |     "    # Print best params\n",
 336 |     "    print(\"Best params:\", best_params)\n",
 337 |     "    \n",
 338 |     "    # Compute best score\n",
 339 |     "    best_score = grid.best_score_\n",
 340 |     "\n",
 341 |     "    # Print best score\n",
 342 |     "    print(\"Best score: {:.5f}\".format(best_score))"
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "code",
 347 |    "execution_count": 11,
 348 |    "metadata": {},
 349 |    "outputs": [
 350 |     {
 351 |      "name": "stdout",
 352 |      "output_type": "stream",
 353 |      "text": [
 354 |       "Best params: {'n_estimators': 100}\n",
 355 |       "Best score: 0.78235\n"
 356 |      ]
 357 |     }
 358 |    ],
 359 |    "source": [
 360 |     "grid_search(params={'n_estimators':[100, 200, 400, 800]})"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": 12,
 366 |    "metadata": {},
 367 |    "outputs": [
 368 |     {
 369 |      "name": "stdout",
 370 |      "output_type": "stream",
 371 |      "text": [
 372 |       "Best params: {'learning_rate': 0.05}\n",
 373 |       "Best score: 0.79585\n"
 374 |      ]
 375 |     }
 376 |    ],
 377 |    "source": [
 378 |     "grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": 13,
 384 |    "metadata": {},
 385 |    "outputs": [
 386 |     {
 387 |      "name": "stdout",
 388 |      "output_type": "stream",
 389 |      "text": [
 390 |       "Best params: {'max_depth': 2}\n",
 391 |       "Best score: 0.79902\n"
 392 |      ]
 393 |     }
 394 |    ],
 395 |    "source": [
 396 |     "grid_search(params={'max_depth':[2, 3, 5, 6, 8]})"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": 14,
 402 |    "metadata": {},
 403 |    "outputs": [
 404 |     {
 405 |      "name": "stdout",
 406 |      "output_type": "stream",
 407 |      "text": [
 408 |       "Best params: {'gamma': 0.5}\n",
 409 |       "Best score: 0.79574\n"
 410 |      ]
 411 |     }
 412 |    ],
 413 |    "source": [
 414 |     "grid_search(params={'gamma':[0, 0.01, 0.1, 0.5, 1, 2]})"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "code",
 419 |    "execution_count": 15,
 420 |    "metadata": {},
 421 |    "outputs": [
 422 |     {
 423 |      "name": "stdout",
 424 |      "output_type": "stream",
 425 |      "text": [
 426 |       "Best params: {'min_child_weight': 5}\n",
 427 |       "Best score: 0.81219\n"
 428 |      ]
 429 |     }
 430 |    ],
 431 |    "source": [
 432 |     "grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "code",
 437 |    "execution_count": 16,
 438 |    "metadata": {},
 439 |    "outputs": [
 440 |     {
 441 |      "name": "stdout",
 442 |      "output_type": "stream",
 443 |      "text": [
 444 |       "Best params: {'subsample': 0.8}\n",
 445 |       "Best score: 0.79579\n"
 446 |      ]
 447 |     }
 448 |    ],
 449 |    "source": [
 450 |     "grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})"
 451 |    ]
 452 |   },
 453 |   {
 454 |    "cell_type": "code",
 455 |    "execution_count": 17,
 456 |    "metadata": {},
 457 |    "outputs": [
 458 |     {
 459 |      "name": "stdout",
 460 |      "output_type": "stream",
 461 |      "text": [
 462 |       "Best params: {'colsample_bytree': 0.7}\n",
 463 |       "Best score: 0.79902\n"
 464 |      ]
 465 |     }
 466 |    ],
 467 |    "source": [
 468 |     "grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})"
 469 |    ]
 470 |   },
 471 |   {
 472 |    "cell_type": "code",
 473 |    "execution_count": 18,
 474 |    "metadata": {},
 475 |    "outputs": [],
 476 |    "source": [
 477 |     "# Import train_test_split\n",
 478 |     "from sklearn.model_selection import train_test_split\n",
 479 |     "\n",
 480 |     "# Split data into train and test sets\n",
 481 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": 19,
 487 |    "metadata": {},
 488 |    "outputs": [
 489 |     {
 490 |      "name": "stdout",
 491 |      "output_type": "stream",
 492 |      "text": [
 493 |       "[0]\tvalidation_0-error:0.15790\n",
 494 |       "[1]\tvalidation_0-error:0.10526\n",
 495 |       "[2]\tvalidation_0-error:0.11842\n",
 496 |       "[3]\tvalidation_0-error:0.13158\n",
 497 |       "[4]\tvalidation_0-error:0.11842\n",
 498 |       "[5]\tvalidation_0-error:0.14474\n",
 499 |       "[6]\tvalidation_0-error:0.14474\n",
 500 |       "[7]\tvalidation_0-error:0.14474\n",
 501 |       "[8]\tvalidation_0-error:0.14474\n",
 502 |       "[9]\tvalidation_0-error:0.14474\n",
 503 |       "[10]\tvalidation_0-error:0.14474\n",
 504 |       "[11]\tvalidation_0-error:0.15790\n",
 505 |       "[12]\tvalidation_0-error:0.15790\n",
 506 |       "[13]\tvalidation_0-error:0.17105\n",
 507 |       "[14]\tvalidation_0-error:0.17105\n",
 508 |       "[15]\tvalidation_0-error:0.17105\n",
 509 |       "[16]\tvalidation_0-error:0.15790\n",
 510 |       "[17]\tvalidation_0-error:0.17105\n",
 511 |       "[18]\tvalidation_0-error:0.15790\n",
 512 |       "[19]\tvalidation_0-error:0.17105\n",
 513 |       "[20]\tvalidation_0-error:0.17105\n",
 514 |       "[21]\tvalidation_0-error:0.17105\n",
 515 |       "[22]\tvalidation_0-error:0.18421\n",
 516 |       "[23]\tvalidation_0-error:0.18421\n",
 517 |       "[24]\tvalidation_0-error:0.17105\n",
 518 |       "[25]\tvalidation_0-error:0.18421\n",
 519 |       "[26]\tvalidation_0-error:0.18421\n",
 520 |       "[27]\tvalidation_0-error:0.18421\n",
 521 |       "[28]\tvalidation_0-error:0.18421\n",
 522 |       "[29]\tvalidation_0-error:0.18421\n",
 523 |       "[30]\tvalidation_0-error:0.18421\n",
 524 |       "[31]\tvalidation_0-error:0.18421\n",
 525 |       "[32]\tvalidation_0-error:0.18421\n",
 526 |       "[33]\tvalidation_0-error:0.18421\n",
 527 |       "[34]\tvalidation_0-error:0.18421\n",
 528 |       "[35]\tvalidation_0-error:0.18421\n",
 529 |       "[36]\tvalidation_0-error:0.18421\n",
 530 |       "[37]\tvalidation_0-error:0.18421\n",
 531 |       "[38]\tvalidation_0-error:0.18421\n",
 532 |       "[39]\tvalidation_0-error:0.18421\n",
 533 |       "[40]\tvalidation_0-error:0.18421\n",
 534 |       "[41]\tvalidation_0-error:0.18421\n",
 535 |       "[42]\tvalidation_0-error:0.18421\n",
 536 |       "[43]\tvalidation_0-error:0.17105\n",
 537 |       "[44]\tvalidation_0-error:0.18421\n",
 538 |       "[45]\tvalidation_0-error:0.17105\n",
 539 |       "[46]\tvalidation_0-error:0.18421\n",
 540 |       "[47]\tvalidation_0-error:0.18421\n",
 541 |       "[48]\tvalidation_0-error:0.17105\n",
 542 |       "[49]\tvalidation_0-error:0.15790\n",
 543 |       "[50]\tvalidation_0-error:0.17105\n",
 544 |       "[51]\tvalidation_0-error:0.17105\n",
 545 |       "[52]\tvalidation_0-error:0.15790\n",
 546 |       "[53]\tvalidation_0-error:0.17105\n",
 547 |       "[54]\tvalidation_0-error:0.17105\n",
 548 |       "[55]\tvalidation_0-error:0.17105\n",
 549 |       "[56]\tvalidation_0-error:0.17105\n",
 550 |       "[57]\tvalidation_0-error:0.17105\n",
 551 |       "[58]\tvalidation_0-error:0.17105\n",
 552 |       "[59]\tvalidation_0-error:0.17105\n",
 553 |       "[60]\tvalidation_0-error:0.17105\n",
 554 |       "[61]\tvalidation_0-error:0.17105\n",
 555 |       "[62]\tvalidation_0-error:0.17105\n",
 556 |       "[63]\tvalidation_0-error:0.17105\n",
 557 |       "[64]\tvalidation_0-error:0.17105\n",
 558 |       "[65]\tvalidation_0-error:0.17105\n",
 559 |       "[66]\tvalidation_0-error:0.18421\n",
 560 |       "[67]\tvalidation_0-error:0.18421\n",
 561 |       "[68]\tvalidation_0-error:0.18421\n",
 562 |       "[69]\tvalidation_0-error:0.18421\n",
 563 |       "[70]\tvalidation_0-error:0.18421\n",
 564 |       "[71]\tvalidation_0-error:0.18421\n",
 565 |       "[72]\tvalidation_0-error:0.18421\n",
 566 |       "[73]\tvalidation_0-error:0.18421\n",
 567 |       "[74]\tvalidation_0-error:0.17105\n",
 568 |       "[75]\tvalidation_0-error:0.18421\n",
 569 |       "[76]\tvalidation_0-error:0.17105\n",
 570 |       "[77]\tvalidation_0-error:0.18421\n",
 571 |       "[78]\tvalidation_0-error:0.15790\n",
 572 |       "[79]\tvalidation_0-error:0.17105\n",
 573 |       "[80]\tvalidation_0-error:0.15790\n",
 574 |       "[81]\tvalidation_0-error:0.15790\n",
 575 |       "[82]\tvalidation_0-error:0.15790\n",
 576 |       "[83]\tvalidation_0-error:0.15790\n",
 577 |       "[84]\tvalidation_0-error:0.15790\n",
 578 |       "[85]\tvalidation_0-error:0.15790\n",
 579 |       "[86]\tvalidation_0-error:0.15790\n",
 580 |       "[87]\tvalidation_0-error:0.15790\n",
 581 |       "[88]\tvalidation_0-error:0.15790\n",
 582 |       "[89]\tvalidation_0-error:0.15790\n",
 583 |       "[90]\tvalidation_0-error:0.15790\n",
 584 |       "[91]\tvalidation_0-error:0.15790\n",
 585 |       "[92]\tvalidation_0-error:0.15790\n",
 586 |       "[93]\tvalidation_0-error:0.17105\n",
 587 |       "[94]\tvalidation_0-error:0.17105\n",
 588 |       "[95]\tvalidation_0-error:0.17105\n",
 589 |       "[96]\tvalidation_0-error:0.17105\n",
 590 |       "[97]\tvalidation_0-error:0.17105\n",
 591 |       "[98]\tvalidation_0-error:0.17105\n",
 592 |       "[99]\tvalidation_0-error:0.17105\n",
 593 |       "Accuracy: 82.89%\n"
 594 |      ]
 595 |     }
 596 |    ],
 597 |    "source": [
 598 |     "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)\n",
 599 |     "eval_set = [(X_test, y_test)]\n",
 600 |     "eval_metric='error'\n",
 601 |     "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)\n",
 602 |     "# make predictions for test data\n",
 603 |     "y_pred = model.predict(X_test)\n",
 604 |     "# evaluate predictions\n",
 605 |     "accuracy = accuracy_score(y_test, y_pred)\n",
 606 |     "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
 607 |    ]
 608 |   },
 609 |   {
 610 |    "cell_type": "code",
 611 |    "execution_count": 20,
 612 |    "metadata": {},
 613 |    "outputs": [
 614 |     {
 615 |      "name": "stdout",
 616 |      "output_type": "stream",
 617 |      "text": [
 618 |       "[0]\tvalidation_0-error:0.15790\n",
 619 |       "Will train until validation_0-error hasn't improved in 10 rounds.\n",
 620 |       "[1]\tvalidation_0-error:0.10526\n",
 621 |       "[2]\tvalidation_0-error:0.11842\n",
 622 |       "[3]\tvalidation_0-error:0.13158\n",
 623 |       "[4]\tvalidation_0-error:0.11842\n",
 624 |       "[5]\tvalidation_0-error:0.14474\n",
 625 |       "[6]\tvalidation_0-error:0.14474\n",
 626 |       "[7]\tvalidation_0-error:0.14474\n",
 627 |       "[8]\tvalidation_0-error:0.14474\n",
 628 |       "[9]\tvalidation_0-error:0.14474\n",
 629 |       "[10]\tvalidation_0-error:0.14474\n",
 630 |       "[11]\tvalidation_0-error:0.15790\n",
 631 |       "Stopping. Best iteration:\n",
 632 |       "[1]\tvalidation_0-error:0.10526\n",
 633 |       "\n",
 634 |       "Accuracy: 89.47%\n"
 635 |      ]
 636 |     }
 637 |    ],
 638 |    "source": [
 639 |     "model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)\n",
 640 |     "eval_set = [(X_test, y_test)]\n",
 641 |     "eval_metric=\"error\"\n",
 642 |     "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=10, verbose=True)\n",
 643 |     "y_pred = model.predict(X_test)\n",
 644 |     "accuracy = accuracy_score(y_test, y_pred)\n",
 645 |     "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
 646 |    ]
 647 |   },
 648 |   {
 649 |    "cell_type": "code",
 650 |    "execution_count": 21,
 651 |    "metadata": {},
 652 |    "outputs": [
 653 |     {
 654 |      "name": "stdout",
 655 |      "output_type": "stream",
 656 |      "text": [
 657 |       "[0]\tvalidation_0-error:0.15790\n",
 658 |       "Will train until validation_0-error hasn't improved in 100 rounds.\n",
 659 |       "[1]\tvalidation_0-error:0.10526\n",
 660 |       "[2]\tvalidation_0-error:0.11842\n",
 661 |       "[3]\tvalidation_0-error:0.13158\n",
 662 |       "[4]\tvalidation_0-error:0.11842\n",
 663 |       "[5]\tvalidation_0-error:0.14474\n",
 664 |       "[6]\tvalidation_0-error:0.14474\n",
 665 |       "[7]\tvalidation_0-error:0.14474\n",
 666 |       "[8]\tvalidation_0-error:0.14474\n",
 667 |       "[9]\tvalidation_0-error:0.14474\n",
 668 |       "[10]\tvalidation_0-error:0.14474\n",
 669 |       "[11]\tvalidation_0-error:0.15790\n",
 670 |       "[12]\tvalidation_0-error:0.15790\n",
 671 |       "[13]\tvalidation_0-error:0.17105\n",
 672 |       "[14]\tvalidation_0-error:0.17105\n",
 673 |       "[15]\tvalidation_0-error:0.17105\n",
 674 |       "[16]\tvalidation_0-error:0.15790\n",
 675 |       "[17]\tvalidation_0-error:0.17105\n",
 676 |       "[18]\tvalidation_0-error:0.15790\n",
 677 |       "[19]\tvalidation_0-error:0.17105\n",
 678 |       "[20]\tvalidation_0-error:0.17105\n",
 679 |       "[21]\tvalidation_0-error:0.17105\n",
 680 |       "[22]\tvalidation_0-error:0.18421\n",
 681 |       "[23]\tvalidation_0-error:0.18421\n",
 682 |       "[24]\tvalidation_0-error:0.17105\n",
 683 |       "[25]\tvalidation_0-error:0.18421\n",
 684 |       "[26]\tvalidation_0-error:0.18421\n",
 685 |       "[27]\tvalidation_0-error:0.18421\n",
 686 |       "[28]\tvalidation_0-error:0.18421\n",
 687 |       "[29]\tvalidation_0-error:0.18421\n",
 688 |       "[30]\tvalidation_0-error:0.18421\n",
 689 |       "[31]\tvalidation_0-error:0.18421\n",
 690 |       "[32]\tvalidation_0-error:0.18421\n",
 691 |       "[33]\tvalidation_0-error:0.18421\n",
 692 |       "[34]\tvalidation_0-error:0.18421\n",
 693 |       "[35]\tvalidation_0-error:0.18421\n",
 694 |       "[36]\tvalidation_0-error:0.18421\n",
 695 |       "[37]\tvalidation_0-error:0.18421\n",
 696 |       "[38]\tvalidation_0-error:0.18421\n",
 697 |       "[39]\tvalidation_0-error:0.18421\n",
 698 |       "[40]\tvalidation_0-error:0.18421\n",
 699 |       "[41]\tvalidation_0-error:0.18421\n",
 700 |       "[42]\tvalidation_0-error:0.18421\n",
 701 |       "[43]\tvalidation_0-error:0.17105\n",
 702 |       "[44]\tvalidation_0-error:0.18421\n",
 703 |       "[45]\tvalidation_0-error:0.17105\n",
 704 |       "[46]\tvalidation_0-error:0.18421\n",
 705 |       "[47]\tvalidation_0-error:0.18421\n",
 706 |       "[48]\tvalidation_0-error:0.17105\n",
 707 |       "[49]\tvalidation_0-error:0.15790\n",
 708 |       "[50]\tvalidation_0-error:0.17105\n",
 709 |       "[51]\tvalidation_0-error:0.17105\n",
 710 |       "[52]\tvalidation_0-error:0.15790\n",
 711 |       "[53]\tvalidation_0-error:0.17105\n",
 712 |       "[54]\tvalidation_0-error:0.17105\n",
 713 |       "[55]\tvalidation_0-error:0.17105\n",
 714 |       "[56]\tvalidation_0-error:0.17105\n",
 715 |       "[57]\tvalidation_0-error:0.17105\n",
 716 |       "[58]\tvalidation_0-error:0.17105\n",
 717 |       "[59]\tvalidation_0-error:0.17105\n",
 718 |       "[60]\tvalidation_0-error:0.17105\n",
 719 |       "[61]\tvalidation_0-error:0.17105\n",
 720 |       "[62]\tvalidation_0-error:0.17105\n",
 721 |       "[63]\tvalidation_0-error:0.17105\n",
 722 |       "[64]\tvalidation_0-error:0.17105\n",
 723 |       "[65]\tvalidation_0-error:0.17105\n",
 724 |       "[66]\tvalidation_0-error:0.18421\n",
 725 |       "[67]\tvalidation_0-error:0.18421\n",
 726 |       "[68]\tvalidation_0-error:0.18421\n",
 727 |       "[69]\tvalidation_0-error:0.18421\n",
 728 |       "[70]\tvalidation_0-error:0.18421\n",
 729 |       "[71]\tvalidation_0-error:0.18421\n",
 730 |       "[72]\tvalidation_0-error:0.18421\n",
 731 |       "[73]\tvalidation_0-error:0.18421\n",
 732 |       "[74]\tvalidation_0-error:0.17105\n",
 733 |       "[75]\tvalidation_0-error:0.18421\n",
 734 |       "[76]\tvalidation_0-error:0.17105\n",
 735 |       "[77]\tvalidation_0-error:0.18421\n",
 736 |       "[78]\tvalidation_0-error:0.15790\n",
 737 |       "[79]\tvalidation_0-error:0.17105\n",
 738 |       "[80]\tvalidation_0-error:0.15790\n",
 739 |       "[81]\tvalidation_0-error:0.15790\n",
 740 |       "[82]\tvalidation_0-error:0.15790\n",
 741 |       "[83]\tvalidation_0-error:0.15790\n",
 742 |       "[84]\tvalidation_0-error:0.15790\n",
 743 |       "[85]\tvalidation_0-error:0.15790\n",
 744 |       "[86]\tvalidation_0-error:0.15790\n",
 745 |       "[87]\tvalidation_0-error:0.15790\n",
 746 |       "[88]\tvalidation_0-error:0.15790\n",
 747 |       "[89]\tvalidation_0-error:0.15790\n",
 748 |       "[90]\tvalidation_0-error:0.15790\n",
 749 |       "[91]\tvalidation_0-error:0.15790\n",
 750 |       "[92]\tvalidation_0-error:0.15790\n",
 751 |       "[93]\tvalidation_0-error:0.17105\n",
 752 |       "[94]\tvalidation_0-error:0.17105\n",
 753 |       "[95]\tvalidation_0-error:0.17105\n",
 754 |       "[96]\tvalidation_0-error:0.17105\n",
 755 |       "[97]\tvalidation_0-error:0.17105\n",
 756 |       "[98]\tvalidation_0-error:0.17105\n",
 757 |       "[99]\tvalidation_0-error:0.17105\n",
 758 |       "[100]\tvalidation_0-error:0.17105\n",
 759 |       "[101]\tvalidation_0-error:0.17105\n",
 760 |       "Stopping. Best iteration:\n",
 761 |       "[1]\tvalidation_0-error:0.10526\n",
 762 |       "\n",
 763 |       "Accuracy: 89.47%\n"
 764 |      ]
 765 |     }
 766 |    ],
 767 |    "source": [
 768 |     "model = XGBClassifier(random_state=2, n_estimators=5000)\n",
 769 |     "eval_set = [(X_test, y_test)]\n",
 770 |     "eval_metric=\"error\"\n",
 771 |     "model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)\n",
 772 |     "y_pred = model.predict(X_test)\n",
 773 |     "accuracy = accuracy_score(y_test, y_pred)\n",
 774 |     "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))"
 775 |    ]
 776 |   },
 777 |   {
 778 |    "cell_type": "code",
 779 |    "execution_count": 22,
 780 |    "metadata": {},
 781 |    "outputs": [
 782 |     {
 783 |      "name": "stdout",
 784 |      "output_type": "stream",
 785 |      "text": [
 786 |       "Best params: {'n_estimators': 50}\n",
 787 |       "Best score: 0.78907\n"
 788 |      ]
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "grid_search(params={'n_estimators':[2, 25, 50, 75, 100]})"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "code",
 797 |    "execution_count": 23,
 798 |    "metadata": {},
 799 |    "outputs": [
 800 |     {
 801 |      "name": "stdout",
 802 |      "output_type": "stream",
 803 |      "text": [
 804 |       "Best params: {'max_depth': 1, 'n_estimators': 50}\n",
 805 |       "Best score: 0.83869\n"
 806 |      ]
 807 |     }
 808 |    ],
 809 |    "source": [
 810 |     "grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], \n",
 811 |     "                    'n_estimators':[50]})"
 812 |    ]
 813 |   },
 814 |   {
 815 |    "cell_type": "code",
 816 |    "execution_count": 24,
 817 |    "metadata": {},
 818 |    "outputs": [
 819 |     {
 820 |      "name": "stdout",
 821 |      "output_type": "stream",
 822 |      "text": [
 823 |       "Best params: {'max_depth': 1, 'n_estimators': 50}\n",
 824 |       "Best score: 0.83869\n"
 825 |      ]
 826 |     }
 827 |    ],
 828 |    "source": [
 829 |     "grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], \n",
 830 |     "                    'n_estimators':[2, 50, 100]})"
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "code",
 835 |    "execution_count": 35,
 836 |    "metadata": {},
 837 |    "outputs": [
 838 |     {
 839 |      "name": "stdout",
 840 |      "output_type": "stream",
 841 |      "text": [
 842 |       "Best params: {'learning_rate': 0.3, 'max_depth': 1, 'n_estimators': 50}\n",
 843 |       "Best score: 0.83869\n"
 844 |      ]
 845 |     }
 846 |    ],
 847 |    "source": [
 848 |     "grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5], \n",
 849 |     "                    'max_depth':[1], \n",
 850 |     "                    'n_estimators':[50]})"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": 26,
 856 |    "metadata": {},
 857 |    "outputs": [
 858 |     {
 859 |      "name": "stdout",
 860 |      "output_type": "stream",
 861 |      "text": [
 862 |       "Best params: {'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 50}\n",
 863 |       "Best score: 0.83869\n"
 864 |      ]
 865 |     }
 866 |    ],
 867 |    "source": [
 868 |     "grid_search(params={'min_child_weight':[1, 2, 3, 4, 5], \n",
 869 |     "                    'max_depth':[1], \n",
 870 |     "                    'n_estimators':[50]})"
 871 |    ]
 872 |   },
 873 |   {
 874 |    "cell_type": "code",
 875 |    "execution_count": 27,
 876 |    "metadata": {},
 877 |    "outputs": [
 878 |     {
 879 |      "name": "stdout",
 880 |      "output_type": "stream",
 881 |      "text": [
 882 |       "Best params: {'max_depth': 1, 'n_estimators': 50, 'subsample': 1}\n",
 883 |       "Best score: 0.83869\n"
 884 |      ]
 885 |     }
 886 |    ],
 887 |    "source": [
 888 |     "grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],\n",
 889 |     "                    'max_depth':[1], \n",
 890 |     "                    'n_estimators':[50]})"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": 28,
 896 |    "metadata": {},
 897 |    "outputs": [
 898 |     {
 899 |      "name": "stdout",
 900 |      "output_type": "stream",
 901 |      "text": [
 902 |       "Best params: {'learning_rate': 0.5, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 2, 'subsample': 0.9}\n",
 903 |       "Best score: 0.81224\n"
 904 |      ]
 905 |     }
 906 |    ],
 907 |    "source": [
 908 |     "grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], \n",
 909 |     "                    'min_child_weight':[1, 2, 3, 4, 5], \n",
 910 |     "                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5], \n",
 911 |     "                    'max_depth':[1, 2, 3, 4, 5], \n",
 912 |     "                    'n_estimators':[2]})"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": 29,
 918 |    "metadata": {},
 919 |    "outputs": [
 920 |     {
 921 |      "name": "stdout",
 922 |      "output_type": "stream",
 923 |      "text": [
 924 |       "Best params: {'subsample': 0.6, 'n_estimators': 25, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.5}\n",
 925 |       "Best score: 0.82208\n"
 926 |      ]
 927 |     }
 928 |    ],
 929 |    "source": [
 930 |     "grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], \n",
 931 |     "                    'min_child_weight':[1, 2, 3, 4, 5], \n",
 932 |     "                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5], \n",
 933 |     "                    'max_depth':[1, 2, 3, 4, 5, None], \n",
 934 |     "                    'n_estimators':[2, 25, 50, 75, 100]}, random=True)"
 935 |    ]
 936 |   },
 937 |   {
 938 |    "cell_type": "code",
 939 |    "execution_count": 30,
 940 |    "metadata": {},
 941 |    "outputs": [
 942 |     {
 943 |      "name": "stdout",
 944 |      "output_type": "stream",
 945 |      "text": [
 946 |       "Best params: {'colsample_bytree': 1, 'max_depth': 1, 'n_estimators': 50}\n",
 947 |       "Best score: 0.83869\n"
 948 |      ]
 949 |     }
 950 |    ],
 951 |    "source": [
 952 |     "grid_search(params={'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1],\n",
 953 |     "                    'max_depth':[1], \n",
 954 |     "                    'n_estimators':[50]})"
 955 |    ]
 956 |   },
 957 |   {
 958 |    "cell_type": "code",
 959 |    "execution_count": 31,
 960 |    "metadata": {},
 961 |    "outputs": [
 962 |     {
 963 |      "name": "stdout",
 964 |      "output_type": "stream",
 965 |      "text": [
 966 |       "Best params: {'colsample_bylevel': 1, 'max_depth': 1, 'n_estimators': 50}\n",
 967 |       "Best score: 0.83869\n"
 968 |      ]
 969 |     }
 970 |    ],
 971 |    "source": [
 972 |     "grid_search(params={'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1],\n",
 973 |     "                    'max_depth':[1], \n",
 974 |     "                    'n_estimators':[50]})"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": 32,
 980 |    "metadata": {},
 981 |    "outputs": [
 982 |     {
 983 |      "name": "stdout",
 984 |      "output_type": "stream",
 985 |      "text": [
 986 |       "Best params: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'max_depth': 1, 'n_estimators': 50}\n",
 987 |       "Best score: 0.84852\n"
 988 |      ]
 989 |     }
 990 |    ],
 991 |    "source": [
 992 |     "grid_search(params={'colsample_bynode':[0.5, 0.6, 0.7, 0.8, 0.9, 1], \n",
 993 |     "                    'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1], \n",
 994 |     "                    'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1], \n",
 995 |     "                    'max_depth':[1], \n",
 996 |     "                    'n_estimators':[50]})"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "code",
1001 |    "execution_count": 33,
1002 |    "metadata": {},
1003 |    "outputs": [
1004 |     {
1005 |      "name": "stdout",
1006 |      "output_type": "stream",
1007 |      "text": [
1008 |       "Best params: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'gamma': 0, 'max_depth': 1, 'n_estimators': 50}\n",
1009 |       "Best score: 0.84852\n"
1010 |      ]
1011 |     }
1012 |    ],
1013 |    "source": [
1014 |     "grid_search(params={'gamma':[0, 0.01, 0.05, 0.1, 0.5, 1, 2, 3], \n",
1015 |     "                    'colsample_bylevel':[0.9], \n",
1016 |     "                    'colsample_bytree':[0.8], \n",
1017 |     "                    'colsample_bynode':[0.5], \n",
1018 |     "                    'max_depth':[1], \n",
1019 |     "                    'n_estimators':[50]})"
1020 |    ]
1021 |   },
1022 |   {
1023 |    "cell_type": "code",
1024 |    "execution_count": null,
1025 |    "metadata": {},
1026 |    "outputs": [],
1027 |    "source": []
1028 |   }
1029 |  ],
1030 |  "metadata": {
1031 |   "kernelspec": {
1032 |    "display_name": "Python 3",
1033 |    "language": "python",
1034 |    "name": "python3"
1035 |   },
1036 |   "language_info": {
1037 |    "codemirror_mode": {
1038 |     "name": "ipython",
1039 |     "version": 3
1040 |    },
1041 |    "file_extension": ".py",
1042 |    "mimetype": "text/x-python",
1043 |    "name": "python",
1044 |    "nbconvert_exporter": "python",
1045 |    "pygments_lexer": "ipython3",
1046 |    "version": "3.7.7"
1047 |   }
1048 |  },
1049 |  "nbformat": 4,
1050 |  "nbformat_minor": 4
1051 | }
1052 | 


--------------------------------------------------------------------------------
/Chapter06/heart_disease.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7c3014365675306819510a49ff289efbec1d1a6a666a2dc7652f1547b383d859
3 | size 11328
4 | 


--------------------------------------------------------------------------------
/Chapter07/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/.DS_Store


--------------------------------------------------------------------------------
/Chapter07/.ipynb_checkpoints/Discovering_Exoplanets-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/Chapter07/Light Plot 0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/Light Plot 0.png


--------------------------------------------------------------------------------
/Chapter07/Light Plot 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/Light Plot 1.png


--------------------------------------------------------------------------------
/Chapter07/Light Plot 37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/Light Plot 37.png


--------------------------------------------------------------------------------
/Chapter07/Light Plot 38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/Light Plot 38.png


--------------------------------------------------------------------------------
/Chapter07/Light Plot 39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/Light Plot 39.png


--------------------------------------------------------------------------------
/Chapter07/exoplanets.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6f04167636d2fcdce44af6fd5cbef359d8af033ce8457d8c8e917fdb822e044c
3 | size 262223348
4 | 


--------------------------------------------------------------------------------
/Chapter07/exoplanets.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter07/exoplanets.csv.zip


--------------------------------------------------------------------------------
/Chapter08/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter08/.DS_Store


--------------------------------------------------------------------------------
/Chapter08/.ipynb_checkpoints/Alternative_Base_Learners-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from sklearn.datasets import load_boston\n",
 11 |     "from sklearn.model_selection import cross_val_score\n",
 12 |     "from xgboost import XGBRegressor\n",
 13 |     "from sklearn.linear_model import LinearRegression\n",
 14 |     "from sklearn.linear_model import Lasso\n",
 15 |     "from sklearn.linear_model import Ridge\n",
 16 |     "from sklearn.model_selection import GridSearchCV\n",
 17 |     "from sklearn.model_selection import KFold\n",
 18 |     "from sklearn.metrics import mean_squared_error as MSE\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "import numpy as np\n",
 21 |     "import seaborn as sns\n",
 22 |     "sns.set()"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "X, y = load_boston(return_X_y=True)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "kfold = KFold(n_splits=5, shuffle=True, random_state=2)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "def regression_model(model):\n",
 50 |     "    # Obtain scores of cross-validation using 10 splits and mean squared error\n",
 51 |     "    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)\n",
 52 |     "\n",
 53 |     "    # Take square root of the scores\n",
 54 |     "    rmse = (-scores)**0.5\n",
 55 |     "\n",
 56 |     "    # Return mean score\n",
 57 |     "    return round(rmse.mean(), 2)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "6.07"
 69 |       ]
 70 |      },
 71 |      "execution_count": 5,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 6,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "4.78"
 89 |       ]
 90 |      },
 91 |      "execution_count": 6,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "regression_model(LinearRegression())"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 7,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "5.25"
109 |       ]
110 |      },
111 |      "execution_count": 7,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "regression_model(Lasso())"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 8,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "4.79"
129 |       ]
130 |      },
131 |      "execution_count": 8,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "regression_model(Ridge())"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 9,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "3.08"
149 |       ]
150 |      },
151 |      "execution_count": 9,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 10,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "def grid_search(params, reg=XGBRegressor(booster='gblinear', objective='reg:squarederror')):\n",
167 |     "\n",
168 |     "    # Instantiate GridSearchCV as grid_reg\n",
169 |     "    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)\n",
170 |     "    \n",
171 |     "    # Fit grid_reg on X_train and y_train\n",
172 |     "    grid_reg.fit(X, y)\n",
173 |     "\n",
174 |     "    # Extract best params\n",
175 |     "    best_params = grid_reg.best_params_\n",
176 |     "\n",
177 |     "    # Print best params\n",
178 |     "    print(\"Best params:\", best_params)\n",
179 |     "    \n",
180 |     "    # Compute best score\n",
181 |     "    best_score = np.sqrt(-grid_reg.best_score_)\n",
182 |     "\n",
183 |     "    # Print best score\n",
184 |     "    print(\"Training score: {:.2f}\".format(best_score))"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 11,
190 |    "metadata": {
191 |     "scrolled": true
192 |    },
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "Best params: {'reg_alpha': 0.01}\n",
199 |       "Training score: 6.12\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "grid_search(params={'reg_alpha':[0.01, 0.1, 0.5, 1, 2, 4]})"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Best params: {'reg_lambda': 0.01}\n",
217 |       "Training score: 5.95\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "grid_search(params={'reg_lambda':[0.01, 0.1, 0.5, 1, 2, 4]})"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 13,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "Best params: {'reg_lambda': 0.01}\n",
235 |       "Training score: 5.95\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "grid_search(params={'reg_lambda':[0.01, 0.05, 0.15, 0.2]})"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 14,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "Best params: {'feature_selector': 'shuffle'}\n",
253 |       "Training score: 6.19\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "grid_search(params={'feature_selector':['shuffle']})"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 15,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "name": "stdout",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "Best params: {'feature_selector': 'greedy', 'updater': 'coord_descent'}\n",
271 |       "Training score: 5.67\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 16,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "Best params: {'feature_selector': 'greedy', 'top_k': 12, 'updater': 'coord_descent'}\n",
289 |       "Training score: 5.67\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "grid_search(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[2, 4, 6, 8, 10, 12]})"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 24,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "Best params: {'feature_selector': 'greedy', 'learning_rate': 0.3, 'updater': 'coord_descent'}\n",
307 |       "Training score: 5.55\n"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "grid_search(params={'feature_selector':['greedy'], 'updater':['coord_descent'], \n",
313 |     "                    'learning_rate':[0.3]})"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 17,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "3.08"
325 |       ]
326 |      },
327 |      "execution_count": 17,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 26,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "text/plain": [
344 |        "3.08"
345 |       ]
346 |      },
347 |      "execution_count": 26,
348 |      "metadata": {},
349 |      "output_type": "execute_result"
350 |     }
351 |    ],
352 |    "source": [
353 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted', \n",
354 |     "                             normalize_type='forest'))"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 43,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/plain": [
365 |        "3.07"
366 |       ]
367 |      },
368 |      "execution_count": 43,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted', \n",
375 |     "                             rate_drop=0.001))"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 41,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "data": {
385 |       "text/plain": [
386 |        "3.09"
387 |       ]
388 |      },
389 |      "execution_count": 41,
390 |      "metadata": {},
391 |      "output_type": "execute_result"
392 |     }
393 |    ],
394 |    "source": [
395 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted', \n",
396 |     "                             one_drop=1))"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": []
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": []
412 |   }
413 |  ],
414 |  "metadata": {
415 |   "kernelspec": {
416 |    "display_name": "Python 3",
417 |    "language": "python",
418 |    "name": "python3"
419 |   },
420 |   "language_info": {
421 |    "codemirror_mode": {
422 |     "name": "ipython",
423 |     "version": 3
424 |    },
425 |    "file_extension": ".py",
426 |    "mimetype": "text/x-python",
427 |    "name": "python",
428 |    "nbconvert_exporter": "python",
429 |    "pygments_lexer": "ipython3",
430 |    "version": "3.7.6"
431 |   }
432 |  },
433 |  "nbformat": 4,
434 |  "nbformat_minor": 4
435 | }
436 | 


--------------------------------------------------------------------------------
/Chapter08/Alternative_Base_Learners.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "from sklearn.datasets import load_diabetes\n",
 12 |     "from sklearn.model_selection import cross_val_score\n",
 13 |     "from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier\n",
 14 |     "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
 15 |     "from sklearn.linear_model import LinearRegression, LogisticRegression\n",
 16 |     "from sklearn.linear_model import Lasso, Ridge\n",
 17 |     "from sklearn.model_selection import GridSearchCV\n",
 18 |     "from sklearn.model_selection import KFold\n",
 19 |     "from sklearn.metrics import mean_squared_error as MSE"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "X, y = load_diabetes(return_X_y=True)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "kfold = KFold(n_splits=5, shuffle=True, random_state=2)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def regression_model(model):\n",
 47 |     "    # Obtain scores of cross-validation using 10 splits and mean squared error\n",
 48 |     "    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)\n",
 49 |     "\n",
 50 |     "    # Take square root of the scores\n",
 51 |     "    rmse = (-scores)**0.5\n",
 52 |     "\n",
 53 |     "    # Return mean score\n",
 54 |     "    return rmse.mean()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 5,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "55.503630186113604"
 66 |       ]
 67 |      },
 68 |      "execution_count": 5,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 6,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "55.50927267834351"
 86 |       ]
 87 |      },
 88 |      "execution_count": 6,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "regression_model(LinearRegression())"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "62.64900771743497"
106 |       ]
107 |      },
108 |      "execution_count": 7,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "regression_model(Lasso())"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 8,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "58.83525077919004"
126 |       ]
127 |      },
128 |      "execution_count": 8,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "regression_model(Ridge())"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 9,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "65.96608419624594"
146 |       ]
147 |      },
148 |      "execution_count": 9,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 10,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "def grid_search(params, reg=XGBRegressor(booster='gblinear', objective='reg:squarederror')):\n",
164 |     "\n",
165 |     "    # Instantiate GridSearchCV as grid_reg\n",
166 |     "    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)\n",
167 |     "    \n",
168 |     "    # Fit grid_reg on X_train and y_train\n",
169 |     "    grid_reg.fit(X, y)\n",
170 |     "\n",
171 |     "    # Extract best params\n",
172 |     "    best_params = grid_reg.best_params_\n",
173 |     "\n",
174 |     "    # Print best params\n",
175 |     "    print(\"Best params:\", best_params)\n",
176 |     "    \n",
177 |     "    # Compute best score\n",
178 |     "    best_score = np.sqrt(-grid_reg.best_score_)\n",
179 |     "\n",
180 |     "    # Print best score\n",
181 |     "    print(\"Best score:\", best_score)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 11,
187 |    "metadata": {
188 |     "scrolled": true
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "Best params: {'reg_alpha': 0.001}\n",
196 |       "Best score: 55.49491862176835\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "grid_search(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 12,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "Best params: {'reg_lambda': 0.001}\n",
214 |       "Best score: 56.17163548052951\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "grid_search(params={'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]})"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 13,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "Best params: {'feature_selector': 'shuffle'}\n",
232 |       "Best score: 55.527107408614704\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "grid_search(params={'feature_selector':['shuffle']})"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 14,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "Best params: {'feature_selector': 'thrifty', 'updater': 'coord_descent'}\n",
250 |       "Best score: 55.48798105805444\n"
251 |      ]
252 |     }
253 |    ],
254 |    "source": [
255 |     "grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 15,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Best params: {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}\n",
268 |       "Best score: 55.478623763746256\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "grid_search(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[3, 5, 7, 9]})"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 16,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "X = np.arange(1,100)\n",
283 |     "np.random.seed(2) \n",
284 |     "y = []\n",
285 |     "for i in X:\n",
286 |     "    y.append(i*np.random.uniform(-0.2, 0.2))\n",
287 |     "y = np.array(y)\n",
288 |     "X = X.reshape(X.shape[0], 1)\n",
289 |     "y = y.reshape(y.shape[0], 1)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 17,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "6.214946302686011"
301 |       ]
302 |      },
303 |      "execution_count": 17,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 18,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "9.37235946501318"
321 |       ]
322 |      },
323 |      "execution_count": 18,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 19,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "6.214962315808842"
341 |       ]
342 |      },
343 |      "execution_count": 19,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "regression_model(LinearRegression())"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 20,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "X, y = load_diabetes(return_X_y=True)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 21,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "65.96444746130739"
370 |       ]
371 |      },
372 |      "execution_count": 21,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 22,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "df_census = pd.read_csv('census_cleaned.csv')\n",
388 |     "X_census = df_census.iloc[:, :-1]\n",
389 |     "y_census = df_census.iloc[:, -1]"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 23,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "def classification_model(model):\n",
399 |     "    # Obtain scores of cross-validation using 10 splits and mean squared error\n",
400 |     "    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)\n",
401 |     "\n",
402 |     "    # Return mean score\n",
403 |     "    return scores.mean()"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 24,
409 |    "metadata": {},
410 |    "outputs": [
411 |     {
412 |      "data": {
413 |       "text/plain": [
414 |        "0.8701208195968675"
415 |       ]
416 |      },
417 |      "execution_count": 24,
418 |      "metadata": {},
419 |      "output_type": "execute_result"
420 |     }
421 |    ],
422 |    "source": [
423 |     "classification_model(XGBClassifier(booster='gbtree'))"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 25,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "0.8701208195968675"
435 |       ]
436 |      },
437 |      "execution_count": 25,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "classification_model(XGBClassifier(booster='dart'))"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 26,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "data": {
453 |       "text/plain": [
454 |        "0.8500354281042902"
455 |       ]
456 |      },
457 |      "execution_count": 26,
458 |      "metadata": {},
459 |      "output_type": "execute_result"
460 |     }
461 |    ],
462 |    "source": [
463 |     "classification_model(XGBClassifier(booster='gblinear'))"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 27,
469 |    "metadata": {},
470 |    "outputs": [
471 |     {
472 |      "data": {
473 |       "text/plain": [
474 |        "0.8008968643699182"
475 |       ]
476 |      },
477 |      "execution_count": 27,
478 |      "metadata": {},
479 |      "output_type": "execute_result"
480 |     }
481 |    ],
482 |    "source": [
483 |     "classification_model(LogisticRegression(max_iter=1000))"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 28,
489 |    "metadata": {},
490 |    "outputs": [
491 |     {
492 |      "data": {
493 |       "text/plain": [
494 |        "0.8718714338474818"
495 |       ]
496 |      },
497 |      "execution_count": 28,
498 |      "metadata": {},
499 |      "output_type": "execute_result"
500 |     }
501 |    ],
502 |    "source": [
503 |     "classification_model(XGBClassifier(booster='dart', one_drop=1))"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 29,
509 |    "metadata": {},
510 |    "outputs": [
511 |     {
512 |      "data": {
513 |       "text/plain": [
514 |        "65.96444746130739"
515 |       ]
516 |      },
517 |      "execution_count": 29,
518 |      "metadata": {},
519 |      "output_type": "execute_result"
520 |     }
521 |    ],
522 |    "source": [
523 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted'))"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 30,
529 |    "metadata": {},
530 |    "outputs": [
531 |     {
532 |      "data": {
533 |       "text/plain": [
534 |        "65.96444746130739"
535 |       ]
536 |      },
537 |      "execution_count": 30,
538 |      "metadata": {},
539 |      "output_type": "execute_result"
540 |     }
541 |    ],
542 |    "source": [
543 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', normalize_type='forest'))"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 31,
549 |    "metadata": {},
550 |    "outputs": [
551 |     {
552 |      "data": {
553 |       "text/plain": [
554 |        "61.81275131335009"
555 |       ]
556 |      },
557 |      "execution_count": 31,
558 |      "metadata": {},
559 |      "output_type": "execute_result"
560 |     }
561 |    ],
562 |    "source": [
563 |     "regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": 32,
569 |    "metadata": {},
570 |    "outputs": [
571 |     {
572 |      "name": "stdout",
573 |      "output_type": "stream",
574 |      "text": [
575 |       "Best params: {'rate_drop': 0.2}\n",
576 |       "Best score: 61.07249602732062\n"
577 |      ]
578 |     }
579 |    ],
580 |    "source": [
581 |     "grid_search(params={'rate_drop':[0.01, 0.1, 0.2, 0.4]}, \n",
582 |     "            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 33,
588 |    "metadata": {},
589 |    "outputs": [
590 |     {
591 |      "name": "stdout",
592 |      "output_type": "stream",
593 |      "text": [
594 |       "Best params: {'skip_drop': 0.1}\n",
595 |       "Best score: 62.879753748627635\n"
596 |      ]
597 |     }
598 |    ],
599 |    "source": [
600 |     "grid_search(params={'skip_drop': [0.01, 0.1, 0.2, 0.4]}, \n",
601 |     "            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": 34,
607 |    "metadata": {},
608 |    "outputs": [
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "65.96604877151103"
613 |       ]
614 |      },
615 |      "execution_count": 34,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=25))"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 35,
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "65.96445649315855"
633 |       ]
634 |      },
635 |      "execution_count": 35,
636 |      "metadata": {},
637 |      "output_type": "execute_result"
638 |     }
639 |    ],
640 |    "source": [
641 |     "regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=5))"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 36,
647 |    "metadata": {},
648 |    "outputs": [
649 |     {
650 |      "data": {
651 |       "text/plain": [
652 |        "59.447250741400595"
653 |       ]
654 |      },
655 |      "execution_count": 36,
656 |      "metadata": {},
657 |      "output_type": "execute_result"
658 |     }
659 |    ],
660 |    "source": [
661 |     "regression_model(XGBRFRegressor(objective='reg:squarederror'))"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "code",
666 |    "execution_count": 37,
667 |    "metadata": {},
668 |    "outputs": [
669 |     {
670 |      "data": {
671 |       "text/plain": [
672 |        "59.46563031802505"
673 |       ]
674 |      },
675 |      "execution_count": 37,
676 |      "metadata": {},
677 |      "output_type": "execute_result"
678 |     }
679 |    ],
680 |    "source": [
681 |     "regression_model(RandomForestRegressor())"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": 38,
687 |    "metadata": {},
688 |    "outputs": [
689 |     {
690 |      "data": {
691 |       "text/plain": [
692 |        "0.856085650471878"
693 |       ]
694 |      },
695 |      "execution_count": 38,
696 |      "metadata": {},
697 |      "output_type": "execute_result"
698 |     }
699 |    ],
700 |    "source": [
701 |     "classification_model(XGBRFClassifier())"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": 39,
707 |    "metadata": {
708 |     "scrolled": true
709 |    },
710 |    "outputs": [
711 |     {
712 |      "data": {
713 |       "text/plain": [
714 |        "0.8555328202034789"
715 |       ]
716 |      },
717 |      "execution_count": 39,
718 |      "metadata": {},
719 |      "output_type": "execute_result"
720 |     }
721 |    ],
722 |    "source": [
723 |     "classification_model(RandomForestClassifier())"
724 |    ]
725 |   }
726 |  ],
727 |  "metadata": {
728 |   "kernelspec": {
729 |    "display_name": "Python 3",
730 |    "language": "python",
731 |    "name": "python3"
732 |   },
733 |   "language_info": {
734 |    "codemirror_mode": {
735 |     "name": "ipython",
736 |     "version": 3
737 |    },
738 |    "file_extension": ".py",
739 |    "mimetype": "text/x-python",
740 |    "name": "python",
741 |    "nbconvert_exporter": "python",
742 |    "pygments_lexer": "ipython3",
743 |    "version": "3.7.7"
744 |   }
745 |  },
746 |  "nbformat": 4,
747 |  "nbformat_minor": 4
748 | }
749 | 


--------------------------------------------------------------------------------
/Chapter08/census_cleaned.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6836a2159748b9d132c8c403faf0da5219db6486162fdbf8317de7ee8a319080
3 | size 6311513
4 | 


--------------------------------------------------------------------------------
/Chapter09/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter09/.DS_Store


--------------------------------------------------------------------------------
/Chapter09/cab_rides.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:873b7eb90a31222791ff15bf728acdc7e8033a3308a822828eb8fd2f3b4c0947
3 | size 88761783
4 | 


--------------------------------------------------------------------------------
/Chapter09/weather.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:617b395adb1e00bd00bd44cf6449d28457bdb0a6d1a84e91afa41c1c2fa0661b
3 | size 349993
4 | 


--------------------------------------------------------------------------------
/Chapter10/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/fe11b99506559598ba3432bc7a2678d7996e724c/Chapter10/.DS_Store


--------------------------------------------------------------------------------
/Chapter10/.ipynb_checkpoints/XGBoost_Model_Deployment-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<div>\n",
 12 |        "<style scoped>\n",
 13 |        "    .dataframe tbody tr th:only-of-type {\n",
 14 |        "        vertical-align: middle;\n",
 15 |        "    }\n",
 16 |        "\n",
 17 |        "    .dataframe tbody tr th {\n",
 18 |        "        vertical-align: top;\n",
 19 |        "    }\n",
 20 |        "\n",
 21 |        "    .dataframe thead th {\n",
 22 |        "        text-align: right;\n",
 23 |        "    }\n",
 24 |        "</style>\n",
 25 |        "<table border=\"1\" class=\"dataframe\">\n",
 26 |        "  <thead>\n",
 27 |        "    <tr style=\"text-align: right;\">\n",
 28 |        "      <th></th>\n",
 29 |        "      <th>school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3</th>\n",
 30 |        "    </tr>\n",
 31 |        "  </thead>\n",
 32 |        "  <tbody>\n",
 33 |        "    <tr>\n",
 34 |        "      <th>0</th>\n",
 35 |        "      <td>GP;\"F\";18;\"U\";\"GT3\";\"A\";4;4;\"at_home\";\"teacher...</td>\n",
 36 |        "    </tr>\n",
 37 |        "    <tr>\n",
 38 |        "      <th>1</th>\n",
 39 |        "      <td>GP;\"F\";17;\"U\";\"GT3\";\"T\";1;1;\"at_home\";\"other\";...</td>\n",
 40 |        "    </tr>\n",
 41 |        "    <tr>\n",
 42 |        "      <th>2</th>\n",
 43 |        "      <td>GP;\"F\";15;\"U\";\"LE3\";\"T\";1;1;\"at_home\";\"other\";...</td>\n",
 44 |        "    </tr>\n",
 45 |        "    <tr>\n",
 46 |        "      <th>3</th>\n",
 47 |        "      <td>GP;\"F\";15;\"U\";\"GT3\";\"T\";4;2;\"health\";\"services...</td>\n",
 48 |        "    </tr>\n",
 49 |        "    <tr>\n",
 50 |        "      <th>4</th>\n",
 51 |        "      <td>GP;\"F\";16;\"U\";\"GT3\";\"T\";3;3;\"other\";\"other\";\"h...</td>\n",
 52 |        "    </tr>\n",
 53 |        "  </tbody>\n",
 54 |        "</table>\n",
 55 |        "</div>"
 56 |       ],
 57 |       "text/plain": [
 58 |        "  school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3\n",
 59 |        "0  GP;\"F\";18;\"U\";\"GT3\";\"A\";4;4;\"at_home\";\"teacher...                                                                                                                                                                                 \n",
 60 |        "1  GP;\"F\";17;\"U\";\"GT3\";\"T\";1;1;\"at_home\";\"other\";...                                                                                                                                                                                 \n",
 61 |        "2  GP;\"F\";15;\"U\";\"LE3\";\"T\";1;1;\"at_home\";\"other\";...                                                                                                                                                                                 \n",
 62 |        "3  GP;\"F\";15;\"U\";\"GT3\";\"T\";4;2;\"health\";\"services...                                                                                                                                                                                 \n",
 63 |        "4  GP;\"F\";16;\"U\";\"GT3\";\"T\";3;3;\"other\";\"other\";\"h...                                                                                                                                                                                 "
 64 |       ]
 65 |      },
 66 |      "execution_count": 1,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "import pandas as pd\n",
 73 |     "import warnings\n",
 74 |     "warnings.filterwarnings('ignore')\n",
 75 |     "df = pd.read_csv('student-por.csv')\n",
 76 |     "df.head()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/html": [
 87 |        "<div>\n",
 88 |        "<style scoped>\n",
 89 |        "    .dataframe tbody tr th:only-of-type {\n",
 90 |        "        vertical-align: middle;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe tbody tr th {\n",
 94 |        "        vertical-align: top;\n",
 95 |        "    }\n",
 96 |        "\n",
 97 |        "    .dataframe thead th {\n",
 98 |        "        text-align: right;\n",
 99 |        "    }\n",
100 |        "</style>\n",
101 |        "<table border=\"1\" class=\"dataframe\">\n",
102 |        "  <thead>\n",
103 |        "    <tr style=\"text-align: right;\">\n",
104 |        "      <th></th>\n",
105 |        "      <th>school</th>\n",
106 |        "      <th>sex</th>\n",
107 |        "      <th>age</th>\n",
108 |        "      <th>address</th>\n",
109 |        "      <th>famsize</th>\n",
110 |        "      <th>Pstatus</th>\n",
111 |        "      <th>Medu</th>\n",
112 |        "      <th>Fedu</th>\n",
113 |        "      <th>Mjob</th>\n",
114 |        "      <th>Fjob</th>\n",
115 |        "      <th>...</th>\n",
116 |        "      <th>famrel</th>\n",
117 |        "      <th>freetime</th>\n",
118 |        "      <th>goout</th>\n",
119 |        "      <th>Dalc</th>\n",
120 |        "      <th>Walc</th>\n",
121 |        "      <th>health</th>\n",
122 |        "      <th>absences</th>\n",
123 |        "      <th>G1</th>\n",
124 |        "      <th>G2</th>\n",
125 |        "      <th>G3</th>\n",
126 |        "    </tr>\n",
127 |        "  </thead>\n",
128 |        "  <tbody>\n",
129 |        "    <tr>\n",
130 |        "      <th>0</th>\n",
131 |        "      <td>GP</td>\n",
132 |        "      <td>Null</td>\n",
133 |        "      <td>18</td>\n",
134 |        "      <td>U</td>\n",
135 |        "      <td>GT3</td>\n",
136 |        "      <td>A</td>\n",
137 |        "      <td>4</td>\n",
138 |        "      <td>4</td>\n",
139 |        "      <td>at_home</td>\n",
140 |        "      <td>teacher</td>\n",
141 |        "      <td>...</td>\n",
142 |        "      <td>4</td>\n",
143 |        "      <td>3</td>\n",
144 |        "      <td>4</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>3</td>\n",
148 |        "      <td>4</td>\n",
149 |        "      <td>0</td>\n",
150 |        "      <td>11</td>\n",
151 |        "      <td>11</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>1</th>\n",
155 |        "      <td>GP</td>\n",
156 |        "      <td>F</td>\n",
157 |        "      <td>Null</td>\n",
158 |        "      <td>U</td>\n",
159 |        "      <td>GT3</td>\n",
160 |        "      <td>T</td>\n",
161 |        "      <td>1</td>\n",
162 |        "      <td>1</td>\n",
163 |        "      <td>at_home</td>\n",
164 |        "      <td>other</td>\n",
165 |        "      <td>...</td>\n",
166 |        "      <td>5</td>\n",
167 |        "      <td>3</td>\n",
168 |        "      <td>3</td>\n",
169 |        "      <td>1</td>\n",
170 |        "      <td>1</td>\n",
171 |        "      <td>3</td>\n",
172 |        "      <td>2</td>\n",
173 |        "      <td>9</td>\n",
174 |        "      <td>11</td>\n",
175 |        "      <td>11</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>2</th>\n",
179 |        "      <td>GP</td>\n",
180 |        "      <td>F</td>\n",
181 |        "      <td>15</td>\n",
182 |        "      <td>U</td>\n",
183 |        "      <td>LE3</td>\n",
184 |        "      <td>T</td>\n",
185 |        "      <td>1</td>\n",
186 |        "      <td>1</td>\n",
187 |        "      <td>at_home</td>\n",
188 |        "      <td>other</td>\n",
189 |        "      <td>...</td>\n",
190 |        "      <td>4</td>\n",
191 |        "      <td>3</td>\n",
192 |        "      <td>2</td>\n",
193 |        "      <td>2</td>\n",
194 |        "      <td>3</td>\n",
195 |        "      <td>3</td>\n",
196 |        "      <td>6</td>\n",
197 |        "      <td>12</td>\n",
198 |        "      <td>13</td>\n",
199 |        "      <td>12</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>3</th>\n",
203 |        "      <td>GP</td>\n",
204 |        "      <td>F</td>\n",
205 |        "      <td>15</td>\n",
206 |        "      <td>U</td>\n",
207 |        "      <td>GT3</td>\n",
208 |        "      <td>T</td>\n",
209 |        "      <td>4</td>\n",
210 |        "      <td>2</td>\n",
211 |        "      <td>health</td>\n",
212 |        "      <td>services</td>\n",
213 |        "      <td>...</td>\n",
214 |        "      <td>3</td>\n",
215 |        "      <td>2</td>\n",
216 |        "      <td>2</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>1</td>\n",
219 |        "      <td>5</td>\n",
220 |        "      <td>0</td>\n",
221 |        "      <td>14</td>\n",
222 |        "      <td>14</td>\n",
223 |        "      <td>14</td>\n",
224 |        "    </tr>\n",
225 |        "    <tr>\n",
226 |        "      <th>4</th>\n",
227 |        "      <td>GP</td>\n",
228 |        "      <td>F</td>\n",
229 |        "      <td>16</td>\n",
230 |        "      <td>U</td>\n",
231 |        "      <td>GT3</td>\n",
232 |        "      <td>T</td>\n",
233 |        "      <td>3</td>\n",
234 |        "      <td>3</td>\n",
235 |        "      <td>other</td>\n",
236 |        "      <td>other</td>\n",
237 |        "      <td>...</td>\n",
238 |        "      <td>4</td>\n",
239 |        "      <td>3</td>\n",
240 |        "      <td>2</td>\n",
241 |        "      <td>1</td>\n",
242 |        "      <td>2</td>\n",
243 |        "      <td>5</td>\n",
244 |        "      <td>0</td>\n",
245 |        "      <td>11</td>\n",
246 |        "      <td>13</td>\n",
247 |        "      <td>13</td>\n",
248 |        "    </tr>\n",
249 |        "  </tbody>\n",
250 |        "</table>\n",
251 |        "<p>5 rows × 33 columns</p>\n",
252 |        "</div>"
253 |       ],
254 |       "text/plain": [
255 |        "  school   sex   age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \\\n",
256 |        "0     GP  Null    18       U     GT3       A     4     4  at_home   teacher   \n",
257 |        "1     GP     F  Null       U     GT3       T     1     1  at_home     other   \n",
258 |        "2     GP     F    15       U     LE3       T     1     1  at_home     other   \n",
259 |        "3     GP     F    15       U     GT3       T     4     2   health  services   \n",
260 |        "4     GP     F    16       U     GT3       T     3     3    other     other   \n",
261 |        "\n",
262 |        "   ... famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  \n",
263 |        "0  ...      4        3      4     1     1      3        4   0  11  11  \n",
264 |        "1  ...      5        3      3     1     1      3        2   9  11  11  \n",
265 |        "2  ...      4        3      2     2     3      3        6  12  13  12  \n",
266 |        "3  ...      3        2      2     1     1      5        0  14  14  14  \n",
267 |        "4  ...      4        3      2     1     2      5        0  11  13  13  \n",
268 |        "\n",
269 |        "[5 rows x 33 columns]"
270 |       ]
271 |      },
272 |      "execution_count": 4,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "df = pd.read_csv('student-por.csv', sep=';')\n",
279 |     "df.head()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 3,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "school        0\n",
291 |        "sex           0\n",
292 |        "age           0\n",
293 |        "address       0\n",
294 |        "famsize       0\n",
295 |        "Pstatus       0\n",
296 |        "Medu          0\n",
297 |        "Fedu          0\n",
298 |        "Mjob          0\n",
299 |        "Fjob          0\n",
300 |        "reason        0\n",
301 |        "guardian      0\n",
302 |        "traveltime    0\n",
303 |        "studytime     0\n",
304 |        "failures      0\n",
305 |        "schoolsup     0\n",
306 |        "famsup        0\n",
307 |        "paid          0\n",
308 |        "activities    0\n",
309 |        "nursery       0\n",
310 |        "higher        0\n",
311 |        "internet      0\n",
312 |        "romantic      0\n",
313 |        "famrel        0\n",
314 |        "freetime      0\n",
315 |        "goout         0\n",
316 |        "Dalc          0\n",
317 |        "Walc          0\n",
318 |        "health        0\n",
319 |        "absences      0\n",
320 |        "G1            0\n",
321 |        "G2            0\n",
322 |        "G3            0\n",
323 |        "dtype: int64"
324 |       ]
325 |      },
326 |      "execution_count": 3,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "df.isnull().sum()"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": []
341 |   }
342 |  ],
343 |  "metadata": {
344 |   "kernelspec": {
345 |    "display_name": "Python 3",
346 |    "language": "python",
347 |    "name": "python3"
348 |   },
349 |   "language_info": {
350 |    "codemirror_mode": {
351 |     "name": "ipython",
352 |     "version": 3
353 |    },
354 |    "file_extension": ".py",
355 |    "mimetype": "text/x-python",
356 |    "name": "python",
357 |    "nbconvert_exporter": "python",
358 |    "pygments_lexer": "ipython3",
359 |    "version": "3.7.6"
360 |   }
361 |  },
362 |  "nbformat": 4,
363 |  "nbformat_minor": 4
364 | }
365 | 


--------------------------------------------------------------------------------
/Chapter10/student-por.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1fe27670d582a1d638b7f200acb4186ad62164219ac0269e2662097b188b3cce
3 | size 93216
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center"><a href="https://packt.link/mlsumgh"><img src="https://static.packt-cdn.com/assets/images/ML Summit Banner v3 1200x627.png" alt="Machine Learning Summit 2025"/></a></p>
  2 | 
  3 | ## Machine Learning Summit 2025
  4 | **Bridging Theory and Practice: ML Solutions for Today’s Challenges**
  5 | 
  6 | 3 days, 20+ experts, and 25+ tech sessions and talks covering critical aspects of:
  7 | - **Agentic and Generative AI**
  8 | - **Applied Machine Learning in the Real World**
  9 | - **ML Engineering and Optimization**
 10 | 
 11 | 👉 [Book your ticket now >>](https://packt.link/mlsumgh)
 12 | 
 13 | ---
 14 | 
 15 | ## Join Our Newsletters 📬
 16 | 
 17 | ### DataPro  
 18 | *The future of AI is unfolding. Don’t fall behind.*
 19 | 
 20 | <p><a href="https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/DataPro NL QR Code.png" alt="DataPro QR" width="150"/></a></p>
 21 | 
 22 | Stay ahead with [**DataPro**](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes), the free weekly newsletter for data scientists, AI/ML researchers, and data engineers.  
 23 | From trending tools like **PyTorch**, **scikit-learn**, **XGBoost**, and **BentoML** to hands-on insights on **database optimization** and real-world **ML workflows**, you’ll get what matters, fast.
 24 | 
 25 | > Stay sharp with [DataPro](https://landing.packtpub.com/subscribe-datapronewsletter/?link_from_packtlink=yes). Join **115K+ data professionals** who never miss a beat.
 26 | 
 27 | ---
 28 | 
 29 | ### BIPro  
 30 | *Business runs on data. Make sure yours tells the right story.*
 31 | 
 32 | <p><a href="https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes"><img src="https://static.packt-cdn.com/assets/images/BIPro NL QR Code.png" alt="BIPro QR" width="150"/></a></p>
 33 | 
 34 | [**BIPro**](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes) is your free weekly newsletter for BI professionals, analysts, and data leaders.  
 35 | Get practical tips on **dashboarding**, **data visualization**, and **analytics strategy** with tools like **Power BI**, **Tableau**, **Looker**, **SQL**, and **dbt**.
 36 | 
 37 | > Get smarter with [BIPro](https://landing.packtpub.com/subscribe-bipro-newsletter/?link_from_packtlink=yes). Trusted by **35K+ BI professionals**, see what you’re missing.
 38 | 
 39 | 
 40 | 
 41 | 
 42 | # Hands-On Gradient Boosting with XGBoost and scikit-learn
 43 | 
 44 | <a href="https://www.packtpub.com/product/hands-on-gradient-boosting-with-xgboost-and-scikit-learn/9781839218354"><img src="https://static.packt-cdn.com/products/9781839218354/cover/smaller" alt="Hands-On Gradient Boosting with XGBoost and scikit-learn" height="256px" align="right"></a>
 45 | 
 46 | This is the code repository for [Hands-On Gradient Boosting with XGBoost and scikit-learn](https://www.packtpub.com/product/hands-on-gradient-boosting-with-xgboost-and-scikit-learn/9781839218354), published by Packt.
 47 | 
 48 | **Perform accessible machine learning and extreme gradient boosting with Python**
 49 | 
 50 | ## What is this book about?
 51 | XGBoost is an industry-proven, open-source software library that provides a gradient boosting framework for scaling billions of data points quickly and efficiently.
 52 | 
 53 | This book covers the following exciting features: <First 5 What you'll learn points>
 54 | * Build gradient boosting models from scratch
 55 | * Develop XGBoost regressors and classifiers with accuracy and speed
 56 | * Analyze variance and bias in terms of fine-tuning XGBoost hyperparameters
 57 | * Automatically correct missing values and scale imbalanced data
 58 | * Apply alternative base learners like dart, linear models, and XGBoost random forests
 59 | 
 60 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/10DigitISBN) today!
 61 | 
 62 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
 63 | alt="https://www.packtpub.com/" border="5" /></a>
 64 | 
 65 | 
 66 | ## Instructions and Navigations
 67 | All of the code is organized into folders. For example, Chapter02.
 68 | 
 69 | The code will look like the following:
 70 | ```
 71 | cross_val(LogisticRegression()) 
 72 | ```
 73 | 
 74 | **Following is what you need for this book:**
 75 | This book is for data science professionals and enthusiasts, data analysts, and developers who want to build fast and accurate machine learning models that scale with big data. Proficiency in Python, along with a basic understanding of linear algebra, will help you to get the most out of this book.
 76 | 
 77 | With the following software and hardware list you can run all code files present in the book (Chapter 1-10).
 78 | 
 79 | ### Software and Hardware List
 80 | 
 81 | | Chapter  | Software required                   | OS required                        |
 82 | | -------- | ------------------------------------| -----------------------------------|
 83 | | 1        |Anaconda: Jupyter Notebbok/ sklearn 0.23                    | Windows, Mac OS X, and Linux (Any) |
 84 | | 2        | Anaconda: Python 3.7           | Windows, Mac OS X, and Linux (Any) |
 85 | | 3        | xgboost 1.2            | Windows, Mac OS X, and Linux (Any) |
 86 | 
 87 | 
 88 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781839218354_ColorImages.pdf).
 89 | 
 90 | 
 91 | ### Related products   
 92 | * Hands-On Machine Learning with scikit-learn and Scientific Python Toolkits [[Packt]](https://www.packtpub.com/product/hands-on-machine-learning-with-scikit-learn-and-scientific-python-toolkits/9781838826048) [[Amazon]](https://www.amazon.com/dp/1838826041)  
 93 | 
 94 | * Mastering Machine Learning Algorithms- Second Edition [[Packt]](https://www.packtpub.com/product/mastering-machine-learning-algorithms-second-edition/9781838820299) [[Amazon]](https://www.amazon.com/dp/1838820299)
 95 | 
 96 | ## Get to Know the Author
 97 | **Corey Wade**
 98 | M.S. Mathematics, M.F.A. Writing and Consciousness, is the founder and director of Berkeley Coding Academy, where he teaches machine learning and AI to teens from all over the world. Additionally, Corey chairs the Math Department at the Independent Study Program of Berkeley High School, where he teaches programming and advanced math. His additional experience includes teaching natural language processing with Hello World, developing data science curricula with Pathstream, and publishing original statistics (3NG) and machine learning articles with Towards Data Science, Springboard, and Medium. Corey is co-author of the Python Workshop, also published by Packt.
 99 | 
100 | ### Suggestions and Feedback
101 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
102 | ### Download a free PDF
103 | 
104 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
105 | <p align="center"> <a href="https://packt.link/free-ebook/9781839218354">https://packt.link/free-ebook/9781839218354 </a> </p>


--------------------------------------------------------------------------------