├── Data ├── Diabetes.csv ├── bikesharing_test.csv ├── bikesharing_train.csv ├── daily-total-female-births.csv └── iris_all.csv ├── Notebooks ├── 01-XGBoost_BikeRental_Data_Preparation.ipynb ├── 02-XGBoost_Regression_BikeRental.ipynb ├── 03-XGBoost_Binary_Classification_Diabetes_Dataset.ipynb ├── 04-XGBoost_Course_Prepare_Iris_Dataset.ipynb ├── 05-XGBoost_Course_Multiclass_Classification_Iris_Dataset.ipynb ├── 06-XGBoost-TimeSeries.ipynb ├── 07-XGBoost_Feature_Importance_Selection_Diabetes_Dataset.ipynb ├── 08-XGBoost_Hyperparameter_Tuning_Diabetes_Dataset.ipynb ├── 09-AWS_XGBoost_Train_Host_Predict.ipynb └── 10-AWS_XGBoost_Invoke_Endpoint_Predict.ipynb └── README.md /Data/Diabetes.csv: -------------------------------------------------------------------------------- 1 | Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome 2 | 6,148.0,72,35,,33.6,0.627,50.0,1 3 | 1,85.0,66,29,,26.6,0.35100000000000003,31.0,0 4 | 8,183.0,64,0,,23.3,0.672,32.0,1 5 | 1,89.0,66,23,94.0,28.1,0.16699999999999998,21.0,0 6 | 0,137.0,40,35,168.0,43.1,2.2880000000000003,33.0,1 7 | 5,116.0,74,0,,25.6,0.201,30.0,0 8 | 3,78.0,50,32,88.0,31.0,0.248,26.0,1 9 | 10,115.0,0,0,,35.3,0.134,29.0,0 10 | 2,197.0,70,45,543.0,30.5,0.158,53.0,1 11 | 8,125.0,96,0,,,0.23199999999999998,54.0,1 12 | 4,110.0,92,0,,37.6,0.191,30.0,0 13 | 10,168.0,74,0,,38.0,0.537,34.0,1 14 | 10,139.0,80,0,,27.1,1.4409999999999998,57.0,0 15 | 1,189.0,60,23,846.0,30.1,0.39799999999999996,59.0,1 16 | 5,166.0,72,19,175.0,25.8,0.5870000000000001,51.0,1 17 | 7,100.0,0,0,,30.0,0.484,32.0,1 18 | 0,118.0,84,47,230.0,45.8,0.551,31.0,1 19 | 7,107.0,74,0,,29.6,0.254,31.0,1 20 | 1,103.0,30,38,83.0,43.3,0.183,33.0,0 21 | 1,115.0,70,30,96.0,34.6,0.529,32.0,1 22 | 3,126.0,88,41,235.0,39.3,0.7040000000000001,27.0,0 23 | 8,99.0,84,0,,35.4,0.38799999999999996,50.0,0 24 | 7,196.0,90,0,,39.8,0.451,41.0,1 25 | 9,119.0,80,35,,29.0,0.263,29.0,1 26 | 11,143.0,94,33,146.0,36.6,0.254,51.0,1 27 | 10,125.0,70,26,115.0,31.1,0.205,41.0,1 28 | 7,147.0,76,0,,39.4,0.257,43.0,1 29 | 1,97.0,66,15,140.0,23.2,0.48700000000000004,22.0,0 30 | 13,145.0,82,19,110.0,22.2,0.245,57.0,0 31 | 5,117.0,92,0,,34.1,0.337,38.0,0 32 | 5,109.0,75,26,,36.0,0.546,60.0,0 33 | 3,158.0,76,36,245.0,31.6,0.851,28.0,1 34 | 3,88.0,58,11,54.0,24.8,0.267,22.0,0 35 | 6,92.0,92,0,,19.9,0.188,28.0,0 36 | 10,122.0,78,31,,27.6,0.512,45.0,0 37 | 4,103.0,60,33,192.0,24.0,0.966,33.0,0 38 | 11,138.0,76,0,,33.2,0.42,35.0,0 39 | 9,102.0,76,37,,32.9,0.665,46.0,1 40 | 2,90.0,68,42,,38.2,0.503,27.0,1 41 | 4,111.0,72,47,207.0,37.1,1.39,56.0,1 42 | 3,180.0,64,25,70.0,34.0,0.271,26.0,0 43 | 7,133.0,84,0,,40.2,0.696,37.0,0 44 | 7,106.0,92,18,,22.7,0.235,48.0,0 45 | 9,171.0,110,24,240.0,45.4,0.721,54.0,1 46 | 7,159.0,64,0,,27.4,0.294,40.0,0 47 | 0,180.0,66,39,,42.0,1.893,25.0,1 48 | 1,146.0,56,0,,29.7,0.564,29.0,0 49 | 2,71.0,70,27,,28.0,0.586,22.0,0 50 | 7,103.0,66,32,,39.1,0.344,31.0,1 51 | 7,105.0,0,0,,,0.305,24.0,0 52 | 1,103.0,80,11,82.0,19.4,0.491,22.0,0 53 | 1,101.0,50,15,36.0,24.2,0.526,26.0,0 54 | 5,88.0,66,21,23.0,24.4,0.342,30.0,0 55 | 8,176.0,90,34,300.0,33.7,0.467,58.0,1 56 | 7,150.0,66,42,342.0,34.7,0.718,42.0,0 57 | 1,73.0,50,10,,23.0,0.248,21.0,0 58 | 7,187.0,68,39,304.0,37.7,0.254,41.0,1 59 | 0,100.0,88,60,110.0,46.8,0.9620000000000001,31.0,0 60 | 0,146.0,82,0,,40.5,1.781,44.0,0 61 | 0,105.0,64,41,142.0,41.5,0.17300000000000001,22.0,0 62 | 2,84.0,0,0,,,0.304,21.0,0 63 | 8,133.0,72,0,,32.9,0.27,39.0,1 64 | 5,44.0,62,0,,25.0,0.5870000000000001,36.0,0 65 | 2,141.0,58,34,128.0,25.4,0.6990000000000001,24.0,0 66 | 7,114.0,66,0,,32.8,0.258,42.0,1 67 | 5,99.0,74,27,,29.0,0.203,32.0,0 68 | 0,109.0,88,30,,32.5,0.855,38.0,1 69 | 2,109.0,92,0,,42.7,0.845,54.0,0 70 | 1,95.0,66,13,38.0,19.6,0.33399999999999996,25.0,0 71 | 4,146.0,85,27,100.0,28.9,0.18899999999999997,27.0,0 72 | 2,100.0,66,20,90.0,32.9,0.867,28.0,1 73 | 5,139.0,64,35,140.0,28.6,0.41100000000000003,26.0,0 74 | 13,126.0,90,0,,43.4,0.583,42.0,1 75 | 4,129.0,86,20,270.0,35.1,0.231,23.0,0 76 | 1,79.0,75,30,,32.0,0.396,22.0,0 77 | 1,,48,20,,24.7,0.14,22.0,0 78 | 7,62.0,78,0,,32.6,0.391,41.0,0 79 | 5,95.0,72,33,,37.7,0.37,27.0,0 80 | 0,131.0,0,0,,43.2,0.27,26.0,1 81 | 2,112.0,66,22,,25.0,0.307,24.0,0 82 | 3,113.0,44,13,,22.4,0.14,22.0,0 83 | 2,74.0,0,0,,,0.102,22.0,0 84 | 7,83.0,78,26,71.0,29.3,0.767,36.0,0 85 | 0,101.0,65,28,,24.6,0.237,22.0,0 86 | 5,137.0,108,0,,48.8,0.22699999999999998,37.0,1 87 | 2,110.0,74,29,125.0,32.4,0.698,27.0,0 88 | 13,106.0,72,54,,36.6,0.17800000000000002,45.0,0 89 | 2,100.0,68,25,71.0,38.5,0.324,26.0,0 90 | 15,136.0,70,32,110.0,37.1,0.153,43.0,1 91 | 1,107.0,68,19,,26.5,0.165,24.0,0 92 | 1,80.0,55,0,,19.1,0.258,21.0,0 93 | 4,123.0,80,15,176.0,32.0,0.44299999999999995,34.0,0 94 | 7,81.0,78,40,48.0,46.7,0.261,42.0,0 95 | 4,134.0,72,0,,23.8,0.27699999999999997,60.0,1 96 | 2,142.0,82,18,64.0,24.7,0.7609999999999999,21.0,0 97 | 6,144.0,72,27,228.0,33.9,0.255,40.0,0 98 | 2,92.0,62,28,,31.6,0.13,24.0,0 99 | 1,71.0,48,18,76.0,20.4,0.32299999999999995,22.0,0 100 | 6,93.0,50,30,64.0,28.7,0.35600000000000004,23.0,0 101 | 1,122.0,90,51,220.0,49.7,0.325,31.0,1 102 | 1,163.0,72,0,,39.0,1.222,33.0,1 103 | 1,151.0,60,0,,26.1,0.179,22.0,0 104 | 0,125.0,96,0,,22.5,0.262,21.0,0 105 | 1,81.0,72,18,40.0,26.6,0.28300000000000003,24.0,0 106 | 2,85.0,65,0,,39.6,0.93,27.0,0 107 | 1,126.0,56,29,152.0,28.7,0.8009999999999999,21.0,0 108 | 1,96.0,122,0,,22.4,0.207,27.0,0 109 | 4,144.0,58,28,140.0,29.5,0.287,37.0,0 110 | 3,83.0,58,31,18.0,34.3,0.336,25.0,0 111 | 0,95.0,85,25,36.0,37.4,0.247,24.0,1 112 | 3,171.0,72,33,135.0,33.3,0.19899999999999998,24.0,1 113 | 8,155.0,62,26,495.0,34.0,0.5429999999999999,46.0,1 114 | 1,89.0,76,34,37.0,31.2,0.192,23.0,0 115 | 4,76.0,62,0,,34.0,0.391,25.0,0 116 | 7,160.0,54,32,175.0,30.5,0.588,39.0,1 117 | 4,146.0,92,0,,31.2,0.539,61.0,1 118 | 5,124.0,74,0,,34.0,0.22,38.0,1 119 | 5,78.0,48,0,,33.7,0.654,25.0,0 120 | 4,97.0,60,23,,28.2,0.44299999999999995,22.0,0 121 | 4,99.0,76,15,51.0,23.2,0.223,21.0,0 122 | 0,162.0,76,56,100.0,53.2,0.759,25.0,1 123 | 6,111.0,64,39,,34.2,0.26,24.0,0 124 | 2,107.0,74,30,100.0,33.6,0.40399999999999997,23.0,0 125 | 5,132.0,80,0,,26.8,0.18600000000000003,69.0,0 126 | 0,113.0,76,0,,33.3,0.278,23.0,1 127 | 1,88.0,30,42,99.0,55.0,0.496,26.0,1 128 | 3,120.0,70,30,135.0,42.9,0.452,30.0,0 129 | 1,118.0,58,36,94.0,33.3,0.261,23.0,0 130 | 1,117.0,88,24,145.0,34.5,0.40299999999999997,40.0,1 131 | 0,105.0,84,0,,27.9,0.741,62.0,1 132 | 4,173.0,70,14,168.0,29.7,0.361,33.0,1 133 | 9,122.0,56,0,,33.3,1.114,33.0,1 134 | 3,170.0,64,37,225.0,34.5,0.35600000000000004,30.0,1 135 | 8,84.0,74,31,,38.3,0.457,39.0,0 136 | 2,96.0,68,13,49.0,21.1,0.647,26.0,0 137 | 2,125.0,60,20,140.0,33.8,0.08800000000000001,31.0,0 138 | 0,100.0,70,26,50.0,30.8,0.597,21.0,0 139 | 0,93.0,60,25,92.0,28.7,0.532,22.0,0 140 | 0,129.0,80,0,,31.2,0.703,29.0,0 141 | 5,105.0,72,29,325.0,36.9,0.159,28.0,0 142 | 3,128.0,78,0,,21.1,0.268,55.0,0 143 | 5,106.0,82,30,,39.5,0.28600000000000003,38.0,0 144 | 2,108.0,52,26,63.0,32.5,0.318,22.0,0 145 | 10,108.0,66,0,,32.4,0.272,42.0,1 146 | 4,154.0,62,31,284.0,32.8,0.237,23.0,0 147 | 0,102.0,75,23,,,0.5720000000000001,21.0,0 148 | 9,57.0,80,37,,32.8,0.096,41.0,0 149 | 2,106.0,64,35,119.0,30.5,1.4,34.0,0 150 | 5,147.0,78,0,,33.7,0.218,65.0,0 151 | 2,90.0,70,17,,27.3,0.085,22.0,0 152 | 1,136.0,74,50,204.0,37.4,0.39899999999999997,24.0,0 153 | 4,114.0,65,0,,21.9,0.43200000000000005,37.0,0 154 | 9,156.0,86,28,155.0,34.3,1.189,42.0,1 155 | 1,153.0,82,42,485.0,40.6,0.687,23.0,0 156 | 8,188.0,78,0,,47.9,0.13699999999999998,43.0,1 157 | 7,152.0,88,44,,50.0,0.337,36.0,1 158 | 2,99.0,52,15,94.0,24.6,0.637,21.0,0 159 | 1,109.0,56,21,135.0,25.2,0.833,23.0,0 160 | 2,88.0,74,19,53.0,29.0,0.22899999999999998,22.0,0 161 | 17,163.0,72,41,114.0,40.9,0.8170000000000001,47.0,1 162 | 4,151.0,90,38,,29.7,0.294,36.0,0 163 | 7,102.0,74,40,105.0,37.2,0.204,45.0,0 164 | 0,114.0,80,34,285.0,44.2,0.16699999999999998,27.0,0 165 | 2,100.0,64,23,,29.7,0.368,21.0,0 166 | 0,131.0,88,0,,31.6,0.743,32.0,1 167 | 6,104.0,74,18,156.0,29.9,0.722,41.0,1 168 | 3,148.0,66,25,,32.5,0.256,22.0,0 169 | 4,120.0,68,0,,29.6,0.7090000000000001,34.0,0 170 | 4,110.0,66,0,,31.9,0.47100000000000003,29.0,0 171 | 3,111.0,90,12,78.0,28.4,0.495,29.0,0 172 | 6,102.0,82,0,,30.8,0.18,36.0,1 173 | 6,134.0,70,23,130.0,35.4,0.542,29.0,1 174 | 2,87.0,0,23,,28.9,0.773,25.0,0 175 | 1,79.0,60,42,48.0,43.5,0.6779999999999999,23.0,0 176 | 2,75.0,64,24,55.0,29.7,0.37,33.0,0 177 | 8,179.0,72,42,130.0,32.7,0.7190000000000001,36.0,1 178 | 6,85.0,78,0,,31.2,0.382,42.0,0 179 | 0,129.0,110,46,130.0,67.1,0.319,26.0,1 180 | 5,143.0,78,0,,45.0,0.19,47.0,0 181 | 5,130.0,82,0,,39.1,0.956,37.0,1 182 | 6,87.0,80,0,,23.2,0.084,32.0,0 183 | 0,119.0,64,18,92.0,34.9,0.725,23.0,0 184 | 1,,74,20,23.0,27.7,0.299,21.0,0 185 | 5,73.0,60,0,,26.8,0.268,27.0,0 186 | 4,141.0,74,0,,27.6,0.244,40.0,0 187 | 7,194.0,68,28,,35.9,0.745,41.0,1 188 | 8,181.0,68,36,495.0,30.1,0.615,60.0,1 189 | 1,128.0,98,41,58.0,32.0,1.321,33.0,1 190 | 8,109.0,76,39,114.0,27.9,0.64,31.0,1 191 | 5,139.0,80,35,160.0,31.6,0.361,25.0,1 192 | 3,111.0,62,0,,22.6,0.142,21.0,0 193 | 9,123.0,70,44,94.0,33.1,0.374,40.0,0 194 | 7,159.0,66,0,,30.4,0.38299999999999995,36.0,1 195 | 11,135.0,0,0,,52.3,0.578,40.0,1 196 | 8,85.0,55,20,,24.4,0.136,42.0,0 197 | 5,158.0,84,41,210.0,39.4,0.395,29.0,1 198 | 1,105.0,58,0,,24.3,0.187,21.0,0 199 | 3,107.0,62,13,48.0,22.9,0.6779999999999999,23.0,1 200 | 4,109.0,64,44,99.0,34.8,0.905,26.0,1 201 | 4,148.0,60,27,318.0,30.9,0.15,29.0,1 202 | 0,113.0,80,16,,31.0,0.8740000000000001,21.0,0 203 | 1,138.0,82,0,,40.1,0.23600000000000002,28.0,0 204 | 0,108.0,68,20,,27.3,0.787,32.0,0 205 | 2,99.0,70,16,44.0,20.4,0.235,27.0,0 206 | 6,103.0,72,32,190.0,37.7,0.324,55.0,0 207 | 5,111.0,72,28,,23.9,0.40700000000000003,27.0,0 208 | 8,196.0,76,29,280.0,37.5,0.605,57.0,1 209 | 5,162.0,104,0,,37.7,0.151,52.0,1 210 | 1,96.0,64,27,87.0,33.2,0.289,21.0,0 211 | 7,184.0,84,33,,35.5,0.355,41.0,1 212 | 2,81.0,60,22,,27.7,0.29,25.0,0 213 | 0,147.0,85,54,,42.8,0.375,24.0,0 214 | 7,179.0,95,31,,34.2,0.16399999999999998,60.0,0 215 | 0,140.0,65,26,130.0,42.6,0.431,24.0,1 216 | 9,112.0,82,32,175.0,34.2,0.26,36.0,1 217 | 12,151.0,70,40,271.0,41.8,0.742,38.0,1 218 | 5,109.0,62,41,129.0,35.8,0.514,25.0,1 219 | 6,125.0,68,30,120.0,30.0,0.46399999999999997,32.0,0 220 | 5,85.0,74,22,,29.0,1.224,32.0,1 221 | 5,112.0,66,0,,37.8,0.261,41.0,1 222 | 0,177.0,60,29,478.0,34.6,1.072,21.0,1 223 | 2,158.0,90,0,,31.6,0.805,66.0,1 224 | 7,119.0,0,0,,25.2,0.209,37.0,0 225 | 7,142.0,60,33,190.0,28.8,0.687,61.0,0 226 | 1,100.0,66,15,56.0,23.6,0.6659999999999999,26.0,0 227 | 1,87.0,78,27,32.0,34.6,0.10099999999999999,22.0,0 228 | 0,101.0,76,0,,35.7,0.198,26.0,0 229 | 3,162.0,52,38,,37.2,0.652,24.0,1 230 | 4,197.0,70,39,744.0,36.7,2.329,31.0,0 231 | 0,117.0,80,31,53.0,45.2,0.08900000000000001,24.0,0 232 | 4,142.0,86,0,,44.0,0.645,22.0,1 233 | 6,134.0,80,37,370.0,46.2,0.23800000000000002,46.0,1 234 | 1,79.0,80,25,37.0,25.4,0.583,22.0,0 235 | 4,122.0,68,0,,35.0,0.39399999999999996,29.0,0 236 | 3,74.0,68,28,45.0,29.7,0.293,23.0,0 237 | 4,171.0,72,0,,43.6,0.479,26.0,1 238 | 7,181.0,84,21,192.0,35.9,0.586,51.0,1 239 | 0,179.0,90,27,,44.1,0.6859999999999999,23.0,1 240 | 9,164.0,84,21,,30.8,0.831,32.0,1 241 | 0,104.0,76,0,,18.4,0.5820000000000001,27.0,0 242 | 1,91.0,64,24,,29.2,0.192,21.0,0 243 | 4,91.0,70,32,88.0,33.1,0.446,22.0,0 244 | 3,139.0,54,0,,25.6,0.402,22.0,1 245 | 6,119.0,50,22,176.0,27.1,1.318,33.0,1 246 | 2,146.0,76,35,194.0,38.2,0.32899999999999996,29.0,0 247 | 9,184.0,85,15,,30.0,1.213,49.0,1 248 | 10,122.0,68,0,,31.2,0.258,41.0,0 249 | 0,165.0,90,33,680.0,52.3,0.42700000000000005,23.0,0 250 | 9,124.0,70,33,402.0,35.4,0.282,34.0,0 251 | 1,111.0,86,19,,30.1,0.14300000000000002,23.0,0 252 | 9,106.0,52,0,,31.2,0.38,42.0,0 253 | 2,129.0,84,0,,28.0,0.284,27.0,0 254 | 2,90.0,80,14,55.0,24.4,0.249,24.0,0 255 | 0,86.0,68,32,,35.8,0.23800000000000002,25.0,0 256 | 12,92.0,62,7,258.0,27.6,0.9259999999999999,44.0,1 257 | 1,113.0,64,35,,33.6,0.5429999999999999,21.0,1 258 | 3,111.0,56,39,,30.1,0.557,30.0,0 259 | 2,114.0,68,22,,28.7,0.092,25.0,0 260 | 1,193.0,50,16,375.0,25.9,0.655,24.0,0 261 | 11,155.0,76,28,150.0,33.3,1.3530000000000002,51.0,1 262 | 3,191.0,68,15,130.0,30.9,0.299,34.0,0 263 | 3,141.0,0,0,,30.0,0.7609999999999999,27.0,1 264 | 4,95.0,70,32,,32.1,0.612,24.0,0 265 | 3,142.0,80,15,,32.4,0.2,63.0,0 266 | 4,123.0,62,0,,32.0,0.226,35.0,1 267 | 5,96.0,74,18,67.0,33.6,0.997,43.0,0 268 | 0,138.0,0,0,,36.3,0.9329999999999999,25.0,1 269 | 2,128.0,64,42,,40.0,1.101,24.0,0 270 | 0,102.0,52,0,,25.1,0.078,21.0,0 271 | 2,146.0,0,0,,27.5,0.24,28.0,1 272 | 10,101.0,86,37,,45.6,1.136,38.0,1 273 | 2,108.0,62,32,56.0,25.2,0.128,21.0,0 274 | 3,122.0,78,0,,23.0,0.254,40.0,0 275 | 1,71.0,78,50,45.0,33.2,0.42200000000000004,21.0,0 276 | 13,106.0,70,0,,34.2,0.251,52.0,0 277 | 2,100.0,70,52,57.0,40.5,0.677,25.0,0 278 | 7,106.0,60,24,,26.5,0.29600000000000004,29.0,1 279 | 0,104.0,64,23,116.0,27.8,0.45399999999999996,23.0,0 280 | 5,114.0,74,0,,24.9,0.7440000000000001,57.0,0 281 | 2,108.0,62,10,278.0,25.3,0.8809999999999999,22.0,0 282 | 0,146.0,70,0,,37.9,0.33399999999999996,28.0,1 283 | 10,129.0,76,28,122.0,35.9,0.28,39.0,0 284 | 7,133.0,88,15,155.0,32.4,0.262,37.0,0 285 | 7,161.0,86,0,,30.4,0.165,47.0,1 286 | 2,108.0,80,0,,27.0,0.259,52.0,1 287 | 7,136.0,74,26,135.0,26.0,0.647,51.0,0 288 | 5,155.0,84,44,545.0,38.7,0.619,34.0,0 289 | 1,119.0,86,39,220.0,45.6,0.8079999999999999,29.0,1 290 | 4,96.0,56,17,49.0,20.8,0.34,26.0,0 291 | 5,108.0,72,43,75.0,36.1,0.263,33.0,0 292 | 0,78.0,88,29,40.0,36.9,0.434,21.0,0 293 | 0,107.0,62,30,74.0,36.6,0.757,25.0,1 294 | 2,128.0,78,37,182.0,43.3,1.224,31.0,1 295 | 1,128.0,48,45,194.0,40.5,0.613,24.0,1 296 | 0,161.0,50,0,,21.9,0.254,65.0,0 297 | 6,151.0,62,31,120.0,35.5,0.6920000000000001,28.0,0 298 | 2,146.0,70,38,360.0,28.0,0.337,29.0,1 299 | 0,126.0,84,29,215.0,30.7,0.52,24.0,0 300 | 14,100.0,78,25,184.0,36.6,0.41200000000000003,46.0,1 301 | 8,112.0,72,0,,23.6,0.84,58.0,0 302 | 0,167.0,0,0,,32.3,0.8390000000000001,30.0,1 303 | 2,144.0,58,33,135.0,31.6,0.42200000000000004,25.0,1 304 | 5,77.0,82,41,42.0,35.8,0.156,35.0,0 305 | 5,115.0,98,0,,52.9,0.209,28.0,1 306 | 3,150.0,76,0,,21.0,0.207,37.0,0 307 | 2,120.0,76,37,105.0,39.7,0.215,29.0,0 308 | 10,161.0,68,23,132.0,25.5,0.326,47.0,1 309 | 0,137.0,68,14,148.0,24.8,0.14300000000000002,21.0,0 310 | 0,128.0,68,19,180.0,30.5,1.391,25.0,1 311 | 2,124.0,68,28,205.0,32.9,0.875,30.0,1 312 | 6,80.0,66,30,,26.2,0.313,41.0,0 313 | 0,106.0,70,37,148.0,39.4,0.605,22.0,0 314 | 2,155.0,74,17,96.0,26.6,0.433,27.0,1 315 | 3,113.0,50,10,85.0,29.5,0.626,25.0,0 316 | 7,109.0,80,31,,35.9,1.127,43.0,1 317 | 2,112.0,68,22,94.0,34.1,0.315,26.0,0 318 | 3,99.0,80,11,64.0,19.3,0.284,30.0,0 319 | 3,182.0,74,0,,30.5,0.345,29.0,1 320 | 3,115.0,66,39,140.0,38.1,0.15,28.0,0 321 | 6,194.0,78,0,,23.5,0.129,59.0,1 322 | 4,129.0,60,12,231.0,27.5,0.527,31.0,0 323 | 3,112.0,74,30,,31.6,0.19699999999999998,25.0,1 324 | 0,124.0,70,20,,27.4,0.254,36.0,1 325 | 13,152.0,90,33,29.0,26.8,0.731,43.0,1 326 | 2,112.0,75,32,,35.7,0.14800000000000002,21.0,0 327 | 1,157.0,72,21,168.0,25.6,0.12300000000000001,24.0,0 328 | 1,122.0,64,32,156.0,35.1,0.6920000000000001,30.0,1 329 | 10,179.0,70,0,,35.1,0.2,37.0,0 330 | 2,102.0,86,36,120.0,45.5,0.127,23.0,1 331 | 6,105.0,70,32,68.0,30.8,0.122,37.0,0 332 | 8,118.0,72,19,,23.1,1.476,46.0,0 333 | 2,87.0,58,16,52.0,32.7,0.166,25.0,0 334 | 1,180.0,0,0,,43.3,0.282,41.0,1 335 | 12,106.0,80,0,,23.6,0.13699999999999998,44.0,0 336 | 1,95.0,60,18,58.0,23.9,0.26,22.0,0 337 | 0,165.0,76,43,255.0,47.9,0.259,26.0,0 338 | 0,117.0,0,0,,33.8,0.932,44.0,0 339 | 5,115.0,76,0,,31.2,0.34299999999999997,44.0,1 340 | 9,152.0,78,34,171.0,34.2,0.893,33.0,1 341 | 7,178.0,84,0,,39.9,0.331,41.0,1 342 | 1,130.0,70,13,105.0,25.9,0.47200000000000003,22.0,0 343 | 1,95.0,74,21,73.0,25.9,0.6729999999999999,36.0,0 344 | 1,,68,35,,32.0,0.389,22.0,0 345 | 5,122.0,86,0,,34.7,0.29,33.0,0 346 | 8,95.0,72,0,,36.8,0.485,57.0,0 347 | 8,126.0,88,36,108.0,38.5,0.349,49.0,0 348 | 1,139.0,46,19,83.0,28.7,0.654,22.0,0 349 | 3,116.0,0,0,,23.5,0.187,23.0,0 350 | 3,99.0,62,19,74.0,21.8,0.27899999999999997,26.0,0 351 | 5,,80,32,,41.0,0.34600000000000003,37.0,1 352 | 4,92.0,80,0,,42.2,0.237,29.0,0 353 | 4,137.0,84,0,,31.2,0.252,30.0,0 354 | 3,61.0,82,28,,34.4,0.243,46.0,0 355 | 1,90.0,62,12,43.0,27.2,0.58,24.0,0 356 | 3,90.0,78,0,,42.7,0.5589999999999999,21.0,0 357 | 9,165.0,88,0,,30.4,0.302,49.0,1 358 | 1,125.0,50,40,167.0,33.3,0.9620000000000001,28.0,1 359 | 13,129.0,0,30,,39.9,0.569,44.0,1 360 | 12,88.0,74,40,54.0,35.3,0.37799999999999995,48.0,0 361 | 1,196.0,76,36,249.0,36.5,0.875,29.0,1 362 | 5,189.0,64,33,325.0,31.2,0.583,29.0,1 363 | 5,158.0,70,0,,29.8,0.207,63.0,0 364 | 5,103.0,108,37,,39.2,0.305,65.0,0 365 | 4,146.0,78,0,,38.5,0.52,67.0,1 366 | 4,147.0,74,25,293.0,34.9,0.385,30.0,0 367 | 5,99.0,54,28,83.0,34.0,0.499,30.0,0 368 | 6,124.0,72,0,,27.6,0.368,29.0,1 369 | 0,101.0,64,17,,21.0,0.252,21.0,0 370 | 3,81.0,86,16,66.0,27.5,0.306,22.0,0 371 | 1,133.0,102,28,140.0,32.8,0.23399999999999999,45.0,1 372 | 3,173.0,82,48,465.0,38.4,2.137,25.0,1 373 | 0,118.0,64,23,89.0,,1.7309999999999999,21.0,0 374 | 0,84.0,64,22,66.0,35.8,0.545,21.0,0 375 | 2,105.0,58,40,94.0,34.9,0.225,25.0,0 376 | 2,122.0,52,43,158.0,36.2,0.816,28.0,0 377 | 12,140.0,82,43,325.0,39.2,0.528,58.0,1 378 | 0,98.0,82,15,84.0,25.2,0.299,22.0,0 379 | 1,87.0,60,37,75.0,37.2,0.509,22.0,0 380 | 4,156.0,75,0,,48.3,0.23800000000000002,32.0,1 381 | 0,93.0,100,39,72.0,43.4,1.021,35.0,0 382 | 1,107.0,72,30,82.0,30.8,0.821,24.0,0 383 | 0,105.0,68,22,,20.0,0.23600000000000002,22.0,0 384 | 1,109.0,60,8,182.0,25.4,0.9470000000000001,21.0,0 385 | 1,90.0,62,18,59.0,25.1,1.268,25.0,0 386 | 1,125.0,70,24,110.0,24.3,0.221,25.0,0 387 | 1,119.0,54,13,50.0,22.3,0.205,24.0,0 388 | 5,116.0,74,29,,32.3,0.66,35.0,1 389 | 8,105.0,100,36,,43.3,0.239,45.0,1 390 | 5,144.0,82,26,285.0,32.0,0.452,58.0,1 391 | 3,100.0,68,23,81.0,31.6,0.9490000000000001,28.0,0 392 | 1,100.0,66,29,196.0,32.0,0.444,42.0,0 393 | 5,166.0,76,0,,45.7,0.34,27.0,1 394 | 1,131.0,64,14,415.0,23.7,0.389,21.0,0 395 | 4,116.0,72,12,87.0,22.1,0.46299999999999997,37.0,0 396 | 4,158.0,78,0,,32.9,0.8029999999999999,31.0,1 397 | 2,127.0,58,24,275.0,27.7,1.6,25.0,0 398 | 3,96.0,56,34,115.0,24.7,0.9440000000000001,39.0,0 399 | 0,131.0,66,40,,34.3,0.196,22.0,1 400 | 3,82.0,70,0,,21.1,0.389,25.0,0 401 | 3,193.0,70,31,,34.9,0.24100000000000002,25.0,1 402 | 4,95.0,64,0,,32.0,0.161,31.0,1 403 | 6,137.0,61,0,,24.2,0.151,55.0,0 404 | 5,136.0,84,41,88.0,35.0,0.28600000000000003,35.0,1 405 | 9,72.0,78,25,,31.6,0.28,38.0,0 406 | 5,168.0,64,0,,32.9,0.135,41.0,1 407 | 2,123.0,48,32,165.0,42.1,0.52,26.0,0 408 | 4,115.0,72,0,,28.9,0.376,46.0,1 409 | 0,101.0,62,0,,21.9,0.336,25.0,0 410 | 8,197.0,74,0,,25.9,1.1909999999999998,39.0,1 411 | 1,172.0,68,49,579.0,42.4,0.7020000000000001,28.0,1 412 | 6,102.0,90,39,,35.7,0.674,28.0,0 413 | 1,112.0,72,30,176.0,34.4,0.528,25.0,0 414 | 1,143.0,84,23,310.0,42.4,1.0759999999999998,22.0,0 415 | 1,143.0,74,22,61.0,26.2,0.256,21.0,0 416 | 0,138.0,60,35,167.0,34.6,0.534,21.0,1 417 | 3,173.0,84,33,474.0,35.7,0.258,22.0,1 418 | 1,97.0,68,21,,27.2,1.095,22.0,0 419 | 4,144.0,82,32,,38.5,0.5539999999999999,37.0,1 420 | 1,83.0,68,0,,18.2,0.624,27.0,0 421 | 3,129.0,64,29,115.0,26.4,0.21899999999999997,28.0,1 422 | 1,119.0,88,41,170.0,45.3,0.507,26.0,0 423 | 2,94.0,68,18,76.0,26.0,0.561,21.0,0 424 | 0,102.0,64,46,78.0,40.6,0.496,21.0,0 425 | 2,115.0,64,22,,30.8,0.42100000000000004,21.0,0 426 | 8,151.0,78,32,210.0,42.9,0.516,36.0,1 427 | 4,184.0,78,39,277.0,37.0,0.264,31.0,1 428 | 0,94.0,0,0,,,0.256,25.0,0 429 | 1,181.0,64,30,180.0,34.1,0.32799999999999996,38.0,1 430 | 0,135.0,94,46,145.0,40.6,0.284,26.0,0 431 | 1,95.0,82,25,180.0,35.0,0.233,43.0,1 432 | 2,99.0,0,0,,22.2,0.10800000000000001,23.0,0 433 | 3,89.0,74,16,85.0,30.4,0.551,38.0,0 434 | 1,80.0,74,11,60.0,30.0,0.527,22.0,0 435 | 2,139.0,75,0,,25.6,0.16699999999999998,29.0,0 436 | 1,90.0,68,8,,24.5,1.138,36.0,0 437 | 0,141.0,0,0,,42.4,0.205,29.0,1 438 | 12,140.0,85,33,,37.4,0.244,41.0,0 439 | 5,147.0,75,0,,29.9,0.434,28.0,0 440 | 1,97.0,70,15,,18.2,0.147,21.0,0 441 | 6,107.0,88,0,,36.8,0.727,31.0,0 442 | 0,189.0,104,25,,34.3,0.435,41.0,1 443 | 2,83.0,66,23,50.0,32.2,0.49700000000000005,22.0,0 444 | 4,117.0,64,27,120.0,33.2,0.23,24.0,0 445 | 8,108.0,70,0,,30.5,0.955,33.0,1 446 | 4,117.0,62,12,,29.7,0.38,30.0,1 447 | 0,180.0,78,63,14.0,59.4,2.42,25.0,1 448 | 1,100.0,72,12,70.0,25.3,0.6579999999999999,28.0,0 449 | 0,95.0,80,45,92.0,36.5,0.33,26.0,0 450 | 0,104.0,64,37,64.0,33.6,0.51,22.0,1 451 | 0,120.0,74,18,63.0,30.5,0.285,26.0,0 452 | 1,82.0,64,13,95.0,21.2,0.415,23.0,0 453 | 2,134.0,70,0,,28.9,0.542,23.0,1 454 | 0,91.0,68,32,210.0,39.9,0.381,25.0,0 455 | 2,119.0,0,0,,19.6,0.8320000000000001,72.0,0 456 | 2,100.0,54,28,105.0,37.8,0.498,24.0,0 457 | 14,175.0,62,30,,33.6,0.212,38.0,1 458 | 1,135.0,54,0,,26.7,0.687,62.0,0 459 | 5,86.0,68,28,71.0,30.2,0.364,24.0,0 460 | 10,148.0,84,48,237.0,37.6,1.001,51.0,1 461 | 9,134.0,74,33,60.0,25.9,0.46,81.0,0 462 | 9,120.0,72,22,56.0,20.8,0.733,48.0,0 463 | 1,71.0,62,0,,21.8,0.41600000000000004,26.0,0 464 | 8,74.0,70,40,49.0,35.3,0.705,39.0,0 465 | 5,88.0,78,30,,27.6,0.258,37.0,0 466 | 10,115.0,98,0,,24.0,1.022,34.0,0 467 | 0,124.0,56,13,105.0,21.8,0.452,21.0,0 468 | 0,74.0,52,10,36.0,27.8,0.26899999999999996,22.0,0 469 | 0,97.0,64,36,100.0,36.8,0.6,25.0,0 470 | 8,120.0,0,0,,30.0,0.183,38.0,1 471 | 6,154.0,78,41,140.0,46.1,0.5710000000000001,27.0,0 472 | 1,144.0,82,40,,41.3,0.607,28.0,0 473 | 0,137.0,70,38,,33.2,0.17,22.0,0 474 | 0,119.0,66,27,,38.8,0.259,22.0,0 475 | 7,136.0,90,0,,29.9,0.21,50.0,0 476 | 4,114.0,64,0,,28.9,0.126,24.0,0 477 | 0,137.0,84,27,,27.3,0.231,59.0,0 478 | 2,105.0,80,45,191.0,33.7,0.711,29.0,1 479 | 7,114.0,76,17,110.0,23.8,0.466,31.0,0 480 | 8,126.0,74,38,75.0,25.9,0.162,39.0,0 481 | 4,132.0,86,31,,28.0,0.419,63.0,0 482 | 3,158.0,70,30,328.0,35.5,0.344,35.0,1 483 | 0,123.0,88,37,,35.2,0.19699999999999998,29.0,0 484 | 4,85.0,58,22,49.0,27.8,0.306,28.0,0 485 | 0,84.0,82,31,125.0,38.2,0.233,23.0,0 486 | 0,145.0,0,0,,44.2,0.63,31.0,1 487 | 0,135.0,68,42,250.0,42.3,0.365,24.0,1 488 | 1,139.0,62,41,480.0,40.7,0.536,21.0,0 489 | 0,173.0,78,32,265.0,46.5,1.159,58.0,0 490 | 4,99.0,72,17,,25.6,0.294,28.0,0 491 | 8,194.0,80,0,,26.1,0.551,67.0,0 492 | 2,83.0,65,28,66.0,36.8,0.629,24.0,0 493 | 2,89.0,90,30,,33.5,0.292,42.0,0 494 | 4,99.0,68,38,,32.8,0.145,33.0,0 495 | 4,125.0,70,18,122.0,28.9,1.1440000000000001,45.0,1 496 | 3,80.0,0,0,,,0.174,22.0,0 497 | 6,166.0,74,0,,26.6,0.304,66.0,0 498 | 5,110.0,68,0,,26.0,0.292,30.0,0 499 | 2,81.0,72,15,76.0,30.1,0.547,25.0,0 500 | 7,195.0,70,33,145.0,25.1,0.163,55.0,1 501 | 6,154.0,74,32,193.0,29.3,0.8390000000000001,39.0,0 502 | 2,117.0,90,19,71.0,25.2,0.313,21.0,0 503 | 3,84.0,72,32,,37.2,0.267,28.0,0 504 | 6,,68,41,,39.0,0.727,41.0,1 505 | 7,94.0,64,25,79.0,33.3,0.738,41.0,0 506 | 3,96.0,78,39,,37.3,0.23800000000000002,40.0,0 507 | 10,75.0,82,0,,33.3,0.263,38.0,0 508 | 0,180.0,90,26,90.0,36.5,0.314,35.0,1 509 | 1,130.0,60,23,170.0,28.6,0.6920000000000001,21.0,0 510 | 2,84.0,50,23,76.0,30.4,0.968,21.0,0 511 | 8,120.0,78,0,,25.0,0.409,64.0,0 512 | 12,84.0,72,31,,29.7,0.297,46.0,1 513 | 0,139.0,62,17,210.0,22.1,0.207,21.0,0 514 | 9,91.0,68,0,,24.2,0.2,58.0,0 515 | 2,91.0,62,0,,27.3,0.525,22.0,0 516 | 3,99.0,54,19,86.0,25.6,0.154,24.0,0 517 | 3,163.0,70,18,105.0,31.6,0.268,28.0,1 518 | 9,145.0,88,34,165.0,30.3,0.7709999999999999,53.0,1 519 | 7,125.0,86,0,,37.6,0.304,51.0,0 520 | 13,76.0,60,0,,32.8,0.18,41.0,0 521 | 6,129.0,90,7,326.0,19.6,0.5820000000000001,60.0,0 522 | 2,68.0,70,32,66.0,25.0,0.187,25.0,0 523 | 3,124.0,80,33,130.0,33.2,0.305,26.0,0 524 | 6,114.0,0,0,,,0.18899999999999997,26.0,0 525 | 9,130.0,70,0,,34.2,0.652,45.0,1 526 | 3,125.0,58,0,,31.6,0.151,24.0,0 527 | 3,87.0,60,18,,21.8,0.444,21.0,0 528 | 1,97.0,64,19,82.0,18.2,0.299,21.0,0 529 | 3,116.0,74,15,105.0,26.3,0.107,24.0,0 530 | 0,117.0,66,31,188.0,30.8,0.493,22.0,0 531 | 0,111.0,65,0,,24.6,0.66,31.0,0 532 | 2,122.0,60,18,106.0,29.8,0.7170000000000001,22.0,0 533 | 0,107.0,76,0,,45.3,0.6859999999999999,24.0,0 534 | 1,86.0,66,52,65.0,41.3,0.917,29.0,0 535 | 6,91.0,0,0,,29.8,0.501,31.0,0 536 | 1,77.0,56,30,56.0,33.3,1.251,24.0,0 537 | 4,132.0,0,0,,32.9,0.302,23.0,1 538 | 0,105.0,90,0,,29.6,0.19699999999999998,46.0,0 539 | 0,57.0,60,0,,21.7,0.735,67.0,0 540 | 0,127.0,80,37,210.0,36.3,0.804,23.0,0 541 | 3,129.0,92,49,155.0,36.4,0.968,32.0,1 542 | 8,100.0,74,40,215.0,39.4,0.6609999999999999,43.0,1 543 | 3,128.0,72,25,190.0,32.4,0.5489999999999999,27.0,1 544 | 10,90.0,85,32,,34.9,0.825,56.0,1 545 | 4,84.0,90,23,56.0,39.5,0.159,25.0,0 546 | 1,88.0,78,29,76.0,32.0,0.365,29.0,0 547 | 8,186.0,90,35,225.0,34.5,0.423,37.0,1 548 | 5,187.0,76,27,207.0,43.6,1.034,53.0,1 549 | 4,131.0,68,21,166.0,33.1,0.16,28.0,0 550 | 1,164.0,82,43,67.0,32.8,0.341,50.0,0 551 | 4,189.0,110,31,,28.5,0.68,37.0,0 552 | 1,116.0,70,28,,27.4,0.204,21.0,0 553 | 3,84.0,68,30,106.0,31.9,0.591,25.0,0 554 | 6,114.0,88,0,,27.8,0.247,66.0,0 555 | 1,88.0,62,24,44.0,29.9,0.42200000000000004,23.0,0 556 | 1,84.0,64,23,115.0,36.9,0.47100000000000003,28.0,0 557 | 7,124.0,70,33,215.0,25.5,0.161,37.0,0 558 | 1,97.0,70,40,,38.1,0.218,30.0,0 559 | 8,110.0,76,0,,27.8,0.237,58.0,0 560 | 11,103.0,68,40,,46.2,0.126,42.0,0 561 | 11,85.0,74,0,,30.1,0.3,35.0,0 562 | 6,125.0,76,0,,33.8,0.121,54.0,1 563 | 0,198.0,66,32,274.0,41.3,0.502,28.0,1 564 | 1,87.0,68,34,77.0,37.6,0.401,24.0,0 565 | 6,99.0,60,19,54.0,26.9,0.49700000000000005,32.0,0 566 | 0,91.0,80,0,,32.4,0.601,27.0,0 567 | 2,95.0,54,14,88.0,26.1,0.748,22.0,0 568 | 1,99.0,72,30,18.0,38.6,0.41200000000000003,21.0,0 569 | 6,92.0,62,32,126.0,32.0,0.085,46.0,0 570 | 4,154.0,72,29,126.0,31.3,0.33799999999999997,37.0,0 571 | 0,121.0,66,30,165.0,34.3,0.203,33.0,1 572 | 3,78.0,70,0,,32.5,0.27,39.0,0 573 | 2,130.0,96,0,,22.6,0.268,21.0,0 574 | 3,111.0,58,31,44.0,29.5,0.43,22.0,0 575 | 2,98.0,60,17,120.0,34.7,0.198,22.0,0 576 | 1,143.0,86,30,330.0,30.1,0.892,23.0,0 577 | 1,119.0,44,47,63.0,35.5,0.28,25.0,0 578 | 6,108.0,44,20,130.0,24.0,0.813,35.0,0 579 | 2,118.0,80,0,,42.9,0.693,21.0,1 580 | 10,133.0,68,0,,27.0,0.245,36.0,0 581 | 2,197.0,70,99,,34.7,0.575,62.0,1 582 | 0,151.0,90,46,,42.1,0.371,21.0,1 583 | 6,109.0,60,27,,25.0,0.20600000000000002,27.0,0 584 | 12,121.0,78,17,,26.5,0.259,62.0,0 585 | 8,100.0,76,0,,38.7,0.19,42.0,0 586 | 8,124.0,76,24,600.0,28.7,0.687,52.0,1 587 | 1,93.0,56,11,,22.5,0.41700000000000004,22.0,0 588 | 8,143.0,66,0,,34.9,0.129,41.0,1 589 | 6,103.0,66,0,,24.3,0.249,29.0,0 590 | 3,176.0,86,27,156.0,33.3,1.1540000000000001,52.0,1 591 | 0,73.0,0,0,,21.1,0.342,25.0,0 592 | 11,111.0,84,40,,46.8,0.925,45.0,1 593 | 2,112.0,78,50,140.0,39.4,0.175,24.0,0 594 | 3,132.0,80,0,,34.4,0.402,44.0,1 595 | 2,82.0,52,22,115.0,28.5,1.699,25.0,0 596 | 6,123.0,72,45,230.0,33.6,0.733,34.0,0 597 | 0,188.0,82,14,185.0,32.0,0.682,22.0,1 598 | 0,67.0,76,0,,45.3,0.19399999999999998,46.0,0 599 | 1,89.0,24,19,25.0,27.8,0.5589999999999999,21.0,0 600 | 1,173.0,74,0,,36.8,0.08800000000000001,38.0,1 601 | 1,109.0,38,18,120.0,23.1,0.40700000000000003,26.0,0 602 | 1,108.0,88,19,,27.1,0.4,24.0,0 603 | 6,96.0,0,0,,23.7,0.19,28.0,0 604 | 1,124.0,74,36,,27.8,0.1,30.0,0 605 | 7,150.0,78,29,126.0,35.2,0.6920000000000001,54.0,1 606 | 4,183.0,0,0,,28.4,0.212,36.0,1 607 | 1,124.0,60,32,,35.8,0.514,21.0,0 608 | 1,181.0,78,42,293.0,40.0,1.258,22.0,1 609 | 1,92.0,62,25,41.0,19.5,0.48200000000000004,25.0,0 610 | 0,152.0,82,39,272.0,41.5,0.27,27.0,0 611 | 1,111.0,62,13,182.0,24.0,0.138,23.0,0 612 | 3,106.0,54,21,158.0,30.9,0.292,24.0,0 613 | 3,174.0,58,22,194.0,32.9,0.593,36.0,1 614 | 7,168.0,88,42,321.0,38.2,0.787,40.0,1 615 | 6,105.0,80,28,,32.5,0.878,26.0,0 616 | 11,138.0,74,26,144.0,36.1,0.557,50.0,1 617 | 3,106.0,72,0,,25.8,0.207,27.0,0 618 | 6,117.0,96,0,,28.7,0.157,30.0,0 619 | 2,68.0,62,13,15.0,20.1,0.257,23.0,0 620 | 9,112.0,82,24,,28.2,1.2819999999999998,50.0,1 621 | 0,119.0,0,0,,32.4,0.141,24.0,1 622 | 2,112.0,86,42,160.0,38.4,0.24600000000000002,28.0,0 623 | 2,92.0,76,20,,24.2,1.6980000000000002,28.0,0 624 | 6,183.0,94,0,,40.8,1.4609999999999999,45.0,0 625 | 0,94.0,70,27,115.0,43.5,0.34700000000000003,21.0,0 626 | 2,108.0,64,0,,30.8,0.158,21.0,0 627 | 4,90.0,88,47,54.0,37.7,0.36200000000000004,29.0,0 628 | 0,125.0,68,0,,24.7,0.20600000000000002,21.0,0 629 | 0,132.0,78,0,,32.4,0.39299999999999996,21.0,0 630 | 5,128.0,80,0,,34.6,0.14400000000000002,45.0,0 631 | 4,94.0,65,22,,24.7,0.14800000000000002,21.0,0 632 | 7,114.0,64,0,,27.4,0.732,34.0,1 633 | 0,102.0,78,40,90.0,34.5,0.23800000000000002,24.0,0 634 | 2,111.0,60,0,,26.2,0.34299999999999997,23.0,0 635 | 1,128.0,82,17,183.0,27.5,0.115,22.0,0 636 | 10,92.0,62,0,,25.9,0.16699999999999998,31.0,0 637 | 13,104.0,72,0,,31.2,0.465,38.0,1 638 | 5,104.0,74,0,,28.8,0.153,48.0,0 639 | 2,94.0,76,18,66.0,31.6,0.649,23.0,0 640 | 7,97.0,76,32,91.0,40.9,0.871,32.0,1 641 | 1,100.0,74,12,46.0,19.5,0.149,28.0,0 642 | 0,102.0,86,17,105.0,29.3,0.695,27.0,0 643 | 4,128.0,70,0,,34.3,0.303,24.0,0 644 | 6,147.0,80,0,,29.5,0.17800000000000002,50.0,1 645 | 4,90.0,0,0,,28.0,0.61,31.0,0 646 | 3,103.0,72,30,152.0,27.6,0.73,27.0,0 647 | 2,157.0,74,35,440.0,39.4,0.134,30.0,0 648 | 1,167.0,74,17,144.0,23.4,0.447,33.0,1 649 | 0,179.0,50,36,159.0,37.8,0.455,22.0,1 650 | 11,136.0,84,35,130.0,28.3,0.26,42.0,1 651 | 0,107.0,60,25,,26.4,0.133,23.0,0 652 | 1,91.0,54,25,100.0,25.2,0.23399999999999999,23.0,0 653 | 1,117.0,60,23,106.0,33.8,0.466,27.0,0 654 | 5,123.0,74,40,77.0,34.1,0.26899999999999996,28.0,0 655 | 2,120.0,54,0,,26.8,0.455,27.0,0 656 | 1,106.0,70,28,135.0,34.2,0.142,22.0,0 657 | 2,155.0,52,27,540.0,38.7,0.24,25.0,1 658 | 2,101.0,58,35,90.0,21.8,0.155,22.0,0 659 | 1,120.0,80,48,200.0,38.9,1.162,41.0,0 660 | 11,127.0,106,0,,39.0,0.19,51.0,0 661 | 3,80.0,82,31,70.0,34.2,1.2919999999999998,27.0,1 662 | 10,162.0,84,0,,27.7,0.182,54.0,0 663 | 1,199.0,76,43,,42.9,1.3940000000000001,22.0,1 664 | 8,167.0,106,46,231.0,37.6,0.165,43.0,1 665 | 9,145.0,80,46,130.0,37.9,0.637,40.0,1 666 | 6,115.0,60,39,,33.7,0.245,40.0,1 667 | 1,112.0,80,45,132.0,34.8,0.217,24.0,0 668 | 4,145.0,82,18,,32.5,0.235,70.0,1 669 | 10,111.0,70,27,,27.5,0.141,40.0,1 670 | 6,98.0,58,33,190.0,34.0,0.43,43.0,0 671 | 9,154.0,78,30,100.0,30.9,0.16399999999999998,45.0,0 672 | 6,165.0,68,26,168.0,33.6,0.631,49.0,0 673 | 1,99.0,58,10,,25.4,0.551,21.0,0 674 | 10,68.0,106,23,49.0,35.5,0.285,47.0,0 675 | 3,123.0,100,35,240.0,57.3,0.88,22.0,0 676 | 8,91.0,82,0,,35.6,0.5870000000000001,68.0,0 677 | 6,195.0,70,0,,30.9,0.32799999999999996,31.0,1 678 | 9,156.0,86,0,,24.8,0.23,53.0,1 679 | 0,93.0,60,0,,35.3,0.263,25.0,0 680 | 3,121.0,52,0,,36.0,0.127,25.0,1 681 | 2,101.0,58,17,265.0,24.2,0.614,23.0,0 682 | 2,56.0,56,28,45.0,24.2,0.332,22.0,0 683 | 0,162.0,76,36,,49.6,0.364,26.0,1 684 | 0,95.0,64,39,105.0,44.6,0.366,22.0,0 685 | 4,125.0,80,0,,32.3,0.536,27.0,1 686 | 5,136.0,82,0,,,0.64,69.0,0 687 | 2,129.0,74,26,205.0,33.2,0.591,25.0,0 688 | 3,130.0,64,0,,23.1,0.314,22.0,0 689 | 1,107.0,50,19,,28.3,0.18100000000000002,29.0,0 690 | 1,140.0,74,26,180.0,24.1,0.828,23.0,0 691 | 1,144.0,82,46,180.0,46.1,0.335,46.0,1 692 | 8,107.0,80,0,,24.6,0.856,34.0,0 693 | 13,158.0,114,0,,42.3,0.257,44.0,1 694 | 2,121.0,70,32,95.0,39.1,0.8859999999999999,23.0,0 695 | 7,129.0,68,49,125.0,38.5,0.439,43.0,1 696 | 2,90.0,60,0,,23.5,0.191,25.0,0 697 | 7,142.0,90,24,480.0,30.4,0.128,43.0,1 698 | 3,169.0,74,19,125.0,29.9,0.268,31.0,1 699 | 0,99.0,0,0,,25.0,0.253,22.0,0 700 | 4,127.0,88,11,155.0,34.5,0.598,28.0,0 701 | 4,118.0,70,0,,44.5,0.904,26.0,0 702 | 2,122.0,76,27,200.0,35.9,0.483,26.0,0 703 | 6,125.0,78,31,,27.6,0.565,49.0,1 704 | 1,168.0,88,29,,35.0,0.905,52.0,1 705 | 2,129.0,0,0,,38.5,0.304,41.0,0 706 | 4,110.0,76,20,100.0,28.4,0.11800000000000001,27.0,0 707 | 6,80.0,80,36,,39.8,0.177,28.0,0 708 | 10,115.0,0,0,,,0.261,30.0,1 709 | 2,127.0,46,21,335.0,34.4,0.17600000000000002,22.0,0 710 | 9,164.0,78,0,,32.8,0.14800000000000002,45.0,1 711 | 2,93.0,64,32,160.0,38.0,0.674,23.0,1 712 | 3,158.0,64,13,387.0,31.2,0.295,24.0,0 713 | 5,126.0,78,27,22.0,29.6,0.439,40.0,0 714 | 10,129.0,62,36,,41.2,0.441,38.0,1 715 | 0,134.0,58,20,291.0,26.4,0.35200000000000004,21.0,0 716 | 3,102.0,74,0,,29.5,0.121,32.0,0 717 | 7,187.0,50,33,392.0,33.9,0.826,34.0,1 718 | 3,173.0,78,39,185.0,33.8,0.97,31.0,1 719 | 10,94.0,72,18,,23.1,0.595,56.0,0 720 | 1,108.0,60,46,178.0,35.5,0.415,24.0,0 721 | 5,97.0,76,27,,35.6,0.37799999999999995,52.0,1 722 | 4,83.0,86,19,,29.3,0.317,34.0,0 723 | 1,114.0,66,36,200.0,38.1,0.289,21.0,0 724 | 1,149.0,68,29,127.0,29.3,0.349,42.0,1 725 | 5,117.0,86,30,105.0,39.1,0.251,42.0,0 726 | 1,111.0,94,0,,32.8,0.265,45.0,0 727 | 4,112.0,78,40,,39.4,0.23600000000000002,38.0,0 728 | 1,116.0,78,29,180.0,36.1,0.496,25.0,0 729 | 0,141.0,84,26,,32.4,0.433,22.0,0 730 | 2,175.0,88,0,,22.9,0.326,22.0,0 731 | 2,92.0,52,0,,30.1,0.141,22.0,0 732 | 3,130.0,78,23,79.0,28.4,0.32299999999999995,34.0,1 733 | 8,120.0,86,0,,28.4,0.259,22.0,1 734 | 2,174.0,88,37,120.0,44.5,0.6459999999999999,24.0,1 735 | 2,106.0,56,27,165.0,29.0,0.426,22.0,0 736 | 2,105.0,75,0,,23.3,0.56,53.0,0 737 | 4,95.0,60,32,,35.4,0.284,28.0,0 738 | 0,126.0,86,27,120.0,27.4,0.515,21.0,0 739 | 8,65.0,72,23,,32.0,0.6,42.0,0 740 | 2,99.0,60,17,160.0,36.6,0.45299999999999996,21.0,0 741 | 1,102.0,74,0,,39.5,0.293,42.0,1 742 | 11,120.0,80,37,150.0,42.3,0.785,48.0,1 743 | 3,102.0,44,20,94.0,30.8,0.4,26.0,0 744 | 1,109.0,58,18,116.0,28.5,0.21899999999999997,22.0,0 745 | 9,140.0,94,0,,32.7,0.7340000000000001,45.0,1 746 | 13,153.0,88,37,140.0,40.6,1.1740000000000002,39.0,0 747 | 12,100.0,84,33,105.0,30.0,0.488,46.0,0 748 | 1,147.0,94,41,,49.3,0.358,27.0,1 749 | 1,81.0,74,41,57.0,46.3,1.0959999999999999,32.0,0 750 | 3,187.0,70,22,200.0,36.4,0.408,36.0,1 751 | 6,162.0,62,0,,24.3,0.17800000000000002,50.0,1 752 | 4,136.0,70,0,,31.2,1.182,22.0,1 753 | 1,121.0,78,39,74.0,39.0,0.261,28.0,0 754 | 3,108.0,62,24,,26.0,0.223,25.0,0 755 | 0,181.0,88,44,510.0,43.3,0.222,26.0,1 756 | 8,154.0,78,32,,32.4,0.44299999999999995,45.0,1 757 | 1,128.0,88,39,110.0,36.5,1.057,37.0,1 758 | 7,137.0,90,41,,32.0,0.391,39.0,0 759 | 0,123.0,72,0,,36.3,0.258,52.0,1 760 | 1,106.0,76,0,,37.5,0.19699999999999998,26.0,0 761 | 6,190.0,92,0,,35.5,0.278,66.0,1 762 | 2,88.0,58,26,16.0,28.4,0.7659999999999999,22.0,0 763 | 9,170.0,74,31,,44.0,0.40299999999999997,43.0,1 764 | 9,89.0,62,0,,22.5,0.142,33.0,0 765 | 10,101.0,76,48,180.0,32.9,0.171,63.0,0 766 | 2,122.0,70,27,,36.8,0.34,27.0,0 767 | 5,121.0,72,23,112.0,26.2,0.245,30.0,0 768 | 1,126.0,60,0,,30.1,0.349,47.0,1 769 | 1,93.0,70,31,,30.4,0.315,23.0,0 770 | -------------------------------------------------------------------------------- /Data/daily-total-female-births.csv: -------------------------------------------------------------------------------- 1 | "Date","Births" 2 | "1959-01-01",35 3 | "1959-01-02",32 4 | "1959-01-03",30 5 | "1959-01-04",31 6 | "1959-01-05",44 7 | "1959-01-06",29 8 | "1959-01-07",45 9 | "1959-01-08",43 10 | "1959-01-09",38 11 | "1959-01-10",27 12 | "1959-01-11",38 13 | "1959-01-12",33 14 | "1959-01-13",55 15 | "1959-01-14",47 16 | "1959-01-15",45 17 | "1959-01-16",37 18 | "1959-01-17",50 19 | "1959-01-18",43 20 | "1959-01-19",41 21 | "1959-01-20",52 22 | "1959-01-21",34 23 | "1959-01-22",53 24 | "1959-01-23",39 25 | "1959-01-24",32 26 | "1959-01-25",37 27 | "1959-01-26",43 28 | "1959-01-27",39 29 | "1959-01-28",35 30 | "1959-01-29",44 31 | "1959-01-30",38 32 | "1959-01-31",24 33 | "1959-02-01",23 34 | "1959-02-02",31 35 | "1959-02-03",44 36 | "1959-02-04",38 37 | "1959-02-05",50 38 | "1959-02-06",38 39 | "1959-02-07",51 40 | "1959-02-08",31 41 | "1959-02-09",31 42 | "1959-02-10",51 43 | "1959-02-11",36 44 | "1959-02-12",45 45 | "1959-02-13",51 46 | "1959-02-14",34 47 | "1959-02-15",52 48 | "1959-02-16",47 49 | "1959-02-17",45 50 | "1959-02-18",46 51 | "1959-02-19",39 52 | "1959-02-20",48 53 | "1959-02-21",37 54 | "1959-02-22",35 55 | "1959-02-23",52 56 | "1959-02-24",42 57 | "1959-02-25",45 58 | "1959-02-26",39 59 | "1959-02-27",37 60 | "1959-02-28",30 61 | "1959-03-01",35 62 | "1959-03-02",28 63 | "1959-03-03",45 64 | "1959-03-04",34 65 | "1959-03-05",36 66 | "1959-03-06",50 67 | "1959-03-07",44 68 | "1959-03-08",39 69 | "1959-03-09",32 70 | "1959-03-10",39 71 | "1959-03-11",45 72 | "1959-03-12",43 73 | "1959-03-13",39 74 | "1959-03-14",31 75 | "1959-03-15",27 76 | "1959-03-16",30 77 | "1959-03-17",42 78 | "1959-03-18",46 79 | "1959-03-19",41 80 | "1959-03-20",36 81 | "1959-03-21",45 82 | "1959-03-22",46 83 | "1959-03-23",43 84 | "1959-03-24",38 85 | "1959-03-25",34 86 | "1959-03-26",35 87 | "1959-03-27",56 88 | "1959-03-28",36 89 | "1959-03-29",32 90 | "1959-03-30",50 91 | "1959-03-31",41 92 | "1959-04-01",39 93 | "1959-04-02",41 94 | "1959-04-03",47 95 | "1959-04-04",34 96 | "1959-04-05",36 97 | "1959-04-06",33 98 | "1959-04-07",35 99 | "1959-04-08",38 100 | "1959-04-09",38 101 | "1959-04-10",34 102 | "1959-04-11",53 103 | "1959-04-12",34 104 | "1959-04-13",34 105 | "1959-04-14",38 106 | "1959-04-15",35 107 | "1959-04-16",32 108 | "1959-04-17",42 109 | "1959-04-18",34 110 | "1959-04-19",46 111 | "1959-04-20",30 112 | "1959-04-21",46 113 | "1959-04-22",45 114 | "1959-04-23",54 115 | "1959-04-24",34 116 | "1959-04-25",37 117 | "1959-04-26",35 118 | "1959-04-27",40 119 | "1959-04-28",42 120 | "1959-04-29",58 121 | "1959-04-30",51 122 | "1959-05-01",32 123 | "1959-05-02",35 124 | "1959-05-03",38 125 | "1959-05-04",33 126 | "1959-05-05",39 127 | "1959-05-06",47 128 | "1959-05-07",38 129 | "1959-05-08",52 130 | "1959-05-09",30 131 | "1959-05-10",34 132 | "1959-05-11",40 133 | "1959-05-12",35 134 | "1959-05-13",42 135 | "1959-05-14",41 136 | "1959-05-15",42 137 | "1959-05-16",38 138 | "1959-05-17",24 139 | "1959-05-18",34 140 | "1959-05-19",43 141 | "1959-05-20",36 142 | "1959-05-21",55 143 | "1959-05-22",41 144 | "1959-05-23",45 145 | "1959-05-24",41 146 | "1959-05-25",37 147 | "1959-05-26",43 148 | "1959-05-27",39 149 | "1959-05-28",33 150 | "1959-05-29",43 151 | "1959-05-30",40 152 | "1959-05-31",38 153 | "1959-06-01",45 154 | "1959-06-02",46 155 | "1959-06-03",34 156 | "1959-06-04",35 157 | "1959-06-05",48 158 | "1959-06-06",51 159 | "1959-06-07",36 160 | "1959-06-08",33 161 | "1959-06-09",46 162 | "1959-06-10",42 163 | "1959-06-11",48 164 | "1959-06-12",34 165 | "1959-06-13",41 166 | "1959-06-14",35 167 | "1959-06-15",40 168 | "1959-06-16",34 169 | "1959-06-17",30 170 | "1959-06-18",36 171 | "1959-06-19",40 172 | "1959-06-20",39 173 | "1959-06-21",45 174 | "1959-06-22",38 175 | "1959-06-23",47 176 | "1959-06-24",33 177 | "1959-06-25",30 178 | "1959-06-26",42 179 | "1959-06-27",43 180 | "1959-06-28",41 181 | "1959-06-29",41 182 | "1959-06-30",59 183 | "1959-07-01",43 184 | "1959-07-02",45 185 | "1959-07-03",38 186 | "1959-07-04",37 187 | "1959-07-05",45 188 | "1959-07-06",42 189 | "1959-07-07",57 190 | "1959-07-08",46 191 | "1959-07-09",51 192 | "1959-07-10",41 193 | "1959-07-11",47 194 | "1959-07-12",26 195 | "1959-07-13",35 196 | "1959-07-14",44 197 | "1959-07-15",41 198 | "1959-07-16",42 199 | "1959-07-17",36 200 | "1959-07-18",45 201 | "1959-07-19",45 202 | "1959-07-20",45 203 | "1959-07-21",47 204 | "1959-07-22",38 205 | "1959-07-23",42 206 | "1959-07-24",35 207 | "1959-07-25",36 208 | "1959-07-26",39 209 | "1959-07-27",45 210 | "1959-07-28",43 211 | "1959-07-29",47 212 | "1959-07-30",36 213 | "1959-07-31",41 214 | "1959-08-01",50 215 | "1959-08-02",39 216 | "1959-08-03",41 217 | "1959-08-04",46 218 | "1959-08-05",64 219 | "1959-08-06",45 220 | "1959-08-07",34 221 | "1959-08-08",38 222 | "1959-08-09",44 223 | "1959-08-10",48 224 | "1959-08-11",46 225 | "1959-08-12",44 226 | "1959-08-13",37 227 | "1959-08-14",39 228 | "1959-08-15",44 229 | "1959-08-16",45 230 | "1959-08-17",33 231 | "1959-08-18",44 232 | "1959-08-19",38 233 | "1959-08-20",46 234 | "1959-08-21",46 235 | "1959-08-22",40 236 | "1959-08-23",39 237 | "1959-08-24",44 238 | "1959-08-25",48 239 | "1959-08-26",50 240 | "1959-08-27",41 241 | "1959-08-28",42 242 | "1959-08-29",51 243 | "1959-08-30",41 244 | "1959-08-31",44 245 | "1959-09-01",38 246 | "1959-09-02",68 247 | "1959-09-03",40 248 | "1959-09-04",42 249 | "1959-09-05",51 250 | "1959-09-06",44 251 | "1959-09-07",45 252 | "1959-09-08",36 253 | "1959-09-09",57 254 | "1959-09-10",44 255 | "1959-09-11",42 256 | "1959-09-12",53 257 | "1959-09-13",42 258 | "1959-09-14",34 259 | "1959-09-15",40 260 | "1959-09-16",56 261 | "1959-09-17",44 262 | "1959-09-18",53 263 | "1959-09-19",55 264 | "1959-09-20",39 265 | "1959-09-21",59 266 | "1959-09-22",55 267 | "1959-09-23",73 268 | "1959-09-24",55 269 | "1959-09-25",44 270 | "1959-09-26",43 271 | "1959-09-27",40 272 | "1959-09-28",47 273 | "1959-09-29",51 274 | "1959-09-30",56 275 | "1959-10-01",49 276 | "1959-10-02",54 277 | "1959-10-03",56 278 | "1959-10-04",47 279 | "1959-10-05",44 280 | "1959-10-06",43 281 | "1959-10-07",42 282 | "1959-10-08",45 283 | "1959-10-09",50 284 | "1959-10-10",48 285 | "1959-10-11",43 286 | "1959-10-12",40 287 | "1959-10-13",59 288 | "1959-10-14",41 289 | "1959-10-15",42 290 | "1959-10-16",51 291 | "1959-10-17",49 292 | "1959-10-18",45 293 | "1959-10-19",43 294 | "1959-10-20",42 295 | "1959-10-21",38 296 | "1959-10-22",47 297 | "1959-10-23",38 298 | "1959-10-24",36 299 | "1959-10-25",42 300 | "1959-10-26",35 301 | "1959-10-27",28 302 | "1959-10-28",44 303 | "1959-10-29",36 304 | "1959-10-30",45 305 | "1959-10-31",46 306 | "1959-11-01",48 307 | "1959-11-02",49 308 | "1959-11-03",43 309 | "1959-11-04",42 310 | "1959-11-05",59 311 | "1959-11-06",45 312 | "1959-11-07",52 313 | "1959-11-08",46 314 | "1959-11-09",42 315 | "1959-11-10",40 316 | "1959-11-11",40 317 | "1959-11-12",45 318 | "1959-11-13",35 319 | "1959-11-14",35 320 | "1959-11-15",40 321 | "1959-11-16",39 322 | "1959-11-17",33 323 | "1959-11-18",42 324 | "1959-11-19",47 325 | "1959-11-20",51 326 | "1959-11-21",44 327 | "1959-11-22",40 328 | "1959-11-23",57 329 | "1959-11-24",49 330 | "1959-11-25",45 331 | "1959-11-26",49 332 | "1959-11-27",51 333 | "1959-11-28",46 334 | "1959-11-29",44 335 | "1959-11-30",52 336 | "1959-12-01",45 337 | "1959-12-02",32 338 | "1959-12-03",46 339 | "1959-12-04",41 340 | "1959-12-05",34 341 | "1959-12-06",33 342 | "1959-12-07",36 343 | "1959-12-08",49 344 | "1959-12-09",43 345 | "1959-12-10",43 346 | "1959-12-11",34 347 | "1959-12-12",39 348 | "1959-12-13",35 349 | "1959-12-14",52 350 | "1959-12-15",47 351 | "1959-12-16",52 352 | "1959-12-17",39 353 | "1959-12-18",40 354 | "1959-12-19",42 355 | "1959-12-20",42 356 | "1959-12-21",53 357 | "1959-12-22",39 358 | "1959-12-23",40 359 | "1959-12-24",38 360 | "1959-12-25",44 361 | "1959-12-26",34 362 | "1959-12-27",37 363 | "1959-12-28",52 364 | "1959-12-29",48 365 | "1959-12-30",55 366 | "1959-12-31",50 -------------------------------------------------------------------------------- /Data/iris_all.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,class 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3.0,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,0.2,Iris-setosa 5 | 4.6,3.1,1.5,0.2,Iris-setosa 6 | 5.0,3.6,1.4,0.2,Iris-setosa 7 | 5.4,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5.0,3.4,1.5,0.2,Iris-setosa 10 | 4.4,2.9,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,1.6,0.2,Iris-setosa 14 | 4.8,3.0,1.4,0.1,Iris-setosa 15 | 4.3,3.0,1.1,0.1,Iris-setosa 16 | 5.8,4.0,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3,Iris-setosa 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1.0,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5.0,3.0,1.6,0.2,Iris-setosa 28 | 5.0,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5.0,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3.0,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5.0,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5.0,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3.0,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5.0,3.3,1.4,0.2,Iris-setosa 52 | 7.0,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5,Iris-versicolor 55 | 5.5,2.3,4.0,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1.0,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5.0,2.0,3.5,1.0,Iris-versicolor 63 | 5.9,3.0,4.2,1.5,Iris-versicolor 64 | 6.0,2.2,4.0,1.0,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3.0,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1.0,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4.0,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3.0,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3.0,5.0,1.7,Iris-versicolor 80 | 6.0,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1.0,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1.0,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6.0,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3.0,4.5,1.5,Iris-versicolor 87 | 6.0,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3.0,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4.0,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3.0,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4.0,1.2,Iris-versicolor 95 | 5.0,2.3,3.3,1.0,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3.0,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3.0,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6.0,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3.0,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3.0,5.8,2.2,Iris-virginica 107 | 7.6,3.0,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2.0,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3.0,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5.0,2.0,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3.0,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6.0,2.2,5.0,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2.0,Iris-virginica 124 | 7.7,2.8,6.7,2.0,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6.0,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3.0,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3.0,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2.0,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3.0,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6.0,3.0,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3.0,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5.0,1.9,Iris-virginica 149 | 6.5,3.0,5.2,2.0,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /Notebooks/01-XGBoost_BikeRental_Data_Preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "from pandas.plotting import register_matplotlib_converters\n", 14 | "register_matplotlib_converters()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "

Kaggle Bike Sharing Demand Dataset

\n", 22 | "\n", 23 | "Modified 'count' to log1p(count) for training\n", 24 | "\n", 25 | "Log can be used when target represents a count (that is non-negative values)\n", 26 | "\n", 27 | "Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)\n", 28 | "\n", 29 | "Reference:\n", 30 | "https://www.kaggle.com/apapiu/predicting-bike-sharing-with-xgboost by Alexandru Papiu\n", 31 | "\n", 32 | "To download dataset, sign-in and download from this link:\n", 33 | "https://www.kaggle.com/c/bike-sharing-demand/data
\n", 34 | "\n", 35 | "\n", 36 | "Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',\n", 37 | " 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']
\n", 38 | "Target Feature: [log1p('count')]
\n", 39 | "Objective: You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Example\n", 49 | "# Converts to log1p(count)\n", 50 | "# Print original count back using expm1\n", 51 | "print('Test log and exp')\n", 52 | "test_count = 100\n", 53 | "print('original value', test_count)\n", 54 | "x = np.log1p(test_count) # log (x+1)\n", 55 | "print('log1p', x)\n", 56 | "print('expm1', np.expm1(x)) # exp(x) - 1" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',\n", 66 | " 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df = pd.read_csv('../Data/bikesharing_train.csv', parse_dates=['datetime'],index_col=0)\n", 76 | "df_test = pd.read_csv('../Data/bikesharing_test.csv', parse_dates=['datetime'],index_col=0)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# We need to convert datetime to numeric for training.\n", 86 | "# Let's extract key features into separate numeric columns\n", 87 | "def add_features(df):\n", 88 | " df['year'] = df.index.year\n", 89 | " df['month'] = df.index.month\n", 90 | " df['day'] = df.index.day\n", 91 | " df['dayofweek'] = df.index.dayofweek\n", 92 | " df['hour'] = df.index.hour" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "add_features(df)\n", 102 | "add_features(df_test)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "plt.plot(df['2011']['count'],label='2011')\n", 112 | "plt.plot(df['2012']['count'],label='2012')\n", 113 | "plt.xticks(fontsize=14, rotation=45)\n", 114 | "plt.xlabel('Date')\n", 115 | "plt.ylabel('Rental Count')\n", 116 | "plt.title('2011 and 2012 Rentals (Year to Year)')\n", 117 | "plt.legend()\n", 118 | "plt.show()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "plt.plot(df['2011']['count'].map(np.log1p),label='2011')\n", 128 | "plt.plot(df['2012']['count'].map(np.log1p),label='2012')\n", 129 | "plt.xticks(fontsize=14, rotation=45)\n", 130 | "plt.xlabel('Date')\n", 131 | "plt.ylabel('Log(Rental Count)')\n", 132 | "plt.title('2011 and 2012 Rentals (Year to Year)')\n", 133 | "plt.legend()\n", 134 | "plt.show()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "plt.boxplot([df['count']], labels=['count'])\n", 144 | "plt.title('Box Plot - Count')\n", 145 | "plt.ylabel('Target')\n", 146 | "plt.grid(True)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "# Let's see how the data distribution changes with log1p\n", 156 | "# Evenly distributed\n", 157 | "plt.boxplot([df['count'].map(np.log1p)], labels=['log1p(count)'])\n", 158 | "plt.title('Box Plot - log1p(Count)')\n", 159 | "plt.ylabel('Target')\n", 160 | "plt.grid(True)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "df[\"count\"] = df[\"count\"].map(np.log1p)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "df.head()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "df_test.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "df.dtypes" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# Save all data\n", 206 | "df.to_csv('../Data/bike_all.csv',index=True,index_label='datetime',columns=columns)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## Training and Validation Set\n", 214 | "### Target Variable as first column followed by input features\n", 215 | "### Training, Validation files do not have a column header" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "# Training = 70% of the data\n", 225 | "# Validation = 30% of the data\n", 226 | "# Randomize the datset\n", 227 | "np.random.seed(5)\n", 228 | "l = list(df.index)\n", 229 | "np.random.shuffle(l)\n", 230 | "df = df.loc[l]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "rows = df.shape[0]\n", 240 | "train = int(.7 * rows)\n", 241 | "test = rows-train" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "rows, train, test" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "columns" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# Write Training Set\n", 269 | "df.iloc[:train].to_csv('../Data/bike_train.csv'\n", 270 | " ,index=False,header=False\n", 271 | " ,columns=columns)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "# Write Validation Set\n", 281 | "df.iloc[train:].to_csv('../Data/bike_validation.csv'\n", 282 | " ,index=False,header=False\n", 283 | " ,columns=columns)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# Test Data has only input features\n", 293 | "df_test.to_csv('../Data/bike_test.csv',index=True,index_label='datetime')" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "print(','.join(columns))" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# Write Column List\n", 312 | "with open('../Data/bike_train_column_list.txt','w') as f:\n", 313 | " f.write(','.join(columns))" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [] 322 | } 323 | ], 324 | "metadata": { 325 | "kernelspec": { 326 | "display_name": "Python 3", 327 | "language": "python", 328 | "name": "python3" 329 | }, 330 | "language_info": { 331 | "codemirror_mode": { 332 | "name": "ipython", 333 | "version": 3 334 | }, 335 | "file_extension": ".py", 336 | "mimetype": "text/x-python", 337 | "name": "python", 338 | "nbconvert_exporter": "python", 339 | "pygments_lexer": "ipython3", 340 | "version": "3.7.6" 341 | } 342 | }, 343 | "nbformat": 4, 344 | "nbformat_minor": 1 345 | } 346 | -------------------------------------------------------------------------------- /Notebooks/02-XGBoost_Regression_BikeRental.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Train a model with bike rental data using XGBoost algorithm\n", 8 | "### Training log1p(count) dataset\n", 9 | "### Model is trained with XGBoost installed in notebook instance\n", 10 | "### In the later examples, we will train using SageMaker's XGBoost algorithm" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Install xgboost in notebook instance.\n", 20 | "#### Command to install xgboost\n", 21 | "# !pip install xgboost==0.90" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import sys\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", 35 | "\n", 36 | "# XGBoost \n", 37 | "import xgboost as xgb\n", 38 | "\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "\n", 41 | "from pandas.plotting import register_matplotlib_converters\n", 42 | "register_matplotlib_converters()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "

Kaggle Bike Sharing Demand Dataset

\n", 50 | "\n", 51 | "Modified 'count' to log1p(count) for training\n", 52 | "\n", 53 | "Log can be used when target represents a count (that is non-negative values)\n", 54 | "\n", 55 | "Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)\n", 56 | "\n", 57 | "Reference:\n", 58 | "https://www.kaggle.com/apapiu/predicting-bike-sharing-with-xgboost by Alexandru Papiu\n", 59 | "\n", 60 | "To download dataset, sign-in and download from this link:\n", 61 | "https://www.kaggle.com/c/bike-sharing-demand/data
\n", 62 | "\n", 63 | "\n", 64 | "Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',\n", 65 | " 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']
\n", 66 | "Target Feature: [log1p('count')]
\n", 67 | "Objective: You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "column_list_file = '../Data/bike_train_column_list.txt'\n", 77 | "train_file = '../Data/bike_train.csv'\n", 78 | "validation_file = '../Data/bike_validation.csv'\n", 79 | "test_file = '../Data/bike_test.csv'" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "columns = ''\n", 89 | "with open(column_list_file,'r') as f:\n", 90 | " columns = f.read().split(',')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "columns" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# Specify the column names as the file does not have column header\n", 109 | "df_train = pd.read_csv(train_file,names=columns)\n", 110 | "df_validation = pd.read_csv(validation_file,names=columns)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "df_train.head()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "df_validation.head()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "X_train = df_train.iloc[:,1:] # Features: 1st column onwards \n", 138 | "y_train = df_train.iloc[:,0].ravel() # Target: 0th column\n", 139 | "\n", 140 | "X_validation = df_validation.iloc[:,1:]\n", 141 | "y_validation = df_validation.iloc[:,0].ravel()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# XGBoost Training Parameter Reference: \n", 151 | "# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n", 152 | "#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)\n", 153 | "regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "regressor" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "df_train['count'].describe()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "eval_result = regressor.evals_result()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "training_rounds = range(len(eval_result['validation_0']['rmse']))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')\n", 208 | "plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')\n", 209 | "plt.grid(True)\n", 210 | "plt.xlabel('Iteration')\n", 211 | "plt.ylabel('RMSE')\n", 212 | "plt.title('Training Vs Validation Error')\n", 213 | "plt.legend()\n", 214 | "plt.show()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "xgb.plot_importance(regressor)\n", 224 | "plt.show()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# Updated - Changed to validation dataset\n", 234 | "# Compare actual vs predicted performance with dataset not seen by the model before\n", 235 | "df = pd.read_csv(validation_file,names=columns)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "df.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "X_test = df.iloc[:,1:]\n", 254 | "print(X_test[:5])" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "result = regressor.predict(X_test)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "result[:5]" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "df.head()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "df['count_predicted'] = result" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "df.head()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "# Negative Values are predicted\n", 309 | "df['count_predicted'].describe()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "df[df['count_predicted'] < 0]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "def adjust_count(x):\n", 328 | " if x < 0:\n", 329 | " return 0\n", 330 | " else:\n", 331 | " return x" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "df['count_predicted'] = df['count_predicted'].map(adjust_count)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "df[df['count_predicted'] < 0]" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "df['count'] = df['count'].map(np.expm1)\n", 359 | "df['count_predicted'] = df['count_predicted'].map(np.expm1)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "# Actual Vs Predicted\n", 369 | "plt.plot(df['count'], label='Actual')\n", 370 | "plt.plot(df['count_predicted'],label='Predicted')\n", 371 | "plt.xlabel('Sample')\n", 372 | "plt.ylabel('Count')\n", 373 | "plt.xlim([100,150])\n", 374 | "plt.title('Validation Dataset - Predicted Vs. Actual')\n", 375 | "plt.legend()\n", 376 | "plt.show()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# Over prediction and Under Prediction needs to be balanced\n", 386 | "# Training Data Residuals\n", 387 | "residuals = (df['count'] - df['count_predicted'])\n", 388 | "\n", 389 | "plt.hist(residuals)\n", 390 | "plt.grid(True)\n", 391 | "plt.xlabel('Actual - Predicted')\n", 392 | "plt.ylabel('Count')\n", 393 | "plt.title('Residuals Distribution')\n", 394 | "plt.axvline(color='r')\n", 395 | "plt.show()" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "value_counts = (residuals > 0).value_counts(sort=False)\n", 405 | "print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))\n", 406 | "print(' Over Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "import sklearn.metrics as metrics\n", 416 | "print(\"RMSE: {0:.2f}\".format(metrics.mean_squared_error(df['count'],\n", 417 | " df['count_predicted'])**.5))" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "# Metric Use By Kaggle\n", 427 | "def compute_rmsle(y_true, y_pred):\n", 428 | " if type(y_true) != np.ndarray:\n", 429 | " y_true = np.array(y_true)\n", 430 | " \n", 431 | " if type(y_pred) != np.ndarray:\n", 432 | " y_pred = np.array(y_pred)\n", 433 | " \n", 434 | " return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "print(\"RMSLE: {0:.2f}\".format(compute_rmsle(df['count'],df['count_predicted'])))" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "# Prepare Data for Submission to Kaggle\n", 453 | "df_test = pd.read_csv(test_file,parse_dates=['datetime'])" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "df_test.head()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "X_test = df_test.iloc[:,1:] # Exclude datetime for prediction" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "X_test.head()" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "result = regressor.predict(X_test)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "result[:5]" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "np.expm1(result)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "# Convert result to actual count\n", 517 | "df_test[\"count\"] = np.expm1(result)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "df_test.head()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "df_test[df_test[\"count\"] < 0]" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "df_test[['datetime','count']].to_csv('../Data/predicted_count.csv',index=False)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [] 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.7.6" 572 | } 573 | }, 574 | "nbformat": 4, 575 | "nbformat_minor": 2 576 | } 577 | -------------------------------------------------------------------------------- /Notebooks/03-XGBoost_Binary_Classification_Diabetes_Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "nbpresent": { 7 | "id": "782a07bf-08de-4030-88e1-6731c4ac956e" 8 | } 9 | }, 10 | "source": [ 11 | "## Diabetes dataset \n", 12 | "### Predict if a person is at risk of developing diabetes\n", 13 | "\n", 14 | "### This Dataset is Freely Available\n", 15 | "\n", 16 | "### Overview:\n", 17 | "The data was collected and made available by the \"National Institute of Diabetes and Digestive and Kidney Diseases\" as part of the Pima Indians Diabetes Database. \n", 18 | "\n", 19 | "`Diabetes.csv` is available [from Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database). We have several questions - what information is more correlated with a positive diagnosis, and if we can only ask two questions to a patient, what should we ask and how would we give them a risk of being diagnosed.\n", 20 | "\n", 21 | "++++++++++++++++++++++++++++++++++++\n", 22 | "\n", 23 | "The following features have been provided to help us predict whether a person is diabetic or not:\n", 24 | "* **Pregnancies:** Number of times pregnant\n", 25 | "* **Glucose:** Plasma glucose concentration over 2 hours in an oral glucose tolerance test\n", 26 | "* **BloodPressure:** Diastolic blood pressure (mm Hg)\n", 27 | "* **SkinThickness:** Triceps skin fold thickness (mm)\n", 28 | "* **Insulin:** 2-Hour serum insulin (mu U/ml)\n", 29 | "* **BMI:** Body mass index (weight in kg/(height in m)2)\n", 30 | "* **DiabetesPedigreeFunction:** Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)\n", 31 | "* **Age:** Age (years)\n", 32 | "* **Outcome:** Class variable (0 if non-diabetic, 1 if diabetic)\n", 33 | "\n", 34 | "### Binary Classification problem - XGBoost" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "nbpresent": { 42 | "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# Install xgboost in notebook instance.\n", 48 | "#### Command to install xgboost\n", 49 | "#!pip install xgboost==0.90" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "nbpresent": { 57 | "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "import sys\n", 63 | "import numpy as np\n", 64 | "import pandas as pd\n", 65 | "import matplotlib.pyplot as plt\n", 66 | "import itertools\n", 67 | "\n", 68 | "import xgboost as xgb\n", 69 | "from sklearn.metrics import classification_report, confusion_matrix" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "nbpresent": { 77 | "id": "a3946273-d086-4564-b0f1-6adc225191c3" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "data = pd.read_csv(\"../Data/Diabetes.csv\")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "data.describe()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "data.info()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "## only keep rows where non of the columns has 0 value (except the first and last columns)\n", 110 | "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n", 111 | "data.reset_index(inplace=True, drop = True)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Dealing with Missing Values" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# using isnull() function \n", 128 | "# print(data.isnull().any().sum())\n", 129 | "print(data.isnull().sum())\n", 130 | "#data.isnull()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "data.drop(columns=['Insulin'], inplace = True)\n", 140 | "data.reset_index(inplace=True, drop = True)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "### Replace missing values in each column with the mean or median of that column\n", 150 | "#data.fillna(data.mean())\n", 151 | "data.fillna(data.median(), inplace=True)\n", 152 | "\n", 153 | "### Drop all rows that contain missing values?\n", 154 | "#data = data.dropna()\n", 155 | "#data.reset_index(inplace=True, drop = True)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### Split Data" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# Training = 70% of the data\n", 172 | "# Validation = 30% of the data\n", 173 | "# Randomize the datset\n", 174 | "np.random.seed(5)\n", 175 | "l = list(data.index)\n", 176 | "np.random.shuffle(l)\n", 177 | "data = data.iloc[l]\n", 178 | "data.reset_index(inplace=True, drop = True)\n", 179 | "data" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "rows = data.shape[0]\n", 189 | "train = int(.7 * rows)\n", 190 | "test = rows - train" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "rows, train, test" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# Training Set\n", 209 | "df_train = data[:train]\n", 210 | "#df_train" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "# validation Set\n", 220 | "df_validation = data[train:]\n", 221 | "#df_validation" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "nbpresent": { 229 | "id": "a195ae30-1962-4427-859b-73a013dc10d6" 230 | } 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "df_train.head()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "377 * 8" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "nbpresent": { 251 | "id": "e30e8aeb-1ca2-4851-bc2d-1bdee29ab1cf" 252 | } 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "df_validation.head()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "nbpresent": { 264 | "id": "3b240613-803d-4fa9-93cf-53ef68df7b93" 265 | } 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "X_train = df_train.iloc[:,:-1] # Features: all columns excep last\n", 270 | "y_train = df_train.iloc[:,-1].ravel() # Target: last column\n", 271 | "\n", 272 | "X_validation = df_validation.iloc[:,:-1]\n", 273 | "y_validation = df_validation.iloc[:,-1].ravel()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "y_validation.shape" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "nbpresent": { 290 | "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e" 291 | } 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "# Launch a classifier\n", 296 | "# XGBoost Training Parameter Reference: \n", 297 | "# https://xgboost.readthedocs.io/en/latest/parameter.html\n", 298 | "classifier = xgb.XGBClassifier (objective=\"binary:logistic\")" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "nbpresent": { 306 | "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a" 307 | } 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "classifier" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "nbpresent": { 319 | "id": "9839d7ce-e791-4d93-bc5f-28604ffde022" 320 | } 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "classifier.fit(X_train,\n", 325 | " y_train, \n", 326 | " eval_set = [(X_train, y_train), (X_validation, y_validation)], \n", 327 | " eval_metric=['logloss'],\n", 328 | " early_stopping_rounds=20)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": { 335 | "nbpresent": { 336 | "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff" 337 | } 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "eval_result = classifier.evals_result()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "nbpresent": { 349 | "id": "092776c3-a611-4f40-91e2-664b3b99d05e" 350 | } 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "training_rounds = range(len(eval_result['validation_0']['logloss']))" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "nbpresent": { 362 | "id": "2e9af3f7-fb85-4c52-83d5-ff9cae457294" 363 | } 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "print(training_rounds)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "nbpresent": { 375 | "id": "5e71239a-e321-43ba-ac2c-993b57b3be3a" 376 | } 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "plt.scatter(x=training_rounds,y=eval_result['validation_0']['logloss'],label='Training Error')\n", 381 | "plt.scatter(x=training_rounds,y=eval_result['validation_1']['logloss'],label='Validation Error')\n", 382 | "plt.grid(True)\n", 383 | "plt.xlabel('Iteration')\n", 384 | "plt.ylabel('LogLoss')\n", 385 | "plt.title('Training Vs Validation Error')\n", 386 | "plt.legend()\n", 387 | "plt.show()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "#### Notice:\n", 395 | "* Model is not generalising well, low train error but high validation error\n", 396 | "* Model has high variance!" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "nbpresent": { 404 | "id": "f144f315-6d38-429e-8c17-06c17a446198" 405 | } 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "xgb.plot_importance(classifier)\n", 410 | "plt.show()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": { 416 | "nbpresent": { 417 | "id": "3312675d-307c-4eff-b835-34f0e7f57924" 418 | } 419 | }, 420 | "source": [ 421 | "#### Predict the Validation Set" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "nbpresent": { 429 | "id": "9b5cb70d-6069-4511-810e-fd17e72667dd" 430 | } 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "X_test = df_validation.iloc[:,:-1]" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": { 441 | "nbpresent": { 442 | "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780" 443 | } 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "result = classifier.predict(X_test)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "result[:5]" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "nbpresent": { 464 | "id": "2c573c2b-4143-4e01-b107-e6b871ce0249" 465 | } 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "df_validation['predicted_class'] = result" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "nbpresent": { 477 | "id": "5ad0fa04-6896-46b5-bc23-40d61480d7ca" 478 | } 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "df_validation.head()" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## Binary Classifier Metrics" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "# Reference: https://scikit-learn.org/stable/modules/model_evaluation.html\n", 499 | "# Explicitly stating labels. Pass=1, Fail=0\n", 500 | "def true_positive(y_true, y_pred): \n", 501 | " return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 0]\n", 502 | "\n", 503 | "def true_negative(y_true, y_pred): \n", 504 | " return confusion_matrix(y_true,y_pred,labels=[1,0])[1, 1]\n", 505 | "\n", 506 | "def false_positive(y_true, y_pred): \n", 507 | " return confusion_matrix(y_true, y_pred,labels=[1,0])[1, 0]\n", 508 | "\n", 509 | "def false_negative(y_true, y_pred): \n", 510 | " return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 1]" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "# Compute Binary Classifier Metrics\n", 520 | "# Returns a dictionary {\"MetricName\":Value,...}\n", 521 | "\n", 522 | "def binary_classifier_metrics(y_true, y_pred):\n", 523 | " metrics = {}\n", 524 | "\n", 525 | " # References: \n", 526 | " # https://docs.aws.amazon.com/machine-learning/latest/dg/binary-classification.html\n", 527 | " # https://en.wikipedia.org/wiki/Confusion_matrix\n", 528 | " \n", 529 | " # Definition:\n", 530 | " # true positive = tp = how many samples were correctly classified as positive (count)\n", 531 | " # true negative = tn = how many samples were correctly classified as negative (count)\n", 532 | " # false positive = fp = how many negative samples were mis-classified as positive (count)\n", 533 | " # false_negative = fn = how many positive samples were mis-classified as negative (count)\n", 534 | " \n", 535 | " # positive = number of positive samples (count)\n", 536 | " # = true positive + false negative\n", 537 | " # negative = number of negative samples (count)\n", 538 | " # = true negative + false positive\n", 539 | " \n", 540 | " tp = true_positive(y_true, y_pred)\n", 541 | " tn = true_negative(y_true, y_pred)\n", 542 | " fp = false_positive(y_true, y_pred)\n", 543 | " fn = false_negative(y_true, y_pred)\n", 544 | " \n", 545 | " positive = tp + fn\n", 546 | " negative = tn + fp\n", 547 | " \n", 548 | " metrics['TruePositive'] = tp\n", 549 | " metrics['TrueNegative'] = tn\n", 550 | " metrics['FalsePositive'] = fp\n", 551 | " metrics['FalseNegative'] = fn\n", 552 | " \n", 553 | " metrics['Positive'] = positive\n", 554 | " metrics['Negative'] = negative\n", 555 | " \n", 556 | " # True Positive Rate (TPR, Recall) = true positive/positive\n", 557 | " # How many positives were correctly classified? (fraction)\n", 558 | " # Recall value closer to 1 is better. closer to 0 is worse\n", 559 | " if tp == 0:\n", 560 | " recall = 0\n", 561 | " else:\n", 562 | " recall = tp/positive\n", 563 | " \n", 564 | " metrics['Recall'] = recall\n", 565 | " \n", 566 | " # True Negative Rate = True Negative/negative\n", 567 | " # How many negatives were correctly classified? (fraction)\n", 568 | " # True Negative Rate value closer to 1 is better. closer to 0 is worse\n", 569 | " if tn == 0:\n", 570 | " tnr = 0\n", 571 | " else:\n", 572 | " tnr = tn/(negative)\n", 573 | " metrics['TrueNegativeRate'] = tnr\n", 574 | " \n", 575 | " # Precision = True Positive/(True Positive + False Positive)\n", 576 | " # How many positives classified by the algorithm are really positives? (fraction)\n", 577 | " # Precision value closer to 1 is better. closer to 0 is worse\n", 578 | " if tp == 0:\n", 579 | " precision = 0\n", 580 | " else:\n", 581 | " precision = tp/(tp + fp)\n", 582 | " metrics['Precision'] = precision\n", 583 | " \n", 584 | " # Accuracy = (True Positive + True Negative)/(total positive + total negative)\n", 585 | " # How many positives and negatives were correctly classified? (fraction)\n", 586 | " # Accuracy value closer to 1 is better. closer to 0 is worse\n", 587 | " accuracy = (tp + tn)/(positive + negative)\n", 588 | " metrics['Accuracy'] = accuracy\n", 589 | " \n", 590 | " # False Positive Rate (FPR, False Alarm) = False Positive/(total negative)\n", 591 | " # How many negatives were mis-classified as positives (fraction)\n", 592 | " # False Positive Rate value closer to 0 is better. closer to 1 is worse\n", 593 | " if fp == 0:\n", 594 | " fpr = 0\n", 595 | " else:\n", 596 | " fpr = fp/(negative)\n", 597 | " metrics['FalsePositiveRate'] = fpr\n", 598 | " \n", 599 | " # False Negative Rate (FNR, Misses) = False Negative/(total Positive)\n", 600 | " # How many positives were mis-classified as negative (fraction)\n", 601 | " # False Negative Rate value closer to 0 is better. closer to 1 is worse\n", 602 | " fnr = fn/(positive)\n", 603 | " metrics['FalseNegativeRate'] = fnr\n", 604 | " \n", 605 | " # F1 Score = harmonic mean of Precision and Recall\n", 606 | " # F1 Score closer to 1 is better. Closer to 0 is worse.\n", 607 | " if precision == 0 or recall == 0:\n", 608 | " f1 = 0\n", 609 | " else: \n", 610 | " f1 = 2*precision*recall/(precision+recall)\n", 611 | "\n", 612 | " metrics['F1'] = f1\n", 613 | " \n", 614 | " return metrics" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": {}, 621 | "outputs": [], 622 | "source": [ 623 | "# Reference: \n", 624 | "# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", 625 | "def plot_confusion_matrix(cm, classes,\n", 626 | " normalize=False,\n", 627 | " title='Confusion matrix',\n", 628 | " cmap=plt.cm.Blues):\n", 629 | " \"\"\"\n", 630 | " This function prints and plots the confusion matrix.\n", 631 | " Normalization can be applied by setting `normalize=True`.\n", 632 | " \"\"\"\n", 633 | " if normalize:\n", 634 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", 635 | " #print(\"Normalized confusion matrix\")\n", 636 | " #else:\n", 637 | " # print('Confusion matrix, without normalization')\n", 638 | "\n", 639 | " #print(cm)\n", 640 | "\n", 641 | " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", 642 | " plt.title(title)\n", 643 | " plt.colorbar()\n", 644 | " tick_marks = np.arange(len(classes))\n", 645 | " plt.xticks(tick_marks, classes, rotation=45)\n", 646 | " plt.yticks(tick_marks, classes)\n", 647 | "\n", 648 | " fmt = '.2f' if normalize else 'd'\n", 649 | " thresh = cm.max() / 2.\n", 650 | " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", 651 | " plt.text(j, i, format(cm[i, j], fmt),\n", 652 | " horizontalalignment=\"center\",\n", 653 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n", 654 | "\n", 655 | " plt.ylabel('True label')\n", 656 | " plt.xlabel('Predicted label')\n", 657 | " plt.tight_layout()" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": null, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "# Compute confusion matrix\n", 667 | "cnf_matrix = confusion_matrix(df_validation['Outcome'], df_validation['predicted_class'],labels=[1,0])" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "# Plot confusion matrix\n", 677 | "plt.figure()\n", 678 | "plot_confusion_matrix(cnf_matrix, classes=['Diabetic','Normal'],\n", 679 | " title='Confusion Matrix')" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "# Plot confusion matrix\n", 689 | "plt.figure()\n", 690 | "plot_confusion_matrix(cnf_matrix, classes=['Diabetic','Normal'],\n", 691 | " title='Confusion Matrix - Fraction', normalize=True)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "metrics = [binary_classifier_metrics(df_validation['Outcome'], df_validation['predicted_class'])]\n", 701 | "df_metrics=pd.DataFrame.from_dict(metrics)\n", 702 | "df_metrics.index = ['Model']" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "df_metrics" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": {}, 718 | "outputs": [], 719 | "source": [ 720 | "print('Counts')\n", 721 | "print(df_metrics[['TruePositive',\n", 722 | " 'FalseNegative',\n", 723 | " 'FalsePositive',\n", 724 | " 'TrueNegative',]].round(2))\n", 725 | "print()\n", 726 | "print('Fractions')\n", 727 | "print(df_metrics[['Recall',\n", 728 | " 'FalseNegativeRate',\n", 729 | " 'FalsePositiveRate',\n", 730 | " 'TrueNegativeRate',]].round(2))\n", 731 | "print()\n", 732 | "\n", 733 | "print(df_metrics[['Precision',\n", 734 | " 'Accuracy',\n", 735 | " 'F1']].round(2))" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "print(classification_report(\n", 745 | " df_validation['Outcome'],\n", 746 | " df_validation['predicted_class'],\n", 747 | " labels=[1,0],\n", 748 | " target_names=['Diabetic','Normal']))" 749 | ] 750 | }, 751 | { 752 | "cell_type": "markdown", 753 | "metadata": {}, 754 | "source": [ 755 | "#### Model Performance not Good Enough?\n", 756 | "#### Debug your Data before you debug your Model!" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": null, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [] 765 | } 766 | ], 767 | "metadata": { 768 | "kernelspec": { 769 | "display_name": "Python 3", 770 | "language": "python", 771 | "name": "python3" 772 | }, 773 | "language_info": { 774 | "codemirror_mode": { 775 | "name": "ipython", 776 | "version": 3 777 | }, 778 | "file_extension": ".py", 779 | "mimetype": "text/x-python", 780 | "name": "python", 781 | "nbconvert_exporter": "python", 782 | "pygments_lexer": "ipython3", 783 | "version": "3.7.6" 784 | } 785 | }, 786 | "nbformat": 4, 787 | "nbformat_minor": 2 788 | } 789 | -------------------------------------------------------------------------------- /Notebooks/04-XGBoost_Course_Prepare_Iris_Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "from sklearn import preprocessing" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "

Iris Classification Dataset

\n", 20 | "\n", 21 | "Input Features:
\n", 22 | "sepal_length,sepal_width,petal_length,petal_width
\n", 23 | "\n", 24 | "Target:
\n", 25 | "Iris plant class
\n", 26 | "\n", 27 | "Objective: Predict iris plant class for a given sepal_length,sepal_width,petal_length,petal_width
\n", 28 | "

Data source: https://archive.ics.uci.edu/ml/datasets/iris

" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "columns = ['encoded_class','sepal_length','sepal_width','petal_length','petal_width']" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Encode Class Labels to integers\n", 47 | "le = preprocessing.LabelEncoder()\n", 48 | "le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "le.classes_" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "df = pd.read_csv('../Data/iris_all.csv')" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df['class'].value_counts()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "df.head()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "df.tail()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "le.transform(df['class'])[-5:]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Convert Classes to numeric value\n", 112 | "df['encoded_class'] = le.transform(df['class'])" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "df.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "df.tail()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Visualize\n", 140 | "setosa = df['class'] == 'Iris-setosa'\n", 141 | "versicolor = df['class'] == 'Iris-versicolor'\n", 142 | "virginica = df['class'] == 'Iris-virginica'" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "plt.scatter(df[setosa].sepal_length,y=df[setosa].sepal_width, label='setosa',color='g')\n", 152 | "plt.scatter(df[versicolor].sepal_length,y=df[versicolor].sepal_width, label='versicolor',color='r')\n", 153 | "plt.scatter(df[virginica].sepal_length,y=df[virginica].sepal_width, label='virginica',color='b')\n", 154 | "plt.xlabel('length')\n", 155 | "plt.ylabel('width')\n", 156 | "plt.title('Sepal')\n", 157 | "plt.grid(True)\n", 158 | "plt.legend()\n", 159 | "plt.show()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "plt.scatter(df[setosa].petal_length,y=df[setosa].petal_width, label='setosa',color='g')\n", 169 | "plt.scatter(df[versicolor].petal_length,y=df[versicolor].petal_width, label='versicolor',color='r')\n", 170 | "plt.scatter(df[virginica].petal_length,y=df[virginica].petal_width, label='virginica',color='b')\n", 171 | "plt.xlabel('length')\n", 172 | "plt.ylabel('width')\n", 173 | "plt.title('Petal')\n", 174 | "plt.grid(True)\n", 175 | "plt.legend()\n", 176 | "plt.show()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "plt.scatter(df[setosa].petal_length,y=df[setosa].sepal_length, label='setosa',color='g')\n", 186 | "plt.scatter(df[versicolor].petal_length,y=df[versicolor].sepal_length, label='versicolor',color='r')\n", 187 | "plt.scatter(df[virginica].petal_length,y=df[virginica].sepal_length, label='virginica',color='b')\n", 188 | "plt.xlabel('petal length')\n", 189 | "plt.ylabel('sepal length')\n", 190 | "plt.title('Petal-Sepal')\n", 191 | "plt.grid(True)\n", 192 | "plt.legend()\n", 193 | "plt.show()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Training and Validation Set\n", 201 | "### Target Variable as first column followed by input features:\n", 202 | "class,sepal_length,sepal_width,petal_length,petal_width\n", 203 | "### Training, Validation files do not have a column header" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# Training = 70% of the data\n", 213 | "# Validation = 30% of the data\n", 214 | "# Randomize the datset\n", 215 | "np.random.seed(5)\n", 216 | "l = list(df.index)\n", 217 | "np.random.shuffle(l)\n", 218 | "df = df.iloc[l]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "rows = df.shape[0]\n", 228 | "train = int(.7 * rows)\n", 229 | "test = rows-train" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "rows, train, test" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "# Write Training Set\n", 248 | "df[:train].to_csv('../Data/iris_train.csv'\n", 249 | " ,index=False,header=False\n", 250 | " ,columns=columns)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "# Write Validation Set\n", 260 | "df[train:].to_csv('../Data/iris_validation.csv'\n", 261 | " ,index=False,header=False\n", 262 | " ,columns=columns)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# Write Column List\n", 272 | "with open('../Data/iris_train_column_list.txt','w') as f:\n", 273 | " f.write(','.join(columns))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.7.6" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 1 305 | } 306 | -------------------------------------------------------------------------------- /Notebooks/05-XGBoost_Course_Multiclass_Classification_Iris_Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "nbpresent": { 7 | "id": "782a07bf-08de-4030-88e1-6731c4ac956e" 8 | } 9 | }, 10 | "source": [ 11 | "## Train a model with Iris data using XGBoost algorithm\n", 12 | "### Model is trained with XGBoost installed in notebook instance\n", 13 | "### In the later examples, we will train using SageMaker's XGBoost algorithm" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "nbpresent": { 21 | "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae" 22 | } 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Install xgboost in notebook instance.\n", 27 | "#### Command to install xgboost\n", 28 | "# !pip install xgboost==0.90" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "nbpresent": { 36 | "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656" 37 | } 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import sys\n", 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "import itertools\n", 46 | "import xgboost as xgb\n", 47 | "\n", 48 | "from sklearn import preprocessing\n", 49 | "from sklearn.metrics import classification_report, confusion_matrix" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "nbpresent": { 57 | "id": "a3946273-d086-4564-b0f1-6adc225191c3" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "column_list_file = '../Data/iris_train_column_list.txt'\n", 63 | "train_file = '../Data/iris_train.csv'\n", 64 | "validation_file = '../Data/iris_validation.csv'" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "nbpresent": { 72 | "id": "7c803d6c-74cc-40d2-ab48-747ff4346c22" 73 | } 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "columns = ''\n", 78 | "with open(column_list_file,'r') as f:\n", 79 | " columns = f.read().split(',')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "nbpresent": { 87 | "id": "630dde8d-44b9-415d-8876-4e873407d0fc" 88 | } 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "columns" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Encode Class Labels to integers\n", 102 | "# Labeled Classes\n", 103 | "labels=[0,1,2]\n", 104 | "classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']\n", 105 | "le = preprocessing.LabelEncoder()\n", 106 | "le.fit(classes)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "nbpresent": { 114 | "id": "d6ff2283-cb13-468f-b0cc-0aefeab7b57f" 115 | } 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "# Specify the column names as the file does not have column header\n", 120 | "df_train = pd.read_csv(train_file,names=columns)\n", 121 | "df_validation = pd.read_csv(validation_file,names=columns)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "nbpresent": { 129 | "id": "a195ae30-1962-4427-859b-73a013dc10d6" 130 | } 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "df_train.head()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "nbpresent": { 142 | "id": "e30e8aeb-1ca2-4851-bc2d-1bdee29ab1cf" 143 | } 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "df_validation.head()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "nbpresent": { 155 | "id": "3b240613-803d-4fa9-93cf-53ef68df7b93" 156 | } 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "X_train = df_train.iloc[:,1:] # Features: 1st column onwards \n", 161 | "y_train = df_train.iloc[:,0].ravel() # Target: 0th column\n", 162 | "\n", 163 | "X_validation = df_validation.iloc[:,1:]\n", 164 | "y_validation = df_validation.iloc[:,0].ravel()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "nbpresent": { 172 | "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e" 173 | } 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "# Launch a classifier\n", 178 | "# XGBoost Training Parameter Reference: \n", 179 | "# https://xgboost.readthedocs.io/en/latest/parameter.html\n", 180 | "\n", 181 | "classifier = xgb.XGBClassifier(objective=\"multi:softmax\",\n", 182 | " num_class=3,\n", 183 | " n_estimators=100, use_label_encoder=False)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "nbpresent": { 191 | "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a" 192 | } 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "classifier" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "nbpresent": { 204 | "id": "9839d7ce-e791-4d93-bc5f-28604ffde022" 205 | } 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "classifier.fit(X_train,\n", 210 | " y_train,\n", 211 | " eval_set = [(X_train, y_train), (X_validation, y_validation)],\n", 212 | " eval_metric=['mlogloss'],\n", 213 | " early_stopping_rounds=10)\n", 214 | "\n", 215 | "# early_stopping_rounds - needs to be passed in as a hyperparameter in SageMaker XGBoost implementation\n", 216 | "# \"The model trains until the validation score stops improving. \n", 217 | "# Validation error needs to decrease at least every early_stopping_rounds to continue training.\n", 218 | "# Amazon SageMaker hosting uses the best model for inference.\"" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "nbpresent": { 226 | "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff" 227 | } 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "eval_result = classifier.evals_result()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "nbpresent": { 239 | "id": "092776c3-a611-4f40-91e2-664b3b99d05e" 240 | } 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "training_rounds = range(len(eval_result['validation_0']['mlogloss']))" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "nbpresent": { 252 | "id": "2e9af3f7-fb85-4c52-83d5-ff9cae457294" 253 | } 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "print(training_rounds)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "nbpresent": { 265 | "id": "5e71239a-e321-43ba-ac2c-993b57b3be3a" 266 | } 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')\n", 271 | "plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')\n", 272 | "plt.grid(True)\n", 273 | "plt.xlabel('Iteration')\n", 274 | "plt.ylabel('LogLoss')\n", 275 | "plt.title('Training Vs Validation Error')\n", 276 | "plt.legend()\n", 277 | "plt.show()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "nbpresent": { 285 | "id": "f144f315-6d38-429e-8c17-06c17a446198" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "xgb.plot_importance(classifier)\n", 291 | "plt.show()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "nbpresent": { 299 | "id": "3312675d-307c-4eff-b835-34f0e7f57924" 300 | } 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "df = pd.read_csv(validation_file,names=columns)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "nbpresent": { 312 | "id": "afad019f-88df-4893-bb3d-b7f2b7db214b" 313 | } 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "df.head()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "nbpresent": { 325 | "id": "9b5cb70d-6069-4511-810e-fd17e72667dd" 326 | } 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "X_test = df.iloc[:,1:]\n", 331 | "print(X_test[:5])" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "nbpresent": { 339 | "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780" 340 | } 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "result = classifier.predict(X_test)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "result[:5]" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "nbpresent": { 361 | "id": "2c573c2b-4143-4e01-b107-e6b871ce0249" 362 | } 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "df['predicted_class'] = result #le.inverse_transform(result)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "nbpresent": { 374 | "id": "5ad0fa04-6896-46b5-bc23-40d61480d7ca" 375 | } 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "df.head()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "# Compare performance of Actual and Model 1 Prediction\n", 389 | "plt.figure()\n", 390 | "plt.scatter(df.index,df['encoded_class'],label='Actual')\n", 391 | "plt.scatter(df.index,df['predicted_class'],label='Predicted',marker='^')\n", 392 | "plt.legend(loc=4)\n", 393 | "plt.yticks([0,1,2])\n", 394 | "plt.xlabel('Sample')\n", 395 | "plt.ylabel('Class')\n", 396 | "plt.show()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "

Confusion Matrix

\n", 404 | "Confusion Matrix is a table that summarizes performance of classification model.

" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "# Reference: \n", 414 | "# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", 415 | "def plot_confusion_matrix(cm, classes,\n", 416 | " normalize=False,\n", 417 | " title='Confusion matrix',\n", 418 | " cmap=plt.cm.Blues):\n", 419 | " \"\"\"\n", 420 | " This function prints and plots the confusion matrix.\n", 421 | " Normalization can be applied by setting `normalize=True`.\n", 422 | " \"\"\"\n", 423 | " if normalize:\n", 424 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", 425 | " #print(\"Normalized confusion matrix\")\n", 426 | " #else:\n", 427 | " # print('Confusion matrix, without normalization')\n", 428 | "\n", 429 | " #print(cm)\n", 430 | "\n", 431 | " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", 432 | " plt.title(title)\n", 433 | " plt.colorbar()\n", 434 | " tick_marks = np.arange(len(classes))\n", 435 | " plt.xticks(tick_marks, classes, rotation=45)\n", 436 | " plt.yticks(tick_marks, classes)\n", 437 | "\n", 438 | " fmt = '.2f' if normalize else 'd'\n", 439 | " thresh = cm.max() / 2.\n", 440 | " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", 441 | " plt.text(j, i, format(cm[i, j], fmt),\n", 442 | " horizontalalignment=\"center\",\n", 443 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n", 444 | "\n", 445 | " plt.ylabel('True label')\n", 446 | " plt.xlabel('Predicted label')\n", 447 | " plt.tight_layout()" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "# Compute confusion matrix\n", 457 | "cnf_matrix = confusion_matrix(df['encoded_class'],\n", 458 | " df['predicted_class'],labels=labels)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "cnf_matrix" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "# Plot confusion matrix\n", 477 | "plt.figure()\n", 478 | "plot_confusion_matrix(cnf_matrix, classes=classes,\n", 479 | " title='Confusion matrix - Count')" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# Plot confusion matrix\n", 489 | "plt.figure()\n", 490 | "plot_confusion_matrix(cnf_matrix, classes=classes,\n", 491 | " title='Confusion matrix - Count',normalize=True)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "print(classification_report(\n", 501 | " df['encoded_class'],\n", 502 | " df['predicted_class'],\n", 503 | " labels=labels,\n", 504 | " target_names=classes))" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "### Well Done!" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [] 520 | } 521 | ], 522 | "metadata": { 523 | "kernelspec": { 524 | "display_name": "Python 3", 525 | "language": "python", 526 | "name": "python3" 527 | }, 528 | "language_info": { 529 | "codemirror_mode": { 530 | "name": "ipython", 531 | "version": 3 532 | }, 533 | "file_extension": ".py", 534 | "mimetype": "text/x-python", 535 | "name": "python", 536 | "nbconvert_exporter": "python", 537 | "pygments_lexer": "ipython3", 538 | "version": "3.7.6" 539 | } 540 | }, 541 | "nbformat": 4, 542 | "nbformat_minor": 2 543 | } 544 | -------------------------------------------------------------------------------- /Notebooks/06-XGBoost-TimeSeries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "More here:\n", 8 | "https://machinelearningmastery.com/xgboost-for-time-series-forecasting/\n", 9 | " \n", 10 | "And here:\n", 11 | "https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# forecast monthly births with xgboost\n", 21 | "from numpy import asarray\n", 22 | "from pandas import read_csv\n", 23 | "from pandas import DataFrame\n", 24 | "from pandas import concat\n", 25 | "from sklearn.metrics import mean_absolute_error\n", 26 | "from xgboost import XGBRegressor\n", 27 | "from matplotlib import pyplot" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# transform a time series dataset into a supervised learning dataset\n", 37 | "def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):\n", 38 | " n_vars = 1 if type(data) is list else data.shape[1]\n", 39 | " df = DataFrame(data)\n", 40 | " cols = list()\n", 41 | " # input sequence (t-n, ... t-1)\n", 42 | " for i in range(n_in, 0, -1):\n", 43 | " cols.append(df.shift(i))\n", 44 | " # forecast sequence (t, t+1, ... t+n)\n", 45 | " for i in range(0, n_out):\n", 46 | " cols.append(df.shift(-i))\n", 47 | " # put it all together\n", 48 | " agg = concat(cols, axis=1)\n", 49 | " # drop rows with NaN values\n", 50 | " if dropnan:\n", 51 | " agg.dropna(inplace=True)\n", 52 | " return agg.values\n", 53 | "\n", 54 | "# split a univariate dataset into train/test sets\n", 55 | "def train_test_split(data, n_test):\n", 56 | " return data[:-n_test, :], data[-n_test:, :]\n", 57 | "\n", 58 | "# fit an xgboost model and make a one step prediction\n", 59 | "def xgboost_forecast(train, testX):\n", 60 | " # transform list into array\n", 61 | " train = asarray(train)\n", 62 | " # split into input and output columns\n", 63 | " trainX, trainy = train[:, :-1], train[:, -1]\n", 64 | " # fit model\n", 65 | " model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)\n", 66 | " model.fit(trainX, trainy)\n", 67 | " # make a one-step prediction\n", 68 | " yhat = model.predict(asarray([testX]))\n", 69 | " return yhat[0]\n", 70 | "\n", 71 | "# walk-forward validation for univariate data\n", 72 | "def walk_forward_validation(data, n_test):\n", 73 | " predictions = list()\n", 74 | " # split dataset\n", 75 | " train, test = train_test_split(data, n_test)\n", 76 | " # seed history with training dataset\n", 77 | " history = [x for x in train]\n", 78 | " # step over each time-step in the test set\n", 79 | " for i in range(len(test)):\n", 80 | " # split test row into input and output columns\n", 81 | " testX, testy = test[i, :-1], test[i, -1]\n", 82 | " # fit model on history and make a prediction\n", 83 | " yhat = xgboost_forecast(history, testX)\n", 84 | " # store forecast in list of predictions\n", 85 | " predictions.append(yhat)\n", 86 | " # add actual observation to history for the next loop\n", 87 | " history.append(test[i])\n", 88 | " # summarize progress\n", 89 | " print('>expected=%.1f, predicted=%.1f' % (testy, yhat))\n", 90 | " # estimate prediction error\n", 91 | " error = mean_absolute_error(test[:, -1], predictions)\n", 92 | " return error, test[:, -1], predictions" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# load the dataset\n", 102 | "# Daily total female births in California, 1959\n", 103 | "# Source: Time Series Data Library (citing: Newton (1988))\n", 104 | " \n", 105 | "series = read_csv('../Data/daily-total-female-births.csv', header=0, index_col=0)\n", 106 | "values = series.values\n", 107 | "# transform the time series data into supervised learning\n", 108 | "data = series_to_supervised(values, n_in=6)\n", 109 | "# evaluate\n", 110 | "mae, y, yhat = walk_forward_validation(data, 12)\n", 111 | "print('MAE: %.3f' % mae)\n", 112 | "# plot expected vs preducted\n", 113 | "pyplot.plot(y, label='Expected')\n", 114 | "pyplot.plot(yhat, label='Predicted')\n", 115 | "pyplot.legend()\n", 116 | "pyplot.show()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.7.6" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /Notebooks/07-XGBoost_Feature_Importance_Selection_Diabetes_Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "nbpresent": { 7 | "id": "782a07bf-08de-4030-88e1-6731c4ac956e" 8 | } 9 | }, 10 | "source": [ 11 | "## Diabetes dataset \n", 12 | "### Predict if a person is at risk of developing diabetes\n", 13 | "\n", 14 | "### This Dataset is Freely Available\n", 15 | "\n", 16 | "### Overview:\n", 17 | "The data was collected and made available by the \"National Institute of Diabetes and Digestive and Kidney Diseases\" as part of the Pima Indians Diabetes Database. \n", 18 | "\n", 19 | "`Diabetes.csv` is available [from Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database). We have several questions - what information is more correlated with a positive diagnosis, and if we can only ask two questions to a patient, what should we ask and how would we give them a risk of being diagnosed.\n", 20 | "\n", 21 | "++++++++++++++++++++++++++++++++++++\n", 22 | "\n", 23 | "The following features have been provided to help us predict whether a person is diabetic or not:\n", 24 | "* **Pregnancies:** Number of times pregnant\n", 25 | "* **Glucose:** Plasma glucose concentration over 2 hours in an oral glucose tolerance test\n", 26 | "* **BloodPressure:** Diastolic blood pressure (mm Hg)\n", 27 | "* **SkinThickness:** Triceps skin fold thickness (mm)\n", 28 | "* **Insulin:** 2-Hour serum insulin (mu U/ml)\n", 29 | "* **BMI:** Body mass index (weight in kg/(height in m)2)\n", 30 | "* **DiabetesPedigreeFunction:** Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)\n", 31 | "* **Age:** Age (years)\n", 32 | "* **Outcome:** Class variable (0 if non-diabetic, 1 if diabetic)\n", 33 | "\n", 34 | "### Binary Classification problem - XGBoost" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "nbpresent": { 42 | "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae" 43 | } 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# Install xgboost in notebook instance.\n", 48 | "#### Command to install xgboost\n", 49 | "#!pip install xgboost==0.90" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "nbpresent": { 57 | "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656" 58 | } 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "import numpy as np\n", 63 | "import pandas as pd\n", 64 | "import matplotlib.pyplot as plt\n", 65 | "\n", 66 | "import xgboost as xgb\n", 67 | "\n", 68 | "\n", 69 | "from sklearn.model_selection import train_test_split\n", 70 | "from xgboost import plot_importance\n", 71 | "\n", 72 | "from sklearn.metrics import accuracy_score\n", 73 | "from sklearn.feature_selection import SelectFromModel\n", 74 | "\n", 75 | "import warnings\n", 76 | "warnings.filterwarnings('ignore')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "nbpresent": { 84 | "id": "a3946273-d086-4564-b0f1-6adc225191c3" 85 | } 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "data = pd.read_csv(\"../Data/Diabetes.csv\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "data.describe()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "data.info()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "## only keep rows where non of the columns has 0 value (except the first and last columns)\n", 117 | "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n", 118 | "data.reset_index(inplace=True, drop = True)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Dealing with Missing Values" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# using isnull() function \n", 135 | "# print(data.isnull().any().sum())\n", 136 | "print(data.isnull().sum())\n", 137 | "#data.isnull()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "data.drop(columns=['Insulin'], inplace = True)\n", 147 | "data.reset_index(inplace=True, drop = True)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "### Replace missing values in each column with the mean or median of that column\n", 157 | "#data.fillna(data.mean())\n", 158 | "data.fillna(data.median(), inplace=True)\n", 159 | "\n", 160 | "### Drop all rows that contain missing values?\n", 161 | "#data = data.dropna()\n", 162 | "#data.reset_index(inplace=True, drop = True)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Split Data" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "X = data.iloc[:,:-1] # Features: all columns excep last\n", 179 | "y = data.iloc[:,-1].ravel() # Target: last column\n", 180 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "### Launch XGBoost classifier" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "nbpresent": { 195 | "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e" 196 | } 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "# Launch a classifier\n", 201 | "# XGBoost Training Parameter Reference: \n", 202 | "# https://xgboost.readthedocs.io/en/latest/parameter.html\n", 203 | "classifier = xgb.XGBClassifier (objective=\"binary:logistic\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "nbpresent": { 211 | "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a" 212 | } 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "classifier" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "nbpresent": { 224 | "id": "9839d7ce-e791-4d93-bc5f-28604ffde022" 225 | } 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "classifier.fit(X_train,\n", 230 | " y_train, \n", 231 | " eval_metric=['logloss'])" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Plot Feature Importance" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "nbpresent": { 246 | "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff" 247 | } 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# plot feature importance\n", 252 | "plot_importance(classifier)\n", 253 | "plt.show()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "nbpresent": { 260 | "id": "3312675d-307c-4eff-b835-34f0e7f57924" 261 | } 262 | }, 263 | "source": [ 264 | "### Feature Selection using Feature Importance\n", 265 | "* Feature importance scores can be used for feature selection in scikit-learn.\n", 266 | "* This is done using the SelectFromModel class that takes a model and can transform a dataset into a subset with selected features.\n", 267 | "* This class can take a pre-trained model, such as one trained on the entire training dataset. \n", 268 | "* It can then use a threshold to decide which features to select. \n", 269 | "* This threshold is used when you call the transform() method on the SelectFromModel instance to consistently select the same features on the training dataset and the test dataset.\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "nbpresent": { 277 | "id": "9b5cb70d-6069-4511-810e-fd17e72667dd" 278 | } 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "# fit model on all training data\n", 283 | "model = xgb.XGBClassifier(objective=\"binary:logistic\", use_label_encoder =False)\n", 284 | "model.fit(X_train, y_train, eval_metric=['logloss'])\n", 285 | "# make predictions for test data and evaluate\n", 286 | "y_pred = model.predict(X_test)\n", 287 | "predictions = [round(value) for value in y_pred]\n", 288 | "accuracy = accuracy_score(y_test, predictions)\n", 289 | "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n", 290 | "# Fit model using each importance as a threshold\n", 291 | "thresholds = np.sort(model.feature_importances_)\n", 292 | "for thresh in thresholds:\n", 293 | " # select features using threshold\n", 294 | " selection = SelectFromModel(model, threshold=thresh, prefit=True)\n", 295 | " select_X_train = selection.transform(X_train)\n", 296 | " # train model\n", 297 | " selection_model = xgb.XGBClassifier(objective=\"binary:logistic\", use_label_encoder =False)\n", 298 | " selection_model.fit(select_X_train, y_train, eval_metric=['logloss'])\n", 299 | " # eval model\n", 300 | " select_X_test = selection.transform(X_test)\n", 301 | " y_pred = selection_model.predict(select_X_test)\n", 302 | " predictions = [round(value) for value in y_pred]\n", 303 | " accuracy = accuracy_score(y_test, predictions)\n", 304 | " print(\"Thresh=%.3f, n=%d, Accuracy: %.2f%%\" % (thresh, select_X_train.shape[1], accuracy*100.0))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": { 310 | "nbpresent": { 311 | "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780" 312 | } 313 | }, 314 | "source": [ 315 | "You can see that the performance of the model generally decreases with the number of selected features." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.7.6" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 2 347 | } 348 | -------------------------------------------------------------------------------- /Notebooks/08-XGBoost_Hyperparameter_Tuning_Diabetes_Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "nbpresent": { 7 | "id": "782a07bf-08de-4030-88e1-6731c4ac956e" 8 | } 9 | }, 10 | "source": [ 11 | "## XGBoost Hyperparameter Tuning\n", 12 | "This is a nice tutorial\n", 13 | "https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "nbpresent": { 21 | "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae" 22 | } 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Install xgboost in notebook instance.\n", 27 | "#### Command to install xgboost\n", 28 | "#!pip install xgboost==0.90" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "nbpresent": { 36 | "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656" 37 | } 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import pandas as pd\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "\n", 45 | "import xgboost as xgb\n", 46 | "\n", 47 | "from sklearn.model_selection import train_test_split\n", 48 | "\n", 49 | "#from sklearn import cross_validation, metrics #Additional scklearn functions\n", 50 | "from sklearn.model_selection import GridSearchCV #Performing grid search\n", 51 | "\n", 52 | "import warnings\n", 53 | "warnings.filterwarnings('ignore')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "nbpresent": { 61 | "id": "a3946273-d086-4564-b0f1-6adc225191c3" 62 | } 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "data = pd.read_csv(\"../Data/Diabetes.csv\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "data.describe()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "data.info()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "## only keep rows where non of the columns has 0 value (except the first and last columns)\n", 94 | "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n", 95 | "data.reset_index(inplace=True, drop = True)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Dealing with Missing Values" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# using isnull() function \n", 112 | "# print(data.isnull().any().sum())\n", 113 | "print(data.isnull().sum())\n", 114 | "#data.isnull()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "data.drop(columns=['Insulin'], inplace = True)\n", 124 | "data.reset_index(inplace=True, drop = True)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "### Replace missing values in each column with the mean or median of that column\n", 134 | "#data.fillna(data.mean())\n", 135 | "data.fillna(data.median(), inplace=True)\n", 136 | "\n", 137 | "### Drop all rows that contain missing values?\n", 138 | "#data = data.dropna()\n", 139 | "#data.reset_index(inplace=True, drop = True)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Split Data" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "X = data.iloc[:,:-1] # Features: all columns excep last\n", 156 | "y = data.iloc[:,-1].ravel() # Target: last column\n", 157 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "### Useful Function\n", 165 | "* This function will help us create XGBoost models and perform cross-validation. \n", 166 | "* The best part is that you can take this function as it is and use it later for your own models." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "nbpresent": { 174 | "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e" 175 | } 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): \n", 180 | " if useTrainCV:\n", 181 | " xgb_param = alg.get_xgb_params()\n", 182 | " xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)\n", 183 | " cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\n", 184 | " metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)\n", 185 | " alg.set_params(n_estimators=cvresult.shape[0])\n", 186 | " \n", 187 | " #Fit the algorithm on the data\n", 188 | " alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')\n", 189 | " \n", 190 | " #Predict training set:\n", 191 | " dtrain_predictions = alg.predict(dtrain[predictors])\n", 192 | " dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]\n", 193 | " \n", 194 | " #Print model report:\n", 195 | " print (\"\\nModel Report\")\n", 196 | " print (\"Accuracy : %.4g\" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))\n", 197 | " print (\"AUC Score (Train): %f\" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))\n", 198 | " \n", 199 | " feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)\n", 200 | " feat_imp.plot(kind='bar', title='Feature Importances')\n", 201 | " plt.ylabel('Feature Importance Score')" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Tune max_depth and min_child_weight" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "nbpresent": { 216 | "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff" 217 | } 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "%%time\n", 222 | "param_test2 = {\n", 223 | " 'max_depth':[4,5,6],\n", 224 | " 'min_child_weight':[4,5,6]\n", 225 | "}\n", 226 | "gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,\n", 227 | " min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,\n", 228 | " objective= 'binary:logistic',eval_metric ='logloss', nthread=4, scale_pos_weight=1,seed=27), \n", 229 | " param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5, return_train_score=True)\n", 230 | "\n", 231 | "gsearch2.fit(X_train ,y_train)\n", 232 | "\n", 233 | "print(gsearch2.best_params_)\n", 234 | "print(gsearch2.best_score_)\n", 235 | "#print(gsearch2.cv_results_)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "dir(gsearch2)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.7.6" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | -------------------------------------------------------------------------------- /Notebooks/09-AWS_XGBoost_Train_Host_Predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SageMaker's XGBoost Built-in Algorithm on AWS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Install SageMaker and boto if you don't already have them\n", 17 | "#!pip install --upgrade sagemaker\n", 18 | "#!pip install --upgrade boto3" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Remember these four steps:\n", 26 | "1) Upload Train and Validation files to S3\n", 27 | "\n", 28 | "2) Specify Algorithm and Hyperparameters\n", 29 | "\n", 30 | "3) Configure type of server and number of servers to use for Training\n", 31 | "\n", 32 | "4) Create a real-time Endpoint for interactive use case" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Import required libraries" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Import required libraries\n", 49 | "import numpy as np\n", 50 | "import pandas as pd\n", 51 | "\n", 52 | "import boto3\n", 53 | "import re\n", 54 | "\n", 55 | "import sagemaker\n", 56 | "from sagemaker import get_execution_role\n", 57 | "# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "sagemaker.__version__" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "sagemaker.__version__" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Upload Data to S3" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Make sure you specify your own bucket name\n", 92 | "bucket_name = 'aws-ml-test-nsadawi'\n", 93 | "\n", 94 | "training_folder = r'bikerental/training/'\n", 95 | "validation_folder = r'bikerental/validation/'\n", 96 | "test_folder = r'bikerental/test/'\n", 97 | "\n", 98 | "s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)\n", 99 | "s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)\n", 100 | "s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)\n", 101 | "s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "print(s3_model_output_location)\n", 111 | "print(s3_training_file_location)\n", 112 | "print(s3_validation_file_location)\n", 113 | "print(s3_test_file_location)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Write and Reading from S3 is just as easy\n", 123 | "# files are referred as objects in S3. \n", 124 | "# file name is referred as key name in S3\n", 125 | "\n", 126 | "# File stored in S3 is automatically replicated across 3 different availability zones \n", 127 | "# in the region where the bucket was created.\n", 128 | "\n", 129 | "# http://boto3.readthedocs.io/en/latest/guide/s3.html\n", 130 | "def write_to_s3(filename, bucket, key):\n", 131 | " with open(filename,'rb') as f: # Read in binary mode\n", 132 | " return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "write_to_s3('../Data/bike_train.csv', \n", 142 | " bucket_name,\n", 143 | " training_folder + 'bike_train.csv')\n", 144 | "\n", 145 | "write_to_s3('../Data/bike_validation.csv',\n", 146 | " bucket_name,\n", 147 | " validation_folder + 'bike_validation.csv')\n", 148 | "\n", 149 | "write_to_s3('../Data/bike_test.csv',\n", 150 | " bucket_name,\n", 151 | " test_folder + 'bike_test.csv')" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Training Algorithm Docker Image\n", 159 | "### SageMaker maintains a separate image for algorithm and region\n", 160 | "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# Establish a session with AWS\n", 170 | "sess = sagemaker.Session()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "#### Important to use an IAM Role\n", 178 | "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "try:\n", 188 | " role = sagemaker.get_execution_role()\n", 189 | "except ValueError:\n", 190 | " iam = boto3.client('iam')\n", 191 | " #arn:aws:iam::479320215787:role/service-role/AmazonSageMaker-ExecutionRole-20210306T134306\n", 192 | " role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20210306T134306')['Role']['Arn']" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# This role contains the permissions needed to train, deploy models\n", 202 | "# SageMaker Service is trusted to assume this role\n", 203 | "print(role)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html#sagemaker.image_uris.retrieve\n", 213 | "\n", 214 | "# SDK 2 uses image_uris.retrieve the container image location\n", 215 | "\n", 216 | "# Use XGBoost 1.2 version \n", 217 | "container = sagemaker.image_uris.retrieve(\"xgboost\",sess.boto_region_name,version=\"1.2-1\")\n", 218 | "\n", 219 | "print (f'Using XGBoost Container {container}')" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Build Model" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "# Configure the training job\n", 236 | "# Specify type and number of instances to use\n", 237 | "# S3 location where final artifacts need to be stored\n", 238 | "\n", 239 | "# Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html\n", 240 | "\n", 241 | "# SDK 2.x version does not require train prefix for instance count and type\n", 242 | "estimator = sagemaker.estimator.Estimator(\n", 243 | " container,\n", 244 | " role,\n", 245 | " instance_count=1,\n", 246 | " instance_type='ml.m4.xlarge',\n", 247 | " output_path=s3_model_output_location,\n", 248 | " sagemaker_session=sess,\n", 249 | " base_job_name = 'xgboost-bikerental-v1')" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Specify hyper parameters that appropriate for the training algorithm\n", 259 | "# XGBoost Training Parameter Reference\n", 260 | "# https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters\n", 261 | "\n", 262 | "# TODO: objective xgboost has deprecated reg:linear. use reg:squarederror instead\n", 263 | "estimator.set_hyperparameters(max_depth=5,\n", 264 | " objective=\"reg:squarederror\",\n", 265 | " eta=0.1,\n", 266 | " num_round=150)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "estimator.hyperparameters()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "source": [ 284 | "### Specify Training Data Location and Optionally, Validation Data Location" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "# content type can be libsvm or csv for XGBoost\n", 294 | "training_input_config = sagemaker.session.TrainingInput(\n", 295 | " s3_data=s3_training_file_location,\n", 296 | " content_type='csv',\n", 297 | " s3_data_type='S3Prefix')\n", 298 | "\n", 299 | "validation_input_config = sagemaker.session.TrainingInput(\n", 300 | " s3_data=s3_validation_file_location,\n", 301 | " content_type='csv',\n", 302 | " s3_data_type='S3Prefix'\n", 303 | ")\n", 304 | "\n", 305 | "data_channels = {'train': training_input_config, 'validation': validation_input_config}" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "print(training_input_config.config)\n", 315 | "print(validation_input_config.config)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "### Train the model (takes a few minutes)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "%%time\n", 332 | "# XGBoost supports \"train\", \"validation\" channels\n", 333 | "# Reference: Supported channels by algorithm\n", 334 | "# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html\n", 335 | "estimator.fit(data_channels)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "## Deploy Model (takes a few minutes)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "%%time\n", 352 | "# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html\n", 353 | "predictor = estimator.deploy(initial_instance_count=1,\n", 354 | " instance_type='ml.m4.xlarge',\n", 355 | " endpoint_name = 'xgboost-bikerental-v2')" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "collapsed": true 362 | }, 363 | "source": [ 364 | "## Make Predictions" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "# SDK 2.0 serializers\n", 374 | "from sagemaker.serializers import CSVSerializer" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "predictor.serializer = CSVSerializer()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": { 398 | "collapsed": true 399 | }, 400 | "source": [ 401 | "## Summary" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "1. Ensure Training, Test and Validation data are in S3 Bucket\n", 409 | "2. Select Algorithm Container Registry Path - Path varies by region\n", 410 | "3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location\n", 411 | "4. Specify algorithm specific hyper parameters\n", 412 | "5. Train model\n", 413 | "6. Deploy model - Specify instance count, instance type and endpoint name\n", 414 | "7. Make Predictions" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "## What if the Endpoint is Already Up and Running?" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 3", 435 | "language": "python", 436 | "name": "python3" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.7.6" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 2 453 | } 454 | -------------------------------------------------------------------------------- /Notebooks/10-AWS_XGBoost_Invoke_Endpoint_Predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

XGBoost Cloud Prediction Invocation Template

\n", 8 | "

Invoke SageMaker Prediction Service

" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import math\n", 22 | "import os\n", 23 | "\n", 24 | "import boto3\n", 25 | "import re\n", 26 | "from sagemaker import get_execution_role\n", 27 | "import sagemaker\n", 28 | "\n", 29 | "# SDK 2 serializers and deserializers\n", 30 | "from sagemaker.serializers import CSVSerializer\n", 31 | "from sagemaker.deserializers import JSONDeserializer" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "This Stackoverflow answer is useful:\n", 39 | "https://stackoverflow.com/a/51086736" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# SDK 2\n", 49 | "# RealTimePredictor renamed to Predictor\n", 50 | "# https://sagemaker.readthedocs.io/en/stable/v2.html\n", 51 | "\n", 52 | "# Create a predictor and point to an existing endpoint\n", 53 | "endpoint_name = 'xgboost-bikerental-v2'\n", 54 | "predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "predictor.serializer = CSVSerializer()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "df_all = pd.read_csv('../Data/bike_test.csv')" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "df_all.head()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "df_all.columns[1:]" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Need to pass an array to the prediction\n", 100 | "# can pass a numpy array or a list of values [[19,1],[20,1]]\n", 101 | "arr_test = df_all[df_all.columns[1:]].values" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "type(arr_test)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "arr_test.shape" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "arr_test[:5]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "result = predictor.predict(arr_test[:2])" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "result" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "arr_test.shape" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### Split the input data into chunks\n", 163 | "There are thousands of rows in this data set for which need inference. \n", 164 | "When communicating over internet, it is a good idea to split the data into chunks to prevent payload and timeout error" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# For large number of predictions, we can split the input data and\n", 174 | "# Query the prediction service.\n", 175 | "# array_split is convenient to specify how many splits are needed\n", 176 | "predictions = []\n", 177 | "for arr in np.array_split(arr_test,10):\n", 178 | " result = predictor.predict(arr)\n", 179 | " result = result.decode(\"utf-8\")\n", 180 | " result = result.split(',')\n", 181 | " print (arr.shape)\n", 182 | " predictions += [float(r) for r in result]" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "len(predictions)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "np.expm1(predictions)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "df_all['count'] = np.expm1(predictions)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "df_all.head()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "df_all[['datetime','count']].to_csv('../Data/predicted_count_cloud.csv',index=False)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# Delete Endpoint to prevent unnecessary charges\n", 237 | "predictor.delete_endpoint()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "predictions" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.7.6" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 1 278 | } 279 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This repo is for my XGBoost training course with OReilly and Pearson 2 | 3 | Some example Python configurations and code snippets are included here .. they are explained in detail during the course 4 | --------------------------------------------------------------------------------