├── Data
    ├── Diabetes.csv
    ├── bikesharing_test.csv
    ├── bikesharing_train.csv
    ├── daily-total-female-births.csv
    └── iris_all.csv
├── Notebooks
    ├── 01-XGBoost_BikeRental_Data_Preparation.ipynb
    ├── 02-XGBoost_Regression_BikeRental.ipynb
    ├── 03-XGBoost_Binary_Classification_Diabetes_Dataset.ipynb
    ├── 04-XGBoost_Course_Prepare_Iris_Dataset.ipynb
    ├── 05-XGBoost_Course_Multiclass_Classification_Iris_Dataset.ipynb
    ├── 06-XGBoost-TimeSeries.ipynb
    ├── 07-XGBoost_Feature_Importance_Selection_Diabetes_Dataset.ipynb
    ├── 08-XGBoost_Hyperparameter_Tuning_Diabetes_Dataset.ipynb
    ├── 09-AWS_XGBoost_Train_Host_Predict.ipynb
    └── 10-AWS_XGBoost_Invoke_Endpoint_Predict.ipynb
└── README.md


/Data/Diabetes.csv:
--------------------------------------------------------------------------------
  1 | Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
  2 | 6,148.0,72,35,,33.6,0.627,50.0,1
  3 | 1,85.0,66,29,,26.6,0.35100000000000003,31.0,0
  4 | 8,183.0,64,0,,23.3,0.672,32.0,1
  5 | 1,89.0,66,23,94.0,28.1,0.16699999999999998,21.0,0
  6 | 0,137.0,40,35,168.0,43.1,2.2880000000000003,33.0,1
  7 | 5,116.0,74,0,,25.6,0.201,30.0,0
  8 | 3,78.0,50,32,88.0,31.0,0.248,26.0,1
  9 | 10,115.0,0,0,,35.3,0.134,29.0,0
 10 | 2,197.0,70,45,543.0,30.5,0.158,53.0,1
 11 | 8,125.0,96,0,,,0.23199999999999998,54.0,1
 12 | 4,110.0,92,0,,37.6,0.191,30.0,0
 13 | 10,168.0,74,0,,38.0,0.537,34.0,1
 14 | 10,139.0,80,0,,27.1,1.4409999999999998,57.0,0
 15 | 1,189.0,60,23,846.0,30.1,0.39799999999999996,59.0,1
 16 | 5,166.0,72,19,175.0,25.8,0.5870000000000001,51.0,1
 17 | 7,100.0,0,0,,30.0,0.484,32.0,1
 18 | 0,118.0,84,47,230.0,45.8,0.551,31.0,1
 19 | 7,107.0,74,0,,29.6,0.254,31.0,1
 20 | 1,103.0,30,38,83.0,43.3,0.183,33.0,0
 21 | 1,115.0,70,30,96.0,34.6,0.529,32.0,1
 22 | 3,126.0,88,41,235.0,39.3,0.7040000000000001,27.0,0
 23 | 8,99.0,84,0,,35.4,0.38799999999999996,50.0,0
 24 | 7,196.0,90,0,,39.8,0.451,41.0,1
 25 | 9,119.0,80,35,,29.0,0.263,29.0,1
 26 | 11,143.0,94,33,146.0,36.6,0.254,51.0,1
 27 | 10,125.0,70,26,115.0,31.1,0.205,41.0,1
 28 | 7,147.0,76,0,,39.4,0.257,43.0,1
 29 | 1,97.0,66,15,140.0,23.2,0.48700000000000004,22.0,0
 30 | 13,145.0,82,19,110.0,22.2,0.245,57.0,0
 31 | 5,117.0,92,0,,34.1,0.337,38.0,0
 32 | 5,109.0,75,26,,36.0,0.546,60.0,0
 33 | 3,158.0,76,36,245.0,31.6,0.851,28.0,1
 34 | 3,88.0,58,11,54.0,24.8,0.267,22.0,0
 35 | 6,92.0,92,0,,19.9,0.188,28.0,0
 36 | 10,122.0,78,31,,27.6,0.512,45.0,0
 37 | 4,103.0,60,33,192.0,24.0,0.966,33.0,0
 38 | 11,138.0,76,0,,33.2,0.42,35.0,0
 39 | 9,102.0,76,37,,32.9,0.665,46.0,1
 40 | 2,90.0,68,42,,38.2,0.503,27.0,1
 41 | 4,111.0,72,47,207.0,37.1,1.39,56.0,1
 42 | 3,180.0,64,25,70.0,34.0,0.271,26.0,0
 43 | 7,133.0,84,0,,40.2,0.696,37.0,0
 44 | 7,106.0,92,18,,22.7,0.235,48.0,0
 45 | 9,171.0,110,24,240.0,45.4,0.721,54.0,1
 46 | 7,159.0,64,0,,27.4,0.294,40.0,0
 47 | 0,180.0,66,39,,42.0,1.893,25.0,1
 48 | 1,146.0,56,0,,29.7,0.564,29.0,0
 49 | 2,71.0,70,27,,28.0,0.586,22.0,0
 50 | 7,103.0,66,32,,39.1,0.344,31.0,1
 51 | 7,105.0,0,0,,,0.305,24.0,0
 52 | 1,103.0,80,11,82.0,19.4,0.491,22.0,0
 53 | 1,101.0,50,15,36.0,24.2,0.526,26.0,0
 54 | 5,88.0,66,21,23.0,24.4,0.342,30.0,0
 55 | 8,176.0,90,34,300.0,33.7,0.467,58.0,1
 56 | 7,150.0,66,42,342.0,34.7,0.718,42.0,0
 57 | 1,73.0,50,10,,23.0,0.248,21.0,0
 58 | 7,187.0,68,39,304.0,37.7,0.254,41.0,1
 59 | 0,100.0,88,60,110.0,46.8,0.9620000000000001,31.0,0
 60 | 0,146.0,82,0,,40.5,1.781,44.0,0
 61 | 0,105.0,64,41,142.0,41.5,0.17300000000000001,22.0,0
 62 | 2,84.0,0,0,,,0.304,21.0,0
 63 | 8,133.0,72,0,,32.9,0.27,39.0,1
 64 | 5,44.0,62,0,,25.0,0.5870000000000001,36.0,0
 65 | 2,141.0,58,34,128.0,25.4,0.6990000000000001,24.0,0
 66 | 7,114.0,66,0,,32.8,0.258,42.0,1
 67 | 5,99.0,74,27,,29.0,0.203,32.0,0
 68 | 0,109.0,88,30,,32.5,0.855,38.0,1
 69 | 2,109.0,92,0,,42.7,0.845,54.0,0
 70 | 1,95.0,66,13,38.0,19.6,0.33399999999999996,25.0,0
 71 | 4,146.0,85,27,100.0,28.9,0.18899999999999997,27.0,0
 72 | 2,100.0,66,20,90.0,32.9,0.867,28.0,1
 73 | 5,139.0,64,35,140.0,28.6,0.41100000000000003,26.0,0
 74 | 13,126.0,90,0,,43.4,0.583,42.0,1
 75 | 4,129.0,86,20,270.0,35.1,0.231,23.0,0
 76 | 1,79.0,75,30,,32.0,0.396,22.0,0
 77 | 1,,48,20,,24.7,0.14,22.0,0
 78 | 7,62.0,78,0,,32.6,0.391,41.0,0
 79 | 5,95.0,72,33,,37.7,0.37,27.0,0
 80 | 0,131.0,0,0,,43.2,0.27,26.0,1
 81 | 2,112.0,66,22,,25.0,0.307,24.0,0
 82 | 3,113.0,44,13,,22.4,0.14,22.0,0
 83 | 2,74.0,0,0,,,0.102,22.0,0
 84 | 7,83.0,78,26,71.0,29.3,0.767,36.0,0
 85 | 0,101.0,65,28,,24.6,0.237,22.0,0
 86 | 5,137.0,108,0,,48.8,0.22699999999999998,37.0,1
 87 | 2,110.0,74,29,125.0,32.4,0.698,27.0,0
 88 | 13,106.0,72,54,,36.6,0.17800000000000002,45.0,0
 89 | 2,100.0,68,25,71.0,38.5,0.324,26.0,0
 90 | 15,136.0,70,32,110.0,37.1,0.153,43.0,1
 91 | 1,107.0,68,19,,26.5,0.165,24.0,0
 92 | 1,80.0,55,0,,19.1,0.258,21.0,0
 93 | 4,123.0,80,15,176.0,32.0,0.44299999999999995,34.0,0
 94 | 7,81.0,78,40,48.0,46.7,0.261,42.0,0
 95 | 4,134.0,72,0,,23.8,0.27699999999999997,60.0,1
 96 | 2,142.0,82,18,64.0,24.7,0.7609999999999999,21.0,0
 97 | 6,144.0,72,27,228.0,33.9,0.255,40.0,0
 98 | 2,92.0,62,28,,31.6,0.13,24.0,0
 99 | 1,71.0,48,18,76.0,20.4,0.32299999999999995,22.0,0
100 | 6,93.0,50,30,64.0,28.7,0.35600000000000004,23.0,0
101 | 1,122.0,90,51,220.0,49.7,0.325,31.0,1
102 | 1,163.0,72,0,,39.0,1.222,33.0,1
103 | 1,151.0,60,0,,26.1,0.179,22.0,0
104 | 0,125.0,96,0,,22.5,0.262,21.0,0
105 | 1,81.0,72,18,40.0,26.6,0.28300000000000003,24.0,0
106 | 2,85.0,65,0,,39.6,0.93,27.0,0
107 | 1,126.0,56,29,152.0,28.7,0.8009999999999999,21.0,0
108 | 1,96.0,122,0,,22.4,0.207,27.0,0
109 | 4,144.0,58,28,140.0,29.5,0.287,37.0,0
110 | 3,83.0,58,31,18.0,34.3,0.336,25.0,0
111 | 0,95.0,85,25,36.0,37.4,0.247,24.0,1
112 | 3,171.0,72,33,135.0,33.3,0.19899999999999998,24.0,1
113 | 8,155.0,62,26,495.0,34.0,0.5429999999999999,46.0,1
114 | 1,89.0,76,34,37.0,31.2,0.192,23.0,0
115 | 4,76.0,62,0,,34.0,0.391,25.0,0
116 | 7,160.0,54,32,175.0,30.5,0.588,39.0,1
117 | 4,146.0,92,0,,31.2,0.539,61.0,1
118 | 5,124.0,74,0,,34.0,0.22,38.0,1
119 | 5,78.0,48,0,,33.7,0.654,25.0,0
120 | 4,97.0,60,23,,28.2,0.44299999999999995,22.0,0
121 | 4,99.0,76,15,51.0,23.2,0.223,21.0,0
122 | 0,162.0,76,56,100.0,53.2,0.759,25.0,1
123 | 6,111.0,64,39,,34.2,0.26,24.0,0
124 | 2,107.0,74,30,100.0,33.6,0.40399999999999997,23.0,0
125 | 5,132.0,80,0,,26.8,0.18600000000000003,69.0,0
126 | 0,113.0,76,0,,33.3,0.278,23.0,1
127 | 1,88.0,30,42,99.0,55.0,0.496,26.0,1
128 | 3,120.0,70,30,135.0,42.9,0.452,30.0,0
129 | 1,118.0,58,36,94.0,33.3,0.261,23.0,0
130 | 1,117.0,88,24,145.0,34.5,0.40299999999999997,40.0,1
131 | 0,105.0,84,0,,27.9,0.741,62.0,1
132 | 4,173.0,70,14,168.0,29.7,0.361,33.0,1
133 | 9,122.0,56,0,,33.3,1.114,33.0,1
134 | 3,170.0,64,37,225.0,34.5,0.35600000000000004,30.0,1
135 | 8,84.0,74,31,,38.3,0.457,39.0,0
136 | 2,96.0,68,13,49.0,21.1,0.647,26.0,0
137 | 2,125.0,60,20,140.0,33.8,0.08800000000000001,31.0,0
138 | 0,100.0,70,26,50.0,30.8,0.597,21.0,0
139 | 0,93.0,60,25,92.0,28.7,0.532,22.0,0
140 | 0,129.0,80,0,,31.2,0.703,29.0,0
141 | 5,105.0,72,29,325.0,36.9,0.159,28.0,0
142 | 3,128.0,78,0,,21.1,0.268,55.0,0
143 | 5,106.0,82,30,,39.5,0.28600000000000003,38.0,0
144 | 2,108.0,52,26,63.0,32.5,0.318,22.0,0
145 | 10,108.0,66,0,,32.4,0.272,42.0,1
146 | 4,154.0,62,31,284.0,32.8,0.237,23.0,0
147 | 0,102.0,75,23,,,0.5720000000000001,21.0,0
148 | 9,57.0,80,37,,32.8,0.096,41.0,0
149 | 2,106.0,64,35,119.0,30.5,1.4,34.0,0
150 | 5,147.0,78,0,,33.7,0.218,65.0,0
151 | 2,90.0,70,17,,27.3,0.085,22.0,0
152 | 1,136.0,74,50,204.0,37.4,0.39899999999999997,24.0,0
153 | 4,114.0,65,0,,21.9,0.43200000000000005,37.0,0
154 | 9,156.0,86,28,155.0,34.3,1.189,42.0,1
155 | 1,153.0,82,42,485.0,40.6,0.687,23.0,0
156 | 8,188.0,78,0,,47.9,0.13699999999999998,43.0,1
157 | 7,152.0,88,44,,50.0,0.337,36.0,1
158 | 2,99.0,52,15,94.0,24.6,0.637,21.0,0
159 | 1,109.0,56,21,135.0,25.2,0.833,23.0,0
160 | 2,88.0,74,19,53.0,29.0,0.22899999999999998,22.0,0
161 | 17,163.0,72,41,114.0,40.9,0.8170000000000001,47.0,1
162 | 4,151.0,90,38,,29.7,0.294,36.0,0
163 | 7,102.0,74,40,105.0,37.2,0.204,45.0,0
164 | 0,114.0,80,34,285.0,44.2,0.16699999999999998,27.0,0
165 | 2,100.0,64,23,,29.7,0.368,21.0,0
166 | 0,131.0,88,0,,31.6,0.743,32.0,1
167 | 6,104.0,74,18,156.0,29.9,0.722,41.0,1
168 | 3,148.0,66,25,,32.5,0.256,22.0,0
169 | 4,120.0,68,0,,29.6,0.7090000000000001,34.0,0
170 | 4,110.0,66,0,,31.9,0.47100000000000003,29.0,0
171 | 3,111.0,90,12,78.0,28.4,0.495,29.0,0
172 | 6,102.0,82,0,,30.8,0.18,36.0,1
173 | 6,134.0,70,23,130.0,35.4,0.542,29.0,1
174 | 2,87.0,0,23,,28.9,0.773,25.0,0
175 | 1,79.0,60,42,48.0,43.5,0.6779999999999999,23.0,0
176 | 2,75.0,64,24,55.0,29.7,0.37,33.0,0
177 | 8,179.0,72,42,130.0,32.7,0.7190000000000001,36.0,1
178 | 6,85.0,78,0,,31.2,0.382,42.0,0
179 | 0,129.0,110,46,130.0,67.1,0.319,26.0,1
180 | 5,143.0,78,0,,45.0,0.19,47.0,0
181 | 5,130.0,82,0,,39.1,0.956,37.0,1
182 | 6,87.0,80,0,,23.2,0.084,32.0,0
183 | 0,119.0,64,18,92.0,34.9,0.725,23.0,0
184 | 1,,74,20,23.0,27.7,0.299,21.0,0
185 | 5,73.0,60,0,,26.8,0.268,27.0,0
186 | 4,141.0,74,0,,27.6,0.244,40.0,0
187 | 7,194.0,68,28,,35.9,0.745,41.0,1
188 | 8,181.0,68,36,495.0,30.1,0.615,60.0,1
189 | 1,128.0,98,41,58.0,32.0,1.321,33.0,1
190 | 8,109.0,76,39,114.0,27.9,0.64,31.0,1
191 | 5,139.0,80,35,160.0,31.6,0.361,25.0,1
192 | 3,111.0,62,0,,22.6,0.142,21.0,0
193 | 9,123.0,70,44,94.0,33.1,0.374,40.0,0
194 | 7,159.0,66,0,,30.4,0.38299999999999995,36.0,1
195 | 11,135.0,0,0,,52.3,0.578,40.0,1
196 | 8,85.0,55,20,,24.4,0.136,42.0,0
197 | 5,158.0,84,41,210.0,39.4,0.395,29.0,1
198 | 1,105.0,58,0,,24.3,0.187,21.0,0
199 | 3,107.0,62,13,48.0,22.9,0.6779999999999999,23.0,1
200 | 4,109.0,64,44,99.0,34.8,0.905,26.0,1
201 | 4,148.0,60,27,318.0,30.9,0.15,29.0,1
202 | 0,113.0,80,16,,31.0,0.8740000000000001,21.0,0
203 | 1,138.0,82,0,,40.1,0.23600000000000002,28.0,0
204 | 0,108.0,68,20,,27.3,0.787,32.0,0
205 | 2,99.0,70,16,44.0,20.4,0.235,27.0,0
206 | 6,103.0,72,32,190.0,37.7,0.324,55.0,0
207 | 5,111.0,72,28,,23.9,0.40700000000000003,27.0,0
208 | 8,196.0,76,29,280.0,37.5,0.605,57.0,1
209 | 5,162.0,104,0,,37.7,0.151,52.0,1
210 | 1,96.0,64,27,87.0,33.2,0.289,21.0,0
211 | 7,184.0,84,33,,35.5,0.355,41.0,1
212 | 2,81.0,60,22,,27.7,0.29,25.0,0
213 | 0,147.0,85,54,,42.8,0.375,24.0,0
214 | 7,179.0,95,31,,34.2,0.16399999999999998,60.0,0
215 | 0,140.0,65,26,130.0,42.6,0.431,24.0,1
216 | 9,112.0,82,32,175.0,34.2,0.26,36.0,1
217 | 12,151.0,70,40,271.0,41.8,0.742,38.0,1
218 | 5,109.0,62,41,129.0,35.8,0.514,25.0,1
219 | 6,125.0,68,30,120.0,30.0,0.46399999999999997,32.0,0
220 | 5,85.0,74,22,,29.0,1.224,32.0,1
221 | 5,112.0,66,0,,37.8,0.261,41.0,1
222 | 0,177.0,60,29,478.0,34.6,1.072,21.0,1
223 | 2,158.0,90,0,,31.6,0.805,66.0,1
224 | 7,119.0,0,0,,25.2,0.209,37.0,0
225 | 7,142.0,60,33,190.0,28.8,0.687,61.0,0
226 | 1,100.0,66,15,56.0,23.6,0.6659999999999999,26.0,0
227 | 1,87.0,78,27,32.0,34.6,0.10099999999999999,22.0,0
228 | 0,101.0,76,0,,35.7,0.198,26.0,0
229 | 3,162.0,52,38,,37.2,0.652,24.0,1
230 | 4,197.0,70,39,744.0,36.7,2.329,31.0,0
231 | 0,117.0,80,31,53.0,45.2,0.08900000000000001,24.0,0
232 | 4,142.0,86,0,,44.0,0.645,22.0,1
233 | 6,134.0,80,37,370.0,46.2,0.23800000000000002,46.0,1
234 | 1,79.0,80,25,37.0,25.4,0.583,22.0,0
235 | 4,122.0,68,0,,35.0,0.39399999999999996,29.0,0
236 | 3,74.0,68,28,45.0,29.7,0.293,23.0,0
237 | 4,171.0,72,0,,43.6,0.479,26.0,1
238 | 7,181.0,84,21,192.0,35.9,0.586,51.0,1
239 | 0,179.0,90,27,,44.1,0.6859999999999999,23.0,1
240 | 9,164.0,84,21,,30.8,0.831,32.0,1
241 | 0,104.0,76,0,,18.4,0.5820000000000001,27.0,0
242 | 1,91.0,64,24,,29.2,0.192,21.0,0
243 | 4,91.0,70,32,88.0,33.1,0.446,22.0,0
244 | 3,139.0,54,0,,25.6,0.402,22.0,1
245 | 6,119.0,50,22,176.0,27.1,1.318,33.0,1
246 | 2,146.0,76,35,194.0,38.2,0.32899999999999996,29.0,0
247 | 9,184.0,85,15,,30.0,1.213,49.0,1
248 | 10,122.0,68,0,,31.2,0.258,41.0,0
249 | 0,165.0,90,33,680.0,52.3,0.42700000000000005,23.0,0
250 | 9,124.0,70,33,402.0,35.4,0.282,34.0,0
251 | 1,111.0,86,19,,30.1,0.14300000000000002,23.0,0
252 | 9,106.0,52,0,,31.2,0.38,42.0,0
253 | 2,129.0,84,0,,28.0,0.284,27.0,0
254 | 2,90.0,80,14,55.0,24.4,0.249,24.0,0
255 | 0,86.0,68,32,,35.8,0.23800000000000002,25.0,0
256 | 12,92.0,62,7,258.0,27.6,0.9259999999999999,44.0,1
257 | 1,113.0,64,35,,33.6,0.5429999999999999,21.0,1
258 | 3,111.0,56,39,,30.1,0.557,30.0,0
259 | 2,114.0,68,22,,28.7,0.092,25.0,0
260 | 1,193.0,50,16,375.0,25.9,0.655,24.0,0
261 | 11,155.0,76,28,150.0,33.3,1.3530000000000002,51.0,1
262 | 3,191.0,68,15,130.0,30.9,0.299,34.0,0
263 | 3,141.0,0,0,,30.0,0.7609999999999999,27.0,1
264 | 4,95.0,70,32,,32.1,0.612,24.0,0
265 | 3,142.0,80,15,,32.4,0.2,63.0,0
266 | 4,123.0,62,0,,32.0,0.226,35.0,1
267 | 5,96.0,74,18,67.0,33.6,0.997,43.0,0
268 | 0,138.0,0,0,,36.3,0.9329999999999999,25.0,1
269 | 2,128.0,64,42,,40.0,1.101,24.0,0
270 | 0,102.0,52,0,,25.1,0.078,21.0,0
271 | 2,146.0,0,0,,27.5,0.24,28.0,1
272 | 10,101.0,86,37,,45.6,1.136,38.0,1
273 | 2,108.0,62,32,56.0,25.2,0.128,21.0,0
274 | 3,122.0,78,0,,23.0,0.254,40.0,0
275 | 1,71.0,78,50,45.0,33.2,0.42200000000000004,21.0,0
276 | 13,106.0,70,0,,34.2,0.251,52.0,0
277 | 2,100.0,70,52,57.0,40.5,0.677,25.0,0
278 | 7,106.0,60,24,,26.5,0.29600000000000004,29.0,1
279 | 0,104.0,64,23,116.0,27.8,0.45399999999999996,23.0,0
280 | 5,114.0,74,0,,24.9,0.7440000000000001,57.0,0
281 | 2,108.0,62,10,278.0,25.3,0.8809999999999999,22.0,0
282 | 0,146.0,70,0,,37.9,0.33399999999999996,28.0,1
283 | 10,129.0,76,28,122.0,35.9,0.28,39.0,0
284 | 7,133.0,88,15,155.0,32.4,0.262,37.0,0
285 | 7,161.0,86,0,,30.4,0.165,47.0,1
286 | 2,108.0,80,0,,27.0,0.259,52.0,1
287 | 7,136.0,74,26,135.0,26.0,0.647,51.0,0
288 | 5,155.0,84,44,545.0,38.7,0.619,34.0,0
289 | 1,119.0,86,39,220.0,45.6,0.8079999999999999,29.0,1
290 | 4,96.0,56,17,49.0,20.8,0.34,26.0,0
291 | 5,108.0,72,43,75.0,36.1,0.263,33.0,0
292 | 0,78.0,88,29,40.0,36.9,0.434,21.0,0
293 | 0,107.0,62,30,74.0,36.6,0.757,25.0,1
294 | 2,128.0,78,37,182.0,43.3,1.224,31.0,1
295 | 1,128.0,48,45,194.0,40.5,0.613,24.0,1
296 | 0,161.0,50,0,,21.9,0.254,65.0,0
297 | 6,151.0,62,31,120.0,35.5,0.6920000000000001,28.0,0
298 | 2,146.0,70,38,360.0,28.0,0.337,29.0,1
299 | 0,126.0,84,29,215.0,30.7,0.52,24.0,0
300 | 14,100.0,78,25,184.0,36.6,0.41200000000000003,46.0,1
301 | 8,112.0,72,0,,23.6,0.84,58.0,0
302 | 0,167.0,0,0,,32.3,0.8390000000000001,30.0,1
303 | 2,144.0,58,33,135.0,31.6,0.42200000000000004,25.0,1
304 | 5,77.0,82,41,42.0,35.8,0.156,35.0,0
305 | 5,115.0,98,0,,52.9,0.209,28.0,1
306 | 3,150.0,76,0,,21.0,0.207,37.0,0
307 | 2,120.0,76,37,105.0,39.7,0.215,29.0,0
308 | 10,161.0,68,23,132.0,25.5,0.326,47.0,1
309 | 0,137.0,68,14,148.0,24.8,0.14300000000000002,21.0,0
310 | 0,128.0,68,19,180.0,30.5,1.391,25.0,1
311 | 2,124.0,68,28,205.0,32.9,0.875,30.0,1
312 | 6,80.0,66,30,,26.2,0.313,41.0,0
313 | 0,106.0,70,37,148.0,39.4,0.605,22.0,0
314 | 2,155.0,74,17,96.0,26.6,0.433,27.0,1
315 | 3,113.0,50,10,85.0,29.5,0.626,25.0,0
316 | 7,109.0,80,31,,35.9,1.127,43.0,1
317 | 2,112.0,68,22,94.0,34.1,0.315,26.0,0
318 | 3,99.0,80,11,64.0,19.3,0.284,30.0,0
319 | 3,182.0,74,0,,30.5,0.345,29.0,1
320 | 3,115.0,66,39,140.0,38.1,0.15,28.0,0
321 | 6,194.0,78,0,,23.5,0.129,59.0,1
322 | 4,129.0,60,12,231.0,27.5,0.527,31.0,0
323 | 3,112.0,74,30,,31.6,0.19699999999999998,25.0,1
324 | 0,124.0,70,20,,27.4,0.254,36.0,1
325 | 13,152.0,90,33,29.0,26.8,0.731,43.0,1
326 | 2,112.0,75,32,,35.7,0.14800000000000002,21.0,0
327 | 1,157.0,72,21,168.0,25.6,0.12300000000000001,24.0,0
328 | 1,122.0,64,32,156.0,35.1,0.6920000000000001,30.0,1
329 | 10,179.0,70,0,,35.1,0.2,37.0,0
330 | 2,102.0,86,36,120.0,45.5,0.127,23.0,1
331 | 6,105.0,70,32,68.0,30.8,0.122,37.0,0
332 | 8,118.0,72,19,,23.1,1.476,46.0,0
333 | 2,87.0,58,16,52.0,32.7,0.166,25.0,0
334 | 1,180.0,0,0,,43.3,0.282,41.0,1
335 | 12,106.0,80,0,,23.6,0.13699999999999998,44.0,0
336 | 1,95.0,60,18,58.0,23.9,0.26,22.0,0
337 | 0,165.0,76,43,255.0,47.9,0.259,26.0,0
338 | 0,117.0,0,0,,33.8,0.932,44.0,0
339 | 5,115.0,76,0,,31.2,0.34299999999999997,44.0,1
340 | 9,152.0,78,34,171.0,34.2,0.893,33.0,1
341 | 7,178.0,84,0,,39.9,0.331,41.0,1
342 | 1,130.0,70,13,105.0,25.9,0.47200000000000003,22.0,0
343 | 1,95.0,74,21,73.0,25.9,0.6729999999999999,36.0,0
344 | 1,,68,35,,32.0,0.389,22.0,0
345 | 5,122.0,86,0,,34.7,0.29,33.0,0
346 | 8,95.0,72,0,,36.8,0.485,57.0,0
347 | 8,126.0,88,36,108.0,38.5,0.349,49.0,0
348 | 1,139.0,46,19,83.0,28.7,0.654,22.0,0
349 | 3,116.0,0,0,,23.5,0.187,23.0,0
350 | 3,99.0,62,19,74.0,21.8,0.27899999999999997,26.0,0
351 | 5,,80,32,,41.0,0.34600000000000003,37.0,1
352 | 4,92.0,80,0,,42.2,0.237,29.0,0
353 | 4,137.0,84,0,,31.2,0.252,30.0,0
354 | 3,61.0,82,28,,34.4,0.243,46.0,0
355 | 1,90.0,62,12,43.0,27.2,0.58,24.0,0
356 | 3,90.0,78,0,,42.7,0.5589999999999999,21.0,0
357 | 9,165.0,88,0,,30.4,0.302,49.0,1
358 | 1,125.0,50,40,167.0,33.3,0.9620000000000001,28.0,1
359 | 13,129.0,0,30,,39.9,0.569,44.0,1
360 | 12,88.0,74,40,54.0,35.3,0.37799999999999995,48.0,0
361 | 1,196.0,76,36,249.0,36.5,0.875,29.0,1
362 | 5,189.0,64,33,325.0,31.2,0.583,29.0,1
363 | 5,158.0,70,0,,29.8,0.207,63.0,0
364 | 5,103.0,108,37,,39.2,0.305,65.0,0
365 | 4,146.0,78,0,,38.5,0.52,67.0,1
366 | 4,147.0,74,25,293.0,34.9,0.385,30.0,0
367 | 5,99.0,54,28,83.0,34.0,0.499,30.0,0
368 | 6,124.0,72,0,,27.6,0.368,29.0,1
369 | 0,101.0,64,17,,21.0,0.252,21.0,0
370 | 3,81.0,86,16,66.0,27.5,0.306,22.0,0
371 | 1,133.0,102,28,140.0,32.8,0.23399999999999999,45.0,1
372 | 3,173.0,82,48,465.0,38.4,2.137,25.0,1
373 | 0,118.0,64,23,89.0,,1.7309999999999999,21.0,0
374 | 0,84.0,64,22,66.0,35.8,0.545,21.0,0
375 | 2,105.0,58,40,94.0,34.9,0.225,25.0,0
376 | 2,122.0,52,43,158.0,36.2,0.816,28.0,0
377 | 12,140.0,82,43,325.0,39.2,0.528,58.0,1
378 | 0,98.0,82,15,84.0,25.2,0.299,22.0,0
379 | 1,87.0,60,37,75.0,37.2,0.509,22.0,0
380 | 4,156.0,75,0,,48.3,0.23800000000000002,32.0,1
381 | 0,93.0,100,39,72.0,43.4,1.021,35.0,0
382 | 1,107.0,72,30,82.0,30.8,0.821,24.0,0
383 | 0,105.0,68,22,,20.0,0.23600000000000002,22.0,0
384 | 1,109.0,60,8,182.0,25.4,0.9470000000000001,21.0,0
385 | 1,90.0,62,18,59.0,25.1,1.268,25.0,0
386 | 1,125.0,70,24,110.0,24.3,0.221,25.0,0
387 | 1,119.0,54,13,50.0,22.3,0.205,24.0,0
388 | 5,116.0,74,29,,32.3,0.66,35.0,1
389 | 8,105.0,100,36,,43.3,0.239,45.0,1
390 | 5,144.0,82,26,285.0,32.0,0.452,58.0,1
391 | 3,100.0,68,23,81.0,31.6,0.9490000000000001,28.0,0
392 | 1,100.0,66,29,196.0,32.0,0.444,42.0,0
393 | 5,166.0,76,0,,45.7,0.34,27.0,1
394 | 1,131.0,64,14,415.0,23.7,0.389,21.0,0
395 | 4,116.0,72,12,87.0,22.1,0.46299999999999997,37.0,0
396 | 4,158.0,78,0,,32.9,0.8029999999999999,31.0,1
397 | 2,127.0,58,24,275.0,27.7,1.6,25.0,0
398 | 3,96.0,56,34,115.0,24.7,0.9440000000000001,39.0,0
399 | 0,131.0,66,40,,34.3,0.196,22.0,1
400 | 3,82.0,70,0,,21.1,0.389,25.0,0
401 | 3,193.0,70,31,,34.9,0.24100000000000002,25.0,1
402 | 4,95.0,64,0,,32.0,0.161,31.0,1
403 | 6,137.0,61,0,,24.2,0.151,55.0,0
404 | 5,136.0,84,41,88.0,35.0,0.28600000000000003,35.0,1
405 | 9,72.0,78,25,,31.6,0.28,38.0,0
406 | 5,168.0,64,0,,32.9,0.135,41.0,1
407 | 2,123.0,48,32,165.0,42.1,0.52,26.0,0
408 | 4,115.0,72,0,,28.9,0.376,46.0,1
409 | 0,101.0,62,0,,21.9,0.336,25.0,0
410 | 8,197.0,74,0,,25.9,1.1909999999999998,39.0,1
411 | 1,172.0,68,49,579.0,42.4,0.7020000000000001,28.0,1
412 | 6,102.0,90,39,,35.7,0.674,28.0,0
413 | 1,112.0,72,30,176.0,34.4,0.528,25.0,0
414 | 1,143.0,84,23,310.0,42.4,1.0759999999999998,22.0,0
415 | 1,143.0,74,22,61.0,26.2,0.256,21.0,0
416 | 0,138.0,60,35,167.0,34.6,0.534,21.0,1
417 | 3,173.0,84,33,474.0,35.7,0.258,22.0,1
418 | 1,97.0,68,21,,27.2,1.095,22.0,0
419 | 4,144.0,82,32,,38.5,0.5539999999999999,37.0,1
420 | 1,83.0,68,0,,18.2,0.624,27.0,0
421 | 3,129.0,64,29,115.0,26.4,0.21899999999999997,28.0,1
422 | 1,119.0,88,41,170.0,45.3,0.507,26.0,0
423 | 2,94.0,68,18,76.0,26.0,0.561,21.0,0
424 | 0,102.0,64,46,78.0,40.6,0.496,21.0,0
425 | 2,115.0,64,22,,30.8,0.42100000000000004,21.0,0
426 | 8,151.0,78,32,210.0,42.9,0.516,36.0,1
427 | 4,184.0,78,39,277.0,37.0,0.264,31.0,1
428 | 0,94.0,0,0,,,0.256,25.0,0
429 | 1,181.0,64,30,180.0,34.1,0.32799999999999996,38.0,1
430 | 0,135.0,94,46,145.0,40.6,0.284,26.0,0
431 | 1,95.0,82,25,180.0,35.0,0.233,43.0,1
432 | 2,99.0,0,0,,22.2,0.10800000000000001,23.0,0
433 | 3,89.0,74,16,85.0,30.4,0.551,38.0,0
434 | 1,80.0,74,11,60.0,30.0,0.527,22.0,0
435 | 2,139.0,75,0,,25.6,0.16699999999999998,29.0,0
436 | 1,90.0,68,8,,24.5,1.138,36.0,0
437 | 0,141.0,0,0,,42.4,0.205,29.0,1
438 | 12,140.0,85,33,,37.4,0.244,41.0,0
439 | 5,147.0,75,0,,29.9,0.434,28.0,0
440 | 1,97.0,70,15,,18.2,0.147,21.0,0
441 | 6,107.0,88,0,,36.8,0.727,31.0,0
442 | 0,189.0,104,25,,34.3,0.435,41.0,1
443 | 2,83.0,66,23,50.0,32.2,0.49700000000000005,22.0,0
444 | 4,117.0,64,27,120.0,33.2,0.23,24.0,0
445 | 8,108.0,70,0,,30.5,0.955,33.0,1
446 | 4,117.0,62,12,,29.7,0.38,30.0,1
447 | 0,180.0,78,63,14.0,59.4,2.42,25.0,1
448 | 1,100.0,72,12,70.0,25.3,0.6579999999999999,28.0,0
449 | 0,95.0,80,45,92.0,36.5,0.33,26.0,0
450 | 0,104.0,64,37,64.0,33.6,0.51,22.0,1
451 | 0,120.0,74,18,63.0,30.5,0.285,26.0,0
452 | 1,82.0,64,13,95.0,21.2,0.415,23.0,0
453 | 2,134.0,70,0,,28.9,0.542,23.0,1
454 | 0,91.0,68,32,210.0,39.9,0.381,25.0,0
455 | 2,119.0,0,0,,19.6,0.8320000000000001,72.0,0
456 | 2,100.0,54,28,105.0,37.8,0.498,24.0,0
457 | 14,175.0,62,30,,33.6,0.212,38.0,1
458 | 1,135.0,54,0,,26.7,0.687,62.0,0
459 | 5,86.0,68,28,71.0,30.2,0.364,24.0,0
460 | 10,148.0,84,48,237.0,37.6,1.001,51.0,1
461 | 9,134.0,74,33,60.0,25.9,0.46,81.0,0
462 | 9,120.0,72,22,56.0,20.8,0.733,48.0,0
463 | 1,71.0,62,0,,21.8,0.41600000000000004,26.0,0
464 | 8,74.0,70,40,49.0,35.3,0.705,39.0,0
465 | 5,88.0,78,30,,27.6,0.258,37.0,0
466 | 10,115.0,98,0,,24.0,1.022,34.0,0
467 | 0,124.0,56,13,105.0,21.8,0.452,21.0,0
468 | 0,74.0,52,10,36.0,27.8,0.26899999999999996,22.0,0
469 | 0,97.0,64,36,100.0,36.8,0.6,25.0,0
470 | 8,120.0,0,0,,30.0,0.183,38.0,1
471 | 6,154.0,78,41,140.0,46.1,0.5710000000000001,27.0,0
472 | 1,144.0,82,40,,41.3,0.607,28.0,0
473 | 0,137.0,70,38,,33.2,0.17,22.0,0
474 | 0,119.0,66,27,,38.8,0.259,22.0,0
475 | 7,136.0,90,0,,29.9,0.21,50.0,0
476 | 4,114.0,64,0,,28.9,0.126,24.0,0
477 | 0,137.0,84,27,,27.3,0.231,59.0,0
478 | 2,105.0,80,45,191.0,33.7,0.711,29.0,1
479 | 7,114.0,76,17,110.0,23.8,0.466,31.0,0
480 | 8,126.0,74,38,75.0,25.9,0.162,39.0,0
481 | 4,132.0,86,31,,28.0,0.419,63.0,0
482 | 3,158.0,70,30,328.0,35.5,0.344,35.0,1
483 | 0,123.0,88,37,,35.2,0.19699999999999998,29.0,0
484 | 4,85.0,58,22,49.0,27.8,0.306,28.0,0
485 | 0,84.0,82,31,125.0,38.2,0.233,23.0,0
486 | 0,145.0,0,0,,44.2,0.63,31.0,1
487 | 0,135.0,68,42,250.0,42.3,0.365,24.0,1
488 | 1,139.0,62,41,480.0,40.7,0.536,21.0,0
489 | 0,173.0,78,32,265.0,46.5,1.159,58.0,0
490 | 4,99.0,72,17,,25.6,0.294,28.0,0
491 | 8,194.0,80,0,,26.1,0.551,67.0,0
492 | 2,83.0,65,28,66.0,36.8,0.629,24.0,0
493 | 2,89.0,90,30,,33.5,0.292,42.0,0
494 | 4,99.0,68,38,,32.8,0.145,33.0,0
495 | 4,125.0,70,18,122.0,28.9,1.1440000000000001,45.0,1
496 | 3,80.0,0,0,,,0.174,22.0,0
497 | 6,166.0,74,0,,26.6,0.304,66.0,0
498 | 5,110.0,68,0,,26.0,0.292,30.0,0
499 | 2,81.0,72,15,76.0,30.1,0.547,25.0,0
500 | 7,195.0,70,33,145.0,25.1,0.163,55.0,1
501 | 6,154.0,74,32,193.0,29.3,0.8390000000000001,39.0,0
502 | 2,117.0,90,19,71.0,25.2,0.313,21.0,0
503 | 3,84.0,72,32,,37.2,0.267,28.0,0
504 | 6,,68,41,,39.0,0.727,41.0,1
505 | 7,94.0,64,25,79.0,33.3,0.738,41.0,0
506 | 3,96.0,78,39,,37.3,0.23800000000000002,40.0,0
507 | 10,75.0,82,0,,33.3,0.263,38.0,0
508 | 0,180.0,90,26,90.0,36.5,0.314,35.0,1
509 | 1,130.0,60,23,170.0,28.6,0.6920000000000001,21.0,0
510 | 2,84.0,50,23,76.0,30.4,0.968,21.0,0
511 | 8,120.0,78,0,,25.0,0.409,64.0,0
512 | 12,84.0,72,31,,29.7,0.297,46.0,1
513 | 0,139.0,62,17,210.0,22.1,0.207,21.0,0
514 | 9,91.0,68,0,,24.2,0.2,58.0,0
515 | 2,91.0,62,0,,27.3,0.525,22.0,0
516 | 3,99.0,54,19,86.0,25.6,0.154,24.0,0
517 | 3,163.0,70,18,105.0,31.6,0.268,28.0,1
518 | 9,145.0,88,34,165.0,30.3,0.7709999999999999,53.0,1
519 | 7,125.0,86,0,,37.6,0.304,51.0,0
520 | 13,76.0,60,0,,32.8,0.18,41.0,0
521 | 6,129.0,90,7,326.0,19.6,0.5820000000000001,60.0,0
522 | 2,68.0,70,32,66.0,25.0,0.187,25.0,0
523 | 3,124.0,80,33,130.0,33.2,0.305,26.0,0
524 | 6,114.0,0,0,,,0.18899999999999997,26.0,0
525 | 9,130.0,70,0,,34.2,0.652,45.0,1
526 | 3,125.0,58,0,,31.6,0.151,24.0,0
527 | 3,87.0,60,18,,21.8,0.444,21.0,0
528 | 1,97.0,64,19,82.0,18.2,0.299,21.0,0
529 | 3,116.0,74,15,105.0,26.3,0.107,24.0,0
530 | 0,117.0,66,31,188.0,30.8,0.493,22.0,0
531 | 0,111.0,65,0,,24.6,0.66,31.0,0
532 | 2,122.0,60,18,106.0,29.8,0.7170000000000001,22.0,0
533 | 0,107.0,76,0,,45.3,0.6859999999999999,24.0,0
534 | 1,86.0,66,52,65.0,41.3,0.917,29.0,0
535 | 6,91.0,0,0,,29.8,0.501,31.0,0
536 | 1,77.0,56,30,56.0,33.3,1.251,24.0,0
537 | 4,132.0,0,0,,32.9,0.302,23.0,1
538 | 0,105.0,90,0,,29.6,0.19699999999999998,46.0,0
539 | 0,57.0,60,0,,21.7,0.735,67.0,0
540 | 0,127.0,80,37,210.0,36.3,0.804,23.0,0
541 | 3,129.0,92,49,155.0,36.4,0.968,32.0,1
542 | 8,100.0,74,40,215.0,39.4,0.6609999999999999,43.0,1
543 | 3,128.0,72,25,190.0,32.4,0.5489999999999999,27.0,1
544 | 10,90.0,85,32,,34.9,0.825,56.0,1
545 | 4,84.0,90,23,56.0,39.5,0.159,25.0,0
546 | 1,88.0,78,29,76.0,32.0,0.365,29.0,0
547 | 8,186.0,90,35,225.0,34.5,0.423,37.0,1
548 | 5,187.0,76,27,207.0,43.6,1.034,53.0,1
549 | 4,131.0,68,21,166.0,33.1,0.16,28.0,0
550 | 1,164.0,82,43,67.0,32.8,0.341,50.0,0
551 | 4,189.0,110,31,,28.5,0.68,37.0,0
552 | 1,116.0,70,28,,27.4,0.204,21.0,0
553 | 3,84.0,68,30,106.0,31.9,0.591,25.0,0
554 | 6,114.0,88,0,,27.8,0.247,66.0,0
555 | 1,88.0,62,24,44.0,29.9,0.42200000000000004,23.0,0
556 | 1,84.0,64,23,115.0,36.9,0.47100000000000003,28.0,0
557 | 7,124.0,70,33,215.0,25.5,0.161,37.0,0
558 | 1,97.0,70,40,,38.1,0.218,30.0,0
559 | 8,110.0,76,0,,27.8,0.237,58.0,0
560 | 11,103.0,68,40,,46.2,0.126,42.0,0
561 | 11,85.0,74,0,,30.1,0.3,35.0,0
562 | 6,125.0,76,0,,33.8,0.121,54.0,1
563 | 0,198.0,66,32,274.0,41.3,0.502,28.0,1
564 | 1,87.0,68,34,77.0,37.6,0.401,24.0,0
565 | 6,99.0,60,19,54.0,26.9,0.49700000000000005,32.0,0
566 | 0,91.0,80,0,,32.4,0.601,27.0,0
567 | 2,95.0,54,14,88.0,26.1,0.748,22.0,0
568 | 1,99.0,72,30,18.0,38.6,0.41200000000000003,21.0,0
569 | 6,92.0,62,32,126.0,32.0,0.085,46.0,0
570 | 4,154.0,72,29,126.0,31.3,0.33799999999999997,37.0,0
571 | 0,121.0,66,30,165.0,34.3,0.203,33.0,1
572 | 3,78.0,70,0,,32.5,0.27,39.0,0
573 | 2,130.0,96,0,,22.6,0.268,21.0,0
574 | 3,111.0,58,31,44.0,29.5,0.43,22.0,0
575 | 2,98.0,60,17,120.0,34.7,0.198,22.0,0
576 | 1,143.0,86,30,330.0,30.1,0.892,23.0,0
577 | 1,119.0,44,47,63.0,35.5,0.28,25.0,0
578 | 6,108.0,44,20,130.0,24.0,0.813,35.0,0
579 | 2,118.0,80,0,,42.9,0.693,21.0,1
580 | 10,133.0,68,0,,27.0,0.245,36.0,0
581 | 2,197.0,70,99,,34.7,0.575,62.0,1
582 | 0,151.0,90,46,,42.1,0.371,21.0,1
583 | 6,109.0,60,27,,25.0,0.20600000000000002,27.0,0
584 | 12,121.0,78,17,,26.5,0.259,62.0,0
585 | 8,100.0,76,0,,38.7,0.19,42.0,0
586 | 8,124.0,76,24,600.0,28.7,0.687,52.0,1
587 | 1,93.0,56,11,,22.5,0.41700000000000004,22.0,0
588 | 8,143.0,66,0,,34.9,0.129,41.0,1
589 | 6,103.0,66,0,,24.3,0.249,29.0,0
590 | 3,176.0,86,27,156.0,33.3,1.1540000000000001,52.0,1
591 | 0,73.0,0,0,,21.1,0.342,25.0,0
592 | 11,111.0,84,40,,46.8,0.925,45.0,1
593 | 2,112.0,78,50,140.0,39.4,0.175,24.0,0
594 | 3,132.0,80,0,,34.4,0.402,44.0,1
595 | 2,82.0,52,22,115.0,28.5,1.699,25.0,0
596 | 6,123.0,72,45,230.0,33.6,0.733,34.0,0
597 | 0,188.0,82,14,185.0,32.0,0.682,22.0,1
598 | 0,67.0,76,0,,45.3,0.19399999999999998,46.0,0
599 | 1,89.0,24,19,25.0,27.8,0.5589999999999999,21.0,0
600 | 1,173.0,74,0,,36.8,0.08800000000000001,38.0,1
601 | 1,109.0,38,18,120.0,23.1,0.40700000000000003,26.0,0
602 | 1,108.0,88,19,,27.1,0.4,24.0,0
603 | 6,96.0,0,0,,23.7,0.19,28.0,0
604 | 1,124.0,74,36,,27.8,0.1,30.0,0
605 | 7,150.0,78,29,126.0,35.2,0.6920000000000001,54.0,1
606 | 4,183.0,0,0,,28.4,0.212,36.0,1
607 | 1,124.0,60,32,,35.8,0.514,21.0,0
608 | 1,181.0,78,42,293.0,40.0,1.258,22.0,1
609 | 1,92.0,62,25,41.0,19.5,0.48200000000000004,25.0,0
610 | 0,152.0,82,39,272.0,41.5,0.27,27.0,0
611 | 1,111.0,62,13,182.0,24.0,0.138,23.0,0
612 | 3,106.0,54,21,158.0,30.9,0.292,24.0,0
613 | 3,174.0,58,22,194.0,32.9,0.593,36.0,1
614 | 7,168.0,88,42,321.0,38.2,0.787,40.0,1
615 | 6,105.0,80,28,,32.5,0.878,26.0,0
616 | 11,138.0,74,26,144.0,36.1,0.557,50.0,1
617 | 3,106.0,72,0,,25.8,0.207,27.0,0
618 | 6,117.0,96,0,,28.7,0.157,30.0,0
619 | 2,68.0,62,13,15.0,20.1,0.257,23.0,0
620 | 9,112.0,82,24,,28.2,1.2819999999999998,50.0,1
621 | 0,119.0,0,0,,32.4,0.141,24.0,1
622 | 2,112.0,86,42,160.0,38.4,0.24600000000000002,28.0,0
623 | 2,92.0,76,20,,24.2,1.6980000000000002,28.0,0
624 | 6,183.0,94,0,,40.8,1.4609999999999999,45.0,0
625 | 0,94.0,70,27,115.0,43.5,0.34700000000000003,21.0,0
626 | 2,108.0,64,0,,30.8,0.158,21.0,0
627 | 4,90.0,88,47,54.0,37.7,0.36200000000000004,29.0,0
628 | 0,125.0,68,0,,24.7,0.20600000000000002,21.0,0
629 | 0,132.0,78,0,,32.4,0.39299999999999996,21.0,0
630 | 5,128.0,80,0,,34.6,0.14400000000000002,45.0,0
631 | 4,94.0,65,22,,24.7,0.14800000000000002,21.0,0
632 | 7,114.0,64,0,,27.4,0.732,34.0,1
633 | 0,102.0,78,40,90.0,34.5,0.23800000000000002,24.0,0
634 | 2,111.0,60,0,,26.2,0.34299999999999997,23.0,0
635 | 1,128.0,82,17,183.0,27.5,0.115,22.0,0
636 | 10,92.0,62,0,,25.9,0.16699999999999998,31.0,0
637 | 13,104.0,72,0,,31.2,0.465,38.0,1
638 | 5,104.0,74,0,,28.8,0.153,48.0,0
639 | 2,94.0,76,18,66.0,31.6,0.649,23.0,0
640 | 7,97.0,76,32,91.0,40.9,0.871,32.0,1
641 | 1,100.0,74,12,46.0,19.5,0.149,28.0,0
642 | 0,102.0,86,17,105.0,29.3,0.695,27.0,0
643 | 4,128.0,70,0,,34.3,0.303,24.0,0
644 | 6,147.0,80,0,,29.5,0.17800000000000002,50.0,1
645 | 4,90.0,0,0,,28.0,0.61,31.0,0
646 | 3,103.0,72,30,152.0,27.6,0.73,27.0,0
647 | 2,157.0,74,35,440.0,39.4,0.134,30.0,0
648 | 1,167.0,74,17,144.0,23.4,0.447,33.0,1
649 | 0,179.0,50,36,159.0,37.8,0.455,22.0,1
650 | 11,136.0,84,35,130.0,28.3,0.26,42.0,1
651 | 0,107.0,60,25,,26.4,0.133,23.0,0
652 | 1,91.0,54,25,100.0,25.2,0.23399999999999999,23.0,0
653 | 1,117.0,60,23,106.0,33.8,0.466,27.0,0
654 | 5,123.0,74,40,77.0,34.1,0.26899999999999996,28.0,0
655 | 2,120.0,54,0,,26.8,0.455,27.0,0
656 | 1,106.0,70,28,135.0,34.2,0.142,22.0,0
657 | 2,155.0,52,27,540.0,38.7,0.24,25.0,1
658 | 2,101.0,58,35,90.0,21.8,0.155,22.0,0
659 | 1,120.0,80,48,200.0,38.9,1.162,41.0,0
660 | 11,127.0,106,0,,39.0,0.19,51.0,0
661 | 3,80.0,82,31,70.0,34.2,1.2919999999999998,27.0,1
662 | 10,162.0,84,0,,27.7,0.182,54.0,0
663 | 1,199.0,76,43,,42.9,1.3940000000000001,22.0,1
664 | 8,167.0,106,46,231.0,37.6,0.165,43.0,1
665 | 9,145.0,80,46,130.0,37.9,0.637,40.0,1
666 | 6,115.0,60,39,,33.7,0.245,40.0,1
667 | 1,112.0,80,45,132.0,34.8,0.217,24.0,0
668 | 4,145.0,82,18,,32.5,0.235,70.0,1
669 | 10,111.0,70,27,,27.5,0.141,40.0,1
670 | 6,98.0,58,33,190.0,34.0,0.43,43.0,0
671 | 9,154.0,78,30,100.0,30.9,0.16399999999999998,45.0,0
672 | 6,165.0,68,26,168.0,33.6,0.631,49.0,0
673 | 1,99.0,58,10,,25.4,0.551,21.0,0
674 | 10,68.0,106,23,49.0,35.5,0.285,47.0,0
675 | 3,123.0,100,35,240.0,57.3,0.88,22.0,0
676 | 8,91.0,82,0,,35.6,0.5870000000000001,68.0,0
677 | 6,195.0,70,0,,30.9,0.32799999999999996,31.0,1
678 | 9,156.0,86,0,,24.8,0.23,53.0,1
679 | 0,93.0,60,0,,35.3,0.263,25.0,0
680 | 3,121.0,52,0,,36.0,0.127,25.0,1
681 | 2,101.0,58,17,265.0,24.2,0.614,23.0,0
682 | 2,56.0,56,28,45.0,24.2,0.332,22.0,0
683 | 0,162.0,76,36,,49.6,0.364,26.0,1
684 | 0,95.0,64,39,105.0,44.6,0.366,22.0,0
685 | 4,125.0,80,0,,32.3,0.536,27.0,1
686 | 5,136.0,82,0,,,0.64,69.0,0
687 | 2,129.0,74,26,205.0,33.2,0.591,25.0,0
688 | 3,130.0,64,0,,23.1,0.314,22.0,0
689 | 1,107.0,50,19,,28.3,0.18100000000000002,29.0,0
690 | 1,140.0,74,26,180.0,24.1,0.828,23.0,0
691 | 1,144.0,82,46,180.0,46.1,0.335,46.0,1
692 | 8,107.0,80,0,,24.6,0.856,34.0,0
693 | 13,158.0,114,0,,42.3,0.257,44.0,1
694 | 2,121.0,70,32,95.0,39.1,0.8859999999999999,23.0,0
695 | 7,129.0,68,49,125.0,38.5,0.439,43.0,1
696 | 2,90.0,60,0,,23.5,0.191,25.0,0
697 | 7,142.0,90,24,480.0,30.4,0.128,43.0,1
698 | 3,169.0,74,19,125.0,29.9,0.268,31.0,1
699 | 0,99.0,0,0,,25.0,0.253,22.0,0
700 | 4,127.0,88,11,155.0,34.5,0.598,28.0,0
701 | 4,118.0,70,0,,44.5,0.904,26.0,0
702 | 2,122.0,76,27,200.0,35.9,0.483,26.0,0
703 | 6,125.0,78,31,,27.6,0.565,49.0,1
704 | 1,168.0,88,29,,35.0,0.905,52.0,1
705 | 2,129.0,0,0,,38.5,0.304,41.0,0
706 | 4,110.0,76,20,100.0,28.4,0.11800000000000001,27.0,0
707 | 6,80.0,80,36,,39.8,0.177,28.0,0
708 | 10,115.0,0,0,,,0.261,30.0,1
709 | 2,127.0,46,21,335.0,34.4,0.17600000000000002,22.0,0
710 | 9,164.0,78,0,,32.8,0.14800000000000002,45.0,1
711 | 2,93.0,64,32,160.0,38.0,0.674,23.0,1
712 | 3,158.0,64,13,387.0,31.2,0.295,24.0,0
713 | 5,126.0,78,27,22.0,29.6,0.439,40.0,0
714 | 10,129.0,62,36,,41.2,0.441,38.0,1
715 | 0,134.0,58,20,291.0,26.4,0.35200000000000004,21.0,0
716 | 3,102.0,74,0,,29.5,0.121,32.0,0
717 | 7,187.0,50,33,392.0,33.9,0.826,34.0,1
718 | 3,173.0,78,39,185.0,33.8,0.97,31.0,1
719 | 10,94.0,72,18,,23.1,0.595,56.0,0
720 | 1,108.0,60,46,178.0,35.5,0.415,24.0,0
721 | 5,97.0,76,27,,35.6,0.37799999999999995,52.0,1
722 | 4,83.0,86,19,,29.3,0.317,34.0,0
723 | 1,114.0,66,36,200.0,38.1,0.289,21.0,0
724 | 1,149.0,68,29,127.0,29.3,0.349,42.0,1
725 | 5,117.0,86,30,105.0,39.1,0.251,42.0,0
726 | 1,111.0,94,0,,32.8,0.265,45.0,0
727 | 4,112.0,78,40,,39.4,0.23600000000000002,38.0,0
728 | 1,116.0,78,29,180.0,36.1,0.496,25.0,0
729 | 0,141.0,84,26,,32.4,0.433,22.0,0
730 | 2,175.0,88,0,,22.9,0.326,22.0,0
731 | 2,92.0,52,0,,30.1,0.141,22.0,0
732 | 3,130.0,78,23,79.0,28.4,0.32299999999999995,34.0,1
733 | 8,120.0,86,0,,28.4,0.259,22.0,1
734 | 2,174.0,88,37,120.0,44.5,0.6459999999999999,24.0,1
735 | 2,106.0,56,27,165.0,29.0,0.426,22.0,0
736 | 2,105.0,75,0,,23.3,0.56,53.0,0
737 | 4,95.0,60,32,,35.4,0.284,28.0,0
738 | 0,126.0,86,27,120.0,27.4,0.515,21.0,0
739 | 8,65.0,72,23,,32.0,0.6,42.0,0
740 | 2,99.0,60,17,160.0,36.6,0.45299999999999996,21.0,0
741 | 1,102.0,74,0,,39.5,0.293,42.0,1
742 | 11,120.0,80,37,150.0,42.3,0.785,48.0,1
743 | 3,102.0,44,20,94.0,30.8,0.4,26.0,0
744 | 1,109.0,58,18,116.0,28.5,0.21899999999999997,22.0,0
745 | 9,140.0,94,0,,32.7,0.7340000000000001,45.0,1
746 | 13,153.0,88,37,140.0,40.6,1.1740000000000002,39.0,0
747 | 12,100.0,84,33,105.0,30.0,0.488,46.0,0
748 | 1,147.0,94,41,,49.3,0.358,27.0,1
749 | 1,81.0,74,41,57.0,46.3,1.0959999999999999,32.0,0
750 | 3,187.0,70,22,200.0,36.4,0.408,36.0,1
751 | 6,162.0,62,0,,24.3,0.17800000000000002,50.0,1
752 | 4,136.0,70,0,,31.2,1.182,22.0,1
753 | 1,121.0,78,39,74.0,39.0,0.261,28.0,0
754 | 3,108.0,62,24,,26.0,0.223,25.0,0
755 | 0,181.0,88,44,510.0,43.3,0.222,26.0,1
756 | 8,154.0,78,32,,32.4,0.44299999999999995,45.0,1
757 | 1,128.0,88,39,110.0,36.5,1.057,37.0,1
758 | 7,137.0,90,41,,32.0,0.391,39.0,0
759 | 0,123.0,72,0,,36.3,0.258,52.0,1
760 | 1,106.0,76,0,,37.5,0.19699999999999998,26.0,0
761 | 6,190.0,92,0,,35.5,0.278,66.0,1
762 | 2,88.0,58,26,16.0,28.4,0.7659999999999999,22.0,0
763 | 9,170.0,74,31,,44.0,0.40299999999999997,43.0,1
764 | 9,89.0,62,0,,22.5,0.142,33.0,0
765 | 10,101.0,76,48,180.0,32.9,0.171,63.0,0
766 | 2,122.0,70,27,,36.8,0.34,27.0,0
767 | 5,121.0,72,23,112.0,26.2,0.245,30.0,0
768 | 1,126.0,60,0,,30.1,0.349,47.0,1
769 | 1,93.0,70,31,,30.4,0.315,23.0,0
770 | 


--------------------------------------------------------------------------------
/Data/daily-total-female-births.csv:
--------------------------------------------------------------------------------
  1 | "Date","Births"
  2 | "1959-01-01",35
  3 | "1959-01-02",32
  4 | "1959-01-03",30
  5 | "1959-01-04",31
  6 | "1959-01-05",44
  7 | "1959-01-06",29
  8 | "1959-01-07",45
  9 | "1959-01-08",43
 10 | "1959-01-09",38
 11 | "1959-01-10",27
 12 | "1959-01-11",38
 13 | "1959-01-12",33
 14 | "1959-01-13",55
 15 | "1959-01-14",47
 16 | "1959-01-15",45
 17 | "1959-01-16",37
 18 | "1959-01-17",50
 19 | "1959-01-18",43
 20 | "1959-01-19",41
 21 | "1959-01-20",52
 22 | "1959-01-21",34
 23 | "1959-01-22",53
 24 | "1959-01-23",39
 25 | "1959-01-24",32
 26 | "1959-01-25",37
 27 | "1959-01-26",43
 28 | "1959-01-27",39
 29 | "1959-01-28",35
 30 | "1959-01-29",44
 31 | "1959-01-30",38
 32 | "1959-01-31",24
 33 | "1959-02-01",23
 34 | "1959-02-02",31
 35 | "1959-02-03",44
 36 | "1959-02-04",38
 37 | "1959-02-05",50
 38 | "1959-02-06",38
 39 | "1959-02-07",51
 40 | "1959-02-08",31
 41 | "1959-02-09",31
 42 | "1959-02-10",51
 43 | "1959-02-11",36
 44 | "1959-02-12",45
 45 | "1959-02-13",51
 46 | "1959-02-14",34
 47 | "1959-02-15",52
 48 | "1959-02-16",47
 49 | "1959-02-17",45
 50 | "1959-02-18",46
 51 | "1959-02-19",39
 52 | "1959-02-20",48
 53 | "1959-02-21",37
 54 | "1959-02-22",35
 55 | "1959-02-23",52
 56 | "1959-02-24",42
 57 | "1959-02-25",45
 58 | "1959-02-26",39
 59 | "1959-02-27",37
 60 | "1959-02-28",30
 61 | "1959-03-01",35
 62 | "1959-03-02",28
 63 | "1959-03-03",45
 64 | "1959-03-04",34
 65 | "1959-03-05",36
 66 | "1959-03-06",50
 67 | "1959-03-07",44
 68 | "1959-03-08",39
 69 | "1959-03-09",32
 70 | "1959-03-10",39
 71 | "1959-03-11",45
 72 | "1959-03-12",43
 73 | "1959-03-13",39
 74 | "1959-03-14",31
 75 | "1959-03-15",27
 76 | "1959-03-16",30
 77 | "1959-03-17",42
 78 | "1959-03-18",46
 79 | "1959-03-19",41
 80 | "1959-03-20",36
 81 | "1959-03-21",45
 82 | "1959-03-22",46
 83 | "1959-03-23",43
 84 | "1959-03-24",38
 85 | "1959-03-25",34
 86 | "1959-03-26",35
 87 | "1959-03-27",56
 88 | "1959-03-28",36
 89 | "1959-03-29",32
 90 | "1959-03-30",50
 91 | "1959-03-31",41
 92 | "1959-04-01",39
 93 | "1959-04-02",41
 94 | "1959-04-03",47
 95 | "1959-04-04",34
 96 | "1959-04-05",36
 97 | "1959-04-06",33
 98 | "1959-04-07",35
 99 | "1959-04-08",38
100 | "1959-04-09",38
101 | "1959-04-10",34
102 | "1959-04-11",53
103 | "1959-04-12",34
104 | "1959-04-13",34
105 | "1959-04-14",38
106 | "1959-04-15",35
107 | "1959-04-16",32
108 | "1959-04-17",42
109 | "1959-04-18",34
110 | "1959-04-19",46
111 | "1959-04-20",30
112 | "1959-04-21",46
113 | "1959-04-22",45
114 | "1959-04-23",54
115 | "1959-04-24",34
116 | "1959-04-25",37
117 | "1959-04-26",35
118 | "1959-04-27",40
119 | "1959-04-28",42
120 | "1959-04-29",58
121 | "1959-04-30",51
122 | "1959-05-01",32
123 | "1959-05-02",35
124 | "1959-05-03",38
125 | "1959-05-04",33
126 | "1959-05-05",39
127 | "1959-05-06",47
128 | "1959-05-07",38
129 | "1959-05-08",52
130 | "1959-05-09",30
131 | "1959-05-10",34
132 | "1959-05-11",40
133 | "1959-05-12",35
134 | "1959-05-13",42
135 | "1959-05-14",41
136 | "1959-05-15",42
137 | "1959-05-16",38
138 | "1959-05-17",24
139 | "1959-05-18",34
140 | "1959-05-19",43
141 | "1959-05-20",36
142 | "1959-05-21",55
143 | "1959-05-22",41
144 | "1959-05-23",45
145 | "1959-05-24",41
146 | "1959-05-25",37
147 | "1959-05-26",43
148 | "1959-05-27",39
149 | "1959-05-28",33
150 | "1959-05-29",43
151 | "1959-05-30",40
152 | "1959-05-31",38
153 | "1959-06-01",45
154 | "1959-06-02",46
155 | "1959-06-03",34
156 | "1959-06-04",35
157 | "1959-06-05",48
158 | "1959-06-06",51
159 | "1959-06-07",36
160 | "1959-06-08",33
161 | "1959-06-09",46
162 | "1959-06-10",42
163 | "1959-06-11",48
164 | "1959-06-12",34
165 | "1959-06-13",41
166 | "1959-06-14",35
167 | "1959-06-15",40
168 | "1959-06-16",34
169 | "1959-06-17",30
170 | "1959-06-18",36
171 | "1959-06-19",40
172 | "1959-06-20",39
173 | "1959-06-21",45
174 | "1959-06-22",38
175 | "1959-06-23",47
176 | "1959-06-24",33
177 | "1959-06-25",30
178 | "1959-06-26",42
179 | "1959-06-27",43
180 | "1959-06-28",41
181 | "1959-06-29",41
182 | "1959-06-30",59
183 | "1959-07-01",43
184 | "1959-07-02",45
185 | "1959-07-03",38
186 | "1959-07-04",37
187 | "1959-07-05",45
188 | "1959-07-06",42
189 | "1959-07-07",57
190 | "1959-07-08",46
191 | "1959-07-09",51
192 | "1959-07-10",41
193 | "1959-07-11",47
194 | "1959-07-12",26
195 | "1959-07-13",35
196 | "1959-07-14",44
197 | "1959-07-15",41
198 | "1959-07-16",42
199 | "1959-07-17",36
200 | "1959-07-18",45
201 | "1959-07-19",45
202 | "1959-07-20",45
203 | "1959-07-21",47
204 | "1959-07-22",38
205 | "1959-07-23",42
206 | "1959-07-24",35
207 | "1959-07-25",36
208 | "1959-07-26",39
209 | "1959-07-27",45
210 | "1959-07-28",43
211 | "1959-07-29",47
212 | "1959-07-30",36
213 | "1959-07-31",41
214 | "1959-08-01",50
215 | "1959-08-02",39
216 | "1959-08-03",41
217 | "1959-08-04",46
218 | "1959-08-05",64
219 | "1959-08-06",45
220 | "1959-08-07",34
221 | "1959-08-08",38
222 | "1959-08-09",44
223 | "1959-08-10",48
224 | "1959-08-11",46
225 | "1959-08-12",44
226 | "1959-08-13",37
227 | "1959-08-14",39
228 | "1959-08-15",44
229 | "1959-08-16",45
230 | "1959-08-17",33
231 | "1959-08-18",44
232 | "1959-08-19",38
233 | "1959-08-20",46
234 | "1959-08-21",46
235 | "1959-08-22",40
236 | "1959-08-23",39
237 | "1959-08-24",44
238 | "1959-08-25",48
239 | "1959-08-26",50
240 | "1959-08-27",41
241 | "1959-08-28",42
242 | "1959-08-29",51
243 | "1959-08-30",41
244 | "1959-08-31",44
245 | "1959-09-01",38
246 | "1959-09-02",68
247 | "1959-09-03",40
248 | "1959-09-04",42
249 | "1959-09-05",51
250 | "1959-09-06",44
251 | "1959-09-07",45
252 | "1959-09-08",36
253 | "1959-09-09",57
254 | "1959-09-10",44
255 | "1959-09-11",42
256 | "1959-09-12",53
257 | "1959-09-13",42
258 | "1959-09-14",34
259 | "1959-09-15",40
260 | "1959-09-16",56
261 | "1959-09-17",44
262 | "1959-09-18",53
263 | "1959-09-19",55
264 | "1959-09-20",39
265 | "1959-09-21",59
266 | "1959-09-22",55
267 | "1959-09-23",73
268 | "1959-09-24",55
269 | "1959-09-25",44
270 | "1959-09-26",43
271 | "1959-09-27",40
272 | "1959-09-28",47
273 | "1959-09-29",51
274 | "1959-09-30",56
275 | "1959-10-01",49
276 | "1959-10-02",54
277 | "1959-10-03",56
278 | "1959-10-04",47
279 | "1959-10-05",44
280 | "1959-10-06",43
281 | "1959-10-07",42
282 | "1959-10-08",45
283 | "1959-10-09",50
284 | "1959-10-10",48
285 | "1959-10-11",43
286 | "1959-10-12",40
287 | "1959-10-13",59
288 | "1959-10-14",41
289 | "1959-10-15",42
290 | "1959-10-16",51
291 | "1959-10-17",49
292 | "1959-10-18",45
293 | "1959-10-19",43
294 | "1959-10-20",42
295 | "1959-10-21",38
296 | "1959-10-22",47
297 | "1959-10-23",38
298 | "1959-10-24",36
299 | "1959-10-25",42
300 | "1959-10-26",35
301 | "1959-10-27",28
302 | "1959-10-28",44
303 | "1959-10-29",36
304 | "1959-10-30",45
305 | "1959-10-31",46
306 | "1959-11-01",48
307 | "1959-11-02",49
308 | "1959-11-03",43
309 | "1959-11-04",42
310 | "1959-11-05",59
311 | "1959-11-06",45
312 | "1959-11-07",52
313 | "1959-11-08",46
314 | "1959-11-09",42
315 | "1959-11-10",40
316 | "1959-11-11",40
317 | "1959-11-12",45
318 | "1959-11-13",35
319 | "1959-11-14",35
320 | "1959-11-15",40
321 | "1959-11-16",39
322 | "1959-11-17",33
323 | "1959-11-18",42
324 | "1959-11-19",47
325 | "1959-11-20",51
326 | "1959-11-21",44
327 | "1959-11-22",40
328 | "1959-11-23",57
329 | "1959-11-24",49
330 | "1959-11-25",45
331 | "1959-11-26",49
332 | "1959-11-27",51
333 | "1959-11-28",46
334 | "1959-11-29",44
335 | "1959-11-30",52
336 | "1959-12-01",45
337 | "1959-12-02",32
338 | "1959-12-03",46
339 | "1959-12-04",41
340 | "1959-12-05",34
341 | "1959-12-06",33
342 | "1959-12-07",36
343 | "1959-12-08",49
344 | "1959-12-09",43
345 | "1959-12-10",43
346 | "1959-12-11",34
347 | "1959-12-12",39
348 | "1959-12-13",35
349 | "1959-12-14",52
350 | "1959-12-15",47
351 | "1959-12-16",52
352 | "1959-12-17",39
353 | "1959-12-18",40
354 | "1959-12-19",42
355 | "1959-12-20",42
356 | "1959-12-21",53
357 | "1959-12-22",39
358 | "1959-12-23",40
359 | "1959-12-24",38
360 | "1959-12-25",44
361 | "1959-12-26",34
362 | "1959-12-27",37
363 | "1959-12-28",52
364 | "1959-12-29",48
365 | "1959-12-30",55
366 | "1959-12-31",50


--------------------------------------------------------------------------------
/Data/iris_all.csv:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,class
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3.0,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5.0,3.6,1.4,0.2,Iris-setosa
  7 | 5.4,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5.0,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,2.9,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,1.6,0.2,Iris-setosa
 14 | 4.8,3.0,1.4,0.1,Iris-setosa
 15 | 4.3,3.0,1.1,0.1,Iris-setosa
 16 | 5.8,4.0,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,Iris-setosa
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1.0,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5.0,3.0,1.6,0.2,Iris-setosa
 28 | 5.0,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5.0,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3.0,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5.0,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5.0,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3.0,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5.0,3.3,1.4,0.2,Iris-setosa
 52 | 7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3.0,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6.0,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3.0,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3.0,5.8,2.2,Iris-virginica
107 | 7.6,3.0,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2.0,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3.0,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5.0,2.0,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3.0,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6.0,2.2,5.0,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2.0,Iris-virginica
124 | 7.7,2.8,6.7,2.0,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6.0,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3.0,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3.0,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2.0,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3.0,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6.0,3.0,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3.0,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5.0,1.9,Iris-virginica
149 | 6.5,3.0,5.2,2.0,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/Notebooks/01-XGBoost_BikeRental_Data_Preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "\n",
 13 |     "from pandas.plotting import register_matplotlib_converters\n",
 14 |     "register_matplotlib_converters()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "<h2>Kaggle Bike Sharing Demand Dataset</h2>\n",
 22 |     "\n",
 23 |     "Modified 'count' to log1p(count) for training\n",
 24 |     "\n",
 25 |     "Log can be used when target represents a count (that is non-negative values)\n",
 26 |     "\n",
 27 |     "Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)\n",
 28 |     "\n",
 29 |     "Reference:\n",
 30 |     "https://www.kaggle.com/apapiu/predicting-bike-sharing-with-xgboost by Alexandru Papiu\n",
 31 |     "\n",
 32 |     "To download dataset, sign-in and download from this link:\n",
 33 |     "https://www.kaggle.com/c/bike-sharing-demand/data <br>\n",
 34 |     "\n",
 35 |     "\n",
 36 |     "Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',\n",
 37 |     "       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']<br>\n",
 38 |     "Target Feature: [<b>log1p('count')</b>]<br>\n",
 39 |     "Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Example\n",
 49 |     "# Converts to log1p(count)\n",
 50 |     "# Print original count back using expm1\n",
 51 |     "print('Test log and exp')\n",
 52 |     "test_count = 100\n",
 53 |     "print('original value', test_count)\n",
 54 |     "x = np.log1p(test_count) # log (x+1)\n",
 55 |     "print('log1p', x)\n",
 56 |     "print('expm1', np.expm1(x)) # exp(x) - 1"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',\n",
 66 |     "       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "df = pd.read_csv('../Data/bikesharing_train.csv', parse_dates=['datetime'],index_col=0)\n",
 76 |     "df_test = pd.read_csv('../Data/bikesharing_test.csv', parse_dates=['datetime'],index_col=0)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# We need to convert datetime to numeric for training.\n",
 86 |     "# Let's extract key features into separate numeric columns\n",
 87 |     "def add_features(df):\n",
 88 |     "    df['year'] = df.index.year\n",
 89 |     "    df['month'] = df.index.month\n",
 90 |     "    df['day'] = df.index.day\n",
 91 |     "    df['dayofweek'] = df.index.dayofweek\n",
 92 |     "    df['hour'] = df.index.hour"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "add_features(df)\n",
102 |     "add_features(df_test)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "plt.plot(df['2011']['count'],label='2011')\n",
112 |     "plt.plot(df['2012']['count'],label='2012')\n",
113 |     "plt.xticks(fontsize=14, rotation=45)\n",
114 |     "plt.xlabel('Date')\n",
115 |     "plt.ylabel('Rental Count')\n",
116 |     "plt.title('2011 and 2012 Rentals (Year to Year)')\n",
117 |     "plt.legend()\n",
118 |     "plt.show()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "plt.plot(df['2011']['count'].map(np.log1p),label='2011')\n",
128 |     "plt.plot(df['2012']['count'].map(np.log1p),label='2012')\n",
129 |     "plt.xticks(fontsize=14, rotation=45)\n",
130 |     "plt.xlabel('Date')\n",
131 |     "plt.ylabel('Log(Rental Count)')\n",
132 |     "plt.title('2011 and 2012 Rentals (Year to Year)')\n",
133 |     "plt.legend()\n",
134 |     "plt.show()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "plt.boxplot([df['count']], labels=['count'])\n",
144 |     "plt.title('Box Plot - Count')\n",
145 |     "plt.ylabel('Target')\n",
146 |     "plt.grid(True)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "# Let's see how the data distribution changes with log1p\n",
156 |     "# Evenly distributed\n",
157 |     "plt.boxplot([df['count'].map(np.log1p)], labels=['log1p(count)'])\n",
158 |     "plt.title('Box Plot - log1p(Count)')\n",
159 |     "plt.ylabel('Target')\n",
160 |     "plt.grid(True)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "df[\"count\"] = df[\"count\"].map(np.log1p)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "df.head()"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "df_test.head()"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "df.dtypes"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "# Save all data\n",
206 |     "df.to_csv('../Data/bike_all.csv',index=True,index_label='datetime',columns=columns)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Training and Validation Set\n",
214 |     "### Target Variable as first column followed by input features\n",
215 |     "### Training, Validation files do not have a column header"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "# Training = 70% of the data\n",
225 |     "# Validation = 30% of the data\n",
226 |     "# Randomize the datset\n",
227 |     "np.random.seed(5)\n",
228 |     "l = list(df.index)\n",
229 |     "np.random.shuffle(l)\n",
230 |     "df = df.loc[l]"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "rows = df.shape[0]\n",
240 |     "train = int(.7 * rows)\n",
241 |     "test = rows-train"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "rows, train, test"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "columns"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "# Write Training Set\n",
269 |     "df.iloc[:train].to_csv('../Data/bike_train.csv'\n",
270 |     "                          ,index=False,header=False\n",
271 |     "                          ,columns=columns)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# Write Validation Set\n",
281 |     "df.iloc[train:].to_csv('../Data/bike_validation.csv'\n",
282 |     "                          ,index=False,header=False\n",
283 |     "                          ,columns=columns)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "# Test Data has only input features\n",
293 |     "df_test.to_csv('../Data/bike_test.csv',index=True,index_label='datetime')"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "print(','.join(columns))"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# Write Column List\n",
312 |     "with open('../Data/bike_train_column_list.txt','w') as f:\n",
313 |     "    f.write(','.join(columns))"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": []
322 |   }
323 |  ],
324 |  "metadata": {
325 |   "kernelspec": {
326 |    "display_name": "Python 3",
327 |    "language": "python",
328 |    "name": "python3"
329 |   },
330 |   "language_info": {
331 |    "codemirror_mode": {
332 |     "name": "ipython",
333 |     "version": 3
334 |    },
335 |    "file_extension": ".py",
336 |    "mimetype": "text/x-python",
337 |    "name": "python",
338 |    "nbconvert_exporter": "python",
339 |    "pygments_lexer": "ipython3",
340 |    "version": "3.7.6"
341 |   }
342 |  },
343 |  "nbformat": 4,
344 |  "nbformat_minor": 1
345 | }
346 | 


--------------------------------------------------------------------------------
/Notebooks/02-XGBoost_Regression_BikeRental.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Train a model with bike rental data using XGBoost algorithm\n",
  8 |     "### Training log1p(count) dataset\n",
  9 |     "###  Model is trained with XGBoost installed in notebook instance\n",
 10 |     "###  In the later examples, we will train using SageMaker's XGBoost algorithm"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Install xgboost in notebook instance.\n",
 20 |     "#### Command to install xgboost\n",
 21 |     "# !pip install xgboost==0.90"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import sys\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "import matplotlib.pyplot as plt\n",
 34 |     "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
 35 |     "\n",
 36 |     "# XGBoost \n",
 37 |     "import xgboost as xgb\n",
 38 |     "\n",
 39 |     "import matplotlib.pyplot as plt\n",
 40 |     "\n",
 41 |     "from pandas.plotting import register_matplotlib_converters\n",
 42 |     "register_matplotlib_converters()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "<h2>Kaggle Bike Sharing Demand Dataset</h2>\n",
 50 |     "\n",
 51 |     "Modified 'count' to log1p(count) for training\n",
 52 |     "\n",
 53 |     "Log can be used when target represents a count (that is non-negative values)\n",
 54 |     "\n",
 55 |     "Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)\n",
 56 |     "\n",
 57 |     "Reference:\n",
 58 |     "https://www.kaggle.com/apapiu/predicting-bike-sharing-with-xgboost by Alexandru Papiu\n",
 59 |     "\n",
 60 |     "To download dataset, sign-in and download from this link:\n",
 61 |     "https://www.kaggle.com/c/bike-sharing-demand/data <br>\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp',\n",
 65 |     "       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']<br>\n",
 66 |     "Target Feature: [<b>log1p('count')</b>]<br>\n",
 67 |     "Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "column_list_file = '../Data/bike_train_column_list.txt'\n",
 77 |     "train_file = '../Data/bike_train.csv'\n",
 78 |     "validation_file = '../Data/bike_validation.csv'\n",
 79 |     "test_file = '../Data/bike_test.csv'"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "columns = ''\n",
 89 |     "with open(column_list_file,'r') as f:\n",
 90 |     "    columns = f.read().split(',')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "columns"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# Specify the column names as the file does not have column header\n",
109 |     "df_train = pd.read_csv(train_file,names=columns)\n",
110 |     "df_validation = pd.read_csv(validation_file,names=columns)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "df_train.head()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "df_validation.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "X_train = df_train.iloc[:,1:] # Features: 1st column onwards \n",
138 |     "y_train = df_train.iloc[:,0].ravel() # Target: 0th column\n",
139 |     "\n",
140 |     "X_validation = df_validation.iloc[:,1:]\n",
141 |     "y_validation = df_validation.iloc[:,0].ravel()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# XGBoost Training Parameter Reference: \n",
151 |     "#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n",
152 |     "#regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150)\n",
153 |     "regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "regressor"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "df_train['count'].describe()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "eval_result = regressor.evals_result()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "training_rounds = range(len(eval_result['validation_0']['rmse']))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')\n",
208 |     "plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')\n",
209 |     "plt.grid(True)\n",
210 |     "plt.xlabel('Iteration')\n",
211 |     "plt.ylabel('RMSE')\n",
212 |     "plt.title('Training Vs Validation Error')\n",
213 |     "plt.legend()\n",
214 |     "plt.show()"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "xgb.plot_importance(regressor)\n",
224 |     "plt.show()"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# Updated - Changed to validation dataset\n",
234 |     "# Compare actual vs predicted performance with dataset not seen by the model before\n",
235 |     "df = pd.read_csv(validation_file,names=columns)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "df.head()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "X_test = df.iloc[:,1:]\n",
254 |     "print(X_test[:5])"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "result = regressor.predict(X_test)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "result[:5]"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "df.head()"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "df['count_predicted'] = result"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "df.head()"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "# Negative Values are predicted\n",
309 |     "df['count_predicted'].describe()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "df[df['count_predicted'] < 0]"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "def adjust_count(x):\n",
328 |     "    if x < 0:\n",
329 |     "        return 0\n",
330 |     "    else:\n",
331 |     "        return x"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "df['count_predicted'] = df['count_predicted'].map(adjust_count)"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "df[df['count_predicted'] < 0]"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "df['count'] = df['count'].map(np.expm1)\n",
359 |     "df['count_predicted'] = df['count_predicted'].map(np.expm1)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "# Actual Vs Predicted\n",
369 |     "plt.plot(df['count'], label='Actual')\n",
370 |     "plt.plot(df['count_predicted'],label='Predicted')\n",
371 |     "plt.xlabel('Sample')\n",
372 |     "plt.ylabel('Count')\n",
373 |     "plt.xlim([100,150])\n",
374 |     "plt.title('Validation Dataset - Predicted Vs. Actual')\n",
375 |     "plt.legend()\n",
376 |     "plt.show()"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "# Over prediction and Under Prediction needs to be balanced\n",
386 |     "# Training Data Residuals\n",
387 |     "residuals = (df['count'] - df['count_predicted'])\n",
388 |     "\n",
389 |     "plt.hist(residuals)\n",
390 |     "plt.grid(True)\n",
391 |     "plt.xlabel('Actual - Predicted')\n",
392 |     "plt.ylabel('Count')\n",
393 |     "plt.title('Residuals Distribution')\n",
394 |     "plt.axvline(color='r')\n",
395 |     "plt.show()"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "value_counts = (residuals > 0).value_counts(sort=False)\n",
405 |     "print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))\n",
406 |     "print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "import sklearn.metrics as metrics\n",
416 |     "print(\"RMSE: {0:.2f}\".format(metrics.mean_squared_error(df['count'],\n",
417 |     "                                                    df['count_predicted'])**.5))"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "# Metric Use By Kaggle\n",
427 |     "def compute_rmsle(y_true, y_pred):\n",
428 |     "    if type(y_true) != np.ndarray:\n",
429 |     "        y_true = np.array(y_true)\n",
430 |     "        \n",
431 |     "    if type(y_pred) != np.ndarray:\n",
432 |     "        y_pred = np.array(y_pred)\n",
433 |     "     \n",
434 |     "    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "print(\"RMSLE: {0:.2f}\".format(compute_rmsle(df['count'],df['count_predicted'])))"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "# Prepare Data for Submission to Kaggle\n",
453 |     "df_test = pd.read_csv(test_file,parse_dates=['datetime'])"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "df_test.head()"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "X_test.head()"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "result = regressor.predict(X_test)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "result[:5]"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": [
507 |     "np.expm1(result)"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "# Convert result to actual count\n",
517 |     "df_test[\"count\"] = np.expm1(result)"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "df_test.head()"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "df_test[df_test[\"count\"] < 0]"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "df_test[['datetime','count']].to_csv('../Data/predicted_count.csv',index=False)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": []
553 |   }
554 |  ],
555 |  "metadata": {
556 |   "kernelspec": {
557 |    "display_name": "Python 3",
558 |    "language": "python",
559 |    "name": "python3"
560 |   },
561 |   "language_info": {
562 |    "codemirror_mode": {
563 |     "name": "ipython",
564 |     "version": 3
565 |    },
566 |    "file_extension": ".py",
567 |    "mimetype": "text/x-python",
568 |    "name": "python",
569 |    "nbconvert_exporter": "python",
570 |    "pygments_lexer": "ipython3",
571 |    "version": "3.7.6"
572 |   }
573 |  },
574 |  "nbformat": 4,
575 |  "nbformat_minor": 2
576 | }
577 | 


--------------------------------------------------------------------------------
/Notebooks/03-XGBoost_Binary_Classification_Diabetes_Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "nbpresent": {
  7 |      "id": "782a07bf-08de-4030-88e1-6731c4ac956e"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "## Diabetes dataset \n",
 12 |     "### Predict if a person is at risk of developing diabetes\n",
 13 |     "\n",
 14 |     "### This Dataset is Freely Available\n",
 15 |     "\n",
 16 |     "### Overview:\n",
 17 |     "The data was collected and made available by the \"National Institute of Diabetes and Digestive and Kidney Diseases\" as part of the Pima Indians Diabetes Database. \n",
 18 |     "\n",
 19 |     "`Diabetes.csv` is available [from Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database). We have several questions - what information is more correlated with a positive diagnosis, and if we can only ask two questions to a patient, what should we ask and how would we give them a risk of being diagnosed.\n",
 20 |     "\n",
 21 |     "++++++++++++++++++++++++++++++++++++\n",
 22 |     "\n",
 23 |     "The following features have been provided to help us predict whether a person is diabetic or not:\n",
 24 |     "* **Pregnancies:**  Number of times pregnant\n",
 25 |     "* **Glucose:** Plasma glucose concentration over 2 hours in an oral glucose tolerance test\n",
 26 |     "* **BloodPressure:** Diastolic blood pressure (mm Hg)\n",
 27 |     "* **SkinThickness:** Triceps skin fold thickness (mm)\n",
 28 |     "* **Insulin:** 2-Hour serum insulin (mu U/ml)\n",
 29 |     "* **BMI:** Body mass index (weight in kg/(height in m)2)\n",
 30 |     "* **DiabetesPedigreeFunction:** Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)\n",
 31 |     "* **Age:** Age (years)\n",
 32 |     "* **Outcome:** Class variable (0 if non-diabetic, 1 if diabetic)\n",
 33 |     "\n",
 34 |     "### Binary Classification problem - XGBoost"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "nbpresent": {
 42 |      "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae"
 43 |     }
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Install xgboost in notebook instance.\n",
 48 |     "#### Command to install xgboost\n",
 49 |     "#!pip install xgboost==0.90"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "nbpresent": {
 57 |      "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656"
 58 |     }
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import sys\n",
 63 |     "import numpy as np\n",
 64 |     "import pandas as pd\n",
 65 |     "import matplotlib.pyplot as plt\n",
 66 |     "import itertools\n",
 67 |     "\n",
 68 |     "import xgboost as xgb\n",
 69 |     "from sklearn.metrics import classification_report, confusion_matrix"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "nbpresent": {
 77 |      "id": "a3946273-d086-4564-b0f1-6adc225191c3"
 78 |     }
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "data = pd.read_csv(\"../Data/Diabetes.csv\")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "data.describe()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "data.info()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "## only keep rows where non of the columns has 0 value (except the first and last columns)\n",
110 |     "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n",
111 |     "data.reset_index(inplace=True, drop = True)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "### Dealing with Missing Values"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# using isnull() function  \n",
128 |     "# print(data.isnull().any().sum())\n",
129 |     "print(data.isnull().sum())\n",
130 |     "#data.isnull()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "data.drop(columns=['Insulin'], inplace = True)\n",
140 |     "data.reset_index(inplace=True, drop = True)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "### Replace missing values in each column with the mean or median of that column\n",
150 |     "#data.fillna(data.mean())\n",
151 |     "data.fillna(data.median(), inplace=True)\n",
152 |     "\n",
153 |     "### Drop all rows that contain missing values?\n",
154 |     "#data = data.dropna()\n",
155 |     "#data.reset_index(inplace=True, drop = True)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "### Split Data"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# Training = 70% of the data\n",
172 |     "# Validation = 30% of the data\n",
173 |     "# Randomize the datset\n",
174 |     "np.random.seed(5)\n",
175 |     "l = list(data.index)\n",
176 |     "np.random.shuffle(l)\n",
177 |     "data = data.iloc[l]\n",
178 |     "data.reset_index(inplace=True, drop = True)\n",
179 |     "data"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "rows = data.shape[0]\n",
189 |     "train = int(.7 * rows)\n",
190 |     "test = rows - train"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "rows, train, test"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "# Training Set\n",
209 |     "df_train = data[:train]\n",
210 |     "#df_train"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# validation Set\n",
220 |     "df_validation = data[train:]\n",
221 |     "#df_validation"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "nbpresent": {
229 |      "id": "a195ae30-1962-4427-859b-73a013dc10d6"
230 |     }
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "df_train.head()"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "377 * 8"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "nbpresent": {
251 |      "id": "e30e8aeb-1ca2-4851-bc2d-1bdee29ab1cf"
252 |     }
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "df_validation.head()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "nbpresent": {
264 |      "id": "3b240613-803d-4fa9-93cf-53ef68df7b93"
265 |     }
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "X_train = df_train.iloc[:,:-1] # Features: all columns excep last\n",
270 |     "y_train = df_train.iloc[:,-1].ravel() # Target: last column\n",
271 |     "\n",
272 |     "X_validation = df_validation.iloc[:,:-1]\n",
273 |     "y_validation = df_validation.iloc[:,-1].ravel()"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "y_validation.shape"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "nbpresent": {
290 |      "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e"
291 |     }
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "# Launch a classifier\n",
296 |     "# XGBoost Training Parameter Reference: \n",
297 |     "#   https://xgboost.readthedocs.io/en/latest/parameter.html\n",
298 |     "classifier = xgb.XGBClassifier (objective=\"binary:logistic\")"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {
305 |     "nbpresent": {
306 |      "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a"
307 |     }
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "classifier"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {
318 |     "nbpresent": {
319 |      "id": "9839d7ce-e791-4d93-bc5f-28604ffde022"
320 |     }
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "classifier.fit(X_train,\n",
325 |     "               y_train, \n",
326 |     "               eval_set = [(X_train, y_train), (X_validation, y_validation)], \n",
327 |     "               eval_metric=['logloss'],\n",
328 |     "               early_stopping_rounds=20)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {
335 |     "nbpresent": {
336 |      "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff"
337 |     }
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "eval_result = classifier.evals_result()"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "nbpresent": {
349 |      "id": "092776c3-a611-4f40-91e2-664b3b99d05e"
350 |     }
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "training_rounds = range(len(eval_result['validation_0']['logloss']))"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {
361 |     "nbpresent": {
362 |      "id": "2e9af3f7-fb85-4c52-83d5-ff9cae457294"
363 |     }
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "print(training_rounds)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {
374 |     "nbpresent": {
375 |      "id": "5e71239a-e321-43ba-ac2c-993b57b3be3a"
376 |     }
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "plt.scatter(x=training_rounds,y=eval_result['validation_0']['logloss'],label='Training Error')\n",
381 |     "plt.scatter(x=training_rounds,y=eval_result['validation_1']['logloss'],label='Validation Error')\n",
382 |     "plt.grid(True)\n",
383 |     "plt.xlabel('Iteration')\n",
384 |     "plt.ylabel('LogLoss')\n",
385 |     "plt.title('Training Vs Validation Error')\n",
386 |     "plt.legend()\n",
387 |     "plt.show()"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "#### Notice:\n",
395 |     "* Model is not generalising well, low train error but high validation error\n",
396 |     "* Model has high variance!"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {
403 |     "nbpresent": {
404 |      "id": "f144f315-6d38-429e-8c17-06c17a446198"
405 |     }
406 |    },
407 |    "outputs": [],
408 |    "source": [
409 |     "xgb.plot_importance(classifier)\n",
410 |     "plt.show()"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {
416 |     "nbpresent": {
417 |      "id": "3312675d-307c-4eff-b835-34f0e7f57924"
418 |     }
419 |    },
420 |    "source": [
421 |     "#### Predict the Validation Set"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "nbpresent": {
429 |      "id": "9b5cb70d-6069-4511-810e-fd17e72667dd"
430 |     }
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "X_test = df_validation.iloc[:,:-1]"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {
441 |     "nbpresent": {
442 |      "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780"
443 |     }
444 |    },
445 |    "outputs": [],
446 |    "source": [
447 |     "result = classifier.predict(X_test)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "result[:5]"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {
463 |     "nbpresent": {
464 |      "id": "2c573c2b-4143-4e01-b107-e6b871ce0249"
465 |     }
466 |    },
467 |    "outputs": [],
468 |    "source": [
469 |     "df_validation['predicted_class'] = result"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {
476 |     "nbpresent": {
477 |      "id": "5ad0fa04-6896-46b5-bc23-40d61480d7ca"
478 |     }
479 |    },
480 |    "outputs": [],
481 |    "source": [
482 |     "df_validation.head()"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## Binary Classifier Metrics"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "# Reference: https://scikit-learn.org/stable/modules/model_evaluation.html\n",
499 |     "# Explicitly stating labels. Pass=1, Fail=0\n",
500 |     "def true_positive(y_true, y_pred): \n",
501 |     "    return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 0]\n",
502 |     "\n",
503 |     "def true_negative(y_true, y_pred): \n",
504 |     "    return confusion_matrix(y_true,y_pred,labels=[1,0])[1, 1]\n",
505 |     "\n",
506 |     "def false_positive(y_true, y_pred): \n",
507 |     "    return confusion_matrix(y_true, y_pred,labels=[1,0])[1, 0]\n",
508 |     "\n",
509 |     "def false_negative(y_true, y_pred): \n",
510 |     "    return confusion_matrix(y_true, y_pred,labels=[1,0])[0, 1]"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": null,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "# Compute Binary Classifier Metrics\n",
520 |     "# Returns a dictionary {\"MetricName\":Value,...}\n",
521 |     "\n",
522 |     "def binary_classifier_metrics(y_true, y_pred):\n",
523 |     "    metrics = {}\n",
524 |     "\n",
525 |     "    # References: \n",
526 |     "    #  https://docs.aws.amazon.com/machine-learning/latest/dg/binary-classification.html\n",
527 |     "    #  https://en.wikipedia.org/wiki/Confusion_matrix\n",
528 |     "    \n",
529 |     "    # Definition:\n",
530 |     "    # true positive = tp = how many samples were correctly classified as positive (count)\n",
531 |     "    # true negative = tn = how many samples were correctly classified as negative (count)\n",
532 |     "    # false positive = fp = how many negative samples were mis-classified as positive (count)\n",
533 |     "    # false_negative = fn = how many positive samples were mis-classified as negative (count)\n",
534 |     "    \n",
535 |     "    # positive = number of positive samples (count)\n",
536 |     "    #          = true positive + false negative\n",
537 |     "    # negative = number of negative samples (count)\n",
538 |     "    #          = true negative + false positive\n",
539 |     "    \n",
540 |     "    tp = true_positive(y_true, y_pred)\n",
541 |     "    tn = true_negative(y_true, y_pred)\n",
542 |     "    fp = false_positive(y_true, y_pred)\n",
543 |     "    fn = false_negative(y_true, y_pred)\n",
544 |     "    \n",
545 |     "    positive = tp + fn\n",
546 |     "    negative = tn + fp\n",
547 |     "    \n",
548 |     "    metrics['TruePositive'] = tp\n",
549 |     "    metrics['TrueNegative'] = tn\n",
550 |     "    metrics['FalsePositive'] = fp\n",
551 |     "    metrics['FalseNegative'] = fn\n",
552 |     "    \n",
553 |     "    metrics['Positive'] = positive\n",
554 |     "    metrics['Negative'] = negative\n",
555 |     "    \n",
556 |     "    # True Positive Rate (TPR, Recall) = true positive/positive\n",
557 |     "    # How many positives were correctly classified? (fraction)\n",
558 |     "    # Recall value closer to 1 is better. closer to 0 is worse\n",
559 |     "    if tp == 0:\n",
560 |     "        recall = 0\n",
561 |     "    else:\n",
562 |     "        recall = tp/positive\n",
563 |     "        \n",
564 |     "    metrics['Recall'] = recall\n",
565 |     "    \n",
566 |     "    # True Negative Rate = True Negative/negative\n",
567 |     "    # How many negatives were correctly classified? (fraction)\n",
568 |     "    # True Negative Rate value closer to 1 is better. closer to 0 is worse\n",
569 |     "    if tn == 0:\n",
570 |     "        tnr = 0\n",
571 |     "    else:\n",
572 |     "        tnr = tn/(negative)\n",
573 |     "    metrics['TrueNegativeRate'] = tnr\n",
574 |     "    \n",
575 |     "    # Precision = True Positive/(True Positive + False Positive)\n",
576 |     "    # How many positives classified by the algorithm are really positives? (fraction)\n",
577 |     "    # Precision value closer to 1 is better. closer to 0 is worse\n",
578 |     "    if tp == 0:\n",
579 |     "        precision = 0\n",
580 |     "    else:\n",
581 |     "        precision = tp/(tp + fp)\n",
582 |     "    metrics['Precision'] = precision\n",
583 |     "    \n",
584 |     "    # Accuracy = (True Positive + True Negative)/(total positive + total negative)\n",
585 |     "    # How many positives and negatives were correctly classified? (fraction)\n",
586 |     "    # Accuracy value closer to 1 is better. closer to 0 is worse\n",
587 |     "    accuracy = (tp + tn)/(positive + negative)\n",
588 |     "    metrics['Accuracy'] = accuracy\n",
589 |     "    \n",
590 |     "    # False Positive Rate (FPR, False Alarm) = False Positive/(total negative)\n",
591 |     "    # How many negatives were mis-classified as positives (fraction)\n",
592 |     "    # False Positive Rate value closer to 0 is better. closer to 1 is worse\n",
593 |     "    if fp == 0:\n",
594 |     "        fpr = 0\n",
595 |     "    else:\n",
596 |     "        fpr = fp/(negative)\n",
597 |     "    metrics['FalsePositiveRate'] = fpr\n",
598 |     "    \n",
599 |     "    # False Negative Rate (FNR, Misses) = False Negative/(total Positive)\n",
600 |     "    # How many positives were mis-classified as negative (fraction)\n",
601 |     "    # False Negative Rate value closer to 0 is better. closer to 1 is worse\n",
602 |     "    fnr = fn/(positive)\n",
603 |     "    metrics['FalseNegativeRate'] = fnr\n",
604 |     "    \n",
605 |     "    # F1 Score = harmonic mean of Precision and Recall\n",
606 |     "    # F1 Score closer to 1 is better. Closer to 0 is worse.\n",
607 |     "    if precision == 0 or recall == 0:\n",
608 |     "        f1 = 0\n",
609 |     "    else:        \n",
610 |     "        f1 = 2*precision*recall/(precision+recall)\n",
611 |     "\n",
612 |     "    metrics['F1'] = f1\n",
613 |     "    \n",
614 |     "    return metrics"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": null,
620 |    "metadata": {},
621 |    "outputs": [],
622 |    "source": [
623 |     "# Reference: \n",
624 |     "# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
625 |     "def plot_confusion_matrix(cm, classes,\n",
626 |     "                          normalize=False,\n",
627 |     "                          title='Confusion matrix',\n",
628 |     "                          cmap=plt.cm.Blues):\n",
629 |     "    \"\"\"\n",
630 |     "    This function prints and plots the confusion matrix.\n",
631 |     "    Normalization can be applied by setting `normalize=True`.\n",
632 |     "    \"\"\"\n",
633 |     "    if normalize:\n",
634 |     "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
635 |     "        #print(\"Normalized confusion matrix\")\n",
636 |     "    #else:\n",
637 |     "    #    print('Confusion matrix, without normalization')\n",
638 |     "\n",
639 |     "    #print(cm)\n",
640 |     "\n",
641 |     "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
642 |     "    plt.title(title)\n",
643 |     "    plt.colorbar()\n",
644 |     "    tick_marks = np.arange(len(classes))\n",
645 |     "    plt.xticks(tick_marks, classes, rotation=45)\n",
646 |     "    plt.yticks(tick_marks, classes)\n",
647 |     "\n",
648 |     "    fmt = '.2f' if normalize else 'd'\n",
649 |     "    thresh = cm.max() / 2.\n",
650 |     "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
651 |     "        plt.text(j, i, format(cm[i, j], fmt),\n",
652 |     "                 horizontalalignment=\"center\",\n",
653 |     "                 color=\"white\" if cm[i, j] > thresh else \"black\")\n",
654 |     "\n",
655 |     "    plt.ylabel('True label')\n",
656 |     "    plt.xlabel('Predicted label')\n",
657 |     "    plt.tight_layout()"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": null,
663 |    "metadata": {},
664 |    "outputs": [],
665 |    "source": [
666 |     "# Compute confusion matrix\n",
667 |     "cnf_matrix = confusion_matrix(df_validation['Outcome'], df_validation['predicted_class'],labels=[1,0])"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {},
674 |    "outputs": [],
675 |    "source": [
676 |     "# Plot confusion matrix\n",
677 |     "plt.figure()\n",
678 |     "plot_confusion_matrix(cnf_matrix, classes=['Diabetic','Normal'],\n",
679 |     "                      title='Confusion Matrix')"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "code",
684 |    "execution_count": null,
685 |    "metadata": {},
686 |    "outputs": [],
687 |    "source": [
688 |     "# Plot confusion matrix\n",
689 |     "plt.figure()\n",
690 |     "plot_confusion_matrix(cnf_matrix, classes=['Diabetic','Normal'],\n",
691 |     "                      title='Confusion Matrix - Fraction', normalize=True)"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": null,
697 |    "metadata": {},
698 |    "outputs": [],
699 |    "source": [
700 |     "metrics = [binary_classifier_metrics(df_validation['Outcome'], df_validation['predicted_class'])]\n",
701 |     "df_metrics=pd.DataFrame.from_dict(metrics)\n",
702 |     "df_metrics.index = ['Model']"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": null,
708 |    "metadata": {},
709 |    "outputs": [],
710 |    "source": [
711 |     "df_metrics"
712 |    ]
713 |   },
714 |   {
715 |    "cell_type": "code",
716 |    "execution_count": null,
717 |    "metadata": {},
718 |    "outputs": [],
719 |    "source": [
720 |     "print('Counts')\n",
721 |     "print(df_metrics[['TruePositive',\n",
722 |     "                  'FalseNegative',\n",
723 |     "                  'FalsePositive',\n",
724 |     "                  'TrueNegative',]].round(2))\n",
725 |     "print()\n",
726 |     "print('Fractions')\n",
727 |     "print(df_metrics[['Recall',\n",
728 |     "                  'FalseNegativeRate',\n",
729 |     "                  'FalsePositiveRate',\n",
730 |     "                  'TrueNegativeRate',]].round(2))\n",
731 |     "print()\n",
732 |     "\n",
733 |     "print(df_metrics[['Precision',\n",
734 |     "                  'Accuracy',\n",
735 |     "                  'F1']].round(2))"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "metadata": {},
742 |    "outputs": [],
743 |    "source": [
744 |     "print(classification_report(\n",
745 |     "    df_validation['Outcome'],\n",
746 |     "    df_validation['predicted_class'],\n",
747 |     "    labels=[1,0],\n",
748 |     "    target_names=['Diabetic','Normal']))"
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "markdown",
753 |    "metadata": {},
754 |    "source": [
755 |     "#### Model Performance not Good Enough?\n",
756 |     "#### Debug your Data before you debug your Model!"
757 |    ]
758 |   },
759 |   {
760 |    "cell_type": "code",
761 |    "execution_count": null,
762 |    "metadata": {},
763 |    "outputs": [],
764 |    "source": []
765 |   }
766 |  ],
767 |  "metadata": {
768 |   "kernelspec": {
769 |    "display_name": "Python 3",
770 |    "language": "python",
771 |    "name": "python3"
772 |   },
773 |   "language_info": {
774 |    "codemirror_mode": {
775 |     "name": "ipython",
776 |     "version": 3
777 |    },
778 |    "file_extension": ".py",
779 |    "mimetype": "text/x-python",
780 |    "name": "python",
781 |    "nbconvert_exporter": "python",
782 |    "pygments_lexer": "ipython3",
783 |    "version": "3.7.6"
784 |   }
785 |  },
786 |  "nbformat": 4,
787 |  "nbformat_minor": 2
788 | }
789 | 


--------------------------------------------------------------------------------
/Notebooks/04-XGBoost_Course_Prepare_Iris_Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "from sklearn import preprocessing"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "<h2>Iris Classification Dataset</h2>\n",
 20 |     "\n",
 21 |     "Input Features:<br>\n",
 22 |     "sepal_length,sepal_width,petal_length,petal_width<br>\n",
 23 |     "\n",
 24 |     "Target:<br>\n",
 25 |     "Iris plant class<br>\n",
 26 |     "\n",
 27 |     "Objective: Predict iris plant class for a given sepal_length,sepal_width,petal_length,petal_width<br>\n",
 28 |     "<h4>Data source: https://archive.ics.uci.edu/ml/datasets/iris</h4>"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "columns = ['encoded_class','sepal_length','sepal_width','petal_length','petal_width']"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Encode Class Labels to integers\n",
 47 |     "le = preprocessing.LabelEncoder()\n",
 48 |     "le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "le.classes_"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "df = pd.read_csv('../Data/iris_all.csv')"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "df['class'].value_counts()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "df.head()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "df.tail()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "le.transform(df['class'])[-5:]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Convert Classes to numeric value\n",
112 |     "df['encoded_class'] = le.transform(df['class'])"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "df.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "df.tail()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# Visualize\n",
140 |     "setosa = df['class'] == 'Iris-setosa'\n",
141 |     "versicolor = df['class'] == 'Iris-versicolor'\n",
142 |     "virginica = df['class'] == 'Iris-virginica'"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "plt.scatter(df[setosa].sepal_length,y=df[setosa].sepal_width, label='setosa',color='g')\n",
152 |     "plt.scatter(df[versicolor].sepal_length,y=df[versicolor].sepal_width, label='versicolor',color='r')\n",
153 |     "plt.scatter(df[virginica].sepal_length,y=df[virginica].sepal_width, label='virginica',color='b')\n",
154 |     "plt.xlabel('length')\n",
155 |     "plt.ylabel('width')\n",
156 |     "plt.title('Sepal')\n",
157 |     "plt.grid(True)\n",
158 |     "plt.legend()\n",
159 |     "plt.show()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "plt.scatter(df[setosa].petal_length,y=df[setosa].petal_width, label='setosa',color='g')\n",
169 |     "plt.scatter(df[versicolor].petal_length,y=df[versicolor].petal_width, label='versicolor',color='r')\n",
170 |     "plt.scatter(df[virginica].petal_length,y=df[virginica].petal_width, label='virginica',color='b')\n",
171 |     "plt.xlabel('length')\n",
172 |     "plt.ylabel('width')\n",
173 |     "plt.title('Petal')\n",
174 |     "plt.grid(True)\n",
175 |     "plt.legend()\n",
176 |     "plt.show()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "plt.scatter(df[setosa].petal_length,y=df[setosa].sepal_length, label='setosa',color='g')\n",
186 |     "plt.scatter(df[versicolor].petal_length,y=df[versicolor].sepal_length, label='versicolor',color='r')\n",
187 |     "plt.scatter(df[virginica].petal_length,y=df[virginica].sepal_length, label='virginica',color='b')\n",
188 |     "plt.xlabel('petal length')\n",
189 |     "plt.ylabel('sepal length')\n",
190 |     "plt.title('Petal-Sepal')\n",
191 |     "plt.grid(True)\n",
192 |     "plt.legend()\n",
193 |     "plt.show()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Training and Validation Set\n",
201 |     "### Target Variable as first column followed by input features:\n",
202 |     "class,sepal_length,sepal_width,petal_length,petal_width\n",
203 |     "### Training, Validation files do not have a column header"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "# Training = 70% of the data\n",
213 |     "# Validation = 30% of the data\n",
214 |     "# Randomize the datset\n",
215 |     "np.random.seed(5)\n",
216 |     "l = list(df.index)\n",
217 |     "np.random.shuffle(l)\n",
218 |     "df = df.iloc[l]"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "rows = df.shape[0]\n",
228 |     "train = int(.7 * rows)\n",
229 |     "test = rows-train"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "rows, train, test"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# Write Training Set\n",
248 |     "df[:train].to_csv('../Data/iris_train.csv'\n",
249 |     "                          ,index=False,header=False\n",
250 |     "                          ,columns=columns)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# Write Validation Set\n",
260 |     "df[train:].to_csv('../Data/iris_validation.csv'\n",
261 |     "                          ,index=False,header=False\n",
262 |     "                          ,columns=columns)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# Write Column List\n",
272 |     "with open('../Data/iris_train_column_list.txt','w') as f:\n",
273 |     "    f.write(','.join(columns))"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.7.6"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 1
305 | }
306 | 


--------------------------------------------------------------------------------
/Notebooks/05-XGBoost_Course_Multiclass_Classification_Iris_Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "nbpresent": {
  7 |      "id": "782a07bf-08de-4030-88e1-6731c4ac956e"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "## Train a model with Iris data using XGBoost algorithm\n",
 12 |     "###  Model is trained with XGBoost installed in notebook instance\n",
 13 |     "###  In the later examples, we will train using SageMaker's XGBoost algorithm"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "nbpresent": {
 21 |      "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae"
 22 |     }
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# Install xgboost in notebook instance.\n",
 27 |     "#### Command to install xgboost\n",
 28 |     "# !pip install xgboost==0.90"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "nbpresent": {
 36 |      "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656"
 37 |     }
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import sys\n",
 42 |     "import numpy as np\n",
 43 |     "import pandas as pd\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "import itertools\n",
 46 |     "import xgboost as xgb\n",
 47 |     "\n",
 48 |     "from sklearn import preprocessing\n",
 49 |     "from sklearn.metrics import classification_report, confusion_matrix"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "nbpresent": {
 57 |      "id": "a3946273-d086-4564-b0f1-6adc225191c3"
 58 |     }
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "column_list_file = '../Data/iris_train_column_list.txt'\n",
 63 |     "train_file = '../Data/iris_train.csv'\n",
 64 |     "validation_file = '../Data/iris_validation.csv'"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "nbpresent": {
 72 |      "id": "7c803d6c-74cc-40d2-ab48-747ff4346c22"
 73 |     }
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "columns = ''\n",
 78 |     "with open(column_list_file,'r') as f:\n",
 79 |     "    columns = f.read().split(',')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "nbpresent": {
 87 |      "id": "630dde8d-44b9-415d-8876-4e873407d0fc"
 88 |     }
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "columns"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Encode Class Labels to integers\n",
102 |     "# Labeled Classes\n",
103 |     "labels=[0,1,2]\n",
104 |     "classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']\n",
105 |     "le = preprocessing.LabelEncoder()\n",
106 |     "le.fit(classes)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "nbpresent": {
114 |      "id": "d6ff2283-cb13-468f-b0cc-0aefeab7b57f"
115 |     }
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "# Specify the column names as the file does not have column header\n",
120 |     "df_train = pd.read_csv(train_file,names=columns)\n",
121 |     "df_validation = pd.read_csv(validation_file,names=columns)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "nbpresent": {
129 |      "id": "a195ae30-1962-4427-859b-73a013dc10d6"
130 |     }
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "df_train.head()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "nbpresent": {
142 |      "id": "e30e8aeb-1ca2-4851-bc2d-1bdee29ab1cf"
143 |     }
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "df_validation.head()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "nbpresent": {
155 |      "id": "3b240613-803d-4fa9-93cf-53ef68df7b93"
156 |     }
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "X_train = df_train.iloc[:,1:] # Features: 1st column onwards \n",
161 |     "y_train = df_train.iloc[:,0].ravel() # Target: 0th column\n",
162 |     "\n",
163 |     "X_validation = df_validation.iloc[:,1:]\n",
164 |     "y_validation = df_validation.iloc[:,0].ravel()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "nbpresent": {
172 |      "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e"
173 |     }
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "# Launch a classifier\n",
178 |     "# XGBoost Training Parameter Reference: \n",
179 |     "#   https://xgboost.readthedocs.io/en/latest/parameter.html\n",
180 |     "\n",
181 |     "classifier = xgb.XGBClassifier(objective=\"multi:softmax\",\n",
182 |     "                               num_class=3,\n",
183 |     "                               n_estimators=100, use_label_encoder=False)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "nbpresent": {
191 |      "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a"
192 |     }
193 |    },
194 |    "outputs": [],
195 |    "source": [
196 |     "classifier"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "nbpresent": {
204 |      "id": "9839d7ce-e791-4d93-bc5f-28604ffde022"
205 |     }
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "classifier.fit(X_train,\n",
210 |     "               y_train,\n",
211 |     "               eval_set = [(X_train, y_train), (X_validation, y_validation)],\n",
212 |     "               eval_metric=['mlogloss'],\n",
213 |     "               early_stopping_rounds=10)\n",
214 |     "\n",
215 |     "# early_stopping_rounds - needs to be passed in as a hyperparameter in SageMaker XGBoost implementation\n",
216 |     "# \"The model trains until the validation score stops improving. \n",
217 |     "# Validation error needs to decrease at least every early_stopping_rounds to continue training.\n",
218 |     "# Amazon SageMaker hosting uses the best model for inference.\""
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "nbpresent": {
226 |      "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff"
227 |     }
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "eval_result = classifier.evals_result()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "nbpresent": {
239 |      "id": "092776c3-a611-4f40-91e2-664b3b99d05e"
240 |     }
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "training_rounds = range(len(eval_result['validation_0']['mlogloss']))"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "nbpresent": {
252 |      "id": "2e9af3f7-fb85-4c52-83d5-ff9cae457294"
253 |     }
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "print(training_rounds)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "nbpresent": {
265 |      "id": "5e71239a-e321-43ba-ac2c-993b57b3be3a"
266 |     }
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')\n",
271 |     "plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')\n",
272 |     "plt.grid(True)\n",
273 |     "plt.xlabel('Iteration')\n",
274 |     "plt.ylabel('LogLoss')\n",
275 |     "plt.title('Training Vs Validation Error')\n",
276 |     "plt.legend()\n",
277 |     "plt.show()"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "nbpresent": {
285 |      "id": "f144f315-6d38-429e-8c17-06c17a446198"
286 |     }
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "xgb.plot_importance(classifier)\n",
291 |     "plt.show()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "nbpresent": {
299 |      "id": "3312675d-307c-4eff-b835-34f0e7f57924"
300 |     }
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "df = pd.read_csv(validation_file,names=columns)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "nbpresent": {
312 |      "id": "afad019f-88df-4893-bb3d-b7f2b7db214b"
313 |     }
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "df.head()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {
324 |     "nbpresent": {
325 |      "id": "9b5cb70d-6069-4511-810e-fd17e72667dd"
326 |     }
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "X_test = df.iloc[:,1:]\n",
331 |     "print(X_test[:5])"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {
338 |     "nbpresent": {
339 |      "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780"
340 |     }
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "result = classifier.predict(X_test)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "result[:5]"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "nbpresent": {
361 |      "id": "2c573c2b-4143-4e01-b107-e6b871ce0249"
362 |     }
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "df['predicted_class'] = result #le.inverse_transform(result)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "nbpresent": {
374 |      "id": "5ad0fa04-6896-46b5-bc23-40d61480d7ca"
375 |     }
376 |    },
377 |    "outputs": [],
378 |    "source": [
379 |     "df.head()"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "# Compare performance of Actual and Model 1 Prediction\n",
389 |     "plt.figure()\n",
390 |     "plt.scatter(df.index,df['encoded_class'],label='Actual')\n",
391 |     "plt.scatter(df.index,df['predicted_class'],label='Predicted',marker='^')\n",
392 |     "plt.legend(loc=4)\n",
393 |     "plt.yticks([0,1,2])\n",
394 |     "plt.xlabel('Sample')\n",
395 |     "plt.ylabel('Class')\n",
396 |     "plt.show()"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "<h2>Confusion Matrix</h2>\n",
404 |     "Confusion Matrix is a table that summarizes performance of classification model.<br><br>"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "# Reference: \n",
414 |     "# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n",
415 |     "def plot_confusion_matrix(cm, classes,\n",
416 |     "                          normalize=False,\n",
417 |     "                          title='Confusion matrix',\n",
418 |     "                          cmap=plt.cm.Blues):\n",
419 |     "    \"\"\"\n",
420 |     "    This function prints and plots the confusion matrix.\n",
421 |     "    Normalization can be applied by setting `normalize=True`.\n",
422 |     "    \"\"\"\n",
423 |     "    if normalize:\n",
424 |     "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
425 |     "        #print(\"Normalized confusion matrix\")\n",
426 |     "    #else:\n",
427 |     "    #    print('Confusion matrix, without normalization')\n",
428 |     "\n",
429 |     "    #print(cm)\n",
430 |     "\n",
431 |     "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
432 |     "    plt.title(title)\n",
433 |     "    plt.colorbar()\n",
434 |     "    tick_marks = np.arange(len(classes))\n",
435 |     "    plt.xticks(tick_marks, classes, rotation=45)\n",
436 |     "    plt.yticks(tick_marks, classes)\n",
437 |     "\n",
438 |     "    fmt = '.2f' if normalize else 'd'\n",
439 |     "    thresh = cm.max() / 2.\n",
440 |     "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
441 |     "        plt.text(j, i, format(cm[i, j], fmt),\n",
442 |     "                 horizontalalignment=\"center\",\n",
443 |     "                 color=\"white\" if cm[i, j] > thresh else \"black\")\n",
444 |     "\n",
445 |     "    plt.ylabel('True label')\n",
446 |     "    plt.xlabel('Predicted label')\n",
447 |     "    plt.tight_layout()"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "# Compute confusion matrix\n",
457 |     "cnf_matrix = confusion_matrix(df['encoded_class'],\n",
458 |     "                              df['predicted_class'],labels=labels)"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "cnf_matrix"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "# Plot confusion matrix\n",
477 |     "plt.figure()\n",
478 |     "plot_confusion_matrix(cnf_matrix, classes=classes,\n",
479 |     "                      title='Confusion matrix - Count')"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "# Plot confusion matrix\n",
489 |     "plt.figure()\n",
490 |     "plot_confusion_matrix(cnf_matrix, classes=classes,\n",
491 |     "                      title='Confusion matrix - Count',normalize=True)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "print(classification_report(\n",
501 |     "    df['encoded_class'],\n",
502 |     "    df['predicted_class'],\n",
503 |     "    labels=labels,\n",
504 |     "    target_names=classes))"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "metadata": {},
510 |    "source": [
511 |     "### Well Done!"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": []
520 |   }
521 |  ],
522 |  "metadata": {
523 |   "kernelspec": {
524 |    "display_name": "Python 3",
525 |    "language": "python",
526 |    "name": "python3"
527 |   },
528 |   "language_info": {
529 |    "codemirror_mode": {
530 |     "name": "ipython",
531 |     "version": 3
532 |    },
533 |    "file_extension": ".py",
534 |    "mimetype": "text/x-python",
535 |    "name": "python",
536 |    "nbconvert_exporter": "python",
537 |    "pygments_lexer": "ipython3",
538 |    "version": "3.7.6"
539 |   }
540 |  },
541 |  "nbformat": 4,
542 |  "nbformat_minor": 2
543 | }
544 | 


--------------------------------------------------------------------------------
/Notebooks/06-XGBoost-TimeSeries.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "More here:\n",
  8 |     "https://machinelearningmastery.com/xgboost-for-time-series-forecasting/\n",
  9 |     "    \n",
 10 |     "And here:\n",
 11 |     "https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# forecast monthly births with xgboost\n",
 21 |     "from numpy import asarray\n",
 22 |     "from pandas import read_csv\n",
 23 |     "from pandas import DataFrame\n",
 24 |     "from pandas import concat\n",
 25 |     "from sklearn.metrics import mean_absolute_error\n",
 26 |     "from xgboost import XGBRegressor\n",
 27 |     "from matplotlib import pyplot"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# transform a time series dataset into a supervised learning dataset\n",
 37 |     "def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):\n",
 38 |     "    n_vars = 1 if type(data) is list else data.shape[1]\n",
 39 |     "    df = DataFrame(data)\n",
 40 |     "    cols = list()\n",
 41 |     "    # input sequence (t-n, ... t-1)\n",
 42 |     "    for i in range(n_in, 0, -1):\n",
 43 |     "        cols.append(df.shift(i))\n",
 44 |     "    # forecast sequence (t, t+1, ... t+n)\n",
 45 |     "    for i in range(0, n_out):\n",
 46 |     "        cols.append(df.shift(-i))\n",
 47 |     "    # put it all together\n",
 48 |     "    agg = concat(cols, axis=1)\n",
 49 |     "    # drop rows with NaN values\n",
 50 |     "    if dropnan:\n",
 51 |     "        agg.dropna(inplace=True)\n",
 52 |     "    return agg.values\n",
 53 |     "\n",
 54 |     "# split a univariate dataset into train/test sets\n",
 55 |     "def train_test_split(data, n_test):\n",
 56 |     "    return data[:-n_test, :], data[-n_test:, :]\n",
 57 |     "\n",
 58 |     "# fit an xgboost model and make a one step prediction\n",
 59 |     "def xgboost_forecast(train, testX):\n",
 60 |     "    # transform list into array\n",
 61 |     "    train = asarray(train)\n",
 62 |     "    # split into input and output columns\n",
 63 |     "    trainX, trainy = train[:, :-1], train[:, -1]\n",
 64 |     "    # fit model\n",
 65 |     "    model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)\n",
 66 |     "    model.fit(trainX, trainy)\n",
 67 |     "    # make a one-step prediction\n",
 68 |     "    yhat = model.predict(asarray([testX]))\n",
 69 |     "    return yhat[0]\n",
 70 |     "\n",
 71 |     "# walk-forward validation for univariate data\n",
 72 |     "def walk_forward_validation(data, n_test):\n",
 73 |     "    predictions = list()\n",
 74 |     "    # split dataset\n",
 75 |     "    train, test = train_test_split(data, n_test)\n",
 76 |     "    # seed history with training dataset\n",
 77 |     "    history = [x for x in train]\n",
 78 |     "    # step over each time-step in the test set\n",
 79 |     "    for i in range(len(test)):\n",
 80 |     "        # split test row into input and output columns\n",
 81 |     "        testX, testy = test[i, :-1], test[i, -1]\n",
 82 |     "        # fit model on history and make a prediction\n",
 83 |     "        yhat = xgboost_forecast(history, testX)\n",
 84 |     "        # store forecast in list of predictions\n",
 85 |     "        predictions.append(yhat)\n",
 86 |     "        # add actual observation to history for the next loop\n",
 87 |     "        history.append(test[i])\n",
 88 |     "        # summarize progress\n",
 89 |     "        print('>expected=%.1f, predicted=%.1f' % (testy, yhat))\n",
 90 |     "    # estimate prediction error\n",
 91 |     "    error = mean_absolute_error(test[:, -1], predictions)\n",
 92 |     "    return error, test[:, -1], predictions"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# load the dataset\n",
102 |     "# Daily total female births in California, 1959\n",
103 |     "# Source: Time Series Data Library (citing: Newton (1988))\n",
104 |     "    \n",
105 |     "series = read_csv('../Data/daily-total-female-births.csv', header=0, index_col=0)\n",
106 |     "values = series.values\n",
107 |     "# transform the time series data into supervised learning\n",
108 |     "data = series_to_supervised(values, n_in=6)\n",
109 |     "# evaluate\n",
110 |     "mae, y, yhat = walk_forward_validation(data, 12)\n",
111 |     "print('MAE: %.3f' % mae)\n",
112 |     "# plot expected vs preducted\n",
113 |     "pyplot.plot(y, label='Expected')\n",
114 |     "pyplot.plot(yhat, label='Predicted')\n",
115 |     "pyplot.legend()\n",
116 |     "pyplot.show()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "Python 3",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.7.6"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 4
148 | }
149 | 


--------------------------------------------------------------------------------
/Notebooks/07-XGBoost_Feature_Importance_Selection_Diabetes_Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "nbpresent": {
  7 |      "id": "782a07bf-08de-4030-88e1-6731c4ac956e"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "## Diabetes dataset \n",
 12 |     "### Predict if a person is at risk of developing diabetes\n",
 13 |     "\n",
 14 |     "### This Dataset is Freely Available\n",
 15 |     "\n",
 16 |     "### Overview:\n",
 17 |     "The data was collected and made available by the \"National Institute of Diabetes and Digestive and Kidney Diseases\" as part of the Pima Indians Diabetes Database. \n",
 18 |     "\n",
 19 |     "`Diabetes.csv` is available [from Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database). We have several questions - what information is more correlated with a positive diagnosis, and if we can only ask two questions to a patient, what should we ask and how would we give them a risk of being diagnosed.\n",
 20 |     "\n",
 21 |     "++++++++++++++++++++++++++++++++++++\n",
 22 |     "\n",
 23 |     "The following features have been provided to help us predict whether a person is diabetic or not:\n",
 24 |     "* **Pregnancies:**  Number of times pregnant\n",
 25 |     "* **Glucose:** Plasma glucose concentration over 2 hours in an oral glucose tolerance test\n",
 26 |     "* **BloodPressure:** Diastolic blood pressure (mm Hg)\n",
 27 |     "* **SkinThickness:** Triceps skin fold thickness (mm)\n",
 28 |     "* **Insulin:** 2-Hour serum insulin (mu U/ml)\n",
 29 |     "* **BMI:** Body mass index (weight in kg/(height in m)2)\n",
 30 |     "* **DiabetesPedigreeFunction:** Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)\n",
 31 |     "* **Age:** Age (years)\n",
 32 |     "* **Outcome:** Class variable (0 if non-diabetic, 1 if diabetic)\n",
 33 |     "\n",
 34 |     "### Binary Classification problem - XGBoost"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "nbpresent": {
 42 |      "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae"
 43 |     }
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Install xgboost in notebook instance.\n",
 48 |     "#### Command to install xgboost\n",
 49 |     "#!pip install xgboost==0.90"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "nbpresent": {
 57 |      "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656"
 58 |     }
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import numpy as np\n",
 63 |     "import pandas as pd\n",
 64 |     "import matplotlib.pyplot as plt\n",
 65 |     "\n",
 66 |     "import xgboost as xgb\n",
 67 |     "\n",
 68 |     "\n",
 69 |     "from sklearn.model_selection import train_test_split\n",
 70 |     "from xgboost import plot_importance\n",
 71 |     "\n",
 72 |     "from sklearn.metrics import accuracy_score\n",
 73 |     "from sklearn.feature_selection import SelectFromModel\n",
 74 |     "\n",
 75 |     "import warnings\n",
 76 |     "warnings.filterwarnings('ignore')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "nbpresent": {
 84 |      "id": "a3946273-d086-4564-b0f1-6adc225191c3"
 85 |     }
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "data = pd.read_csv(\"../Data/Diabetes.csv\")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "data.describe()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "data.info()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "## only keep rows where non of the columns has 0 value (except the first and last columns)\n",
117 |     "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n",
118 |     "data.reset_index(inplace=True, drop = True)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Dealing with Missing Values"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# using isnull() function  \n",
135 |     "# print(data.isnull().any().sum())\n",
136 |     "print(data.isnull().sum())\n",
137 |     "#data.isnull()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "data.drop(columns=['Insulin'], inplace = True)\n",
147 |     "data.reset_index(inplace=True, drop = True)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "### Replace missing values in each column with the mean or median of that column\n",
157 |     "#data.fillna(data.mean())\n",
158 |     "data.fillna(data.median(), inplace=True)\n",
159 |     "\n",
160 |     "### Drop all rows that contain missing values?\n",
161 |     "#data = data.dropna()\n",
162 |     "#data.reset_index(inplace=True, drop = True)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Split Data"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "X = data.iloc[:,:-1] # Features: all columns excep last\n",
179 |     "y = data.iloc[:,-1].ravel() # Target: last column\n",
180 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Launch XGBoost classifier"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "nbpresent": {
195 |      "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e"
196 |     }
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "# Launch a classifier\n",
201 |     "# XGBoost Training Parameter Reference: \n",
202 |     "#   https://xgboost.readthedocs.io/en/latest/parameter.html\n",
203 |     "classifier = xgb.XGBClassifier (objective=\"binary:logistic\")"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "nbpresent": {
211 |      "id": "348296fb-8c9b-4598-ad2e-d1fe8e10f76a"
212 |     }
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "classifier"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {
223 |     "nbpresent": {
224 |      "id": "9839d7ce-e791-4d93-bc5f-28604ffde022"
225 |     }
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "classifier.fit(X_train,\n",
230 |     "               y_train, \n",
231 |     "               eval_metric=['logloss'])"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Plot Feature Importance"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "nbpresent": {
246 |      "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff"
247 |     }
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "# plot feature importance\n",
252 |     "plot_importance(classifier)\n",
253 |     "plt.show()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {
259 |     "nbpresent": {
260 |      "id": "3312675d-307c-4eff-b835-34f0e7f57924"
261 |     }
262 |    },
263 |    "source": [
264 |     "### Feature Selection using Feature Importance\n",
265 |     "* Feature importance scores can be used for feature selection in scikit-learn.\n",
266 |     "* This is done using the SelectFromModel class that takes a model and can transform a dataset into a subset with selected features.\n",
267 |     "* This class can take a pre-trained model, such as one trained on the entire training dataset. \n",
268 |     "* It can then use a threshold to decide which features to select. \n",
269 |     "* This threshold is used when you call the transform() method on the SelectFromModel instance to consistently select the same features on the training dataset and the test dataset.\n"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {
276 |     "nbpresent": {
277 |      "id": "9b5cb70d-6069-4511-810e-fd17e72667dd"
278 |     }
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "# fit model on all training data\n",
283 |     "model = xgb.XGBClassifier(objective=\"binary:logistic\", use_label_encoder =False)\n",
284 |     "model.fit(X_train, y_train, eval_metric=['logloss'])\n",
285 |     "# make predictions for test data and evaluate\n",
286 |     "y_pred = model.predict(X_test)\n",
287 |     "predictions = [round(value) for value in y_pred]\n",
288 |     "accuracy = accuracy_score(y_test, predictions)\n",
289 |     "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))\n",
290 |     "# Fit model using each importance as a threshold\n",
291 |     "thresholds = np.sort(model.feature_importances_)\n",
292 |     "for thresh in thresholds:\n",
293 |     "    # select features using threshold\n",
294 |     "    selection = SelectFromModel(model, threshold=thresh, prefit=True)\n",
295 |     "    select_X_train = selection.transform(X_train)\n",
296 |     "    # train model\n",
297 |     "    selection_model = xgb.XGBClassifier(objective=\"binary:logistic\", use_label_encoder =False)\n",
298 |     "    selection_model.fit(select_X_train, y_train, eval_metric=['logloss'])\n",
299 |     "    # eval model\n",
300 |     "    select_X_test = selection.transform(X_test)\n",
301 |     "    y_pred = selection_model.predict(select_X_test)\n",
302 |     "    predictions = [round(value) for value in y_pred]\n",
303 |     "    accuracy = accuracy_score(y_test, predictions)\n",
304 |     "    print(\"Thresh=%.3f, n=%d, Accuracy: %.2f%%\" % (thresh, select_X_train.shape[1], accuracy*100.0))"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {
310 |     "nbpresent": {
311 |      "id": "f611c852-50e3-4a1a-9134-c1c6e82ad780"
312 |     }
313 |    },
314 |    "source": [
315 |     "You can see that the performance of the model generally decreases with the number of selected features."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": []
324 |   }
325 |  ],
326 |  "metadata": {
327 |   "kernelspec": {
328 |    "display_name": "Python 3",
329 |    "language": "python",
330 |    "name": "python3"
331 |   },
332 |   "language_info": {
333 |    "codemirror_mode": {
334 |     "name": "ipython",
335 |     "version": 3
336 |    },
337 |    "file_extension": ".py",
338 |    "mimetype": "text/x-python",
339 |    "name": "python",
340 |    "nbconvert_exporter": "python",
341 |    "pygments_lexer": "ipython3",
342 |    "version": "3.7.6"
343 |   }
344 |  },
345 |  "nbformat": 4,
346 |  "nbformat_minor": 2
347 | }
348 | 


--------------------------------------------------------------------------------
/Notebooks/08-XGBoost_Hyperparameter_Tuning_Diabetes_Dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "nbpresent": {
  7 |      "id": "782a07bf-08de-4030-88e1-6731c4ac956e"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "## XGBoost Hyperparameter Tuning\n",
 12 |     "This is a nice tutorial\n",
 13 |     "https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "nbpresent": {
 21 |      "id": "6c6a8672-d428-410a-82fa-7f587c9ef2ae"
 22 |     }
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# Install xgboost in notebook instance.\n",
 27 |     "#### Command to install xgboost\n",
 28 |     "#!pip install xgboost==0.90"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "nbpresent": {
 36 |      "id": "652b58d4-3b75-405f-9f11-24d0cd1f9656"
 37 |     }
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import numpy as np\n",
 42 |     "import pandas as pd\n",
 43 |     "import matplotlib.pyplot as plt\n",
 44 |     "\n",
 45 |     "import xgboost as xgb\n",
 46 |     "\n",
 47 |     "from sklearn.model_selection import train_test_split\n",
 48 |     "\n",
 49 |     "#from sklearn import cross_validation, metrics   #Additional scklearn functions\n",
 50 |     "from sklearn.model_selection import GridSearchCV   #Performing grid search\n",
 51 |     "\n",
 52 |     "import warnings\n",
 53 |     "warnings.filterwarnings('ignore')"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "nbpresent": {
 61 |      "id": "a3946273-d086-4564-b0f1-6adc225191c3"
 62 |     }
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "data = pd.read_csv(\"../Data/Diabetes.csv\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "data.describe()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "data.info()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "## only keep rows where non of the columns has 0 value (except the first and last columns)\n",
 94 |     "data = data[~(data[data.columns[1:-1]] == 0).any(axis=1)]\n",
 95 |     "data.reset_index(inplace=True, drop = True)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Dealing with Missing Values"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# using isnull() function  \n",
112 |     "# print(data.isnull().any().sum())\n",
113 |     "print(data.isnull().sum())\n",
114 |     "#data.isnull()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "data.drop(columns=['Insulin'], inplace = True)\n",
124 |     "data.reset_index(inplace=True, drop = True)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "### Replace missing values in each column with the mean or median of that column\n",
134 |     "#data.fillna(data.mean())\n",
135 |     "data.fillna(data.median(), inplace=True)\n",
136 |     "\n",
137 |     "### Drop all rows that contain missing values?\n",
138 |     "#data = data.dropna()\n",
139 |     "#data.reset_index(inplace=True, drop = True)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "### Split Data"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "X = data.iloc[:,:-1] # Features: all columns excep last\n",
156 |     "y = data.iloc[:,-1].ravel() # Target: last column\n",
157 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "### Useful Function\n",
165 |     "* This function will help us create XGBoost models and perform cross-validation. \n",
166 |     "* The best part is that you can take this function as it is and use it later for your own models."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "nbpresent": {
174 |      "id": "9edc89e7-45d3-4350-9eb4-3e0938c3c55e"
175 |     }
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):    \n",
180 |     "    if useTrainCV:\n",
181 |     "        xgb_param = alg.get_xgb_params()\n",
182 |     "        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)\n",
183 |     "        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\n",
184 |     "            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)\n",
185 |     "        alg.set_params(n_estimators=cvresult.shape[0])\n",
186 |     "    \n",
187 |     "    #Fit the algorithm on the data\n",
188 |     "    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')\n",
189 |     "        \n",
190 |     "    #Predict training set:\n",
191 |     "    dtrain_predictions = alg.predict(dtrain[predictors])\n",
192 |     "    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]\n",
193 |     "        \n",
194 |     "    #Print model report:\n",
195 |     "    print (\"\\nModel Report\")\n",
196 |     "    print (\"Accuracy : %.4g\" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))\n",
197 |     "    print (\"AUC Score (Train): %f\" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))\n",
198 |     "                    \n",
199 |     "    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)\n",
200 |     "    feat_imp.plot(kind='bar', title='Feature Importances')\n",
201 |     "    plt.ylabel('Feature Importance Score')"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "### Tune max_depth and min_child_weight"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "nbpresent": {
216 |      "id": "e08f22c1-4346-4e2d-96a2-9974ed5c59ff"
217 |     }
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "%%time\n",
222 |     "param_test2 = {\n",
223 |     " 'max_depth':[4,5,6],\n",
224 |     " 'min_child_weight':[4,5,6]\n",
225 |     "}\n",
226 |     "gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,\n",
227 |     " min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,\n",
228 |     " objective= 'binary:logistic',eval_metric ='logloss', nthread=4, scale_pos_weight=1,seed=27), \n",
229 |     " param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5, return_train_score=True)\n",
230 |     "\n",
231 |     "gsearch2.fit(X_train ,y_train)\n",
232 |     "\n",
233 |     "print(gsearch2.best_params_)\n",
234 |     "print(gsearch2.best_score_)\n",
235 |     "#print(gsearch2.cv_results_)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "dir(gsearch2)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": []
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "kernelspec": {
257 |    "display_name": "Python 3",
258 |    "language": "python",
259 |    "name": "python3"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "ipython",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "python",
269 |    "nbconvert_exporter": "python",
270 |    "pygments_lexer": "ipython3",
271 |    "version": "3.7.6"
272 |   }
273 |  },
274 |  "nbformat": 4,
275 |  "nbformat_minor": 2
276 | }
277 | 


--------------------------------------------------------------------------------
/Notebooks/09-AWS_XGBoost_Train_Host_Predict.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SageMaker's XGBoost Built-in Algorithm on AWS"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# Install SageMaker and boto if you don't already have them\n",
 17 |     "#!pip install --upgrade sagemaker\n",
 18 |     "#!pip install --upgrade boto3"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Remember these four steps:\n",
 26 |     "1) Upload Train and Validation files to S3\n",
 27 |     "\n",
 28 |     "2) Specify Algorithm and Hyperparameters\n",
 29 |     "\n",
 30 |     "3) Configure type of server and number of servers to use for Training\n",
 31 |     "\n",
 32 |     "4) Create a real-time Endpoint for interactive use case"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Import required libraries"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Import required libraries\n",
 49 |     "import numpy as np\n",
 50 |     "import pandas as pd\n",
 51 |     "\n",
 52 |     "import boto3\n",
 53 |     "import re\n",
 54 |     "\n",
 55 |     "import sagemaker\n",
 56 |     "from sagemaker import get_execution_role\n",
 57 |     "# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "sagemaker.__version__"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "sagemaker.__version__"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Upload Data to S3"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Make sure you specify your own bucket name\n",
 92 |     "bucket_name = 'aws-ml-test-nsadawi'\n",
 93 |     "\n",
 94 |     "training_folder = r'bikerental/training/'\n",
 95 |     "validation_folder = r'bikerental/validation/'\n",
 96 |     "test_folder = r'bikerental/test/'\n",
 97 |     "\n",
 98 |     "s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)\n",
 99 |     "s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)\n",
100 |     "s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)\n",
101 |     "s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "print(s3_model_output_location)\n",
111 |     "print(s3_training_file_location)\n",
112 |     "print(s3_validation_file_location)\n",
113 |     "print(s3_test_file_location)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# Write and Reading from S3 is just as easy\n",
123 |     "# files are referred as objects in S3.  \n",
124 |     "# file name is referred as key name in S3\n",
125 |     "\n",
126 |     "# File stored in S3 is automatically replicated across 3 different availability zones \n",
127 |     "# in the region where the bucket was created.\n",
128 |     "\n",
129 |     "# http://boto3.readthedocs.io/en/latest/guide/s3.html\n",
130 |     "def write_to_s3(filename, bucket, key):\n",
131 |     "    with open(filename,'rb') as f: # Read in binary mode\n",
132 |     "        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "write_to_s3('../Data/bike_train.csv', \n",
142 |     "            bucket_name,\n",
143 |     "            training_folder + 'bike_train.csv')\n",
144 |     "\n",
145 |     "write_to_s3('../Data/bike_validation.csv',\n",
146 |     "            bucket_name,\n",
147 |     "            validation_folder + 'bike_validation.csv')\n",
148 |     "\n",
149 |     "write_to_s3('../Data/bike_test.csv',\n",
150 |     "            bucket_name,\n",
151 |     "            test_folder + 'bike_test.csv')"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## Training Algorithm Docker Image\n",
159 |     "### SageMaker maintains a separate image for algorithm and region\n",
160 |     "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# Establish a session with AWS\n",
170 |     "sess = sagemaker.Session()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "#### Important to use an IAM Role\n",
178 |     "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "try:\n",
188 |     "    role = sagemaker.get_execution_role()\n",
189 |     "except ValueError:\n",
190 |     "    iam = boto3.client('iam')\n",
191 |     "    #arn:aws:iam::479320215787:role/service-role/AmazonSageMaker-ExecutionRole-20210306T134306\n",
192 |     "    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20210306T134306')['Role']['Arn']"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "# This role contains the permissions needed to train, deploy models\n",
202 |     "# SageMaker Service is trusted to assume this role\n",
203 |     "print(role)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html#sagemaker.image_uris.retrieve\n",
213 |     "\n",
214 |     "# SDK 2 uses image_uris.retrieve the container image location\n",
215 |     "\n",
216 |     "# Use XGBoost 1.2 version \n",
217 |     "container = sagemaker.image_uris.retrieve(\"xgboost\",sess.boto_region_name,version=\"1.2-1\")\n",
218 |     "\n",
219 |     "print (f'Using XGBoost Container {container}')"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Build Model"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# Configure the training job\n",
236 |     "# Specify type and number of instances to use\n",
237 |     "# S3 location where final artifacts need to be stored\n",
238 |     "\n",
239 |     "#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html\n",
240 |     "\n",
241 |     "# SDK 2.x version does not require train prefix for instance count and type\n",
242 |     "estimator = sagemaker.estimator.Estimator(\n",
243 |     "    container,\n",
244 |     "    role,\n",
245 |     "    instance_count=1,\n",
246 |     "    instance_type='ml.m4.xlarge',\n",
247 |     "    output_path=s3_model_output_location,\n",
248 |     "    sagemaker_session=sess,\n",
249 |     "    base_job_name = 'xgboost-bikerental-v1')"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "# Specify hyper parameters that appropriate for the training algorithm\n",
259 |     "# XGBoost Training Parameter Reference\n",
260 |     "#  https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters\n",
261 |     "\n",
262 |     "# TODO: objective xgboost has deprecated reg:linear. use reg:squarederror instead\n",
263 |     "estimator.set_hyperparameters(max_depth=5,\n",
264 |     "                              objective=\"reg:squarederror\",\n",
265 |     "                              eta=0.1,\n",
266 |     "                              num_round=150)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "estimator.hyperparameters()"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {
281 |     "collapsed": true
282 |    },
283 |    "source": [
284 |     "### Specify Training Data Location and Optionally, Validation Data Location"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "# content type can be libsvm or csv for XGBoost\n",
294 |     "training_input_config = sagemaker.session.TrainingInput(\n",
295 |     "    s3_data=s3_training_file_location,\n",
296 |     "    content_type='csv',\n",
297 |     "    s3_data_type='S3Prefix')\n",
298 |     "\n",
299 |     "validation_input_config = sagemaker.session.TrainingInput(\n",
300 |     "    s3_data=s3_validation_file_location,\n",
301 |     "    content_type='csv',\n",
302 |     "    s3_data_type='S3Prefix'\n",
303 |     ")\n",
304 |     "\n",
305 |     "data_channels = {'train': training_input_config, 'validation': validation_input_config}"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "print(training_input_config.config)\n",
315 |     "print(validation_input_config.config)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "### Train the model (takes a few minutes)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "%%time\n",
332 |     "# XGBoost supports \"train\", \"validation\" channels\n",
333 |     "# Reference: Supported channels by algorithm\n",
334 |     "#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html\n",
335 |     "estimator.fit(data_channels)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "## Deploy Model (takes a few minutes)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "%%time\n",
352 |     "# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html\n",
353 |     "predictor = estimator.deploy(initial_instance_count=1,\n",
354 |     "                             instance_type='ml.m4.xlarge',\n",
355 |     "                             endpoint_name = 'xgboost-bikerental-v2')"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {
361 |     "collapsed": true
362 |    },
363 |    "source": [
364 |     "## Make Predictions"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "# SDK 2.0 serializers\n",
374 |     "from sagemaker.serializers import CSVSerializer"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "predictor.serializer = CSVSerializer()"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "source": [
401 |     "## Summary"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {},
407 |    "source": [
408 |     "1. Ensure Training, Test and Validation data are in S3 Bucket\n",
409 |     "2. Select Algorithm Container Registry Path - Path varies by region\n",
410 |     "3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location\n",
411 |     "4. Specify algorithm specific hyper parameters\n",
412 |     "5. Train model\n",
413 |     "6. Deploy model - Specify instance count, instance type and endpoint name\n",
414 |     "7. Make Predictions"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "## What if the Endpoint is Already Up and Running?"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": []
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 3",
435 |    "language": "python",
436 |    "name": "python3"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.7.6"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 2
453 | }
454 | 


--------------------------------------------------------------------------------
/Notebooks/10-AWS_XGBoost_Invoke_Endpoint_Predict.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h1>XGBoost Cloud Prediction Invocation Template</h1>\n",
  8 |     "<h4>Invoke SageMaker Prediction Service</h4>"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import sys\n",
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "import math\n",
 22 |     "import os\n",
 23 |     "\n",
 24 |     "import boto3\n",
 25 |     "import re\n",
 26 |     "from sagemaker import get_execution_role\n",
 27 |     "import sagemaker\n",
 28 |     "\n",
 29 |     "# SDK 2 serializers and deserializers\n",
 30 |     "from sagemaker.serializers import CSVSerializer\n",
 31 |     "from sagemaker.deserializers import JSONDeserializer"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "This Stackoverflow answer is useful:\n",
 39 |     "https://stackoverflow.com/a/51086736"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# SDK 2\n",
 49 |     "# RealTimePredictor renamed to Predictor\n",
 50 |     "# https://sagemaker.readthedocs.io/en/stable/v2.html\n",
 51 |     "\n",
 52 |     "# Create a predictor and point to an existing endpoint\n",
 53 |     "endpoint_name = 'xgboost-bikerental-v2'\n",
 54 |     "predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "predictor.serializer = CSVSerializer()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "df_all = pd.read_csv('../Data/bike_test.csv')"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "df_all.head()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "df_all.columns[1:]"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Need to pass an array to the prediction\n",
100 |     "# can pass a numpy array or a list of values [[19,1],[20,1]]\n",
101 |     "arr_test = df_all[df_all.columns[1:]].values"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "type(arr_test)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "arr_test.shape"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "arr_test[:5]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "result = predictor.predict(arr_test[:2])"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "result"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "arr_test.shape"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "### Split the input data into chunks\n",
163 |     "There are thousands of rows in this data set for which need inference.  \n",
164 |     "When communicating over internet, it is a good idea to split the data into chunks to prevent payload and timeout error"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "# For large number of predictions, we can split the input data and\n",
174 |     "# Query the prediction service.\n",
175 |     "# array_split is convenient to specify how many splits are needed\n",
176 |     "predictions = []\n",
177 |     "for arr in np.array_split(arr_test,10):\n",
178 |     "    result = predictor.predict(arr)\n",
179 |     "    result = result.decode(\"utf-8\")\n",
180 |     "    result = result.split(',')\n",
181 |     "    print (arr.shape)\n",
182 |     "    predictions += [float(r) for r in result]"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "len(predictions)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "np.expm1(predictions)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "df_all['count'] = np.expm1(predictions)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "df_all.head()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "df_all[['datetime','count']].to_csv('../Data/predicted_count_cloud.csv',index=False)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# Delete Endpoint to prevent unnecessary charges\n",
237 |     "predictor.delete_endpoint()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "predictions"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": []
255 |   }
256 |  ],
257 |  "metadata": {
258 |   "kernelspec": {
259 |    "display_name": "Python 3",
260 |    "language": "python",
261 |    "name": "python3"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.7.6"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 1
278 | }
279 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # This repo is for my XGBoost training course with OReilly and Pearson
2 | 
3 | Some example Python configurations and code snippets are included here .. they are explained in detail during the course
4 | 


--------------------------------------------------------------------------------