├── .gitignore
├── LICENSE
├── README.md
├── data
    └── titanic
    │   ├── example_gender_submission.csv
    │   ├── processed
    │       ├── X_test.feather
    │       ├── X_train.feather
    │       ├── y_test.feather
    │       └── y_train.feather
    │   ├── submission_data.csv
    │   └── train.csv
├── machine_learning
    ├── __init__.py
    ├── decision_tree.py
    ├── gradient_boosted_decision_tree.py
    ├── knn.py
    ├── linear_regression.py
    ├── logistic_regression.py
    ├── neural_network.py
    ├── random_forest.py
    └── tree.py
└── notebooks
    ├── decision_tree.ipynb
    ├── gradient_boosted_decision_tree.ipynb
    ├── knn.ipynb
    ├── linear_regression.ipynb
    ├── logistic_regression.ipynb
    ├── neural_network.ipynb
    ├── random_forest.ipynb
    └── sklearn_titanic_example.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # data volume
132 | data/mnist
133 | 
134 | # ide
135 | .vscode/
136 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Simon Ward-Jones
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine learning
 2 | 
 3 | > Machine learning algorithm implementations and explanations
 4 | 
 5 | ## Notebooks
 6 | 
 7 | The notebooks contains derivations and explanations of each method followed by implementations and example usage on well know data sets
 8 | 
 9 | ## machine_learning
10 | 
11 | This contains the source code for each of the implementations (these are the same as in each notebook)
12 | 
13 | ## Key algorithms covered
14 | 
15 |  - Linear regression
16 |  - Logistic regression
17 |  - Knn
18 |  - Decision tree
19 |  - Random forest
20 |  - Gradient boosted decision tree
21 |  - Neural network
22 | 


--------------------------------------------------------------------------------
/data/titanic/example_gender_submission.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/titanic/processed/X_test.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/X_test.feather


--------------------------------------------------------------------------------
/data/titanic/processed/X_train.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/X_train.feather


--------------------------------------------------------------------------------
/data/titanic/processed/y_test.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/y_test.feather


--------------------------------------------------------------------------------
/data/titanic/processed/y_train.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/y_train.feather


--------------------------------------------------------------------------------
/data/titanic/submission_data.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
  2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
  3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S
  4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q
  5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S
  6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S
  7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S
  8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q
  9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S
 10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C
 11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S
 12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
 13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S
 14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S
 15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S
 16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S
 17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C
 18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q
 19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C
 20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S
 21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C
 22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C
 23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S
 24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
 25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C
 26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C
 27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S
 28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C
 29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C
 30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S
 31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
 32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S
 33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S
 34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S
 35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
 36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C
 37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C
 38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S
 39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S
 40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S
 41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S
 42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C
 43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S
 44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S
 45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S
 46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S
 47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S
 48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C
 49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q
 50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C
 51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S
 52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S
 53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C
 54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S
 55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S
 56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C
 57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q
 58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S
 59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S
 60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S
 61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C
 62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S
 63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S
 64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S
 65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q
 66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C
 67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S
 68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q
 69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S
 70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C
 71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S
 72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q
 73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S
 74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S
 75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C
 76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C
 77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C
 78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S
 79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S
 80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S
 81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q
 82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C
 83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S
 84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S
 85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S
 86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q
 87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C
 88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q
 89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S
 90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q
 91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S
 92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S
 93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
 94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S
 95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S
 96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C
 97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S
 98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S
 99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S
100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S
101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S
102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C
103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S
104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S
106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C
107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S
108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q
109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S
111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S
112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C
113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q
114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C
115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q
116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S
117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C
118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C
119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S
120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C
121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S
122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S
123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q
124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C
125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S
126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q
127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S
128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S
129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q
130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S
131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S
132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S
133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C
134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S
135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S
137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S
138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C
139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S
140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S
141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S
142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S
143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S
144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C
145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S
146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S
147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S
148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S
149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S
150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S
151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S
152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C
153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C
154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S
155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S
156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S
157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S
158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S
159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S
160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S
161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S
162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q
163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C
164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S
165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S
166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S
167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S
168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C
169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S
170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C
171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S
172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S
173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C
174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S
175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C
176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S
177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S
178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S
179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C
180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S
181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C
182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S
183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C
184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S
185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q
186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C
187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S
188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S
189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S
190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S
191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S
192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S
193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S
194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S
195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q
196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S
197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S
198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C
199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S
200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S
201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S
202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q
203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C
205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S
206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S
207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q
209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S
210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C
211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S
212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S
213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S
214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S
215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S
216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S
217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S
218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q
219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S
220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C
221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S
222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C
223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S
224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S
225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S
226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C
227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C
228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S
229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q
230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S
231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S
232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S
233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S
234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S
235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q
236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C
237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S
238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C
239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C
240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S
241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C
242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C
243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S
244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C
245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S
246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S
247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S
248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S
249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S
250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S
251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C
252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S
253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S
254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C
255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S
256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S
257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S
258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q
259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S
260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S
261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S
262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S
263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S
264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S
265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S
266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C
267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S
268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S
269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S
270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S
271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S
272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C
273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q
274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C
275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q
276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C
277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S
278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S
279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S
280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S
281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S
282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S
283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S
284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q
285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C
286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S
287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S
288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S
290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C
291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S
292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S
293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q
294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C
295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S
296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S
297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S
298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C
299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C
300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S
301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S
302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S
303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C
304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S
305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S
306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S
308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S
309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S
310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S
311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S
312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S
313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C
314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S
315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q
316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C
317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q
318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C
319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S
320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S
321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S
322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S
323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C
324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S
325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S
326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S
327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S
328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S
329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C
330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S
331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S
332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S
333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C
334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C
335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C
336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S
337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S
338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S
339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C
340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S
341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C
342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S
343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S
344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S
345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C
346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S
348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S
349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C
350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S
351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S
352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C
353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S
354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S
355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S
356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S
357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S
358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S
359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S
360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q
361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S
362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S
363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C
364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S
365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S
366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C
367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S
368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C
369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S
370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C
371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C
372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S
373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C
374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S
375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S
376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S
377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C
378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S
379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S
380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S
381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S
382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q
383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q
384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S
385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S
386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S
388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S
389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S
390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q
391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S
392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S
393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S
394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S
395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S
396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S
397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S
398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q
399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C
400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S
401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q
402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S
403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S
404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C
405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S
406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C
407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C
408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S
409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C
410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S
412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q
414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S
415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C
417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C
420 | 


--------------------------------------------------------------------------------
/machine_learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/machine_learning/__init__.py


--------------------------------------------------------------------------------
/machine_learning/decision_tree.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | 
  5 | from .tree import TreeNode
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger(__file__)
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | class DecisionTree():
 13 | 
 14 |     def __init__(self,
 15 |                  max_depth=2,
 16 |                  min_samples_split=2,
 17 |                  min_samples_leaf=1,
 18 |                  n_classes=2,
 19 |                  max_features=None,
 20 |                  impurity='gini',
 21 |                  is_classifier=True):
 22 |         """Decision tree model
 23 | 
 24 |         Parameters:
 25 |         ----------
 26 |         max_depth: int
 27 |             The maximum depth allowed when "growing" a tree
 28 |         min_samples_split: int
 29 |             The minimum number of samples required to allow a split at a
 30 |             node
 31 |         min_samples_leaf: int
 32 |             The minimum number of samples allowed in a leaf. A split
 33 |             candidate leading to less samples in a node than the
 34 |             min_samples_leaf will be rejected
 35 |         n_classes: int, optional, default 2
 36 |             Number of classes in a classification setting. Ignored when
 37 |             self.is_classifier = False
 38 |         max_features: int, optional, default None
 39 |             If set to 'sqrt' then only a random subset of features are
 40 |             used to split at each node, the number of features used in
 41 |             this case is sqrt(n_features).
 42 |             Else all the features are considered when splitting at each
 43 |             node
 44 |         impurity: str, optional, default 'gini'
 45 |             The impurity measure to use when splitting at each node.
 46 |             I have currently only implemented two
 47 |             'gini' - Uses the gini impurity (for classification)
 48 |             'mse' - Uses the mean square error - equal to variance (for
 49 |             regression)
 50 |         is_classifier: bool, optional, default True
 51 |             Is the model used as part of a classification problem
 52 |             or a regression problem. Should be set to True if
 53 |             classification, False if regression
 54 |         """
 55 |         self.max_depth = max_depth
 56 |         self.min_samples_split = min_samples_split
 57 |         self.min_samples_leaf = min_samples_leaf
 58 |         self.n_classes = n_classes
 59 |         self.max_features = max_features
 60 |         self.impurity = impurity
 61 |         self.is_classifier = is_classifier
 62 | 
 63 |         self.is_fitted = False
 64 |         self.tree = None
 65 | 
 66 |     def fit(self, X, y):
 67 |         """Fits the decision tree model
 68 | 
 69 |         The tree is fitted by instantiaing a root TreeNode instance and
 70 |         then calling the recursive_split method. This iteratively grows
 71 |         the tree by finding the best split to reduce the impurity the
 72 |         most.
 73 | 
 74 |         Parameters:
 75 |         ----------
 76 |         X: numpy.ndarray
 77 |             Training data, shape (m samples, n features)
 78 |         y: numpy.ndarray
 79 |             Target values, shape (m samples, 1)
 80 |             If classifier with n_classes the values are assumed to be in
 81 |             0, ..., n-1
 82 |         """
 83 |         y_shape = (X.shape[0], 1)
 84 |         data = np.concatenate((X, y.reshape(y_shape)), axis=1)
 85 |         self.tree = TreeNode(
 86 |             data=data,
 87 |             max_depth=self.max_depth,
 88 |             min_samples_split=self.min_samples_split,
 89 |             min_samples_leaf=self.min_samples_leaf,
 90 |             n_classes=self.n_classes,
 91 |             max_features=self.max_features,
 92 |             impurity=self.impurity,
 93 |             is_classifier=self.is_classifier)
 94 |         self.tree.recursive_split()
 95 |         self.is_fitted = True
 96 | 
 97 |     def predict(self, data):
 98 |         """Predicts target values or class labels for classification
 99 | 
100 |         Predicts target values/class for each row in data by walking the
101 |         tree and returning the leaf node value for regression or the 
102 |         class with the largest predicted probability for classification
103 | 
104 |         Parameters:
105 |         ----------
106 |         data: numpy.ndarray
107 |             The input data with shape (m samples, n features)
108 | 
109 |         Returns:
110 |         -------
111 |         numpy.ndarray:
112 |             Predicted target values or class labels for classification
113 |         """
114 |         if not self.is_fitted:
115 |             raise Exception('Decision tree not fitted')
116 |         return self.tree.predict(data)
117 | 
118 |     def predict_proba(self, data):
119 |         """Predicts class probabilities for input data
120 | 
121 |         Predicts class probabilities for each row in data by walking the
122 |         tree and returning the leaf node class probabilities
123 | 
124 |         Parameters:
125 |         ----------
126 |         data: numpy.ndarray
127 |             The input data with shape (m samples, n features)
128 | 
129 |         Returns:
130 |         -------
131 |         numpy.ndarray:
132 |             Predicted sample class probabilities, 
133 |             shape (m samples, n classes)
134 |         """
135 |         if not self.is_fitted:
136 |             raise Exception('Decision tree not fitted')
137 |         return self.tree.predict_proba(data)
138 | 
139 |     def render(self, feature_names):
140 |         """Returns Digraph visualizing the decision tree (if fitted)
141 | 
142 |         Parameters:
143 |         ----------
144 |         feature_names: list[str]
145 |             List of feature names
146 | 
147 |         Returns:
148 |         -------
149 |         graphviz.Digraph:
150 |             dot for tree diagram visual
151 |         """
152 |         if not self.is_fitted:
153 |             print('Decision tree not fitted')
154 |         else:
155 |             return self.tree.dot(feature_names=feature_names)
156 | 


--------------------------------------------------------------------------------
/machine_learning/gradient_boosted_decision_tree.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | from scipy.special import expit, logsumexp
  5 | 
  6 | from .decision_tree import DecisionTree
  7 | 
  8 | logging.basicConfig()
  9 | logger = logging.getLogger(__file__)
 10 | logger.setLevel(logging.INFO)
 11 | 
 12 | 
 13 | class GradientBoostedDecisionTree():
 14 | 
 15 |     def __init__(self,
 16 |                  max_depth=2,
 17 |                  min_samples_split=2,
 18 |                  min_samples_leaf=1,
 19 |                  n_classes=2,
 20 |                  max_features=None,
 21 |                  is_classifier=True,
 22 |                  n_trees=10,
 23 |                  learning_rate=0.1):
 24 |         """Gradient boosted decision tree model
 25 | 
 26 |         The trees are grown sequentially and fitted to the negative 
 27 |         gradient of the cost function with respect to the raw predicted
 28 |         values at the previous stage. 
 29 | 
 30 |         Note I use the term raw_predictions as raw predicted values 
 31 |         must be transformed to find the probability estimates in the 
 32 |         case of classification.
 33 | 
 34 |         In practice these gradients are equal to the residual.
 35 | 
 36 |         The raw predictions for a stage are made by adding the new delta
 37 |         model (multiplied by the learning rate) to the raw predictions
 38 |         from the previous stage
 39 | 
 40 |         Parameters:
 41 |         ----------
 42 |         max_depth: int
 43 |             The maximum depth allowed when "growing" a tree
 44 |         min_samples_split: int
 45 |             The minimum number of samples required to allow a split at a
 46 |             node
 47 |         min_samples_leaf: int
 48 |             The minimum number of samples allowed in a leaf. A split
 49 |             candidate leading to less samples in a node than the
 50 |             min_samples_leaf will be rejected
 51 |         n_classes: int, optional, default 2
 52 |             Number of classes in a classification setting. Ignored when
 53 |             self.is_classifier = False
 54 |         max_features: int, optional, default None
 55 |             If set to 'sqrt' then only a random subset of features are
 56 |             used to split at each node, the number of features used in
 57 |             this case is sqrt(n_features).
 58 |             Else all the features are considered when splitting at each
 59 |             node
 60 |         is_classifier: bool, optional, default True
 61 |             Is the model used as part of a classification problem
 62 |             or a regression problem. Should be set to True if
 63 |             classification, False if regression
 64 |         n_trees: int, optional, default 10
 65 |             Number of trees, equivalently gradient steps
 66 |         learning_rate: float, optional, default 0.05
 67 |             The learning rate parameter controlling the gradient descent
 68 |             step size
 69 |         """
 70 |         self.max_depth = max_depth
 71 |         self.min_samples_split = min_samples_split
 72 |         self.min_samples_leaf = min_samples_leaf
 73 |         self.n_classes = n_classes
 74 |         self.max_features = max_features
 75 |         self.is_classifier = is_classifier
 76 | 
 77 |         self.n_trees = n_trees
 78 |         self.learning_rate = learning_rate
 79 |         self.is_fitted = False
 80 |         np.random.seed(1)
 81 |         self.trees_to_fit = 1 if n_classes <= 2 else n_classes
 82 |         self.trees = [
 83 |             [None for _ in range(self.trees_to_fit)]
 84 |             for _ in range(self.n_trees)]
 85 |         #  trees has shape (n_trees, n_classes)
 86 | 
 87 |     def predict_delta_model(self, X, stage=0):
 88 |         """Calculate the delta model for a stage
 89 | 
 90 |         This function returns the estimate of the negative gradient. 
 91 |         These raw predictions are the delta models f_{stage + 1}
 92 | 
 93 |         Parameters:
 94 |         ----------
 95 |         X: numpy.ndarray
 96 |             Sample data, shape (m samples, n features)
 97 |         stage: int, optional, default 0
 98 |             What correction step are we predicting
 99 | 
100 |         Returns:
101 |         -------
102 |         numpy.ndarray:
103 |             gradient_step, shape (X.shape[0], n_classes)
104 |             if n_classes > 2 else shape (m samples, 1)
105 |         """
106 |         class_gradient_step = []
107 |         for class_k, model in enumerate(self.trees[stage]):
108 |             k_gradient_step = model.predict(X).reshape(-1)
109 |             class_gradient_step.append(k_gradient_step)
110 |         gradient_step = np.stack(class_gradient_step, axis=-1)
111 |         return gradient_step
112 | 
113 |     def predict_raw_stages(self, X, n_stages=None):
114 |         """Predictions for input X
115 | 
116 |         The predictions are given by the transformed sum of initial 
117 |         model and delta models. Note no transformation is required for
118 |         regression.
119 | 
120 |         If n_stages specified stop at that stage. The delta model is
121 |         multiplied by the learning rate before being added to the
122 |         raw predictions
123 | 
124 |         Parameters:
125 |         ----------
126 |         X: numpy.ndarray
127 |             Sample data, shape (m samples, n features)
128 |         n_stages: in, optional, default None
129 |             If given return prediction an n_stages
130 | 
131 |         Returns:
132 |         -------
133 |         numpy.ndarray:
134 |             predictions, shape (X.shape[0], n_classes)
135 |             if n_classes > 2 else shape (m samples, 1)
136 |         """
137 |         if not n_stages:
138 |             n_stages = self.n_trees
139 |         if n_stages not in list(range(1, self.n_trees + 1)):
140 |             raise Exception('n_stages must be between 1 and n_trees')
141 |         raw_predictions = self.f_0_prediction(X)
142 |         for stage in range(n_stages):
143 |             stage_gradient_step = self.predict_delta_model(X, stage)
144 |             raw_predictions += self.learning_rate * stage_gradient_step
145 |         return self.convert_raw_predictions(raw_predictions)
146 | 
147 |     def predict(self, X):
148 |         """Predicts target values or class labels for classification
149 | 
150 |         Parameters:
151 |         ----------
152 |         X: numpy.ndarray
153 |             Sample data, shape (m samples, n features)
154 | 
155 |         Returns:
156 |         -------
157 |         numpy.ndarray:
158 |             Predicted target values or class labels for classification
159 |         """
160 |         if not self.is_classifier:
161 |             return self.predict_raw_stages(X)
162 |         else:
163 |             return np.argmax(self.predict_proba(X), axis=-1)
164 | 
165 |     def predict_proba(self, X):
166 |         """Predicts class probabilities for input data
167 | 
168 |         Parameters:
169 |         ----------
170 |         X: numpy.ndarray
171 |             Sample data, shape (m samples, n features)
172 | 
173 |         Returns:
174 |         -------
175 |         numpy.ndarray:
176 |             Predicted sample class probabilities, 
177 |             shape (m samples, n classes)
178 |             if n_classes > 2 else shape (m samples, 1)
179 |         """
180 |         if not self.is_classifier:
181 |             raise Exception('Not a classifier')
182 |         if self.n_classes == 2:
183 |             prob_class_one = self.predict_raw_stages(X)
184 |             return np.stack([1-prob_class_one, prob_class_one], axis=-1)
185 |         if self.n_classes > 2:
186 |             return self.predict_raw_stages(X)
187 | 
188 |     def convert_raw_predictions(self, raw_predictions):
189 |         """Convert raw_predictions to probability if classifier
190 | 
191 |         This uses sigmoid if the are two classes - in which case we
192 |         model the logit. Softmax function is used when there are more
193 |         than two classes.
194 | 
195 |         Parameters:
196 |         ----------
197 |         raw_predictions: numpy.ndarray
198 |             Raw predictions, shape (m samples, n classes)
199 | 
200 |         Returns:
201 |         -------
202 |         numpy.ndarray:
203 |             target values or class probabilities for classification
204 |         """
205 |         if not self.is_classifier:
206 |             return raw_predictions
207 |         if self.is_classifier and self.n_classes == 2:
208 |             return expit(raw_predictions)
209 |         if self.is_classifier and self.n_classes > 2:
210 |             return np.exp(
211 |                 raw_predictions - logsumexp(raw_predictions, axis=1)[:, None])
212 | 
213 |     def f_0_prediction(self, X):
214 |         """Return initial raw_predictions for X
215 | 
216 |         Parameters:
217 |         ----------
218 |         X: numpy.ndarray
219 |             Training data, shape (m samples, n features)
220 | 
221 |         Returns:
222 |         -------
223 |         numpy.ndarray:
224 |             raw_predictions, shape (m samples, n classes)
225 |             if n_classes > 2 else shape (m samples, 1)
226 |         """
227 |         n = X.shape[0]
228 |         if not self.is_classifier:
229 |             return self.regression_f_0_tree.predict(X).reshape(n, 1)
230 |         if self.is_classifier and self.n_classes == 2:
231 |             return np.repeat(self.f_0, n).reshape(n, 1)
232 |         if self.is_classifier and self.n_classes > 2:
233 |             return np.repeat(self.f_0, n, axis=0)
234 | 
235 |     def init_f_0(self, X, y):
236 |         """Fit initial prediction model
237 | 
238 |         For regression this is simple fitting a first tree to the target
239 |         values.
240 | 
241 |         For classification when we model the logit (in two class 
242 |         scenario) we use the logit of the average probability in the
243 |         training data.
244 |         For the multi class case, where we model the log of each class
245 |         probability as an additive model, we initialise the raw values
246 |         as the log of the observed probability of that class.
247 | 
248 |         Parameters:
249 |         ----------
250 |         X: numpy.ndarray
251 |             Training data, shape (m samples, n features)
252 |         y: numpy.ndarray
253 |             Target values, shape (m samples, 1)
254 |             If classifier with n_classes the values are assumed to be in
255 |             0, ..., n-1
256 |         """
257 |         y = y.reshape(-1)
258 |         if not self.is_classifier:
259 |             self.regression_f_0_tree = self.get_tree()
260 |             self.regression_f_0_tree.fit(X, y)
261 |         if self.is_classifier and self.n_classes == 2:
262 |             self.f_0 = np.log(y.sum() / (y.shape[0] - y.sum()))
263 |         if self.is_classifier and self.n_classes > 2:
264 |             self.f_0 = np.log(
265 |                 np.bincount(y, minlength=self.n_classes) / y.shape[0])[None, :]
266 | 
267 |     def get_tree(self):
268 |         """Helper to return decision tree to be fitted
269 | 
270 |         Returns:
271 |         -------
272 |         DecisionTree:
273 |             Regression tree
274 |         """
275 |         return DecisionTree(
276 |             max_depth=self.max_depth,
277 |             min_samples_split=self.min_samples_split,
278 |             min_samples_leaf=self.min_samples_leaf,
279 |             n_classes=self.n_classes,
280 |             max_features=self.max_features,
281 |             impurity='mse',
282 |             is_classifier=False)
283 | 
284 |     def fit(self, X, y):
285 |         """Fit the gradient boosted decision tree
286 | 
287 |         For each stage fit a tree to the negative gradient (for that
288 |         class), then update the raw predictions using the learning rate
289 |         and delta model.
290 | 
291 |         Parameters:
292 |         ----------
293 |         X: numpy.ndarray
294 |             Training data, shape (m samples, n features)
295 |         y: numpy.ndarray
296 |             Target values, shape (m samples, 1)
297 |             If classifier with n_classes the values are assumed to be in
298 |             0, ..., n-1
299 |         """
300 |         if self.is_classifier:
301 |             y = y.astype(int)
302 |         self.init_f_0(X, y)
303 |         prev_stage_raw_predictions = self.f_0_prediction(X)
304 |         for stage in range(self.n_trees):
305 |             negative_gradient = self.negative_gradient(
306 |                 y, prev_stage_raw_predictions)
307 |             self.fit_stage(X, negative_gradient, stage=stage)
308 |             delta_model = self.predict_delta_model(X, stage=stage)
309 |             prev_stage_raw_predictions = prev_stage_raw_predictions + \
310 |                 (self.learning_rate * delta_model)
311 | 
312 |     def fit_stage(self, X, negative_gradient, stage=0):
313 |         """Fit a given stage
314 | 
315 |         For regression this is just fitting a single tree to the
316 |         gradient. For classification we fit one tree for each class (
317 |         unless there are only two classes when we can use just one)
318 | 
319 |         Parameters:
320 |         ----------
321 |         X: numpy.ndarray
322 |             Training data, shape (m samples, n features)
323 |         negative_gradient: numpy.ndarray
324 |             dL_dY^hat, shape (m samples, n features)
325 |         stage: int, optional, default 0
326 |             stage to fit
327 |         """
328 |         logger.info(f'Fitting stage {stage}')
329 |         trees_to_fit = 1 if self.n_classes <= 2 else self.n_classes
330 |         for class_k in range(trees_to_fit):
331 |             target = negative_gradient[:, class_k]
332 |             tree = self.get_tree()
333 |             tree.fit(X, target)
334 |             self.trees[stage][class_k] = tree
335 | 
336 |     def negative_gradient(self, y, prev_stage_raw_predictions):
337 |         """Gradient of the loss function with res
338 | 
339 |         Parameters:
340 |         ----------
341 |         y: numpy.ndarray
342 |             Target values, shape (m samples, 1)
343 |             If classifier with n_classes the values are assumed to be in
344 |             0, ..., n-1
345 |         prev_stage_raw_predictions: numpy.ndarray
346 |             raw_predictions, shape
347 | 
348 |         Returns:
349 |         -------
350 |         numpy.ndarray:
351 |             negative gradient, shape (m samples, n classes)
352 |             if n_classes > 2 else shape (m samples, 1)
353 |         """
354 |         if self.is_classifier and self.n_classes > 2:
355 |             y = np.eye(self.n_classes)[y.reshape(-1)]
356 |         else:
357 |             y = y.reshape(y.shape[0], 1)
358 |         return y - self.convert_raw_predictions(prev_stage_raw_predictions)
359 | 
360 |     def render(self, stage, class_k, feature_names):
361 |         """Returns Digraph visualizing one of the decision trees
362 | 
363 |         Parameters:
364 |         ----------
365 |         stage: [type]
366 |             Stage to get tree from
367 |         class_k: [type]
368 |             tree for class class_k
369 |         feature_names: [type]
370 |             Feature names
371 | 
372 |         Returns:
373 |         -------
374 |         graphviz.Digraph:
375 |             dot for tree diagram visual
376 |         """
377 |         return self.trees[stage][class_k].render(feature_names)
378 | 


--------------------------------------------------------------------------------
/machine_learning/knn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class Knn():
  5 | 
  6 |     def __init__(self, k=3, save_history=False, tolerance=0.001):
  7 |         """knn model
  8 | 
  9 |         Parameters:
 10 |         ----------
 11 |         k: int, optional, default 3
 12 |             number of clusters
 13 |         save_history: bool, optional, default False
 14 |             Whether to save intermediate steps, for analysis and 
 15 |             visualisation - see notebook for example
 16 |         tolerance: float, optional, default 0.001
 17 |             Stopping tolerance for change in centorids
 18 |         """
 19 |         self.k = k
 20 |         self.save_history = save_history
 21 |         self.centroids = [None for _ in range(k)]
 22 |         self.iteration = 0
 23 |         self.tolerance = tolerance
 24 | 
 25 |     def update_centroids(self, X):
 26 |         """Update the centroids as the mean of clusters
 27 | 
 28 |         The new centroids are calculated as the mean of the clusters
 29 |         based on the current cluster assignment (self.cluster_labels).
 30 |         After the update the new assignment is done to update the
 31 |         labels
 32 | 
 33 |         Parameters:
 34 |         ----------
 35 |         X: numpy.ndarray
 36 |             Input data to cluster, shape (m samples, n features)
 37 | 
 38 |         Returns:
 39 |         -------
 40 |         float:
 41 |             Sum of the euclidean distance change in each of the 
 42 |             centroids after updating
 43 |         """
 44 |         new_centroids = []
 45 |         for i in range(self.k):
 46 |             new_centroid = X[self.cluster_labels == i, :].mean(axis=0)
 47 |             new_centroids.append(new_centroid)
 48 |         new_centroids = np.stack(new_centroids, axis=0)
 49 |         self.iteration += 1
 50 |         distance_change = self.dist(self.centroids, new_centroids).sum()
 51 |         self.centroids = new_centroids
 52 |         new_cluster_labels = self.assign_clusters(X)
 53 |         self.cluster_labels = new_cluster_labels
 54 |         if self.save_history:
 55 |             self.centroid_history.append(self.centroids)
 56 |             self.cluster_labels_history.append(self.cluster_labels)
 57 |         return distance_change
 58 | 
 59 |     def fit(self, X, max_updates=10):
 60 |         """Fit the knn model
 61 | 
 62 |         Fitting the model updates the centroids and cluster labels
 63 |         iteratively util the centroids no longer change or the max
 64 |         number of iterations is reached
 65 | 
 66 |         Parameters:
 67 |         ----------
 68 |         X: numpy.ndarray
 69 |             Input data to cluster, shape (m samples, n features)
 70 |         max_updates: int, optional, default 10
 71 |             Maximum number of iterations permitted
 72 |         """
 73 |         self.initalise_centroids(X)
 74 |         distance_change = 10**6
 75 |         while self.iteration < max_updates and not distance_change < self.tolerance:
 76 |             distance_change = self.update_centroids(X)
 77 |         print(f'Finished at iteration {self.iteration}')
 78 | 
 79 |     def initalise_centroids(self, X):
 80 |         """Sets initial centorids randomly and assigns cluster labels
 81 | 
 82 |         The centroids are chosen randomly based on observed range of
 83 |         values in X
 84 | 
 85 |         Parameters:
 86 |         ----------
 87 |         X: numpy.ndarray
 88 |             Input data to cluster, shape (m samples, n features)
 89 |         """
 90 |         X_mins = X.min(axis=0)
 91 |         X_maxs = X.max(axis=0)
 92 |         self.centroids = np.stack(
 93 |             [np.random.uniform(xi_min, xi_max, self.k)
 94 |              for xi_min, xi_max in zip(X_mins, X_maxs)],
 95 |             axis=-1)
 96 |         self.cluster_labels = self.assign_clusters(X)
 97 |         if self.save_history:
 98 |             self.centroid_history = [self.centroids]
 99 |             self.cluster_labels_history = [self.cluster_labels]
100 | 
101 |     def dist(self, a, b, axis=1):
102 |         """Euclidean distance function
103 | 
104 |         Parameters:
105 |         ----------
106 |         a: numpy.ndarray
107 |             samples, shape (m sample, n features)
108 |         b: numpy.ndarray
109 |             centroid, shape (n_features,)
110 |         axis: int, optional, default 1
111 |             Set to 1 to sum along rows
112 | 
113 |         Returns:
114 |         -------
115 |         numpy.ndarray:
116 |             Distance between each sample and centroid
117 |         """
118 |         return np.linalg.norm(a - b, axis=axis)
119 | 
120 |     def assign_clusters(self, X):
121 |         """Assigns each sample of X to its nearest cluster centroid
122 | 
123 |         Parameters:
124 |         ----------
125 |         X: numpy.ndarray
126 |             Input data to cluster, shape (m samples, n features)
127 | 
128 |         Returns:
129 |         -------
130 |         numpy.ndarray:
131 |             Cluster label for each sample, shape (m samples, 1)
132 |         """
133 |         distances = []
134 |         for centroid in self.centroids:
135 |             centorid_distances = self.dist(X, centroid)
136 |             distances.append(centorid_distances)
137 |         all_distaces = np.stack(distances, axis=1)
138 |         cluster_labels = np.argmin(all_distaces, axis=1)
139 |         return cluster_labels
140 | 


--------------------------------------------------------------------------------
/machine_learning/linear_regression.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class LinearRegression():
  7 | 
  8 |     def __init__(self, learning_rate=0.05):
  9 |         """
 10 |         Linear regression model
 11 | 
 12 |         Parameters:
 13 |         ----------
 14 |         learning_rate: float, optional, default 0.05
 15 |             The learning rate parameter controlling the gradient descent
 16 |             step size
 17 |         """
 18 |         self.learning_rate = learning_rate
 19 |         print('Creating linear model instance')
 20 | 
 21 |     def __repr__(self):
 22 |         return (
 23 |             f'<LinearRegression '
 24 |             f'learning_rate={self.learning_rate}>')
 25 | 
 26 |     def fit(self, X, y, n_iter=1000):
 27 |         """
 28 |         Fit the linear regression model
 29 | 
 30 |         Updates the weights with n_iter iterations of batch gradient
 31 |         descent updates
 32 | 
 33 |         Parameters:
 34 |         ----------
 35 |         X: numpy.ndarray
 36 |             Training data, shape (m samples, (n - 1) features + 1)
 37 |             Note the first column of X is expected to be ones (to allow 
 38 |             for the bias to be included in beta)
 39 |         y: numpy.ndarray
 40 |             Target values, shape (m samples, 1)
 41 |         n_iter: int, optional, default 1000
 42 |             Number of batch gradient descent steps
 43 |         """
 44 |         m, n = X.shape
 45 |         print(f'fitting with m={m} samples with n={n-1} features\n')
 46 |         self.beta = np.zeros(shape=(n, 1))
 47 |         self.costs = []
 48 |         self.betas = [self.beta]
 49 |         for iteration in range(n_iter):
 50 |             y_pred = self.predict(X)
 51 |             cost = self.cost(y, y_pred)
 52 |             self.costs.append(cost[0][0])
 53 |             gradient = self.gradient(y, y_pred, X)
 54 |             self.beta = self.beta - (
 55 |                 self.learning_rate * gradient)
 56 |             self.betas.append(self.beta)
 57 | 
 58 |     def cost(self, y, y_pred):
 59 |         """  
 60 |         Mean square error cost function
 61 | 
 62 |         Parameters:
 63 |         ----------
 64 |         y: numpy.ndarray
 65 |             True target values, shape (m samples, 1)
 66 |         y_pred: numpy.ndarray
 67 |             Predicted y values, shape (m samples, 1)
 68 | 
 69 |         Returns:
 70 |         -------
 71 |         float:
 72 |             mean square error value
 73 |         """
 74 |         m = y.shape[0]
 75 |         cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred)
 76 |         return cost
 77 | 
 78 |     def gradient(self, y, y_pred, X):
 79 |         """  
 80 |         Calculates the gradient of the cost function
 81 | 
 82 |         Parameters:
 83 |         ----------
 84 |         y: numpy.ndarray
 85 |             Predicted y values, shape (m samples, 1)
 86 |         y_pred: numpy.ndarray
 87 |             True target values, shape (m samples, 1)
 88 |         X: numpy.ndarray
 89 |             Training data, shape (m samples, (n - 1) features + 1)
 90 |             Note the first column of X is expected to be ones (to allow 
 91 |             for the bias to be included in beta)
 92 | 
 93 |         Returns:
 94 |         -------
 95 |         numpy.ndarray:
 96 |             Derivate of mean square error cost function with respect to
 97 |             the weights beta, shape (n features, 1)
 98 |         """
 99 |         m = X.shape[0]
100 |         gradient = (1 / m) * X.T @ (y_pred - y)
101 |         return gradient
102 | 
103 |     def predict(self, X):
104 |         """  
105 |         Predict the target values from sample X feature values
106 | 
107 |         Parameters:
108 |         ----------
109 |         X: numpy.ndarray
110 |             Training data, shape (m samples, (n - 1) features + 1)
111 |             Note the first column of X is expected to be ones (to allow 
112 |             for the bias to be included in beta)
113 | 
114 |         Returns:
115 |         -------
116 |         numpy.ndarray:
117 |             Target value predictions, shape (m samples, 1)
118 |         """
119 |         y_pred = X @ self.beta
120 |         return y_pred
121 | 


--------------------------------------------------------------------------------
/machine_learning/logistic_regression.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class LogisticRegression():
  7 | 
  8 |     def __init__(self, learning_rate=0.05):
  9 |         """  
 10 |         Logistic regression model
 11 | 
 12 |         Parameters:
 13 |         ----------
 14 |         learning_rate: float, optional, default 0.05
 15 |             The learning rate parameter controlling the gradient descent
 16 |             step size
 17 |         """
 18 |         self.learning_rate = learning_rate
 19 |         print('Creating logistic model instance')
 20 | 
 21 |     def __repr__(self):
 22 |         return (
 23 |             f'<LogisticRegression '
 24 |             f'learning_rate={self.learning_rate}>')
 25 | 
 26 |     def fit(self, X, y, n_iter=1000):
 27 |         """  
 28 |         Fit the logistic regression model
 29 | 
 30 |         Updates the weights with n_iter iterations of batch gradient
 31 |         descent updates
 32 | 
 33 |         Parameters:
 34 |         ----------
 35 |         X: numpy.ndarray
 36 |             Training data, shape (m samples, (n - 1) features + 1)
 37 |             Note the first column of X is expected to be ones (to allow 
 38 |             for the bias to be included in beta)
 39 |         y: numpy.ndarray
 40 |             Target values - class label {0, 1}, shape (m samples, 1)
 41 |         n_iter: int, optional, default 1000
 42 |             Number of batch gradient descent steps
 43 |         """
 44 |         m, n = X.shape
 45 |         print(f'fitting with m={m} samples with n={n-1} features\n')
 46 |         self.beta = np.zeros(shape=(n, 1))
 47 |         self.costs = []
 48 |         self.betas = [self.beta]
 49 |         for iteration in range(n_iter):
 50 |             y_pred = self.predict_proba(X)
 51 |             cost = (-1 / m) * (
 52 |                 (y.T @ np.log(y_pred)) +
 53 |                 ((np.ones(shape=y.shape) - y).T @ np.log(
 54 |                     np.ones(shape=y_pred.shape) - y_pred))
 55 |             )
 56 |             self.costs.append(cost[0][0])
 57 |             gradient = (1 / m) * X.T @ (y_pred - y)
 58 |             self.beta = self.beta - (
 59 |                 self.learning_rate * gradient)
 60 |             self.betas.append(self.beta)
 61 | 
 62 |     def predict_proba(self, X):
 63 |         """  
 64 |         Predicted probability values for class 1
 65 | 
 66 |         Note this is calculated as the sigmoid of the linear combination
 67 |         of the feature values and the weights.
 68 | 
 69 |         Parameters:
 70 |         ----------
 71 |         X: numpy.ndarray
 72 |             Training data, shape (m samples, (n - 1) features + 1)
 73 |             Note the first column of X is expected to be ones (to allow 
 74 |             for the bias to be included in beta)
 75 | 
 76 |         Returns:
 77 |         -------
 78 |         numpy.ndarray:
 79 |             Predicted probability of samples being in class 1
 80 |         """        
 81 |         y_pred = self.sigmoid(X @ self.beta)
 82 |         return y_pred
 83 | 
 84 |     def predict(self, X, descision_prob=0.5):
 85 |         """  
 86 |         Predict the class values from sample X feature values
 87 | 
 88 |         Parameters:
 89 |         ----------
 90 |         X: numpy.ndarray
 91 |             Training data, shape (m samples, (n - 1) features + 1)
 92 |             Note the first column of X is expected to be ones (to allow 
 93 |             for the bias to be included in beta)
 94 | 
 95 |         Returns:
 96 |         -------
 97 |         numpy.ndarray:
 98 |             Prediceted class values, shape (m samples, 1)
 99 |         """
100 |         y_pred = self.sigmoid(X @ self.beta)
101 |         return (y_pred > descision_prob) * 1
102 | 
103 |     def sigmoid(self, x):
104 |         """  
105 |         Sigmoid function
106 | 
107 |         f(x) = 1 / (1 + e^(-x))
108 | 
109 |         Parameters:
110 |         ----------
111 |         x: numpy.ndarray
112 | 
113 |         Returns:
114 |         -------
115 |         numpy.ndarray:
116 |             sigmoid of x, values in (0, 1)
117 |         """        
118 |         return 1 / (1 + np.exp(-x))
119 | 


--------------------------------------------------------------------------------
/machine_learning/neural_network.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | 
  5 | logging.basicConfig()
  6 | logger = logging.getLogger(__file__)
  7 | logger.setLevel(logging.INFO)
  8 | 
  9 | 
 10 | class NeuralNetwork():
 11 | 
 12 |     def __init__(self,
 13 |                  layer_sizes=[5, 10, 1],
 14 |                  is_classifier=True,
 15 |                  learning_rate=0.1):
 16 |         """Neural network model
 17 | 
 18 |         Parameters:
 19 |         ----------
 20 |         layer_sizes: list, optional, default [5, 10, 1]
 21 |             Number of nodes in each layer (including input and output)
 22 |         is_classifier: bool, optional, default True
 23 |             Is the model used as part of a classification problem
 24 |             or a regression problem. Should be set to True if
 25 |             classification, False if regression
 26 |         learning_rate: float, optional, default 0.05
 27 |             The learning rate parameter controlling the gradient descent
 28 |             step size
 29 |         """
 30 |         self.layer_sizes = layer_sizes  # n^0, ..., n^L
 31 |         self.is_classifier = is_classifier
 32 |         self.learning_rate = learning_rate
 33 |         self.n_L = layer_sizes[-1]  # n^L
 34 |         self.n_layers = len(layer_sizes) - 1  # L
 35 |         self.initialise_weights()
 36 | 
 37 |     def initialise_weights(self):
 38 |         """Initialise the weights and biases
 39 | 
 40 |         weights are initialized as small random numbers, biases as zero
 41 |         """
 42 |         self.weight_matrices = [
 43 |             np.random.normal(loc=0.0, scale=1.0, size=(n_l, n_l_minus_1))
 44 |             for n_l, n_l_minus_1 in zip(self.layer_sizes[1:], self.layer_sizes)
 45 |         ]
 46 |         self.betas = [np.zeros(shape=(n_l, 1)) for n_l in self.layer_sizes[1:]]
 47 | 
 48 |     def feed_forward(self, X):
 49 |         """Feed X forward through the network
 50 | 
 51 |         For each layer the net input is calculated as the product of the
 52 |         weight matrix and the activations of the previous layer plus the
 53 |         biases.
 54 | 
 55 |         The output activation is then calculated by applying the
 56 |         activation function to the net input
 57 | 
 58 |         Parameters:
 59 |         ----------
 60 |         X: numpy.ndarray
 61 |             Training data, shape (m samples, n features)
 62 | 
 63 |         Returns:
 64 |         -------
 65 |         numpy.ndarray:
 66 |             final layer activations, shape (n^L, m)
 67 |         """
 68 |         m = X.shape[0]
 69 |         layer_activations = [X.T]
 70 |         for layer in range(self.n_layers):
 71 |             A_layer_minus_1 = layer_activations[-1]
 72 |             beta = self.betas[layer]
 73 |             B = np.repeat(beta, m, axis=-1)
 74 |             Z = self.weight_matrices[layer] @ A_layer_minus_1 + B
 75 |             A = self.activation_function(Z, layer=layer)
 76 |             layer_activations.append(A)
 77 |             self.log_layer(layer, A_layer_minus_1, beta, B, Z, A)
 78 |         self.layer_activations = layer_activations
 79 |         return layer_activations[-1]
 80 | 
 81 |     def back_propagation(self, X, Y):
 82 |         """Update the weights and biases through back propagation
 83 | 
 84 |         Parameters:
 85 |         ----------
 86 |         X: numpy.ndarray
 87 |             Training data, shape (m samples, n features)
 88 |         Y: numpy.ndarray
 89 |             Target values, shape (n_classes, m samples)
 90 |         """
 91 |         assert X.shape[0] == Y.shape[1]
 92 |         final_layer_error = self.layer_activations[-1] - Y
 93 |         D_plus_1 = final_layer_error
 94 |         # errors represent D matrices in notebook explanation
 95 |         errors = [D_plus_1]
 96 |         for layer in range(self.n_layers - 2, -1, -1):
 97 |             logger.debug(f'Calculating D_{layer + 1}')
 98 |             A = self.layer_activations[layer + 1]
 99 |             self.log_back_prop_layer(layer, A, D_plus_1)
100 |             D = (self.weight_matrices[layer + 1].T @ D_plus_1) * \
101 |                 A * (1 - A)
102 |             D_plus_1 = D
103 |             errors.insert(0, D)
104 |         self.errors = errors
105 |         self.update_weights()
106 | 
107 |     def update_weights(self):
108 |         """Update the weights and biases using gradient
109 | 
110 |         The weights and biases are updated by calculating the parital
111 |         derivatives and then stepping the weights in the direction 
112 |         of the negative gradient. The step size is governed by the
113 |         learing rate
114 |         """
115 |         for layer in range(self.n_layers):
116 |             m = self.errors[0].shape[1]
117 |             d_L_d_W = (1 / m) * self.errors[layer] @ \
118 |                 self.layer_activations[layer].T
119 |             d_L_d_beta = (1 / m) * self.errors[layer].sum(axis=1)[:, None]
120 |             self.weight_matrices[layer] = self.weight_matrices[layer] - \
121 |                 self.learning_rate * d_L_d_W
122 |             if layer == 0:
123 |                 self.d_L_d_Ws.append(d_L_d_W.sum())
124 |             self.betas[layer] = self.betas[layer] - \
125 |                 self.learning_rate * d_L_d_beta
126 | 
127 |     def log_layer(self, layer, A_layer_minus_1, beta, B, Z, A):
128 |         """Utility function to group logging
129 | 
130 |         Parameters:
131 |         ----------
132 |         layer: int
133 |             The layer being logged (note python uses 0 index) so the
134 |             layer is actually layer + 1
135 |         A_layer_minus_1: numpy.ndarray, shape (n^{l-1},m)
136 |             Previous layer activations for each sample
137 |         beta: numpy.ndarray, shape (n^{l}, 1)
138 |             Layer biases
139 |         B: numpy.ndarray, shape (n^{l}, m)
140 |             Repeated layer biases for ease of matrix operations
141 |         Z: numpy.ndarray, shape (n^{l}, m)
142 |             Net input for each sample
143 |         A: numpy.ndarray, shape (n^{l}, m)
144 |             Output activation for each sample
145 |         """
146 |         logger.debug(
147 |             f'A_layer_minus_1 i.e. A_{layer} '
148 |             f'has shape {A_layer_minus_1.shape}')
149 |         logger.debug(f'beta_{layer + 1} has shape {beta.shape}')
150 |         logger.debug(f'B_{layer + 1} has shape {B.shape}')
151 |         logger.debug(f'Z_{layer + 1} has shape {Z.shape}')
152 |         logger.debug(f'A_{layer + 1} has shape {A.shape}')
153 | 
154 |     def log_back_prop_layer(self, layer,  A, D_plus_1):
155 |         """Utility for logging back propagation
156 | 
157 |         Parameters:
158 |         ----------
159 |         layer: int
160 |             The layer being logged (note python uses 0 index) so the
161 |             layer is actually layer + 1
162 |         A: numpy.ndarray, shape (n^{l}, m)
163 |             Output activation for each sample
164 |         D_plus_1: numpy.ndarray, shape (n^{l+1}, m)
165 |             Error in the next layer
166 |         """
167 |         logger.debug(
168 |             f'A_{layer + 1} has shape {A.shape}')
169 |         logger.debug(
170 |             f'W_{layer + 2} has shape '
171 |             f'{self.weight_matrices[layer + 1].shape}')
172 |         logger.debug(
173 |             f'D_{layer + 2} has shape {D_plus_1.shape}')
174 | 
175 |     def activation_function(self, Z, layer):
176 |         """Activation function
177 | 
178 |         The activation function is the sigmoid for nodes except the
179 |         output layer. For the final layer the identify function is used
180 |         for regression and for multiclass classification the softmax
181 |         function is used
182 | 
183 |         Parameters:
184 |         ----------
185 |         Z: numpy.ndarray, shape (n^{l}, m)
186 |             Net input for each sample
187 |         layer: int
188 |             The layer being logged (note python uses 0 index) so the
189 |             layer is actually layer + 1
190 | 
191 |         Returns:
192 |         -------
193 |         numpy.ndarray:
194 |             Output activation for each sample, shape (n^{l}, m)
195 |         """
196 |         if layer == (self.n_layers - 1):
197 |             if not self.is_classifier:
198 |                 return Z
199 |             if self.is_classifier and self.n_L >= 2:
200 |                 return np.exp(Z - logsumexp(Z, axis=0)[None, :])
201 |         return expit(Z)
202 | 
203 |     def cost(self, Y):
204 |         """Cost function
205 | 
206 |         Parameters:
207 |         ----------
208 |         Y: numpy.ndarray
209 |             Target values, shape (n_classes, m samples)
210 |         """
211 |         if self.is_classifier and self.n_L == 1:
212 |             cost = (-1 / m) * (
213 |                 Y * np.log(self.layer_activations[-1]) +
214 |                 (1 - Y) * np.log(1 - self.layer_activations[-1])
215 |             ).sum()
216 |         if self.is_classifier and self.n_L > 1:
217 |             cost = (-1 / m) * \
218 |                 (Y * np.log(self.layer_activations[-1])).sum()
219 |         if not self.is_classifier:
220 |             cost = (1 / (2 * m)) * \
221 |                 ((Y - self.layer_activations[-1]) ** 2).sum()
222 |         logger.debug(f'cost = {cost}')
223 |         self.costs.append(cost)
224 | 
225 |     def fit(self, X, Y, epochs=100):
226 |         """Fits the neural network with training data
227 | 
228 |         The fitting is done via multiple epochs of gradient descent.
229 |         Each iteration has a feed forward step and a back propagation
230 |         step.
231 | 
232 |         Note Y is one hot encoded if necessary.
233 | 
234 |         Parameters:
235 |         ----------
236 |         X: numpy.ndarray
237 |             Training data, shape (m samples, n features)
238 |         Y: numpy.ndarray
239 |             Target values, shape (m samples, 1)
240 |         epochs: int, optional, default 100
241 |             Number of iterations of gradient descent
242 |         """
243 |         if self.n_L > 1:
244 |             if Y.shape[0] != self.n_L:
245 |                 print('One hot encoding Y')
246 |                 Y = np.eye(self.n_L)[:, Y.reshape(-1).astype(int)]
247 |         self.costs = []
248 |         self.d_L_d_Ws = []
249 |         for epoch in range(epochs):
250 |             self.feed_forward(X)
251 |             self.cost(Y)
252 |             self.back_propagation(X, Y)
253 | 
254 |     def predict(self, X):
255 |         """Predicts target values or class labels by forward propagation
256 | 
257 |         Parameters:
258 |         ----------
259 |         X: numpy.ndarray
260 |             Training data, shape (m samples, n features)
261 |         Returns:
262 |         -------
263 |         numpy.ndarray:
264 |             Predicted target values or class labels for classification,
265 |             Shape is (n^L, m samples)
266 |         """
267 |         A_L = self.feed_forward(X)
268 |         if not self.is_classifier:
269 |             return A_L
270 |         if self.is_classifier and self.n_L == 1:
271 |             return np.round(A_L).astype(int)
272 |         if self.is_classifier and self.n_L > 1:
273 |             return np.argmax(A_L, axis=0)
274 | 
275 |     def predict_proba(self, X):
276 |         """Predicts class probabilities for input data
277 | 
278 |         Parameters:
279 |         ----------
280 |         X: numpy.ndarray
281 |             Sample data, shape (m samples, n features)
282 | 
283 |         Returns:
284 |         -------
285 |         numpy.ndarray:
286 |             Predicted sample class probabilities, 
287 |             shape (n classes, m samples)
288 |             if n_classes > 2 else shape (1, m samples)
289 |         """
290 |         A_L = self.feed_forward(X)
291 |         if not self.is_classifier:
292 |             raise Exception('Must be a classifier')
293 |         return A_L
294 | 


--------------------------------------------------------------------------------
/machine_learning/random_forest.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | 
  5 | from .decision_tree import DecisionTree
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger(__file__)
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | class RandomForest():
 13 | 
 14 |     def __init__(self,
 15 |                  max_depth=2,
 16 |                  min_samples_split=2,
 17 |                  min_samples_leaf=1,
 18 |                  n_classes=2,
 19 |                  max_features='sqrt',
 20 |                  impurity='gini',
 21 |                  is_classifier=True,
 22 |                  n_trees=10,
 23 |                  bootstrap=True):
 24 |         """Random forest model
 25 | 
 26 |         Parameters:
 27 |         ----------
 28 |         max_depth: int
 29 |             The maximum depth allowed when "growing" a tree
 30 |         min_samples_split: int
 31 |             The minimum number of samples required to allow a split at
 32 |             a the node
 33 |         min_samples_leaf: int
 34 |             The minimum number of samples allowed in a leaf. A split
 35 |             candidate leading to less samples in a node than the
 36 |             min_samples_leaf will be rejected
 37 |         n_classes: int, optional, default 2
 38 |             Number of classes in a classification setting. Ignored when
 39 |             self.is_classifier = False
 40 |         max_features: int, optional, default None
 41 |             If set to 'sqrt' then only a random subset of features are
 42 |             used to split at each node, the number of features used in
 43 |             this case is sqrt(n_features).
 44 |             Else all the features are considered when splitting at each
 45 |             node
 46 |         impurity: str, optional, default 'gini'
 47 |             The impurity measure to use when splitting at each node.
 48 |             I have currently only implemented two
 49 |             'gini' - Uses the gini impurity (for classification)
 50 |             'mse' - Uses the mean square error - equal to variance (for
 51 |             regression)
 52 |         is_classifier: bool, optional, default True
 53 |             Is the model used as part of a classification problem
 54 |             or a regression problem. Should be set to True if
 55 |             classification, False if regression
 56 |         n_trees: int, optional, default 10
 57 |             Number of trees in the forest
 58 |         bootstrap: bool, optional, default True
 59 |             Whether to bootstrap the data when fitting the trees
 60 |         """
 61 |         self.max_depth = max_depth
 62 |         self.min_samples_split = min_samples_split
 63 |         self.min_samples_leaf = min_samples_leaf
 64 |         self.n_classes = n_classes
 65 |         self.max_features = max_features
 66 |         self.impurity = impurity
 67 |         self.is_classifier = is_classifier
 68 | 
 69 |         self.n_trees = n_trees
 70 |         self.bootstrap = bootstrap
 71 |         self.is_fitted = False
 72 |         self.trees = []
 73 |         np.random.seed(1)
 74 | 
 75 |     def fit(self, X, y):
 76 |         """Fit the random forest model
 77 | 
 78 |         This method fits n_trees trees on the data with bootstrap
 79 |         samples. A random subset of the features is used at each split.
 80 | 
 81 | 
 82 |         Parameters:
 83 |         ----------
 84 |         X: numpy.ndarray
 85 |             Training data, shape (m samples, n features)
 86 |         y: numpy.ndarray
 87 |             Target values, shape (m samples, 1)
 88 |             If classifier with n_classes the values are assumed to be in
 89 |             0, ..., n-1
 90 |         """
 91 |         y_shape = (X.shape[0], 1)
 92 |         data = np.concatenate((X, y.reshape(y_shape)), axis=1)
 93 |         for i, data in enumerate(self._samples(data)):
 94 |             tree = DecisionTree(
 95 |                 max_depth=self.max_depth,
 96 |                 min_samples_split=self.min_samples_split,
 97 |                 min_samples_leaf=self.min_samples_leaf,
 98 |                 n_classes=self.n_classes,
 99 |                 max_features=self.max_features,
100 |                 impurity=self.impurity,
101 |                 is_classifier=self.is_classifier)
102 |             logger.info(f'Fitting tree {i}')
103 |             tree.fit(X, y)
104 |             self.trees.append(tree)
105 |         self.is_fitted = True
106 | 
107 |     def _samples(self, data):
108 |         """Bootstrap sample generator
109 | 
110 |         Parameters:
111 |         ----------
112 |         data: numpy.ndarray
113 |             The input data with shape (m samples, n features + 1 target)
114 |             Note the last column of the data are the target values
115 | 
116 |         Yields:
117 |             numpy.ndarray: Bootstrap sample of data
118 |         """
119 |         n_rows = data.shape[0]
120 |         for _ in range(self.n_trees):
121 |             if not self.bootstrap:
122 |                 yield data
123 |             else:
124 |                 random_rows = np.random.choice(np.arange(n_rows),
125 |                                                size=n_rows,
126 |                                                replace=True)
127 |                 yield data[random_rows, :]
128 | 
129 |     def predict_proba(self, data):
130 |         """Predicts class probabilities for input data
131 | 
132 |         The class probability predictions from each tree are averaged to
133 |         provide the overall class prediction probabilities 
134 | 
135 |         Parameters:
136 |         ----------
137 |         data: numpy.ndarray
138 |             The input data with shape (m samples, n features)
139 | 
140 |         Returns:
141 |         -------
142 |         numpy.ndarray:
143 |             Predicted sample class probabilities, 
144 |             shape (m samples, n classes)
145 |         """
146 |         if not self.is_fitted:
147 |             raise Exception('Forest not fitted')
148 |         # samples, classes, trees
149 |         return np.stack(list(tree.predict_proba(data) for tree in self.trees),
150 |                         axis=-1).sum(axis=-1) / self.n_trees
151 | 
152 |     def predict(self, data):
153 |         """Predicts target values or class labels for classification
154 | 
155 |         Parameters:
156 |         ----------
157 |         data: numpy.ndarray
158 |             The input data with shape (m samples, n features)
159 | 
160 |         Returns:
161 |         -------
162 |         numpy.ndarray:
163 |             Predicted target values or class labels for classification
164 |         """
165 |         if self.is_classifier:
166 |             return np.argmax(self.predict_proba(data), axis=-1)
167 |         else:
168 |             return np.stack(
169 |                 list(tree.predict(data) for tree in self.trees),
170 |                 axis=-1).mean(axis=-1)
171 | 
172 |     def render(self, tree_id, feature_names):
173 |         """Returns Digraph visualizing one of the decision trees
174 | 
175 |         Parameters:
176 |         ----------
177 |         tree_id: [type]
178 |             tree index to display
179 |         feature_names: [type]
180 |             Feature names
181 | 
182 |         Returns:
183 |         -------
184 |         graphviz.Digraph:
185 |             dot for tree diagram visual
186 |         """
187 |         return self.trees[tree_id].render(feature_names)
188 | 


--------------------------------------------------------------------------------
/machine_learning/tree.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | from graphviz import Digraph
  6 | 
  7 | logging.basicConfig()
  8 | logger = logging.getLogger(__file__)
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | 
 12 | class TreeNode():
 13 | 
 14 |     count = itertools.count()
 15 | 
 16 |     def __init__(self,
 17 |                  data,
 18 |                  max_depth,
 19 |                  min_samples_split,
 20 |                  min_samples_leaf,
 21 |                  n_classes=2,
 22 |                  max_features=None,
 23 |                  depth=0,
 24 |                  impurity='gini',
 25 |                  is_classifier=True):
 26 |         """
 27 |         A single node in a decision tree
 28 | 
 29 |         After recursive splitting of the input data, a given node 
 30 |         represents one split of the tree if it is not a leaf node. The
 31 |         leaf node stores the training samples in that leaf to be used 
 32 |         for prediction. 
 33 |         The splitting nodes record the feature to split on as attribute 
 34 |         self.best_feature_index and the splitting value as attribute
 35 |         self.best_feature_split_val
 36 | 
 37 |         Parameters:
 38 |         ----------
 39 |         data: numpy.ndarray
 40 |             The input data with shape (m samples, n features + 1 target)
 41 |             Note the last column of the data are the target values
 42 |         max_depth: int
 43 |             The maximum depth allowed when "growing" a tree
 44 |         min_samples_split: int
 45 |             The minimum number of samples required to allow a split at
 46 |             a the node
 47 |         min_samples_leaf: int
 48 |             The minimum number of samples allowed in a leaf. A split
 49 |             candidate leading to less samples in a node than the
 50 |             min_samples_leaf will be rejected
 51 |         n_classes: int, optional, default 2
 52 |             Number of classes in a classification setting. Ignored when
 53 |             self.is_classifier = False
 54 |         max_features: int, optional, default None
 55 |             If set to 'sqrt' then only a random subset of features are
 56 |             used to split at the node, the number of features used in
 57 |             this case is sqrt(n_features).
 58 |             Else all the features are considered when splitting at this
 59 |             node
 60 |         depth: int, optional, default 0
 61 |             The depth of the node in the tree
 62 |         impurity: str, optional, default 'gini'
 63 |             The impurity measure to use when splitting at the node.
 64 |             I have currently only implemented two
 65 |             'gini' - Uses the gini impurity (for classification)
 66 |             'mse' - Uses the mean square error - equal to variance (for
 67 |             regression)
 68 |         is_classifier: bool, optional, default True
 69 |             Is the tree node used as part of a classification problem
 70 |             or a regression problem. Should be set to True if
 71 |             classification, False if regression
 72 |         """
 73 |         self.data = data
 74 |         self.max_depth = max_depth
 75 |         self.min_samples_split = min_samples_split
 76 |         self.min_samples_leaf = min_samples_leaf
 77 |         self.n_classes = n_classes
 78 |         self.max_features = max_features
 79 |         self.depth = depth
 80 |         self.impurity = impurity
 81 |         self.is_classifier = is_classifier
 82 | 
 83 |         self.data_shape = data.shape
 84 |         self.split_attempted = False
 85 |         self.best_split_impurity = None
 86 |         self.best_feature_index = None
 87 |         self.best_feature_split_val = None
 88 |         self.is_leaf = False
 89 |         self.node_impurity = self.calculate_impurity([data[:, -1]])
 90 |         self.value = self._init_value(data)
 91 |         self.id = str(next(self.count))
 92 | 
 93 |     def __repr__(self):
 94 |         return (
 95 |             f'<TreeNode '
 96 |             f'depth:{self.depth} '
 97 |             f'node_impurity:{self.node_impurity:.2f} '
 98 |             f'samples:{self.data_shape[0]} '
 99 |             f'{"🌳" if self.is_root else ""}'
100 |             f'{"🍁" if self.is_leaf else ""}'
101 |             f'>')
102 | 
103 |     @property
104 |     def is_root(self):
105 |         return self.depth == 0
106 | 
107 |     def info(self):
108 |         return dict(
109 |             data_shape=self.data_shape,
110 |             n_classes=self.n_classes,
111 |             depth=self.depth,
112 |             min_samples_split=self.min_samples_split,
113 |             min_samples_leaf=self.min_samples_leaf,
114 |             node_impurity=self.node_impurity,
115 |             split_attempted=self.split_attempted,
116 |             best_split_impurity=self.best_split_impurity,
117 |             best_feature_index=self.best_feature_index,
118 |             best_feature_split_val=self.best_feature_split_val,
119 |             is_root=self.is_root)
120 | 
121 |     def _init_value(self, data):
122 |         """  
123 |         Returns the terminal node value based on the input data
124 | 
125 |         For a classifier this is the class_counts.
126 |         For a regressor this is the average y value. 
127 | 
128 |         Note this value can be access at a splitting node to see what
129 |         the prediction would have been at that level of the tree
130 | 
131 |         Parameters:
132 |         ----------
133 |         data: numpy.ndarray
134 |             The input data with shape (m samples, n features + 1 target)
135 |             Note the last column of the data are the target values
136 | 
137 |         Returns:
138 |         -------
139 |         numpy.ndarray or float:
140 |             Class counts if classifier, else mean of target values 
141 |         """
142 |         if self.is_classifier:
143 |             return np.bincount(
144 |                 data[:, -1].astype(int),
145 |                 minlength=self.n_classes)
146 |         else:
147 |             return np.mean(data[:, -1])
148 | 
149 |     def split(self, feature_index, feature_split_val, only_y=True):
150 |         """  
151 |         Splits self.data on feature with index feature_index using
152 |         feature_split_val.
153 | 
154 |         Each sample is included in left output if the feature value for
155 |         the sample is less than or equal to the feature_split_val else 
156 |         it is included in the right output
157 | 
158 |         Parameters:
159 |         ----------
160 |         feature_index: int
161 |             Index of the feature (column) in self.data
162 |         feature_split_val: float
163 |             Feature value to use when splitting data
164 |         only_y: bool, optional, default True
165 |             Return only the y values in left and right - this is used 
166 |             when checking candidate split purity increase
167 | 
168 |         Returns:
169 |         -------
170 |         (numpy.ndarray, numpy.ndarray):
171 |             left and right splits of self.data
172 |         """
173 |         assert feature_index in range(self.data.shape[1])
174 |         if only_y:
175 |             select = -1
176 |         else:
177 |             select = slice(None)
178 |         left_mask = self.data[:, feature_index] <= feature_split_val
179 |         right_mask = ~ left_mask
180 |         left = self.data[left_mask, select]
181 |         right = self.data[right_mask, select]
182 |         logger.debug(
183 |             f'Splitting on feature_index {feature_index} with '
184 |             f'feature_split_val = {feature_split_val} creates left '
185 |             f'with shape {left.shape} and right with '
186 |             f'shape {right.shape}')
187 |         return left, right
188 | 
189 |     def gini_impurity(self, groups):
190 |         """  
191 |         Calculate the Gini impurity for groups of values
192 | 
193 |         The impurity returned is the weighted average of the impurity
194 |         of the groups.
195 | 
196 |         You can think of gini impurity as the probability of incorrectly
197 |         predicting a random sample from a group if the prediction was
198 |         made based purely on the distribution of class labels in the
199 |         group
200 | 
201 | 
202 |         Parameters:
203 |         ----------
204 |         groups: tuple
205 |             The groups tuple is made up of arrays of values. It is 
206 |             often called with groups = (left, right) to find the purity
207 |             of the candidate split
208 | 
209 |         Returns:
210 |         -------
211 |         float:
212 |             Gini impurity
213 |         """
214 |         gini = 0
215 |         total_samples = sum(group.shape[0] for group in groups)
216 |         for i, group in enumerate(groups):
217 |             group = group.astype(int)
218 |             class_counts = np.bincount(group, minlength=self.n_classes)
219 |             group_size = class_counts.sum()
220 |             class_probs = class_counts / group_size
221 |             unique_classes = np.count_nonzero(class_counts)
222 |             group_gini = (class_probs * (1 - class_probs)).sum()
223 |             gini += group_gini * (group_size / total_samples)
224 |             logger.debug(
225 |                 f'Group {i} has size {group.shape[0]} with '
226 |                 f'{unique_classes} unique classes '
227 |                 f'with Gini index {group_gini:.3}')
228 |         return gini
229 | 
230 |     def mean_square_impurity(self, groups):
231 |         """  
232 |         Calculates the mean square error impurity
233 | 
234 |         The mse impurity is the weighted average of the group variances
235 | 
236 |         Parameters:
237 |         ----------
238 |         groups: tuple
239 |             The groups tuple is made up of arrays of values. It is 
240 |             often called with groups = (left, right) to find the purity
241 |             of the candidate split
242 | 
243 |         Returns:
244 |         -------
245 |         float:
246 |             Mean square error impurity
247 |         """
248 |         mean_square_error = 0
249 |         total_samples = sum(group.shape[0] for group in groups)
250 |         for i, group in enumerate(groups):
251 |             group_size = group.shape[0]
252 |             group_mean = np.mean(group)
253 |             group_mean_square_error = np.mean((group - group_mean) ** 2)
254 |             mean_square_error += group_mean_square_error * \
255 |                 (group_size / total_samples)
256 |             logger.debug(
257 |                 f'Group {i} has size {group.shape[0]} with '
258 |                 f'with MSE impurity {group_mean_square_error:.3}')
259 |         logger.debug(f'MSE candidate {mean_square_error}')
260 |         return mean_square_error
261 | 
262 |     def calculate_impurity(self, groups):
263 |         """  
264 |         Calculates impurity based on self.impurity setting
265 | 
266 |         Parameters:
267 |         ----------
268 |         groups: tuple
269 |             The groups tuple is made up of arrays of values. It is 
270 |             often called with groups = (left, right) to find the purity
271 |             of the candidate split
272 | 
273 |         Returns:
274 |         -------
275 |         float:
276 |             Mean square error of groups if self.impurity = 'mse'
277 |             Gini impurity of groups if self.impurity = 'mse'
278 |         """
279 |         if self.impurity == 'gini':
280 |             return self.gini_impurity(groups)
281 |         elif self.impurity == 'mse':
282 |             return self.mean_square_impurity(groups)
283 | 
284 |     def check_split(self, feature_index, feature_split_val):
285 |         """  
286 |         Updates best split if candidate split is better
287 | 
288 |         Splits the data in groups using self.split. Checks min samples
289 |         leaf condition after split. Calculates impurity of the split
290 |         then if impurity is less than best split already found and less
291 |         than the current node impurity the best_feature_index, the 
292 |         best_feature_split_val and the best_split_impurity values are
293 |         updated.
294 | 
295 |         Parameters:
296 |         ----------
297 |         feature_index: int
298 |             Index of the feature (column) in self.data
299 |         feature_split_val: float
300 |             Feature value to use when splitting data
301 |         """
302 |         groups = self.split(feature_index, feature_split_val)
303 |         if any(len(group) < self.min_samples_leaf for group in groups):
304 |             logger.debug(
305 |                 f"Can't split node on feature {feature_index} with split "
306 |                 f"val {feature_split_val} due to min_samples_leaf condition")
307 |             return None
308 |         split_impurity = self.calculate_impurity(groups)
309 |         best_current_impurity = (
310 |             10**10 if self.best_split_impurity is None
311 |             else self.best_split_impurity)
312 |         if ((split_impurity < best_current_impurity) and
313 |                 (split_impurity < self.node_impurity)):
314 |             logger.debug(
315 |                 f'Found new best split with feature_split_val='
316 |                 f'{feature_split_val} for feature_index = {feature_index} '
317 |                 f'and split_impurity = {split_impurity:.2f}')
318 |             self.best_feature_index = feature_index
319 |             self.best_feature_split_val = feature_split_val
320 |             self.best_split_impurity = split_impurity
321 | 
322 |     def find_best_split(self):
323 |         """
324 |         Finds best split at the node
325 | 
326 |         Loops through each feature and each unique value of that feature
327 |         checking for the best candidate split (i.e. the split that 
328 |         reduces the impurity the most)
329 | 
330 |         The function first checks if we have reached the max depth or if
331 |         self.data < self.min_samples_split. In either case no further
332 |         split is allowed and the function returns
333 | 
334 |         All features are considered unless self.max_features == 'sqrt'
335 |         in which case a random subset of features are used of size
336 |         sqrt(n_features)
337 |         """
338 |         if self.depth == self.max_depth:
339 |             return
340 |         if self.data.shape[0] < self.min_samples_split:
341 |             logger.info(f"{self} can't split as samples < min_samples_split")
342 |             return None
343 |         if self.node_impurity == 0:
344 |             logger.info(f"Can't improve as node pure")
345 |             return None
346 |         n_features = self.data.shape[1] - 1
347 |         all_feature_indices = np.arange(n_features)
348 |         if self.max_features == 'sqrt':
349 |             features_to_check = np.random.choice(
350 |                 all_feature_indices,
351 |                 size=np.sqrt(n_features).astype(int))
352 |         else:
353 |             features_to_check = all_feature_indices
354 |         logger.info(f'Checking features {features_to_check}')
355 |         for feature_index in features_to_check:
356 |             for feature_split_val in np.unique(self.data[:, feature_index]):
357 |                 self.check_split(feature_index, feature_split_val)
358 |         self.split_attempted = True
359 | 
360 |     def recursive_split(self):
361 |         """  
362 |         Recursively grows tree by splitting to reduce impurity the most
363 | 
364 |         The function finds the best split using the find_best_split
365 |         method. If there was a split found two nodes are created - left
366 |         and right. Finally the recursive_split method is called on each
367 |         of the new nodes.
368 | 
369 |         Note the depth of the children node is incremented, otherwise
370 |         the node settings such as min_samples_split are passed to the
371 |         children nodes
372 |         """
373 |         self.find_best_split()
374 |         if self.best_feature_index is not None:
375 |             logger.info(f'Splitting tree on feature_index '
376 |                         f'{self.best_feature_index} and feature_split_val '
377 |                         f'{self.best_feature_split_val:.2f}')
378 |             left, right = self.split(
379 |                 feature_index=self.best_feature_index,
380 |                 feature_split_val=self.best_feature_split_val,
381 |                 only_y=False)
382 |             del self.data
383 |             self.left = TreeNode(
384 |                 data=left,
385 |                 max_depth=self.max_depth,
386 |                 min_samples_split=self.min_samples_split,
387 |                 min_samples_leaf=self.min_samples_leaf,
388 |                 n_classes=self.n_classes,
389 |                 max_features=self.max_features,
390 |                 depth=self.depth + 1,
391 |                 impurity=self.impurity,
392 |                 is_classifier=self.is_classifier)
393 |             self.right = TreeNode(
394 |                 data=right,
395 |                 max_depth=self.max_depth,
396 |                 min_samples_split=self.min_samples_split,
397 |                 min_samples_leaf=self.min_samples_leaf,
398 |                 n_classes=self.n_classes,
399 |                 max_features=self.max_features,
400 |                 depth=self.depth + 1,
401 |                 impurity=self.impurity,
402 |                 is_classifier=self.is_classifier)
403 |             self.left.recursive_split()
404 |             self.right.recursive_split()
405 |         else:
406 |             logger.info('Reached max depth or no splits reduce impurity')
407 |             self.is_leaf = True
408 | 
409 |     def walk_depth_first(self, only_leaves=True):
410 |         """  
411 |         Generator traversing of all nodes below and including this node
412 | 
413 |         Depth first so visiting children before siblings
414 | 
415 |         Parameters:
416 |         ----------
417 |         only_leaves: bool, optional, default True
418 |             Only return leaf nodes
419 | 
420 |         Yields:
421 |             TreeNode: each node in tree
422 |         """
423 |         if self.is_leaf:
424 |             yield self
425 |         else:
426 |             if not only_leaves:
427 |                 yield self
428 |             for node in (self.left, self.right):
429 |                 yield from node.walk_depth_first(only_leaves)
430 | 
431 |     def walk_breadth_first(self, layer=None):
432 |         """  
433 |         Generator traversing of all nodes below and including this node
434 | 
435 |         Breadth first so visiting siblings before children
436 | 
437 |         Parameters:
438 |         ----------
439 |         only_leaves: bool, optional, default True
440 |             Only return leaf nodes
441 | 
442 |         Yields:
443 |             TreeNode: each node in tree
444 |         """
445 |         if layer is None:
446 |             layer = [self]
447 |         for node in layer:
448 |             yield node
449 |         new_layer = [
450 |             child
451 |             for node_children in [[node.left, node.right]
452 |                                   for node in layer if not node.is_leaf]
453 |             for child in node_children]
454 |         if new_layer:
455 |             yield from self.walk_breadth_first(new_layer)
456 | 
457 |     def print_tree(self):
458 |         """  
459 |         prints ascii representation of tree below this node
460 |         """
461 |         for node in self.walk_depth_first(only_leaves=False):
462 |             print('--' * node.depth + str(node))
463 | 
464 |     def predict_row_proba(self, row):
465 |         """
466 |         Predicts class probabilities for input row by walking the tree
467 |         and returning the leaf node class probabilities
468 | 
469 |         Parameters:
470 |         ----------
471 |         row: numpy.ndarray
472 |             Input row, shape (n features,)
473 | 
474 |         Returns:
475 |         -------
476 |         numpy.ndarray:
477 |             Class probabilities, shape (n classes, )
478 |         """
479 |         if self.is_leaf:
480 |             group_size = self.value.sum()
481 |             class_probs = self.value / group_size
482 |             return class_probs
483 |         elif row[self.best_feature_index] <= self.best_feature_split_val:
484 |             return self.left.predict_row_proba(row)
485 |         else:
486 |             return self.right.predict_row_proba(row)
487 | 
488 |     def predict_proba(self, data):
489 |         """Predicts class probabilities for input data
490 | 
491 |         Predicts class probabilities for each row in data by walking the
492 |         tree and returning the leaf node class probabilities
493 | 
494 |         Parameters:
495 |         ----------
496 |         data: numpy.ndarray
497 |             The input data with shape (m samples, n features)
498 | 
499 |         Returns:
500 |         -------
501 |         numpy.ndarray:
502 |             Predicted sample class probabilities, 
503 |             shape (m samples, n classes)
504 |         """
505 |         if not self.is_classifier:
506 |             raise Exception('Not a classifier')
507 |         if len(data.shape) == 2:
508 |             return np.stack([self.predict_row_proba(row)
509 |                              for row in data])
510 |         else:
511 |             return self.predict_row_proba(data)
512 | 
513 |     def predict_regressor_row(self, row):
514 |         """
515 |         Predicts target value for input row by walking the tree
516 |         and returning the leaf node value
517 | 
518 |         Parameters:
519 |         ----------
520 |         row: numpy.ndarray
521 |             Input row, shape (n features,)
522 | 
523 |         Returns:
524 |         -------
525 |         float:
526 |             Predicted target value
527 |         """
528 |         if self.is_leaf:
529 |             return self.value
530 |         elif row[self.best_feature_index] <= self.best_feature_split_val:
531 |             return self.left.predict_regressor_row(row)
532 |         else:
533 |             return self.right.predict_regressor_row(row)
534 | 
535 |     def predict_regressor(self, data):
536 |         """  
537 |         Predicts target values for each row in data by walking the
538 |         tree and returning the leaf node values
539 | 
540 |         Parameters:
541 |         ----------
542 |         data: numpy.ndarray
543 |             The input data with shape (m samples, n features)
544 | 
545 |         Returns:
546 |         -------
547 |         numpy.ndarray:
548 |             Predicted target values, shape (m samples, 1)
549 |         """
550 |         if len(data.shape) == 2:
551 |             return np.stack([self.predict_regressor_row(row)
552 |                              for row in data])
553 |         else:
554 |             return self.predict_regressor_row(data)
555 | 
556 |     def predict(self, data):
557 |         """Predicts target values or class labels for classification
558 | 
559 |         Predicts target values/class for each row in data by walking the
560 |         tree and returning the leaf node value for regression or the 
561 |         class with the largest predicted probability for classification
562 | 
563 |         Parameters:
564 |         ----------
565 |         data: numpy.ndarray
566 |             The input data with shape (m samples, n features)
567 | 
568 |         Returns:
569 |         -------
570 |         numpy.ndarray:
571 |             Predicted target values or class labels for classification
572 |         """
573 |         if self.is_classifier:
574 |             return np.argmax(self.predict_proba(data), axis=-1)
575 |         else:
576 |             return self.predict_regressor(data)
577 | 
578 |     def dot(self,
579 |             feature_names,
580 |             samples=True,
581 |             impurity=True,
582 |             value=True):
583 |         """  
584 |         Returns Digraph visualizing the tree below this node
585 | 
586 |         Parameters:
587 |         ----------
588 |         feature_names: list[str]
589 |             List of feature names
590 |         samples: bool, optional, default True
591 |             Whether to display the number of samples on this node
592 |         impurity: bool, optional, default True
593 |             Whether to display the impurity value on this node
594 |         value: bool, optional, default True
595 |             Whether to dispaly the value on this node
596 | 
597 |         Returns:
598 |         -------
599 |         graphviz.Digraph:
600 |             dot for tree diagram visual
601 |         """
602 |         dot = Digraph(
603 |             comment='Decsion Tree',
604 |             node_attr=dict(shape="rectangle",
605 |                            style="rounded",
606 |                            fillcolor="#028d35"))
607 |         for i, node in enumerate(self.walk_breadth_first()):
608 |             label = ""
609 |             if not node.is_leaf:
610 |                 label += (
611 |                     f'{feature_names[node.best_feature_index]} <= '
612 |                     f'{node.best_feature_split_val}\n')
613 |                 dot.edge(node.id, node.left.id)
614 |                 dot.edge(node.id, node.right.id)
615 |             if samples:
616 |                 label += f'Samples = {node.data_shape[0]}\n'
617 |             if impurity:
618 |                 label += f'Impurity = {node.node_impurity:.2f}\n'
619 |             if value:
620 |                 if self.is_classifier:
621 |                     label += f'Class counts = {str(node.value)}\n'
622 |                 else:
623 |                     label += f'Average y = {node.value:.2f}\n'
624 |             dot.node(name=node.id, label=label)
625 |         return dot
626 | 


--------------------------------------------------------------------------------
/notebooks/linear_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Machine Learning Implementation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "heading_collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "## Imports"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 3,
 22 |    "metadata": {
 23 |     "hidden": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import json\n",
 28 |     "\n",
 29 |     "import numpy as np\n",
 30 |     "import pandas as pd\n",
 31 |     "import plotly.offline as py\n",
 32 |     "from plotly import graph_objects as go"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {
 38 |     "heading_collapsed": true
 39 |    },
 40 |    "source": [
 41 |     "## Linear regression"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {
 47 |     "heading_collapsed": true,
 48 |     "hidden": true
 49 |    },
 50 |    "source": [
 51 |     "### The maths"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {
 57 |     "hidden": true
 58 |    },
 59 |    "source": [
 60 |     "The linear model (or line of best fit in 2D) aims to describe the continuous y vairable a.k.a the target variable (e.g. house prices) as a linear combination of features (e.g. square footage / number of bedrooms) the features are also refered to as the design matrix."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {
 66 |     "hidden": true
 67 |    },
 68 |    "source": [
 69 |     "$$\n",
 70 |     "\\begin{align}\n",
 71 |     "\\hat{y}&=\\beta_0x_0+\\cdots+\\beta_nx_n\\quad &n\\in \\mathbb{N}, x_o = 1 \\\\\n",
 72 |     "\\hat{y}&=\\sum^{n}_{i=0}\\beta_ix_i \\\\\n",
 73 |     "\\hat{y}&=\\mathbf{\\boldsymbol{\\beta}^Tx}\\quad&\\boldsymbol{\\beta},\\mathbf{x}\\in\\mathbb{R}^{(n+1)\\times1}\\\\\n",
 74 |     "\\hat{y}&=g(\\boldsymbol{\\beta}^T\\mathbf{x})\n",
 75 |     "\\end{align}\n",
 76 |     "$$"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {
 82 |     "hidden": true
 83 |    },
 84 |    "source": [
 85 |     "where g, the activation function, is the identidy in linear regression  \n",
 86 |     "\n",
 87 |     "We define the cost function as half of the mean square error:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "hidden": true
 94 |    },
 95 |    "source": [
 96 |     "$$\n",
 97 |     "\\begin{align}\n",
 98 |     "J(\\boldsymbol{\\beta})\n",
 99 |     "&= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n",
100 |     "y^j-\\hat{y}^j\n",
101 |     "\\right)^2,\\quad m\\in \\mathbb{N} \\text{ is the number of training samples}\\\\\n",
102 |     "&= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n",
103 |     "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\n",
104 |     "\\right)^2\n",
105 |     "\\end{align}\n",
106 |     "$$"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {
112 |     "hidden": true
113 |    },
114 |    "source": [
115 |     "We need to differentiate the cost function i.e. find the gradient"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {
121 |     "hidden": true
122 |    },
123 |    "source": [
124 |     "$$\n",
125 |     "\\begin{align}\n",
126 |     "\\frac{\\partial J}{\\partial\\beta_k}\\left(\\boldsymbol{\\beta}\\right) &= \\frac{\\partial}{\\partial\\beta_k}\\left(\n",
127 |     "\\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n",
128 |     "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\\right)^2\n",
129 |     "\\right)\\\\\n",
130 |     "&= \\frac{\\partial}{\\partial\\beta_k}\\left(\n",
131 |     "\\frac{1}{2m}\\sum^{m}_{j=1}\n",
132 |     "\\left(\n",
133 |     "y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\n",
134 |     "\\right)^2\n",
135 |     "\\right)\\\\\n",
136 |     "&=\n",
137 |     "\\frac{1}{m}\\sum^{m}_{j=1}\n",
138 |     "\\left(\n",
139 |     "y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\n",
140 |     "\\right)(-x^j_k)\\\\\n",
141 |     "\\end{align}\n",
142 |     "$$"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {
148 |     "hidden": true
149 |    },
150 |    "source": [
151 |     "hence"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {
157 |     "hidden": true
158 |    },
159 |    "source": [
160 |     "$$\n",
161 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
162 |     "=\n",
163 |     "\\begin{bmatrix}\n",
164 |     "       \\frac{\\partial J}{\\partial\\beta_1} \\\\\n",
165 |     "       \\vdots \\\\\n",
166 |     "       \\frac{\\partial J}{\\partial\\beta_n}\n",
167 |     "\\end{bmatrix}\n",
168 |     "=\n",
169 |     "\\begin{bmatrix}\n",
170 |     "       -\\frac{1}{m}\\sum^{m}_{j=1}\n",
171 |     "           \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_1\\\\\n",
172 |     "       \\vdots \\\\\n",
173 |     "       -\\frac{1}{m}\\sum^{m}_{j=1}\n",
174 |     "           \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_n\\\\\n",
175 |     "\\end{bmatrix}\n",
176 |     "$$"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {
182 |     "hidden": true
183 |    },
184 |    "source": [
185 |     "Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {
191 |     "hidden": true
192 |    },
193 |    "source": [
194 |     "$$\n",
195 |     "\\mathbf{X}\\in\\mathbb{R}^{m\\times (n+1)},\n",
196 |     "\\quad \\mathbf{y}\\in\\mathbb{R}^{m\\times 1},\n",
197 |     "\\quad \\boldsymbol{\\beta}\\in\\mathbb{R}^{(n+1)\\times1}\n",
198 |     "$$"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {
204 |     "hidden": true
205 |    },
206 |    "source": [
207 |     "$$\n",
208 |     "\\mathbf{X}=\\begin{bmatrix}\n",
209 |     "    1 & x_1^1  & x_2^1  & \\dots  & x_n^1  \\\\\n",
210 |     "    1 & x_1^2  & x_2^2  & \\dots  & x_n^2  \\\\\n",
211 |     "    \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n",
212 |     "    1 & x_1^m  & x_2^m  & \\dots  & x_n^m  \\\\\n",
213 |     "\\end{bmatrix}\\quad\n",
214 |     "\\mathbf{y}=\\begin{bmatrix}\n",
215 |     "    y_1\\\\y_2\\\\\\vdots\\\\y_m\n",
216 |     "\\end{bmatrix}\\quad\n",
217 |     "\\boldsymbol{\\beta} = \\begin{bmatrix}\n",
218 |     "    \\beta_0\\\\\\beta_1\\\\\\vdots\\\\\\beta_n\n",
219 |     "\\end{bmatrix}\n",
220 |     "$$"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {
226 |     "hidden": true
227 |    },
228 |    "source": [
229 |     "$$\n",
230 |     "\\begin{align}\n",
231 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
232 |     "&=\n",
233 |     "\\begin{bmatrix}\n",
234 |     "       -\\frac{1}{m}\\sum^{m}_{j=1}\n",
235 |     "           \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_1\\\\\n",
236 |     "       \\vdots \\\\\n",
237 |     "       -\\frac{1}{m}\\sum^{m}_{j=1}\n",
238 |     "           \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_n\\\\\n",
239 |     "\\end{bmatrix}\n",
240 |     "=-\\frac{1}{m}\n",
241 |     "\\begin{bmatrix}\n",
242 |     "       \\sum^{m}_{j=1}y^jx^j_1\\\\\n",
243 |     "       \\vdots \\\\\n",
244 |     "       \\sum^{m}_{j=1}y^jx^j_n\\\\\n",
245 |     "\\end{bmatrix}+\n",
246 |     "\\frac{1}{m}\n",
247 |     "\\begin{bmatrix}\n",
248 |     "       \\sum^{m}_{j=1}\\sum^{n}_{i=0}\\beta_ix_i^jx^j_1\\\\\n",
249 |     "       \\vdots \\\\\n",
250 |     "       \\sum^{m}_{j=1}\\sum^{n}_{i=0}\\beta_ix_i^jx^j_n\n",
251 |     "\\end{bmatrix}\\\\\n",
252 |     "\\end{align}\n",
253 |     "$$"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {
259 |     "hidden": true
260 |    },
261 |    "source": [
262 |     "so"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {
268 |     "hidden": true
269 |    },
270 |    "source": [
271 |     "$$\n",
272 |     "\\begin{align}\n",
273 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
274 |     "&=\\frac{1}{m}\\left(\n",
275 |     "\\mathbf{X}^T\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{X}^T\\mathbf{y}\n",
276 |     "\\right)\\\\\n",
277 |     "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n",
278 |     "\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{y}\n",
279 |     "\\right)\\\\\n",
280 |     "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n",
281 |     "\\mathbf{\\hat{y}}-\\mathbf{y}\n",
282 |     "\\right)\n",
283 |     "\\end{align}\n",
284 |     "$$"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {
290 |     "hidden": true
291 |    },
292 |    "source": [
293 |     "where"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {
299 |     "hidden": true
300 |    },
301 |    "source": [
302 |     "$$\n",
303 |     "\\mathbf{\\hat{y}} = \\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}\n",
304 |     "$$"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {
310 |     "hidden": true
311 |    },
312 |    "source": [
313 |     "We could have derived the same thing using matrix calculus - noting the following:"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {
319 |     "hidden": true
320 |    },
321 |    "source": [
322 |     "$$\n",
323 |     "\\begin{align}\n",
324 |     "J(\\boldsymbol{\\beta}) &= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n",
325 |     "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\n",
326 |     "\\right)^2\\\\\n",
327 |     "&= \\frac{1}{2m}\\left(\n",
328 |     "\\mathbf{y}-\\mathbf{\\hat{y}}\n",
329 |     "\\right)^T\n",
330 |     "\\left(\n",
331 |     "\\mathbf{y}-\\mathbf{\\hat{y}}\n",
332 |     "\\right)\\\\\n",
333 |     "&= \\frac{1}{2m}\\left(\n",
334 |     "\\mathbf{y}-\\mathbf{X}\\boldsymbol{\\beta}\n",
335 |     "\\right)^T\n",
336 |     "\\left(\n",
337 |     "\\mathbf{y}-\\mathbf{X}\\boldsymbol{\\beta}\n",
338 |     "\\right)\\\\\n",
339 |     "&= \\frac{1}{2m}\\left(\n",
340 |     "\\mathbf{y}^T\\mathbf{y}\n",
341 |     "-\\boldsymbol{\\beta}^T\\mathbf{X}^T\\mathbf{y}\n",
342 |     "-\\mathbf{y}^T\\mathbf{X}\\boldsymbol{\\beta}\n",
343 |     "+\\boldsymbol{\\beta}^T\\mathbf{X}^T\\mathbf{X}\\boldsymbol{\\beta}\n",
344 |     "\\right)\\\\\n",
345 |     "\\end{align}\n",
346 |     "$$"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {
352 |     "hidden": true
353 |    },
354 |    "source": [
355 |     "and"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {
361 |     "hidden": true
362 |    },
363 |    "source": [
364 |     "$$\n",
365 |     "\\frac{\\partial}{\\partial\\mathbf{\\boldsymbol{\\beta}}}\n",
366 |     "\\left(\n",
367 |     "A^T\\boldsymbol{\\beta}\n",
368 |     "\\right) = A,\\quad \\forall A\\in\\mathbb{R}^{(n+1)\\times1}\\\\\n",
369 |     "$$"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {
375 |     "hidden": true
376 |    },
377 |    "source": [
378 |     "and"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {
384 |     "hidden": true
385 |    },
386 |    "source": [
387 |     "$$\n",
388 |     "\\frac{\\partial}{\\partial\\mathbf{\\boldsymbol{\\beta}}}\n",
389 |     "\\left(\n",
390 |     "\\boldsymbol{\\beta}^TA\\boldsymbol{\\beta}\n",
391 |     "\\right) = 2A\\boldsymbol{\\beta},\\quad \\forall A\\in\\mathbb{R}^{m\\times (n+1)}\\\\\n",
392 |     "$$"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {
398 |     "hidden": true
399 |    },
400 |    "source": [
401 |     "so"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {
407 |     "hidden": true
408 |    },
409 |    "source": [
410 |     "$$\n",
411 |     "\\nabla_{\\boldsymbol{\\beta}}J=\\frac{1}{m}\\left(\n",
412 |     "\\mathbf{X}^T\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{X}^T\\mathbf{y}\n",
413 |     "\\right)$$"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {
419 |     "heading_collapsed": true,
420 |     "hidden": true
421 |    },
422 |    "source": [
423 |     "### Make fake data"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 4,
429 |    "metadata": {
430 |     "hidden": true
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "m = 100\n",
435 |     "x0 = np.ones(shape=(m, 1))\n",
436 |     "x1 = np.linspace(0, 10, m).reshape(-1, 1)\n",
437 |     "X = np.column_stack((x0, x1))\n",
438 |     "\n",
439 |     "# let y = 0.5 * x + 1 + epsilon\n",
440 |     "epsilon =  np.random.normal(scale=0.5, size=(m, 1))\n",
441 |     "y = x1 + 1 + epsilon"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 5,
447 |    "metadata": {
448 |     "hidden": true
449 |    },
450 |    "outputs": [
451 |     {
452 |      "data": {
453 |       "application/vnd.jupyter.widget-view+json": {
454 |        "model_id": "ffbc482ea5db405283d37d93d5d30c7f",
455 |        "version_major": 2,
456 |        "version_minor": 0
457 |       },
458 |       "text/plain": [
459 |        "FigureWidget({\n",
460 |        "    'data': [{'mode': 'markers',\n",
461 |        "              'name': 'linear data + noise',\n",
462 |        "              'ty…"
463 |       ]
464 |      },
465 |      "metadata": {},
466 |      "output_type": "display_data"
467 |     }
468 |    ],
469 |    "source": [
470 |     "fig = go.FigureWidget()\n",
471 |     "fig = fig.add_scatter(\n",
472 |     "    x=X[:,1],\n",
473 |     "    y=y[:,0],\n",
474 |     "    mode='markers',\n",
475 |     "    name='linear data + noise')\n",
476 |     "fig.layout.title = 'Fake linear data with noise'\n",
477 |     "fig.layout.xaxis.title = 'x1'\n",
478 |     "fig.layout.yaxis.title = 'y'\n",
479 |     "fig"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {
485 |     "heading_collapsed": true,
486 |     "hidden": true
487 |    },
488 |    "source": [
489 |     "### Linear regression class"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 6,
495 |    "metadata": {
496 |     "hidden": true
497 |    },
498 |    "outputs": [],
499 |    "source": [
500 |     "class LinearRegression():\n",
501 |     "\n",
502 |     "    def __init__(self, learning_rate=0.05):\n",
503 |     "        \"\"\"  \n",
504 |     "        Linear regression model\n",
505 |     "\n",
506 |     "        Parameters:\n",
507 |     "        ----------\n",
508 |     "        learning_rate: float, optional, default 0.05\n",
509 |     "            The learning rate parameter controlling the gradient descent\n",
510 |     "            step size\n",
511 |     "        \"\"\"\n",
512 |     "        self.learning_rate = learning_rate\n",
513 |     "        print('Creating linear model instance')\n",
514 |     "\n",
515 |     "    def __repr__(self):\n",
516 |     "        return (\n",
517 |     "            f'<LinearRegression '\n",
518 |     "            f'learning_rate={self.learning_rate}>')\n",
519 |     "        \n",
520 |     "\n",
521 |     "        \n",
522 |     "    def fit(self, X, y, n_iter=1000):\n",
523 |     "        \"\"\"  \n",
524 |     "        Fit the linear regression model\n",
525 |     "\n",
526 |     "        Updates the weights with n_iter iterations of batch gradient\n",
527 |     "        descent updates\n",
528 |     "\n",
529 |     "        Parameters:\n",
530 |     "        ----------\n",
531 |     "        X: numpy.ndarray\n",
532 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
533 |     "            Note the first column of X is expected to be ones (to allow \n",
534 |     "            for the bias to be included in beta)\n",
535 |     "        y: numpy.ndarray\n",
536 |     "            Target values, shape (m samples, 1)\n",
537 |     "        n_iter: int, optional, default 1000\n",
538 |     "            Number of batch gradient descent steps\n",
539 |     "        \"\"\"        \n",
540 |     "        m, n = X.shape\n",
541 |     "        print(f'fitting with m={m} samples with n={n-1} features\\n')\n",
542 |     "        self.beta = np.zeros(shape=(n, 1))\n",
543 |     "        self.costs = []\n",
544 |     "        self.betas = [self.beta]\n",
545 |     "        for iteration in range(n_iter):\n",
546 |     "            y_pred = self.predict(X)\n",
547 |     "            cost = self.cost(y, y_pred)\n",
548 |     "            self.costs.append(cost[0][0])\n",
549 |     "            gradient = self.gradient(y, y_pred, X)\n",
550 |     "            self.beta = self.beta - (\n",
551 |     "                self.learning_rate * gradient)\n",
552 |     "            self.betas.append(self.beta)\n",
553 |     "\n",
554 |     "    def cost(self, y, y_pred):\n",
555 |     "        \"\"\"  \n",
556 |     "        Mean square error cost function\n",
557 |     "\n",
558 |     "        Parameters:\n",
559 |     "        ----------\n",
560 |     "        y: numpy.ndarray\n",
561 |     "            True target values, shape (m samples, 1)\n",
562 |     "        y_pred: numpy.ndarray\n",
563 |     "            Predicted y values, shape (m samples, 1)\n",
564 |     "\n",
565 |     "        Returns:\n",
566 |     "        -------\n",
567 |     "        float:\n",
568 |     "            mean square error value\n",
569 |     "        \"\"\"\n",
570 |     "        m = y.shape[0]\n",
571 |     "        cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred)\n",
572 |     "        return cost\n",
573 |     "\n",
574 |     "    def gradient(self, y, y_pred, X):\n",
575 |     "        \"\"\"  \n",
576 |     "        Calculates the gradient of the cost function\n",
577 |     "\n",
578 |     "        Parameters:\n",
579 |     "        ----------\n",
580 |     "        y: numpy.ndarray\n",
581 |     "            Predicted y values, shape (m samples, 1)\n",
582 |     "        y_pred: numpy.ndarray\n",
583 |     "            True target values, shape (m samples, 1)\n",
584 |     "        X: numpy.ndarray\n",
585 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
586 |     "            Note the first column of X is expected to be ones (to allow \n",
587 |     "            for the bias to be included in beta)\n",
588 |     "\n",
589 |     "        Returns:\n",
590 |     "        -------\n",
591 |     "        numpy.ndarray:\n",
592 |     "            Derivate of mean square error cost function with respect to\n",
593 |     "            the weights beta, shape (n features, 1)\n",
594 |     "        \"\"\"\n",
595 |     "        m = X.shape[0]\n",
596 |     "        gradient = (1 / m) * X.T @ (y_pred - y)\n",
597 |     "        return gradient\n",
598 |     "\n",
599 |     "    def predict(self, X):\n",
600 |     "        \"\"\"  \n",
601 |     "        Predict the target values from sample X feature values\n",
602 |     "\n",
603 |     "        Parameters:\n",
604 |     "        ----------\n",
605 |     "        X: numpy.ndarray\n",
606 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
607 |     "            Note the first column of X is expected to be ones (to allow \n",
608 |     "            for the bias to be included in beta)\n",
609 |     "\n",
610 |     "        Returns:\n",
611 |     "        -------\n",
612 |     "        numpy.ndarray:\n",
613 |     "            Target value predictions, shape (m samples, 1)\n",
614 |     "        \"\"\"        \n",
615 |     "        y_pred = X @ self.beta\n",
616 |     "        return y_pred\n"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": 7,
622 |    "metadata": {
623 |     "hidden": true
624 |    },
625 |    "outputs": [
626 |     {
627 |      "name": "stdout",
628 |      "output_type": "stream",
629 |      "text": [
630 |       "Creating linear model instance\n"
631 |      ]
632 |     },
633 |     {
634 |      "data": {
635 |       "text/plain": [
636 |        "<LinearRegression learning_rate=0.05>"
637 |       ]
638 |      },
639 |      "execution_count": 7,
640 |      "metadata": {},
641 |      "output_type": "execute_result"
642 |     }
643 |    ],
644 |    "source": [
645 |     "linear_regression = LinearRegression()\n",
646 |     "linear_regression"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": 8,
652 |    "metadata": {
653 |     "hidden": true
654 |    },
655 |    "outputs": [
656 |     {
657 |      "name": "stdout",
658 |      "output_type": "stream",
659 |      "text": [
660 |       "fitting with m=100 samples with n=1 features\n",
661 |       "\n"
662 |      ]
663 |     }
664 |    ],
665 |    "source": [
666 |     "linear_regression.fit(X, y)"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "markdown",
671 |    "metadata": {
672 |     "heading_collapsed": true,
673 |     "hidden": true
674 |    },
675 |    "source": [
676 |     "### Plot the best fit"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": 10,
682 |    "metadata": {
683 |     "hidden": true
684 |    },
685 |    "outputs": [
686 |     {
687 |      "data": {
688 |       "application/vnd.jupyter.widget-view+json": {
689 |        "model_id": "ffbc482ea5db405283d37d93d5d30c7f",
690 |        "version_major": 2,
691 |        "version_minor": 0
692 |       },
693 |       "text/plain": [
694 |        "FigureWidget({\n",
695 |        "    'data': [{'mode': 'markers',\n",
696 |        "              'name': 'linear data + noise',\n",
697 |        "              'ty…"
698 |       ]
699 |      },
700 |      "metadata": {},
701 |      "output_type": "display_data"
702 |     }
703 |    ],
704 |    "source": [
705 |     "fig = fig.add_scatter(\n",
706 |     "    x=X[:,1], \n",
707 |     "    y=linear_regression.predict(X)[:,0],\n",
708 |     "    mode='markers',\n",
709 |     "    name='best fit')\n",
710 |     "fig"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "markdown",
715 |    "metadata": {
716 |     "heading_collapsed": true,
717 |     "hidden": true
718 |    },
719 |    "source": [
720 |     "### Plot the cost function"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 11,
726 |    "metadata": {
727 |     "hidden": true
728 |    },
729 |    "outputs": [],
730 |    "source": [
731 |     "def plot_surface(linear_regression):\n",
732 |     "    cost_fig = go.FigureWidget()\n",
733 |     "    cost_fig = cost_fig.add_scatter(\n",
734 |     "        x=list(range(len(linear_regression.costs))),\n",
735 |     "        y=linear_regression.costs,\n",
736 |     "        mode='markers+lines')\n",
737 |     "    cost_fig.layout.title = 'Cost by iteration'\n",
738 |     "    return cost_fig"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": 12,
744 |    "metadata": {
745 |     "hidden": true
746 |    },
747 |    "outputs": [
748 |     {
749 |      "data": {
750 |       "application/vnd.jupyter.widget-view+json": {
751 |        "model_id": "c6743fff09c14f8cbdbb792626b6b17d",
752 |        "version_major": 2,
753 |        "version_minor": 0
754 |       },
755 |       "text/plain": [
756 |        "FigureWidget({\n",
757 |        "    'data': [{'mode': 'markers+lines',\n",
758 |        "              'type': 'scatter',\n",
759 |        "              'uid': 'd…"
760 |       ]
761 |      },
762 |      "metadata": {},
763 |      "output_type": "display_data"
764 |     }
765 |    ],
766 |    "source": [
767 |     "cost_fig = plot_surface(linear_regression)\n",
768 |     "cost_fig"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 14,
774 |    "metadata": {
775 |     "hidden": true
776 |    },
777 |    "outputs": [],
778 |    "source": [
779 |     "def plot_surface(linear_regression):\n",
780 |     "    beta0s = [beta[0][0] for beta in linear_regression.betas]\n",
781 |     "    beta1s = [beta[1][0] for beta in linear_regression.betas]\n",
782 |     "    beta0_max = max(map(abs, beta0s)) * 1.05\n",
783 |     "    beta1_max = max(map(abs, beta1s)) * 1.05\n",
784 |     "\n",
785 |     "    gradient_descent_fig = go.FigureWidget()\n",
786 |     "    gradient_descent_fig = gradient_descent_fig.add_scatter3d(\n",
787 |     "        x=beta0s,\n",
788 |     "        y=beta1s,\n",
789 |     "        z=linear_regression.costs,\n",
790 |     "        mode='markers+lines',\n",
791 |     "        marker={'size':3, 'color':'red'})\n",
792 |     "\n",
793 |     "    beta0, beta1 = np.meshgrid(\n",
794 |     "        np.linspace(-beta0_max, beta0_max, 100),\n",
795 |     "        np.linspace(-beta1_max, beta1_max, 100))\n",
796 |     "\n",
797 |     "    z = np.diag(\n",
798 |     "        (1 / (2 * m)) * \\\n",
799 |     "        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T)).T @ \\\n",
800 |     "        (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T))\n",
801 |     "        ).reshape(beta1.shape)\n",
802 |     "\n",
803 |     "    gradient_descent_fig = gradient_descent_fig.add_surface(\n",
804 |     "        x=beta0,\n",
805 |     "        y=beta1,\n",
806 |     "        z=z,\n",
807 |     "        opacity=0.8)\n",
808 |     "    \n",
809 |     "    gradient_descent_fig.layout.title = 'Cost function surface'\n",
810 |     "    gradient_descent_fig.layout.scene.xaxis.title = 'beta_0'\n",
811 |     "    gradient_descent_fig.layout.scene.yaxis.title = 'beta_1'\n",
812 |     "    gradient_descent_fig.layout.scene.zaxis.title = 'cost' \n",
813 |     "    # cost = average sum square residuals\n",
814 |     "    gradient_descent_fig.layout.height = 500\n",
815 |     "    return gradient_descent_fig"
816 |    ]
817 |   },
818 |   {
819 |    "cell_type": "code",
820 |    "execution_count": 15,
821 |    "metadata": {
822 |     "hidden": true,
823 |     "scrolled": false
824 |    },
825 |    "outputs": [
826 |     {
827 |      "data": {
828 |       "application/vnd.jupyter.widget-view+json": {
829 |        "model_id": "2545400b812747cdb6a02def35e944b7",
830 |        "version_major": 2,
831 |        "version_minor": 0
832 |       },
833 |       "text/plain": [
834 |        "FigureWidget({\n",
835 |        "    'data': [{'marker': {'color': 'red', 'size': 3},\n",
836 |        "              'mode': 'markers+lines',\n",
837 |        "   …"
838 |       ]
839 |      },
840 |      "metadata": {},
841 |      "output_type": "display_data"
842 |     }
843 |    ],
844 |    "source": [
845 |     "gradient_descent_fig = plot_surface(linear_regression)\n",
846 |     "gradient_descent_fig"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "code",
851 |    "execution_count": 16,
852 |    "metadata": {
853 |     "hidden": true
854 |    },
855 |    "outputs": [],
856 |    "source": [
857 |     "# py.plot(gradient_descent_fig, filename='gradient_descent.html')"
858 |    ]
859 |   },
860 |   {
861 |    "cell_type": "markdown",
862 |    "metadata": {
863 |     "heading_collapsed": true
864 |    },
865 |    "source": [
866 |     "## End"
867 |    ]
868 |   }
869 |  ],
870 |  "metadata": {
871 |   "kernelspec": {
872 |    "display_name": "Python 3",
873 |    "language": "python",
874 |    "name": "python3"
875 |   },
876 |   "language_info": {
877 |    "codemirror_mode": {
878 |     "name": "ipython",
879 |     "version": 3
880 |    },
881 |    "file_extension": ".py",
882 |    "mimetype": "text/x-python",
883 |    "name": "python",
884 |    "nbconvert_exporter": "python",
885 |    "pygments_lexer": "ipython3",
886 |    "version": "3.7.6"
887 |   },
888 |   "toc": {
889 |    "base_numbering": 1,
890 |    "nav_menu": {},
891 |    "number_sections": true,
892 |    "sideBar": true,
893 |    "skip_h1_title": false,
894 |    "title_cell": "Table of Contents",
895 |    "title_sidebar": "Contents",
896 |    "toc_cell": false,
897 |    "toc_position": {},
898 |    "toc_section_display": true,
899 |    "toc_window_display": false
900 |   }
901 |  },
902 |  "nbformat": 4,
903 |  "nbformat_minor": 2
904 | }
905 | 


--------------------------------------------------------------------------------
/notebooks/logistic_regression.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Machine Learning Implementation"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {
  13 |     "heading_collapsed": true
  14 |    },
  15 |    "source": [
  16 |     "## Imports"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "code",
  21 |    "execution_count": 2,
  22 |    "metadata": {
  23 |     "hidden": true
  24 |    },
  25 |    "outputs": [],
  26 |    "source": [
  27 |     "import json\n",
  28 |     "\n",
  29 |     "import numpy as np\n",
  30 |     "import pandas as pd\n",
  31 |     "import plotly.offline as py\n",
  32 |     "from plotly import graph_objects as go"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "markdown",
  37 |    "metadata": {
  38 |     "heading_collapsed": true
  39 |    },
  40 |    "source": [
  41 |     "## Logistic regression"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "markdown",
  46 |    "metadata": {
  47 |     "heading_collapsed": true,
  48 |     "hidden": true
  49 |    },
  50 |    "source": [
  51 |     "### The maths"
  52 |    ]
  53 |   },
  54 |   {
  55 |    "cell_type": "markdown",
  56 |    "metadata": {
  57 |     "hidden": true
  58 |    },
  59 |    "source": [
  60 |     "The logistic model aims to predict the discrete y variable a.k.a the target variable (e.g. whether something will happen) based on a collection of features. It does this by transforming a linear combination of the features into a curve and fitting this curve to the data."
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "markdown",
  65 |    "metadata": {
  66 |     "hidden": true
  67 |    },
  68 |    "source": [
  69 |     "The curve used in logistic regression is the sigmoid function"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "metadata": {
  75 |     "hidden": true
  76 |    },
  77 |    "source": [
  78 |     "$$\n",
  79 |     "\\sigma(x) = \\frac{1}{1+e^{-x}}\n",
  80 |     "$$"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "markdown",
  85 |    "metadata": {
  86 |     "hidden": true
  87 |    },
  88 |    "source": [
  89 |     "Define y as"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {
  95 |     "hidden": true
  96 |    },
  97 |    "source": [
  98 |     "$$\n",
  99 |     "\\begin{align}\n",
 100 |     "\\hat{y} &= h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\\\\n",
 101 |     "\\hat{y}&= \\sigma\\left(\\beta_0x_0+\\cdots+\\beta_nx_n\\right)\\quad &n\\in \\mathbb{N},x_0=1 \\\\\n",
 102 |     "\\hat{y}&=\\sigma\\left(\\sum^{n}_{i=0}\\beta_ix_i\\right) \\\\\n",
 103 |     "\\hat{y}&=\\sigma\\left(\\mathbf{\\boldsymbol{\\beta}^Tx}\\right)\\quad&\\boldsymbol{\\beta},\\mathbf{x}\\in\\mathbb{R}^{n\\times1}\\\\\n",
 104 |     "\\hat{y}&=\\sigma\\left(\\boldsymbol{\\beta}^T\\mathbf{x}\\right)\n",
 105 |     "\\end{align}\n",
 106 |     "$$"
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "markdown",
 111 |    "metadata": {
 112 |     "hidden": true
 113 |    },
 114 |    "source": [
 115 |     "notice"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "markdown",
 120 |    "metadata": {
 121 |     "hidden": true
 122 |    },
 123 |    "source": [
 124 |     "$$\n",
 125 |     "\\hat{y} = \\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\n",
 126 |     "$$"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "markdown",
 131 |    "metadata": {
 132 |     "hidden": true
 133 |    },
 134 |    "source": [
 135 |     "so"
 136 |    ]
 137 |   },
 138 |   {
 139 |    "cell_type": "markdown",
 140 |    "metadata": {
 141 |     "hidden": true
 142 |    },
 143 |    "source": [
 144 |     "$$\n",
 145 |     "\\begin{align}\n",
 146 |     "\\hat{y} + \\hat{y}e^{-\\boldsymbol{\\beta}^T\\mathbf{x}} &= 1\\\\\n",
 147 |     "\\hat{y}e^{-\\boldsymbol{\\beta}^T\\mathbf{x}} &= 1 - \\hat{y}\\\\\n",
 148 |     "\\frac{\\hat{y}}{1 - \\hat{y}} &= e^{\\boldsymbol{\\beta}^T\\mathbf{x}}\\\\\n",
 149 |     "\\ln\\left(\\frac{\\hat{y}}{1 - \\hat{y}}\\right)&=\\boldsymbol{\\beta}^T\\mathbf{x}\n",
 150 |     "\\end{align}\n",
 151 |     "$$"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {
 157 |     "hidden": true
 158 |    },
 159 |    "source": [
 160 |     "This above is the logit form of logistic regression. We model the logit as a linear combination of the x variables"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "markdown",
 165 |    "metadata": {
 166 |     "hidden": true
 167 |    },
 168 |    "source": [
 169 |     "We define the cost function as follows for each y and corresponding x"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "markdown",
 174 |    "metadata": {
 175 |     "hidden": true
 176 |    },
 177 |    "source": [
 178 |     "$$\n",
 179 |     "\\begin{align}\n",
 180 |     "J(\\mathbf{x})\n",
 181 |     "&= \\begin{cases}\n",
 182 |     "-\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\right) &\\text{if y=1}\\\\\n",
 183 |     "-\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\right) &\\text{if y=0}\\\\\n",
 184 |     "\\end{cases}\n",
 185 |     "\\end{align}\n",
 186 |     "$$"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "metadata": {
 192 |     "hidden": true
 193 |    },
 194 |    "source": [
 195 |     "$$\n",
 196 |     "\\begin{align}\n",
 197 |     "J(\\mathbf{x})\n",
 198 |     "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n",
 199 |     "+(1-y^j)\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\\\\\n",
 200 |     "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\\right)\n",
 201 |     "+(1-y^j)\\log\\left(1-\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\\right)\\\\\n",
 202 |     "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\right)\n",
 203 |     "+(1-y^j)\\log\\left(1-\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\right)\n",
 204 |     "\\end{align}\n",
 205 |     "$$"
 206 |    ]
 207 |   },
 208 |   {
 209 |    "cell_type": "markdown",
 210 |    "metadata": {
 211 |     "hidden": true
 212 |    },
 213 |    "source": [
 214 |     "note"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "markdown",
 219 |    "metadata": {
 220 |     "hidden": true
 221 |    },
 222 |    "source": [
 223 |     "$$\n",
 224 |     "\\begin{align}\n",
 225 |     "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)&=\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}^j}}\\\\\n",
 226 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 227 |     "\\end{align}\n",
 228 |     "$$"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "markdown",
 233 |    "metadata": {
 234 |     "hidden": true
 235 |    },
 236 |    "source": [
 237 |     "so"
 238 |    ]
 239 |   },
 240 |   {
 241 |    "cell_type": "markdown",
 242 |    "metadata": {
 243 |     "hidden": true
 244 |    },
 245 |    "source": [
 246 |     "$$\n",
 247 |     "\\begin{align}\n",
 248 |     "\\frac{\\partial h}{\\partial \\beta_k} &= -\\left(1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}\\right)^{-2}e^{-\\sum^{n}_{i=0}\\beta_ix_i} (-x_k^j)\\\\\n",
 249 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 250 |     "\\frac{-e^{-\\sum^{n}_{i=0}\\beta_ix_i} (-x_k^j)}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\\\\n",
 251 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 252 |     "\\frac{(1-1-e^{-\\sum^{n}_{i=0}\\beta_ix_i})(-x_k^j)}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\\\\n",
 253 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 254 |     "\\left(\n",
 255 |     "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}-\n",
 256 |     "\\frac{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 257 |     "\\right)(-x_k^j)\\\\\n",
 258 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 259 |     "\\left(\n",
 260 |     "\\frac{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}-\n",
 261 |     "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 262 |     "\\right)(x_k^j)\\\\\n",
 263 |     "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 264 |     "\\left(\n",
 265 |     "1-\n",
 266 |     "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n",
 267 |     "\\right)(x_k^j)\\\\\n",
 268 |     "&=h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n",
 269 |     "\\end{align}\n",
 270 |     "$$"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "markdown",
 275 |    "metadata": {
 276 |     "hidden": true
 277 |    },
 278 |    "source": [
 279 |     "We need to differentiate the cost function i.e. find the gradient"
 280 |    ]
 281 |   },
 282 |   {
 283 |    "cell_type": "markdown",
 284 |    "metadata": {
 285 |     "hidden": true
 286 |    },
 287 |    "source": [
 288 |     "$$\n",
 289 |     "\\begin{align}\n",
 290 |     "\\frac{\\partial J}{\\partial\\beta_k}\\left(\\boldsymbol{\\beta}\\right) \n",
 291 |     "&=\\frac{\\partial}{\\partial\\beta_k}\\left(\n",
 292 |     "-\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n",
 293 |     "+(1-y^j)\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n",
 294 |     "\\right)\\\\\n",
 295 |     "&=-\\frac{1}{m}\\sum_{j=1}^m\\frac{y^j}{h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\\frac{\\partial h}{\\partial \\beta_k}\n",
 296 |     "+\\frac{-(1-y^j)}{1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\\frac{\\partial h}{\\partial \\beta_k}\\\\\n",
 297 |     "&=-\\frac{1}{m}\\sum_{j=1}^m\\frac{y^j}{h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\n",
 298 |     "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n",
 299 |     "+\\frac{-(1-y^j)}{1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\n",
 300 |     "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\\\\\n",
 301 |     "&=-\\frac{1}{m}\\sum_{j=1}^my^j(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n",
 302 |     "-(1-y^j)\n",
 303 |     "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)x_k^j\\\\\n",
 304 |     "&=\\frac{1}{m}\\sum_{j=1}^m\n",
 305 |     "\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_k^j\n",
 306 |     "\\end{align}\n",
 307 |     "$$"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "markdown",
 312 |    "metadata": {
 313 |     "hidden": true
 314 |    },
 315 |    "source": [
 316 |     "hence"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "markdown",
 321 |    "metadata": {
 322 |     "hidden": true
 323 |    },
 324 |    "source": [
 325 |     "$$\n",
 326 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
 327 |     "=\n",
 328 |     "\\begin{bmatrix}\n",
 329 |     "       \\frac{\\partial J}{\\partial\\beta_1} \\\\\n",
 330 |     "       \\vdots \\\\\n",
 331 |     "       \\frac{\\partial J}{\\partial\\beta_n}\n",
 332 |     "\\end{bmatrix}\n",
 333 |     "=\n",
 334 |     "\\begin{bmatrix}\n",
 335 |     "       \\frac{1}{m}\\sum_{j=1}^m\n",
 336 |     "            \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_1^j\\\\\n",
 337 |     "       \\vdots \\\\\n",
 338 |     "       \\frac{1}{m}\\sum_{j=1}^m\n",
 339 |     "           \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_n^j\n",
 340 |     "\\end{bmatrix}\n",
 341 |     "$$"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "markdown",
 346 |    "metadata": {
 347 |     "hidden": true
 348 |    },
 349 |    "source": [
 350 |     "Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows"
 351 |    ]
 352 |   },
 353 |   {
 354 |    "cell_type": "markdown",
 355 |    "metadata": {
 356 |     "hidden": true
 357 |    },
 358 |    "source": [
 359 |     "$$\\mathbf{X}\\in\\mathbb{R}^{m\\times n},\n",
 360 |     "\\quad \\mathbf{y}\\in\\mathbb{R}^{m\\times 1}\n",
 361 |     "$$"
 362 |    ]
 363 |   },
 364 |   {
 365 |    "cell_type": "markdown",
 366 |    "metadata": {
 367 |     "hidden": true
 368 |    },
 369 |    "source": [
 370 |     "$$\n",
 371 |     "\\mathbf{X}=\\begin{bmatrix}\n",
 372 |     "       \\dots & (\\mathbf{x}^1)^T & \\dots\\\\\n",
 373 |     "       \\dots & (\\mathbf{x}^2)^T & \\dots\\\\\n",
 374 |     "       \\dots & \\vdots  & \\dots\\\\\n",
 375 |     "       \\dots & (\\mathbf{x}^m)^T & \\dots\n",
 376 |     "\\end{bmatrix}\\quad\n",
 377 |     "\\mathbf{y}=\\begin{bmatrix}\n",
 378 |     "    y_1\\\\y_2\\\\\\vdots\\\\y_m\n",
 379 |     "\\end{bmatrix}\n",
 380 |     "$$"
 381 |    ]
 382 |   },
 383 |   {
 384 |    "cell_type": "markdown",
 385 |    "metadata": {
 386 |     "hidden": true
 387 |    },
 388 |    "source": [
 389 |     "$$\n",
 390 |     "\\begin{align}\n",
 391 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
 392 |     "=\n",
 393 |     "\\begin{bmatrix}\n",
 394 |     "       \\frac{1}{m}\\sum_{j=1}^m\n",
 395 |     "            \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_1^j\\\\\n",
 396 |     "       \\vdots \\\\\n",
 397 |     "       \\frac{1}{m}\\sum_{j=1}^m\n",
 398 |     "           \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_n^j\n",
 399 |     "\\end{bmatrix}\n",
 400 |     "=\n",
 401 |     "\\frac{1}{m}\n",
 402 |     "\\begin{bmatrix}\n",
 403 |     "       \\sum^{n}_{i=0}h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)^jx^j_1\\\\\n",
 404 |     "       \\vdots \\\\\n",
 405 |     "       \\sum^{n}_{i=0}h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)x^j_n\n",
 406 |     "\\end{bmatrix}\n",
 407 |     "-\n",
 408 |     "\\frac{1}{m}\n",
 409 |     "\\begin{bmatrix}\n",
 410 |     "       \\sum^{m}_{j=1}y^jx^j_1\\\\\n",
 411 |     "       \\vdots \\\\\n",
 412 |     "       \\sum^{m}_{j=1}y^jx^j_n\\\\\n",
 413 |     "\\end{bmatrix}\n",
 414 |     "\\end{align}\n",
 415 |     "$$"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "markdown",
 420 |    "metadata": {
 421 |     "hidden": true
 422 |    },
 423 |    "source": [
 424 |     "$$\n",
 425 |     "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j) = \\sigma({\\mathbf{x}^j}^T\\boldsymbol{\\beta})\n",
 426 |     "$$"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "markdown",
 431 |    "metadata": {
 432 |     "hidden": true
 433 |    },
 434 |    "source": [
 435 |     "so"
 436 |    ]
 437 |   },
 438 |   {
 439 |    "cell_type": "markdown",
 440 |    "metadata": {
 441 |     "hidden": true
 442 |    },
 443 |    "source": [
 444 |     "$$\n",
 445 |     "\\begin{align}\n",
 446 |     "\\nabla_{\\boldsymbol{\\beta}} J\n",
 447 |     "&=\\frac{1}{m}\\left(\n",
 448 |     "\\mathbf{X}^T\\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})-\\mathbf{X}^T\\mathbf{y}\n",
 449 |     "\\right)\\\\\n",
 450 |     "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n",
 451 |     "\\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})-\\mathbf{y}\n",
 452 |     "\\right)\\\\\n",
 453 |     "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n",
 454 |     "\\mathbf{\\hat{y}}-\\mathbf{y}\n",
 455 |     "\\right)\n",
 456 |     "\\end{align}\n",
 457 |     "$$"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "markdown",
 462 |    "metadata": {
 463 |     "hidden": true
 464 |    },
 465 |    "source": [
 466 |     "where"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "markdown",
 471 |    "metadata": {
 472 |     "hidden": true
 473 |    },
 474 |    "source": [
 475 |     "$$\n",
 476 |     "\\mathbf{\\hat{y}} = \\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})\n",
 477 |     "$$"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "markdown",
 482 |    "metadata": {
 483 |     "hidden": true
 484 |    },
 485 |    "source": [
 486 |     "We could have derived the same thing using matrix calculus"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "markdown",
 491 |    "metadata": {
 492 |     "heading_collapsed": true,
 493 |     "hidden": true
 494 |    },
 495 |    "source": [
 496 |     "### Example sigmoid"
 497 |    ]
 498 |   },
 499 |   {
 500 |    "cell_type": "markdown",
 501 |    "metadata": {
 502 |     "hidden": true
 503 |    },
 504 |    "source": [
 505 |     "The curve used in logistic regression is the sigmoid function"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "markdown",
 510 |    "metadata": {
 511 |     "hidden": true
 512 |    },
 513 |    "source": [
 514 |     "$$\n",
 515 |     "\\sigma(x) = \\frac{1}{1+e^{-x}}\n",
 516 |     "$$"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 3,
 522 |    "metadata": {
 523 |     "hidden": true
 524 |    },
 525 |    "outputs": [
 526 |     {
 527 |      "data": {
 528 |       "application/vnd.jupyter.widget-view+json": {
 529 |        "model_id": "96e4341cd3ff4793acd7c0d968a8a355",
 530 |        "version_major": 2,
 531 |        "version_minor": 0
 532 |       },
 533 |       "text/plain": [
 534 |        "FigureWidget({\n",
 535 |        "    'data': [{'type': 'scatter',\n",
 536 |        "              'uid': 'fc81bfbe-f9f8-419c-9923-4d127962d5e2',\n",
 537 |        " …"
 538 |       ]
 539 |      },
 540 |      "metadata": {},
 541 |      "output_type": "display_data"
 542 |     }
 543 |    ],
 544 |    "source": [
 545 |     "sigmoid_fig = go.FigureWidget()\n",
 546 |     "demo_x = np.arange(-10,10,0.1)\n",
 547 |     "demo_y = 1 / (1 + np.exp(-demo_x))\n",
 548 |     "sigmoid_fig.add_scatter(\n",
 549 |     "    x=demo_x,\n",
 550 |     "    y=demo_y)\n",
 551 |     "sigmoid_fig.layout.title = 'Sigmoid Function'\n",
 552 |     "sigmoid_fig"
 553 |    ]
 554 |   },
 555 |   {
 556 |    "cell_type": "markdown",
 557 |    "metadata": {
 558 |     "heading_collapsed": true,
 559 |     "hidden": true
 560 |    },
 561 |    "source": [
 562 |     "### Make fake data"
 563 |    ]
 564 |   },
 565 |   {
 566 |    "cell_type": "code",
 567 |    "execution_count": 4,
 568 |    "metadata": {
 569 |     "hidden": true
 570 |    },
 571 |    "outputs": [],
 572 |    "source": [
 573 |     "m = 100\n",
 574 |     "x0 = np.ones(shape=(m, 1))\n",
 575 |     "x1 = np.linspace(0, 10, m).reshape(-1, 1)\n",
 576 |     "X = np.column_stack((x0, x1))\n",
 577 |     "\n",
 578 |     "# let y = 0.5 * x + 1 + epsilon\n",
 579 |     "epsilon =  np.random.normal(scale=2, size=(m, 1))\n",
 580 |     "y = x1 + epsilon\n",
 581 |     "y = (y > 5).astype(int)"
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": 6,
 587 |    "metadata": {
 588 |     "hidden": true
 589 |    },
 590 |    "outputs": [
 591 |     {
 592 |      "data": {
 593 |       "application/vnd.jupyter.widget-view+json": {
 594 |        "model_id": "5e928c2fe5a1497889e2502722076d7a",
 595 |        "version_major": 2,
 596 |        "version_minor": 0
 597 |       },
 598 |       "text/plain": [
 599 |        "FigureWidget({\n",
 600 |        "    'data': [{'mode': 'markers',\n",
 601 |        "              'name': 'linear data + noise',\n",
 602 |        "              'ty…"
 603 |       ]
 604 |      },
 605 |      "metadata": {},
 606 |      "output_type": "display_data"
 607 |     }
 608 |    ],
 609 |    "source": [
 610 |     "fig = go.FigureWidget()\n",
 611 |     "fig = fig.add_scatter(\n",
 612 |     "    x=X[:,1],\n",
 613 |     "    y=y[:,0],\n",
 614 |     "    mode='markers',\n",
 615 |     "    name='linear data + noise')\n",
 616 |     "fig"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "markdown",
 621 |    "metadata": {
 622 |     "heading_collapsed": true,
 623 |     "hidden": true
 624 |    },
 625 |    "source": [
 626 |     "### Logistic regression class"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "code",
 631 |    "execution_count": 7,
 632 |    "metadata": {
 633 |     "hidden": true
 634 |    },
 635 |    "outputs": [],
 636 |    "source": [
 637 |     "import json\n",
 638 |     "\n",
 639 |     "import numpy as np\n",
 640 |     "\n",
 641 |     "\n",
 642 |     "class LogisticRegression():\n",
 643 |     "\n",
 644 |     "    def __init__(self, learning_rate=0.05):\n",
 645 |     "        \"\"\"  \n",
 646 |     "        Logistic regression model\n",
 647 |     "\n",
 648 |     "        Parameters:\n",
 649 |     "        ----------\n",
 650 |     "        learning_rate: float, optional, default 0.05\n",
 651 |     "            The learning rate parameter controlling the gradient descent\n",
 652 |     "            step size\n",
 653 |     "        \"\"\"\n",
 654 |     "        self.learning_rate = learning_rate\n",
 655 |     "        print('Creating logistic model instance')\n",
 656 |     "\n",
 657 |     "    def __repr__(self):\n",
 658 |     "        return (\n",
 659 |     "            f'<LogisticRegression '\n",
 660 |     "            f'learning_rate={self.learning_rate}>')\n",
 661 |     "\n",
 662 |     "    def fit(self, X, y, n_iter=1000):\n",
 663 |     "        \"\"\"  \n",
 664 |     "        Fit the logistic regression model\n",
 665 |     "\n",
 666 |     "        Updates the weights with n_iter iterations of batch gradient\n",
 667 |     "        descent updates\n",
 668 |     "\n",
 669 |     "        Parameters:\n",
 670 |     "        ----------\n",
 671 |     "        X: numpy.ndarray\n",
 672 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
 673 |     "            Note the first column of X is expected to be ones (to allow \n",
 674 |     "            for the bias to be included in beta)\n",
 675 |     "        y: numpy.ndarray\n",
 676 |     "            Target values - class label {0, 1}, shape (m samples, 1)\n",
 677 |     "        n_iter: int, optional, default 1000\n",
 678 |     "            Number of batch gradient descent steps\n",
 679 |     "        \"\"\"\n",
 680 |     "        m, n = X.shape\n",
 681 |     "        print(f'fitting with m={m} samples with n={n-1} features\\n')\n",
 682 |     "        self.beta = np.zeros(shape=(n, 1))\n",
 683 |     "        self.costs = []\n",
 684 |     "        self.betas = [self.beta]\n",
 685 |     "        for iteration in range(n_iter):\n",
 686 |     "            y_pred = self.predict_proba(X)\n",
 687 |     "            cost = (-1 / m) * (\n",
 688 |     "                (y.T @ np.log(y_pred)) +\n",
 689 |     "                ((np.ones(shape=y.shape) - y).T @ np.log(\n",
 690 |     "                    np.ones(shape=y_pred.shape) - y_pred))\n",
 691 |     "            )\n",
 692 |     "            self.costs.append(cost[0][0])\n",
 693 |     "            gradient = (1 / m) * X.T @ (y_pred - y)\n",
 694 |     "            self.beta = self.beta - (\n",
 695 |     "                self.learning_rate * gradient)\n",
 696 |     "            self.betas.append(self.beta)\n",
 697 |     "\n",
 698 |     "    def predict_proba(self, X):\n",
 699 |     "        \"\"\"  \n",
 700 |     "        Predicted probability values for class 1\n",
 701 |     "\n",
 702 |     "        Note this is calculated as the sigmoid of the linear combination\n",
 703 |     "        of the feature values and the weights.\n",
 704 |     "\n",
 705 |     "        Parameters:\n",
 706 |     "        ----------\n",
 707 |     "        X: numpy.ndarray\n",
 708 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
 709 |     "            Note the first column of X is expected to be ones (to allow \n",
 710 |     "            for the bias to be included in beta)\n",
 711 |     "\n",
 712 |     "        Returns:\n",
 713 |     "        -------\n",
 714 |     "        numpy.ndarray:\n",
 715 |     "            Predicted probability of samples being in class 1\n",
 716 |     "        \"\"\"        \n",
 717 |     "        y_pred = self.sigmoid(X @ self.beta)\n",
 718 |     "        return y_pred\n",
 719 |     "\n",
 720 |     "    def predict(self, X, descision_prob=0.5):\n",
 721 |     "        \"\"\"  \n",
 722 |     "        Predict the class values from sample X feature values\n",
 723 |     "\n",
 724 |     "        Parameters:\n",
 725 |     "        ----------\n",
 726 |     "        X: numpy.ndarray\n",
 727 |     "            Training data, shape (m samples, (n - 1) features + 1)\n",
 728 |     "            Note the first column of X is expected to be ones (to allow \n",
 729 |     "            for the bias to be included in beta)\n",
 730 |     "\n",
 731 |     "        Returns:\n",
 732 |     "        -------\n",
 733 |     "        numpy.ndarray:\n",
 734 |     "            Prediceted class values, shape (m samples, 1)\n",
 735 |     "        \"\"\"\n",
 736 |     "        y_pred = self.sigmoid(X @ self.beta)\n",
 737 |     "        return (y_pred > descision_prob) * 1\n",
 738 |     "\n",
 739 |     "    def sigmoid(self, x):\n",
 740 |     "        \"\"\"  \n",
 741 |     "        Sigmoid function\n",
 742 |     "\n",
 743 |     "        f(x) = 1 / (1 + e^(-x))\n",
 744 |     "\n",
 745 |     "        Parameters:\n",
 746 |     "        ----------\n",
 747 |     "        x: numpy.ndarray\n",
 748 |     "\n",
 749 |     "        Returns:\n",
 750 |     "        -------\n",
 751 |     "        numpy.ndarray:\n",
 752 |     "            sigmoid of x, values in (0, 1)\n",
 753 |     "        \"\"\"        \n",
 754 |     "        return 1 / (1 + np.exp(-x))\n"
 755 |    ]
 756 |   },
 757 |   {
 758 |    "cell_type": "code",
 759 |    "execution_count": 8,
 760 |    "metadata": {
 761 |     "hidden": true
 762 |    },
 763 |    "outputs": [
 764 |     {
 765 |      "name": "stdout",
 766 |      "output_type": "stream",
 767 |      "text": [
 768 |       "Creating logistic model instance\n",
 769 |       "fitting with m=100 samples with n=1 features\n",
 770 |       "\n"
 771 |      ]
 772 |     }
 773 |    ],
 774 |    "source": [
 775 |     "logistic_regression = LogisticRegression()\n",
 776 |     "logistic_regression.fit(X, y)"
 777 |    ]
 778 |   },
 779 |   {
 780 |    "cell_type": "code",
 781 |    "execution_count": 9,
 782 |    "metadata": {
 783 |     "hidden": true
 784 |    },
 785 |    "outputs": [
 786 |     {
 787 |      "data": {
 788 |       "text/plain": [
 789 |        "array([[0],\n",
 790 |        "       [0],\n",
 791 |        "       [1]])"
 792 |       ]
 793 |      },
 794 |      "execution_count": 9,
 795 |      "metadata": {},
 796 |      "output_type": "execute_result"
 797 |     }
 798 |    ],
 799 |    "source": [
 800 |     "example_X = np.array([[1,1],[1,4],[1,7]])\n",
 801 |     "logistic_regression.predict(example_X)"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "markdown",
 806 |    "metadata": {
 807 |     "heading_collapsed": true,
 808 |     "hidden": true
 809 |    },
 810 |    "source": [
 811 |     "### Plot the best fit"
 812 |    ]
 813 |   },
 814 |   {
 815 |    "cell_type": "code",
 816 |    "execution_count": 10,
 817 |    "metadata": {
 818 |     "hidden": true
 819 |    },
 820 |    "outputs": [
 821 |     {
 822 |      "data": {
 823 |       "application/vnd.jupyter.widget-view+json": {
 824 |        "model_id": "5e928c2fe5a1497889e2502722076d7a",
 825 |        "version_major": 2,
 826 |        "version_minor": 0
 827 |       },
 828 |       "text/plain": [
 829 |        "FigureWidget({\n",
 830 |        "    'data': [{'mode': 'markers',\n",
 831 |        "              'name': 'linear data + noise',\n",
 832 |        "              'ty…"
 833 |       ]
 834 |      },
 835 |      "metadata": {},
 836 |      "output_type": "display_data"
 837 |     }
 838 |    ],
 839 |    "source": [
 840 |     "fig = fig.add_scatter(\n",
 841 |     "    x=X[:,1], \n",
 842 |     "    y=logistic_regression.predict_proba(X)[:,0],\n",
 843 |     "    mode='markers',\n",
 844 |     "    name='logistic best fit')\n",
 845 |     "fig"
 846 |    ]
 847 |   },
 848 |   {
 849 |    "cell_type": "markdown",
 850 |    "metadata": {
 851 |     "heading_collapsed": true,
 852 |     "hidden": true
 853 |    },
 854 |    "source": [
 855 |     "### Plot the cost function"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "code",
 860 |    "execution_count": 11,
 861 |    "metadata": {
 862 |     "hidden": true
 863 |    },
 864 |    "outputs": [],
 865 |    "source": [
 866 |     "# Haven't got round to this yet - see linear regression for an example error \n",
 867 |     "# surface decent."
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "markdown",
 872 |    "metadata": {
 873 |     "heading_collapsed": true
 874 |    },
 875 |    "source": [
 876 |     "## Logisitc regression - Titanic example"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "markdown",
 881 |    "metadata": {
 882 |     "heading_collapsed": true,
 883 |     "hidden": true
 884 |    },
 885 |    "source": [
 886 |     "### Load data"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 12,
 892 |    "metadata": {
 893 |     "hidden": true
 894 |    },
 895 |    "outputs": [],
 896 |    "source": [
 897 |     "X_train = pd.read_feather('../data/titanic/processed/X_train.feather')\n",
 898 |     "X_test = pd.read_feather('../data/titanic/processed/X_test.feather')\n",
 899 |     "y_train = pd.read_feather('../data/titanic/processed/y_train.feather')\n",
 900 |     "y_test = pd.read_feather('../data/titanic/processed/y_test.feather')"
 901 |    ]
 902 |   },
 903 |   {
 904 |    "cell_type": "markdown",
 905 |    "metadata": {
 906 |     "heading_collapsed": true,
 907 |     "hidden": true
 908 |    },
 909 |    "source": [
 910 |     "### Train model"
 911 |    ]
 912 |   },
 913 |   {
 914 |    "cell_type": "code",
 915 |    "execution_count": 13,
 916 |    "metadata": {
 917 |     "hidden": true
 918 |    },
 919 |    "outputs": [
 920 |     {
 921 |      "name": "stdout",
 922 |      "output_type": "stream",
 923 |      "text": [
 924 |       "Creating logistic model instance\n"
 925 |      ]
 926 |     }
 927 |    ],
 928 |    "source": [
 929 |     "titanic_logistic_model = LogisticRegression()"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": 14,
 935 |    "metadata": {
 936 |     "hidden": true
 937 |    },
 938 |    "outputs": [
 939 |     {
 940 |      "name": "stdout",
 941 |      "output_type": "stream",
 942 |      "text": [
 943 |       "fitting with m=712 samples with n=29 features\n",
 944 |       "\n"
 945 |      ]
 946 |     }
 947 |    ],
 948 |    "source": [
 949 |     "titanic_logistic_model.fit(X=X_train.values, y=y_train.values, n_iter=4000)"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "markdown",
 954 |    "metadata": {
 955 |     "heading_collapsed": true,
 956 |     "hidden": true
 957 |    },
 958 |    "source": [
 959 |     "### Plot the cost"
 960 |    ]
 961 |   },
 962 |   {
 963 |    "cell_type": "code",
 964 |    "execution_count": 16,
 965 |    "metadata": {
 966 |     "hidden": true
 967 |    },
 968 |    "outputs": [
 969 |     {
 970 |      "data": {
 971 |       "application/vnd.jupyter.widget-view+json": {
 972 |        "model_id": "3c6486736c164288b3a79a20965d1168",
 973 |        "version_major": 2,
 974 |        "version_minor": 0
 975 |       },
 976 |       "text/plain": [
 977 |        "FigureWidget({\n",
 978 |        "    'data': [{'type': 'scatter',\n",
 979 |        "              'uid': 'a3a5dfba-fddd-428e-87a9-4a7f4461bffc',\n",
 980 |        " …"
 981 |       ]
 982 |      },
 983 |      "metadata": {},
 984 |      "output_type": "display_data"
 985 |     }
 986 |    ],
 987 |    "source": [
 988 |     "titanic_cost_fig = go.FigureWidget()\n",
 989 |     "\n",
 990 |     "titanic_cost_fig.add_scatter(\n",
 991 |     "    x=list(range(len(titanic_logistic_model.costs))),\n",
 992 |     "    y=titanic_logistic_model.costs,\n",
 993 |     ")\n",
 994 |     "\n",
 995 |     "titanic_cost_fig.layout.title = 'Cost Vs gradient descent iterations'\n",
 996 |     "titanic_cost_fig.layout.xaxis.title = 'Iterations'\n",
 997 |     "titanic_cost_fig.layout.yaxis.title = 'Cost'\n",
 998 |     "titanic_cost_fig"
 999 |    ]
1000 |   },
1001 |   {
1002 |    "cell_type": "markdown",
1003 |    "metadata": {
1004 |     "heading_collapsed": true,
1005 |     "hidden": true
1006 |    },
1007 |    "source": [
1008 |     "### Error analysis"
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "code",
1013 |    "execution_count": 17,
1014 |    "metadata": {
1015 |     "hidden": true
1016 |    },
1017 |    "outputs": [
1018 |     {
1019 |      "name": "stdout",
1020 |      "output_type": "stream",
1021 |      "text": [
1022 |       "Test accuracy is with my implementation 79.89%\n"
1023 |      ]
1024 |     }
1025 |    ],
1026 |    "source": [
1027 |     "y_pred = titanic_logistic_model.predict(X_test.values)\n",
1028 |     "test_accuracy = (y_pred == y_test.values).sum() / len(y_pred)\n",
1029 |     "\n",
1030 |     "print(f'Test accuracy is with my implementation {test_accuracy:.2%}')"
1031 |    ]
1032 |   },
1033 |   {
1034 |    "cell_type": "markdown",
1035 |    "metadata": {
1036 |     "heading_collapsed": true
1037 |    },
1038 |    "source": [
1039 |     "## End"
1040 |    ]
1041 |   }
1042 |  ],
1043 |  "metadata": {
1044 |   "kernelspec": {
1045 |    "display_name": "Python 3",
1046 |    "language": "python",
1047 |    "name": "python3"
1048 |   },
1049 |   "language_info": {
1050 |    "codemirror_mode": {
1051 |     "name": "ipython",
1052 |     "version": 3
1053 |    },
1054 |    "file_extension": ".py",
1055 |    "mimetype": "text/x-python",
1056 |    "name": "python",
1057 |    "nbconvert_exporter": "python",
1058 |    "pygments_lexer": "ipython3",
1059 |    "version": "3.7.6"
1060 |   },
1061 |   "toc": {
1062 |    "base_numbering": 1,
1063 |    "nav_menu": {},
1064 |    "number_sections": true,
1065 |    "sideBar": true,
1066 |    "skip_h1_title": false,
1067 |    "title_cell": "Table of Contents",
1068 |    "title_sidebar": "Contents",
1069 |    "toc_cell": false,
1070 |    "toc_position": {},
1071 |    "toc_section_display": true,
1072 |    "toc_window_display": false
1073 |   }
1074 |  },
1075 |  "nbformat": 4,
1076 |  "nbformat_minor": 2
1077 | }
1078 | 


--------------------------------------------------------------------------------