├── .gitignore ├── LICENSE ├── README.md ├── data └── titanic │ ├── example_gender_submission.csv │ ├── processed │ ├── X_test.feather │ ├── X_train.feather │ ├── y_test.feather │ └── y_train.feather │ ├── submission_data.csv │ └── train.csv ├── machine_learning ├── __init__.py ├── decision_tree.py ├── gradient_boosted_decision_tree.py ├── knn.py ├── linear_regression.py ├── logistic_regression.py ├── neural_network.py ├── random_forest.py └── tree.py └── notebooks ├── decision_tree.ipynb ├── gradient_boosted_decision_tree.ipynb ├── knn.ipynb ├── linear_regression.ipynb ├── logistic_regression.ipynb ├── neural_network.ipynb ├── random_forest.ipynb └── sklearn_titanic_example.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # data volume 132 | data/mnist 133 | 134 | # ide 135 | .vscode/ 136 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Simon Ward-Jones 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine learning 2 | 3 | > Machine learning algorithm implementations and explanations 4 | 5 | ## Notebooks 6 | 7 | The notebooks contains derivations and explanations of each method followed by implementations and example usage on well know data sets 8 | 9 | ## machine_learning 10 | 11 | This contains the source code for each of the implementations (these are the same as in each notebook) 12 | 13 | ## Key algorithms covered 14 | 15 | - Linear regression 16 | - Logistic regression 17 | - Knn 18 | - Decision tree 19 | - Random forest 20 | - Gradient boosted decision tree 21 | - Neural network 22 | -------------------------------------------------------------------------------- /data/titanic/example_gender_submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/titanic/processed/X_test.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/X_test.feather -------------------------------------------------------------------------------- /data/titanic/processed/X_train.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/X_train.feather -------------------------------------------------------------------------------- /data/titanic/processed/y_test.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/y_test.feather -------------------------------------------------------------------------------- /data/titanic/processed/y_train.feather: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/data/titanic/processed/y_train.feather -------------------------------------------------------------------------------- /data/titanic/submission_data.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q 3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S 4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q 5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S 6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S 7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S 8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q 9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S 10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C 11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S 12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S 13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S 14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S 15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S 16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S 17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C 18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q 19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C 20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S 21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C 22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C 23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S 24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S 25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C 26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C 27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S 28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C 29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C 30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S 31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C 32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S 33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S 34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S 35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S 36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C 37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C 38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S 39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S 40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S 41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S 42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C 43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S 44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S 45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S 46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S 47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S 48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C 49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q 50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C 51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S 52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S 53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C 54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S 55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S 56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C 57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q 58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S 59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S 60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S 61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C 62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S 63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S 64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S 65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q 66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C 67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S 68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q 69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S 70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C 71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S 72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q 73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S 74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S 75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C 76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C 77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C 78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S 79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S 80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S 81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q 82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C 83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S 84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S 85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S 86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q 87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C 88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q 89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S 90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q 91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S 92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S 93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S 94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S 95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S 96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C 97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S 98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S 99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S 100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S 101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S 102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C 103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S 104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q 105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S 106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C 107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S 108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q 109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q 110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S 111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S 112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C 113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q 114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C 115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q 116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S 117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C 118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C 119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S 120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C 121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S 122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S 123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q 124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C 125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S 126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q 127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S 128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S 129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q 130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S 131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S 132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S 133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C 134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S 135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C 136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S 137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S 138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C 139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S 140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S 141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S 142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S 143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S 144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C 145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S 146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S 147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S 148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S 149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S 150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S 151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S 152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C 153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C 154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S 155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S 156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S 157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S 158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S 159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S 160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S 161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S 162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q 163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C 164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S 165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S 166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S 167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S 168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C 169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S 170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C 171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S 172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S 173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C 174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S 175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C 176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S 177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S 178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S 179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C 180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S 181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C 182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S 183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C 184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S 185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q 186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C 187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S 188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S 189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S 190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S 191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S 192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S 193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S 194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S 195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q 196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S 197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S 198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C 199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S 200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S 201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S 202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q 203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S 204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C 205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S 206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S 207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C 208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q 209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S 210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C 211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S 212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S 213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S 214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S 215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S 216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S 217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S 218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q 219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S 220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C 221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S 222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C 223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S 224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S 225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S 226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C 227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C 228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S 229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q 230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S 231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S 232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S 233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S 234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S 235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q 236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C 237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S 238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C 239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C 240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S 241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C 242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C 243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S 244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C 245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S 246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S 247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S 248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S 249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S 250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S 251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C 252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S 253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S 254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C 255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S 256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S 257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S 258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q 259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S 260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S 261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S 262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S 263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S 264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S 265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S 266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C 267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S 268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S 269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S 270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S 271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S 272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C 273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q 274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C 275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q 276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C 277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S 278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S 279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S 280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S 281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S 282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S 283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S 284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q 285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C 286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S 287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S 288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S 289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S 290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C 291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S 292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S 293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q 294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C 295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S 296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S 297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S 298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C 299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C 300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S 301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S 302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S 303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C 304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S 305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S 306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q 307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S 308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S 309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S 310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S 311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S 312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S 313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C 314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S 315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q 316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C 317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q 318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C 319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S 320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S 321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S 322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S 323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C 324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S 325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S 326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S 327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S 328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S 329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C 330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S 331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S 332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S 333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C 334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C 335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C 336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S 337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S 338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S 339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C 340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S 341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C 342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S 343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S 344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S 345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C 346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S 347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S 348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S 349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C 350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S 351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S 352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C 353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S 354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S 355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S 356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S 357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S 358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S 359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S 360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q 361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S 362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S 363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C 364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S 365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S 366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C 367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S 368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C 369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S 370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C 371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C 372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S 373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C 374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S 375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S 376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S 377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C 378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S 379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S 380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S 381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S 382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q 383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q 384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S 385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S 386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S 387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S 388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S 389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S 390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q 391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S 392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S 393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S 394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S 395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S 396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S 397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S 398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q 399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C 400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S 401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q 402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S 403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S 404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C 405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S 406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C 407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C 408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S 409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C 410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q 411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S 412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q 413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q 414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S 415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S 416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C 417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S 418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S 419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C 420 | -------------------------------------------------------------------------------- /machine_learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonwardjones/machine_learning/1e92865bfe152acaf0df2df8f11a5f51833389a9/machine_learning/__init__.py -------------------------------------------------------------------------------- /machine_learning/decision_tree.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | 5 | from .tree import TreeNode 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger(__file__) 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | class DecisionTree(): 13 | 14 | def __init__(self, 15 | max_depth=2, 16 | min_samples_split=2, 17 | min_samples_leaf=1, 18 | n_classes=2, 19 | max_features=None, 20 | impurity='gini', 21 | is_classifier=True): 22 | """Decision tree model 23 | 24 | Parameters: 25 | ---------- 26 | max_depth: int 27 | The maximum depth allowed when "growing" a tree 28 | min_samples_split: int 29 | The minimum number of samples required to allow a split at a 30 | node 31 | min_samples_leaf: int 32 | The minimum number of samples allowed in a leaf. A split 33 | candidate leading to less samples in a node than the 34 | min_samples_leaf will be rejected 35 | n_classes: int, optional, default 2 36 | Number of classes in a classification setting. Ignored when 37 | self.is_classifier = False 38 | max_features: int, optional, default None 39 | If set to 'sqrt' then only a random subset of features are 40 | used to split at each node, the number of features used in 41 | this case is sqrt(n_features). 42 | Else all the features are considered when splitting at each 43 | node 44 | impurity: str, optional, default 'gini' 45 | The impurity measure to use when splitting at each node. 46 | I have currently only implemented two 47 | 'gini' - Uses the gini impurity (for classification) 48 | 'mse' - Uses the mean square error - equal to variance (for 49 | regression) 50 | is_classifier: bool, optional, default True 51 | Is the model used as part of a classification problem 52 | or a regression problem. Should be set to True if 53 | classification, False if regression 54 | """ 55 | self.max_depth = max_depth 56 | self.min_samples_split = min_samples_split 57 | self.min_samples_leaf = min_samples_leaf 58 | self.n_classes = n_classes 59 | self.max_features = max_features 60 | self.impurity = impurity 61 | self.is_classifier = is_classifier 62 | 63 | self.is_fitted = False 64 | self.tree = None 65 | 66 | def fit(self, X, y): 67 | """Fits the decision tree model 68 | 69 | The tree is fitted by instantiaing a root TreeNode instance and 70 | then calling the recursive_split method. This iteratively grows 71 | the tree by finding the best split to reduce the impurity the 72 | most. 73 | 74 | Parameters: 75 | ---------- 76 | X: numpy.ndarray 77 | Training data, shape (m samples, n features) 78 | y: numpy.ndarray 79 | Target values, shape (m samples, 1) 80 | If classifier with n_classes the values are assumed to be in 81 | 0, ..., n-1 82 | """ 83 | y_shape = (X.shape[0], 1) 84 | data = np.concatenate((X, y.reshape(y_shape)), axis=1) 85 | self.tree = TreeNode( 86 | data=data, 87 | max_depth=self.max_depth, 88 | min_samples_split=self.min_samples_split, 89 | min_samples_leaf=self.min_samples_leaf, 90 | n_classes=self.n_classes, 91 | max_features=self.max_features, 92 | impurity=self.impurity, 93 | is_classifier=self.is_classifier) 94 | self.tree.recursive_split() 95 | self.is_fitted = True 96 | 97 | def predict(self, data): 98 | """Predicts target values or class labels for classification 99 | 100 | Predicts target values/class for each row in data by walking the 101 | tree and returning the leaf node value for regression or the 102 | class with the largest predicted probability for classification 103 | 104 | Parameters: 105 | ---------- 106 | data: numpy.ndarray 107 | The input data with shape (m samples, n features) 108 | 109 | Returns: 110 | ------- 111 | numpy.ndarray: 112 | Predicted target values or class labels for classification 113 | """ 114 | if not self.is_fitted: 115 | raise Exception('Decision tree not fitted') 116 | return self.tree.predict(data) 117 | 118 | def predict_proba(self, data): 119 | """Predicts class probabilities for input data 120 | 121 | Predicts class probabilities for each row in data by walking the 122 | tree and returning the leaf node class probabilities 123 | 124 | Parameters: 125 | ---------- 126 | data: numpy.ndarray 127 | The input data with shape (m samples, n features) 128 | 129 | Returns: 130 | ------- 131 | numpy.ndarray: 132 | Predicted sample class probabilities, 133 | shape (m samples, n classes) 134 | """ 135 | if not self.is_fitted: 136 | raise Exception('Decision tree not fitted') 137 | return self.tree.predict_proba(data) 138 | 139 | def render(self, feature_names): 140 | """Returns Digraph visualizing the decision tree (if fitted) 141 | 142 | Parameters: 143 | ---------- 144 | feature_names: list[str] 145 | List of feature names 146 | 147 | Returns: 148 | ------- 149 | graphviz.Digraph: 150 | dot for tree diagram visual 151 | """ 152 | if not self.is_fitted: 153 | print('Decision tree not fitted') 154 | else: 155 | return self.tree.dot(feature_names=feature_names) 156 | -------------------------------------------------------------------------------- /machine_learning/gradient_boosted_decision_tree.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | from scipy.special import expit, logsumexp 5 | 6 | from .decision_tree import DecisionTree 7 | 8 | logging.basicConfig() 9 | logger = logging.getLogger(__file__) 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | class GradientBoostedDecisionTree(): 14 | 15 | def __init__(self, 16 | max_depth=2, 17 | min_samples_split=2, 18 | min_samples_leaf=1, 19 | n_classes=2, 20 | max_features=None, 21 | is_classifier=True, 22 | n_trees=10, 23 | learning_rate=0.1): 24 | """Gradient boosted decision tree model 25 | 26 | The trees are grown sequentially and fitted to the negative 27 | gradient of the cost function with respect to the raw predicted 28 | values at the previous stage. 29 | 30 | Note I use the term raw_predictions as raw predicted values 31 | must be transformed to find the probability estimates in the 32 | case of classification. 33 | 34 | In practice these gradients are equal to the residual. 35 | 36 | The raw predictions for a stage are made by adding the new delta 37 | model (multiplied by the learning rate) to the raw predictions 38 | from the previous stage 39 | 40 | Parameters: 41 | ---------- 42 | max_depth: int 43 | The maximum depth allowed when "growing" a tree 44 | min_samples_split: int 45 | The minimum number of samples required to allow a split at a 46 | node 47 | min_samples_leaf: int 48 | The minimum number of samples allowed in a leaf. A split 49 | candidate leading to less samples in a node than the 50 | min_samples_leaf will be rejected 51 | n_classes: int, optional, default 2 52 | Number of classes in a classification setting. Ignored when 53 | self.is_classifier = False 54 | max_features: int, optional, default None 55 | If set to 'sqrt' then only a random subset of features are 56 | used to split at each node, the number of features used in 57 | this case is sqrt(n_features). 58 | Else all the features are considered when splitting at each 59 | node 60 | is_classifier: bool, optional, default True 61 | Is the model used as part of a classification problem 62 | or a regression problem. Should be set to True if 63 | classification, False if regression 64 | n_trees: int, optional, default 10 65 | Number of trees, equivalently gradient steps 66 | learning_rate: float, optional, default 0.05 67 | The learning rate parameter controlling the gradient descent 68 | step size 69 | """ 70 | self.max_depth = max_depth 71 | self.min_samples_split = min_samples_split 72 | self.min_samples_leaf = min_samples_leaf 73 | self.n_classes = n_classes 74 | self.max_features = max_features 75 | self.is_classifier = is_classifier 76 | 77 | self.n_trees = n_trees 78 | self.learning_rate = learning_rate 79 | self.is_fitted = False 80 | np.random.seed(1) 81 | self.trees_to_fit = 1 if n_classes <= 2 else n_classes 82 | self.trees = [ 83 | [None for _ in range(self.trees_to_fit)] 84 | for _ in range(self.n_trees)] 85 | # trees has shape (n_trees, n_classes) 86 | 87 | def predict_delta_model(self, X, stage=0): 88 | """Calculate the delta model for a stage 89 | 90 | This function returns the estimate of the negative gradient. 91 | These raw predictions are the delta models f_{stage + 1} 92 | 93 | Parameters: 94 | ---------- 95 | X: numpy.ndarray 96 | Sample data, shape (m samples, n features) 97 | stage: int, optional, default 0 98 | What correction step are we predicting 99 | 100 | Returns: 101 | ------- 102 | numpy.ndarray: 103 | gradient_step, shape (X.shape[0], n_classes) 104 | if n_classes > 2 else shape (m samples, 1) 105 | """ 106 | class_gradient_step = [] 107 | for class_k, model in enumerate(self.trees[stage]): 108 | k_gradient_step = model.predict(X).reshape(-1) 109 | class_gradient_step.append(k_gradient_step) 110 | gradient_step = np.stack(class_gradient_step, axis=-1) 111 | return gradient_step 112 | 113 | def predict_raw_stages(self, X, n_stages=None): 114 | """Predictions for input X 115 | 116 | The predictions are given by the transformed sum of initial 117 | model and delta models. Note no transformation is required for 118 | regression. 119 | 120 | If n_stages specified stop at that stage. The delta model is 121 | multiplied by the learning rate before being added to the 122 | raw predictions 123 | 124 | Parameters: 125 | ---------- 126 | X: numpy.ndarray 127 | Sample data, shape (m samples, n features) 128 | n_stages: in, optional, default None 129 | If given return prediction an n_stages 130 | 131 | Returns: 132 | ------- 133 | numpy.ndarray: 134 | predictions, shape (X.shape[0], n_classes) 135 | if n_classes > 2 else shape (m samples, 1) 136 | """ 137 | if not n_stages: 138 | n_stages = self.n_trees 139 | if n_stages not in list(range(1, self.n_trees + 1)): 140 | raise Exception('n_stages must be between 1 and n_trees') 141 | raw_predictions = self.f_0_prediction(X) 142 | for stage in range(n_stages): 143 | stage_gradient_step = self.predict_delta_model(X, stage) 144 | raw_predictions += self.learning_rate * stage_gradient_step 145 | return self.convert_raw_predictions(raw_predictions) 146 | 147 | def predict(self, X): 148 | """Predicts target values or class labels for classification 149 | 150 | Parameters: 151 | ---------- 152 | X: numpy.ndarray 153 | Sample data, shape (m samples, n features) 154 | 155 | Returns: 156 | ------- 157 | numpy.ndarray: 158 | Predicted target values or class labels for classification 159 | """ 160 | if not self.is_classifier: 161 | return self.predict_raw_stages(X) 162 | else: 163 | return np.argmax(self.predict_proba(X), axis=-1) 164 | 165 | def predict_proba(self, X): 166 | """Predicts class probabilities for input data 167 | 168 | Parameters: 169 | ---------- 170 | X: numpy.ndarray 171 | Sample data, shape (m samples, n features) 172 | 173 | Returns: 174 | ------- 175 | numpy.ndarray: 176 | Predicted sample class probabilities, 177 | shape (m samples, n classes) 178 | if n_classes > 2 else shape (m samples, 1) 179 | """ 180 | if not self.is_classifier: 181 | raise Exception('Not a classifier') 182 | if self.n_classes == 2: 183 | prob_class_one = self.predict_raw_stages(X) 184 | return np.stack([1-prob_class_one, prob_class_one], axis=-1) 185 | if self.n_classes > 2: 186 | return self.predict_raw_stages(X) 187 | 188 | def convert_raw_predictions(self, raw_predictions): 189 | """Convert raw_predictions to probability if classifier 190 | 191 | This uses sigmoid if the are two classes - in which case we 192 | model the logit. Softmax function is used when there are more 193 | than two classes. 194 | 195 | Parameters: 196 | ---------- 197 | raw_predictions: numpy.ndarray 198 | Raw predictions, shape (m samples, n classes) 199 | 200 | Returns: 201 | ------- 202 | numpy.ndarray: 203 | target values or class probabilities for classification 204 | """ 205 | if not self.is_classifier: 206 | return raw_predictions 207 | if self.is_classifier and self.n_classes == 2: 208 | return expit(raw_predictions) 209 | if self.is_classifier and self.n_classes > 2: 210 | return np.exp( 211 | raw_predictions - logsumexp(raw_predictions, axis=1)[:, None]) 212 | 213 | def f_0_prediction(self, X): 214 | """Return initial raw_predictions for X 215 | 216 | Parameters: 217 | ---------- 218 | X: numpy.ndarray 219 | Training data, shape (m samples, n features) 220 | 221 | Returns: 222 | ------- 223 | numpy.ndarray: 224 | raw_predictions, shape (m samples, n classes) 225 | if n_classes > 2 else shape (m samples, 1) 226 | """ 227 | n = X.shape[0] 228 | if not self.is_classifier: 229 | return self.regression_f_0_tree.predict(X).reshape(n, 1) 230 | if self.is_classifier and self.n_classes == 2: 231 | return np.repeat(self.f_0, n).reshape(n, 1) 232 | if self.is_classifier and self.n_classes > 2: 233 | return np.repeat(self.f_0, n, axis=0) 234 | 235 | def init_f_0(self, X, y): 236 | """Fit initial prediction model 237 | 238 | For regression this is simple fitting a first tree to the target 239 | values. 240 | 241 | For classification when we model the logit (in two class 242 | scenario) we use the logit of the average probability in the 243 | training data. 244 | For the multi class case, where we model the log of each class 245 | probability as an additive model, we initialise the raw values 246 | as the log of the observed probability of that class. 247 | 248 | Parameters: 249 | ---------- 250 | X: numpy.ndarray 251 | Training data, shape (m samples, n features) 252 | y: numpy.ndarray 253 | Target values, shape (m samples, 1) 254 | If classifier with n_classes the values are assumed to be in 255 | 0, ..., n-1 256 | """ 257 | y = y.reshape(-1) 258 | if not self.is_classifier: 259 | self.regression_f_0_tree = self.get_tree() 260 | self.regression_f_0_tree.fit(X, y) 261 | if self.is_classifier and self.n_classes == 2: 262 | self.f_0 = np.log(y.sum() / (y.shape[0] - y.sum())) 263 | if self.is_classifier and self.n_classes > 2: 264 | self.f_0 = np.log( 265 | np.bincount(y, minlength=self.n_classes) / y.shape[0])[None, :] 266 | 267 | def get_tree(self): 268 | """Helper to return decision tree to be fitted 269 | 270 | Returns: 271 | ------- 272 | DecisionTree: 273 | Regression tree 274 | """ 275 | return DecisionTree( 276 | max_depth=self.max_depth, 277 | min_samples_split=self.min_samples_split, 278 | min_samples_leaf=self.min_samples_leaf, 279 | n_classes=self.n_classes, 280 | max_features=self.max_features, 281 | impurity='mse', 282 | is_classifier=False) 283 | 284 | def fit(self, X, y): 285 | """Fit the gradient boosted decision tree 286 | 287 | For each stage fit a tree to the negative gradient (for that 288 | class), then update the raw predictions using the learning rate 289 | and delta model. 290 | 291 | Parameters: 292 | ---------- 293 | X: numpy.ndarray 294 | Training data, shape (m samples, n features) 295 | y: numpy.ndarray 296 | Target values, shape (m samples, 1) 297 | If classifier with n_classes the values are assumed to be in 298 | 0, ..., n-1 299 | """ 300 | if self.is_classifier: 301 | y = y.astype(int) 302 | self.init_f_0(X, y) 303 | prev_stage_raw_predictions = self.f_0_prediction(X) 304 | for stage in range(self.n_trees): 305 | negative_gradient = self.negative_gradient( 306 | y, prev_stage_raw_predictions) 307 | self.fit_stage(X, negative_gradient, stage=stage) 308 | delta_model = self.predict_delta_model(X, stage=stage) 309 | prev_stage_raw_predictions = prev_stage_raw_predictions + \ 310 | (self.learning_rate * delta_model) 311 | 312 | def fit_stage(self, X, negative_gradient, stage=0): 313 | """Fit a given stage 314 | 315 | For regression this is just fitting a single tree to the 316 | gradient. For classification we fit one tree for each class ( 317 | unless there are only two classes when we can use just one) 318 | 319 | Parameters: 320 | ---------- 321 | X: numpy.ndarray 322 | Training data, shape (m samples, n features) 323 | negative_gradient: numpy.ndarray 324 | dL_dY^hat, shape (m samples, n features) 325 | stage: int, optional, default 0 326 | stage to fit 327 | """ 328 | logger.info(f'Fitting stage {stage}') 329 | trees_to_fit = 1 if self.n_classes <= 2 else self.n_classes 330 | for class_k in range(trees_to_fit): 331 | target = negative_gradient[:, class_k] 332 | tree = self.get_tree() 333 | tree.fit(X, target) 334 | self.trees[stage][class_k] = tree 335 | 336 | def negative_gradient(self, y, prev_stage_raw_predictions): 337 | """Gradient of the loss function with res 338 | 339 | Parameters: 340 | ---------- 341 | y: numpy.ndarray 342 | Target values, shape (m samples, 1) 343 | If classifier with n_classes the values are assumed to be in 344 | 0, ..., n-1 345 | prev_stage_raw_predictions: numpy.ndarray 346 | raw_predictions, shape 347 | 348 | Returns: 349 | ------- 350 | numpy.ndarray: 351 | negative gradient, shape (m samples, n classes) 352 | if n_classes > 2 else shape (m samples, 1) 353 | """ 354 | if self.is_classifier and self.n_classes > 2: 355 | y = np.eye(self.n_classes)[y.reshape(-1)] 356 | else: 357 | y = y.reshape(y.shape[0], 1) 358 | return y - self.convert_raw_predictions(prev_stage_raw_predictions) 359 | 360 | def render(self, stage, class_k, feature_names): 361 | """Returns Digraph visualizing one of the decision trees 362 | 363 | Parameters: 364 | ---------- 365 | stage: [type] 366 | Stage to get tree from 367 | class_k: [type] 368 | tree for class class_k 369 | feature_names: [type] 370 | Feature names 371 | 372 | Returns: 373 | ------- 374 | graphviz.Digraph: 375 | dot for tree diagram visual 376 | """ 377 | return self.trees[stage][class_k].render(feature_names) 378 | -------------------------------------------------------------------------------- /machine_learning/knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Knn(): 5 | 6 | def __init__(self, k=3, save_history=False, tolerance=0.001): 7 | """knn model 8 | 9 | Parameters: 10 | ---------- 11 | k: int, optional, default 3 12 | number of clusters 13 | save_history: bool, optional, default False 14 | Whether to save intermediate steps, for analysis and 15 | visualisation - see notebook for example 16 | tolerance: float, optional, default 0.001 17 | Stopping tolerance for change in centorids 18 | """ 19 | self.k = k 20 | self.save_history = save_history 21 | self.centroids = [None for _ in range(k)] 22 | self.iteration = 0 23 | self.tolerance = tolerance 24 | 25 | def update_centroids(self, X): 26 | """Update the centroids as the mean of clusters 27 | 28 | The new centroids are calculated as the mean of the clusters 29 | based on the current cluster assignment (self.cluster_labels). 30 | After the update the new assignment is done to update the 31 | labels 32 | 33 | Parameters: 34 | ---------- 35 | X: numpy.ndarray 36 | Input data to cluster, shape (m samples, n features) 37 | 38 | Returns: 39 | ------- 40 | float: 41 | Sum of the euclidean distance change in each of the 42 | centroids after updating 43 | """ 44 | new_centroids = [] 45 | for i in range(self.k): 46 | new_centroid = X[self.cluster_labels == i, :].mean(axis=0) 47 | new_centroids.append(new_centroid) 48 | new_centroids = np.stack(new_centroids, axis=0) 49 | self.iteration += 1 50 | distance_change = self.dist(self.centroids, new_centroids).sum() 51 | self.centroids = new_centroids 52 | new_cluster_labels = self.assign_clusters(X) 53 | self.cluster_labels = new_cluster_labels 54 | if self.save_history: 55 | self.centroid_history.append(self.centroids) 56 | self.cluster_labels_history.append(self.cluster_labels) 57 | return distance_change 58 | 59 | def fit(self, X, max_updates=10): 60 | """Fit the knn model 61 | 62 | Fitting the model updates the centroids and cluster labels 63 | iteratively util the centroids no longer change or the max 64 | number of iterations is reached 65 | 66 | Parameters: 67 | ---------- 68 | X: numpy.ndarray 69 | Input data to cluster, shape (m samples, n features) 70 | max_updates: int, optional, default 10 71 | Maximum number of iterations permitted 72 | """ 73 | self.initalise_centroids(X) 74 | distance_change = 10**6 75 | while self.iteration < max_updates and not distance_change < self.tolerance: 76 | distance_change = self.update_centroids(X) 77 | print(f'Finished at iteration {self.iteration}') 78 | 79 | def initalise_centroids(self, X): 80 | """Sets initial centorids randomly and assigns cluster labels 81 | 82 | The centroids are chosen randomly based on observed range of 83 | values in X 84 | 85 | Parameters: 86 | ---------- 87 | X: numpy.ndarray 88 | Input data to cluster, shape (m samples, n features) 89 | """ 90 | X_mins = X.min(axis=0) 91 | X_maxs = X.max(axis=0) 92 | self.centroids = np.stack( 93 | [np.random.uniform(xi_min, xi_max, self.k) 94 | for xi_min, xi_max in zip(X_mins, X_maxs)], 95 | axis=-1) 96 | self.cluster_labels = self.assign_clusters(X) 97 | if self.save_history: 98 | self.centroid_history = [self.centroids] 99 | self.cluster_labels_history = [self.cluster_labels] 100 | 101 | def dist(self, a, b, axis=1): 102 | """Euclidean distance function 103 | 104 | Parameters: 105 | ---------- 106 | a: numpy.ndarray 107 | samples, shape (m sample, n features) 108 | b: numpy.ndarray 109 | centroid, shape (n_features,) 110 | axis: int, optional, default 1 111 | Set to 1 to sum along rows 112 | 113 | Returns: 114 | ------- 115 | numpy.ndarray: 116 | Distance between each sample and centroid 117 | """ 118 | return np.linalg.norm(a - b, axis=axis) 119 | 120 | def assign_clusters(self, X): 121 | """Assigns each sample of X to its nearest cluster centroid 122 | 123 | Parameters: 124 | ---------- 125 | X: numpy.ndarray 126 | Input data to cluster, shape (m samples, n features) 127 | 128 | Returns: 129 | ------- 130 | numpy.ndarray: 131 | Cluster label for each sample, shape (m samples, 1) 132 | """ 133 | distances = [] 134 | for centroid in self.centroids: 135 | centorid_distances = self.dist(X, centroid) 136 | distances.append(centorid_distances) 137 | all_distaces = np.stack(distances, axis=1) 138 | cluster_labels = np.argmin(all_distaces, axis=1) 139 | return cluster_labels 140 | -------------------------------------------------------------------------------- /machine_learning/linear_regression.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | 5 | 6 | class LinearRegression(): 7 | 8 | def __init__(self, learning_rate=0.05): 9 | """ 10 | Linear regression model 11 | 12 | Parameters: 13 | ---------- 14 | learning_rate: float, optional, default 0.05 15 | The learning rate parameter controlling the gradient descent 16 | step size 17 | """ 18 | self.learning_rate = learning_rate 19 | print('Creating linear model instance') 20 | 21 | def __repr__(self): 22 | return ( 23 | f'') 25 | 26 | def fit(self, X, y, n_iter=1000): 27 | """ 28 | Fit the linear regression model 29 | 30 | Updates the weights with n_iter iterations of batch gradient 31 | descent updates 32 | 33 | Parameters: 34 | ---------- 35 | X: numpy.ndarray 36 | Training data, shape (m samples, (n - 1) features + 1) 37 | Note the first column of X is expected to be ones (to allow 38 | for the bias to be included in beta) 39 | y: numpy.ndarray 40 | Target values, shape (m samples, 1) 41 | n_iter: int, optional, default 1000 42 | Number of batch gradient descent steps 43 | """ 44 | m, n = X.shape 45 | print(f'fitting with m={m} samples with n={n-1} features\n') 46 | self.beta = np.zeros(shape=(n, 1)) 47 | self.costs = [] 48 | self.betas = [self.beta] 49 | for iteration in range(n_iter): 50 | y_pred = self.predict(X) 51 | cost = self.cost(y, y_pred) 52 | self.costs.append(cost[0][0]) 53 | gradient = self.gradient(y, y_pred, X) 54 | self.beta = self.beta - ( 55 | self.learning_rate * gradient) 56 | self.betas.append(self.beta) 57 | 58 | def cost(self, y, y_pred): 59 | """ 60 | Mean square error cost function 61 | 62 | Parameters: 63 | ---------- 64 | y: numpy.ndarray 65 | True target values, shape (m samples, 1) 66 | y_pred: numpy.ndarray 67 | Predicted y values, shape (m samples, 1) 68 | 69 | Returns: 70 | ------- 71 | float: 72 | mean square error value 73 | """ 74 | m = y.shape[0] 75 | cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred) 76 | return cost 77 | 78 | def gradient(self, y, y_pred, X): 79 | """ 80 | Calculates the gradient of the cost function 81 | 82 | Parameters: 83 | ---------- 84 | y: numpy.ndarray 85 | Predicted y values, shape (m samples, 1) 86 | y_pred: numpy.ndarray 87 | True target values, shape (m samples, 1) 88 | X: numpy.ndarray 89 | Training data, shape (m samples, (n - 1) features + 1) 90 | Note the first column of X is expected to be ones (to allow 91 | for the bias to be included in beta) 92 | 93 | Returns: 94 | ------- 95 | numpy.ndarray: 96 | Derivate of mean square error cost function with respect to 97 | the weights beta, shape (n features, 1) 98 | """ 99 | m = X.shape[0] 100 | gradient = (1 / m) * X.T @ (y_pred - y) 101 | return gradient 102 | 103 | def predict(self, X): 104 | """ 105 | Predict the target values from sample X feature values 106 | 107 | Parameters: 108 | ---------- 109 | X: numpy.ndarray 110 | Training data, shape (m samples, (n - 1) features + 1) 111 | Note the first column of X is expected to be ones (to allow 112 | for the bias to be included in beta) 113 | 114 | Returns: 115 | ------- 116 | numpy.ndarray: 117 | Target value predictions, shape (m samples, 1) 118 | """ 119 | y_pred = X @ self.beta 120 | return y_pred 121 | -------------------------------------------------------------------------------- /machine_learning/logistic_regression.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | 5 | 6 | class LogisticRegression(): 7 | 8 | def __init__(self, learning_rate=0.05): 9 | """ 10 | Logistic regression model 11 | 12 | Parameters: 13 | ---------- 14 | learning_rate: float, optional, default 0.05 15 | The learning rate parameter controlling the gradient descent 16 | step size 17 | """ 18 | self.learning_rate = learning_rate 19 | print('Creating logistic model instance') 20 | 21 | def __repr__(self): 22 | return ( 23 | f'') 25 | 26 | def fit(self, X, y, n_iter=1000): 27 | """ 28 | Fit the logistic regression model 29 | 30 | Updates the weights with n_iter iterations of batch gradient 31 | descent updates 32 | 33 | Parameters: 34 | ---------- 35 | X: numpy.ndarray 36 | Training data, shape (m samples, (n - 1) features + 1) 37 | Note the first column of X is expected to be ones (to allow 38 | for the bias to be included in beta) 39 | y: numpy.ndarray 40 | Target values - class label {0, 1}, shape (m samples, 1) 41 | n_iter: int, optional, default 1000 42 | Number of batch gradient descent steps 43 | """ 44 | m, n = X.shape 45 | print(f'fitting with m={m} samples with n={n-1} features\n') 46 | self.beta = np.zeros(shape=(n, 1)) 47 | self.costs = [] 48 | self.betas = [self.beta] 49 | for iteration in range(n_iter): 50 | y_pred = self.predict_proba(X) 51 | cost = (-1 / m) * ( 52 | (y.T @ np.log(y_pred)) + 53 | ((np.ones(shape=y.shape) - y).T @ np.log( 54 | np.ones(shape=y_pred.shape) - y_pred)) 55 | ) 56 | self.costs.append(cost[0][0]) 57 | gradient = (1 / m) * X.T @ (y_pred - y) 58 | self.beta = self.beta - ( 59 | self.learning_rate * gradient) 60 | self.betas.append(self.beta) 61 | 62 | def predict_proba(self, X): 63 | """ 64 | Predicted probability values for class 1 65 | 66 | Note this is calculated as the sigmoid of the linear combination 67 | of the feature values and the weights. 68 | 69 | Parameters: 70 | ---------- 71 | X: numpy.ndarray 72 | Training data, shape (m samples, (n - 1) features + 1) 73 | Note the first column of X is expected to be ones (to allow 74 | for the bias to be included in beta) 75 | 76 | Returns: 77 | ------- 78 | numpy.ndarray: 79 | Predicted probability of samples being in class 1 80 | """ 81 | y_pred = self.sigmoid(X @ self.beta) 82 | return y_pred 83 | 84 | def predict(self, X, descision_prob=0.5): 85 | """ 86 | Predict the class values from sample X feature values 87 | 88 | Parameters: 89 | ---------- 90 | X: numpy.ndarray 91 | Training data, shape (m samples, (n - 1) features + 1) 92 | Note the first column of X is expected to be ones (to allow 93 | for the bias to be included in beta) 94 | 95 | Returns: 96 | ------- 97 | numpy.ndarray: 98 | Prediceted class values, shape (m samples, 1) 99 | """ 100 | y_pred = self.sigmoid(X @ self.beta) 101 | return (y_pred > descision_prob) * 1 102 | 103 | def sigmoid(self, x): 104 | """ 105 | Sigmoid function 106 | 107 | f(x) = 1 / (1 + e^(-x)) 108 | 109 | Parameters: 110 | ---------- 111 | x: numpy.ndarray 112 | 113 | Returns: 114 | ------- 115 | numpy.ndarray: 116 | sigmoid of x, values in (0, 1) 117 | """ 118 | return 1 / (1 + np.exp(-x)) 119 | -------------------------------------------------------------------------------- /machine_learning/neural_network.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | 5 | logging.basicConfig() 6 | logger = logging.getLogger(__file__) 7 | logger.setLevel(logging.INFO) 8 | 9 | 10 | class NeuralNetwork(): 11 | 12 | def __init__(self, 13 | layer_sizes=[5, 10, 1], 14 | is_classifier=True, 15 | learning_rate=0.1): 16 | """Neural network model 17 | 18 | Parameters: 19 | ---------- 20 | layer_sizes: list, optional, default [5, 10, 1] 21 | Number of nodes in each layer (including input and output) 22 | is_classifier: bool, optional, default True 23 | Is the model used as part of a classification problem 24 | or a regression problem. Should be set to True if 25 | classification, False if regression 26 | learning_rate: float, optional, default 0.05 27 | The learning rate parameter controlling the gradient descent 28 | step size 29 | """ 30 | self.layer_sizes = layer_sizes # n^0, ..., n^L 31 | self.is_classifier = is_classifier 32 | self.learning_rate = learning_rate 33 | self.n_L = layer_sizes[-1] # n^L 34 | self.n_layers = len(layer_sizes) - 1 # L 35 | self.initialise_weights() 36 | 37 | def initialise_weights(self): 38 | """Initialise the weights and biases 39 | 40 | weights are initialized as small random numbers, biases as zero 41 | """ 42 | self.weight_matrices = [ 43 | np.random.normal(loc=0.0, scale=1.0, size=(n_l, n_l_minus_1)) 44 | for n_l, n_l_minus_1 in zip(self.layer_sizes[1:], self.layer_sizes) 45 | ] 46 | self.betas = [np.zeros(shape=(n_l, 1)) for n_l in self.layer_sizes[1:]] 47 | 48 | def feed_forward(self, X): 49 | """Feed X forward through the network 50 | 51 | For each layer the net input is calculated as the product of the 52 | weight matrix and the activations of the previous layer plus the 53 | biases. 54 | 55 | The output activation is then calculated by applying the 56 | activation function to the net input 57 | 58 | Parameters: 59 | ---------- 60 | X: numpy.ndarray 61 | Training data, shape (m samples, n features) 62 | 63 | Returns: 64 | ------- 65 | numpy.ndarray: 66 | final layer activations, shape (n^L, m) 67 | """ 68 | m = X.shape[0] 69 | layer_activations = [X.T] 70 | for layer in range(self.n_layers): 71 | A_layer_minus_1 = layer_activations[-1] 72 | beta = self.betas[layer] 73 | B = np.repeat(beta, m, axis=-1) 74 | Z = self.weight_matrices[layer] @ A_layer_minus_1 + B 75 | A = self.activation_function(Z, layer=layer) 76 | layer_activations.append(A) 77 | self.log_layer(layer, A_layer_minus_1, beta, B, Z, A) 78 | self.layer_activations = layer_activations 79 | return layer_activations[-1] 80 | 81 | def back_propagation(self, X, Y): 82 | """Update the weights and biases through back propagation 83 | 84 | Parameters: 85 | ---------- 86 | X: numpy.ndarray 87 | Training data, shape (m samples, n features) 88 | Y: numpy.ndarray 89 | Target values, shape (n_classes, m samples) 90 | """ 91 | assert X.shape[0] == Y.shape[1] 92 | final_layer_error = self.layer_activations[-1] - Y 93 | D_plus_1 = final_layer_error 94 | # errors represent D matrices in notebook explanation 95 | errors = [D_plus_1] 96 | for layer in range(self.n_layers - 2, -1, -1): 97 | logger.debug(f'Calculating D_{layer + 1}') 98 | A = self.layer_activations[layer + 1] 99 | self.log_back_prop_layer(layer, A, D_plus_1) 100 | D = (self.weight_matrices[layer + 1].T @ D_plus_1) * \ 101 | A * (1 - A) 102 | D_plus_1 = D 103 | errors.insert(0, D) 104 | self.errors = errors 105 | self.update_weights() 106 | 107 | def update_weights(self): 108 | """Update the weights and biases using gradient 109 | 110 | The weights and biases are updated by calculating the parital 111 | derivatives and then stepping the weights in the direction 112 | of the negative gradient. The step size is governed by the 113 | learing rate 114 | """ 115 | for layer in range(self.n_layers): 116 | m = self.errors[0].shape[1] 117 | d_L_d_W = (1 / m) * self.errors[layer] @ \ 118 | self.layer_activations[layer].T 119 | d_L_d_beta = (1 / m) * self.errors[layer].sum(axis=1)[:, None] 120 | self.weight_matrices[layer] = self.weight_matrices[layer] - \ 121 | self.learning_rate * d_L_d_W 122 | if layer == 0: 123 | self.d_L_d_Ws.append(d_L_d_W.sum()) 124 | self.betas[layer] = self.betas[layer] - \ 125 | self.learning_rate * d_L_d_beta 126 | 127 | def log_layer(self, layer, A_layer_minus_1, beta, B, Z, A): 128 | """Utility function to group logging 129 | 130 | Parameters: 131 | ---------- 132 | layer: int 133 | The layer being logged (note python uses 0 index) so the 134 | layer is actually layer + 1 135 | A_layer_minus_1: numpy.ndarray, shape (n^{l-1},m) 136 | Previous layer activations for each sample 137 | beta: numpy.ndarray, shape (n^{l}, 1) 138 | Layer biases 139 | B: numpy.ndarray, shape (n^{l}, m) 140 | Repeated layer biases for ease of matrix operations 141 | Z: numpy.ndarray, shape (n^{l}, m) 142 | Net input for each sample 143 | A: numpy.ndarray, shape (n^{l}, m) 144 | Output activation for each sample 145 | """ 146 | logger.debug( 147 | f'A_layer_minus_1 i.e. A_{layer} ' 148 | f'has shape {A_layer_minus_1.shape}') 149 | logger.debug(f'beta_{layer + 1} has shape {beta.shape}') 150 | logger.debug(f'B_{layer + 1} has shape {B.shape}') 151 | logger.debug(f'Z_{layer + 1} has shape {Z.shape}') 152 | logger.debug(f'A_{layer + 1} has shape {A.shape}') 153 | 154 | def log_back_prop_layer(self, layer, A, D_plus_1): 155 | """Utility for logging back propagation 156 | 157 | Parameters: 158 | ---------- 159 | layer: int 160 | The layer being logged (note python uses 0 index) so the 161 | layer is actually layer + 1 162 | A: numpy.ndarray, shape (n^{l}, m) 163 | Output activation for each sample 164 | D_plus_1: numpy.ndarray, shape (n^{l+1}, m) 165 | Error in the next layer 166 | """ 167 | logger.debug( 168 | f'A_{layer + 1} has shape {A.shape}') 169 | logger.debug( 170 | f'W_{layer + 2} has shape ' 171 | f'{self.weight_matrices[layer + 1].shape}') 172 | logger.debug( 173 | f'D_{layer + 2} has shape {D_plus_1.shape}') 174 | 175 | def activation_function(self, Z, layer): 176 | """Activation function 177 | 178 | The activation function is the sigmoid for nodes except the 179 | output layer. For the final layer the identify function is used 180 | for regression and for multiclass classification the softmax 181 | function is used 182 | 183 | Parameters: 184 | ---------- 185 | Z: numpy.ndarray, shape (n^{l}, m) 186 | Net input for each sample 187 | layer: int 188 | The layer being logged (note python uses 0 index) so the 189 | layer is actually layer + 1 190 | 191 | Returns: 192 | ------- 193 | numpy.ndarray: 194 | Output activation for each sample, shape (n^{l}, m) 195 | """ 196 | if layer == (self.n_layers - 1): 197 | if not self.is_classifier: 198 | return Z 199 | if self.is_classifier and self.n_L >= 2: 200 | return np.exp(Z - logsumexp(Z, axis=0)[None, :]) 201 | return expit(Z) 202 | 203 | def cost(self, Y): 204 | """Cost function 205 | 206 | Parameters: 207 | ---------- 208 | Y: numpy.ndarray 209 | Target values, shape (n_classes, m samples) 210 | """ 211 | if self.is_classifier and self.n_L == 1: 212 | cost = (-1 / m) * ( 213 | Y * np.log(self.layer_activations[-1]) + 214 | (1 - Y) * np.log(1 - self.layer_activations[-1]) 215 | ).sum() 216 | if self.is_classifier and self.n_L > 1: 217 | cost = (-1 / m) * \ 218 | (Y * np.log(self.layer_activations[-1])).sum() 219 | if not self.is_classifier: 220 | cost = (1 / (2 * m)) * \ 221 | ((Y - self.layer_activations[-1]) ** 2).sum() 222 | logger.debug(f'cost = {cost}') 223 | self.costs.append(cost) 224 | 225 | def fit(self, X, Y, epochs=100): 226 | """Fits the neural network with training data 227 | 228 | The fitting is done via multiple epochs of gradient descent. 229 | Each iteration has a feed forward step and a back propagation 230 | step. 231 | 232 | Note Y is one hot encoded if necessary. 233 | 234 | Parameters: 235 | ---------- 236 | X: numpy.ndarray 237 | Training data, shape (m samples, n features) 238 | Y: numpy.ndarray 239 | Target values, shape (m samples, 1) 240 | epochs: int, optional, default 100 241 | Number of iterations of gradient descent 242 | """ 243 | if self.n_L > 1: 244 | if Y.shape[0] != self.n_L: 245 | print('One hot encoding Y') 246 | Y = np.eye(self.n_L)[:, Y.reshape(-1).astype(int)] 247 | self.costs = [] 248 | self.d_L_d_Ws = [] 249 | for epoch in range(epochs): 250 | self.feed_forward(X) 251 | self.cost(Y) 252 | self.back_propagation(X, Y) 253 | 254 | def predict(self, X): 255 | """Predicts target values or class labels by forward propagation 256 | 257 | Parameters: 258 | ---------- 259 | X: numpy.ndarray 260 | Training data, shape (m samples, n features) 261 | Returns: 262 | ------- 263 | numpy.ndarray: 264 | Predicted target values or class labels for classification, 265 | Shape is (n^L, m samples) 266 | """ 267 | A_L = self.feed_forward(X) 268 | if not self.is_classifier: 269 | return A_L 270 | if self.is_classifier and self.n_L == 1: 271 | return np.round(A_L).astype(int) 272 | if self.is_classifier and self.n_L > 1: 273 | return np.argmax(A_L, axis=0) 274 | 275 | def predict_proba(self, X): 276 | """Predicts class probabilities for input data 277 | 278 | Parameters: 279 | ---------- 280 | X: numpy.ndarray 281 | Sample data, shape (m samples, n features) 282 | 283 | Returns: 284 | ------- 285 | numpy.ndarray: 286 | Predicted sample class probabilities, 287 | shape (n classes, m samples) 288 | if n_classes > 2 else shape (1, m samples) 289 | """ 290 | A_L = self.feed_forward(X) 291 | if not self.is_classifier: 292 | raise Exception('Must be a classifier') 293 | return A_L 294 | -------------------------------------------------------------------------------- /machine_learning/random_forest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | 5 | from .decision_tree import DecisionTree 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger(__file__) 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | class RandomForest(): 13 | 14 | def __init__(self, 15 | max_depth=2, 16 | min_samples_split=2, 17 | min_samples_leaf=1, 18 | n_classes=2, 19 | max_features='sqrt', 20 | impurity='gini', 21 | is_classifier=True, 22 | n_trees=10, 23 | bootstrap=True): 24 | """Random forest model 25 | 26 | Parameters: 27 | ---------- 28 | max_depth: int 29 | The maximum depth allowed when "growing" a tree 30 | min_samples_split: int 31 | The minimum number of samples required to allow a split at 32 | a the node 33 | min_samples_leaf: int 34 | The minimum number of samples allowed in a leaf. A split 35 | candidate leading to less samples in a node than the 36 | min_samples_leaf will be rejected 37 | n_classes: int, optional, default 2 38 | Number of classes in a classification setting. Ignored when 39 | self.is_classifier = False 40 | max_features: int, optional, default None 41 | If set to 'sqrt' then only a random subset of features are 42 | used to split at each node, the number of features used in 43 | this case is sqrt(n_features). 44 | Else all the features are considered when splitting at each 45 | node 46 | impurity: str, optional, default 'gini' 47 | The impurity measure to use when splitting at each node. 48 | I have currently only implemented two 49 | 'gini' - Uses the gini impurity (for classification) 50 | 'mse' - Uses the mean square error - equal to variance (for 51 | regression) 52 | is_classifier: bool, optional, default True 53 | Is the model used as part of a classification problem 54 | or a regression problem. Should be set to True if 55 | classification, False if regression 56 | n_trees: int, optional, default 10 57 | Number of trees in the forest 58 | bootstrap: bool, optional, default True 59 | Whether to bootstrap the data when fitting the trees 60 | """ 61 | self.max_depth = max_depth 62 | self.min_samples_split = min_samples_split 63 | self.min_samples_leaf = min_samples_leaf 64 | self.n_classes = n_classes 65 | self.max_features = max_features 66 | self.impurity = impurity 67 | self.is_classifier = is_classifier 68 | 69 | self.n_trees = n_trees 70 | self.bootstrap = bootstrap 71 | self.is_fitted = False 72 | self.trees = [] 73 | np.random.seed(1) 74 | 75 | def fit(self, X, y): 76 | """Fit the random forest model 77 | 78 | This method fits n_trees trees on the data with bootstrap 79 | samples. A random subset of the features is used at each split. 80 | 81 | 82 | Parameters: 83 | ---------- 84 | X: numpy.ndarray 85 | Training data, shape (m samples, n features) 86 | y: numpy.ndarray 87 | Target values, shape (m samples, 1) 88 | If classifier with n_classes the values are assumed to be in 89 | 0, ..., n-1 90 | """ 91 | y_shape = (X.shape[0], 1) 92 | data = np.concatenate((X, y.reshape(y_shape)), axis=1) 93 | for i, data in enumerate(self._samples(data)): 94 | tree = DecisionTree( 95 | max_depth=self.max_depth, 96 | min_samples_split=self.min_samples_split, 97 | min_samples_leaf=self.min_samples_leaf, 98 | n_classes=self.n_classes, 99 | max_features=self.max_features, 100 | impurity=self.impurity, 101 | is_classifier=self.is_classifier) 102 | logger.info(f'Fitting tree {i}') 103 | tree.fit(X, y) 104 | self.trees.append(tree) 105 | self.is_fitted = True 106 | 107 | def _samples(self, data): 108 | """Bootstrap sample generator 109 | 110 | Parameters: 111 | ---------- 112 | data: numpy.ndarray 113 | The input data with shape (m samples, n features + 1 target) 114 | Note the last column of the data are the target values 115 | 116 | Yields: 117 | numpy.ndarray: Bootstrap sample of data 118 | """ 119 | n_rows = data.shape[0] 120 | for _ in range(self.n_trees): 121 | if not self.bootstrap: 122 | yield data 123 | else: 124 | random_rows = np.random.choice(np.arange(n_rows), 125 | size=n_rows, 126 | replace=True) 127 | yield data[random_rows, :] 128 | 129 | def predict_proba(self, data): 130 | """Predicts class probabilities for input data 131 | 132 | The class probability predictions from each tree are averaged to 133 | provide the overall class prediction probabilities 134 | 135 | Parameters: 136 | ---------- 137 | data: numpy.ndarray 138 | The input data with shape (m samples, n features) 139 | 140 | Returns: 141 | ------- 142 | numpy.ndarray: 143 | Predicted sample class probabilities, 144 | shape (m samples, n classes) 145 | """ 146 | if not self.is_fitted: 147 | raise Exception('Forest not fitted') 148 | # samples, classes, trees 149 | return np.stack(list(tree.predict_proba(data) for tree in self.trees), 150 | axis=-1).sum(axis=-1) / self.n_trees 151 | 152 | def predict(self, data): 153 | """Predicts target values or class labels for classification 154 | 155 | Parameters: 156 | ---------- 157 | data: numpy.ndarray 158 | The input data with shape (m samples, n features) 159 | 160 | Returns: 161 | ------- 162 | numpy.ndarray: 163 | Predicted target values or class labels for classification 164 | """ 165 | if self.is_classifier: 166 | return np.argmax(self.predict_proba(data), axis=-1) 167 | else: 168 | return np.stack( 169 | list(tree.predict(data) for tree in self.trees), 170 | axis=-1).mean(axis=-1) 171 | 172 | def render(self, tree_id, feature_names): 173 | """Returns Digraph visualizing one of the decision trees 174 | 175 | Parameters: 176 | ---------- 177 | tree_id: [type] 178 | tree index to display 179 | feature_names: [type] 180 | Feature names 181 | 182 | Returns: 183 | ------- 184 | graphviz.Digraph: 185 | dot for tree diagram visual 186 | """ 187 | return self.trees[tree_id].render(feature_names) 188 | -------------------------------------------------------------------------------- /machine_learning/tree.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | 4 | import numpy as np 5 | from graphviz import Digraph 6 | 7 | logging.basicConfig() 8 | logger = logging.getLogger(__file__) 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | class TreeNode(): 13 | 14 | count = itertools.count() 15 | 16 | def __init__(self, 17 | data, 18 | max_depth, 19 | min_samples_split, 20 | min_samples_leaf, 21 | n_classes=2, 22 | max_features=None, 23 | depth=0, 24 | impurity='gini', 25 | is_classifier=True): 26 | """ 27 | A single node in a decision tree 28 | 29 | After recursive splitting of the input data, a given node 30 | represents one split of the tree if it is not a leaf node. The 31 | leaf node stores the training samples in that leaf to be used 32 | for prediction. 33 | The splitting nodes record the feature to split on as attribute 34 | self.best_feature_index and the splitting value as attribute 35 | self.best_feature_split_val 36 | 37 | Parameters: 38 | ---------- 39 | data: numpy.ndarray 40 | The input data with shape (m samples, n features + 1 target) 41 | Note the last column of the data are the target values 42 | max_depth: int 43 | The maximum depth allowed when "growing" a tree 44 | min_samples_split: int 45 | The minimum number of samples required to allow a split at 46 | a the node 47 | min_samples_leaf: int 48 | The minimum number of samples allowed in a leaf. A split 49 | candidate leading to less samples in a node than the 50 | min_samples_leaf will be rejected 51 | n_classes: int, optional, default 2 52 | Number of classes in a classification setting. Ignored when 53 | self.is_classifier = False 54 | max_features: int, optional, default None 55 | If set to 'sqrt' then only a random subset of features are 56 | used to split at the node, the number of features used in 57 | this case is sqrt(n_features). 58 | Else all the features are considered when splitting at this 59 | node 60 | depth: int, optional, default 0 61 | The depth of the node in the tree 62 | impurity: str, optional, default 'gini' 63 | The impurity measure to use when splitting at the node. 64 | I have currently only implemented two 65 | 'gini' - Uses the gini impurity (for classification) 66 | 'mse' - Uses the mean square error - equal to variance (for 67 | regression) 68 | is_classifier: bool, optional, default True 69 | Is the tree node used as part of a classification problem 70 | or a regression problem. Should be set to True if 71 | classification, False if regression 72 | """ 73 | self.data = data 74 | self.max_depth = max_depth 75 | self.min_samples_split = min_samples_split 76 | self.min_samples_leaf = min_samples_leaf 77 | self.n_classes = n_classes 78 | self.max_features = max_features 79 | self.depth = depth 80 | self.impurity = impurity 81 | self.is_classifier = is_classifier 82 | 83 | self.data_shape = data.shape 84 | self.split_attempted = False 85 | self.best_split_impurity = None 86 | self.best_feature_index = None 87 | self.best_feature_split_val = None 88 | self.is_leaf = False 89 | self.node_impurity = self.calculate_impurity([data[:, -1]]) 90 | self.value = self._init_value(data) 91 | self.id = str(next(self.count)) 92 | 93 | def __repr__(self): 94 | return ( 95 | f'') 102 | 103 | @property 104 | def is_root(self): 105 | return self.depth == 0 106 | 107 | def info(self): 108 | return dict( 109 | data_shape=self.data_shape, 110 | n_classes=self.n_classes, 111 | depth=self.depth, 112 | min_samples_split=self.min_samples_split, 113 | min_samples_leaf=self.min_samples_leaf, 114 | node_impurity=self.node_impurity, 115 | split_attempted=self.split_attempted, 116 | best_split_impurity=self.best_split_impurity, 117 | best_feature_index=self.best_feature_index, 118 | best_feature_split_val=self.best_feature_split_val, 119 | is_root=self.is_root) 120 | 121 | def _init_value(self, data): 122 | """ 123 | Returns the terminal node value based on the input data 124 | 125 | For a classifier this is the class_counts. 126 | For a regressor this is the average y value. 127 | 128 | Note this value can be access at a splitting node to see what 129 | the prediction would have been at that level of the tree 130 | 131 | Parameters: 132 | ---------- 133 | data: numpy.ndarray 134 | The input data with shape (m samples, n features + 1 target) 135 | Note the last column of the data are the target values 136 | 137 | Returns: 138 | ------- 139 | numpy.ndarray or float: 140 | Class counts if classifier, else mean of target values 141 | """ 142 | if self.is_classifier: 143 | return np.bincount( 144 | data[:, -1].astype(int), 145 | minlength=self.n_classes) 146 | else: 147 | return np.mean(data[:, -1]) 148 | 149 | def split(self, feature_index, feature_split_val, only_y=True): 150 | """ 151 | Splits self.data on feature with index feature_index using 152 | feature_split_val. 153 | 154 | Each sample is included in left output if the feature value for 155 | the sample is less than or equal to the feature_split_val else 156 | it is included in the right output 157 | 158 | Parameters: 159 | ---------- 160 | feature_index: int 161 | Index of the feature (column) in self.data 162 | feature_split_val: float 163 | Feature value to use when splitting data 164 | only_y: bool, optional, default True 165 | Return only the y values in left and right - this is used 166 | when checking candidate split purity increase 167 | 168 | Returns: 169 | ------- 170 | (numpy.ndarray, numpy.ndarray): 171 | left and right splits of self.data 172 | """ 173 | assert feature_index in range(self.data.shape[1]) 174 | if only_y: 175 | select = -1 176 | else: 177 | select = slice(None) 178 | left_mask = self.data[:, feature_index] <= feature_split_val 179 | right_mask = ~ left_mask 180 | left = self.data[left_mask, select] 181 | right = self.data[right_mask, select] 182 | logger.debug( 183 | f'Splitting on feature_index {feature_index} with ' 184 | f'feature_split_val = {feature_split_val} creates left ' 185 | f'with shape {left.shape} and right with ' 186 | f'shape {right.shape}') 187 | return left, right 188 | 189 | def gini_impurity(self, groups): 190 | """ 191 | Calculate the Gini impurity for groups of values 192 | 193 | The impurity returned is the weighted average of the impurity 194 | of the groups. 195 | 196 | You can think of gini impurity as the probability of incorrectly 197 | predicting a random sample from a group if the prediction was 198 | made based purely on the distribution of class labels in the 199 | group 200 | 201 | 202 | Parameters: 203 | ---------- 204 | groups: tuple 205 | The groups tuple is made up of arrays of values. It is 206 | often called with groups = (left, right) to find the purity 207 | of the candidate split 208 | 209 | Returns: 210 | ------- 211 | float: 212 | Gini impurity 213 | """ 214 | gini = 0 215 | total_samples = sum(group.shape[0] for group in groups) 216 | for i, group in enumerate(groups): 217 | group = group.astype(int) 218 | class_counts = np.bincount(group, minlength=self.n_classes) 219 | group_size = class_counts.sum() 220 | class_probs = class_counts / group_size 221 | unique_classes = np.count_nonzero(class_counts) 222 | group_gini = (class_probs * (1 - class_probs)).sum() 223 | gini += group_gini * (group_size / total_samples) 224 | logger.debug( 225 | f'Group {i} has size {group.shape[0]} with ' 226 | f'{unique_classes} unique classes ' 227 | f'with Gini index {group_gini:.3}') 228 | return gini 229 | 230 | def mean_square_impurity(self, groups): 231 | """ 232 | Calculates the mean square error impurity 233 | 234 | The mse impurity is the weighted average of the group variances 235 | 236 | Parameters: 237 | ---------- 238 | groups: tuple 239 | The groups tuple is made up of arrays of values. It is 240 | often called with groups = (left, right) to find the purity 241 | of the candidate split 242 | 243 | Returns: 244 | ------- 245 | float: 246 | Mean square error impurity 247 | """ 248 | mean_square_error = 0 249 | total_samples = sum(group.shape[0] for group in groups) 250 | for i, group in enumerate(groups): 251 | group_size = group.shape[0] 252 | group_mean = np.mean(group) 253 | group_mean_square_error = np.mean((group - group_mean) ** 2) 254 | mean_square_error += group_mean_square_error * \ 255 | (group_size / total_samples) 256 | logger.debug( 257 | f'Group {i} has size {group.shape[0]} with ' 258 | f'with MSE impurity {group_mean_square_error:.3}') 259 | logger.debug(f'MSE candidate {mean_square_error}') 260 | return mean_square_error 261 | 262 | def calculate_impurity(self, groups): 263 | """ 264 | Calculates impurity based on self.impurity setting 265 | 266 | Parameters: 267 | ---------- 268 | groups: tuple 269 | The groups tuple is made up of arrays of values. It is 270 | often called with groups = (left, right) to find the purity 271 | of the candidate split 272 | 273 | Returns: 274 | ------- 275 | float: 276 | Mean square error of groups if self.impurity = 'mse' 277 | Gini impurity of groups if self.impurity = 'mse' 278 | """ 279 | if self.impurity == 'gini': 280 | return self.gini_impurity(groups) 281 | elif self.impurity == 'mse': 282 | return self.mean_square_impurity(groups) 283 | 284 | def check_split(self, feature_index, feature_split_val): 285 | """ 286 | Updates best split if candidate split is better 287 | 288 | Splits the data in groups using self.split. Checks min samples 289 | leaf condition after split. Calculates impurity of the split 290 | then if impurity is less than best split already found and less 291 | than the current node impurity the best_feature_index, the 292 | best_feature_split_val and the best_split_impurity values are 293 | updated. 294 | 295 | Parameters: 296 | ---------- 297 | feature_index: int 298 | Index of the feature (column) in self.data 299 | feature_split_val: float 300 | Feature value to use when splitting data 301 | """ 302 | groups = self.split(feature_index, feature_split_val) 303 | if any(len(group) < self.min_samples_leaf for group in groups): 304 | logger.debug( 305 | f"Can't split node on feature {feature_index} with split " 306 | f"val {feature_split_val} due to min_samples_leaf condition") 307 | return None 308 | split_impurity = self.calculate_impurity(groups) 309 | best_current_impurity = ( 310 | 10**10 if self.best_split_impurity is None 311 | else self.best_split_impurity) 312 | if ((split_impurity < best_current_impurity) and 313 | (split_impurity < self.node_impurity)): 314 | logger.debug( 315 | f'Found new best split with feature_split_val=' 316 | f'{feature_split_val} for feature_index = {feature_index} ' 317 | f'and split_impurity = {split_impurity:.2f}') 318 | self.best_feature_index = feature_index 319 | self.best_feature_split_val = feature_split_val 320 | self.best_split_impurity = split_impurity 321 | 322 | def find_best_split(self): 323 | """ 324 | Finds best split at the node 325 | 326 | Loops through each feature and each unique value of that feature 327 | checking for the best candidate split (i.e. the split that 328 | reduces the impurity the most) 329 | 330 | The function first checks if we have reached the max depth or if 331 | self.data < self.min_samples_split. In either case no further 332 | split is allowed and the function returns 333 | 334 | All features are considered unless self.max_features == 'sqrt' 335 | in which case a random subset of features are used of size 336 | sqrt(n_features) 337 | """ 338 | if self.depth == self.max_depth: 339 | return 340 | if self.data.shape[0] < self.min_samples_split: 341 | logger.info(f"{self} can't split as samples < min_samples_split") 342 | return None 343 | if self.node_impurity == 0: 344 | logger.info(f"Can't improve as node pure") 345 | return None 346 | n_features = self.data.shape[1] - 1 347 | all_feature_indices = np.arange(n_features) 348 | if self.max_features == 'sqrt': 349 | features_to_check = np.random.choice( 350 | all_feature_indices, 351 | size=np.sqrt(n_features).astype(int)) 352 | else: 353 | features_to_check = all_feature_indices 354 | logger.info(f'Checking features {features_to_check}') 355 | for feature_index in features_to_check: 356 | for feature_split_val in np.unique(self.data[:, feature_index]): 357 | self.check_split(feature_index, feature_split_val) 358 | self.split_attempted = True 359 | 360 | def recursive_split(self): 361 | """ 362 | Recursively grows tree by splitting to reduce impurity the most 363 | 364 | The function finds the best split using the find_best_split 365 | method. If there was a split found two nodes are created - left 366 | and right. Finally the recursive_split method is called on each 367 | of the new nodes. 368 | 369 | Note the depth of the children node is incremented, otherwise 370 | the node settings such as min_samples_split are passed to the 371 | children nodes 372 | """ 373 | self.find_best_split() 374 | if self.best_feature_index is not None: 375 | logger.info(f'Splitting tree on feature_index ' 376 | f'{self.best_feature_index} and feature_split_val ' 377 | f'{self.best_feature_split_val:.2f}') 378 | left, right = self.split( 379 | feature_index=self.best_feature_index, 380 | feature_split_val=self.best_feature_split_val, 381 | only_y=False) 382 | del self.data 383 | self.left = TreeNode( 384 | data=left, 385 | max_depth=self.max_depth, 386 | min_samples_split=self.min_samples_split, 387 | min_samples_leaf=self.min_samples_leaf, 388 | n_classes=self.n_classes, 389 | max_features=self.max_features, 390 | depth=self.depth + 1, 391 | impurity=self.impurity, 392 | is_classifier=self.is_classifier) 393 | self.right = TreeNode( 394 | data=right, 395 | max_depth=self.max_depth, 396 | min_samples_split=self.min_samples_split, 397 | min_samples_leaf=self.min_samples_leaf, 398 | n_classes=self.n_classes, 399 | max_features=self.max_features, 400 | depth=self.depth + 1, 401 | impurity=self.impurity, 402 | is_classifier=self.is_classifier) 403 | self.left.recursive_split() 404 | self.right.recursive_split() 405 | else: 406 | logger.info('Reached max depth or no splits reduce impurity') 407 | self.is_leaf = True 408 | 409 | def walk_depth_first(self, only_leaves=True): 410 | """ 411 | Generator traversing of all nodes below and including this node 412 | 413 | Depth first so visiting children before siblings 414 | 415 | Parameters: 416 | ---------- 417 | only_leaves: bool, optional, default True 418 | Only return leaf nodes 419 | 420 | Yields: 421 | TreeNode: each node in tree 422 | """ 423 | if self.is_leaf: 424 | yield self 425 | else: 426 | if not only_leaves: 427 | yield self 428 | for node in (self.left, self.right): 429 | yield from node.walk_depth_first(only_leaves) 430 | 431 | def walk_breadth_first(self, layer=None): 432 | """ 433 | Generator traversing of all nodes below and including this node 434 | 435 | Breadth first so visiting siblings before children 436 | 437 | Parameters: 438 | ---------- 439 | only_leaves: bool, optional, default True 440 | Only return leaf nodes 441 | 442 | Yields: 443 | TreeNode: each node in tree 444 | """ 445 | if layer is None: 446 | layer = [self] 447 | for node in layer: 448 | yield node 449 | new_layer = [ 450 | child 451 | for node_children in [[node.left, node.right] 452 | for node in layer if not node.is_leaf] 453 | for child in node_children] 454 | if new_layer: 455 | yield from self.walk_breadth_first(new_layer) 456 | 457 | def print_tree(self): 458 | """ 459 | prints ascii representation of tree below this node 460 | """ 461 | for node in self.walk_depth_first(only_leaves=False): 462 | print('--' * node.depth + str(node)) 463 | 464 | def predict_row_proba(self, row): 465 | """ 466 | Predicts class probabilities for input row by walking the tree 467 | and returning the leaf node class probabilities 468 | 469 | Parameters: 470 | ---------- 471 | row: numpy.ndarray 472 | Input row, shape (n features,) 473 | 474 | Returns: 475 | ------- 476 | numpy.ndarray: 477 | Class probabilities, shape (n classes, ) 478 | """ 479 | if self.is_leaf: 480 | group_size = self.value.sum() 481 | class_probs = self.value / group_size 482 | return class_probs 483 | elif row[self.best_feature_index] <= self.best_feature_split_val: 484 | return self.left.predict_row_proba(row) 485 | else: 486 | return self.right.predict_row_proba(row) 487 | 488 | def predict_proba(self, data): 489 | """Predicts class probabilities for input data 490 | 491 | Predicts class probabilities for each row in data by walking the 492 | tree and returning the leaf node class probabilities 493 | 494 | Parameters: 495 | ---------- 496 | data: numpy.ndarray 497 | The input data with shape (m samples, n features) 498 | 499 | Returns: 500 | ------- 501 | numpy.ndarray: 502 | Predicted sample class probabilities, 503 | shape (m samples, n classes) 504 | """ 505 | if not self.is_classifier: 506 | raise Exception('Not a classifier') 507 | if len(data.shape) == 2: 508 | return np.stack([self.predict_row_proba(row) 509 | for row in data]) 510 | else: 511 | return self.predict_row_proba(data) 512 | 513 | def predict_regressor_row(self, row): 514 | """ 515 | Predicts target value for input row by walking the tree 516 | and returning the leaf node value 517 | 518 | Parameters: 519 | ---------- 520 | row: numpy.ndarray 521 | Input row, shape (n features,) 522 | 523 | Returns: 524 | ------- 525 | float: 526 | Predicted target value 527 | """ 528 | if self.is_leaf: 529 | return self.value 530 | elif row[self.best_feature_index] <= self.best_feature_split_val: 531 | return self.left.predict_regressor_row(row) 532 | else: 533 | return self.right.predict_regressor_row(row) 534 | 535 | def predict_regressor(self, data): 536 | """ 537 | Predicts target values for each row in data by walking the 538 | tree and returning the leaf node values 539 | 540 | Parameters: 541 | ---------- 542 | data: numpy.ndarray 543 | The input data with shape (m samples, n features) 544 | 545 | Returns: 546 | ------- 547 | numpy.ndarray: 548 | Predicted target values, shape (m samples, 1) 549 | """ 550 | if len(data.shape) == 2: 551 | return np.stack([self.predict_regressor_row(row) 552 | for row in data]) 553 | else: 554 | return self.predict_regressor_row(data) 555 | 556 | def predict(self, data): 557 | """Predicts target values or class labels for classification 558 | 559 | Predicts target values/class for each row in data by walking the 560 | tree and returning the leaf node value for regression or the 561 | class with the largest predicted probability for classification 562 | 563 | Parameters: 564 | ---------- 565 | data: numpy.ndarray 566 | The input data with shape (m samples, n features) 567 | 568 | Returns: 569 | ------- 570 | numpy.ndarray: 571 | Predicted target values or class labels for classification 572 | """ 573 | if self.is_classifier: 574 | return np.argmax(self.predict_proba(data), axis=-1) 575 | else: 576 | return self.predict_regressor(data) 577 | 578 | def dot(self, 579 | feature_names, 580 | samples=True, 581 | impurity=True, 582 | value=True): 583 | """ 584 | Returns Digraph visualizing the tree below this node 585 | 586 | Parameters: 587 | ---------- 588 | feature_names: list[str] 589 | List of feature names 590 | samples: bool, optional, default True 591 | Whether to display the number of samples on this node 592 | impurity: bool, optional, default True 593 | Whether to display the impurity value on this node 594 | value: bool, optional, default True 595 | Whether to dispaly the value on this node 596 | 597 | Returns: 598 | ------- 599 | graphviz.Digraph: 600 | dot for tree diagram visual 601 | """ 602 | dot = Digraph( 603 | comment='Decsion Tree', 604 | node_attr=dict(shape="rectangle", 605 | style="rounded", 606 | fillcolor="#028d35")) 607 | for i, node in enumerate(self.walk_breadth_first()): 608 | label = "" 609 | if not node.is_leaf: 610 | label += ( 611 | f'{feature_names[node.best_feature_index]} <= ' 612 | f'{node.best_feature_split_val}\n') 613 | dot.edge(node.id, node.left.id) 614 | dot.edge(node.id, node.right.id) 615 | if samples: 616 | label += f'Samples = {node.data_shape[0]}\n' 617 | if impurity: 618 | label += f'Impurity = {node.node_impurity:.2f}\n' 619 | if value: 620 | if self.is_classifier: 621 | label += f'Class counts = {str(node.value)}\n' 622 | else: 623 | label += f'Average y = {node.value:.2f}\n' 624 | dot.node(name=node.id, label=label) 625 | return dot 626 | -------------------------------------------------------------------------------- /notebooks/linear_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning Implementation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "heading_collapsed": true 14 | }, 15 | "source": [ 16 | "## Imports" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "hidden": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import json\n", 28 | "\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "import plotly.offline as py\n", 32 | "from plotly import graph_objects as go" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "heading_collapsed": true 39 | }, 40 | "source": [ 41 | "## Linear regression" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "heading_collapsed": true, 48 | "hidden": true 49 | }, 50 | "source": [ 51 | "### The maths" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "hidden": true 58 | }, 59 | "source": [ 60 | "The linear model (or line of best fit in 2D) aims to describe the continuous y vairable a.k.a the target variable (e.g. house prices) as a linear combination of features (e.g. square footage / number of bedrooms) the features are also refered to as the design matrix." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "hidden": true 67 | }, 68 | "source": [ 69 | "$$\n", 70 | "\\begin{align}\n", 71 | "\\hat{y}&=\\beta_0x_0+\\cdots+\\beta_nx_n\\quad &n\\in \\mathbb{N}, x_o = 1 \\\\\n", 72 | "\\hat{y}&=\\sum^{n}_{i=0}\\beta_ix_i \\\\\n", 73 | "\\hat{y}&=\\mathbf{\\boldsymbol{\\beta}^Tx}\\quad&\\boldsymbol{\\beta},\\mathbf{x}\\in\\mathbb{R}^{(n+1)\\times1}\\\\\n", 74 | "\\hat{y}&=g(\\boldsymbol{\\beta}^T\\mathbf{x})\n", 75 | "\\end{align}\n", 76 | "$$" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "hidden": true 83 | }, 84 | "source": [ 85 | "where g, the activation function, is the identidy in linear regression \n", 86 | "\n", 87 | "We define the cost function as half of the mean square error:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "hidden": true 94 | }, 95 | "source": [ 96 | "$$\n", 97 | "\\begin{align}\n", 98 | "J(\\boldsymbol{\\beta})\n", 99 | "&= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n", 100 | "y^j-\\hat{y}^j\n", 101 | "\\right)^2,\\quad m\\in \\mathbb{N} \\text{ is the number of training samples}\\\\\n", 102 | "&= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n", 103 | "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\n", 104 | "\\right)^2\n", 105 | "\\end{align}\n", 106 | "$$" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "hidden": true 113 | }, 114 | "source": [ 115 | "We need to differentiate the cost function i.e. find the gradient" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "hidden": true 122 | }, 123 | "source": [ 124 | "$$\n", 125 | "\\begin{align}\n", 126 | "\\frac{\\partial J}{\\partial\\beta_k}\\left(\\boldsymbol{\\beta}\\right) &= \\frac{\\partial}{\\partial\\beta_k}\\left(\n", 127 | "\\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n", 128 | "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\\right)^2\n", 129 | "\\right)\\\\\n", 130 | "&= \\frac{\\partial}{\\partial\\beta_k}\\left(\n", 131 | "\\frac{1}{2m}\\sum^{m}_{j=1}\n", 132 | "\\left(\n", 133 | "y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\n", 134 | "\\right)^2\n", 135 | "\\right)\\\\\n", 136 | "&=\n", 137 | "\\frac{1}{m}\\sum^{m}_{j=1}\n", 138 | "\\left(\n", 139 | "y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\n", 140 | "\\right)(-x^j_k)\\\\\n", 141 | "\\end{align}\n", 142 | "$$" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "hidden": true 149 | }, 150 | "source": [ 151 | "hence" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "hidden": true 158 | }, 159 | "source": [ 160 | "$$\n", 161 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 162 | "=\n", 163 | "\\begin{bmatrix}\n", 164 | " \\frac{\\partial J}{\\partial\\beta_1} \\\\\n", 165 | " \\vdots \\\\\n", 166 | " \\frac{\\partial J}{\\partial\\beta_n}\n", 167 | "\\end{bmatrix}\n", 168 | "=\n", 169 | "\\begin{bmatrix}\n", 170 | " -\\frac{1}{m}\\sum^{m}_{j=1}\n", 171 | " \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_1\\\\\n", 172 | " \\vdots \\\\\n", 173 | " -\\frac{1}{m}\\sum^{m}_{j=1}\n", 174 | " \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_n\\\\\n", 175 | "\\end{bmatrix}\n", 176 | "$$" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "hidden": true 183 | }, 184 | "source": [ 185 | "Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "hidden": true 192 | }, 193 | "source": [ 194 | "$$\n", 195 | "\\mathbf{X}\\in\\mathbb{R}^{m\\times (n+1)},\n", 196 | "\\quad \\mathbf{y}\\in\\mathbb{R}^{m\\times 1},\n", 197 | "\\quad \\boldsymbol{\\beta}\\in\\mathbb{R}^{(n+1)\\times1}\n", 198 | "$$" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "hidden": true 205 | }, 206 | "source": [ 207 | "$$\n", 208 | "\\mathbf{X}=\\begin{bmatrix}\n", 209 | " 1 & x_1^1 & x_2^1 & \\dots & x_n^1 \\\\\n", 210 | " 1 & x_1^2 & x_2^2 & \\dots & x_n^2 \\\\\n", 211 | " \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", 212 | " 1 & x_1^m & x_2^m & \\dots & x_n^m \\\\\n", 213 | "\\end{bmatrix}\\quad\n", 214 | "\\mathbf{y}=\\begin{bmatrix}\n", 215 | " y_1\\\\y_2\\\\\\vdots\\\\y_m\n", 216 | "\\end{bmatrix}\\quad\n", 217 | "\\boldsymbol{\\beta} = \\begin{bmatrix}\n", 218 | " \\beta_0\\\\\\beta_1\\\\\\vdots\\\\\\beta_n\n", 219 | "\\end{bmatrix}\n", 220 | "$$" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "hidden": true 227 | }, 228 | "source": [ 229 | "$$\n", 230 | "\\begin{align}\n", 231 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 232 | "&=\n", 233 | "\\begin{bmatrix}\n", 234 | " -\\frac{1}{m}\\sum^{m}_{j=1}\n", 235 | " \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_1\\\\\n", 236 | " \\vdots \\\\\n", 237 | " -\\frac{1}{m}\\sum^{m}_{j=1}\n", 238 | " \\left(y^j-\\sum^{n}_{i=0}\\beta_ix_i^j\\right)x^j_n\\\\\n", 239 | "\\end{bmatrix}\n", 240 | "=-\\frac{1}{m}\n", 241 | "\\begin{bmatrix}\n", 242 | " \\sum^{m}_{j=1}y^jx^j_1\\\\\n", 243 | " \\vdots \\\\\n", 244 | " \\sum^{m}_{j=1}y^jx^j_n\\\\\n", 245 | "\\end{bmatrix}+\n", 246 | "\\frac{1}{m}\n", 247 | "\\begin{bmatrix}\n", 248 | " \\sum^{m}_{j=1}\\sum^{n}_{i=0}\\beta_ix_i^jx^j_1\\\\\n", 249 | " \\vdots \\\\\n", 250 | " \\sum^{m}_{j=1}\\sum^{n}_{i=0}\\beta_ix_i^jx^j_n\n", 251 | "\\end{bmatrix}\\\\\n", 252 | "\\end{align}\n", 253 | "$$" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "hidden": true 260 | }, 261 | "source": [ 262 | "so" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "hidden": true 269 | }, 270 | "source": [ 271 | "$$\n", 272 | "\\begin{align}\n", 273 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 274 | "&=\\frac{1}{m}\\left(\n", 275 | "\\mathbf{X}^T\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{X}^T\\mathbf{y}\n", 276 | "\\right)\\\\\n", 277 | "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n", 278 | "\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{y}\n", 279 | "\\right)\\\\\n", 280 | "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n", 281 | "\\mathbf{\\hat{y}}-\\mathbf{y}\n", 282 | "\\right)\n", 283 | "\\end{align}\n", 284 | "$$" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "hidden": true 291 | }, 292 | "source": [ 293 | "where" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "hidden": true 300 | }, 301 | "source": [ 302 | "$$\n", 303 | "\\mathbf{\\hat{y}} = \\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}\n", 304 | "$$" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": { 310 | "hidden": true 311 | }, 312 | "source": [ 313 | "We could have derived the same thing using matrix calculus - noting the following:" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": { 319 | "hidden": true 320 | }, 321 | "source": [ 322 | "$$\n", 323 | "\\begin{align}\n", 324 | "J(\\boldsymbol{\\beta}) &= \\frac{1}{2m}\\sum^{m}_{j=1}\\left(\n", 325 | "y^j-g(\\boldsymbol{\\beta}^T\\mathbf{x}^j)\n", 326 | "\\right)^2\\\\\n", 327 | "&= \\frac{1}{2m}\\left(\n", 328 | "\\mathbf{y}-\\mathbf{\\hat{y}}\n", 329 | "\\right)^T\n", 330 | "\\left(\n", 331 | "\\mathbf{y}-\\mathbf{\\hat{y}}\n", 332 | "\\right)\\\\\n", 333 | "&= \\frac{1}{2m}\\left(\n", 334 | "\\mathbf{y}-\\mathbf{X}\\boldsymbol{\\beta}\n", 335 | "\\right)^T\n", 336 | "\\left(\n", 337 | "\\mathbf{y}-\\mathbf{X}\\boldsymbol{\\beta}\n", 338 | "\\right)\\\\\n", 339 | "&= \\frac{1}{2m}\\left(\n", 340 | "\\mathbf{y}^T\\mathbf{y}\n", 341 | "-\\boldsymbol{\\beta}^T\\mathbf{X}^T\\mathbf{y}\n", 342 | "-\\mathbf{y}^T\\mathbf{X}\\boldsymbol{\\beta}\n", 343 | "+\\boldsymbol{\\beta}^T\\mathbf{X}^T\\mathbf{X}\\boldsymbol{\\beta}\n", 344 | "\\right)\\\\\n", 345 | "\\end{align}\n", 346 | "$$" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": { 352 | "hidden": true 353 | }, 354 | "source": [ 355 | "and" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "hidden": true 362 | }, 363 | "source": [ 364 | "$$\n", 365 | "\\frac{\\partial}{\\partial\\mathbf{\\boldsymbol{\\beta}}}\n", 366 | "\\left(\n", 367 | "A^T\\boldsymbol{\\beta}\n", 368 | "\\right) = A,\\quad \\forall A\\in\\mathbb{R}^{(n+1)\\times1}\\\\\n", 369 | "$$" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "hidden": true 376 | }, 377 | "source": [ 378 | "and" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "hidden": true 385 | }, 386 | "source": [ 387 | "$$\n", 388 | "\\frac{\\partial}{\\partial\\mathbf{\\boldsymbol{\\beta}}}\n", 389 | "\\left(\n", 390 | "\\boldsymbol{\\beta}^TA\\boldsymbol{\\beta}\n", 391 | "\\right) = 2A\\boldsymbol{\\beta},\\quad \\forall A\\in\\mathbb{R}^{m\\times (n+1)}\\\\\n", 392 | "$$" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": { 398 | "hidden": true 399 | }, 400 | "source": [ 401 | "so" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": { 407 | "hidden": true 408 | }, 409 | "source": [ 410 | "$$\n", 411 | "\\nabla_{\\boldsymbol{\\beta}}J=\\frac{1}{m}\\left(\n", 412 | "\\mathbf{X}^T\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}}-\\mathbf{X}^T\\mathbf{y}\n", 413 | "\\right)$$" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "heading_collapsed": true, 420 | "hidden": true 421 | }, 422 | "source": [ 423 | "### Make fake data" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 4, 429 | "metadata": { 430 | "hidden": true 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "m = 100\n", 435 | "x0 = np.ones(shape=(m, 1))\n", 436 | "x1 = np.linspace(0, 10, m).reshape(-1, 1)\n", 437 | "X = np.column_stack((x0, x1))\n", 438 | "\n", 439 | "# let y = 0.5 * x + 1 + epsilon\n", 440 | "epsilon = np.random.normal(scale=0.5, size=(m, 1))\n", 441 | "y = x1 + 1 + epsilon" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 5, 447 | "metadata": { 448 | "hidden": true 449 | }, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "application/vnd.jupyter.widget-view+json": { 454 | "model_id": "ffbc482ea5db405283d37d93d5d30c7f", 455 | "version_major": 2, 456 | "version_minor": 0 457 | }, 458 | "text/plain": [ 459 | "FigureWidget({\n", 460 | " 'data': [{'mode': 'markers',\n", 461 | " 'name': 'linear data + noise',\n", 462 | " 'ty…" 463 | ] 464 | }, 465 | "metadata": {}, 466 | "output_type": "display_data" 467 | } 468 | ], 469 | "source": [ 470 | "fig = go.FigureWidget()\n", 471 | "fig = fig.add_scatter(\n", 472 | " x=X[:,1],\n", 473 | " y=y[:,0],\n", 474 | " mode='markers',\n", 475 | " name='linear data + noise')\n", 476 | "fig.layout.title = 'Fake linear data with noise'\n", 477 | "fig.layout.xaxis.title = 'x1'\n", 478 | "fig.layout.yaxis.title = 'y'\n", 479 | "fig" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": { 485 | "heading_collapsed": true, 486 | "hidden": true 487 | }, 488 | "source": [ 489 | "### Linear regression class" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 6, 495 | "metadata": { 496 | "hidden": true 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "class LinearRegression():\n", 501 | "\n", 502 | " def __init__(self, learning_rate=0.05):\n", 503 | " \"\"\" \n", 504 | " Linear regression model\n", 505 | "\n", 506 | " Parameters:\n", 507 | " ----------\n", 508 | " learning_rate: float, optional, default 0.05\n", 509 | " The learning rate parameter controlling the gradient descent\n", 510 | " step size\n", 511 | " \"\"\"\n", 512 | " self.learning_rate = learning_rate\n", 513 | " print('Creating linear model instance')\n", 514 | "\n", 515 | " def __repr__(self):\n", 516 | " return (\n", 517 | " f'')\n", 519 | " \n", 520 | "\n", 521 | " \n", 522 | " def fit(self, X, y, n_iter=1000):\n", 523 | " \"\"\" \n", 524 | " Fit the linear regression model\n", 525 | "\n", 526 | " Updates the weights with n_iter iterations of batch gradient\n", 527 | " descent updates\n", 528 | "\n", 529 | " Parameters:\n", 530 | " ----------\n", 531 | " X: numpy.ndarray\n", 532 | " Training data, shape (m samples, (n - 1) features + 1)\n", 533 | " Note the first column of X is expected to be ones (to allow \n", 534 | " for the bias to be included in beta)\n", 535 | " y: numpy.ndarray\n", 536 | " Target values, shape (m samples, 1)\n", 537 | " n_iter: int, optional, default 1000\n", 538 | " Number of batch gradient descent steps\n", 539 | " \"\"\" \n", 540 | " m, n = X.shape\n", 541 | " print(f'fitting with m={m} samples with n={n-1} features\\n')\n", 542 | " self.beta = np.zeros(shape=(n, 1))\n", 543 | " self.costs = []\n", 544 | " self.betas = [self.beta]\n", 545 | " for iteration in range(n_iter):\n", 546 | " y_pred = self.predict(X)\n", 547 | " cost = self.cost(y, y_pred)\n", 548 | " self.costs.append(cost[0][0])\n", 549 | " gradient = self.gradient(y, y_pred, X)\n", 550 | " self.beta = self.beta - (\n", 551 | " self.learning_rate * gradient)\n", 552 | " self.betas.append(self.beta)\n", 553 | "\n", 554 | " def cost(self, y, y_pred):\n", 555 | " \"\"\" \n", 556 | " Mean square error cost function\n", 557 | "\n", 558 | " Parameters:\n", 559 | " ----------\n", 560 | " y: numpy.ndarray\n", 561 | " True target values, shape (m samples, 1)\n", 562 | " y_pred: numpy.ndarray\n", 563 | " Predicted y values, shape (m samples, 1)\n", 564 | "\n", 565 | " Returns:\n", 566 | " -------\n", 567 | " float:\n", 568 | " mean square error value\n", 569 | " \"\"\"\n", 570 | " m = y.shape[0]\n", 571 | " cost = (1 / (2 * m)) * (y - y_pred).T @ (y - y_pred)\n", 572 | " return cost\n", 573 | "\n", 574 | " def gradient(self, y, y_pred, X):\n", 575 | " \"\"\" \n", 576 | " Calculates the gradient of the cost function\n", 577 | "\n", 578 | " Parameters:\n", 579 | " ----------\n", 580 | " y: numpy.ndarray\n", 581 | " Predicted y values, shape (m samples, 1)\n", 582 | " y_pred: numpy.ndarray\n", 583 | " True target values, shape (m samples, 1)\n", 584 | " X: numpy.ndarray\n", 585 | " Training data, shape (m samples, (n - 1) features + 1)\n", 586 | " Note the first column of X is expected to be ones (to allow \n", 587 | " for the bias to be included in beta)\n", 588 | "\n", 589 | " Returns:\n", 590 | " -------\n", 591 | " numpy.ndarray:\n", 592 | " Derivate of mean square error cost function with respect to\n", 593 | " the weights beta, shape (n features, 1)\n", 594 | " \"\"\"\n", 595 | " m = X.shape[0]\n", 596 | " gradient = (1 / m) * X.T @ (y_pred - y)\n", 597 | " return gradient\n", 598 | "\n", 599 | " def predict(self, X):\n", 600 | " \"\"\" \n", 601 | " Predict the target values from sample X feature values\n", 602 | "\n", 603 | " Parameters:\n", 604 | " ----------\n", 605 | " X: numpy.ndarray\n", 606 | " Training data, shape (m samples, (n - 1) features + 1)\n", 607 | " Note the first column of X is expected to be ones (to allow \n", 608 | " for the bias to be included in beta)\n", 609 | "\n", 610 | " Returns:\n", 611 | " -------\n", 612 | " numpy.ndarray:\n", 613 | " Target value predictions, shape (m samples, 1)\n", 614 | " \"\"\" \n", 615 | " y_pred = X @ self.beta\n", 616 | " return y_pred\n" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 7, 622 | "metadata": { 623 | "hidden": true 624 | }, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "Creating linear model instance\n" 631 | ] 632 | }, 633 | { 634 | "data": { 635 | "text/plain": [ 636 | "" 637 | ] 638 | }, 639 | "execution_count": 7, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "linear_regression = LinearRegression()\n", 646 | "linear_regression" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 8, 652 | "metadata": { 653 | "hidden": true 654 | }, 655 | "outputs": [ 656 | { 657 | "name": "stdout", 658 | "output_type": "stream", 659 | "text": [ 660 | "fitting with m=100 samples with n=1 features\n", 661 | "\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "linear_regression.fit(X, y)" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": { 672 | "heading_collapsed": true, 673 | "hidden": true 674 | }, 675 | "source": [ 676 | "### Plot the best fit" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 10, 682 | "metadata": { 683 | "hidden": true 684 | }, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "application/vnd.jupyter.widget-view+json": { 689 | "model_id": "ffbc482ea5db405283d37d93d5d30c7f", 690 | "version_major": 2, 691 | "version_minor": 0 692 | }, 693 | "text/plain": [ 694 | "FigureWidget({\n", 695 | " 'data': [{'mode': 'markers',\n", 696 | " 'name': 'linear data + noise',\n", 697 | " 'ty…" 698 | ] 699 | }, 700 | "metadata": {}, 701 | "output_type": "display_data" 702 | } 703 | ], 704 | "source": [ 705 | "fig = fig.add_scatter(\n", 706 | " x=X[:,1], \n", 707 | " y=linear_regression.predict(X)[:,0],\n", 708 | " mode='markers',\n", 709 | " name='best fit')\n", 710 | "fig" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": { 716 | "heading_collapsed": true, 717 | "hidden": true 718 | }, 719 | "source": [ 720 | "### Plot the cost function" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 11, 726 | "metadata": { 727 | "hidden": true 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "def plot_surface(linear_regression):\n", 732 | " cost_fig = go.FigureWidget()\n", 733 | " cost_fig = cost_fig.add_scatter(\n", 734 | " x=list(range(len(linear_regression.costs))),\n", 735 | " y=linear_regression.costs,\n", 736 | " mode='markers+lines')\n", 737 | " cost_fig.layout.title = 'Cost by iteration'\n", 738 | " return cost_fig" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 12, 744 | "metadata": { 745 | "hidden": true 746 | }, 747 | "outputs": [ 748 | { 749 | "data": { 750 | "application/vnd.jupyter.widget-view+json": { 751 | "model_id": "c6743fff09c14f8cbdbb792626b6b17d", 752 | "version_major": 2, 753 | "version_minor": 0 754 | }, 755 | "text/plain": [ 756 | "FigureWidget({\n", 757 | " 'data': [{'mode': 'markers+lines',\n", 758 | " 'type': 'scatter',\n", 759 | " 'uid': 'd…" 760 | ] 761 | }, 762 | "metadata": {}, 763 | "output_type": "display_data" 764 | } 765 | ], 766 | "source": [ 767 | "cost_fig = plot_surface(linear_regression)\n", 768 | "cost_fig" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 14, 774 | "metadata": { 775 | "hidden": true 776 | }, 777 | "outputs": [], 778 | "source": [ 779 | "def plot_surface(linear_regression):\n", 780 | " beta0s = [beta[0][0] for beta in linear_regression.betas]\n", 781 | " beta1s = [beta[1][0] for beta in linear_regression.betas]\n", 782 | " beta0_max = max(map(abs, beta0s)) * 1.05\n", 783 | " beta1_max = max(map(abs, beta1s)) * 1.05\n", 784 | "\n", 785 | " gradient_descent_fig = go.FigureWidget()\n", 786 | " gradient_descent_fig = gradient_descent_fig.add_scatter3d(\n", 787 | " x=beta0s,\n", 788 | " y=beta1s,\n", 789 | " z=linear_regression.costs,\n", 790 | " mode='markers+lines',\n", 791 | " marker={'size':3, 'color':'red'})\n", 792 | "\n", 793 | " beta0, beta1 = np.meshgrid(\n", 794 | " np.linspace(-beta0_max, beta0_max, 100),\n", 795 | " np.linspace(-beta1_max, beta1_max, 100))\n", 796 | "\n", 797 | " z = np.diag(\n", 798 | " (1 / (2 * m)) * \\\n", 799 | " (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T)).T @ \\\n", 800 | " (y - (X @ np.column_stack((beta0.ravel(), beta1.ravel())).T))\n", 801 | " ).reshape(beta1.shape)\n", 802 | "\n", 803 | " gradient_descent_fig = gradient_descent_fig.add_surface(\n", 804 | " x=beta0,\n", 805 | " y=beta1,\n", 806 | " z=z,\n", 807 | " opacity=0.8)\n", 808 | " \n", 809 | " gradient_descent_fig.layout.title = 'Cost function surface'\n", 810 | " gradient_descent_fig.layout.scene.xaxis.title = 'beta_0'\n", 811 | " gradient_descent_fig.layout.scene.yaxis.title = 'beta_1'\n", 812 | " gradient_descent_fig.layout.scene.zaxis.title = 'cost' \n", 813 | " # cost = average sum square residuals\n", 814 | " gradient_descent_fig.layout.height = 500\n", 815 | " return gradient_descent_fig" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 15, 821 | "metadata": { 822 | "hidden": true, 823 | "scrolled": false 824 | }, 825 | "outputs": [ 826 | { 827 | "data": { 828 | "application/vnd.jupyter.widget-view+json": { 829 | "model_id": "2545400b812747cdb6a02def35e944b7", 830 | "version_major": 2, 831 | "version_minor": 0 832 | }, 833 | "text/plain": [ 834 | "FigureWidget({\n", 835 | " 'data': [{'marker': {'color': 'red', 'size': 3},\n", 836 | " 'mode': 'markers+lines',\n", 837 | " …" 838 | ] 839 | }, 840 | "metadata": {}, 841 | "output_type": "display_data" 842 | } 843 | ], 844 | "source": [ 845 | "gradient_descent_fig = plot_surface(linear_regression)\n", 846 | "gradient_descent_fig" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": 16, 852 | "metadata": { 853 | "hidden": true 854 | }, 855 | "outputs": [], 856 | "source": [ 857 | "# py.plot(gradient_descent_fig, filename='gradient_descent.html')" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": { 863 | "heading_collapsed": true 864 | }, 865 | "source": [ 866 | "## End" 867 | ] 868 | } 869 | ], 870 | "metadata": { 871 | "kernelspec": { 872 | "display_name": "Python 3", 873 | "language": "python", 874 | "name": "python3" 875 | }, 876 | "language_info": { 877 | "codemirror_mode": { 878 | "name": "ipython", 879 | "version": 3 880 | }, 881 | "file_extension": ".py", 882 | "mimetype": "text/x-python", 883 | "name": "python", 884 | "nbconvert_exporter": "python", 885 | "pygments_lexer": "ipython3", 886 | "version": "3.7.6" 887 | }, 888 | "toc": { 889 | "base_numbering": 1, 890 | "nav_menu": {}, 891 | "number_sections": true, 892 | "sideBar": true, 893 | "skip_h1_title": false, 894 | "title_cell": "Table of Contents", 895 | "title_sidebar": "Contents", 896 | "toc_cell": false, 897 | "toc_position": {}, 898 | "toc_section_display": true, 899 | "toc_window_display": false 900 | } 901 | }, 902 | "nbformat": 4, 903 | "nbformat_minor": 2 904 | } 905 | -------------------------------------------------------------------------------- /notebooks/logistic_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning Implementation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "heading_collapsed": true 14 | }, 15 | "source": [ 16 | "## Imports" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "hidden": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import json\n", 28 | "\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "import plotly.offline as py\n", 32 | "from plotly import graph_objects as go" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "heading_collapsed": true 39 | }, 40 | "source": [ 41 | "## Logistic regression" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "heading_collapsed": true, 48 | "hidden": true 49 | }, 50 | "source": [ 51 | "### The maths" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "hidden": true 58 | }, 59 | "source": [ 60 | "The logistic model aims to predict the discrete y variable a.k.a the target variable (e.g. whether something will happen) based on a collection of features. It does this by transforming a linear combination of the features into a curve and fitting this curve to the data." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "hidden": true 67 | }, 68 | "source": [ 69 | "The curve used in logistic regression is the sigmoid function" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "hidden": true 76 | }, 77 | "source": [ 78 | "$$\n", 79 | "\\sigma(x) = \\frac{1}{1+e^{-x}}\n", 80 | "$$" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "hidden": true 87 | }, 88 | "source": [ 89 | "Define y as" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "hidden": true 96 | }, 97 | "source": [ 98 | "$$\n", 99 | "\\begin{align}\n", 100 | "\\hat{y} &= h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\\\\n", 101 | "\\hat{y}&= \\sigma\\left(\\beta_0x_0+\\cdots+\\beta_nx_n\\right)\\quad &n\\in \\mathbb{N},x_0=1 \\\\\n", 102 | "\\hat{y}&=\\sigma\\left(\\sum^{n}_{i=0}\\beta_ix_i\\right) \\\\\n", 103 | "\\hat{y}&=\\sigma\\left(\\mathbf{\\boldsymbol{\\beta}^Tx}\\right)\\quad&\\boldsymbol{\\beta},\\mathbf{x}\\in\\mathbb{R}^{n\\times1}\\\\\n", 104 | "\\hat{y}&=\\sigma\\left(\\boldsymbol{\\beta}^T\\mathbf{x}\\right)\n", 105 | "\\end{align}\n", 106 | "$$" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "hidden": true 113 | }, 114 | "source": [ 115 | "notice" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "hidden": true 122 | }, 123 | "source": [ 124 | "$$\n", 125 | "\\hat{y} = \\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\n", 126 | "$$" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": { 132 | "hidden": true 133 | }, 134 | "source": [ 135 | "so" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "hidden": true 142 | }, 143 | "source": [ 144 | "$$\n", 145 | "\\begin{align}\n", 146 | "\\hat{y} + \\hat{y}e^{-\\boldsymbol{\\beta}^T\\mathbf{x}} &= 1\\\\\n", 147 | "\\hat{y}e^{-\\boldsymbol{\\beta}^T\\mathbf{x}} &= 1 - \\hat{y}\\\\\n", 148 | "\\frac{\\hat{y}}{1 - \\hat{y}} &= e^{\\boldsymbol{\\beta}^T\\mathbf{x}}\\\\\n", 149 | "\\ln\\left(\\frac{\\hat{y}}{1 - \\hat{y}}\\right)&=\\boldsymbol{\\beta}^T\\mathbf{x}\n", 150 | "\\end{align}\n", 151 | "$$" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "hidden": true 158 | }, 159 | "source": [ 160 | "This above is the logit form of logistic regression. We model the logit as a linear combination of the x variables" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "hidden": true 167 | }, 168 | "source": [ 169 | "We define the cost function as follows for each y and corresponding x" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "hidden": true 176 | }, 177 | "source": [ 178 | "$$\n", 179 | "\\begin{align}\n", 180 | "J(\\mathbf{x})\n", 181 | "&= \\begin{cases}\n", 182 | "-\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\right) &\\text{if y=1}\\\\\n", 183 | "-\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x})\\right) &\\text{if y=0}\\\\\n", 184 | "\\end{cases}\n", 185 | "\\end{align}\n", 186 | "$$" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "hidden": true 193 | }, 194 | "source": [ 195 | "$$\n", 196 | "\\begin{align}\n", 197 | "J(\\mathbf{x})\n", 198 | "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n", 199 | "+(1-y^j)\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\\\\\n", 200 | "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\\right)\n", 201 | "+(1-y^j)\\log\\left(1-\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}}}\\right)\\\\\n", 202 | "&= -\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\right)\n", 203 | "+(1-y^j)\\log\\left(1-\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\right)\n", 204 | "\\end{align}\n", 205 | "$$" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "hidden": true 212 | }, 213 | "source": [ 214 | "note" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "hidden": true 221 | }, 222 | "source": [ 223 | "$$\n", 224 | "\\begin{align}\n", 225 | "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)&=\\frac{1}{1+e^{-\\boldsymbol{\\beta}^T\\mathbf{x}^j}}\\\\\n", 226 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 227 | "\\end{align}\n", 228 | "$$" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "hidden": true 235 | }, 236 | "source": [ 237 | "so" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "hidden": true 244 | }, 245 | "source": [ 246 | "$$\n", 247 | "\\begin{align}\n", 248 | "\\frac{\\partial h}{\\partial \\beta_k} &= -\\left(1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}\\right)^{-2}e^{-\\sum^{n}_{i=0}\\beta_ix_i} (-x_k^j)\\\\\n", 249 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 250 | "\\frac{-e^{-\\sum^{n}_{i=0}\\beta_ix_i} (-x_k^j)}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\\\\n", 251 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 252 | "\\frac{(1-1-e^{-\\sum^{n}_{i=0}\\beta_ix_i})(-x_k^j)}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\\\\\n", 253 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 254 | "\\left(\n", 255 | "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}-\n", 256 | "\\frac{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 257 | "\\right)(-x_k^j)\\\\\n", 258 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 259 | "\\left(\n", 260 | "\\frac{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}-\n", 261 | "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 262 | "\\right)(x_k^j)\\\\\n", 263 | "&=\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 264 | "\\left(\n", 265 | "1-\n", 266 | "\\frac{1}{1+e^{-\\sum^{n}_{i=0}\\beta_ix_i}}\n", 267 | "\\right)(x_k^j)\\\\\n", 268 | "&=h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n", 269 | "\\end{align}\n", 270 | "$$" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "hidden": true 277 | }, 278 | "source": [ 279 | "We need to differentiate the cost function i.e. find the gradient" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": { 285 | "hidden": true 286 | }, 287 | "source": [ 288 | "$$\n", 289 | "\\begin{align}\n", 290 | "\\frac{\\partial J}{\\partial\\beta_k}\\left(\\boldsymbol{\\beta}\\right) \n", 291 | "&=\\frac{\\partial}{\\partial\\beta_k}\\left(\n", 292 | "-\\frac{1}{m}\\sum_{j=1}^my^j\\log\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n", 293 | "+(1-y^j)\\log\\left(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)\\right)\n", 294 | "\\right)\\\\\n", 295 | "&=-\\frac{1}{m}\\sum_{j=1}^m\\frac{y^j}{h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\\frac{\\partial h}{\\partial \\beta_k}\n", 296 | "+\\frac{-(1-y^j)}{1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\\frac{\\partial h}{\\partial \\beta_k}\\\\\n", 297 | "&=-\\frac{1}{m}\\sum_{j=1}^m\\frac{y^j}{h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\n", 298 | "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n", 299 | "+\\frac{-(1-y^j)}{1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)}\n", 300 | "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\\\\\n", 301 | "&=-\\frac{1}{m}\\sum_{j=1}^my^j(1-h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j))x_k^j\n", 302 | "-(1-y^j)\n", 303 | "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)x_k^j\\\\\n", 304 | "&=\\frac{1}{m}\\sum_{j=1}^m\n", 305 | "\\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_k^j\n", 306 | "\\end{align}\n", 307 | "$$" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "hidden": true 314 | }, 315 | "source": [ 316 | "hence" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "hidden": true 323 | }, 324 | "source": [ 325 | "$$\n", 326 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 327 | "=\n", 328 | "\\begin{bmatrix}\n", 329 | " \\frac{\\partial J}{\\partial\\beta_1} \\\\\n", 330 | " \\vdots \\\\\n", 331 | " \\frac{\\partial J}{\\partial\\beta_n}\n", 332 | "\\end{bmatrix}\n", 333 | "=\n", 334 | "\\begin{bmatrix}\n", 335 | " \\frac{1}{m}\\sum_{j=1}^m\n", 336 | " \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_1^j\\\\\n", 337 | " \\vdots \\\\\n", 338 | " \\frac{1}{m}\\sum_{j=1}^m\n", 339 | " \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_n^j\n", 340 | "\\end{bmatrix}\n", 341 | "$$" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": { 347 | "hidden": true 348 | }, 349 | "source": [ 350 | "Define the design matrix and column representation of y. Here each row of X and y are training examples hence there are m rows" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": { 356 | "hidden": true 357 | }, 358 | "source": [ 359 | "$$\\mathbf{X}\\in\\mathbb{R}^{m\\times n},\n", 360 | "\\quad \\mathbf{y}\\in\\mathbb{R}^{m\\times 1}\n", 361 | "$$" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "hidden": true 368 | }, 369 | "source": [ 370 | "$$\n", 371 | "\\mathbf{X}=\\begin{bmatrix}\n", 372 | " \\dots & (\\mathbf{x}^1)^T & \\dots\\\\\n", 373 | " \\dots & (\\mathbf{x}^2)^T & \\dots\\\\\n", 374 | " \\dots & \\vdots & \\dots\\\\\n", 375 | " \\dots & (\\mathbf{x}^m)^T & \\dots\n", 376 | "\\end{bmatrix}\\quad\n", 377 | "\\mathbf{y}=\\begin{bmatrix}\n", 378 | " y_1\\\\y_2\\\\\\vdots\\\\y_m\n", 379 | "\\end{bmatrix}\n", 380 | "$$" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": { 386 | "hidden": true 387 | }, 388 | "source": [ 389 | "$$\n", 390 | "\\begin{align}\n", 391 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 392 | "=\n", 393 | "\\begin{bmatrix}\n", 394 | " \\frac{1}{m}\\sum_{j=1}^m\n", 395 | " \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_1^j\\\\\n", 396 | " \\vdots \\\\\n", 397 | " \\frac{1}{m}\\sum_{j=1}^m\n", 398 | " \\left(h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)-y^j\\right)x_n^j\n", 399 | "\\end{bmatrix}\n", 400 | "=\n", 401 | "\\frac{1}{m}\n", 402 | "\\begin{bmatrix}\n", 403 | " \\sum^{n}_{i=0}h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)^jx^j_1\\\\\n", 404 | " \\vdots \\\\\n", 405 | " \\sum^{n}_{i=0}h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j)x^j_n\n", 406 | "\\end{bmatrix}\n", 407 | "-\n", 408 | "\\frac{1}{m}\n", 409 | "\\begin{bmatrix}\n", 410 | " \\sum^{m}_{j=1}y^jx^j_1\\\\\n", 411 | " \\vdots \\\\\n", 412 | " \\sum^{m}_{j=1}y^jx^j_n\\\\\n", 413 | "\\end{bmatrix}\n", 414 | "\\end{align}\n", 415 | "$$" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "hidden": true 422 | }, 423 | "source": [ 424 | "$$\n", 425 | "h_{\\boldsymbol{\\beta}}(\\mathbf{x}^j) = \\sigma({\\mathbf{x}^j}^T\\boldsymbol{\\beta})\n", 426 | "$$" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "hidden": true 433 | }, 434 | "source": [ 435 | "so" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "hidden": true 442 | }, 443 | "source": [ 444 | "$$\n", 445 | "\\begin{align}\n", 446 | "\\nabla_{\\boldsymbol{\\beta}} J\n", 447 | "&=\\frac{1}{m}\\left(\n", 448 | "\\mathbf{X}^T\\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})-\\mathbf{X}^T\\mathbf{y}\n", 449 | "\\right)\\\\\n", 450 | "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n", 451 | "\\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})-\\mathbf{y}\n", 452 | "\\right)\\\\\n", 453 | "&=\\frac{1}{m}\\mathbf{X}^T\\left(\n", 454 | "\\mathbf{\\hat{y}}-\\mathbf{y}\n", 455 | "\\right)\n", 456 | "\\end{align}\n", 457 | "$$" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "hidden": true 464 | }, 465 | "source": [ 466 | "where" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": { 472 | "hidden": true 473 | }, 474 | "source": [ 475 | "$$\n", 476 | "\\mathbf{\\hat{y}} = \\sigma(\\mathbf{X}\\mathbf{\\boldsymbol{\\beta}})\n", 477 | "$$" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "hidden": true 484 | }, 485 | "source": [ 486 | "We could have derived the same thing using matrix calculus" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": { 492 | "heading_collapsed": true, 493 | "hidden": true 494 | }, 495 | "source": [ 496 | "### Example sigmoid" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": { 502 | "hidden": true 503 | }, 504 | "source": [ 505 | "The curve used in logistic regression is the sigmoid function" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": { 511 | "hidden": true 512 | }, 513 | "source": [ 514 | "$$\n", 515 | "\\sigma(x) = \\frac{1}{1+e^{-x}}\n", 516 | "$$" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 3, 522 | "metadata": { 523 | "hidden": true 524 | }, 525 | "outputs": [ 526 | { 527 | "data": { 528 | "application/vnd.jupyter.widget-view+json": { 529 | "model_id": "96e4341cd3ff4793acd7c0d968a8a355", 530 | "version_major": 2, 531 | "version_minor": 0 532 | }, 533 | "text/plain": [ 534 | "FigureWidget({\n", 535 | " 'data': [{'type': 'scatter',\n", 536 | " 'uid': 'fc81bfbe-f9f8-419c-9923-4d127962d5e2',\n", 537 | " …" 538 | ] 539 | }, 540 | "metadata": {}, 541 | "output_type": "display_data" 542 | } 543 | ], 544 | "source": [ 545 | "sigmoid_fig = go.FigureWidget()\n", 546 | "demo_x = np.arange(-10,10,0.1)\n", 547 | "demo_y = 1 / (1 + np.exp(-demo_x))\n", 548 | "sigmoid_fig.add_scatter(\n", 549 | " x=demo_x,\n", 550 | " y=demo_y)\n", 551 | "sigmoid_fig.layout.title = 'Sigmoid Function'\n", 552 | "sigmoid_fig" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": { 558 | "heading_collapsed": true, 559 | "hidden": true 560 | }, 561 | "source": [ 562 | "### Make fake data" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 4, 568 | "metadata": { 569 | "hidden": true 570 | }, 571 | "outputs": [], 572 | "source": [ 573 | "m = 100\n", 574 | "x0 = np.ones(shape=(m, 1))\n", 575 | "x1 = np.linspace(0, 10, m).reshape(-1, 1)\n", 576 | "X = np.column_stack((x0, x1))\n", 577 | "\n", 578 | "# let y = 0.5 * x + 1 + epsilon\n", 579 | "epsilon = np.random.normal(scale=2, size=(m, 1))\n", 580 | "y = x1 + epsilon\n", 581 | "y = (y > 5).astype(int)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 6, 587 | "metadata": { 588 | "hidden": true 589 | }, 590 | "outputs": [ 591 | { 592 | "data": { 593 | "application/vnd.jupyter.widget-view+json": { 594 | "model_id": "5e928c2fe5a1497889e2502722076d7a", 595 | "version_major": 2, 596 | "version_minor": 0 597 | }, 598 | "text/plain": [ 599 | "FigureWidget({\n", 600 | " 'data': [{'mode': 'markers',\n", 601 | " 'name': 'linear data + noise',\n", 602 | " 'ty…" 603 | ] 604 | }, 605 | "metadata": {}, 606 | "output_type": "display_data" 607 | } 608 | ], 609 | "source": [ 610 | "fig = go.FigureWidget()\n", 611 | "fig = fig.add_scatter(\n", 612 | " x=X[:,1],\n", 613 | " y=y[:,0],\n", 614 | " mode='markers',\n", 615 | " name='linear data + noise')\n", 616 | "fig" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": { 622 | "heading_collapsed": true, 623 | "hidden": true 624 | }, 625 | "source": [ 626 | "### Logistic regression class" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 7, 632 | "metadata": { 633 | "hidden": true 634 | }, 635 | "outputs": [], 636 | "source": [ 637 | "import json\n", 638 | "\n", 639 | "import numpy as np\n", 640 | "\n", 641 | "\n", 642 | "class LogisticRegression():\n", 643 | "\n", 644 | " def __init__(self, learning_rate=0.05):\n", 645 | " \"\"\" \n", 646 | " Logistic regression model\n", 647 | "\n", 648 | " Parameters:\n", 649 | " ----------\n", 650 | " learning_rate: float, optional, default 0.05\n", 651 | " The learning rate parameter controlling the gradient descent\n", 652 | " step size\n", 653 | " \"\"\"\n", 654 | " self.learning_rate = learning_rate\n", 655 | " print('Creating logistic model instance')\n", 656 | "\n", 657 | " def __repr__(self):\n", 658 | " return (\n", 659 | " f'')\n", 661 | "\n", 662 | " def fit(self, X, y, n_iter=1000):\n", 663 | " \"\"\" \n", 664 | " Fit the logistic regression model\n", 665 | "\n", 666 | " Updates the weights with n_iter iterations of batch gradient\n", 667 | " descent updates\n", 668 | "\n", 669 | " Parameters:\n", 670 | " ----------\n", 671 | " X: numpy.ndarray\n", 672 | " Training data, shape (m samples, (n - 1) features + 1)\n", 673 | " Note the first column of X is expected to be ones (to allow \n", 674 | " for the bias to be included in beta)\n", 675 | " y: numpy.ndarray\n", 676 | " Target values - class label {0, 1}, shape (m samples, 1)\n", 677 | " n_iter: int, optional, default 1000\n", 678 | " Number of batch gradient descent steps\n", 679 | " \"\"\"\n", 680 | " m, n = X.shape\n", 681 | " print(f'fitting with m={m} samples with n={n-1} features\\n')\n", 682 | " self.beta = np.zeros(shape=(n, 1))\n", 683 | " self.costs = []\n", 684 | " self.betas = [self.beta]\n", 685 | " for iteration in range(n_iter):\n", 686 | " y_pred = self.predict_proba(X)\n", 687 | " cost = (-1 / m) * (\n", 688 | " (y.T @ np.log(y_pred)) +\n", 689 | " ((np.ones(shape=y.shape) - y).T @ np.log(\n", 690 | " np.ones(shape=y_pred.shape) - y_pred))\n", 691 | " )\n", 692 | " self.costs.append(cost[0][0])\n", 693 | " gradient = (1 / m) * X.T @ (y_pred - y)\n", 694 | " self.beta = self.beta - (\n", 695 | " self.learning_rate * gradient)\n", 696 | " self.betas.append(self.beta)\n", 697 | "\n", 698 | " def predict_proba(self, X):\n", 699 | " \"\"\" \n", 700 | " Predicted probability values for class 1\n", 701 | "\n", 702 | " Note this is calculated as the sigmoid of the linear combination\n", 703 | " of the feature values and the weights.\n", 704 | "\n", 705 | " Parameters:\n", 706 | " ----------\n", 707 | " X: numpy.ndarray\n", 708 | " Training data, shape (m samples, (n - 1) features + 1)\n", 709 | " Note the first column of X is expected to be ones (to allow \n", 710 | " for the bias to be included in beta)\n", 711 | "\n", 712 | " Returns:\n", 713 | " -------\n", 714 | " numpy.ndarray:\n", 715 | " Predicted probability of samples being in class 1\n", 716 | " \"\"\" \n", 717 | " y_pred = self.sigmoid(X @ self.beta)\n", 718 | " return y_pred\n", 719 | "\n", 720 | " def predict(self, X, descision_prob=0.5):\n", 721 | " \"\"\" \n", 722 | " Predict the class values from sample X feature values\n", 723 | "\n", 724 | " Parameters:\n", 725 | " ----------\n", 726 | " X: numpy.ndarray\n", 727 | " Training data, shape (m samples, (n - 1) features + 1)\n", 728 | " Note the first column of X is expected to be ones (to allow \n", 729 | " for the bias to be included in beta)\n", 730 | "\n", 731 | " Returns:\n", 732 | " -------\n", 733 | " numpy.ndarray:\n", 734 | " Prediceted class values, shape (m samples, 1)\n", 735 | " \"\"\"\n", 736 | " y_pred = self.sigmoid(X @ self.beta)\n", 737 | " return (y_pred > descision_prob) * 1\n", 738 | "\n", 739 | " def sigmoid(self, x):\n", 740 | " \"\"\" \n", 741 | " Sigmoid function\n", 742 | "\n", 743 | " f(x) = 1 / (1 + e^(-x))\n", 744 | "\n", 745 | " Parameters:\n", 746 | " ----------\n", 747 | " x: numpy.ndarray\n", 748 | "\n", 749 | " Returns:\n", 750 | " -------\n", 751 | " numpy.ndarray:\n", 752 | " sigmoid of x, values in (0, 1)\n", 753 | " \"\"\" \n", 754 | " return 1 / (1 + np.exp(-x))\n" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 8, 760 | "metadata": { 761 | "hidden": true 762 | }, 763 | "outputs": [ 764 | { 765 | "name": "stdout", 766 | "output_type": "stream", 767 | "text": [ 768 | "Creating logistic model instance\n", 769 | "fitting with m=100 samples with n=1 features\n", 770 | "\n" 771 | ] 772 | } 773 | ], 774 | "source": [ 775 | "logistic_regression = LogisticRegression()\n", 776 | "logistic_regression.fit(X, y)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 9, 782 | "metadata": { 783 | "hidden": true 784 | }, 785 | "outputs": [ 786 | { 787 | "data": { 788 | "text/plain": [ 789 | "array([[0],\n", 790 | " [0],\n", 791 | " [1]])" 792 | ] 793 | }, 794 | "execution_count": 9, 795 | "metadata": {}, 796 | "output_type": "execute_result" 797 | } 798 | ], 799 | "source": [ 800 | "example_X = np.array([[1,1],[1,4],[1,7]])\n", 801 | "logistic_regression.predict(example_X)" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": { 807 | "heading_collapsed": true, 808 | "hidden": true 809 | }, 810 | "source": [ 811 | "### Plot the best fit" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": 10, 817 | "metadata": { 818 | "hidden": true 819 | }, 820 | "outputs": [ 821 | { 822 | "data": { 823 | "application/vnd.jupyter.widget-view+json": { 824 | "model_id": "5e928c2fe5a1497889e2502722076d7a", 825 | "version_major": 2, 826 | "version_minor": 0 827 | }, 828 | "text/plain": [ 829 | "FigureWidget({\n", 830 | " 'data': [{'mode': 'markers',\n", 831 | " 'name': 'linear data + noise',\n", 832 | " 'ty…" 833 | ] 834 | }, 835 | "metadata": {}, 836 | "output_type": "display_data" 837 | } 838 | ], 839 | "source": [ 840 | "fig = fig.add_scatter(\n", 841 | " x=X[:,1], \n", 842 | " y=logistic_regression.predict_proba(X)[:,0],\n", 843 | " mode='markers',\n", 844 | " name='logistic best fit')\n", 845 | "fig" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": { 851 | "heading_collapsed": true, 852 | "hidden": true 853 | }, 854 | "source": [ 855 | "### Plot the cost function" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 11, 861 | "metadata": { 862 | "hidden": true 863 | }, 864 | "outputs": [], 865 | "source": [ 866 | "# Haven't got round to this yet - see linear regression for an example error \n", 867 | "# surface decent." 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "metadata": { 873 | "heading_collapsed": true 874 | }, 875 | "source": [ 876 | "## Logisitc regression - Titanic example" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": { 882 | "heading_collapsed": true, 883 | "hidden": true 884 | }, 885 | "source": [ 886 | "### Load data" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 12, 892 | "metadata": { 893 | "hidden": true 894 | }, 895 | "outputs": [], 896 | "source": [ 897 | "X_train = pd.read_feather('../data/titanic/processed/X_train.feather')\n", 898 | "X_test = pd.read_feather('../data/titanic/processed/X_test.feather')\n", 899 | "y_train = pd.read_feather('../data/titanic/processed/y_train.feather')\n", 900 | "y_test = pd.read_feather('../data/titanic/processed/y_test.feather')" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": { 906 | "heading_collapsed": true, 907 | "hidden": true 908 | }, 909 | "source": [ 910 | "### Train model" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 13, 916 | "metadata": { 917 | "hidden": true 918 | }, 919 | "outputs": [ 920 | { 921 | "name": "stdout", 922 | "output_type": "stream", 923 | "text": [ 924 | "Creating logistic model instance\n" 925 | ] 926 | } 927 | ], 928 | "source": [ 929 | "titanic_logistic_model = LogisticRegression()" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": 14, 935 | "metadata": { 936 | "hidden": true 937 | }, 938 | "outputs": [ 939 | { 940 | "name": "stdout", 941 | "output_type": "stream", 942 | "text": [ 943 | "fitting with m=712 samples with n=29 features\n", 944 | "\n" 945 | ] 946 | } 947 | ], 948 | "source": [ 949 | "titanic_logistic_model.fit(X=X_train.values, y=y_train.values, n_iter=4000)" 950 | ] 951 | }, 952 | { 953 | "cell_type": "markdown", 954 | "metadata": { 955 | "heading_collapsed": true, 956 | "hidden": true 957 | }, 958 | "source": [ 959 | "### Plot the cost" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 16, 965 | "metadata": { 966 | "hidden": true 967 | }, 968 | "outputs": [ 969 | { 970 | "data": { 971 | "application/vnd.jupyter.widget-view+json": { 972 | "model_id": "3c6486736c164288b3a79a20965d1168", 973 | "version_major": 2, 974 | "version_minor": 0 975 | }, 976 | "text/plain": [ 977 | "FigureWidget({\n", 978 | " 'data': [{'type': 'scatter',\n", 979 | " 'uid': 'a3a5dfba-fddd-428e-87a9-4a7f4461bffc',\n", 980 | " …" 981 | ] 982 | }, 983 | "metadata": {}, 984 | "output_type": "display_data" 985 | } 986 | ], 987 | "source": [ 988 | "titanic_cost_fig = go.FigureWidget()\n", 989 | "\n", 990 | "titanic_cost_fig.add_scatter(\n", 991 | " x=list(range(len(titanic_logistic_model.costs))),\n", 992 | " y=titanic_logistic_model.costs,\n", 993 | ")\n", 994 | "\n", 995 | "titanic_cost_fig.layout.title = 'Cost Vs gradient descent iterations'\n", 996 | "titanic_cost_fig.layout.xaxis.title = 'Iterations'\n", 997 | "titanic_cost_fig.layout.yaxis.title = 'Cost'\n", 998 | "titanic_cost_fig" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "markdown", 1003 | "metadata": { 1004 | "heading_collapsed": true, 1005 | "hidden": true 1006 | }, 1007 | "source": [ 1008 | "### Error analysis" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 17, 1014 | "metadata": { 1015 | "hidden": true 1016 | }, 1017 | "outputs": [ 1018 | { 1019 | "name": "stdout", 1020 | "output_type": "stream", 1021 | "text": [ 1022 | "Test accuracy is with my implementation 79.89%\n" 1023 | ] 1024 | } 1025 | ], 1026 | "source": [ 1027 | "y_pred = titanic_logistic_model.predict(X_test.values)\n", 1028 | "test_accuracy = (y_pred == y_test.values).sum() / len(y_pred)\n", 1029 | "\n", 1030 | "print(f'Test accuracy is with my implementation {test_accuracy:.2%}')" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "markdown", 1035 | "metadata": { 1036 | "heading_collapsed": true 1037 | }, 1038 | "source": [ 1039 | "## End" 1040 | ] 1041 | } 1042 | ], 1043 | "metadata": { 1044 | "kernelspec": { 1045 | "display_name": "Python 3", 1046 | "language": "python", 1047 | "name": "python3" 1048 | }, 1049 | "language_info": { 1050 | "codemirror_mode": { 1051 | "name": "ipython", 1052 | "version": 3 1053 | }, 1054 | "file_extension": ".py", 1055 | "mimetype": "text/x-python", 1056 | "name": "python", 1057 | "nbconvert_exporter": "python", 1058 | "pygments_lexer": "ipython3", 1059 | "version": "3.7.6" 1060 | }, 1061 | "toc": { 1062 | "base_numbering": 1, 1063 | "nav_menu": {}, 1064 | "number_sections": true, 1065 | "sideBar": true, 1066 | "skip_h1_title": false, 1067 | "title_cell": "Table of Contents", 1068 | "title_sidebar": "Contents", 1069 | "toc_cell": false, 1070 | "toc_position": {}, 1071 | "toc_section_display": true, 1072 | "toc_window_display": false 1073 | } 1074 | }, 1075 | "nbformat": 4, 1076 | "nbformat_minor": 2 1077 | } 1078 | --------------------------------------------------------------------------------