├── README.md ├── Neural_Network.py ├── Linear Regression.ipynb └── K-Nearest Neighboor.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # ML Algorithms From Scratch 2 | 3 | In this folder I carry out machine learning algorithms from first principles to improve my understanding. 4 | 5 | This will be an ongoing project and I am hoping to cover the following: 6 | 7 | - Linear Regression :heavy_check_mark: 8 | - Polynomial Regression 9 | - Logistic Regression :heavy_check_mark: 10 | - Support Vector Machines :heavy_check_mark: 11 | - K-Nearest Neighboor :heavy_check_mark: 12 | - Naive Bayes 13 | - K-Means 14 | - Neural Network :heavy_check_mark: 15 | -------------------------------------------------------------------------------- /Neural_Network.py: -------------------------------------------------------------------------------- 1 | 2 | # import packages 3 | import numpy as np 4 | 5 | print('This is a neural network from scratch using one hidden layer') 6 | 7 | 8 | # define the layer size for the network 9 | def layer_size(X, Y): 10 | 11 | input_size = X.shape[0] 12 | hidden_size = 3 13 | output_size = Y.shape[0] 14 | 15 | return input_size, hidden_size, output_size 16 | 17 | # initialise the weights and biases randomnly to enable symmetry breaking 18 | def initialise_parameters(input_size, hidden_size, output_size): 19 | 20 | W1 = np.random.randn(hidden_size, input_size) * 0.01 21 | W2 = np.random.randn(output_size, hidden_size) * 0.01 22 | b1 = np.zeros((hidden_size, 1)) 23 | b2 = np.zeros((output_size, 1)) 24 | 25 | # store in dict for access 26 | parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} 27 | 28 | return parameters 29 | 30 | # sigmoid 31 | def sigmoid(z): 32 | return 1/(1+np.exp(-z)) 33 | 34 | # derivative 35 | def d_sigmoid(z): 36 | return z*(1-z) 37 | 38 | # forward prop 39 | def forward(X, parameters): 40 | 41 | # get parameters from dict 42 | W1 = parameters['W1'] 43 | W2 = parameters['W2'] 44 | b1 = parameters['b1'] 45 | b2 = parameters['b2'] 46 | 47 | # calculate the forward using the activation function 48 | Z1 = np.dot(W1,X) + b1 49 | A1 = np.tanh(Z1) 50 | Z2 = np.dot(W2,A1) + b2 51 | A2 = sigmoid(Z2) 52 | 53 | # store the values in a dict 54 | cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2} 55 | 56 | return A2, cache 57 | 58 | # cross entropy loss function 59 | def compute_cost(A2, Y): 60 | 61 | cost = (-1/Y.shape[1]) * np.sum(Y*np.log(A2) + (1-Y) * np.log(1-A2)) 62 | cost = float(np.squeeze(cost)) 63 | 64 | return cost 65 | 66 | # backprop 67 | def backprop(parameters, cache, X, Y): 68 | 69 | m = X.shape[1] 70 | 71 | # get parameters from dict in cache 72 | W1 = parameters['W1'] 73 | W2 = parameters['W2'] 74 | A1 = cache['A1'] 75 | A2 = cache['A2'] 76 | 77 | # compute gradients 78 | dZ2 = A2 - Y 79 | dW2 = (1/m) * np.dot(dZ2,A1.T) 80 | db2 = (1/m) * np.sum(dZ2, axis=1,keepdims=True) 81 | dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2)) 82 | dW1 = (1/m) * np.dot(dZ1,X.T) 83 | db1 = (1/m) * np.sum(dZ1, axis=1,keepdims=True) 84 | 85 | # store grads in dict 86 | grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} 87 | 88 | return grads 89 | 90 | # update the parameters 91 | def update_params(parameters, grads, alpha = 1): 92 | 93 | # fetch grads, weights and biases from dicts 94 | W1 = parameters['W1'] 95 | b1 = parameters['b1'] 96 | W2 = parameters['W2'] 97 | b2 = parameters['b2'] 98 | dW1 = grads['dW1'] 99 | db1 = grads['db1'] 100 | dW2 = grads['dW2'] 101 | db2 = grads['db2'] 102 | 103 | # update them 104 | W1 -= alpha * dW1 105 | b1 -= alpha * db1 106 | W2 -= alpha * dW2 107 | b2 -= alpha * db2 108 | 109 | # store in dict 110 | parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} 111 | 112 | return parameters 113 | 114 | # define the model 115 | def model(X, Y, hidden_size, epochs = 1000): 116 | 117 | # initiate layer size 118 | np.random.seed(3) 119 | input_size = layer_size(X, Y)[0] 120 | output_size = layer_size(X, Y)[2] 121 | 122 | # intialse the weights 123 | parameters = initialise_parameters(input_size, hidden_size, output_size) 124 | 125 | # train the model 126 | for i in range(0, epochs): 127 | 128 | A2, cache = forward(X, parameters) 129 | cost = compute_cost(A2, Y) 130 | grads = backprop(parameters, cache, X, Y) 131 | parameters = update_params(parameters, grads) 132 | 133 | # Print the cost every 1000 iterations 134 | print ("Cost after iteration %i: %f" %(i, cost)) 135 | 136 | return parameters 137 | 138 | 139 | # training data 140 | X = np.array([[0,0,1,1], 141 | [0,1,1,1], 142 | [1,0,1,1], 143 | [1,1,1,1]]) 144 | 145 | # expected output 146 | Y = np.array([[0],[1],[1],[1]]) 147 | 148 | 149 | # apply the model 150 | parameters = model(X, Y, hidden_size = 4, epochs = 1000) 151 | 152 | 153 | # predict the output 154 | def predict(parameters, X): 155 | 156 | A2, cache = forward(X, parameters) 157 | 158 | return np.round(A2,3) 159 | 160 | 161 | 162 | print('The output is:') 163 | print(predict(parameters,X)) 164 | print('The expected output:') 165 | print(Y) 166 | 167 | 168 | -------------------------------------------------------------------------------- /Linear Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Regression From First Principles" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## In this notebook we will derive the supervised learning algorith of linear regression from scratch using its mathematical principles" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 28, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# import packages\n", 24 | "import numpy as np\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import pandas as pd" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Import and read the data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 29, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "\n", 48 | "RangeIndex: 30 entries, 0 to 29\n", 49 | "Data columns (total 2 columns):\n", 50 | " # Column Non-Null Count Dtype \n", 51 | "--- ------ -------------- ----- \n", 52 | " 0 YearsExperience 30 non-null float64\n", 53 | " 1 Salary 30 non-null float64\n", 54 | "dtypes: float64(2)\n", 55 | "memory usage: 608.0 bytes\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "# import the data\n", 61 | "data = pd.read_csv('Salary_Data.csv')\n", 62 | "data.info()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### Feature Engineering" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 30, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# set up the data\n", 79 | "X = data['YearsExperience'].values\n", 80 | "y = data['Salary'].values" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Normalise the data using Z = (x - mu)/sigma to improve performance" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 32, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# normalise the data\n", 97 | "for i in range(len(X)):\n", 98 | " X[i] = (X[i] - np.mean(X)) / np.std(X)\n", 99 | " y[i] = (y[i] - np.mean(y)) / np.std(y)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | " Prep the data for matrix multiplications " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 34, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# assign dummy features\n", 116 | "y = np.reshape(y, (30,1))\n", 117 | "X = np.reshape(X,newshape=(-1,1))\n", 118 | "X = np.hstack((np.ones((X.shape[0],1)), X))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Modelling" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 35, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def linreg(x, y, alpha, epochs):\n", 135 | " \n", 136 | " # intialise the theta values\n", 137 | " m = len(y)\n", 138 | " theta = np.zeros((X.shape[1],1))\n", 139 | " cost = []\n", 140 | " \n", 141 | " for i in range(epochs):\n", 142 | " \n", 143 | " # predicted output\n", 144 | " h = np.dot(x,theta)\n", 145 | " \n", 146 | " # cost function\n", 147 | " J = (1/(2*m)) * np.sum(np.square(h-y))\n", 148 | " \n", 149 | " # update the parameters\n", 150 | " d_theta = (1/m) * np.dot(x.T, h-y)\n", 151 | " theta = theta - alpha*d_theta\n", 152 | " \n", 153 | " cost.append(J)\n", 154 | " \n", 155 | " return theta, cost" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Apply the model and plot the cost function " 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 36, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# set parameters for the iteration\n", 172 | "alpha = 0.1\n", 173 | "epochs = 21" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 37, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# apply the model\n", 183 | "theta, cost = linreg(X, y, alpha, epochs)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Plot the linear regression line " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 39, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# unpack the X values\n", 200 | "one, a = zip(*X)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 40, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# create the line, y = mx + c\n", 210 | "y_pred = a*theta[1] + theta[0]" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 43, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "image/png": "\n", 221 | "text/plain": [ 222 | "
" 223 | ] 224 | }, 225 | "metadata": { 226 | "needs_background": "light" 227 | }, 228 | "output_type": "display_data" 229 | } 230 | ], 231 | "source": [ 232 | "# plotting the regression line\n", 233 | "plt.scatter(a,y, label='Data Points')\n", 234 | "plt.plot(a,y_pred, color = 'r', label = 'Regression line')\n", 235 | "plt.xlabel('Years Experience (rescaled)', fontsize = 14)\n", 236 | "plt.ylabel('Salary (rescaled)', fontsize = 14)\n", 237 | "plt.xticks(size=14)\n", 238 | "plt.yticks(size=14)\n", 239 | "plt.legend()\n", 240 | "plt.show()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | " We can see that our line fits the data very well " 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Model from sklearn" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 74, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# import packages\n", 264 | "from sklearn.model_selection import train_test_split\n", 265 | "from sklearn.linear_model import LinearRegression\n", 266 | "from sklearn import metrics" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 70, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" 278 | ] 279 | }, 280 | "execution_count": 70, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "# set up the data again\n", 287 | "X = data[['YearsExperience']].values\n", 288 | "y = data['Salary'].values\n", 289 | "\n", 290 | "# split into test train \n", 291 | "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 7)\n", 292 | "\n", 293 | "# train the model\n", 294 | "model = LinearRegression() \n", 295 | "model.fit(x_train, y_train)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 71, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "Intercept: 0.155417965345038\n", 308 | "Coefficient: [1.0057085]\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "# Print out the line values y = mx + c\n", 314 | "print(\"Intercept: \", model.intercept_)\n", 315 | "print(\"Coefficient: \", model.coef_)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 72, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "# use the test data to predict values to test the model\n", 325 | "pred = model.predict(x_test)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 73, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "image/png": "\n", 336 | "text/plain": [ 337 | "
" 338 | ] 339 | }, 340 | "metadata": { 341 | "needs_background": "light" 342 | }, 343 | "output_type": "display_data" 344 | } 345 | ], 346 | "source": [ 347 | "# plotting the regression line\n", 348 | "plt.scatter(x_test,y_test, label='Data Points')\n", 349 | "plt.plot(x_test, pred, 'Red', label = 'Linear Regression Line')\n", 350 | "plt.xlabel('Years Experience (rescaled)', fontsize = 14)\n", 351 | "plt.ylabel('Salary (rescaled)', fontsize = 14)\n", 352 | "plt.xticks(size=14)\n", 353 | "plt.yticks(size=14)\n", 354 | "plt.legend()\n", 355 | "plt.show()" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3 (ipykernel)", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.9.7" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 4 380 | } 381 | -------------------------------------------------------------------------------- /K-Nearest Neighboor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# KNN from scratch" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## In this code I implement the KNN algorithm from scratch using the OOP paradigm " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 40, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# import packages\n", 24 | "import numpy as np\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "from collections import Counter\n", 27 | "from sklearn.model_selection import train_test_split\n", 28 | "from sklearn import datasets" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Model Class" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 90, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# distance between two-dimensional vectors\n", 45 | "def distance(x1, x2):\n", 46 | " return np.sqrt(np.sum((x1 - x2) ** 2))\n", 47 | "\n", 48 | "# build the KNN class\n", 49 | "class model:\n", 50 | " \n", 51 | " # compare to k neareast points\n", 52 | " def __init__(self, k):\n", 53 | " self.k = k\n", 54 | "\n", 55 | " # assign training data \n", 56 | " def fit(self, X, y):\n", 57 | " self.X_train = X\n", 58 | " self.y_train = y\n", 59 | "\n", 60 | " # array of predicted value for each data points \n", 61 | " def predict(self, X):\n", 62 | " preds = [self.predict_class(x) for x in X]\n", 63 | " return np.array(preds)\n", 64 | "\n", 65 | " def predict_class(self, x):\n", 66 | " \n", 67 | " # distances between x and all data points in the training set\n", 68 | " distances = [distance(x, x_train) for x_train in self.X_train]\n", 69 | " \n", 70 | " # Sort by distance and return indices of the first k neighbors with shortest distance \n", 71 | " k_idx_val = np.argsort(distances)[:self.k]\n", 72 | " \n", 73 | " # get the labels of the k nearest neighbor training samples using the index values\n", 74 | " labels = [self.y_train[i] for i in k_idx_val]\n", 75 | " \n", 76 | " # get the mode using the counter function\n", 77 | " mode = Counter(labels).most_common(1)\n", 78 | " \n", 79 | " # return the first mode if the are more than one\n", 80 | " return mode[0][0]\n", 81 | " " 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Testing the model on a dataset " 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 91, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "image/png": "\n", 99 | "text/plain": [ 100 | "
" 101 | ] 102 | }, 103 | "metadata": { 104 | "needs_background": "light" 105 | }, 106 | "output_type": "display_data" 107 | } 108 | ], 109 | "source": [ 110 | "# import the data\n", 111 | "data = datasets.load_breast_cancer()\n", 112 | "\n", 113 | "# assign and separate to train and test\n", 114 | "X, y = data.data, data.target\n", 115 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)\n", 116 | "\n", 117 | "# plot the first two features for visualisation\n", 118 | "plt.figure()\n", 119 | "plt.scatter(X_train[:,0], X_train[:,1], c=y_train)\n", 120 | "plt.xlabel('radius')\n", 121 | "plt.ylabel('texture')\n", 122 | "plt.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 92, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "Accuracy of model: 91.22807017543859 %\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "# function for the accuracy of the model\n", 140 | "def accuracy(y_true, preds):\n", 141 | " accuracy = (y_true == preds).mean()\n", 142 | " return accuracy\n", 143 | "\n", 144 | "# apply the model to the data and evaluate the accuracy\n", 145 | "knn = model(k=5)\n", 146 | "knn.fit(X_train, y_train)\n", 147 | "predictions = knn.predict(X_test)\n", 148 | "print(\"Accuracy of model:\", accuracy(y_test, predictions)*100,'%')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## Very good accuracy! " 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 93, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "image/png": "\n", 166 | "text/plain": [ 167 | "
" 168 | ] 169 | }, 170 | "metadata": { 171 | "needs_background": "light" 172 | }, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "# plot the predictions\n", 178 | "plt.figure()\n", 179 | "plt.scatter(X_test[:,0],X_test[:,1],c=predictions)\n", 180 | "plt.xlabel('radius')\n", 181 | "plt.ylabel('texture')\n", 182 | "plt.show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## The data points look to be in a reasonable location compared to the the training examples above." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.7.6" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 4 221 | } 222 | --------------------------------------------------------------------------------