├── LICENSE ├── README.md ├── nnfs-visuals-START.ipynb └── nnfs-visuals.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Harrison 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # neural-net-internals-visualized 2 | Visualizing some of the internals of a neural network during training and inference. 3 | 4 | Video tutorial for the code: https://www.youtube.com/watch?v=ChfEO8l-fas 5 | -------------------------------------------------------------------------------- /nnfs-visuals-START.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from zipfile import ZipFile\n", 10 | "import os\n", 11 | "import urllib\n", 12 | "import urllib.request\n", 13 | "\n", 14 | "FILE = 'fashion_mnist_images.zip'\n", 15 | "FOLDER = 'fashion_mnist_images'\n", 16 | "URL = 'https://nnfs.io/datasets/fashion_mnist_images.zip'\n", 17 | "\n", 18 | "\n", 19 | "if not os.path.isfile(FILE):\n", 20 | " print(f'Downloading {URL} and saving as {FILE}...')\n", 21 | " urllib.request.urlretrieve(URL, FILE)\n", 22 | "print('Unzipping images...')\n", 23 | "\n", 24 | "with ZipFile(FILE) as zip_images:\n", 25 | " zip_images.extractall(FOLDER)\n", 26 | "print('Done!')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "'''\n", 36 | "The following code is from the Neural Networks from Scratch book by Harrison Kinsley and Daniel Kukiela.\n", 37 | "\n", 38 | "https://nnfs.io\n", 39 | "'''\n", 40 | "\n", 41 | "import numpy as np\n", 42 | "import os\n", 43 | "import cv2\n", 44 | "from tqdm import tqdm\n", 45 | "import pickle\n", 46 | "import copy\n", 47 | "import pickle\n", 48 | "\n", 49 | "# We'll be saving all the data from this model's training. \n", 50 | "train_dict = {}\n", 51 | "\n", 52 | "# Dense layer\n", 53 | "class Layer_Dense:\n", 54 | "\n", 55 | " # Layer initialization\n", 56 | " def __init__(self, n_inputs, n_neurons,\n", 57 | " weight_regularizer_l1=0, weight_regularizer_l2=0,\n", 58 | " bias_regularizer_l1=0, bias_regularizer_l2=0):\n", 59 | " # Initialize weights and biases\n", 60 | " self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)\n", 61 | " self.biases = np.zeros((1, n_neurons))\n", 62 | " # Set regularization strength\n", 63 | " self.weight_regularizer_l1 = weight_regularizer_l1\n", 64 | " self.weight_regularizer_l2 = weight_regularizer_l2\n", 65 | " self.bias_regularizer_l1 = bias_regularizer_l1\n", 66 | " self.bias_regularizer_l2 = bias_regularizer_l2\n", 67 | "\n", 68 | " # Forward pass\n", 69 | " def forward(self, inputs, training):\n", 70 | " # Remember input values\n", 71 | " self.inputs = inputs\n", 72 | " # Calculate output values from inputs, weights and biases\n", 73 | " self.output = np.dot(inputs, self.weights) + self.biases\n", 74 | "\n", 75 | " # Backward pass\n", 76 | " def backward(self, dvalues):\n", 77 | " # Gradients on parameters\n", 78 | " self.dweights = np.dot(self.inputs.T, dvalues)\n", 79 | " self.dbiases = np.sum(dvalues, axis=0, keepdims=True)\n", 80 | "\n", 81 | "\n", 82 | " # Gradients on regularization\n", 83 | " # L1 on weights\n", 84 | " if self.weight_regularizer_l1 > 0:\n", 85 | " dL1 = np.ones_like(self.weights)\n", 86 | " dL1[self.weights < 0] = -1\n", 87 | " self.dweights += self.weight_regularizer_l1 * dL1\n", 88 | " # L2 on weights\n", 89 | " if self.weight_regularizer_l2 > 0:\n", 90 | " self.dweights += 2 * self.weight_regularizer_l2 * \\\n", 91 | " self.weights\n", 92 | " # L1 on biases\n", 93 | " if self.bias_regularizer_l1 > 0:\n", 94 | " dL1 = np.ones_like(self.biases)\n", 95 | " dL1[self.biases < 0] = -1\n", 96 | " self.dbiases += self.bias_regularizer_l1 * dL1\n", 97 | " # L2 on biases\n", 98 | " if self.bias_regularizer_l2 > 0:\n", 99 | " self.dbiases += 2 * self.bias_regularizer_l2 * \\\n", 100 | " self.biases\n", 101 | "\n", 102 | " # Gradient on values\n", 103 | " self.dinputs = np.dot(dvalues, self.weights.T)\n", 104 | "\n", 105 | "\n", 106 | "# Dropout\n", 107 | "class Layer_Dropout:\n", 108 | "\n", 109 | " # Init\n", 110 | " def __init__(self, rate):\n", 111 | " # Store rate, we invert it as for example for dropout\n", 112 | " # of 0.1 we need success rate of 0.9\n", 113 | " self.rate = 1 - rate\n", 114 | "\n", 115 | " # Forward pass\n", 116 | " def forward(self, inputs, training):\n", 117 | " # Save input values\n", 118 | " self.inputs = inputs\n", 119 | "\n", 120 | " # If not in the training mode - return values\n", 121 | " if not training:\n", 122 | " self.output = inputs.copy()\n", 123 | " return\n", 124 | "\n", 125 | " # Generate and save scaled mask\n", 126 | " self.binary_mask = np.random.binomial(1, self.rate,\n", 127 | " size=inputs.shape) / self.rate\n", 128 | " # Apply mask to output values\n", 129 | " self.output = inputs * self.binary_mask\n", 130 | "\n", 131 | "\n", 132 | " # Backward pass\n", 133 | " def backward(self, dvalues):\n", 134 | " # Gradient on values\n", 135 | " self.dinputs = dvalues * self.binary_mask\n", 136 | "\n", 137 | "\n", 138 | "# Input \"layer\"\n", 139 | "class Layer_Input:\n", 140 | "\n", 141 | " # Forward pass\n", 142 | " def forward(self, inputs, training):\n", 143 | " self.output = inputs\n", 144 | "\n", 145 | "\n", 146 | "# ReLU activation\n", 147 | "class Activation_ReLU:\n", 148 | "\n", 149 | " # Forward pass\n", 150 | " def forward(self, inputs, training):\n", 151 | " # Remember input values\n", 152 | " self.inputs = inputs\n", 153 | " # Calculate output values from inputs\n", 154 | " self.output = np.maximum(0, inputs)\n", 155 | "\n", 156 | " # Backward pass\n", 157 | " def backward(self, dvalues):\n", 158 | " # Since we need to modify original variable,\n", 159 | " # let's make a copy of values first\n", 160 | " self.dinputs = dvalues.copy()\n", 161 | "\n", 162 | " # Zero gradient where input values were negative\n", 163 | " self.dinputs[self.inputs <= 0] = 0\n", 164 | "\n", 165 | " # Calculate predictions for outputs\n", 166 | " def predictions(self, outputs):\n", 167 | " return outputs\n", 168 | "\n", 169 | "\n", 170 | "# Softmax activation\n", 171 | "class Activation_Softmax:\n", 172 | "\n", 173 | " # Forward pass\n", 174 | " def forward(self, inputs, training):\n", 175 | " # Remember input values\n", 176 | " self.inputs = inputs\n", 177 | "\n", 178 | " # Get unnormalized probabilities\n", 179 | " exp_values = np.exp(inputs - np.max(inputs, axis=1,\n", 180 | " keepdims=True))\n", 181 | "\n", 182 | " # Normalize them for each sample\n", 183 | " probabilities = exp_values / np.sum(exp_values, axis=1,\n", 184 | " keepdims=True)\n", 185 | "\n", 186 | " self.output = probabilities\n", 187 | "\n", 188 | " # Backward pass\n", 189 | " def backward(self, dvalues):\n", 190 | "\n", 191 | " # Create uninitialized array\n", 192 | " self.dinputs = np.empty_like(dvalues)\n", 193 | "\n", 194 | " # Enumerate outputs and gradients\n", 195 | " for index, (single_output, single_dvalues) in \\\n", 196 | " enumerate(zip(self.output, dvalues)):\n", 197 | " # Flatten output array\n", 198 | " single_output = single_output.reshape(-1, 1)\n", 199 | " # Calculate Jacobian matrix of the output\n", 200 | " jacobian_matrix = np.diagflat(single_output) - \\\n", 201 | " np.dot(single_output, single_output.T)\n", 202 | " # Calculate sample-wise gradient\n", 203 | " # and add it to the array of sample gradients\n", 204 | " self.dinputs[index] = np.dot(jacobian_matrix,\n", 205 | " single_dvalues)\n", 206 | "\n", 207 | " # Calculate predictions for outputs\n", 208 | " def predictions(self, outputs):\n", 209 | " return np.argmax(outputs, axis=1)\n", 210 | "\n", 211 | "\n", 212 | "\n", 213 | "# Adam optimizer\n", 214 | "class Optimizer_Adam:\n", 215 | "\n", 216 | " # Initialize optimizer - set settings\n", 217 | " def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,\n", 218 | " beta_1=0.9, beta_2=0.999):\n", 219 | " self.learning_rate = learning_rate\n", 220 | " self.current_learning_rate = learning_rate\n", 221 | " self.decay = decay\n", 222 | " self.iterations = 0\n", 223 | " self.epsilon = epsilon\n", 224 | " self.beta_1 = beta_1\n", 225 | " self.beta_2 = beta_2\n", 226 | "\n", 227 | "\n", 228 | " # Call once before any parameter updates\n", 229 | " def pre_update_params(self):\n", 230 | " if self.decay:\n", 231 | " self.current_learning_rate = self.learning_rate * \\\n", 232 | " (1. / (1. + self.decay * self.iterations))\n", 233 | "\n", 234 | " # Update parameters\n", 235 | " def update_params(self, layer):\n", 236 | "\n", 237 | " # If layer does not contain cache arrays,\n", 238 | " # create them filled with zeros\n", 239 | " if not hasattr(layer, 'weight_cache'):\n", 240 | " layer.weight_momentums = np.zeros_like(layer.weights)\n", 241 | " layer.weight_cache = np.zeros_like(layer.weights)\n", 242 | " layer.bias_momentums = np.zeros_like(layer.biases)\n", 243 | " layer.bias_cache = np.zeros_like(layer.biases)\n", 244 | "\n", 245 | " # Update momentum with current gradients\n", 246 | " layer.weight_momentums = self.beta_1 * \\\n", 247 | " layer.weight_momentums + \\\n", 248 | " (1 - self.beta_1) * layer.dweights\n", 249 | " layer.bias_momentums = self.beta_1 * \\\n", 250 | " layer.bias_momentums + \\\n", 251 | " (1 - self.beta_1) * layer.dbiases\n", 252 | " # Get corrected momentum\n", 253 | " # self.iteration is 0 at first pass\n", 254 | " # and we need to start with 1 here\n", 255 | " weight_momentums_corrected = layer.weight_momentums / \\\n", 256 | " (1 - self.beta_1 ** (self.iterations + 1))\n", 257 | " bias_momentums_corrected = layer.bias_momentums / \\\n", 258 | " (1 - self.beta_1 ** (self.iterations + 1))\n", 259 | " # Update cache with squared current gradients\n", 260 | " layer.weight_cache = self.beta_2 * layer.weight_cache + \\\n", 261 | " (1 - self.beta_2) * layer.dweights**2\n", 262 | " layer.bias_cache = self.beta_2 * layer.bias_cache + \\\n", 263 | " (1 - self.beta_2) * layer.dbiases**2\n", 264 | " # Get corrected cache\n", 265 | " weight_cache_corrected = layer.weight_cache / \\\n", 266 | " (1 - self.beta_2 ** (self.iterations + 1))\n", 267 | " bias_cache_corrected = layer.bias_cache / \\\n", 268 | " (1 - self.beta_2 ** (self.iterations + 1))\n", 269 | "\n", 270 | " # Vanilla SGD parameter update + normalization\n", 271 | " # with square rooted cache\n", 272 | " layer.weights += -self.current_learning_rate * \\\n", 273 | " weight_momentums_corrected / \\\n", 274 | " (np.sqrt(weight_cache_corrected) +\n", 275 | " self.epsilon)\n", 276 | "\n", 277 | " layer.biases += -self.current_learning_rate * \\\n", 278 | " bias_momentums_corrected / \\\n", 279 | " (np.sqrt(bias_cache_corrected) +\n", 280 | " self.epsilon)\n", 281 | "\n", 282 | " # Call once after any parameter updates\n", 283 | " def post_update_params(self):\n", 284 | " self.iterations += 1\n", 285 | "\n", 286 | "\n", 287 | "# Common loss class\n", 288 | "class Loss:\n", 289 | "\n", 290 | " # Regularization loss calculation\n", 291 | " def regularization_loss(self):\n", 292 | "\n", 293 | " # 0 by default\n", 294 | " regularization_loss = 0\n", 295 | "\n", 296 | " # Calculate regularization loss\n", 297 | " # iterate all trainable layers\n", 298 | " for layer in self.trainable_layers:\n", 299 | "\n", 300 | " # L1 regularization - weights\n", 301 | " # calculate only when factor greater than 0\n", 302 | " if layer.weight_regularizer_l1 > 0:\n", 303 | " regularization_loss += layer.weight_regularizer_l1 * \\\n", 304 | " np.sum(np.abs(layer.weights))\n", 305 | "\n", 306 | " # L2 regularization - weights\n", 307 | " if layer.weight_regularizer_l2 > 0:\n", 308 | " regularization_loss += layer.weight_regularizer_l2 * \\\n", 309 | " np.sum(layer.weights * \\\n", 310 | " layer.weights)\n", 311 | "\n", 312 | " # L1 regularization - biases\n", 313 | " # calculate only when factor greater than 0\n", 314 | " if layer.bias_regularizer_l1 > 0:\n", 315 | " regularization_loss += layer.bias_regularizer_l1 * \\\n", 316 | " np.sum(np.abs(layer.biases))\n", 317 | "\n", 318 | " # L2 regularization - biases\n", 319 | " if layer.bias_regularizer_l2 > 0:\n", 320 | " regularization_loss += layer.bias_regularizer_l2 * \\\n", 321 | " np.sum(layer.biases * \\\n", 322 | " layer.biases)\n", 323 | "\n", 324 | " return regularization_loss\n", 325 | "\n", 326 | "\n", 327 | " # Set/remember trainable layers\n", 328 | " def remember_trainable_layers(self, trainable_layers):\n", 329 | " self.trainable_layers = trainable_layers\n", 330 | "\n", 331 | " # Calculates the data and regularization losses\n", 332 | " # given model output and ground truth values\n", 333 | " def calculate(self, output, y, *, include_regularization=False):\n", 334 | "\n", 335 | " # Calculate sample losses\n", 336 | " sample_losses = self.forward(output, y)\n", 337 | "\n", 338 | " # Calculate mean loss\n", 339 | " data_loss = np.mean(sample_losses)\n", 340 | "\n", 341 | " # Add accumulated sum of losses and sample count\n", 342 | " self.accumulated_sum += np.sum(sample_losses)\n", 343 | " self.accumulated_count += len(sample_losses)\n", 344 | "\n", 345 | " # If just data loss - return it\n", 346 | " if not include_regularization:\n", 347 | " return data_loss\n", 348 | "\n", 349 | " # Return the data and regularization losses\n", 350 | " return data_loss, self.regularization_loss()\n", 351 | "\n", 352 | " # Calculates accumulated loss\n", 353 | " def calculate_accumulated(self, *, include_regularization=False):\n", 354 | "\n", 355 | " # Calculate mean loss\n", 356 | " data_loss = self.accumulated_sum / self.accumulated_count\n", 357 | "\n", 358 | " # If just data loss - return it\n", 359 | " if not include_regularization:\n", 360 | " return data_loss\n", 361 | "\n", 362 | " # Return the data and regularization losses\n", 363 | " return data_loss, self.regularization_loss()\n", 364 | "\n", 365 | " # Reset variables for accumulated loss\n", 366 | " def new_pass(self):\n", 367 | " self.accumulated_sum = 0\n", 368 | " self.accumulated_count = 0\n", 369 | "\n", 370 | "\n", 371 | "\n", 372 | "# Cross-entropy loss\n", 373 | "class Loss_CategoricalCrossentropy(Loss):\n", 374 | "\n", 375 | " # Forward pass\n", 376 | " def forward(self, y_pred, y_true):\n", 377 | "\n", 378 | " # Number of samples in a batch\n", 379 | " samples = len(y_pred)\n", 380 | "\n", 381 | " # Clip data to prevent division by 0\n", 382 | " # Clip both sides to not drag mean towards any value\n", 383 | " y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)\n", 384 | "\n", 385 | " # Probabilities for target values -\n", 386 | " # only if categorical labels\n", 387 | " if len(y_true.shape) == 1:\n", 388 | " correct_confidences = y_pred_clipped[\n", 389 | " range(samples),\n", 390 | " y_true\n", 391 | " ]\n", 392 | "\n", 393 | " # Mask values - only for one-hot encoded labels\n", 394 | " elif len(y_true.shape) == 2:\n", 395 | " correct_confidences = np.sum(\n", 396 | " y_pred_clipped * y_true,\n", 397 | " axis=1\n", 398 | " )\n", 399 | "\n", 400 | " # Losses\n", 401 | " negative_log_likelihoods = -np.log(correct_confidences)\n", 402 | " return negative_log_likelihoods\n", 403 | "\n", 404 | " # Backward pass\n", 405 | " def backward(self, dvalues, y_true):\n", 406 | "\n", 407 | " # Number of samples\n", 408 | " samples = len(dvalues)\n", 409 | " # Number of labels in every sample\n", 410 | " # We'll use the first sample to count them\n", 411 | " labels = len(dvalues[0])\n", 412 | "\n", 413 | " # If labels are sparse, turn them into one-hot vector\n", 414 | " if len(y_true.shape) == 1:\n", 415 | " y_true = np.eye(labels)[y_true]\n", 416 | "\n", 417 | " # Calculate gradient\n", 418 | " self.dinputs = -y_true / dvalues\n", 419 | " # Normalize gradient\n", 420 | " self.dinputs = self.dinputs / samples\n", 421 | "\n", 422 | "# Softmax classifier - combined Softmax activation\n", 423 | "# and cross-entropy loss for faster backward step\n", 424 | "class Activation_Softmax_Loss_CategoricalCrossentropy():\n", 425 | "\n", 426 | " # Backward pass\n", 427 | " def backward(self, dvalues, y_true):\n", 428 | "\n", 429 | " # Number of samples\n", 430 | " samples = len(dvalues)\n", 431 | "\n", 432 | " # If labels are one-hot encoded,\n", 433 | " # turn them into discrete values\n", 434 | " if len(y_true.shape) == 2:\n", 435 | " y_true = np.argmax(y_true, axis=1)\n", 436 | "\n", 437 | " # Copy so we can safely modify\n", 438 | " self.dinputs = dvalues.copy()\n", 439 | " # Calculate gradient\n", 440 | " self.dinputs[range(samples), y_true] -= 1\n", 441 | " # Normalize gradient\n", 442 | " self.dinputs = self.dinputs / samples\n", 443 | "\n", 444 | "\n", 445 | "\n", 446 | "# Common accuracy class\n", 447 | "class Accuracy:\n", 448 | "\n", 449 | " # Calculates an accuracy\n", 450 | " # given predictions and ground truth values\n", 451 | " def calculate(self, predictions, y):\n", 452 | "\n", 453 | " # Get comparison results\n", 454 | " comparisons = self.compare(predictions, y)\n", 455 | "\n", 456 | " # Calculate an accuracy\n", 457 | " accuracy = np.mean(comparisons)\n", 458 | "\n", 459 | " # Add accumulated sum of matching values and sample count\n", 460 | " self.accumulated_sum += np.sum(comparisons)\n", 461 | " self.accumulated_count += len(comparisons)\n", 462 | "\n", 463 | " # Return accuracy\n", 464 | " return accuracy\n", 465 | "\n", 466 | " # Calculates accumulated accuracy\n", 467 | " def calculate_accumulated(self):\n", 468 | "\n", 469 | " # Calculate an accuracy\n", 470 | " accuracy = self.accumulated_sum / self.accumulated_count\n", 471 | "\n", 472 | " # Return the data and regularization losses\n", 473 | " return accuracy\n", 474 | "\n", 475 | " # Reset variables for accumulated accuracy\n", 476 | " def new_pass(self):\n", 477 | " self.accumulated_sum = 0\n", 478 | " self.accumulated_count = 0\n", 479 | "\n", 480 | "\n", 481 | "# Accuracy calculation for classification model\n", 482 | "class Accuracy_Categorical(Accuracy):\n", 483 | "\n", 484 | " def __init__(self, *, binary=False):\n", 485 | " # Binary mode?\n", 486 | " self.binary = binary\n", 487 | "\n", 488 | " # No initialization is needed\n", 489 | " def init(self, y):\n", 490 | " pass\n", 491 | "\n", 492 | " # Compares predictions to the ground truth values\n", 493 | " def compare(self, predictions, y):\n", 494 | " if not self.binary and len(y.shape) == 2:\n", 495 | " y = np.argmax(y, axis=1)\n", 496 | " return predictions == y\n", 497 | "\n", 498 | "\n", 499 | "# Model class\n", 500 | "class Model:\n", 501 | "\n", 502 | " def __init__(self):\n", 503 | " # Create a list of network objects\n", 504 | " self.layers = []\n", 505 | " # Softmax classifier's output object\n", 506 | " self.softmax_classifier_output = None\n", 507 | "\n", 508 | " # Add objects to the model\n", 509 | " def add(self, layer):\n", 510 | " self.layers.append(layer)\n", 511 | "\n", 512 | "\n", 513 | " # Set loss, optimizer and accuracy\n", 514 | " def set(self, *, loss=None, optimizer=None, accuracy=None):\n", 515 | "\n", 516 | " if loss is not None:\n", 517 | " self.loss = loss\n", 518 | "\n", 519 | " if optimizer is not None:\n", 520 | " self.optimizer = optimizer\n", 521 | "\n", 522 | " if accuracy is not None:\n", 523 | " self.accuracy = accuracy\n", 524 | "\n", 525 | " # Finalize the model\n", 526 | " def finalize(self):\n", 527 | "\n", 528 | " # Create and set the input layer\n", 529 | " self.input_layer = Layer_Input()\n", 530 | "\n", 531 | " # Count all the objects\n", 532 | " layer_count = len(self.layers)\n", 533 | "\n", 534 | " # Initialize a list containing trainable layers:\n", 535 | " self.trainable_layers = []\n", 536 | "\n", 537 | " # Iterate the objects\n", 538 | " for i in range(layer_count):\n", 539 | "\n", 540 | " # If it's the first layer,\n", 541 | " # the previous layer object is the input layer\n", 542 | " if i == 0:\n", 543 | " self.layers[i].prev = self.input_layer\n", 544 | " self.layers[i].next = self.layers[i+1]\n", 545 | "\n", 546 | " # All layers except for the first and the last\n", 547 | " elif i < layer_count - 1:\n", 548 | " self.layers[i].prev = self.layers[i-1]\n", 549 | " self.layers[i].next = self.layers[i+1]\n", 550 | "\n", 551 | " # The last layer - the next object is the loss\n", 552 | " # Also let's save aside the reference to the last object\n", 553 | " # whose output is the model's output\n", 554 | " else:\n", 555 | " self.layers[i].prev = self.layers[i-1]\n", 556 | " self.layers[i].next = self.loss\n", 557 | " self.output_layer_activation = self.layers[i]\n", 558 | "\n", 559 | "\n", 560 | " # If layer contains an attribute called \"weights\",\n", 561 | " # it's a trainable layer -\n", 562 | " # add it to the list of trainable layers\n", 563 | " # We don't need to check for biases -\n", 564 | " # checking for weights is enough\n", 565 | " if hasattr(self.layers[i], 'weights'):\n", 566 | " self.trainable_layers.append(self.layers[i])\n", 567 | "\n", 568 | " # Update loss object with trainable layers\n", 569 | " if self.loss is not None:\n", 570 | " self.loss.remember_trainable_layers(\n", 571 | " self.trainable_layers\n", 572 | " )\n", 573 | "\n", 574 | " # If output activation is Softmax and\n", 575 | " # loss function is Categorical Cross-Entropy\n", 576 | " # create an object of combined activation\n", 577 | " # and loss function containing\n", 578 | " # faster gradient calculation\n", 579 | " if isinstance(self.layers[-1], Activation_Softmax) and \\\n", 580 | " isinstance(self.loss, Loss_CategoricalCrossentropy):\n", 581 | " # Create an object of combined activation\n", 582 | " # and loss functions\n", 583 | " self.softmax_classifier_output = \\\n", 584 | " Activation_Softmax_Loss_CategoricalCrossentropy()\n", 585 | "\n", 586 | " # Train the model\n", 587 | " def train(self, X, y, *, epochs=1, batch_size=None,\n", 588 | " print_every=1, validation_data=None):\n", 589 | "\n", 590 | " # Initialize accuracy object\n", 591 | " self.accuracy.init(y)\n", 592 | "\n", 593 | " # Default value if batch size is not being set\n", 594 | " train_steps = 1\n", 595 | "\n", 596 | " # Calculate number of steps\n", 597 | " if batch_size is not None:\n", 598 | " train_steps = len(X) // batch_size\n", 599 | " # Dividing rounds down. If there are some remaining\n", 600 | " # data but not a full batch, this won't include it\n", 601 | " # Add `1` to include this not full batch\n", 602 | " if train_steps * batch_size < len(X):\n", 603 | " train_steps += 1\n", 604 | "\n", 605 | " \n", 606 | " # Main training loop\n", 607 | " for epoch in range(1, epochs+1):\n", 608 | " train_dict[epoch] = {} # add this\n", 609 | " \n", 610 | " # Print epoch number\n", 611 | " print(f'epoch: {epoch}')\n", 612 | "\n", 613 | " # Reset accumulated values in loss and accuracy objects\n", 614 | " self.loss.new_pass()\n", 615 | " self.accuracy.new_pass()\n", 616 | "\n", 617 | " \n", 618 | " # Iterate over steps\n", 619 | " for step in range(train_steps):\n", 620 | " train_dict[epoch][step] = {} # add this\n", 621 | " # If batch size is not set -\n", 622 | " # train using one step and full dataset\n", 623 | " if batch_size is None:\n", 624 | " batch_X = X\n", 625 | " batch_y = y\n", 626 | "\n", 627 | " # Otherwise slice a batch\n", 628 | " else:\n", 629 | " batch_X = X[step*batch_size:(step+1)*batch_size]\n", 630 | " batch_y = y[step*batch_size:(step+1)*batch_size]\n", 631 | "\n", 632 | " # Perform the forward pass\n", 633 | " output = self.forward(batch_X, training=True)\n", 634 | "\n", 635 | " # Calculate loss\n", 636 | " data_loss, regularization_loss = \\\n", 637 | " self.loss.calculate(output, batch_y,\n", 638 | " include_regularization=True)\n", 639 | " loss = data_loss + regularization_loss\n", 640 | "\n", 641 | " # Get predictions and calculate an accuracy\n", 642 | " predictions = self.output_layer_activation.predictions(\n", 643 | " output)\n", 644 | " accuracy = self.accuracy.calculate(predictions,\n", 645 | " batch_y)\n", 646 | "\n", 647 | " # Perform backward pass\n", 648 | " self.backward(output, batch_y)\n", 649 | "\n", 650 | " # Optimize (update parameters)\n", 651 | " ########################################################\n", 652 | " ### THIS IS WHERE WE SAVE ALL THE DATA FROM TRAINING ###\n", 653 | " ########################################################\n", 654 | " self.optimizer.pre_update_params()\n", 655 | " for n, layer in enumerate(self.trainable_layers): # added enum/n\n", 656 | " self.optimizer.update_params(layer)\n", 657 | " train_dict[epoch][step][n] = {}\n", 658 | " train_dict[epoch][step][n][\"weights\"] = layer.weights.copy()\n", 659 | " train_dict[epoch][step][n][\"biases\"] = layer.biases.copy()\n", 660 | " train_dict[epoch][step][n][\"dweights\"] = layer.dweights.copy()\n", 661 | " train_dict[epoch][step][n][\"dbiases\"] = layer.dbiases.copy()\n", 662 | " train_dict[epoch][step][n][\"weight_momentums\"] = layer.weight_momentums.copy()\n", 663 | " train_dict[epoch][step][n][\"bias_momentums\"] = layer.bias_momentums.copy()\n", 664 | "\n", 665 | " self.optimizer.post_update_params()\n", 666 | "\n", 667 | " # Print a summary\n", 668 | " if not step % print_every or step == train_steps - 1:\n", 669 | " print(f'step: {step}, ' +\n", 670 | " f'acc: {accuracy:.3f}, ' +\n", 671 | " f'loss: {loss:.3f} (' +\n", 672 | " f'data_loss: {data_loss:.3f}, ' +\n", 673 | " f'reg_loss: {regularization_loss:.3f}), ' +\n", 674 | " f'lr: {self.optimizer.current_learning_rate}')\n", 675 | "\n", 676 | " # Get and print epoch loss and accuracy\n", 677 | " epoch_data_loss, epoch_regularization_loss = \\\n", 678 | " self.loss.calculate_accumulated(\n", 679 | " include_regularization=True)\n", 680 | " epoch_loss = epoch_data_loss + epoch_regularization_loss\n", 681 | " epoch_accuracy = self.accuracy.calculate_accumulated()\n", 682 | "\n", 683 | " print(f'training, ' +\n", 684 | " f'acc: {epoch_accuracy:.3f}, ' +\n", 685 | " f'loss: {epoch_loss:.3f} (' +\n", 686 | " f'data_loss: {epoch_data_loss:.3f}, ' +\n", 687 | " f'reg_loss: {epoch_regularization_loss:.3f}), ' +\n", 688 | " f'lr: {self.optimizer.current_learning_rate}')\n", 689 | "\n", 690 | " # If there is the validation data\n", 691 | " if validation_data is not None:\n", 692 | "\n", 693 | " # Evaluate the model:\n", 694 | " self.evaluate(*validation_data,\n", 695 | " batch_size=batch_size)\n", 696 | "\n", 697 | " # Evaluates the model using passed-in dataset\n", 698 | " def evaluate(self, X_val, y_val, *, batch_size=None):\n", 699 | "\n", 700 | " # Default value if batch size is not being set\n", 701 | " validation_steps = 1\n", 702 | "\n", 703 | " # Calculate number of steps\n", 704 | " if batch_size is not None:\n", 705 | " validation_steps = len(X_val) // batch_size\n", 706 | " # Dividing rounds down. If there are some remaining\n", 707 | " # data but not a full batch, this won't include it\n", 708 | " # Add `1` to include this not full batch\n", 709 | " if validation_steps * batch_size < len(X_val):\n", 710 | " validation_steps += 1\n", 711 | "\n", 712 | " # Reset accumulated values in loss\n", 713 | " # and accuracy objects\n", 714 | " self.loss.new_pass()\n", 715 | " self.accuracy.new_pass()\n", 716 | "\n", 717 | "\n", 718 | " # Iterate over steps\n", 719 | " for step in range(validation_steps):\n", 720 | "\n", 721 | " # If batch size is not set -\n", 722 | " # train using one step and full dataset\n", 723 | " if batch_size is None:\n", 724 | " batch_X = X_val\n", 725 | " batch_y = y_val\n", 726 | "\n", 727 | " # Otherwise slice a batch\n", 728 | " else:\n", 729 | " batch_X = X_val[\n", 730 | " step*batch_size:(step+1)*batch_size\n", 731 | " ]\n", 732 | " batch_y = y_val[\n", 733 | " step*batch_size:(step+1)*batch_size\n", 734 | " ]\n", 735 | "\n", 736 | " # Perform the forward pass\n", 737 | " output = self.forward(batch_X, training=False)\n", 738 | "\n", 739 | " # Calculate the loss\n", 740 | " self.loss.calculate(output, batch_y)\n", 741 | "\n", 742 | " # Get predictions and calculate an accuracy\n", 743 | " predictions = self.output_layer_activation.predictions(\n", 744 | " output)\n", 745 | " self.accuracy.calculate(predictions, batch_y)\n", 746 | "\n", 747 | " # Get and print validation loss and accuracy\n", 748 | " validation_loss = self.loss.calculate_accumulated()\n", 749 | " validation_accuracy = self.accuracy.calculate_accumulated()\n", 750 | "\n", 751 | " # Print a summary\n", 752 | " print(f'validation, ' +\n", 753 | " f'acc: {validation_accuracy:.3f}, ' +\n", 754 | " f'loss: {validation_loss:.3f}')\n", 755 | "\n", 756 | " # Predicts on the samples\n", 757 | " def predict(self, X, *, batch_size=None):\n", 758 | "\n", 759 | " # Default value if batch size is not being set\n", 760 | " prediction_steps = 1\n", 761 | "\n", 762 | " # Calculate number of steps\n", 763 | " if batch_size is not None:\n", 764 | " prediction_steps = len(X) // batch_size\n", 765 | "\n", 766 | " # Dividing rounds down. If there are some remaining\n", 767 | " # data but not a full batch, this won't include it\n", 768 | " # Add `1` to include this not full batch\n", 769 | " if prediction_steps * batch_size < len(X):\n", 770 | " prediction_steps += 1\n", 771 | "\n", 772 | " # Model outputs\n", 773 | " output = []\n", 774 | "\n", 775 | " # Iterate over steps\n", 776 | " for step in range(prediction_steps):\n", 777 | "\n", 778 | " # If batch size is not set -\n", 779 | " # train using one step and full dataset\n", 780 | " if batch_size is None:\n", 781 | " batch_X = X\n", 782 | "\n", 783 | " # Otherwise slice a batch\n", 784 | " else:\n", 785 | " batch_X = X[step*batch_size:(step+1)*batch_size]\n", 786 | "\n", 787 | " # Perform the forward pass\n", 788 | " batch_output = self.forward(batch_X, training=False)\n", 789 | "\n", 790 | " # Append batch prediction to the list of predictions\n", 791 | " output.append(batch_output)\n", 792 | "\n", 793 | " # Stack and return results\n", 794 | " return np.vstack(output)\n", 795 | "\n", 796 | " # Performs forward pass\n", 797 | " def forward(self, X, training):\n", 798 | "\n", 799 | " # Call forward method on the input layer\n", 800 | " # this will set the output property that\n", 801 | " # the first layer in \"prev\" object is expecting\n", 802 | " self.input_layer.forward(X, training)\n", 803 | "\n", 804 | " # Call forward method of every object in a chain\n", 805 | " # Pass output of the previous object as a parameter\n", 806 | " for layer in self.layers:\n", 807 | " layer.forward(layer.prev.output, training)\n", 808 | "\n", 809 | " # \"layer\" is now the last object from the list,\n", 810 | " # return its output\n", 811 | " return layer.output\n", 812 | "\n", 813 | "\n", 814 | " # Performs backward pass\n", 815 | " def backward(self, output, y):\n", 816 | "\n", 817 | " # If softmax classifier\n", 818 | " if self.softmax_classifier_output is not None:\n", 819 | " # First call backward method\n", 820 | " # on the combined activation/loss\n", 821 | " # this will set dinputs property\n", 822 | " self.softmax_classifier_output.backward(output, y)\n", 823 | "\n", 824 | " # Since we'll not call backward method of the last layer\n", 825 | " # which is Softmax activation\n", 826 | " # as we used combined activation/loss\n", 827 | " # object, let's set dinputs in this object\n", 828 | " self.layers[-1].dinputs = \\\n", 829 | " self.softmax_classifier_output.dinputs\n", 830 | "\n", 831 | " # Call backward method going through\n", 832 | " # all the objects but last\n", 833 | " # in reversed order passing dinputs as a parameter\n", 834 | " for layer in reversed(self.layers[:-1]):\n", 835 | " layer.backward(layer.next.dinputs)\n", 836 | "\n", 837 | " return\n", 838 | "\n", 839 | " # First call backward method on the loss\n", 840 | " # this will set dinputs property that the last\n", 841 | " # layer will try to access shortly\n", 842 | " self.loss.backward(output, y)\n", 843 | "\n", 844 | " # Call backward method going through all the objects\n", 845 | " # in reversed order passing dinputs as a parameter\n", 846 | " for layer in reversed(self.layers):\n", 847 | " layer.backward(layer.next.dinputs)\n", 848 | "\n", 849 | " # Retrieves and returns parameters of trainable layers\n", 850 | " def get_parameters(self):\n", 851 | "\n", 852 | " # Create a list for parameters\n", 853 | " parameters = []\n", 854 | "\n", 855 | " # Iterable trainable layers and get their parameters\n", 856 | " for layer in self.trainable_layers:\n", 857 | " parameters.append(layer.get_parameters())\n", 858 | "\n", 859 | " # Return a list\n", 860 | " return parameters\n", 861 | "\n", 862 | "\n", 863 | " # Updates the model with new parameters\n", 864 | " def set_parameters(self, parameters):\n", 865 | "\n", 866 | " # Iterate over the parameters and layers\n", 867 | " # and update each layers with each set of the parameters\n", 868 | " for parameter_set, layer in zip(parameters,\n", 869 | " self.trainable_layers):\n", 870 | " layer.set_parameters(*parameter_set)\n", 871 | "\n", 872 | " # Saves the parameters to a file\n", 873 | " def save_parameters(self, path):\n", 874 | "\n", 875 | " # Open a file in the binary-write mode\n", 876 | " # and save parameters into it\n", 877 | " with open(path, 'wb') as f:\n", 878 | " pickle.dump(self.get_parameters(), f)\n", 879 | "\n", 880 | " # Loads the weights and updates a model instance with them\n", 881 | " def load_parameters(self, path):\n", 882 | "\n", 883 | " # Open file in the binary-read mode,\n", 884 | " # load weights and update trainable layers\n", 885 | " with open(path, 'rb') as f:\n", 886 | " self.set_parameters(pickle.load(f))\n", 887 | "\n", 888 | " # Saves the model\n", 889 | " def save(self, path):\n", 890 | "\n", 891 | " # Make a deep copy of current model instance\n", 892 | " model = copy.deepcopy(self)\n", 893 | "\n", 894 | " # Reset accumulated values in loss and accuracy objects\n", 895 | " model.loss.new_pass()\n", 896 | " model.accuracy.new_pass()\n", 897 | "\n", 898 | " # Remove data from the input layer\n", 899 | " # and gradients from the loss object\n", 900 | " model.input_layer.__dict__.pop('output', None)\n", 901 | " model.loss.__dict__.pop('dinputs', None)\n", 902 | "\n", 903 | " # For each layer remove inputs, output and dinputs properties\n", 904 | " for layer in model.layers:\n", 905 | " for property in ['inputs', 'output', 'dinputs',\n", 906 | " 'dweights', 'dbiases']:\n", 907 | " layer.__dict__.pop(property, None)\n", 908 | "\n", 909 | " # Open a file in the binary-write mode and save the model\n", 910 | " with open(path, 'wb') as f:\n", 911 | " pickle.dump(model, f)\n", 912 | "\n", 913 | "\n", 914 | " # Loads and returns a model\n", 915 | " @staticmethod\n", 916 | " def load(path):\n", 917 | "\n", 918 | " # Open file in the binary-read mode, load a model\n", 919 | " with open(path, 'rb') as f:\n", 920 | " model = pickle.load(f)\n", 921 | "\n", 922 | " # Return a model\n", 923 | " return model\n", 924 | "\n", 925 | "\n", 926 | "# Loads a MNIST dataset\n", 927 | "def load_mnist_dataset(dataset, path):\n", 928 | "\n", 929 | " # Scan all the directories and create a list of labels\n", 930 | " labels = os.listdir(os.path.join(path, dataset))\n", 931 | "\n", 932 | " # Create lists for samples and labels\n", 933 | " X = []\n", 934 | " y = []\n", 935 | "\n", 936 | "\n", 937 | " # For each label folder\n", 938 | " for label in labels:\n", 939 | " print(label)\n", 940 | " # And for each image in given folder\n", 941 | " for file in tqdm(os.listdir(os.path.join(path, dataset, label))):\n", 942 | " # Read the image\n", 943 | " image = cv2.imread(\n", 944 | " os.path.join(path, dataset, label, file),\n", 945 | " cv2.IMREAD_UNCHANGED)\n", 946 | "\n", 947 | " # And append it and a label to the lists\n", 948 | " X.append(image)\n", 949 | " y.append(label)\n", 950 | "\n", 951 | " # Convert the data to proper numpy arrays and return\n", 952 | " return np.array(X), np.array(y).astype('uint8')\n", 953 | "\n", 954 | "\n", 955 | "# MNIST dataset (train + test)\n", 956 | "def create_data_mnist(path):\n", 957 | "\n", 958 | " # Load both sets separately\n", 959 | " X, y = load_mnist_dataset('train', path)\n", 960 | " X_test, y_test = load_mnist_dataset('test', path)\n", 961 | "\n", 962 | " # And return all the data\n", 963 | " return X, y, X_test, y_test\n", 964 | "\n", 965 | "\n", 966 | "# Create dataset\n", 967 | "X, y, X_test, y_test = create_data_mnist('fashion_mnist_images')\n", 968 | "# Shuffle the training dataset\n", 969 | "keys = np.array(range(X.shape[0]))\n", 970 | "np.random.shuffle(keys)\n", 971 | "X = X[keys]\n", 972 | "y = y[keys]\n", 973 | "\n", 974 | "# Scale and reshape samples\n", 975 | "X = (X.reshape(X.shape[0], -1).astype(np.float32) - 127.5) / 127.5\n", 976 | "X_test = (X_test.reshape(X_test.shape[0], -1).astype(np.float32) -\n", 977 | " 127.5) / 127.5\n" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": null, 983 | "metadata": {}, 984 | "outputs": [], 985 | "source": [ 986 | "# Instantiate the model\n", 987 | "model = Model()\n", 988 | "\n", 989 | "# Add layers\n", 990 | "model.add(Layer_Dense(X.shape[1], 32))\n", 991 | "model.add(Activation_ReLU())\n", 992 | "model.add(Layer_Dense(32, 32))\n", 993 | "model.add(Activation_ReLU())\n", 994 | "model.add(Layer_Dense(32, 10))\n", 995 | "model.add(Activation_Softmax())\n", 996 | "\n", 997 | "# Set loss, optimizer and accuracy objects\n", 998 | "model.set(\n", 999 | " loss=Loss_CategoricalCrossentropy(),\n", 1000 | " optimizer=Optimizer_Adam(decay=1e-3),\n", 1001 | " accuracy=Accuracy_Categorical()\n", 1002 | ")\n", 1003 | "\n", 1004 | "# Finalize the model\n", 1005 | "model.finalize()\n", 1006 | "\n", 1007 | "# Train the model\n", 1008 | "model.train(X, y, validation_data=(X_test, y_test),\n", 1009 | " epochs=5, batch_size=128, print_every=100)\n", 1010 | "\n", 1011 | "model.save(\"fashion_mnist.model\")\n", 1012 | "\n", 1013 | "\n", 1014 | "print(\"train_dict\", train_dict)\n", 1015 | "\n", 1016 | "# save train_dict with pickle\n", 1017 | "with open(\"train_dict.pkl\", \"wb\") as f:\n", 1018 | " pickle.dump(train_dict, f)" 1019 | ] 1020 | } 1021 | ], 1022 | "metadata": { 1023 | "kernelspec": { 1024 | "display_name": "Python 3", 1025 | "language": "python", 1026 | "name": "python3" 1027 | }, 1028 | "language_info": { 1029 | "codemirror_mode": { 1030 | "name": "ipython", 1031 | "version": 3 1032 | }, 1033 | "file_extension": ".py", 1034 | "mimetype": "text/x-python", 1035 | "name": "python", 1036 | "nbconvert_exporter": "python", 1037 | "pygments_lexer": "ipython3", 1038 | "version": "3.10.13" 1039 | } 1040 | }, 1041 | "nbformat": 4, 1042 | "nbformat_minor": 2 1043 | } 1044 | --------------------------------------------------------------------------------