├── Perceptron.py ├── main.py ├── CommonFunctions.py ├── training ├── BatchGradientDescent.py ├── StochasticGradientDescent.py ├── Adam.py └── GeneticAlgorithm.py └── README.md /Perceptron.py: -------------------------------------------------------------------------------- 1 | from CommonFunctions import * 2 | 3 | # The structure of this neural net is as follows, where I represents an input node, N a neuron, B a bias and O 4 | # the output. Notice that the bias is treated simply as a node in the previous layer. The bias is extremely important 5 | # for this model, since we will be training it with input values of either 1 or 0, so if we didn't include a bias, 6 | # when calculating the node value we may run into the case where all inputs are 0, leading to the node value being 7 | # sigmoid(0) = 0.5, which is no prediction at all. 8 | # I - 9 | # \ 10 | # I - N - O 11 | # / | 12 | # I - B 13 | # The neural net is technically considered to be feed forward 14 | 15 | 16 | class Perceptron: 17 | 18 | def __init__(self, weights=np.array([]), activation=sigmoid, error=mse): 19 | self.weights = weights 20 | self.er = 1 21 | self.activation = activation 22 | self.error = error 23 | self.error_path = [] 24 | self.avg_diff_path = [] 25 | 26 | def train(self, train_func, *args): 27 | self.weights, self.er = train_func(self.activation, self.error, *args) 28 | 29 | def predict(self, x, display=True): 30 | p_input = np.append(x, 1) # add bias node 31 | neuron_val = np.dot(self.weights, p_input) 32 | p_output = self.activation(neuron_val) 33 | # depending on the activation function the output may be a numpy array, so force it to be a value 34 | if type(p_output) is np.ndarray: 35 | p_output = p_output[0] 36 | prediction = round(p_output) 37 | confidence = max(abs(prediction - p_output), 1 - abs(prediction - p_output)) 38 | 39 | if display: 40 | print("INPUT \n {}".format(x)) 41 | print("PREDICTION \t CONFIDENCE \n {} \t\t {}".format(prediction, confidence)) 42 | print("WEIGHTS \n {}".format(self.weights)) 43 | print("ERROR \n {}".format(self.er)) 44 | 45 | return p_output, prediction, confidence 46 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from training.GeneticAlgorithm import genetic_algorithm 2 | from training.StochasticGradientDescent import sgd 3 | from training.Adam import adam 4 | from training.BatchGradientDescent import bgd 5 | from Perceptron import * 6 | 7 | 8 | if __name__ == "__main__": 9 | x = np.array([[1, 1, 0, 0], 10 | [0, 0, 1, 0], 11 | [1, 0, 0, 1], 12 | [0, 1, 1, 1]]) 13 | y = np.array([0, 0, 1, 1]) 14 | input0 = np.array([0, 1, 1, 0]) 15 | 16 | x1 = np.array([[0], 17 | [1], 18 | [2], 19 | [10]]) 20 | y1 = np.array([5, 7, 9, 25]) 21 | input1 = np.array([100]) 22 | 23 | # Trying to model the first pattern of taking the last element in the list (so weights are [0, 0, 0, 1, 0]) 24 | print("Genetic Algorithm ##################################################") 25 | nn = Perceptron(activation=leaky_relu, error=mse) 26 | nn.train(genetic_algorithm, x, y, 1000, 100, 0.0001) 27 | nn.predict(input0) 28 | 29 | print("Stochastic Gradient Descent ########################################") 30 | nn = Perceptron(activation=sigmoid, error=mae) 31 | nn.train(sgd, x, y, 1000, 1) 32 | nn.predict(input0) 33 | 34 | print("Batch Gradient Descent #############################################") 35 | nn = Perceptron(activation=sigmoid, error=mae) 36 | nn.train(bgd, x, y, 1000, 1) 37 | nn.predict(input0) 38 | 39 | print("Adam Optimizer #####################################################") 40 | nn = Perceptron(activation=sigmoid, error=mse) 41 | nn.train(adam, x, y, 1000, 1, 0) 42 | nn.predict(input0) 43 | 44 | # Trying to model the second pattern y = 2x + 5 (so weights are [2, 5]) 45 | # notice the genetic algorithm cannot do this, since the weights are bound between -1 and 1. 46 | # Any non-momentum gradient descent performs horribly here, and the momentum based algorithms (Adam) perform 47 | # extremely well, often getting exact weights [2, 5] thanks to floating point errors. 48 | print("Adam Optimizer #####################################################") 49 | nn = Perceptron(activation=leaky_relu, error=mse) 50 | nn.train(adam, x1, y1, 1000, 1, 0) 51 | nn.predict(input1) 52 | -------------------------------------------------------------------------------- /CommonFunctions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # Activation Functions ------------------------------------------------------------------------------------------------- 5 | def sigmoid(x, deriv=False): 6 | if deriv: 7 | return sigmoid(x) * (1 - sigmoid(x)) 8 | return 1 / (np.exp(-x) + 1) 9 | 10 | 11 | def tanh(x, deriv=False): 12 | if deriv: 13 | return 1 - tanh(x, False) * tanh(x, False) 14 | return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) 15 | 16 | 17 | def relu(x, deriv=False): 18 | if deriv: 19 | if x > 0: 20 | return 1 21 | return 0 22 | return max(0, x) 23 | 24 | 25 | def leaky_relu(x, deriv=False, nslope=0.05): 26 | if deriv: 27 | if x > 0: 28 | return 1 29 | if x < 0: 30 | return nslope 31 | # the function derivative is not defined at x=0, but we set it to zero here to allow it to work 32 | return 0 33 | if x > 0: 34 | return x 35 | return nslope * x 36 | 37 | 38 | def nothing(x, deriv=False): 39 | return x 40 | 41 | 42 | # Error Functions ------------------------------------------------------------------------------------------------------ 43 | # mean squared error 44 | def mse(y, y_hats, deriv=False): 45 | if type(y) is not np.ndarray: 46 | y = np.array([y]) 47 | if type(y_hats) is not np.ndarray: 48 | y_hats = np.array([y_hats]) 49 | n = len(y_hats) 50 | if deriv: 51 | return 2 * np.sum(y_hats - y) / n 52 | return np.sum(np.power((y - y_hats), 2)) / n 53 | 54 | 55 | # mean absolute error 56 | def mae(y, y_hats, deriv=False): 57 | if type(y) is not np.ndarray: 58 | y = np.array([y]) 59 | if type(y_hats) is not np.ndarray: 60 | y_hats = np.array([y_hats]) 61 | n = len(y_hats) 62 | if deriv: 63 | err = np.array([]) 64 | for i in range(n): 65 | if y_hats[i] > y[i]: 66 | err = np.append(err, 1) 67 | elif y[i] > y_hats[i]: 68 | err = np.append(err, -1) 69 | else: 70 | # the derivative isn't defined when the predicted equals the actual, but in our case we don't want to 71 | # change the weights when this happens, so we set the output to 0 72 | err = np.append(err, 0) 73 | return err 74 | return np.sum(abs(y - y_hats)) / n 75 | -------------------------------------------------------------------------------- /training/BatchGradientDescent.py: -------------------------------------------------------------------------------- 1 | from CommonFunctions import * 2 | 3 | 4 | def bgd(activation, error_func, x, y, epochs=1000, learning_rate=0.01): 5 | """ 6 | SUMMARY 7 | Batch Gradient Descent is the default gradient descent algorithm. The idea behind gradient descent is to adjust 8 | the weights in a way that slightly decreases the loss of the model, and calculates this via backpropagation. 9 | It computes the partial derivative of the loss function with respect to each weight (called the gradient), 10 | which is what indicates the direction the weight needs to be adjusted by to slightly decrease the loss. After 11 | each partial derivative is calculated and we have the gradient vector, we use the delta rule to evaluate the 12 | magnitude of the weight adjustment. 13 | PROS 14 | Since we are calculating the full gradient vector, with each weight adjustment, we know we are moving along the 15 | loss function in the perfect way. A common analogy is that we are a ball with no momentum rolling down a hill, 16 | we know that we are always going in the direction which gives us a lower cost. 17 | CONS 18 | Consider what happens when we have a large dataset. Since we compute the partial derivatives for each weight 19 | and only make the adjustments once after going over every data point, the algorithm can be extremely heavy on 20 | memory use. 21 | ARGUMENTS 22 | x: a numpy array, one row for each trial (set of inputs) 23 | y: a numpy array, labels for the inputs x. one label per trial 24 | epochs: integer, number of times we will pass through all training data the weights of the model 25 | learning_rate: float, a relative value which determines how big the weight adjustments are. Too high and the 26 | model won't be able to find the local minimum, too small and the model will take too long to find the 27 | minimum. 28 | activation: function, the type of activation function to be used. (sigmoid works well) 29 | error_func: function, the type of error function to be used. (mean absolute error works well) 30 | RETURN 31 | The function returns two elements, the numpy array of best weights found, as well as the error of the 32 | weights, in that order. 33 | NOTE 34 | The activation function and error functions should be differentiable, because the delta values are dependent 35 | on the derivatives of those functions. 36 | """ 37 | 38 | def calculate_err(): 39 | y_hats = np.array([]) 40 | for u in range(len(x)): 41 | y_hats = np.append(y_hats, activation(np.dot(weights, x[u]))) 42 | return error_func(y, y_hats) 43 | 44 | num_inputs = len(x[0]) 45 | 46 | # we are adding a bias by creating a new node (val=1) in the input layer and treating it as an input 47 | temp = [] 48 | for row in range(len(x)): 49 | temp.append(np.append(x[row], 1)) 50 | x = np.array(temp) 51 | 52 | # we want a vector of length n + 1 (one extra for a bias) for weights, where the values are between -1 and 1 53 | weights = (np.random.rand(1, num_inputs + 1) * 2 - 1)[0] 54 | 55 | # In each epoch, we run the model on every weight for every testing input and output, and adjust each weight once 56 | for epoch in range(epochs): 57 | deltas = np.array([]) 58 | for i in range(len(x)): 59 | input_layer = x[i] 60 | neuron_val = np.dot(weights, input_layer) 61 | y_hat = activation(neuron_val) 62 | # the delta rule, a generalization of the partial derivative of the cost function, thanks to backpropagation 63 | delta = activation(neuron_val, deriv=True) * error_func(y[i], [y_hat], deriv=True) 64 | deltas = np.append(deltas, delta) 65 | # we update the weights only after we have gone through every training data 66 | adjustment = learning_rate * np.dot(deltas, x) 67 | weights -= adjustment 68 | 69 | return weights, calculate_err() 70 | -------------------------------------------------------------------------------- /training/StochasticGradientDescent.py: -------------------------------------------------------------------------------- 1 | from CommonFunctions import * 2 | 3 | 4 | def sgd(activation, error_func, x, y, epochs=1000, learning_rate=0.01): 5 | """ 6 | SUMMARY 7 | Stochastic Gradient Descent is a variant of regular gradient descent (batch gradient descent, or bgd), and it 8 | is effectively the same algorithm, with the exception of when the weights update. Recall that bgd updates all 9 | of the weights at the same time, after the full gradient vector is calculated. What sgd does instead is update 10 | each weight as soon as it's partial derivative is calculated. 11 | PROS 12 | Since the weights are updated very frequently, the loss may fluctuate very heavily, allowing it to possibly 13 | move to a new, lower local minimum. Sgd is very low on memory compared to bgd (imagine storing hundreds of 14 | thousands of gradients in memory before finally adjusting the weight values) 15 | CONS 16 | The update frequency of the weights is also a downside, since it often results in a very choppy path to the 17 | minimum of the cost function. It's sort of like a drunk man trying to walk down a hill, where bgd is a ball 18 | with no momentum; it always adjusts it's direction to be follow the exact path of the hill. 19 | ARGUMENTS 20 | x: a numpy array, one row for each trial (set of inputs) 21 | y: a numpy array, labels for the inputs x. one label per trial 22 | epochs: integer, number of times we will pass through all training data the weights of the model 23 | learning_rate: float, a relative value which determines how big the weight adjustments are. Too high and the 24 | model won't be able to find the local minimum, too small and the model will take too long to find the 25 | minimum. 26 | activation: function, the type of activation function to be used. (sigmoid works well) 27 | error_func: function, the type of error function to be used. (mean absolute error works well) 28 | RETURN 29 | The function returns two elements, the numpy array of best weights found, as well as the error of the 30 | weights, in that order. 31 | NOTE 32 | The activation function and error functions should be differentiable, because the delta values are dependent 33 | on the derivatives of those functions. 34 | """ 35 | 36 | def calculate_err(): 37 | y_hats = np.array([]) 38 | for u in range(len(x)): 39 | y_hats = np.append(y_hats, activation(np.dot(weights, x[u]))) 40 | return error_func(y, y_hats) 41 | 42 | num_inputs = len(x[0]) 43 | 44 | # we are adding a bias by creating a new node (val=1) in the input layer and treating it as an input 45 | temp = [] 46 | for row in range(len(x)): 47 | temp.append(np.append(x[row], 1)) 48 | x = np.array(temp) 49 | 50 | # we want a vector of length n + 1 (one extra for a bias) for weights, where the values are between -1 and 1 51 | weights = (np.random.rand(1, num_inputs + 1) * 2 - 1)[0] 52 | tup = list(zip(x, y)) 53 | # In each epoch, we run the model on every weight for every testing input and output, and adjust each weight once 54 | for epoch in range(epochs): 55 | # since we update every weight as soon as we calculate the gradient, we need to randomize it every epoch 56 | np.random.shuffle(tup) 57 | x = [x1 for x1, _ in tup] 58 | y = [x2 for _, x2 in tup] 59 | for i in range(len(x)): 60 | input_layer = x[i] 61 | neuron_val = np.dot(weights, input_layer) 62 | y_hat = activation(neuron_val) 63 | # the delta rule, a generalization of the partial derivative of the cost function, thanks to backpropagation 64 | delta = activation(neuron_val, deriv=True) * error_func(y[i], y_hat, deriv=True) 65 | for k in range(num_inputs + 1): 66 | adjustment = delta * learning_rate * x[i][k] 67 | # we update each individual weight once per trial 68 | weights[k] -= adjustment 69 | 70 | return weights, calculate_err() 71 | -------------------------------------------------------------------------------- /training/Adam.py: -------------------------------------------------------------------------------- 1 | from CommonFunctions import * 2 | 3 | 4 | def adam(activation, error_func, x, y, epochs=1000, learning_rate=0.001, stop_loss=0.001): 5 | """ 6 | SUMMARY 7 | Using https://arxiv.org/abs/1412.6980 as a reference. 8 | Adam is an optimization of gradient descent. The algorithm is essentially a combination of Stochastic Gradient 9 | Descent with momentum and RMSprop. Adam adjusts the learning rate for each weight (RMSprop), and uses the 10 | moving average of the gradient rather than the gradient itself (momentum). This algorithm can be thought of as 11 | a ball with lots of friction rolling down a hill. It's often the default optimization algorithm used when 12 | training neural networks. 13 | PROS 14 | Adam combines the best of RMSprop and sgd with momentum to have a computationally efficient and scalable 15 | algorithm, with hyperparameters that require no tuning the large majority of the time. 16 | CONS 17 | May suffer weight decay problems (the weights go to zero, and are unable to return). 18 | ARGUMENTS 19 | x: a numpy array, one row for each trial (set of inputs) 20 | y: a numpy array, labels for the inputs x. one label per trial 21 | epochs: integer, number of times we will pass through all training data the weights of the model 22 | learning_rate: float, a relative value which determines how big the weight adjustments are. Too high and the 23 | model won't be able to find the local minimum, too small and the model will take too long to find the 24 | minimum. 25 | stop_loss: float, if every weight is being adjusted by a value is smaller than stop_loss, the current weights 26 | are deemed acceptable, and are returned 27 | activation: function, the type of activation function to be used. (sigmoid works well) 28 | error_func: function, the type of error function to be used. (mean squared error works well) 29 | RETURN 30 | The function returns two elements, the numpy array of best weights found, as well as the error of the 31 | weights, in that order. 32 | NOTE 33 | Typically Adam performs the weight adjustments on a random mini-batch of data (like mini-batch gradient 34 | descent), however, since we manually define the training dataset, we have a very small dataset, so we are able 35 | to treat our entire dataset as a mini-batch. 36 | """ 37 | 38 | num_inputs = len(x[0]) 39 | # we are adding a bias by creating a new node (val=1) in the input layer and treating it as an input 40 | temp = [] 41 | for row in range(len(x)): 42 | temp.append(np.append(x[row], 1)) 43 | x = np.array(temp) 44 | # the two moments, mean and variance 45 | m = np.zeros(num_inputs + 1) 46 | v = np.zeros(num_inputs + 1) 47 | # m moment's exponential decay rate 48 | beta_1 = 0.9 49 | # v moment's exponential decay rate 50 | beta_2 = 0.999 51 | # prevent division by 0 52 | epsilon = 10**-8 53 | 54 | # we want a vector of length n + 1 (one extra for a bias) for weights, where the values are between -1 and 1 55 | weights = (np.random.rand(1, num_inputs + 1) * 2 - 1)[0] 56 | 57 | def calculate_err(): 58 | y_hats = np.array([]) 59 | for k in range(len(x)): 60 | y_hats = np.append(y_hats, activation(np.dot(weights, x[k]))) 61 | return error_func(y, y_hats) 62 | 63 | # In each epoch, we run the model on every weight for every testing input and output, and adjust each weight once 64 | for epoch in range(epochs): 65 | t = epoch + 1 # prevent division by 0 66 | for i in range(len(x)): 67 | input_layer = x[i] 68 | neuron_val = np.dot(weights, input_layer) 69 | y_hat = activation(neuron_val) 70 | 71 | # g is the partial derivative of the cost w.r.t the weight, just like in gradient descent 72 | g = (activation(neuron_val, deriv=True) + epsilon) * error_func(y[i], [y_hat], deriv=True) * input_layer 73 | 74 | # adjust mean and variance such that the decay rate decreases them over time 75 | m = beta_1 * m + (1 - beta_1) * g 76 | v = beta_2 * v + (1 - beta_2) * np.power(g, 2) 77 | 78 | m_hat = m / (1 - np.power(beta_1, t)) 79 | v_hat = v / (1 - np.power(beta_2, t)) 80 | 81 | # final adjustment value, notice epsilon added purely to avoid division by 0 82 | adjustment = learning_rate * m_hat/(np.sqrt(v_hat) + epsilon) 83 | 84 | if all(abs(x) < stop_loss for x in adjustment): 85 | print("EARLY STOP") 86 | return weights, calculate_err() 87 | 88 | weights = weights - adjustment 89 | 90 | return weights, calculate_err() 91 | -------------------------------------------------------------------------------- /training/GeneticAlgorithm.py: -------------------------------------------------------------------------------- 1 | from CommonFunctions import * 2 | import math 3 | import random 4 | from operator import itemgetter 5 | 6 | 7 | def genetic_algorithm(activation, error_func, x, y, generations=100, num_agents=100, stop_error=0.001): 8 | """ 9 | SUMMARY 10 | The genetic algorithm tries to mimic evolution by testing many different weights, ranking them, and making the 11 | weights that scored higher more likely to reproduce, where a variant of their 'genes' (weights) will be passed 12 | down to the next generation. After each generation, the set of weights (agents) is likely going to be slightly 13 | better than the previous agents, thus resulting in an improvement in accuracy over time. 14 | PROS 15 | The algorithm is (typically) unsupervised, meaning it needs no training labels to improve overtime. The 16 | algorithm excels when used on an agent in a video game, since you're able to calculate the fitness of the agent 17 | based on how well it performs in the environment it's in. It also is able to get itself out of local minimums 18 | fairly easily, thanks to random genetic mutations. 19 | CONS 20 | The algorithm is very computationally expensive. 21 | ARGUMENTS 22 | activation: function, the type of activation function to be used. (relu performs well with this model) 23 | error_func: function, the type of function to calculate the error. (mae performs well with this model) 24 | x: a numpy array, one row for each trial (set of inputs) 25 | y: a numpy array, labels for the inputs x. one label per trial 26 | generations: integer, the number of generations to run, where once per generation the agents reproduce 27 | num_agents: int, the number of agents in a generation (one agent is one set of weights for the model) 28 | stop_error: float, if the error of the model is under or equal to this value, the model will be returned as it 29 | is. It's very useful for the genetic algorithm due to the randomized nature of the algorithm; it's possible 30 | to find a well fitting model, but the random mutations alter the model before the accurate model is returned 31 | RETURN 32 | The function returns two elements, the numpy array of best weights found, as well as the error of the 33 | weights, in that order. 34 | NOTE 35 | This algorithm is almost always used for unsupervised training, i.e. in situations where the actual result 36 | you want your model to predict is unknown. This algorithm is common when training an agent to play a video game, 37 | since you aren't able to know what move is the optimal move. However, since this is a learning exercise, we 38 | will use the algorithm in a supervised way, by calculating the 'fitness' of the model directly, using the error 39 | of the prediction. 40 | """ 41 | 42 | def generate_agents(n): 43 | # create random list of weights, it has an extra value for the bias weight 44 | weights = np.empty((0, num_inputs + 1)) 45 | for i in range(n): 46 | new_weight = np.random.rand(1, num_inputs + 1) * 2 - 1 47 | weights = np.append(weights, new_weight, axis=0) 48 | return weights 49 | 50 | def calculate_fitness(current_gen): 51 | loe = np.empty((0, num_inputs + 1)), 52 | for weights in current_gen: 53 | y_hats = np.array([]) 54 | for i in range(len(x)): 55 | input_layer = x[i] 56 | neuron_val = np.dot(weights, input_layer) 57 | y_hat = activation(neuron_val) 58 | y_hats = np.append(y_hats, y_hat) 59 | # the error of the agent 60 | er = error_func(y, y_hats) 61 | if er < stop_error: 62 | print("EARLY STOP") 63 | return weights, er, True 64 | 65 | loe = np.append(loe, er) 66 | # sorts the list of weights in increasing order indexed by the error. the first element in sorted_low is 67 | # the best fitting model, and the last element is the worst fitting model 68 | sorted_tuples = sorted(tuple(zip(loe, current_gen)), key=itemgetter(0)) 69 | sorted_e = np.array([w[0] for w in sorted_tuples]) 70 | sorted_w = np.array([w[1] for w in sorted_tuples]) 71 | return sorted_w, sorted_e, False 72 | 73 | def selection(weights, errors): 74 | num_reproduced = math.ceil(len(weights)*3/4) # we want one fourth of the population to be random, arbitrary % 75 | num_new = num_agents - num_reproduced # the rest of the agents will be generated randomly 76 | 77 | # we want low errors to have a large probability of being selected, so divide errors by 1 78 | error_sum = np.sum(np.divide(1, errors)) 79 | count = 0 80 | probabilities = [] 81 | for i in range(len(weights)): 82 | if i == len(weights) - 1: 83 | # ensure sum of probabilities is equal to 1 84 | num = 1 - count 85 | else: 86 | # divide by the sum of errors so the sum is equal to 1 (without floating-point errors) 87 | num = (1/errors[i])/error_sum 88 | count += num 89 | probabilities.append(num) 90 | 91 | index_choices = np.random.choice(len(weights), num_reproduced, p=probabilities) 92 | weights = weights[index_choices] 93 | 94 | new_weights = [] 95 | # simulate dna swapping during reproduction 96 | for i in range(num_reproduced): 97 | parent1 = random.choice(weights) 98 | parent2 = random.choice(weights) 99 | child = np.array([]) 100 | for index in range(len(parent1)): 101 | rnd = np.random.rand(1) 102 | if rnd > 0.5: 103 | child_weight = parent1[index] 104 | else: 105 | child_weight = parent2[index] 106 | if rnd < 0.1: 107 | # simulate 10% random mutations 108 | child_weight = np.random.rand(1) * 2 - 1 109 | child = np.append(child, child_weight) 110 | new_weights.append(child) 111 | 112 | new_weights = np.array(new_weights) 113 | random_weights = generate_agents(num_new) 114 | new_weights = np.append(new_weights, random_weights, axis=0) 115 | return new_weights 116 | 117 | num_inputs = len(x[0]) 118 | # add a bias node to the input layer, with a value of 1 119 | temp = [] 120 | for row in range(len(x)): 121 | temp.append(np.append(x[row], 1)) 122 | x = np.array(temp) 123 | 124 | # initial list of weights (a generation) 125 | low = generate_agents(num_agents) 126 | 127 | for generation in range(generations): 128 | sorted_weights, sorted_errors, early_quit = calculate_fitness(low) 129 | if early_quit: 130 | return sorted_weights, sorted_errors 131 | low = selection(sorted_weights, sorted_errors) 132 | 133 | best_weight, error, _ = calculate_fitness(low) 134 | return best_weight[0], error[0] 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MiniNeuralNets 2 | A framework for mini neural networks (single node networks, also known as perceptrons), written from scratch in python. The goal of the project is to demystify the workings of a neural network and various training algorithms by providing code written from scratch of the simplest neural network one could have. You can easily train a mini neural network to try to model a linear pattern of your choosing, and swap between training algorithms and cost functions to get a sense of what works, and why. 3 | 4 | ## main.py 5 | This script acts as the controller for the mini neural networks. You're able to instantiate a perceptron with 6 | ```python 7 | nn = Perceptron(activation=sigmoid, error=mse) 8 | ``` 9 | This will store a perceptron in nn, which contains the weights, activation function, error function and accuracy for the model. To train the model, you call `nn.train`, passing the training algorithm of your choice, along with the arguments needed for your chosen training algorithm. As an example, to train a model using stochastic gradient descent on inputs `x`, labels `y`, with 1000 epochs and a learning rate of 0.001, you pass the following parameters: 10 | ```python 11 | nn.train(sgd, x, y, 1000, 0.001) 12 | ``` 13 | Then, to make a prediction of some input `input` using the trained model, you call 14 | ```python 15 | output, prediction, confidence = nn.predict(input) 16 | ``` 17 | where `output` is the actual output of the model, `prediction` is the rounded `output`, and `confidence` is the confidence the model has for it's prediction. 18 | 19 | ## Gradient Descent 20 | There are a few gradient descent variations here, but they all do close to the same thing. Any gradient descent variation uses the gradient of the error function to calculate the direction each weight should be adjusted in order to minimize the error function, and this gradient is calculated by backpropagation. Backpropagation and gradient descent are often used interchangably, however, gradient descent is the actual training algorithm, while backpropagation is a generalization of the computation of the gradient. The idea of the algorithm is to adjust the weights significantly when the error is large, and to make progressively smaller adjustments the closer the error is to 0. The 'gradient' in 'gradient descent' comes from the algorithms use of the gradient of the error function to determine the direction of the adjustment. For each weight, we need to calculate the delta value, which is found by multiplying the derivative of the activation function of the neuron value by the derivative of the error function. Then, to calculate the adjustment amount for that weight, we multiply the delta value by the input value attached to the weight, and we are left with the amount we need to subtract to the weight in order to slightly improve the accuracy of the model. 21 | 22 | ### Batch Gradient Descent (bgd) 23 | The traditional gradient descent (also known as batch gradient descent, or bgd) updates the weights after performing the calculations on the entire training dataset. Batch gradient descent isn't used often, because when you're dealing with large datasets, it takes far too long to make each weight update. 24 | ### Stochastic Gradient Descent (sgd) 25 | Stochastic gradient descent (sgd), is exactly like the traditional batch gradient descent, except the weights are updated as soon as the adjustment is calculated (so weights update once per training element). This allows for a faster time of convergence, and is almost always favoured over the other. The downfall is that since the adjustment is calculated once per training element, the adjustment is really just an approximate adjustment for minimum loss. The end result is what is often described as a drunk man walking down a hill; it makes lots of irregular movements, but gets there in the end. 26 | ### Mini-Batch Gradient Descent (mbgd) 27 | Mini-batch gradient descent is essentially a combination of the previous two. Instead of updating the weights after each calculation like SGD does, mini-batch gradient descent separates the database into mini batches (often of size 32), and performs batch gradient descent on each mini-batch. This allows the model to converge faster than batch gradient descent because weights are updated more frequently, and it also allows for a more accurate gradient approximation since the gradient is calculated with more training samples. Notice when the batch size of mini-batch gradient descent is 1, then it is equivalent to sgd, and when it is sufficiently large, it is equivalent to bgd. 28 | ### Adam Optimizer 29 | Finally, there is Adam, which is a very commonly used optimization of gradient descent. Adam adjusts the learning rate for each weight (like RMSprop), and uses the moving average of the gradient rather than the gradient itself (momentum). This algorithm can be thought of as a ball with lots of friction rolling down a hill. It's often the default optimization algorithm used when training neural networks, because it works in so many differnet scenarios, and is efficient enough to work well with very large datasets. 30 | 31 | We will now train a model using stochastic gradient descent, the sigmoid activation function and the mean absolute error to predict the output of the following pattern, where each row in `x` is a trial, and the desired output of the trial is stored in the corresponding index of `y` 32 | ```python 33 | x = np.array([[1, 1, 0, 0], 34 | [0, 0, 1, 0], 35 | [1, 0, 0, 1], 36 | [0, 1, 1, 1]]) 37 | 38 | y = np.array([0, 0, 1, 1]) 39 | 40 | nn = Perceptron(activation=sigmoid, error=mae) 41 | 42 | # x are the training inputs, y are training labels, 1000 epochs and a learning rate of 1 43 | nn.train(sgd, x, y, 1000, 1) 44 | 45 | input_vals = np.array([0,1,1,0]) 46 | nn.predict(input_vals) 47 | ``` 48 | Notice the output is simply the last element in the input. It's difficult for a perceptron to find more complex patterns, after all, it's only one node. The output is as follows: 49 | ```python 50 | # OUTPUT 51 | INPUT 52 | [0 1 1 0] 53 | PREDICTION CONFIDENCE 54 | 0.0 0.9988789558989664 55 | WEIGHTS 56 | [-1.82548668 -0.30245144 -1.86913931 13.53508116 -4.62078238] 57 | ERROR 58 | 0.0011747806580496268 59 | ``` 60 | So, we have trained a perceptron to model the given pattern, and it has produced the correct result with a confidence of 99.89%, on an input it has never seen before. Notice that each weight corresponds to it's respective input, i.e., the first value in the input pattern has a weight of `-1.825`, the last value in the input pattern has a weight of `13.535`, and the bias has a weight of `-4.621`. From these weights, it's easy to see how the model calculates the result given an input; it essentially ignores the first 3 inputs, making the value of the neuron entirely dependent on the last input value, which is exactly how it is expected to model the pattern. 61 | 62 | ## GeneticAlgorithm.py 63 | The genetic algorithm tries to mimic evolution, by adjusting the weights of a given model until it's found optimal weights. It's typically used in a unsupervised manner, measuring the performance of the model based on the 'score' it obtains in an environment it's being trained in. However, since this is a learning exercise, we slightly modify the algorithm to be supervised, by calculating the score of the model based on how close the output is to the actual answer. There are 5 main steps to this algorithm: 64 | 65 | 1. **Generate Agents:** 66 | We randomly initialize `num_agents` agents in the initial generation. An agent is one set of weights for the model. 67 | 2. **Measure Fitness:** 68 | We find some way to calculate the fitness of each agent. In our case, since we have the desired outputs, we use the error of the model's prediction (using a user-passed error function) to rank the fitnesses of the agents. The fitness is a measure of how accurate the model fits the training inputs. 69 | 3. **Selection:** 70 | We choose some number of agents to reproduce, where the higher the agent's fitness, the more likely they are to be selected for reproduction. 71 | 4. **Reproduction:** 72 | Out of the pool of selected agents, we randomly choose two to reproduce, until we have close to `num_agents` agents (roughly 75%, chosen fairly arbitrarily). The reproduction is the creation of a child, where each weight in the child is chosen to be one of the two parent's weight in that position. We also include a 10% chance for any given weight to be set to a random value. This random mutation allows for new patterns to emerge in case the model gets stuck in a local minimum. 73 | 5. **Create New Generation:** 74 | When we have a number of children that are close to the number of agents we started with (75%, see step 4), we fill the rest of the generation with randomized weights, again to allow new patterns to emerge. 75 | 76 | This process is performed `generations` times, and the assumption of the algorithm is that with each new generation, the average fitness of the agents in the generation will have slightly improved, because the stronger performing weights were more likely to reproduce, passing down their DNA. We will now give an example of the genetic algorithm in action, using it to train a perceptron to model a very simple pattern, where each row in `x` is a trial, and the desired output of the trial is stored in the corresponding index of `y` 77 | 78 | ```python 79 | x = np.array([[0], 80 | [1], 81 | [2]]) 82 | y = np.array([1, 2, 3]) 83 | 84 | nn = Perceptron(activation=relu, error=mae) 85 | 86 | # we will simulate 1000 generations, with each having 100 agents, and return the model early if the error is < 0.002 87 | nn.train(genetic_algorithm, x, y, 1000, 100, 0.002) 88 | 89 | input_vals = np.array([100]) 90 | nn.predict(input_vals) 91 | ``` 92 | Clearly the pattern returns `x+1`, given some `x`, and notice we have chosen to use the `ReLU` activation function. This function takes in some `x` and simply returns the maximum of `x` and `0`. Since we want our output to be an unbouded integer, we are restricted from using activation functions such as `sigmoid` or `tanh`, which is why we choose `ReLU`. Also notice that since `ReLU` returns `max(0,x)`, we aren't able to train the model to predict negative numbers. The output is as follows: 93 | ```python 94 | # OUTPUT 95 | INPUT 96 | [100] 97 | PREDICTION CONFIDENCE 98 | 101.0 0.9762138654645582 99 | WEIGHTS 100 | [0.99977005 0.99920929] 101 | ERROR 102 | 0.001020665568131296 103 | ``` 104 | So, we've successfully trained the perceptron to model the function `x+1` (for relatively small inputs). Consider the weights, noticing that the first weight is the connection between the input and the node value (which is the output, since this is a perceptron), and the second weight is the connection between the bias (value=1) and the node value. It isn't hard to see that if the model was 100% accurate, the weights would be `[1, 1]`, since to calculate the node value, you add the result of the first weight multiplied by the input and the second weight mulitplied by the bias. When both weights are `1`, this simplifies to `node_value = ReLU(input + 1) = input + 1`, which is exactly our desired relationship. 105 | --------------------------------------------------------------------------------