├── brain.py ├── dqn.py ├── testing.py ├── training.py ├── environment.py └── README.md /brain.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Dense, Dropout from keras.models import Model 2 | from keras.optimizers import Adam 3 | 4 | # BUILDING THE BRAIN 5 | class Brain(object): 6 | # BUILDING A FULLY CONNECTED NEURAL NETWORK DIRECTLY INSIDE THE INIT METHOD 7 | def __init__(self, learning_rate = 0.001, number_actions = 5): 8 | self.learning_rate = learning_rate 9 | 10 | # BUILDIND THE INPUT LAYER COMPOSED OF THE INPUT STATE 11 | states = Input(shape = (3,)) 12 | 13 | # BUILDING THE FIRST FULLY CONNECTED HIDDEN LAYER WITH DROPOUT ACTIVATED 14 | x = Dense(units = 64, activation = ’sigmoid’)(states) 15 | x = Dropout(rate = 0.1)(x) 16 | 17 | # BUILDING THE SECOND FULLY CONNECTED HIDDEN LAYER WITH DROPOUT ACTIVATED 18 | y = Dense(units = 32, activation = ’sigmoid’)(x) y = Dropout(rate = 0.1)(y) 19 | 20 | # BUILDING THE OUTPUT LAYER, FULLY CONNECTED TO THE LAST HIDDEN LAYER 21 | q_values = Dense(units = number_actions, activation = ’softmax’)(y) 22 | 23 | # ASSEMBLING THE FULL ARCHITECTURE INSIDE A MODEL OBJECT 24 | self.model = Model(inputs = states, outputs = q_values) 25 | 26 | # COMPILING THE MODEL WITH A MEAN-SQUARED ERROR LOSS AND A CHOSEN OPTIMIZER 27 | self.model.compile(loss = ’mse’, optimizer = Adam(lr = learning_rate)) 28 | -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | # Implementing Deep Q-Learning with Experience Replay 2 | 3 | import numpy as np 4 | 5 | # INTRODUCING AND INITIALIZING ALL THE PARAMETERS AND VARIABLES OF THE DQN 6 | def __init__(self, max_memory = 100, discount = 0.9): 7 | self.memory = list() 8 | self.max_memory = max_memory 9 | self.discount = discount 10 | 11 | # MAKING A METHOD THAT BUILDS THE MEMORY IN EXPERIENCE REPLAY 12 | def remember(self, transition, game_over): 13 | self.memory.append([transition, game_over]) 14 | if len(self.memory) > self.max_memory: 15 | del self.memory[0] 16 | 17 | # MAKING A METHOD THAT BUILDS TWO BATCHES OF INPUTS AND TARGETS 18 | def get_batch(self, model, batch_size = 10): 19 | len_memory = len(self.memory) 20 | num_inputs = self.memory[0][0][0].shape[1] 21 | num_outputs = model.output_shape[-1] 22 | inputs = np.zeros((min(len_memory, batch_size), num_inputs)) 23 | targets = np.zeros((min(len_memory, batch_size), num_outputs)) 24 | for i, idx in enumerate(np.random.randint(0, len_memory, size = min(len_memory, batch_size))): 25 | current_state, action, reward, next_state = self.memory[idx][0] 26 | game_over = self.memory[idx][1] 27 | inputs[i] = current_state 28 | targets[i] = model.predict(current_state)[0] 29 | Q_sa = np.max(model.predict(next_state)[0]) 30 | if game_over: 31 | targets[i, action] = reward 32 | else: 33 | targets[i, action] = reward + self.discount * Q_sa 34 | return inputs, targets 35 | -------------------------------------------------------------------------------- /testing.py: -------------------------------------------------------------------------------- 1 | # Testing the AI 2 | # Installing Keras 3 | # conda install -c conda-forge keras 4 | 5 | import os 6 | import numpy as np 10 import random as rn 7 | from keras.models import load_model import environment 8 | 9 | # Setting seeds for reproducibility 10 | os.environ[’PYTHONHASHSEED’] = ’0’ 11 | np.random.seed(42) 12 | rn.seed(12345) 13 | 14 | # SETTING THE PARAMETERS 15 | number_actions = 5 16 | direction_boundary = (number_actions - 1) / 2 17 | temperature_step = 1.5 18 | 19 | # BUILDING THE ENVIRONMENT BY SIMPLY CREATING AN OBJECT OF THE ENVIRONMENT CLASS 20 | env = environment.Environment(optimal_temperature = (18.0, 24.0), initial_month = 0, initial_number_users = 20, initial_rate_data = 30) 21 | 22 | # LOADING A PRE-TRAINED BRAIN 23 | model = load_model("model.h5") 24 | 25 | # CHOOSING THE MODE 26 | train = False 27 | 28 | # RUNNING A 1 YEAR SIMULATION IN INFERENCE MODE 29 | env.train = train 30 | current_state, _, _ = env.observe() 31 | for timestep in range(0, 12 * 30 * 24 * 60): 32 | q_values = model.predict(current_state) 33 | action = np.argmax(q_values[0]) 34 | if (action - direction_boundary < 0): 35 | direction = -1 36 | else: 37 | direction = 1 38 | energy_ai = abs(action - direction_boundary) * temperature_step 39 | next_state, reward, game_over = env.update_env(direction, energy_ai, int(timestep / (30*24*60))) 40 | current_state = next_state 41 | 42 | # PRINTING THE TRAINING RESULTS FOR EACH EPOCH 43 | print("\n") 44 | print("Total Energy spent with an AI: {:.0f}".format(env.total_energy_ai)) 45 | print("Total Energy spent with no AI: {:.0f}".format(env.total_energy_noai)) 46 | print("ENERGY SAVED: {:.0f} %".format((env.total_energy_noai - env.total_energy_ai) / env.total_energy_noai * 100)) 47 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | # Training the AI 2 | # Installing Keras 3 | # conda install -c conda-forge keras 4 | 5 | import os 6 | import numpy as np 10 import random as rn 7 | import environment import brain import dqn 8 | 9 | # Setting seeds for reproducibility 10 | os.environ[’PYTHONHASHSEED’] = ’0’ 11 | np.random.seed(42) 12 | rn.seed(12345) 13 | 14 | # SETTING THE PARAMETERS 15 | epsilon = .3 16 | number_actions = 5 17 | direction_boundary = (number_actions - 1) / 2 18 | number_epochs = 100 19 | max_memory = 3000 20 | batch_size = 512 21 | temperature_step = 1.5 22 | 23 | 24 | # BUILDING THE ENVIRONMENT BY SIMPLY CREATING AN OBJECT OF THE ENVIRONMENT CLASS 25 | env = environment.Environment(optimal_temperature = (18.0, 24.0), initial_month = 0,initial_number_users = 20,initial_rate_data = 30) 26 | 27 | # BUILDING THE BRAIN BY SIMPLY CREATING AN OBJECT OF THE BRAIN CLASS 28 | brain = brain.Brain(learning_rate = 0.00001, number_actions = number_actions) 29 | 30 | # BUILDING THE DQN MODEL BY SIMPLY CREATING AN OBJECT OF THE DQN CLASS 31 | dqn = dqn.DQN(max_memory = max_memory, discount = 0.9) 32 | 33 | 34 | # CHOOSING THE MODE 35 | train = True 36 | 37 | 38 | # TRAINING THE AI 39 | env.train = train 40 | model = brain.model 41 | early_stopping = True 42 | patience = 10 43 | best_total_reward = -np.inf 44 | patience_count = 0 45 | 46 | 47 | if (env.train): 48 | # STARTING THE LOOP OVER ALL THE EPOCHS (1 Epoch = 5 Months) 49 | for epoch in range(1, number_epochs): 50 | # INITIALIAZING ALL THE VARIABLES OF BOTH THE ENVIRONMENT AND THE TRAINING LOOP 51 | total_reward = 0 52 | loss = 0. 53 | new_month = np.random.randint(0, 12) 54 | env.reset(new_month = new_month) 55 | game_over = False 56 | current_state, _, _ = env.observe() 57 | timestep = 0 58 | # STARTING THE LOOP OVER ALL THE TIMESTEPS (1 Timestep = 1 Minute) IN ONE EPOCH 59 | while ((not game_over) and timestep <= 5 * 30 * 24 * 60): 60 | # PLAYING THE NEXT ACTION BY EXPLORATION 61 | if np.random.rand() <= epsilon: 62 | action = np.random.randint(0, number_actions) 63 | if (action - direction_boundary < 0): 64 | direction = -1 65 | else: 66 | direction = 1 67 | energy_ai = abs(action - direction_boundary) * temperature_step 68 | else: 69 | q_values = model.predict(current_state) 70 | action = np.argmax(q_values[0]) 71 | if (action - direction_boundary < 0): 72 | direction = -1 73 | else: 74 | direction = 1 75 | energy_ai = abs(action - direction_boundary) * temperature_step 76 | # UPDATING THE ENVIRONMENT AND REACHING THE NEXT STATE 77 | next_state, reward, game_over = env.update_env(direction,energy_ai,int(timestep / (30*24*60))) 78 | total_reward += reward 79 | # STORING THIS NEW TRANSITION INTO THE MEMORY 80 | dqn.remember([current_state, action, reward, next_state], game_over) 81 | # GATHERING IN TWO SEPARATE BATCHES THE INPUTS AND THE TARGETS 82 | inputs, targets = dqn.get_batch(model, batch_size = batch_size) 83 | # COMPUTING THE LOSS OVER THE TWO WHOLE BATCHES OF INPUTS AND TARGETS 84 | loss += model.train_on_batch(inputs, targets) 85 | timestep += 1 86 | current_state = next_state 87 | 88 | # PRINTING THE TRAINING RESULTS FOR EACH EPOCH 89 | print("\n") 90 | print("Epoch: {:03d}/{:03d}".format(epoch, number_epochs)) 91 | print("Total Energy spent with an AI: {:.0f}".format(env.total_energy_ai)) print("Total Energy spent with no AI: {:.0f}".format(env.total_energy_noai)) 92 | 93 | # EARLY STOPPING 94 | if (early_stopping): 95 | if (total_reward <= best_total_reward): 96 | patience_count += 1 97 | elif (total_reward > best_total_reward): 98 | best_total_reward = total_reward 99 | patience_count = 0 100 | if (patience_count >= patience): 101 | print("Early Stopping") 102 | break 103 | 104 | # SAVING THE MODEL 105 | model.save("model.h5") 106 | -------------------------------------------------------------------------------- /environment.py: -------------------------------------------------------------------------------- 1 | 2 | # Building the Environment 3 | 4 | import numpy as np 5 | # BUILDING THE ENVIRONMENT IN A CLASS 6 | 7 | class Environment(object): 8 | # INTRODUCING AND INITIALIZING ALL THE PARAMETERS AND VARIABLES OF THE ENVIRONMENT 9 | def __init__(self,optimal_temperature = (18.0, 24.0),initial_month = 0,initial_number_users = 10,initial_rate_data = 60): 10 | self.monthly_atmospheric_temperatures = [1.0, 5.0, 7.0, 10.0, 11.0, 20.0, 23.0, 24.0, 22.0, 10.0, 5.0, 1.0] 11 | self.initial_month = initial_month self.atmospheric_temperature = self.monthly_atmospheric_temperatures[initial_month] 12 | self.optimal_temperature = optimal_temperature 13 | self.min_temperature = -20 14 | self.max_temperature = 80 15 | self.min_number_users = 10 16 | self.max_number_users = 100 17 | self.max_update_users = 5 18 | self.min_rate_data = 20 19 | self.max_rate_data = 300 self.max_update_data = 10 20 | self.initial_number_users = initial_number_users 21 | self.current_number_users = initial_number_users 22 | self.initial_rate_data = initial_rate_data 23 | self.current_rate_data = initial_rate_data self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data 24 | self.temperature_ai = self.intrinsic_temperature 25 | self.temperature_noai = (self.optimal_temperature[0] + self.optimal_temperature[1]) / 2.0 26 | self.total_energy_ai = 0.0 27 | self.total_energy_noai = 0.0 28 | self.reward = 0.0 29 | self.game_over = 0 30 | self.train = 1 31 | 32 | # MAKING A METHOD THAT UPDATES THE ENVIRONMENT RIGHT AFTER THE AI PLAYS AN ACTION 33 | def update_env(self, direction, energy_ai, month): 34 | # GETTING THE REWARD 35 | # Computing the energy spent by the server’s cooling system when there is no AI energy_noai = 0 36 | if (self.temperature_noai < self.optimal_temperature[0]): 37 | energy_noai = self.optimal_temperature[0] - self.temperature_noai 38 | self.temperature_noai = self.optimal_temperature[0] 39 | elif (self.temperature_noai > self.optimal_temperature[1]): 40 | energy_noai = self.temperature_noai - self.optimal_temperature[1] 41 | self.temperature_noai = self.optimal_temperature[1] 42 | # Computing the Reward 43 | self.reward = energy_noai - energy_ai 44 | # Scaling the Reward 45 | self.reward = 1e-3 * self.reward 46 | 47 | # GETTING THE NEXT STATE 48 | # Updating the atmospheric temperature 49 | self.atmospheric_temperature = self.monthly_atmospheric_temperatures[month] 50 | # Updating the number of users 51 | self.current_number_users += np.random.randint(-self.max_update_users,self.max_update_users) 52 | if (self.current_number_users > self.max_number_users): 53 | self.current_number_users = self.max_number_users 54 | elif (self.current_number_users < self.min_number_users): 55 | self.current_number_users = self.min_number_users 56 | # Updating the rate of data 57 | self.current_rate_data += np.random.randint(-self.max_update_data,self.max_update_data) 58 | if (self.current_rate_data > self.max_rate_data): 59 | self.current_rate_data = self.max_rate_data 60 | elif (self.current_rate_data < self.min_rate_data): 61 | self.current_rate_data = self.min_rate_data 62 | # Computing the Delta of Intrinsic Temperature 63 | past_intrinsic_temperature = self.intrinsic_temperature 64 | self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data 65 | delta_intrinsic_temperature = self.intrinsic_temperature - past_intrinsic_temperature 66 | 67 | # Computing the Delta of Temperature caused by the AI 68 | if (direction == -1): 69 | delta_temperature_ai = -energy_ai 70 | elif (direction == 1): 71 | delta_temperature_ai = energy_ai 72 | # Updating the new Server’s Temperature when there is the AI 73 | self.temperature_ai += delta_intrinsic_temperature + delta_temperature_ai 74 | 75 | # Updating the new Server’s Temperature when there is no AI 76 | self.temperature_noai += delta_intrinsic_temperature 77 | 78 | # GETTING GAME OVER 79 | 80 | if(self.temperature_ai < self.min_temperature): 81 | if(self.train == 1): 82 | self.game_over = 1 83 | else: 84 | self.total_energy_ai += self.optimal_temperature[0] - self.temperature_ai 85 | self.temperature_ai = self.optimal_temperature[0] 86 | elif (self.temperature_ai > self.max_temperature): 87 | if (self.train == 1): 88 | self.game_over = 1 89 | else: 90 | self.total_energy_ai += self.temperature_ai - self.optimal_temperature[1] 91 | self.temperature_ai = self.optimal_temperature[1] 92 | 93 | # UPDATING THE SCORES 94 | 95 | # Updating the Total Energy spent by the AI 96 | self.total_energy_ai += energy_ai 97 | # Updating the Total Energy spent by the alternative system when there is no AI 98 | self.total_energy_noai += energy_noai 99 | 100 | # SCALING THE NEXT STATE 101 | scaled_temperature_ai = (self.temperature_ai - self.min_temperature) / (self.max_temperature - self.min_temperature) 102 | scaled_number_users = (self.current_number_users - self.min_number_users) / (self.max_number_users - self.min_number_users) 103 | scaled_rate_data = (self.current_rate_data - self.min_rate_data) / (self.max_rate_data - self.min_rate_data) 104 | next_state = np.matrix([scaled_temperature_ai, scaled_number_users, scaled_rate_data]) 105 | 106 | # RETURNING THE NEXT STATE, THE REWARD, AND GAME OVER 107 | return next_state, self.reward, self.game_over 108 | 109 | # MAKING A METHOD THAT RESETS THE ENVIRONMENT 110 | 111 | def reset(self, new_month): 112 | self.atmospheric_temperature = self.monthly_atmospheric_temperatures[new_month] self.initial_month = new_month 113 | self.current_number_users = self.initial_number_users 114 | self.current_rate_data = self.initial_rate_data 115 | self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data 116 | self.temperature_ai = self.intrinsic_temperature 117 | self.temperature_noai = (self.optimal_temperature[0] + self.optimal_temperature[1]) / 2.0 118 | self.total_energy_ai = 0.0 119 | self.total_energy_noai = 0.0 120 | self.reward = 0.0 121 | self.game_over = 0 122 | self.train = 1 123 | 124 | # MAKING A METHOD THAT GIVES US AT ANY TIME THE STATE, THE REWARD AND GAMEOVER 125 | 126 | def observe(self): 127 | scaled_temperature_ai = (self.temperature_ai - self.min_temperature) / (self.max_temperature - self.min_temperature) 128 | scaled_number_users = (self.current_number_users - self.min_number_users) / (self.max_number_users - self.min_number_users) 129 | scaled_rate_data = (self.current_rate_data - self.min_rate_data) / (self.max_rate_data - self.min_rate_data) 130 | current_state = np.matrix([scaled_temperature_ai, scaled_number_users, scaled_rate_data]) 131 | return current_state, self.reward, self.game_over 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AIPoweredCooling 2 | 3 | Problem to solve 4 | 5 | In 2016, DeepMind AI minimized a big part of Google’s cost by reducing Google Data Centre Cooling Bill by 40% using their DQN AI model (Deep Q-Learning). We will do something very similar. We will we will build an AI that will be controlling the cooling/heating of the server so that it stays in an optimal range of temperatures while saving the maximum energy, therefore minimizing the costs. And just as DeepMind AI did, our goal will be to achieve at least 40% energy saving. 6 | 7 | Environment to define 8 | 9 | Before we define the states, actions and rewards, we need to explain how the server operates. We will do that in several steps. First, we will list all the environment parameters and variables by which the server is controlled. After that we will set the essential assumption of the problem, on which our AI will rely to provide a solution. Then we will specify how we will simulate the whole process. And eventually we will explain the overall functioning of the server, and how the AI plays its role. 10 | 11 | Parameters: 12 | 13 | 14 | • the average atmospheric temperature over a month 15 | 16 | • the optimal range of temperatures of the server, which will be [18◦C,24◦C] 17 | 18 | • the minimum temperature of the server below which it fails to operate, which will be −20◦C 19 | 20 | • the maximum temperature of the server above which it fails to operate, which will be 80◦C 21 | 22 | • the minimum number of users in the server, which will be 10 23 | 24 | • the maximum number of users in the server, which will be 100 25 | 26 | • the maximum number of users in the server that can go up or down per minute, which will be 5 • the minimum rate of data transmission in the server, which will be 20 27 | 28 | • the maximum rate of data transmission in the server, which will be 300 29 | 30 | • the maximum rate of data transmission that can go up or down per minute, which will be 10 31 | 32 | Variables: 33 | 34 | 35 | • the temperature of the server at any minute 36 | 37 | • the number of users in the server at any minute 38 | 39 | • the rate of data transmission at any minute 40 | 41 | • the energy spent by the AI onto the server (to cool it down or heat it up) at any minute 42 | 43 | • the energy spent by the server’s integrated cooling system that automatically brings the server’s tem- perature back to the optimal range whenever the server’s temperature goes outside this optimal range 44 | 45 | 46 | All these parameters and variables will be part of our server environment and will influence the actions of the AI on the server. 47 | 48 | The number of users and the rate of data transmission will be randomly fluctuating to simulate an actual server. This leads to randomness in the temperature and the AI has to understand how much cooling or heating power it has to transfer to the server so as to not deteriorate the server performance and at the same time, expend the least energy by optimizing its heat transfer. 49 | 50 | Defining the states. 51 | 52 | The input state st at time t is composed of the following three elements: 53 | 54 | 55 | 1. The temperature of the server at time t. 56 | 57 | 2. The number of users in the server at time t. 58 | 59 | 3. The rate of data transmission in the server at time t. 60 | 61 | Thus the input state will be an input vector of these three elements. Our future AI will take this vector as input, and will return the action to play at each time t. 62 | 63 | 64 | Defining the actions. 65 | 66 | The actions are simply the temperature changes that the AI can cause inside the server, in order to heat it up or cool it down. In order to make our actions discrete, we will consider 5 possible temperature changes from −3◦C to +3◦C, so that we end up with the 5 following possible actions that the AI can play to regulate the temperature of the server: 67 | 68 | 69 | 0 The AI cools down the server by 3◦C 70 | 71 | 1 The AI cools down the server by 1.5◦C 72 | 73 | 2 The AI does not transfer any heat to the server (no temperature change) 74 | 75 | 3 The AI heats up the server by 1.5◦C 76 | 77 | 4 The AI heats up the server by 3◦C 78 | 79 | Defining the rewards. 80 | 81 | The reward at iteration t is the energy spent on the server that the AI is saving with respect to the server’s integrated cooling system, that is, the difference between the energy that the unintelligent cooling system would spend if the AI was deactivated and the energy that the AI spends onto the server: 82 | 83 | Reward = E tnoAI − EtAI 84 | 85 | Rewardt = Energy saved by the AI between t and t + 1 86 | = EtnoAI − EtAI 87 | = |∆T noAI| − |∆T AI| 88 | 89 | 90 | # AI Solution 91 | 92 | Q-Learning into Deep Learning 93 | 94 | 95 | Deep Q-Learning consists of combining Q-Learning to an Artificial Neural Network. Inputs are encoded vectors, each one defining a state of the environment. These inputs go into an Artificial Neural Network, where the output is the action to play. More precisely, let’s say the game has n possible actions, the output layer of the neural network is comprised of n output neurons, each one corresponding to the Q-values of each action played in the current state. Then the action played is the one associated with the output neuron that has the highest Q-value (argmax), or the one returned by the softmax method. In our case we will use argmax. And since Q-values are real numbers, that makes our neural network an ANN for Regression. 96 | 97 | Hence, in each state st: 98 | 99 | • the prediction is the Q-value Q(st,at) where at is chosen by argmax or softmax 100 | 101 | • the target is rt + γmax(Q(st+1, a)) 102 | 103 | • the loss error between the prediction and the target is the squared of the Temporal Difference 104 | 105 | Then this loss error is backpropagated into the network, and the weights are updated according to how much they contributed to the error. 106 | 107 | 108 | Experience Replay 109 | 110 | 111 | So far we have only considered transitions from one state st to the next state st+1. The problem with this is that st is most of the time very correlated with st+1. Therefore the network is not learning much. This could be way improved if, instead of considering only this one previous transition, we considered the last m transitions where m is a large number. This pack of the last m transitions is what is called the Experience Replay. Then from this Experience Replay we take some random batches of transitions to make our updates. 112 | 113 | The Brain 114 | 115 | 116 | The brain, or more precisely the deep neural network of our AI, will be a fully connected neural network, composed of two hidden layers, the first one having 64 neurons, and the second one having 32 neurons. And as a reminder, this neural network takes as inputs the states of the environment, and returns as outputs the Q-Values for each of the 5 actions. This artificial brain will be trained with a "Mean Squared Error" loss, and an Adam optimizer. 117 | 118 | # Implementation 119 | 120 | Step 1: Building the Environment 121 | 1. Step 1-1: Introducing and initializing all the parameters and variables of the environment. 122 | 2. Step 1-2: Making a method that updates the environment right after the AI plays an action. 123 | 3. Step 1-3: Making a method that resets the environment. 124 | 4. Step 1-4: Making a method that gives us at any time the current state, the last reward obtained, and whether the game is over. 125 | 126 | Step 2: Building the Brain 127 | 1. Step 2-1: Building the input layer composed of the input states. 128 | 2. Step 2-2: Building the hidden layers with a chosen number of these layers and neurons inside each, fully connected to the input layer and between each other. 129 | 3. Step 2-3: Building the output layer, fully connected to the last hidden layer. 130 | 4. Step 2-4: Assembling the full architecture inside a model object. 131 | 5. Step 2-5: Compiling the model with a Mean-Squared Error loss function and a chosen optimizer. 132 | 133 | Step 3: Implementing the Deep Reinforcement Learning Algorithm 134 | 1. Step 3-1: Introducing and initializing all the parameters and variables of the DQN model. 135 | 2. Step 3-2: Making a method that builds the memory in Experience Replay. 136 | 3. Step 3-3: Making a method that builds and returns two batches of 10 inputs and 10 targets 137 | 138 | Step 4: Training the AI 139 | 1. Step 4-1: Building the environment by creating an object of the Environment class built in Step 1. 140 | 2. Step 4-2: Building the artificial brain by creating an object of the Brain class built in Step 2. 141 | 3. Step 4-3: Building the DQN model by creating an object of the DQN class built in Step 3. 142 | 4. Step 4-4: Choosing the training mode. 143 | 5. Step 4-5: Starting the training with a for loop over a chosen number of epochs. 144 | 6. Step 4-6: During each epoch we repeat the whole Deep Q-Learning process, while also doing some exploration 30% of the time. 145 | 146 | Step 5: Testing the AI 147 | 1. Step 5-1: Building a new environment by creating an object of the Environment class built in Step 1. 148 | 2. Step 5-2: Loading the artificial brain with its pre-trained weights from the previous training. 149 | 3. Step 5-3: Choosing the inference mode. 150 | 4. Step 5-4: Starting the simulation. 151 | 5. Step 5-5: At each iteration (each minute), our AI only plays the action that results from its prediction, and no exploration or Deep Q-Learning training is happening whatsoever. 152 | --------------------------------------------------------------------------------