├── brain.py
├── dqn.py
├── testing.py
├── training.py
├── environment.py
└── README.md


/brain.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Input, Dense, Dropout from keras.models import Model
 2 | from keras.optimizers import Adam
 3 | 
 4 | # BUILDING THE BRAIN
 5 | class Brain(object):
 6 |     # BUILDING A FULLY CONNECTED NEURAL NETWORK DIRECTLY INSIDE THE INIT METHOD
 7 |     def __init__(self, learning_rate = 0.001, number_actions = 5):
 8 |         self.learning_rate = learning_rate
 9 | 
10 |         # BUILDIND THE INPUT LAYER COMPOSED OF THE INPUT STATE
11 |         states = Input(shape = (3,))
12 | 
13 |         # BUILDING THE FIRST FULLY CONNECTED HIDDEN LAYER WITH DROPOUT ACTIVATED
14 |         x = Dense(units = 64, activation = ’sigmoid’)(states)
15 |         x = Dropout(rate = 0.1)(x)
16 | 
17 |         # BUILDING THE SECOND FULLY CONNECTED HIDDEN LAYER WITH DROPOUT ACTIVATED
18 |         y = Dense(units = 32, activation = ’sigmoid’)(x) y = Dropout(rate = 0.1)(y)
19 | 
20 |         # BUILDING THE OUTPUT LAYER, FULLY CONNECTED TO THE LAST HIDDEN LAYER
21 |         q_values = Dense(units = number_actions, activation = ’softmax’)(y)
22 | 
23 |         # ASSEMBLING THE FULL ARCHITECTURE INSIDE A MODEL OBJECT
24 |         self.model = Model(inputs = states, outputs = q_values)
25 |         
26 |         # COMPILING THE MODEL WITH A MEAN-SQUARED ERROR LOSS AND A CHOSEN OPTIMIZER
27 |         self.model.compile(loss = ’mse’, optimizer = Adam(lr = learning_rate))
28 | 


--------------------------------------------------------------------------------
/dqn.py:
--------------------------------------------------------------------------------
 1 | # Implementing Deep Q-Learning with Experience Replay
 2 | 
 3 | import numpy as np
 4 | 
 5 |     # INTRODUCING AND INITIALIZING ALL THE PARAMETERS AND VARIABLES OF THE DQN
 6 |     def __init__(self, max_memory = 100, discount = 0.9):
 7 |         self.memory = list()
 8 |         self.max_memory = max_memory
 9 |         self.discount = discount
10 | 
11 |     # MAKING A METHOD THAT BUILDS THE MEMORY IN EXPERIENCE REPLAY
12 |     def remember(self, transition, game_over):
13 |         self.memory.append([transition, game_over])
14 |         if len(self.memory) > self.max_memory:
15 |             del self.memory[0]
16 |             
17 |     # MAKING A METHOD THAT BUILDS TWO BATCHES OF INPUTS AND TARGETS
18 |     def get_batch(self, model, batch_size = 10):
19 |         len_memory = len(self.memory)
20 |         num_inputs = self.memory[0][0][0].shape[1]
21 |         num_outputs = model.output_shape[-1]
22 |         inputs = np.zeros((min(len_memory, batch_size), num_inputs))
23 |         targets = np.zeros((min(len_memory, batch_size), num_outputs))
24 |         for i, idx in enumerate(np.random.randint(0, len_memory, size = min(len_memory, batch_size))):
25 |                     current_state, action, reward, next_state = self.memory[idx][0]
26 |                     game_over = self.memory[idx][1]
27 |                     inputs[i] = current_state
28 |                     targets[i] = model.predict(current_state)[0]
29 |                     Q_sa = np.max(model.predict(next_state)[0])
30 |                     if game_over:
31 |                     targets[i, action] = reward
32 |                     else:
33 |                     targets[i, action] = reward + self.discount * Q_sa
34 |         return inputs, targets
35 | 


--------------------------------------------------------------------------------
/testing.py:
--------------------------------------------------------------------------------
 1 | # Testing the AI
 2 | # Installing Keras
 3 | # conda install -c conda-forge keras
 4 | 
 5 | import os
 6 | import numpy as np 10 import random as rn
 7 | from keras.models import load_model import environment
 8 | 
 9 | # Setting seeds for reproducibility
10 | os.environ[’PYTHONHASHSEED’] = ’0’
11 | np.random.seed(42)
12 | rn.seed(12345)
13 | 
14 | # SETTING THE PARAMETERS
15 | number_actions = 5
16 | direction_boundary = (number_actions - 1) / 2
17 | temperature_step = 1.5
18 | 
19 | # BUILDING THE ENVIRONMENT BY SIMPLY CREATING AN OBJECT OF THE ENVIRONMENT CLASS
20 | env = environment.Environment(optimal_temperature = (18.0, 24.0), initial_month = 0, initial_number_users = 20, initial_rate_data = 30)
21 | 
22 | # LOADING A PRE-TRAINED BRAIN
23 | model = load_model("model.h5")
24 | 
25 | # CHOOSING THE MODE
26 | train = False
27 | 
28 | # RUNNING A 1 YEAR SIMULATION IN INFERENCE MODE
29 | env.train = train
30 | current_state, _, _ = env.observe()
31 | for timestep in range(0, 12 * 30 * 24 * 60):
32 |     q_values = model.predict(current_state)
33 |     action = np.argmax(q_values[0])
34 |     if (action - direction_boundary < 0):
35 |         direction = -1
36 |     else:
37 |         direction = 1
38 |     energy_ai = abs(action - direction_boundary) * temperature_step
39 |     next_state, reward, game_over = env.update_env(direction, energy_ai, int(timestep / (30*24*60)))
40 |     current_state = next_state
41 | 
42 | # PRINTING THE TRAINING RESULTS FOR EACH EPOCH
43 | print("\n")
44 | print("Total Energy spent with an AI: {:.0f}".format(env.total_energy_ai))
45 | print("Total Energy spent with no AI: {:.0f}".format(env.total_energy_noai))
46 | print("ENERGY SAVED: {:.0f} %".format((env.total_energy_noai - env.total_energy_ai) / env.total_energy_noai * 100))
47 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
  1 | # Training the AI
  2 | # Installing Keras
  3 | # conda install -c conda-forge keras
  4 | 
  5 | import os
  6 | import numpy as np 10 import random as rn
  7 | import environment import brain import dqn
  8 | 
  9 | # Setting seeds for reproducibility
 10 | os.environ[’PYTHONHASHSEED’] = ’0’
 11 | np.random.seed(42)
 12 | rn.seed(12345)
 13 | 
 14 | # SETTING THE PARAMETERS
 15 | epsilon = .3
 16 | number_actions = 5
 17 | direction_boundary = (number_actions - 1) / 2
 18 | number_epochs = 100
 19 | max_memory = 3000
 20 | batch_size = 512
 21 | temperature_step = 1.5
 22 | 
 23 | 
 24 | # BUILDING THE ENVIRONMENT BY SIMPLY CREATING AN OBJECT OF THE ENVIRONMENT CLASS
 25 | env = environment.Environment(optimal_temperature = (18.0, 24.0), initial_month = 0,initial_number_users = 20,initial_rate_data = 30)
 26 | 
 27 | # BUILDING THE BRAIN BY SIMPLY CREATING AN OBJECT OF THE BRAIN CLASS
 28 | brain = brain.Brain(learning_rate = 0.00001, number_actions = number_actions)
 29 | 
 30 | # BUILDING THE DQN MODEL BY SIMPLY CREATING AN OBJECT OF THE DQN CLASS
 31 | dqn = dqn.DQN(max_memory = max_memory, discount = 0.9)
 32 | 
 33 | 
 34 | # CHOOSING THE MODE
 35 | train = True
 36 | 
 37 | 
 38 | # TRAINING THE AI
 39 | env.train = train
 40 | model = brain.model
 41 | early_stopping = True
 42 | patience = 10
 43 | best_total_reward = -np.inf
 44 | patience_count = 0
 45 | 
 46 | 
 47 | if (env.train):
 48 |  # STARTING THE LOOP OVER ALL THE EPOCHS (1 Epoch = 5 Months)
 49 |  for epoch in range(1, number_epochs):
 50 | # INITIALIAZING ALL THE VARIABLES OF BOTH THE ENVIRONMENT AND THE TRAINING LOOP
 51 |     total_reward = 0
 52 |     loss = 0.
 53 |     new_month = np.random.randint(0, 12)
 54 |     env.reset(new_month = new_month)
 55 |     game_over = False
 56 |     current_state, _, _ = env.observe()
 57 |     timestep = 0
 58 |     # STARTING THE LOOP OVER ALL THE TIMESTEPS (1 Timestep = 1 Minute) IN ONE EPOCH
 59 |     while ((not game_over) and timestep <= 5 * 30 * 24 * 60):
 60 |         # PLAYING THE NEXT ACTION BY EXPLORATION
 61 |         if np.random.rand() <= epsilon:
 62 |             action = np.random.randint(0, number_actions)
 63 |             if (action - direction_boundary < 0):
 64 |                 direction = -1
 65 |             else:
 66 |                 direction = 1
 67 |             energy_ai = abs(action - direction_boundary) * temperature_step
 68 |         else:
 69 |             q_values = model.predict(current_state)
 70 |             action = np.argmax(q_values[0])
 71 |             if (action - direction_boundary < 0):
 72 |                 direction = -1
 73 |             else:
 74 |                 direction = 1
 75 |             energy_ai = abs(action - direction_boundary) * temperature_step
 76 |         # UPDATING THE ENVIRONMENT AND REACHING THE NEXT STATE
 77 |         next_state, reward, game_over = env.update_env(direction,energy_ai,int(timestep / (30*24*60)))
 78 |         total_reward += reward
 79 |         # STORING THIS NEW TRANSITION INTO THE MEMORY
 80 |         dqn.remember([current_state, action, reward, next_state], game_over)
 81 |         # GATHERING IN TWO SEPARATE BATCHES THE INPUTS AND THE TARGETS
 82 |         inputs, targets = dqn.get_batch(model, batch_size = batch_size)
 83 |         # COMPUTING THE LOSS OVER THE TWO WHOLE BATCHES OF INPUTS AND TARGETS
 84 |         loss += model.train_on_batch(inputs, targets)
 85 |         timestep += 1
 86 |         current_state = next_state
 87 | 
 88 |     # PRINTING THE TRAINING RESULTS FOR EACH EPOCH
 89 |     print("\n")
 90 |     print("Epoch: {:03d}/{:03d}".format(epoch, number_epochs))
 91 |     print("Total Energy spent with an AI: {:.0f}".format(env.total_energy_ai)) print("Total Energy spent with no AI: {:.0f}".format(env.total_energy_noai))
 92 | 
 93 |     # EARLY STOPPING
 94 |     if (early_stopping):
 95 |         if (total_reward <= best_total_reward):
 96 |              patience_count += 1
 97 |          elif (total_reward > best_total_reward):
 98 |              best_total_reward = total_reward
 99 |              patience_count = 0
100 |         if (patience_count >= patience):
101 |             print("Early Stopping")
102 |             break
103 | 
104 |     # SAVING THE MODEL
105 |     model.save("model.h5")
106 | 


--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Building the Environment
  3 | 
  4 | import numpy as np
  5 | # BUILDING THE ENVIRONMENT IN A CLASS
  6 | 
  7 | class Environment(object):
  8 |     # INTRODUCING AND INITIALIZING ALL THE PARAMETERS AND VARIABLES OF THE ENVIRONMENT
  9 |     def __init__(self,optimal_temperature = (18.0, 24.0),initial_month = 0,initial_number_users = 10,initial_rate_data = 60):
 10 |         self.monthly_atmospheric_temperatures = [1.0, 5.0, 7.0, 10.0, 11.0, 20.0, 23.0, 24.0, 22.0, 10.0, 5.0, 1.0]
 11 |         self.initial_month = initial_month self.atmospheric_temperature = self.monthly_atmospheric_temperatures[initial_month]
 12 |         self.optimal_temperature = optimal_temperature
 13 |         self.min_temperature = -20
 14 |         self.max_temperature = 80
 15 |         self.min_number_users = 10
 16 |         self.max_number_users = 100
 17 |         self.max_update_users = 5
 18 |         self.min_rate_data = 20
 19 |         self.max_rate_data = 300 self.max_update_data = 10
 20 |         self.initial_number_users = initial_number_users
 21 |         self.current_number_users = initial_number_users
 22 |         self.initial_rate_data = initial_rate_data
 23 |         self.current_rate_data = initial_rate_data self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data
 24 |         self.temperature_ai = self.intrinsic_temperature
 25 |         self.temperature_noai = (self.optimal_temperature[0] + self.optimal_temperature[1]) / 2.0
 26 |         self.total_energy_ai = 0.0
 27 |         self.total_energy_noai = 0.0
 28 |         self.reward = 0.0
 29 |         self.game_over = 0
 30 |         self.train = 1
 31 | 
 32 |     # MAKING A METHOD THAT UPDATES THE ENVIRONMENT RIGHT AFTER THE AI PLAYS AN ACTION
 33 |     def update_env(self, direction, energy_ai, month):
 34 |         # GETTING THE REWARD
 35 |         # Computing the energy spent by the server’s cooling system when there is no AI energy_noai = 0
 36 |         if (self.temperature_noai < self.optimal_temperature[0]):
 37 |             energy_noai = self.optimal_temperature[0] - self.temperature_noai
 38 |             self.temperature_noai = self.optimal_temperature[0]
 39 |         elif (self.temperature_noai > self.optimal_temperature[1]):
 40 |             energy_noai = self.temperature_noai - self.optimal_temperature[1]
 41 |             self.temperature_noai = self.optimal_temperature[1]
 42 |         # Computing the Reward
 43 |         self.reward = energy_noai - energy_ai
 44 |         # Scaling the Reward
 45 |         self.reward = 1e-3 * self.reward
 46 | 
 47 |         # GETTING THE NEXT STATE
 48 |         # Updating the atmospheric temperature
 49 |         self.atmospheric_temperature = self.monthly_atmospheric_temperatures[month]
 50 |         # Updating the number of users
 51 |         self.current_number_users += np.random.randint(-self.max_update_users,self.max_update_users)
 52 |         if (self.current_number_users > self.max_number_users):
 53 |             self.current_number_users = self.max_number_users
 54 |         elif (self.current_number_users < self.min_number_users):
 55 |             self.current_number_users = self.min_number_users
 56 |         # Updating the rate of data
 57 |         self.current_rate_data += np.random.randint(-self.max_update_data,self.max_update_data)
 58 |         if (self.current_rate_data > self.max_rate_data):
 59 |             self.current_rate_data = self.max_rate_data
 60 |         elif (self.current_rate_data < self.min_rate_data):
 61 |             self.current_rate_data = self.min_rate_data
 62 |         # Computing the Delta of Intrinsic Temperature
 63 |         past_intrinsic_temperature = self.intrinsic_temperature
 64 |         self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data
 65 |         delta_intrinsic_temperature = self.intrinsic_temperature - past_intrinsic_temperature
 66 | 
 67 |         # Computing the Delta of Temperature caused by the AI
 68 |         if (direction == -1):
 69 |             delta_temperature_ai = -energy_ai
 70 |         elif (direction == 1):
 71 |             delta_temperature_ai = energy_ai
 72 |         # Updating the new Server’s Temperature when there is the AI
 73 |         self.temperature_ai += delta_intrinsic_temperature + delta_temperature_ai
 74 | 
 75 |         # Updating the new Server’s Temperature when there is no AI
 76 |         self.temperature_noai += delta_intrinsic_temperature
 77 | 
 78 |         # GETTING GAME OVER
 79 | 
 80 |         if(self.temperature_ai < self.min_temperature):
 81 |             if(self.train == 1):
 82 |                 self.game_over = 1
 83 |             else:
 84 |                 self.total_energy_ai += self.optimal_temperature[0] - self.temperature_ai
 85 |                 self.temperature_ai = self.optimal_temperature[0]
 86 |         elif (self.temperature_ai > self.max_temperature):
 87 |                 if (self.train == 1):
 88 |                     self.game_over = 1
 89 |                 else:
 90 |                     self.total_energy_ai += self.temperature_ai - self.optimal_temperature[1]
 91 |                     self.temperature_ai = self.optimal_temperature[1]
 92 | 
 93 |         # UPDATING THE SCORES
 94 | 
 95 |         # Updating the Total Energy spent by the AI
 96 |         self.total_energy_ai += energy_ai
 97 |         # Updating the Total Energy spent by the alternative system when there is no AI
 98 |         self.total_energy_noai += energy_noai
 99 | 
100 |         # SCALING THE NEXT STATE
101 |         scaled_temperature_ai = (self.temperature_ai - self.min_temperature) / (self.max_temperature - self.min_temperature)
102 |         scaled_number_users = (self.current_number_users - self.min_number_users) / (self.max_number_users - self.min_number_users)
103 |         scaled_rate_data = (self.current_rate_data - self.min_rate_data) / (self.max_rate_data - self.min_rate_data)
104 |         next_state = np.matrix([scaled_temperature_ai, scaled_number_users, scaled_rate_data])
105 |         
106 |         # RETURNING THE NEXT STATE, THE REWARD, AND GAME OVER
107 |         return next_state, self.reward, self.game_over
108 | 
109 |     # MAKING A METHOD THAT RESETS THE ENVIRONMENT
110 | 
111 |     def reset(self, new_month):
112 |         self.atmospheric_temperature = self.monthly_atmospheric_temperatures[new_month] self.initial_month = new_month
113 |         self.current_number_users = self.initial_number_users
114 |         self.current_rate_data = self.initial_rate_data
115 |         self.intrinsic_temperature = self.atmospheric_temperature + 1.25 * self.current_number_users + 1.25 * self.current_rate_data
116 |         self.temperature_ai = self.intrinsic_temperature
117 |         self.temperature_noai = (self.optimal_temperature[0] + self.optimal_temperature[1]) / 2.0
118 |         self.total_energy_ai = 0.0
119 |         self.total_energy_noai = 0.0
120 |         self.reward = 0.0
121 |         self.game_over = 0
122 |         self.train = 1
123 | 
124 |     # MAKING A METHOD THAT GIVES US AT ANY TIME THE STATE, THE REWARD AND GAMEOVER
125 | 
126 |     def observe(self):
127 |         scaled_temperature_ai = (self.temperature_ai - self.min_temperature) / (self.max_temperature - self.min_temperature)
128 |         scaled_number_users = (self.current_number_users - self.min_number_users) / (self.max_number_users - self.min_number_users)
129 |         scaled_rate_data = (self.current_rate_data - self.min_rate_data) / (self.max_rate_data - self.min_rate_data)
130 |         current_state = np.matrix([scaled_temperature_ai, scaled_number_users, scaled_rate_data])
131 |         return current_state, self.reward, self.game_over
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AIPoweredCooling
  2 | 
  3 | Problem to solve
  4 | 
  5 | In 2016, DeepMind AI minimized a big part of Google’s cost by reducing Google Data Centre Cooling Bill by 40% using their DQN AI model (Deep Q-Learning). We will do something very similar. We will we will build an AI that will be controlling the cooling/heating of the server so that it stays in an optimal range of temperatures while saving the maximum energy, therefore minimizing the costs. And just as DeepMind AI did, our goal will be to achieve at least 40% energy saving.
  6 | 
  7 | Environment to define
  8 | 
  9 | Before we define the states, actions and rewards, we need to explain how the server operates. We will do that in several steps. First, we will list all the environment parameters and variables by which the server is controlled. After that we will set the essential assumption of the problem, on which our AI will rely to provide a solution. Then we will specify how we will simulate the whole process. And eventually we will explain the overall functioning of the server, and how the AI plays its role.
 10 | 
 11 | Parameters:
 12 | 
 13 | 
 14 | • the average atmospheric temperature over a month
 15 | 
 16 | • the optimal range of temperatures of the server, which will be [18◦C,24◦C]
 17 | 
 18 | • the minimum temperature of the server below which it fails to operate, which will be −20◦C
 19 | 
 20 | • the maximum temperature of the server above which it fails to operate, which will be 80◦C
 21 | 
 22 | • the minimum number of users in the server, which will be 10
 23 | 
 24 | • the maximum number of users in the server, which will be 100
 25 | 
 26 | • the maximum number of users in the server that can go up or down per minute, which will be 5 • the minimum rate of data transmission in the server, which will be 20
 27 | 
 28 | • the maximum rate of data transmission in the server, which will be 300
 29 | 
 30 | • the maximum rate of data transmission that can go up or down per minute, which will be 10
 31 | 
 32 | Variables:
 33 | 
 34 | 
 35 | • the temperature of the server at any minute
 36 | 
 37 | • the number of users in the server at any minute
 38 | 
 39 | • the rate of data transmission at any minute
 40 | 
 41 | • the energy spent by the AI onto the server (to cool it down or heat it up) at any minute
 42 | 
 43 | • the energy spent by the server’s integrated cooling system that automatically brings the server’s tem- perature back to the optimal range whenever the server’s temperature goes outside this optimal range
 44 | 
 45 | 
 46 | All these parameters and variables will be part of our server environment and will influence the actions of the AI on the server.
 47 | 
 48 | The number of users and the rate of data transmission will be randomly fluctuating to simulate an actual server. This leads to randomness in the temperature and the AI has to understand how much cooling or heating power it has to transfer to the server so as to not deteriorate the server performance and at the same time, expend the least energy by optimizing its heat transfer.
 49 | 
 50 | Defining the states.
 51 | 
 52 | The input state st at time t is composed of the following three elements: 
 53 | 
 54 | 
 55 | 1. The temperature of the server at time t.
 56 | 
 57 | 2. The number of users in the server at time t.
 58 | 
 59 | 3. The rate of data transmission in the server at time t.
 60 | 
 61 | Thus the input state will be an input vector of these three elements. Our future AI will take this vector as input, and will return the action to play at each time t.
 62 | 
 63 | 
 64 | Defining the actions.
 65 | 
 66 | The actions are simply the temperature changes that the AI can cause inside the server, in order to heat it up or cool it down. In order to make our actions discrete, we will consider 5 possible temperature changes from −3◦C to +3◦C, so that we end up with the 5 following possible actions that the AI can play to regulate the temperature of the server:
 67 | 
 68 | 
 69 | 0 The AI cools down the server by 3◦C
 70 | 
 71 | 1 The AI cools down the server by 1.5◦C
 72 | 
 73 | 2 The AI does not transfer any heat to the server (no temperature change)
 74 | 
 75 | 3 The AI heats up the server by 1.5◦C
 76 | 
 77 | 4 The AI heats up the server by 3◦C
 78 | 
 79 | Defining the rewards.
 80 | 
 81 | The reward at iteration t is the energy spent on the server that the AI is saving with respect to the server’s integrated cooling system, that is, the difference between the energy that the unintelligent cooling system would spend if the AI was deactivated and the energy that the AI spends onto the server:
 82 | 
 83 | Reward = E tnoAI − EtAI
 84 | 
 85 | Rewardt = Energy saved by the AI between t and t + 1
 86 | = EtnoAI − EtAI
 87 | = |∆T noAI| − |∆T AI|
 88 | 
 89 | 
 90 | # AI Solution
 91 | 
 92 | Q-Learning into Deep Learning
 93 | 
 94 | 
 95 | Deep Q-Learning consists of combining Q-Learning to an Artificial Neural Network. Inputs are encoded vectors, each one defining a state of the environment. These inputs go into an Artificial Neural Network, where the output is the action to play. More precisely, let’s say the game has n possible actions, the output layer of the neural network is comprised of n output neurons, each one corresponding to the Q-values of each action played in the current state. Then the action played is the one associated with the output neuron that has the highest Q-value (argmax), or the one returned by the softmax method. In our case we will use argmax. And since Q-values are real numbers, that makes our neural network an ANN for Regression.
 96 | 
 97 | Hence, in each state st:
 98 | 
 99 | • the prediction is the Q-value Q(st,at) where at is chosen by argmax or softmax
100 | 
101 | • the target is rt + γmax(Q(st+1, a))
102 | 
103 | • the loss error between the prediction and the target is the squared of the Temporal Difference
104 | 
105 | Then this loss error is backpropagated into the network, and the weights are updated according to how much they contributed to the error.
106 | 
107 | 
108 | Experience Replay
109 | 
110 | 
111 | So far we have only considered transitions from one state st to the next state st+1. The problem with this is that st is most of the time very correlated with st+1. Therefore the network is not learning much. This could be way improved if, instead of considering only this one previous transition, we considered the last m transitions where m is a large number. This pack of the last m transitions is what is called the Experience Replay. Then from this Experience Replay we take some random batches of transitions to make our updates.
112 | 
113 | The Brain
114 | 
115 | 
116 | The brain, or more precisely the deep neural network of our AI, will be a fully connected neural network, composed of two hidden layers, the first one having 64 neurons, and the second one having 32 neurons. And as a reminder, this neural network takes as inputs the states of the environment, and returns as outputs the Q-Values for each of the 5 actions. This artificial brain will be trained with a "Mean Squared Error" loss, and an Adam optimizer.
117 | 
118 | # Implementation
119 | 
120 | Step 1: Building the Environment
121 | 1. Step 1-1: Introducing and initializing all the parameters and variables of the environment.
122 | 2. Step 1-2: Making a method that updates the environment right after the AI plays an action.
123 | 3. Step 1-3: Making a method that resets the environment.
124 | 4. Step 1-4: Making a method that gives us at any time the current state, the last reward obtained, and whether the game is over.
125 | 
126 | Step 2: Building the Brain
127 | 1. Step 2-1: Building the input layer composed of the input states.
128 | 2. Step 2-2: Building the hidden layers with a chosen number of these layers and neurons inside each, fully connected to the input layer and between each other.
129 | 3. Step 2-3: Building the output layer, fully connected to the last hidden layer.
130 | 4. Step 2-4: Assembling the full architecture inside a model object.
131 | 5. Step 2-5: Compiling the model with a Mean-Squared Error loss function and a chosen optimizer.
132 | 
133 | Step 3: Implementing the Deep Reinforcement Learning Algorithm
134 | 1. Step 3-1: Introducing and initializing all the parameters and variables of the DQN model.
135 | 2. Step 3-2: Making a method that builds the memory in Experience Replay.
136 | 3. Step 3-3: Making a method that builds and returns two batches of 10 inputs and 10 targets
137 | 
138 | Step 4: Training the AI
139 | 1. Step 4-1: Building the environment by creating an object of the Environment class built in Step 1.
140 | 2. Step 4-2: Building the artificial brain by creating an object of the Brain class built in Step 2.
141 | 3. Step 4-3: Building the DQN model by creating an object of the DQN class built in Step 3.
142 | 4. Step 4-4: Choosing the training mode.
143 | 5. Step 4-5: Starting the training with a for loop over a chosen number of epochs.
144 | 6. Step 4-6: During each epoch we repeat the whole Deep Q-Learning process, while also doing some exploration 30% of the time.
145 | 
146 | Step 5: Testing the AI
147 | 1. Step 5-1: Building a new environment by creating an object of the Environment class built in Step 1.
148 | 2. Step 5-2: Loading the artificial brain with its pre-trained weights from the previous training.
149 | 3. Step 5-3: Choosing the inference mode.
150 | 4. Step 5-4: Starting the simulation.
151 | 5. Step 5-5: At each iteration (each minute), our AI only plays the action that results from its prediction, and no exploration or Deep Q-Learning training is happening whatsoever.
152 | 


--------------------------------------------------------------------------------