├── AtariWrapper.py ├── CDQN.py ├── CustomEnvironments ├── __init__.py ├── stochasticGridWorld.py └── stochasticGridWorldOptimal.py ├── DQN.py ├── Experiments └── __init__.py ├── FQF.py ├── Figures ├── Distributions │ └── __init__.py ├── Performance │ └── __init__.py └── __init__.py ├── IQN.py ├── Models ├── CDQN_Model.py ├── CDQN_Model_Atari.py ├── CNN_Atari.py ├── CNN_MinAtar.py ├── DNN_Atari.py ├── DNN_MinAtar.py ├── FQF_Model.py ├── FQF_Model_Atari.py ├── FQF_Model_Bis.py ├── FeedforwardDNN.py ├── IQN_Model.py ├── IQN_Model_Atari.py ├── MonotonicNN.py ├── QR_DQN_Model.py ├── QR_DQN_Model_Atari.py ├── UMDQN_C_Model.py ├── UMDQN_C_Model_Atari.py ├── UMDQN_KL_Model.py ├── UMDQN_KL_Model_Atari.py ├── UMDQN_W_Model.py ├── UMDQN_W_Model_Atari.py └── __init__.py ├── MonteCarloDistributions.py ├── Parameters ├── parameters_CDQN_Atari57.json ├── parameters_CDQN_ClassicControl.json ├── parameters_CDQN_MinAtar.json ├── parameters_CDQN_StochasticGridWorld.json ├── parameters_DQN_Atari57.json ├── parameters_DQN_ClassicControl.json ├── parameters_DQN_MinAtar.json ├── parameters_DQN_StochasticGridWorld.json ├── parameters_FQF_Atari57.json ├── parameters_FQF_ClassicControl.json ├── parameters_FQF_MinAtar.json ├── parameters_FQF_StochasticGridWorld.json ├── parameters_IQN_Atari57.json ├── parameters_IQN_ClassicControl.json ├── parameters_IQN_MinAtar.json ├── parameters_IQN_StochasticGridWorld.json ├── parameters_QR_DQN_Atari57.json ├── parameters_QR_DQN_ClassicControl.json ├── parameters_QR_DQN_MinAtar.json ├── parameters_QR_DQN_StochasticGridWorld.json ├── parameters_UMDQN_C_Atari57.json ├── parameters_UMDQN_C_ClassicControl.json ├── parameters_UMDQN_C_MinAtar.json ├── parameters_UMDQN_C_StochasticGridWorld.json ├── parameters_UMDQN_KL_Atari57.json ├── parameters_UMDQN_KL_ClassicControl.json ├── parameters_UMDQN_KL_MinAtar.json ├── parameters_UMDQN_KL_StochasticGridWorld.json ├── parameters_UMDQN_W_Atari57.json ├── parameters_UMDQN_W_ClassicControl.json ├── parameters_UMDQN_W_MinAtar.json └── parameters_UMDQN_W_StochasticGridWorld.json ├── QR_DQN.py ├── README.md ├── SavedModels └── __init__.py ├── Tensorboard └── __init__.py ├── UMDQN_C.py ├── UMDQN_KL.py ├── UMDQN_W.py ├── main.py ├── replayMemory.py └── requirements.txt /CDQN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | import numpy as np 10 | 11 | from matplotlib import pyplot as plt 12 | 13 | import torch 14 | import torch.optim as optim 15 | 16 | from replayMemory import ReplayMemory 17 | 18 | from Models.CDQN_Model import CDQN_Model 19 | from Models.CDQN_Model_Atari import CDQN_Model_Atari 20 | 21 | from DQN import DQN 22 | 23 | 24 | 25 | ############################################################################### 26 | ############################### Class CDQN #################################### 27 | ############################################################################### 28 | 29 | class CDQN(DQN): 30 | """ 31 | GOAL: Implementing the Categorical DQN (C51) Deep Reinforcement Learning algorithm. 32 | 33 | VARIABLES: - device: Hardware specification (CPU or GPU). 34 | - gamma: Discount factor of the RL algorithm. 35 | - learningRate: Learning rate of the DL optimizer (ADAM). 36 | - epsilon: Epsilon value for the DL optimizer (ADAM). 37 | - targetNetworkUpdate: Update frequency of the target network. 38 | - learningUpdatePeriod: Frequency of the learning procedure. 39 | - batchSize: Size of the batch to sample from the replay memory. 40 | - capacity: Capacity of the replay memory. 41 | - replayMemory: Experience Replay memory. 42 | - rewardClipping: Clipping of the RL rewards. 43 | - gradientClipping: Clipping of the training loss. 44 | - optimizer: DL optimizer (ADAM). 45 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 46 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 47 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 48 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 49 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 50 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 51 | - targetNetwork: Deep Neural Network representing the target network. 52 | 53 | METHODS: - __init__: Initialization of the RL algorithm. 54 | - chooseAction: Choose a valid action based on the current state 55 | observed, according to the RL policy learned. 56 | - learning: Execute the RL algorithm learning procedure. 57 | """ 58 | 59 | def __init__(self, observationSpace, actionSpace, environment, 60 | parametersFileName='', reporting=True): 61 | """ 62 | GOAL: Initializing the RL agent based on the CDQN Deep Reinforcement Learning 63 | algorithm, by setting up the algorithm parameters as well as 64 | the Deep Neural Networks. 65 | 66 | INPUTS: - observationSpace: RL observation space. 67 | - actionSpace: RL action space. 68 | - environment: Name of the RL environment. 69 | - parametersFileName: Name of the JSON parameters file. 70 | - reporting: Enable the reporting of the results. 71 | 72 | OUTPUTS: / 73 | """ 74 | 75 | # Initialization of the DQN parent class 76 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 77 | 78 | # Setting of the parameters 79 | if parametersFileName == '': 80 | parametersFileName = ''.join(['Parameters/parameters_CDQN_', str(environment), '.json']) 81 | parameters = self.readParameters(parametersFileName) 82 | 83 | # Set the device for DNN computations (CPU or GPU) 84 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 85 | 86 | # Set the general parameters of the RL algorithm 87 | self.gamma = parameters['gamma'] 88 | self.learningRate = parameters['learningRate'] 89 | self.epsilon = parameters['epsilon'] 90 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 91 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 92 | self.rewardClipping = parameters['rewardClipping'] 93 | self.gradientClipping = parameters['gradientClipping'] 94 | 95 | # Set the Experience Replay mechanism 96 | self.batchSize = parameters['batchSize'] 97 | self.capacity = parameters['capacity'] 98 | self.replayMemory = ReplayMemory(self.capacity) 99 | 100 | # Set the distribution support 101 | self.numberOfAtoms = parameters['numberOfAtoms'] 102 | self.minReturn = parameters['minReturn'] 103 | self.maxReturn = parameters['maxReturn'] 104 | self.support = np.linspace(self.minReturn, self.maxReturn, self.numberOfAtoms) 105 | self.supportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfAtoms).to(self.device) 106 | 107 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 108 | self.atari = parameters['atari'] 109 | self.minatar = parameters['minatar'] 110 | if self.atari or self.minatar: 111 | self.policyNetwork = CDQN_Model_Atari(observationSpace, actionSpace*self.numberOfAtoms, self.numberOfAtoms, minAtar=self.minatar).to(self.device) 112 | self.targetNetwork = CDQN_Model_Atari(observationSpace, actionSpace*self.numberOfAtoms, self.numberOfAtoms, minAtar=self.minatar).to(self.device) 113 | else: 114 | self.policyNetwork = CDQN_Model(observationSpace, actionSpace*self.numberOfAtoms, parameters['structureDNN'], self.numberOfAtoms).to(self.device) 115 | self.targetNetwork = CDQN_Model(observationSpace, actionSpace*self.numberOfAtoms, parameters['structureDNN'], self.numberOfAtoms).to(self.device) 116 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 117 | 118 | # Set the Deep Learning optimizer 119 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 120 | 121 | # Set the Epsilon-Greedy exploration technique 122 | self.epsilonStart = parameters['epsilonStart'] 123 | self.epsilonEnd = parameters['epsilonEnd'] 124 | self.epsilonDecay = parameters['epsilonDecay'] 125 | self.epsilonTest = parameters['epsilonTest'] 126 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 127 | 128 | # Initialization of the experiment folder and tensorboard writer 129 | self.initReporting(parameters, 'CDQN') 130 | 131 | 132 | def chooseAction(self, state, plot=False): 133 | """ 134 | GOAL: Choose a valid RL action from the action space according to the 135 | RL policy as well as the current RL state observed. 136 | 137 | INPUTS: - state: RL state returned by the environment. 138 | - plot: Enable the plotting of the random returns distributions. 139 | 140 | OUTPUTS: - action: RL action chosen from the action space. 141 | """ 142 | 143 | # Choose the best action based on the RL policy 144 | with torch.no_grad(): 145 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 146 | distribution = self.policyNetwork(state).squeeze(0) 147 | distributionReturn = distribution * self.supportTorch 148 | QValues = distributionReturn.sum(1) 149 | _, action = QValues.max(0) 150 | 151 | # If required, plot the return distribution associated with each action 152 | if plot: 153 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 154 | fig = plt.figure() 155 | ax = fig.add_subplot() 156 | QValues = QValues.cpu().numpy() 157 | for a in range(self.actionSpace): 158 | dist = distribution[a].cpu().numpy() 159 | ax.bar(self.support, dist, label=''.join(['Action ', str(a), ' random return Z']), width=(self.maxReturn-self.minReturn)/self.numberOfAtoms, edgecolor='black', alpha=0.5, color=colors[a]) 160 | ax.axvline(x=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 161 | ax.set_xlabel('Random return') 162 | ax.set_ylabel('Probability Density Function (PDF)') 163 | ax.legend() 164 | plt.show() 165 | 166 | return action.item() 167 | 168 | 169 | def learning(self): 170 | """ 171 | GOAL: Sample a batch of past experiences and learn from it 172 | by updating the Reinforcement Learning policy. 173 | 174 | INPUTS: / 175 | 176 | OUTPUTS: - loss: Loss of the learning procedure. 177 | """ 178 | 179 | # Check that the replay memory is filled enough 180 | if (len(self.replayMemory) >= self.batchSize): 181 | 182 | # Sample a batch of experiences from the replay memory 183 | batch = self.dataLoaderIter.next() 184 | state = batch[0].float().to(self.device) 185 | action = batch[1].long().to(self.device) 186 | reward = batch[2].float().to(self.device) 187 | nextState = batch[3].float().to(self.device) 188 | done = batch[4].float().to(self.device) 189 | 190 | # Computation of the current return distribution 191 | distribution = self.policyNetwork(state) 192 | action = action.unsqueeze(1).unsqueeze(1).expand(self.batchSize, 1, self.numberOfAtoms) 193 | distribution = distribution.gather(1, action).squeeze(1) 194 | 195 | # Computation of the new distribution to be learnt by the policy DNN 196 | with torch.no_grad(): 197 | nextDistribution = self.targetNetwork(nextState) 198 | nextAction = (nextDistribution * self.supportTorch).sum(2).max(1)[1].unsqueeze(1).unsqueeze(1).expand(self.batchSize, 1, self.numberOfAtoms) 199 | nextDistribution = nextDistribution.gather(1, nextAction).squeeze(1) 200 | deltaZ = float(self.maxReturn - self.minReturn) / (self.numberOfAtoms - 1) 201 | tz = reward.view(-1, 1) + (1 - done.view(-1, 1)) * self.gamma * self.supportTorch 202 | tz = tz.clamp(min=self.minReturn, max=self.maxReturn) 203 | b = ((tz - self.minReturn) / deltaZ) 204 | l = b.floor().long() 205 | u = b.ceil().long() 206 | offset = torch.linspace(0, (self.batchSize - 1) * self.numberOfAtoms, self.batchSize).long().unsqueeze(1).expand(self.batchSize, self.numberOfAtoms).to(self.device) 207 | projectedDistribution = torch.zeros(nextDistribution.size()).to(self.device) 208 | projectedDistribution.view(-1).index_add_(0, (l + offset).view(-1), (nextDistribution * (u.float() - b)).view(-1)) 209 | projectedDistribution.view(-1).index_add_(0, (u + offset).view(-1), (nextDistribution * (b - l.float())).view(-1)) 210 | 211 | # Computation of the loss 212 | loss = -(projectedDistribution * distribution.log()).sum(1).mean() 213 | 214 | # Computation of the gradients 215 | self.optimizer.zero_grad() 216 | loss.backward() 217 | 218 | # Gradient Clipping 219 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 220 | 221 | # Perform the Deep Neural Network optimization 222 | self.optimizer.step() 223 | 224 | return loss.item() 225 | -------------------------------------------------------------------------------- /CustomEnvironments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/CustomEnvironments/__init__.py -------------------------------------------------------------------------------- /CustomEnvironments/stochasticGridWorld.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import random 8 | import time 9 | 10 | import numpy as np 11 | 12 | from matplotlib import pyplot as plt 13 | 14 | import gym 15 | from gym import spaces 16 | 17 | 18 | 19 | ############################################################################### 20 | ################################ Global variables ############################# 21 | ############################################################################### 22 | 23 | # Default parameters for the environment configuration 24 | size = 7 25 | timeOut = 20 26 | 27 | # Parameters associated with stochasticity 28 | doubleProbability = 0.5 29 | stochasticRewards = True 30 | 31 | 32 | 33 | ############################################################################### 34 | ########################### Class StochasticGridWorld ######################### 35 | ############################################################################### 36 | 37 | class StochasticGridWorld(gym.Env): 38 | """ 39 | GOAL: Implementing a simple RL environment consisting of a 2D grid world 40 | where the agent has to reach a fixed objective while avoiding a trap, 41 | with potentially stochastic transitions and rewards. 42 | 43 | VARIABLES: - observation_space: RL environment observation space. 44 | - action_space: RL environment action space. 45 | - playerPosition: Position of the player (x, y). 46 | - trapPosition: Position of the trap (x, y). 47 | - targetPosition: Position of the target (x, y). 48 | - timeElapsed: Time elapsed. 49 | - state: RL state or observation. 50 | - reward: RL reward signal. 51 | - done: RL termination signal (episode). 52 | - info: Additional RL information. 53 | 54 | METHODS: - __init__: Initialization of the RL environment. 55 | - reset: Resetting of the RL environment. 56 | - step: Update the RL environment according to the agent's action. 57 | - render: Render graphically the current state of the RL environment. 58 | """ 59 | 60 | def __init__(self, size=size): 61 | """ 62 | GOAL: Perform the initialization of the RL environment. 63 | 64 | INPUTS: - size: Size of the square grid world. 65 | 66 | OUTPUTS: / 67 | """ 68 | 69 | super(StochasticGridWorld, self).__init__() 70 | 71 | # Initialize the random function with a fixed random seed 72 | random.seed(time.time()) 73 | 74 | # Definition of the observation/state and action spaces 75 | self.observation_space = spaces.Box(low=0, high=size-1, shape=(2, 1), dtype=np.uint8) 76 | self.action_space = spaces.Discrete(4) 77 | self.size = size 78 | 79 | # Initialization of the traps and target positions 80 | self.trapPosition = [int(self.size/2), int(self.size/2)] 81 | self.targetPosition = [int(self.size/2), self.size-1] 82 | 83 | # Initialization of the player position 84 | x = int(random.random() * (self.size-1)) 85 | y = int(random.random() * (self.size-1)) 86 | self.playerPosition = [x, y] 87 | while self.playerPosition == self.targetPosition or self.playerPosition == self.trapPosition: 88 | x = int(random.random() * (self.size-1)) 89 | y = int(random.random() * (self.size-1)) 90 | self.playerPosition = [x, y] 91 | 92 | # Initialization of the time elapsed 93 | self.timeElapsed = 0 94 | 95 | # Initialization of the RL variables 96 | self.state = np.array([self.playerPosition[0], self.playerPosition[1]]) 97 | self.reward = 0. 98 | self.done = 0 99 | self.info = {} 100 | 101 | 102 | def reset(self): 103 | """ 104 | GOAL: Perform a reset of the RL environment. 105 | 106 | INPUTS: / 107 | 108 | OUTPUTS: - state: RL state or observation. 109 | """ 110 | 111 | # Reset of the player position and time elapsed 112 | x = int(random.random() * (self.size-1)) 113 | y = int(random.random() * (self.size-1)) 114 | self.playerPosition = [x, y] 115 | while self.playerPosition == self.targetPosition or self.playerPosition == self.trapPosition: 116 | x = int(random.random() * (self.size-1)) 117 | y = int(random.random() * (self.size-1)) 118 | self.playerPosition = [x, y] 119 | self.timeElapsed = 0 120 | 121 | # Reset of the RL variables 122 | self.state = np.array([self.playerPosition[0], self.playerPosition[1]]) 123 | self.reward = 0. 124 | self.done = 0 125 | self.info = {} 126 | 127 | return self.state 128 | 129 | 130 | def step(self, action): 131 | """ 132 | GOAL: Update the RL environment according to the agent's action. 133 | 134 | INPUTS: - action: RL action outputted by the agent. 135 | 136 | OUTPUTS: - state: RL state or observation. 137 | - reward: RL reward signal. 138 | - done: RL termination signal. 139 | - info: Additional RL information. 140 | """ 141 | 142 | # Stochasticity associated with the next move of the agent 143 | rand = random.random() 144 | if rand > doubleProbability: 145 | moveRange = 1 146 | else: 147 | moveRange = 2 148 | 149 | # Go right 150 | if action == 0: 151 | self.playerPosition[0] = min(self.playerPosition[0]+moveRange, self.size-1) 152 | # Go down 153 | elif action == 1: 154 | self.playerPosition[1] = max(self.playerPosition[1]-moveRange, 0) 155 | # Go left 156 | elif action == 2: 157 | self.playerPosition[0] = max(self.playerPosition[0]-moveRange, 0) 158 | # Go up 159 | elif action == 3: 160 | self.playerPosition[1] = min(self.playerPosition[1]+moveRange, self.size-1) 161 | # Invalid action 162 | else: 163 | print("Error: invalid action...") 164 | 165 | # Incrementation of the time elapsed 166 | self.timeElapsed += 1 167 | 168 | # Assign the appropriate RL reward 169 | if stochasticRewards: 170 | self.reward = np.random.normal(loc=0.0, scale=0.1) 171 | else: 172 | self.reward = 0.0 173 | if self.playerPosition == self.targetPosition: 174 | if stochasticRewards: 175 | self.reward = np.random.normal(loc=1.0, scale=0.1) 176 | else: 177 | self.reward = 1.0 178 | self.done = 1 179 | elif self.playerPosition == self.trapPosition: 180 | if stochasticRewards: 181 | self.reward = np.random.normal(loc=-1.0, scale=0.1) 182 | else: 183 | self.reward = -1.0 184 | self.done = 1 185 | 186 | # Check if the time elapsed reaches the time limit 187 | if self.timeElapsed >= timeOut: 188 | self.done = 1 189 | 190 | # Update of the RL state 191 | self.state = np.array([self.playerPosition[0], self.playerPosition[1]]) 192 | 193 | # Return of the RL variables 194 | return self.state, self.reward, self.done, self.info 195 | 196 | 197 | def render(self, mode='human'): 198 | """ 199 | GOAL: Render graphically the current state of the RL environment. 200 | 201 | INPUTS: / 202 | 203 | OUTPUTS: / 204 | """ 205 | 206 | fig = plt.figure(figsize=(8, 8)) 207 | ax = fig.gca() 208 | ax.set_xticks(np.arange(0, self.size+1, 1)) 209 | ax.set_yticks(np.arange(0, self.size+1, 1)) 210 | ax.set(xlim=(0, self.size), ylim=(0, self.size)) 211 | plt.scatter(self.playerPosition[0]+0.5, self.playerPosition[1]+0.5, s=100, color='blue') 212 | plt.scatter(self.targetPosition[0]+0.5, self.targetPosition[1]+0.5, s=100, color='green') 213 | plt.scatter(self.trapPosition[0]+0.5, self.trapPosition[1]+0.5, s=100, color='red') 214 | plt.grid() 215 | text = ''.join(['Time elapsed: ', str(self.timeElapsed)]) 216 | plt.text(0, self.size+0.2, text, fontsize=12) 217 | plt.show() 218 | #plt.savefig("Figures/Distributions/StochasticGridWorldState.pdf", format="pdf") 219 | 220 | 221 | def setState(self, state): 222 | """ 223 | GOAL: Reset the RL environment and set a specific initial state. 224 | 225 | INPUTS: - state: Information about the state to set. 226 | 227 | OUTPUTS: - state: RL state of the environment. 228 | """ 229 | 230 | # Reset of the environment 231 | self.reset() 232 | 233 | # Set the initial state as specified 234 | self.timeElapsed = 0 235 | self.playerPosition = [state[0], state[1]] 236 | self.state = np.array([self.playerPosition[0], self.playerPosition[1]]) 237 | 238 | return self.state 239 | -------------------------------------------------------------------------------- /CustomEnvironments/stochasticGridWorldOptimal.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ##################### Class StochasticGridWorldOptimal ######################## 5 | ############################################################################### 6 | 7 | class StochasticGridWorldOptimal(): 8 | """ 9 | GOAL: Implementing the optimal policy associated with the stochastic grid 10 | world environment. 11 | 12 | VARIABLES: - environment: Stochastic grid world environment. 13 | 14 | METHODS: - __init__: Initialization of the class. 15 | """ 16 | 17 | def __init__(self, environment): 18 | """ 19 | GOAL: Perform the initialization of the class. 20 | 21 | INPUTS: - environment: Stochastic grid world environment considered. 22 | 23 | OUTPUTS: - processState: Preprocessing of the RL state. 24 | - chooseAction: Choose the optimal RL action. 25 | """ 26 | 27 | # Initialization of important variables 28 | self.environment = environment 29 | self.size = self.environment.size 30 | self.trapPosition = self.environment.trapPosition 31 | self.targetPosition = self.environment.targetPosition 32 | 33 | 34 | def processState(self, state): 35 | """ 36 | GOAL: Potentially process the RL state returned by the environment. 37 | 38 | INPUTS: - state: RL state returned by the environment. 39 | 40 | OUTPUTS: - state: RL state processed. 41 | """ 42 | 43 | return state 44 | 45 | 46 | def chooseAction(self, state, plot=False): 47 | """ 48 | GOAL: Choose the optimal RL action. 49 | 50 | INPUTS: - state: RL state returned by the environment. 51 | - plot: False, because not supported. 52 | 53 | OUTPUTS: - action: RL action selected. 54 | """ 55 | 56 | # Retrieve the coordinates of the agent 57 | x = state[0] 58 | y = state[1] 59 | 60 | # Implementation of the optimal policy 61 | if x == self.targetPosition[0] and y < self.trapPosition[1]: 62 | action = 0 63 | elif x == self.targetPosition[0] and y > self.trapPosition[1]: 64 | action = 3 65 | elif y == self.targetPosition[1] and x < self.targetPosition[0]: 66 | action = 0 67 | elif y == self.targetPosition[1] and x > self.targetPosition[0]: 68 | action = 2 69 | elif (x < self.targetPosition[0] or x > self.targetPosition[0]) and y < (self.targetPosition[1]-1): 70 | action = 3 71 | elif y == (self.targetPosition[1]-1) and y > self.trapPosition[1] and x < self.targetPosition[0]: 72 | action = 0 73 | elif y == (self.targetPosition[1]-1) and y > self.trapPosition[1] and x > self.targetPosition[0]: 74 | action = 2 75 | else: 76 | action = 3 77 | 78 | # Return of the RL action selected 79 | return action 80 | -------------------------------------------------------------------------------- /Experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Experiments/__init__.py -------------------------------------------------------------------------------- /FQF.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | from matplotlib import pyplot as plt 10 | 11 | import torch 12 | import torch.optim as optim 13 | 14 | from replayMemory import ReplayMemory 15 | 16 | from Models.FQF_Model import FQF_Model 17 | from Models.FQF_Model_Atari import FQF_Model_Atari 18 | from Models.FQF_Model_Bis import FQF_Model_Bis 19 | 20 | from DQN import DQN 21 | 22 | 23 | 24 | ############################################################################### 25 | ################################## Class FQF ################################## 26 | ############################################################################### 27 | 28 | class FQF(DQN): 29 | """ 30 | GOAL: Implementing the FQF Deep Reinforcement Learning algorithm. 31 | 32 | VARIABLES: - device: Hardware specification (CPU or GPU). 33 | - gamma: Discount factor of the RL algorithm. 34 | - learningRate: Learning rate of the DL optimizer (ADAM). 35 | - epsilon: Epsilon value for the DL optimizer (ADAM). 36 | - targetNetworkUpdate: Update frequency of the target network. 37 | - learningUpdatePeriod: Frequency of the learning procedure. 38 | - batchSize: Size of the batch to sample from the replay memory. 39 | - capacity: Capacity of the replay memory. 40 | - replayMemory: Experience Replay memory. 41 | - rewardClipping: Clipping of the RL rewards. 42 | - gradientClipping: Clipping of the training loss. 43 | - optimizer: DL optimizer (ADAM). 44 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 45 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 46 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 47 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 48 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 49 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 50 | - targetNetwork: Deep Neural Network representing the target network. 51 | 52 | METHODS: - __init__: Initialization of the RL algorithm. 53 | - chooseAction: Choose a valid action based on the current state 54 | observed, according to the RL policy learned. 55 | - learning: Execute the RL algorithm learning procedure. 56 | """ 57 | 58 | def __init__(self, observationSpace, actionSpace, environment, 59 | parametersFileName='', reporting=True): 60 | """ 61 | GOAL: Initializing the RL agent based on the FQF Deep Reinforcement Learning 62 | algorithm, by setting up the algorithm parameters as well as 63 | the Deep Neural Networks. 64 | 65 | INPUTS: - observationSpace: RL observation space. 66 | - actionSpace: RL action space. 67 | - environment: Name of the RL environment. 68 | - parametersFileName: Name of the JSON parameters file. 69 | - reporting: Enable the reporting of the results. 70 | 71 | OUTPUTS: / 72 | """ 73 | 74 | # Initialization of the DQN parent class 75 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 76 | 77 | # Setting of the parameters 78 | if parametersFileName == '': 79 | parametersFileName = ''.join(['Parameters/parameters_FQF_', str(environment), '.json']) 80 | parameters = self.readParameters(parametersFileName) 81 | 82 | # Set the device for DNN computations (CPU or GPU) 83 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 84 | 85 | # Set the general parameters of the RL algorithm 86 | self.gamma = parameters['gamma'] 87 | self.learningRate = parameters['learningRate'] 88 | self.epsilon = parameters['epsilon'] 89 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 90 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 91 | self.rewardClipping = parameters['rewardClipping'] 92 | self.gradientClipping = parameters['gradientClipping'] 93 | 94 | # Set the Experience Replay mechanism 95 | self.batchSize = parameters['batchSize'] 96 | self.capacity = parameters['capacity'] 97 | self.replayMemory = ReplayMemory(self.capacity) 98 | 99 | # Set the distribution support 100 | self.N = parameters['N'] 101 | self.K = parameters['K'] 102 | self.NCos = parameters['NCos'] 103 | self.kappa = 1.0 104 | 105 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 106 | self.atari = parameters['atari'] 107 | self.minatar = parameters['minatar'] 108 | if self.atari or self.minatar: 109 | self.policyNetwork = FQF_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device) 110 | self.targetNetwork = FQF_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device) 111 | stateEmbedding = self.policyNetwork.getEmbeddingSize() 112 | else: 113 | self.policyNetwork = FQF_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device) 114 | self.targetNetwork = FQF_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device) 115 | stateEmbedding = parameters['stateEmbedding'] 116 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 117 | 118 | # Set the Deep Learning optimizer 119 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 120 | 121 | # Set the Fraction Proposal Network of the FQF algorithm + associated parameters 122 | self.fractionProposalNetwork = FQF_Model_Bis(stateEmbedding, self.N, self.device).to(self.device) 123 | self.optimizerFPN = optim.RMSprop(self.fractionProposalNetwork.parameters(), lr=0.000000001, alpha=0.95, eps=0.00001) 124 | self.entropyCoefficient = 0.001 125 | 126 | # Set the Epsilon-Greedy exploration technique 127 | self.epsilonStart = parameters['epsilonStart'] 128 | self.epsilonEnd = parameters['epsilonEnd'] 129 | self.epsilonDecay = parameters['epsilonDecay'] 130 | self.epsilonTest = parameters['epsilonTest'] 131 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 132 | 133 | # Initialization of the experiment folder and tensorboard writer 134 | self.initReporting(parameters, 'FQF') 135 | 136 | 137 | def chooseAction(self, state, plot=False): 138 | """ 139 | GOAL: Choose a valid RL action from the action space according to the 140 | RL policy as well as the current RL state observed. 141 | 142 | INPUTS: - state: RL state returned by the environment. 143 | - plot: Enable the plotting of the random returns distributions. 144 | 145 | OUTPUTS: - action: RL action chosen from the action space. 146 | """ 147 | 148 | # Choose the best action based on the RL policy 149 | with torch.no_grad(): 150 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 151 | stateEmbedding = self.policyNetwork.embedding(state) 152 | _, tausBis, _ = self.fractionProposalNetwork(stateEmbedding) 153 | quantiles = self.policyNetwork(state, tausBis, stateEmbedding) 154 | QValues = quantiles.mean(2) 155 | _, action = QValues.max(1) 156 | 157 | # If required, plot the return distribution associated with each action 158 | if plot: 159 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 160 | fig = plt.figure() 161 | ax = fig.add_subplot() 162 | taus = torch.linspace(0.0, 1.0, 10000).to(self.device) 163 | quantiles = self.policyNetwork(state, taus.unsqueeze(0), stateEmbedding) 164 | QValues = quantiles.mean(2) 165 | taus = taus.cpu().numpy() 166 | quantiles = quantiles.squeeze(0).cpu().numpy() 167 | QValues = QValues.squeeze(0).cpu().numpy() 168 | for a in range(self.actionSpace): 169 | ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a]) 170 | ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 171 | ax.set_xlabel('Quantile fraction') 172 | ax.set_ylabel('Quantile Function (QF)') 173 | ax.legend() 174 | plt.show() 175 | 176 | return action.item() 177 | 178 | 179 | def learning(self): 180 | """ 181 | GOAL: Sample a batch of past experiences and learn from it 182 | by updating the Reinforcement Learning policy. 183 | 184 | INPUTS: / 185 | 186 | OUTPUTS: - loss: Loss of the learning procedure. 187 | """ 188 | 189 | # Check that the replay memory is filled enough 190 | if (len(self.replayMemory) >= self.batchSize): 191 | 192 | # Sample a batch of experiences from the replay memory 193 | batch = self.dataLoaderIter.next() 194 | state = batch[0].float().to(self.device) 195 | action = batch[1].long().to(self.device) 196 | reward = batch[2].float().to(self.device) 197 | nextState = batch[3].float().to(self.device) 198 | done = batch[4].float().to(self.device) 199 | 200 | # Computation of the current return distribution 201 | stateEmbedding = self.policyNetwork.embedding(state) 202 | taus, tausBis, entropy = self.fractionProposalNetwork(stateEmbedding) 203 | quantiles = self.policyNetwork(state, tausBis, stateEmbedding) 204 | actionBis = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N) 205 | quantiles = quantiles.gather(1, actionBis).squeeze(1) 206 | 207 | # Computation of the Fractional loss for the FPN 208 | with torch.no_grad(): 209 | quantilesBis = self.policyNetwork(state, taus[:, 1:-1], stateEmbedding) 210 | actionBis = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N-1) 211 | quantilesBis = quantilesBis.gather(1, actionBis).squeeze(1) 212 | gradients1 = quantilesBis - quantiles[:, :-1] 213 | gradients2 = quantilesBis - quantiles[:, 1:] 214 | flag1 = quantilesBis > torch.cat([quantiles[:, :1], quantilesBis[:, :-1]], dim=1) 215 | flag2 = quantilesBis < torch.cat([quantilesBis[:, 1:], quantiles[:, -1:]], dim=1) 216 | gradients = (torch.where(flag1, gradients1, - gradients1) + torch.where(flag2, gradients2, -gradients2)).view(self.batchSize, self.N-1) 217 | fractionalLoss = (gradients * taus[:, 1:-1]).sum(dim=1).mean() 218 | fractionalLoss += self.entropyCoefficient * entropy.mean() 219 | 220 | # Computation of the new distribution to be learnt by the policy DNN 221 | with torch.no_grad(): 222 | nextStateEmbedding = self.targetNetwork.embedding(nextState) 223 | nextQuantiles = self.targetNetwork(nextState, tausBis, nextStateEmbedding) 224 | nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N) 225 | nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1) 226 | targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1)) 227 | 228 | # Computation of the quantile huber loss 229 | difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2) 230 | error = difference.abs() 231 | loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa))) 232 | loss = (tausBis.unsqueeze(2) - (difference < 0).float()).abs() * loss/self.kappa 233 | loss = loss.mean(1).sum(1).mean() 234 | 235 | # Update of the Fraction Proposal Network parameters 236 | self.optimizerFPN.zero_grad() 237 | fractionalLoss.backward(retain_graph=True) 238 | self.optimizerFPN.step() 239 | 240 | # Computation of the gradients 241 | self.optimizer.zero_grad() 242 | loss.backward() 243 | 244 | # Gradient Clipping 245 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 246 | 247 | # Perform the Deep Neural Network optimization 248 | self.optimizer.step() 249 | 250 | return loss.item() 251 | -------------------------------------------------------------------------------- /Figures/Distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/Distributions/__init__.py -------------------------------------------------------------------------------- /Figures/Performance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/Performance/__init__.py -------------------------------------------------------------------------------- /Figures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/__init__.py -------------------------------------------------------------------------------- /IQN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from matplotlib import pyplot as plt 13 | 14 | import torch 15 | import torch.optim as optim 16 | 17 | from replayMemory import ReplayMemory 18 | 19 | from Models.IQN_Model import IQN_Model 20 | from Models.IQN_Model_Atari import IQN_Model_Atari 21 | 22 | from DQN import DQN 23 | 24 | 25 | 26 | ############################################################################### 27 | ################################## Class IQN ################################## 28 | ############################################################################### 29 | 30 | class IQN(DQN): 31 | """ 32 | GOAL: Implementing the IQN Deep Reinforcement Learning algorithm. 33 | 34 | VARIABLES: - device: Hardware specification (CPU or GPU). 35 | - gamma: Discount factor of the RL algorithm. 36 | - learningRate: Learning rate of the DL optimizer (ADAM). 37 | - epsilon: Epsilon value for the DL optimizer (ADAM). 38 | - targetNetworkUpdate: Update frequency of the target network. 39 | - learningUpdatePeriod: Frequency of the learning procedure. 40 | - batchSize: Size of the batch to sample from the replay memory. 41 | - capacity: Capacity of the replay memory. 42 | - replayMemory: Experience Replay memory. 43 | - rewardClipping: Clipping of the RL rewards. 44 | - gradientClipping: Clipping of the training loss. 45 | - optimizer: DL optimizer (ADAM). 46 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 47 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 48 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 49 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 50 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 51 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 52 | - targetNetwork: Deep Neural Network representing the target network. 53 | 54 | METHODS: - __init__: Initialization of the RL algorithm. 55 | - chooseAction: Choose a valid action based on the current state 56 | observed, according to the RL policy learned. 57 | - learning: Execute the RL algorithm learning procedure. 58 | """ 59 | 60 | def __init__(self, observationSpace, actionSpace, environment, 61 | parametersFileName='', reporting=True): 62 | """ 63 | GOAL: Initializing the RL agent based on the IQN Deep Reinforcement Learning 64 | algorithm, by setting up the algorithm parameters as well as 65 | the Deep Neural Networks. 66 | 67 | INPUTS: - observationSpace: RL observation space. 68 | - actionSpace: RL action space. 69 | - environment: Name of the RL environment. 70 | - parametersFileName: Name of the JSON parameters file. 71 | - reporting: Enable the reporting of the results. 72 | 73 | OUTPUTS: / 74 | """ 75 | 76 | # Initialization of the DQN parent class 77 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 78 | 79 | # Setting of the parameters 80 | if parametersFileName == '': 81 | parametersFileName = ''.join(['Parameters/parameters_IQN_', str(environment), '.json']) 82 | parameters = self.readParameters(parametersFileName) 83 | 84 | # Set the device for DNN computations (CPU or GPU) 85 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 86 | 87 | # Set the general parameters of the RL algorithm 88 | self.gamma = parameters['gamma'] 89 | self.learningRate = parameters['learningRate'] 90 | self.epsilon = parameters['epsilon'] 91 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 92 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 93 | self.rewardClipping = parameters['rewardClipping'] 94 | self.gradientClipping = parameters['gradientClipping'] 95 | 96 | # Set the Experience Replay mechanism 97 | self.batchSize = parameters['batchSize'] 98 | self.capacity = parameters['capacity'] 99 | self.replayMemory = ReplayMemory(self.capacity) 100 | 101 | # Set the distribution support 102 | self.N = parameters['N'] 103 | self.K = parameters['K'] 104 | self.NCos = parameters['NCos'] 105 | self.kappa = 1.0 106 | 107 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 108 | self.atari = parameters['atari'] 109 | self.minatar = parameters['minatar'] 110 | if self.atari or self.minatar: 111 | self.policyNetwork = IQN_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device) 112 | self.targetNetwork = IQN_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device) 113 | else: 114 | self.policyNetwork = IQN_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device) 115 | self.targetNetwork = IQN_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device) 116 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 117 | 118 | # Set the Deep Learning optimizer 119 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 120 | 121 | # Set the Epsilon-Greedy exploration technique 122 | self.epsilonStart = parameters['epsilonStart'] 123 | self.epsilonEnd = parameters['epsilonEnd'] 124 | self.epsilonDecay = parameters['epsilonDecay'] 125 | self.epsilonTest = parameters['epsilonTest'] 126 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 127 | 128 | # Initialization of the experiment folder and tensorboard writer 129 | self.initReporting(parameters, 'IQN') 130 | 131 | 132 | def chooseAction(self, state, plot=False): 133 | """ 134 | GOAL: Choose a valid RL action from the action space according to the 135 | RL policy as well as the current RL state observed. 136 | 137 | INPUTS: - state: RL state returned by the environment. 138 | - plot: Enable the plotting of the random returns distributions. 139 | 140 | OUTPUTS: - action: RL action chosen from the action space. 141 | """ 142 | 143 | # Choose the best action based on the RL policy 144 | with torch.no_grad(): 145 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 146 | quantiles, _ = self.policyNetwork(state, self.K) 147 | QValues = quantiles.mean(2) 148 | _, action = QValues.max(1) 149 | 150 | # If required, plot the return distribution associated with each action 151 | if plot: 152 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 153 | fig = plt.figure() 154 | ax = fig.add_subplot() 155 | quantiles, taus = self.policyNetwork(state, 10000, False) 156 | taus = taus[0].squeeze(1).cpu().numpy() 157 | quantiles = quantiles.squeeze(0).cpu().numpy() 158 | QValues = QValues.squeeze(0).cpu().numpy() 159 | for a in range(self.actionSpace): 160 | ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a]) 161 | ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 162 | ax.set_xlabel('Quantile fraction') 163 | ax.set_ylabel('Quantile Function (QF)') 164 | ax.legend() 165 | plt.show() 166 | 167 | return action.item() 168 | 169 | 170 | def learning(self): 171 | """ 172 | GOAL: Sample a batch of past experiences and learn from it 173 | by updating the Reinforcement Learning policy. 174 | 175 | INPUTS: / 176 | 177 | OUTPUTS: - loss: Loss of the learning procedure. 178 | """ 179 | 180 | # Check that the replay memory is filled enough 181 | if (len(self.replayMemory) >= self.batchSize): 182 | 183 | # Sample a batch of experiences from the replay memory 184 | batch = self.dataLoaderIter.next() 185 | state = batch[0].float().to(self.device) 186 | action = batch[1].long().to(self.device) 187 | reward = batch[2].float().to(self.device) 188 | nextState = batch[3].float().to(self.device) 189 | done = batch[4].float().to(self.device) 190 | 191 | # Computation of the current return distribution 192 | quantiles, taus = self.policyNetwork(state, self.N) 193 | action = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N) 194 | quantiles = quantiles.gather(1, action).squeeze(1) 195 | 196 | # Computation of the new distribution to be learnt by the policy DNN 197 | with torch.no_grad(): 198 | nextQuantiles, _ = self.targetNetwork(nextState, self.N) 199 | nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N) 200 | nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1) 201 | targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1)) 202 | 203 | # Computation of the loss 204 | difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2) 205 | error = difference.abs() 206 | loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa))) 207 | loss = (taus - (difference < 0).float()).abs() * loss/self.kappa 208 | loss = loss.mean(1).sum(1).mean() 209 | 210 | # Without Huber loss (to be tested) 211 | lossMSE = False 212 | if lossMSE: 213 | difference = targetQuantiles - quantiles 214 | error = difference.pow(2) 215 | loss = error.mean(1).sum() 216 | 217 | # Computation of the gradients 218 | self.optimizer.zero_grad() 219 | loss.backward() 220 | 221 | # Gradient Clipping 222 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 223 | 224 | # Perform the Deep Neural Network optimization 225 | self.optimizer.step() 226 | 227 | return loss.item() 228 | -------------------------------------------------------------------------------- /Models/CDQN_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.FeedforwardDNN import FeedForwardDNN 13 | 14 | 15 | 16 | ############################################################################### 17 | ################################ Class CDQN_Model ############################# 18 | ############################################################################### 19 | 20 | class CDQN_Model(nn.Module): 21 | """ 22 | GOAL: Implementing the DL model for the CDQN distributional RL algorithm. 23 | 24 | VARIABLES: - network: Deep Neural Network. 25 | 26 | METHODS: - __init__: Initialization of the Deep Neural Network. 27 | - forward: Forward pass of the Deep Neural Network. 28 | """ 29 | 30 | def __init__(self, numberOfInputs, numberOfOutputs, structure, numberOfAtoms=51): 31 | """ 32 | GOAL: Defining and initializing the Deep Neural Network. 33 | 34 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 35 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 36 | - structure: Structure of the Deep Neural Network (hidden layers). 37 | - numberOfAtoms: Number of atoms for the support (see C51 algorithm). 38 | 39 | OUTPUTS: / 40 | """ 41 | 42 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 43 | super(CDQN_Model, self).__init__() 44 | 45 | # Initialization of useful variables 46 | self.numberOfAtoms = numberOfAtoms 47 | self.numberOfActions = int(numberOfOutputs/numberOfAtoms) 48 | 49 | # Initialization of the Deep Neural Network 50 | self.network = FeedForwardDNN(numberOfInputs, numberOfOutputs, structure) 51 | 52 | 53 | def forward(self, x): 54 | """ 55 | GOAL: Implementing the forward pass of the Deep Neural Network. 56 | 57 | INPUTS: - x: Input of the Deep Neural Network. 58 | 59 | OUTPUTS: - y: Output of the Deep Neural Network. 60 | """ 61 | 62 | x = self.network(x) 63 | y = F.softmax(x.view(-1, self.numberOfActions, self.numberOfAtoms), dim=-1) 64 | return y.clamp(min=1e-6) 65 | -------------------------------------------------------------------------------- /Models/CDQN_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.DNN_Atari import DNN_Atari 13 | from Models.DNN_MinAtar import DNN_MinAtar 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################ Class CDQN_Model_Atari ########################### 19 | ############################################################################### 20 | 21 | class CDQN_Model_Atari(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the CDQN distributional RL algorithm. 24 | 25 | VARIABLES: - network: Deep Neural Network. 26 | 27 | METHODS: - __init__: Initialization of the Deep Neural Network. 28 | - forward: Forward pass of the Deep Neural Network. 29 | """ 30 | 31 | def __init__(self, numberOfInputs, numberOfOutputs, numberOfAtoms, minAtar=False): 32 | """ 33 | GOAL: Defining and initializing the Deep Neural Network. 34 | 35 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 36 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 37 | - numberOfAtoms: Number of atoms for the support (see C51 algorithm). 38 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 39 | 40 | OUTPUTS: / 41 | """ 42 | 43 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 44 | super(CDQN_Model_Atari, self).__init__() 45 | 46 | # Initialization of useful variables 47 | self.numberOfAtoms = numberOfAtoms 48 | self.numberOfActions = int(numberOfOutputs/numberOfAtoms) 49 | 50 | # Initialization of the Deep Neural Network 51 | if minAtar: 52 | self.network = DNN_MinAtar(numberOfInputs, numberOfOutputs) 53 | else: 54 | self.network = DNN_Atari(numberOfInputs, numberOfOutputs) 55 | 56 | 57 | def forward(self, x): 58 | """ 59 | GOAL: Implementing the forward pass of the Deep Neural Network. 60 | 61 | INPUTS: - x: Input of the Deep Neural Network. 62 | 63 | OUTPUTS: - y: Output of the Deep Neural Network. 64 | """ 65 | 66 | x = self.network(x) 67 | y = F.softmax(x.view(-1, self.numberOfActions, self.numberOfAtoms), dim=-1) 68 | return y.clamp(min=1e-6) 69 | -------------------------------------------------------------------------------- /Models/CNN_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | 13 | 14 | ############################################################################### 15 | ############################## Class CNN_Atari ################################ 16 | ############################################################################### 17 | 18 | class CNN_Atari(nn.Module): 19 | """ 20 | GOAL: Implementing the CNN part of the DNN designed for the DQN algorithm 21 | to successfully play Atari games. 22 | 23 | VARIABLES: - network: Convolutional Neural Network. 24 | 25 | METHODS: - __init__: Initialization of the Convolutional Neural Network. 26 | - forward: Forward pass of the Convolutional Neural Network. 27 | """ 28 | 29 | def __init__(self, numberOfInputs): 30 | """ 31 | GOAL: Defining and initializing the Convolutional Neural Network. 32 | 33 | INPUTS: - numberOfInputs: Number of inputs of the Convolutional Neural Network. 34 | 35 | OUTPUTS: / 36 | """ 37 | 38 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 39 | super(CNN_Atari, self).__init__() 40 | 41 | # Initialization of the Convolutional Neural Network 42 | self.network = nn.Sequential( 43 | nn.Conv2d(numberOfInputs, 32, kernel_size=8, stride=4), 44 | nn.ReLU(), 45 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 46 | nn.ReLU(), 47 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 48 | nn.ReLU() 49 | ) 50 | 51 | 52 | def getOutputSize(self): 53 | """ 54 | GOAL: Get the size of the Convolutional Neural Network output. 55 | 56 | INPUTS: / 57 | 58 | OUTPUTS: - size: Size of the Convolutional Neural Network. output. 59 | """ 60 | 61 | return self.network(torch.zeros(1, *(4, 84, 84))).view(1, -1).size(1) 62 | 63 | 64 | def forward(self, x): 65 | """ 66 | GOAL: Implementing the forward pass of the Convolutional Neural Network. 67 | 68 | INPUTS: - x: Input of the Convolutional Neural Network. 69 | 70 | OUTPUTS: - y: Output of the Convolutional Neural Network. 71 | """ 72 | 73 | x = self.network(x) 74 | return x.view(x.size(0), -1) 75 | -------------------------------------------------------------------------------- /Models/CNN_MinAtar.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | 13 | 14 | ############################################################################### 15 | ############################# Class CNN_MinAtar ############################### 16 | ############################################################################### 17 | 18 | class CNN_MinAtar(nn.Module): 19 | """ 20 | GOAL: Implementing the CNN part of the DNN designed for the DQN algorithm 21 | to successfully play Atari games (MinAtar version). 22 | 23 | VARIABLES: - network: Convolutional Neural Network. 24 | 25 | METHODS: - __init__: Initialization of the Convolutional Neural Network. 26 | - forward: Forward pass of the Convolutional Neural Network. 27 | """ 28 | 29 | def __init__(self, numberOfInputs): 30 | """ 31 | GOAL: Defining and initializing the Convolutional Neural Network. 32 | 33 | INPUTS: - numberOfInputs: Number of inputs of the Convolutional Neural Network. 34 | 35 | OUTPUTS: / 36 | """ 37 | 38 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 39 | super(CNN_MinAtar, self).__init__() 40 | 41 | # Initialization of some variables 42 | self.channels = numberOfInputs 43 | self.size = 10 44 | self.filters = 16 45 | self.kernel = 3 46 | self.stride = 1 47 | 48 | # Initialization of the Convolutional Neural Network 49 | self.network = nn.Sequential( 50 | nn.Conv2d(self.channels, self.filters, self.kernel, self.stride), 51 | nn.ReLU() 52 | ) 53 | 54 | 55 | def getOutputSize(self): 56 | """ 57 | GOAL: Get the size of the Convolutional Neural Network output. 58 | 59 | INPUTS: / 60 | 61 | OUTPUTS: - size: Size of the Convolutional Neural Network. output. 62 | """ 63 | 64 | newSize = ((self.size - self.kernel)/self.stride) + 1 65 | return int(newSize * newSize * self.filters) 66 | 67 | 68 | def forward(self, x): 69 | """ 70 | GOAL: Implementing the forward pass of the Convolutional Neural Network. 71 | 72 | INPUTS: - x: Input of the Convolutional Neural Network. 73 | 74 | OUTPUTS: - y: Output of the Convolutional Neural Network. 75 | """ 76 | 77 | x = self.network(x) 78 | return x.view(x.size(0), -1) 79 | -------------------------------------------------------------------------------- /Models/DNN_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | # pylint: disable=E1101 9 | # pylint: disable=E1102 10 | 11 | from Models.FeedforwardDNN import FeedForwardDNN 12 | from Models.CNN_Atari import CNN_Atari 13 | 14 | 15 | 16 | ############################################################################### 17 | ############################## Class DNN_Atari ################################ 18 | ############################################################################### 19 | 20 | class DNN_Atari(nn.Module): 21 | """ 22 | GOAL: Implementing the orignal DNN designed for the DQN algorithm to 23 | succesfully play Atari games. 24 | 25 | VARIABLES: - network: Deep Neural Network. 26 | 27 | METHODS: - __init__: Initialization of the Deep Neural Network. 28 | - forward: Forward pass of the Deep Neural Network. 29 | """ 30 | 31 | def __init__(self, numberOfInputs, numberOfOutputs): 32 | """ 33 | GOAL: Defining and initializing the Deep Neural Network. 34 | 35 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 36 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 37 | 38 | OUTPUTS: / 39 | """ 40 | 41 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 42 | super(DNN_Atari, self).__init__() 43 | 44 | # Initialization of the Deep Neural Network. 45 | CNNOutputSize = CNN_Atari(numberOfInputs).getOutputSize() 46 | self.network = nn.Sequential( 47 | CNN_Atari(numberOfInputs), 48 | FeedForwardDNN(CNNOutputSize, numberOfOutputs, [512]) 49 | ) 50 | 51 | 52 | def forward(self, x): 53 | """ 54 | GOAL: Implementing the forward pass of the Deep Neural Network. 55 | 56 | INPUTS: - x: Input of the Deep Neural Network. 57 | 58 | OUTPUTS: - y: Output of the Deep Neural Network. 59 | """ 60 | 61 | return self.network(x) 62 | -------------------------------------------------------------------------------- /Models/DNN_MinAtar.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | # pylint: disable=E1101 9 | # pylint: disable=E1102 10 | 11 | from Models.FeedforwardDNN import FeedForwardDNN 12 | from Models.CNN_MinAtar import CNN_MinAtar 13 | 14 | 15 | 16 | ############################################################################### 17 | ############################# Class DNN_MinAtar ############################### 18 | ############################################################################### 19 | 20 | class DNN_MinAtar(nn.Module): 21 | """ 22 | GOAL: Implementing the orignal DNN designed for the DQN algorithm to 23 | succesfully play Atari games (MinAtar version). 24 | 25 | VARIABLES: - network: Deep Neural Network. 26 | 27 | METHODS: - __init__: Initialization of the Deep Neural Network. 28 | - forward: Forward pass of the Deep Neural Network. 29 | """ 30 | 31 | def __init__(self, numberOfInputs, numberOfOutputs): 32 | """ 33 | GOAL: Defining and initializing the Deep Neural Network. 34 | 35 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 36 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 37 | 38 | OUTPUTS: / 39 | """ 40 | 41 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 42 | super(DNN_MinAtar, self).__init__() 43 | 44 | # Initialization of the Deep Neural Network. 45 | CNNOutputSize = CNN_MinAtar(numberOfInputs).getOutputSize() 46 | self.network = nn.Sequential( 47 | CNN_MinAtar(numberOfInputs), 48 | FeedForwardDNN(CNNOutputSize, numberOfOutputs, [128]) 49 | ) 50 | 51 | 52 | def forward(self, x): 53 | """ 54 | GOAL: Implementing the forward pass of the Deep Neural Network. 55 | 56 | INPUTS: - x: Input of the Deep Neural Network. 57 | 58 | OUTPUTS: - y: Output of the Deep Neural Network. 59 | """ 60 | 61 | return self.network(x) 62 | -------------------------------------------------------------------------------- /Models/FQF_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | # pylint: disable=E1101 11 | # pylint: disable=E1102 12 | 13 | from Models.FeedforwardDNN import FeedForwardDNN 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################## Class FQF_Model ################################ 19 | ############################################################################### 20 | 21 | class FQF_Model(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the FQF distributional RL algorithm 24 | (Implicit Quantile Network). 25 | 26 | VARIABLES: - network: Deep Neural Network. 27 | 28 | METHODS: - __init__: Initialization of the Deep Neural Network. 29 | - forward: Forward pass of the Deep Neural Network. 30 | """ 31 | 32 | def __init__(self, numberOfInputs, numberOfOutputs, structure, stateEmbedding, NCos=64, device='cpu'): 33 | """ 34 | GOAL: Defining and initializing the Deep Neural Network. 35 | 36 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 37 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 38 | - structure: Structure of the state embedding Deep Neural Network (hidden layers). 39 | - stateEmbedding: Number of values to represent the state. 40 | - Ncos: Number of elements in cosine function. 41 | 42 | OUTPUTS: / 43 | """ 44 | 45 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 46 | super(FQF_Model, self).__init__() 47 | 48 | # Initialization of useful variables 49 | self.device = device 50 | self.NCos = NCos 51 | self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device) 52 | 53 | # Initialization of the Deep Neural Network. 54 | self.stateEmbedding = FeedForwardDNN(numberOfInputs, stateEmbedding, structure) 55 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU()) 56 | self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [256]) 57 | 58 | 59 | def embedding(self, x): 60 | """ 61 | GOAL: Implementing the embedding part of the Deep Neural Network. 62 | 63 | INPUTS: - x: Input of the Deep Neural Network. 64 | 65 | OUTPUTS: - y: Embedded input of the Deep Neural Network. 66 | """ 67 | 68 | return self.stateEmbedding(x) 69 | 70 | 71 | def forward(self, x, taus, embedding=None): 72 | """ 73 | GOAL: Implementing the forward pass of the Deep Neural Network. 74 | 75 | INPUTS: - x: Input of the Deep Neural Network. 76 | - taus: Quantiles (generated by the FPN). 77 | - embedding: Embedding of the Deep Neural Network input (state). 78 | 79 | OUTPUTS: - y: Output of the Deep Neural Network. 80 | """ 81 | 82 | # State embedding part of the Deep Neural Network 83 | batchSize = x.size(0) 84 | if embedding == None: 85 | x = self.stateEmbedding(x).unsqueeze(1) 86 | else: 87 | x = embedding.unsqueeze(1) 88 | 89 | # Quantile embedding part of the Deep Neural Network 90 | N = taus.size(1) 91 | cos = torch.cos(taus.unsqueeze(2)*self.piMultiples).view(batchSize*N, self.NCos) 92 | cos = self.cosEmbedding(cos).view(batchSize, N, -1) 93 | 94 | # Multiplication of both state and cos embeddings outputs (combination) 95 | x = (x * cos).view(batchSize, N, -1) 96 | 97 | # Distribution part of the Deep Neural Network 98 | x = self.feedForwardDNN(x) 99 | return x.transpose(1, 2) 100 | -------------------------------------------------------------------------------- /Models/FQF_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | # pylint: disable=E1101 11 | # pylint: disable=E1102 12 | 13 | from Models.FeedforwardDNN import FeedForwardDNN 14 | from Models.CNN_Atari import CNN_Atari 15 | from Models.CNN_MinAtar import CNN_MinAtar 16 | 17 | 18 | 19 | ############################################################################### 20 | ########################### Class FQF_Model_Atari ############################# 21 | ############################################################################### 22 | 23 | class FQF_Model_Atari(nn.Module): 24 | """ 25 | GOAL: Implementing the DL model for the FQF distributional RL algorithm 26 | (Implicit Quantile Network). 27 | 28 | VARIABLES: - network: Deep Neural Network. 29 | 30 | METHODS: - __init__: Initialization of the Deep Neural Network. 31 | - forward: Forward pass of the Deep Neural Network. 32 | """ 33 | 34 | def __init__(self, numberOfInputs, numberOfOutputs, NCos=64, device='cpu', 35 | minAtar=False): 36 | """ 37 | GOAL: Defining and initializing the Deep Neural Network. 38 | 39 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 40 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 41 | - Ncos: Number of elements in cosine function. 42 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 43 | 44 | OUTPUTS: / 45 | """ 46 | 47 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 48 | super(FQF_Model_Atari, self).__init__() 49 | 50 | # Initialization of useful variables 51 | self.device = device 52 | self.NCos = NCos 53 | self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device) 54 | 55 | # Initialization of the Deep Neural Network. 56 | if minAtar: 57 | self.stateEmbedding = CNN_MinAtar(numberOfInputs) 58 | self.stateEmbeddingSize = CNN_MinAtar(numberOfInputs).getOutputSize() 59 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, self.stateEmbeddingSize), nn.ReLU()) 60 | self.feedForwardDNN = FeedForwardDNN(self.stateEmbeddingSize, numberOfOutputs, [128]) 61 | else: 62 | self.stateEmbedding = CNN_Atari(numberOfInputs) 63 | self.stateEmbeddingSize = CNN_Atari(numberOfInputs).getOutputSize() 64 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, self.stateEmbeddingSize), nn.ReLU()) 65 | self.feedForwardDNN = FeedForwardDNN(self.stateEmbeddingSize, numberOfOutputs, [512]) 66 | 67 | 68 | def embedding(self, x): 69 | """ 70 | GOAL: Implementing the embedding part of the Deep Neural Network. 71 | 72 | INPUTS: - x: Input of the Deep Neural Network. 73 | 74 | OUTPUTS: - y: Embedded input of the Deep Neural Network. 75 | """ 76 | 77 | return self.stateEmbedding(x) 78 | 79 | 80 | def getEmbeddingSize(self): 81 | """ 82 | GOAL: Return the size of the state embedding. 83 | 84 | INPUTS: / 85 | 86 | OUTPUTS: - stateEmbeddingSize: Size of the state embedding. 87 | """ 88 | 89 | return self.stateEmbeddingSize 90 | 91 | 92 | def forward(self, x, taus, embedding=None): 93 | """ 94 | GOAL: Implementing the forward pass of the Deep Neural Network. 95 | 96 | INPUTS: - x: Input of the Deep Neural Network. 97 | - taus: Quantiles (generated by the FPN). 98 | - embedding: Embedding of the Deep Neural Network input (state) 99 | 100 | OUTPUTS: - y: Output of the Deep Neural Network. 101 | """ 102 | 103 | # State embedding part of the Deep Neural Network 104 | batchSize = x.size(0) 105 | if embedding == None: 106 | x = self.stateEmbedding(x).unsqueeze(1) 107 | else: 108 | x = embedding.unsqueeze(1) 109 | 110 | # Quantile embedding part of the Deep Neural Network 111 | N = taus.size(1) 112 | cos = torch.cos(taus.unsqueeze(2)*self.piMultiples).view(batchSize*N, self.NCos) 113 | cos = self.cosEmbedding(cos).view(batchSize, N, -1) 114 | 115 | # Multiplication of both state and cos embeddings outputs (combination) 116 | x = (x * cos).view(batchSize, N, -1) 117 | 118 | # Distribution part of the Deep Neural Network 119 | x = self.feedForwardDNN(x) 120 | return x.transpose(1, 2) 121 | -------------------------------------------------------------------------------- /Models/FQF_Model_Bis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | 13 | 14 | ############################################################################### 15 | ############################# Class FQF_Model_Bis ############################# 16 | ############################################################################### 17 | 18 | class FQF_Model_Bis(nn.Module): 19 | """ 20 | GOAL: Implementing the DL model for the FQF distributional RL algorithm 21 | (Fraction Proposal Network). 22 | 23 | VARIABLES: - network: Deep Neural Network. 24 | 25 | METHODS: - __init__: Initialization of the Deep Neural Network. 26 | - forward: Forward pass of the Deep Neural Network. 27 | """ 28 | 29 | def __init__(self, numberOfInputs, numberOfOutputs, device='cpu'): 30 | """ 31 | GOAL: Defining and initializing the Deep Neural Network. 32 | 33 | INPUTS: - numberOfInputs: Input shape (state embedding). 34 | - numberOfOutputs: Output shape (number of quantile fractions). 35 | - device: Running device (hardware acceleration). 36 | 37 | OUTPUTS: / 38 | """ 39 | 40 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 41 | super(FQF_Model_Bis, self).__init__() 42 | 43 | # Initialization of useful variables 44 | self.device = device 45 | self.N = numberOfOutputs 46 | 47 | # Initialization of the Deep Neural Network. 48 | self.network = nn.Sequential( 49 | nn.Linear(numberOfInputs, numberOfOutputs), 50 | nn.LogSoftmax(dim=1) 51 | ) 52 | 53 | 54 | def forward(self, x): 55 | """ 56 | GOAL: Implementing the forward pass of the Deep Neural Network. 57 | 58 | INPUTS: - x: Input of the Deep Neural Network. (state embedding). 59 | 60 | OUTPUTS: - taus: Quantile fractions generated. 61 | - tausBis: Quantile fractions generated. 62 | - entropy: Entropy associated with the DNN output. 63 | """ 64 | 65 | # Generation of quantile fractions 66 | out = self.network(x) 67 | taus = torch.cumsum(out.exp(), dim=1) 68 | taus = torch.cat((torch.zeros((out.shape[0], 1)).to(self.device), taus), dim=1) 69 | tausBis = (taus[:, :-1] + taus[:, 1:]).detach() / 2. 70 | 71 | # Computation of the associated entropy 72 | entropy = -(out * out.exp()).sum(dim=-1, keepdim=True) 73 | 74 | return taus, tausBis, entropy 75 | -------------------------------------------------------------------------------- /Models/FeedforwardDNN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | # pylint: disable=E1101 9 | # pylint: disable=E1102 10 | 11 | 12 | 13 | ############################################################################### 14 | ############################ Class FeedForwardDNN ############################# 15 | ############################################################################### 16 | 17 | class FeedForwardDNN(nn.Module): 18 | """ 19 | GOAL: Implementing a classical feedforward DNN using Pytorch. 20 | 21 | VARIABLES: - network: Feedforward DNN. 22 | 23 | METHODS: - __init__: Initialization of the feedforward DNN. 24 | - forward: Forward pass of the feedforward DNN. 25 | """ 26 | 27 | def __init__(self, numberOfInputs, numberOfOutputs, structure): 28 | """ 29 | GOAL: Defining and initializing the feedforward DNN. 30 | 31 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 32 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 33 | - structure: Structure of the feedforward DNN (hidden layers). 34 | 35 | OUTPUTS: / 36 | """ 37 | 38 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 39 | super(FeedForwardDNN, self).__init__() 40 | 41 | # Initialization of the FeedForward DNN 42 | self.network = [] 43 | structure = [numberOfInputs] + structure + [numberOfOutputs] 44 | for inFeature, outFeature in zip(structure, structure[1:]): 45 | self.network.extend([ 46 | nn.Linear(inFeature, outFeature), 47 | nn.ReLU(), 48 | ]) 49 | self.network.pop() 50 | self.network = nn.Sequential(*self.network) 51 | 52 | 53 | def forward(self, x): 54 | """ 55 | GOAL: Implementing the forward pass of the feedforward DNN. 56 | 57 | INPUTS: - x: Input of the feedforward DNN. 58 | 59 | OUTPUTS: - y: Output of the feedforward DNN. 60 | """ 61 | 62 | return self.network(x) 63 | -------------------------------------------------------------------------------- /Models/IQN_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | # pylint: disable=E1101 11 | # pylint: disable=E1102 12 | 13 | from Models.FeedforwardDNN import FeedForwardDNN 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################## Class IQN_Model ################################ 19 | ############################################################################### 20 | 21 | class IQN_Model(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the IQN distributional RL algorithm. 24 | 25 | VARIABLES: - network: Deep Neural Network. 26 | 27 | METHODS: - __init__: Initialization of the Deep Neural Network. 28 | - forward: Forward pass of the Deep Neural Network. 29 | """ 30 | 31 | def __init__(self, numberOfInputs, numberOfOutputs, structure, stateEmbedding, NCos=64, device='cpu'): 32 | """ 33 | GOAL: Defining and initializing the Deep Neural Network. 34 | 35 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 36 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 37 | - structure: Structure of the state embedding Deep Neural Network (hidden layers). 38 | - stateEmbedding: Number of values to represent the state. 39 | - Ncos: Number of elements in cosine function. 40 | 41 | OUTPUTS: / 42 | """ 43 | 44 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 45 | super(IQN_Model, self).__init__() 46 | 47 | # Initialization of useful variables 48 | self.device = device 49 | self.NCos = NCos 50 | self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device) 51 | 52 | # Initialization of the Deep Neural Network 53 | self.stateEmbedding = FeedForwardDNN(numberOfInputs, stateEmbedding, structure) 54 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU()) 55 | self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [256]) 56 | 57 | 58 | def forward(self, x, N, randomSampling=True): 59 | """ 60 | GOAL: Implementing the forward pass of the Deep Neural Network. 61 | 62 | INPUTS: - x: Input of the Deep Neural Network. 63 | - N: Number of quantiles to generate. 64 | - randomSampling: Boolean specifying whether the quantiles are 65 | sampled randomly or not (default: True). 66 | 67 | OUTPUTS: - y: Output of the Deep Neural Network. 68 | """ 69 | 70 | # State embedding part of the Deep Neural Network 71 | batchSize = x.size(0) 72 | x = self.stateEmbedding(x).unsqueeze(1) 73 | 74 | # Generate a number of quantiles (randomly or not) 75 | if randomSampling: 76 | taus = torch.rand(batchSize, N).to(self.device).unsqueeze(2) 77 | else: 78 | taus = torch.linspace(0.0, 1.0, N).to(self.device) 79 | taus = taus.repeat(batchSize, 1).unsqueeze(2) 80 | 81 | # Quantile embedding part of the Deep Neural Network 82 | cos = torch.cos(taus*self.piMultiples).view(batchSize*N, self.NCos) 83 | cos = self.cosEmbedding(cos).view(batchSize, N, -1) 84 | 85 | # Multiplication of both state and cos embeddings outputs (combination) 86 | x = (x * cos).view(batchSize, N, -1) 87 | 88 | # Distribution part of the Deep Neural Network 89 | x = self.feedForwardDNN(x) 90 | return x.transpose(1, 2), taus 91 | -------------------------------------------------------------------------------- /Models/IQN_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | # pylint: disable=E1101 11 | # pylint: disable=E1102 12 | 13 | from Models.FeedforwardDNN import FeedForwardDNN 14 | from Models.CNN_Atari import CNN_Atari 15 | from Models.CNN_MinAtar import CNN_MinAtar 16 | 17 | 18 | 19 | ############################################################################### 20 | ######################### Class IQN_Model_Atari ############################### 21 | ############################################################################### 22 | 23 | class IQN_Model_Atari(nn.Module): 24 | """ 25 | GOAL: Implementing the DL model for the IQN distributional RL algorithm. 26 | 27 | VARIABLES: - network: Deep Neural Network. 28 | 29 | METHODS: - __init__: Initialization of the Deep Neural Network. 30 | - forward: Forward pass of the Deep Neural Network. 31 | """ 32 | 33 | def __init__(self, numberOfInputs, numberOfOutputs, NCos=64, device='cpu', 34 | minAtar=False): 35 | """ 36 | GOAL: Defining and initializing the Deep Neural Network. 37 | 38 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 39 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 40 | - Ncos: Number of elements in cosine function. 41 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 42 | 43 | OUTPUTS: / 44 | """ 45 | 46 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 47 | super(IQN_Model_Atari, self).__init__() 48 | 49 | # Initialization of useful variables 50 | self.device = device 51 | self.NCos = NCos 52 | self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device) 53 | 54 | # Initialization of the Deep Neural Network 55 | if minAtar: 56 | self.stateEmbedding = CNN_MinAtar(numberOfInputs) 57 | stateEmbedding = CNN_MinAtar(numberOfInputs).getOutputSize() 58 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU()) 59 | self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [128]) 60 | else: 61 | self.stateEmbedding = CNN_Atari(numberOfInputs) 62 | stateEmbedding = CNN_Atari(numberOfInputs).getOutputSize() 63 | self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU()) 64 | self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [512]) 65 | 66 | 67 | def forward(self, x, N, randomSampling=True): 68 | """ 69 | GOAL: Implementing the forward pass of the Deep Neural Network. 70 | 71 | INPUTS: - x: Input of the Deep Neural Network. 72 | - N: Number of quantiles to generate. 73 | - randomSampling: Boolean specifying whether the quantiles are 74 | sampled randomly or not (default: True). 75 | 76 | OUTPUTS: - y: Output of the Deep Neural Network. 77 | """ 78 | 79 | # State embedding part of the Deep Neural Network 80 | batchSize = x.size(0) 81 | x = self.stateEmbedding(x).unsqueeze(1) 82 | 83 | # Generate a number of quantiles (randomly or not) 84 | if randomSampling: 85 | taus = torch.rand(batchSize, N).to(self.device).unsqueeze(2) 86 | else: 87 | taus = torch.linspace(0.0, 1.0, N).to(self.device) 88 | taus = taus.repeat(batchSize, 1).unsqueeze(2) 89 | 90 | # Quantile embedding part of the Deep Neural Network 91 | cos = torch.cos(taus*self.piMultiples).view(batchSize*N, self.NCos) 92 | cos = self.cosEmbedding(cos).view(batchSize, N, -1) 93 | 94 | # Multiplication of both state and cos embeddings outputs (combination) 95 | x = (x * cos).view(batchSize, N, -1) 96 | 97 | # Distribution part of the Deep Neural Network 98 | x = self.feedForwardDNN(x) 99 | return x.transpose(1, 2), taus 100 | -------------------------------------------------------------------------------- /Models/MonotonicNN.py: -------------------------------------------------------------------------------- 1 | # Credit goes to Antoine Wehenkel for this entire python file. 2 | 3 | 4 | 5 | import torch 6 | import torch.nn as nn 7 | from UMNN import NeuralIntegral 8 | from UMNN import ParallelNeuralIntegral 9 | import math 10 | 11 | def _flatten(sequence): 12 | flat = [p.contiguous().view(-1) for p in sequence] 13 | return torch.cat(flat) if len(flat) > 0 else torch.tensor([]) 14 | 15 | 16 | class IntegrandNN(nn.Module): 17 | def __init__(self, in_d, hidden_layers, n_out=1): 18 | super(IntegrandNN, self).__init__() 19 | self.net = [] 20 | hs = [in_d] + hidden_layers + [n_out] 21 | for h0, h1 in zip(hs, hs[1:]): 22 | self.net.extend([ 23 | nn.Linear(h0, h1), 24 | nn.ReLU(), 25 | ]) 26 | self.net.pop() 27 | self.net.append(nn.ELU()) 28 | self.net = nn.Sequential(*self.net) 29 | 30 | def forward(self, x, h): 31 | return self.net(torch.cat((x, h), 1)) + 1. 32 | 33 | 34 | class OneDimensionnalNF(nn.Module): 35 | def __init__(self, in_d, hidden_layers, nb_steps=200, n_out=1, dev="cpu"): 36 | super(OneDimensionnalNF, self).__init__() 37 | self.device = dev 38 | self.nb_steps = nb_steps 39 | self.n_out = n_out 40 | self.net = MonotonicNN(in_d, hidden_layers, nb_steps=nb_steps, n_out=n_out, dev=dev) 41 | self.register_buffer("pi", torch.tensor(math.pi)) 42 | 43 | ''' 44 | The forward procedure takes as input x which is the variable for which the integration must be made, h are just other conditionning variables. 45 | It returns the $log(p(x|h; \theta))$. 46 | ''' 47 | def forward(self, x, h): 48 | x0 = torch.zeros(x.shape).to(self.device) 49 | out = self.net.net(h) 50 | offset = out[:, :self.n_out] 51 | scaling = torch.exp(out[:, self.n_out:]) 52 | jac = scaling * self.net.integrand(x, h) 53 | z = scaling*ParallelNeuralIntegral.apply(x0, x, self.net.integrand, _flatten(self.net.integrand.parameters()), h, self.nb_steps) + offset 54 | z.clamp_(-10., 10.) 55 | log_prob_gauss = -.5 * (torch.log(self.pi * 2) + z ** 2) 56 | ll = log_prob_gauss + torch.log(jac + 1e-10) 57 | return ll 58 | 59 | def expectation(self, h, x_func, min=-10, max=10, npts=1000): 60 | # Using first order Euler method . 61 | b_size = h.shape[0] 62 | n_out = self.n_out 63 | dx = (max-min)/(npts - 1) 64 | emb_size = h.shape[1] 65 | 66 | x = torch.arange(min, max+(max-min)/(npts - 1), dx).to(h.device) 67 | npts = x.shape[0] 68 | zero_idx = torch.argmin(x**2).item() 69 | 70 | out = self.net.net(h) 71 | offset = out[:, :self.n_out].unsqueeze(1).expand(b_size, npts, n_out) 72 | scaling = torch.exp(out[:, self.n_out:]).unsqueeze(1).expand(b_size, npts, n_out) 73 | 74 | h_values = h.unsqueeze(1).expand(b_size, npts, emb_size).reshape(-1, emb_size) 75 | x_values = x.unsqueeze(0).expand(b_size, npts).reshape(-1, 1) 76 | 77 | f_values = self.net.integrand(x_values, h_values) 78 | f_values = f_values.reshape(b_size, npts, n_out) * scaling 79 | 80 | z = (dx * f_values.cumsum(1)) 81 | z = (z - z[:, [zero_idx], :].expand(-1, npts, -1)) + offset 82 | log_prob_gauss = -.5 * (torch.log(self.pi * 2) + z ** 2) 83 | ll = log_prob_gauss + torch.log(f_values + 1e-10) 84 | 85 | 86 | expectations = (x_func(x).unsqueeze(0).unsqueeze(2).expand(b_size, npts, n_out) * torch.exp(ll)).sum(1) * dx 87 | 88 | return expectations 89 | 90 | class MonotonicNN(nn.Module): 91 | ''' 92 | in_d : The total number of inputs 93 | hidden_layers : a list a the number of neurons, to be used by a network that compresses the non-monotonic variables and by the integrand net. 94 | nb_steps : Number of integration steps 95 | n_out : the number of output (each output will be monotonic w.r.t one variable) 96 | ''' 97 | def __init__(self, in_d, hidden_layers, nb_steps=200, n_out=1, dev="cpu"): 98 | super(MonotonicNN, self).__init__() 99 | self.integrand = IntegrandNN(in_d, hidden_layers, n_out) 100 | self.net = [] 101 | hs = [in_d-1] + hidden_layers + [2 * n_out] 102 | for h0, h1 in zip(hs, hs[1:]): 103 | self.net.extend([ 104 | nn.Linear(h0, h1), 105 | nn.ReLU(), 106 | ]) 107 | self.net.pop() 108 | self.net = nn.Sequential(*self.net) 109 | self.device = dev 110 | self.nb_steps = nb_steps 111 | self.n_out = n_out 112 | 113 | ''' 114 | The forward procedure takes as input x which is the variable for which the integration must be made, h are just other conditionning variables. 115 | ''' 116 | def forward(self, x, h, only_derivative=False): 117 | x0 = torch.zeros(x.shape).to(self.device) 118 | out = self.net(h) 119 | offset = out[:, :self.n_out] 120 | scaling = torch.exp(out[:, self.n_out:]) 121 | if only_derivative: 122 | return scaling * self.integrand(x, h) 123 | return scaling*ParallelNeuralIntegral.apply(x0, x, self.integrand, _flatten(self.integrand.parameters()), h, self.nb_steps) + offset 124 | 125 | 126 | 127 | 128 | ''' 129 | The inverse procedure takes as input y which is the variable for which the inverse must be computed, h are just other conditionning variables. 130 | One output per n_out. 131 | y should be a scalar. 132 | ''' 133 | def inverse(self, y, h, min=-10, max=10, nb_iter=10): 134 | idx = (torch.arange(0, self.n_out**2, self.n_out + 1).view(1, -1) + torch.arange(0, (self.n_out**2)*y.shape[0], self.n_out**2).view(-1, 1)).view(-1) 135 | h = h.unsqueeze(1).expand(-1, self.n_out, -1).contiguous().view(y.shape[0]*self.n_out, -1) 136 | 137 | # Old inversion by binary search 138 | x_max = torch.ones(y.shape[0], self.n_out).to(y.device) * max 139 | x_min = torch.ones(y.shape[0], self.n_out).to(y.device) * min 140 | y_max = self.forward(x_max.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out) 141 | y_min = self.forward(x_min.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out) 142 | 143 | for i in range(nb_iter): 144 | x_middle = (x_max + x_min) / 2 145 | y_middle = self.forward(x_middle.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out) 146 | left = (y_middle > y).float() 147 | right = 1 - left 148 | x_max = left * x_middle + right * x_max 149 | x_min = right * x_middle + left * x_min 150 | y_max = left * y_middle + right * y_max 151 | y_min = right * y_middle + left * y_min 152 | return (x_max + x_min) / 2 153 | 154 | def expectation(self, h, x_func, out_deriv, min=-10, max=10, npts=1000): 155 | # Using first order Euler method . 156 | b_size = h.shape[0] 157 | n_out = self.n_out 158 | dx = (max-min)/(npts - 1) 159 | emb_size = h.shape[1] 160 | 161 | x = torch.arange(min, max+(max-min)/(npts - 1), dx).to(h.device) 162 | npts = x.shape[0] 163 | zero_idx = torch.argmin(x**2).item() 164 | 165 | out = self.net(h) 166 | offset = out[:, :self.n_out].unsqueeze(1).expand(b_size, npts, n_out) 167 | scaling = torch.exp(out[:, self.n_out:]).unsqueeze(1).expand(b_size, npts, n_out) 168 | 169 | h_values = h.unsqueeze(1).expand(b_size, npts, emb_size).reshape(-1, emb_size) 170 | x_values = x.unsqueeze(0).expand(b_size, npts).reshape(-1, 1) 171 | 172 | f_values = self.integrand(x_values, h_values) 173 | f_values = f_values.reshape(b_size, npts, n_out) * scaling 174 | 175 | F_values = (dx * f_values.cumsum(1)) 176 | F_values = (F_values - F_values[:, [zero_idx], :].expand(-1, npts, -1)) + offset 177 | corrected_F_values = out_deriv(F_values) 178 | 179 | expectations = (x_func(x).unsqueeze(0).unsqueeze(2).expand(b_size, npts, n_out) * f_values * corrected_F_values).sum(1) * dx 180 | 181 | return expectations 182 | -------------------------------------------------------------------------------- /Models/QR_DQN_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | # pylint: disable=E1101 9 | # pylint: disable=E1102 10 | 11 | from Models.FeedforwardDNN import FeedForwardDNN 12 | 13 | 14 | 15 | ############################################################################### 16 | ############################# Class QR_DQN_Model ############################## 17 | ############################################################################### 18 | 19 | class QR_DQN_Model(nn.Module): 20 | """ 21 | GOAL: Implementing the DL model for the QR-DQN distributional RL algorithm. 22 | 23 | VARIABLES: - network: Deep Neural Network. 24 | 25 | METHODS: - __init__: Initialization of the Deep Neural Network. 26 | - forward: Forward pass of the Deep Neural Network. 27 | """ 28 | 29 | def __init__(self, numberOfInputs, numberOfOutputs, structure, numberOfQuantiles=200): 30 | """ 31 | GOAL: Defining and initializing the Deep Neural Network. 32 | 33 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 34 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 35 | - structure: Structure of the Deep Neural Network (hidden layers). 36 | - numberOfQuantiles: Number of quantiles for approximating the distribution. 37 | 38 | OUTPUTS: / 39 | """ 40 | 41 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 42 | super(QR_DQN_Model, self).__init__() 43 | 44 | # Initialization of useful variables 45 | self.numberOfQuantiles = numberOfQuantiles 46 | self.numberOfActions = int(numberOfOutputs/numberOfQuantiles) 47 | 48 | # Initialization of the Deep Neural Network. 49 | self.network = FeedForwardDNN(numberOfInputs, numberOfOutputs, structure) 50 | 51 | 52 | def forward(self, x): 53 | """ 54 | GOAL: Implementing the forward pass of the Deep Neural Network. 55 | 56 | INPUTS: - x: Input of the Deep Neural Network. 57 | 58 | OUTPUTS: - y: Output of the Deep Neural Network. 59 | """ 60 | 61 | x = self.network(x) 62 | return x.view(x.size(0), self.numberOfActions, self.numberOfQuantiles) 63 | -------------------------------------------------------------------------------- /Models/QR_DQN_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch.nn as nn 8 | # pylint: disable=E1101 9 | # pylint: disable=E1102 10 | 11 | from Models.DNN_Atari import DNN_Atari 12 | from Models.DNN_MinAtar import DNN_MinAtar 13 | 14 | 15 | 16 | ############################################################################### 17 | ######################## Class QR_DQN_Model_Atari ############################# 18 | ############################################################################### 19 | 20 | class QR_DQN_Model_Atari(nn.Module): 21 | """ 22 | GOAL: Implementing the DL model for the QR-DQN distributional RL algorithm. 23 | 24 | VARIABLES: - network: Deep Neural Network. 25 | 26 | METHODS: - __init__: Initialization of the Deep Neural Network. 27 | - forward: Forward pass of the Deep Neural Network. 28 | """ 29 | 30 | def __init__(self, numberOfInputs, numberOfOutputs, numberOfQuantiles=200, minAtar=False): 31 | """ 32 | GOAL: Defining and initializing the Deep Neural Network. 33 | 34 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 35 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 36 | - numberOfQuantiles: Number of quantiles for approximating the distribution. 37 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 38 | 39 | OUTPUTS: / 40 | """ 41 | 42 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 43 | super(QR_DQN_Model_Atari, self).__init__() 44 | 45 | # Initialization of useful variables 46 | self.numberOfQuantiles = numberOfQuantiles 47 | self.numberOfActions = int(numberOfOutputs/numberOfQuantiles) 48 | 49 | # Initialization of the Deep Neural Network. 50 | if minAtar: 51 | self.network = DNN_MinAtar(numberOfInputs, numberOfOutputs) 52 | else: 53 | self.network = DNN_Atari(numberOfInputs, numberOfOutputs) 54 | 55 | 56 | def forward(self, x): 57 | """ 58 | GOAL: Implementing the forward pass of the Deep Neural Network. 59 | 60 | INPUTS: - x: Input of the Deep Neural Network. 61 | 62 | OUTPUTS: - y: Output of the Deep Neural Network. 63 | """ 64 | 65 | x = self.network(x) 66 | return x.view(x.size(0), self.numberOfActions, self.numberOfQuantiles) 67 | -------------------------------------------------------------------------------- /Models/UMDQN_C_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.FeedforwardDNN import FeedForwardDNN 13 | from Models.MonotonicNN import MonotonicNN 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################ Class UMDQN_C_Model ############################## 19 | ############################################################################### 20 | 21 | class UMDQN_C_Model(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the UMDQN-C distributional RL algorithm. 24 | 25 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 26 | - UMNN: UMNN part of the Deep Neural Network. 27 | 28 | METHODS: - __init__: Initialization of the Deep Neural Network. 29 | - forward: Forward pass of the Deep Neural Network. 30 | - getDerivative: Get the derivative internally computed by the UMNN. 31 | - getExpectation: Get the expectation of the PDF internally computed by the UMNN. 32 | """ 33 | 34 | def __init__(self, numberOfInputs, numberOfOutputs, 35 | structureDNN, structureUMNN, stateEmbedding, 36 | numberOfSteps, device='cpu'): 37 | """ 38 | GOAL: Defining and initializing the Deep Neural Network. 39 | 40 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 41 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 42 | - structureDNN: Structure of the feedforward DNN for state embedding. 43 | - structureUMNN: Structure of the UMNN for distribution representation. 44 | - stateEmbedding: Dimension of the state embedding. 45 | - numberOfSteps: Number of integration steps for the UMNN. 46 | - device: Hardware device (CPU or GPU). 47 | 48 | OUTPUTS: / 49 | """ 50 | 51 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 52 | super(UMDQN_C_Model, self).__init__() 53 | 54 | # Initialization of the Deep Neural Network 55 | self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN) 56 | self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 57 | 58 | 59 | def forward(self, state, q): 60 | """ 61 | GOAL: Implementing the forward pass of the Deep Neural Network. 62 | 63 | INPUTS: - state: RL state. 64 | - q: Samples of potential returns. 65 | 66 | OUTPUTS: - output: Output of the Deep Neural Network. 67 | """ 68 | 69 | # State embedding part of the Deep Neural Network 70 | batchSize = state.size(0) 71 | x = self.stateEmbeddingDNN(state) 72 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 73 | 74 | # UMNNN part of the Deep Neural Network 75 | x = self.UMNN(q, x) 76 | 77 | # Sigmoid activation function + appropriate format 78 | x = torch.sigmoid(x) 79 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 80 | 81 | 82 | def getDerivative(self, state, q): 83 | """ 84 | GOAL: Get the derivative internally computed by the UMNN. 85 | 86 | INPUTS: - state: RL state. 87 | - q: Samples of potential returns. 88 | 89 | OUTPUTS: - output: Derivative internally computed by the UMNN. 90 | """ 91 | 92 | # State embedding part of the Deep Neural Network 93 | batchSize = state.size(0) 94 | x = self.stateEmbeddingDNN(state) 95 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 96 | 97 | # Computation of both PDF and CDF 98 | pdf = self.UMNN(q, x, only_derivative=True) 99 | cdf = self.UMNN(q, x, only_derivative=False) 100 | 101 | # Correction of the sigmoid + appropriate format 102 | x = torch.sigmoid(cdf) 103 | x = x * (1 - x) * pdf 104 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 105 | 106 | 107 | def getExpectation(self, state, minReturn, maxReturn, numberOfPoints): 108 | """ 109 | GOAL: Get the expectation of the PDF internally computed by the UMNN. 110 | 111 | INPUTS: - state: RL state. 112 | - minReturn: Minimum return. 113 | - maxReturn: Maximum return. 114 | - numberOfPoints: Number of points for the computations (accuracy). 115 | 116 | OUTPUTS: - expectation: Expectation computed. 117 | """ 118 | 119 | # State embedding part of the Deep Neural Network 120 | state = self.stateEmbeddingDNN(state) 121 | 122 | # Computation of the expectation of the PDF internally computed by the UMNN 123 | expectation = self.UMNN.expectation(state, lambda x: x, lambda x: torch.sigmoid(x)*(1-torch.sigmoid(x)), minReturn, maxReturn, numberOfPoints) 124 | return expectation 125 | 126 | -------------------------------------------------------------------------------- /Models/UMDQN_C_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.DNN_Atari import DNN_Atari 13 | from Models.DNN_MinAtar import DNN_MinAtar 14 | from Models.MonotonicNN import MonotonicNN 15 | 16 | 17 | 18 | ############################################################################### 19 | ########################## Class UMDQN_C_Model_Atari ########################## 20 | ############################################################################### 21 | 22 | class UMDQN_C_Model_Atari(nn.Module): 23 | """ 24 | GOAL: Implementing the DL model for the UMDQN-C distributional RL algorithm. 25 | 26 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 27 | - UMNN: UMNN part of the Deep Neural Network. 28 | 29 | METHODS: - __init__: Initialization of the Deep Neural Network. 30 | - forward: Forward pass of the Deep Neural Network. 31 | - getDerivative: Get the derivative internally computed by the UMNN. 32 | - getExpectation: Get the expectation of the PDF internally computed by the UMNN. 33 | """ 34 | 35 | def __init__(self, numberOfInputs, numberOfOutputs, 36 | structureUMNN, stateEmbedding, numberOfSteps, 37 | device='cpu', minAtar=False): 38 | """ 39 | GOAL: Defining and initializing the Deep Neural Network. 40 | 41 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 42 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 43 | - structureUMNN: Structure of the UMNN for distribution representation. 44 | - stateEmbedding: Dimension of the state embedding. 45 | - numberOfSteps: Number of integration steps for the UMNN. 46 | - device: Hardware device (CPU or GPU). 47 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 48 | 49 | OUTPUTS: / 50 | """ 51 | 52 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 53 | super(UMDQN_C_Model_Atari, self).__init__() 54 | 55 | # Initialization of the Deep Neural Network 56 | if minAtar: 57 | self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding) 58 | else: 59 | self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding) 60 | self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 61 | 62 | 63 | def forward(self, state, q): 64 | """ 65 | GOAL: Implementing the forward pass of the Deep Neural Network. 66 | 67 | INPUTS: - state: RL state. 68 | - q: Samples of potential returns. 69 | 70 | OUTPUTS: - output: Output of the Deep Neural Network. 71 | """ 72 | 73 | # State embedding part of the Deep Neural Network 74 | batchSize = state.size(0) 75 | x = self.stateEmbeddingDNN(state) 76 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 77 | 78 | # UMNNN part of the Deep Neural Network 79 | x = self.UMNN(q, x) 80 | 81 | # Sigmoid activation function + appropriate format 82 | x = torch.sigmoid(x) 83 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 84 | 85 | 86 | def getDerivative(self, state, q): 87 | """ 88 | GOAL: Get the derivative internally computed by the UMNN. 89 | 90 | INPUTS: - state: RL state. 91 | - q: Samples of potential returns. 92 | 93 | OUTPUTS: - output: Derivative internally computed by the UMNN. 94 | """ 95 | 96 | # State embedding part of the Deep Neural Network 97 | batchSize = state.size(0) 98 | x = self.stateEmbeddingDNN(state) 99 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 100 | 101 | # Computation of both PDF and CDF 102 | pdf = self.UMNN(q, x, only_derivative=True) 103 | cdf = self.UMNN(q, x, only_derivative=False) 104 | 105 | # Correction of the sigmoid + appropriate format 106 | x = torch.sigmoid(cdf) 107 | x = x * (1 - x) * pdf 108 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 109 | 110 | 111 | def getExpectation(self, state, minReturn, maxReturn, numberOfPoints): 112 | """ 113 | GOAL: Get the expectation of the PDF internally computed by the UMNN. 114 | 115 | INPUTS: - state: RL state. 116 | - minReturn: Minimum return. 117 | - maxReturn: Maximum return. 118 | - numberOfPoints: Number of points for the computations (accuracy). 119 | 120 | OUTPUTS: - expectation: Expectation computed. 121 | """ 122 | 123 | # State embedding part of the Deep Neural Network 124 | state = self.stateEmbeddingDNN(state) 125 | 126 | # Computation of the expectation of the PDF internally computed by the UMNN 127 | expectation = self.UMNN.expectation(state, lambda x: x, lambda x: torch.sigmoid(x)*(1-torch.sigmoid(x)), minReturn, maxReturn, numberOfPoints) 128 | return expectation 129 | 130 | -------------------------------------------------------------------------------- /Models/UMDQN_KL_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.FeedforwardDNN import FeedForwardDNN 13 | from Models.MonotonicNN import OneDimensionnalNF 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################ Class UMDQN_KL_Model ############################# 19 | ############################################################################### 20 | 21 | class UMDQN_KL_Model(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the UMDQN-KL distributional RL algorithm. 24 | 25 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 26 | - UMNN: UMNN part of the Deep Neural Network. 27 | 28 | METHODS: - __init__: Initialization of the Deep Neural Network. 29 | - forward: Forward pass of the Deep Neural Network. 30 | - getExpectation: Get the expectation of the PDF approximated by the UMNN. 31 | """ 32 | 33 | def __init__(self, numberOfInputs, numberOfOutputs, 34 | structureDNN, structureUMNN, stateEmbedding, 35 | numberOfSteps, device='cpu'): 36 | """ 37 | GOAL: Defining and initializing the Deep Neural Network. 38 | 39 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 40 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 41 | - structureDNN: Structure of the feedforward DNN for state embedding. 42 | - structureUMNN: Structure of the UMNN for distribution representation. 43 | - stateEmbedding: Dimension of the state embedding. 44 | - numberOfSteps: Number of integration steps for the UMNN. 45 | - device: Hardware device (CPU or GPU). 46 | 47 | OUTPUTS: / 48 | """ 49 | 50 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 51 | super(UMDQN_KL_Model, self).__init__() 52 | 53 | # Initialization of the Deep Neural Network 54 | self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN) 55 | self.UMNN = OneDimensionnalNF(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 56 | 57 | 58 | def forward(self, state, q): 59 | """ 60 | GOAL: Implementing the forward pass of the Deep Neural Network. 61 | 62 | INPUTS: - state: RL state. 63 | - q: Samples of potential returns. 64 | 65 | OUTPUTS: - output: Output of the Deep Neural Network. 66 | """ 67 | 68 | # State embedding part of the Deep Neural Network 69 | batchSize = state.size(0) 70 | x = self.stateEmbeddingDNN(state) 71 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 72 | 73 | # UMNN part of the Deep Neural Network 74 | x = self.UMNN(q, x) 75 | 76 | # Formatting of the output and post processing operations 77 | x = torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 78 | x = torch.exp(x) 79 | x = x.clamp(min=1e-6) 80 | 81 | return x 82 | 83 | 84 | def getExpectation(self, state, minReturn, maxReturn, numberOfPoints): 85 | """ 86 | GOAL: Get the expectation of the PDF internally computed by the UMNN. 87 | 88 | INPUTS: - state: RL state. 89 | - minReturn: Minimum return. 90 | - maxReturn: Maximum return. 91 | - numberOfPoints: Number of points for the computations (accuracy). 92 | 93 | OUTPUTS: - expectation: Expectation computed. 94 | """ 95 | 96 | # State embedding part of the Deep Neural Network 97 | state = self.stateEmbeddingDNN(state) 98 | 99 | # Computation of the expectation of the PDF internally computed by the UMNN 100 | expectation = self.UMNN.expectation(state, lambda x: x, minReturn, maxReturn, numberOfPoints) 101 | return expectation 102 | -------------------------------------------------------------------------------- /Models/UMDQN_KL_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.DNN_Atari import DNN_Atari 13 | from Models.DNN_MinAtar import DNN_MinAtar 14 | from Models.MonotonicNN import OneDimensionnalNF 15 | 16 | 17 | 18 | ############################################################################### 19 | ######################### Class UMDQN_KL_Model_Atari ########################## 20 | ############################################################################### 21 | 22 | class UMDQN_KL_Model_Atari(nn.Module): 23 | """ 24 | GOAL: Implementing the DL model for the UMDQN-KL distributional RL algorithm. 25 | 26 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 27 | - UMNN: UMNN part of the Deep Neural Network. 28 | 29 | METHODS: - __init__: Initialization of the Deep Neural Network. 30 | - forward: Forward pass of the Deep Neural Network. 31 | - getExpectation: Get the expectation of the PDF approximated by the UMNN. 32 | """ 33 | 34 | def __init__(self, numberOfInputs, numberOfOutputs, 35 | structureUMNN, stateEmbedding, numberOfSteps, 36 | device='cpu', minAtar=False): 37 | """ 38 | GOAL: Defining and initializing the Deep Neural Network. 39 | 40 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 41 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 42 | - structureUMNN: Structure of the UMNN for distribution representation. 43 | - stateEmbedding: Dimension of the state embedding. 44 | - numberOfSteps: Number of integration steps for the UMNN. 45 | - device: Hardware device (CPU or GPU). 46 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 47 | 48 | OUTPUTS: / 49 | """ 50 | 51 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 52 | super(UMDQN_KL_Model_Atari, self).__init__() 53 | 54 | # Initialization of the Deep Neural Network 55 | if minAtar: 56 | self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding) 57 | else: 58 | self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding) 59 | self.UMNN = OneDimensionnalNF(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 60 | 61 | 62 | def forward(self, state, q): 63 | """ 64 | GOAL: Implementing the forward pass of the Deep Neural Network. 65 | 66 | INPUTS: - state: RL state. 67 | - q: Samples of potential returns. 68 | 69 | OUTPUTS: - output: Output of the Deep Neural Network. 70 | """ 71 | 72 | # State embedding part of the Deep Neural Network 73 | batchSize = state.size(0) 74 | x = self.stateEmbeddingDNN(state) 75 | x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1)) 76 | 77 | # UMNN part of the Deep Neural Network 78 | x = self.UMNN(q, x) 79 | 80 | # Formatting of the output and post processing operations 81 | x = torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 82 | x = torch.exp(x) 83 | x = x.clamp(min=1e-6) 84 | 85 | return x 86 | 87 | 88 | def getExpectation(self, state, minReturn, maxReturn, numberOfPoints): 89 | """ 90 | GOAL: Get the expectation of the PDF internally computed by the UMNN. 91 | 92 | INPUTS: - state: RL state. 93 | - minReturn: Minimum return. 94 | - maxReturn: Maximum return. 95 | - numberOfPoints: Number of points for the computations (accuracy). 96 | 97 | OUTPUTS: - expectation: Expectation computed. 98 | """ 99 | 100 | # State embedding part of the Deep Neural Network 101 | state = self.stateEmbeddingDNN(state) 102 | 103 | # Computation of the expectation of the PDF internally computed by the UMNN 104 | expectation = self.UMNN.expectation(state, lambda x: x, minReturn, maxReturn, numberOfPoints) 105 | return expectation 106 | -------------------------------------------------------------------------------- /Models/UMDQN_W_Model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.FeedforwardDNN import FeedForwardDNN 13 | from Models.MonotonicNN import MonotonicNN 14 | 15 | 16 | 17 | ############################################################################### 18 | ############################### Class UMDQN_W_Model ########################### 19 | ############################################################################### 20 | 21 | class UMDQN_W_Model(nn.Module): 22 | """ 23 | GOAL: Implementing the DL model for the UMDQN-W distributional RL algorithm. 24 | 25 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 26 | - UMNN: UMNN part of the Deep Neural Network. 27 | 28 | METHODS: - __init__: Initialization of the Deep Neural Network. 29 | - forward: Forward pass of the Deep Neural Network. 30 | """ 31 | 32 | def __init__(self, numberOfInputs, numberOfOutputs, 33 | structureDNN, structureUMNN, stateEmbedding, 34 | numberOfSteps, device='cpu'): 35 | """ 36 | GOAL: Defining and initializing the Deep Neural Network. 37 | 38 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 39 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 40 | - structureDNN: Structure of the feedforward DNN for state embedding. 41 | - structureUMNN: Structure of the UMNN for distribution representation. 42 | - stateEmbedding: Dimension of the state embedding. 43 | - numberOfSteps: Number of integration steps for the UMNN. 44 | - device: Hardware device (CPU or GPU). 45 | 46 | OUTPUTS: / 47 | """ 48 | 49 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 50 | super(UMDQN_W_Model, self).__init__() 51 | 52 | # Initialization of the Deep Neural Network 53 | self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN) 54 | self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 55 | 56 | 57 | def forward(self, state, taus): 58 | """ 59 | GOAL: Implementing the forward pass of the Deep Neural Network. 60 | 61 | INPUTS: - state: RL state. 62 | - taus: Samples of taus. 63 | 64 | OUTPUTS: - output: Output of the Deep Neural Network. 65 | """ 66 | 67 | # State embedding part of the Deep Neural Network 68 | batchSize = state.size(0) 69 | x = self.stateEmbeddingDNN(state) 70 | x = x.repeat(1, int(len(taus)/len(state))).view(-1, x.size(1)) 71 | 72 | # UMNNN part of the Deep Neural Network 73 | x = self.UMNN(taus, x) 74 | 75 | # Appropriate format 76 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 77 | -------------------------------------------------------------------------------- /Models/UMDQN_W_Model_Atari.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | # pylint: disable=E1101 10 | # pylint: disable=E1102 11 | 12 | from Models.DNN_Atari import DNN_Atari 13 | from Models.DNN_MinAtar import DNN_MinAtar 14 | from Models.MonotonicNN import MonotonicNN 15 | 16 | 17 | 18 | ############################################################################### 19 | ########################### Class UMDQN_W_Model_Atari ######################### 20 | ############################################################################### 21 | 22 | class UMDQN_W_Model_Atari(nn.Module): 23 | """ 24 | GOAL: Implementing the DL model for the UMDQN-W distributional RL algorithm. 25 | 26 | VARIABLES: - stateEmbeddingDNN: State embedding part of the Deep Neural Network. 27 | - UMNN: UMNN part of the Deep Neural Network. 28 | 29 | METHODS: - __init__: Initialization of the Deep Neural Network. 30 | - forward: Forward pass of the Deep Neural Network. 31 | """ 32 | 33 | def __init__(self, numberOfInputs, numberOfOutputs, 34 | structureUMNN, stateEmbedding, 35 | numberOfSteps, device='cpu', minAtar=False): 36 | """ 37 | GOAL: Defining and initializing the Deep Neural Network. 38 | 39 | INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network. 40 | - numberOfOutputs: Number of outputs of the Deep Neural Network. 41 | - structureUMNN: Structure of the UMNN for distribution representation. 42 | - stateEmbedding: Dimension of the state embedding. 43 | - numberOfSteps: Number of integration steps for the UMNN. 44 | - device: Hardware device (CPU or GPU). 45 | - minAtar: Boolean specifying whether the env is "MinAtar" or not. 46 | 47 | OUTPUTS: / 48 | """ 49 | 50 | # Call the constructor of the parent class (Pytorch torch.nn.Module) 51 | super(UMDQN_W_Model_Atari, self).__init__() 52 | 53 | # Initialization of the Deep Neural Network 54 | if minAtar: 55 | self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding) 56 | else: 57 | self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding) 58 | self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device) 59 | 60 | 61 | def forward(self, state, taus): 62 | """ 63 | GOAL: Implementing the forward pass of the Deep Neural Network. 64 | 65 | INPUTS: - state: RL state. 66 | - taus: Samples of taus. 67 | 68 | OUTPUTS: - output: Output of the Deep Neural Network. 69 | """ 70 | 71 | # State embedding part of the Deep Neural Network 72 | batchSize = state.size(0) 73 | x = self.stateEmbeddingDNN(state) 74 | x = x.repeat(1, int(len(taus)/len(state))).view(-1, x.size(1)) 75 | 76 | # UMNNN part of the Deep Neural Network 77 | x = self.UMNN(taus, x) 78 | 79 | # Appropriate format 80 | return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0) 81 | -------------------------------------------------------------------------------- /Models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Models/__init__.py -------------------------------------------------------------------------------- /MonteCarloDistributions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import pandas as pd 8 | 9 | from matplotlib import pyplot as plt 10 | from matplotlib import rc 11 | rc('text', usetex=True) 12 | 13 | 14 | 15 | ############################################################################### 16 | ################################ Global variables ############################# 17 | ############################################################################### 18 | 19 | # Default parameters for the plotting of the distributions 20 | numberOfSamples = 10000000 21 | bins = 1000 22 | density = True 23 | plotRange = (-2.1, 2.1) 24 | histtype = 'step' 25 | 26 | 27 | 28 | ############################################################################### 29 | ####################### Class MonteCarloDistributions ######################### 30 | ############################################################################### 31 | 32 | class MonteCarloDistributions(): 33 | """ 34 | GOAL: Implementing a technique based on Monte Carlo to estimate the true 35 | expected return associated with an environment and a policy. 36 | 37 | VARIABLES: - environment: Environment analysed. 38 | - policy: Policy analysed. 39 | - gamma: Discount factor. 40 | 41 | METHODS: - __init__: Initialization of the class. 42 | - samplingMonteCarlo: Generate MC samples of the random return. 43 | - plotDistributions: PLot the distributions from the MC samples. 44 | """ 45 | 46 | def __init__(self, environment, policy, gamma): 47 | """ 48 | GOAL: Perform the initialization of the class. 49 | 50 | INPUTS: - environment: Environment analysed. 51 | - policy: Policy analysed. 52 | - gamma: Discount factor. 53 | 54 | OUTPUTS: / 55 | """ 56 | 57 | # Initialization of important variables 58 | self.environment = environment 59 | self.policy = policy 60 | self.gamma = gamma 61 | 62 | 63 | def samplingMonteCarlo(self, initialState, initialAction, numberOfSamples=numberOfSamples): 64 | """ 65 | GOAL: Collect Monte Carlo samples of the expected return associated 66 | with the state and action specified. 67 | 68 | INPUTS: - initialState: RL state to start from. 69 | - initialAction: RL action to start from. 70 | 71 | - numberOfSamples: Number of Monte Carlo samples to collect. 72 | 73 | OUTPUTS: - samples: Monte Carlo samples collected. 74 | """ 75 | 76 | # Initialization of the memory storing the MC samples 77 | samples = [] 78 | 79 | # Generation of the MC samples 80 | for _i in range(numberOfSamples): 81 | 82 | # Initialization of some variables 83 | expectedReturn = 0 84 | step = 0 85 | 86 | # Reset of the environment and initialization to the desired state 87 | self.environment.reset() 88 | state = self.environment.setState(initialState) 89 | 90 | # Execution of the action specified 91 | nextState, reward, done, info = self.environment.step(initialAction) 92 | 93 | # Update of the expected return 94 | expectedReturn += (reward * (self.gamma**step)) 95 | step += 1 96 | 97 | # Loop until episode termination 98 | while done == 0: 99 | 100 | # Execute the next ation according to the policy selected 101 | state = self.policy.processState(nextState) 102 | policyAction = self.policy.chooseAction(state, plot=False) 103 | nextState, reward, done, info = self.environment.step(policyAction) 104 | 105 | # Update of the expected return 106 | expectedReturn += (reward * (self.gamma**step)) 107 | step += 1 108 | 109 | # Add the MC sample to the memory 110 | samples.append(expectedReturn) 111 | 112 | # Output the MC samples collected 113 | return samples 114 | 115 | 116 | def plotDistributions(self, state, numberOfSamples=numberOfSamples): 117 | """ 118 | GOAL: Collect Monte Carlo samples of the expected return associated 119 | with the state and action specified. 120 | 121 | INPUTS: - state: RL state to start from. 122 | - numberOfSamples: Number of Monte Carlo samples to collect. 123 | 124 | OUTPUTS: / 125 | """ 126 | 127 | # Generation of the Monte Carlo samples 128 | samples = [] 129 | actions = 4 130 | for action in range(actions): 131 | samples.append(self.samplingMonteCarlo(state, action, numberOfSamples)) 132 | 133 | # Initialization of the figure 134 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 135 | fig = plt.figure() 136 | 137 | # Plotting of the PDF of the random return 138 | ax1 = plt.subplot(3, 1, 1) 139 | for action in range(actions): 140 | plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, color=colors[action]) 141 | ax1.set_xlabel('Random return') 142 | ax1.set_ylabel('PDF') 143 | ax1.set(xlim=(-2, 2)) 144 | 145 | # Plotting of the CDF of the random return 146 | ax2 = plt.subplot(3, 1, 2) 147 | for action in range(actions): 148 | plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color=colors[action]) 149 | ax2.set_xlabel('Random return') 150 | ax2.set_ylabel('CDF') 151 | ax2.set(xlim=(-2, 2)) 152 | 153 | # Plotting of the QF of the random return 154 | ax3 = plt.subplot(3, 1, 3) 155 | CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 156 | CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 157 | CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 158 | CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 159 | ax3.clear() 160 | ax3.plot(CDF0[0], CDF0[1][1:], color=colors[0]) 161 | ax3.plot(CDF1[0], CDF1[1][1:], color=colors[1]) 162 | ax3.plot(CDF2[0], CDF2[1][1:], color=colors[2]) 163 | ax3.plot(CDF3[0], CDF3[1][1:], color=colors[3]) 164 | ax3.set_xlabel('Quantile fraction') 165 | ax3.set_ylabel('QF') 166 | ax3.set(xlim=(0, 1)) 167 | ax3.legend(['Move right', 'Move down', 'Move left', 'Move up']) 168 | 169 | # Saving of the figure generated 170 | plt.savefig("Figures/Distributions/MonteCarloDistributions.pdf", format='pdf') 171 | 172 | # Generation of the figure for the PDF of the random return 173 | fig = plt.figure(figsize=(10, 4)) 174 | ax1 = plt.subplot(1, 1, 1) 175 | for action in range(actions): 176 | plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, color=colors[action]) 177 | ax1.set_xlabel('Random return') 178 | ax1.set_ylabel('PDF') 179 | ax1.set(xlim=(-0.5, 1.5), ylim=(0, 3.5)) 180 | plt.savefig("Figures/Distributions/MonteCarloDistributionsPDF.pdf", format='pdf') 181 | # Generation of the figure for the CDF of the random return 182 | fig = plt.figure(figsize=(10, 4)) 183 | ax2 = plt.subplot(1, 1, 1) 184 | for action in range(actions): 185 | plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color=colors[action]) 186 | ax2.set_xlabel('Random return') 187 | ax2.set_ylabel('CDF') 188 | ax2.set(xlim=(-0.5, 1.5), ylim=(-0.1, 1.1)) 189 | plt.savefig("Figures/Distributions/MonteCarloDistributionsCDF.pdf", format='pdf') 190 | # Generation of the figure for the QF of the random return 191 | fig = plt.figure(figsize=(10, 4)) 192 | ax3 = plt.subplot(1, 1, 1) 193 | CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 194 | CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 195 | CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 196 | CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 197 | ax3.clear() 198 | ax3.plot(CDF0[0], CDF0[1][1:], color=colors[0]) 199 | ax3.plot(CDF1[0], CDF1[1][1:], color=colors[1]) 200 | ax3.plot(CDF2[0], CDF2[1][1:], color=colors[2]) 201 | ax3.plot(CDF3[0], CDF3[1][1:], color=colors[3]) 202 | ax3.set_xlabel('Quantile fraction') 203 | ax3.set_ylabel('QF') 204 | ax3.set(xlim=(0, 1), ylim=(-0.5, 1.5)) 205 | plt.savefig("Figures/Distributions/MonteCarloDistributionsQF.pdf", format='pdf') 206 | 207 | # Saving of the data into external files 208 | PDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, color='white') 209 | PDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, color='white') 210 | PDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, color='white') 211 | PDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, color='white') 212 | CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 213 | CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 214 | CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 215 | CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white') 216 | dataPDF = { 217 | 'Action0_x': PDF0[1][1:], 218 | 'Action0_y': PDF0[0], 219 | 'Action1_x': PDF1[1][1:], 220 | 'Action1_y': PDF1[0], 221 | 'Action2_x': PDF2[1][1:], 222 | 'Action2_y': PDF2[0], 223 | 'Action3_x': PDF3[1][1:], 224 | 'Action3_y': PDF3[0], 225 | } 226 | dataCDF = { 227 | 'Action0_x': CDF0[1][1:], 228 | 'Action0_y': CDF0[0], 229 | 'Action1_x': CDF1[1][1:], 230 | 'Action1_y': CDF1[0], 231 | 'Action2_x': CDF2[1][1:], 232 | 'Action2_y': CDF2[0], 233 | 'Action3_x': CDF3[1][1:], 234 | 'Action3_y': CDF3[0], 235 | } 236 | dataQF = { 237 | 'Action0_y': CDF0[1][1:], 238 | 'Action0_x': CDF0[0], 239 | 'Action1_y': CDF1[1][1:], 240 | 'Action1_x': CDF1[0], 241 | 'Action2_y': CDF2[1][1:], 242 | 'Action2_x': CDF2[0], 243 | 'Action3_y': CDF3[1][1:], 244 | 'Action3_x': CDF3[0], 245 | } 246 | dataframePDF = pd.DataFrame(dataPDF) 247 | dataframeCDF = pd.DataFrame(dataCDF) 248 | dataframeQF = pd.DataFrame(dataQF) 249 | dataframePDF.to_csv('Figures/Distributions/MonteCarloPDF.csv') 250 | dataframeCDF.to_csv('Figures/Distributions/MonteCarloCDF.csv') 251 | dataframeQF.to_csv('Figures/Distributions/MonteCarloQF.csv') 252 | -------------------------------------------------------------------------------- /Parameters/parameters_CDQN_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "numberOfAtoms": 51, 20 | "minReturn": -5, 21 | "maxReturn": 5, 22 | 23 | "rewardClipping": 1, 24 | "gradientClipping": 1, 25 | 26 | "atari": 1, 27 | "minatar": 0, 28 | 29 | "GPUNumber": 0 30 | } -------------------------------------------------------------------------------- /Parameters/parameters_CDQN_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | "numberOfAtoms": 51, 20 | "minReturn": -10, 21 | "maxReturn": 110, 22 | 23 | "rewardClipping": 1000, 24 | "gradientClipping": 1, 25 | 26 | "atari": 0, 27 | "minatar": 0, 28 | 29 | "GPUNumber": 0 30 | } -------------------------------------------------------------------------------- /Parameters/parameters_CDQN_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "numberOfAtoms": 51, 20 | "minReturn": -1, 21 | "maxReturn": 10, 22 | 23 | "rewardClipping": 1, 24 | "gradientClipping": 1, 25 | 26 | "atari": 0, 27 | "minatar": 1, 28 | 29 | "GPUNumber": 0 30 | } -------------------------------------------------------------------------------- /Parameters/parameters_CDQN_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | "numberOfAtoms": 51, 20 | "minReturn": -2, 21 | "maxReturn": 2, 22 | 23 | "rewardClipping": 2, 24 | "gradientClipping": 1, 25 | 26 | "atari": 0, 27 | "minatar": 0, 28 | 29 | "GPUNumber": 0 30 | } -------------------------------------------------------------------------------- /Parameters/parameters_DQN_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | 20 | "rewardClipping": 1, 21 | "gradientClipping": 1, 22 | 23 | "atari": 1, 24 | "minatar": 0, 25 | 26 | "GPUNumber": 0 27 | } -------------------------------------------------------------------------------- /Parameters/parameters_DQN_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | "learningRate": 0.0001, 4 | 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | 20 | "rewardClipping": 1000, 21 | "gradientClipping": 1, 22 | 23 | "atari": 0, 24 | "minatar": 0, 25 | 26 | "GPUNumber": 0 27 | } -------------------------------------------------------------------------------- /Parameters/parameters_DQN_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | 20 | "rewardClipping": 1, 21 | "gradientClipping": 1, 22 | 23 | "atari": 0, 24 | "minatar": 1, 25 | 26 | "GPUNumber": 0 27 | } -------------------------------------------------------------------------------- /Parameters/parameters_DQN_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | "learningRate": 0.0001, 4 | 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | 20 | "rewardClipping": 2, 21 | "gradientClipping": 1, 22 | 23 | "atari": 0, 24 | "minatar": 0, 25 | 26 | "GPUNumber": 0 27 | } -------------------------------------------------------------------------------- /Parameters/parameters_FQF_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 512, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1, 25 | "gradientClipping": 1, 26 | 27 | "atari": 1, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_FQF_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1000, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_FQF_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 1, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_FQF_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 2, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_IQN_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 512, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1, 25 | "gradientClipping": 1, 26 | 27 | "atari": 1, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_IQN_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1000, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_IQN_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 1, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 1, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_IQN_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "stateEmbedding": 128, 20 | "N": 32, 21 | "K": 32, 22 | "NCos": 64, 23 | 24 | "rewardClipping": 2, 25 | "gradientClipping": 1, 26 | 27 | "atari": 0, 28 | "minatar": 0, 29 | 30 | "GPUNumber": 0 31 | } -------------------------------------------------------------------------------- /Parameters/parameters_QR_DQN_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "numberOfQuantiles": 200, 20 | 21 | "rewardClipping": 1, 22 | "gradientClipping": 1, 23 | 24 | "atari": 1, 25 | "minatar": 0, 26 | 27 | "GPUNumber": 0 28 | } -------------------------------------------------------------------------------- /Parameters/parameters_QR_DQN_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | "numberOfQuantiles": 200, 20 | 21 | "rewardClipping": 1000, 22 | "gradientClipping": 1, 23 | 24 | "atari": 0, 25 | "minatar": 0, 26 | 27 | "GPUNumber": 0 28 | } -------------------------------------------------------------------------------- /Parameters/parameters_QR_DQN_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.0003125, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "numberOfQuantiles": 200, 20 | 21 | "rewardClipping": 1, 22 | "gradientClipping": 1, 23 | 24 | "atari": 0, 25 | "minatar": 1, 26 | 27 | "GPUNumber": 0 28 | } -------------------------------------------------------------------------------- /Parameters/parameters_QR_DQN_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128, 128], 19 | "numberOfQuantiles": 200, 20 | 21 | "rewardClipping": 2, 22 | "gradientClipping": 1, 23 | 24 | "atari": 0, 25 | "minatar": 0, 26 | 27 | "GPUNumber": 0 28 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_C_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 512, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -5, 25 | "maxReturn": 5, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1, 30 | "gradientClipping": 1, 31 | 32 | "atari": 1, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_C_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -10, 25 | "maxReturn": 110, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1000, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_C_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -1, 25 | "maxReturn": 10, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 1, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_C_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -2, 25 | "maxReturn": 2, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 2, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_KL_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 512, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -5, 25 | "maxReturn": 5, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1, 30 | "gradientClipping": 1, 31 | 32 | "atari": 1, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_KL_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -10, 25 | "maxReturn": 110, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1000, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_KL_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -1, 25 | "maxReturn": 10, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 1, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 1, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_KL_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | "minReturn": -2, 25 | "maxReturn": 2, 26 | 27 | "fasterExpectation": 1, 28 | 29 | "rewardClipping": 2, 30 | "gradientClipping": 1, 31 | 32 | "atari": 0, 33 | "minatar": 0, 34 | 35 | "GPUNumber": 0 36 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_W_Atari57.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 512, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | 25 | "rewardClipping": 1, 26 | "gradientClipping": 1, 27 | 28 | "atari": 1, 29 | "minatar": 0, 30 | 31 | "GPUNumber": 0 32 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_W_ClassicControl.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | 25 | "rewardClipping": 1000, 26 | "gradientClipping": 1, 27 | 28 | "atari": 0, 29 | "minatar": 0, 30 | 31 | "GPUNumber": 0 32 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_W_MinAtar.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.99, 3 | 4 | "learningRate": 0.00005, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 10000, 8 | "learningUpdatePeriod": 4, 9 | 10 | "capacity": 100000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 1000000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | 25 | "rewardClipping": 1, 26 | "gradientClipping": 1, 27 | 28 | "atari": 0, 29 | "minatar": 1, 30 | 31 | "GPUNumber": 0 32 | } -------------------------------------------------------------------------------- /Parameters/parameters_UMDQN_W_StochasticGridWorld.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma": 0.5, 3 | 4 | "learningRate": 0.0001, 5 | "epsilon": 0.00001, 6 | 7 | "targetUpdatePeriod": 1000, 8 | "learningUpdatePeriod": 1, 9 | 10 | "capacity": 10000, 11 | "batchSize": 32, 12 | 13 | "epsilonStart": 1.0, 14 | "epsilonEnd": 0.01, 15 | "epsilonDecay": 10000, 16 | "epsilonTest": 0.001, 17 | 18 | "structureDNN": [128], 19 | "structureUMNN": [128], 20 | "stateEmbedding": 128, 21 | "numberOfSteps": 50, 22 | 23 | "numberOfSamples": 200, 24 | 25 | "rewardClipping": 2, 26 | "gradientClipping": 1, 27 | 28 | "atari": 0, 29 | "minatar": 0, 30 | 31 | "GPUNumber": 0 32 | } -------------------------------------------------------------------------------- /QR_DQN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | from matplotlib import pyplot as plt 10 | 11 | import torch 12 | import torch.optim as optim 13 | 14 | from replayMemory import ReplayMemory 15 | 16 | from Models.QR_DQN_Model import QR_DQN_Model 17 | from Models.QR_DQN_Model_Atari import QR_DQN_Model_Atari 18 | 19 | from DQN import DQN 20 | 21 | 22 | 23 | ############################################################################### 24 | ################################## Class QR_DQN ################################ 25 | ############################################################################### 26 | 27 | class QR_DQN(DQN): 28 | """ 29 | GOAL: Implementing the QR-DQN Deep Reinforcement Learning algorithm. 30 | 31 | VARIABLES: - device: Hardware specification (CPU or GPU). 32 | - gamma: Discount factor of the RL algorithm. 33 | - learningRate: Learning rate of the DL optimizer (ADAM). 34 | - epsilon: Epsilon value for the DL optimizer (ADAM). 35 | - targetNetworkUpdate: Update frequency of the target network. 36 | - learningUpdatePeriod: Frequency of the learning procedure. 37 | - batchSize: Size of the batch to sample from the replay memory. 38 | - capacity: Capacity of the replay memory. 39 | - replayMemory: Experience Replay memory. 40 | - rewardClipping: Clipping of the RL rewards. 41 | - gradientClipping: Clipping of the training loss. 42 | - optimizer: DL optimizer (ADAM). 43 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 44 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 45 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 46 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 47 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 48 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 49 | - targetNetwork: Deep Neural Network representing the target network. 50 | 51 | METHODS: - __init__: Initialization of the RL algorithm. 52 | - chooseAction: Choose a valid action based on the current state 53 | observed, according to the RL policy learned. 54 | - learning: Execute the RL algorithm learning procedure. 55 | """ 56 | 57 | def __init__(self, observationSpace, actionSpace, environment, 58 | parametersFileName='', reporting=True): 59 | """ 60 | GOAL: Initializing the RL agent based on the QR-DQN Deep Reinforcement Learning 61 | algorithm, by setting up the algorithm parameters as well as 62 | the Deep Neural Networks. 63 | 64 | INPUTS: - observationSpace: RL observation space. 65 | - actionSpace: RL action space. 66 | - environment: Name of the RL environment. 67 | - parametersFileName: Name of the JSON parameters file. 68 | - reporting: Enable the reporting of the results. 69 | 70 | OUTPUTS: / 71 | """ 72 | 73 | # Initialization of the DQN parent class 74 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 75 | 76 | # Setting of the parameters 77 | if parametersFileName == '': 78 | parametersFileName = ''.join(['Parameters/parameters_QR_DQN_', str(environment), '.json']) 79 | parameters = self.readParameters(parametersFileName) 80 | 81 | # Set the device for DNN computations (CPU or GPU) 82 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 83 | 84 | # Set the general parameters of the RL algorithm 85 | self.gamma = parameters['gamma'] 86 | self.learningRate = parameters['learningRate'] 87 | self.epsilon = parameters['epsilon'] 88 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 89 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 90 | self.rewardClipping = parameters['rewardClipping'] 91 | self.gradientClipping = parameters['gradientClipping'] 92 | 93 | # Set the Experience Replay mechanism 94 | self.batchSize = parameters['batchSize'] 95 | self.capacity = parameters['capacity'] 96 | self.replayMemory = ReplayMemory(self.capacity) 97 | 98 | # Set the distribution support 99 | self.numberOfQuantiles = parameters['numberOfQuantiles'] 100 | self.quantileProbability = 1./self.numberOfQuantiles 101 | self.tau = ((torch.linspace(0.0, 1.0, self.numberOfQuantiles+1)[:-1] + torch.linspace(0.0, 1.0, self.numberOfQuantiles+1)[1:])/2).to(self.device) 102 | self.kappa = 1.0 103 | 104 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 105 | self.atari = parameters['atari'] 106 | self.minatar = parameters['minatar'] 107 | if self.atari or self.minatar: 108 | self.policyNetwork = QR_DQN_Model_Atari(observationSpace, actionSpace*self.numberOfQuantiles, self.numberOfQuantiles, minAtar=self.minatar).to(self.device) 109 | self.targetNetwork = QR_DQN_Model_Atari(observationSpace, actionSpace*self.numberOfQuantiles, self.numberOfQuantiles, minAtar=self.minatar).to(self.device) 110 | else: 111 | self.policyNetwork = QR_DQN_Model(observationSpace, actionSpace*self.numberOfQuantiles, parameters['structureDNN'], self.numberOfQuantiles).to(self.device) 112 | self.targetNetwork = QR_DQN_Model(observationSpace, actionSpace*self.numberOfQuantiles, parameters['structureDNN'], self.numberOfQuantiles).to(self.device) 113 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 114 | 115 | # Set the Deep Learning optimizer 116 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 117 | 118 | # Set the Epsilon-Greedy exploration technique 119 | self.epsilonStart = parameters['epsilonStart'] 120 | self.epsilonEnd = parameters['epsilonEnd'] 121 | self.epsilonDecay = parameters['epsilonDecay'] 122 | self.epsilonTest = parameters['epsilonTest'] 123 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 124 | 125 | # Initialization of the experiment folder and tensorboard writer 126 | self.initReporting(parameters, 'QR_DQN') 127 | 128 | 129 | def chooseAction(self, state, plot=False): 130 | """ 131 | GOAL: Choose a valid RL action from the action space according to the 132 | RL policy as well as the current RL state observed. 133 | 134 | INPUTS: - state: RL state returned by the environment. 135 | - plot: Enable the plotting of the random returns distributions. 136 | 137 | OUTPUTS: - action: RL action chosen from the action space. 138 | """ 139 | 140 | # Choose the best action based on the RL policy 141 | with torch.no_grad(): 142 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 143 | quantiles = self.policyNetwork(state).squeeze(0) 144 | QValues = quantiles.mean(1) 145 | _, action = QValues.max(0) 146 | 147 | # If required, plot the return distribution associated with each action 148 | if plot: 149 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 150 | fig = plt.figure() 151 | ax = fig.add_subplot() 152 | tau = self.tau.cpu().numpy() 153 | quantiles = quantiles.cpu().numpy() 154 | QValues = QValues.cpu().numpy() 155 | for a in range(self.actionSpace): 156 | ax.plot(tau, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a]) 157 | ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 158 | ax.set_xlabel('Quantile fraction') 159 | ax.set_ylabel('Quantile Function (QF)') 160 | ax.legend() 161 | plt.show() 162 | 163 | return action.item() 164 | 165 | 166 | def learning(self): 167 | """ 168 | GOAL: Sample a batch of past experiences and learn from it 169 | by updating the Reinforcement Learning policy. 170 | 171 | INPUTS: / 172 | 173 | OUTPUTS: - loss: Loss of the learning procedure. 174 | """ 175 | 176 | # Check that the replay memory is filled enough 177 | if (len(self.replayMemory) >= self.batchSize): 178 | 179 | # Sample a batch of experiences from the replay memory 180 | batch = self.dataLoaderIter.next() 181 | state = batch[0].float().to(self.device) 182 | action = batch[1].long().to(self.device) 183 | reward = batch[2].float().to(self.device) 184 | nextState = batch[3].float().to(self.device) 185 | done = batch[4].float().to(self.device) 186 | 187 | # Computation of the current return distribution 188 | quantiles = self.policyNetwork(state) 189 | action = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.numberOfQuantiles) 190 | quantiles = quantiles.gather(1, action).squeeze(1) 191 | 192 | # Computation of the new distribution to be learnt by the policy DNN 193 | with torch.no_grad(): 194 | nextQuantiles = self.targetNetwork(nextState) 195 | nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.numberOfQuantiles) 196 | nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1) 197 | targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1)) 198 | 199 | # Computation of the loss 200 | difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2) 201 | error = difference.abs() 202 | loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa))) 203 | loss = (self.tau - (difference < 0).float()).abs() * loss/self.kappa 204 | loss = loss.mean(1).sum(1).mean() 205 | 206 | # Computation of the gradients 207 | self.optimizer.zero_grad() 208 | loss.backward() 209 | 210 | # Gradient Clipping 211 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 212 | 213 | # Perform the Deep Neural Network optimization 214 | self.optimizer.step() 215 | 216 | return loss.item() 217 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks 2 | Experimental code supporting the results presented in the scientific research paper: 3 | > Thibaut Théate, Antoine Wehenkel, Adrien Bolland, Gilles Louppe and Damien Ernst. "Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks." (2021). 4 | > [[arxiv]](https://arxiv.org/abs/) 5 | 6 | 7 | 8 | # Dependencies 9 | 10 | The dependencies are listed in the text file "requirements.txt": 11 | * Python 3.7.7 12 | * Pytorch 13 | * Tensorboard 14 | * Gym 15 | * Opencv-python 16 | * Atari-py 17 | * MinAtar 18 | * Umnn 19 | * Numpy 20 | * Pandas 21 | * Matplotlib 22 | * Scipy 23 | * Tqdm 24 | 25 | 26 | 27 | # Usage 28 | 29 | Training and testing a chosen distributional RL algorithm for the control problem of a chosen environment is performed by running the following command: 30 | 31 | ```bash 32 | python main.py -algorithm ALGORITHM -environment ENVIRONMENT 33 | ``` 34 | 35 | with: 36 | * ALGORITHM being the name of the algorithm (by default UMDQN_C), 37 | * ENVIRONMENT being the name of the environment (by default StochasticGridWorld). 38 | 39 | The (distributional) RL algorithms supported are: 40 | * DQN, 41 | * CDQN, 42 | * QR_DQN, 43 | * IQN, 44 | * FQF, 45 | * UMDQN_KL, 46 | * UMDQN_C, 47 | * UMDQN_W. 48 | 49 | The benchmark environments supported are: 50 | * StochasticGridWorld, 51 | * CartPole-v0, 52 | * Acrobot-v1, 53 | * LunarLander-v2, 54 | * MountainCar-v0, 55 | * MinAtar/Asterix-v0, 56 | * MinAtar/Breakout-v0, 57 | * MinAtar/Freeway-v0, 58 | * MinAtar/Seaquest-v0, 59 | * MinAtar/SpaceInvaders-v0, 60 | * PongNoFrameskip-v4, 61 | * BoxingNoFrameskip-v4, 62 | * FreewayNoFrameskip-v4. 63 | 64 | The number of episodes for training the DRL algorithm may also be specified by the user through the argument "-episodes". The parameters of the DRL algorithms can be set with the argument "-parameters" and by providing the name of the .txt file containing these parameters within the "Parameters" folder. 65 | 66 | For more advanced tests and manipulations, please directly refer to the code. 67 | 68 | 69 | 70 | # Citation 71 | 72 | If you make use of this experimental code, please cite the associated research paper: 73 | 74 | ``` 75 | @inproceedings{Théate2021, 76 | title={Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks}, 77 | author={Thibaut Théate, Antoine Wehenkel, Adrien Bolland, Gilles Louppe and Damien Ernst}, 78 | year={2021} 79 | } 80 | ``` 81 | -------------------------------------------------------------------------------- /SavedModels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/SavedModels/__init__.py -------------------------------------------------------------------------------- /Tensorboard/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Tensorboard/__init__.py -------------------------------------------------------------------------------- /UMDQN_KL.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import scipy.stats as stats 12 | 13 | from matplotlib import pyplot as plt 14 | 15 | import torch 16 | import torch.optim as optim 17 | 18 | from replayMemory import ReplayMemory 19 | 20 | from Models.UMDQN_KL_Model import UMDQN_KL_Model 21 | from Models.UMDQN_KL_Model_Atari import UMDQN_KL_Model_Atari 22 | 23 | from DQN import DQN 24 | 25 | 26 | 27 | ############################################################################### 28 | ################################ Class UMDQN_KL ############################### 29 | ############################################################################### 30 | 31 | class UMDQN_KL(DQN): 32 | """ 33 | GOAL: Implementing the UMDQN_KL Deep Reinforcement Learning algorithm. 34 | 35 | VARIABLES: - device: Hardware specification (CPU or GPU). 36 | - gamma: Discount factor of the RL algorithm. 37 | - learningRate: Learning rate of the DL optimizer (ADAM). 38 | - epsilon: Epsilon value for the DL optimizer (ADAM). 39 | - targetNetworkUpdate: Update frequency of the target network. 40 | - learningUpdatePeriod: Frequency of the learning procedure. 41 | - batchSize: Size of the batch to sample from the replay memory. 42 | - capacity: Capacity of the replay memory. 43 | - replayMemory: Experience Replay memory. 44 | - rewardClipping: Clipping of the RL rewards. 45 | - gradientClipping: Clipping of the training loss. 46 | - optimizer: DL optimizer (ADAM). 47 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 48 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 49 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 50 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 51 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 52 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 53 | - targetNetwork: Deep Neural Network representing the target network. 54 | 55 | METHODS: - __init__: Initialization of the RL algorithm. 56 | - chooseAction: Choose a valid action based on the current state 57 | observed, according to the RL policy learned. 58 | - learning: Execute the RL algorithm learning procedure. 59 | """ 60 | 61 | def __init__(self, observationSpace, actionSpace, environment, 62 | parametersFileName='', reporting=True): 63 | """ 64 | GOAL: Initializing the RL agent based on the UMDQN_KL Deep Reinforcement Learning 65 | algorithm, by setting up the algorithm parameters as well as 66 | the Deep Neural Networks. 67 | 68 | INPUTS: - observationSpace: RL observation space. 69 | - actionSpace: RL action space. 70 | - environment: Name of the RL environment. 71 | - parametersFileName: Name of the JSON parameters file. 72 | - reporting: Enable the reporting of the results. 73 | 74 | OUTPUTS: / 75 | """ 76 | 77 | # Initialization of the DQN parent class 78 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 79 | 80 | # Setting of the parameters 81 | if parametersFileName == '': 82 | parametersFileName = ''.join(['Parameters/parameters_UMDQN_KL_', str(environment), '.json']) 83 | parameters = self.readParameters(parametersFileName) 84 | 85 | # Set the device for DNN computations (CPU or GPU) 86 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 87 | 88 | # Set the general parameters of the RL algorithm 89 | self.gamma = parameters['gamma'] 90 | self.learningRate = parameters['learningRate'] 91 | self.epsilon = parameters['epsilon'] 92 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 93 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 94 | self.rewardClipping = parameters['rewardClipping'] 95 | self.gradientClipping = parameters['gradientClipping'] 96 | 97 | # Set the Experience Replay mechanism 98 | self.batchSize = parameters['batchSize'] 99 | self.capacity = parameters['capacity'] 100 | self.replayMemory = ReplayMemory(self.capacity) 101 | 102 | # Set the distribution support 103 | self.numberOfSamples = parameters['numberOfSamples'] 104 | self.minReturn = parameters['minReturn'] 105 | self.maxReturn = parameters['maxReturn'] 106 | self.support = np.linspace(self.minReturn, self.maxReturn, self.numberOfSamples) 107 | self.supportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfSamples, device=self.device) 108 | self.supportRepeatedBatchSize = self.supportTorch.repeat(self.batchSize, 1).view(-1, 1) 109 | self.uniformProba = 1/(self.maxReturn - self.minReturn) 110 | self.deltaSupport = self.support[1] - self.support[0] 111 | 112 | # Enable the faster but potentially less accurate estimation of the expectation 113 | self.fasterExpectation = parameters['fasterExpectation'] 114 | 115 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 116 | self.atari = parameters['atari'] 117 | self.minatar = parameters['minatar'] 118 | if self.atari or self.minatar: 119 | self.policyNetwork = UMDQN_KL_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device) 120 | self.targetNetwork = UMDQN_KL_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device) 121 | else: 122 | self.policyNetwork = UMDQN_KL_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device) 123 | self.targetNetwork = UMDQN_KL_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device) 124 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 125 | 126 | # Set the Deep Learning optimizer 127 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 128 | 129 | # Set the Epsilon-Greedy exploration technique 130 | self.epsilonStart = parameters['epsilonStart'] 131 | self.epsilonEnd = parameters['epsilonEnd'] 132 | self.epsilonDecay = parameters['epsilonDecay'] 133 | self.epsilonTest = parameters['epsilonTest'] 134 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 135 | 136 | # Initialization of the experiment folder and tensorboard writer 137 | self.initReporting(parameters, 'UMDQN_KL') 138 | 139 | 140 | def chooseAction(self, state, plot=False): 141 | """ 142 | GOAL: Choose a valid RL action from the action space according to the 143 | RL policy as well as the current RL state observed. 144 | 145 | INPUTS: - state: RL state returned by the environment. 146 | - plot: Enable the plotting of the random returns distributions. 147 | 148 | OUTPUTS: - action: RL action chosen from the action space. 149 | """ 150 | 151 | # Choose the best action based on the RL policy 152 | with torch.no_grad(): 153 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 154 | if self.fasterExpectation: 155 | QValues = self.policyNetwork.getExpectation(state, self.minReturn, self.maxReturn, 10*self.numberOfSamples).squeeze(0) 156 | else: 157 | pdfs = self.policyNetwork(state, self.supportTorch.unsqueeze(1)) 158 | QValues = (pdfs * self.supportTorch).sum(1)/(self.numberOfSamples*self.uniformProba) 159 | _, action = QValues.max(0) 160 | 161 | # If required, plot the return distribution associated with each action 162 | if plot: 163 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 164 | plt.figure() 165 | ax = plt.subplot(1, 1, 1) 166 | with torch.no_grad(): 167 | accurateSupport = np.linspace(self.minReturn, self.maxReturn, self.numberOfSamples*10) 168 | accurateSupportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfSamples*10, device=self.device) 169 | pdfs = self.policyNetwork(state, accurateSupportTorch.unsqueeze(1)) 170 | QValues = ((pdfs * accurateSupportTorch).sum(1))/(self.numberOfSamples*10*self.uniformProba) 171 | for a in range(self.actionSpace): 172 | ax.plot(accurateSupport, pdfs[a].cpu(), linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a]) 173 | ax.fill_between(accurateSupport, accurateSupport*0, pdfs[a].cpu(), alpha=0.25, color=colors[a]) 174 | ax.axvline(x=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 175 | ax.set_xlabel('Random return') 176 | ax.set_ylabel('Probability Density Function (PDF)') 177 | ax.legend() 178 | plt.show() 179 | """ 180 | # Saving of the data into external files 181 | dataPDF = { 182 | 'Action0_x': accurateSupport, 183 | 'Action0_y': pdfs[0].cpu(), 184 | 'Action1_x': accurateSupport, 185 | 'Action1_y': pdfs[1].cpu(), 186 | 'Action2_x': accurateSupport, 187 | 'Action2_y': pdfs[2].cpu(), 188 | 'Action3_x': accurateSupport, 189 | 'Action3_y': pdfs[3].cpu(), 190 | } 191 | dataframePDF = pd.DataFrame(dataPDF) 192 | dataframePDF.to_csv('Figures/Distributions/UMDQN_KL.csv') 193 | quit() 194 | """ 195 | 196 | return action.item() 197 | 198 | 199 | def learning(self): 200 | """ 201 | GOAL: Sample a batch of past experiences and learn from it 202 | by updating the Reinforcement Learning policy. 203 | 204 | INPUTS: / 205 | 206 | OUTPUTS: - loss: Loss of the learning procedure. 207 | """ 208 | 209 | # Check that the replay memory is filled enough 210 | if (len(self.replayMemory) >= self.batchSize): 211 | 212 | # Sample a batch of experiences from the replay memory 213 | batch = self.dataLoaderIter.next() 214 | state = batch[0].float().to(self.device) 215 | action = batch[1].float().to(self.device) 216 | reward = batch[2].float().to(self.device) 217 | nextState = batch[3].float().to(self.device) 218 | done = batch[4].float().to(self.device) 219 | 220 | # Computation of the current return distribution, according to the policy DNN 221 | pdfs = self.policyNetwork(state, self.supportRepeatedBatchSize) 222 | selection = torch.tensor([self.actionSpace*i + action[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device) 223 | currentPdfs = torch.index_select(pdfs, 0, selection).view(-1, 1) 224 | 225 | # Computation of the next action, according to the policy DNN 226 | with torch.no_grad(): 227 | if self.fasterExpectation: 228 | expectedReturns = self.targetNetwork.getExpectation(nextState, self.minReturn, self.maxReturn, 10*self.numberOfSamples) 229 | else: 230 | pdfs = self.targetNetwork(nextState, self.supportRepeatedBatchSize) 231 | expectedReturns = (((pdfs * self.supportTorch).sum(1))/(self.numberOfSamples*self.uniformProba)).view(-1, self.actionSpace) 232 | _, nextAction = expectedReturns.max(1) 233 | 234 | # Computation of the new distribution to be learnt by the policy DNN 235 | with torch.no_grad(): 236 | r = reward.view(self.batchSize, 1).repeat(1, self.numberOfSamples).view(-1, 1) 237 | support = (self.supportRepeatedBatchSize - r)/self.gamma 238 | targetPdfs = self.targetNetwork(nextState, support) 239 | selection = torch.tensor([self.actionSpace*i + nextAction[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device) 240 | targetPdfs = torch.index_select(targetPdfs, 0, selection) 241 | targetPdfs = targetPdfs/self.gamma 242 | for i in range(self.batchSize): 243 | if done[i] == 1: 244 | targetPdfs[i] = torch.tensor(stats.norm.pdf(self.support, reward[i].item(), self.deltaSupport)).to(self.device) 245 | targetPdfs = targetPdfs.clamp(min=1e-6) 246 | targetPdfs = targetPdfs.view(-1, 1) 247 | 248 | # Compute the loss 249 | loss = (targetPdfs*(targetPdfs.log()-currentPdfs.log())).sum() 250 | 251 | # Computation of the gradients 252 | self.optimizer.zero_grad() 253 | loss.backward() 254 | 255 | # Gradient Clipping 256 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 257 | 258 | # Perform the Deep Neural Network optimization 259 | self.optimizer.step() 260 | 261 | return loss.item() 262 | -------------------------------------------------------------------------------- /UMDQN_W.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import math 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from matplotlib import pyplot as plt 13 | 14 | import torch 15 | import torch.optim as optim 16 | 17 | from replayMemory import ReplayMemory 18 | 19 | from Models.UMDQN_W_Model import UMDQN_W_Model 20 | from Models.UMDQN_W_Model_Atari import UMDQN_W_Model_Atari 21 | 22 | from DQN import DQN 23 | 24 | 25 | 26 | ############################################################################### 27 | ############################### Class UMDQN_W ################################# 28 | ############################################################################### 29 | 30 | class UMDQN_W(DQN): 31 | """ 32 | GOAL: Implementing the UMDQN_W Deep Reinforcement Learning algorithm. 33 | 34 | VARIABLES: - device: Hardware specification (CPU or GPU). 35 | - gamma: Discount factor of the RL algorithm. 36 | - learningRate: Learning rate of the DL optimizer (ADAM). 37 | - epsilon: Epsilon value for the DL optimizer (ADAM). 38 | - targetNetworkUpdate: Update frequency of the target network. 39 | - learningUpdatePeriod: Frequency of the learning procedure. 40 | - batchSize: Size of the batch to sample from the replay memory. 41 | - capacity: Capacity of the replay memory. 42 | - replayMemory: Experience Replay memory. 43 | - rewardClipping: Clipping of the RL rewards. 44 | - gradientClipping: Clipping of the training loss. 45 | - optimizer: DL optimizer (ADAM). 46 | - epsilonStart: Initial value of epsilon (Epsilon-Greedy). 47 | - epsilonEnd: Final value of epsilon (Epsilon-Greedy). 48 | - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy). 49 | - epsilonTest: Test value of epsilon (Epsilon-Greedy). 50 | - epsilonValue: Current value of epsilon (Epsilon-Greedy). 51 | - policyNetwork: Deep Neural Network representing the info used by the RL policy. 52 | - targetNetwork: Deep Neural Network representing the target network. 53 | 54 | METHODS: - __init__: Initialization of the RL algorithm. 55 | - chooseAction: Choose a valid action based on the current state 56 | observed, according to the RL policy learned. 57 | - learning: Execute the RL algorithm learning procedure. 58 | """ 59 | 60 | def __init__(self, observationSpace, actionSpace, environment, 61 | parametersFileName='', reporting=True): 62 | """ 63 | GOAL: Initializing the RL agent based on the UMDQN_W Deep Reinforcement Learning 64 | algorithm, by setting up the algorithm parameters as well as 65 | the Deep Neural Networks. 66 | 67 | INPUTS: - observationSpace: RL observation space. 68 | - actionSpace: RL action space. 69 | - environment: Name of the RL environment. 70 | - parametersFileName: Name of the JSON parameters file. 71 | - reporting: Enable the reporting of the results. 72 | 73 | OUTPUTS: / 74 | """ 75 | 76 | # Initialization of the DQN parent class 77 | DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False) 78 | 79 | # Setting of the parameters 80 | if parametersFileName == '': 81 | parametersFileName = ''.join(['Parameters/parameters_UMDQN_W_', str(environment), '.json']) 82 | parameters = self.readParameters(parametersFileName) 83 | 84 | # Set the device for DNN computations (CPU or GPU) 85 | self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu') 86 | 87 | # Set the general parameters of the RL algorithm 88 | self.gamma = parameters['gamma'] 89 | self.learningRate = parameters['learningRate'] 90 | self.epsilon = parameters['epsilon'] 91 | self.targetUpdatePeriod = parameters['targetUpdatePeriod'] 92 | self.learningUpdatePeriod = parameters['learningUpdatePeriod'] 93 | self.rewardClipping = parameters['rewardClipping'] 94 | self.gradientClipping = parameters['gradientClipping'] 95 | 96 | # Set the Experience Replay mechanism 97 | self.batchSize = parameters['batchSize'] 98 | self.capacity = parameters['capacity'] 99 | self.replayMemory = ReplayMemory(self.capacity) 100 | 101 | # Set the distribution support (quantile fractions) 102 | self.numberOfSamples = parameters['numberOfSamples'] 103 | self.support = np.linspace(0.0, 1.0, self.numberOfSamples) 104 | self.supportTorch = torch.linspace(0.0, 1.0, self.numberOfSamples, device=self.device) 105 | self.supportRepeatedBatchSize = self.supportTorch.repeat(self.batchSize, 1).view(-1, 1) 106 | self.kappa = 1.0 107 | 108 | # Set the two Deep Neural Networks of the RL algorithm (policy and target) 109 | self.atari = parameters['atari'] 110 | self.minatar = parameters['minatar'] 111 | if self.atari or self.minatar: 112 | self.policyNetwork = UMDQN_W_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device) 113 | self.targetNetwork = UMDQN_W_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device) 114 | else: 115 | self.policyNetwork = UMDQN_W_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device) 116 | self.targetNetwork = UMDQN_W_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device) 117 | self.targetNetwork.load_state_dict(self.policyNetwork.state_dict()) 118 | 119 | # Set the Deep Learning optimizer 120 | self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon) 121 | 122 | # Set the Epsilon-Greedy exploration technique 123 | self.epsilonStart = parameters['epsilonStart'] 124 | self.epsilonEnd = parameters['epsilonEnd'] 125 | self.epsilonDecay = parameters['epsilonDecay'] 126 | self.epsilonTest = parameters['epsilonTest'] 127 | self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay) 128 | 129 | # Initialization of the experiment folder and tensorboard writer 130 | self.initReporting(parameters, 'UMDQN_W') 131 | 132 | 133 | def chooseAction(self, state, plot=False): 134 | """ 135 | GOAL: Choose a valid RL action from the action space according to the 136 | RL policy as well as the current RL state observed. 137 | 138 | INPUTS: - state: RL state returned by the environment. 139 | - plot: Enable the plotting of the random returns distributions. 140 | 141 | OUTPUTS: - action: RL action chosen from the action space. 142 | """ 143 | 144 | # Choose the best action based on the RL policy 145 | with torch.no_grad(): 146 | state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) 147 | quantiles = self.policyNetwork(state, self.supportTorch.unsqueeze(1)) 148 | QValues = quantiles.mean(1) 149 | _, action = QValues.max(0) 150 | 151 | # If required, plot the return distribution associated with each action 152 | if plot: 153 | colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown'] 154 | plt.figure() 155 | ax = plt.subplot(1, 1, 1) 156 | taus = torch.linspace(0.0, 1.0, self.numberOfSamples*10, device=self.device).unsqueeze(1) 157 | quantiles = self.policyNetwork(state, taus) 158 | QValues = quantiles.mean(1) 159 | taus = taus.cpu().numpy() 160 | quantiles = quantiles.squeeze(0).cpu().numpy() 161 | QValues = QValues.squeeze(0).cpu().numpy() 162 | for a in range(self.actionSpace): 163 | ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a]) 164 | ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a]) 165 | ax.set_xlabel('Quantile fraction') 166 | ax.set_ylabel('Quantile Function (QF)') 167 | ax.legend() 168 | plt.show() 169 | """ 170 | # Saving of the data into external files 171 | taus = np.linspace(0, 1, self.numberOfSamples*10) 172 | dataQF = { 173 | 'Action0_x': taus, 174 | 'Action0_y': quantiles[0], 175 | 'Action1_x': taus, 176 | 'Action1_y': quantiles[1], 177 | 'Action2_x': taus, 178 | 'Action2_y': quantiles[2], 179 | 'Action3_x': taus, 180 | 'Action3_y': quantiles[3], 181 | } 182 | dataframeQF = pd.DataFrame(dataQF) 183 | dataframeQF.to_csv('Figures/Distributions/UMDQN_W.csv') 184 | quit() 185 | """ 186 | 187 | return action.item() 188 | 189 | 190 | def learning(self): 191 | """ 192 | GOAL: Sample a batch of past experiences and learn from it 193 | by updating the Reinforcement Learning policy. 194 | 195 | INPUTS: / 196 | 197 | OUTPUTS: - loss: Loss of the learning procedure. 198 | """ 199 | 200 | # Check that the replay memory is filled enough 201 | if (len(self.replayMemory) >= self.batchSize): 202 | 203 | # Sample a batch of experiences from the replay memory 204 | batch = self.dataLoaderIter.next() 205 | state = batch[0].float().to(self.device) 206 | action = batch[1].long().to(self.device) 207 | reward = batch[2].float().to(self.device) 208 | nextState = batch[3].float().to(self.device) 209 | done = batch[4].float().to(self.device) 210 | 211 | # Computation of the current return distribution 212 | quantiles = self.policyNetwork(state, self.supportRepeatedBatchSize) 213 | selection = torch.tensor([self.actionSpace*i + action[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device) 214 | quantiles = torch.index_select(quantiles, 0, selection) 215 | 216 | # Computation of the new distribution to be learnt by the policy DNN 217 | with torch.no_grad(): 218 | nextQuantiles = self.targetNetwork(nextState, self.supportRepeatedBatchSize) 219 | nextAction = nextQuantiles.view(self.batchSize, self.actionSpace, self.numberOfSamples).mean(2).max(1)[1] 220 | selection = torch.tensor([self.actionSpace*i + nextAction[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device) 221 | nextQuantiles = torch.index_select(nextQuantiles, 0, selection) 222 | targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1)) 223 | 224 | #""" 225 | # Improve stability with the lower and upper bounds of the random return 226 | minZ = -1 227 | maxZ = 10 228 | quantiles = quantiles.clamp(min=minZ, max=maxZ) 229 | targetQuantiles = targetQuantiles.clamp(min=minZ, max=maxZ) 230 | #""" 231 | 232 | # Computation of the loss 233 | difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2) 234 | error = difference.abs() 235 | loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa))) 236 | loss = (self.supportRepeatedBatchSize.view(self.batchSize, self.numberOfSamples, 1) - (difference < 0).float()).abs() * loss/self.kappa 237 | loss = loss.mean(1).sum(1).mean() 238 | 239 | # Computation of the gradients 240 | self.optimizer.zero_grad() 241 | loss.backward() 242 | 243 | # Gradient Clipping 244 | torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping) 245 | 246 | # Perform the Deep Neural Network optimization 247 | self.optimizer.step() 248 | 249 | return loss.item() 250 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import argparse 8 | import importlib 9 | import gym 10 | 11 | from CustomEnvironments.stochasticGridWorld import StochasticGridWorld 12 | from CustomEnvironments.stochasticGridWorldOptimal import StochasticGridWorldOptimal 13 | from MonteCarloDistributions import MonteCarloDistributions 14 | from AtariWrapper import AtariWrapper, MinAtarWrapper 15 | 16 | 17 | 18 | ############################################################################### 19 | ################################ Global variables ############################# 20 | ############################################################################### 21 | 22 | # Supported RL algorithms 23 | algorithms = ['DQN', 'CDQN', 'QR_DQN', 'IQN', 'FQF', 24 | 'UMDQN_KL', 'UMDQN_C', 'UMDQN_W'] 25 | 26 | # Supported RL environments 27 | environments = ['StochasticGridWorld', 'CartPole-v0', 'Acrobot-v1', 28 | 'LunarLander-v2', 'MountainCar-v0', 'MinAtar/Asterix-v0', 29 | 'MinAtar/Breakout-v0', 'MinAtar/Freeway-v0', 'MinAtar/Seaquest-v0', 30 | 'MinAtar/SpaceInvaders-v0', 'PongNoFrameskip-v4', 31 | 'BoxingNoFrameskip-v4', 'FreewayNoFrameskip-v4'] 32 | 33 | 34 | 35 | ############################################################################### 36 | ##################################### MAIN #################################### 37 | ############################################################################### 38 | 39 | if(__name__ == '__main__'): 40 | 41 | # Retrieve the paramaters sent by the user 42 | parser = argparse.ArgumentParser(description='') 43 | parser.add_argument("-algorithm", default='UMDQN_C', type=str, help="Name of the RL algorithm") 44 | parser.add_argument("-environment", default='StochasticGridWorld', type=str, help="Name of the RL environment") 45 | parser.add_argument("-episodes", default=10000, type=str, help="Number of episodes for training") 46 | parser.add_argument("-parameters", default='parameters', type=str, help="Name of the JSON parameters file") 47 | args = parser.parse_args() 48 | 49 | # Checking of the parameters validity 50 | algorithm = args.algorithm 51 | environment = args.environment 52 | episodes = int(args.episodes) 53 | parameters = args.parameters 54 | if algorithm not in algorithms: 55 | print("The algorithm specified is not valid, only the following algorithms are supported:") 56 | for algo in algorithms: 57 | print("".join(['- ', algo])) 58 | if environment not in environments: 59 | print("The environment specified is not valid, only the following environments are supported:") 60 | for env in environments: 61 | print("".join(['- ', env])) 62 | if parameters == 'parameters': 63 | parameters = ''.join(['Parameters/parameters_', str(algorithm), '_', str(environment), '.json']) 64 | 65 | # Name of the file for saving the RL policy learned 66 | fileName = 'SavedModels/' + algorithm + '_' + environment 67 | 68 | # Initialization of the RL environment 69 | if environment == 'StochasticGridWorld': 70 | env = StochasticGridWorld() 71 | elif environment in ['CartPole-v0', 'Acrobot-v1', 'LunarLander-v2', 'MountainCar-v0']: 72 | env = gym.make(environment) 73 | parameters = ''.join(['Parameters/parameters_', algorithm, '_ClassicControl.json']) 74 | elif environment in ['MinAtar/Asterix-v0','MinAtar/Breakout-v0', 'MinAtar/Freeway-v0', 'MinAtar/Seaquest-v0', 'MinAtar/SpaceInvaders-v0']: 75 | minAtarWrapper = MinAtarWrapper() 76 | env = minAtarWrapper.wrapper(environment) 77 | parameters = ''.join(['Parameters/parameters_', algorithm, '_MinAtar.json']) 78 | else: 79 | atariWrapper = AtariWrapper() 80 | env = atariWrapper.wrapper(environment, stickyActionsProba=0.25) 81 | parameters = ''.join(['Parameters/parameters_', algorithm, '_Atari57.json']) 82 | 83 | # Determination of the state and action spaces 84 | observationSpace = env.observation_space.shape[0] 85 | actionSpace = env.action_space.n 86 | 87 | # Initialization of the DRL algorithm 88 | algorithmModule = importlib.import_module(str(algorithm)) 89 | className = getattr(algorithmModule, algorithm) 90 | RLAgent = className(observationSpace, actionSpace, environment, parameters) 91 | 92 | # Training of the RL agent 93 | RLAgent.training(env, episodes, verbose=False, rendering=False, plotTraining=False) 94 | #RLAgent.plotExpectedPerformance(env, episodes, iterations=5) 95 | 96 | # Saving of the RL model 97 | RLAgent.saveModel(fileName) 98 | 99 | # Loading of the RL model 100 | RLAgent.loadModel(fileName) 101 | 102 | # Testing of the RL agent 103 | RLAgent.testing(env, verbose=True, rendering=False) 104 | 105 | # Plotting of the true distribution of the random return via Monte Carlo 106 | """ 107 | state = [int(7/2)-1, 7-1] 108 | optimalPolicy = StochasticGridWorldOptimal(env) 109 | MonteCarloDistributions = MonteCarloDistributions(env, optimalPolicy, 0.5) 110 | #MonteCarloDistributions = MonteCarloDistributions(env, RLAgent, 0.5) 111 | MonteCarloDistributions.plotDistributions(state) 112 | """ 113 | -------------------------------------------------------------------------------- /replayMemory.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | ############################################################################### 4 | ################################### Imports ################################### 5 | ############################################################################### 6 | 7 | import random 8 | from collections import deque 9 | from torch.utils.data import Dataset 10 | 11 | 12 | 13 | ############################################################################### 14 | ############################### Class ReplayMemory ############################ 15 | ############################################################################### 16 | 17 | class ReplayMemory(Dataset): 18 | """ 19 | GOAL: Implementing the replay memory required for the Experience Replay 20 | mechanism of the DQN Reinforcement Learning algorithm. This class 21 | inherits from the Dataset class from Pytorch for being used with 22 | efficient data loaders. 23 | 24 | VARIABLES: - memory: Data structure storing the RL experiences. 25 | 26 | METHODS: - __init__: Initialization of the memory data structure. 27 | - __getitem__: Get an item from the replay memory. 28 | - __len__: Return the length of the replay memory. 29 | - push: Insert a new experience into the replay memory. 30 | - sample: Sample a batch of experiences from the replay memory. 31 | - reset: Reset the replay memory. 32 | """ 33 | 34 | def __init__(self, capacity=10000): 35 | """ 36 | GOAL: Initialization of the replay memory data structure. 37 | 38 | INPUTS: - capacity: Capacity of the data structure, specifying the 39 | maximum number of experiences to be stored 40 | simultaneously into the data structure. 41 | 42 | OUTPUTS: / 43 | """ 44 | 45 | random.seed(0) 46 | self.capacity = capacity 47 | self.memory = deque(maxlen=capacity) 48 | 49 | 50 | def __getitem__(self, index): 51 | """ 52 | GOAL: Outputing the item associated with the provided index 53 | from the replay memory. 54 | 55 | INPUTS: / 56 | 57 | OUTPUTS: - item: Selected item of the replay memory. 58 | """ 59 | 60 | return self.memory[index] 61 | 62 | 63 | def __len__(self): 64 | """ 65 | GOAL: Return the size of the replay memory, i.e. the number of experiences 66 | currently stored into the data structure. 67 | 68 | INPUTS: / 69 | 70 | OUTPUTS: - length: Size of the replay memory. 71 | """ 72 | 73 | return len(self.memory) 74 | 75 | 76 | def push(self, state, action, reward, nextState, done): 77 | """ 78 | GOAL: Insert a new experience into the replay memory. An experience 79 | is composed of a state, an action, a reward, a next state and 80 | a termination signal. 81 | 82 | INPUTS: - state: RL state of the experience to be stored. 83 | - action: RL action of the experience to be stored. 84 | - reward: RL reward of the experience to be stored. 85 | - nextState: RL next state of the experience to be stored. 86 | - done: RL termination signal of the experience to be stored. 87 | 88 | OUTPUTS: / 89 | """ 90 | 91 | # FIFO policy 92 | self.memory.append((state, action, reward, nextState, done)) 93 | 94 | 95 | def sample(self, batchSize): 96 | """ 97 | GOAL: Sample a batch of experiences from the replay memory. 98 | 99 | INPUTS: - batchSize: Size of the batch to sample. 100 | 101 | OUTPUTS: - state: RL states of the experience batch sampled. 102 | - action: RL actions of the experience batch sampled. 103 | - reward: RL rewards of the experience batch sampled. 104 | - nextState: RL next states of the experience batch sampled. 105 | - done: RL termination signals of the experience batch sampled. 106 | """ 107 | 108 | state, action, reward, nextState, done = zip(*random.sample(self.memory, batchSize)) 109 | return state, action, reward, nextState, done 110 | 111 | 112 | def reset(self): 113 | """ 114 | GOAL: Reset (empty) the replay memory. 115 | 116 | INPUTS: / 117 | 118 | OUTPUTS: / 119 | """ 120 | 121 | random.seed(0) 122 | self.memory = deque(maxlen=self.capacity) 123 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Python version 2 | python==3.7.7 3 | 4 | # Python basic packages 5 | numpy 6 | scipy 7 | matplotlib 8 | pandas 9 | 10 | # Deep Learning framework 11 | torch 12 | tensorboard 13 | umnn 14 | 15 | # RL environment packages 16 | gym 17 | atari-py 18 | opencv-python 19 | minatar 20 | 21 | # Extra packages 22 | tqdm 23 | --------------------------------------------------------------------------------