├── AtariWrapper.py
├── CDQN.py
├── CustomEnvironments
    ├── __init__.py
    ├── stochasticGridWorld.py
    └── stochasticGridWorldOptimal.py
├── DQN.py
├── Experiments
    └── __init__.py
├── FQF.py
├── Figures
    ├── Distributions
    │   └── __init__.py
    ├── Performance
    │   └── __init__.py
    └── __init__.py
├── IQN.py
├── Models
    ├── CDQN_Model.py
    ├── CDQN_Model_Atari.py
    ├── CNN_Atari.py
    ├── CNN_MinAtar.py
    ├── DNN_Atari.py
    ├── DNN_MinAtar.py
    ├── FQF_Model.py
    ├── FQF_Model_Atari.py
    ├── FQF_Model_Bis.py
    ├── FeedforwardDNN.py
    ├── IQN_Model.py
    ├── IQN_Model_Atari.py
    ├── MonotonicNN.py
    ├── QR_DQN_Model.py
    ├── QR_DQN_Model_Atari.py
    ├── UMDQN_C_Model.py
    ├── UMDQN_C_Model_Atari.py
    ├── UMDQN_KL_Model.py
    ├── UMDQN_KL_Model_Atari.py
    ├── UMDQN_W_Model.py
    ├── UMDQN_W_Model_Atari.py
    └── __init__.py
├── MonteCarloDistributions.py
├── Parameters
    ├── parameters_CDQN_Atari57.json
    ├── parameters_CDQN_ClassicControl.json
    ├── parameters_CDQN_MinAtar.json
    ├── parameters_CDQN_StochasticGridWorld.json
    ├── parameters_DQN_Atari57.json
    ├── parameters_DQN_ClassicControl.json
    ├── parameters_DQN_MinAtar.json
    ├── parameters_DQN_StochasticGridWorld.json
    ├── parameters_FQF_Atari57.json
    ├── parameters_FQF_ClassicControl.json
    ├── parameters_FQF_MinAtar.json
    ├── parameters_FQF_StochasticGridWorld.json
    ├── parameters_IQN_Atari57.json
    ├── parameters_IQN_ClassicControl.json
    ├── parameters_IQN_MinAtar.json
    ├── parameters_IQN_StochasticGridWorld.json
    ├── parameters_QR_DQN_Atari57.json
    ├── parameters_QR_DQN_ClassicControl.json
    ├── parameters_QR_DQN_MinAtar.json
    ├── parameters_QR_DQN_StochasticGridWorld.json
    ├── parameters_UMDQN_C_Atari57.json
    ├── parameters_UMDQN_C_ClassicControl.json
    ├── parameters_UMDQN_C_MinAtar.json
    ├── parameters_UMDQN_C_StochasticGridWorld.json
    ├── parameters_UMDQN_KL_Atari57.json
    ├── parameters_UMDQN_KL_ClassicControl.json
    ├── parameters_UMDQN_KL_MinAtar.json
    ├── parameters_UMDQN_KL_StochasticGridWorld.json
    ├── parameters_UMDQN_W_Atari57.json
    ├── parameters_UMDQN_W_ClassicControl.json
    ├── parameters_UMDQN_W_MinAtar.json
    └── parameters_UMDQN_W_StochasticGridWorld.json
├── QR_DQN.py
├── README.md
├── SavedModels
    └── __init__.py
├── Tensorboard
    └── __init__.py
├── UMDQN_C.py
├── UMDQN_KL.py
├── UMDQN_W.py
├── main.py
├── replayMemory.py
└── requirements.txt


/CDQN.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | import numpy as np
 10 | 
 11 | from matplotlib import pyplot as plt
 12 | 
 13 | import torch
 14 | import torch.optim as optim
 15 | 
 16 | from replayMemory import ReplayMemory
 17 | 
 18 | from Models.CDQN_Model import CDQN_Model
 19 | from Models.CDQN_Model_Atari import CDQN_Model_Atari
 20 | 
 21 | from DQN import DQN
 22 | 
 23 | 
 24 | 
 25 | ###############################################################################
 26 | ############################### Class CDQN ####################################
 27 | ###############################################################################
 28 | 
 29 | class CDQN(DQN):
 30 |     """
 31 |     GOAL: Implementing the Categorical DQN (C51) Deep Reinforcement Learning algorithm.
 32 |     
 33 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 34 |                - gamma: Discount factor of the RL algorithm.
 35 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 36 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 37 |                - targetNetworkUpdate: Update frequency of the target network.
 38 |                - learningUpdatePeriod: Frequency of the learning procedure.
 39 |                - batchSize: Size of the batch to sample from the replay memory.
 40 |                - capacity: Capacity of the replay memory.
 41 |                - replayMemory: Experience Replay memory.
 42 |                - rewardClipping: Clipping of the RL rewards.
 43 |                - gradientClipping: Clipping of the training loss.
 44 |                - optimizer: DL optimizer (ADAM).
 45 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 46 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 47 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 48 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 49 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 50 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 51 |                - targetNetwork: Deep Neural Network representing the target network. 
 52 |                                 
 53 |     METHODS: - __init__: Initialization of the RL algorithm.
 54 |              - chooseAction: Choose a valid action based on the current state
 55 |                              observed, according to the RL policy learned.
 56 |              - learning: Execute the RL algorithm learning procedure.
 57 |     """
 58 | 
 59 |     def __init__(self, observationSpace, actionSpace, environment,
 60 |                  parametersFileName='', reporting=True):
 61 |         """
 62 |         GOAL: Initializing the RL agent based on the CDQN Deep Reinforcement Learning
 63 |               algorithm, by setting up the algorithm parameters as well as 
 64 |               the Deep Neural Networks.
 65 |         
 66 |         INPUTS: - observationSpace: RL observation space.
 67 |                 - actionSpace: RL action space.
 68 |                 - environment: Name of the RL environment.
 69 |                 - parametersFileName: Name of the JSON parameters file.
 70 |                 - reporting: Enable the reporting of the results.
 71 |         
 72 |         OUTPUTS: /
 73 |         """
 74 | 
 75 |         # Initialization of the DQN parent class
 76 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 77 | 
 78 |         # Setting of the parameters
 79 |         if parametersFileName == '':
 80 |             parametersFileName = ''.join(['Parameters/parameters_CDQN_', str(environment), '.json'])
 81 |         parameters = self.readParameters(parametersFileName)
 82 | 
 83 |         # Set the device for DNN computations (CPU or GPU)
 84 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 85 | 
 86 |         # Set the general parameters of the RL algorithm
 87 |         self.gamma = parameters['gamma']
 88 |         self.learningRate = parameters['learningRate']
 89 |         self.epsilon = parameters['epsilon']
 90 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 91 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 92 |         self.rewardClipping = parameters['rewardClipping']
 93 |         self.gradientClipping = parameters['gradientClipping']
 94 | 
 95 |         # Set the Experience Replay mechanism
 96 |         self.batchSize = parameters['batchSize']
 97 |         self.capacity = parameters['capacity']
 98 |         self.replayMemory = ReplayMemory(self.capacity)
 99 | 
100 |         # Set the distribution support
101 |         self.numberOfAtoms = parameters['numberOfAtoms']
102 |         self.minReturn = parameters['minReturn']
103 |         self.maxReturn = parameters['maxReturn']
104 |         self.support = np.linspace(self.minReturn, self.maxReturn, self.numberOfAtoms)
105 |         self.supportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfAtoms).to(self.device)
106 | 
107 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
108 |         self.atari = parameters['atari']
109 |         self.minatar = parameters['minatar']
110 |         if self.atari or self.minatar:
111 |             self.policyNetwork = CDQN_Model_Atari(observationSpace, actionSpace*self.numberOfAtoms, self.numberOfAtoms, minAtar=self.minatar).to(self.device)
112 |             self.targetNetwork = CDQN_Model_Atari(observationSpace, actionSpace*self.numberOfAtoms, self.numberOfAtoms, minAtar=self.minatar).to(self.device)
113 |         else:
114 |             self.policyNetwork = CDQN_Model(observationSpace, actionSpace*self.numberOfAtoms, parameters['structureDNN'], self.numberOfAtoms).to(self.device)
115 |             self.targetNetwork = CDQN_Model(observationSpace, actionSpace*self.numberOfAtoms, parameters['structureDNN'], self.numberOfAtoms).to(self.device)
116 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
117 | 
118 |         # Set the Deep Learning optimizer
119 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
120 | 
121 |         # Set the Epsilon-Greedy exploration technique
122 |         self.epsilonStart = parameters['epsilonStart']
123 |         self.epsilonEnd = parameters['epsilonEnd']
124 |         self.epsilonDecay = parameters['epsilonDecay']
125 |         self.epsilonTest = parameters['epsilonTest']
126 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
127 | 
128 |         # Initialization of the experiment folder and tensorboard writer
129 |         self.initReporting(parameters, 'CDQN')
130 | 
131 | 
132 |     def chooseAction(self, state, plot=False):
133 |         """
134 |         GOAL: Choose a valid RL action from the action space according to the
135 |               RL policy as well as the current RL state observed.
136 |         
137 |         INPUTS: - state: RL state returned by the environment.
138 |                 - plot: Enable the plotting of the random returns distributions.
139 |         
140 |         OUTPUTS: - action: RL action chosen from the action space.
141 |         """
142 | 
143 |         # Choose the best action based on the RL policy
144 |         with torch.no_grad():
145 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
146 |             distribution = self.policyNetwork(state).squeeze(0)
147 |             distributionReturn = distribution * self.supportTorch
148 |             QValues = distributionReturn.sum(1)
149 |             _, action = QValues.max(0)
150 | 
151 |             # If required, plot the return distribution associated with each action
152 |             if plot:
153 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
154 |                 fig = plt.figure()
155 |                 ax = fig.add_subplot()
156 |                 QValues = QValues.cpu().numpy()
157 |                 for a in range(self.actionSpace):
158 |                     dist = distribution[a].cpu().numpy()
159 |                     ax.bar(self.support, dist, label=''.join(['Action ', str(a), ' random return Z']), width=(self.maxReturn-self.minReturn)/self.numberOfAtoms, edgecolor='black', alpha=0.5, color=colors[a])
160 |                     ax.axvline(x=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
161 |                 ax.set_xlabel('Random return')
162 |                 ax.set_ylabel('Probability Density Function (PDF)')
163 |                 ax.legend()
164 |                 plt.show()
165 |             
166 |             return action.item()
167 | 
168 | 
169 |     def learning(self):
170 |         """
171 |         GOAL: Sample a batch of past experiences and learn from it
172 |               by updating the Reinforcement Learning policy.
173 |         
174 |         INPUTS: /
175 |         
176 |         OUTPUTS: - loss: Loss of the learning procedure.
177 |         """
178 |         
179 |         # Check that the replay memory is filled enough
180 |         if (len(self.replayMemory) >= self.batchSize):
181 | 
182 |             # Sample a batch of experiences from the replay memory
183 |             batch = self.dataLoaderIter.next()
184 |             state = batch[0].float().to(self.device)
185 |             action = batch[1].long().to(self.device)
186 |             reward = batch[2].float().to(self.device)
187 |             nextState = batch[3].float().to(self.device)
188 |             done = batch[4].float().to(self.device)
189 | 
190 |             # Computation of the current return distribution
191 |             distribution = self.policyNetwork(state)
192 |             action = action.unsqueeze(1).unsqueeze(1).expand(self.batchSize, 1, self.numberOfAtoms)
193 |             distribution = distribution.gather(1, action).squeeze(1)
194 | 
195 |             # Computation of the new distribution to be learnt by the policy DNN
196 |             with torch.no_grad():
197 |                 nextDistribution = self.targetNetwork(nextState)
198 |                 nextAction = (nextDistribution * self.supportTorch).sum(2).max(1)[1].unsqueeze(1).unsqueeze(1).expand(self.batchSize, 1, self.numberOfAtoms)
199 |                 nextDistribution = nextDistribution.gather(1, nextAction).squeeze(1)
200 |                 deltaZ = float(self.maxReturn - self.minReturn) / (self.numberOfAtoms - 1)
201 |                 tz = reward.view(-1, 1) + (1 - done.view(-1, 1)) * self.gamma * self.supportTorch
202 |                 tz = tz.clamp(min=self.minReturn, max=self.maxReturn)
203 |                 b  = ((tz - self.minReturn) / deltaZ)
204 |                 l  = b.floor().long()
205 |                 u  = b.ceil().long()
206 |                 offset = torch.linspace(0, (self.batchSize - 1) * self.numberOfAtoms, self.batchSize).long().unsqueeze(1).expand(self.batchSize, self.numberOfAtoms).to(self.device)
207 |                 projectedDistribution = torch.zeros(nextDistribution.size()).to(self.device)
208 |                 projectedDistribution.view(-1).index_add_(0, (l + offset).view(-1), (nextDistribution * (u.float() - b)).view(-1))
209 |                 projectedDistribution.view(-1).index_add_(0, (u + offset).view(-1), (nextDistribution * (b - l.float())).view(-1))
210 | 
211 |             # Computation of the loss
212 |             loss = -(projectedDistribution * distribution.log()).sum(1).mean()
213 | 
214 |             # Computation of the gradients
215 |             self.optimizer.zero_grad()
216 |             loss.backward()
217 | 
218 |             # Gradient Clipping
219 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
220 | 
221 |             # Perform the Deep Neural Network optimization
222 |             self.optimizer.step()
223 | 
224 |             return loss.item()
225 | 


--------------------------------------------------------------------------------
/CustomEnvironments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/CustomEnvironments/__init__.py


--------------------------------------------------------------------------------
/CustomEnvironments/stochasticGridWorld.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import random
  8 | import time
  9 | 
 10 | import numpy as np
 11 | 
 12 | from matplotlib import pyplot as plt
 13 | 
 14 | import gym
 15 | from gym import spaces
 16 | 
 17 | 
 18 | 
 19 | ###############################################################################
 20 | ################################ Global variables #############################
 21 | ###############################################################################
 22 | 
 23 | # Default parameters for the environment configuration
 24 | size = 7
 25 | timeOut = 20
 26 | 
 27 | # Parameters associated with stochasticity
 28 | doubleProbability = 0.5
 29 | stochasticRewards = True
 30 | 
 31 | 
 32 | 
 33 | ###############################################################################
 34 | ########################### Class StochasticGridWorld #########################
 35 | ###############################################################################
 36 | 
 37 | class StochasticGridWorld(gym.Env):
 38 |     """
 39 |     GOAL: Implementing a simple RL environment consisting of a 2D grid world
 40 |           where the agent has to reach a fixed objective while avoiding a trap,
 41 |           with potentially stochastic transitions and rewards.
 42 |     
 43 |     VARIABLES: - observation_space: RL environment observation space.
 44 |                - action_space: RL environment action space.
 45 |                - playerPosition: Position of the player (x, y).
 46 |                - trapPosition: Position of the trap (x, y).
 47 |                - targetPosition: Position of the target (x, y).
 48 |                - timeElapsed: Time elapsed.
 49 |                - state: RL state or observation.
 50 |                - reward: RL reward signal.
 51 |                - done: RL termination signal (episode).
 52 |                - info: Additional RL information.
 53 |                                 
 54 |     METHODS: - __init__: Initialization of the RL environment.
 55 |              - reset: Resetting of the RL environment.
 56 |              - step: Update the RL environment according to the agent's action.
 57 |              - render: Render graphically the current state of the RL environment.
 58 |     """
 59 | 
 60 |     def __init__(self, size=size):
 61 |         """
 62 |         GOAL: Perform the initialization of the RL environment.
 63 |         
 64 |         INPUTS: - size: Size of the square grid world.
 65 |         
 66 |         OUTPUTS: /
 67 |         """
 68 | 
 69 |         super(StochasticGridWorld, self).__init__()
 70 | 
 71 |         # Initialize the random function with a fixed random seed
 72 |         random.seed(time.time())
 73 | 
 74 |         # Definition of the observation/state and action spaces
 75 |         self.observation_space = spaces.Box(low=0, high=size-1, shape=(2, 1), dtype=np.uint8)
 76 |         self.action_space = spaces.Discrete(4)
 77 |         self.size = size
 78 | 
 79 |         # Initialization of the traps and target positions
 80 |         self.trapPosition = [int(self.size/2), int(self.size/2)]
 81 |         self.targetPosition = [int(self.size/2), self.size-1]
 82 | 
 83 |         # Initialization of the player position
 84 |         x = int(random.random() * (self.size-1))
 85 |         y = int(random.random() * (self.size-1))
 86 |         self.playerPosition = [x, y]
 87 |         while self.playerPosition == self.targetPosition or self.playerPosition == self.trapPosition:
 88 |             x = int(random.random() * (self.size-1))
 89 |             y = int(random.random() * (self.size-1))
 90 |             self.playerPosition = [x, y]
 91 | 
 92 |         # Initialization of the time elapsed
 93 |         self.timeElapsed = 0
 94 | 
 95 |         # Initialization of the RL variables
 96 |         self.state = np.array([self.playerPosition[0], self.playerPosition[1]])
 97 |         self.reward = 0.
 98 |         self.done = 0
 99 |         self.info = {}
100 | 
101 | 
102 |     def reset(self):
103 |         """
104 |         GOAL: Perform a reset of the RL environment.
105 |         
106 |         INPUTS: /
107 |         
108 |         OUTPUTS: - state: RL state or observation.
109 |         """
110 | 
111 |         # Reset of the player position and time elapsed
112 |         x = int(random.random() * (self.size-1))
113 |         y = int(random.random() * (self.size-1))
114 |         self.playerPosition = [x, y]
115 |         while self.playerPosition == self.targetPosition or self.playerPosition == self.trapPosition:
116 |             x = int(random.random() * (self.size-1))
117 |             y = int(random.random() * (self.size-1))
118 |             self.playerPosition = [x, y]
119 |         self.timeElapsed = 0
120 | 
121 |         # Reset of the RL variables
122 |         self.state = np.array([self.playerPosition[0], self.playerPosition[1]])
123 |         self.reward = 0.
124 |         self.done = 0
125 |         self.info = {}
126 | 
127 |         return self.state
128 | 
129 | 
130 |     def step(self, action):
131 |         """
132 |         GOAL: Update the RL environment according to the agent's action.
133 |         
134 |         INPUTS: - action: RL action outputted by the agent.
135 |         
136 |         OUTPUTS: - state: RL state or observation.
137 |                  - reward: RL reward signal.
138 |                  - done: RL termination signal.
139 |                  - info: Additional RL information.
140 |         """
141 | 
142 |         # Stochasticity associated with the next move of the agent
143 |         rand = random.random()
144 |         if rand > doubleProbability:
145 |             moveRange = 1
146 |         else:
147 |             moveRange = 2
148 |         
149 |         # Go right
150 |         if action == 0:
151 |             self.playerPosition[0] = min(self.playerPosition[0]+moveRange, self.size-1)
152 |         # Go down
153 |         elif action == 1:
154 |             self.playerPosition[1] = max(self.playerPosition[1]-moveRange, 0)
155 |         # Go left
156 |         elif action == 2:
157 |             self.playerPosition[0] = max(self.playerPosition[0]-moveRange, 0)
158 |         # Go up
159 |         elif action == 3:
160 |             self.playerPosition[1] = min(self.playerPosition[1]+moveRange, self.size-1)
161 |         # Invalid action
162 |         else:
163 |             print("Error: invalid action...")
164 | 
165 |         # Incrementation of the time elapsed
166 |         self.timeElapsed += 1
167 |         
168 |         # Assign the appropriate RL reward
169 |         if stochasticRewards:
170 |             self.reward = np.random.normal(loc=0.0, scale=0.1)
171 |         else:
172 |             self.reward = 0.0
173 |         if self.playerPosition == self.targetPosition:
174 |             if stochasticRewards:
175 |                 self.reward = np.random.normal(loc=1.0, scale=0.1)
176 |             else:
177 |                 self.reward = 1.0
178 |             self.done = 1
179 |         elif self.playerPosition == self.trapPosition:
180 |             if stochasticRewards:
181 |                 self.reward = np.random.normal(loc=-1.0, scale=0.1)
182 |             else:
183 |                 self.reward = -1.0
184 |             self.done = 1
185 | 
186 |         # Check if the time elapsed reaches the time limit
187 |         if self.timeElapsed >= timeOut:
188 |             self.done = 1
189 | 
190 |         # Update of the RL state
191 |         self.state = np.array([self.playerPosition[0], self.playerPosition[1]])
192 | 
193 |         # Return of the RL variables
194 |         return self.state, self.reward, self.done, self.info
195 | 
196 |     
197 |     def render(self, mode='human'):
198 |         """
199 |         GOAL: Render graphically the current state of the RL environment.
200 |         
201 |         INPUTS: /
202 |         
203 |         OUTPUTS: /
204 |         """
205 | 
206 |         fig = plt.figure(figsize=(8, 8))
207 |         ax = fig.gca()
208 |         ax.set_xticks(np.arange(0, self.size+1, 1))
209 |         ax.set_yticks(np.arange(0, self.size+1, 1))
210 |         ax.set(xlim=(0, self.size), ylim=(0, self.size))
211 |         plt.scatter(self.playerPosition[0]+0.5, self.playerPosition[1]+0.5, s=100, color='blue')
212 |         plt.scatter(self.targetPosition[0]+0.5, self.targetPosition[1]+0.5, s=100, color='green')
213 |         plt.scatter(self.trapPosition[0]+0.5, self.trapPosition[1]+0.5, s=100, color='red')
214 |         plt.grid()
215 |         text = ''.join(['Time elapsed: ', str(self.timeElapsed)])
216 |         plt.text(0, self.size+0.2, text, fontsize=12)
217 |         plt.show()
218 |         #plt.savefig("Figures/Distributions/StochasticGridWorldState.pdf", format="pdf")
219 | 
220 | 
221 |     def setState(self, state):
222 |         """
223 |         GOAL: Reset the RL environment and set a specific initial state.
224 |         
225 |         INPUTS: - state: Information about the state to set.
226 |         
227 |         OUTPUTS: - state: RL state of the environment.
228 |         """
229 | 
230 |         # Reset of the environment
231 |         self.reset()
232 | 
233 |         # Set the initial state as specified
234 |         self.timeElapsed = 0
235 |         self.playerPosition = [state[0], state[1]]
236 |         self.state = np.array([self.playerPosition[0], self.playerPosition[1]])
237 | 
238 |         return self.state
239 |         


--------------------------------------------------------------------------------
/CustomEnvironments/stochasticGridWorldOptimal.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ##################### Class StochasticGridWorldOptimal ########################
 5 | ###############################################################################
 6 | 
 7 | class StochasticGridWorldOptimal():
 8 |     """
 9 |     GOAL: Implementing the optimal policy associated with the stochastic grid
10 |           world environment.
11 |     
12 |     VARIABLES: - environment: Stochastic grid world environment.
13 |                                 
14 |     METHODS: - __init__: Initialization of the class.
15 |     """
16 | 
17 |     def __init__(self, environment):
18 |         """
19 |         GOAL: Perform the initialization of the class.
20 |         
21 |         INPUTS: - environment: Stochastic grid world environment considered.
22 |         
23 |         OUTPUTS: - processState: Preprocessing of the RL state.
24 |                  - chooseAction: Choose the optimal RL action.
25 |         """
26 | 
27 |         # Initialization of important variables
28 |         self.environment = environment
29 |         self.size = self.environment.size
30 |         self.trapPosition = self.environment.trapPosition
31 |         self.targetPosition = self.environment.targetPosition
32 | 
33 |     
34 |     def processState(self, state):
35 |         """
36 |         GOAL: Potentially process the RL state returned by the environment.
37 |         
38 |         INPUTS: - state: RL state returned by the environment.
39 |         
40 |         OUTPUTS: - state: RL state processed.
41 |         """
42 | 
43 |         return state
44 | 
45 |     
46 |     def chooseAction(self, state, plot=False):
47 |         """
48 |         GOAL: Choose the optimal RL action.
49 |         
50 |         INPUTS: - state: RL state returned by the environment.
51 |                 - plot: False, because not supported.
52 |         
53 |         OUTPUTS: - action: RL action selected.
54 |         """
55 | 
56 |         # Retrieve the coordinates of the agent
57 |         x = state[0]
58 |         y = state[1]
59 | 
60 |         # Implementation of the optimal policy
61 |         if x == self.targetPosition[0] and y < self.trapPosition[1]:
62 |             action = 0
63 |         elif x == self.targetPosition[0] and y > self.trapPosition[1]:
64 |             action = 3
65 |         elif y == self.targetPosition[1] and x < self.targetPosition[0]:
66 |             action = 0
67 |         elif y == self.targetPosition[1] and x > self.targetPosition[0]:
68 |             action = 2
69 |         elif (x < self.targetPosition[0] or x > self.targetPosition[0]) and y < (self.targetPosition[1]-1):
70 |             action = 3
71 |         elif y == (self.targetPosition[1]-1) and y > self.trapPosition[1] and x < self.targetPosition[0]:
72 |             action = 0
73 |         elif y == (self.targetPosition[1]-1) and y > self.trapPosition[1] and x > self.targetPosition[0]:
74 |             action = 2
75 |         else:
76 |             action = 3
77 | 
78 |         # Return of the RL action selected
79 |         return action
80 | 


--------------------------------------------------------------------------------
/Experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Experiments/__init__.py


--------------------------------------------------------------------------------
/FQF.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | from matplotlib import pyplot as plt
 10 | 
 11 | import torch
 12 | import torch.optim as optim
 13 | 
 14 | from replayMemory import ReplayMemory
 15 | 
 16 | from Models.FQF_Model import FQF_Model
 17 | from Models.FQF_Model_Atari import FQF_Model_Atari
 18 | from Models.FQF_Model_Bis import FQF_Model_Bis
 19 | 
 20 | from DQN import DQN
 21 | 
 22 | 
 23 | 
 24 | ###############################################################################
 25 | ################################## Class FQF ##################################
 26 | ###############################################################################
 27 | 
 28 | class FQF(DQN):
 29 |     """
 30 |     GOAL: Implementing the FQF Deep Reinforcement Learning algorithm.
 31 |     
 32 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 33 |                - gamma: Discount factor of the RL algorithm.
 34 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 35 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 36 |                - targetNetworkUpdate: Update frequency of the target network.
 37 |                - learningUpdatePeriod: Frequency of the learning procedure.
 38 |                - batchSize: Size of the batch to sample from the replay memory.
 39 |                - capacity: Capacity of the replay memory.
 40 |                - replayMemory: Experience Replay memory.
 41 |                - rewardClipping: Clipping of the RL rewards.
 42 |                - gradientClipping: Clipping of the training loss.
 43 |                - optimizer: DL optimizer (ADAM).
 44 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 45 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 46 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 47 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 48 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 49 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 50 |                - targetNetwork: Deep Neural Network representing the target network.
 51 |         
 52 |     METHODS: - __init__: Initialization of the RL algorithm.
 53 |              - chooseAction: Choose a valid action based on the current state
 54 |                              observed, according to the RL policy learned.
 55 |              - learning: Execute the RL algorithm learning procedure.
 56 |     """
 57 | 
 58 |     def __init__(self, observationSpace, actionSpace, environment,
 59 |                  parametersFileName='', reporting=True):
 60 |         """
 61 |         GOAL: Initializing the RL agent based on the FQF Deep Reinforcement Learning
 62 |               algorithm, by setting up the algorithm parameters as well as 
 63 |               the Deep Neural Networks.
 64 |         
 65 |         INPUTS: - observationSpace: RL observation space.
 66 |                 - actionSpace: RL action space.
 67 |                 - environment: Name of the RL environment.
 68 |                 - parametersFileName: Name of the JSON parameters file.
 69 |                 - reporting: Enable the reporting of the results.
 70 |         
 71 |         OUTPUTS: /
 72 |         """
 73 | 
 74 |         # Initialization of the DQN parent class
 75 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 76 | 
 77 |         # Setting of the parameters
 78 |         if parametersFileName == '':
 79 |             parametersFileName = ''.join(['Parameters/parameters_FQF_', str(environment), '.json'])
 80 |         parameters = self.readParameters(parametersFileName)
 81 | 
 82 |         # Set the device for DNN computations (CPU or GPU)
 83 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 84 | 
 85 |         # Set the general parameters of the RL algorithm
 86 |         self.gamma = parameters['gamma']
 87 |         self.learningRate = parameters['learningRate']
 88 |         self.epsilon = parameters['epsilon']
 89 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 90 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 91 |         self.rewardClipping = parameters['rewardClipping']
 92 |         self.gradientClipping = parameters['gradientClipping']
 93 | 
 94 |         # Set the Experience Replay mechanism
 95 |         self.batchSize = parameters['batchSize']
 96 |         self.capacity = parameters['capacity']
 97 |         self.replayMemory = ReplayMemory(self.capacity)
 98 | 
 99 |         # Set the distribution support
100 |         self.N = parameters['N']
101 |         self.K = parameters['K']
102 |         self.NCos = parameters['NCos']
103 |         self.kappa = 1.0
104 | 
105 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
106 |         self.atari = parameters['atari']
107 |         self.minatar = parameters['minatar']
108 |         if self.atari or self.minatar:
109 |             self.policyNetwork = FQF_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device)
110 |             self.targetNetwork = FQF_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device)
111 |             stateEmbedding = self.policyNetwork.getEmbeddingSize()
112 |         else:
113 |             self.policyNetwork = FQF_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device)
114 |             self.targetNetwork = FQF_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device)
115 |             stateEmbedding = parameters['stateEmbedding']
116 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
117 | 
118 |         # Set the Deep Learning optimizer
119 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
120 | 
121 |         # Set the Fraction Proposal Network of the FQF algorithm + associated parameters
122 |         self.fractionProposalNetwork = FQF_Model_Bis(stateEmbedding, self.N, self.device).to(self.device)
123 |         self.optimizerFPN = optim.RMSprop(self.fractionProposalNetwork.parameters(), lr=0.000000001, alpha=0.95, eps=0.00001)
124 |         self.entropyCoefficient = 0.001
125 | 
126 |         # Set the Epsilon-Greedy exploration technique
127 |         self.epsilonStart = parameters['epsilonStart']
128 |         self.epsilonEnd = parameters['epsilonEnd']
129 |         self.epsilonDecay = parameters['epsilonDecay']
130 |         self.epsilonTest = parameters['epsilonTest']
131 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
132 | 
133 |         # Initialization of the experiment folder and tensorboard writer
134 |         self.initReporting(parameters, 'FQF')
135 | 
136 | 
137 |     def chooseAction(self, state, plot=False):
138 |         """
139 |         GOAL: Choose a valid RL action from the action space according to the
140 |               RL policy as well as the current RL state observed.
141 |         
142 |         INPUTS: - state: RL state returned by the environment.
143 |                 - plot: Enable the plotting of the random returns distributions.
144 |         
145 |         OUTPUTS: - action: RL action chosen from the action space.
146 |         """
147 | 
148 |         # Choose the best action based on the RL policy
149 |         with torch.no_grad():
150 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
151 |             stateEmbedding = self.policyNetwork.embedding(state)
152 |             _, tausBis, _ = self.fractionProposalNetwork(stateEmbedding)
153 |             quantiles = self.policyNetwork(state, tausBis, stateEmbedding)
154 |             QValues = quantiles.mean(2)
155 |             _, action = QValues.max(1)
156 | 
157 |             # If required, plot the return distribution associated with each action
158 |             if plot:
159 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
160 |                 fig = plt.figure()
161 |                 ax = fig.add_subplot()
162 |                 taus = torch.linspace(0.0, 1.0, 10000).to(self.device)
163 |                 quantiles = self.policyNetwork(state, taus.unsqueeze(0), stateEmbedding)
164 |                 QValues = quantiles.mean(2)
165 |                 taus = taus.cpu().numpy()
166 |                 quantiles = quantiles.squeeze(0).cpu().numpy()
167 |                 QValues = QValues.squeeze(0).cpu().numpy()
168 |                 for a in range(self.actionSpace):
169 |                     ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a])
170 |                     ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
171 |                 ax.set_xlabel('Quantile fraction')
172 |                 ax.set_ylabel('Quantile Function (QF)')
173 |                 ax.legend()
174 |                 plt.show()
175 |             
176 |             return action.item()
177 | 
178 | 
179 |     def learning(self):
180 |         """
181 |         GOAL: Sample a batch of past experiences and learn from it
182 |               by updating the Reinforcement Learning policy.
183 |         
184 |         INPUTS: /
185 |         
186 |         OUTPUTS: - loss: Loss of the learning procedure.
187 |         """
188 |         
189 |         # Check that the replay memory is filled enough
190 |         if (len(self.replayMemory) >= self.batchSize):
191 | 
192 |             # Sample a batch of experiences from the replay memory
193 |             batch = self.dataLoaderIter.next()
194 |             state = batch[0].float().to(self.device)
195 |             action = batch[1].long().to(self.device)
196 |             reward = batch[2].float().to(self.device)
197 |             nextState = batch[3].float().to(self.device)
198 |             done = batch[4].float().to(self.device)
199 | 
200 |             # Computation of the current return distribution
201 |             stateEmbedding = self.policyNetwork.embedding(state)
202 |             taus, tausBis, entropy = self.fractionProposalNetwork(stateEmbedding)
203 |             quantiles = self.policyNetwork(state, tausBis, stateEmbedding)
204 |             actionBis = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N)
205 |             quantiles = quantiles.gather(1, actionBis).squeeze(1)
206 | 
207 |             # Computation of the Fractional loss for the FPN
208 |             with torch.no_grad():
209 |                 quantilesBis = self.policyNetwork(state, taus[:, 1:-1], stateEmbedding)
210 |                 actionBis = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N-1)
211 |                 quantilesBis = quantilesBis.gather(1, actionBis).squeeze(1)
212 |             gradients1 = quantilesBis - quantiles[:, :-1]
213 |             gradients2 = quantilesBis - quantiles[:, 1:]
214 |             flag1 = quantilesBis > torch.cat([quantiles[:, :1], quantilesBis[:, :-1]], dim=1)
215 |             flag2 = quantilesBis < torch.cat([quantilesBis[:, 1:], quantiles[:, -1:]], dim=1)
216 |             gradients = (torch.where(flag1, gradients1, - gradients1) + torch.where(flag2, gradients2, -gradients2)).view(self.batchSize, self.N-1)
217 |             fractionalLoss = (gradients * taus[:, 1:-1]).sum(dim=1).mean()
218 |             fractionalLoss += self.entropyCoefficient * entropy.mean()
219 | 
220 |             # Computation of the new distribution to be learnt by the policy DNN
221 |             with torch.no_grad(): 
222 |                 nextStateEmbedding = self.targetNetwork.embedding(nextState)
223 |                 nextQuantiles = self.targetNetwork(nextState, tausBis, nextStateEmbedding)
224 |                 nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N)
225 |                 nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1)
226 |                 targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1))
227 | 
228 |             # Computation of the quantile huber loss
229 |             difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2)
230 |             error = difference.abs()
231 |             loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa)))
232 |             loss = (tausBis.unsqueeze(2) - (difference < 0).float()).abs() * loss/self.kappa
233 |             loss = loss.mean(1).sum(1).mean()
234 | 
235 |             # Update of the Fraction Proposal Network parameters
236 |             self.optimizerFPN.zero_grad()
237 |             fractionalLoss.backward(retain_graph=True)
238 |             self.optimizerFPN.step()
239 | 
240 |             # Computation of the gradients
241 |             self.optimizer.zero_grad()
242 |             loss.backward()
243 | 
244 |             # Gradient Clipping
245 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
246 | 
247 |             # Perform the Deep Neural Network optimization
248 |             self.optimizer.step()
249 | 
250 |             return loss.item()
251 | 


--------------------------------------------------------------------------------
/Figures/Distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/Distributions/__init__.py


--------------------------------------------------------------------------------
/Figures/Performance/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/Performance/__init__.py


--------------------------------------------------------------------------------
/Figures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Figures/__init__.py


--------------------------------------------------------------------------------
/IQN.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from matplotlib import pyplot as plt
 13 | 
 14 | import torch
 15 | import torch.optim as optim
 16 | 
 17 | from replayMemory import ReplayMemory
 18 | 
 19 | from Models.IQN_Model import IQN_Model
 20 | from Models.IQN_Model_Atari import IQN_Model_Atari
 21 | 
 22 | from DQN import DQN
 23 | 
 24 | 
 25 | 
 26 | ###############################################################################
 27 | ################################## Class IQN ##################################
 28 | ###############################################################################
 29 | 
 30 | class IQN(DQN):
 31 |     """
 32 |     GOAL: Implementing the IQN Deep Reinforcement Learning algorithm.
 33 |     
 34 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 35 |                - gamma: Discount factor of the RL algorithm.
 36 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 37 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 38 |                - targetNetworkUpdate: Update frequency of the target network.
 39 |                - learningUpdatePeriod: Frequency of the learning procedure.
 40 |                - batchSize: Size of the batch to sample from the replay memory.
 41 |                - capacity: Capacity of the replay memory.
 42 |                - replayMemory: Experience Replay memory.
 43 |                - rewardClipping: Clipping of the RL rewards.
 44 |                - gradientClipping: Clipping of the training loss.
 45 |                - optimizer: DL optimizer (ADAM).
 46 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 47 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 48 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 49 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 50 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 51 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 52 |                - targetNetwork: Deep Neural Network representing the target network.
 53 |                                 
 54 |     METHODS: - __init__: Initialization of the RL algorithm.
 55 |              - chooseAction: Choose a valid action based on the current state
 56 |                              observed, according to the RL policy learned.
 57 |              - learning: Execute the RL algorithm learning procedure.
 58 |     """
 59 | 
 60 |     def __init__(self, observationSpace, actionSpace, environment,
 61 |                  parametersFileName='', reporting=True):
 62 |         """
 63 |         GOAL: Initializing the RL agent based on the IQN Deep Reinforcement Learning
 64 |               algorithm, by setting up the algorithm parameters as well as 
 65 |               the Deep Neural Networks.
 66 |         
 67 |         INPUTS: - observationSpace: RL observation space.
 68 |                 - actionSpace: RL action space.
 69 |                 - environment: Name of the RL environment.
 70 |                 - parametersFileName: Name of the JSON parameters file.
 71 |                 - reporting: Enable the reporting of the results.
 72 |         
 73 |         OUTPUTS: /
 74 |         """
 75 | 
 76 |         # Initialization of the DQN parent class
 77 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 78 | 
 79 |         # Setting of the parameters
 80 |         if parametersFileName == '':
 81 |             parametersFileName = ''.join(['Parameters/parameters_IQN_', str(environment), '.json'])
 82 |         parameters = self.readParameters(parametersFileName)
 83 | 
 84 |         # Set the device for DNN computations (CPU or GPU)
 85 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 86 | 
 87 |         # Set the general parameters of the RL algorithm
 88 |         self.gamma = parameters['gamma']
 89 |         self.learningRate = parameters['learningRate']
 90 |         self.epsilon = parameters['epsilon']
 91 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 92 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 93 |         self.rewardClipping = parameters['rewardClipping']
 94 |         self.gradientClipping = parameters['gradientClipping']
 95 | 
 96 |         # Set the Experience Replay mechanism
 97 |         self.batchSize = parameters['batchSize']
 98 |         self.capacity = parameters['capacity']
 99 |         self.replayMemory = ReplayMemory(self.capacity)
100 | 
101 |         # Set the distribution support
102 |         self.N = parameters['N']
103 |         self.K = parameters['K']
104 |         self.NCos = parameters['NCos']
105 |         self.kappa = 1.0
106 | 
107 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
108 |         self.atari = parameters['atari']
109 |         self.minatar = parameters['minatar']
110 |         if self.atari or self.minatar:
111 |             self.policyNetwork = IQN_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device)
112 |             self.targetNetwork = IQN_Model_Atari(observationSpace, actionSpace, self.NCos, self.device, minAtar=self.minatar).to(self.device)
113 |         else:
114 |             self.policyNetwork = IQN_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device)
115 |             self.targetNetwork = IQN_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['stateEmbedding'], self.NCos, self.device).to(self.device)
116 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
117 | 
118 |         # Set the Deep Learning optimizer
119 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
120 | 
121 |         # Set the Epsilon-Greedy exploration technique
122 |         self.epsilonStart = parameters['epsilonStart']
123 |         self.epsilonEnd = parameters['epsilonEnd']
124 |         self.epsilonDecay = parameters['epsilonDecay']
125 |         self.epsilonTest = parameters['epsilonTest']
126 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
127 | 
128 |         # Initialization of the experiment folder and tensorboard writer
129 |         self.initReporting(parameters, 'IQN')
130 | 
131 | 
132 |     def chooseAction(self, state, plot=False):
133 |         """
134 |         GOAL: Choose a valid RL action from the action space according to the
135 |               RL policy as well as the current RL state observed.
136 |         
137 |         INPUTS: - state: RL state returned by the environment.
138 |                 - plot: Enable the plotting of the random returns distributions.
139 |         
140 |         OUTPUTS: - action: RL action chosen from the action space.
141 |         """
142 | 
143 |         # Choose the best action based on the RL policy
144 |         with torch.no_grad():
145 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
146 |             quantiles, _ = self.policyNetwork(state, self.K)
147 |             QValues = quantiles.mean(2)
148 |             _, action = QValues.max(1)
149 |         
150 |             # If required, plot the return distribution associated with each action
151 |             if plot:
152 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
153 |                 fig = plt.figure()
154 |                 ax = fig.add_subplot()
155 |                 quantiles, taus = self.policyNetwork(state, 10000, False)
156 |                 taus = taus[0].squeeze(1).cpu().numpy()
157 |                 quantiles = quantiles.squeeze(0).cpu().numpy()
158 |                 QValues = QValues.squeeze(0).cpu().numpy()
159 |                 for a in range(self.actionSpace):
160 |                     ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a])
161 |                     ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
162 |                 ax.set_xlabel('Quantile fraction')
163 |                 ax.set_ylabel('Quantile Function (QF)')
164 |                 ax.legend()
165 |                 plt.show()
166 |             
167 |         return action.item()
168 | 
169 | 
170 |     def learning(self):
171 |         """
172 |         GOAL: Sample a batch of past experiences and learn from it
173 |               by updating the Reinforcement Learning policy.
174 |         
175 |         INPUTS: /
176 |         
177 |         OUTPUTS: - loss: Loss of the learning procedure.
178 |         """
179 |         
180 |         # Check that the replay memory is filled enough
181 |         if (len(self.replayMemory) >= self.batchSize):
182 | 
183 |             # Sample a batch of experiences from the replay memory
184 |             batch = self.dataLoaderIter.next()
185 |             state = batch[0].float().to(self.device)
186 |             action = batch[1].long().to(self.device)
187 |             reward = batch[2].float().to(self.device)
188 |             nextState = batch[3].float().to(self.device)
189 |             done = batch[4].float().to(self.device)
190 | 
191 |             # Computation of the current return distribution
192 |             quantiles, taus = self.policyNetwork(state, self.N)
193 |             action = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N)
194 |             quantiles = quantiles.gather(1, action).squeeze(1)
195 | 
196 |             # Computation of the new distribution to be learnt by the policy DNN
197 |             with torch.no_grad(): 
198 |                 nextQuantiles, _ = self.targetNetwork(nextState, self.N)
199 |                 nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.N)
200 |                 nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1)
201 |                 targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1))
202 | 
203 |             # Computation of the loss
204 |             difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2)
205 |             error = difference.abs()
206 |             loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa)))
207 |             loss = (taus - (difference < 0).float()).abs() * loss/self.kappa
208 |             loss = loss.mean(1).sum(1).mean()
209 | 
210 |             # Without Huber loss (to be tested)
211 |             lossMSE = False
212 |             if lossMSE:
213 |                 difference = targetQuantiles - quantiles
214 |                 error = difference.pow(2)
215 |                 loss = error.mean(1).sum()
216 | 
217 |             # Computation of the gradients
218 |             self.optimizer.zero_grad()
219 |             loss.backward()
220 | 
221 |             # Gradient Clipping
222 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
223 | 
224 |             # Perform the Deep Neural Network optimization
225 |             self.optimizer.step()
226 | 
227 |             return loss.item()
228 | 


--------------------------------------------------------------------------------
/Models/CDQN_Model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | from Models.FeedforwardDNN import FeedForwardDNN
13 | 
14 | 
15 | 
16 | ###############################################################################
17 | ################################ Class CDQN_Model #############################
18 | ###############################################################################
19 | 
20 | class CDQN_Model(nn.Module):
21 |     """
22 |     GOAL: Implementing the DL model for the CDQN distributional RL algorithm.
23 |     
24 |     VARIABLES:  - network: Deep Neural Network.
25 |                                 
26 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
27 |                 - forward: Forward pass of the Deep Neural Network.
28 |     """
29 | 
30 |     def __init__(self, numberOfInputs, numberOfOutputs, structure, numberOfAtoms=51):
31 |         """
32 |         GOAL: Defining and initializing the Deep Neural Network.
33 |         
34 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
35 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
36 |                 - structure: Structure of the Deep Neural Network (hidden layers).
37 |                 - numberOfAtoms: Number of atoms for the support (see C51 algorithm).
38 |         
39 |         OUTPUTS: /
40 |         """
41 | 
42 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
43 |         super(CDQN_Model, self).__init__()
44 | 
45 |         # Initialization of useful variables
46 |         self.numberOfAtoms = numberOfAtoms
47 |         self.numberOfActions = int(numberOfOutputs/numberOfAtoms)
48 |     
49 |         # Initialization of the Deep Neural Network
50 |         self.network = FeedForwardDNN(numberOfInputs, numberOfOutputs, structure)
51 | 
52 |     
53 |     def forward(self, x):
54 |         """
55 |         GOAL: Implementing the forward pass of the Deep Neural Network.
56 |         
57 |         INPUTS: - x: Input of the Deep Neural Network.
58 |         
59 |         OUTPUTS: - y: Output of the Deep Neural Network.
60 |         """
61 | 
62 |         x = self.network(x)
63 |         y = F.softmax(x.view(-1, self.numberOfActions, self.numberOfAtoms), dim=-1)
64 |         return y.clamp(min=1e-6)
65 | 


--------------------------------------------------------------------------------
/Models/CDQN_Model_Atari.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | from Models.DNN_Atari import DNN_Atari
13 | from Models.DNN_MinAtar import DNN_MinAtar
14 | 
15 | 
16 | 
17 | ###############################################################################
18 | ############################ Class CDQN_Model_Atari ###########################
19 | ###############################################################################
20 | 
21 | class CDQN_Model_Atari(nn.Module):
22 |     """
23 |     GOAL: Implementing the DL model for the CDQN distributional RL algorithm.
24 |     
25 |     VARIABLES:  - network: Deep Neural Network.
26 |                                 
27 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
28 |                 - forward: Forward pass of the Deep Neural Network.
29 |     """
30 | 
31 |     def __init__(self, numberOfInputs, numberOfOutputs, numberOfAtoms, minAtar=False):
32 |         """
33 |         GOAL: Defining and initializing the Deep Neural Network.
34 |         
35 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
36 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
37 |                 - numberOfAtoms: Number of atoms for the support (see C51 algorithm).
38 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
39 |         
40 |         OUTPUTS: /
41 |         """
42 | 
43 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
44 |         super(CDQN_Model_Atari, self).__init__()
45 | 
46 |         # Initialization of useful variables
47 |         self.numberOfAtoms = numberOfAtoms
48 |         self.numberOfActions = int(numberOfOutputs/numberOfAtoms)
49 |     
50 |         # Initialization of the Deep Neural Network
51 |         if minAtar:
52 |             self.network = DNN_MinAtar(numberOfInputs, numberOfOutputs)
53 |         else:
54 |             self.network = DNN_Atari(numberOfInputs, numberOfOutputs)
55 | 
56 |     
57 |     def forward(self, x):
58 |         """
59 |         GOAL: Implementing the forward pass of the Deep Neural Network.
60 |         
61 |         INPUTS: - x: Input of the Deep Neural Network.
62 |         
63 |         OUTPUTS: - y: Output of the Deep Neural Network.
64 |         """
65 | 
66 |         x = self.network(x)
67 |         y = F.softmax(x.view(-1, self.numberOfActions, self.numberOfAtoms), dim=-1)
68 |         return y.clamp(min=1e-6)
69 | 


--------------------------------------------------------------------------------
/Models/CNN_Atari.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | 
13 | 
14 | ###############################################################################
15 | ############################## Class CNN_Atari ################################
16 | ###############################################################################
17 | 
18 | class CNN_Atari(nn.Module):
19 |     """
20 |     GOAL: Implementing the CNN part of the DNN designed for the DQN algorithm
21 |           to successfully play Atari games.
22 |     
23 |     VARIABLES:  - network: Convolutional Neural Network.
24 |                                 
25 |     METHODS:    - __init__: Initialization of the Convolutional Neural Network.
26 |                 - forward: Forward pass of the Convolutional Neural Network.
27 |     """
28 | 
29 |     def __init__(self, numberOfInputs):
30 |         """
31 |         GOAL: Defining and initializing the Convolutional Neural Network.
32 |         
33 |         INPUTS: - numberOfInputs: Number of inputs of the Convolutional Neural Network.
34 |         
35 |         OUTPUTS: /
36 |         """
37 | 
38 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
39 |         super(CNN_Atari, self).__init__()
40 | 
41 |         # Initialization of the Convolutional Neural Network
42 |         self.network = nn.Sequential(
43 |             nn.Conv2d(numberOfInputs, 32, kernel_size=8, stride=4),
44 |             nn.ReLU(),
45 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
46 |             nn.ReLU(),
47 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
48 |             nn.ReLU()
49 |         )
50 | 
51 |     
52 |     def getOutputSize(self):
53 |         """
54 |         GOAL: Get the size of the Convolutional Neural Network output.
55 |         
56 |         INPUTS: /
57 |         
58 |         OUTPUTS: - size: Size of the Convolutional Neural Network. output.
59 |         """
60 | 
61 |         return self.network(torch.zeros(1, *(4, 84, 84))).view(1, -1).size(1)
62 | 
63 |     
64 |     def forward(self, x):
65 |         """
66 |         GOAL: Implementing the forward pass of the Convolutional Neural Network.
67 |         
68 |         INPUTS: - x: Input of the Convolutional Neural Network.
69 |         
70 |         OUTPUTS: - y: Output of the Convolutional Neural Network.
71 |         """
72 | 
73 |         x = self.network(x)
74 |         return x.view(x.size(0), -1)
75 | 


--------------------------------------------------------------------------------
/Models/CNN_MinAtar.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | 
13 | 
14 | ###############################################################################
15 | ############################# Class CNN_MinAtar ###############################
16 | ###############################################################################
17 | 
18 | class CNN_MinAtar(nn.Module):
19 |     """
20 |     GOAL: Implementing the CNN part of the DNN designed for the DQN algorithm
21 |           to successfully play Atari games (MinAtar version).
22 |     
23 |     VARIABLES:  - network: Convolutional Neural Network.
24 |                                 
25 |     METHODS:    - __init__: Initialization of the Convolutional Neural Network.
26 |                 - forward: Forward pass of the Convolutional Neural Network.
27 |     """
28 | 
29 |     def __init__(self, numberOfInputs):
30 |         """
31 |         GOAL: Defining and initializing the Convolutional Neural Network.
32 |         
33 |         INPUTS: - numberOfInputs: Number of inputs of the Convolutional Neural Network.
34 |         
35 |         OUTPUTS: /
36 |         """
37 | 
38 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
39 |         super(CNN_MinAtar, self).__init__()
40 | 
41 |         # Initialization of some variables
42 |         self.channels = numberOfInputs
43 |         self.size = 10
44 |         self.filters = 16
45 |         self.kernel = 3
46 |         self.stride = 1
47 | 
48 |         # Initialization of the Convolutional Neural Network
49 |         self.network = nn.Sequential(
50 |             nn.Conv2d(self.channels, self.filters, self.kernel, self.stride),
51 |             nn.ReLU()
52 |         )
53 | 
54 |     
55 |     def getOutputSize(self):
56 |         """
57 |         GOAL: Get the size of the Convolutional Neural Network output.
58 |         
59 |         INPUTS: /
60 |         
61 |         OUTPUTS: - size: Size of the Convolutional Neural Network. output.
62 |         """
63 | 
64 |         newSize = ((self.size - self.kernel)/self.stride) + 1
65 |         return int(newSize * newSize * self.filters)
66 | 
67 |     
68 |     def forward(self, x):
69 |         """
70 |         GOAL: Implementing the forward pass of the Convolutional Neural Network.
71 |         
72 |         INPUTS: - x: Input of the Convolutional Neural Network.
73 |         
74 |         OUTPUTS: - y: Output of the Convolutional Neural Network.
75 |         """
76 |         
77 |         x = self.network(x)
78 |         return x.view(x.size(0), -1)
79 | 


--------------------------------------------------------------------------------
/Models/DNN_Atari.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | # pylint: disable=E1101
 9 | # pylint: disable=E1102
10 | 
11 | from Models.FeedforwardDNN import FeedForwardDNN
12 | from Models.CNN_Atari import CNN_Atari
13 | 
14 | 
15 | 
16 | ###############################################################################
17 | ############################## Class DNN_Atari ################################
18 | ###############################################################################
19 | 
20 | class DNN_Atari(nn.Module):
21 |     """
22 |     GOAL: Implementing the orignal DNN designed for the DQN algorithm to
23 |           succesfully play Atari games.
24 |     
25 |     VARIABLES:  - network: Deep Neural Network.
26 |                                 
27 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
28 |                 - forward: Forward pass of the Deep Neural Network.
29 |     """
30 | 
31 |     def __init__(self, numberOfInputs, numberOfOutputs):
32 |         """
33 |         GOAL: Defining and initializing the Deep Neural Network.
34 |         
35 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
36 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
37 |         
38 |         OUTPUTS: /
39 |         """
40 | 
41 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
42 |         super(DNN_Atari, self).__init__()
43 | 
44 |         # Initialization of the Deep Neural Network.
45 |         CNNOutputSize = CNN_Atari(numberOfInputs).getOutputSize()
46 |         self.network = nn.Sequential(
47 |             CNN_Atari(numberOfInputs),
48 |             FeedForwardDNN(CNNOutputSize, numberOfOutputs, [512])
49 |         )
50 | 
51 |     
52 |     def forward(self, x):
53 |         """
54 |         GOAL: Implementing the forward pass of the Deep Neural Network.
55 |         
56 |         INPUTS: - x: Input of the Deep Neural Network.
57 |         
58 |         OUTPUTS: - y: Output of the Deep Neural Network.
59 |         """
60 | 
61 |         return self.network(x)
62 | 


--------------------------------------------------------------------------------
/Models/DNN_MinAtar.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | # pylint: disable=E1101
 9 | # pylint: disable=E1102
10 | 
11 | from Models.FeedforwardDNN import FeedForwardDNN
12 | from Models.CNN_MinAtar import CNN_MinAtar
13 | 
14 | 
15 | 
16 | ###############################################################################
17 | ############################# Class DNN_MinAtar ###############################
18 | ###############################################################################
19 | 
20 | class DNN_MinAtar(nn.Module):
21 |     """
22 |     GOAL: Implementing the orignal DNN designed for the DQN algorithm to
23 |           succesfully play Atari games (MinAtar version).
24 |     
25 |     VARIABLES:  - network: Deep Neural Network.
26 |                                 
27 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
28 |                 - forward: Forward pass of the Deep Neural Network.
29 |     """
30 | 
31 |     def __init__(self, numberOfInputs, numberOfOutputs):
32 |         """
33 |         GOAL: Defining and initializing the Deep Neural Network.
34 |         
35 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
36 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
37 |         
38 |         OUTPUTS: /
39 |         """
40 | 
41 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
42 |         super(DNN_MinAtar, self).__init__()
43 | 
44 |         # Initialization of the Deep Neural Network.
45 |         CNNOutputSize = CNN_MinAtar(numberOfInputs).getOutputSize()
46 |         self.network = nn.Sequential(
47 |             CNN_MinAtar(numberOfInputs),
48 |             FeedForwardDNN(CNNOutputSize, numberOfOutputs, [128])
49 |         )
50 | 
51 |     
52 |     def forward(self, x):
53 |         """
54 |         GOAL: Implementing the forward pass of the Deep Neural Network.
55 |         
56 |         INPUTS: - x: Input of the Deep Neural Network.
57 |         
58 |         OUTPUTS: - y: Output of the Deep Neural Network.
59 |         """
60 |         
61 |         return self.network(x)
62 | 


--------------------------------------------------------------------------------
/Models/FQF_Model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | # pylint: disable=E1101
 11 | # pylint: disable=E1102
 12 | 
 13 | from Models.FeedforwardDNN import FeedForwardDNN
 14 | 
 15 | 
 16 | 
 17 | ###############################################################################
 18 | ############################## Class FQF_Model ################################
 19 | ###############################################################################
 20 | 
 21 | class FQF_Model(nn.Module):
 22 |     """
 23 |     GOAL: Implementing the DL model for the FQF distributional RL algorithm 
 24 |           (Implicit Quantile Network).
 25 |     
 26 |     VARIABLES:  - network: Deep Neural Network.
 27 |                                 
 28 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 29 |                 - forward: Forward pass of the Deep Neural Network.
 30 |     """
 31 | 
 32 |     def __init__(self, numberOfInputs, numberOfOutputs, structure, stateEmbedding, NCos=64, device='cpu'):
 33 |         """
 34 |         GOAL: Defining and initializing the Deep Neural Network.
 35 |         
 36 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 37 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 38 |                 - structure: Structure of the state embedding Deep Neural Network (hidden layers).
 39 |                 - stateEmbedding: Number of values to represent the state.
 40 |                 - Ncos: Number of elements in cosine function.
 41 |         
 42 |         OUTPUTS: /
 43 |         """
 44 | 
 45 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 46 |         super(FQF_Model, self).__init__()
 47 | 
 48 |         # Initialization of useful variables
 49 |         self.device = device
 50 |         self.NCos = NCos
 51 |         self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device)
 52 |     
 53 |         # Initialization of the Deep Neural Network.
 54 |         self.stateEmbedding = FeedForwardDNN(numberOfInputs, stateEmbedding, structure)
 55 |         self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU())
 56 |         self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [256])
 57 | 
 58 |     
 59 |     def embedding(self, x):
 60 |         """
 61 |         GOAL: Implementing the embedding part of the Deep Neural Network.
 62 |         
 63 |         INPUTS: - x: Input of the Deep Neural Network.
 64 |         
 65 |         OUTPUTS: - y: Embedded input of the Deep Neural Network.
 66 |         """
 67 | 
 68 |         return self.stateEmbedding(x)
 69 | 
 70 |     
 71 |     def forward(self, x, taus, embedding=None):
 72 |         """
 73 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 74 |         
 75 |         INPUTS: - x: Input of the Deep Neural Network.
 76 |                 - taus: Quantiles (generated by the FPN).
 77 |                 - embedding: Embedding of the Deep Neural Network input (state).
 78 |         
 79 |         OUTPUTS: - y: Output of the Deep Neural Network.
 80 |         """
 81 | 
 82 |         # State embedding part of the Deep Neural Network
 83 |         batchSize = x.size(0)
 84 |         if embedding == None:
 85 |             x = self.stateEmbedding(x).unsqueeze(1)
 86 |         else:
 87 |             x = embedding.unsqueeze(1)
 88 | 
 89 |         # Quantile embedding part of the Deep Neural Network
 90 |         N = taus.size(1)
 91 |         cos = torch.cos(taus.unsqueeze(2)*self.piMultiples).view(batchSize*N, self.NCos)
 92 |         cos = self.cosEmbedding(cos).view(batchSize, N, -1)
 93 | 
 94 |         # Multiplication of both state and cos embeddings outputs (combination)
 95 |         x = (x * cos).view(batchSize, N, -1)
 96 | 
 97 |         # Distribution part of the Deep Neural Network
 98 |         x = self.feedForwardDNN(x)
 99 |         return x.transpose(1, 2)
100 | 


--------------------------------------------------------------------------------
/Models/FQF_Model_Atari.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | # pylint: disable=E1101
 11 | # pylint: disable=E1102
 12 | 
 13 | from Models.FeedforwardDNN import FeedForwardDNN
 14 | from Models.CNN_Atari import CNN_Atari
 15 | from Models.CNN_MinAtar import CNN_MinAtar
 16 | 
 17 | 
 18 | 
 19 | ###############################################################################
 20 | ########################### Class FQF_Model_Atari #############################
 21 | ###############################################################################
 22 | 
 23 | class FQF_Model_Atari(nn.Module):
 24 |     """
 25 |     GOAL: Implementing the DL model for the FQF distributional RL algorithm 
 26 |           (Implicit Quantile Network).
 27 |     
 28 |     VARIABLES:  - network: Deep Neural Network.
 29 |                                 
 30 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 31 |                 - forward: Forward pass of the Deep Neural Network.
 32 |     """
 33 | 
 34 |     def __init__(self, numberOfInputs, numberOfOutputs, NCos=64, device='cpu',
 35 |                  minAtar=False):
 36 |         """
 37 |         GOAL: Defining and initializing the Deep Neural Network.
 38 |         
 39 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 40 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 41 |                 - Ncos: Number of elements in cosine function.
 42 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
 43 |         
 44 |         OUTPUTS: /
 45 |         """
 46 | 
 47 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 48 |         super(FQF_Model_Atari, self).__init__()
 49 | 
 50 |         # Initialization of useful variables
 51 |         self.device = device
 52 |         self.NCos = NCos
 53 |         self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device)
 54 |     
 55 |         # Initialization of the Deep Neural Network.
 56 |         if minAtar:
 57 |             self.stateEmbedding = CNN_MinAtar(numberOfInputs)
 58 |             self.stateEmbeddingSize = CNN_MinAtar(numberOfInputs).getOutputSize()
 59 |             self.cosEmbedding = nn.Sequential(nn.Linear(NCos, self.stateEmbeddingSize), nn.ReLU())
 60 |             self.feedForwardDNN = FeedForwardDNN(self.stateEmbeddingSize, numberOfOutputs, [128])
 61 |         else:
 62 |             self.stateEmbedding = CNN_Atari(numberOfInputs)
 63 |             self.stateEmbeddingSize = CNN_Atari(numberOfInputs).getOutputSize()
 64 |             self.cosEmbedding = nn.Sequential(nn.Linear(NCos, self.stateEmbeddingSize), nn.ReLU())
 65 |             self.feedForwardDNN = FeedForwardDNN(self.stateEmbeddingSize, numberOfOutputs, [512])
 66 | 
 67 |     
 68 |     def embedding(self, x):
 69 |         """
 70 |         GOAL: Implementing the embedding part of the Deep Neural Network.
 71 |         
 72 |         INPUTS: - x: Input of the Deep Neural Network.
 73 |         
 74 |         OUTPUTS: - y: Embedded input of the Deep Neural Network.
 75 |         """
 76 | 
 77 |         return self.stateEmbedding(x)
 78 | 
 79 |     
 80 |     def getEmbeddingSize(self):
 81 |         """
 82 |         GOAL: Return the size of the state embedding.
 83 |         
 84 |         INPUTS: /
 85 |         
 86 |         OUTPUTS: - stateEmbeddingSize: Size of the state embedding.
 87 |         """
 88 | 
 89 |         return self.stateEmbeddingSize
 90 | 
 91 |     
 92 |     def forward(self, x, taus, embedding=None):
 93 |         """
 94 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 95 |         
 96 |         INPUTS: - x: Input of the Deep Neural Network.
 97 |                 - taus: Quantiles (generated by the FPN).
 98 |                 - embedding: Embedding of the Deep Neural Network input (state)
 99 |         
100 |         OUTPUTS: - y: Output of the Deep Neural Network.
101 |         """
102 | 
103 |         # State embedding part of the Deep Neural Network
104 |         batchSize = x.size(0)
105 |         if embedding == None:
106 |             x = self.stateEmbedding(x).unsqueeze(1)
107 |         else:
108 |             x = embedding.unsqueeze(1)
109 | 
110 |         # Quantile embedding part of the Deep Neural Network
111 |         N = taus.size(1)
112 |         cos = torch.cos(taus.unsqueeze(2)*self.piMultiples).view(batchSize*N, self.NCos)
113 |         cos = self.cosEmbedding(cos).view(batchSize, N, -1)
114 | 
115 |         # Multiplication of both state and cos embeddings outputs (combination)
116 |         x = (x * cos).view(batchSize, N, -1)
117 | 
118 |         # Distribution part of the Deep Neural Network
119 |         x = self.feedForwardDNN(x)
120 |         return x.transpose(1, 2)
121 |         


--------------------------------------------------------------------------------
/Models/FQF_Model_Bis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | 
13 | 
14 | ###############################################################################
15 | ############################# Class FQF_Model_Bis #############################
16 | ###############################################################################
17 | 
18 | class FQF_Model_Bis(nn.Module):
19 |     """
20 |     GOAL: Implementing the DL model for the FQF distributional RL algorithm
21 |           (Fraction Proposal Network).
22 |     
23 |     VARIABLES:  - network: Deep Neural Network.
24 |                                 
25 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
26 |                 - forward: Forward pass of the Deep Neural Network.
27 |     """
28 | 
29 |     def __init__(self, numberOfInputs, numberOfOutputs, device='cpu'):
30 |         """
31 |         GOAL: Defining and initializing the Deep Neural Network.
32 |         
33 |         INPUTS: - numberOfInputs: Input shape (state embedding).
34 |                 - numberOfOutputs: Output shape (number of quantile fractions).
35 |                 - device: Running device (hardware acceleration).
36 |         
37 |         OUTPUTS: /
38 |         """
39 | 
40 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
41 |         super(FQF_Model_Bis, self).__init__()
42 | 
43 |         # Initialization of useful variables
44 |         self.device = device
45 |         self.N = numberOfOutputs
46 | 
47 |         # Initialization of the Deep Neural Network.
48 |         self.network = nn.Sequential(
49 |             nn.Linear(numberOfInputs, numberOfOutputs),
50 |             nn.LogSoftmax(dim=1)
51 |         )
52 | 
53 |     
54 |     def forward(self, x):
55 |         """
56 |         GOAL: Implementing the forward pass of the Deep Neural Network.
57 |         
58 |         INPUTS: - x: Input of the Deep Neural Network. (state embedding).
59 |         
60 |         OUTPUTS: - taus: Quantile fractions generated.
61 |                  - tausBis: Quantile fractions generated.
62 |                  - entropy: Entropy associated with the DNN output.
63 |         """
64 | 
65 |         # Generation of quantile fractions
66 |         out = self.network(x)
67 |         taus = torch.cumsum(out.exp(), dim=1)
68 |         taus = torch.cat((torch.zeros((out.shape[0], 1)).to(self.device), taus), dim=1)
69 |         tausBis = (taus[:, :-1] + taus[:, 1:]).detach() / 2.
70 | 
71 |         # Computation of the associated entropy
72 |         entropy = -(out * out.exp()).sum(dim=-1, keepdim=True)
73 | 
74 |         return taus, tausBis, entropy
75 |         


--------------------------------------------------------------------------------
/Models/FeedforwardDNN.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | # pylint: disable=E1101
 9 | # pylint: disable=E1102
10 | 
11 | 
12 | 
13 | ###############################################################################
14 | ############################ Class FeedForwardDNN #############################
15 | ###############################################################################
16 | 
17 | class FeedForwardDNN(nn.Module):
18 |     """
19 |     GOAL: Implementing a classical feedforward DNN using Pytorch.
20 |     
21 |     VARIABLES:  - network: Feedforward DNN.
22 |                                 
23 |     METHODS:    - __init__: Initialization of the feedforward DNN.
24 |                 - forward: Forward pass of the feedforward DNN.
25 |     """
26 | 
27 |     def __init__(self, numberOfInputs, numberOfOutputs, structure):
28 |         """
29 |         GOAL: Defining and initializing the feedforward DNN.
30 |         
31 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
32 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
33 |                 - structure: Structure of the feedforward DNN (hidden layers).
34 |         
35 |         OUTPUTS: /
36 |         """
37 | 
38 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
39 |         super(FeedForwardDNN, self).__init__()
40 | 
41 |         # Initialization of the FeedForward DNN
42 |         self.network = []
43 |         structure = [numberOfInputs] + structure + [numberOfOutputs]
44 |         for inFeature, outFeature in zip(structure, structure[1:]):
45 |             self.network.extend([
46 |                 nn.Linear(inFeature, outFeature),
47 |                 nn.ReLU(),
48 |             ])
49 |         self.network.pop()
50 |         self.network = nn.Sequential(*self.network)
51 | 
52 |     
53 |     def forward(self, x):
54 |         """
55 |         GOAL: Implementing the forward pass of the feedforward DNN.
56 |         
57 |         INPUTS: - x: Input of the feedforward DNN.
58 |         
59 |         OUTPUTS: - y: Output of the feedforward DNN.
60 |         """
61 | 
62 |         return self.network(x)
63 | 


--------------------------------------------------------------------------------
/Models/IQN_Model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | import torch.nn as nn
10 | # pylint: disable=E1101
11 | # pylint: disable=E1102
12 | 
13 | from Models.FeedforwardDNN import FeedForwardDNN
14 | 
15 | 
16 | 
17 | ###############################################################################
18 | ############################## Class IQN_Model ################################
19 | ###############################################################################
20 | 
21 | class IQN_Model(nn.Module):
22 |     """
23 |     GOAL: Implementing the DL model for the IQN distributional RL algorithm.
24 |     
25 |     VARIABLES:  - network: Deep Neural Network.
26 |                                 
27 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
28 |                 - forward: Forward pass of the Deep Neural Network.
29 |     """
30 | 
31 |     def __init__(self, numberOfInputs, numberOfOutputs, structure, stateEmbedding, NCos=64, device='cpu'):
32 |         """
33 |         GOAL: Defining and initializing the Deep Neural Network.
34 |         
35 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
36 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
37 |                 - structure: Structure of the state embedding Deep Neural Network (hidden layers).
38 |                 - stateEmbedding: Number of values to represent the state.
39 |                 - Ncos: Number of elements in cosine function.
40 |         
41 |         OUTPUTS: /
42 |         """
43 | 
44 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
45 |         super(IQN_Model, self).__init__()
46 | 
47 |         # Initialization of useful variables
48 |         self.device = device
49 |         self.NCos = NCos
50 |         self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device)
51 |     
52 |         # Initialization of the Deep Neural Network
53 |         self.stateEmbedding = FeedForwardDNN(numberOfInputs, stateEmbedding, structure)
54 |         self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU())
55 |         self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [256])
56 | 
57 |     
58 |     def forward(self, x, N, randomSampling=True):
59 |         """
60 |         GOAL: Implementing the forward pass of the Deep Neural Network.
61 |         
62 |         INPUTS: - x: Input of the Deep Neural Network.
63 |                 - N: Number of quantiles to generate.
64 |                 - randomSampling: Boolean specifying whether the quantiles are
65 |                                   sampled randomly or not (default: True).
66 |         
67 |         OUTPUTS: - y: Output of the Deep Neural Network.
68 |         """
69 | 
70 |         # State embedding part of the Deep Neural Network
71 |         batchSize = x.size(0)
72 |         x = self.stateEmbedding(x).unsqueeze(1)
73 | 
74 |         # Generate a number of quantiles (randomly or not)
75 |         if randomSampling:
76 |             taus = torch.rand(batchSize, N).to(self.device).unsqueeze(2)
77 |         else:
78 |             taus = torch.linspace(0.0, 1.0, N).to(self.device)
79 |             taus = taus.repeat(batchSize, 1).unsqueeze(2)
80 |             
81 |         # Quantile embedding part of the Deep Neural Network
82 |         cos = torch.cos(taus*self.piMultiples).view(batchSize*N, self.NCos)
83 |         cos = self.cosEmbedding(cos).view(batchSize, N, -1)
84 | 
85 |         # Multiplication of both state and cos embeddings outputs (combination)
86 |         x = (x * cos).view(batchSize, N, -1)
87 | 
88 |         # Distribution part of the Deep Neural Network
89 |         x = self.feedForwardDNN(x)
90 |         return x.transpose(1, 2), taus
91 | 


--------------------------------------------------------------------------------
/Models/IQN_Model_Atari.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | # pylint: disable=E1101
 11 | # pylint: disable=E1102
 12 | 
 13 | from Models.FeedforwardDNN import FeedForwardDNN
 14 | from Models.CNN_Atari import CNN_Atari
 15 | from Models.CNN_MinAtar import CNN_MinAtar
 16 | 
 17 | 
 18 | 
 19 | ###############################################################################
 20 | ######################### Class IQN_Model_Atari ###############################
 21 | ###############################################################################
 22 | 
 23 | class IQN_Model_Atari(nn.Module):
 24 |     """
 25 |     GOAL: Implementing the DL model for the IQN distributional RL algorithm.
 26 |     
 27 |     VARIABLES:  - network: Deep Neural Network.
 28 |                                 
 29 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 30 |                 - forward: Forward pass of the Deep Neural Network.
 31 |     """
 32 | 
 33 |     def __init__(self, numberOfInputs, numberOfOutputs, NCos=64, device='cpu',
 34 |                  minAtar=False):
 35 |         """
 36 |         GOAL: Defining and initializing the Deep Neural Network.
 37 |         
 38 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 39 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 40 |                 - Ncos: Number of elements in cosine function.
 41 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
 42 |         
 43 |         OUTPUTS: /
 44 |         """
 45 | 
 46 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 47 |         super(IQN_Model_Atari, self).__init__()
 48 | 
 49 |         # Initialization of useful variables
 50 |         self.device = device
 51 |         self.NCos = NCos
 52 |         self.piMultiples = torch.tensor([np.pi*i for i in range(self.NCos)], dtype=torch.float).view(1, 1, self.NCos).to(self.device)
 53 |     
 54 |         # Initialization of the Deep Neural Network
 55 |         if minAtar:
 56 |             self.stateEmbedding = CNN_MinAtar(numberOfInputs)
 57 |             stateEmbedding = CNN_MinAtar(numberOfInputs).getOutputSize()
 58 |             self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU())
 59 |             self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [128])
 60 |         else:
 61 |             self.stateEmbedding = CNN_Atari(numberOfInputs)
 62 |             stateEmbedding = CNN_Atari(numberOfInputs).getOutputSize()
 63 |             self.cosEmbedding = nn.Sequential(nn.Linear(NCos, stateEmbedding), nn.ReLU())
 64 |             self.feedForwardDNN = FeedForwardDNN(stateEmbedding, numberOfOutputs, [512])
 65 | 
 66 |     
 67 |     def forward(self, x, N, randomSampling=True):
 68 |         """
 69 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 70 |         
 71 |         INPUTS: - x: Input of the Deep Neural Network.
 72 |                 - N: Number of quantiles to generate.
 73 |                 - randomSampling: Boolean specifying whether the quantiles are
 74 |                                   sampled randomly or not (default: True).
 75 |         
 76 |         OUTPUTS: - y: Output of the Deep Neural Network.
 77 |         """
 78 | 
 79 |         # State embedding part of the Deep Neural Network
 80 |         batchSize = x.size(0)
 81 |         x = self.stateEmbedding(x).unsqueeze(1)
 82 | 
 83 |         # Generate a number of quantiles (randomly or not)
 84 |         if randomSampling:
 85 |             taus = torch.rand(batchSize, N).to(self.device).unsqueeze(2)
 86 |         else:
 87 |             taus = torch.linspace(0.0, 1.0, N).to(self.device)
 88 |             taus = taus.repeat(batchSize, 1).unsqueeze(2)
 89 | 
 90 |         # Quantile embedding part of the Deep Neural Network
 91 |         cos = torch.cos(taus*self.piMultiples).view(batchSize*N, self.NCos)
 92 |         cos = self.cosEmbedding(cos).view(batchSize, N, -1)
 93 | 
 94 |         # Multiplication of both state and cos embeddings outputs (combination)
 95 |         x = (x * cos).view(batchSize, N, -1)
 96 | 
 97 |         # Distribution part of the Deep Neural Network
 98 |         x = self.feedForwardDNN(x)
 99 |         return x.transpose(1, 2), taus
100 | 


--------------------------------------------------------------------------------
/Models/MonotonicNN.py:
--------------------------------------------------------------------------------
  1 | # Credit goes to Antoine Wehenkel for this entire python file.
  2 | 
  3 | 
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from UMNN import NeuralIntegral
  8 | from UMNN import ParallelNeuralIntegral
  9 | import math
 10 | 
 11 | def _flatten(sequence):
 12 |     flat = [p.contiguous().view(-1) for p in sequence]
 13 |     return torch.cat(flat) if len(flat) > 0 else torch.tensor([])
 14 | 
 15 | 
 16 | class IntegrandNN(nn.Module):
 17 |     def __init__(self, in_d, hidden_layers, n_out=1):
 18 |         super(IntegrandNN, self).__init__()
 19 |         self.net = []
 20 |         hs = [in_d] + hidden_layers + [n_out]
 21 |         for h0, h1 in zip(hs, hs[1:]):
 22 |             self.net.extend([
 23 |                 nn.Linear(h0, h1),
 24 |                 nn.ReLU(),
 25 |             ])
 26 |         self.net.pop()
 27 |         self.net.append(nn.ELU())
 28 |         self.net = nn.Sequential(*self.net)
 29 | 
 30 |     def forward(self, x, h):
 31 |         return self.net(torch.cat((x, h), 1)) + 1.
 32 | 
 33 | 
 34 | class OneDimensionnalNF(nn.Module):
 35 |     def __init__(self, in_d, hidden_layers, nb_steps=200, n_out=1, dev="cpu"):
 36 |         super(OneDimensionnalNF, self).__init__()
 37 |         self.device = dev
 38 |         self.nb_steps = nb_steps
 39 |         self.n_out = n_out
 40 |         self.net = MonotonicNN(in_d, hidden_layers, nb_steps=nb_steps, n_out=n_out, dev=dev)
 41 |         self.register_buffer("pi", torch.tensor(math.pi))
 42 | 
 43 |     '''
 44 |     The forward procedure takes as input x which is the variable for which the integration must be made, h are just other conditionning variables.
 45 |     It returns the $log(p(x|h; \theta))$.
 46 |     '''
 47 |     def forward(self, x, h):
 48 |         x0 = torch.zeros(x.shape).to(self.device)
 49 |         out = self.net.net(h)
 50 |         offset = out[:, :self.n_out]
 51 |         scaling = torch.exp(out[:, self.n_out:])
 52 |         jac = scaling * self.net.integrand(x, h)
 53 |         z = scaling*ParallelNeuralIntegral.apply(x0, x, self.net.integrand, _flatten(self.net.integrand.parameters()), h, self.nb_steps) + offset
 54 |         z.clamp_(-10., 10.)
 55 |         log_prob_gauss = -.5 * (torch.log(self.pi * 2) + z ** 2)
 56 |         ll = log_prob_gauss + torch.log(jac + 1e-10)
 57 |         return ll
 58 | 
 59 |     def expectation(self, h, x_func, min=-10, max=10, npts=1000):
 60 |         # Using first order Euler method .
 61 |         b_size = h.shape[0]
 62 |         n_out = self.n_out
 63 |         dx = (max-min)/(npts - 1)
 64 |         emb_size = h.shape[1]
 65 | 
 66 |         x = torch.arange(min, max+(max-min)/(npts - 1), dx).to(h.device)
 67 |         npts = x.shape[0]
 68 |         zero_idx = torch.argmin(x**2).item()
 69 | 
 70 |         out = self.net.net(h)
 71 |         offset = out[:, :self.n_out].unsqueeze(1).expand(b_size, npts, n_out)
 72 |         scaling = torch.exp(out[:, self.n_out:]).unsqueeze(1).expand(b_size, npts, n_out)
 73 | 
 74 |         h_values = h.unsqueeze(1).expand(b_size, npts, emb_size).reshape(-1, emb_size)
 75 |         x_values = x.unsqueeze(0).expand(b_size, npts).reshape(-1, 1)
 76 | 
 77 |         f_values = self.net.integrand(x_values, h_values)
 78 |         f_values = f_values.reshape(b_size, npts, n_out) * scaling
 79 | 
 80 |         z = (dx * f_values.cumsum(1))
 81 |         z = (z - z[:, [zero_idx], :].expand(-1, npts, -1)) + offset
 82 |         log_prob_gauss = -.5 * (torch.log(self.pi * 2) + z ** 2)
 83 |         ll = log_prob_gauss + torch.log(f_values + 1e-10)
 84 | 
 85 | 
 86 |         expectations = (x_func(x).unsqueeze(0).unsqueeze(2).expand(b_size, npts, n_out) * torch.exp(ll)).sum(1) * dx
 87 | 
 88 |         return expectations
 89 | 
 90 | class MonotonicNN(nn.Module):
 91 |     '''
 92 |     in_d : The total number of inputs
 93 |     hidden_layers : a list a the number of neurons, to be used by a network that compresses the non-monotonic variables and by the integrand net.
 94 |     nb_steps : Number of integration steps
 95 |     n_out : the number of output (each output will be monotonic w.r.t one variable)
 96 |     '''
 97 |     def __init__(self, in_d, hidden_layers, nb_steps=200, n_out=1, dev="cpu"):
 98 |         super(MonotonicNN, self).__init__()
 99 |         self.integrand = IntegrandNN(in_d, hidden_layers, n_out)
100 |         self.net = []
101 |         hs = [in_d-1] + hidden_layers + [2 * n_out]
102 |         for h0, h1 in zip(hs, hs[1:]):
103 |             self.net.extend([
104 |                 nn.Linear(h0, h1),
105 |                 nn.ReLU(),
106 |             ])
107 |         self.net.pop()
108 |         self.net = nn.Sequential(*self.net)
109 |         self.device = dev
110 |         self.nb_steps = nb_steps
111 |         self.n_out = n_out
112 | 
113 |     '''
114 |     The forward procedure takes as input x which is the variable for which the integration must be made, h are just other conditionning variables.
115 |     '''
116 |     def forward(self, x, h, only_derivative=False):
117 |         x0 = torch.zeros(x.shape).to(self.device)
118 |         out = self.net(h)
119 |         offset = out[:, :self.n_out]
120 |         scaling = torch.exp(out[:, self.n_out:])
121 |         if only_derivative:
122 |             return scaling * self.integrand(x, h)
123 |         return scaling*ParallelNeuralIntegral.apply(x0, x, self.integrand, _flatten(self.integrand.parameters()), h, self.nb_steps) + offset
124 | 
125 | 
126 | 
127 | 
128 |     '''
129 |     The inverse procedure takes as input y which is the variable for which the inverse must be computed, h are just other conditionning variables.
130 |     One output per n_out.
131 |     y should be a scalar.
132 |     '''
133 |     def inverse(self, y, h, min=-10, max=10, nb_iter=10):
134 |         idx = (torch.arange(0, self.n_out**2, self.n_out + 1).view(1, -1) + torch.arange(0, (self.n_out**2)*y.shape[0], self.n_out**2).view(-1, 1)).view(-1)
135 |         h = h.unsqueeze(1).expand(-1, self.n_out, -1).contiguous().view(y.shape[0]*self.n_out, -1)
136 | 
137 |         # Old inversion by binary search
138 |         x_max = torch.ones(y.shape[0], self.n_out).to(y.device) * max
139 |         x_min = torch.ones(y.shape[0], self.n_out).to(y.device) * min
140 |         y_max = self.forward(x_max.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out)
141 |         y_min = self.forward(x_min.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out)
142 | 
143 |         for i in range(nb_iter):
144 |             x_middle = (x_max + x_min) / 2
145 |             y_middle = self.forward(x_middle.view(-1, 1), h).view(-1)[idx].view(-1, self.n_out)
146 |             left = (y_middle > y).float()
147 |             right = 1 - left
148 |             x_max = left * x_middle + right * x_max
149 |             x_min = right * x_middle + left * x_min
150 |             y_max = left * y_middle + right * y_max
151 |             y_min = right * y_middle + left * y_min
152 |         return (x_max + x_min) / 2
153 | 
154 |     def expectation(self, h, x_func, out_deriv, min=-10, max=10, npts=1000):
155 |         # Using first order Euler method .
156 |         b_size = h.shape[0]
157 |         n_out = self.n_out
158 |         dx = (max-min)/(npts - 1)
159 |         emb_size = h.shape[1]
160 | 
161 |         x = torch.arange(min, max+(max-min)/(npts - 1), dx).to(h.device)
162 |         npts = x.shape[0]
163 |         zero_idx = torch.argmin(x**2).item()
164 | 
165 |         out = self.net(h)
166 |         offset = out[:, :self.n_out].unsqueeze(1).expand(b_size, npts, n_out)
167 |         scaling = torch.exp(out[:, self.n_out:]).unsqueeze(1).expand(b_size, npts, n_out)
168 | 
169 |         h_values = h.unsqueeze(1).expand(b_size, npts, emb_size).reshape(-1, emb_size)
170 |         x_values = x.unsqueeze(0).expand(b_size, npts).reshape(-1, 1)
171 | 
172 |         f_values = self.integrand(x_values, h_values)
173 |         f_values = f_values.reshape(b_size, npts, n_out) * scaling
174 | 
175 |         F_values = (dx * f_values.cumsum(1))
176 |         F_values = (F_values - F_values[:, [zero_idx], :].expand(-1, npts, -1)) + offset
177 |         corrected_F_values = out_deriv(F_values)
178 | 
179 |         expectations = (x_func(x).unsqueeze(0).unsqueeze(2).expand(b_size, npts, n_out) * f_values * corrected_F_values).sum(1) * dx
180 | 
181 |         return expectations
182 | 


--------------------------------------------------------------------------------
/Models/QR_DQN_Model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | # pylint: disable=E1101
 9 | # pylint: disable=E1102
10 | 
11 | from Models.FeedforwardDNN import FeedForwardDNN
12 | 
13 | 
14 | 
15 | ###############################################################################
16 | ############################# Class QR_DQN_Model ##############################
17 | ###############################################################################
18 | 
19 | class QR_DQN_Model(nn.Module):
20 |     """
21 |     GOAL: Implementing the DL model for the QR-DQN distributional RL algorithm.
22 |     
23 |     VARIABLES:  - network: Deep Neural Network.
24 |                                 
25 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
26 |                 - forward: Forward pass of the Deep Neural Network.
27 |     """
28 | 
29 |     def __init__(self, numberOfInputs, numberOfOutputs, structure, numberOfQuantiles=200):
30 |         """
31 |         GOAL: Defining and initializing the Deep Neural Network.
32 |         
33 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
34 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
35 |                 - structure: Structure of the Deep Neural Network (hidden layers).
36 |                 - numberOfQuantiles: Number of quantiles for approximating the distribution.
37 |         
38 |         OUTPUTS: /
39 |         """
40 | 
41 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
42 |         super(QR_DQN_Model, self).__init__()
43 | 
44 |         # Initialization of useful variables
45 |         self.numberOfQuantiles = numberOfQuantiles
46 |         self.numberOfActions = int(numberOfOutputs/numberOfQuantiles)
47 |     
48 |         # Initialization of the Deep Neural Network.
49 |         self.network = FeedForwardDNN(numberOfInputs, numberOfOutputs, structure)
50 | 
51 |     
52 |     def forward(self, x):
53 |         """
54 |         GOAL: Implementing the forward pass of the Deep Neural Network.
55 |         
56 |         INPUTS: - x: Input of the Deep Neural Network.
57 |         
58 |         OUTPUTS: - y: Output of the Deep Neural Network.
59 |         """
60 | 
61 |         x = self.network(x)
62 |         return x.view(x.size(0), self.numberOfActions, self.numberOfQuantiles)
63 | 


--------------------------------------------------------------------------------
/Models/QR_DQN_Model_Atari.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch.nn as nn
 8 | # pylint: disable=E1101
 9 | # pylint: disable=E1102
10 | 
11 | from Models.DNN_Atari import DNN_Atari
12 | from Models.DNN_MinAtar import DNN_MinAtar
13 | 
14 | 
15 | 
16 | ###############################################################################
17 | ######################## Class QR_DQN_Model_Atari #############################
18 | ###############################################################################
19 | 
20 | class QR_DQN_Model_Atari(nn.Module):
21 |     """
22 |     GOAL: Implementing the DL model for the QR-DQN distributional RL algorithm.
23 |     
24 |     VARIABLES:  - network: Deep Neural Network.
25 |                                 
26 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
27 |                 - forward: Forward pass of the Deep Neural Network.
28 |     """
29 | 
30 |     def __init__(self, numberOfInputs, numberOfOutputs, numberOfQuantiles=200, minAtar=False):
31 |         """
32 |         GOAL: Defining and initializing the Deep Neural Network.
33 |         
34 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
35 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
36 |                 - numberOfQuantiles: Number of quantiles for approximating the distribution.
37 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
38 |         
39 |         OUTPUTS: /
40 |         """
41 | 
42 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
43 |         super(QR_DQN_Model_Atari, self).__init__()
44 | 
45 |         # Initialization of useful variables
46 |         self.numberOfQuantiles = numberOfQuantiles
47 |         self.numberOfActions = int(numberOfOutputs/numberOfQuantiles)
48 |     
49 |         # Initialization of the Deep Neural Network.
50 |         if minAtar:
51 |             self.network = DNN_MinAtar(numberOfInputs, numberOfOutputs)
52 |         else:
53 |             self.network = DNN_Atari(numberOfInputs, numberOfOutputs)
54 | 
55 |     
56 |     def forward(self, x):
57 |         """
58 |         GOAL: Implementing the forward pass of the Deep Neural Network.
59 |         
60 |         INPUTS: - x: Input of the Deep Neural Network.
61 |         
62 |         OUTPUTS: - y: Output of the Deep Neural Network.
63 |         """
64 | 
65 |         x = self.network(x)
66 |         return x.view(x.size(0), self.numberOfActions, self.numberOfQuantiles)
67 | 


--------------------------------------------------------------------------------
/Models/UMDQN_C_Model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | # pylint: disable=E1101
 10 | # pylint: disable=E1102
 11 | 
 12 | from Models.FeedforwardDNN import FeedForwardDNN
 13 | from Models.MonotonicNN import MonotonicNN
 14 | 
 15 | 
 16 | 
 17 | ###############################################################################
 18 | ############################ Class UMDQN_C_Model ##############################
 19 | ###############################################################################
 20 | 
 21 | class UMDQN_C_Model(nn.Module):
 22 |     """
 23 |     GOAL: Implementing the DL model for the UMDQN-C distributional RL algorithm.
 24 |     
 25 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
 26 |                 - UMNN: UMNN part of the Deep Neural Network.
 27 |                                 
 28 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 29 |                 - forward: Forward pass of the Deep Neural Network.
 30 |                 - getDerivative: Get the derivative internally computed by the UMNN.
 31 |                 - getExpectation: Get the expectation of the PDF internally computed by the UMNN.
 32 |     """
 33 | 
 34 |     def __init__(self, numberOfInputs, numberOfOutputs,
 35 |                  structureDNN, structureUMNN, stateEmbedding,
 36 |                  numberOfSteps, device='cpu'):
 37 |         """
 38 |         GOAL: Defining and initializing the Deep Neural Network.
 39 |         
 40 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 41 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 42 |                 - structureDNN: Structure of the feedforward DNN for state embedding.
 43 |                 - structureUMNN: Structure of the UMNN for distribution representation.
 44 |                 - stateEmbedding: Dimension of the state embedding.
 45 |                 - numberOfSteps: Number of integration steps for the UMNN.
 46 |                 - device: Hardware device (CPU or GPU).
 47 |         
 48 |         OUTPUTS: /
 49 |         """
 50 | 
 51 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 52 |         super(UMDQN_C_Model, self).__init__()
 53 | 
 54 |         # Initialization of the Deep Neural Network
 55 |         self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN)
 56 |         self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
 57 | 
 58 |     
 59 |     def forward(self, state, q):
 60 |         """
 61 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 62 |         
 63 |         INPUTS: - state: RL state.
 64 |                 - q: Samples of potential returns.
 65 |         
 66 |         OUTPUTS: - output: Output of the Deep Neural Network.
 67 |         """
 68 |         
 69 |         # State embedding part of the Deep Neural Network
 70 |         batchSize = state.size(0)
 71 |         x = self.stateEmbeddingDNN(state)
 72 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
 73 | 
 74 |         # UMNNN part of the Deep Neural Network
 75 |         x = self.UMNN(q, x)
 76 | 
 77 |         # Sigmoid activation function + appropriate format
 78 |         x = torch.sigmoid(x)
 79 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
 80 | 
 81 | 
 82 |     def getDerivative(self, state, q):
 83 |         """
 84 |         GOAL: Get the derivative internally computed by the UMNN.
 85 |         
 86 |         INPUTS: - state: RL state.
 87 |                 - q: Samples of potential returns.
 88 |         
 89 |         OUTPUTS: - output: Derivative internally computed by the UMNN.
 90 |         """
 91 | 
 92 |         # State embedding part of the Deep Neural Network
 93 |         batchSize = state.size(0)
 94 |         x = self.stateEmbeddingDNN(state)
 95 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
 96 | 
 97 |         # Computation of both PDF and CDF
 98 |         pdf = self.UMNN(q, x, only_derivative=True)
 99 |         cdf = self.UMNN(q, x, only_derivative=False)
100 | 
101 |         # Correction of the sigmoid + appropriate format
102 |         x = torch.sigmoid(cdf)
103 |         x = x * (1 - x) * pdf
104 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
105 | 
106 | 
107 |     def getExpectation(self, state, minReturn, maxReturn, numberOfPoints):
108 |         """
109 |         GOAL: Get the expectation of the PDF internally computed by the UMNN.
110 |         
111 |         INPUTS: - state: RL state.
112 |                 - minReturn: Minimum return.
113 |                 - maxReturn: Maximum return.
114 |                 - numberOfPoints: Number of points for the computations (accuracy).
115 |         
116 |         OUTPUTS: - expectation: Expectation computed.
117 |         """
118 | 
119 |         # State embedding part of the Deep Neural Network
120 |         state = self.stateEmbeddingDNN(state)
121 | 
122 |         # Computation of the expectation of the PDF internally computed by the UMNN
123 |         expectation = self.UMNN.expectation(state, lambda x: x, lambda x: torch.sigmoid(x)*(1-torch.sigmoid(x)), minReturn, maxReturn, numberOfPoints)
124 |         return expectation
125 | 
126 | 


--------------------------------------------------------------------------------
/Models/UMDQN_C_Model_Atari.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | # pylint: disable=E1101
 10 | # pylint: disable=E1102
 11 | 
 12 | from Models.DNN_Atari import DNN_Atari
 13 | from Models.DNN_MinAtar import DNN_MinAtar
 14 | from Models.MonotonicNN import MonotonicNN
 15 | 
 16 | 
 17 | 
 18 | ###############################################################################
 19 | ########################## Class UMDQN_C_Model_Atari ##########################
 20 | ###############################################################################
 21 | 
 22 | class UMDQN_C_Model_Atari(nn.Module):
 23 |     """
 24 |     GOAL: Implementing the DL model for the UMDQN-C distributional RL algorithm.
 25 |     
 26 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
 27 |                 - UMNN: UMNN part of the Deep Neural Network.
 28 |                                 
 29 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 30 |                 - forward: Forward pass of the Deep Neural Network.
 31 |                 - getDerivative: Get the derivative internally computed by the UMNN.
 32 |                 - getExpectation: Get the expectation of the PDF internally computed by the UMNN.
 33 |     """
 34 | 
 35 |     def __init__(self, numberOfInputs, numberOfOutputs,
 36 |                  structureUMNN, stateEmbedding, numberOfSteps,
 37 |                  device='cpu', minAtar=False):
 38 |         """
 39 |         GOAL: Defining and initializing the Deep Neural Network.
 40 |         
 41 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 42 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 43 |                 - structureUMNN: Structure of the UMNN for distribution representation.
 44 |                 - stateEmbedding: Dimension of the state embedding.
 45 |                 - numberOfSteps: Number of integration steps for the UMNN.
 46 |                 - device: Hardware device (CPU or GPU).
 47 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
 48 |         
 49 |         OUTPUTS: /
 50 |         """
 51 | 
 52 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 53 |         super(UMDQN_C_Model_Atari, self).__init__()
 54 | 
 55 |         # Initialization of the Deep Neural Network
 56 |         if minAtar:
 57 |             self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding)
 58 |         else:
 59 |             self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding)
 60 |         self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
 61 | 
 62 |     
 63 |     def forward(self, state, q):
 64 |         """
 65 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 66 |         
 67 |         INPUTS: - state: RL state.
 68 |                 - q: Samples of potential returns.
 69 |         
 70 |         OUTPUTS: - output: Output of the Deep Neural Network.
 71 |         """
 72 |         
 73 |         # State embedding part of the Deep Neural Network
 74 |         batchSize = state.size(0)
 75 |         x = self.stateEmbeddingDNN(state)
 76 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
 77 | 
 78 |         # UMNNN part of the Deep Neural Network
 79 |         x = self.UMNN(q, x)
 80 | 
 81 |         # Sigmoid activation function + appropriate format
 82 |         x = torch.sigmoid(x)
 83 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
 84 | 
 85 | 
 86 |     def getDerivative(self, state, q):
 87 |         """
 88 |         GOAL: Get the derivative internally computed by the UMNN.
 89 |         
 90 |         INPUTS: - state: RL state.
 91 |                 - q: Samples of potential returns.
 92 |         
 93 |         OUTPUTS: - output: Derivative internally computed by the UMNN.
 94 |         """
 95 | 
 96 |         # State embedding part of the Deep Neural Network
 97 |         batchSize = state.size(0)
 98 |         x = self.stateEmbeddingDNN(state)
 99 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
100 | 
101 |         # Computation of both PDF and CDF
102 |         pdf = self.UMNN(q, x, only_derivative=True)
103 |         cdf = self.UMNN(q, x, only_derivative=False)
104 | 
105 |         # Correction of the sigmoid + appropriate format
106 |         x = torch.sigmoid(cdf)
107 |         x = x * (1 - x) * pdf
108 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
109 | 
110 | 
111 |     def getExpectation(self, state, minReturn, maxReturn, numberOfPoints):
112 |         """
113 |         GOAL: Get the expectation of the PDF internally computed by the UMNN.
114 |         
115 |         INPUTS: - state: RL state.
116 |                 - minReturn: Minimum return.
117 |                 - maxReturn: Maximum return.
118 |                 - numberOfPoints: Number of points for the computations (accuracy).
119 |         
120 |         OUTPUTS: - expectation: Expectation computed.
121 |         """
122 | 
123 |         # State embedding part of the Deep Neural Network
124 |         state = self.stateEmbeddingDNN(state)
125 | 
126 |         # Computation of the expectation of the PDF internally computed by the UMNN
127 |         expectation = self.UMNN.expectation(state, lambda x: x, lambda x: torch.sigmoid(x)*(1-torch.sigmoid(x)), minReturn, maxReturn, numberOfPoints)
128 |         return expectation
129 | 
130 | 


--------------------------------------------------------------------------------
/Models/UMDQN_KL_Model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | # pylint: disable=E1101
 10 | # pylint: disable=E1102
 11 | 
 12 | from Models.FeedforwardDNN import FeedForwardDNN
 13 | from Models.MonotonicNN import OneDimensionnalNF
 14 | 
 15 | 
 16 | 
 17 | ###############################################################################
 18 | ############################ Class UMDQN_KL_Model #############################
 19 | ###############################################################################
 20 | 
 21 | class UMDQN_KL_Model(nn.Module):
 22 |     """
 23 |     GOAL: Implementing the DL model for the UMDQN-KL distributional RL algorithm.
 24 |     
 25 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
 26 |                 - UMNN: UMNN part of the Deep Neural Network.
 27 |                                 
 28 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 29 |                 - forward: Forward pass of the Deep Neural Network.
 30 |                 - getExpectation: Get the expectation of the PDF approximated by the UMNN.
 31 |     """
 32 | 
 33 |     def __init__(self, numberOfInputs, numberOfOutputs,
 34 |                  structureDNN, structureUMNN, stateEmbedding,
 35 |                  numberOfSteps, device='cpu'):
 36 |         """
 37 |         GOAL: Defining and initializing the Deep Neural Network.
 38 |         
 39 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 40 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 41 |                 - structureDNN: Structure of the feedforward DNN for state embedding.
 42 |                 - structureUMNN: Structure of the UMNN for distribution representation.
 43 |                 - stateEmbedding: Dimension of the state embedding.
 44 |                 - numberOfSteps: Number of integration steps for the UMNN.
 45 |                 - device: Hardware device (CPU or GPU).
 46 |         
 47 |         OUTPUTS: /
 48 |         """
 49 | 
 50 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 51 |         super(UMDQN_KL_Model, self).__init__()
 52 | 
 53 |         # Initialization of the Deep Neural Network
 54 |         self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN)
 55 |         self.UMNN = OneDimensionnalNF(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
 56 | 
 57 |     
 58 |     def forward(self, state, q):
 59 |         """
 60 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 61 |         
 62 |         INPUTS: - state: RL state.
 63 |                 - q: Samples of potential returns.
 64 |         
 65 |         OUTPUTS: - output: Output of the Deep Neural Network.
 66 |         """
 67 |         
 68 |         # State embedding part of the Deep Neural Network
 69 |         batchSize = state.size(0)
 70 |         x = self.stateEmbeddingDNN(state)
 71 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
 72 | 
 73 |         # UMNN part of the Deep Neural Network
 74 |         x = self.UMNN(q, x)
 75 | 
 76 |         # Formatting of the output and post processing operations
 77 |         x = torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
 78 |         x = torch.exp(x)
 79 |         x = x.clamp(min=1e-6)
 80 | 
 81 |         return x
 82 | 
 83 | 
 84 |     def getExpectation(self, state, minReturn, maxReturn, numberOfPoints):
 85 |         """
 86 |         GOAL: Get the expectation of the PDF internally computed by the UMNN.
 87 |         
 88 |         INPUTS: - state: RL state.
 89 |                 - minReturn: Minimum return.
 90 |                 - maxReturn: Maximum return.
 91 |                 - numberOfPoints: Number of points for the computations (accuracy).
 92 |         
 93 |         OUTPUTS: - expectation: Expectation computed.
 94 |         """
 95 | 
 96 |         # State embedding part of the Deep Neural Network
 97 |         state = self.stateEmbeddingDNN(state)
 98 | 
 99 |         # Computation of the expectation of the PDF internally computed by the UMNN
100 |         expectation = self.UMNN.expectation(state, lambda x: x, minReturn, maxReturn, numberOfPoints)
101 |         return expectation
102 | 


--------------------------------------------------------------------------------
/Models/UMDQN_KL_Model_Atari.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | # pylint: disable=E1101
 10 | # pylint: disable=E1102
 11 | 
 12 | from Models.DNN_Atari import DNN_Atari
 13 | from Models.DNN_MinAtar import DNN_MinAtar
 14 | from Models.MonotonicNN import OneDimensionnalNF
 15 | 
 16 | 
 17 | 
 18 | ###############################################################################
 19 | ######################### Class UMDQN_KL_Model_Atari ##########################
 20 | ###############################################################################
 21 | 
 22 | class UMDQN_KL_Model_Atari(nn.Module):
 23 |     """
 24 |     GOAL: Implementing the DL model for the UMDQN-KL distributional RL algorithm.
 25 |     
 26 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
 27 |                 - UMNN: UMNN part of the Deep Neural Network.
 28 |                                 
 29 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
 30 |                 - forward: Forward pass of the Deep Neural Network.
 31 |                 - getExpectation: Get the expectation of the PDF approximated by the UMNN.
 32 |     """
 33 | 
 34 |     def __init__(self, numberOfInputs, numberOfOutputs,
 35 |                  structureUMNN, stateEmbedding, numberOfSteps,
 36 |                  device='cpu', minAtar=False):
 37 |         """
 38 |         GOAL: Defining and initializing the Deep Neural Network.
 39 |         
 40 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
 41 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
 42 |                 - structureUMNN: Structure of the UMNN for distribution representation.
 43 |                 - stateEmbedding: Dimension of the state embedding.
 44 |                 - numberOfSteps: Number of integration steps for the UMNN.
 45 |                 - device: Hardware device (CPU or GPU).
 46 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
 47 |         
 48 |         OUTPUTS: /
 49 |         """
 50 | 
 51 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
 52 |         super(UMDQN_KL_Model_Atari, self).__init__()
 53 | 
 54 |         # Initialization of the Deep Neural Network
 55 |         if minAtar:
 56 |             self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding)
 57 |         else:
 58 |             self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding)
 59 |         self.UMNN = OneDimensionnalNF(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
 60 | 
 61 |     
 62 |     def forward(self, state, q):
 63 |         """
 64 |         GOAL: Implementing the forward pass of the Deep Neural Network.
 65 |         
 66 |         INPUTS: - state: RL state.
 67 |                 - q: Samples of potential returns.
 68 |         
 69 |         OUTPUTS: - output: Output of the Deep Neural Network.
 70 |         """
 71 |         
 72 |         # State embedding part of the Deep Neural Network
 73 |         batchSize = state.size(0)
 74 |         x = self.stateEmbeddingDNN(state)
 75 |         x = x.repeat(1, int(len(q)/len(state))).view(-1, x.size(1))
 76 | 
 77 |         # UMNN part of the Deep Neural Network
 78 |         x = self.UMNN(q, x)
 79 | 
 80 |         # Formatting of the output and post processing operations
 81 |         x = torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
 82 |         x = torch.exp(x)
 83 |         x = x.clamp(min=1e-6)
 84 | 
 85 |         return x
 86 | 
 87 | 
 88 |     def getExpectation(self, state, minReturn, maxReturn, numberOfPoints):
 89 |         """
 90 |         GOAL: Get the expectation of the PDF internally computed by the UMNN.
 91 |         
 92 |         INPUTS: - state: RL state.
 93 |                 - minReturn: Minimum return.
 94 |                 - maxReturn: Maximum return.
 95 |                 - numberOfPoints: Number of points for the computations (accuracy).
 96 |         
 97 |         OUTPUTS: - expectation: Expectation computed.
 98 |         """
 99 | 
100 |         # State embedding part of the Deep Neural Network
101 |         state = self.stateEmbeddingDNN(state)
102 | 
103 |         # Computation of the expectation of the PDF internally computed by the UMNN
104 |         expectation = self.UMNN.expectation(state, lambda x: x, minReturn, maxReturn, numberOfPoints)
105 |         return expectation
106 | 


--------------------------------------------------------------------------------
/Models/UMDQN_W_Model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | from Models.FeedforwardDNN import FeedForwardDNN
13 | from Models.MonotonicNN import MonotonicNN
14 | 
15 | 
16 | 
17 | ###############################################################################
18 | ############################### Class UMDQN_W_Model ###########################
19 | ###############################################################################
20 | 
21 | class UMDQN_W_Model(nn.Module):
22 |     """
23 |     GOAL: Implementing the DL model for the UMDQN-W distributional RL algorithm.
24 |     
25 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
26 |                 - UMNN: UMNN part of the Deep Neural Network.
27 |                                 
28 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
29 |                 - forward: Forward pass of the Deep Neural Network.
30 |     """
31 | 
32 |     def __init__(self, numberOfInputs, numberOfOutputs,
33 |                  structureDNN, structureUMNN, stateEmbedding,
34 |                  numberOfSteps, device='cpu'):
35 |         """
36 |         GOAL: Defining and initializing the Deep Neural Network.
37 |         
38 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
39 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
40 |                 - structureDNN: Structure of the feedforward DNN for state embedding.
41 |                 - structureUMNN: Structure of the UMNN for distribution representation.
42 |                 - stateEmbedding: Dimension of the state embedding.
43 |                 - numberOfSteps: Number of integration steps for the UMNN.
44 |                 - device: Hardware device (CPU or GPU).
45 |         
46 |         OUTPUTS: /
47 |         """
48 | 
49 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
50 |         super(UMDQN_W_Model, self).__init__()
51 | 
52 |         # Initialization of the Deep Neural Network
53 |         self.stateEmbeddingDNN = FeedForwardDNN(numberOfInputs, stateEmbedding, structureDNN)
54 |         self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
55 | 
56 |     
57 |     def forward(self, state, taus):
58 |         """
59 |         GOAL: Implementing the forward pass of the Deep Neural Network.
60 |         
61 |         INPUTS: - state: RL state.
62 |                 - taus: Samples of taus.
63 |         
64 |         OUTPUTS: - output: Output of the Deep Neural Network.
65 |         """
66 |         
67 |         # State embedding part of the Deep Neural Network
68 |         batchSize = state.size(0)
69 |         x = self.stateEmbeddingDNN(state)
70 |         x = x.repeat(1, int(len(taus)/len(state))).view(-1, x.size(1))
71 | 
72 |         # UMNNN part of the Deep Neural Network
73 |         x = self.UMNN(taus, x)
74 | 
75 |         # Appropriate format
76 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
77 | 


--------------------------------------------------------------------------------
/Models/UMDQN_W_Model_Atari.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | ###############################################################################
 4 | ################################### Imports ###################################
 5 | ###############################################################################
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | # pylint: disable=E1101
10 | # pylint: disable=E1102
11 | 
12 | from Models.DNN_Atari import DNN_Atari
13 | from Models.DNN_MinAtar import DNN_MinAtar
14 | from Models.MonotonicNN import MonotonicNN
15 | 
16 | 
17 | 
18 | ###############################################################################
19 | ########################### Class UMDQN_W_Model_Atari #########################
20 | ###############################################################################
21 | 
22 | class UMDQN_W_Model_Atari(nn.Module):
23 |     """
24 |     GOAL: Implementing the DL model for the UMDQN-W distributional RL algorithm.
25 |     
26 |     VARIABLES:  - stateEmbeddingDNN: State embedding part of the Deep Neural Network.
27 |                 - UMNN: UMNN part of the Deep Neural Network.
28 |                                 
29 |     METHODS:    - __init__: Initialization of the Deep Neural Network.
30 |                 - forward: Forward pass of the Deep Neural Network.
31 |     """
32 | 
33 |     def __init__(self, numberOfInputs, numberOfOutputs,
34 |                  structureUMNN, stateEmbedding,
35 |                  numberOfSteps, device='cpu', minAtar=False):
36 |         """
37 |         GOAL: Defining and initializing the Deep Neural Network.
38 |         
39 |         INPUTS: - numberOfInputs: Number of inputs of the Deep Neural Network.
40 |                 - numberOfOutputs: Number of outputs of the Deep Neural Network.
41 |                 - structureUMNN: Structure of the UMNN for distribution representation.
42 |                 - stateEmbedding: Dimension of the state embedding.
43 |                 - numberOfSteps: Number of integration steps for the UMNN.
44 |                 - device: Hardware device (CPU or GPU).
45 |                 - minAtar: Boolean specifying whether the env is "MinAtar" or not.
46 |         
47 |         OUTPUTS: /
48 |         """
49 | 
50 |         # Call the constructor of the parent class (Pytorch torch.nn.Module)
51 |         super(UMDQN_W_Model_Atari, self).__init__()
52 | 
53 |         # Initialization of the Deep Neural Network
54 |         if minAtar:
55 |             self.stateEmbeddingDNN = DNN_MinAtar(numberOfInputs, stateEmbedding)
56 |         else:
57 |             self.stateEmbeddingDNN = DNN_Atari(numberOfInputs, stateEmbedding)
58 |         self.UMNN = MonotonicNN(stateEmbedding+1, structureUMNN, numberOfSteps, numberOfOutputs, device)
59 | 
60 |     
61 |     def forward(self, state, taus):
62 |         """
63 |         GOAL: Implementing the forward pass of the Deep Neural Network.
64 |         
65 |         INPUTS: - state: RL state.
66 |                 - taus: Samples of taus.
67 |         
68 |         OUTPUTS: - output: Output of the Deep Neural Network.
69 |         """
70 |         
71 |         # State embedding part of the Deep Neural Network
72 |         batchSize = state.size(0)
73 |         x = self.stateEmbeddingDNN(state)
74 |         x = x.repeat(1, int(len(taus)/len(state))).view(-1, x.size(1))
75 | 
76 |         # UMNNN part of the Deep Neural Network
77 |         x = self.UMNN(taus, x)
78 | 
79 |         # Appropriate format
80 |         return torch.cat(torch.chunk(torch.transpose(x, 0, 1), batchSize, dim=1), 0)
81 | 


--------------------------------------------------------------------------------
/Models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Models/__init__.py


--------------------------------------------------------------------------------
/MonteCarloDistributions.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from matplotlib import pyplot as plt
 10 | from matplotlib import rc
 11 | rc('text', usetex=True)
 12 | 
 13 | 
 14 | 
 15 | ###############################################################################
 16 | ################################ Global variables #############################
 17 | ###############################################################################
 18 | 
 19 | # Default parameters for the plotting of the distributions
 20 | numberOfSamples = 10000000
 21 | bins = 1000
 22 | density = True
 23 | plotRange = (-2.1, 2.1)
 24 | histtype = 'step'
 25 | 
 26 | 
 27 | 
 28 | ###############################################################################
 29 | ####################### Class MonteCarloDistributions #########################
 30 | ###############################################################################
 31 | 
 32 | class MonteCarloDistributions():
 33 |     """
 34 |     GOAL: Implementing a technique based on Monte Carlo to estimate the true
 35 |           expected return associated with an environment and a policy.
 36 |     
 37 |     VARIABLES: - environment: Environment analysed.
 38 |                - policy: Policy analysed.
 39 |                - gamma: Discount factor.
 40 |                                 
 41 |     METHODS: - __init__: Initialization of the class.
 42 |              - samplingMonteCarlo: Generate MC samples of the random return.
 43 |              - plotDistributions: PLot the distributions from the MC samples.
 44 |     """
 45 | 
 46 |     def __init__(self, environment, policy, gamma):
 47 |         """
 48 |         GOAL: Perform the initialization of the class.
 49 |         
 50 |         INPUTS: - environment: Environment analysed.
 51 |                 - policy: Policy analysed.
 52 |                 - gamma: Discount factor.
 53 |         
 54 |         OUTPUTS: /
 55 |         """
 56 | 
 57 |         # Initialization of important variables
 58 |         self.environment = environment
 59 |         self.policy = policy
 60 |         self.gamma = gamma
 61 | 
 62 |     
 63 |     def samplingMonteCarlo(self, initialState, initialAction, numberOfSamples=numberOfSamples):
 64 |         """
 65 |         GOAL: Collect Monte Carlo samples of the expected return associated
 66 |               with the state and action specified.
 67 |         
 68 |         INPUTS: - initialState: RL state to start from.
 69 |                 - initialAction: RL action to start from.
 70 |                 
 71 |                 - numberOfSamples: Number of Monte Carlo samples to collect.
 72 |         
 73 |         OUTPUTS: - samples: Monte Carlo samples collected.
 74 |         """
 75 | 
 76 |         # Initialization of the memory storing the MC samples
 77 |         samples = []
 78 | 
 79 |         # Generation of the MC samples
 80 |         for _i in range(numberOfSamples):
 81 | 
 82 |             # Initialization of some variables
 83 |             expectedReturn = 0
 84 |             step = 0
 85 | 
 86 |             # Reset of the environment and initialization to the desired state
 87 |             self.environment.reset()
 88 |             state = self.environment.setState(initialState)
 89 | 
 90 |             # Execution of the action specified
 91 |             nextState, reward, done, info = self.environment.step(initialAction)
 92 | 
 93 |             # Update of the expected return
 94 |             expectedReturn += (reward * (self.gamma**step))
 95 |             step += 1
 96 | 
 97 |             # Loop until episode termination
 98 |             while done == 0:
 99 | 
100 |                 # Execute the next ation according to the policy selected
101 |                 state = self.policy.processState(nextState)
102 |                 policyAction = self.policy.chooseAction(state, plot=False)
103 |                 nextState, reward, done, info = self.environment.step(policyAction)
104 | 
105 |                 # Update of the expected return
106 |                 expectedReturn += (reward * (self.gamma**step))
107 |                 step += 1
108 |             
109 |             # Add the MC sample to the memory
110 |             samples.append(expectedReturn)
111 | 
112 |         # Output the MC samples collected
113 |         return samples
114 | 
115 | 
116 |     def plotDistributions(self, state, numberOfSamples=numberOfSamples):
117 |         """
118 |         GOAL: Collect Monte Carlo samples of the expected return associated
119 |               with the state and action specified.
120 |         
121 |         INPUTS: - state: RL state to start from.
122 |                 - numberOfSamples: Number of Monte Carlo samples to collect.
123 |         
124 |         OUTPUTS: /
125 |         """
126 | 
127 |         # Generation of the Monte Carlo samples
128 |         samples = []
129 |         actions = 4
130 |         for action in range(actions):
131 |             samples.append(self.samplingMonteCarlo(state, action, numberOfSamples))
132 | 
133 |         # Initialization of the figure
134 |         colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
135 |         fig = plt.figure()
136 |         
137 |         # Plotting of the PDF of the random return
138 |         ax1 = plt.subplot(3, 1, 1)
139 |         for action in range(actions):
140 |             plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, color=colors[action])
141 |         ax1.set_xlabel('Random return')
142 |         ax1.set_ylabel('PDF')
143 |         ax1.set(xlim=(-2, 2))
144 | 
145 |         # Plotting of the CDF of the random return
146 |         ax2 = plt.subplot(3, 1, 2)
147 |         for action in range(actions):
148 |             plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color=colors[action])
149 |         ax2.set_xlabel('Random return')
150 |         ax2.set_ylabel('CDF')
151 |         ax2.set(xlim=(-2, 2))
152 | 
153 |         # Plotting of the QF of the random return
154 |         ax3 = plt.subplot(3, 1, 3)
155 |         CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
156 |         CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
157 |         CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
158 |         CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
159 |         ax3.clear()
160 |         ax3.plot(CDF0[0], CDF0[1][1:], color=colors[0])
161 |         ax3.plot(CDF1[0], CDF1[1][1:], color=colors[1])
162 |         ax3.plot(CDF2[0], CDF2[1][1:], color=colors[2])
163 |         ax3.plot(CDF3[0], CDF3[1][1:], color=colors[3])
164 |         ax3.set_xlabel('Quantile fraction')
165 |         ax3.set_ylabel('QF')
166 |         ax3.set(xlim=(0, 1))
167 |         ax3.legend(['Move right', 'Move down', 'Move left', 'Move up'])
168 | 
169 |         # Saving of the figure generated
170 |         plt.savefig("Figures/Distributions/MonteCarloDistributions.pdf", format='pdf')
171 | 
172 |         # Generation of the figure for the PDF of the random return
173 |         fig = plt.figure(figsize=(10, 4))
174 |         ax1 = plt.subplot(1, 1, 1)
175 |         for action in range(actions):
176 |             plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, color=colors[action])
177 |         ax1.set_xlabel('Random return')
178 |         ax1.set_ylabel('PDF')
179 |         ax1.set(xlim=(-0.5, 1.5), ylim=(0, 3.5))
180 |         plt.savefig("Figures/Distributions/MonteCarloDistributionsPDF.pdf", format='pdf')
181 |         # Generation of the figure for the CDF of the random return
182 |         fig = plt.figure(figsize=(10, 4))
183 |         ax2 = plt.subplot(1, 1, 1)
184 |         for action in range(actions):
185 |             plt.hist(samples[action], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color=colors[action])
186 |         ax2.set_xlabel('Random return')
187 |         ax2.set_ylabel('CDF')
188 |         ax2.set(xlim=(-0.5, 1.5), ylim=(-0.1, 1.1))
189 |         plt.savefig("Figures/Distributions/MonteCarloDistributionsCDF.pdf", format='pdf')
190 |         # Generation of the figure for the QF of the random return
191 |         fig = plt.figure(figsize=(10, 4))
192 |         ax3 = plt.subplot(1, 1, 1)
193 |         CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
194 |         CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
195 |         CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
196 |         CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
197 |         ax3.clear()
198 |         ax3.plot(CDF0[0], CDF0[1][1:], color=colors[0])
199 |         ax3.plot(CDF1[0], CDF1[1][1:], color=colors[1])
200 |         ax3.plot(CDF2[0], CDF2[1][1:], color=colors[2])
201 |         ax3.plot(CDF3[0], CDF3[1][1:], color=colors[3])
202 |         ax3.set_xlabel('Quantile fraction')
203 |         ax3.set_ylabel('QF')
204 |         ax3.set(xlim=(0, 1), ylim=(-0.5, 1.5))
205 |         plt.savefig("Figures/Distributions/MonteCarloDistributionsQF.pdf", format='pdf')
206 | 
207 |         # Saving of the data into external files
208 |         PDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, color='white')
209 |         PDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, color='white')
210 |         PDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, color='white')
211 |         PDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, color='white')
212 |         CDF0 = plt.hist(samples[0], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
213 |         CDF1 = plt.hist(samples[1], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
214 |         CDF2 = plt.hist(samples[2], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
215 |         CDF3 = plt.hist(samples[3], bins=bins, density=density, range=plotRange, histtype=histtype, cumulative=True, color='white')
216 |         dataPDF = {
217 |             'Action0_x': PDF0[1][1:],
218 |             'Action0_y': PDF0[0],
219 |             'Action1_x': PDF1[1][1:],
220 |             'Action1_y': PDF1[0],
221 |             'Action2_x': PDF2[1][1:],
222 |             'Action2_y': PDF2[0],
223 |             'Action3_x': PDF3[1][1:],
224 |             'Action3_y': PDF3[0],
225 |         }
226 |         dataCDF = {
227 |             'Action0_x': CDF0[1][1:],
228 |             'Action0_y': CDF0[0],
229 |             'Action1_x': CDF1[1][1:],
230 |             'Action1_y': CDF1[0],
231 |             'Action2_x': CDF2[1][1:],
232 |             'Action2_y': CDF2[0],
233 |             'Action3_x': CDF3[1][1:],
234 |             'Action3_y': CDF3[0],
235 |         }
236 |         dataQF = {
237 |             'Action0_y': CDF0[1][1:],
238 |             'Action0_x': CDF0[0],
239 |             'Action1_y': CDF1[1][1:],
240 |             'Action1_x': CDF1[0],
241 |             'Action2_y': CDF2[1][1:],
242 |             'Action2_x': CDF2[0],
243 |             'Action3_y': CDF3[1][1:],
244 |             'Action3_x': CDF3[0],
245 |         }
246 |         dataframePDF = pd.DataFrame(dataPDF)
247 |         dataframeCDF = pd.DataFrame(dataCDF)
248 |         dataframeQF = pd.DataFrame(dataQF)
249 |         dataframePDF.to_csv('Figures/Distributions/MonteCarloPDF.csv')
250 |         dataframeCDF.to_csv('Figures/Distributions/MonteCarloCDF.csv')
251 |         dataframeQF.to_csv('Figures/Distributions/MonteCarloQF.csv')
252 | 


--------------------------------------------------------------------------------
/Parameters/parameters_CDQN_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "numberOfAtoms": 51,
20 |     "minReturn": -5,
21 |     "maxReturn": 5,
22 | 
23 |     "rewardClipping": 1,
24 |     "gradientClipping": 1,
25 | 
26 |     "atari": 1,
27 |     "minatar": 0,
28 |     
29 |     "GPUNumber": 0
30 | }


--------------------------------------------------------------------------------
/Parameters/parameters_CDQN_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 |     "numberOfAtoms": 51,
20 |     "minReturn": -10,
21 |     "maxReturn": 110,
22 | 
23 |     "rewardClipping": 1000,
24 |     "gradientClipping": 1,
25 | 
26 |     "atari": 0,
27 |     "minatar": 0,
28 |     
29 |     "GPUNumber": 0
30 | }


--------------------------------------------------------------------------------
/Parameters/parameters_CDQN_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "numberOfAtoms": 51,
20 |     "minReturn": -1,
21 |     "maxReturn": 10,
22 | 
23 |     "rewardClipping": 1,
24 |     "gradientClipping": 1,
25 | 
26 |     "atari": 0,
27 |     "minatar": 1,
28 |     
29 |     "GPUNumber": 0
30 | }


--------------------------------------------------------------------------------
/Parameters/parameters_CDQN_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 |     "numberOfAtoms": 51,
20 |     "minReturn": -2,
21 |     "maxReturn": 2,
22 | 
23 |     "rewardClipping": 2,
24 |     "gradientClipping": 1,
25 | 
26 |     "atari": 0,
27 |     "minatar": 0,
28 |     
29 |     "GPUNumber": 0
30 | }


--------------------------------------------------------------------------------
/Parameters/parameters_DQN_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 | 
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 | 
20 |     "rewardClipping": 1,
21 |     "gradientClipping": 1,
22 | 
23 |     "atari": 1,
24 |     "minatar": 0,
25 |     
26 |     "GPUNumber": 0
27 | }


--------------------------------------------------------------------------------
/Parameters/parameters_DQN_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 |     "learningRate": 0.0001,
 4 | 
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 | 
20 |     "rewardClipping": 1000,
21 |     "gradientClipping": 1,
22 | 
23 |     "atari": 0,
24 |     "minatar": 0,
25 |     
26 |     "GPUNumber": 0
27 | }


--------------------------------------------------------------------------------
/Parameters/parameters_DQN_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 | 
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 | 
20 |     "rewardClipping": 1,
21 |     "gradientClipping": 1,
22 | 
23 |     "atari": 0,
24 |     "minatar": 1,
25 |     
26 |     "GPUNumber": 0
27 | }


--------------------------------------------------------------------------------
/Parameters/parameters_DQN_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 |     "learningRate": 0.0001,
 4 | 
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 | 
20 |     "rewardClipping": 2,
21 |     "gradientClipping": 1,
22 | 
23 |     "atari": 0,
24 |     "minatar": 0,
25 |     
26 |     "GPUNumber": 0
27 | }


--------------------------------------------------------------------------------
/Parameters/parameters_FQF_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 512,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 1,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_FQF_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1000,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_FQF_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 1,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_FQF_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 2,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_IQN_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 512,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 1,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_IQN_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1000,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_IQN_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 1,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 1,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_IQN_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "stateEmbedding": 128,
20 |     "N": 32,
21 |     "K": 32,
22 |     "NCos": 64,
23 | 
24 |     "rewardClipping": 2,
25 |     "gradientClipping": 1,
26 | 
27 |     "atari": 0,
28 |     "minatar": 0,
29 |     
30 |     "GPUNumber": 0
31 | }


--------------------------------------------------------------------------------
/Parameters/parameters_QR_DQN_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "numberOfQuantiles": 200,
20 | 
21 |     "rewardClipping": 1,
22 |     "gradientClipping": 1,
23 | 
24 |     "atari": 1,
25 |     "minatar": 0,
26 |     
27 |     "GPUNumber": 0
28 | }


--------------------------------------------------------------------------------
/Parameters/parameters_QR_DQN_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 |     "numberOfQuantiles": 200,
20 | 
21 |     "rewardClipping": 1000,
22 |     "gradientClipping": 1,
23 | 
24 |     "atari": 0,
25 |     "minatar": 0,
26 |     
27 |     "GPUNumber": 0
28 | }


--------------------------------------------------------------------------------
/Parameters/parameters_QR_DQN_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.0003125,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "numberOfQuantiles": 200,
20 | 
21 |     "rewardClipping": 1,
22 |     "gradientClipping": 1,
23 | 
24 |     "atari": 0,
25 |     "minatar": 1,
26 |     
27 |     "GPUNumber": 0
28 | }


--------------------------------------------------------------------------------
/Parameters/parameters_QR_DQN_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128, 128],
19 |     "numberOfQuantiles": 200,
20 | 
21 |     "rewardClipping": 2,
22 |     "gradientClipping": 1,
23 | 
24 |     "atari": 0,
25 |     "minatar": 0,
26 |     
27 |     "GPUNumber": 0
28 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_C_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 512,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -5,
25 |     "maxReturn": 5,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 1,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_C_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -10,
25 |     "maxReturn": 110,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1000,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_C_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -1,
25 |     "maxReturn": 10,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 1,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_C_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -2,
25 |     "maxReturn": 2,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 2,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_KL_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 512,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -5,
25 |     "maxReturn": 5,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 1,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_KL_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -10,
25 |     "maxReturn": 110,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1000,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_KL_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -1,
25 |     "maxReturn": 10,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 1,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 1,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_KL_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 |     "minReturn": -2,
25 |     "maxReturn": 2,
26 | 
27 |     "fasterExpectation": 1,
28 | 
29 |     "rewardClipping": 2,
30 |     "gradientClipping": 1,
31 | 
32 |     "atari": 0,
33 |     "minatar": 0,
34 |     
35 |     "GPUNumber": 0
36 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_W_Atari57.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 512,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 | 
25 |     "rewardClipping": 1,
26 |     "gradientClipping": 1,
27 | 
28 |     "atari": 1,
29 |     "minatar": 0,
30 |     
31 |     "GPUNumber": 0
32 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_W_ClassicControl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 | 
25 |     "rewardClipping": 1000,
26 |     "gradientClipping": 1,
27 | 
28 |     "atari": 0,
29 |     "minatar": 0,
30 |     
31 |     "GPUNumber": 0
32 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_W_MinAtar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.99,
 3 | 
 4 |     "learningRate": 0.00005,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 10000,
 8 |     "learningUpdatePeriod": 4,
 9 | 
10 |     "capacity": 100000,
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 1000000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 | 
25 |     "rewardClipping": 1,
26 |     "gradientClipping": 1,
27 | 
28 |     "atari": 0,
29 |     "minatar": 1,
30 |     
31 |     "GPUNumber": 0
32 | }


--------------------------------------------------------------------------------
/Parameters/parameters_UMDQN_W_StochasticGridWorld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma": 0.5,
 3 | 
 4 |     "learningRate": 0.0001,
 5 |     "epsilon": 0.00001,
 6 |     
 7 |     "targetUpdatePeriod": 1000,
 8 |     "learningUpdatePeriod": 1,
 9 | 
10 |     "capacity": 10000, 
11 |     "batchSize": 32,
12 | 
13 |     "epsilonStart": 1.0,
14 |     "epsilonEnd": 0.01,
15 |     "epsilonDecay": 10000,
16 |     "epsilonTest": 0.001,
17 | 
18 |     "structureDNN": [128],
19 |     "structureUMNN": [128],
20 |     "stateEmbedding": 128,
21 |     "numberOfSteps": 50,
22 | 
23 |     "numberOfSamples": 200,
24 | 
25 |     "rewardClipping": 2,
26 |     "gradientClipping": 1,
27 | 
28 |     "atari": 0,
29 |     "minatar": 0,
30 |     
31 |     "GPUNumber": 0
32 | }


--------------------------------------------------------------------------------
/QR_DQN.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | from matplotlib import pyplot as plt
 10 | 
 11 | import torch
 12 | import torch.optim as optim
 13 | 
 14 | from replayMemory import ReplayMemory
 15 | 
 16 | from Models.QR_DQN_Model import QR_DQN_Model
 17 | from Models.QR_DQN_Model_Atari import QR_DQN_Model_Atari
 18 | 
 19 | from DQN import DQN
 20 | 
 21 | 
 22 | 
 23 | ###############################################################################
 24 | ################################## Class QR_DQN ################################
 25 | ###############################################################################
 26 | 
 27 | class QR_DQN(DQN):
 28 |     """
 29 |     GOAL: Implementing the QR-DQN Deep Reinforcement Learning algorithm.
 30 |     
 31 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 32 |                - gamma: Discount factor of the RL algorithm.
 33 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 34 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 35 |                - targetNetworkUpdate: Update frequency of the target network.
 36 |                - learningUpdatePeriod: Frequency of the learning procedure.
 37 |                - batchSize: Size of the batch to sample from the replay memory.
 38 |                - capacity: Capacity of the replay memory.
 39 |                - replayMemory: Experience Replay memory.
 40 |                - rewardClipping: Clipping of the RL rewards.
 41 |                - gradientClipping: Clipping of the training loss.
 42 |                - optimizer: DL optimizer (ADAM).
 43 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 44 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 45 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 46 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 47 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 48 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 49 |                - targetNetwork: Deep Neural Network representing the target network.
 50 |                            
 51 |     METHODS: - __init__: Initialization of the RL algorithm.
 52 |              - chooseAction: Choose a valid action based on the current state
 53 |                              observed, according to the RL policy learned.
 54 |              - learning: Execute the RL algorithm learning procedure.
 55 |     """
 56 | 
 57 |     def __init__(self, observationSpace, actionSpace, environment,
 58 |                  parametersFileName='', reporting=True):
 59 |         """
 60 |         GOAL: Initializing the RL agent based on the QR-DQN Deep Reinforcement Learning
 61 |               algorithm, by setting up the algorithm parameters as well as 
 62 |               the Deep Neural Networks.
 63 |         
 64 |         INPUTS: - observationSpace: RL observation space.
 65 |                 - actionSpace: RL action space.
 66 |                 - environment: Name of the RL environment.
 67 |                 - parametersFileName: Name of the JSON parameters file.
 68 |                 - reporting: Enable the reporting of the results.
 69 |         
 70 |         OUTPUTS: /
 71 |         """
 72 | 
 73 |         # Initialization of the DQN parent class
 74 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 75 | 
 76 |         # Setting of the parameters
 77 |         if parametersFileName == '':
 78 |             parametersFileName = ''.join(['Parameters/parameters_QR_DQN_', str(environment), '.json'])
 79 |         parameters = self.readParameters(parametersFileName)
 80 | 
 81 |         # Set the device for DNN computations (CPU or GPU)
 82 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 83 | 
 84 |         # Set the general parameters of the RL algorithm
 85 |         self.gamma = parameters['gamma']
 86 |         self.learningRate = parameters['learningRate']
 87 |         self.epsilon = parameters['epsilon']
 88 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 89 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 90 |         self.rewardClipping = parameters['rewardClipping']
 91 |         self.gradientClipping = parameters['gradientClipping']
 92 | 
 93 |         # Set the Experience Replay mechanism
 94 |         self.batchSize = parameters['batchSize']
 95 |         self.capacity = parameters['capacity']
 96 |         self.replayMemory = ReplayMemory(self.capacity)
 97 | 
 98 |         # Set the distribution support
 99 |         self.numberOfQuantiles = parameters['numberOfQuantiles']
100 |         self.quantileProbability = 1./self.numberOfQuantiles
101 |         self.tau = ((torch.linspace(0.0, 1.0, self.numberOfQuantiles+1)[:-1] + torch.linspace(0.0, 1.0, self.numberOfQuantiles+1)[1:])/2).to(self.device)
102 |         self.kappa = 1.0
103 | 
104 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
105 |         self.atari = parameters['atari']
106 |         self.minatar = parameters['minatar']
107 |         if self.atari or self.minatar:
108 |             self.policyNetwork = QR_DQN_Model_Atari(observationSpace, actionSpace*self.numberOfQuantiles, self.numberOfQuantiles, minAtar=self.minatar).to(self.device)
109 |             self.targetNetwork = QR_DQN_Model_Atari(observationSpace, actionSpace*self.numberOfQuantiles, self.numberOfQuantiles, minAtar=self.minatar).to(self.device)
110 |         else:
111 |             self.policyNetwork = QR_DQN_Model(observationSpace, actionSpace*self.numberOfQuantiles, parameters['structureDNN'], self.numberOfQuantiles).to(self.device)
112 |             self.targetNetwork = QR_DQN_Model(observationSpace, actionSpace*self.numberOfQuantiles, parameters['structureDNN'], self.numberOfQuantiles).to(self.device)
113 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
114 | 
115 |         # Set the Deep Learning optimizer
116 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
117 | 
118 |         # Set the Epsilon-Greedy exploration technique
119 |         self.epsilonStart = parameters['epsilonStart']
120 |         self.epsilonEnd = parameters['epsilonEnd']
121 |         self.epsilonDecay = parameters['epsilonDecay']
122 |         self.epsilonTest = parameters['epsilonTest']
123 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
124 | 
125 |         # Initialization of the experiment folder and tensorboard writer
126 |         self.initReporting(parameters, 'QR_DQN')
127 | 
128 | 
129 |     def chooseAction(self, state, plot=False):
130 |         """
131 |         GOAL: Choose a valid RL action from the action space according to the
132 |               RL policy as well as the current RL state observed.
133 |         
134 |         INPUTS: - state: RL state returned by the environment.
135 |                 - plot: Enable the plotting of the random returns distributions.
136 |         
137 |         OUTPUTS: - action: RL action chosen from the action space.
138 |         """
139 | 
140 |         # Choose the best action based on the RL policy
141 |         with torch.no_grad():
142 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
143 |             quantiles = self.policyNetwork(state).squeeze(0)
144 |             QValues = quantiles.mean(1)
145 |             _, action = QValues.max(0)
146 | 
147 |             # If required, plot the return distribution associated with each action
148 |             if plot:
149 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
150 |                 fig = plt.figure()
151 |                 ax = fig.add_subplot()
152 |                 tau = self.tau.cpu().numpy()
153 |                 quantiles = quantiles.cpu().numpy()
154 |                 QValues = QValues.cpu().numpy()
155 |                 for a in range(self.actionSpace):
156 |                     ax.plot(tau, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a])
157 |                     ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
158 |                 ax.set_xlabel('Quantile fraction')
159 |                 ax.set_ylabel('Quantile Function (QF)')
160 |                 ax.legend()
161 |                 plt.show()
162 |             
163 |             return action.item()
164 | 
165 | 
166 |     def learning(self):
167 |         """
168 |         GOAL: Sample a batch of past experiences and learn from it
169 |               by updating the Reinforcement Learning policy.
170 |         
171 |         INPUTS: /
172 |         
173 |         OUTPUTS: - loss: Loss of the learning procedure.
174 |         """
175 |         
176 |         # Check that the replay memory is filled enough
177 |         if (len(self.replayMemory) >= self.batchSize):
178 | 
179 |             # Sample a batch of experiences from the replay memory
180 |             batch = self.dataLoaderIter.next()
181 |             state = batch[0].float().to(self.device)
182 |             action = batch[1].long().to(self.device)
183 |             reward = batch[2].float().to(self.device)
184 |             nextState = batch[3].float().to(self.device)
185 |             done = batch[4].float().to(self.device)
186 | 
187 |             # Computation of the current return distribution
188 |             quantiles = self.policyNetwork(state)
189 |             action = action.view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.numberOfQuantiles)
190 |             quantiles = quantiles.gather(1, action).squeeze(1)
191 | 
192 |             # Computation of the new distribution to be learnt by the policy DNN
193 |             with torch.no_grad(): 
194 |                 nextQuantiles = self.targetNetwork(nextState)
195 |                 nextAction = nextQuantiles.mean(2).max(1)[1].view(self.batchSize, 1, 1).expand(self.batchSize, 1, self.numberOfQuantiles)
196 |                 nextQuantiles = nextQuantiles.gather(1, nextAction).squeeze(1)
197 |                 targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1))
198 | 
199 |             # Computation of the loss
200 |             difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2)
201 |             error = difference.abs()
202 |             loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa)))
203 |             loss = (self.tau - (difference < 0).float()).abs() * loss/self.kappa
204 |             loss = loss.mean(1).sum(1).mean()
205 | 
206 |             # Computation of the gradients
207 |             self.optimizer.zero_grad()
208 |             loss.backward()
209 | 
210 |             # Gradient Clipping
211 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
212 | 
213 |             # Perform the Deep Neural Network optimization
214 |             self.optimizer.step()
215 | 
216 |             return loss.item()
217 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks
 2 | Experimental code supporting the results presented in the scientific research paper:
 3 | > Thibaut Théate, Antoine Wehenkel, Adrien Bolland, Gilles Louppe and Damien Ernst. "Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks." (2021).
 4 | > [[arxiv]](https://arxiv.org/abs/)
 5 | 
 6 | 
 7 | 
 8 | # Dependencies
 9 | 
10 | The dependencies are listed in the text file "requirements.txt":
11 | * Python 3.7.7
12 | * Pytorch
13 | * Tensorboard
14 | * Gym
15 | * Opencv-python
16 | * Atari-py
17 | * MinAtar
18 | * Umnn
19 | * Numpy
20 | * Pandas
21 | * Matplotlib
22 | * Scipy
23 | * Tqdm
24 | 
25 | 
26 | 
27 | # Usage
28 | 
29 | Training and testing a chosen distributional RL algorithm for the control problem of a chosen environment is performed by running the following command:
30 | 
31 | ```bash
32 | python main.py -algorithm ALGORITHM -environment ENVIRONMENT
33 | ```
34 | 
35 | with:
36 | * ALGORITHM being the name of the algorithm (by default UMDQN_C),
37 | * ENVIRONMENT being the name of the environment (by default StochasticGridWorld).
38 | 
39 | The (distributional) RL algorithms supported are:
40 | * DQN,
41 | * CDQN,
42 | * QR_DQN,
43 | * IQN,
44 | * FQF,
45 | * UMDQN_KL,
46 | * UMDQN_C,
47 | * UMDQN_W.
48 | 
49 | The benchmark environments supported are:
50 | * StochasticGridWorld,
51 | * CartPole-v0,
52 | * Acrobot-v1,
53 | * LunarLander-v2,
54 | * MountainCar-v0,
55 | * MinAtar/Asterix-v0,
56 | * MinAtar/Breakout-v0,
57 | * MinAtar/Freeway-v0,
58 | * MinAtar/Seaquest-v0,
59 | * MinAtar/SpaceInvaders-v0,
60 | * PongNoFrameskip-v4,
61 | * BoxingNoFrameskip-v4,
62 | * FreewayNoFrameskip-v4.
63 | 
64 | The number of episodes for training the DRL algorithm may also be specified by the user through the argument "-episodes". The parameters of the DRL algorithms can be set with the argument "-parameters" and by providing the name of the .txt file containing these parameters within the "Parameters" folder.
65 | 
66 | For more advanced tests and manipulations, please directly refer to the code.
67 | 
68 | 
69 | 
70 | # Citation
71 | 
72 | If you make use of this experimental code, please cite the associated research paper:
73 | 
74 | ```
75 | @inproceedings{Théate2021,
76 |   title={Distributional Reinforcement Learning with Unconstrained Monotonic Neural Networks},
77 |   author={Thibaut Théate, Antoine Wehenkel, Adrien Bolland, Gilles Louppe and Damien Ernst},
78 |   year={2021}
79 | }
80 | ```
81 | 


--------------------------------------------------------------------------------
/SavedModels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/SavedModels/__init__.py


--------------------------------------------------------------------------------
/Tensorboard/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThibautTheate/Unconstrained-Monotonic-Deep-Q-Network-algorithm/be8a45ec9d31e8d07a43e4f6aad7fa3fd65b657c/Tensorboard/__init__.py


--------------------------------------------------------------------------------
/UMDQN_KL.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import scipy.stats as stats
 12 | 
 13 | from matplotlib import pyplot as plt
 14 | 
 15 | import torch
 16 | import torch.optim as optim
 17 | 
 18 | from replayMemory import ReplayMemory
 19 | 
 20 | from Models.UMDQN_KL_Model import UMDQN_KL_Model
 21 | from Models.UMDQN_KL_Model_Atari import UMDQN_KL_Model_Atari
 22 | 
 23 | from DQN import DQN
 24 | 
 25 | 
 26 | 
 27 | ###############################################################################
 28 | ################################ Class UMDQN_KL ###############################
 29 | ###############################################################################
 30 | 
 31 | class UMDQN_KL(DQN):
 32 |     """
 33 |     GOAL: Implementing the UMDQN_KL Deep Reinforcement Learning algorithm.
 34 |     
 35 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 36 |                - gamma: Discount factor of the RL algorithm.
 37 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 38 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 39 |                - targetNetworkUpdate: Update frequency of the target network.
 40 |                - learningUpdatePeriod: Frequency of the learning procedure.
 41 |                - batchSize: Size of the batch to sample from the replay memory.
 42 |                - capacity: Capacity of the replay memory.
 43 |                - replayMemory: Experience Replay memory.
 44 |                - rewardClipping: Clipping of the RL rewards.
 45 |                - gradientClipping: Clipping of the training loss.
 46 |                - optimizer: DL optimizer (ADAM).
 47 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 48 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 49 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 50 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 51 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 52 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 53 |                - targetNetwork: Deep Neural Network representing the target network.
 54 |                                 
 55 |     METHODS: - __init__: Initialization of the RL algorithm.
 56 |              - chooseAction: Choose a valid action based on the current state
 57 |                              observed, according to the RL policy learned.
 58 |              - learning: Execute the RL algorithm learning procedure.
 59 |     """
 60 | 
 61 |     def __init__(self, observationSpace, actionSpace, environment,
 62 |                  parametersFileName='', reporting=True):
 63 |         """
 64 |         GOAL: Initializing the RL agent based on the UMDQN_KL Deep Reinforcement Learning
 65 |               algorithm, by setting up the algorithm parameters as well as 
 66 |               the Deep Neural Networks.
 67 |         
 68 |         INPUTS: - observationSpace: RL observation space.
 69 |                 - actionSpace: RL action space.
 70 |                 - environment: Name of the RL environment.
 71 |                 - parametersFileName: Name of the JSON parameters file.
 72 |                 - reporting: Enable the reporting of the results.
 73 |         
 74 |         OUTPUTS: /
 75 |         """
 76 | 
 77 |         # Initialization of the DQN parent class
 78 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 79 | 
 80 |         # Setting of the parameters
 81 |         if parametersFileName == '':
 82 |             parametersFileName = ''.join(['Parameters/parameters_UMDQN_KL_', str(environment), '.json'])
 83 |         parameters = self.readParameters(parametersFileName)
 84 | 
 85 |         # Set the device for DNN computations (CPU or GPU)
 86 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 87 | 
 88 |         # Set the general parameters of the RL algorithm
 89 |         self.gamma = parameters['gamma']
 90 |         self.learningRate = parameters['learningRate']
 91 |         self.epsilon = parameters['epsilon']
 92 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 93 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 94 |         self.rewardClipping = parameters['rewardClipping']
 95 |         self.gradientClipping = parameters['gradientClipping']
 96 | 
 97 |         # Set the Experience Replay mechanism
 98 |         self.batchSize = parameters['batchSize']
 99 |         self.capacity = parameters['capacity']
100 |         self.replayMemory = ReplayMemory(self.capacity)
101 | 
102 |         # Set the distribution support
103 |         self.numberOfSamples = parameters['numberOfSamples']
104 |         self.minReturn = parameters['minReturn']
105 |         self.maxReturn = parameters['maxReturn']
106 |         self.support = np.linspace(self.minReturn, self.maxReturn, self.numberOfSamples)
107 |         self.supportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfSamples, device=self.device)
108 |         self.supportRepeatedBatchSize = self.supportTorch.repeat(self.batchSize, 1).view(-1, 1)
109 |         self.uniformProba = 1/(self.maxReturn - self.minReturn)
110 |         self.deltaSupport = self.support[1] - self.support[0]
111 | 
112 |         # Enable the faster but potentially less accurate estimation of the expectation
113 |         self.fasterExpectation = parameters['fasterExpectation']
114 | 
115 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
116 |         self.atari = parameters['atari']
117 |         self.minatar = parameters['minatar']
118 |         if self.atari or self.minatar:
119 |             self.policyNetwork = UMDQN_KL_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device)
120 |             self.targetNetwork = UMDQN_KL_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device)
121 |         else:
122 |             self.policyNetwork = UMDQN_KL_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device)
123 |             self.targetNetwork = UMDQN_KL_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device)
124 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
125 | 
126 |         # Set the Deep Learning optimizer
127 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
128 | 
129 |         # Set the Epsilon-Greedy exploration technique
130 |         self.epsilonStart = parameters['epsilonStart']
131 |         self.epsilonEnd = parameters['epsilonEnd']
132 |         self.epsilonDecay = parameters['epsilonDecay']
133 |         self.epsilonTest = parameters['epsilonTest']
134 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
135 | 
136 |         # Initialization of the experiment folder and tensorboard writer
137 |         self.initReporting(parameters, 'UMDQN_KL')
138 | 
139 | 
140 |     def chooseAction(self, state, plot=False):
141 |         """
142 |         GOAL: Choose a valid RL action from the action space according to the
143 |               RL policy as well as the current RL state observed.
144 |         
145 |         INPUTS: - state: RL state returned by the environment.
146 |                 - plot: Enable the plotting of the random returns distributions.
147 |         
148 |         OUTPUTS: - action: RL action chosen from the action space.
149 |         """
150 | 
151 |         # Choose the best action based on the RL policy
152 |         with torch.no_grad():
153 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
154 |             if self.fasterExpectation:
155 |                 QValues = self.policyNetwork.getExpectation(state, self.minReturn, self.maxReturn, 10*self.numberOfSamples).squeeze(0)
156 |             else:
157 |                 pdfs = self.policyNetwork(state, self.supportTorch.unsqueeze(1))
158 |                 QValues = (pdfs * self.supportTorch).sum(1)/(self.numberOfSamples*self.uniformProba)
159 |             _, action = QValues.max(0)
160 | 
161 |             # If required, plot the return distribution associated with each action
162 |             if plot:
163 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
164 |                 plt.figure()
165 |                 ax = plt.subplot(1, 1, 1)
166 |                 with torch.no_grad():
167 |                     accurateSupport = np.linspace(self.minReturn, self.maxReturn, self.numberOfSamples*10)
168 |                     accurateSupportTorch = torch.linspace(self.minReturn, self.maxReturn, self.numberOfSamples*10, device=self.device)
169 |                     pdfs = self.policyNetwork(state, accurateSupportTorch.unsqueeze(1))
170 |                     QValues = ((pdfs * accurateSupportTorch).sum(1))/(self.numberOfSamples*10*self.uniformProba)
171 |                 for a in range(self.actionSpace):
172 |                     ax.plot(accurateSupport, pdfs[a].cpu(), linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a])
173 |                     ax.fill_between(accurateSupport, accurateSupport*0, pdfs[a].cpu(), alpha=0.25, color=colors[a])
174 |                     ax.axvline(x=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
175 |                 ax.set_xlabel('Random return')
176 |                 ax.set_ylabel('Probability Density Function (PDF)')
177 |                 ax.legend()
178 |                 plt.show()
179 |                 """
180 |                 # Saving of the data into external files
181 |                 dataPDF = {
182 |                 'Action0_x': accurateSupport,
183 |                 'Action0_y': pdfs[0].cpu(),
184 |                 'Action1_x': accurateSupport,
185 |                 'Action1_y': pdfs[1].cpu(),
186 |                 'Action2_x': accurateSupport,
187 |                 'Action2_y': pdfs[2].cpu(),
188 |                 'Action3_x': accurateSupport,
189 |                 'Action3_y': pdfs[3].cpu(),
190 |                 }
191 |                 dataframePDF = pd.DataFrame(dataPDF)
192 |                 dataframePDF.to_csv('Figures/Distributions/UMDQN_KL.csv')
193 |                 quit()
194 |                 """
195 |             
196 |             return action.item()
197 | 
198 | 
199 |     def learning(self):
200 |         """
201 |         GOAL: Sample a batch of past experiences and learn from it
202 |               by updating the Reinforcement Learning policy.
203 |         
204 |         INPUTS: /
205 |         
206 |         OUTPUTS: - loss: Loss of the learning procedure.
207 |         """
208 |         
209 |         # Check that the replay memory is filled enough
210 |         if (len(self.replayMemory) >= self.batchSize):
211 |             
212 |             # Sample a batch of experiences from the replay memory
213 |             batch = self.dataLoaderIter.next()
214 |             state = batch[0].float().to(self.device)
215 |             action = batch[1].float().to(self.device)
216 |             reward = batch[2].float().to(self.device)
217 |             nextState = batch[3].float().to(self.device)
218 |             done = batch[4].float().to(self.device)
219 | 
220 |             # Computation of the current return distribution, according to the policy DNN
221 |             pdfs = self.policyNetwork(state, self.supportRepeatedBatchSize)
222 |             selection = torch.tensor([self.actionSpace*i + action[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device)
223 |             currentPdfs = torch.index_select(pdfs, 0, selection).view(-1, 1)
224 | 
225 |             # Computation of the next action, according to the policy DNN
226 |             with torch.no_grad():
227 |                 if self.fasterExpectation:
228 |                     expectedReturns = self.targetNetwork.getExpectation(nextState, self.minReturn, self.maxReturn, 10*self.numberOfSamples)
229 |                 else:
230 |                     pdfs = self.targetNetwork(nextState, self.supportRepeatedBatchSize)
231 |                     expectedReturns = (((pdfs * self.supportTorch).sum(1))/(self.numberOfSamples*self.uniformProba)).view(-1, self.actionSpace)
232 |                 _, nextAction = expectedReturns.max(1)
233 | 
234 |             # Computation of the new distribution to be learnt by the policy DNN
235 |             with torch.no_grad():
236 |                 r = reward.view(self.batchSize, 1).repeat(1, self.numberOfSamples).view(-1, 1)
237 |                 support = (self.supportRepeatedBatchSize - r)/self.gamma
238 |                 targetPdfs = self.targetNetwork(nextState, support)
239 |                 selection = torch.tensor([self.actionSpace*i + nextAction[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device)
240 |                 targetPdfs = torch.index_select(targetPdfs, 0, selection)
241 |                 targetPdfs = targetPdfs/self.gamma
242 |                 for i in range(self.batchSize):
243 |                     if done[i] == 1:
244 |                         targetPdfs[i] = torch.tensor(stats.norm.pdf(self.support, reward[i].item(), self.deltaSupport)).to(self.device)
245 |                 targetPdfs = targetPdfs.clamp(min=1e-6)
246 |                 targetPdfs = targetPdfs.view(-1, 1)
247 |             
248 |             # Compute the loss
249 |             loss = (targetPdfs*(targetPdfs.log()-currentPdfs.log())).sum()
250 | 
251 |             # Computation of the gradients
252 |             self.optimizer.zero_grad()
253 |             loss.backward()
254 | 
255 |             # Gradient Clipping
256 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
257 | 
258 |             # Perform the Deep Neural Network optimization
259 |             self.optimizer.step()
260 | 
261 |             return loss.item()
262 | 


--------------------------------------------------------------------------------
/UMDQN_W.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import math
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from matplotlib import pyplot as plt
 13 | 
 14 | import torch
 15 | import torch.optim as optim
 16 | 
 17 | from replayMemory import ReplayMemory
 18 | 
 19 | from Models.UMDQN_W_Model import UMDQN_W_Model
 20 | from Models.UMDQN_W_Model_Atari import UMDQN_W_Model_Atari
 21 | 
 22 | from DQN import DQN
 23 | 
 24 | 
 25 | 
 26 | ###############################################################################
 27 | ############################### Class UMDQN_W #################################
 28 | ###############################################################################
 29 | 
 30 | class UMDQN_W(DQN):
 31 |     """
 32 |     GOAL: Implementing the UMDQN_W Deep Reinforcement Learning algorithm.
 33 |     
 34 |     VARIABLES: - device: Hardware specification (CPU or GPU).
 35 |                - gamma: Discount factor of the RL algorithm.
 36 |                - learningRate: Learning rate of the DL optimizer (ADAM).
 37 |                - epsilon: Epsilon value for the DL optimizer (ADAM).
 38 |                - targetNetworkUpdate: Update frequency of the target network.
 39 |                - learningUpdatePeriod: Frequency of the learning procedure.
 40 |                - batchSize: Size of the batch to sample from the replay memory.
 41 |                - capacity: Capacity of the replay memory.
 42 |                - replayMemory: Experience Replay memory.
 43 |                - rewardClipping: Clipping of the RL rewards.
 44 |                - gradientClipping: Clipping of the training loss.
 45 |                - optimizer: DL optimizer (ADAM).
 46 |                - epsilonStart: Initial value of epsilon (Epsilon-Greedy).
 47 |                - epsilonEnd: Final value of epsilon (Epsilon-Greedy).
 48 |                - epsilonDecay: Exponential decay of epsilon (Epsilon-Greedy).
 49 |                - epsilonTest: Test value of epsilon (Epsilon-Greedy).
 50 |                - epsilonValue: Current value of epsilon (Epsilon-Greedy).
 51 |                - policyNetwork: Deep Neural Network representing the info used by the RL policy.
 52 |                - targetNetwork: Deep Neural Network representing the target network.
 53 |                                 
 54 |     METHODS: - __init__: Initialization of the RL algorithm.
 55 |              - chooseAction: Choose a valid action based on the current state
 56 |                              observed, according to the RL policy learned.
 57 |              - learning: Execute the RL algorithm learning procedure.
 58 |     """
 59 | 
 60 |     def __init__(self, observationSpace, actionSpace, environment,
 61 |                  parametersFileName='', reporting=True):
 62 |         """
 63 |         GOAL: Initializing the RL agent based on the UMDQN_W Deep Reinforcement Learning
 64 |               algorithm, by setting up the algorithm parameters as well as 
 65 |               the Deep Neural Networks.
 66 |         
 67 |         INPUTS: - observationSpace: RL observation space.
 68 |                 - actionSpace: RL action space.
 69 |                 - environment: Name of the RL environment.
 70 |                 - parametersFileName: Name of the JSON parameters file.
 71 |                 - reporting: Enable the reporting of the results.
 72 |         
 73 |         OUTPUTS: /
 74 |         """
 75 | 
 76 |         # Initialization of the DQN parent class
 77 |         DQN.__init__(self, observationSpace, actionSpace, environment, parametersFileName, False)
 78 | 
 79 |         # Setting of the parameters
 80 |         if parametersFileName == '':
 81 |             parametersFileName = ''.join(['Parameters/parameters_UMDQN_W_', str(environment), '.json'])
 82 |         parameters = self.readParameters(parametersFileName)
 83 | 
 84 |         # Set the device for DNN computations (CPU or GPU)
 85 |         self.device = torch.device('cuda:'+str(parameters['GPUNumber']) if torch.cuda.is_available() else 'cpu')
 86 | 
 87 |         # Set the general parameters of the RL algorithm
 88 |         self.gamma = parameters['gamma']
 89 |         self.learningRate = parameters['learningRate']
 90 |         self.epsilon = parameters['epsilon']
 91 |         self.targetUpdatePeriod = parameters['targetUpdatePeriod']
 92 |         self.learningUpdatePeriod = parameters['learningUpdatePeriod']
 93 |         self.rewardClipping = parameters['rewardClipping']
 94 |         self.gradientClipping = parameters['gradientClipping']
 95 | 
 96 |         # Set the Experience Replay mechanism
 97 |         self.batchSize = parameters['batchSize']
 98 |         self.capacity = parameters['capacity']
 99 |         self.replayMemory = ReplayMemory(self.capacity)
100 | 
101 |         # Set the distribution support (quantile fractions)
102 |         self.numberOfSamples = parameters['numberOfSamples']
103 |         self.support = np.linspace(0.0, 1.0, self.numberOfSamples)
104 |         self.supportTorch = torch.linspace(0.0, 1.0, self.numberOfSamples, device=self.device)
105 |         self.supportRepeatedBatchSize = self.supportTorch.repeat(self.batchSize, 1).view(-1, 1)
106 |         self.kappa = 1.0
107 |         
108 |         # Set the two Deep Neural Networks of the RL algorithm (policy and target)
109 |         self.atari = parameters['atari']
110 |         self.minatar = parameters['minatar']
111 |         if self.atari or self.minatar:
112 |             self.policyNetwork = UMDQN_W_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device)
113 |             self.targetNetwork = UMDQN_W_Model_Atari(observationSpace, actionSpace, parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device, minAtar=self.minatar).to(self.device)
114 |         else:
115 |             self.policyNetwork = UMDQN_W_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device)
116 |             self.targetNetwork = UMDQN_W_Model(observationSpace, actionSpace, parameters['structureDNN'], parameters['structureUMNN'], parameters['stateEmbedding'], parameters['numberOfSteps'], self.device).to(self.device)
117 |         self.targetNetwork.load_state_dict(self.policyNetwork.state_dict())
118 | 
119 |         # Set the Deep Learning optimizer
120 |         self.optimizer = optim.Adam(self.policyNetwork.parameters(), lr=self.learningRate, eps=self.epsilon)
121 | 
122 |         # Set the Epsilon-Greedy exploration technique
123 |         self.epsilonStart = parameters['epsilonStart']
124 |         self.epsilonEnd = parameters['epsilonEnd']
125 |         self.epsilonDecay = parameters['epsilonDecay']
126 |         self.epsilonTest = parameters['epsilonTest']
127 |         self.epsilonValue = lambda iteration: self.epsilonEnd + (self.epsilonStart - self.epsilonEnd) * math.exp(-1 * iteration / self.epsilonDecay)
128 | 
129 |         # Initialization of the experiment folder and tensorboard writer
130 |         self.initReporting(parameters, 'UMDQN_W')
131 | 
132 | 
133 |     def chooseAction(self, state, plot=False):
134 |         """
135 |         GOAL: Choose a valid RL action from the action space according to the
136 |               RL policy as well as the current RL state observed.
137 |         
138 |         INPUTS: - state: RL state returned by the environment.
139 |                 - plot: Enable the plotting of the random returns distributions.
140 |         
141 |         OUTPUTS: - action: RL action chosen from the action space.
142 |         """
143 | 
144 |         # Choose the best action based on the RL policy
145 |         with torch.no_grad():
146 |             state = torch.from_numpy(state).float().to(self.device).unsqueeze(0)
147 |             quantiles = self.policyNetwork(state, self.supportTorch.unsqueeze(1))
148 |             QValues = quantiles.mean(1)
149 |             _, action = QValues.max(0)
150 | 
151 |             # If required, plot the return distribution associated with each action
152 |             if plot:
153 |                 colors = ['blue', 'red', 'orange', 'green', 'purple', 'brown']
154 |                 plt.figure()
155 |                 ax = plt.subplot(1, 1, 1)
156 |                 taus = torch.linspace(0.0, 1.0, self.numberOfSamples*10, device=self.device).unsqueeze(1)
157 |                 quantiles = self.policyNetwork(state, taus)
158 |                 QValues = quantiles.mean(1)
159 |                 taus = taus.cpu().numpy()
160 |                 quantiles = quantiles.squeeze(0).cpu().numpy()
161 |                 QValues = QValues.squeeze(0).cpu().numpy()
162 |                 for a in range(self.actionSpace):
163 |                     ax.plot(taus, quantiles[a], linestyle='-', label=''.join(['Action ', str(a), ' random return Z']), color=colors[a])
164 |                     ax.axhline(y=QValues[a], linewidth=2, linestyle='--', label=''.join(['Action ', str(a), ' expected return Q']), color=colors[a])
165 |                 ax.set_xlabel('Quantile fraction')
166 |                 ax.set_ylabel('Quantile Function (QF)')
167 |                 ax.legend()
168 |                 plt.show()
169 |                 """
170 |                 # Saving of the data into external files
171 |                 taus = np.linspace(0, 1, self.numberOfSamples*10)
172 |                 dataQF = {
173 |                 'Action0_x': taus,
174 |                 'Action0_y': quantiles[0],
175 |                 'Action1_x': taus,
176 |                 'Action1_y': quantiles[1],
177 |                 'Action2_x': taus,
178 |                 'Action2_y': quantiles[2],
179 |                 'Action3_x': taus,
180 |                 'Action3_y': quantiles[3],
181 |                 }
182 |                 dataframeQF = pd.DataFrame(dataQF)
183 |                 dataframeQF.to_csv('Figures/Distributions/UMDQN_W.csv')
184 |                 quit()
185 |                 """
186 |             
187 |             return action.item()
188 | 
189 | 
190 |     def learning(self):
191 |         """
192 |         GOAL: Sample a batch of past experiences and learn from it
193 |               by updating the Reinforcement Learning policy.
194 |         
195 |         INPUTS: /
196 |         
197 |         OUTPUTS: - loss: Loss of the learning procedure.
198 |         """
199 |         
200 |         # Check that the replay memory is filled enough
201 |         if (len(self.replayMemory) >= self.batchSize):
202 | 
203 |             # Sample a batch of experiences from the replay memory
204 |             batch = self.dataLoaderIter.next()
205 |             state = batch[0].float().to(self.device)
206 |             action = batch[1].long().to(self.device)
207 |             reward = batch[2].float().to(self.device)
208 |             nextState = batch[3].float().to(self.device)
209 |             done = batch[4].float().to(self.device)
210 | 
211 |             # Computation of the current return distribution
212 |             quantiles = self.policyNetwork(state, self.supportRepeatedBatchSize)
213 |             selection = torch.tensor([self.actionSpace*i + action[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device)
214 |             quantiles = torch.index_select(quantiles, 0, selection)
215 | 
216 |             # Computation of the new distribution to be learnt by the policy DNN
217 |             with torch.no_grad():
218 |                 nextQuantiles = self.targetNetwork(nextState, self.supportRepeatedBatchSize)
219 |                 nextAction = nextQuantiles.view(self.batchSize, self.actionSpace, self.numberOfSamples).mean(2).max(1)[1]
220 |                 selection = torch.tensor([self.actionSpace*i + nextAction[i] for i in range(self.batchSize)], dtype=torch.long, device=self.device)
221 |                 nextQuantiles = torch.index_select(nextQuantiles, 0, selection)
222 |                 targetQuantiles = reward.unsqueeze(1) + self.gamma * nextQuantiles * (1 - done.unsqueeze(1))
223 | 
224 |             #"""
225 |             # Improve stability with the lower and upper bounds of the random return
226 |             minZ = -1
227 |             maxZ = 10
228 |             quantiles = quantiles.clamp(min=minZ, max=maxZ)
229 |             targetQuantiles = targetQuantiles.clamp(min=minZ, max=maxZ)
230 |             #"""
231 |             
232 |             # Computation of the loss
233 |             difference = targetQuantiles.unsqueeze(1) - quantiles.unsqueeze(2)
234 |             error = difference.abs()
235 |             loss = torch.where(error <= self.kappa, 0.5 * error.pow(2), self.kappa * (error - (0.5 * self.kappa)))
236 |             loss = (self.supportRepeatedBatchSize.view(self.batchSize, self.numberOfSamples, 1) - (difference < 0).float()).abs() * loss/self.kappa
237 |             loss = loss.mean(1).sum(1).mean()
238 | 
239 |             # Computation of the gradients
240 |             self.optimizer.zero_grad()
241 |             loss.backward()
242 | 
243 |             # Gradient Clipping
244 |             torch.nn.utils.clip_grad_norm_(self.policyNetwork.parameters(), self.gradientClipping)
245 | 
246 |             # Perform the Deep Neural Network optimization
247 |             self.optimizer.step()
248 | 
249 |             return loss.item()
250 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import argparse
  8 | import importlib
  9 | import gym
 10 | 
 11 | from CustomEnvironments.stochasticGridWorld import StochasticGridWorld
 12 | from CustomEnvironments.stochasticGridWorldOptimal import StochasticGridWorldOptimal
 13 | from MonteCarloDistributions import MonteCarloDistributions
 14 | from AtariWrapper import AtariWrapper, MinAtarWrapper
 15 | 
 16 | 
 17 | 
 18 | ###############################################################################
 19 | ################################ Global variables #############################
 20 | ###############################################################################
 21 | 
 22 | # Supported RL algorithms
 23 | algorithms = ['DQN', 'CDQN', 'QR_DQN', 'IQN', 'FQF',
 24 |               'UMDQN_KL', 'UMDQN_C', 'UMDQN_W']
 25 | 
 26 | # Supported RL environments
 27 | environments = ['StochasticGridWorld', 'CartPole-v0', 'Acrobot-v1',
 28 |                 'LunarLander-v2', 'MountainCar-v0', 'MinAtar/Asterix-v0',
 29 |                 'MinAtar/Breakout-v0', 'MinAtar/Freeway-v0', 'MinAtar/Seaquest-v0',
 30 |                 'MinAtar/SpaceInvaders-v0', 'PongNoFrameskip-v4',
 31 |                 'BoxingNoFrameskip-v4', 'FreewayNoFrameskip-v4']
 32 | 
 33 | 
 34 | 
 35 | ###############################################################################
 36 | ##################################### MAIN ####################################
 37 | ###############################################################################
 38 | 
 39 | if(__name__ == '__main__'):
 40 | 
 41 |     # Retrieve the paramaters sent by the user
 42 |     parser = argparse.ArgumentParser(description='')
 43 |     parser.add_argument("-algorithm", default='UMDQN_C', type=str, help="Name of the RL algorithm")
 44 |     parser.add_argument("-environment", default='StochasticGridWorld', type=str, help="Name of the RL environment")
 45 |     parser.add_argument("-episodes", default=10000, type=str, help="Number of episodes for training")
 46 |     parser.add_argument("-parameters", default='parameters', type=str, help="Name of the JSON parameters file")
 47 |     args = parser.parse_args()
 48 | 
 49 |     # Checking of the parameters validity
 50 |     algorithm = args.algorithm
 51 |     environment = args.environment
 52 |     episodes = int(args.episodes)
 53 |     parameters = args.parameters
 54 |     if algorithm not in algorithms:
 55 |         print("The algorithm specified is not valid, only the following algorithms are supported:")
 56 |         for algo in algorithms:
 57 |             print("".join(['- ', algo]))
 58 |     if environment not in environments:
 59 |         print("The environment specified is not valid, only the following environments are supported:")
 60 |         for env in environments:
 61 |             print("".join(['- ', env]))
 62 |     if parameters == 'parameters':
 63 |         parameters = ''.join(['Parameters/parameters_', str(algorithm), '_', str(environment), '.json'])
 64 |     
 65 |     # Name of the file for saving the RL policy learned
 66 |     fileName = 'SavedModels/' + algorithm + '_' + environment
 67 |     
 68 |     # Initialization of the RL environment
 69 |     if environment == 'StochasticGridWorld':
 70 |         env = StochasticGridWorld()
 71 |     elif environment in ['CartPole-v0', 'Acrobot-v1', 'LunarLander-v2', 'MountainCar-v0']:
 72 |         env = gym.make(environment)
 73 |         parameters = ''.join(['Parameters/parameters_', algorithm, '_ClassicControl.json'])
 74 |     elif environment in ['MinAtar/Asterix-v0','MinAtar/Breakout-v0', 'MinAtar/Freeway-v0', 'MinAtar/Seaquest-v0', 'MinAtar/SpaceInvaders-v0']:
 75 |         minAtarWrapper = MinAtarWrapper()
 76 |         env = minAtarWrapper.wrapper(environment)
 77 |         parameters = ''.join(['Parameters/parameters_', algorithm, '_MinAtar.json'])
 78 |     else:
 79 |         atariWrapper = AtariWrapper()
 80 |         env = atariWrapper.wrapper(environment, stickyActionsProba=0.25)
 81 |         parameters = ''.join(['Parameters/parameters_', algorithm, '_Atari57.json'])
 82 | 
 83 |     # Determination of the state and action spaces
 84 |     observationSpace = env.observation_space.shape[0]
 85 |     actionSpace = env.action_space.n
 86 | 
 87 |     # Initialization of the DRL algorithm
 88 |     algorithmModule = importlib.import_module(str(algorithm))
 89 |     className = getattr(algorithmModule, algorithm)
 90 |     RLAgent = className(observationSpace, actionSpace, environment, parameters)
 91 | 
 92 |     # Training of the RL agent
 93 |     RLAgent.training(env, episodes, verbose=False, rendering=False, plotTraining=False)
 94 |     #RLAgent.plotExpectedPerformance(env, episodes, iterations=5)
 95 |     
 96 |     # Saving of the RL model
 97 |     RLAgent.saveModel(fileName)
 98 | 
 99 |     # Loading of the RL model
100 |     RLAgent.loadModel(fileName)
101 | 
102 |     # Testing of the RL agent
103 |     RLAgent.testing(env, verbose=True, rendering=False)
104 | 
105 |     # Plotting of the true distribution of the random return via Monte Carlo
106 |     """
107 |     state = [int(7/2)-1, 7-1]
108 |     optimalPolicy = StochasticGridWorldOptimal(env)
109 |     MonteCarloDistributions = MonteCarloDistributions(env, optimalPolicy, 0.5)
110 |     #MonteCarloDistributions = MonteCarloDistributions(env, RLAgent, 0.5)
111 |     MonteCarloDistributions.plotDistributions(state)
112 |     """
113 | 


--------------------------------------------------------------------------------
/replayMemory.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | ###############################################################################
  4 | ################################### Imports ###################################
  5 | ###############################################################################
  6 | 
  7 | import random
  8 | from collections import deque
  9 | from torch.utils.data import Dataset
 10 | 
 11 | 
 12 | 
 13 | ###############################################################################
 14 | ############################### Class ReplayMemory ############################
 15 | ###############################################################################
 16 | 
 17 | class ReplayMemory(Dataset):
 18 |     """
 19 |     GOAL: Implementing the replay memory required for the Experience Replay
 20 |           mechanism of the DQN Reinforcement Learning algorithm. This class
 21 |           inherits from the Dataset class from Pytorch for being used with
 22 |           efficient data loaders.
 23 |     
 24 |     VARIABLES:  - memory: Data structure storing the RL experiences.
 25 |                                 
 26 |     METHODS:    - __init__: Initialization of the memory data structure.
 27 |                 - __getitem__: Get an item from the replay memory.
 28 |                 - __len__: Return the length of the replay memory.
 29 |                 - push: Insert a new experience into the replay memory.
 30 |                 - sample: Sample a batch of experiences from the replay memory.
 31 |                 - reset: Reset the replay memory.
 32 |     """
 33 | 
 34 |     def __init__(self, capacity=10000):
 35 |         """
 36 |         GOAL: Initialization of the replay memory data structure.
 37 |         
 38 |         INPUTS: - capacity: Capacity of the data structure, specifying the
 39 |                             maximum number of experiences to be stored
 40 |                             simultaneously into the data structure.
 41 |         
 42 |         OUTPUTS: /
 43 |         """
 44 | 
 45 |         random.seed(0)
 46 |         self.capacity = capacity
 47 |         self.memory = deque(maxlen=capacity)
 48 | 
 49 | 
 50 |     def __getitem__(self, index):
 51 |         """
 52 |         GOAL: Outputing the item associated with the provided index
 53 |               from the replay memory.
 54 |         
 55 |         INPUTS: /
 56 |         
 57 |         OUTPUTS: - item: Selected item of the replay memory.
 58 |         """
 59 | 
 60 |         return self.memory[index]
 61 | 
 62 | 
 63 |     def __len__(self):
 64 |         """
 65 |         GOAL: Return the size of the replay memory, i.e. the number of experiences
 66 |               currently stored into the data structure.
 67 |         
 68 |         INPUTS: /
 69 |         
 70 |         OUTPUTS: - length: Size of the replay memory.
 71 |         """
 72 | 
 73 |         return len(self.memory)
 74 |     
 75 | 
 76 |     def push(self, state, action, reward, nextState, done):
 77 |         """
 78 |         GOAL: Insert a new experience into the replay memory. An experience
 79 |               is composed of a state, an action, a reward, a next state and
 80 |               a termination signal.
 81 |         
 82 |         INPUTS: - state: RL state of the experience to be stored.
 83 |                 - action: RL action of the experience to be stored.
 84 |                 - reward: RL reward of the experience to be stored.
 85 |                 - nextState: RL next state of the experience to be stored.
 86 |                 - done: RL termination signal of the experience to be stored.
 87 |         
 88 |         OUTPUTS: /
 89 |         """
 90 | 
 91 |         # FIFO policy
 92 |         self.memory.append((state, action, reward, nextState, done))
 93 | 
 94 | 
 95 |     def sample(self, batchSize):
 96 |         """
 97 |         GOAL: Sample a batch of experiences from the replay memory.
 98 |         
 99 |         INPUTS: - batchSize: Size of the batch to sample.
100 |         
101 |         OUTPUTS: - state: RL states of the experience batch sampled.
102 |                  - action: RL actions of the experience batch sampled.
103 |                  - reward: RL rewards of the experience batch sampled.
104 |                  - nextState: RL next states of the experience batch sampled.
105 |                  - done: RL termination signals of the experience batch sampled.
106 |         """
107 | 
108 |         state, action, reward, nextState, done = zip(*random.sample(self.memory, batchSize))
109 |         return state, action, reward, nextState, done
110 |         
111 |         
112 |     def reset(self):
113 |         """
114 |         GOAL: Reset (empty) the replay memory.
115 |         
116 |         INPUTS: /
117 |         
118 |         OUTPUTS: /
119 |         """
120 | 
121 |         random.seed(0)
122 |         self.memory = deque(maxlen=self.capacity)
123 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Python version
 2 | python==3.7.7
 3 | 
 4 | # Python basic packages
 5 | numpy
 6 | scipy
 7 | matplotlib
 8 | pandas
 9 | 
10 | # Deep Learning framework
11 | torch
12 | tensorboard
13 | umnn
14 | 
15 | # RL environment packages
16 | gym
17 | atari-py
18 | opencv-python
19 | minatar
20 | 
21 | # Extra packages
22 | tqdm
23 | 


--------------------------------------------------------------------------------