├── environments
    ├── ant_environments
    │   ├── __init__.py
    │   ├── README.md
    │   ├── ant_maze_env.py
    │   ├── point_maze_env.py
    │   ├── point.py
    │   ├── create_maze_env.py
    │   ├── maze_env_utils.py
    │   ├── ant.py
    │   └── assets
    │   │   └── ant.xml
    ├── Atari_Environment.py
    ├── Ant_Navigation_Environments.py
    ├── Long_Corridor_Environment.py
    └── Bit_Flipping_Environment.py
├── utilities
    ├── RL_image.jpeg
    ├── PyTorch-logo-2.jpg
    ├── data_structures
    │   ├── Node.py
    │   ├── Config.py
    │   ├── Deque.py
    │   ├── Tanh_Distribution.py
    │   ├── Replay_Buffer.py
    │   ├── Max_Heap.py
    │   └── Action_Balanced_Replay_Buffer.py
    ├── OU_Noise.py
    ├── Tensorboard.py
    ├── Deepmind_RMS_Prop.py
    ├── Parallel_Experience_Generator.py
    ├── Memory_Shaper.py
    └── Utility_Functions.py
├── requirements.txt
├── results
    ├── data_and_graphs
    │   ├── Taxi_data.pkl
    │   ├── Four_Rooms.pkl
    │   ├── Four_Rooms.png
    │   ├── Taxi_graph.png
    │   ├── HER_Experiments.png
    │   ├── Hopper_Results_Data.pkl
    │   ├── Long_Corridor_Graph.png
    │   ├── Walker_Results_Data.pkl
    │   ├── Hopper_Results_Graph.png
    │   ├── Taxi_graph_comparison.png
    │   ├── Walker_Results_Graph.png
    │   ├── hrl_experiments
    │   │   ├── Taxi.png
    │   │   ├── Cart_Pole.png
    │   │   ├── Taxi_data.pkl
    │   │   ├── Cart_Pole_data.pkl
    │   │   └── Taxi_graph_comparison.png
    │   ├── Cart_Pole_Results_Data.pkl
    │   ├── Cart_Pole_Results_Graph.png
    │   ├── Fetch_Reach_Results_Data.pkl
    │   ├── Bit_Flipping_Results_Data.pkl
    │   ├── Bit_Flipping_Results_Graph.png
    │   ├── Fetch_Reach_Results_Graph.png
    │   ├── Long_Corridor_Results_Data.pkl
    │   ├── Mountain_Car_Results_Data.pkl
    │   ├── Mountain_Car_Results_Graph.png
    │   ├── Four_Rooms_and_Long_Corridor.png
    │   ├── Long_Corridor_Results_Graph.png
    │   ├── CartPole_and_MountainCar_Graph.png
    │   ├── Hopper_Results_Graph_Both_Agents.png
    │   └── Plot_Sets_Of_Results.py
    ├── Bit_Flipping.py
    ├── Fetch_Reach.py
    ├── Mountain_Car.py
    ├── Hopper.py
    ├── Walker.py
    ├── Reacher.py
    ├── HRL_Experiments.py
    ├── Cart_Pole.py
    ├── Four_Rooms.py
    ├── Long_Corridor.py
    └── Taxi.py
├── .gitignore
├── tests
    ├── Test_Max_Heap.py
    ├── Test_Deque.py
    ├── Test_Trainer.py
    ├── Test_Bit_Flipping_Environment.py
    ├── Test_HRL.py
    ├── Test_Action_Balanced_Replay_Buffer.py
    ├── Test_Prioritised_Replay_Buffer.py
    ├── Test_Four_Rooms_Environment.py
    └── Test_DQN_HER.py
├── exploration_strategies
    ├── Base_Exploration_Strategy.py
    ├── OU_Noise_Exploration.py
    ├── Gaussian_Exploration.py
    └── Epsilon_Greedy_Exploration.py
├── agents
    ├── DQN_agents
    │   ├── DDQN.py
    │   ├── DQN_With_Fixed_Q_Targets.py
    │   ├── DQN_HER.py
    │   ├── DDQN_With_Prioritised_Experience_Replay.py
    │   ├── Dueling_DDQN.py
    │   └── DQN.py
    ├── actor_critic_agents
    │   ├── A2C.py
    │   ├── DDPG_HER.py
    │   ├── TD3.py
    │   ├── SAC_Discrete.py
    │   └── DDPG.py
    ├── policy_gradient_agents
    │   └── REINFORCE.py
    └── HER_Base.py
└── .travis.yml


/environments/ant_environments/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/utilities/RL_image.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/utilities/RL_image.jpeg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.15.2
2 | torch==0.4.1.post2
3 | matplotlib==3.0.0
4 | PyVirtualDisplay==0.2.1
5 | gym==0.10.9
6 | nn_builder
7 | tensorflow
8 | 


--------------------------------------------------------------------------------
/utilities/PyTorch-logo-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/utilities/PyTorch-logo-2.jpg


--------------------------------------------------------------------------------
/results/data_and_graphs/Taxi_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Four_Rooms.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Four_Rooms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Taxi_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/HER_Experiments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/HER_Experiments.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Hopper_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Long_Corridor_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Walker_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Walker_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Hopper_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Taxi_graph_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_graph_comparison.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Walker_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Walker_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/hrl_experiments/Taxi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Cart_Pole_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Cart_Pole_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Cart_Pole_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Cart_Pole_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Fetch_Reach_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Fetch_Reach_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Bit_Flipping_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Bit_Flipping_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Bit_Flipping_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Bit_Flipping_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Fetch_Reach_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Fetch_Reach_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Long_Corridor_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Mountain_Car_Results_Data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Mountain_Car_Results_Data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Mountain_Car_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Mountain_Car_Results_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/hrl_experiments/Cart_Pole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Cart_Pole.png


--------------------------------------------------------------------------------
/results/data_and_graphs/hrl_experiments/Taxi_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi_data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Four_Rooms_and_Long_Corridor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms_and_Long_Corridor.png


--------------------------------------------------------------------------------
/results/data_and_graphs/Long_Corridor_Results_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Results_Graph.png


--------------------------------------------------------------------------------
/environments/ant_environments/README.md:
--------------------------------------------------------------------------------
1 | NOTE that all code in this folder came directly from the github repo https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 
2 | and is not my code. 


--------------------------------------------------------------------------------
/results/data_and_graphs/CartPole_and_MountainCar_Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/CartPole_and_MountainCar_Graph.png


--------------------------------------------------------------------------------
/results/data_and_graphs/hrl_experiments/Cart_Pole_data.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Cart_Pole_data.pkl


--------------------------------------------------------------------------------
/results/data_and_graphs/Hopper_Results_Graph_Both_Agents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Graph_Both_Agents.png


--------------------------------------------------------------------------------
/results/data_and_graphs/hrl_experiments/Taxi_graph_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi_graph_comparison.png


--------------------------------------------------------------------------------
/environments/ant_environments/ant_maze_env.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
 4 | # and is not my code.
 5 | 
 6 | 
 7 | 
 8 | from ant_environments.maze_env import MazeEnv
 9 | from ant_environments.ant import AntEnv
10 | 
11 | 
12 | class AntMazeEnv(MazeEnv):
13 |     MODEL_CLASS = AntEnv
14 | 


--------------------------------------------------------------------------------
/environments/ant_environments/point_maze_env.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
 5 | # and is not my code.
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | from ant_environments.maze_env import MazeEnv
12 | from ant_environments.point import PointEnv
13 | 
14 | 
15 | class PointMazeEnv(MazeEnv):
16 |     MODEL_CLASS = PointEnv
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *cache
 3 | *.idea
 4 | *.log
 5 | *logs/
 6 | logs/
 7 | runs/
 8 | *__pycache__
 9 | .DS_Store
10 | drlnd/
11 | *Banana.app/
12 | *deep-reinforcement-learning/
13 | *venv/
14 | *playground_test_runs.py
15 | Game_Files/
16 | mjpro150/
17 | mujoco200_macos/
18 | *to_do_list
19 | Random_Junkyard/
20 | Notebook.ipynb
21 | Results/Notebook.ipynb
22 | *.ipynb_checkpoints
23 | *.drive_access_key.json
24 | drive_access_key.json
25 | drive_access_key
26 | settings.json
27 | launch.json
28 | results/data_and_graphs/Cart_Pole_Results_Data.pkl
29 | results/data_and_graphs/Cart_Pole_Results_Graph.png
30 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Node.py:
--------------------------------------------------------------------------------
 1 | class Node(object):
 2 |     """Generic Node class. Used in the implementation of a prioritised replay buffer"""
 3 |     def __init__(self, key, value):
 4 |         self.key = key
 5 |         self.value = value
 6 | 
 7 |     def update_key_and_value(self, new_key, new_value):
 8 |         self.update_key(new_key)
 9 |         self.update_value(new_value)
10 | 
11 |     def update_key(self, new_key):
12 |         self.key = new_key
13 | 
14 |     def update_value(self, new_value):
15 |         self.value = new_value
16 | 
17 |     def __eq__(self, other):
18 |         return self.key == other.key and self.value == other.value


--------------------------------------------------------------------------------
/tests/Test_Max_Heap.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from utilities.data_structures.Max_Heap import Max_Heap
 3 | import numpy as np
 4 | from utilities.data_structures.Node import Node
 5 | 
 6 | 
 7 | def test_heap_always_keeps_max_element_at_top():
 8 |     max_heap_size = 200
 9 |     for _ in range(100):
10 |         heap = Max_Heap(max_heap_size, 2, 0)
11 |         elements_added = []
12 |         for ix in range(1, 100):
13 |             element = random.random()
14 |             elements_added.append(element)
15 |             heap.update_element_and_reorganise_heap(ix, Node(element, (None, None)))
16 | 
17 |         max_key = np.max(elements_added)
18 |         assert round(heap.give_max_key(), 8) == round(max_key, 8), "{}".format(elements_added)
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/Test_Deque.py:
--------------------------------------------------------------------------------
 1 | from utilities.data_structures.Deque import Deque
 2 | from utilities.data_structures.Node import Node
 3 | 
 4 | 
 5 | def test_Deque_initialisation():
 6 | 
 7 |     deque = Deque(2, 1)
 8 |     assert all(deque.deque == [Node(0, (None,)), Node(0, (None,))])
 9 | 
10 | def test_Deque_adding_elements():
11 | 
12 |     deque = Deque(2, 1)
13 |     deque.add_element_to_deque(3, 5)
14 |     deque.add_element_to_deque(2, 4)
15 | 
16 |     assert all(deque.deque == [Node(3, 5), Node(2, 4)])
17 | 
18 |     deque.add_element_to_deque(1, 2)
19 | 
20 |     assert all(deque.deque == [Node(1, 2), Node(2, 4)])
21 | 
22 |     deque.add_element_to_deque(-100, 0)
23 |     deque.add_element_to_deque(0, 0)
24 | 
25 |     assert all(deque.deque == [Node(0, 0), Node(-100, 0)])


--------------------------------------------------------------------------------
/utilities/OU_Noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import copy
 4 | 
 5 | class OU_Noise(object):
 6 |     """Ornstein-Uhlenbeck process."""
 7 |     def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
 8 |         self.mu = mu * np.ones(size)
 9 |         self.theta = theta
10 |         self.sigma = sigma
11 |         self.seed = random.seed(seed)
12 |         self.reset()
13 | 
14 |     def reset(self):
15 |         """Reset the internal state (= noise) to mean (mu)."""
16 |         self.state = copy.copy(self.mu)
17 | 
18 |     def sample(self):
19 |         """Update internal state and return it as a noise sample."""
20 |         dx = self.theta * (self.mu - self.state) + self.sigma * np.array([np.random.normal() for _ in range(len(self.state))])
21 |         self.state += dx
22 |         return self.state


--------------------------------------------------------------------------------
/exploration_strategies/Base_Exploration_Strategy.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Base_Exploration_Strategy(object):
 4 |     """Base abstract class for agent exploration strategies. Every exploration strategy must inherit from this class
 5 |     and implement the methods perturb_action_for_exploration_purposes and add_exploration_rewards"""
 6 |     def __init__(self, config):
 7 |         self.config = config
 8 | 
 9 |     def perturb_action_for_exploration_purposes(self, action_info):
10 |         """Perturbs the action of the agent to encourage exploration"""
11 |         raise ValueError("Must be implemented")
12 | 
13 |     def add_exploration_rewards(self, reward_info):
14 |         """Actions intrinsic rewards to encourage exploration"""
15 |         raise ValueError("Must be implemented")
16 | 
17 |     def reset(self):
18 |         """Resets the noise process"""
19 |         raise ValueError("Must be implemented")


--------------------------------------------------------------------------------
/utilities/data_structures/Config.py:
--------------------------------------------------------------------------------
 1 | class Config(object):
 2 |     """Object to hold the config requirements for an agent/game"""
 3 |     def __init__(self):
 4 |         self.seed = None
 5 |         self.environment = None
 6 |         self.requirements_to_solve_game = None
 7 |         self.num_episodes_to_run = None
 8 |         self.file_to_save_data_results = None
 9 |         self.file_to_save_results_graph = None
10 |         self.runs_per_agent = None
11 |         self.visualise_overall_results = None
12 |         self.visualise_individual_results = None
13 |         self.hyperparameters = None
14 |         self.use_GPU = None
15 |         self.overwrite_existing_results_file = None
16 |         self.save_model = False
17 |         self.standard_deviation_results = 1.0
18 |         self.randomise_random_seed = True
19 |         self.show_solution_score = False
20 |         self.debug_mode = False
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/agents/DQN_agents/DDQN.py:
--------------------------------------------------------------------------------
 1 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
 2 | 
 3 | class DDQN(DQN_With_Fixed_Q_Targets):
 4 |     """A double DQN agent"""
 5 |     agent_name = "DDQN"
 6 | 
 7 |     def __init__(self, config):
 8 |         DQN_With_Fixed_Q_Targets.__init__(self, config)
 9 | 
10 |     def compute_q_values_for_next_states(self, next_states):
11 |         """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
12 |         uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
13 |         The reasoning behind this is that it will help stop the network from overestimating q values"""
14 |         max_action_indexes = self.q_network_local(next_states).detach().argmax(1)
15 |         Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1))
16 |         return Q_targets_next 
17 |             
18 | 
19 | 


--------------------------------------------------------------------------------
/environments/Atari_Environment.py:
--------------------------------------------------------------------------------
 1 | from gym import Wrapper, spaces
 2 | from .Open_AI_Wrappers import *
 3 | 
 4 | 
 5 | def make_atari_game(env_id, max_episode_steps=None):
 6 |     env = gym.make(env_id)
 7 |     env.frameskip = 1
 8 |     env = NoopResetEnv(env, noop_max=30)
 9 |     env = MaxAndSkipEnv(env, skip=4)
10 |     if max_episode_steps is not None:
11 |         env = TimeLimit(env, max_episode_steps=max_episode_steps)
12 |     env = wrap_deepmind(env)
13 |     return env
14 | 
15 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=True):
16 |     """Configure environment for DeepMind-style Atari """
17 |     if episode_life:
18 |         env = EpisodicLifeEnv(env)
19 |     if 'FIRE' in env.unwrapped.get_action_meanings():
20 |         env = FireResetEnv(env)
21 |     env = WarpFrame(env)
22 |     if scale:
23 |         env = ScaledFloatFrame(env)
24 |     if clip_rewards:
25 |         env = ClipRewardEnv(env)
26 |     if frame_stack:
27 |         env = FrameStack(env, 4)
28 |     return env
29 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language:
 2 |   python
 3 | 
 4 | python: 3.7
 5 | dist: xenial
 6 | sudo: true
 7 | 
 8 | install:
 9 |   - pip install -r requirements.txt
10 | 
11 | 
12 | script:
13 |   - export PYTHONPATH="$PYTHONPATH:$PWD"
14 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/agents""
15 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/DQN_agents""
16 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/hierarchical_agents""
17 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/actor_critic_agents""
18 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/policy_gradient_agents""
19 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities/data_structures""
20 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/environments""
21 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities""
22 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/exploration_strategies""
23 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities/*""
24 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/*""
25 |   - export PYTHONPATH=""$PYTHONPATH:$PWD/results""
26 |   - pytest tests/*.py
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/exploration_strategies/OU_Noise_Exploration.py:
--------------------------------------------------------------------------------
 1 | from utilities.OU_Noise import OU_Noise
 2 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy
 3 | 
 4 | class OU_Noise_Exploration(Base_Exploration_Strategy):
 5 |     """Ornstein-Uhlenbeck noise process exploration strategy"""
 6 |     def __init__(self, config):
 7 |         super().__init__(config)
 8 |         self.noise = OU_Noise(self.config.action_size, self.config.seed, self.config.hyperparameters["mu"],
 9 |                               self.config.hyperparameters["theta"], self.config.hyperparameters["sigma"])
10 | 
11 |     def perturb_action_for_exploration_purposes(self, action_info):
12 |         """Perturbs the action of the agent to encourage exploration"""
13 |         action = action_info["action"]
14 |         action += self.noise.sample()
15 |         return action
16 | 
17 |     def add_exploration_rewards(self, reward_info):
18 |         """Actions intrinsic rewards to encourage exploration"""
19 |         raise ValueError("Must be implemented")
20 | 
21 |     def reset(self):
22 |         """Resets the noise process"""
23 |         self.noise.reset()


--------------------------------------------------------------------------------
/agents/DQN_agents/DQN_With_Fixed_Q_Targets.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from agents.Base_Agent import Base_Agent
 4 | from agents.DQN_agents.DQN import DQN
 5 | 
 6 | class DQN_With_Fixed_Q_Targets(DQN):
 7 |     """A DQN agent that uses an older version of the q_network as the target network"""
 8 |     agent_name = "DQN with Fixed Q Targets"
 9 |     def __init__(self, config):
10 |         DQN.__init__(self, config)
11 |         self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
12 |         Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
13 | 
14 |     def learn(self, experiences=None):
15 |         """Runs a learning iteration for the Q network"""
16 |         super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences)
17 |         self.soft_update_of_target_network(self.q_network_local, self.q_network_target,
18 |                                            self.hyperparameters["tau"])  # Update the target network
19 | 
20 |     def compute_q_values_for_next_states(self, next_states):
21 |         """Computes the q_values for next state we will use to create the loss to train the Q network"""
22 |         Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1)
23 |         return Q_targets_next


--------------------------------------------------------------------------------
/agents/actor_critic_agents/A2C.py:
--------------------------------------------------------------------------------
 1 | from agents.actor_critic_agents.A3C import A3C
 2 | 
 3 | class A2C(A3C):
 4 |     """Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only
 5 |     difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients
 6 |     come in"""
 7 |     agent_name = "A2C"
 8 |     def __init__(self, config):
 9 |         super(A2C, self).__init__(config)
10 | 
11 |     def update_shared_model(self, gradient_updates_queue):
12 |         """Worker that updates the shared model with gradients as they get put into the queue"""
13 |         while True:
14 |             gradients_seen = 0
15 |             while gradients_seen < self.worker_processes:
16 |                 if gradients_seen == 0:
17 |                     gradients = gradient_updates_queue.get()
18 |                 else:
19 |                     new_grads = gradient_updates_queue.get()
20 |                     gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)]
21 |                 gradients_seen += 1
22 |             self.actor_critic_optimizer.zero_grad()
23 |             for grads, params in zip(gradients, self.actor_critic.parameters()):
24 |                 params._grad = grads
25 |             self.actor_critic_optimizer.step()


--------------------------------------------------------------------------------
/exploration_strategies/Gaussian_Exploration.py:
--------------------------------------------------------------------------------
 1 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy
 2 | import torch
 3 | from torch.distributions.normal import Normal
 4 | 
 5 | class Gaussian_Exploration(Base_Exploration_Strategy):
 6 | 
 7 |     """Gaussian noise exploration strategy"""
 8 |     def __init__(self, config):
 9 |         super().__init__(config)
10 |         self.action_noise_std = self.config.hyperparameters["action_noise_std"]
11 |         self.action_noise_distribution = Normal(torch.Tensor([0.0]), torch.Tensor([self.action_noise_std]))
12 |         self.action_noise_clipping_range = self.config.hyperparameters["action_noise_clipping_range"]
13 | 
14 | 
15 |     def perturb_action_for_exploration_purposes(self, action_info):
16 |         """Perturbs the action of the agent to encourage exploration"""
17 |         action = action_info["action"]
18 |         action_noise = self.action_noise_distribution.sample(sample_shape=action.shape)
19 |         action_noise = action_noise.squeeze(-1)
20 |         clipped_action_noise = torch.clamp(action_noise, min=-self.action_noise_clipping_range,
21 |                                            max=self.action_noise_clipping_range)
22 |         action += clipped_action_noise
23 |         return action
24 | 
25 |     def add_exploration_rewards(self, reward_info):
26 |         """Actions intrinsic rewards to encourage exploration"""
27 |         raise ValueError("Must be implemented")
28 | 
29 |     def reset(self):
30 |         """Resets the noise process"""
31 |         pass
32 | 
33 | 


--------------------------------------------------------------------------------
/agents/DQN_agents/DQN_HER.py:
--------------------------------------------------------------------------------
 1 | from agents.DQN_agents.DQN import DQN
 2 | from agents.HER_Base import HER_Base
 3 | 
 4 | class DQN_HER(HER_Base, DQN):
 5 |     """DQN algorithm with hindsight experience replay"""
 6 |     agent_name = "DQN-HER"
 7 |     def __init__(self, config):
 8 |         DQN.__init__(self, config)
 9 |         HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
10 |                           self.hyperparameters["HER_sample_proportion"])
11 | 
12 |     def step(self):
13 |         """Runs a step within a game including a learning step if required"""
14 |         while not self.done:
15 |             self.action = self.pick_action()
16 |             self.conduct_action_in_changeable_goal_envs(self.action)
17 |             if self.time_for_q_network_to_learn():
18 |                 for _ in range(self.hyperparameters["learning_iterations"]):
19 |                     self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer())
20 |             self.track_changeable_goal_episodes_data()
21 |             self.save_experience()
22 |             if self.done: self.save_alternative_experience()
23 |             self.state_dict = self.next_state_dict  # this is to set the state for the next iteration
24 |             self.state = self.next_state
25 |             self.global_step_number += 1
26 |         self.episode_number += 1
27 | 
28 |     def enough_experiences_to_learn_from(self):
29 |         """Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from"""
30 |         return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size


--------------------------------------------------------------------------------
/tests/Test_Trainer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utilities.data_structures.Config import Config
 3 | from agents.Trainer import Trainer
 4 | 
 5 | def test_get_mean_and_standard_deviation_difference_results():
 6 |     """Tests that get_mean_and_standard_deviation_difference_results method produces correct output"""
 7 |     results = [ [1.0, 2.0, 3.0], [5.0, -33.0, 55.0], [2.5, 2.5, 2.5]]
 8 |     mean_results = [np.mean([1.0, 5.0, 2.5]), np.mean([2.0, -33.0, 2.5]), np.mean([3.0, 55.0, 2.5])]
 9 |     std_results = [np.std([1.0, 5.0, 2.5]), np.std([2.0, -33.0, 2.5]), np.std([3.0, 55.0, 2.5])]
10 |     mean_minus_1_std = [ mean - std_val for mean, std_val in zip(mean_results, std_results)]
11 |     mean_plus_1_std = [ mean + std_val for mean, std_val in zip(mean_results, std_results)]
12 |     config = Config()
13 |     config.standard_deviation_results = 1.0
14 |     trainer = Trainer(config, [])
15 |     mean_minus_x_std_guess, mean_results_guess, mean_plus_x_std_guess = trainer.get_mean_and_standard_deviation_difference_results(results)
16 |     assert mean_results == mean_results_guess
17 |     assert mean_minus_1_std == mean_minus_x_std_guess
18 |     assert mean_plus_1_std == mean_plus_x_std_guess
19 | 
20 |     config.standard_deviation_results = 3.0
21 |     trainer = Trainer(config, [])
22 |     mean_minus_x_std_guess, mean_results_guess, mean_plus_x_std_guess = trainer.get_mean_and_standard_deviation_difference_results(results)
23 |     mean_plus_3_std = [mean + 3.0*std_val for mean, std_val in zip(mean_results, std_results)]
24 |     mean_minus_3_std = [mean - 3.0*std_val for mean, std_val in zip(mean_results, std_results)]
25 |     assert mean_results == mean_results_guess
26 |     assert mean_minus_3_std == mean_minus_x_std_guess
27 |     assert mean_plus_3_std == mean_plus_x_std_guess
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/results/Bit_Flipping.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import FlattenDictWrapper
 2 | from agents.DQN_agents.DQN_HER import DQN_HER
 3 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment
 4 | from agents.Trainer import Trainer
 5 | from utilities.data_structures.Config import Config
 6 | from agents.DQN_agents.DQN import DQN
 7 | 
 8 | config = Config()
 9 | config.seed = 1
10 | config.environment = Bit_Flipping_Environment(14)
11 | config.num_episodes_to_run = 4500
12 | config.file_to_save_data_results = None #"Data_and_Graphs/Bit_Flipping_Results_Data.pkl"
13 | config.file_to_save_results_graph = None #"Data_and_Graphs/Bit_Flipping_Results_Graph.png"
14 | config.show_solution_score = False
15 | config.visualise_individual_results = False
16 | config.visualise_overall_agent_results = True
17 | config.standard_deviation_results = 1.0
18 | config.runs_per_agent = 3
19 | config.use_GPU = False
20 | config.overwrite_existing_results_file = False
21 | config.randomise_random_seed = True
22 | config.save_model = False
23 | 
24 | 
25 | config.hyperparameters = {
26 |     "DQN_Agents": {
27 |         "learning_rate": 0.001,
28 |         "batch_size": 128,
29 |         "buffer_size": 100000,
30 |         "epsilon_decay_rate_denominator": 150,
31 |         "discount_rate": 0.999,
32 |         "incremental_td_error": 1e-8,
33 |         "update_every_n_steps": 1,
34 |         "linear_hidden_units": [64, 64],
35 |         "final_layer_activation": None,
36 |         "y_range": (-1, 14),
37 |         "batch_norm": False,
38 |         "gradient_clipping_norm": 5,
39 |         "HER_sample_proportion": 0.8,
40 |         "learning_iterations": 1,
41 |         "clip_rewards": False
42 |     }
43 | }
44 | 
45 | if __name__== '__main__':
46 |     AGENTS = [DQN_HER, DQN]
47 |     trainer = Trainer(config, AGENTS)
48 |     trainer.run_games_for_agents()
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/results/Fetch_Reach.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from agents.actor_critic_agents.DDPG import DDPG
 4 | from agents.actor_critic_agents.DDPG_HER import DDPG_HER
 5 | from utilities.data_structures.Config import Config
 6 | from agents.Trainer import Trainer
 7 | 
 8 | 
 9 | config = Config()
10 | config.seed = 1
11 | config.environment = gym.make("FetchReach-v1")
12 | config.num_episodes_to_run = 1000
13 | config.file_to_save_data_results = None
14 | config.file_to_save_results_graph = None
15 | config.show_solution_score = False
16 | config.visualise_individual_results = False
17 | config.visualise_overall_agent_results = True
18 | config.standard_deviation_results = 1.0
19 | config.runs_per_agent = 3
20 | config.use_GPU = False
21 | config.overwrite_existing_results_file = False
22 | config.randomise_random_seed = True
23 | config.save_model = False
24 | 
25 | 
26 | config.hyperparameters = {
27 | 
28 | "Actor_Critic_Agents": {
29 |     "Actor": {
30 |         "learning_rate": 0.001,
31 |         "linear_hidden_units": [50, 50],
32 |         "final_layer_activation": "TANH",
33 |         "batch_norm": False,
34 |         "tau": 0.01,
35 |         "gradient_clipping_norm": 5
36 |     },
37 | 
38 |     "Critic": {
39 |         "learning_rate": 0.01,
40 |         "linear_hidden_units": [50, 50, 50],
41 |         "final_layer_activation": None,
42 |         "batch_norm": False,
43 |         "buffer_size": 30000,
44 |         "tau": 0.01,
45 |         "gradient_clipping_norm": 5
46 |     },
47 | 
48 |     "batch_size": 256,
49 |     "discount_rate": 0.9,
50 |     "mu": 0.0,
51 |     "theta": 0.15,
52 |     "sigma": 0.25,
53 |     "update_every_n_steps": 10,
54 |     "learning_updates_per_learning_session": 10,
55 |     "HER_sample_proportion": 0.8,
56 |     "clip_rewards": False
57 | }}
58 | 
59 | 
60 | if __name__== '__main__':
61 |     AGENTS = [DDPG, DDPG_HER]
62 |     trainer = Trainer(config, AGENTS)
63 |     trainer.run_games_for_agents()
64 | 
65 | 


--------------------------------------------------------------------------------
/agents/actor_critic_agents/DDPG_HER.py:
--------------------------------------------------------------------------------
 1 | from agents.actor_critic_agents.DDPG import DDPG
 2 | from agents.HER_Base import HER_Base
 3 | 
 4 | class DDPG_HER(HER_Base, DDPG):
 5 |     """DDPG algorithm with hindsight experience replay"""
 6 |     agent_name = "DDPG-HER"
 7 | 
 8 |     def __init__(self, config):
 9 |         DDPG.__init__(self, config)
10 |         HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
11 |                           self.hyperparameters["HER_sample_proportion"])
12 | 
13 |     def step(self):
14 |         """Runs a step within a game including a learning step if required"""
15 |         while not self.done:
16 |             self.action = self.pick_action()
17 |             self.conduct_action_in_changeable_goal_envs(self.action)
18 |             if self.time_for_critic_and_actor_to_learn():
19 |                 for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
20 |                     states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer()  # Samples experiences from buffer
21 |                     self.critic_learn(states, actions, rewards, next_states, dones)
22 |                     self.actor_learn(states)
23 |             self.track_changeable_goal_episodes_data()
24 |             self.save_experience()
25 |             if self.done: self.save_alternative_experience()
26 |             self.state_dict = self.next_state_dict  # this is to set the state for the next iteration
27 |             self.state = self.next_state
28 |             self.global_step_number += 1
29 |         self.episode_number += 1
30 | 
31 |     def enough_experiences_to_learn_from(self):
32 |         """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn"""
33 |         return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/results/data_and_graphs/Plot_Sets_Of_Results.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from utilities.data_structures.Config import Config
 4 | from Trainer import Trainer
 5 | 
 6 | 
 7 | trainer = Trainer(config=Config(), agents=None)
 8 | 
 9 | #
10 | # trainer.visualise_set_of_preexisting_results(save_image_path="Four_Rooms_and_Long_Corridor.png", results_data_paths=["Long_Corridor_Results_Data.pkl", "Four_Rooms.pkl"],
11 | #                                       plot_titles=["Long Corridor", "Four Rooms"], y_limits=[(0.0, 0.25), (-90.0, 100.25)])
12 | 
13 | 
14 | 
15 | trainer.visualise_preexisting_results(save_image_path="hrl_experiments/Taxi_graph_comparison.png", data_path="hrl_experiments/Taxi_data.pkl",
16 |                                       title="Taxi v2", y_limits=(-800.0, 0.0))
17 | 
18 | 
19 | # trainer.visualise_preexisting_results(save_image_path="Long_Corridor_Graph.png", data_path="Long_Corridor_Results_Data.pkl",
20 | #                                       title="Long Corridor", y_limits=(0.0, 0.25))
21 | 
22 | 
23 | # trainer.visualise_preexisting_results(save_image_path="Hopper_Results_Graph_Both_Agents.png", data_path="Hopper_Results_Data.pkl",
24 | #                                       title="Hopper") #, y_limits=(0.0, 0.25))
25 | 
26 | # trainer.visualise_set_of_preexisting_results(results_data_paths=["Cart_Pole_Results_Data.pkl",
27 | #                                                                  "Mountain_Car_Results_Data.pkl"],
28 | #                                              plot_titles=["Cart Pole (Discrete Actions)", "Mountain Car (Continuous Actions)"],
29 | #                                              save_image_path="CartPole_and_MountainCar_Graph.png")
30 | 
31 | 
32 | 
33 | # trainer.visualise_set_of_preexisting_results(results_data_paths=["Data_and_Graphs/Bit_Flipping_Results_Data.pkl",
34 | #                                                                  "Data_and_Graphs/Fetch_Reach_Results_Data.pkl"],
35 | #                                              plot_titles=["Bit Flipping", "Fetch Reach"],
36 | #                                              save_image_path="Data_and_Graphs/HER_Experiments.png")
37 | 


--------------------------------------------------------------------------------
/tests/Test_Bit_Flipping_Environment.py:
--------------------------------------------------------------------------------
 1 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment
 2 | import numpy as np
 3 | 
 4 | 
 5 | def test_environment_actions():
 6 |     """Tests environment is executing actions correctly"""
 7 |     env = Bit_Flipping_Environment(5)
 8 |     env.reset()
 9 |     env.state = [1, 0, 0, 1, 0, 1, 0, 0, 1, 0]
10 | 
11 |     env.step(0)
12 |     env.state = env.next_state
13 |     assert env.state == [0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
14 | 
15 |     env.step(0)
16 |     env.state = env.next_state
17 |     assert env.state == [1, 0, 0, 1, 0, 1, 0, 0, 1, 0]
18 | 
19 |     env.step(3)
20 |     env.state = env.next_state
21 |     assert env.state == [1, 0, 0, 0, 0, 1, 0, 0, 1, 0]
22 | 
23 |     env.step(6)
24 |     env.state = env.next_state
25 |     assert env.state == [1, 0, 0, 0, 0, 1, 0, 0, 1, 0]
26 | 
27 | def test_environment_goal_achievement():
28 |     """Tests environment is registering goal achievement properly"""
29 |     env = Bit_Flipping_Environment(5)
30 |     env.reset()
31 |     env.state = [1, 0, 0, 1, 0, 0, 0, 0, 0, 0]
32 |     env.desired_goal = [0, 0, 0, 0, 0]
33 | 
34 |     env.step(0)
35 |     assert env.reward == -1
36 |     env.state = env.next_state
37 |     assert env.achieved_goal == [0, 0, 0, 1, 0]
38 | 
39 |     env.step(2)
40 |     assert env.reward == -1
41 |     env.state = env.next_state
42 |     assert env.achieved_goal == [0, 0, 1, 1, 0]
43 | 
44 |     env.step(2)
45 |     assert env.reward == -1
46 |     env.state = env.next_state
47 |     assert env.achieved_goal == [0, 0, 0, 1, 0]
48 | 
49 |     env.step(3)
50 |     assert env.reward == 5
51 | 
52 | def test_compute_reward():
53 |     """Tests compute_reward method"""
54 |     env = Bit_Flipping_Environment(5)
55 |     assert env.compute_reward(np.array([0, 0, 0, 1, 0]), np.array([0, 0, 0, 1, 0]), None) == env.reward_for_achieving_goal
56 |     assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([1, 1, 1, 1, 1]), None) == env.reward_for_achieving_goal
57 |     assert env.compute_reward(np.array([0, 0, 0, 0, 0]), np.array([0, 0, 0, 0, 0]), None) == env.reward_for_achieving_goal
58 |     assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([0, 0, 0, 1, 0]), None) == env.step_reward_for_not_achieving_goal
59 |     assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0]), None) == env.step_reward_for_not_achieving_goal
60 | 
61 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Deque.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utilities.data_structures.Node import Node
 3 | 
 4 | class Deque(object):
 5 |     """Generic deque object"""
 6 |     def __init__(self, max_size, dimension_of_value_attribute):
 7 | 
 8 |         self.max_size = max_size
 9 |         self.dimension_of_value_attribute = dimension_of_value_attribute
10 |         self.deque = self.initialise_deque()
11 |         self.deque_index_to_overwrite_next = 0
12 |         self.reached_max_capacity = False
13 |         self.number_experiences_in_deque = 0
14 | 
15 |     def initialise_deque(self):
16 |         """Initialises a queue of Nodes of length self.max_size"""
17 |         deque = np.array([Node(0, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size)])
18 |         return deque
19 | 
20 |     def add_element_to_deque(self, new_key, new_value):
21 |         """Adds an element to the deque and then updates the index of the next element to be overwritten and also the
22 |         amount of elements in the deque"""
23 |         self.update_deque_node_key_and_value(self.deque_index_to_overwrite_next, new_key, new_value)
24 |         self.update_number_experiences_in_deque()
25 |         self.update_deque_index_to_overwrite_next()
26 | 
27 |     def update_deque_node_key_and_value(self, index, new_key, new_value):
28 |         self.update_deque_node_key(index, new_key)
29 |         self.update_deque_node_value(index, new_value)
30 | 
31 |     def update_deque_node_key(self, index, new_key):
32 |         self.deque[index].update_key(new_key)
33 | 
34 |     def update_deque_node_value(self, index, new_value):
35 |         self.deque[index].update_value(new_value)
36 | 
37 |     def update_deque_index_to_overwrite_next(self):
38 |         """Updates the deque index that we should write over next. When the buffer gets full we begin writing over
39 |          older experiences"""
40 |         if self.deque_index_to_overwrite_next < self.max_size - 1:
41 |             self.deque_index_to_overwrite_next += 1
42 |         else:
43 |             self.reached_max_capacity = True
44 |             self.deque_index_to_overwrite_next = 0
45 | 
46 |     def update_number_experiences_in_deque(self):
47 |         """Keeps track of how many experiences there are in the buffer"""
48 |         if not self.reached_max_capacity:
49 |             self.number_experiences_in_deque += 1


--------------------------------------------------------------------------------
/agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from agents.DQN_agents.DDQN import DDQN
 4 | from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer
 5 | 
 6 | class DDQN_With_Prioritised_Experience_Replay(DDQN):
 7 |     """A DQN agent with prioritised experience replay"""
 8 |     agent_name = "DDQN with Prioritised Replay"
 9 | 
10 |     def __init__(self, config):
11 |         DDQN.__init__(self, config)
12 |         self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed)
13 | 
14 |     def learn(self):
15 |         """Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way"""
16 |         sampled_experiences, importance_sampling_weights = self.memory.sample()
17 |         states, actions, rewards, next_states, dones = sampled_experiences
18 |         loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights)
19 |         self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])
20 |         self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"])
21 |         self.memory.update_td_errors(td_errors.squeeze(1))
22 | 
23 |     def save_experience(self):
24 |         """Saves the latest experience including the td_error"""
25 |         max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9
26 |         self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done)
27 | 
28 |     def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights):
29 |         """Calculates the loss for the local Q network. It weighs each observations loss according to the importance
30 |         sampling weights which come from the prioritised replay buffer"""
31 |         Q_targets = self.compute_q_targets(next_states, rewards, dones)
32 |         Q_expected = self.compute_expected_q_values(states, actions)
33 |         loss = F.mse_loss(Q_expected, Q_targets)
34 |         loss = loss * importance_sampling_weights
35 |         loss = torch.mean(loss)
36 |         td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy()
37 |         return loss, td_errors


--------------------------------------------------------------------------------
/environments/ant_environments/point.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
 6 | # and is not my code.
 7 | 
 8 | 
 9 | 
10 | 
11 | import math
12 | import numpy as np
13 | from gym import utils
14 | from gym.envs.mujoco import mujoco_env
15 | 
16 | 
17 | class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
18 |   FILE = "point.xml"
19 |   ORI_IND = 2
20 | 
21 |   def __init__(self, file_path=None, expose_all_qpos=True):
22 |     self._expose_all_qpos = expose_all_qpos
23 | 
24 |     mujoco_env.MujocoEnv.__init__(self, file_path, 1)
25 |     utils.EzPickle.__init__(self)
26 | 
27 |   @property
28 |   def physics(self):
29 |     return self.model
30 | 
31 |   def _step(self, a):
32 |     return self.step(a)
33 | 
34 |   def step(self, action):
35 |     action[0] = 0.2 * action[0]
36 |     qpos = np.copy(self.physics.data.qpos)
37 |     qpos[2] += action[1]
38 |     ori = qpos[2]
39 |     # compute increment in each direction
40 |     dx = math.cos(ori) * action[0]
41 |     dy = math.sin(ori) * action[0]
42 |     # ensure that the robot is within reasonable range
43 |     qpos[0] = np.clip(qpos[0] + dx, -100, 100)
44 |     qpos[1] = np.clip(qpos[1] + dy, -100, 100)
45 |     qvel = self.physics.data.qvel
46 |     self.set_state(qpos, qvel)
47 |     for _ in range(0, self.frame_skip):
48 |       self.physics.step()
49 |     next_obs = self._get_obs()
50 |     reward = 0
51 |     done = False
52 |     info = {}
53 |     return next_obs, reward, done, info
54 | 
55 |   def _get_obs(self):
56 |     if self._expose_all_qpos:
57 |       return np.concatenate([
58 |           self.physics.data.qpos.flat[:3],  # Only point-relevant coords.
59 |           self.physics.data.qvel.flat[:3]])
60 |     return np.concatenate([
61 |         self.physics.data.qpos.flat[2:3],
62 |         self.physics.data.qvel.flat[:3]])
63 | 
64 |   def reset_model(self):
65 |     qpos = self.init_qpos + self.np_random.uniform(
66 |         size=self.physics.model.nq, low=-.1, high=.1)
67 |     qvel = self.init_qvel + self.np_random.randn(self.physics.model.nv) * .1
68 | 
69 |     # Set everything other than point to original position and 0 velocity.
70 |     qpos[3:] = self.init_qpos[3:]
71 |     qvel[3:] = 0.
72 |     self.set_state(qpos, qvel)
73 |     return self._get_obs()
74 | 
75 |   def get_ori(self):
76 |     return self.model.data.qpos[self.__class__.ORI_IND]
77 | 
78 |   def set_xy(self, xy):
79 |     qpos = np.copy(self.physics.data.qpos)
80 |     qpos[0] = xy[0]
81 |     qpos[1] = xy[1]
82 | 
83 |     qvel = self.physics.data.qvel
84 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Tanh_Distribution.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # NOTE that this is not my code.
 3 | # Taken from here: https://github.com/vitchyr/rlkit/blob/master/rlkit/torch/distributions.py
 4 | 
 5 | 
 6 | import torch
 7 | from torch.distributions import Distribution, Normal
 8 | 
 9 | 
10 | class TanhNormal(Distribution):
11 |     """
12 |     Represent distribution of X where
13 |         X ~ tanh(Z)
14 |         Z ~ N(mean, std)
15 |     Note: this is not very numerically stable.
16 |     """
17 |     def __init__(self, normal_mean, normal_std, epsilon=1e-6):
18 |         """
19 |         :param normal_mean: Mean of the normal distribution
20 |         :param normal_std: Std of the normal distribution
21 |         :param epsilon: Numerical stability epsilon when computing log-prob.
22 |         """
23 |         self.normal_mean = normal_mean
24 |         self.normal_std = normal_std
25 |         self.normal = Normal(normal_mean, normal_std)
26 |         self.epsilon = epsilon
27 | 
28 |     def sample_n(self, n, return_pre_tanh_value=False):
29 |         z = self.normal.sample_n(n)
30 |         if return_pre_tanh_value:
31 |             return torch.tanh(z), z
32 |         else:
33 |             return torch.tanh(z)
34 | 
35 |     def log_prob(self, value, pre_tanh_value=None):
36 |         """
37 |         :param value: some value, x
38 |         :param pre_tanh_value: arctanh(x)
39 |         :return:
40 |         """
41 |         if pre_tanh_value is None:
42 |             pre_tanh_value = torch.log(
43 |                 (1+value) / (1-value)
44 |             ) / 2
45 |         return self.normal.log_prob(pre_tanh_value) - torch.log(
46 |             1 - value * value + self.epsilon
47 |         )
48 | 
49 |     def sample(self, return_pretanh_value=False):
50 |         """
51 |         Gradients will and should *not* pass through this operation.
52 |         See https://github.com/pytorch/pytorch/issues/4620 for discussion.
53 |         """
54 |         z = self.normal.sample().detach()
55 | 
56 |         if return_pretanh_value:
57 |             return torch.tanh(z), z
58 |         else:
59 |             return torch.tanh(z)
60 | 
61 |     def rsample(self, return_pretanh_value=False):
62 |         """
63 |         Sampling in the reparameterization case.
64 |         """
65 |         z = (
66 |             self.normal_mean +
67 |             self.normal_std *
68 |             Normal(
69 |                 torch.zeros(self.normal_mean.size()),
70 |                 torch.ones(self.normal_std.size())
71 |             ).sample()
72 |         )
73 |         z.requires_grad_()
74 | 
75 |         if return_pretanh_value:
76 |             return torch.tanh(z), z
77 |         else:
78 |             return torch.tanh(z)


--------------------------------------------------------------------------------
/utilities/Tensorboard.py:
--------------------------------------------------------------------------------
 1 | # NOTE that this code is not mine and was taken from https://becominghuman.ai/logging-in-tensorboard-with-pytorch-or-any-other-library-c549163dee9e
 2 | 
 3 | 
 4 | import io
 5 | import numpy as np
 6 | from PIL import Image
 7 | import tensorflow as tf
 8 | 
 9 | # run tensorboard --logdir="logs/" on command line to get up the tensorboard afterwards
10 | 
11 | class Tensorboard:
12 |     def __init__(self, logdir):
13 |         self.writer = tf.summary.FileWriter(logdir)
14 | 
15 |     def close(self):
16 |         self.writer.close()
17 | 
18 |     def log_scalar(self, tag, value, global_step):
19 |         summary = tf.Summary()
20 |         summary.value.add(tag=tag, simple_value=value)
21 |         self.writer.add_summary(summary, global_step=global_step)
22 |         self.writer.flush()
23 | 
24 |     def log_histogram(self, tag, values, global_step, bins):
25 |         counts, bin_edges = np.histogram(values, bins=bins)
26 | 
27 |         hist = tf.HistogramProto()
28 |         hist.min = float(np.min(values))
29 |         hist.max = float(np.max(values))
30 |         hist.num = int(np.prod(values.shape))
31 |         hist.sum = float(np.sum(values))
32 |         hist.sum_squares = float(np.sum(values ** 2))
33 | 
34 |         bin_edges = bin_edges[1:]
35 | 
36 |         for edge in bin_edges:
37 |             hist.bucket_limit.append(edge)
38 |         for c in counts:
39 |             hist.bucket.append(c)
40 | 
41 |         summary = tf.Summary()
42 |         summary.value.add(tag=tag, histo=hist)
43 |         self.writer.add_summary(summary, global_step=global_step)
44 |         self.writer.flush()
45 | 
46 |     def log_image(self, tag, img, global_step):
47 |         s = io.BytesIO()
48 |         Image.fromarray(img).save(s, format='png')
49 | 
50 |         img_summary = tf.Summary.Image(encoded_image_string=s.getvalue(),
51 |                                        height=img.shape[0],
52 |                                        width=img.shape[1])
53 | 
54 |         summary = tf.Summary()
55 |         summary.value.add(tag=tag, image=img_summary)
56 |         self.writer.add_summary(summary, global_step=global_step)
57 |         self.writer.flush()
58 | 
59 |     def log_plot(self, tag, figure, global_step):
60 |         plot_buf = io.BytesIO()
61 |         figure.savefig(plot_buf, format='png')
62 |         plot_buf.seek(0)
63 |         img = Image.open(plot_buf)
64 |         img_ar = np.array(img)
65 | 
66 |         img_summary = tf.Summary.Image(encoded_image_string=plot_buf.getvalue(),
67 |                                        height=img_ar.shape[0],
68 |                                        width=img_ar.shape[1])
69 | 
70 |         summary = tf.Summary()
71 |         summary.value.add(tag=tag, image=img_summary)
72 |         self.writer.add_summary(summary, global_step=global_step)
73 |         self.writer.flush()


--------------------------------------------------------------------------------
/environments/ant_environments/create_maze_env.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
 4 | # and is not my code.
 5 | 
 6 | 
 7 | 
 8 | from ant_environments.ant_maze_env import AntMazeEnv
 9 | from ant_environments.point_maze_env import PointMazeEnv
10 | 
11 | import tensorflow as tf
12 | import gin.tf
13 | from tf_agents.environments import gym_wrapper
14 | from tf_agents.environments import tf_py_environment
15 | 
16 | 
17 | @gin.configurable
18 | def create_maze_env(env_name=None, top_down_view=False):
19 |   n_bins = 0
20 |   manual_collision = False
21 |   if env_name.startswith('Ego'):
22 |     n_bins = 8
23 |     env_name = env_name[3:]
24 |   if env_name.startswith('Ant'):
25 |     cls = AntMazeEnv
26 |     env_name = env_name[3:]
27 |     maze_size_scaling = 8
28 |   elif env_name.startswith('Point'):
29 |     cls = PointMazeEnv
30 |     manual_collision = True
31 |     env_name = env_name[5:]
32 |     maze_size_scaling = 4
33 |   else:
34 |     assert False, 'unknown env %s' % env_name
35 | 
36 |   maze_id = None
37 |   observe_blocks = False
38 |   put_spin_near_agent = False
39 |   if env_name == 'Maze':
40 |     maze_id = 'Maze'
41 |   elif env_name == 'Push':
42 |     maze_id = 'Push'
43 |   elif env_name == 'Fall':
44 |     maze_id = 'Fall'
45 |   elif env_name == 'Block':
46 |     maze_id = 'Block'
47 |     put_spin_near_agent = True
48 |     observe_blocks = True
49 |   elif env_name == 'BlockMaze':
50 |     maze_id = 'BlockMaze'
51 |     put_spin_near_agent = True
52 |     observe_blocks = True
53 |   else:
54 |     raise ValueError('Unknown maze environment %s' % env_name)
55 | 
56 |   gym_mujoco_kwargs = {
57 |       'maze_id': maze_id,
58 |       'n_bins': n_bins,
59 |       'observe_blocks': observe_blocks,
60 |       'put_spin_near_agent': put_spin_near_agent,
61 |       'top_down_view': top_down_view,
62 |       'manual_collision': manual_collision,
63 |       'maze_size_scaling': maze_size_scaling
64 |   }
65 |   gym_env = cls(**gym_mujoco_kwargs)
66 |   gym_env.reset()
67 |   wrapped_env = gym_wrapper.GymWrapper(gym_env)
68 |   return wrapped_env
69 | 
70 | 
71 | class TFPyEnvironment(tf_py_environment.TFPyEnvironment):
72 | 
73 |   def __init__(self, *args, **kwargs):
74 |     super(TFPyEnvironment, self).__init__(*args, **kwargs)
75 | 
76 |   def start_collect(self):
77 |     pass
78 | 
79 |   def current_obs(self):
80 |     time_step = self.current_time_step()
81 |     return time_step.observation[0]  # For some reason, there is an extra dim.
82 | 
83 |   def step(self, actions):
84 |     actions = tf.expand_dims(actions, 0)
85 |     next_step = super(TFPyEnvironment, self).step(actions)
86 |     return next_step.is_last()[0], next_step.reward[0], next_step.discount[0]
87 | 
88 |   def reset(self):
89 |     return super(TFPyEnvironment, self).reset()
90 | 


--------------------------------------------------------------------------------
/environments/Ant_Navigation_Environments.py:
--------------------------------------------------------------------------------
 1 | from .ant_environments.create_maze_env import create_maze_env
 2 | import numpy as np
 3 | 
 4 | """Environments taken from HIRO paper github repo: https://github.com/tensorflow/models/tree/master/research/efficient-hrl
 5 | There are three environments that can be represented by this class depending on what environment_name you provide. 
 6 | The options are: ["AntMaze", "AntPush", "AntFall"].
 7 | 
 8 | Note that "Success" for this game is defined by the authors as achieving -5 or more on the last step of the episode 
 9 | but that this isn't coded in anyway as part of the environment. 
10 | """
11 | class Ant_Navigation_Environments(object):
12 | 
13 |     def __init__(self, environment_name):
14 |         self.environment_name = environment_name
15 |         self.base_env = create_maze_env(env_name=self.environment_name).gym  #
16 | 
17 |         self.goal_sample_fn = self.get_goal_fn()
18 |         self.reward_fn = self.get_reward_fn()
19 |         self.goal = None
20 | 
21 |         self.unwrapped = self.base_env.unwrapped
22 |         self.spec = self.base_env.spec
23 |         self.action_space = self.base_env.action_space
24 | 
25 |     def reset(self):
26 |         self.steps_taken = 0
27 |         obs = self.base_env.reset()
28 |         self.goal = self.goal_sample_fn()
29 |         return np.concatenate([obs, self.goal])
30 | 
31 |     def step(self, action):
32 |         self.steps_taken += 1
33 |         obs, _, _, info = self.base_env.step(action)
34 |         reward = self.reward_fn(obs, self.goal)
35 |         done = self.steps_taken >= 500
36 |         return np.concatenate([obs, self.goal]), reward, done, info
37 | 
38 |     def get_goal_fn(self):
39 |         """Produces the function required to generate a goal for each environment"""
40 |         if self.environment_name == "AntMaze":
41 |             return lambda: np.array([0., 16.])
42 |             #Can also use np.random.uniform((-4, -4), (20, 20)) for training purposes
43 |         elif self.environment_name == "AntPush":
44 |             return lambda: np.array([0., 19.])
45 |         elif self.environment_name == "AntFall":
46 |             return lambda: np.array([0., 27., 4.5])
47 |         else:
48 |             raise ValueError("Unknown environment name")
49 | 
50 |     def get_reward_fn(self):
51 |         """Provides function required to calculates rewards for each game"""
52 |         if self.environment_name == 'AntMaze':
53 |             return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
54 |         elif self.environment_name == 'AntPush':
55 |             return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5
56 |         elif self.environment_name == 'AntFall':
57 |             return lambda obs, goal: -np.sum(np.square(obs[:3] - goal)) ** 0.5
58 |         else:
59 |             raise ValueError("Unknown environment name")
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Replay_Buffer.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple, deque
 2 | import random
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | class Replay_Buffer(object):
 7 |     """Replay buffer to store past experiences that the agent can then use for training data"""
 8 |     
 9 |     def __init__(self, buffer_size, batch_size, seed):
10 | 
11 |         self.memory = deque(maxlen=buffer_size)
12 |         self.batch_size = batch_size
13 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
14 |         self.seed = random.seed(seed)
15 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
16 | 
17 |     def add_experience(self, states, actions, rewards, next_states, dones):
18 |         """Adds experience(s) into the replay buffer"""
19 |         if type(dones) == list:
20 |             assert type(dones[0]) != list, "A done shouldn't be a list"
21 |             experiences = [self.experience(state, action, reward, next_state, done)
22 |                            for state, action, reward, next_state, done in
23 |                            zip(states, actions, rewards, next_states, dones)]
24 |             self.memory.extend(experiences)
25 |         else:
26 |             experience = self.experience(states, actions, rewards, next_states, dones)
27 |             self.memory.append(experience)
28 |    
29 |     def sample(self, num_experiences=None, separate_out_data_types=True):
30 |         """Draws a random sample of experience from the replay buffer"""
31 |         experiences = self.pick_experiences(num_experiences)
32 |         if separate_out_data_types:
33 |             states, actions, rewards, next_states, dones = self.separate_out_data_types(experiences)
34 |             return states, actions, rewards, next_states, dones
35 |         else:
36 |             return experiences
37 |             
38 |     def separate_out_data_types(self, experiences):
39 |         """Puts the sampled experience into the correct format for a PyTorch neural network"""
40 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
41 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
42 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
43 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
44 |         dones = torch.from_numpy(np.vstack([int(e.done) for e in experiences if e is not None])).float().to(self.device)
45 |         
46 |         return states, actions, rewards, next_states, dones
47 |     
48 |     def pick_experiences(self, num_experiences=None):
49 |         if num_experiences is not None: batch_size = num_experiences
50 |         else: batch_size = self.batch_size
51 |         return random.sample(self.memory, k=batch_size)
52 | 
53 |     def __len__(self):
54 |         return len(self.memory)


--------------------------------------------------------------------------------
/utilities/Deepmind_RMS_Prop.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Optimizer
 3 | 
 4 | 
 5 | class DM_RMSprop(Optimizer):
 6 |     """Implements the form of RMSProp used in DM 2015 Atari paper.
 7 |     Inspired by https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/updates.py"""
 8 | 
 9 |     def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
10 |         if not 0.0 <= lr:
11 |             raise ValueError("Invalid learning rate: {}".format(lr))
12 |         if not 0.0 <= eps:
13 |             raise ValueError("Invalid epsilon value: {}".format(eps))
14 |         if not 0.0 <= momentum:
15 |             raise ValueError("Invalid momentum value: {}".format(momentum))
16 |         if not 0.0 <= weight_decay:
17 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
18 |         if not 0.0 <= alpha:
19 |             raise ValueError("Invalid alpha value: {}".format(alpha))
20 | 
21 |         defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
22 |         super(DM_RMSprop, self).__init__(params, defaults)
23 | 
24 |     def __setstate__(self, state):
25 |         super(DM_RMSprop, self).__setstate__(state)
26 |         for group in self.param_groups:
27 |             group.setdefault('momentum', 0)
28 |             group.setdefault('centered', False)
29 | 
30 |     def step(self, closure=None):
31 |         """Performs a single optimization step.
32 | 
33 |         Arguments:
34 |             closure (callable, optional): A closure that reevaluates the model
35 |                 and returns the loss.
36 |         """
37 |         loss = None
38 |         if closure is not None:
39 |             loss = closure()
40 |         for group in self.param_groups:
41 |             momentum = group['momentum']
42 |             sq_momentum = group['alpha']
43 |             epsilon = group['eps']
44 | 
45 |             for p in group['params']:
46 |                 if p.grad is None:
47 |                     continue
48 |                 grad = p.grad.data
49 |                 if grad.is_sparse:
50 |                     raise RuntimeError('RMSprop does not support sparse gradients')
51 |                 state = self.state[p]
52 | 
53 |                 # State initialization
54 |                 if len(state) == 0:
55 |                     state['step'] = 0
56 |                     state['square_avg'] = torch.zeros_like(p.data)
57 |                     if momentum > 0:
58 |                         state['momentum_buffer'] = torch.zeros_like(p.data)
59 | 
60 |                 mom_buffer = state['momentum_buffer']
61 |                 square_avg = state['square_avg']
62 | 
63 | 
64 |                 state['step'] += 1
65 | 
66 |                 mom_buffer.mul_(momentum)
67 |                 mom_buffer.add_((1 - momentum) * grad)
68 | 
69 |                 square_avg.mul_(sq_momentum).addcmul_(1 - sq_momentum, grad, grad)
70 | 
71 |                 avg = (square_avg -  mom_buffer**2 + epsilon).sqrt()
72 | 
73 |                 p.data.addcdiv_(-group['lr'], grad, avg)
74 | 
75 |         return loss
76 | 
77 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Max_Heap.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utilities.data_structures.Node import Node
 3 | 
 4 | class Max_Heap(object):
 5 |     """Generic max heap object"""
 6 |     def __init__(self, max_size, dimension_of_value_attribute, default_key_to_use):
 7 | 
 8 |         self.max_size = max_size
 9 |         self.dimension_of_value_attribute = dimension_of_value_attribute
10 |         self.default_key_to_use = default_key_to_use
11 |         self.heap = self.initialise_heap()
12 | 
13 |     def initialise_heap(self):
14 |         """Initialises a heap of Nodes of length self.max_size * 4 + 1"""
15 |         heap = np.array([Node(self.default_key_to_use, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size * 4 + 1)])
16 | 
17 |         # We don't use the 0th element in a heap so we want it to have infinite value so it is never swapped with a lower node
18 |         heap[0] = Node(float("inf"), (None, None, None, None, None))
19 |         return heap
20 | 
21 |     def update_element_and_reorganise_heap(self, heap_index_for_change, new_element):
22 |         self.update_heap_element(heap_index_for_change, new_element)
23 |         self.reorganise_heap(heap_index_for_change)
24 | 
25 |     def update_heap_element(self, heap_index, new_element):
26 |         self.heap[heap_index] = new_element
27 | 
28 |     def reorganise_heap(self, heap_index_changed):
29 |         """This reorganises the heap after a new value is added so as to keep the max value at the top of the heap which
30 |         is index position 1 in the array self.heap"""
31 | 
32 |         node_key = self.heap[heap_index_changed].key
33 |         parent_index = int(heap_index_changed / 2)
34 | 
35 |         if node_key > self.heap[parent_index].key:
36 |             self.swap_heap_elements(heap_index_changed, parent_index)
37 |             self.reorganise_heap(parent_index)
38 | 
39 |         else:
40 |             biggest_child_index = self.calculate_index_of_biggest_child(heap_index_changed)
41 |             if node_key < self.heap[biggest_child_index].key:
42 |                 self.swap_heap_elements(heap_index_changed, biggest_child_index)
43 |                 self.reorganise_heap(biggest_child_index)
44 | 
45 |     def swap_heap_elements(self, index1, index2):
46 |         """Swaps the position of two heap elements"""
47 |         self.heap[index1], self.heap[index2] = self.heap[index2], self.heap[index1]
48 | 
49 |     def calculate_index_of_biggest_child(self, heap_index_changed):
50 |         """Calculates the heap index of the node's child with the biggest td_error value"""
51 |         left_child = self.heap[int(heap_index_changed * 2)]
52 |         right_child = self.heap[int(heap_index_changed * 2) + 1]
53 | 
54 |         if left_child.key > right_child.key:
55 |             biggest_child_index = heap_index_changed * 2
56 |         else:
57 |             biggest_child_index = heap_index_changed * 2 + 1
58 | 
59 |         return biggest_child_index
60 | 
61 |     def give_max_key(self):
62 |         """Returns the maximum td error currently in the heap. Because it is a max heap this is the top element of the heap"""
63 |         return self.heap[1].key
64 | 


--------------------------------------------------------------------------------
/environments/Long_Corridor_Environment.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import namedtuple
 3 | import gym
 4 | import numpy as np
 5 | from gym import spaces
 6 | from gym.utils import seeding
 7 | 
 8 | class Long_Corridor_Environment(gym.Env):
 9 |     """Is the environment from pg.6 of the paper Hierarchical Deep Reinforcement Learning: Integrating Temporal
10 |     Abstraction and Intrinsic Motivation.
11 |     https://papers.nips.cc/paper/6233-hierarchical-deep-reinforcement-learning-integrating-temporal-abstraction-and-intrinsic-motivation.pdf"""
12 |     environment_name = "Long Corridor Environment"
13 | 
14 |     def __init__(self, num_states=6, stochasticity_of_action_right=0.5):
15 |         self.action_space = spaces.Discrete(2)
16 |         self.observation_space = spaces.Discrete(num_states)
17 |         self.seed()
18 |         self.reward_threshold = 1.0
19 |         self.trials = 100
20 |         self.max_episode_steps = 100
21 |         self.id = "Long Corridor"
22 |         self.action_translation = {0: "left", 1: "right"}
23 |         self.stochasticity_of_action_right = stochasticity_of_action_right
24 |         self.num_states = num_states
25 |         self.visited_final_state = False
26 |         self.reward_if_visited_final_state = 1.0
27 |         self.reward_if_havent_visited_final_state = 0.01
28 | 
29 |     def seed(self, seed=None):
30 |         self.np_random, seed = seeding.np_random(seed)
31 |         return [seed]
32 | 
33 |     def step(self, action):
34 |         self.episode_steps += 1
35 |         if type(action) is np.ndarray:
36 |             action = action[0]
37 |         assert action in [0, 1], "Action must be a 0 or a 1"
38 |         if action == 0: self.move_left()
39 |         else: self.move_right()
40 |         self.update_done_reward_and_visited_final_state()
41 |         self.state = self.next_state
42 |         self.s = np.array(self.next_state)
43 |         return self.s, self.reward, self.done, {}
44 | 
45 |     def reset(self):
46 |         self.state = 1 #environment always starts in state 1
47 |         self.next_state = None
48 |         self.reward = None
49 |         self.done = False
50 |         self.visited_final_state = False
51 |         self.episode_steps = 0
52 |         self.s = np.array(self.state)
53 |         return self.s
54 | 
55 |     def update_done_reward_and_visited_final_state(self):
56 |         if self.next_state == 0:
57 |             self.done = True
58 |             if self.visited_final_state: self.reward = self.reward_if_visited_final_state
59 |             else: self.reward = self.reward_if_havent_visited_final_state
60 |         else:
61 |             self.reward = 0
62 |         if self.next_state == self.num_states - 1: self.visited_final_state = True
63 |         if self.episode_steps >= self.max_episode_steps: self.done = True
64 | 
65 |     def move_left(self):
66 |         """Moves left in environment"""
67 |         self.next_state = self.state - 1
68 | 
69 |     def move_right(self):
70 |         """Moves right in environment"""
71 |         if random.random() < self.stochasticity_of_action_right: self.next_state = self.state - 1
72 |         else: self.next_state = min(self.state + 1, self.num_states - 1)
73 | 


--------------------------------------------------------------------------------
/results/Mountain_Car.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from agents.policy_gradient_agents.PPO import PPO
 4 | from agents.actor_critic_agents.DDPG import DDPG
 5 | from agents.actor_critic_agents.SAC import SAC
 6 | from agents.actor_critic_agents.TD3 import TD3
 7 | from agents.Trainer import Trainer
 8 | from utilities.data_structures.Config import Config
 9 | 
10 | 
11 | config = Config()
12 | config.seed = 1
13 | config.environment = gym.make("MountainCarContinuous-v0")
14 | config.num_episodes_to_run = 450
15 | config.file_to_save_data_results = None
16 | config.file_to_save_results_graph = None
17 | config.show_solution_score = False
18 | config.visualise_individual_results = False
19 | config.visualise_overall_agent_results = True
20 | config.standard_deviation_results = 1.0
21 | config.runs_per_agent = 3
22 | config.use_GPU = False
23 | config.overwrite_existing_results_file = False
24 | config.randomise_random_seed = True
25 | config.save_model = False
26 | 
27 | 
28 | config.hyperparameters = {
29 |     "Policy_Gradient_Agents": {
30 |             "learning_rate": 0.05,
31 |             "linear_hidden_units": [30, 15],
32 |             "final_layer_activation": "TANH",
33 |             "learning_iterations_per_round": 10,
34 |             "discount_rate": 0.9,
35 |             "batch_norm": False,
36 |             "clip_epsilon": 0.2,
37 |             "episodes_per_learning_round": 10,
38 |             "normalise_rewards": True,
39 |             "gradient_clipping_norm": 5,
40 |             "mu": 0.0,
41 |             "theta": 0.15,
42 |             "sigma": 0.2,
43 |             "epsilon_decay_rate_denominator": 1,
44 |             "clip_rewards": False
45 |         },
46 | 
47 |     "Actor_Critic_Agents": {
48 |             "Actor": {
49 |                 "learning_rate": 0.003,
50 |                 "linear_hidden_units": [20, 20],
51 |                 "final_layer_activation": None,
52 |                 "batch_norm": False,
53 |                 "tau": 0.005,
54 |                 "gradient_clipping_norm": 5,
55 |                 "initialiser": "Xavier"
56 |             },
57 | 
58 |             "Critic": {
59 |                 "learning_rate": 0.02,
60 |                 "linear_hidden_units": [20, 20],
61 |                 "final_layer_activation": None,
62 |                 "batch_norm": False,
63 |                 "buffer_size": 1000000,
64 |                 "tau": 0.005,
65 |                 "gradient_clipping_norm": 5,
66 |                 "initialiser": "Xavier"
67 |             },
68 | 
69 |         "min_steps_before_learning": 1000, #for SAC only
70 |         "batch_size": 256,
71 |         "discount_rate": 0.99,
72 |         "mu": 0.0,  # for O-H noise
73 |         "theta": 0.15,  # for O-H noise
74 |         "sigma": 0.25,  # for O-H noise
75 |         "action_noise_std": 0.2,  # for TD3
76 |         "action_noise_clipping_range": 0.5,  # for TD3
77 |         "update_every_n_steps": 20,
78 |         "learning_updates_per_learning_session": 10,
79 |         "automatically_tune_entropy_hyperparameter": True,
80 |         "entropy_term_weight": None,
81 |         "add_extra_noise": True,
82 |         "do_evaluation_iterations": True,
83 |         "clip_rewards": False
84 | 
85 |     }
86 | 
87 | }
88 | 
89 | if __name__ == "__main__":
90 |     AGENTS = [TD3, DDPG, PPO]
91 |     trainer = Trainer(config, AGENTS)
92 |     trainer.run_games_for_agents()
93 | 
94 | # SAC, ,
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/agents/actor_critic_agents/TD3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as functional
 3 | from torch import optim
 4 | from agents.Base_Agent import Base_Agent
 5 | from .DDPG import DDPG
 6 | from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration
 7 | 
 8 | class TD3(DDPG):
 9 |     """A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018)
10 |     https://arxiv.org/abs/1802.09477"""
11 |     agent_name = "TD3"
12 | 
13 |     def __init__(self, config):
14 |         DDPG.__init__(self, config)
15 |         self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
16 |                                            key_to_use="Critic", override_seed=self.config.seed + 1)
17 |         self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
18 |                                             key_to_use="Critic")
19 |         Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
20 |         self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
21 |                                            lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
22 |         self.exploration_strategy_critic = Gaussian_Exploration(self.config)
23 | 
24 |     def compute_critic_values_for_next_states(self, next_states):
25 |         """Computes the critic values for next states to be used in the loss for the critic"""
26 |         with torch.no_grad():
27 |             actions_next = self.actor_target(next_states)
28 |             actions_next_with_noise =  self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next})
29 |             critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1))
30 |             critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1))
31 |             critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1)
32 |         return critic_targets_next
33 | 
34 |     def critic_learn(self, states, actions, rewards, next_states, dones):
35 |         """Runs a learning iteration for both the critics"""
36 |         critic_targets_next =  self.compute_critic_values_for_next_states(next_states)
37 |         critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
38 | 
39 |         critic_expected_1 = self.critic_local(torch.cat((states, actions), 1))
40 |         critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1))
41 | 
42 |         critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets)
43 |         critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets)
44 | 
45 |         self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"])
46 |         self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
47 |                                     self.hyperparameters["Critic"]["gradient_clipping_norm"])
48 | 
49 |         self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
50 |         self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"])
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/exploration_strategies/Epsilon_Greedy_Exploration.py:
--------------------------------------------------------------------------------
 1 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy
 2 | import numpy as np
 3 | import random
 4 | import torch
 5 | 
 6 | class Epsilon_Greedy_Exploration(Base_Exploration_Strategy):
 7 |     """Implements an epsilon greedy exploration strategy"""
 8 |     def __init__(self, config):
 9 |         super().__init__(config)
10 |         self.notified_that_exploration_turned_off = False
11 |         if "exploration_cycle_episodes_length" in self.config.hyperparameters.keys():
12 |             print("Using a cyclical exploration strategy")
13 |             self.exploration_cycle_episodes_length = self.config.hyperparameters["exploration_cycle_episodes_length"]
14 |         else:
15 |             self.exploration_cycle_episodes_length = None
16 | 
17 |         if "random_episodes_to_run" in self.config.hyperparameters.keys():
18 |             self.random_episodes_to_run = self.config.hyperparameters["random_episodes_to_run"]
19 |             print("Running {} random episodes".format(self.random_episodes_to_run))
20 |         else:
21 |             self.random_episodes_to_run = 0
22 | 
23 |     def perturb_action_for_exploration_purposes(self, action_info):
24 |         """Perturbs the action of the agent to encourage exploration"""
25 |         action_values = action_info["action_values"]
26 |         turn_off_exploration = action_info["turn_off_exploration"]
27 |         episode_number = action_info["episode_number"]
28 |         if turn_off_exploration and not self.notified_that_exploration_turned_off:
29 |             print(" ")
30 |             print("Exploration has been turned OFF")
31 |             print(" ")
32 |             self.notified_that_exploration_turned_off = True
33 |         epsilon = self.get_updated_epsilon_exploration(action_info)
34 | 
35 | 
36 |         if (random.random() > epsilon or turn_off_exploration) and (episode_number >= self.random_episodes_to_run):
37 |             return torch.argmax(action_values).item()
38 |         return  np.random.randint(0, action_values.shape[1])
39 | 
40 |     def get_updated_epsilon_exploration(self, action_info, epsilon=1.0):
41 |         """Gets the probability that we just pick a random action. This probability decays the more episodes we have seen"""
42 |         episode_number = action_info["episode_number"]
43 |         epsilon_decay_denominator = self.config.hyperparameters["epsilon_decay_rate_denominator"]
44 | 
45 |         if self.exploration_cycle_episodes_length is None:
46 |             epsilon = epsilon / (1.0 + (episode_number / epsilon_decay_denominator))
47 |         else:
48 |             epsilon = self.calculate_epsilon_with_cyclical_strategy(episode_number)
49 |         return epsilon
50 | 
51 |     def calculate_epsilon_with_cyclical_strategy(self, episode_number):
52 |         """Calculates epsilon according to a cyclical strategy"""
53 |         max_epsilon = 0.5
54 |         min_epsilon = 0.001
55 |         increment = (max_epsilon - min_epsilon) / float(self.exploration_cycle_episodes_length / 2)
56 |         cycle = [ix for ix in range(int(self.exploration_cycle_episodes_length / 2))] + [ix for ix in range(
57 |             int(self.exploration_cycle_episodes_length / 2), 0, -1)]
58 |         cycle_ix = episode_number % self.exploration_cycle_episodes_length
59 |         epsilon = max_epsilon - cycle[cycle_ix] * increment
60 |         return epsilon
61 | 
62 |     def add_exploration_rewards(self, reward_info):
63 |         """Actions intrinsic rewards to encourage exploration"""
64 |         return reward_info["reward"]
65 | 
66 |     def reset(self):
67 |         """Resets the noise process"""
68 |         pass
69 | 


--------------------------------------------------------------------------------
/agents/DQN_agents/Dueling_DDQN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import optim
 3 | from agents.Base_Agent import Base_Agent
 4 | from agents.DQN_agents.DDQN import DDQN
 5 | 
 6 | class Dueling_DDQN(DDQN):
 7 |     """A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf"""
 8 |     agent_name = "Dueling DDQN"
 9 | 
10 |     def __init__(self, config):
11 |         DDQN.__init__(self, config)
12 |         self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
13 |         self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
14 |         self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
15 |         Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
16 | 
17 |     def pick_action(self, state=None):
18 |         """Uses the local Q network and an epsilon greedy policy to pick an action"""
19 |         # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
20 |         # a "fake" dimension to make it a mini-batch rather than a single observation
21 |         if state is None: state = self.state
22 |         state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
23 |         if len(state.shape) < 2: state = state.unsqueeze(0)
24 |         self.q_network_local.eval()
25 |         with torch.no_grad():
26 |             action_values = self.q_network_local(state)
27 |             action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages
28 |         self.q_network_local.train()
29 |         action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
30 |                                                                                     "turn_off_exploration": self.turn_off_exploration,
31 |                                                                                     "episode_number": self.episode_number})
32 |         return action
33 | 
34 |     def compute_q_values_for_next_states(self, next_states):
35 |         """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
36 |         uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
37 |         The reasoning behind this is that it will help stop the network from overestimating q values"""
38 |         max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1)
39 |         duelling_network_output = self.q_network_target(next_states)
40 |         q_values = self.calculate_duelling_q_values(duelling_network_output)
41 |         Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1))
42 |         return Q_targets_next
43 | 
44 |     def calculate_duelling_q_values(self, duelling_q_network_output):
45 |         """Calculates the q_values using the duelling network architecture. This is equation (9) in the paper
46 |         referenced at the top of the class"""
47 |         state_value = duelling_q_network_output[:, -1]
48 |         avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1)
49 |         q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1))
50 |         return q_values
51 | 
52 |     def compute_expected_q_values(self, states, actions):
53 |         """Computes the expected q_values we will use to create the loss to train the Q network"""
54 |         duelling_network_output = self.q_network_local(states)
55 |         q_values = self.calculate_duelling_q_values(duelling_network_output)
56 |         Q_expected = q_values.gather(1, actions.long())
57 |         return Q_expected
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/tests/Test_HRL.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import pytest
 3 | 
 4 | 
 5 | from utilities.Utility_Functions import flatten_action_id_to_actions
 6 | from utilities.data_structures.Config import Config
 7 | 
 8 | config = Config()
 9 | config.seed = 1
10 | config.environment = gym.make("Taxi-v2")
11 | config.env_parameters = {}
12 | config.num_episodes_to_run = 1000
13 | config.file_to_save_data_results = None
14 | config.file_to_save_results_graph = None
15 | config.show_solution_score = False
16 | config.visualise_individual_results = False
17 | config.visualise_overall_agent_results = True
18 | config.standard_deviation_results = 1.0
19 | config.runs_per_agent = 3
20 | config.use_GPU = False
21 | config.overwrite_existing_results_file = False
22 | config.randomise_random_seed = True
23 | config.save_model = False
24 | 
25 | linear_hidden_units = [10, 5]
26 | learning_rate = 0.01
27 | buffer_size = 40000
28 | batch_size = 256
29 | batch_norm = False
30 | embedding_dimensionality = 15
31 | gradient_clipping_norm = 5
32 | update_every_n_steps = 1
33 | learning_iterations = 1
34 | epsilon_decay_rate_denominator = 400
35 | discount_rate = 0.99
36 | tau = 0.01
37 | sequitur_k = 10
38 | 
39 | config.hyperparameters = {
40 | 
41 | 
42 |         "linear_hidden_units": linear_hidden_units,
43 |         "learning_rate": learning_rate,
44 |         "buffer_size": buffer_size,
45 |         "batch_size": batch_size,
46 |         "final_layer_activation": "None",
47 |         "columns_of_data_to_be_embedded": [0],
48 |         "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
49 |         "batch_norm": batch_norm,
50 |         "gradient_clipping_norm": gradient_clipping_norm,
51 |         "update_every_n_steps": update_every_n_steps,
52 |         "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator,
53 |         "discount_rate": discount_rate,
54 |         "learning_iterations": learning_iterations,
55 |         "tau": tau,
56 |         "sequitur_k": sequitur_k,
57 |         "action_length_reward_bonus": 0.1,
58 |         "episodes_to_run_with_no_exploration": 10,
59 |         "pre_training_learning_iterations_multiplier": 0.1,
60 |         "copy_over_hidden_layers": True,
61 |         "use_global_list_of_best_performing_actions": True
62 | }
63 | 
64 | 
65 | # hrl = HRL(config)
66 | 
67 | # def test_flatten_action_id_to_actions():
68 | #     """Tests flatten_action_id_to_actions"""
69 | #     action_id_to_actions = {0: (0,), 1:(1,), 2:(0, 1), 3: (2, 1), 4:(2, 3)}
70 | #     original_number_of_primitive_actions = 2
71 | #
72 | #
73 | #
74 | #     flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, original_number_of_primitive_actions)
75 | #     assert flattened_action_id_to_actions == {0: (0,), 1:(1,), 2:(0, 1), 3: (0, 1, 1), 4:(0, 1, 0, 1, 1)}, flattened_action_id_to_actions
76 | #
77 | #     action_id_to_actions = {0: (0,), 1:(1,), 2:(2,)}
78 | #     original_number_of_primitive_actions = 3
79 | #     flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, original_number_of_primitive_actions)
80 | #     assert flattened_action_id_to_actions == action_id_to_actions
81 | #
82 | #     with pytest.raises(AssertionError):
83 | #         action_id_to_actions = {0: (0,), 1: (1,), 2: (2,)}
84 | #         original_number_of_primitive_actions = 4
85 | #         flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions,
86 | #                                                                           original_number_of_primitive_actions)
87 | #     with pytest.raises(AssertionError):
88 | #         action_id_to_actions = {0: (0,), 1: (1,), 2: (2, 2)}
89 | #         original_number_of_primitive_actions = 3
90 | #         flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions,
91 | #                                                                           original_number_of_primitive_actions)
92 | 
93 | 


--------------------------------------------------------------------------------
/environments/Bit_Flipping_Environment.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import random
 3 | import gym
 4 | import numpy as np
 5 | from gym import spaces
 6 | from gym.utils import seeding
 7 | 
 8 | class Bit_Flipping_Environment(gym.Env):
 9 |     environment_name = "Bit Flipping Game"
10 | 
11 |     def __init__(self, environment_dimension=20, deterministic=False):
12 | 
13 |         self.action_space = spaces.Discrete(environment_dimension)
14 |         self.observation_space = spaces.Dict(dict(
15 |             desired_goal=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'),
16 |             achieved_goal=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'),
17 |             observation=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'),
18 |         ))
19 | 
20 |         self.seed()
21 |         self.reward_threshold = 0.0
22 |         self.trials = 50
23 |         self.max_episode_steps = environment_dimension
24 |         self.id = "Bit Flipping"
25 |         self.environment_dimension = environment_dimension
26 |         self.reward_for_achieving_goal = self.environment_dimension
27 |         self.step_reward_for_not_achieving_goal = -1
28 | 
29 |         self.deterministic = deterministic
30 | 
31 |     def seed(self, seed=None):
32 |         self.np_random, seed = seeding.np_random(seed)
33 |         return [seed]
34 | 
35 |     def reset(self):
36 |         if not self.deterministic:
37 |             self.desired_goal = self.randomly_pick_state_or_goal()
38 |             self.state = self.randomly_pick_state_or_goal()
39 |         else:
40 |             self.desired_goal = [0 for _ in range(self.environment_dimension)]
41 |             self.state = [1 for _ in range(self.environment_dimension)]
42 |         self.state.extend(self.desired_goal)
43 |         self.achieved_goal = self.state[:self.environment_dimension]
44 |         self.step_count = 0
45 |         return {"observation": np.array(self.state[:self.environment_dimension]), "desired_goal": np.array(self.desired_goal),
46 |                 "achieved_goal": np.array(self.achieved_goal)}
47 | 
48 |     def randomly_pick_state_or_goal(self):
49 |         return [random.randint(0, 1) for _ in range(self.environment_dimension)]
50 | 
51 |     def step(self, action):
52 |         """Conducts the discrete action chosen and updated next_state, reward and done"""
53 |         if type(action) is np.ndarray:
54 |             action = action[0]
55 |         assert action <= self.environment_dimension + 1, "You picked an invalid action"
56 |         self.step_count += 1
57 |         if action != self.environment_dimension + 1: #otherwise no bit is flipped
58 |             self.next_state = copy.copy(self.state)
59 |             self.next_state[action] = (self.next_state[action] + 1) % 2
60 |         if self.goal_achieved(self.next_state):
61 |             self.reward = self.reward_for_achieving_goal
62 |             self.done = True
63 |         else:
64 |             self.reward = self.step_reward_for_not_achieving_goal
65 |             if self.step_count >= self.environment_dimension:
66 |                 self.done = True
67 |             else:
68 |                 self.done = False
69 |         self.achieved_goal = self.next_state[:self.environment_dimension]
70 |         self.state = self.next_state
71 | 
72 |         return {"observation": np.array(self.next_state[:self.environment_dimension]),
73 |                 "desired_goal": np.array(self.desired_goal), "achieved_goal": np.array(self.achieved_goal)}, self.reward, self.done, {}
74 | 
75 |     def goal_achieved(self, next_state):
76 |         return next_state[:self.environment_dimension] == next_state[-self.environment_dimension:]
77 | 
78 |     def compute_reward(self, achieved_goal, desired_goal, info):
79 |         """Computes the reward we would have got with this achieved goal and desired goal. Must be of this exact
80 |         interface to fit with the open AI gym specifications"""
81 |         if (achieved_goal == desired_goal).all():
82 |             reward = self.reward_for_achieving_goal
83 |         else:
84 |             reward = self.step_reward_for_not_achieving_goal
85 |         return reward
86 | 


--------------------------------------------------------------------------------
/tests/Test_Action_Balanced_Replay_Buffer.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import Counter
 3 | 
 4 | import pytest
 5 | 
 6 | from utilities.data_structures.Action_Balanced_Replay_Buffer import Action_Balanced_Replay_Buffer
 7 | 
 8 | def test_add_experience():
 9 |     """Tests that add_experience works correctly"""
10 |     buffer = Action_Balanced_Replay_Buffer(6, 4, 0, 3)
11 | 
12 |     rewards = [0 for _ in range(4)]
13 |     next_states = [0 for _ in range(4)]
14 |     states = [0 for _ in range(4)]
15 |     dones = [0 for _ in range(4)]
16 |     actions = [0, 1, 2, 0]
17 | 
18 |     for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
19 |         buffer.add_experience(state, action, reward, next_state, done)
20 | 
21 |     assert len(buffer.memories[0]) == 2
22 |     assert len(buffer.memories[1]) == 1
23 |     assert len(buffer.memories[2]) == 1
24 | 
25 |     buffer.add_experience(99, 0, 0, 0, 0)
26 |     assert len(buffer.memories[0]) == 2
27 |     assert buffer.memories[0][1].state == 99
28 | 
29 |     buffer = Action_Balanced_Replay_Buffer(6, 4, 0, 3)
30 |     buffer.add_experience(states, actions, rewards, next_states, dones)
31 |     assert len(buffer.memories[0]) == 2
32 |     assert len(buffer.memories[1]) == 1
33 |     assert len(buffer.memories[2]) == 1
34 | 
35 |     buffer.add_experience(99, 0, 0, 0, 0)
36 |     assert len(buffer.memories[0]) == 2
37 |     assert buffer.memories[0][1].state == 99
38 | 
39 | def test_add_experience_throws_error():
40 |     """Tests that add_experience works correctly"""
41 |     buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3)
42 |     with pytest.raises(KeyError):
43 |         buffer.add_experience(3, 99, 1, 0, 0)
44 |         buffer.sample()
45 | 
46 |     buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3)
47 |     buffer.add_experience(3, 2, 1, 0, 0)
48 | 
49 |     with pytest.raises(AssertionError):
50 |         buffer.sample()
51 | 
52 | def test_sample_correctly():
53 |     """Tests that sample works correctly"""
54 |     buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3)
55 |     buffer.add_experience(3, 2, 1, 0, 0)
56 |     buffer.add_experience(2, 0, 1, 0, 0)
57 |     buffer.add_experience(1, 1, 1, 0, 0)
58 |     states, actions, rewards, next_states, dones = buffer.sample()
59 | 
60 |     for var in [states, actions, rewards, next_states, dones]:
61 |         assert len(var) == 4
62 | 
63 |     num_occurances = 0
64 |     tries = 50
65 | 
66 |     for random_seed in range(tries):
67 |         buffer = Action_Balanced_Replay_Buffer(20, 4, random_seed, 3)
68 |         buffer.add_experience(3, 2, 1, 0, 0)
69 |         buffer.add_experience(2, 0, 1, 0, 0)
70 |         buffer.add_experience(1, 1, 1, 0, 0)
71 |         states, actions, rewards, next_states, dones = buffer.sample()
72 |         if states[2] == 3.0: num_occurances += 1
73 |         print(states)
74 |     assert num_occurances < tries/2
75 |     assert num_occurances > tries/5
76 | 
77 | def test_sample_statistics_correct():
78 |     """Tests that sampled experiences correspond to expected statistics"""
79 |     tries = 5
80 |     for random_seed in range(tries):
81 |         for num_actions in range(1, 7):
82 |             for buffer_size in [random.randint(55, 9999) for _ in range(10)]:
83 |                 for batch_size in [random.randint(8, 200) for _ in range(10)]:
84 |                     buffer = Action_Balanced_Replay_Buffer(buffer_size, batch_size, random.randint(0, 2000000), num_actions)
85 |                     for _ in range(500):
86 |                         random_action = random.randint(0, num_actions - 1)
87 |                         buffer.add_experience(1, random_action, 1, 0, 0)
88 |                     states, actions, rewards, next_states, dones = buffer.sample()
89 |                     actions = [action.item() for action in actions]
90 |                     assert len(actions) == batch_size
91 |                     count = Counter(actions)
92 |                     action_count = count[0]
93 |                     for action in range(num_actions):
94 |                         assert abs(count[action] - action_count) < 2, print(count[action])
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/tests/Test_Prioritised_Replay_Buffer.py:
--------------------------------------------------------------------------------
 1 | from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer
 2 | import numpy as np
 3 | import random
 4 | 
 5 | hyperparameters = {
 6 |         "alpha_prioritised_replay": 0.5,
 7 |         "beta_prioritised_replay": 0.5,
 8 |         "incremental_td_error": 0.0,
 9 |         "buffer_size": 4,
10 |         "batch_size": 3
11 | }
12 | 
13 | 
14 | def test_prioritised_replay_buffer():
15 | 
16 |     buffer = Prioritised_Replay_Buffer(hyperparameters)
17 |     buffer.add_experience(100, 1, 2, 3, 4, 5)
18 | 
19 |     assert buffer.deque[0].key == 100.0**hyperparameters["alpha_prioritised_replay"]
20 |     assert buffer.deque[0].value == (1, 2, 3, 4, 5)
21 |     assert buffer.deque[0].heap_index == 1
22 |     assert buffer.heap[1].key == 100.0**hyperparameters["alpha_prioritised_replay"]
23 |     assert buffer.heap[1].value == (1, 2, 3, 4, 5)
24 | 
25 |     buffer.add_experience(99, 1, 2, 3, 4, 5)
26 |     buffer.add_experience(98, 1, 2, 3, 4, 5)
27 | 
28 |     assert buffer.deque[0].key == 100.0**hyperparameters["alpha_prioritised_replay"]
29 |     assert buffer.deque[0].value == (1, 2, 3, 4, 5)
30 |     assert buffer.deque[0].heap_index == 1
31 |     assert buffer.heap[1].key == 100.0**hyperparameters["alpha_prioritised_replay"]
32 |     assert buffer.heap[1].value == (1, 2, 3, 4, 5)
33 | 
34 |     assert buffer.deque[1].key == 99.0**hyperparameters["alpha_prioritised_replay"]
35 |     assert buffer.deque[1].value == (1, 2, 3, 4, 5)
36 |     assert buffer.deque[1].heap_index == 2
37 |     assert buffer.heap[2].key == 99.0**hyperparameters["alpha_prioritised_replay"]
38 |     assert buffer.heap[2].value == (1, 2, 3, 4, 5)
39 | 
40 |     assert buffer.deque[2].key == 98.0**hyperparameters["alpha_prioritised_replay"]
41 |     assert buffer.deque[2].value == (1, 2, 3, 4, 5)
42 |     assert buffer.deque[2].heap_index == 3
43 |     assert buffer.heap[3].key == 98.0**hyperparameters["alpha_prioritised_replay"]
44 |     assert buffer.heap[3].value == (1, 2, 3, 4, 5)
45 | 
46 |     buffer.add_experience(105, 1, 2, 3, 4, 5)
47 | 
48 |     assert buffer.deque[3].key == 105.0**hyperparameters["alpha_prioritised_replay"]
49 |     assert buffer.deque[3].value == (1, 2, 3, 4, 5)
50 |     assert buffer.deque[3].heap_index == 1
51 |     assert buffer.heap[1].key == 105.0**hyperparameters["alpha_prioritised_replay"]
52 |     assert buffer.heap[1].value == (1, 2, 3, 4, 5)
53 |     assert buffer.heap[2].key == 100.0 ** hyperparameters["alpha_prioritised_replay"]
54 | 
55 |     buffer.add_experience(101, 1, 24, 3, 4, 5)
56 | 
57 |     assert buffer.deque[0].key == 101.0 ** hyperparameters["alpha_prioritised_replay"]
58 |     assert buffer.deque[0].value == (1, 24, 3, 4, 5)
59 |     assert buffer.deque[0].heap_index == 2
60 |     assert buffer.heap[2].key == 101.0 ** hyperparameters["alpha_prioritised_replay"]
61 |     assert buffer.heap[2].value == (1, 24, 3, 4, 5)
62 | 
63 | 
64 | def test_heap_always_keeps_max_element_at_top():
65 |     hyperparameters["buffer_size"] = 200
66 |     for _ in range(100):
67 |         buffer = Prioritised_Replay_Buffer(hyperparameters)
68 |         elements_added = []
69 |         for ix in range(1, 100):
70 |             element = random.random()
71 |             elements_added.append(element)
72 |             buffer.add_experience(element, 0, 0, 0, 0, 0)
73 | 
74 |         max_key = np.max(elements_added)** hyperparameters["alpha_prioritised_replay"]
75 |         assert round(buffer.give_max_td_error(), 8) == round(max_key, 8), "{}".format(elements_added)
76 | 
77 | def test_give_sum_of_elements_is_always_correct():
78 |     hyperparameters["buffer_size"] = 200
79 |     for _ in range(100):
80 |         buffer = Prioritised_Replay_Buffer(hyperparameters)
81 |         elements_added = []
82 |         for ix in range(1, 100):
83 |             element = random.random()
84 |             elements_added.append((abs(element) + hyperparameters["incremental_td_error"]) ** hyperparameters["alpha_prioritised_replay"])
85 |             buffer.add_experience(element, 0, 0, 0, 0, 0)
86 | 
87 |             sum_key = np.sum(elements_added)
88 |         assert round(buffer.give_adapted_sum_of_td_errors(), 8) == round(sum_key, 8), "{}".format(elements_added)
89 | 


--------------------------------------------------------------------------------
/environments/ant_environments/maze_env_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
  3 | # and is not my code.
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | import numpy as np
 12 | import math
 13 | 
 14 | 
 15 | class Move(object):
 16 |   X = 11
 17 |   Y = 12
 18 |   Z = 13
 19 |   XY = 14
 20 |   XZ = 15
 21 |   YZ = 16
 22 |   XYZ = 17
 23 |   SpinXY = 18
 24 | 
 25 | 
 26 | def can_move_x(movable):
 27 |   return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
 28 |                      Move.SpinXY]
 29 | 
 30 | 
 31 | def can_move_y(movable):
 32 |   return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
 33 |                      Move.SpinXY]
 34 | 
 35 | 
 36 | def can_move_z(movable):
 37 |   return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
 38 | 
 39 | 
 40 | def can_spin(movable):
 41 |   return movable in [Move.SpinXY]
 42 | 
 43 | 
 44 | def can_move(movable):
 45 |   return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
 46 | 
 47 | 
 48 | def construct_maze(maze_id='Maze'):
 49 |   if maze_id == 'Maze':
 50 |     structure = [
 51 |         [1, 1, 1, 1, 1],
 52 |         [1, 'r', 0, 0, 1],
 53 |         [1, 1, 1, 0, 1],
 54 |         [1, 0, 0, 0, 1],
 55 |         [1, 1, 1, 1, 1],
 56 |     ]
 57 |   elif maze_id == 'Push':
 58 |     structure = [
 59 |         [1, 1,  1,  1,   1],
 60 |         [1, 0, 'r', 1,   1],
 61 |         [1, 0,  Move.XY, 0,  1],
 62 |         [1, 1,  0,  1,   1],
 63 |         [1, 1,  1,  1,   1],
 64 |     ]
 65 |   elif maze_id == 'Fall':
 66 |     structure = [
 67 |         [1, 1,   1,  1],
 68 |         [1, 'r', 0,  1],
 69 |         [1, 0,   Move.YZ,  1],
 70 |         [1, -1, -1,  1],
 71 |         [1, 0,   0,  1],
 72 |         [1, 1,   1,  1],
 73 |     ]
 74 |   elif maze_id == 'Block':
 75 |     O = 'r'
 76 |     structure = [
 77 |         [1, 1, 1, 1, 1],
 78 |         [1, O, 0, 0, 1],
 79 |         [1, 0, 0, 0, 1],
 80 |         [1, 0, 0, 0, 1],
 81 |         [1, 1, 1, 1, 1],
 82 |     ]
 83 |   elif maze_id == 'BlockMaze':
 84 |     O = 'r'
 85 |     structure = [
 86 |         [1, 1, 1, 1],
 87 |         [1, O, 0, 1],
 88 |         [1, 1, 0, 1],
 89 |         [1, 0, 0, 1],
 90 |         [1, 1, 1, 1],
 91 |     ]
 92 |   else:
 93 |       raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)
 94 | 
 95 |   return structure
 96 | 
 97 | 
 98 | def line_intersect(pt1, pt2, ptA, ptB):
 99 |   """
100 |   Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
101 | 
102 |   this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
103 |   """
104 | 
105 |   DET_TOLERANCE = 0.00000001
106 | 
107 |   # the first line is pt1 + r*(pt2-pt1)
108 |   # in component form:
109 |   x1, y1 = pt1
110 |   x2, y2 = pt2
111 |   dx1 = x2 - x1
112 |   dy1 = y2 - y1
113 | 
114 |   # the second line is ptA + s*(ptB-ptA)
115 |   x, y = ptA
116 |   xB, yB = ptB
117 |   dx = xB - x
118 |   dy = yB - y
119 | 
120 |   DET = (-dx1 * dy + dy1 * dx)
121 | 
122 |   if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
123 | 
124 |   # now, the determinant should be OK
125 |   DETinv = 1.0 / DET
126 | 
127 |   # find the scalar amount along the "self" segment
128 |   r = DETinv * (-dy * (x - x1) + dx * (y - y1))
129 | 
130 |   # find the scalar amount along the input line
131 |   s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
132 | 
133 |   # return the average of the two descriptions
134 |   xi = (x1 + r * dx1 + x + s * dx) / 2.0
135 |   yi = (y1 + r * dy1 + y + s * dy) / 2.0
136 |   return (xi, yi, 1, r, s)
137 | 
138 | 
139 | def ray_segment_intersect(ray, segment):
140 |   """
141 |   Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
142 |   and return the intersection point if there is one
143 |   """
144 |   (x, y), theta = ray
145 |   # (x1, y1), (x2, y2) = segment
146 |   pt1 = (x, y)
147 |   len = 1
148 |   pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
149 |   xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
150 |   if valid and r >= 0 and 0 <= s <= 1:
151 |     return (xo, yo)
152 |   return None
153 | 
154 | 
155 | def point_distance(p1, p2):
156 |   x1, y1 = p1
157 |   x2, y2 = p2
158 |   return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
159 | 


--------------------------------------------------------------------------------
/utilities/Parallel_Experience_Generator.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import sys
 4 | from contextlib import closing
 5 | #
 6 | # from pathos.multiprocessing import ProcessingPool as Pool
 7 | 
 8 | from torch.multiprocessing import Pool
 9 | from random import randint
10 | 
11 | from utilities.OU_Noise import OU_Noise
12 | from utilities.Utility_Functions import create_actor_distribution
13 | 
14 | class Parallel_Experience_Generator(object):
15 |     """ Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents"""
16 |     def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None):
17 |         self.use_GPU = use_GPU
18 |         self.environment =  environment
19 |         self.action_types = "DISCRETE" if self.environment.action_space.dtype == int  else "CONTINUOUS"
20 |         self.action_size = action_size
21 |         self.policy = policy
22 |         self.action_choice_output_columns = action_choice_output_columns
23 |         self.hyperparameters = hyperparameters
24 |         if self.action_types == "CONTINUOUS": self.noise = OU_Noise(self.action_size, seed, self.hyperparameters["mu"],
25 |                             self.hyperparameters["theta"], self.hyperparameters["sigma"])
26 | 
27 | 
28 |     def play_n_episodes(self, n, exploration_epsilon=None):
29 |         """Plays n episodes in parallel using the fixed policy and returns the data"""
30 |         self.exploration_epsilon = exploration_epsilon
31 |         with closing(Pool(processes=n)) as pool:
32 |             results = pool.map(self, range(n))
33 |             pool.terminate()
34 |         states_for_all_episodes = [episode[0] for episode in results]
35 |         actions_for_all_episodes = [episode[1] for episode in results]
36 |         rewards_for_all_episodes = [episode[2] for episode in results]
37 |         return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes
38 | 
39 |     def __call__(self, n):
40 |         exploration = max(0.0, random.uniform(self.exploration_epsilon / 3.0, self.exploration_epsilon * 3.0))
41 |         return self.play_1_episode(exploration)
42 | 
43 |     def play_1_episode(self, epsilon_exploration):
44 |         """Plays 1 episode using the fixed policy and returns the data"""
45 |         state = self.reset_game()
46 |         done = False
47 |         episode_states = []
48 |         episode_actions = []
49 |         episode_rewards = []
50 |         while not done:
51 |             action = self.pick_action(self.policy, state, epsilon_exploration)
52 |             next_state, reward, done, _ = self.environment.step(action)
53 |             if self.hyperparameters["clip_rewards"]: reward = max(min(reward, 1.0), -1.0)
54 |             episode_states.append(state)
55 |             episode_actions.append(action)
56 |             episode_rewards.append(reward)
57 |             state = next_state
58 |         return episode_states, episode_actions, episode_rewards
59 | 
60 |     def reset_game(self):
61 |         """Resets the game environment so it is ready to play a new episode"""
62 |         seed = randint(0, sys.maxsize)
63 |         torch.manual_seed(seed) # Need to do this otherwise each worker generates same experience
64 |         state = self.environment.reset()
65 |         if self.action_types == "CONTINUOUS": self.noise.reset()
66 |         return state
67 | 
68 |     def pick_action(self, policy, state, epsilon_exploration=None):
69 |         """Picks an action using the policy"""
70 |         if self.action_types == "DISCRETE":
71 |             if random.random() <= epsilon_exploration:
72 |                 action = random.randint(0, self.action_size - 1)
73 |                 return action
74 | 
75 |         state = torch.from_numpy(state).float().unsqueeze(0)
76 |         actor_output = policy.forward(state)
77 |         if self.action_choice_output_columns is not None:
78 |             actor_output = actor_output[:, self.action_choice_output_columns]
79 |         action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
80 |         action = action_distribution.sample().cpu()
81 | 
82 |         if self.action_types == "CONTINUOUS": action += torch.Tensor(self.noise.sample())
83 |         else: action = action.item()
84 |         return action


--------------------------------------------------------------------------------
/environments/ant_environments/ant.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments
  4 | # and is not my code.
  5 | 
  6 | 
  7 | 
  8 | 
  9 | """Wrapper for creating the ant environment in gym_mujoco."""
 10 | 
 11 | import math
 12 | import numpy as np
 13 | from gym import utils
 14 | from gym.envs.mujoco import mujoco_env
 15 | 
 16 | 
 17 | def q_inv(a):
 18 |   return [a[0], -a[1], -a[2], -a[3]]
 19 | 
 20 | 
 21 | def q_mult(a, b): # multiply two quaternion
 22 |   w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3]
 23 |   i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2]
 24 |   j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1]
 25 |   k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0]
 26 |   return [w, i, j, k]
 27 | 
 28 | 
 29 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 30 |   FILE = "ant.xml"
 31 |   ORI_IND = 3
 32 | 
 33 |   def __init__(self, file_path=None, expose_all_qpos=True,
 34 |                expose_body_coms=None, expose_body_comvels=None):
 35 |     self._expose_all_qpos = expose_all_qpos
 36 |     self._expose_body_coms = expose_body_coms
 37 |     self._expose_body_comvels = expose_body_comvels
 38 |     self._body_com_indices = {}
 39 |     self._body_comvel_indices = {}
 40 | 
 41 | 
 42 |     mujoco_env.MujocoEnv.__init__(self, file_path, 5)
 43 |     utils.EzPickle.__init__(self)
 44 | 
 45 |   @property
 46 |   def physics(self):
 47 |     return self.model
 48 | 
 49 |   def _step(self, a):
 50 |     return self.step(a)
 51 | 
 52 |   def step(self, a):
 53 |     xposbefore = self.get_body_com("torso")[0]
 54 |     self.do_simulation(a, self.frame_skip)
 55 |     xposafter = self.get_body_com("torso")[0]
 56 |     forward_reward = (xposafter - xposbefore) / self.dt
 57 |     ctrl_cost = .5 * np.square(a).sum()
 58 |     survive_reward = 1.0
 59 |     reward = forward_reward - ctrl_cost + survive_reward
 60 |     state = self.state_vector()
 61 |     done = False
 62 |     ob = self._get_obs()
 63 |     return ob, reward, done, dict(
 64 |         reward_forward=forward_reward,
 65 |         reward_ctrl=-ctrl_cost,
 66 |         reward_survive=survive_reward)
 67 | 
 68 |   def _get_obs(self):
 69 |     # No cfrc observation
 70 |     if self._expose_all_qpos:
 71 |       obs = np.concatenate([
 72 |           self.data.qpos.flat[:15],  # Ensures only ant obs.
 73 |           self.data.qvel.flat[:14],
 74 |       ])
 75 |     else:
 76 |       obs = np.concatenate([
 77 |           self.data.qpos.flat[2:15],
 78 |           self.data.qvel.flat[:14],
 79 |       ])
 80 | 
 81 |     if self._expose_body_coms is not None:
 82 |       for name in self._expose_body_coms:
 83 |         com = self.get_body_com(name)
 84 |         if name not in self._body_com_indices:
 85 |           indices = range(len(obs), len(obs) + len(com))
 86 |           self._body_com_indices[name] = indices
 87 |         obs = np.concatenate([obs, com])
 88 | 
 89 |     if self._expose_body_comvels is not None:
 90 |       for name in self._expose_body_comvels:
 91 |         comvel = self.get_body_comvel(name)
 92 |         if name not in self._body_comvel_indices:
 93 |           indices = range(len(obs), len(obs) + len(comvel))
 94 |           self._body_comvel_indices[name] = indices
 95 |         obs = np.concatenate([obs, comvel])
 96 |     return obs
 97 | 
 98 |   def reset_model(self):
 99 |     qpos = self.init_qpos + self.np_random.uniform(
100 |         size=self.model.nq, low=-.1, high=.1)
101 |     qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
102 | 
103 |     # Set everything other than ant to original position and 0 velocity.
104 |     qpos[15:] = self.init_qpos[15:]
105 |     qvel[14:] = 0.
106 |     self.set_state(qpos, qvel)
107 |     return self._get_obs()
108 | 
109 |   def viewer_setup(self):
110 |     self.viewer.cam.distance = self.model.stat.extent * 0.5
111 | 
112 |   def get_ori(self):
113 |     ori = [0, 1, 0, 0]
114 |     rot = self.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4]  # take the quaternion
115 |     ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3]  # project onto x-y plane
116 |     ori = math.atan2(ori[1], ori[0])
117 |     return ori
118 | 
119 |   def set_xy(self, xy):
120 |     qpos = np.copy(self.data.qpos)
121 |     qpos[0] = xy[0]
122 |     qpos[1] = xy[1]
123 | 
124 |     qvel = self.data.qvel
125 |     self.set_state(qpos, qvel)
126 | 
127 |   def get_xy(self):
128 |     return self.data.qpos[:2]
129 | 


--------------------------------------------------------------------------------
/agents/policy_gradient_agents/REINFORCE.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.optim as optim
 4 | from torch.distributions import Categorical
 5 | from agents.Base_Agent import Base_Agent
 6 | 
 7 | class REINFORCE(Base_Agent):
 8 |     agent_name = "REINFORCE"
 9 |     def __init__(self, config):
10 |         Base_Agent.__init__(self, config)
11 |         self.policy = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
12 |         self.optimizer = optim.Adam(self.policy.parameters(), lr=self.hyperparameters["learning_rate"])
13 |         self.episode_rewards = []
14 |         self.episode_log_probabilities = []
15 | 
16 |     def reset_game(self):
17 |         """Resets the game information so we are ready to play a new episode"""
18 |         self.state = self.environment.reset_environment()
19 |         self.next_state = None
20 |         self.action = None
21 |         self.reward = None
22 |         self.done = False
23 |         self.total_episode_score_so_far = 0
24 |         self.episode_rewards = []
25 |         self.episode_log_probabilities = []
26 |         self.episode_step_number = 0
27 | 
28 |     def step(self):
29 |         """Runs a step within a game including a learning step if required"""
30 |         while not self.done:
31 |             self.pick_and_conduct_action_and_save_log_probabilities()
32 |             self.update_next_state_reward_done_and_score()
33 |             self.store_reward()
34 |             if self.time_to_learn():
35 |                 self.actor_learn()
36 |             self.state = self.next_state #this is to set the state for the next iteration
37 |             self.episode_step_number += 1
38 |         self.episode_number += 1
39 | 
40 |     def pick_and_conduct_action_and_save_log_probabilities(self):
41 |         """Picks and then conducts actions. Then saves the log probabilities of the actions it conducted to be used for
42 |         learning later"""
43 |         action, log_probabilities = self.pick_action_and_get_log_probabilities()
44 |         self.store_log_probabilities(log_probabilities)
45 |         self.store_action(action)
46 |         self.conduct_action()
47 | 
48 |     def pick_action_and_get_log_probabilities(self):
49 |         """Picks actions and then calculates the log probabilities of the actions it picked given the policy"""
50 |         # PyTorch only accepts mini-batches and not individual observations so we have to add
51 |         # a "fake" dimension to our observation using unsqueeze
52 |         state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
53 |         action_probabilities = self.policy.forward(state).cpu()
54 |         action_distribution = Categorical(action_probabilities) # this creates a distribution to sample from
55 |         action = action_distribution.sample()
56 |         return action.item(), action_distribution.log_prob(action)
57 | 
58 |     def store_log_probabilities(self, log_probabilities):
59 |         """Stores the log probabilities of picked actions to be used for learning later"""
60 |         self.episode_log_probabilities.append(log_probabilities)
61 | 
62 |     def store_action(self, action):
63 |         """Stores the action picked"""
64 |         self.action = action
65 | 
66 |     def store_reward(self):
67 |         """Stores the reward picked"""
68 |         self.episode_rewards.append(self.reward)
69 | 
70 |     def actor_learn(self):
71 |         """Runs a learning iteration for the policy"""
72 |         total_discounted_reward = self.calculate_episode_discounted_reward()
73 |         policy_loss = self.calculate_policy_loss_on_episode(total_discounted_reward)
74 |         self.optimizer.zero_grad()
75 |         policy_loss.backward()
76 |         self.optimizer.step()
77 | 
78 |     def calculate_episode_discounted_reward(self):
79 |         """Calculates the cumulative discounted return for the episode"""
80 |         discounts = self.hyperparameters["discount_rate"] ** np.arange(len(self.episode_rewards))
81 |         total_discounted_reward = np.dot(discounts, self.episode_rewards)
82 |         return total_discounted_reward
83 | 
84 |     def calculate_policy_loss_on_episode(self, total_discounted_reward):
85 |         """Calculates the loss from an episode"""
86 |         policy_loss = []
87 |         for log_prob in self.episode_log_probabilities:
88 |             policy_loss.append(-log_prob * total_discounted_reward)
89 |         policy_loss = torch.cat(policy_loss).sum() # We need to add up the losses across the mini-batch to get 1 overall loss
90 |         return policy_loss
91 | 
92 |     def time_to_learn(self):
93 |         """Tells us whether it is time for the algorithm to learn. With REINFORCE we only learn at the end of every
94 |         episode so this just returns whether the episode is over"""
95 |         return self.done
96 | 


--------------------------------------------------------------------------------
/results/Hopper.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from agents.policy_gradient_agents.PPO import PPO
  3 | from agents.actor_critic_agents.DDPG import DDPG
  4 | from agents.actor_critic_agents.SAC import SAC
  5 | from agents.actor_critic_agents.TD3 import TD3
  6 | from agents.Trainer import Trainer
  7 | from agents.hierarchical_agents.DIAYN import DIAYN
  8 | from utilities.data_structures.Config import Config
  9 | 
 10 | 
 11 | config = Config()
 12 | config.seed = 1
 13 | config.environment = gym.make("Hopper-v2")
 14 | config.num_episodes_to_run = 1000
 15 | config.file_to_save_data_results = "data_and_graphs/Hopper_Results_Data.pkl"
 16 | config.file_to_save_results_graph = "data_and_graphs/Hopper_Results_Graph.png"
 17 | config.show_solution_score = False
 18 | config.visualise_individual_results = False
 19 | config.visualise_overall_agent_results = True
 20 | config.standard_deviation_results = 1.0
 21 | config.runs_per_agent = 3
 22 | config.use_GPU = False
 23 | config.overwrite_existing_results_file = False
 24 | config.randomise_random_seed = True
 25 | config.save_model = False
 26 | 
 27 | 
 28 | actor_critic_agent_hyperparameters = {
 29 |         "Actor": {
 30 |             "learning_rate": 0.0003,
 31 |             "linear_hidden_units": [64, 64],
 32 |             "final_layer_activation": None,
 33 |             "batch_norm": False,
 34 |             "tau": 0.005,
 35 |             "gradient_clipping_norm": 5,
 36 |             "initialiser": "Xavier"
 37 |         },
 38 | 
 39 |         "Critic": {
 40 |             "learning_rate": 0.0003,
 41 |             "linear_hidden_units": [64, 64],
 42 |             "final_layer_activation": None,
 43 |             "batch_norm": False,
 44 |             "buffer_size": 1000000,
 45 |             "tau": 0.005,
 46 |             "gradient_clipping_norm": 5,
 47 |             "initialiser": "Xavier"
 48 |         },
 49 | 
 50 |         "min_steps_before_learning": 400,
 51 |         "batch_size": 256,
 52 |         "discount_rate": 0.99,
 53 |         "mu": 0.0, #for O-H noise
 54 |         "theta": 0.15, #for O-H noise
 55 |         "sigma": 0.25, #for O-H noise
 56 |         "action_noise_std": 0.2,  # for TD3
 57 |         "action_noise_clipping_range": 0.5,  # for TD3
 58 |         "update_every_n_steps": 1,
 59 |         "learning_updates_per_learning_session": 1,
 60 |         "automatically_tune_entropy_hyperparameter": True,
 61 |         "entropy_term_weight": None,
 62 |         "add_extra_noise": False,
 63 |         "do_evaluation_iterations": True,
 64 |         "clip_rewards": False
 65 |     }
 66 | 
 67 | dqn_agent_hyperparameters =   {
 68 |         "learning_rate": 0.005,
 69 |         "batch_size": 128,
 70 |         "buffer_size": 40000,
 71 |         "epsilon": 1.0,
 72 |         "epsilon_decay_rate_denominator": 3,
 73 |         "discount_rate": 0.99,
 74 |         "tau": 0.01,
 75 |         "alpha_prioritised_replay": 0.6,
 76 |         "beta_prioritised_replay": 0.1,
 77 |         "incremental_td_error": 1e-8,
 78 |         "update_every_n_steps": 3,
 79 |         "linear_hidden_units": [30, 15],
 80 |         "final_layer_activation": "None",
 81 |         "batch_norm": False,
 82 |         "gradient_clipping_norm": 5,
 83 |         "clip_rewards": False
 84 |     }
 85 | 
 86 | 
 87 | manager_hyperparameters = dqn_agent_hyperparameters
 88 | manager_hyperparameters.update({"timesteps_to_give_up_control_for": 5})
 89 | 
 90 | 
 91 | config.hyperparameters = {
 92 |     "Policy_Gradient_Agents": {
 93 |             "learning_rate": 0.05,
 94 |             "linear_hidden_units": [30, 15],
 95 |             "final_layer_activation": "TANH",
 96 |             "learning_iterations_per_round": 10,
 97 |             "discount_rate": 0.9,
 98 |             "batch_norm": False,
 99 |             "clip_epsilon": 0.2,
100 |             "episodes_per_learning_round": 10,
101 |             "normalise_rewards": True,
102 |             "gradient_clipping_norm": 5,
103 |             "mu": 0.0,
104 |             "theta": 0.15,
105 |             "sigma": 0.2,
106 |             "epsilon_decay_rate_denominator": 1,
107 |             "clip_rewards": False
108 |         },
109 | 
110 |     "Actor_Critic_Agents": actor_critic_agent_hyperparameters,
111 |     "DIAYN": {
112 |         "DISCRIMINATOR": {
113 |             "learning_rate": 0.001,
114 |             "linear_hidden_units": [32, 32],
115 |             "final_layer_activation": None,
116 |             "gradient_clipping_norm": 5
117 | 
118 |         },
119 |         "AGENT": actor_critic_agent_hyperparameters,
120 |         "MANAGER": manager_hyperparameters,
121 |         "num_skills": 10,
122 |         "num_unsupservised_episodes": 500
123 |     }
124 | }
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     AGENTS = [SAC, DIAYN] #SAC] #, DDPG, PPO, TD3] ] #,
129 |     trainer = Trainer(config, AGENTS)
130 |     trainer.run_games_for_agents()
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/results/Walker.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from agents.policy_gradient_agents.PPO import PPO
  3 | from agents.actor_critic_agents.DDPG import DDPG
  4 | from agents.actor_critic_agents.SAC import SAC
  5 | from agents.actor_critic_agents.TD3 import TD3
  6 | from agents.Trainer import Trainer
  7 | from agents.hierarchical_agents.DIAYN import DIAYN
  8 | from utilities.data_structures.Config import Config
  9 | 
 10 | 
 11 | config = Config()
 12 | config.seed = 1
 13 | config.environment = gym.make("Walker2d-v2")
 14 | config.num_episodes_to_run = 400
 15 | config.file_to_save_data_results = "data_and_graphs/Walker_Results_Data.pkl"
 16 | config.file_to_save_results_graph = "data_and_graphs/Walker_Results_Graph.png"
 17 | config.show_solution_score = False
 18 | config.visualise_individual_results = False
 19 | config.visualise_overall_agent_results = True
 20 | config.standard_deviation_results = 1.0
 21 | config.runs_per_agent = 3
 22 | config.use_GPU = False
 23 | config.overwrite_existing_results_file = False
 24 | config.randomise_random_seed = True
 25 | config.save_model = False
 26 | 
 27 | 
 28 | actor_critic_agent_hyperparameters = {
 29 |         "Actor": {
 30 |             "learning_rate": 0.0003,
 31 |             "linear_hidden_units": [64, 64],
 32 |             "final_layer_activation": None,
 33 |             "batch_norm": False,
 34 |             "tau": 0.005,
 35 |             "gradient_clipping_norm": 5,
 36 |             "initialiser": "Xavier"
 37 |         },
 38 | 
 39 |         "Critic": {
 40 |             "learning_rate": 0.0003,
 41 |             "linear_hidden_units": [64, 64],
 42 |             "final_layer_activation": None,
 43 |             "batch_norm": False,
 44 |             "buffer_size": 1000000,
 45 |             "tau": 0.005,
 46 |             "gradient_clipping_norm": 5,
 47 |             "initialiser": "Xavier"
 48 |         },
 49 | 
 50 |         "min_steps_before_learning": 400,
 51 |         "batch_size": 256,
 52 |         "discount_rate": 0.99,
 53 |         "mu": 0.0, #for O-H noise
 54 |         "theta": 0.15, #for O-H noise
 55 |         "sigma": 0.25, #for O-H noise
 56 |         "action_noise_std": 0.2,  # for TD3
 57 |         "action_noise_clipping_range": 0.5,  # for TD3
 58 |         "update_every_n_steps": 1,
 59 |         "learning_updates_per_learning_session": 1,
 60 |         "automatically_tune_entropy_hyperparameter": True,
 61 |         "entropy_term_weight": None,
 62 |         "add_extra_noise": False,
 63 |         "do_evaluation_iterations": True,
 64 |         "clip_rewards": clip_rewards
 65 |     }
 66 | 
 67 | dqn_agent_hyperparameters =   {
 68 |         "learning_rate": 0.005,
 69 |         "batch_size": 128,
 70 |         "buffer_size": 40000,
 71 |         "epsilon": 1.0,
 72 |         "epsilon_decay_rate_denominator": 3,
 73 |         "discount_rate": 0.99,
 74 |         "tau": 0.01,
 75 |         "alpha_prioritised_replay": 0.6,
 76 |         "beta_prioritised_replay": 0.1,
 77 |         "incremental_td_error": 1e-8,
 78 |         "update_every_n_steps": 3,
 79 |         "linear_hidden_units": [30, 15],
 80 |         "final_layer_activation": "None",
 81 |         "batch_norm": False,
 82 |         "gradient_clipping_norm": 5,
 83 |         "clip_rewards": clip_rewards
 84 |     }
 85 | 
 86 | 
 87 | manager_hyperparameters = dqn_agent_hyperparameters
 88 | manager_hyperparameters.update({"timesteps_to_give_up_control_for": 5})
 89 | 
 90 | 
 91 | config.hyperparameters = {
 92 |     "Policy_Gradient_Agents": {
 93 |             "learning_rate": 0.05,
 94 |             "linear_hidden_units": [30, 15],
 95 |             "final_layer_activation": "TANH",
 96 |             "learning_iterations_per_round": 10,
 97 |             "discount_rate": 0.9,
 98 |             "batch_norm": False,
 99 |             "clip_epsilon": 0.2,
100 |             "episodes_per_learning_round": 10,
101 |             "normalise_rewards": True,
102 |             "gradient_clipping_norm": 5,
103 |             "mu": 0.0,
104 |             "theta": 0.15,
105 |             "sigma": 0.2,
106 |             "epsilon_decay_rate_denominator": 1,
107 |             "clip_rewards": clip_rewards
108 |         },
109 | 
110 |     "Actor_Critic_Agents": actor_critic_agent_hyperparameters,
111 |     "DIAYN": {
112 |         "DISCRIMINATOR": {
113 |             "learning_rate": 0.001,
114 |             "linear_hidden_units": [32, 32],
115 |             "final_layer_activation": None,
116 |             "gradient_clipping_norm": 5
117 | 
118 |         },
119 |         "AGENT": actor_critic_agent_hyperparameters,
120 |         "MANAGER": manager_hyperparameters,
121 |         "num_skills": 10,
122 |         "num_unsupservised_episodes": 100
123 |     }
124 | }
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     AGENTS = [DIAYN] #, SAC] #, DDPG, PPO, TD3] ] #,DIAYN] #
129 |     trainer = Trainer(config, AGENTS)
130 |     trainer.run_games_for_agents()
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/results/Reacher.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from agents.Trainer import Trainer
  3 | from agents.actor_critic_agents.DDPG import DDPG
  4 | from agents.hierarchical_agents.HIRO import HIRO
  5 | from utilities.data_structures.Config import Config
  6 | config = Config()
  7 | config.seed = 1
  8 | config.environment = gym.make("Reacher-v2") #  Reacher-v2 "InvertedPendulum-v2") #Pendulum-v0
  9 | config.num_episodes_to_run = 1500
 10 | config.file_to_save_data_results = None
 11 | config.file_to_save_results_graph = None
 12 | config.show_solution_score = False
 13 | config.visualise_individual_results = False
 14 | config.visualise_overall_agent_results = True
 15 | config.standard_deviation_results = 1.0
 16 | config.runs_per_agent = 1
 17 | config.use_GPU = False
 18 | config.overwrite_existing_results_file = False
 19 | config.randomise_random_seed = True
 20 | config.save_model = False
 21 | 
 22 | 
 23 | 
 24 | 
 25 | config.hyperparameters = {
 26 |     "HIRO": {
 27 | 
 28 |         "LOWER_LEVEL": {
 29 |             "max_lower_level_timesteps": 5,
 30 | 
 31 |             "Actor": {
 32 |                 "learning_rate": 0.001,
 33 |                 "linear_hidden_units": [20, 20],
 34 |                 "final_layer_activation": "TANH",
 35 |                 "batch_norm": False,
 36 |                 "tau": 0.005,
 37 |                 "gradient_clipping_norm": 5
 38 |             },
 39 | 
 40 |             "Critic": {
 41 |                 "learning_rate": 0.01,
 42 |                 "linear_hidden_units": [20, 20],
 43 |                 "final_layer_activation": "None",
 44 |                 "batch_norm": False,
 45 |                 "buffer_size": 100000,
 46 |                 "tau": 0.005,
 47 |                 "gradient_clipping_norm": 5
 48 |             },
 49 | 
 50 |             "batch_size": 256,
 51 |             "discount_rate": 0.9,
 52 |             "mu": 0.0,  # for O-H noise
 53 |             "theta": 0.15,  # for O-H noise
 54 |             "sigma": 0.25,  # for O-H noise
 55 |             "action_noise_std": 0.2,  # for TD3
 56 |             "action_noise_clipping_range": 0.5,  # for TD3
 57 |             "update_every_n_steps": 20,
 58 |             "learning_updates_per_learning_session": 10,
 59 |             "clip_rewards": False
 60 | 
 61 |             } ,
 62 | 
 63 | 
 64 | 
 65 |         "HIGHER_LEVEL": {
 66 | 
 67 |                 "Actor": {
 68 |                 "learning_rate": 0.001,
 69 |                 "linear_hidden_units": [20, 20],
 70 |                 "final_layer_activation": "TANH",
 71 |                 "batch_norm": False,
 72 |                 "tau": 0.005,
 73 |                 "gradient_clipping_norm": 5
 74 |             },
 75 | 
 76 |             "Critic": {
 77 |                 "learning_rate": 0.01,
 78 |                 "linear_hidden_units": [20, 20],
 79 |                 "final_layer_activation": "None",
 80 |                 "batch_norm": False,
 81 |                 "buffer_size": 100000,
 82 |                 "tau": 0.005,
 83 |                 "gradient_clipping_norm": 5
 84 |             },
 85 | 
 86 |             "batch_size": 256,
 87 |             "discount_rate": 0.9,
 88 |             "mu": 0.0,  # for O-H noise
 89 |             "theta": 0.15,  # for O-H noise
 90 |             "sigma": 0.25,  # for O-H noise
 91 |             "action_noise_std": 0.2,  # for TD3
 92 |             "action_noise_clipping_range": 0.5,  # for TD3
 93 |             "update_every_n_steps": 20,
 94 |             "learning_updates_per_learning_session": 10,
 95 |             "clip_rewards": False
 96 | 
 97 |             } ,
 98 | 
 99 | 
100 |         },
101 |     "Actor_Critic_Agents": {  # hyperparameters taken from https://arxiv.org/pdf/1802.09477.pdf
102 |         "Actor": {
103 |             "learning_rate": 0.001,
104 |             "linear_hidden_units": [400, 300],
105 |             "final_layer_activation": "TANH",
106 |             "batch_norm": False,
107 |             "tau": 0.01,
108 |             "gradient_clipping_norm": 5
109 |         },
110 | 
111 |         "Critic": {
112 |             "learning_rate": 0.01,
113 |             "linear_hidden_units": [400, 300],
114 |             "final_layer_activation": "None",
115 |             "batch_norm": False,
116 |             "buffer_size": 100000,
117 |             "tau": 0.01,
118 |             "gradient_clipping_norm": 5
119 |         },
120 | 
121 |         "batch_size": 64,
122 |         "discount_rate": 0.99,
123 |         "mu": 0.0,  # for O-H noise
124 |         "theta": 0.15,  # for O-H noise
125 |         "sigma": 0.2,  # for O-H noise
126 |         "action_noise_std": 0.2,  # for TD3
127 |         "action_noise_clipping_range": 0.5,  # for TD3
128 |         "update_every_n_steps": 1,
129 |         "learning_updates_per_learning_session": 1,
130 |         "clip_rewards": False
131 | 
132 |     }
133 | 
134 | 
135 |     }
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     AGENTS = [DDPG, HIRO]
140 |     trainer = Trainer(config, AGENTS)
141 |     trainer.run_games_for_agents()
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/environments/ant_environments/assets/ant.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="ant">
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
 3 |   <option timestep="0.02" integrator="RK4" />
 4 |   <custom>
 5 |     <numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0" />
 6 |   </custom>
 7 |   <default>
 8 |     <joint limited="true" armature="1" damping="1" />
 9 |     <geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01" rgba="0.8 0.6 0.4 1" density="5.0" />
10 |   </default>
11 |   <asset>
12 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
13 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
14 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
15 |     <material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1"  reflectance="0.5" />
16 |     <material name='geom' texture="texgeom" texuniform="true" />
17 |   </asset>
18 |   <worldbody>
19 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
20 |     <geom name='floor' pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
21 |     <body name="torso" pos="0 0 0.75">
22 |       <geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0" />
23 |       <joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0" damping="0" />
24 |       <body name="front_left_leg" pos="0 0 0">
25 |         <geom name="aux_1_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
26 |         <body name="aux_1" pos="0.2 0.2 0">
27 |           <joint name="hip_1" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
28 |           <geom name="left_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
29 |           <body pos="0.2 0.2 0">
30 |             <joint name="ankle_1" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="30 70" />
31 |             <geom name="left_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 0.4 0.0" />
32 |           </body>
33 |         </body>
34 |       </body>
35 |       <body name="front_right_leg" pos="0 0 0">
36 |         <geom name="aux_2_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
37 |         <body name="aux_2" pos="-0.2 0.2 0">
38 |           <joint name="hip_2" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
39 |           <geom name="right_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
40 |           <body pos="-0.2 0.2 0">
41 |             <joint name="ankle_2" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="-70 -30" />
42 |             <geom name="right_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 0.4 0.0" />
43 |           </body>
44 |         </body>
45 |       </body>
46 |       <body name="back_leg" pos="0 0 0">
47 |         <geom name="aux_3_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
48 |         <body name="aux_3" pos="-0.2 -0.2 0">
49 |           <joint name="hip_3" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
50 |           <geom name="back_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
51 |           <body pos="-0.2 -0.2 0">
52 |             <joint name="ankle_3" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="-70 -30" />
53 |             <geom name="third_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 -0.4 0.0" />
54 |           </body>
55 |         </body>
56 |       </body>
57 |       <body name="right_back_leg" pos="0 0 0">
58 |         <geom name="aux_4_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
59 |         <body name="aux_4" pos="0.2 -0.2 0">
60 |           <joint name="hip_4" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
61 |           <geom name="rightback_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
62 |           <body pos="0.2 -0.2 0">
63 |             <joint name="ankle_4" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="30 70" />
64 |             <geom name="fourth_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 -0.4 0.0" />
65 |           </body>
66 |         </body>
67 |       </body>
68 |     </body>
69 | 
70 |   </worldbody>
71 |   <actuator>
72 |     <motor joint="hip_4" ctrlrange="-30.0 30.0" ctrllimited="true" />
73 |     <motor joint="ankle_4" ctrlrange="-30.0 30.0" ctrllimited="true" />
74 |     <motor joint="hip_1" ctrlrange="-30.0 30.0" ctrllimited="true" />
75 |     <motor joint="ankle_1" ctrlrange="-30.0 30.0" ctrllimited="true" />
76 |     <motor joint="hip_2" ctrlrange="-30.0 30.0" ctrllimited="true" />
77 |     <motor joint="ankle_2" ctrlrange="-30.0 30.0" ctrllimited="true" />
78 |     <motor joint="hip_3" ctrlrange="-30.0 30.0" ctrllimited="true" />
79 |     <motor joint="ankle_3" ctrlrange="-30.0 30.0" ctrllimited="true" />
80 |   </actuator>
81 | </mujoco>
82 | 


--------------------------------------------------------------------------------
/results/HRL_Experiments.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | 
  3 | from agents.hierarchical_agents.HRL.HRL import HRL
  4 | from agents.Trainer import Trainer
  5 | from utilities.data_structures.Config import Config
  6 | 
  7 | config = Config()
  8 | config.environment = gym.make("Taxi-v2")
  9 | config.seed = 1
 10 | config.env_parameters = {}
 11 | config.num_episodes_to_run = 2000
 12 | config.file_to_save_data_results = None
 13 | config.file_to_save_results_graph = None
 14 | config.show_solution_score = False
 15 | config.visualise_individual_results = False
 16 | config.visualise_overall_agent_results = True
 17 | config.standard_deviation_results = 1.0
 18 | config.runs_per_agent = 3
 19 | config.use_GPU = False
 20 | config.overwrite_existing_results_file = False
 21 | config.randomise_random_seed = True
 22 | config.save_model = False
 23 | 
 24 | 
 25 | linear_hidden_units = [32, 32]
 26 | learning_rate = 0.01
 27 | buffer_size = 100000
 28 | batch_size = 256
 29 | batch_norm = False
 30 | embedding_dimensionality = 10
 31 | gradient_clipping_norm = 5
 32 | update_every_n_steps = 1
 33 | learning_iterations = 1
 34 | epsilon_decay_rate_denominator = 400
 35 | discount_rate = 0.99
 36 | tau = 0.01
 37 | sequitur_k = 2
 38 | pre_training_learning_iterations_multiplier = 50
 39 | episodes_to_run_with_no_exploration = 10
 40 | action_balanced_replay_buffer = True
 41 | copy_over_hidden_layers = True
 42 | action_length_reward_bonus = 0.1
 43 | 
 44 | config.hyperparameters = {
 45 | 
 46 |     "HRL": {
 47 |         "linear_hidden_units": linear_hidden_units,
 48 |         "learning_rate": learning_rate,
 49 |         "buffer_size": buffer_size,
 50 |         "batch_size": batch_size,
 51 |         "final_layer_activation": "None",
 52 |         "columns_of_data_to_be_embedded": [0],
 53 |         "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
 54 |         "batch_norm": batch_norm,
 55 |         "gradient_clipping_norm": gradient_clipping_norm,
 56 |         "update_every_n_steps": update_every_n_steps,
 57 |         "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator,
 58 |         "discount_rate": discount_rate,
 59 |         "learning_iterations": learning_iterations,
 60 |         "tau": tau,
 61 |         "sequitur_k": sequitur_k,
 62 |         "action_length_reward_bonus": action_length_reward_bonus,
 63 |         "pre_training_learning_iterations_multiplier": pre_training_learning_iterations_multiplier,
 64 |         "episodes_to_run_with_no_exploration": episodes_to_run_with_no_exploration,
 65 |         "action_balanced_replay_buffer": action_balanced_replay_buffer,
 66 |         "copy_over_hidden_layers": copy_over_hidden_layers
 67 |     },
 68 | 
 69 |     "DQN_Agents": {
 70 |         "linear_hidden_units": linear_hidden_units,
 71 |         "learning_rate": learning_rate,
 72 |         "buffer_size": buffer_size,
 73 |         "batch_size": batch_size,
 74 |         "final_layer_activation": "None",
 75 |         "columns_of_data_to_be_embedded": [0],
 76 |         "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
 77 |         "batch_norm": batch_norm,
 78 |         "gradient_clipping_norm": gradient_clipping_norm,
 79 |         "update_every_n_steps": update_every_n_steps,
 80 |         "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator,
 81 |         "discount_rate": discount_rate,
 82 |         "learning_iterations": learning_iterations,
 83 |         "tau": tau,
 84 |     },
 85 | 
 86 |     "Actor_Critic_Agents": {
 87 |         "Actor": {
 88 |             "learning_rate": 0.0003,
 89 |             "linear_hidden_units": [64, 64],
 90 |             "final_layer_activation": "Softmax",
 91 |             "columns_of_data_to_be_embedded": [0],
 92 |             "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
 93 |             "batch_norm": False,
 94 |             "tau": 0.005,
 95 |             "gradient_clipping_norm": 5,
 96 |             "initialiser": "Xavier"
 97 |         },
 98 | 
 99 |         "Critic": {
100 |             "learning_rate": 0.0003,
101 |             "linear_hidden_units": [64, 64],
102 |             "final_layer_activation": None,
103 |             "columns_of_data_to_be_embedded": [0],
104 |             "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
105 |             "batch_norm": False,
106 |             "buffer_size": 1000000,
107 |             "tau": 0.005,
108 |             "gradient_clipping_norm": 5,
109 |             "initialiser": "Xavier"
110 |         },
111 | 
112 |         "min_steps_before_learning": 10000,
113 |         "batch_size": 256,
114 |         "discount_rate": 0.99,
115 |         "mu": 0.0,  # for O-H noise
116 |         "theta": 0.15,  # for O-H noise
117 |         "sigma": 0.25,  # for O-H noise
118 |         "action_noise_std": 0.2,  # for TD3
119 |         "action_noise_clipping_range": 0.5,  # for TD3
120 |         "update_every_n_steps": 1,
121 |         "learning_updates_per_learning_session": 1,
122 |         "automatically_tune_entropy_hyperparameter": True,
123 |         "entropy_term_weight": None,
124 |         "add_extra_noise": False,
125 |         "do_evaluation_iterations": True
126 |     }
127 | }
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     AGENTS = [HRL] #SAC_Discrete,  SAC_Discrete, DDQN] #HRL] #, SNN_HRL, DQN, h_DQN]
132 |     trainer = Trainer(config, AGENTS)
133 |     trainer.run_games_for_agents()
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/agents/HER_Base.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from utilities.data_structures.Replay_Buffer import Replay_Buffer
  4 | from utilities.Utility_Functions import abstract
  5 | 
  6 | @abstract
  7 | class HER_Base(object):
  8 |     """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm"""
  9 |     def __init__(self, buffer_size, batch_size, HER_sample_proportion):
 10 |         self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed)
 11 |         self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion))
 12 |         self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size
 13 | 
 14 |     def reset_game(self):
 15 |         """Resets the game information so we are ready to play a new episode"""
 16 |         self.state_dict = self.environment.reset()
 17 |         self.observation = self.state_dict["observation"]
 18 |         self.desired_goal = self.state_dict["desired_goal"]
 19 |         self.achieved_goal = self.state_dict["achieved_goal"]
 20 | 
 21 |         self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
 22 |         self.next_state = None
 23 |         self.action = None
 24 |         self.reward = None
 25 |         self.done = False
 26 | 
 27 |         self.episode_states = []
 28 |         self.episode_rewards = []
 29 |         self.episode_actions = []
 30 |         self.episode_next_states = []
 31 |         self.episode_dones = []
 32 | 
 33 |         self.episode_desired_goals = []
 34 |         self.episode_achieved_goals = []
 35 |         self.episode_observations = []
 36 | 
 37 |         self.episode_next_desired_goals = []
 38 |         self.episode_next_achieved_goals = []
 39 |         self.episode_next_observations = []
 40 | 
 41 |         self.total_episode_score_so_far = 0
 42 | 
 43 |     def track_changeable_goal_episodes_data(self):
 44 |         """Saves the data from the recent episodes in a way compatible with changeable goal environments"""
 45 |         self.episode_rewards.append(self.reward)
 46 |         self.episode_actions.append(self.action)
 47 |         self.episode_dones.append(self.done)
 48 | 
 49 |         self.episode_states.append(self.state)
 50 |         self.episode_next_states.append(self.next_state)
 51 | 
 52 |         self.episode_desired_goals.append(self.state_dict["desired_goal"])
 53 |         self.episode_achieved_goals.append(self.state_dict["achieved_goal"])
 54 |         self.episode_observations.append(self.state_dict["observation"])
 55 | 
 56 |         self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"])
 57 |         self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"])
 58 |         self.episode_next_observations.append(self.next_state_dict["observation"])
 59 | 
 60 |     def conduct_action_in_changeable_goal_envs(self, action):
 61 |         """Adapts conduct_action from base agent so that can handle changeable goal environments"""
 62 |         self.next_state_dict, self.reward, self.done, _ = self.environment.step(action)
 63 |         self.total_episode_score_so_far += self.reward
 64 |         if self.hyperparameters["clip_rewards"]:
 65 |             self.reward = max(min(self.reward, 1.0), -1.0)
 66 |         self.observation = self.next_state_dict["observation"]
 67 |         self.desired_goal = self.next_state_dict["desired_goal"]
 68 |         self.achieved_goal = self.next_state_dict["achieved_goal"]
 69 |         self.next_state =  self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
 70 | 
 71 | 
 72 |     def create_state_from_observation_and_desired_goal(self, observation, desired_goal):
 73 |         return np.concatenate((observation, desired_goal))
 74 | 
 75 |     def save_alternative_experience(self):
 76 |         """Saves the experiences as if the final state visited in the episode was the goal state"""
 77 |         new_goal = self.achieved_goal
 78 |         new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations]
 79 |         new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in
 80 |                       self.episode_next_observations]
 81 |         new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in  self.episode_next_achieved_goals]
 82 | 
 83 |         if self.hyperparameters["clip_rewards"]:
 84 |             new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards]
 85 | 
 86 |         self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones)
 87 | 
 88 |     def sample_from_HER_and_Ordinary_Buffer(self):
 89 |         """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config"""
 90 |         states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size)
 91 |         HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size)
 92 | 
 93 |         states = torch.cat((states, HER_states))
 94 |         actions = torch.cat((actions, HER_actions))
 95 |         rewards = torch.cat((rewards, HER_rewards))
 96 |         next_states = torch.cat((next_states, HER_next_states))
 97 |         dones = torch.cat((dones, HER_dones))
 98 |         return states, actions, rewards, next_states, dones
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/utilities/Memory_Shaper.py:
--------------------------------------------------------------------------------
  1 | # NOT FINISHED
  2 | from .data_structures.Action_Balanced_Replay_Buffer import Action_Balanced_Replay_Buffer
  3 | from .data_structures.Replay_Buffer import Replay_Buffer
  4 | import numpy as np
  5 | import random
  6 | 
  7 | class Memory_Shaper(object):
  8 |     """Takes in the experience of full episodes and reshapes it according to macro-actions you define. Then it provides
  9 |     a replay buffer with this reshaped data to learn from"""
 10 |     def __init__(self, buffer_size, batch_size, seed, new_reward_fn, action_balanced_replay_buffer=True):
 11 |         self.reset()
 12 |         self.buffer_size = buffer_size
 13 |         self.batch_size = batch_size
 14 |         self.seed = seed
 15 |         self.new_reward_fn = new_reward_fn
 16 |         self.action_balanced_replay_buffer = action_balanced_replay_buffer
 17 | 
 18 |     def put_adapted_experiences_in_a_replay_buffer(self, action_id_to_actions):
 19 |         """Adds experiences to the replay buffer after re-imagining that the actions taken were macro-actions according to
 20 |          action_rules as well as primitive actions.
 21 | 
 22 |          NOTE that we want to put both primitive actions and macro-actions into replay buffer so that it can learn that
 23 |          its better to do a macro-action rather than the same primitive actions (which we will enforce with reward penalty)
 24 |          """
 25 | 
 26 |         actions_to_action_id = {v: k for k, v in action_id_to_actions.items()}
 27 | 
 28 |         self.num_actions = len(action_id_to_actions)
 29 | 
 30 |         print(actions_to_action_id)
 31 | 
 32 |         for key in actions_to_action_id.keys():
 33 |             assert isinstance(key, tuple)
 34 |             assert isinstance(actions_to_action_id[key], int)
 35 | 
 36 |         episodes = len(self.states)
 37 |         for data_type in [self.states, self.next_states, self.rewards, self.actions, self.dones]:
 38 |             assert len(data_type) == episodes
 39 | 
 40 |         max_action_length = self.calculate_max_action_length(actions_to_action_id)
 41 | 
 42 |         if self.action_balanced_replay_buffer:
 43 |             print("Using action balanced replay buffer")
 44 |             replay_buffer = Action_Balanced_Replay_Buffer(self.buffer_size, self.batch_size, self.seed, num_actions=self.num_actions)
 45 |         else:
 46 |             print("Using ordinary replay buffer")
 47 |             replay_buffer = Replay_Buffer(self.buffer_size, self.batch_size, self.seed)
 48 | 
 49 |         for episode_ix in range(episodes):
 50 |             self.add_adapted_experience_for_an_episode(episode_ix, actions_to_action_id, max_action_length, replay_buffer)
 51 | 
 52 |         return replay_buffer
 53 | 
 54 |     def calculate_max_action_length(self, actions_to_action_id):
 55 |         """Calculates the max length of the provided macro-actions"""
 56 |         max_length = 0
 57 |         for key in actions_to_action_id.keys():
 58 |             action_length = len(key)
 59 |             if action_length > max_length:
 60 |                 max_length = action_length
 61 |         return max_length
 62 | 
 63 | 
 64 |     def add_adapted_experience_for_an_episode(self, episode_ix, action_rules, max_action_length, replay_buffer):
 65 |         """Adds all the experiences we have been given to a replay buffer after adapting experiences that involved doing a
 66 |           macro action"""
 67 |         states = self.states[episode_ix]
 68 |         next_states = self.next_states[episode_ix]
 69 |         rewards = self.rewards[episode_ix]
 70 |         actions = self.actions[episode_ix]
 71 |         dones = self.dones[episode_ix]
 72 | 
 73 |         assert len(states) == len(next_states) == len(rewards) == len(dones) == len(actions), "{} {} {} {} {} = {}".format(len(states), len(next_states), len(rewards), len(dones), len(actions), actions)
 74 |         steps = len(states)
 75 |         for step in range(steps):
 76 |             replay_buffer.add_experience(states[step], actions[step], rewards[step], next_states[step], dones[step])
 77 |             for action_length in range(2, max_action_length + 1):
 78 |                 if step < action_length - 1: continue
 79 |                 action_sequence =  tuple(actions[step - action_length + 1 : step + 1])
 80 |                 assert all([action in range(self.num_actions) for action in action_sequence]), "All actions should be primitive here"
 81 |                 if action_sequence in action_rules.keys():
 82 |                     new_action = action_rules[action_sequence]
 83 |                     new_state = states[step - action_length + 1]
 84 |                     new_reward = np.sum(rewards[step - action_length + 1:step + 1])
 85 |                     new_reward = self.new_reward_fn(new_reward, len(action_sequence))
 86 |                     new_next_state = next_states[step]
 87 |                     new_dones = dones[step]
 88 |                     replay_buffer.add_experience(new_state, new_action, new_reward, new_next_state, new_dones)
 89 | 
 90 | 
 91 |     def add_episode_experience(self, states, next_states, rewards, actions, dones):
 92 |         """Adds in an episode of experience"""
 93 |         self.states.append(states)
 94 |         self.next_states.append(next_states)
 95 |         self.rewards.append(rewards)
 96 |         self.actions.append(actions)
 97 |         self.dones.append(dones)
 98 | 
 99 |     def reset(self):
100 |         self.states = []
101 |         self.next_states = []
102 |         self.rewards = []
103 |         self.actions = []
104 |         self.dones = []
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/tests/Test_Four_Rooms_Environment.py:
--------------------------------------------------------------------------------
  1 | from environments.Four_Rooms_Environment import Four_Rooms_Environment
  2 | from random import randint
  3 | from collections import Counter
  4 | 
  5 | 
  6 | def test_location_to_state():
  7 |     """Tests location_to_state maps each location to a unique integer"""
  8 |     for num_rows in [12, 10]:
  9 |         for num_cols in [15, 9]:
 10 |             env = Four_Rooms_Environment(grid_width=num_cols, grid_height=num_rows)
 11 |             observed_states = set()
 12 |             for row in range(num_rows):
 13 |                 for col in range(num_cols):
 14 |                     state = env.location_to_state((row, col))
 15 |                     assert state not in observed_states
 16 |                     observed_states.add(state)
 17 | 
 18 | def test_actions_execute_correctly():
 19 |     """Tests that actions execute correctly"""
 20 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.0)
 21 |     env.reset()
 22 |     env.move_user(env.current_user_location, (3, 3))
 23 | 
 24 |     env.step(0)
 25 |     assert env.current_user_location == (2, 3)
 26 | 
 27 |     env.step(1)
 28 |     assert env.current_user_location == (2, 4)
 29 | 
 30 |     env.step(2)
 31 |     assert env.current_user_location == (3, 4)
 32 | 
 33 |     env.step(3)
 34 |     assert env.current_user_location == (3, 3)
 35 | 
 36 |     env.step(0)
 37 |     assert env.current_user_location == (2, 3)
 38 | 
 39 |     env.step(0)
 40 |     assert env.current_user_location == (1, 3)
 41 | 
 42 |     env.step(0)
 43 |     assert env.current_user_location == (1, 3)
 44 | 
 45 |     env.step(1)
 46 |     assert env.current_user_location == (1, 4)
 47 | 
 48 |     env.step(1)
 49 |     assert env.current_user_location == (1, 5)
 50 | 
 51 |     env.step(1)
 52 |     assert env.current_user_location == (1, 5)
 53 | 
 54 | def test_check_user_location_and_goal_location_match_state_and_next_state():
 55 |     """Checks whether user location always matches state and next state correctly"""
 56 |     for _ in range(50):
 57 |         env = Four_Rooms_Environment()
 58 |         env.reset()
 59 |         for _ in range(50):
 60 |             move = randint(0, 3)
 61 |             env.step(move)
 62 |             assert env.state == [env.location_to_state(env.current_user_location), env.location_to_state(env.current_goal_location)]
 63 |             assert env.next_state == [env.location_to_state(env.current_user_location), env.location_to_state(env.current_goal_location)]
 64 | 
 65 | def test_lands_on_goal_correctly():
 66 |     """Checks whether getting to goal state produces the correct response"""
 67 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.0)
 68 |     env.reset()
 69 |     env.move_user(env.current_user_location, (3, 3))
 70 |     env.move_goal(env.current_goal_location, (2, 2))
 71 | 
 72 |     env.step(0)
 73 |     assert env.reward == env.step_reward_for_not_achieving_goal
 74 |     assert not env.done
 75 | 
 76 |     env.step(3)
 77 |     assert env.reward == env.reward_for_achieving_goal
 78 |     assert env.done
 79 | 
 80 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.0)
 81 |     env.reset()
 82 |     env.move_user(env.current_user_location, (2, 3))
 83 |     env.move_goal(env.current_goal_location, (2, 8))
 84 |     for move in [2, 1, 1, 1, 1, 1, 0]:
 85 |         env.step(move)
 86 |         if move != 0:
 87 |             assert env.reward == env.step_reward_for_not_achieving_goal
 88 |             assert not env.done
 89 |         else:
 90 |             assert env.reward == env.reward_for_achieving_goal
 91 |             assert env.done
 92 | 
 93 | def test_location_to_state_and_state_to_location_match():
 94 |     """Test that location_to_state and state_to_location are inverses of each other"""
 95 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.0)
 96 |     env.reset()
 97 |     for row in range(env.grid_height):
 98 |         for col in range(env.grid_width):
 99 |             assert env.location_to_state((row, col)) == env.location_to_state(env.state_to_location(env.location_to_state((row, col))))
100 | 
101 | def test_randomness_of_moves():
102 |     """Test that determine_which_action_will_actually_occur correctly implements stochastic_actions_probability"""
103 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.0)
104 |     env.reset()
105 |     for _ in range(10):
106 |         for move in env.actions:
107 |             assert move == env.determine_which_action_will_actually_occur(move)
108 | 
109 |     env = Four_Rooms_Environment(stochastic_actions_probability=1.0)
110 |     num_iterations = 10000
111 |     for move in env.actions:
112 |         moves = []
113 |         for _ in range(num_iterations):
114 |             moves.append(env.determine_which_action_will_actually_occur(move))
115 |         count = Counter(moves)
116 |         for move_test in env.actions:
117 |             if move != move_test: #We do this because stochastic probability 1.0 means the move will never be picked
118 |                 assert abs((num_iterations / (len(env.actions)-1)) - count[move_test]) < num_iterations / 20.0,  "{}".format(count)
119 | 
120 |     env = Four_Rooms_Environment(stochastic_actions_probability=0.75)
121 |     num_iterations = 10000
122 |     for move in env.actions:
123 |         moves = []
124 |         for _ in range(num_iterations):
125 |             moves.append(env.determine_which_action_will_actually_occur(move))
126 |         count = Counter(moves)
127 |         for move_test in env.actions:
128 |             assert abs((num_iterations / len(env.actions)) - count[move_test]) < num_iterations / 20.0, "{}".format(count)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/results/Cart_Pole.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from os.path import dirname, abspath
  4 | sys.path.append(dirname(dirname(abspath(__file__))))
  5 | 
  6 | import gym
  7 | 
  8 | from agents.actor_critic_agents.A2C import A2C
  9 | from agents.DQN_agents.Dueling_DDQN import Dueling_DDQN
 10 | from agents.actor_critic_agents.SAC_Discrete import SAC_Discrete
 11 | from agents.actor_critic_agents.A3C import A3C
 12 | from agents.policy_gradient_agents.PPO import PPO
 13 | from agents.Trainer import Trainer
 14 | from utilities.data_structures.Config import Config
 15 | from agents.DQN_agents.DDQN import DDQN
 16 | from agents.DQN_agents.DDQN_With_Prioritised_Experience_Replay import DDQN_With_Prioritised_Experience_Replay
 17 | from agents.DQN_agents.DQN import DQN
 18 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
 19 | 
 20 | config = Config()
 21 | config.seed = 1
 22 | config.environment = gym.make("CartPole-v0")
 23 | config.num_episodes_to_run = 450
 24 | config.file_to_save_data_results = "results/data_and_graphs/Cart_Pole_Results_Data.pkl"
 25 | config.file_to_save_results_graph = "results/data_and_graphs/Cart_Pole_Results_Graph.png"
 26 | config.show_solution_score = False
 27 | config.visualise_individual_results = False
 28 | config.visualise_overall_agent_results = True
 29 | config.standard_deviation_results = 1.0
 30 | config.runs_per_agent = 1
 31 | config.use_GPU = False
 32 | config.overwrite_existing_results_file = False
 33 | config.randomise_random_seed = True
 34 | config.save_model = False
 35 | 
 36 | 
 37 | config.hyperparameters = {
 38 |     "DQN_Agents": {
 39 |         "learning_rate": 0.01,
 40 |         "batch_size": 256,
 41 |         "buffer_size": 40000,
 42 |         "epsilon": 1.0,
 43 |         "epsilon_decay_rate_denominator": 1,
 44 |         "discount_rate": 0.99,
 45 |         "tau": 0.01,
 46 |         "alpha_prioritised_replay": 0.6,
 47 |         "beta_prioritised_replay": 0.1,
 48 |         "incremental_td_error": 1e-8,
 49 |         "update_every_n_steps": 1,
 50 |         "linear_hidden_units": [30, 15],
 51 |         "final_layer_activation": "None",
 52 |         "batch_norm": False,
 53 |         "gradient_clipping_norm": 0.7,
 54 |         "learning_iterations": 1,
 55 |         "clip_rewards": False
 56 |     },
 57 |     "Stochastic_Policy_Search_Agents": {
 58 |         "policy_network_type": "Linear",
 59 |         "noise_scale_start": 1e-2,
 60 |         "noise_scale_min": 1e-3,
 61 |         "noise_scale_max": 2.0,
 62 |         "noise_scale_growth_factor": 2.0,
 63 |         "stochastic_action_decision": False,
 64 |         "num_policies": 10,
 65 |         "episodes_per_policy": 1,
 66 |         "num_policies_to_keep": 5,
 67 |         "clip_rewards": False
 68 |     },
 69 |     "Policy_Gradient_Agents": {
 70 |         "learning_rate": 0.05,
 71 |         "linear_hidden_units": [20, 20],
 72 |         "final_layer_activation": "SOFTMAX",
 73 |         "learning_iterations_per_round": 5,
 74 |         "discount_rate": 0.99,
 75 |         "batch_norm": False,
 76 |         "clip_epsilon": 0.1,
 77 |         "episodes_per_learning_round": 4,
 78 |         "normalise_rewards": True,
 79 |         "gradient_clipping_norm": 7.0,
 80 |         "mu": 0.0, #only required for continuous action games
 81 |         "theta": 0.0, #only required for continuous action games
 82 |         "sigma": 0.0, #only required for continuous action games
 83 |         "epsilon_decay_rate_denominator": 1.0,
 84 |         "clip_rewards": False
 85 |     },
 86 | 
 87 |     "Actor_Critic_Agents":  {
 88 | 
 89 |         "learning_rate": 0.005,
 90 |         "linear_hidden_units": [20, 10],
 91 |         "final_layer_activation": ["SOFTMAX", None],
 92 |         "gradient_clipping_norm": 5.0,
 93 |         "discount_rate": 0.99,
 94 |         "epsilon_decay_rate_denominator": 1.0,
 95 |         "normalise_rewards": True,
 96 |         "exploration_worker_difference": 2.0,
 97 |         "clip_rewards": False,
 98 | 
 99 |         "Actor": {
100 |             "learning_rate": 0.0003,
101 |             "linear_hidden_units": [64, 64],
102 |             "final_layer_activation": "Softmax",
103 |             "batch_norm": False,
104 |             "tau": 0.005,
105 |             "gradient_clipping_norm": 5,
106 |             "initialiser": "Xavier"
107 |         },
108 | 
109 |         "Critic": {
110 |             "learning_rate": 0.0003,
111 |             "linear_hidden_units": [64, 64],
112 |             "final_layer_activation": None,
113 |             "batch_norm": False,
114 |             "buffer_size": 1000000,
115 |             "tau": 0.005,
116 |             "gradient_clipping_norm": 5,
117 |             "initialiser": "Xavier"
118 |         },
119 | 
120 |         "min_steps_before_learning": 400,
121 |         "batch_size": 256,
122 |         "discount_rate": 0.99,
123 |         "mu": 0.0, #for O-H noise
124 |         "theta": 0.15, #for O-H noise
125 |         "sigma": 0.25, #for O-H noise
126 |         "action_noise_std": 0.2,  # for TD3
127 |         "action_noise_clipping_range": 0.5,  # for TD3
128 |         "update_every_n_steps": 1,
129 |         "learning_updates_per_learning_session": 1,
130 |         "automatically_tune_entropy_hyperparameter": True,
131 |         "entropy_term_weight": None,
132 |         "add_extra_noise": False,
133 |         "do_evaluation_iterations": True
134 |     }
135 | }
136 | 
137 | if __name__ == "__main__":
138 |     AGENTS = [SAC_Discrete, DDQN, Dueling_DDQN, DQN, DQN_With_Fixed_Q_Targets,
139 |               DDQN_With_Prioritised_Experience_Replay, A2C, PPO, A3C ]
140 |     trainer = Trainer(config, AGENTS)
141 |     trainer.run_games_for_agents()
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/results/Four_Rooms.py:
--------------------------------------------------------------------------------
  1 | from agents.DQN_agents.DDQN import DDQN
  2 | from environments.Four_Rooms_Environment import Four_Rooms_Environment
  3 | from agents.Trainer import Trainer
  4 | from utilities.data_structures.Config import Config
  5 | 
  6 | config = Config()
  7 | config.seed = 1
  8 | 
  9 | height = 15
 10 | width = 15
 11 | random_goal_place = False
 12 | num_possible_states = (height * width) ** (1 + 1*random_goal_place)
 13 | embedding_dimensions = [[num_possible_states, 20]]
 14 | print("Num possible states ", num_possible_states)
 15 | 
 16 | config.environment = Four_Rooms_Environment(height, width, stochastic_actions_probability=0.0, random_start_user_place=True, random_goal_place=random_goal_place)
 17 | 
 18 | config.num_episodes_to_run = 1000
 19 | config.file_to_save_data_results = "Data_and_Graphs/Four_Rooms.pkl"
 20 | config.file_to_save_results_graph = "Data_and_Graphs/Four_Rooms.png"
 21 | config.show_solution_score = False
 22 | config.visualise_individual_results = False
 23 | config.visualise_overall_agent_results = True
 24 | config.standard_deviation_results = 1.0
 25 | config.runs_per_agent = 3
 26 | config.use_GPU = False
 27 | config.overwrite_existing_results_file = False
 28 | config.randomise_random_seed = True
 29 | config.save_model = False
 30 | 
 31 | 
 32 | config.hyperparameters = {
 33 |     "DQN_Agents": {
 34 |         "linear_hidden_units": [30, 10],
 35 |         "learning_rate": 0.01,
 36 |         "buffer_size": 40000,
 37 |         "batch_size": 256,
 38 |         "final_layer_activation": "None",
 39 |         "columns_of_data_to_be_embedded": [0],
 40 |         "embedding_dimensions": embedding_dimensions,
 41 |         "batch_norm": False,
 42 |         "gradient_clipping_norm": 5,
 43 |         "update_every_n_steps": 1,
 44 |         "epsilon_decay_rate_denominator": 10,
 45 |         "discount_rate": 0.99,
 46 |         "learning_iterations": 1,
 47 |         "tau": 0.01,
 48 |         "exploration_cycle_episodes_length": None,
 49 |         "learning_iterations": 1,
 50 |         "clip_rewards": False
 51 |     },
 52 | 
 53 |     "SNN_HRL": {
 54 |         "SKILL_AGENT": {
 55 |             "num_skills": 20,
 56 |             "regularisation_weight": 1.5,
 57 |             "visitations_decay": 0.9999,
 58 |             "episodes_for_pretraining": 300,
 59 |             "batch_size": 256,
 60 |             "learning_rate": 0.001,
 61 |             "buffer_size": 40000,
 62 |             "linear_hidden_units": [20, 10],
 63 |             "final_layer_activation": "None",
 64 |             "columns_of_data_to_be_embedded": [0, 1],
 65 |             "embedding_dimensions": [embedding_dimensions[0],
 66 |                                      [20, 6]],
 67 |             "batch_norm": False,
 68 |             "gradient_clipping_norm": 2,
 69 |             "update_every_n_steps": 1,
 70 |             "epsilon_decay_rate_denominator": 500,
 71 |             "discount_rate": 0.999,
 72 |             "learning_iterations": 1,
 73 |             "tau": 0.01,
 74 |             "clip_rewards": False
 75 |         },
 76 | 
 77 |         "MANAGER": {
 78 |             "timesteps_before_changing_skill": 6,
 79 |             "linear_hidden_units": [10, 5],
 80 |             "learning_rate": 0.01,
 81 |             "buffer_size": 40000,
 82 |             "batch_size": 256,
 83 |             "final_layer_activation": "None",
 84 |             "columns_of_data_to_be_embedded": [0],
 85 |             "embedding_dimensions": embedding_dimensions,
 86 |             "batch_norm": False,
 87 |             "gradient_clipping_norm": 5,
 88 |             "update_every_n_steps": 1,
 89 |             "epsilon_decay_rate_denominator": 50,
 90 |             "discount_rate": 0.99,
 91 |             "learning_iterations": 1,
 92 |             "tau": 0.01,
 93 |             "clip_rewards": False
 94 | 
 95 |         }
 96 | 
 97 |     },
 98 | 
 99 |     "Actor_Critic_Agents": {
100 | 
101 |         "learning_rate": 0.005,
102 |         "linear_hidden_units": [20, 10],
103 | 
104 |         "columns_of_data_to_be_embedded": [0],
105 |         "embedding_dimensions": embedding_dimensions,
106 |         "final_layer_activation": ["SOFTMAX", None],
107 |         "gradient_clipping_norm": 5.0,
108 |         "discount_rate": 0.99,
109 |         "epsilon_decay_rate_denominator": 50.0,
110 |         "normalise_rewards": True,
111 |         "clip_rewards": False
112 | 
113 |     },
114 | 
115 | 
116 |     "DIAYN": {
117 | 
118 |         "num_skills": 5,
119 |         "DISCRIMINATOR": {
120 |             "learning_rate": 0.01,
121 |             "linear_hidden_units": [20, 10],
122 |             "columns_of_data_to_be_embedded": [0],
123 |             "embedding_dimensions": embedding_dimensions,
124 |         },
125 | 
126 |         "AGENT": {
127 |             "learning_rate": 0.01,
128 |             "linear_hidden_units": [20, 10],
129 |         }
130 |     },
131 | 
132 | 
133 |     "HRL": {
134 |         "linear_hidden_units": [10, 5],
135 |         "learning_rate": 0.01,
136 |         "buffer_size": 40000,
137 |         "batch_size": 256,
138 |         "final_layer_activation": "None",
139 |         "columns_of_data_to_be_embedded": [0],
140 |         "embedding_dimensions": embedding_dimensions,
141 |         "batch_norm": False,
142 |         "gradient_clipping_norm": 5,
143 |         "update_every_n_steps": 1,
144 |         "epsilon_decay_rate_denominator": 400,
145 |         "discount_rate": 0.99,
146 |         "learning_iterations": 1,
147 |         "tau": 0.01
148 | 
149 |     }
150 | 
151 | 
152 | }
153 | 
154 | if __name__== '__main__':
155 | 
156 | 
157 |     AGENTS = [DDQN] #DIAYN] # A3C] #SNN_HRL] #, DDQN]
158 |     trainer = Trainer(config, AGENTS)
159 |     trainer.run_games_for_agents()
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/results/Long_Corridor.py:
--------------------------------------------------------------------------------
  1 | from agents.hierarchical_agents.SNN_HRL import SNN_HRL
  2 | from agents.Trainer import Trainer
  3 | from utilities.data_structures.Config import Config
  4 | from agents.DQN_agents.DQN import DQN
  5 | from agents.hierarchical_agents.h_DQN import h_DQN
  6 | from environments.Long_Corridor_Environment import Long_Corridor_Environment
  7 | 
  8 | config = Config()
  9 | config.seed = 1
 10 | config.env_parameters = {"stochasticity_of_action_right": 0.5}
 11 | config.environment = Long_Corridor_Environment(stochasticity_of_action_right=config.env_parameters["stochasticity_of_action_right"])
 12 | config.num_episodes_to_run = 10000
 13 | config.file_to_save_data_results = "Data_and_Graphs/Long_Corridor_Results_Data.pkl"
 14 | config.file_to_save_results_graph = "Data_and_Graphs/Long_Corridor_Results_Graph.png"
 15 | config.show_solution_score = False
 16 | config.visualise_individual_results = False
 17 | config.visualise_overall_agent_results = True
 18 | config.standard_deviation_results = 1.0
 19 | config.runs_per_agent = 3
 20 | config.use_GPU = False
 21 | config.overwrite_existing_results_file = False
 22 | config.randomise_random_seed = True
 23 | config.save_model = False
 24 | 
 25 | config.hyperparameters = {
 26 | 
 27 |     "h_DQN": {
 28 |         "CONTROLLER": {
 29 |             "batch_size": 256,
 30 |             "learning_rate": 0.01,
 31 |             "buffer_size": 40000,
 32 |             "linear_hidden_units": [20, 10],
 33 |             "final_layer_activation": "None",
 34 |             "columns_of_data_to_be_embedded": [0, 1],
 35 |             "embedding_dimensions": [[config.environment.observation_space.n,
 36 |                                       max(4, int(config.environment.observation_space.n / 10.0))],
 37 |                                      [config.environment.observation_space.n,
 38 |                                       max(4, int(config.environment.observation_space.n / 10.0))]],
 39 |             "batch_norm": False,
 40 |             "gradient_clipping_norm": 5,
 41 |             "update_every_n_steps": 1,
 42 |             "epsilon_decay_rate_denominator": 1500,
 43 |             "discount_rate": 0.999,
 44 |             "learning_iterations": 1
 45 |         },
 46 |         "META_CONTROLLER": {
 47 |             "batch_size": 256,
 48 |             "learning_rate": 0.001,
 49 |             "buffer_size": 40000,
 50 |             "linear_hidden_units": [20, 10],
 51 |             "final_layer_activation": "None",
 52 |             "columns_of_data_to_be_embedded": [0],
 53 |             "embedding_dimensions": [[config.environment.observation_space.n,
 54 |                                       max(4, int(config.environment.observation_space.n / 10.0))]],
 55 |             "batch_norm": False,
 56 |             "gradient_clipping_norm": 5,
 57 |             "update_every_n_steps": 1,
 58 |             "epsilon_decay_rate_denominator": 2500,
 59 |             "discount_rate": 0.999,
 60 |             "learning_iterations": 1
 61 |         }
 62 |     },
 63 | 
 64 |     "SNN_HRL": {
 65 |         "SKILL_AGENT": {
 66 |             "num_skills": 2,
 67 |             "regularisation_weight": 1.5,
 68 |             "visitations_decay": 0.99,
 69 |             "episodes_for_pretraining": 2000,
 70 |             # "batch_size": 256,
 71 |             # "learning_rate": 0.01,
 72 |             # "buffer_size": 40000,
 73 |             # "linear_hidden_units": [20, 10],
 74 |             # "final_layer_activation": "None",
 75 |             # "columns_of_data_to_be_embedded": [0, 1],
 76 |             # "embedding_dimensions": [[config.environment.observation_space.n,
 77 |             #                           max(4, int(config.environment.observation_space.n / 10.0))],
 78 |             #                          [6, 4]],
 79 |             # "batch_norm": False,
 80 |             # "gradient_clipping_norm": 5,
 81 |             # "update_every_n_steps": 1,
 82 |             # "epsilon_decay_rate_denominator": 50,
 83 |             # "discount_rate": 0.999,
 84 |             # "learning_iterations": 1
 85 | 
 86 | 
 87 |             "learning_rate": 0.05,
 88 |             "linear_hidden_units": [20, 20],
 89 |             "final_layer_activation": "SOFTMAX",
 90 |             "learning_iterations_per_round": 5,
 91 |             "discount_rate": 0.99,
 92 |             "batch_norm": False,
 93 |             "clip_epsilon": 0.1,
 94 |             "episodes_per_learning_round": 4,
 95 |             "normalise_rewards": True,
 96 |             "gradient_clipping_norm": 7.0,
 97 |             "mu": 0.0,  # only required for continuous action games
 98 |             "theta": 0.0,  # only required for continuous action games
 99 |             "sigma": 0.0,  # only required for continuous action games
100 |             "epsilon_decay_rate_denominator": 1.0
101 | 
102 | 
103 | 
104 |     },
105 | 
106 |         "MANAGER": {
107 |             "timesteps_before_changing_skill": 4,
108 |             "linear_hidden_units": [10, 5],
109 |             "learning_rate": 0.01,
110 |             "buffer_size": 40000,
111 |             "batch_size": 256,
112 |             "final_layer_activation": "None",
113 |             "columns_of_data_to_be_embedded": [0],
114 |             "embedding_dimensions": [[config.environment.observation_space.n,
115 |                                       max(4, int(config.environment.observation_space.n / 10.0))]],
116 |             "batch_norm": False,
117 |             "gradient_clipping_norm": 5,
118 |             "update_every_n_steps": 1,
119 |             "epsilon_decay_rate_denominator": 1000,
120 |             "discount_rate": 0.999,
121 |             "learning_iterations": 1
122 | 
123 |         }
124 | 
125 |     }
126 | 
127 | }
128 | 
129 | config.hyperparameters["DQN_Agents"] =  config.hyperparameters["h_DQN"]["META_CONTROLLER"]
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     AGENTS = [SNN_HRL, DQN, h_DQN]
134 |     trainer = Trainer(config, AGENTS)
135 |     trainer.run_games_for_agents()
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/utilities/data_structures/Action_Balanced_Replay_Buffer.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import namedtuple, deque
  3 | import torch
  4 | import numpy as np
  5 | from .Replay_Buffer import Replay_Buffer
  6 | 
  7 | class Action_Balanced_Replay_Buffer(Replay_Buffer):
  8 |     """Replay buffer that provides sample of experiences that have an equal number of each action being conducted"""
  9 |     def __init__(self, buffer_size, batch_size, seed, num_actions):
 10 |         self.num_actions = num_actions
 11 |         self.buffer_size_per_memory = int(buffer_size / self.num_actions)
 12 | 
 13 |         print("NUM ACTIONS ", self.num_actions)
 14 |         self.memories = {action: deque(maxlen=self.buffer_size_per_memory) for action in range(self.num_actions)}
 15 |         self.batch_size = batch_size
 16 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
 17 |         self.seed = random.seed(seed)
 18 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 19 | 
 20 |     def add_experience(self, states, actions, rewards, next_states, dones):
 21 |         """Adds experience or list of experiences into the replay buffer"""
 22 |         if type(dones) == list:
 23 |             assert type(dones[0]) != list, "A done shouldn't be a list"
 24 |             experiences = [self.experience(state, action, reward, next_state, done)
 25 |                            for state, action, reward, next_state, done in
 26 |                            zip(states, actions, rewards, next_states, dones)]
 27 |             for experience in experiences:
 28 |                 action = experience.action
 29 |                 self.memories[action].append(experience)
 30 |         else:
 31 |             experience = self.experience(states, actions, rewards, next_states, dones)
 32 |             self.memories[actions].append(experience)
 33 | 
 34 |     def pick_experiences(self, num_experiences=None):
 35 |         """Picks the experiences that the sample function will return as a random sample of experiences. It works by picking
 36 |         an equal number of experiences that used each action (as far as possible)"""
 37 |         if num_experiences: batch_size = num_experiences
 38 |         else: batch_size = self.batch_size
 39 |         batch_per_action = self.calculate_batch_sizes_per_action(batch_size)
 40 |         samples_split_by_action = self.sample_each_action_equally(batch_per_action)
 41 |         combined_sample = []
 42 |         for key in samples_split_by_action.keys():
 43 |             combined_sample.extend(samples_split_by_action[key])
 44 |         return combined_sample
 45 | 
 46 |     def calculate_batch_sizes_per_action(self, batch_size):
 47 |         """Calculates the batch size we need to randomly draw from each action to make sure there is equal coverage
 48 |         per action and that the batch gets filled up"""
 49 |         min_batch_per_action = int(batch_size / self.num_actions)
 50 |         batch_per_action = {k: min_batch_per_action for k in range(self.num_actions)}
 51 |         current_batch_size = np.sum([batch_per_action[k] for k in range(self.num_actions)])
 52 |         remainder = batch_size - current_batch_size
 53 |         give_remainder_to = random.sample(range(self.num_actions), remainder)
 54 |         for action in give_remainder_to:
 55 |             batch_per_action[action] += 1
 56 |         return batch_per_action
 57 | 
 58 |     def sample_each_action_equally(self, batch_per_action):
 59 |         """Samples a number of experiences (determined by batch_per_action) from the memory buffer for each action"""
 60 |         samples = {}
 61 |         for action in range(self.num_actions):
 62 |             memory = self.memories[action]
 63 |             batch_size_for_action = batch_per_action[action]
 64 |             action_memory_size = len(memory)
 65 |             assert action_memory_size > 0, "Need at least 1 experience for each action"
 66 |             if action_memory_size >= batch_size_for_action:
 67 |                 samples[action] = random.sample(memory, batch_size_for_action)
 68 |             else:
 69 |                 print("Memory size {} vs. required batch size {}".format(action_memory_size, batch_size_for_action))
 70 |                 samples_for_action = []
 71 |                 while len(samples_for_action) < batch_per_action[action]:
 72 |                     remainder = batch_per_action[action] - len(samples_for_action)
 73 |                     sampled_experiences = random.sample(memory, min(remainder, action_memory_size))
 74 |                     samples_for_action.extend(sampled_experiences)
 75 |                 samples[action] = samples_for_action
 76 |         return samples
 77 | 
 78 |     def __len__(self):
 79 |         return  np.sum([len(memory) for memory in self.memories.values()])
 80 | 
 81 |     def sample_experiences_with_certain_actions(self, allowed_actions, num_all_actions, required_batch_size):
 82 |         """Samples a number of experiences where the action conducted was in the list of required actions"""
 83 |         assert isinstance(allowed_actions, list)
 84 |         assert len(allowed_actions) > 0
 85 | 
 86 |         num_new_actions = len(allowed_actions)
 87 |         experiences_to_sample = int(required_batch_size * float(num_all_actions) / float(num_new_actions))
 88 |         experiences = self.sample(num_experiences=experiences_to_sample)
 89 |         states, actions, rewards, next_states, dones = experiences
 90 |         matching_indexes = np.argwhere((np.in1d(actions.numpy(), allowed_actions)))
 91 |         assert matching_indexes.shape[1] == 1
 92 | 
 93 |         matching_indexes = matching_indexes[:, 0]
 94 | 
 95 |         states = states[matching_indexes]
 96 |         actions = actions[matching_indexes]
 97 |         rewards = rewards[matching_indexes]
 98 |         next_states = next_states[matching_indexes]
 99 |         dones = dones[matching_indexes]
100 | 
101 |         assert abs(states.shape[0] - required_batch_size) <= 0.05*required_batch_size, "{} vs. {}".format(states.shape[0], required_batch_size)
102 | 
103 | 
104 |         return (states, actions, rewards, next_states, dones)
105 | 


--------------------------------------------------------------------------------
/tests/Test_DQN_HER.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import Counter
  3 | 
  4 | import pytest
  5 | 
  6 | from agents.DQN_agents.DQN_HER import DQN_HER
  7 | from agents.DQN_agents.DDQN import DDQN
  8 | from agents.DQN_agents.DDQN_With_Prioritised_Experience_Replay import DDQN_With_Prioritised_Experience_Replay
  9 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
 10 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment
 11 | from agents.policy_gradient_agents.PPO import PPO
 12 | from agents.Trainer import Trainer
 13 | from utilities.data_structures.Config import Config
 14 | from agents.DQN_agents.DQN import DQN
 15 | import numpy as np
 16 | import torch
 17 | 
 18 | random.seed(1)
 19 | np.random.seed(1)
 20 | torch.manual_seed(1)
 21 | 
 22 | config = Config()
 23 | config.seed = 1
 24 | config.environment = Bit_Flipping_Environment(4)
 25 | config.num_episodes_to_run = 1
 26 | config.file_to_save_data_results = None
 27 | config.file_to_save_results_graph = None
 28 | config.visualise_individual_results = False
 29 | config.visualise_overall_agent_results = False
 30 | config.randomise_random_seed = False
 31 | config.runs_per_agent = 1
 32 | config.use_GPU = False
 33 | config.hyperparameters = {
 34 | 
 35 |     "DQN_Agents": {
 36 | 
 37 |         "learning_rate": 0.005,
 38 |         "batch_size": 3,
 39 |         "buffer_size": 40000,
 40 |         "epsilon": 0.1,
 41 |         "epsilon_decay_rate_denominator": 200,
 42 |         "discount_rate": 0.99,
 43 |         "tau": 0.1,
 44 |         "alpha_prioritised_replay": 0.6,
 45 |         "beta_prioritised_replay": 0.4,
 46 |         "incremental_td_error": 1e-8,
 47 |         "update_every_n_steps": 3,
 48 |         "linear_hidden_units": [20, 20, 20],
 49 |         "final_layer_activation": "None",
 50 |         "batch_norm": False,
 51 |         "gradient_clipping_norm": 5,
 52 |         "HER_sample_proportion": 0.8,
 53 |         "clip_rewards": False
 54 | }
 55 | }
 56 | 
 57 | 
 58 | trainer = Trainer(config, [DQN_HER])
 59 | config.hyperparameters = config.hyperparameters["DQN_Agents"]
 60 | agent = DQN_HER(config)
 61 | agent.reset_game()
 62 | 
 63 | def test_initiation():
 64 |     """Tests whether DQN_HER initiates correctly"""
 65 |     config.hyperparameters["batch_size"] = 64
 66 |     agent = DQN_HER(config)
 67 |     agent.reset_game()
 68 | 
 69 | 
 70 |     assert agent.ordinary_buffer_batch_size == int(0.2 * 64)
 71 |     assert agent.HER_buffer_batch_size == 64 - int(0.2 * 64)
 72 | 
 73 |     assert agent.q_network_local.input_dim == 8
 74 |     assert agent.q_network_local.output_layers[0].out_features == 4
 75 | 
 76 |     assert isinstance(agent.state_dict, dict)
 77 | 
 78 |     assert agent.observation.shape[0] == 4
 79 |     assert agent.desired_goal.shape[0] == 4
 80 |     assert agent.achieved_goal.shape[0] == 4
 81 | 
 82 |     assert agent.state.shape[0] == 8
 83 |     assert not agent.done
 84 |     assert agent.next_state is None
 85 |     assert agent.reward is None
 86 | 
 87 |     config.hyperparameters["batch_size"] = 3
 88 | 
 89 | def test_action():
 90 |     """Tests whether DQN_HER picks and conducts actions correctly"""
 91 |     num_tries = 1000
 92 |     actions = []
 93 |     for _ in range(num_tries):
 94 |         action = agent.pick_action()
 95 |         actions.append(action)
 96 | 
 97 |     actions_count = Counter(actions)
 98 |     assert actions_count[0] > num_tries*0.1
 99 |     assert actions_count[1] > num_tries*0.1
100 |     assert actions_count[2] > num_tries*0.1
101 |     assert actions_count[3] > num_tries*0.1
102 |     assert actions_count[0] + actions_count[1] + actions_count[2] + actions_count[3] == num_tries
103 | 
104 |     assert agent.next_state is None
105 | 
106 | def test_tracks_changes_from_one_action():
107 |     """Tests that it tracks the changes as a result of actions correctly"""
108 | 
109 |     previous_obs = agent.observation
110 |     previous_desired_goal = agent.desired_goal
111 |     previous_achieved_goal = agent.achieved_goal
112 | 
113 |     agent.action = 0
114 |     agent.conduct_action_in_changeable_goal_envs(agent.action)
115 | 
116 |     assert agent.next_state.shape[0] == 8
117 |     assert isinstance(agent.next_state_dict, dict)
118 |     assert not all (agent.observation == previous_obs)
119 |     assert not all(agent.achieved_goal == previous_achieved_goal)
120 |     assert all (agent.desired_goal == previous_desired_goal)
121 | 
122 |     agent.track_changeable_goal_episodes_data()
123 | 
124 |     with pytest.raises(Exception):
125 |         agent.HER_memory.sample(1)
126 | 
127 |     agent.save_alternative_experience()
128 | 
129 |     sample = agent.HER_memory.sample(1)
130 | 
131 |     assert sample[1].item() == agent.action
132 |     assert sample[2].item() == 4
133 | 
134 | def test_tracks_changes_from_multiple_actions():
135 |     """Tests that it tracks the changes as a result of actions correctly"""
136 | 
137 |     agent = DQN_HER(config)
138 |     agent.reset_game()
139 | 
140 |     for ix in range(4):
141 |         previous_obs = agent.observation
142 |         previous_desired_goal = agent.desired_goal
143 |         previous_achieved_goal = agent.achieved_goal
144 | 
145 |         agent.action = ix
146 |         agent.conduct_action_in_changeable_goal_envs(agent.action)
147 | 
148 |         assert agent.next_state.shape[0] == 8
149 |         assert isinstance(agent.next_state_dict, dict)
150 |         assert not all(agent.observation == previous_obs)
151 |         assert not all(agent.achieved_goal == previous_achieved_goal)
152 |         assert all(agent.desired_goal == previous_desired_goal)
153 | 
154 |         agent.track_changeable_goal_episodes_data()
155 |         agent.save_experience()
156 |         if agent.done: agent.save_alternative_experience()
157 | 
158 |         agent.state_dict = agent.next_state_dict  # this is to set the state for the next iteration
159 |         agent.state = agent.next_state
160 | 
161 |     states, actions, rewards, next_states, dones = agent.HER_memory.sample(4)
162 | 
163 |     assert all(states[1] == torch.Tensor([1.0, 1., 1., 1., 0., 0., 0. , 0.]))
164 |     assert all(actions == torch.Tensor([[1.], [0.], [3.], [2.]]))
165 |     assert all(rewards == torch.Tensor([[-1.], [-1.], [4.], [-1.]]))
166 |     assert all(dones == torch.Tensor([[0.], [0.], [1.], [0.]]))
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/agents/actor_critic_agents/SAC_Discrete.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | from agents.Base_Agent import Base_Agent
 6 | from utilities.data_structures.Replay_Buffer import Replay_Buffer
 7 | from agents.actor_critic_agents.SAC import SAC
 8 | from utilities.Utility_Functions import create_actor_distribution
 9 | 
10 | class SAC_Discrete(SAC):
11 |     """The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few
12 |     methods."""
13 |     agent_name = "SAC"
14 |     def __init__(self, config):
15 |         Base_Agent.__init__(self, config)
16 |         assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
17 |         assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
18 |         self.hyperparameters = config.hyperparameters
19 |         self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
20 |         self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
21 |                                            key_to_use="Critic", override_seed=self.config.seed + 1)
22 |         self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
23 |                                                  lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
24 |         self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
25 |                                                    lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
26 |         self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
27 |                                            key_to_use="Critic")
28 |         self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
29 |                                             key_to_use="Critic")
30 |         Base_Agent.copy_model_over(self.critic_local, self.critic_target)
31 |         Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
32 |         self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
33 |                                     self.config.seed)
34 | 
35 |         self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
36 |         self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
37 |                                           lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
38 |         self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
39 |         if self.automatic_entropy_tuning:
40 |             # we set the max possible entropy as the target entropy
41 |             self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
42 |             self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
43 |             self.alpha = self.log_alpha.exp()
44 |             self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
45 |         else:
46 |             self.alpha = self.hyperparameters["entropy_term_weight"]
47 |         assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
48 |         self.add_extra_noise = False
49 |         self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
50 | 
51 |     def produce_action_and_action_info(self, state):
52 |         """Given the state, produces an action, the probability of the action, the log probability of the action, and
53 |         the argmax action"""
54 |         action_probabilities = self.actor_local(state)
55 |         max_probability_action = torch.argmax(action_probabilities).unsqueeze(0)
56 |         action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
57 |         action = action_distribution.sample().cpu()
58 |         # Have to deal with situation of 0.0 probabilities because we can't do log 0
59 |         z = action_probabilities == 0.0
60 |         z = z.float() * 1e-8
61 |         log_action_probabilities = torch.log(action_probabilities + z)
62 |         return action, (action_probabilities, log_action_probabilities), max_probability_action
63 | 
64 |     def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
65 |         """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
66 |          term is taken into account"""
67 |         with torch.no_grad():
68 |             next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
69 |             qf1_next_target = self.critic_target(next_state_batch)
70 |             qf2_next_target = self.critic_target_2(next_state_batch)
71 |             min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
72 |             min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1)
73 |             next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
74 | 
75 |         qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
76 |         qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long())
77 |         qf1_loss = F.mse_loss(qf1, next_q_value)
78 |         qf2_loss = F.mse_loss(qf2, next_q_value)
79 |         return qf1_loss, qf2_loss
80 | 
81 |     def calculate_actor_loss(self, state_batch):
82 |         """Calculates the loss for the actor. This loss includes the additional entropy term"""
83 |         action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch)
84 |         qf1_pi = self.critic_local(state_batch)
85 |         qf2_pi = self.critic_local_2(state_batch)
86 |         min_qf_pi = torch.min(qf1_pi, qf2_pi)
87 |         inside_term = self.alpha * log_action_probabilities - min_qf_pi
88 |         policy_loss = action_probabilities * inside_term
89 |         policy_loss = policy_loss.mean()
90 |         log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
91 |         return policy_loss, log_action_probabilities
92 | 


--------------------------------------------------------------------------------
/utilities/Utility_Functions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | from abc import ABCMeta
  5 | import torch
  6 | from nn_builder.pytorch.NN import NN
  7 | from torch.distributions import Categorical, normal, MultivariateNormal
  8 | 
  9 | def abstract(cls):
 10 |     return ABCMeta(cls.__name__, cls.__bases__, dict(cls.__dict__))
 11 |     
 12 | def save_score_results(file_path, results):
 13 |     """Saves results as a numpy file at given path"""
 14 |     np.save(file_path, results)
 15 | 
 16 | def normalise_rewards(rewards):
 17 |     """Normalises rewards to mean 0 and standard deviation 1"""
 18 |     mean_reward = np.mean(rewards)
 19 |     std_reward = np.std(rewards)
 20 |     return (rewards - mean_reward) / (std_reward + 1e-8) #1e-8 added for stability
 21 | 
 22 | def create_actor_distribution(action_types, actor_output, action_size):
 23 |     """Creates a distribution that the actor can then use to randomly draw actions"""
 24 |     if action_types == "DISCRETE":
 25 |         assert actor_output.size()[1] == action_size, "Actor output the wrong size"
 26 |         action_distribution = Categorical(actor_output)  # this creates a distribution to sample from
 27 |     else:
 28 |         assert actor_output.size()[1] == action_size * 2, "Actor output the wrong size"
 29 |         means = actor_output[:, :action_size].squeeze(0)
 30 |         stds = actor_output[:,  action_size:].squeeze(0)
 31 |         if len(means.shape) == 2: means = means.squeeze(-1)
 32 |         if len(stds.shape) == 2: stds = stds.squeeze(-1)
 33 |         if len(stds.shape) > 1 or len(means.shape) > 1:
 34 |             raise ValueError("Wrong mean and std shapes - {} -- {}".format(stds.shape, means.shape))
 35 |         action_distribution = normal.Normal(means.squeeze(0), torch.abs(stds))
 36 |     return action_distribution
 37 | 
 38 | class SharedAdam(torch.optim.Adam):
 39 |     """Creates an adam optimizer object that is shareable between processes. Useful for algorithms like A3C. Code
 40 |     taken from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py"""
 41 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
 42 |         super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
 43 |         for group in self.param_groups:
 44 |             for p in group['params']:
 45 |                 state = self.state[p]
 46 |                 state['step'] = torch.zeros(1)
 47 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
 48 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
 49 | 
 50 |     def share_memory(self):
 51 |         for group in self.param_groups:
 52 |             for p in group['params']:
 53 |                 state = self.state[p]
 54 |                 state['step'].share_memory_()
 55 |                 state['exp_avg'].share_memory_()
 56 |                 state['exp_avg_sq'].share_memory_()
 57 | 
 58 |     def step(self, closure=None):
 59 |         """Performs a single optimization step.
 60 |         Arguments:
 61 |             closure (callable, optional): A closure that reevaluates the model
 62 |                 and returns the loss.
 63 |         """
 64 |         loss = None
 65 |         if closure is not None:
 66 |             loss = closure()
 67 |         for group in self.param_groups:
 68 |             for p in group['params']:
 69 |                 if p.grad is None:
 70 |                     continue
 71 |                 grad = p.grad.data
 72 |                 amsgrad = group['amsgrad']
 73 |                 state = self.state[p]
 74 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 75 |                 if amsgrad:
 76 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 77 |                 beta1, beta2 = group['betas']
 78 |                 state['step'] += 1
 79 |                 if group['weight_decay'] != 0:
 80 |                     grad = grad.add(group['weight_decay'], p.data)
 81 |                 # Decay the first and second moment running average coefficient
 82 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 83 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 84 |                 if amsgrad:
 85 |                     # Maintains the maximum of all 2nd moment running avg. till now
 86 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
 87 |                     # Use the max. for normalizing running avg. of gradient
 88 |                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
 89 |                 else:
 90 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 91 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
 92 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
 93 |                 step_size = group['lr'] * math.sqrt(
 94 |                     bias_correction2) / bias_correction1
 95 | 
 96 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
 97 |         return loss
 98 | 
 99 | def flatten_action_id_to_actions(action_id_to_actions, global_action_id_to_primitive_action, num_primitive_actions):
100 |     """Converts the values in an action_id_to_actions dictionary back to the primitive actions they represent"""
101 |     flattened_action_id_to_actions = {}
102 |     for key in action_id_to_actions.keys():
103 |         actions = action_id_to_actions[key]
104 |         raw_actions = backtrack_action_to_primitive_actions(actions, global_action_id_to_primitive_action, num_primitive_actions)
105 |         flattened_action_id_to_actions[key] = raw_actions
106 |     return flattened_action_id_to_actions
107 | 
108 | def backtrack_action_to_primitive_actions(action_tuple, global_action_id_to_primitive_action, num_primitive_actions):
109 |     """Converts an action tuple back to the primitive actions it represents in a recursive way."""
110 |     print("Recursing to backtrack on ", action_tuple)
111 |     primitive_actions = range(num_primitive_actions)
112 |     if all(action in primitive_actions for action in action_tuple): return action_tuple #base case
113 |     new_action_tuple = []
114 |     for action in action_tuple:
115 |         if action in primitive_actions: new_action_tuple.append(action)
116 |         else:
117 |             converted_action = global_action_id_to_primitive_action[action]
118 |             print(new_action_tuple)
119 |             new_action_tuple.extend(converted_action)
120 |             print("Should have changed: ", new_action_tuple)
121 |     new_action_tuple = tuple(new_action_tuple)
122 |     return backtrack_action_to_primitive_actions(new_action_tuple)
123 | 


--------------------------------------------------------------------------------
/agents/DQN_agents/DQN.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import torch
  4 | import random
  5 | import torch.optim as optim
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | from agents.Base_Agent import Base_Agent
  9 | from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration
 10 | from utilities.data_structures.Replay_Buffer import Replay_Buffer
 11 | 
 12 | class DQN(Base_Agent):
 13 |     """A deep Q learning agent"""
 14 |     agent_name = "DQN"
 15 |     def __init__(self, config):
 16 |         Base_Agent.__init__(self, config)
 17 |         self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
 18 |         self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
 19 |         self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
 20 |                                               lr=self.hyperparameters["learning_rate"], eps=1e-4)
 21 |         self.exploration_strategy = Epsilon_Greedy_Exploration(config)
 22 | 
 23 |     def reset_game(self):
 24 |         super(DQN, self).reset_game()
 25 |         self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer)
 26 | 
 27 |     def step(self):
 28 |         """Runs a step within a game including a learning step if required"""
 29 |         while not self.done:
 30 |             self.action = self.pick_action()
 31 |             self.conduct_action(self.action)
 32 |             if self.time_for_q_network_to_learn():
 33 |                 for _ in range(self.hyperparameters["learning_iterations"]):
 34 |                     self.learn()
 35 |             self.save_experience()
 36 |             self.state = self.next_state #this is to set the state for the next iteration
 37 |             self.global_step_number += 1
 38 |         self.episode_number += 1
 39 | 
 40 |     def pick_action(self, state=None):
 41 |         """Uses the local Q network and an epsilon greedy policy to pick an action"""
 42 |         # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
 43 |         # a "fake" dimension to make it a mini-batch rather than a single observation
 44 |         if state is None: state = self.state
 45 |         if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state])
 46 |         state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
 47 |         if len(state.shape) < 2: state = state.unsqueeze(0)
 48 |         self.q_network_local.eval() #puts network in evaluation mode
 49 |         with torch.no_grad():
 50 |             action_values = self.q_network_local(state)
 51 |         self.q_network_local.train() #puts network back in training mode
 52 |         action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
 53 |                                                                                     "turn_off_exploration": self.turn_off_exploration,
 54 |                                                                                     "episode_number": self.episode_number})
 55 |         self.logger.info("Q values {} -- Action chosen {}".format(action_values, action))
 56 |         return action
 57 | 
 58 |     def learn(self, experiences=None):
 59 |         """Runs a learning iteration for the Q network"""
 60 |         if experiences is None: states, actions, rewards, next_states, dones = self.sample_experiences() #Sample experiences
 61 |         else: states, actions, rewards, next_states, dones = experiences
 62 |         loss = self.compute_loss(states, next_states, rewards, actions, dones)
 63 | 
 64 |         actions_list = [action_X.item() for action_X in actions ]
 65 | 
 66 |         self.logger.info("Action counts {}".format(Counter(actions_list)))
 67 |         self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])
 68 | 
 69 |     def compute_loss(self, states, next_states, rewards, actions, dones):
 70 |         """Computes the loss required to train the Q network"""
 71 |         with torch.no_grad():
 72 |             Q_targets = self.compute_q_targets(next_states, rewards, dones)
 73 |         Q_expected = self.compute_expected_q_values(states, actions)
 74 |         loss = F.mse_loss(Q_expected, Q_targets)
 75 |         return loss
 76 | 
 77 |     def compute_q_targets(self, next_states, rewards, dones):
 78 |         """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
 79 |         Q_targets_next = self.compute_q_values_for_next_states(next_states)
 80 |         Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones)
 81 |         return Q_targets
 82 | 
 83 |     def compute_q_values_for_next_states(self, next_states):
 84 |         """Computes the q_values for next state we will use to create the loss to train the Q network"""
 85 |         Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1)
 86 |         return Q_targets_next
 87 | 
 88 |     def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones):
 89 |         """Computes the q_values for current state we will use to create the loss to train the Q network"""
 90 |         Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones))
 91 |         return Q_targets_current
 92 | 
 93 |     def compute_expected_q_values(self, states, actions):
 94 |         """Computes the expected q_values we will use to create the loss to train the Q network"""
 95 |         Q_expected = self.q_network_local(states).gather(1, actions.long()) #must convert actions to long so can be used as index
 96 |         return Q_expected
 97 | 
 98 |     def locally_save_policy(self):
 99 |         """Saves the policy"""
100 |         torch.save(self.q_network_local.state_dict(), "Models/{}_local_network.pt".format(self.agent_name))
101 | 
102 |     def time_for_q_network_to_learn(self):
103 |         """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
104 |         enough experiences in the replay buffer to learn from"""
105 |         return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from()
106 | 
107 |     def right_amount_of_steps_taken(self):
108 |         """Returns boolean indicating whether enough steps have been taken for learning to begin"""
109 |         return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
110 | 
111 |     def sample_experiences(self):
112 |         """Draws a random sample of experience from the memory buffer"""
113 |         experiences = self.memory.sample()
114 |         states, actions, rewards, next_states, dones = experiences
115 |         return states, actions, rewards, next_states, dones


--------------------------------------------------------------------------------
/agents/actor_critic_agents/DDPG.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as functional
  3 | from torch import optim
  4 | from agents.Base_Agent import Base_Agent
  5 | from utilities.data_structures.Replay_Buffer import Replay_Buffer
  6 | from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration
  7 | 
  8 | class DDPG(Base_Agent):
  9 |     """A DDPG Agent"""
 10 |     agent_name = "DDPG"
 11 | 
 12 |     def __init__(self, config):
 13 |         Base_Agent.__init__(self, config)
 14 |         self.hyperparameters = config.hyperparameters
 15 |         self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
 16 |         self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
 17 |         Base_Agent.copy_model_over(self.critic_local, self.critic_target)
 18 | 
 19 |         self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
 20 |                                            lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
 21 |         self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
 22 |                                     self.config.seed)
 23 |         self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
 24 |         self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
 25 |         Base_Agent.copy_model_over(self.actor_local, self.actor_target)
 26 | 
 27 |         self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
 28 |                                           lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
 29 |         self.exploration_strategy = OU_Noise_Exploration(self.config)
 30 | 
 31 |     def step(self):
 32 |         """Runs a step in the game"""
 33 |         while not self.done:
 34 |             # print("State ", self.state.shape)
 35 |             self.action = self.pick_action()
 36 |             self.conduct_action(self.action)
 37 |             if self.time_for_critic_and_actor_to_learn():
 38 |                 for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
 39 |                     states, actions, rewards, next_states, dones = self.sample_experiences()
 40 |                     self.critic_learn(states, actions, rewards, next_states, dones)
 41 |                     self.actor_learn(states)
 42 |             self.save_experience()
 43 |             self.state = self.next_state #this is to set the state for the next iteration
 44 |             self.global_step_number += 1
 45 |         self.episode_number += 1
 46 | 
 47 |     def sample_experiences(self):
 48 |         return self.memory.sample()
 49 | 
 50 |     def pick_action(self, state=None):
 51 |         """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
 52 |         if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
 53 |         self.actor_local.eval()
 54 |         with torch.no_grad():
 55 |             action = self.actor_local(state).cpu().data.numpy()
 56 |         self.actor_local.train()
 57 |         action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
 58 |         return action.squeeze(0)
 59 | 
 60 |     def critic_learn(self, states, actions, rewards, next_states, dones):
 61 |         """Runs a learning iteration for the critic"""
 62 |         loss = self.compute_loss(states, next_states, rewards, actions, dones)
 63 |         self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
 64 |         self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
 65 | 
 66 |     def compute_loss(self, states, next_states, rewards, actions, dones):
 67 |         """Computes the loss for the critic"""
 68 |         with torch.no_grad():
 69 |             critic_targets = self.compute_critic_targets(next_states, rewards, dones)
 70 |         critic_expected = self.compute_expected_critic_values(states, actions)
 71 |         loss = functional.mse_loss(critic_expected, critic_targets)
 72 |         return loss
 73 | 
 74 |     def compute_critic_targets(self, next_states, rewards, dones):
 75 |         """Computes the critic target values to be used in the loss for the critic"""
 76 |         critic_targets_next = self.compute_critic_values_for_next_states(next_states)
 77 |         critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
 78 |         return critic_targets
 79 | 
 80 |     def compute_critic_values_for_next_states(self, next_states):
 81 |         """Computes the critic values for next states to be used in the loss for the critic"""
 82 |         with torch.no_grad():
 83 |             actions_next = self.actor_target(next_states)
 84 |             critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
 85 |         return critic_targets_next
 86 | 
 87 |     def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
 88 |         """Computes the critic values for current states to be used in the loss for the critic"""
 89 |         critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
 90 |         return critic_targets_current
 91 | 
 92 |     def compute_expected_critic_values(self, states, actions):
 93 |         """Computes the expected critic values to be used in the loss for the critic"""
 94 |         critic_expected = self.critic_local(torch.cat((states, actions), 1))
 95 |         return critic_expected
 96 | 
 97 |     def time_for_critic_and_actor_to_learn(self):
 98 |         """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
 99 |         actor and critic"""
100 |         return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
101 | 
102 |     def actor_learn(self, states):
103 |         """Runs a learning iteration for the actor"""
104 |         if self.done: #we only update the learning rate at end of each episode
105 |             self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
106 |         actor_loss = self.calculate_actor_loss(states)
107 |         self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
108 |                                     self.hyperparameters["Actor"]["gradient_clipping_norm"])
109 |         self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])
110 | 
111 |     def calculate_actor_loss(self, states):
112 |         """Calculates the loss for the actor"""
113 |         actions_pred = self.actor_local(states)
114 |         actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
115 |         return actor_loss


--------------------------------------------------------------------------------
/results/Taxi.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | 
  3 | from agents.DQN_agents.DDQN import DDQN
  4 | from agents.hierarchical_agents.HRL.HRL import HRL
  5 | from agents.hierarchical_agents.HRL.Model_HRL import Model_HRL
  6 | from agents.Trainer import Trainer
  7 | from utilities.data_structures.Config import Config
  8 | 
  9 | config = Config()
 10 | config.seed = 1
 11 | config.environment = gym.make("CartPole-v0")
 12 | config.env_parameters = {}
 13 | config.num_episodes_to_run = 500
 14 | config.file_to_save_data_results = "data_and_graphs/hrl_experiments/Cart_Pole_data.pkl"
 15 | config.file_to_save_results_graph = "data_and_graphs/hrl_experiments/Cart_Poke.png"
 16 | config.show_solution_score = False
 17 | config.visualise_individual_results = False
 18 | config.visualise_overall_agent_results = True
 19 | config.standard_deviation_results = 1.0
 20 | config.runs_per_agent = 10
 21 | config.use_GPU = False
 22 | config.overwrite_existing_results_file = False
 23 | config.randomise_random_seed = True
 24 | config.save_model = False
 25 | 
 26 | 
 27 | # Loss is not drawing a random sample! otherwise wouldnt jump around that much!!
 28 | 
 29 | linear_hidden_units = [32, 32]
 30 | learning_rate = 0.005  # 0.001 taxi
 31 | buffer_size = 1000000
 32 | batch_size = 256
 33 | batch_norm = False
 34 | embedding_dimensionality = 10
 35 | gradient_clipping_norm = 0.5 #needs to be optimised
 36 | update_every_n_steps = 1
 37 | learning_iterations = 1
 38 | epsilon_decay_rate_denominator = 2 #150
 39 | episodes_per_round = 50 #80
 40 | discount_rate = 0.99
 41 | tau = 0.004
 42 | sequitur_k = 2
 43 | pre_training_learning_iterations_multiplier = 0
 44 | episodes_to_run_with_no_exploration = 0
 45 | action_balanced_replay_buffer = True
 46 | copy_over_hidden_layers = True
 47 | 
 48 | num_top_results_to_use = 10
 49 | action_frequency_required_in_top_results = 0.8
 50 | 
 51 | random_episodes_to_run = 0
 52 | 
 53 | action_length_reward_bonus = 0.0
 54 | 
 55 | only_train_new_actions = True
 56 | only_train_final_layer = True
 57 | reduce_macro_action_appearance_cutoff_throughout_training = False
 58 | add_1_macro_action_at_a_time = True
 59 | 
 60 | calculate_q_values_as_increments = True
 61 | abandon_ship = True
 62 | clip_rewards = True
 63 | use_relative_counts = True
 64 | 
 65 | config.debug_mode = False
 66 | 
 67 | config.hyperparameters = {
 68 | 
 69 |     "HRL": {
 70 |         "linear_hidden_units": linear_hidden_units,
 71 |         "learning_rate": learning_rate,
 72 |         "buffer_size": buffer_size,
 73 |         "batch_size": batch_size,
 74 |         "final_layer_activation": "None",
 75 |         # "columns_of_data_to_be_embedded": [0],
 76 |         # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
 77 |         "batch_norm": batch_norm,
 78 |         "gradient_clipping_norm": gradient_clipping_norm,
 79 |         "update_every_n_steps": update_every_n_steps,
 80 |         "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator,
 81 |         "discount_rate": discount_rate,
 82 |         "learning_iterations": learning_iterations,
 83 |         "tau": tau,
 84 |         "sequitur_k": sequitur_k,
 85 |         "use_relative_counts": use_relative_counts,
 86 |         "action_length_reward_bonus": action_length_reward_bonus,
 87 |         "pre_training_learning_iterations_multiplier": pre_training_learning_iterations_multiplier,
 88 |         "episodes_to_run_with_no_exploration": episodes_to_run_with_no_exploration,
 89 |         "action_balanced_replay_buffer": action_balanced_replay_buffer,
 90 |         "copy_over_hidden_layers": copy_over_hidden_layers,
 91 |         "random_episodes_to_run": random_episodes_to_run,
 92 |         "only_train_new_actions": only_train_new_actions,
 93 |         "only_train_final_layer": only_train_final_layer,
 94 |         "num_top_results_to_use": num_top_results_to_use,
 95 |         "action_frequency_required_in_top_results": action_frequency_required_in_top_results,
 96 |         "reduce_macro_action_appearance_cutoff_throughout_training": reduce_macro_action_appearance_cutoff_throughout_training,
 97 |         "add_1_macro_action_at_a_time": add_1_macro_action_at_a_time,
 98 |         "calculate_q_values_as_increments": calculate_q_values_as_increments,
 99 |         "episodes_per_round": episodes_per_round,
100 |         "abandon_ship": abandon_ship,
101 |         "clip_rewards": clip_rewards
102 |     },
103 | 
104 |     "DQN_Agents": {
105 |         "linear_hidden_units": linear_hidden_units,
106 |         "learning_rate": learning_rate,
107 |         "buffer_size": buffer_size,
108 |         "batch_size": batch_size,
109 |         "final_layer_activation": "None",
110 |         # "columns_of_data_to_be_embedded": [0],
111 |         # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
112 |         "batch_norm": batch_norm,
113 |         "gradient_clipping_norm": gradient_clipping_norm,
114 |         "update_every_n_steps": update_every_n_steps,
115 |         "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator,
116 |         "discount_rate": discount_rate,
117 |         "learning_iterations": learning_iterations,
118 |         "tau": tau,
119 |         "clip_rewards": clip_rewards
120 |     },
121 | 
122 |     "Actor_Critic_Agents": {
123 |         "Actor": {
124 |             "learning_rate": 0.0003,
125 |             "linear_hidden_units": [64, 64],
126 |             "final_layer_activation": "Softmax",
127 |             # "columns_of_data_to_be_embedded": [0],
128 |             # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
129 |             "batch_norm": False,
130 |             "tau": 0.005,
131 |             "gradient_clipping_norm": 5,
132 |             "initialiser": "Xavier"
133 |         },
134 | 
135 |         "Critic": {
136 |             "learning_rate": 0.0003,
137 |             "linear_hidden_units": [64, 64],
138 |             "final_layer_activation": None,
139 |             # "columns_of_data_to_be_embedded": [0],
140 |             # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]],
141 |             "batch_norm": False,
142 |             "buffer_size": 1000000,
143 |             "tau": 0.005,
144 |             "gradient_clipping_norm": 5,
145 |             "initialiser": "Xavier"
146 |         },
147 | 
148 |         "min_steps_before_learning": 10000,
149 |         "batch_size": 256,
150 |         "discount_rate": 0.99,
151 |         "mu": 0.0,  # for O-H noise
152 |         "theta": 0.15,  # for O-H noise
153 |         "sigma": 0.25,  # for O-H noise
154 |         "action_noise_std": 0.2,  # for TD3
155 |         "action_noise_clipping_range": 0.5,  # for TD3
156 |         "update_every_n_steps": 1,
157 |         "learning_updates_per_learning_session": 1,
158 |         "automatically_tune_entropy_hyperparameter": True,
159 |         "entropy_term_weight": None,
160 |         "add_extra_noise": False,
161 |         "do_evaluation_iterations": True,
162 |         "clip_rewards": clip_rewards
163 |     }
164 | }
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     AGENTS = [HRL, DDQN] #] #DDQN, ,  ] #] ##  ] #, SAC_Discrete,  SAC_Discrete, DDQN] #HRL] #, SNN_HRL, DQN, h_DQN]
169 |     trainer = Trainer(config, AGENTS)
170 |     trainer.run_games_for_agents()
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------