├── environments ├── ant_environments │ ├── __init__.py │ ├── README.md │ ├── ant_maze_env.py │ ├── point_maze_env.py │ ├── point.py │ ├── create_maze_env.py │ ├── maze_env_utils.py │ ├── ant.py │ └── assets │ │ └── ant.xml ├── Atari_Environment.py ├── Ant_Navigation_Environments.py ├── Long_Corridor_Environment.py └── Bit_Flipping_Environment.py ├── utilities ├── RL_image.jpeg ├── PyTorch-logo-2.jpg ├── data_structures │ ├── Node.py │ ├── Config.py │ ├── Deque.py │ ├── Tanh_Distribution.py │ ├── Replay_Buffer.py │ ├── Max_Heap.py │ └── Action_Balanced_Replay_Buffer.py ├── OU_Noise.py ├── Tensorboard.py ├── Deepmind_RMS_Prop.py ├── Parallel_Experience_Generator.py ├── Memory_Shaper.py └── Utility_Functions.py ├── requirements.txt ├── results ├── data_and_graphs │ ├── Taxi_data.pkl │ ├── Four_Rooms.pkl │ ├── Four_Rooms.png │ ├── Taxi_graph.png │ ├── HER_Experiments.png │ ├── Hopper_Results_Data.pkl │ ├── Long_Corridor_Graph.png │ ├── Walker_Results_Data.pkl │ ├── Hopper_Results_Graph.png │ ├── Taxi_graph_comparison.png │ ├── Walker_Results_Graph.png │ ├── hrl_experiments │ │ ├── Taxi.png │ │ ├── Cart_Pole.png │ │ ├── Taxi_data.pkl │ │ ├── Cart_Pole_data.pkl │ │ └── Taxi_graph_comparison.png │ ├── Cart_Pole_Results_Data.pkl │ ├── Cart_Pole_Results_Graph.png │ ├── Fetch_Reach_Results_Data.pkl │ ├── Bit_Flipping_Results_Data.pkl │ ├── Bit_Flipping_Results_Graph.png │ ├── Fetch_Reach_Results_Graph.png │ ├── Long_Corridor_Results_Data.pkl │ ├── Mountain_Car_Results_Data.pkl │ ├── Mountain_Car_Results_Graph.png │ ├── Four_Rooms_and_Long_Corridor.png │ ├── Long_Corridor_Results_Graph.png │ ├── CartPole_and_MountainCar_Graph.png │ ├── Hopper_Results_Graph_Both_Agents.png │ └── Plot_Sets_Of_Results.py ├── Bit_Flipping.py ├── Fetch_Reach.py ├── Mountain_Car.py ├── Hopper.py ├── Walker.py ├── Reacher.py ├── HRL_Experiments.py ├── Cart_Pole.py ├── Four_Rooms.py ├── Long_Corridor.py └── Taxi.py ├── .gitignore ├── tests ├── Test_Max_Heap.py ├── Test_Deque.py ├── Test_Trainer.py ├── Test_Bit_Flipping_Environment.py ├── Test_HRL.py ├── Test_Action_Balanced_Replay_Buffer.py ├── Test_Prioritised_Replay_Buffer.py ├── Test_Four_Rooms_Environment.py └── Test_DQN_HER.py ├── exploration_strategies ├── Base_Exploration_Strategy.py ├── OU_Noise_Exploration.py ├── Gaussian_Exploration.py └── Epsilon_Greedy_Exploration.py ├── agents ├── DQN_agents │ ├── DDQN.py │ ├── DQN_With_Fixed_Q_Targets.py │ ├── DQN_HER.py │ ├── DDQN_With_Prioritised_Experience_Replay.py │ ├── Dueling_DDQN.py │ └── DQN.py ├── actor_critic_agents │ ├── A2C.py │ ├── DDPG_HER.py │ ├── TD3.py │ ├── SAC_Discrete.py │ └── DDPG.py ├── policy_gradient_agents │ └── REINFORCE.py └── HER_Base.py └── .travis.yml /environments/ant_environments/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /utilities/RL_image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/utilities/RL_image.jpeg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.15.2 2 | torch==0.4.1.post2 3 | matplotlib==3.0.0 4 | PyVirtualDisplay==0.2.1 5 | gym==0.10.9 6 | nn_builder 7 | tensorflow 8 | -------------------------------------------------------------------------------- /utilities/PyTorch-logo-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/utilities/PyTorch-logo-2.jpg -------------------------------------------------------------------------------- /results/data_and_graphs/Taxi_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Four_Rooms.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Four_Rooms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms.png -------------------------------------------------------------------------------- /results/data_and_graphs/Taxi_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/HER_Experiments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/HER_Experiments.png -------------------------------------------------------------------------------- /results/data_and_graphs/Hopper_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Long_Corridor_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/Walker_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Walker_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Hopper_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/Taxi_graph_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Taxi_graph_comparison.png -------------------------------------------------------------------------------- /results/data_and_graphs/Walker_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Walker_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/hrl_experiments/Taxi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi.png -------------------------------------------------------------------------------- /results/data_and_graphs/Cart_Pole_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Cart_Pole_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Cart_Pole_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Cart_Pole_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/Fetch_Reach_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Fetch_Reach_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Bit_Flipping_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Bit_Flipping_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Bit_Flipping_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Bit_Flipping_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/Fetch_Reach_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Fetch_Reach_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/Long_Corridor_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Mountain_Car_Results_Data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Mountain_Car_Results_Data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Mountain_Car_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Mountain_Car_Results_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/hrl_experiments/Cart_Pole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Cart_Pole.png -------------------------------------------------------------------------------- /results/data_and_graphs/hrl_experiments/Taxi_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi_data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Four_Rooms_and_Long_Corridor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Four_Rooms_and_Long_Corridor.png -------------------------------------------------------------------------------- /results/data_and_graphs/Long_Corridor_Results_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Long_Corridor_Results_Graph.png -------------------------------------------------------------------------------- /environments/ant_environments/README.md: -------------------------------------------------------------------------------- 1 | NOTE that all code in this folder came directly from the github repo https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 2 | and is not my code. -------------------------------------------------------------------------------- /results/data_and_graphs/CartPole_and_MountainCar_Graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/CartPole_and_MountainCar_Graph.png -------------------------------------------------------------------------------- /results/data_and_graphs/hrl_experiments/Cart_Pole_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Cart_Pole_data.pkl -------------------------------------------------------------------------------- /results/data_and_graphs/Hopper_Results_Graph_Both_Agents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/Hopper_Results_Graph_Both_Agents.png -------------------------------------------------------------------------------- /results/data_and_graphs/hrl_experiments/Taxi_graph_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/findmyway/Deep-Reinforcement-Learning-Algorithms-with-PyTorch/master/results/data_and_graphs/hrl_experiments/Taxi_graph_comparison.png -------------------------------------------------------------------------------- /environments/ant_environments/ant_maze_env.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 4 | # and is not my code. 5 | 6 | 7 | 8 | from ant_environments.maze_env import MazeEnv 9 | from ant_environments.ant import AntEnv 10 | 11 | 12 | class AntMazeEnv(MazeEnv): 13 | MODEL_CLASS = AntEnv 14 | -------------------------------------------------------------------------------- /environments/ant_environments/point_maze_env.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 5 | # and is not my code. 6 | 7 | 8 | 9 | 10 | 11 | from ant_environments.maze_env import MazeEnv 12 | from ant_environments.point import PointEnv 13 | 14 | 15 | class PointMazeEnv(MazeEnv): 16 | MODEL_CLASS = PointEnv 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *cache 3 | *.idea 4 | *.log 5 | *logs/ 6 | logs/ 7 | runs/ 8 | *__pycache__ 9 | .DS_Store 10 | drlnd/ 11 | *Banana.app/ 12 | *deep-reinforcement-learning/ 13 | *venv/ 14 | *playground_test_runs.py 15 | Game_Files/ 16 | mjpro150/ 17 | mujoco200_macos/ 18 | *to_do_list 19 | Random_Junkyard/ 20 | Notebook.ipynb 21 | Results/Notebook.ipynb 22 | *.ipynb_checkpoints 23 | *.drive_access_key.json 24 | drive_access_key.json 25 | drive_access_key 26 | settings.json 27 | launch.json 28 | results/data_and_graphs/Cart_Pole_Results_Data.pkl 29 | results/data_and_graphs/Cart_Pole_Results_Graph.png 30 | -------------------------------------------------------------------------------- /utilities/data_structures/Node.py: -------------------------------------------------------------------------------- 1 | class Node(object): 2 | """Generic Node class. Used in the implementation of a prioritised replay buffer""" 3 | def __init__(self, key, value): 4 | self.key = key 5 | self.value = value 6 | 7 | def update_key_and_value(self, new_key, new_value): 8 | self.update_key(new_key) 9 | self.update_value(new_value) 10 | 11 | def update_key(self, new_key): 12 | self.key = new_key 13 | 14 | def update_value(self, new_value): 15 | self.value = new_value 16 | 17 | def __eq__(self, other): 18 | return self.key == other.key and self.value == other.value -------------------------------------------------------------------------------- /tests/Test_Max_Heap.py: -------------------------------------------------------------------------------- 1 | import random 2 | from utilities.data_structures.Max_Heap import Max_Heap 3 | import numpy as np 4 | from utilities.data_structures.Node import Node 5 | 6 | 7 | def test_heap_always_keeps_max_element_at_top(): 8 | max_heap_size = 200 9 | for _ in range(100): 10 | heap = Max_Heap(max_heap_size, 2, 0) 11 | elements_added = [] 12 | for ix in range(1, 100): 13 | element = random.random() 14 | elements_added.append(element) 15 | heap.update_element_and_reorganise_heap(ix, Node(element, (None, None))) 16 | 17 | max_key = np.max(elements_added) 18 | assert round(heap.give_max_key(), 8) == round(max_key, 8), "{}".format(elements_added) 19 | 20 | -------------------------------------------------------------------------------- /tests/Test_Deque.py: -------------------------------------------------------------------------------- 1 | from utilities.data_structures.Deque import Deque 2 | from utilities.data_structures.Node import Node 3 | 4 | 5 | def test_Deque_initialisation(): 6 | 7 | deque = Deque(2, 1) 8 | assert all(deque.deque == [Node(0, (None,)), Node(0, (None,))]) 9 | 10 | def test_Deque_adding_elements(): 11 | 12 | deque = Deque(2, 1) 13 | deque.add_element_to_deque(3, 5) 14 | deque.add_element_to_deque(2, 4) 15 | 16 | assert all(deque.deque == [Node(3, 5), Node(2, 4)]) 17 | 18 | deque.add_element_to_deque(1, 2) 19 | 20 | assert all(deque.deque == [Node(1, 2), Node(2, 4)]) 21 | 22 | deque.add_element_to_deque(-100, 0) 23 | deque.add_element_to_deque(0, 0) 24 | 25 | assert all(deque.deque == [Node(0, 0), Node(-100, 0)]) -------------------------------------------------------------------------------- /utilities/OU_Noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | 5 | class OU_Noise(object): 6 | """Ornstein-Uhlenbeck process.""" 7 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2): 8 | self.mu = mu * np.ones(size) 9 | self.theta = theta 10 | self.sigma = sigma 11 | self.seed = random.seed(seed) 12 | self.reset() 13 | 14 | def reset(self): 15 | """Reset the internal state (= noise) to mean (mu).""" 16 | self.state = copy.copy(self.mu) 17 | 18 | def sample(self): 19 | """Update internal state and return it as a noise sample.""" 20 | dx = self.theta * (self.mu - self.state) + self.sigma * np.array([np.random.normal() for _ in range(len(self.state))]) 21 | self.state += dx 22 | return self.state -------------------------------------------------------------------------------- /exploration_strategies/Base_Exploration_Strategy.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Base_Exploration_Strategy(object): 4 | """Base abstract class for agent exploration strategies. Every exploration strategy must inherit from this class 5 | and implement the methods perturb_action_for_exploration_purposes and add_exploration_rewards""" 6 | def __init__(self, config): 7 | self.config = config 8 | 9 | def perturb_action_for_exploration_purposes(self, action_info): 10 | """Perturbs the action of the agent to encourage exploration""" 11 | raise ValueError("Must be implemented") 12 | 13 | def add_exploration_rewards(self, reward_info): 14 | """Actions intrinsic rewards to encourage exploration""" 15 | raise ValueError("Must be implemented") 16 | 17 | def reset(self): 18 | """Resets the noise process""" 19 | raise ValueError("Must be implemented") -------------------------------------------------------------------------------- /utilities/data_structures/Config.py: -------------------------------------------------------------------------------- 1 | class Config(object): 2 | """Object to hold the config requirements for an agent/game""" 3 | def __init__(self): 4 | self.seed = None 5 | self.environment = None 6 | self.requirements_to_solve_game = None 7 | self.num_episodes_to_run = None 8 | self.file_to_save_data_results = None 9 | self.file_to_save_results_graph = None 10 | self.runs_per_agent = None 11 | self.visualise_overall_results = None 12 | self.visualise_individual_results = None 13 | self.hyperparameters = None 14 | self.use_GPU = None 15 | self.overwrite_existing_results_file = None 16 | self.save_model = False 17 | self.standard_deviation_results = 1.0 18 | self.randomise_random_seed = True 19 | self.show_solution_score = False 20 | self.debug_mode = False 21 | 22 | 23 | -------------------------------------------------------------------------------- /agents/DQN_agents/DDQN.py: -------------------------------------------------------------------------------- 1 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets 2 | 3 | class DDQN(DQN_With_Fixed_Q_Targets): 4 | """A double DQN agent""" 5 | agent_name = "DDQN" 6 | 7 | def __init__(self, config): 8 | DQN_With_Fixed_Q_Targets.__init__(self, config) 9 | 10 | def compute_q_values_for_next_states(self, next_states): 11 | """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN 12 | uses the local index to pick the maximum q_value action and then the target network to calculate the q_value. 13 | The reasoning behind this is that it will help stop the network from overestimating q values""" 14 | max_action_indexes = self.q_network_local(next_states).detach().argmax(1) 15 | Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1)) 16 | return Q_targets_next 17 | 18 | 19 | -------------------------------------------------------------------------------- /environments/Atari_Environment.py: -------------------------------------------------------------------------------- 1 | from gym import Wrapper, spaces 2 | from .Open_AI_Wrappers import * 3 | 4 | 5 | def make_atari_game(env_id, max_episode_steps=None): 6 | env = gym.make(env_id) 7 | env.frameskip = 1 8 | env = NoopResetEnv(env, noop_max=30) 9 | env = MaxAndSkipEnv(env, skip=4) 10 | if max_episode_steps is not None: 11 | env = TimeLimit(env, max_episode_steps=max_episode_steps) 12 | env = wrap_deepmind(env) 13 | return env 14 | 15 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=True, scale=True): 16 | """Configure environment for DeepMind-style Atari """ 17 | if episode_life: 18 | env = EpisodicLifeEnv(env) 19 | if 'FIRE' in env.unwrapped.get_action_meanings(): 20 | env = FireResetEnv(env) 21 | env = WarpFrame(env) 22 | if scale: 23 | env = ScaledFloatFrame(env) 24 | if clip_rewards: 25 | env = ClipRewardEnv(env) 26 | if frame_stack: 27 | env = FrameStack(env, 4) 28 | return env 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: 2 | python 3 | 4 | python: 3.7 5 | dist: xenial 6 | sudo: true 7 | 8 | install: 9 | - pip install -r requirements.txt 10 | 11 | 12 | script: 13 | - export PYTHONPATH="$PYTHONPATH:$PWD" 14 | - export PYTHONPATH=""$PYTHONPATH:$PWD/agents"" 15 | - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/DQN_agents"" 16 | - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/hierarchical_agents"" 17 | - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/actor_critic_agents"" 18 | - export PYTHONPATH=""$PYTHONPATH:$PWD/agents/policy_gradient_agents"" 19 | - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities/data_structures"" 20 | - export PYTHONPATH=""$PYTHONPATH:$PWD/environments"" 21 | - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities"" 22 | - export PYTHONPATH=""$PYTHONPATH:$PWD/exploration_strategies"" 23 | - export PYTHONPATH=""$PYTHONPATH:$PWD/utilities/*"" 24 | - export PYTHONPATH=""$PYTHONPATH:$PWD/*"" 25 | - export PYTHONPATH=""$PYTHONPATH:$PWD/results"" 26 | - pytest tests/*.py 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /exploration_strategies/OU_Noise_Exploration.py: -------------------------------------------------------------------------------- 1 | from utilities.OU_Noise import OU_Noise 2 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy 3 | 4 | class OU_Noise_Exploration(Base_Exploration_Strategy): 5 | """Ornstein-Uhlenbeck noise process exploration strategy""" 6 | def __init__(self, config): 7 | super().__init__(config) 8 | self.noise = OU_Noise(self.config.action_size, self.config.seed, self.config.hyperparameters["mu"], 9 | self.config.hyperparameters["theta"], self.config.hyperparameters["sigma"]) 10 | 11 | def perturb_action_for_exploration_purposes(self, action_info): 12 | """Perturbs the action of the agent to encourage exploration""" 13 | action = action_info["action"] 14 | action += self.noise.sample() 15 | return action 16 | 17 | def add_exploration_rewards(self, reward_info): 18 | """Actions intrinsic rewards to encourage exploration""" 19 | raise ValueError("Must be implemented") 20 | 21 | def reset(self): 22 | """Resets the noise process""" 23 | self.noise.reset() -------------------------------------------------------------------------------- /agents/DQN_agents/DQN_With_Fixed_Q_Targets.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from agents.Base_Agent import Base_Agent 4 | from agents.DQN_agents.DQN import DQN 5 | 6 | class DQN_With_Fixed_Q_Targets(DQN): 7 | """A DQN agent that uses an older version of the q_network as the target network""" 8 | agent_name = "DQN with Fixed Q Targets" 9 | def __init__(self, config): 10 | DQN.__init__(self, config) 11 | self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) 12 | Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) 13 | 14 | def learn(self, experiences=None): 15 | """Runs a learning iteration for the Q network""" 16 | super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences) 17 | self.soft_update_of_target_network(self.q_network_local, self.q_network_target, 18 | self.hyperparameters["tau"]) # Update the target network 19 | 20 | def compute_q_values_for_next_states(self, next_states): 21 | """Computes the q_values for next state we will use to create the loss to train the Q network""" 22 | Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1) 23 | return Q_targets_next -------------------------------------------------------------------------------- /agents/actor_critic_agents/A2C.py: -------------------------------------------------------------------------------- 1 | from agents.actor_critic_agents.A3C import A3C 2 | 3 | class A2C(A3C): 4 | """Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only 5 | difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients 6 | come in""" 7 | agent_name = "A2C" 8 | def __init__(self, config): 9 | super(A2C, self).__init__(config) 10 | 11 | def update_shared_model(self, gradient_updates_queue): 12 | """Worker that updates the shared model with gradients as they get put into the queue""" 13 | while True: 14 | gradients_seen = 0 15 | while gradients_seen < self.worker_processes: 16 | if gradients_seen == 0: 17 | gradients = gradient_updates_queue.get() 18 | else: 19 | new_grads = gradient_updates_queue.get() 20 | gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)] 21 | gradients_seen += 1 22 | self.actor_critic_optimizer.zero_grad() 23 | for grads, params in zip(gradients, self.actor_critic.parameters()): 24 | params._grad = grads 25 | self.actor_critic_optimizer.step() -------------------------------------------------------------------------------- /exploration_strategies/Gaussian_Exploration.py: -------------------------------------------------------------------------------- 1 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy 2 | import torch 3 | from torch.distributions.normal import Normal 4 | 5 | class Gaussian_Exploration(Base_Exploration_Strategy): 6 | 7 | """Gaussian noise exploration strategy""" 8 | def __init__(self, config): 9 | super().__init__(config) 10 | self.action_noise_std = self.config.hyperparameters["action_noise_std"] 11 | self.action_noise_distribution = Normal(torch.Tensor([0.0]), torch.Tensor([self.action_noise_std])) 12 | self.action_noise_clipping_range = self.config.hyperparameters["action_noise_clipping_range"] 13 | 14 | 15 | def perturb_action_for_exploration_purposes(self, action_info): 16 | """Perturbs the action of the agent to encourage exploration""" 17 | action = action_info["action"] 18 | action_noise = self.action_noise_distribution.sample(sample_shape=action.shape) 19 | action_noise = action_noise.squeeze(-1) 20 | clipped_action_noise = torch.clamp(action_noise, min=-self.action_noise_clipping_range, 21 | max=self.action_noise_clipping_range) 22 | action += clipped_action_noise 23 | return action 24 | 25 | def add_exploration_rewards(self, reward_info): 26 | """Actions intrinsic rewards to encourage exploration""" 27 | raise ValueError("Must be implemented") 28 | 29 | def reset(self): 30 | """Resets the noise process""" 31 | pass 32 | 33 | -------------------------------------------------------------------------------- /agents/DQN_agents/DQN_HER.py: -------------------------------------------------------------------------------- 1 | from agents.DQN_agents.DQN import DQN 2 | from agents.HER_Base import HER_Base 3 | 4 | class DQN_HER(HER_Base, DQN): 5 | """DQN algorithm with hindsight experience replay""" 6 | agent_name = "DQN-HER" 7 | def __init__(self, config): 8 | DQN.__init__(self, config) 9 | HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], 10 | self.hyperparameters["HER_sample_proportion"]) 11 | 12 | def step(self): 13 | """Runs a step within a game including a learning step if required""" 14 | while not self.done: 15 | self.action = self.pick_action() 16 | self.conduct_action_in_changeable_goal_envs(self.action) 17 | if self.time_for_q_network_to_learn(): 18 | for _ in range(self.hyperparameters["learning_iterations"]): 19 | self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer()) 20 | self.track_changeable_goal_episodes_data() 21 | self.save_experience() 22 | if self.done: self.save_alternative_experience() 23 | self.state_dict = self.next_state_dict # this is to set the state for the next iteration 24 | self.state = self.next_state 25 | self.global_step_number += 1 26 | self.episode_number += 1 27 | 28 | def enough_experiences_to_learn_from(self): 29 | """Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from""" 30 | return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size -------------------------------------------------------------------------------- /tests/Test_Trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utilities.data_structures.Config import Config 3 | from agents.Trainer import Trainer 4 | 5 | def test_get_mean_and_standard_deviation_difference_results(): 6 | """Tests that get_mean_and_standard_deviation_difference_results method produces correct output""" 7 | results = [ [1.0, 2.0, 3.0], [5.0, -33.0, 55.0], [2.5, 2.5, 2.5]] 8 | mean_results = [np.mean([1.0, 5.0, 2.5]), np.mean([2.0, -33.0, 2.5]), np.mean([3.0, 55.0, 2.5])] 9 | std_results = [np.std([1.0, 5.0, 2.5]), np.std([2.0, -33.0, 2.5]), np.std([3.0, 55.0, 2.5])] 10 | mean_minus_1_std = [ mean - std_val for mean, std_val in zip(mean_results, std_results)] 11 | mean_plus_1_std = [ mean + std_val for mean, std_val in zip(mean_results, std_results)] 12 | config = Config() 13 | config.standard_deviation_results = 1.0 14 | trainer = Trainer(config, []) 15 | mean_minus_x_std_guess, mean_results_guess, mean_plus_x_std_guess = trainer.get_mean_and_standard_deviation_difference_results(results) 16 | assert mean_results == mean_results_guess 17 | assert mean_minus_1_std == mean_minus_x_std_guess 18 | assert mean_plus_1_std == mean_plus_x_std_guess 19 | 20 | config.standard_deviation_results = 3.0 21 | trainer = Trainer(config, []) 22 | mean_minus_x_std_guess, mean_results_guess, mean_plus_x_std_guess = trainer.get_mean_and_standard_deviation_difference_results(results) 23 | mean_plus_3_std = [mean + 3.0*std_val for mean, std_val in zip(mean_results, std_results)] 24 | mean_minus_3_std = [mean - 3.0*std_val for mean, std_val in zip(mean_results, std_results)] 25 | assert mean_results == mean_results_guess 26 | assert mean_minus_3_std == mean_minus_x_std_guess 27 | assert mean_plus_3_std == mean_plus_x_std_guess 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /results/Bit_Flipping.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import FlattenDictWrapper 2 | from agents.DQN_agents.DQN_HER import DQN_HER 3 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment 4 | from agents.Trainer import Trainer 5 | from utilities.data_structures.Config import Config 6 | from agents.DQN_agents.DQN import DQN 7 | 8 | config = Config() 9 | config.seed = 1 10 | config.environment = Bit_Flipping_Environment(14) 11 | config.num_episodes_to_run = 4500 12 | config.file_to_save_data_results = None #"Data_and_Graphs/Bit_Flipping_Results_Data.pkl" 13 | config.file_to_save_results_graph = None #"Data_and_Graphs/Bit_Flipping_Results_Graph.png" 14 | config.show_solution_score = False 15 | config.visualise_individual_results = False 16 | config.visualise_overall_agent_results = True 17 | config.standard_deviation_results = 1.0 18 | config.runs_per_agent = 3 19 | config.use_GPU = False 20 | config.overwrite_existing_results_file = False 21 | config.randomise_random_seed = True 22 | config.save_model = False 23 | 24 | 25 | config.hyperparameters = { 26 | "DQN_Agents": { 27 | "learning_rate": 0.001, 28 | "batch_size": 128, 29 | "buffer_size": 100000, 30 | "epsilon_decay_rate_denominator": 150, 31 | "discount_rate": 0.999, 32 | "incremental_td_error": 1e-8, 33 | "update_every_n_steps": 1, 34 | "linear_hidden_units": [64, 64], 35 | "final_layer_activation": None, 36 | "y_range": (-1, 14), 37 | "batch_norm": False, 38 | "gradient_clipping_norm": 5, 39 | "HER_sample_proportion": 0.8, 40 | "learning_iterations": 1, 41 | "clip_rewards": False 42 | } 43 | } 44 | 45 | if __name__== '__main__': 46 | AGENTS = [DQN_HER, DQN] 47 | trainer = Trainer(config, AGENTS) 48 | trainer.run_games_for_agents() 49 | 50 | 51 | -------------------------------------------------------------------------------- /results/Fetch_Reach.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from agents.actor_critic_agents.DDPG import DDPG 4 | from agents.actor_critic_agents.DDPG_HER import DDPG_HER 5 | from utilities.data_structures.Config import Config 6 | from agents.Trainer import Trainer 7 | 8 | 9 | config = Config() 10 | config.seed = 1 11 | config.environment = gym.make("FetchReach-v1") 12 | config.num_episodes_to_run = 1000 13 | config.file_to_save_data_results = None 14 | config.file_to_save_results_graph = None 15 | config.show_solution_score = False 16 | config.visualise_individual_results = False 17 | config.visualise_overall_agent_results = True 18 | config.standard_deviation_results = 1.0 19 | config.runs_per_agent = 3 20 | config.use_GPU = False 21 | config.overwrite_existing_results_file = False 22 | config.randomise_random_seed = True 23 | config.save_model = False 24 | 25 | 26 | config.hyperparameters = { 27 | 28 | "Actor_Critic_Agents": { 29 | "Actor": { 30 | "learning_rate": 0.001, 31 | "linear_hidden_units": [50, 50], 32 | "final_layer_activation": "TANH", 33 | "batch_norm": False, 34 | "tau": 0.01, 35 | "gradient_clipping_norm": 5 36 | }, 37 | 38 | "Critic": { 39 | "learning_rate": 0.01, 40 | "linear_hidden_units": [50, 50, 50], 41 | "final_layer_activation": None, 42 | "batch_norm": False, 43 | "buffer_size": 30000, 44 | "tau": 0.01, 45 | "gradient_clipping_norm": 5 46 | }, 47 | 48 | "batch_size": 256, 49 | "discount_rate": 0.9, 50 | "mu": 0.0, 51 | "theta": 0.15, 52 | "sigma": 0.25, 53 | "update_every_n_steps": 10, 54 | "learning_updates_per_learning_session": 10, 55 | "HER_sample_proportion": 0.8, 56 | "clip_rewards": False 57 | }} 58 | 59 | 60 | if __name__== '__main__': 61 | AGENTS = [DDPG, DDPG_HER] 62 | trainer = Trainer(config, AGENTS) 63 | trainer.run_games_for_agents() 64 | 65 | -------------------------------------------------------------------------------- /agents/actor_critic_agents/DDPG_HER.py: -------------------------------------------------------------------------------- 1 | from agents.actor_critic_agents.DDPG import DDPG 2 | from agents.HER_Base import HER_Base 3 | 4 | class DDPG_HER(HER_Base, DDPG): 5 | """DDPG algorithm with hindsight experience replay""" 6 | agent_name = "DDPG-HER" 7 | 8 | def __init__(self, config): 9 | DDPG.__init__(self, config) 10 | HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], 11 | self.hyperparameters["HER_sample_proportion"]) 12 | 13 | def step(self): 14 | """Runs a step within a game including a learning step if required""" 15 | while not self.done: 16 | self.action = self.pick_action() 17 | self.conduct_action_in_changeable_goal_envs(self.action) 18 | if self.time_for_critic_and_actor_to_learn(): 19 | for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): 20 | states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer() # Samples experiences from buffer 21 | self.critic_learn(states, actions, rewards, next_states, dones) 22 | self.actor_learn(states) 23 | self.track_changeable_goal_episodes_data() 24 | self.save_experience() 25 | if self.done: self.save_alternative_experience() 26 | self.state_dict = self.next_state_dict # this is to set the state for the next iteration 27 | self.state = self.next_state 28 | self.global_step_number += 1 29 | self.episode_number += 1 30 | 31 | def enough_experiences_to_learn_from(self): 32 | """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn""" 33 | return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /results/data_and_graphs/Plot_Sets_Of_Results.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from utilities.data_structures.Config import Config 4 | from Trainer import Trainer 5 | 6 | 7 | trainer = Trainer(config=Config(), agents=None) 8 | 9 | # 10 | # trainer.visualise_set_of_preexisting_results(save_image_path="Four_Rooms_and_Long_Corridor.png", results_data_paths=["Long_Corridor_Results_Data.pkl", "Four_Rooms.pkl"], 11 | # plot_titles=["Long Corridor", "Four Rooms"], y_limits=[(0.0, 0.25), (-90.0, 100.25)]) 12 | 13 | 14 | 15 | trainer.visualise_preexisting_results(save_image_path="hrl_experiments/Taxi_graph_comparison.png", data_path="hrl_experiments/Taxi_data.pkl", 16 | title="Taxi v2", y_limits=(-800.0, 0.0)) 17 | 18 | 19 | # trainer.visualise_preexisting_results(save_image_path="Long_Corridor_Graph.png", data_path="Long_Corridor_Results_Data.pkl", 20 | # title="Long Corridor", y_limits=(0.0, 0.25)) 21 | 22 | 23 | # trainer.visualise_preexisting_results(save_image_path="Hopper_Results_Graph_Both_Agents.png", data_path="Hopper_Results_Data.pkl", 24 | # title="Hopper") #, y_limits=(0.0, 0.25)) 25 | 26 | # trainer.visualise_set_of_preexisting_results(results_data_paths=["Cart_Pole_Results_Data.pkl", 27 | # "Mountain_Car_Results_Data.pkl"], 28 | # plot_titles=["Cart Pole (Discrete Actions)", "Mountain Car (Continuous Actions)"], 29 | # save_image_path="CartPole_and_MountainCar_Graph.png") 30 | 31 | 32 | 33 | # trainer.visualise_set_of_preexisting_results(results_data_paths=["Data_and_Graphs/Bit_Flipping_Results_Data.pkl", 34 | # "Data_and_Graphs/Fetch_Reach_Results_Data.pkl"], 35 | # plot_titles=["Bit Flipping", "Fetch Reach"], 36 | # save_image_path="Data_and_Graphs/HER_Experiments.png") 37 | -------------------------------------------------------------------------------- /tests/Test_Bit_Flipping_Environment.py: -------------------------------------------------------------------------------- 1 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment 2 | import numpy as np 3 | 4 | 5 | def test_environment_actions(): 6 | """Tests environment is executing actions correctly""" 7 | env = Bit_Flipping_Environment(5) 8 | env.reset() 9 | env.state = [1, 0, 0, 1, 0, 1, 0, 0, 1, 0] 10 | 11 | env.step(0) 12 | env.state = env.next_state 13 | assert env.state == [0, 0, 0, 1, 0, 1, 0, 0, 1, 0] 14 | 15 | env.step(0) 16 | env.state = env.next_state 17 | assert env.state == [1, 0, 0, 1, 0, 1, 0, 0, 1, 0] 18 | 19 | env.step(3) 20 | env.state = env.next_state 21 | assert env.state == [1, 0, 0, 0, 0, 1, 0, 0, 1, 0] 22 | 23 | env.step(6) 24 | env.state = env.next_state 25 | assert env.state == [1, 0, 0, 0, 0, 1, 0, 0, 1, 0] 26 | 27 | def test_environment_goal_achievement(): 28 | """Tests environment is registering goal achievement properly""" 29 | env = Bit_Flipping_Environment(5) 30 | env.reset() 31 | env.state = [1, 0, 0, 1, 0, 0, 0, 0, 0, 0] 32 | env.desired_goal = [0, 0, 0, 0, 0] 33 | 34 | env.step(0) 35 | assert env.reward == -1 36 | env.state = env.next_state 37 | assert env.achieved_goal == [0, 0, 0, 1, 0] 38 | 39 | env.step(2) 40 | assert env.reward == -1 41 | env.state = env.next_state 42 | assert env.achieved_goal == [0, 0, 1, 1, 0] 43 | 44 | env.step(2) 45 | assert env.reward == -1 46 | env.state = env.next_state 47 | assert env.achieved_goal == [0, 0, 0, 1, 0] 48 | 49 | env.step(3) 50 | assert env.reward == 5 51 | 52 | def test_compute_reward(): 53 | """Tests compute_reward method""" 54 | env = Bit_Flipping_Environment(5) 55 | assert env.compute_reward(np.array([0, 0, 0, 1, 0]), np.array([0, 0, 0, 1, 0]), None) == env.reward_for_achieving_goal 56 | assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([1, 1, 1, 1, 1]), None) == env.reward_for_achieving_goal 57 | assert env.compute_reward(np.array([0, 0, 0, 0, 0]), np.array([0, 0, 0, 0, 0]), None) == env.reward_for_achieving_goal 58 | assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([0, 0, 0, 1, 0]), None) == env.step_reward_for_not_achieving_goal 59 | assert env.compute_reward(np.array([1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0]), None) == env.step_reward_for_not_achieving_goal 60 | 61 | -------------------------------------------------------------------------------- /utilities/data_structures/Deque.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utilities.data_structures.Node import Node 3 | 4 | class Deque(object): 5 | """Generic deque object""" 6 | def __init__(self, max_size, dimension_of_value_attribute): 7 | 8 | self.max_size = max_size 9 | self.dimension_of_value_attribute = dimension_of_value_attribute 10 | self.deque = self.initialise_deque() 11 | self.deque_index_to_overwrite_next = 0 12 | self.reached_max_capacity = False 13 | self.number_experiences_in_deque = 0 14 | 15 | def initialise_deque(self): 16 | """Initialises a queue of Nodes of length self.max_size""" 17 | deque = np.array([Node(0, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size)]) 18 | return deque 19 | 20 | def add_element_to_deque(self, new_key, new_value): 21 | """Adds an element to the deque and then updates the index of the next element to be overwritten and also the 22 | amount of elements in the deque""" 23 | self.update_deque_node_key_and_value(self.deque_index_to_overwrite_next, new_key, new_value) 24 | self.update_number_experiences_in_deque() 25 | self.update_deque_index_to_overwrite_next() 26 | 27 | def update_deque_node_key_and_value(self, index, new_key, new_value): 28 | self.update_deque_node_key(index, new_key) 29 | self.update_deque_node_value(index, new_value) 30 | 31 | def update_deque_node_key(self, index, new_key): 32 | self.deque[index].update_key(new_key) 33 | 34 | def update_deque_node_value(self, index, new_value): 35 | self.deque[index].update_value(new_value) 36 | 37 | def update_deque_index_to_overwrite_next(self): 38 | """Updates the deque index that we should write over next. When the buffer gets full we begin writing over 39 | older experiences""" 40 | if self.deque_index_to_overwrite_next < self.max_size - 1: 41 | self.deque_index_to_overwrite_next += 1 42 | else: 43 | self.reached_max_capacity = True 44 | self.deque_index_to_overwrite_next = 0 45 | 46 | def update_number_experiences_in_deque(self): 47 | """Keeps track of how many experiences there are in the buffer""" 48 | if not self.reached_max_capacity: 49 | self.number_experiences_in_deque += 1 -------------------------------------------------------------------------------- /agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from agents.DQN_agents.DDQN import DDQN 4 | from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer 5 | 6 | class DDQN_With_Prioritised_Experience_Replay(DDQN): 7 | """A DQN agent with prioritised experience replay""" 8 | agent_name = "DDQN with Prioritised Replay" 9 | 10 | def __init__(self, config): 11 | DDQN.__init__(self, config) 12 | self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed) 13 | 14 | def learn(self): 15 | """Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way""" 16 | sampled_experiences, importance_sampling_weights = self.memory.sample() 17 | states, actions, rewards, next_states, dones = sampled_experiences 18 | loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights) 19 | self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) 20 | self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"]) 21 | self.memory.update_td_errors(td_errors.squeeze(1)) 22 | 23 | def save_experience(self): 24 | """Saves the latest experience including the td_error""" 25 | max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9 26 | self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done) 27 | 28 | def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights): 29 | """Calculates the loss for the local Q network. It weighs each observations loss according to the importance 30 | sampling weights which come from the prioritised replay buffer""" 31 | Q_targets = self.compute_q_targets(next_states, rewards, dones) 32 | Q_expected = self.compute_expected_q_values(states, actions) 33 | loss = F.mse_loss(Q_expected, Q_targets) 34 | loss = loss * importance_sampling_weights 35 | loss = torch.mean(loss) 36 | td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy() 37 | return loss, td_errors -------------------------------------------------------------------------------- /environments/ant_environments/point.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 6 | # and is not my code. 7 | 8 | 9 | 10 | 11 | import math 12 | import numpy as np 13 | from gym import utils 14 | from gym.envs.mujoco import mujoco_env 15 | 16 | 17 | class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle): 18 | FILE = "point.xml" 19 | ORI_IND = 2 20 | 21 | def __init__(self, file_path=None, expose_all_qpos=True): 22 | self._expose_all_qpos = expose_all_qpos 23 | 24 | mujoco_env.MujocoEnv.__init__(self, file_path, 1) 25 | utils.EzPickle.__init__(self) 26 | 27 | @property 28 | def physics(self): 29 | return self.model 30 | 31 | def _step(self, a): 32 | return self.step(a) 33 | 34 | def step(self, action): 35 | action[0] = 0.2 * action[0] 36 | qpos = np.copy(self.physics.data.qpos) 37 | qpos[2] += action[1] 38 | ori = qpos[2] 39 | # compute increment in each direction 40 | dx = math.cos(ori) * action[0] 41 | dy = math.sin(ori) * action[0] 42 | # ensure that the robot is within reasonable range 43 | qpos[0] = np.clip(qpos[0] + dx, -100, 100) 44 | qpos[1] = np.clip(qpos[1] + dy, -100, 100) 45 | qvel = self.physics.data.qvel 46 | self.set_state(qpos, qvel) 47 | for _ in range(0, self.frame_skip): 48 | self.physics.step() 49 | next_obs = self._get_obs() 50 | reward = 0 51 | done = False 52 | info = {} 53 | return next_obs, reward, done, info 54 | 55 | def _get_obs(self): 56 | if self._expose_all_qpos: 57 | return np.concatenate([ 58 | self.physics.data.qpos.flat[:3], # Only point-relevant coords. 59 | self.physics.data.qvel.flat[:3]]) 60 | return np.concatenate([ 61 | self.physics.data.qpos.flat[2:3], 62 | self.physics.data.qvel.flat[:3]]) 63 | 64 | def reset_model(self): 65 | qpos = self.init_qpos + self.np_random.uniform( 66 | size=self.physics.model.nq, low=-.1, high=.1) 67 | qvel = self.init_qvel + self.np_random.randn(self.physics.model.nv) * .1 68 | 69 | # Set everything other than point to original position and 0 velocity. 70 | qpos[3:] = self.init_qpos[3:] 71 | qvel[3:] = 0. 72 | self.set_state(qpos, qvel) 73 | return self._get_obs() 74 | 75 | def get_ori(self): 76 | return self.model.data.qpos[self.__class__.ORI_IND] 77 | 78 | def set_xy(self, xy): 79 | qpos = np.copy(self.physics.data.qpos) 80 | qpos[0] = xy[0] 81 | qpos[1] = xy[1] 82 | 83 | qvel = self.physics.data.qvel 84 | -------------------------------------------------------------------------------- /utilities/data_structures/Tanh_Distribution.py: -------------------------------------------------------------------------------- 1 | 2 | # NOTE that this is not my code. 3 | # Taken from here: https://github.com/vitchyr/rlkit/blob/master/rlkit/torch/distributions.py 4 | 5 | 6 | import torch 7 | from torch.distributions import Distribution, Normal 8 | 9 | 10 | class TanhNormal(Distribution): 11 | """ 12 | Represent distribution of X where 13 | X ~ tanh(Z) 14 | Z ~ N(mean, std) 15 | Note: this is not very numerically stable. 16 | """ 17 | def __init__(self, normal_mean, normal_std, epsilon=1e-6): 18 | """ 19 | :param normal_mean: Mean of the normal distribution 20 | :param normal_std: Std of the normal distribution 21 | :param epsilon: Numerical stability epsilon when computing log-prob. 22 | """ 23 | self.normal_mean = normal_mean 24 | self.normal_std = normal_std 25 | self.normal = Normal(normal_mean, normal_std) 26 | self.epsilon = epsilon 27 | 28 | def sample_n(self, n, return_pre_tanh_value=False): 29 | z = self.normal.sample_n(n) 30 | if return_pre_tanh_value: 31 | return torch.tanh(z), z 32 | else: 33 | return torch.tanh(z) 34 | 35 | def log_prob(self, value, pre_tanh_value=None): 36 | """ 37 | :param value: some value, x 38 | :param pre_tanh_value: arctanh(x) 39 | :return: 40 | """ 41 | if pre_tanh_value is None: 42 | pre_tanh_value = torch.log( 43 | (1+value) / (1-value) 44 | ) / 2 45 | return self.normal.log_prob(pre_tanh_value) - torch.log( 46 | 1 - value * value + self.epsilon 47 | ) 48 | 49 | def sample(self, return_pretanh_value=False): 50 | """ 51 | Gradients will and should *not* pass through this operation. 52 | See https://github.com/pytorch/pytorch/issues/4620 for discussion. 53 | """ 54 | z = self.normal.sample().detach() 55 | 56 | if return_pretanh_value: 57 | return torch.tanh(z), z 58 | else: 59 | return torch.tanh(z) 60 | 61 | def rsample(self, return_pretanh_value=False): 62 | """ 63 | Sampling in the reparameterization case. 64 | """ 65 | z = ( 66 | self.normal_mean + 67 | self.normal_std * 68 | Normal( 69 | torch.zeros(self.normal_mean.size()), 70 | torch.ones(self.normal_std.size()) 71 | ).sample() 72 | ) 73 | z.requires_grad_() 74 | 75 | if return_pretanh_value: 76 | return torch.tanh(z), z 77 | else: 78 | return torch.tanh(z) -------------------------------------------------------------------------------- /utilities/Tensorboard.py: -------------------------------------------------------------------------------- 1 | # NOTE that this code is not mine and was taken from https://becominghuman.ai/logging-in-tensorboard-with-pytorch-or-any-other-library-c549163dee9e 2 | 3 | 4 | import io 5 | import numpy as np 6 | from PIL import Image 7 | import tensorflow as tf 8 | 9 | # run tensorboard --logdir="logs/" on command line to get up the tensorboard afterwards 10 | 11 | class Tensorboard: 12 | def __init__(self, logdir): 13 | self.writer = tf.summary.FileWriter(logdir) 14 | 15 | def close(self): 16 | self.writer.close() 17 | 18 | def log_scalar(self, tag, value, global_step): 19 | summary = tf.Summary() 20 | summary.value.add(tag=tag, simple_value=value) 21 | self.writer.add_summary(summary, global_step=global_step) 22 | self.writer.flush() 23 | 24 | def log_histogram(self, tag, values, global_step, bins): 25 | counts, bin_edges = np.histogram(values, bins=bins) 26 | 27 | hist = tf.HistogramProto() 28 | hist.min = float(np.min(values)) 29 | hist.max = float(np.max(values)) 30 | hist.num = int(np.prod(values.shape)) 31 | hist.sum = float(np.sum(values)) 32 | hist.sum_squares = float(np.sum(values ** 2)) 33 | 34 | bin_edges = bin_edges[1:] 35 | 36 | for edge in bin_edges: 37 | hist.bucket_limit.append(edge) 38 | for c in counts: 39 | hist.bucket.append(c) 40 | 41 | summary = tf.Summary() 42 | summary.value.add(tag=tag, histo=hist) 43 | self.writer.add_summary(summary, global_step=global_step) 44 | self.writer.flush() 45 | 46 | def log_image(self, tag, img, global_step): 47 | s = io.BytesIO() 48 | Image.fromarray(img).save(s, format='png') 49 | 50 | img_summary = tf.Summary.Image(encoded_image_string=s.getvalue(), 51 | height=img.shape[0], 52 | width=img.shape[1]) 53 | 54 | summary = tf.Summary() 55 | summary.value.add(tag=tag, image=img_summary) 56 | self.writer.add_summary(summary, global_step=global_step) 57 | self.writer.flush() 58 | 59 | def log_plot(self, tag, figure, global_step): 60 | plot_buf = io.BytesIO() 61 | figure.savefig(plot_buf, format='png') 62 | plot_buf.seek(0) 63 | img = Image.open(plot_buf) 64 | img_ar = np.array(img) 65 | 66 | img_summary = tf.Summary.Image(encoded_image_string=plot_buf.getvalue(), 67 | height=img_ar.shape[0], 68 | width=img_ar.shape[1]) 69 | 70 | summary = tf.Summary() 71 | summary.value.add(tag=tag, image=img_summary) 72 | self.writer.add_summary(summary, global_step=global_step) 73 | self.writer.flush() -------------------------------------------------------------------------------- /environments/ant_environments/create_maze_env.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 4 | # and is not my code. 5 | 6 | 7 | 8 | from ant_environments.ant_maze_env import AntMazeEnv 9 | from ant_environments.point_maze_env import PointMazeEnv 10 | 11 | import tensorflow as tf 12 | import gin.tf 13 | from tf_agents.environments import gym_wrapper 14 | from tf_agents.environments import tf_py_environment 15 | 16 | 17 | @gin.configurable 18 | def create_maze_env(env_name=None, top_down_view=False): 19 | n_bins = 0 20 | manual_collision = False 21 | if env_name.startswith('Ego'): 22 | n_bins = 8 23 | env_name = env_name[3:] 24 | if env_name.startswith('Ant'): 25 | cls = AntMazeEnv 26 | env_name = env_name[3:] 27 | maze_size_scaling = 8 28 | elif env_name.startswith('Point'): 29 | cls = PointMazeEnv 30 | manual_collision = True 31 | env_name = env_name[5:] 32 | maze_size_scaling = 4 33 | else: 34 | assert False, 'unknown env %s' % env_name 35 | 36 | maze_id = None 37 | observe_blocks = False 38 | put_spin_near_agent = False 39 | if env_name == 'Maze': 40 | maze_id = 'Maze' 41 | elif env_name == 'Push': 42 | maze_id = 'Push' 43 | elif env_name == 'Fall': 44 | maze_id = 'Fall' 45 | elif env_name == 'Block': 46 | maze_id = 'Block' 47 | put_spin_near_agent = True 48 | observe_blocks = True 49 | elif env_name == 'BlockMaze': 50 | maze_id = 'BlockMaze' 51 | put_spin_near_agent = True 52 | observe_blocks = True 53 | else: 54 | raise ValueError('Unknown maze environment %s' % env_name) 55 | 56 | gym_mujoco_kwargs = { 57 | 'maze_id': maze_id, 58 | 'n_bins': n_bins, 59 | 'observe_blocks': observe_blocks, 60 | 'put_spin_near_agent': put_spin_near_agent, 61 | 'top_down_view': top_down_view, 62 | 'manual_collision': manual_collision, 63 | 'maze_size_scaling': maze_size_scaling 64 | } 65 | gym_env = cls(**gym_mujoco_kwargs) 66 | gym_env.reset() 67 | wrapped_env = gym_wrapper.GymWrapper(gym_env) 68 | return wrapped_env 69 | 70 | 71 | class TFPyEnvironment(tf_py_environment.TFPyEnvironment): 72 | 73 | def __init__(self, *args, **kwargs): 74 | super(TFPyEnvironment, self).__init__(*args, **kwargs) 75 | 76 | def start_collect(self): 77 | pass 78 | 79 | def current_obs(self): 80 | time_step = self.current_time_step() 81 | return time_step.observation[0] # For some reason, there is an extra dim. 82 | 83 | def step(self, actions): 84 | actions = tf.expand_dims(actions, 0) 85 | next_step = super(TFPyEnvironment, self).step(actions) 86 | return next_step.is_last()[0], next_step.reward[0], next_step.discount[0] 87 | 88 | def reset(self): 89 | return super(TFPyEnvironment, self).reset() 90 | -------------------------------------------------------------------------------- /environments/Ant_Navigation_Environments.py: -------------------------------------------------------------------------------- 1 | from .ant_environments.create_maze_env import create_maze_env 2 | import numpy as np 3 | 4 | """Environments taken from HIRO paper github repo: https://github.com/tensorflow/models/tree/master/research/efficient-hrl 5 | There are three environments that can be represented by this class depending on what environment_name you provide. 6 | The options are: ["AntMaze", "AntPush", "AntFall"]. 7 | 8 | Note that "Success" for this game is defined by the authors as achieving -5 or more on the last step of the episode 9 | but that this isn't coded in anyway as part of the environment. 10 | """ 11 | class Ant_Navigation_Environments(object): 12 | 13 | def __init__(self, environment_name): 14 | self.environment_name = environment_name 15 | self.base_env = create_maze_env(env_name=self.environment_name).gym # 16 | 17 | self.goal_sample_fn = self.get_goal_fn() 18 | self.reward_fn = self.get_reward_fn() 19 | self.goal = None 20 | 21 | self.unwrapped = self.base_env.unwrapped 22 | self.spec = self.base_env.spec 23 | self.action_space = self.base_env.action_space 24 | 25 | def reset(self): 26 | self.steps_taken = 0 27 | obs = self.base_env.reset() 28 | self.goal = self.goal_sample_fn() 29 | return np.concatenate([obs, self.goal]) 30 | 31 | def step(self, action): 32 | self.steps_taken += 1 33 | obs, _, _, info = self.base_env.step(action) 34 | reward = self.reward_fn(obs, self.goal) 35 | done = self.steps_taken >= 500 36 | return np.concatenate([obs, self.goal]), reward, done, info 37 | 38 | def get_goal_fn(self): 39 | """Produces the function required to generate a goal for each environment""" 40 | if self.environment_name == "AntMaze": 41 | return lambda: np.array([0., 16.]) 42 | #Can also use np.random.uniform((-4, -4), (20, 20)) for training purposes 43 | elif self.environment_name == "AntPush": 44 | return lambda: np.array([0., 19.]) 45 | elif self.environment_name == "AntFall": 46 | return lambda: np.array([0., 27., 4.5]) 47 | else: 48 | raise ValueError("Unknown environment name") 49 | 50 | def get_reward_fn(self): 51 | """Provides function required to calculates rewards for each game""" 52 | if self.environment_name == 'AntMaze': 53 | return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5 54 | elif self.environment_name == 'AntPush': 55 | return lambda obs, goal: -np.sum(np.square(obs[:2] - goal)) ** 0.5 56 | elif self.environment_name == 'AntFall': 57 | return lambda obs, goal: -np.sum(np.square(obs[:3] - goal)) ** 0.5 58 | else: 59 | raise ValueError("Unknown environment name") 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /utilities/data_structures/Replay_Buffer.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple, deque 2 | import random 3 | import torch 4 | import numpy as np 5 | 6 | class Replay_Buffer(object): 7 | """Replay buffer to store past experiences that the agent can then use for training data""" 8 | 9 | def __init__(self, buffer_size, batch_size, seed): 10 | 11 | self.memory = deque(maxlen=buffer_size) 12 | self.batch_size = batch_size 13 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 14 | self.seed = random.seed(seed) 15 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 16 | 17 | def add_experience(self, states, actions, rewards, next_states, dones): 18 | """Adds experience(s) into the replay buffer""" 19 | if type(dones) == list: 20 | assert type(dones[0]) != list, "A done shouldn't be a list" 21 | experiences = [self.experience(state, action, reward, next_state, done) 22 | for state, action, reward, next_state, done in 23 | zip(states, actions, rewards, next_states, dones)] 24 | self.memory.extend(experiences) 25 | else: 26 | experience = self.experience(states, actions, rewards, next_states, dones) 27 | self.memory.append(experience) 28 | 29 | def sample(self, num_experiences=None, separate_out_data_types=True): 30 | """Draws a random sample of experience from the replay buffer""" 31 | experiences = self.pick_experiences(num_experiences) 32 | if separate_out_data_types: 33 | states, actions, rewards, next_states, dones = self.separate_out_data_types(experiences) 34 | return states, actions, rewards, next_states, dones 35 | else: 36 | return experiences 37 | 38 | def separate_out_data_types(self, experiences): 39 | """Puts the sampled experience into the correct format for a PyTorch neural network""" 40 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) 41 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) 42 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) 43 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) 44 | dones = torch.from_numpy(np.vstack([int(e.done) for e in experiences if e is not None])).float().to(self.device) 45 | 46 | return states, actions, rewards, next_states, dones 47 | 48 | def pick_experiences(self, num_experiences=None): 49 | if num_experiences is not None: batch_size = num_experiences 50 | else: batch_size = self.batch_size 51 | return random.sample(self.memory, k=batch_size) 52 | 53 | def __len__(self): 54 | return len(self.memory) -------------------------------------------------------------------------------- /utilities/Deepmind_RMS_Prop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Optimizer 3 | 4 | 5 | class DM_RMSprop(Optimizer): 6 | """Implements the form of RMSProp used in DM 2015 Atari paper. 7 | Inspired by https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/updates.py""" 8 | 9 | def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): 10 | if not 0.0 <= lr: 11 | raise ValueError("Invalid learning rate: {}".format(lr)) 12 | if not 0.0 <= eps: 13 | raise ValueError("Invalid epsilon value: {}".format(eps)) 14 | if not 0.0 <= momentum: 15 | raise ValueError("Invalid momentum value: {}".format(momentum)) 16 | if not 0.0 <= weight_decay: 17 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 18 | if not 0.0 <= alpha: 19 | raise ValueError("Invalid alpha value: {}".format(alpha)) 20 | 21 | defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay) 22 | super(DM_RMSprop, self).__init__(params, defaults) 23 | 24 | def __setstate__(self, state): 25 | super(DM_RMSprop, self).__setstate__(state) 26 | for group in self.param_groups: 27 | group.setdefault('momentum', 0) 28 | group.setdefault('centered', False) 29 | 30 | def step(self, closure=None): 31 | """Performs a single optimization step. 32 | 33 | Arguments: 34 | closure (callable, optional): A closure that reevaluates the model 35 | and returns the loss. 36 | """ 37 | loss = None 38 | if closure is not None: 39 | loss = closure() 40 | for group in self.param_groups: 41 | momentum = group['momentum'] 42 | sq_momentum = group['alpha'] 43 | epsilon = group['eps'] 44 | 45 | for p in group['params']: 46 | if p.grad is None: 47 | continue 48 | grad = p.grad.data 49 | if grad.is_sparse: 50 | raise RuntimeError('RMSprop does not support sparse gradients') 51 | state = self.state[p] 52 | 53 | # State initialization 54 | if len(state) == 0: 55 | state['step'] = 0 56 | state['square_avg'] = torch.zeros_like(p.data) 57 | if momentum > 0: 58 | state['momentum_buffer'] = torch.zeros_like(p.data) 59 | 60 | mom_buffer = state['momentum_buffer'] 61 | square_avg = state['square_avg'] 62 | 63 | 64 | state['step'] += 1 65 | 66 | mom_buffer.mul_(momentum) 67 | mom_buffer.add_((1 - momentum) * grad) 68 | 69 | square_avg.mul_(sq_momentum).addcmul_(1 - sq_momentum, grad, grad) 70 | 71 | avg = (square_avg - mom_buffer**2 + epsilon).sqrt() 72 | 73 | p.data.addcdiv_(-group['lr'], grad, avg) 74 | 75 | return loss 76 | 77 | -------------------------------------------------------------------------------- /utilities/data_structures/Max_Heap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utilities.data_structures.Node import Node 3 | 4 | class Max_Heap(object): 5 | """Generic max heap object""" 6 | def __init__(self, max_size, dimension_of_value_attribute, default_key_to_use): 7 | 8 | self.max_size = max_size 9 | self.dimension_of_value_attribute = dimension_of_value_attribute 10 | self.default_key_to_use = default_key_to_use 11 | self.heap = self.initialise_heap() 12 | 13 | def initialise_heap(self): 14 | """Initialises a heap of Nodes of length self.max_size * 4 + 1""" 15 | heap = np.array([Node(self.default_key_to_use, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size * 4 + 1)]) 16 | 17 | # We don't use the 0th element in a heap so we want it to have infinite value so it is never swapped with a lower node 18 | heap[0] = Node(float("inf"), (None, None, None, None, None)) 19 | return heap 20 | 21 | def update_element_and_reorganise_heap(self, heap_index_for_change, new_element): 22 | self.update_heap_element(heap_index_for_change, new_element) 23 | self.reorganise_heap(heap_index_for_change) 24 | 25 | def update_heap_element(self, heap_index, new_element): 26 | self.heap[heap_index] = new_element 27 | 28 | def reorganise_heap(self, heap_index_changed): 29 | """This reorganises the heap after a new value is added so as to keep the max value at the top of the heap which 30 | is index position 1 in the array self.heap""" 31 | 32 | node_key = self.heap[heap_index_changed].key 33 | parent_index = int(heap_index_changed / 2) 34 | 35 | if node_key > self.heap[parent_index].key: 36 | self.swap_heap_elements(heap_index_changed, parent_index) 37 | self.reorganise_heap(parent_index) 38 | 39 | else: 40 | biggest_child_index = self.calculate_index_of_biggest_child(heap_index_changed) 41 | if node_key < self.heap[biggest_child_index].key: 42 | self.swap_heap_elements(heap_index_changed, biggest_child_index) 43 | self.reorganise_heap(biggest_child_index) 44 | 45 | def swap_heap_elements(self, index1, index2): 46 | """Swaps the position of two heap elements""" 47 | self.heap[index1], self.heap[index2] = self.heap[index2], self.heap[index1] 48 | 49 | def calculate_index_of_biggest_child(self, heap_index_changed): 50 | """Calculates the heap index of the node's child with the biggest td_error value""" 51 | left_child = self.heap[int(heap_index_changed * 2)] 52 | right_child = self.heap[int(heap_index_changed * 2) + 1] 53 | 54 | if left_child.key > right_child.key: 55 | biggest_child_index = heap_index_changed * 2 56 | else: 57 | biggest_child_index = heap_index_changed * 2 + 1 58 | 59 | return biggest_child_index 60 | 61 | def give_max_key(self): 62 | """Returns the maximum td error currently in the heap. Because it is a max heap this is the top element of the heap""" 63 | return self.heap[1].key 64 | -------------------------------------------------------------------------------- /environments/Long_Corridor_Environment.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | import gym 4 | import numpy as np 5 | from gym import spaces 6 | from gym.utils import seeding 7 | 8 | class Long_Corridor_Environment(gym.Env): 9 | """Is the environment from pg.6 of the paper Hierarchical Deep Reinforcement Learning: Integrating Temporal 10 | Abstraction and Intrinsic Motivation. 11 | https://papers.nips.cc/paper/6233-hierarchical-deep-reinforcement-learning-integrating-temporal-abstraction-and-intrinsic-motivation.pdf""" 12 | environment_name = "Long Corridor Environment" 13 | 14 | def __init__(self, num_states=6, stochasticity_of_action_right=0.5): 15 | self.action_space = spaces.Discrete(2) 16 | self.observation_space = spaces.Discrete(num_states) 17 | self.seed() 18 | self.reward_threshold = 1.0 19 | self.trials = 100 20 | self.max_episode_steps = 100 21 | self.id = "Long Corridor" 22 | self.action_translation = {0: "left", 1: "right"} 23 | self.stochasticity_of_action_right = stochasticity_of_action_right 24 | self.num_states = num_states 25 | self.visited_final_state = False 26 | self.reward_if_visited_final_state = 1.0 27 | self.reward_if_havent_visited_final_state = 0.01 28 | 29 | def seed(self, seed=None): 30 | self.np_random, seed = seeding.np_random(seed) 31 | return [seed] 32 | 33 | def step(self, action): 34 | self.episode_steps += 1 35 | if type(action) is np.ndarray: 36 | action = action[0] 37 | assert action in [0, 1], "Action must be a 0 or a 1" 38 | if action == 0: self.move_left() 39 | else: self.move_right() 40 | self.update_done_reward_and_visited_final_state() 41 | self.state = self.next_state 42 | self.s = np.array(self.next_state) 43 | return self.s, self.reward, self.done, {} 44 | 45 | def reset(self): 46 | self.state = 1 #environment always starts in state 1 47 | self.next_state = None 48 | self.reward = None 49 | self.done = False 50 | self.visited_final_state = False 51 | self.episode_steps = 0 52 | self.s = np.array(self.state) 53 | return self.s 54 | 55 | def update_done_reward_and_visited_final_state(self): 56 | if self.next_state == 0: 57 | self.done = True 58 | if self.visited_final_state: self.reward = self.reward_if_visited_final_state 59 | else: self.reward = self.reward_if_havent_visited_final_state 60 | else: 61 | self.reward = 0 62 | if self.next_state == self.num_states - 1: self.visited_final_state = True 63 | if self.episode_steps >= self.max_episode_steps: self.done = True 64 | 65 | def move_left(self): 66 | """Moves left in environment""" 67 | self.next_state = self.state - 1 68 | 69 | def move_right(self): 70 | """Moves right in environment""" 71 | if random.random() < self.stochasticity_of_action_right: self.next_state = self.state - 1 72 | else: self.next_state = min(self.state + 1, self.num_states - 1) 73 | -------------------------------------------------------------------------------- /results/Mountain_Car.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from agents.policy_gradient_agents.PPO import PPO 4 | from agents.actor_critic_agents.DDPG import DDPG 5 | from agents.actor_critic_agents.SAC import SAC 6 | from agents.actor_critic_agents.TD3 import TD3 7 | from agents.Trainer import Trainer 8 | from utilities.data_structures.Config import Config 9 | 10 | 11 | config = Config() 12 | config.seed = 1 13 | config.environment = gym.make("MountainCarContinuous-v0") 14 | config.num_episodes_to_run = 450 15 | config.file_to_save_data_results = None 16 | config.file_to_save_results_graph = None 17 | config.show_solution_score = False 18 | config.visualise_individual_results = False 19 | config.visualise_overall_agent_results = True 20 | config.standard_deviation_results = 1.0 21 | config.runs_per_agent = 3 22 | config.use_GPU = False 23 | config.overwrite_existing_results_file = False 24 | config.randomise_random_seed = True 25 | config.save_model = False 26 | 27 | 28 | config.hyperparameters = { 29 | "Policy_Gradient_Agents": { 30 | "learning_rate": 0.05, 31 | "linear_hidden_units": [30, 15], 32 | "final_layer_activation": "TANH", 33 | "learning_iterations_per_round": 10, 34 | "discount_rate": 0.9, 35 | "batch_norm": False, 36 | "clip_epsilon": 0.2, 37 | "episodes_per_learning_round": 10, 38 | "normalise_rewards": True, 39 | "gradient_clipping_norm": 5, 40 | "mu": 0.0, 41 | "theta": 0.15, 42 | "sigma": 0.2, 43 | "epsilon_decay_rate_denominator": 1, 44 | "clip_rewards": False 45 | }, 46 | 47 | "Actor_Critic_Agents": { 48 | "Actor": { 49 | "learning_rate": 0.003, 50 | "linear_hidden_units": [20, 20], 51 | "final_layer_activation": None, 52 | "batch_norm": False, 53 | "tau": 0.005, 54 | "gradient_clipping_norm": 5, 55 | "initialiser": "Xavier" 56 | }, 57 | 58 | "Critic": { 59 | "learning_rate": 0.02, 60 | "linear_hidden_units": [20, 20], 61 | "final_layer_activation": None, 62 | "batch_norm": False, 63 | "buffer_size": 1000000, 64 | "tau": 0.005, 65 | "gradient_clipping_norm": 5, 66 | "initialiser": "Xavier" 67 | }, 68 | 69 | "min_steps_before_learning": 1000, #for SAC only 70 | "batch_size": 256, 71 | "discount_rate": 0.99, 72 | "mu": 0.0, # for O-H noise 73 | "theta": 0.15, # for O-H noise 74 | "sigma": 0.25, # for O-H noise 75 | "action_noise_std": 0.2, # for TD3 76 | "action_noise_clipping_range": 0.5, # for TD3 77 | "update_every_n_steps": 20, 78 | "learning_updates_per_learning_session": 10, 79 | "automatically_tune_entropy_hyperparameter": True, 80 | "entropy_term_weight": None, 81 | "add_extra_noise": True, 82 | "do_evaluation_iterations": True, 83 | "clip_rewards": False 84 | 85 | } 86 | 87 | } 88 | 89 | if __name__ == "__main__": 90 | AGENTS = [TD3, DDPG, PPO] 91 | trainer = Trainer(config, AGENTS) 92 | trainer.run_games_for_agents() 93 | 94 | # SAC, , 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /agents/actor_critic_agents/TD3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as functional 3 | from torch import optim 4 | from agents.Base_Agent import Base_Agent 5 | from .DDPG import DDPG 6 | from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration 7 | 8 | class TD3(DDPG): 9 | """A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018) 10 | https://arxiv.org/abs/1802.09477""" 11 | agent_name = "TD3" 12 | 13 | def __init__(self, config): 14 | DDPG.__init__(self, config) 15 | self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, 16 | key_to_use="Critic", override_seed=self.config.seed + 1) 17 | self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, 18 | key_to_use="Critic") 19 | Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) 20 | self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(), 21 | lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) 22 | self.exploration_strategy_critic = Gaussian_Exploration(self.config) 23 | 24 | def compute_critic_values_for_next_states(self, next_states): 25 | """Computes the critic values for next states to be used in the loss for the critic""" 26 | with torch.no_grad(): 27 | actions_next = self.actor_target(next_states) 28 | actions_next_with_noise = self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next}) 29 | critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1)) 30 | critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1)) 31 | critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1) 32 | return critic_targets_next 33 | 34 | def critic_learn(self, states, actions, rewards, next_states, dones): 35 | """Runs a learning iteration for both the critics""" 36 | critic_targets_next = self.compute_critic_values_for_next_states(next_states) 37 | critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones) 38 | 39 | critic_expected_1 = self.critic_local(torch.cat((states, actions), 1)) 40 | critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1)) 41 | 42 | critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets) 43 | critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets) 44 | 45 | self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"]) 46 | self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2, 47 | self.hyperparameters["Critic"]["gradient_clipping_norm"]) 48 | 49 | self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) 50 | self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"]) 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /exploration_strategies/Epsilon_Greedy_Exploration.py: -------------------------------------------------------------------------------- 1 | from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy 2 | import numpy as np 3 | import random 4 | import torch 5 | 6 | class Epsilon_Greedy_Exploration(Base_Exploration_Strategy): 7 | """Implements an epsilon greedy exploration strategy""" 8 | def __init__(self, config): 9 | super().__init__(config) 10 | self.notified_that_exploration_turned_off = False 11 | if "exploration_cycle_episodes_length" in self.config.hyperparameters.keys(): 12 | print("Using a cyclical exploration strategy") 13 | self.exploration_cycle_episodes_length = self.config.hyperparameters["exploration_cycle_episodes_length"] 14 | else: 15 | self.exploration_cycle_episodes_length = None 16 | 17 | if "random_episodes_to_run" in self.config.hyperparameters.keys(): 18 | self.random_episodes_to_run = self.config.hyperparameters["random_episodes_to_run"] 19 | print("Running {} random episodes".format(self.random_episodes_to_run)) 20 | else: 21 | self.random_episodes_to_run = 0 22 | 23 | def perturb_action_for_exploration_purposes(self, action_info): 24 | """Perturbs the action of the agent to encourage exploration""" 25 | action_values = action_info["action_values"] 26 | turn_off_exploration = action_info["turn_off_exploration"] 27 | episode_number = action_info["episode_number"] 28 | if turn_off_exploration and not self.notified_that_exploration_turned_off: 29 | print(" ") 30 | print("Exploration has been turned OFF") 31 | print(" ") 32 | self.notified_that_exploration_turned_off = True 33 | epsilon = self.get_updated_epsilon_exploration(action_info) 34 | 35 | 36 | if (random.random() > epsilon or turn_off_exploration) and (episode_number >= self.random_episodes_to_run): 37 | return torch.argmax(action_values).item() 38 | return np.random.randint(0, action_values.shape[1]) 39 | 40 | def get_updated_epsilon_exploration(self, action_info, epsilon=1.0): 41 | """Gets the probability that we just pick a random action. This probability decays the more episodes we have seen""" 42 | episode_number = action_info["episode_number"] 43 | epsilon_decay_denominator = self.config.hyperparameters["epsilon_decay_rate_denominator"] 44 | 45 | if self.exploration_cycle_episodes_length is None: 46 | epsilon = epsilon / (1.0 + (episode_number / epsilon_decay_denominator)) 47 | else: 48 | epsilon = self.calculate_epsilon_with_cyclical_strategy(episode_number) 49 | return epsilon 50 | 51 | def calculate_epsilon_with_cyclical_strategy(self, episode_number): 52 | """Calculates epsilon according to a cyclical strategy""" 53 | max_epsilon = 0.5 54 | min_epsilon = 0.001 55 | increment = (max_epsilon - min_epsilon) / float(self.exploration_cycle_episodes_length / 2) 56 | cycle = [ix for ix in range(int(self.exploration_cycle_episodes_length / 2))] + [ix for ix in range( 57 | int(self.exploration_cycle_episodes_length / 2), 0, -1)] 58 | cycle_ix = episode_number % self.exploration_cycle_episodes_length 59 | epsilon = max_epsilon - cycle[cycle_ix] * increment 60 | return epsilon 61 | 62 | def add_exploration_rewards(self, reward_info): 63 | """Actions intrinsic rewards to encourage exploration""" 64 | return reward_info["reward"] 65 | 66 | def reset(self): 67 | """Resets the noise process""" 68 | pass 69 | -------------------------------------------------------------------------------- /agents/DQN_agents/Dueling_DDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import optim 3 | from agents.Base_Agent import Base_Agent 4 | from agents.DQN_agents.DDQN import DDQN 5 | 6 | class Dueling_DDQN(DDQN): 7 | """A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf""" 8 | agent_name = "Dueling DDQN" 9 | 10 | def __init__(self, config): 11 | DDQN.__init__(self, config) 12 | self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) 13 | self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) 14 | self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) 15 | Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) 16 | 17 | def pick_action(self, state=None): 18 | """Uses the local Q network and an epsilon greedy policy to pick an action""" 19 | # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add 20 | # a "fake" dimension to make it a mini-batch rather than a single observation 21 | if state is None: state = self.state 22 | state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) 23 | if len(state.shape) < 2: state = state.unsqueeze(0) 24 | self.q_network_local.eval() 25 | with torch.no_grad(): 26 | action_values = self.q_network_local(state) 27 | action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages 28 | self.q_network_local.train() 29 | action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values, 30 | "turn_off_exploration": self.turn_off_exploration, 31 | "episode_number": self.episode_number}) 32 | return action 33 | 34 | def compute_q_values_for_next_states(self, next_states): 35 | """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN 36 | uses the local index to pick the maximum q_value action and then the target network to calculate the q_value. 37 | The reasoning behind this is that it will help stop the network from overestimating q values""" 38 | max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1) 39 | duelling_network_output = self.q_network_target(next_states) 40 | q_values = self.calculate_duelling_q_values(duelling_network_output) 41 | Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1)) 42 | return Q_targets_next 43 | 44 | def calculate_duelling_q_values(self, duelling_q_network_output): 45 | """Calculates the q_values using the duelling network architecture. This is equation (9) in the paper 46 | referenced at the top of the class""" 47 | state_value = duelling_q_network_output[:, -1] 48 | avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1) 49 | q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1)) 50 | return q_values 51 | 52 | def compute_expected_q_values(self, states, actions): 53 | """Computes the expected q_values we will use to create the loss to train the Q network""" 54 | duelling_network_output = self.q_network_local(states) 55 | q_values = self.calculate_duelling_q_values(duelling_network_output) 56 | Q_expected = q_values.gather(1, actions.long()) 57 | return Q_expected 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /tests/Test_HRL.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pytest 3 | 4 | 5 | from utilities.Utility_Functions import flatten_action_id_to_actions 6 | from utilities.data_structures.Config import Config 7 | 8 | config = Config() 9 | config.seed = 1 10 | config.environment = gym.make("Taxi-v2") 11 | config.env_parameters = {} 12 | config.num_episodes_to_run = 1000 13 | config.file_to_save_data_results = None 14 | config.file_to_save_results_graph = None 15 | config.show_solution_score = False 16 | config.visualise_individual_results = False 17 | config.visualise_overall_agent_results = True 18 | config.standard_deviation_results = 1.0 19 | config.runs_per_agent = 3 20 | config.use_GPU = False 21 | config.overwrite_existing_results_file = False 22 | config.randomise_random_seed = True 23 | config.save_model = False 24 | 25 | linear_hidden_units = [10, 5] 26 | learning_rate = 0.01 27 | buffer_size = 40000 28 | batch_size = 256 29 | batch_norm = False 30 | embedding_dimensionality = 15 31 | gradient_clipping_norm = 5 32 | update_every_n_steps = 1 33 | learning_iterations = 1 34 | epsilon_decay_rate_denominator = 400 35 | discount_rate = 0.99 36 | tau = 0.01 37 | sequitur_k = 10 38 | 39 | config.hyperparameters = { 40 | 41 | 42 | "linear_hidden_units": linear_hidden_units, 43 | "learning_rate": learning_rate, 44 | "buffer_size": buffer_size, 45 | "batch_size": batch_size, 46 | "final_layer_activation": "None", 47 | "columns_of_data_to_be_embedded": [0], 48 | "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 49 | "batch_norm": batch_norm, 50 | "gradient_clipping_norm": gradient_clipping_norm, 51 | "update_every_n_steps": update_every_n_steps, 52 | "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator, 53 | "discount_rate": discount_rate, 54 | "learning_iterations": learning_iterations, 55 | "tau": tau, 56 | "sequitur_k": sequitur_k, 57 | "action_length_reward_bonus": 0.1, 58 | "episodes_to_run_with_no_exploration": 10, 59 | "pre_training_learning_iterations_multiplier": 0.1, 60 | "copy_over_hidden_layers": True, 61 | "use_global_list_of_best_performing_actions": True 62 | } 63 | 64 | 65 | # hrl = HRL(config) 66 | 67 | # def test_flatten_action_id_to_actions(): 68 | # """Tests flatten_action_id_to_actions""" 69 | # action_id_to_actions = {0: (0,), 1:(1,), 2:(0, 1), 3: (2, 1), 4:(2, 3)} 70 | # original_number_of_primitive_actions = 2 71 | # 72 | # 73 | # 74 | # flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, original_number_of_primitive_actions) 75 | # assert flattened_action_id_to_actions == {0: (0,), 1:(1,), 2:(0, 1), 3: (0, 1, 1), 4:(0, 1, 0, 1, 1)}, flattened_action_id_to_actions 76 | # 77 | # action_id_to_actions = {0: (0,), 1:(1,), 2:(2,)} 78 | # original_number_of_primitive_actions = 3 79 | # flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, original_number_of_primitive_actions) 80 | # assert flattened_action_id_to_actions == action_id_to_actions 81 | # 82 | # with pytest.raises(AssertionError): 83 | # action_id_to_actions = {0: (0,), 1: (1,), 2: (2,)} 84 | # original_number_of_primitive_actions = 4 85 | # flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, 86 | # original_number_of_primitive_actions) 87 | # with pytest.raises(AssertionError): 88 | # action_id_to_actions = {0: (0,), 1: (1,), 2: (2, 2)} 89 | # original_number_of_primitive_actions = 3 90 | # flattened_action_id_to_actions = flatten_action_id_to_actions(action_id_to_actions, 91 | # original_number_of_primitive_actions) 92 | 93 | -------------------------------------------------------------------------------- /environments/Bit_Flipping_Environment.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import random 3 | import gym 4 | import numpy as np 5 | from gym import spaces 6 | from gym.utils import seeding 7 | 8 | class Bit_Flipping_Environment(gym.Env): 9 | environment_name = "Bit Flipping Game" 10 | 11 | def __init__(self, environment_dimension=20, deterministic=False): 12 | 13 | self.action_space = spaces.Discrete(environment_dimension) 14 | self.observation_space = spaces.Dict(dict( 15 | desired_goal=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'), 16 | achieved_goal=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'), 17 | observation=spaces.Box(0, 1, shape=(environment_dimension,), dtype='float32'), 18 | )) 19 | 20 | self.seed() 21 | self.reward_threshold = 0.0 22 | self.trials = 50 23 | self.max_episode_steps = environment_dimension 24 | self.id = "Bit Flipping" 25 | self.environment_dimension = environment_dimension 26 | self.reward_for_achieving_goal = self.environment_dimension 27 | self.step_reward_for_not_achieving_goal = -1 28 | 29 | self.deterministic = deterministic 30 | 31 | def seed(self, seed=None): 32 | self.np_random, seed = seeding.np_random(seed) 33 | return [seed] 34 | 35 | def reset(self): 36 | if not self.deterministic: 37 | self.desired_goal = self.randomly_pick_state_or_goal() 38 | self.state = self.randomly_pick_state_or_goal() 39 | else: 40 | self.desired_goal = [0 for _ in range(self.environment_dimension)] 41 | self.state = [1 for _ in range(self.environment_dimension)] 42 | self.state.extend(self.desired_goal) 43 | self.achieved_goal = self.state[:self.environment_dimension] 44 | self.step_count = 0 45 | return {"observation": np.array(self.state[:self.environment_dimension]), "desired_goal": np.array(self.desired_goal), 46 | "achieved_goal": np.array(self.achieved_goal)} 47 | 48 | def randomly_pick_state_or_goal(self): 49 | return [random.randint(0, 1) for _ in range(self.environment_dimension)] 50 | 51 | def step(self, action): 52 | """Conducts the discrete action chosen and updated next_state, reward and done""" 53 | if type(action) is np.ndarray: 54 | action = action[0] 55 | assert action <= self.environment_dimension + 1, "You picked an invalid action" 56 | self.step_count += 1 57 | if action != self.environment_dimension + 1: #otherwise no bit is flipped 58 | self.next_state = copy.copy(self.state) 59 | self.next_state[action] = (self.next_state[action] + 1) % 2 60 | if self.goal_achieved(self.next_state): 61 | self.reward = self.reward_for_achieving_goal 62 | self.done = True 63 | else: 64 | self.reward = self.step_reward_for_not_achieving_goal 65 | if self.step_count >= self.environment_dimension: 66 | self.done = True 67 | else: 68 | self.done = False 69 | self.achieved_goal = self.next_state[:self.environment_dimension] 70 | self.state = self.next_state 71 | 72 | return {"observation": np.array(self.next_state[:self.environment_dimension]), 73 | "desired_goal": np.array(self.desired_goal), "achieved_goal": np.array(self.achieved_goal)}, self.reward, self.done, {} 74 | 75 | def goal_achieved(self, next_state): 76 | return next_state[:self.environment_dimension] == next_state[-self.environment_dimension:] 77 | 78 | def compute_reward(self, achieved_goal, desired_goal, info): 79 | """Computes the reward we would have got with this achieved goal and desired goal. Must be of this exact 80 | interface to fit with the open AI gym specifications""" 81 | if (achieved_goal == desired_goal).all(): 82 | reward = self.reward_for_achieving_goal 83 | else: 84 | reward = self.step_reward_for_not_achieving_goal 85 | return reward 86 | -------------------------------------------------------------------------------- /tests/Test_Action_Balanced_Replay_Buffer.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import Counter 3 | 4 | import pytest 5 | 6 | from utilities.data_structures.Action_Balanced_Replay_Buffer import Action_Balanced_Replay_Buffer 7 | 8 | def test_add_experience(): 9 | """Tests that add_experience works correctly""" 10 | buffer = Action_Balanced_Replay_Buffer(6, 4, 0, 3) 11 | 12 | rewards = [0 for _ in range(4)] 13 | next_states = [0 for _ in range(4)] 14 | states = [0 for _ in range(4)] 15 | dones = [0 for _ in range(4)] 16 | actions = [0, 1, 2, 0] 17 | 18 | for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): 19 | buffer.add_experience(state, action, reward, next_state, done) 20 | 21 | assert len(buffer.memories[0]) == 2 22 | assert len(buffer.memories[1]) == 1 23 | assert len(buffer.memories[2]) == 1 24 | 25 | buffer.add_experience(99, 0, 0, 0, 0) 26 | assert len(buffer.memories[0]) == 2 27 | assert buffer.memories[0][1].state == 99 28 | 29 | buffer = Action_Balanced_Replay_Buffer(6, 4, 0, 3) 30 | buffer.add_experience(states, actions, rewards, next_states, dones) 31 | assert len(buffer.memories[0]) == 2 32 | assert len(buffer.memories[1]) == 1 33 | assert len(buffer.memories[2]) == 1 34 | 35 | buffer.add_experience(99, 0, 0, 0, 0) 36 | assert len(buffer.memories[0]) == 2 37 | assert buffer.memories[0][1].state == 99 38 | 39 | def test_add_experience_throws_error(): 40 | """Tests that add_experience works correctly""" 41 | buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3) 42 | with pytest.raises(KeyError): 43 | buffer.add_experience(3, 99, 1, 0, 0) 44 | buffer.sample() 45 | 46 | buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3) 47 | buffer.add_experience(3, 2, 1, 0, 0) 48 | 49 | with pytest.raises(AssertionError): 50 | buffer.sample() 51 | 52 | def test_sample_correctly(): 53 | """Tests that sample works correctly""" 54 | buffer = Action_Balanced_Replay_Buffer(20, 4, 0, 3) 55 | buffer.add_experience(3, 2, 1, 0, 0) 56 | buffer.add_experience(2, 0, 1, 0, 0) 57 | buffer.add_experience(1, 1, 1, 0, 0) 58 | states, actions, rewards, next_states, dones = buffer.sample() 59 | 60 | for var in [states, actions, rewards, next_states, dones]: 61 | assert len(var) == 4 62 | 63 | num_occurances = 0 64 | tries = 50 65 | 66 | for random_seed in range(tries): 67 | buffer = Action_Balanced_Replay_Buffer(20, 4, random_seed, 3) 68 | buffer.add_experience(3, 2, 1, 0, 0) 69 | buffer.add_experience(2, 0, 1, 0, 0) 70 | buffer.add_experience(1, 1, 1, 0, 0) 71 | states, actions, rewards, next_states, dones = buffer.sample() 72 | if states[2] == 3.0: num_occurances += 1 73 | print(states) 74 | assert num_occurances < tries/2 75 | assert num_occurances > tries/5 76 | 77 | def test_sample_statistics_correct(): 78 | """Tests that sampled experiences correspond to expected statistics""" 79 | tries = 5 80 | for random_seed in range(tries): 81 | for num_actions in range(1, 7): 82 | for buffer_size in [random.randint(55, 9999) for _ in range(10)]: 83 | for batch_size in [random.randint(8, 200) for _ in range(10)]: 84 | buffer = Action_Balanced_Replay_Buffer(buffer_size, batch_size, random.randint(0, 2000000), num_actions) 85 | for _ in range(500): 86 | random_action = random.randint(0, num_actions - 1) 87 | buffer.add_experience(1, random_action, 1, 0, 0) 88 | states, actions, rewards, next_states, dones = buffer.sample() 89 | actions = [action.item() for action in actions] 90 | assert len(actions) == batch_size 91 | count = Counter(actions) 92 | action_count = count[0] 93 | for action in range(num_actions): 94 | assert abs(count[action] - action_count) < 2, print(count[action]) 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /tests/Test_Prioritised_Replay_Buffer.py: -------------------------------------------------------------------------------- 1 | from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer 2 | import numpy as np 3 | import random 4 | 5 | hyperparameters = { 6 | "alpha_prioritised_replay": 0.5, 7 | "beta_prioritised_replay": 0.5, 8 | "incremental_td_error": 0.0, 9 | "buffer_size": 4, 10 | "batch_size": 3 11 | } 12 | 13 | 14 | def test_prioritised_replay_buffer(): 15 | 16 | buffer = Prioritised_Replay_Buffer(hyperparameters) 17 | buffer.add_experience(100, 1, 2, 3, 4, 5) 18 | 19 | assert buffer.deque[0].key == 100.0**hyperparameters["alpha_prioritised_replay"] 20 | assert buffer.deque[0].value == (1, 2, 3, 4, 5) 21 | assert buffer.deque[0].heap_index == 1 22 | assert buffer.heap[1].key == 100.0**hyperparameters["alpha_prioritised_replay"] 23 | assert buffer.heap[1].value == (1, 2, 3, 4, 5) 24 | 25 | buffer.add_experience(99, 1, 2, 3, 4, 5) 26 | buffer.add_experience(98, 1, 2, 3, 4, 5) 27 | 28 | assert buffer.deque[0].key == 100.0**hyperparameters["alpha_prioritised_replay"] 29 | assert buffer.deque[0].value == (1, 2, 3, 4, 5) 30 | assert buffer.deque[0].heap_index == 1 31 | assert buffer.heap[1].key == 100.0**hyperparameters["alpha_prioritised_replay"] 32 | assert buffer.heap[1].value == (1, 2, 3, 4, 5) 33 | 34 | assert buffer.deque[1].key == 99.0**hyperparameters["alpha_prioritised_replay"] 35 | assert buffer.deque[1].value == (1, 2, 3, 4, 5) 36 | assert buffer.deque[1].heap_index == 2 37 | assert buffer.heap[2].key == 99.0**hyperparameters["alpha_prioritised_replay"] 38 | assert buffer.heap[2].value == (1, 2, 3, 4, 5) 39 | 40 | assert buffer.deque[2].key == 98.0**hyperparameters["alpha_prioritised_replay"] 41 | assert buffer.deque[2].value == (1, 2, 3, 4, 5) 42 | assert buffer.deque[2].heap_index == 3 43 | assert buffer.heap[3].key == 98.0**hyperparameters["alpha_prioritised_replay"] 44 | assert buffer.heap[3].value == (1, 2, 3, 4, 5) 45 | 46 | buffer.add_experience(105, 1, 2, 3, 4, 5) 47 | 48 | assert buffer.deque[3].key == 105.0**hyperparameters["alpha_prioritised_replay"] 49 | assert buffer.deque[3].value == (1, 2, 3, 4, 5) 50 | assert buffer.deque[3].heap_index == 1 51 | assert buffer.heap[1].key == 105.0**hyperparameters["alpha_prioritised_replay"] 52 | assert buffer.heap[1].value == (1, 2, 3, 4, 5) 53 | assert buffer.heap[2].key == 100.0 ** hyperparameters["alpha_prioritised_replay"] 54 | 55 | buffer.add_experience(101, 1, 24, 3, 4, 5) 56 | 57 | assert buffer.deque[0].key == 101.0 ** hyperparameters["alpha_prioritised_replay"] 58 | assert buffer.deque[0].value == (1, 24, 3, 4, 5) 59 | assert buffer.deque[0].heap_index == 2 60 | assert buffer.heap[2].key == 101.0 ** hyperparameters["alpha_prioritised_replay"] 61 | assert buffer.heap[2].value == (1, 24, 3, 4, 5) 62 | 63 | 64 | def test_heap_always_keeps_max_element_at_top(): 65 | hyperparameters["buffer_size"] = 200 66 | for _ in range(100): 67 | buffer = Prioritised_Replay_Buffer(hyperparameters) 68 | elements_added = [] 69 | for ix in range(1, 100): 70 | element = random.random() 71 | elements_added.append(element) 72 | buffer.add_experience(element, 0, 0, 0, 0, 0) 73 | 74 | max_key = np.max(elements_added)** hyperparameters["alpha_prioritised_replay"] 75 | assert round(buffer.give_max_td_error(), 8) == round(max_key, 8), "{}".format(elements_added) 76 | 77 | def test_give_sum_of_elements_is_always_correct(): 78 | hyperparameters["buffer_size"] = 200 79 | for _ in range(100): 80 | buffer = Prioritised_Replay_Buffer(hyperparameters) 81 | elements_added = [] 82 | for ix in range(1, 100): 83 | element = random.random() 84 | elements_added.append((abs(element) + hyperparameters["incremental_td_error"]) ** hyperparameters["alpha_prioritised_replay"]) 85 | buffer.add_experience(element, 0, 0, 0, 0, 0) 86 | 87 | sum_key = np.sum(elements_added) 88 | assert round(buffer.give_adapted_sum_of_td_errors(), 8) == round(sum_key, 8), "{}".format(elements_added) 89 | -------------------------------------------------------------------------------- /environments/ant_environments/maze_env_utils.py: -------------------------------------------------------------------------------- 1 | 2 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 3 | # and is not my code. 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | import numpy as np 12 | import math 13 | 14 | 15 | class Move(object): 16 | X = 11 17 | Y = 12 18 | Z = 13 19 | XY = 14 20 | XZ = 15 21 | YZ = 16 22 | XYZ = 17 23 | SpinXY = 18 24 | 25 | 26 | def can_move_x(movable): 27 | return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ, 28 | Move.SpinXY] 29 | 30 | 31 | def can_move_y(movable): 32 | return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ, 33 | Move.SpinXY] 34 | 35 | 36 | def can_move_z(movable): 37 | return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ] 38 | 39 | 40 | def can_spin(movable): 41 | return movable in [Move.SpinXY] 42 | 43 | 44 | def can_move(movable): 45 | return can_move_x(movable) or can_move_y(movable) or can_move_z(movable) 46 | 47 | 48 | def construct_maze(maze_id='Maze'): 49 | if maze_id == 'Maze': 50 | structure = [ 51 | [1, 1, 1, 1, 1], 52 | [1, 'r', 0, 0, 1], 53 | [1, 1, 1, 0, 1], 54 | [1, 0, 0, 0, 1], 55 | [1, 1, 1, 1, 1], 56 | ] 57 | elif maze_id == 'Push': 58 | structure = [ 59 | [1, 1, 1, 1, 1], 60 | [1, 0, 'r', 1, 1], 61 | [1, 0, Move.XY, 0, 1], 62 | [1, 1, 0, 1, 1], 63 | [1, 1, 1, 1, 1], 64 | ] 65 | elif maze_id == 'Fall': 66 | structure = [ 67 | [1, 1, 1, 1], 68 | [1, 'r', 0, 1], 69 | [1, 0, Move.YZ, 1], 70 | [1, -1, -1, 1], 71 | [1, 0, 0, 1], 72 | [1, 1, 1, 1], 73 | ] 74 | elif maze_id == 'Block': 75 | O = 'r' 76 | structure = [ 77 | [1, 1, 1, 1, 1], 78 | [1, O, 0, 0, 1], 79 | [1, 0, 0, 0, 1], 80 | [1, 0, 0, 0, 1], 81 | [1, 1, 1, 1, 1], 82 | ] 83 | elif maze_id == 'BlockMaze': 84 | O = 'r' 85 | structure = [ 86 | [1, 1, 1, 1], 87 | [1, O, 0, 1], 88 | [1, 1, 0, 1], 89 | [1, 0, 0, 1], 90 | [1, 1, 1, 1], 91 | ] 92 | else: 93 | raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id) 94 | 95 | return structure 96 | 97 | 98 | def line_intersect(pt1, pt2, ptA, ptB): 99 | """ 100 | Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html 101 | 102 | this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB) 103 | """ 104 | 105 | DET_TOLERANCE = 0.00000001 106 | 107 | # the first line is pt1 + r*(pt2-pt1) 108 | # in component form: 109 | x1, y1 = pt1 110 | x2, y2 = pt2 111 | dx1 = x2 - x1 112 | dy1 = y2 - y1 113 | 114 | # the second line is ptA + s*(ptB-ptA) 115 | x, y = ptA 116 | xB, yB = ptB 117 | dx = xB - x 118 | dy = yB - y 119 | 120 | DET = (-dx1 * dy + dy1 * dx) 121 | 122 | if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0) 123 | 124 | # now, the determinant should be OK 125 | DETinv = 1.0 / DET 126 | 127 | # find the scalar amount along the "self" segment 128 | r = DETinv * (-dy * (x - x1) + dx * (y - y1)) 129 | 130 | # find the scalar amount along the input line 131 | s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1)) 132 | 133 | # return the average of the two descriptions 134 | xi = (x1 + r * dx1 + x + s * dx) / 2.0 135 | yi = (y1 + r * dy1 + y + s * dy) / 2.0 136 | return (xi, yi, 1, r, s) 137 | 138 | 139 | def ray_segment_intersect(ray, segment): 140 | """ 141 | Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2), 142 | and return the intersection point if there is one 143 | """ 144 | (x, y), theta = ray 145 | # (x1, y1), (x2, y2) = segment 146 | pt1 = (x, y) 147 | len = 1 148 | pt2 = (x + len * math.cos(theta), y + len * math.sin(theta)) 149 | xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment) 150 | if valid and r >= 0 and 0 <= s <= 1: 151 | return (xo, yo) 152 | return None 153 | 154 | 155 | def point_distance(p1, p2): 156 | x1, y1 = p1 157 | x2, y2 = p2 158 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 159 | -------------------------------------------------------------------------------- /utilities/Parallel_Experience_Generator.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import sys 4 | from contextlib import closing 5 | # 6 | # from pathos.multiprocessing import ProcessingPool as Pool 7 | 8 | from torch.multiprocessing import Pool 9 | from random import randint 10 | 11 | from utilities.OU_Noise import OU_Noise 12 | from utilities.Utility_Functions import create_actor_distribution 13 | 14 | class Parallel_Experience_Generator(object): 15 | """ Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents""" 16 | def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None): 17 | self.use_GPU = use_GPU 18 | self.environment = environment 19 | self.action_types = "DISCRETE" if self.environment.action_space.dtype == int else "CONTINUOUS" 20 | self.action_size = action_size 21 | self.policy = policy 22 | self.action_choice_output_columns = action_choice_output_columns 23 | self.hyperparameters = hyperparameters 24 | if self.action_types == "CONTINUOUS": self.noise = OU_Noise(self.action_size, seed, self.hyperparameters["mu"], 25 | self.hyperparameters["theta"], self.hyperparameters["sigma"]) 26 | 27 | 28 | def play_n_episodes(self, n, exploration_epsilon=None): 29 | """Plays n episodes in parallel using the fixed policy and returns the data""" 30 | self.exploration_epsilon = exploration_epsilon 31 | with closing(Pool(processes=n)) as pool: 32 | results = pool.map(self, range(n)) 33 | pool.terminate() 34 | states_for_all_episodes = [episode[0] for episode in results] 35 | actions_for_all_episodes = [episode[1] for episode in results] 36 | rewards_for_all_episodes = [episode[2] for episode in results] 37 | return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes 38 | 39 | def __call__(self, n): 40 | exploration = max(0.0, random.uniform(self.exploration_epsilon / 3.0, self.exploration_epsilon * 3.0)) 41 | return self.play_1_episode(exploration) 42 | 43 | def play_1_episode(self, epsilon_exploration): 44 | """Plays 1 episode using the fixed policy and returns the data""" 45 | state = self.reset_game() 46 | done = False 47 | episode_states = [] 48 | episode_actions = [] 49 | episode_rewards = [] 50 | while not done: 51 | action = self.pick_action(self.policy, state, epsilon_exploration) 52 | next_state, reward, done, _ = self.environment.step(action) 53 | if self.hyperparameters["clip_rewards"]: reward = max(min(reward, 1.0), -1.0) 54 | episode_states.append(state) 55 | episode_actions.append(action) 56 | episode_rewards.append(reward) 57 | state = next_state 58 | return episode_states, episode_actions, episode_rewards 59 | 60 | def reset_game(self): 61 | """Resets the game environment so it is ready to play a new episode""" 62 | seed = randint(0, sys.maxsize) 63 | torch.manual_seed(seed) # Need to do this otherwise each worker generates same experience 64 | state = self.environment.reset() 65 | if self.action_types == "CONTINUOUS": self.noise.reset() 66 | return state 67 | 68 | def pick_action(self, policy, state, epsilon_exploration=None): 69 | """Picks an action using the policy""" 70 | if self.action_types == "DISCRETE": 71 | if random.random() <= epsilon_exploration: 72 | action = random.randint(0, self.action_size - 1) 73 | return action 74 | 75 | state = torch.from_numpy(state).float().unsqueeze(0) 76 | actor_output = policy.forward(state) 77 | if self.action_choice_output_columns is not None: 78 | actor_output = actor_output[:, self.action_choice_output_columns] 79 | action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size) 80 | action = action_distribution.sample().cpu() 81 | 82 | if self.action_types == "CONTINUOUS": action += torch.Tensor(self.noise.sample()) 83 | else: action = action.item() 84 | return action -------------------------------------------------------------------------------- /environments/ant_environments/ant.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # NOTE THIS CODE IS TAKEN FROM https://github.com/tensorflow/models/tree/master/research/efficient-hrl/environments 4 | # and is not my code. 5 | 6 | 7 | 8 | 9 | """Wrapper for creating the ant environment in gym_mujoco.""" 10 | 11 | import math 12 | import numpy as np 13 | from gym import utils 14 | from gym.envs.mujoco import mujoco_env 15 | 16 | 17 | def q_inv(a): 18 | return [a[0], -a[1], -a[2], -a[3]] 19 | 20 | 21 | def q_mult(a, b): # multiply two quaternion 22 | w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3] 23 | i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2] 24 | j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1] 25 | k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0] 26 | return [w, i, j, k] 27 | 28 | 29 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): 30 | FILE = "ant.xml" 31 | ORI_IND = 3 32 | 33 | def __init__(self, file_path=None, expose_all_qpos=True, 34 | expose_body_coms=None, expose_body_comvels=None): 35 | self._expose_all_qpos = expose_all_qpos 36 | self._expose_body_coms = expose_body_coms 37 | self._expose_body_comvels = expose_body_comvels 38 | self._body_com_indices = {} 39 | self._body_comvel_indices = {} 40 | 41 | 42 | mujoco_env.MujocoEnv.__init__(self, file_path, 5) 43 | utils.EzPickle.__init__(self) 44 | 45 | @property 46 | def physics(self): 47 | return self.model 48 | 49 | def _step(self, a): 50 | return self.step(a) 51 | 52 | def step(self, a): 53 | xposbefore = self.get_body_com("torso")[0] 54 | self.do_simulation(a, self.frame_skip) 55 | xposafter = self.get_body_com("torso")[0] 56 | forward_reward = (xposafter - xposbefore) / self.dt 57 | ctrl_cost = .5 * np.square(a).sum() 58 | survive_reward = 1.0 59 | reward = forward_reward - ctrl_cost + survive_reward 60 | state = self.state_vector() 61 | done = False 62 | ob = self._get_obs() 63 | return ob, reward, done, dict( 64 | reward_forward=forward_reward, 65 | reward_ctrl=-ctrl_cost, 66 | reward_survive=survive_reward) 67 | 68 | def _get_obs(self): 69 | # No cfrc observation 70 | if self._expose_all_qpos: 71 | obs = np.concatenate([ 72 | self.data.qpos.flat[:15], # Ensures only ant obs. 73 | self.data.qvel.flat[:14], 74 | ]) 75 | else: 76 | obs = np.concatenate([ 77 | self.data.qpos.flat[2:15], 78 | self.data.qvel.flat[:14], 79 | ]) 80 | 81 | if self._expose_body_coms is not None: 82 | for name in self._expose_body_coms: 83 | com = self.get_body_com(name) 84 | if name not in self._body_com_indices: 85 | indices = range(len(obs), len(obs) + len(com)) 86 | self._body_com_indices[name] = indices 87 | obs = np.concatenate([obs, com]) 88 | 89 | if self._expose_body_comvels is not None: 90 | for name in self._expose_body_comvels: 91 | comvel = self.get_body_comvel(name) 92 | if name not in self._body_comvel_indices: 93 | indices = range(len(obs), len(obs) + len(comvel)) 94 | self._body_comvel_indices[name] = indices 95 | obs = np.concatenate([obs, comvel]) 96 | return obs 97 | 98 | def reset_model(self): 99 | qpos = self.init_qpos + self.np_random.uniform( 100 | size=self.model.nq, low=-.1, high=.1) 101 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 102 | 103 | # Set everything other than ant to original position and 0 velocity. 104 | qpos[15:] = self.init_qpos[15:] 105 | qvel[14:] = 0. 106 | self.set_state(qpos, qvel) 107 | return self._get_obs() 108 | 109 | def viewer_setup(self): 110 | self.viewer.cam.distance = self.model.stat.extent * 0.5 111 | 112 | def get_ori(self): 113 | ori = [0, 1, 0, 0] 114 | rot = self.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4] # take the quaternion 115 | ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3] # project onto x-y plane 116 | ori = math.atan2(ori[1], ori[0]) 117 | return ori 118 | 119 | def set_xy(self, xy): 120 | qpos = np.copy(self.data.qpos) 121 | qpos[0] = xy[0] 122 | qpos[1] = xy[1] 123 | 124 | qvel = self.data.qvel 125 | self.set_state(qpos, qvel) 126 | 127 | def get_xy(self): 128 | return self.data.qpos[:2] 129 | -------------------------------------------------------------------------------- /agents/policy_gradient_agents/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.optim as optim 4 | from torch.distributions import Categorical 5 | from agents.Base_Agent import Base_Agent 6 | 7 | class REINFORCE(Base_Agent): 8 | agent_name = "REINFORCE" 9 | def __init__(self, config): 10 | Base_Agent.__init__(self, config) 11 | self.policy = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) 12 | self.optimizer = optim.Adam(self.policy.parameters(), lr=self.hyperparameters["learning_rate"]) 13 | self.episode_rewards = [] 14 | self.episode_log_probabilities = [] 15 | 16 | def reset_game(self): 17 | """Resets the game information so we are ready to play a new episode""" 18 | self.state = self.environment.reset_environment() 19 | self.next_state = None 20 | self.action = None 21 | self.reward = None 22 | self.done = False 23 | self.total_episode_score_so_far = 0 24 | self.episode_rewards = [] 25 | self.episode_log_probabilities = [] 26 | self.episode_step_number = 0 27 | 28 | def step(self): 29 | """Runs a step within a game including a learning step if required""" 30 | while not self.done: 31 | self.pick_and_conduct_action_and_save_log_probabilities() 32 | self.update_next_state_reward_done_and_score() 33 | self.store_reward() 34 | if self.time_to_learn(): 35 | self.actor_learn() 36 | self.state = self.next_state #this is to set the state for the next iteration 37 | self.episode_step_number += 1 38 | self.episode_number += 1 39 | 40 | def pick_and_conduct_action_and_save_log_probabilities(self): 41 | """Picks and then conducts actions. Then saves the log probabilities of the actions it conducted to be used for 42 | learning later""" 43 | action, log_probabilities = self.pick_action_and_get_log_probabilities() 44 | self.store_log_probabilities(log_probabilities) 45 | self.store_action(action) 46 | self.conduct_action() 47 | 48 | def pick_action_and_get_log_probabilities(self): 49 | """Picks actions and then calculates the log probabilities of the actions it picked given the policy""" 50 | # PyTorch only accepts mini-batches and not individual observations so we have to add 51 | # a "fake" dimension to our observation using unsqueeze 52 | state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) 53 | action_probabilities = self.policy.forward(state).cpu() 54 | action_distribution = Categorical(action_probabilities) # this creates a distribution to sample from 55 | action = action_distribution.sample() 56 | return action.item(), action_distribution.log_prob(action) 57 | 58 | def store_log_probabilities(self, log_probabilities): 59 | """Stores the log probabilities of picked actions to be used for learning later""" 60 | self.episode_log_probabilities.append(log_probabilities) 61 | 62 | def store_action(self, action): 63 | """Stores the action picked""" 64 | self.action = action 65 | 66 | def store_reward(self): 67 | """Stores the reward picked""" 68 | self.episode_rewards.append(self.reward) 69 | 70 | def actor_learn(self): 71 | """Runs a learning iteration for the policy""" 72 | total_discounted_reward = self.calculate_episode_discounted_reward() 73 | policy_loss = self.calculate_policy_loss_on_episode(total_discounted_reward) 74 | self.optimizer.zero_grad() 75 | policy_loss.backward() 76 | self.optimizer.step() 77 | 78 | def calculate_episode_discounted_reward(self): 79 | """Calculates the cumulative discounted return for the episode""" 80 | discounts = self.hyperparameters["discount_rate"] ** np.arange(len(self.episode_rewards)) 81 | total_discounted_reward = np.dot(discounts, self.episode_rewards) 82 | return total_discounted_reward 83 | 84 | def calculate_policy_loss_on_episode(self, total_discounted_reward): 85 | """Calculates the loss from an episode""" 86 | policy_loss = [] 87 | for log_prob in self.episode_log_probabilities: 88 | policy_loss.append(-log_prob * total_discounted_reward) 89 | policy_loss = torch.cat(policy_loss).sum() # We need to add up the losses across the mini-batch to get 1 overall loss 90 | return policy_loss 91 | 92 | def time_to_learn(self): 93 | """Tells us whether it is time for the algorithm to learn. With REINFORCE we only learn at the end of every 94 | episode so this just returns whether the episode is over""" 95 | return self.done 96 | -------------------------------------------------------------------------------- /results/Hopper.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from agents.policy_gradient_agents.PPO import PPO 3 | from agents.actor_critic_agents.DDPG import DDPG 4 | from agents.actor_critic_agents.SAC import SAC 5 | from agents.actor_critic_agents.TD3 import TD3 6 | from agents.Trainer import Trainer 7 | from agents.hierarchical_agents.DIAYN import DIAYN 8 | from utilities.data_structures.Config import Config 9 | 10 | 11 | config = Config() 12 | config.seed = 1 13 | config.environment = gym.make("Hopper-v2") 14 | config.num_episodes_to_run = 1000 15 | config.file_to_save_data_results = "data_and_graphs/Hopper_Results_Data.pkl" 16 | config.file_to_save_results_graph = "data_and_graphs/Hopper_Results_Graph.png" 17 | config.show_solution_score = False 18 | config.visualise_individual_results = False 19 | config.visualise_overall_agent_results = True 20 | config.standard_deviation_results = 1.0 21 | config.runs_per_agent = 3 22 | config.use_GPU = False 23 | config.overwrite_existing_results_file = False 24 | config.randomise_random_seed = True 25 | config.save_model = False 26 | 27 | 28 | actor_critic_agent_hyperparameters = { 29 | "Actor": { 30 | "learning_rate": 0.0003, 31 | "linear_hidden_units": [64, 64], 32 | "final_layer_activation": None, 33 | "batch_norm": False, 34 | "tau": 0.005, 35 | "gradient_clipping_norm": 5, 36 | "initialiser": "Xavier" 37 | }, 38 | 39 | "Critic": { 40 | "learning_rate": 0.0003, 41 | "linear_hidden_units": [64, 64], 42 | "final_layer_activation": None, 43 | "batch_norm": False, 44 | "buffer_size": 1000000, 45 | "tau": 0.005, 46 | "gradient_clipping_norm": 5, 47 | "initialiser": "Xavier" 48 | }, 49 | 50 | "min_steps_before_learning": 400, 51 | "batch_size": 256, 52 | "discount_rate": 0.99, 53 | "mu": 0.0, #for O-H noise 54 | "theta": 0.15, #for O-H noise 55 | "sigma": 0.25, #for O-H noise 56 | "action_noise_std": 0.2, # for TD3 57 | "action_noise_clipping_range": 0.5, # for TD3 58 | "update_every_n_steps": 1, 59 | "learning_updates_per_learning_session": 1, 60 | "automatically_tune_entropy_hyperparameter": True, 61 | "entropy_term_weight": None, 62 | "add_extra_noise": False, 63 | "do_evaluation_iterations": True, 64 | "clip_rewards": False 65 | } 66 | 67 | dqn_agent_hyperparameters = { 68 | "learning_rate": 0.005, 69 | "batch_size": 128, 70 | "buffer_size": 40000, 71 | "epsilon": 1.0, 72 | "epsilon_decay_rate_denominator": 3, 73 | "discount_rate": 0.99, 74 | "tau": 0.01, 75 | "alpha_prioritised_replay": 0.6, 76 | "beta_prioritised_replay": 0.1, 77 | "incremental_td_error": 1e-8, 78 | "update_every_n_steps": 3, 79 | "linear_hidden_units": [30, 15], 80 | "final_layer_activation": "None", 81 | "batch_norm": False, 82 | "gradient_clipping_norm": 5, 83 | "clip_rewards": False 84 | } 85 | 86 | 87 | manager_hyperparameters = dqn_agent_hyperparameters 88 | manager_hyperparameters.update({"timesteps_to_give_up_control_for": 5}) 89 | 90 | 91 | config.hyperparameters = { 92 | "Policy_Gradient_Agents": { 93 | "learning_rate": 0.05, 94 | "linear_hidden_units": [30, 15], 95 | "final_layer_activation": "TANH", 96 | "learning_iterations_per_round": 10, 97 | "discount_rate": 0.9, 98 | "batch_norm": False, 99 | "clip_epsilon": 0.2, 100 | "episodes_per_learning_round": 10, 101 | "normalise_rewards": True, 102 | "gradient_clipping_norm": 5, 103 | "mu": 0.0, 104 | "theta": 0.15, 105 | "sigma": 0.2, 106 | "epsilon_decay_rate_denominator": 1, 107 | "clip_rewards": False 108 | }, 109 | 110 | "Actor_Critic_Agents": actor_critic_agent_hyperparameters, 111 | "DIAYN": { 112 | "DISCRIMINATOR": { 113 | "learning_rate": 0.001, 114 | "linear_hidden_units": [32, 32], 115 | "final_layer_activation": None, 116 | "gradient_clipping_norm": 5 117 | 118 | }, 119 | "AGENT": actor_critic_agent_hyperparameters, 120 | "MANAGER": manager_hyperparameters, 121 | "num_skills": 10, 122 | "num_unsupservised_episodes": 500 123 | } 124 | } 125 | 126 | 127 | if __name__ == "__main__": 128 | AGENTS = [SAC, DIAYN] #SAC] #, DDPG, PPO, TD3] ] #, 129 | trainer = Trainer(config, AGENTS) 130 | trainer.run_games_for_agents() 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /results/Walker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from agents.policy_gradient_agents.PPO import PPO 3 | from agents.actor_critic_agents.DDPG import DDPG 4 | from agents.actor_critic_agents.SAC import SAC 5 | from agents.actor_critic_agents.TD3 import TD3 6 | from agents.Trainer import Trainer 7 | from agents.hierarchical_agents.DIAYN import DIAYN 8 | from utilities.data_structures.Config import Config 9 | 10 | 11 | config = Config() 12 | config.seed = 1 13 | config.environment = gym.make("Walker2d-v2") 14 | config.num_episodes_to_run = 400 15 | config.file_to_save_data_results = "data_and_graphs/Walker_Results_Data.pkl" 16 | config.file_to_save_results_graph = "data_and_graphs/Walker_Results_Graph.png" 17 | config.show_solution_score = False 18 | config.visualise_individual_results = False 19 | config.visualise_overall_agent_results = True 20 | config.standard_deviation_results = 1.0 21 | config.runs_per_agent = 3 22 | config.use_GPU = False 23 | config.overwrite_existing_results_file = False 24 | config.randomise_random_seed = True 25 | config.save_model = False 26 | 27 | 28 | actor_critic_agent_hyperparameters = { 29 | "Actor": { 30 | "learning_rate": 0.0003, 31 | "linear_hidden_units": [64, 64], 32 | "final_layer_activation": None, 33 | "batch_norm": False, 34 | "tau": 0.005, 35 | "gradient_clipping_norm": 5, 36 | "initialiser": "Xavier" 37 | }, 38 | 39 | "Critic": { 40 | "learning_rate": 0.0003, 41 | "linear_hidden_units": [64, 64], 42 | "final_layer_activation": None, 43 | "batch_norm": False, 44 | "buffer_size": 1000000, 45 | "tau": 0.005, 46 | "gradient_clipping_norm": 5, 47 | "initialiser": "Xavier" 48 | }, 49 | 50 | "min_steps_before_learning": 400, 51 | "batch_size": 256, 52 | "discount_rate": 0.99, 53 | "mu": 0.0, #for O-H noise 54 | "theta": 0.15, #for O-H noise 55 | "sigma": 0.25, #for O-H noise 56 | "action_noise_std": 0.2, # for TD3 57 | "action_noise_clipping_range": 0.5, # for TD3 58 | "update_every_n_steps": 1, 59 | "learning_updates_per_learning_session": 1, 60 | "automatically_tune_entropy_hyperparameter": True, 61 | "entropy_term_weight": None, 62 | "add_extra_noise": False, 63 | "do_evaluation_iterations": True, 64 | "clip_rewards": clip_rewards 65 | } 66 | 67 | dqn_agent_hyperparameters = { 68 | "learning_rate": 0.005, 69 | "batch_size": 128, 70 | "buffer_size": 40000, 71 | "epsilon": 1.0, 72 | "epsilon_decay_rate_denominator": 3, 73 | "discount_rate": 0.99, 74 | "tau": 0.01, 75 | "alpha_prioritised_replay": 0.6, 76 | "beta_prioritised_replay": 0.1, 77 | "incremental_td_error": 1e-8, 78 | "update_every_n_steps": 3, 79 | "linear_hidden_units": [30, 15], 80 | "final_layer_activation": "None", 81 | "batch_norm": False, 82 | "gradient_clipping_norm": 5, 83 | "clip_rewards": clip_rewards 84 | } 85 | 86 | 87 | manager_hyperparameters = dqn_agent_hyperparameters 88 | manager_hyperparameters.update({"timesteps_to_give_up_control_for": 5}) 89 | 90 | 91 | config.hyperparameters = { 92 | "Policy_Gradient_Agents": { 93 | "learning_rate": 0.05, 94 | "linear_hidden_units": [30, 15], 95 | "final_layer_activation": "TANH", 96 | "learning_iterations_per_round": 10, 97 | "discount_rate": 0.9, 98 | "batch_norm": False, 99 | "clip_epsilon": 0.2, 100 | "episodes_per_learning_round": 10, 101 | "normalise_rewards": True, 102 | "gradient_clipping_norm": 5, 103 | "mu": 0.0, 104 | "theta": 0.15, 105 | "sigma": 0.2, 106 | "epsilon_decay_rate_denominator": 1, 107 | "clip_rewards": clip_rewards 108 | }, 109 | 110 | "Actor_Critic_Agents": actor_critic_agent_hyperparameters, 111 | "DIAYN": { 112 | "DISCRIMINATOR": { 113 | "learning_rate": 0.001, 114 | "linear_hidden_units": [32, 32], 115 | "final_layer_activation": None, 116 | "gradient_clipping_norm": 5 117 | 118 | }, 119 | "AGENT": actor_critic_agent_hyperparameters, 120 | "MANAGER": manager_hyperparameters, 121 | "num_skills": 10, 122 | "num_unsupservised_episodes": 100 123 | } 124 | } 125 | 126 | 127 | if __name__ == "__main__": 128 | AGENTS = [DIAYN] #, SAC] #, DDPG, PPO, TD3] ] #,DIAYN] # 129 | trainer = Trainer(config, AGENTS) 130 | trainer.run_games_for_agents() 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /results/Reacher.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from agents.Trainer import Trainer 3 | from agents.actor_critic_agents.DDPG import DDPG 4 | from agents.hierarchical_agents.HIRO import HIRO 5 | from utilities.data_structures.Config import Config 6 | config = Config() 7 | config.seed = 1 8 | config.environment = gym.make("Reacher-v2") # Reacher-v2 "InvertedPendulum-v2") #Pendulum-v0 9 | config.num_episodes_to_run = 1500 10 | config.file_to_save_data_results = None 11 | config.file_to_save_results_graph = None 12 | config.show_solution_score = False 13 | config.visualise_individual_results = False 14 | config.visualise_overall_agent_results = True 15 | config.standard_deviation_results = 1.0 16 | config.runs_per_agent = 1 17 | config.use_GPU = False 18 | config.overwrite_existing_results_file = False 19 | config.randomise_random_seed = True 20 | config.save_model = False 21 | 22 | 23 | 24 | 25 | config.hyperparameters = { 26 | "HIRO": { 27 | 28 | "LOWER_LEVEL": { 29 | "max_lower_level_timesteps": 5, 30 | 31 | "Actor": { 32 | "learning_rate": 0.001, 33 | "linear_hidden_units": [20, 20], 34 | "final_layer_activation": "TANH", 35 | "batch_norm": False, 36 | "tau": 0.005, 37 | "gradient_clipping_norm": 5 38 | }, 39 | 40 | "Critic": { 41 | "learning_rate": 0.01, 42 | "linear_hidden_units": [20, 20], 43 | "final_layer_activation": "None", 44 | "batch_norm": False, 45 | "buffer_size": 100000, 46 | "tau": 0.005, 47 | "gradient_clipping_norm": 5 48 | }, 49 | 50 | "batch_size": 256, 51 | "discount_rate": 0.9, 52 | "mu": 0.0, # for O-H noise 53 | "theta": 0.15, # for O-H noise 54 | "sigma": 0.25, # for O-H noise 55 | "action_noise_std": 0.2, # for TD3 56 | "action_noise_clipping_range": 0.5, # for TD3 57 | "update_every_n_steps": 20, 58 | "learning_updates_per_learning_session": 10, 59 | "clip_rewards": False 60 | 61 | } , 62 | 63 | 64 | 65 | "HIGHER_LEVEL": { 66 | 67 | "Actor": { 68 | "learning_rate": 0.001, 69 | "linear_hidden_units": [20, 20], 70 | "final_layer_activation": "TANH", 71 | "batch_norm": False, 72 | "tau": 0.005, 73 | "gradient_clipping_norm": 5 74 | }, 75 | 76 | "Critic": { 77 | "learning_rate": 0.01, 78 | "linear_hidden_units": [20, 20], 79 | "final_layer_activation": "None", 80 | "batch_norm": False, 81 | "buffer_size": 100000, 82 | "tau": 0.005, 83 | "gradient_clipping_norm": 5 84 | }, 85 | 86 | "batch_size": 256, 87 | "discount_rate": 0.9, 88 | "mu": 0.0, # for O-H noise 89 | "theta": 0.15, # for O-H noise 90 | "sigma": 0.25, # for O-H noise 91 | "action_noise_std": 0.2, # for TD3 92 | "action_noise_clipping_range": 0.5, # for TD3 93 | "update_every_n_steps": 20, 94 | "learning_updates_per_learning_session": 10, 95 | "clip_rewards": False 96 | 97 | } , 98 | 99 | 100 | }, 101 | "Actor_Critic_Agents": { # hyperparameters taken from https://arxiv.org/pdf/1802.09477.pdf 102 | "Actor": { 103 | "learning_rate": 0.001, 104 | "linear_hidden_units": [400, 300], 105 | "final_layer_activation": "TANH", 106 | "batch_norm": False, 107 | "tau": 0.01, 108 | "gradient_clipping_norm": 5 109 | }, 110 | 111 | "Critic": { 112 | "learning_rate": 0.01, 113 | "linear_hidden_units": [400, 300], 114 | "final_layer_activation": "None", 115 | "batch_norm": False, 116 | "buffer_size": 100000, 117 | "tau": 0.01, 118 | "gradient_clipping_norm": 5 119 | }, 120 | 121 | "batch_size": 64, 122 | "discount_rate": 0.99, 123 | "mu": 0.0, # for O-H noise 124 | "theta": 0.15, # for O-H noise 125 | "sigma": 0.2, # for O-H noise 126 | "action_noise_std": 0.2, # for TD3 127 | "action_noise_clipping_range": 0.5, # for TD3 128 | "update_every_n_steps": 1, 129 | "learning_updates_per_learning_session": 1, 130 | "clip_rewards": False 131 | 132 | } 133 | 134 | 135 | } 136 | 137 | 138 | if __name__ == "__main__": 139 | AGENTS = [DDPG, HIRO] 140 | trainer = Trainer(config, AGENTS) 141 | trainer.run_games_for_agents() 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /environments/ant_environments/assets/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 82 | -------------------------------------------------------------------------------- /results/HRL_Experiments.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from agents.hierarchical_agents.HRL.HRL import HRL 4 | from agents.Trainer import Trainer 5 | from utilities.data_structures.Config import Config 6 | 7 | config = Config() 8 | config.environment = gym.make("Taxi-v2") 9 | config.seed = 1 10 | config.env_parameters = {} 11 | config.num_episodes_to_run = 2000 12 | config.file_to_save_data_results = None 13 | config.file_to_save_results_graph = None 14 | config.show_solution_score = False 15 | config.visualise_individual_results = False 16 | config.visualise_overall_agent_results = True 17 | config.standard_deviation_results = 1.0 18 | config.runs_per_agent = 3 19 | config.use_GPU = False 20 | config.overwrite_existing_results_file = False 21 | config.randomise_random_seed = True 22 | config.save_model = False 23 | 24 | 25 | linear_hidden_units = [32, 32] 26 | learning_rate = 0.01 27 | buffer_size = 100000 28 | batch_size = 256 29 | batch_norm = False 30 | embedding_dimensionality = 10 31 | gradient_clipping_norm = 5 32 | update_every_n_steps = 1 33 | learning_iterations = 1 34 | epsilon_decay_rate_denominator = 400 35 | discount_rate = 0.99 36 | tau = 0.01 37 | sequitur_k = 2 38 | pre_training_learning_iterations_multiplier = 50 39 | episodes_to_run_with_no_exploration = 10 40 | action_balanced_replay_buffer = True 41 | copy_over_hidden_layers = True 42 | action_length_reward_bonus = 0.1 43 | 44 | config.hyperparameters = { 45 | 46 | "HRL": { 47 | "linear_hidden_units": linear_hidden_units, 48 | "learning_rate": learning_rate, 49 | "buffer_size": buffer_size, 50 | "batch_size": batch_size, 51 | "final_layer_activation": "None", 52 | "columns_of_data_to_be_embedded": [0], 53 | "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 54 | "batch_norm": batch_norm, 55 | "gradient_clipping_norm": gradient_clipping_norm, 56 | "update_every_n_steps": update_every_n_steps, 57 | "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator, 58 | "discount_rate": discount_rate, 59 | "learning_iterations": learning_iterations, 60 | "tau": tau, 61 | "sequitur_k": sequitur_k, 62 | "action_length_reward_bonus": action_length_reward_bonus, 63 | "pre_training_learning_iterations_multiplier": pre_training_learning_iterations_multiplier, 64 | "episodes_to_run_with_no_exploration": episodes_to_run_with_no_exploration, 65 | "action_balanced_replay_buffer": action_balanced_replay_buffer, 66 | "copy_over_hidden_layers": copy_over_hidden_layers 67 | }, 68 | 69 | "DQN_Agents": { 70 | "linear_hidden_units": linear_hidden_units, 71 | "learning_rate": learning_rate, 72 | "buffer_size": buffer_size, 73 | "batch_size": batch_size, 74 | "final_layer_activation": "None", 75 | "columns_of_data_to_be_embedded": [0], 76 | "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 77 | "batch_norm": batch_norm, 78 | "gradient_clipping_norm": gradient_clipping_norm, 79 | "update_every_n_steps": update_every_n_steps, 80 | "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator, 81 | "discount_rate": discount_rate, 82 | "learning_iterations": learning_iterations, 83 | "tau": tau, 84 | }, 85 | 86 | "Actor_Critic_Agents": { 87 | "Actor": { 88 | "learning_rate": 0.0003, 89 | "linear_hidden_units": [64, 64], 90 | "final_layer_activation": "Softmax", 91 | "columns_of_data_to_be_embedded": [0], 92 | "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 93 | "batch_norm": False, 94 | "tau": 0.005, 95 | "gradient_clipping_norm": 5, 96 | "initialiser": "Xavier" 97 | }, 98 | 99 | "Critic": { 100 | "learning_rate": 0.0003, 101 | "linear_hidden_units": [64, 64], 102 | "final_layer_activation": None, 103 | "columns_of_data_to_be_embedded": [0], 104 | "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 105 | "batch_norm": False, 106 | "buffer_size": 1000000, 107 | "tau": 0.005, 108 | "gradient_clipping_norm": 5, 109 | "initialiser": "Xavier" 110 | }, 111 | 112 | "min_steps_before_learning": 10000, 113 | "batch_size": 256, 114 | "discount_rate": 0.99, 115 | "mu": 0.0, # for O-H noise 116 | "theta": 0.15, # for O-H noise 117 | "sigma": 0.25, # for O-H noise 118 | "action_noise_std": 0.2, # for TD3 119 | "action_noise_clipping_range": 0.5, # for TD3 120 | "update_every_n_steps": 1, 121 | "learning_updates_per_learning_session": 1, 122 | "automatically_tune_entropy_hyperparameter": True, 123 | "entropy_term_weight": None, 124 | "add_extra_noise": False, 125 | "do_evaluation_iterations": True 126 | } 127 | } 128 | 129 | 130 | if __name__ == "__main__": 131 | AGENTS = [HRL] #SAC_Discrete, SAC_Discrete, DDQN] #HRL] #, SNN_HRL, DQN, h_DQN] 132 | trainer = Trainer(config, AGENTS) 133 | trainer.run_games_for_agents() 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /agents/HER_Base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from utilities.data_structures.Replay_Buffer import Replay_Buffer 4 | from utilities.Utility_Functions import abstract 5 | 6 | @abstract 7 | class HER_Base(object): 8 | """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm""" 9 | def __init__(self, buffer_size, batch_size, HER_sample_proportion): 10 | self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed) 11 | self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion)) 12 | self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size 13 | 14 | def reset_game(self): 15 | """Resets the game information so we are ready to play a new episode""" 16 | self.state_dict = self.environment.reset() 17 | self.observation = self.state_dict["observation"] 18 | self.desired_goal = self.state_dict["desired_goal"] 19 | self.achieved_goal = self.state_dict["achieved_goal"] 20 | 21 | self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal) 22 | self.next_state = None 23 | self.action = None 24 | self.reward = None 25 | self.done = False 26 | 27 | self.episode_states = [] 28 | self.episode_rewards = [] 29 | self.episode_actions = [] 30 | self.episode_next_states = [] 31 | self.episode_dones = [] 32 | 33 | self.episode_desired_goals = [] 34 | self.episode_achieved_goals = [] 35 | self.episode_observations = [] 36 | 37 | self.episode_next_desired_goals = [] 38 | self.episode_next_achieved_goals = [] 39 | self.episode_next_observations = [] 40 | 41 | self.total_episode_score_so_far = 0 42 | 43 | def track_changeable_goal_episodes_data(self): 44 | """Saves the data from the recent episodes in a way compatible with changeable goal environments""" 45 | self.episode_rewards.append(self.reward) 46 | self.episode_actions.append(self.action) 47 | self.episode_dones.append(self.done) 48 | 49 | self.episode_states.append(self.state) 50 | self.episode_next_states.append(self.next_state) 51 | 52 | self.episode_desired_goals.append(self.state_dict["desired_goal"]) 53 | self.episode_achieved_goals.append(self.state_dict["achieved_goal"]) 54 | self.episode_observations.append(self.state_dict["observation"]) 55 | 56 | self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"]) 57 | self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"]) 58 | self.episode_next_observations.append(self.next_state_dict["observation"]) 59 | 60 | def conduct_action_in_changeable_goal_envs(self, action): 61 | """Adapts conduct_action from base agent so that can handle changeable goal environments""" 62 | self.next_state_dict, self.reward, self.done, _ = self.environment.step(action) 63 | self.total_episode_score_so_far += self.reward 64 | if self.hyperparameters["clip_rewards"]: 65 | self.reward = max(min(self.reward, 1.0), -1.0) 66 | self.observation = self.next_state_dict["observation"] 67 | self.desired_goal = self.next_state_dict["desired_goal"] 68 | self.achieved_goal = self.next_state_dict["achieved_goal"] 69 | self.next_state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal) 70 | 71 | 72 | def create_state_from_observation_and_desired_goal(self, observation, desired_goal): 73 | return np.concatenate((observation, desired_goal)) 74 | 75 | def save_alternative_experience(self): 76 | """Saves the experiences as if the final state visited in the episode was the goal state""" 77 | new_goal = self.achieved_goal 78 | new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations] 79 | new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in 80 | self.episode_next_observations] 81 | new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals] 82 | 83 | if self.hyperparameters["clip_rewards"]: 84 | new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards] 85 | 86 | self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones) 87 | 88 | def sample_from_HER_and_Ordinary_Buffer(self): 89 | """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config""" 90 | states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size) 91 | HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size) 92 | 93 | states = torch.cat((states, HER_states)) 94 | actions = torch.cat((actions, HER_actions)) 95 | rewards = torch.cat((rewards, HER_rewards)) 96 | next_states = torch.cat((next_states, HER_next_states)) 97 | dones = torch.cat((dones, HER_dones)) 98 | return states, actions, rewards, next_states, dones 99 | 100 | 101 | -------------------------------------------------------------------------------- /utilities/Memory_Shaper.py: -------------------------------------------------------------------------------- 1 | # NOT FINISHED 2 | from .data_structures.Action_Balanced_Replay_Buffer import Action_Balanced_Replay_Buffer 3 | from .data_structures.Replay_Buffer import Replay_Buffer 4 | import numpy as np 5 | import random 6 | 7 | class Memory_Shaper(object): 8 | """Takes in the experience of full episodes and reshapes it according to macro-actions you define. Then it provides 9 | a replay buffer with this reshaped data to learn from""" 10 | def __init__(self, buffer_size, batch_size, seed, new_reward_fn, action_balanced_replay_buffer=True): 11 | self.reset() 12 | self.buffer_size = buffer_size 13 | self.batch_size = batch_size 14 | self.seed = seed 15 | self.new_reward_fn = new_reward_fn 16 | self.action_balanced_replay_buffer = action_balanced_replay_buffer 17 | 18 | def put_adapted_experiences_in_a_replay_buffer(self, action_id_to_actions): 19 | """Adds experiences to the replay buffer after re-imagining that the actions taken were macro-actions according to 20 | action_rules as well as primitive actions. 21 | 22 | NOTE that we want to put both primitive actions and macro-actions into replay buffer so that it can learn that 23 | its better to do a macro-action rather than the same primitive actions (which we will enforce with reward penalty) 24 | """ 25 | 26 | actions_to_action_id = {v: k for k, v in action_id_to_actions.items()} 27 | 28 | self.num_actions = len(action_id_to_actions) 29 | 30 | print(actions_to_action_id) 31 | 32 | for key in actions_to_action_id.keys(): 33 | assert isinstance(key, tuple) 34 | assert isinstance(actions_to_action_id[key], int) 35 | 36 | episodes = len(self.states) 37 | for data_type in [self.states, self.next_states, self.rewards, self.actions, self.dones]: 38 | assert len(data_type) == episodes 39 | 40 | max_action_length = self.calculate_max_action_length(actions_to_action_id) 41 | 42 | if self.action_balanced_replay_buffer: 43 | print("Using action balanced replay buffer") 44 | replay_buffer = Action_Balanced_Replay_Buffer(self.buffer_size, self.batch_size, self.seed, num_actions=self.num_actions) 45 | else: 46 | print("Using ordinary replay buffer") 47 | replay_buffer = Replay_Buffer(self.buffer_size, self.batch_size, self.seed) 48 | 49 | for episode_ix in range(episodes): 50 | self.add_adapted_experience_for_an_episode(episode_ix, actions_to_action_id, max_action_length, replay_buffer) 51 | 52 | return replay_buffer 53 | 54 | def calculate_max_action_length(self, actions_to_action_id): 55 | """Calculates the max length of the provided macro-actions""" 56 | max_length = 0 57 | for key in actions_to_action_id.keys(): 58 | action_length = len(key) 59 | if action_length > max_length: 60 | max_length = action_length 61 | return max_length 62 | 63 | 64 | def add_adapted_experience_for_an_episode(self, episode_ix, action_rules, max_action_length, replay_buffer): 65 | """Adds all the experiences we have been given to a replay buffer after adapting experiences that involved doing a 66 | macro action""" 67 | states = self.states[episode_ix] 68 | next_states = self.next_states[episode_ix] 69 | rewards = self.rewards[episode_ix] 70 | actions = self.actions[episode_ix] 71 | dones = self.dones[episode_ix] 72 | 73 | assert len(states) == len(next_states) == len(rewards) == len(dones) == len(actions), "{} {} {} {} {} = {}".format(len(states), len(next_states), len(rewards), len(dones), len(actions), actions) 74 | steps = len(states) 75 | for step in range(steps): 76 | replay_buffer.add_experience(states[step], actions[step], rewards[step], next_states[step], dones[step]) 77 | for action_length in range(2, max_action_length + 1): 78 | if step < action_length - 1: continue 79 | action_sequence = tuple(actions[step - action_length + 1 : step + 1]) 80 | assert all([action in range(self.num_actions) for action in action_sequence]), "All actions should be primitive here" 81 | if action_sequence in action_rules.keys(): 82 | new_action = action_rules[action_sequence] 83 | new_state = states[step - action_length + 1] 84 | new_reward = np.sum(rewards[step - action_length + 1:step + 1]) 85 | new_reward = self.new_reward_fn(new_reward, len(action_sequence)) 86 | new_next_state = next_states[step] 87 | new_dones = dones[step] 88 | replay_buffer.add_experience(new_state, new_action, new_reward, new_next_state, new_dones) 89 | 90 | 91 | def add_episode_experience(self, states, next_states, rewards, actions, dones): 92 | """Adds in an episode of experience""" 93 | self.states.append(states) 94 | self.next_states.append(next_states) 95 | self.rewards.append(rewards) 96 | self.actions.append(actions) 97 | self.dones.append(dones) 98 | 99 | def reset(self): 100 | self.states = [] 101 | self.next_states = [] 102 | self.rewards = [] 103 | self.actions = [] 104 | self.dones = [] 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /tests/Test_Four_Rooms_Environment.py: -------------------------------------------------------------------------------- 1 | from environments.Four_Rooms_Environment import Four_Rooms_Environment 2 | from random import randint 3 | from collections import Counter 4 | 5 | 6 | def test_location_to_state(): 7 | """Tests location_to_state maps each location to a unique integer""" 8 | for num_rows in [12, 10]: 9 | for num_cols in [15, 9]: 10 | env = Four_Rooms_Environment(grid_width=num_cols, grid_height=num_rows) 11 | observed_states = set() 12 | for row in range(num_rows): 13 | for col in range(num_cols): 14 | state = env.location_to_state((row, col)) 15 | assert state not in observed_states 16 | observed_states.add(state) 17 | 18 | def test_actions_execute_correctly(): 19 | """Tests that actions execute correctly""" 20 | env = Four_Rooms_Environment(stochastic_actions_probability=0.0) 21 | env.reset() 22 | env.move_user(env.current_user_location, (3, 3)) 23 | 24 | env.step(0) 25 | assert env.current_user_location == (2, 3) 26 | 27 | env.step(1) 28 | assert env.current_user_location == (2, 4) 29 | 30 | env.step(2) 31 | assert env.current_user_location == (3, 4) 32 | 33 | env.step(3) 34 | assert env.current_user_location == (3, 3) 35 | 36 | env.step(0) 37 | assert env.current_user_location == (2, 3) 38 | 39 | env.step(0) 40 | assert env.current_user_location == (1, 3) 41 | 42 | env.step(0) 43 | assert env.current_user_location == (1, 3) 44 | 45 | env.step(1) 46 | assert env.current_user_location == (1, 4) 47 | 48 | env.step(1) 49 | assert env.current_user_location == (1, 5) 50 | 51 | env.step(1) 52 | assert env.current_user_location == (1, 5) 53 | 54 | def test_check_user_location_and_goal_location_match_state_and_next_state(): 55 | """Checks whether user location always matches state and next state correctly""" 56 | for _ in range(50): 57 | env = Four_Rooms_Environment() 58 | env.reset() 59 | for _ in range(50): 60 | move = randint(0, 3) 61 | env.step(move) 62 | assert env.state == [env.location_to_state(env.current_user_location), env.location_to_state(env.current_goal_location)] 63 | assert env.next_state == [env.location_to_state(env.current_user_location), env.location_to_state(env.current_goal_location)] 64 | 65 | def test_lands_on_goal_correctly(): 66 | """Checks whether getting to goal state produces the correct response""" 67 | env = Four_Rooms_Environment(stochastic_actions_probability=0.0) 68 | env.reset() 69 | env.move_user(env.current_user_location, (3, 3)) 70 | env.move_goal(env.current_goal_location, (2, 2)) 71 | 72 | env.step(0) 73 | assert env.reward == env.step_reward_for_not_achieving_goal 74 | assert not env.done 75 | 76 | env.step(3) 77 | assert env.reward == env.reward_for_achieving_goal 78 | assert env.done 79 | 80 | env = Four_Rooms_Environment(stochastic_actions_probability=0.0) 81 | env.reset() 82 | env.move_user(env.current_user_location, (2, 3)) 83 | env.move_goal(env.current_goal_location, (2, 8)) 84 | for move in [2, 1, 1, 1, 1, 1, 0]: 85 | env.step(move) 86 | if move != 0: 87 | assert env.reward == env.step_reward_for_not_achieving_goal 88 | assert not env.done 89 | else: 90 | assert env.reward == env.reward_for_achieving_goal 91 | assert env.done 92 | 93 | def test_location_to_state_and_state_to_location_match(): 94 | """Test that location_to_state and state_to_location are inverses of each other""" 95 | env = Four_Rooms_Environment(stochastic_actions_probability=0.0) 96 | env.reset() 97 | for row in range(env.grid_height): 98 | for col in range(env.grid_width): 99 | assert env.location_to_state((row, col)) == env.location_to_state(env.state_to_location(env.location_to_state((row, col)))) 100 | 101 | def test_randomness_of_moves(): 102 | """Test that determine_which_action_will_actually_occur correctly implements stochastic_actions_probability""" 103 | env = Four_Rooms_Environment(stochastic_actions_probability=0.0) 104 | env.reset() 105 | for _ in range(10): 106 | for move in env.actions: 107 | assert move == env.determine_which_action_will_actually_occur(move) 108 | 109 | env = Four_Rooms_Environment(stochastic_actions_probability=1.0) 110 | num_iterations = 10000 111 | for move in env.actions: 112 | moves = [] 113 | for _ in range(num_iterations): 114 | moves.append(env.determine_which_action_will_actually_occur(move)) 115 | count = Counter(moves) 116 | for move_test in env.actions: 117 | if move != move_test: #We do this because stochastic probability 1.0 means the move will never be picked 118 | assert abs((num_iterations / (len(env.actions)-1)) - count[move_test]) < num_iterations / 20.0, "{}".format(count) 119 | 120 | env = Four_Rooms_Environment(stochastic_actions_probability=0.75) 121 | num_iterations = 10000 122 | for move in env.actions: 123 | moves = [] 124 | for _ in range(num_iterations): 125 | moves.append(env.determine_which_action_will_actually_occur(move)) 126 | count = Counter(moves) 127 | for move_test in env.actions: 128 | assert abs((num_iterations / len(env.actions)) - count[move_test]) < num_iterations / 20.0, "{}".format(count) 129 | 130 | 131 | -------------------------------------------------------------------------------- /results/Cart_Pole.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from os.path import dirname, abspath 4 | sys.path.append(dirname(dirname(abspath(__file__)))) 5 | 6 | import gym 7 | 8 | from agents.actor_critic_agents.A2C import A2C 9 | from agents.DQN_agents.Dueling_DDQN import Dueling_DDQN 10 | from agents.actor_critic_agents.SAC_Discrete import SAC_Discrete 11 | from agents.actor_critic_agents.A3C import A3C 12 | from agents.policy_gradient_agents.PPO import PPO 13 | from agents.Trainer import Trainer 14 | from utilities.data_structures.Config import Config 15 | from agents.DQN_agents.DDQN import DDQN 16 | from agents.DQN_agents.DDQN_With_Prioritised_Experience_Replay import DDQN_With_Prioritised_Experience_Replay 17 | from agents.DQN_agents.DQN import DQN 18 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets 19 | 20 | config = Config() 21 | config.seed = 1 22 | config.environment = gym.make("CartPole-v0") 23 | config.num_episodes_to_run = 450 24 | config.file_to_save_data_results = "results/data_and_graphs/Cart_Pole_Results_Data.pkl" 25 | config.file_to_save_results_graph = "results/data_and_graphs/Cart_Pole_Results_Graph.png" 26 | config.show_solution_score = False 27 | config.visualise_individual_results = False 28 | config.visualise_overall_agent_results = True 29 | config.standard_deviation_results = 1.0 30 | config.runs_per_agent = 1 31 | config.use_GPU = False 32 | config.overwrite_existing_results_file = False 33 | config.randomise_random_seed = True 34 | config.save_model = False 35 | 36 | 37 | config.hyperparameters = { 38 | "DQN_Agents": { 39 | "learning_rate": 0.01, 40 | "batch_size": 256, 41 | "buffer_size": 40000, 42 | "epsilon": 1.0, 43 | "epsilon_decay_rate_denominator": 1, 44 | "discount_rate": 0.99, 45 | "tau": 0.01, 46 | "alpha_prioritised_replay": 0.6, 47 | "beta_prioritised_replay": 0.1, 48 | "incremental_td_error": 1e-8, 49 | "update_every_n_steps": 1, 50 | "linear_hidden_units": [30, 15], 51 | "final_layer_activation": "None", 52 | "batch_norm": False, 53 | "gradient_clipping_norm": 0.7, 54 | "learning_iterations": 1, 55 | "clip_rewards": False 56 | }, 57 | "Stochastic_Policy_Search_Agents": { 58 | "policy_network_type": "Linear", 59 | "noise_scale_start": 1e-2, 60 | "noise_scale_min": 1e-3, 61 | "noise_scale_max": 2.0, 62 | "noise_scale_growth_factor": 2.0, 63 | "stochastic_action_decision": False, 64 | "num_policies": 10, 65 | "episodes_per_policy": 1, 66 | "num_policies_to_keep": 5, 67 | "clip_rewards": False 68 | }, 69 | "Policy_Gradient_Agents": { 70 | "learning_rate": 0.05, 71 | "linear_hidden_units": [20, 20], 72 | "final_layer_activation": "SOFTMAX", 73 | "learning_iterations_per_round": 5, 74 | "discount_rate": 0.99, 75 | "batch_norm": False, 76 | "clip_epsilon": 0.1, 77 | "episodes_per_learning_round": 4, 78 | "normalise_rewards": True, 79 | "gradient_clipping_norm": 7.0, 80 | "mu": 0.0, #only required for continuous action games 81 | "theta": 0.0, #only required for continuous action games 82 | "sigma": 0.0, #only required for continuous action games 83 | "epsilon_decay_rate_denominator": 1.0, 84 | "clip_rewards": False 85 | }, 86 | 87 | "Actor_Critic_Agents": { 88 | 89 | "learning_rate": 0.005, 90 | "linear_hidden_units": [20, 10], 91 | "final_layer_activation": ["SOFTMAX", None], 92 | "gradient_clipping_norm": 5.0, 93 | "discount_rate": 0.99, 94 | "epsilon_decay_rate_denominator": 1.0, 95 | "normalise_rewards": True, 96 | "exploration_worker_difference": 2.0, 97 | "clip_rewards": False, 98 | 99 | "Actor": { 100 | "learning_rate": 0.0003, 101 | "linear_hidden_units": [64, 64], 102 | "final_layer_activation": "Softmax", 103 | "batch_norm": False, 104 | "tau": 0.005, 105 | "gradient_clipping_norm": 5, 106 | "initialiser": "Xavier" 107 | }, 108 | 109 | "Critic": { 110 | "learning_rate": 0.0003, 111 | "linear_hidden_units": [64, 64], 112 | "final_layer_activation": None, 113 | "batch_norm": False, 114 | "buffer_size": 1000000, 115 | "tau": 0.005, 116 | "gradient_clipping_norm": 5, 117 | "initialiser": "Xavier" 118 | }, 119 | 120 | "min_steps_before_learning": 400, 121 | "batch_size": 256, 122 | "discount_rate": 0.99, 123 | "mu": 0.0, #for O-H noise 124 | "theta": 0.15, #for O-H noise 125 | "sigma": 0.25, #for O-H noise 126 | "action_noise_std": 0.2, # for TD3 127 | "action_noise_clipping_range": 0.5, # for TD3 128 | "update_every_n_steps": 1, 129 | "learning_updates_per_learning_session": 1, 130 | "automatically_tune_entropy_hyperparameter": True, 131 | "entropy_term_weight": None, 132 | "add_extra_noise": False, 133 | "do_evaluation_iterations": True 134 | } 135 | } 136 | 137 | if __name__ == "__main__": 138 | AGENTS = [SAC_Discrete, DDQN, Dueling_DDQN, DQN, DQN_With_Fixed_Q_Targets, 139 | DDQN_With_Prioritised_Experience_Replay, A2C, PPO, A3C ] 140 | trainer = Trainer(config, AGENTS) 141 | trainer.run_games_for_agents() 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /results/Four_Rooms.py: -------------------------------------------------------------------------------- 1 | from agents.DQN_agents.DDQN import DDQN 2 | from environments.Four_Rooms_Environment import Four_Rooms_Environment 3 | from agents.Trainer import Trainer 4 | from utilities.data_structures.Config import Config 5 | 6 | config = Config() 7 | config.seed = 1 8 | 9 | height = 15 10 | width = 15 11 | random_goal_place = False 12 | num_possible_states = (height * width) ** (1 + 1*random_goal_place) 13 | embedding_dimensions = [[num_possible_states, 20]] 14 | print("Num possible states ", num_possible_states) 15 | 16 | config.environment = Four_Rooms_Environment(height, width, stochastic_actions_probability=0.0, random_start_user_place=True, random_goal_place=random_goal_place) 17 | 18 | config.num_episodes_to_run = 1000 19 | config.file_to_save_data_results = "Data_and_Graphs/Four_Rooms.pkl" 20 | config.file_to_save_results_graph = "Data_and_Graphs/Four_Rooms.png" 21 | config.show_solution_score = False 22 | config.visualise_individual_results = False 23 | config.visualise_overall_agent_results = True 24 | config.standard_deviation_results = 1.0 25 | config.runs_per_agent = 3 26 | config.use_GPU = False 27 | config.overwrite_existing_results_file = False 28 | config.randomise_random_seed = True 29 | config.save_model = False 30 | 31 | 32 | config.hyperparameters = { 33 | "DQN_Agents": { 34 | "linear_hidden_units": [30, 10], 35 | "learning_rate": 0.01, 36 | "buffer_size": 40000, 37 | "batch_size": 256, 38 | "final_layer_activation": "None", 39 | "columns_of_data_to_be_embedded": [0], 40 | "embedding_dimensions": embedding_dimensions, 41 | "batch_norm": False, 42 | "gradient_clipping_norm": 5, 43 | "update_every_n_steps": 1, 44 | "epsilon_decay_rate_denominator": 10, 45 | "discount_rate": 0.99, 46 | "learning_iterations": 1, 47 | "tau": 0.01, 48 | "exploration_cycle_episodes_length": None, 49 | "learning_iterations": 1, 50 | "clip_rewards": False 51 | }, 52 | 53 | "SNN_HRL": { 54 | "SKILL_AGENT": { 55 | "num_skills": 20, 56 | "regularisation_weight": 1.5, 57 | "visitations_decay": 0.9999, 58 | "episodes_for_pretraining": 300, 59 | "batch_size": 256, 60 | "learning_rate": 0.001, 61 | "buffer_size": 40000, 62 | "linear_hidden_units": [20, 10], 63 | "final_layer_activation": "None", 64 | "columns_of_data_to_be_embedded": [0, 1], 65 | "embedding_dimensions": [embedding_dimensions[0], 66 | [20, 6]], 67 | "batch_norm": False, 68 | "gradient_clipping_norm": 2, 69 | "update_every_n_steps": 1, 70 | "epsilon_decay_rate_denominator": 500, 71 | "discount_rate": 0.999, 72 | "learning_iterations": 1, 73 | "tau": 0.01, 74 | "clip_rewards": False 75 | }, 76 | 77 | "MANAGER": { 78 | "timesteps_before_changing_skill": 6, 79 | "linear_hidden_units": [10, 5], 80 | "learning_rate": 0.01, 81 | "buffer_size": 40000, 82 | "batch_size": 256, 83 | "final_layer_activation": "None", 84 | "columns_of_data_to_be_embedded": [0], 85 | "embedding_dimensions": embedding_dimensions, 86 | "batch_norm": False, 87 | "gradient_clipping_norm": 5, 88 | "update_every_n_steps": 1, 89 | "epsilon_decay_rate_denominator": 50, 90 | "discount_rate": 0.99, 91 | "learning_iterations": 1, 92 | "tau": 0.01, 93 | "clip_rewards": False 94 | 95 | } 96 | 97 | }, 98 | 99 | "Actor_Critic_Agents": { 100 | 101 | "learning_rate": 0.005, 102 | "linear_hidden_units": [20, 10], 103 | 104 | "columns_of_data_to_be_embedded": [0], 105 | "embedding_dimensions": embedding_dimensions, 106 | "final_layer_activation": ["SOFTMAX", None], 107 | "gradient_clipping_norm": 5.0, 108 | "discount_rate": 0.99, 109 | "epsilon_decay_rate_denominator": 50.0, 110 | "normalise_rewards": True, 111 | "clip_rewards": False 112 | 113 | }, 114 | 115 | 116 | "DIAYN": { 117 | 118 | "num_skills": 5, 119 | "DISCRIMINATOR": { 120 | "learning_rate": 0.01, 121 | "linear_hidden_units": [20, 10], 122 | "columns_of_data_to_be_embedded": [0], 123 | "embedding_dimensions": embedding_dimensions, 124 | }, 125 | 126 | "AGENT": { 127 | "learning_rate": 0.01, 128 | "linear_hidden_units": [20, 10], 129 | } 130 | }, 131 | 132 | 133 | "HRL": { 134 | "linear_hidden_units": [10, 5], 135 | "learning_rate": 0.01, 136 | "buffer_size": 40000, 137 | "batch_size": 256, 138 | "final_layer_activation": "None", 139 | "columns_of_data_to_be_embedded": [0], 140 | "embedding_dimensions": embedding_dimensions, 141 | "batch_norm": False, 142 | "gradient_clipping_norm": 5, 143 | "update_every_n_steps": 1, 144 | "epsilon_decay_rate_denominator": 400, 145 | "discount_rate": 0.99, 146 | "learning_iterations": 1, 147 | "tau": 0.01 148 | 149 | } 150 | 151 | 152 | } 153 | 154 | if __name__== '__main__': 155 | 156 | 157 | AGENTS = [DDQN] #DIAYN] # A3C] #SNN_HRL] #, DDQN] 158 | trainer = Trainer(config, AGENTS) 159 | trainer.run_games_for_agents() 160 | 161 | 162 | -------------------------------------------------------------------------------- /results/Long_Corridor.py: -------------------------------------------------------------------------------- 1 | from agents.hierarchical_agents.SNN_HRL import SNN_HRL 2 | from agents.Trainer import Trainer 3 | from utilities.data_structures.Config import Config 4 | from agents.DQN_agents.DQN import DQN 5 | from agents.hierarchical_agents.h_DQN import h_DQN 6 | from environments.Long_Corridor_Environment import Long_Corridor_Environment 7 | 8 | config = Config() 9 | config.seed = 1 10 | config.env_parameters = {"stochasticity_of_action_right": 0.5} 11 | config.environment = Long_Corridor_Environment(stochasticity_of_action_right=config.env_parameters["stochasticity_of_action_right"]) 12 | config.num_episodes_to_run = 10000 13 | config.file_to_save_data_results = "Data_and_Graphs/Long_Corridor_Results_Data.pkl" 14 | config.file_to_save_results_graph = "Data_and_Graphs/Long_Corridor_Results_Graph.png" 15 | config.show_solution_score = False 16 | config.visualise_individual_results = False 17 | config.visualise_overall_agent_results = True 18 | config.standard_deviation_results = 1.0 19 | config.runs_per_agent = 3 20 | config.use_GPU = False 21 | config.overwrite_existing_results_file = False 22 | config.randomise_random_seed = True 23 | config.save_model = False 24 | 25 | config.hyperparameters = { 26 | 27 | "h_DQN": { 28 | "CONTROLLER": { 29 | "batch_size": 256, 30 | "learning_rate": 0.01, 31 | "buffer_size": 40000, 32 | "linear_hidden_units": [20, 10], 33 | "final_layer_activation": "None", 34 | "columns_of_data_to_be_embedded": [0, 1], 35 | "embedding_dimensions": [[config.environment.observation_space.n, 36 | max(4, int(config.environment.observation_space.n / 10.0))], 37 | [config.environment.observation_space.n, 38 | max(4, int(config.environment.observation_space.n / 10.0))]], 39 | "batch_norm": False, 40 | "gradient_clipping_norm": 5, 41 | "update_every_n_steps": 1, 42 | "epsilon_decay_rate_denominator": 1500, 43 | "discount_rate": 0.999, 44 | "learning_iterations": 1 45 | }, 46 | "META_CONTROLLER": { 47 | "batch_size": 256, 48 | "learning_rate": 0.001, 49 | "buffer_size": 40000, 50 | "linear_hidden_units": [20, 10], 51 | "final_layer_activation": "None", 52 | "columns_of_data_to_be_embedded": [0], 53 | "embedding_dimensions": [[config.environment.observation_space.n, 54 | max(4, int(config.environment.observation_space.n / 10.0))]], 55 | "batch_norm": False, 56 | "gradient_clipping_norm": 5, 57 | "update_every_n_steps": 1, 58 | "epsilon_decay_rate_denominator": 2500, 59 | "discount_rate": 0.999, 60 | "learning_iterations": 1 61 | } 62 | }, 63 | 64 | "SNN_HRL": { 65 | "SKILL_AGENT": { 66 | "num_skills": 2, 67 | "regularisation_weight": 1.5, 68 | "visitations_decay": 0.99, 69 | "episodes_for_pretraining": 2000, 70 | # "batch_size": 256, 71 | # "learning_rate": 0.01, 72 | # "buffer_size": 40000, 73 | # "linear_hidden_units": [20, 10], 74 | # "final_layer_activation": "None", 75 | # "columns_of_data_to_be_embedded": [0, 1], 76 | # "embedding_dimensions": [[config.environment.observation_space.n, 77 | # max(4, int(config.environment.observation_space.n / 10.0))], 78 | # [6, 4]], 79 | # "batch_norm": False, 80 | # "gradient_clipping_norm": 5, 81 | # "update_every_n_steps": 1, 82 | # "epsilon_decay_rate_denominator": 50, 83 | # "discount_rate": 0.999, 84 | # "learning_iterations": 1 85 | 86 | 87 | "learning_rate": 0.05, 88 | "linear_hidden_units": [20, 20], 89 | "final_layer_activation": "SOFTMAX", 90 | "learning_iterations_per_round": 5, 91 | "discount_rate": 0.99, 92 | "batch_norm": False, 93 | "clip_epsilon": 0.1, 94 | "episodes_per_learning_round": 4, 95 | "normalise_rewards": True, 96 | "gradient_clipping_norm": 7.0, 97 | "mu": 0.0, # only required for continuous action games 98 | "theta": 0.0, # only required for continuous action games 99 | "sigma": 0.0, # only required for continuous action games 100 | "epsilon_decay_rate_denominator": 1.0 101 | 102 | 103 | 104 | }, 105 | 106 | "MANAGER": { 107 | "timesteps_before_changing_skill": 4, 108 | "linear_hidden_units": [10, 5], 109 | "learning_rate": 0.01, 110 | "buffer_size": 40000, 111 | "batch_size": 256, 112 | "final_layer_activation": "None", 113 | "columns_of_data_to_be_embedded": [0], 114 | "embedding_dimensions": [[config.environment.observation_space.n, 115 | max(4, int(config.environment.observation_space.n / 10.0))]], 116 | "batch_norm": False, 117 | "gradient_clipping_norm": 5, 118 | "update_every_n_steps": 1, 119 | "epsilon_decay_rate_denominator": 1000, 120 | "discount_rate": 0.999, 121 | "learning_iterations": 1 122 | 123 | } 124 | 125 | } 126 | 127 | } 128 | 129 | config.hyperparameters["DQN_Agents"] = config.hyperparameters["h_DQN"]["META_CONTROLLER"] 130 | 131 | 132 | if __name__ == "__main__": 133 | AGENTS = [SNN_HRL, DQN, h_DQN] 134 | trainer = Trainer(config, AGENTS) 135 | trainer.run_games_for_agents() 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /utilities/data_structures/Action_Balanced_Replay_Buffer.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple, deque 3 | import torch 4 | import numpy as np 5 | from .Replay_Buffer import Replay_Buffer 6 | 7 | class Action_Balanced_Replay_Buffer(Replay_Buffer): 8 | """Replay buffer that provides sample of experiences that have an equal number of each action being conducted""" 9 | def __init__(self, buffer_size, batch_size, seed, num_actions): 10 | self.num_actions = num_actions 11 | self.buffer_size_per_memory = int(buffer_size / self.num_actions) 12 | 13 | print("NUM ACTIONS ", self.num_actions) 14 | self.memories = {action: deque(maxlen=self.buffer_size_per_memory) for action in range(self.num_actions)} 15 | self.batch_size = batch_size 16 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 17 | self.seed = random.seed(seed) 18 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 19 | 20 | def add_experience(self, states, actions, rewards, next_states, dones): 21 | """Adds experience or list of experiences into the replay buffer""" 22 | if type(dones) == list: 23 | assert type(dones[0]) != list, "A done shouldn't be a list" 24 | experiences = [self.experience(state, action, reward, next_state, done) 25 | for state, action, reward, next_state, done in 26 | zip(states, actions, rewards, next_states, dones)] 27 | for experience in experiences: 28 | action = experience.action 29 | self.memories[action].append(experience) 30 | else: 31 | experience = self.experience(states, actions, rewards, next_states, dones) 32 | self.memories[actions].append(experience) 33 | 34 | def pick_experiences(self, num_experiences=None): 35 | """Picks the experiences that the sample function will return as a random sample of experiences. It works by picking 36 | an equal number of experiences that used each action (as far as possible)""" 37 | if num_experiences: batch_size = num_experiences 38 | else: batch_size = self.batch_size 39 | batch_per_action = self.calculate_batch_sizes_per_action(batch_size) 40 | samples_split_by_action = self.sample_each_action_equally(batch_per_action) 41 | combined_sample = [] 42 | for key in samples_split_by_action.keys(): 43 | combined_sample.extend(samples_split_by_action[key]) 44 | return combined_sample 45 | 46 | def calculate_batch_sizes_per_action(self, batch_size): 47 | """Calculates the batch size we need to randomly draw from each action to make sure there is equal coverage 48 | per action and that the batch gets filled up""" 49 | min_batch_per_action = int(batch_size / self.num_actions) 50 | batch_per_action = {k: min_batch_per_action for k in range(self.num_actions)} 51 | current_batch_size = np.sum([batch_per_action[k] for k in range(self.num_actions)]) 52 | remainder = batch_size - current_batch_size 53 | give_remainder_to = random.sample(range(self.num_actions), remainder) 54 | for action in give_remainder_to: 55 | batch_per_action[action] += 1 56 | return batch_per_action 57 | 58 | def sample_each_action_equally(self, batch_per_action): 59 | """Samples a number of experiences (determined by batch_per_action) from the memory buffer for each action""" 60 | samples = {} 61 | for action in range(self.num_actions): 62 | memory = self.memories[action] 63 | batch_size_for_action = batch_per_action[action] 64 | action_memory_size = len(memory) 65 | assert action_memory_size > 0, "Need at least 1 experience for each action" 66 | if action_memory_size >= batch_size_for_action: 67 | samples[action] = random.sample(memory, batch_size_for_action) 68 | else: 69 | print("Memory size {} vs. required batch size {}".format(action_memory_size, batch_size_for_action)) 70 | samples_for_action = [] 71 | while len(samples_for_action) < batch_per_action[action]: 72 | remainder = batch_per_action[action] - len(samples_for_action) 73 | sampled_experiences = random.sample(memory, min(remainder, action_memory_size)) 74 | samples_for_action.extend(sampled_experiences) 75 | samples[action] = samples_for_action 76 | return samples 77 | 78 | def __len__(self): 79 | return np.sum([len(memory) for memory in self.memories.values()]) 80 | 81 | def sample_experiences_with_certain_actions(self, allowed_actions, num_all_actions, required_batch_size): 82 | """Samples a number of experiences where the action conducted was in the list of required actions""" 83 | assert isinstance(allowed_actions, list) 84 | assert len(allowed_actions) > 0 85 | 86 | num_new_actions = len(allowed_actions) 87 | experiences_to_sample = int(required_batch_size * float(num_all_actions) / float(num_new_actions)) 88 | experiences = self.sample(num_experiences=experiences_to_sample) 89 | states, actions, rewards, next_states, dones = experiences 90 | matching_indexes = np.argwhere((np.in1d(actions.numpy(), allowed_actions))) 91 | assert matching_indexes.shape[1] == 1 92 | 93 | matching_indexes = matching_indexes[:, 0] 94 | 95 | states = states[matching_indexes] 96 | actions = actions[matching_indexes] 97 | rewards = rewards[matching_indexes] 98 | next_states = next_states[matching_indexes] 99 | dones = dones[matching_indexes] 100 | 101 | assert abs(states.shape[0] - required_batch_size) <= 0.05*required_batch_size, "{} vs. {}".format(states.shape[0], required_batch_size) 102 | 103 | 104 | return (states, actions, rewards, next_states, dones) 105 | -------------------------------------------------------------------------------- /tests/Test_DQN_HER.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import Counter 3 | 4 | import pytest 5 | 6 | from agents.DQN_agents.DQN_HER import DQN_HER 7 | from agents.DQN_agents.DDQN import DDQN 8 | from agents.DQN_agents.DDQN_With_Prioritised_Experience_Replay import DDQN_With_Prioritised_Experience_Replay 9 | from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets 10 | from environments.Bit_Flipping_Environment import Bit_Flipping_Environment 11 | from agents.policy_gradient_agents.PPO import PPO 12 | from agents.Trainer import Trainer 13 | from utilities.data_structures.Config import Config 14 | from agents.DQN_agents.DQN import DQN 15 | import numpy as np 16 | import torch 17 | 18 | random.seed(1) 19 | np.random.seed(1) 20 | torch.manual_seed(1) 21 | 22 | config = Config() 23 | config.seed = 1 24 | config.environment = Bit_Flipping_Environment(4) 25 | config.num_episodes_to_run = 1 26 | config.file_to_save_data_results = None 27 | config.file_to_save_results_graph = None 28 | config.visualise_individual_results = False 29 | config.visualise_overall_agent_results = False 30 | config.randomise_random_seed = False 31 | config.runs_per_agent = 1 32 | config.use_GPU = False 33 | config.hyperparameters = { 34 | 35 | "DQN_Agents": { 36 | 37 | "learning_rate": 0.005, 38 | "batch_size": 3, 39 | "buffer_size": 40000, 40 | "epsilon": 0.1, 41 | "epsilon_decay_rate_denominator": 200, 42 | "discount_rate": 0.99, 43 | "tau": 0.1, 44 | "alpha_prioritised_replay": 0.6, 45 | "beta_prioritised_replay": 0.4, 46 | "incremental_td_error": 1e-8, 47 | "update_every_n_steps": 3, 48 | "linear_hidden_units": [20, 20, 20], 49 | "final_layer_activation": "None", 50 | "batch_norm": False, 51 | "gradient_clipping_norm": 5, 52 | "HER_sample_proportion": 0.8, 53 | "clip_rewards": False 54 | } 55 | } 56 | 57 | 58 | trainer = Trainer(config, [DQN_HER]) 59 | config.hyperparameters = config.hyperparameters["DQN_Agents"] 60 | agent = DQN_HER(config) 61 | agent.reset_game() 62 | 63 | def test_initiation(): 64 | """Tests whether DQN_HER initiates correctly""" 65 | config.hyperparameters["batch_size"] = 64 66 | agent = DQN_HER(config) 67 | agent.reset_game() 68 | 69 | 70 | assert agent.ordinary_buffer_batch_size == int(0.2 * 64) 71 | assert agent.HER_buffer_batch_size == 64 - int(0.2 * 64) 72 | 73 | assert agent.q_network_local.input_dim == 8 74 | assert agent.q_network_local.output_layers[0].out_features == 4 75 | 76 | assert isinstance(agent.state_dict, dict) 77 | 78 | assert agent.observation.shape[0] == 4 79 | assert agent.desired_goal.shape[0] == 4 80 | assert agent.achieved_goal.shape[0] == 4 81 | 82 | assert agent.state.shape[0] == 8 83 | assert not agent.done 84 | assert agent.next_state is None 85 | assert agent.reward is None 86 | 87 | config.hyperparameters["batch_size"] = 3 88 | 89 | def test_action(): 90 | """Tests whether DQN_HER picks and conducts actions correctly""" 91 | num_tries = 1000 92 | actions = [] 93 | for _ in range(num_tries): 94 | action = agent.pick_action() 95 | actions.append(action) 96 | 97 | actions_count = Counter(actions) 98 | assert actions_count[0] > num_tries*0.1 99 | assert actions_count[1] > num_tries*0.1 100 | assert actions_count[2] > num_tries*0.1 101 | assert actions_count[3] > num_tries*0.1 102 | assert actions_count[0] + actions_count[1] + actions_count[2] + actions_count[3] == num_tries 103 | 104 | assert agent.next_state is None 105 | 106 | def test_tracks_changes_from_one_action(): 107 | """Tests that it tracks the changes as a result of actions correctly""" 108 | 109 | previous_obs = agent.observation 110 | previous_desired_goal = agent.desired_goal 111 | previous_achieved_goal = agent.achieved_goal 112 | 113 | agent.action = 0 114 | agent.conduct_action_in_changeable_goal_envs(agent.action) 115 | 116 | assert agent.next_state.shape[0] == 8 117 | assert isinstance(agent.next_state_dict, dict) 118 | assert not all (agent.observation == previous_obs) 119 | assert not all(agent.achieved_goal == previous_achieved_goal) 120 | assert all (agent.desired_goal == previous_desired_goal) 121 | 122 | agent.track_changeable_goal_episodes_data() 123 | 124 | with pytest.raises(Exception): 125 | agent.HER_memory.sample(1) 126 | 127 | agent.save_alternative_experience() 128 | 129 | sample = agent.HER_memory.sample(1) 130 | 131 | assert sample[1].item() == agent.action 132 | assert sample[2].item() == 4 133 | 134 | def test_tracks_changes_from_multiple_actions(): 135 | """Tests that it tracks the changes as a result of actions correctly""" 136 | 137 | agent = DQN_HER(config) 138 | agent.reset_game() 139 | 140 | for ix in range(4): 141 | previous_obs = agent.observation 142 | previous_desired_goal = agent.desired_goal 143 | previous_achieved_goal = agent.achieved_goal 144 | 145 | agent.action = ix 146 | agent.conduct_action_in_changeable_goal_envs(agent.action) 147 | 148 | assert agent.next_state.shape[0] == 8 149 | assert isinstance(agent.next_state_dict, dict) 150 | assert not all(agent.observation == previous_obs) 151 | assert not all(agent.achieved_goal == previous_achieved_goal) 152 | assert all(agent.desired_goal == previous_desired_goal) 153 | 154 | agent.track_changeable_goal_episodes_data() 155 | agent.save_experience() 156 | if agent.done: agent.save_alternative_experience() 157 | 158 | agent.state_dict = agent.next_state_dict # this is to set the state for the next iteration 159 | agent.state = agent.next_state 160 | 161 | states, actions, rewards, next_states, dones = agent.HER_memory.sample(4) 162 | 163 | assert all(states[1] == torch.Tensor([1.0, 1., 1., 1., 0., 0., 0. , 0.])) 164 | assert all(actions == torch.Tensor([[1.], [0.], [3.], [2.]])) 165 | assert all(rewards == torch.Tensor([[-1.], [-1.], [4.], [-1.]])) 166 | assert all(dones == torch.Tensor([[0.], [0.], [1.], [0.]])) 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /agents/actor_critic_agents/SAC_Discrete.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from agents.Base_Agent import Base_Agent 6 | from utilities.data_structures.Replay_Buffer import Replay_Buffer 7 | from agents.actor_critic_agents.SAC import SAC 8 | from utilities.Utility_Functions import create_actor_distribution 9 | 10 | class SAC_Discrete(SAC): 11 | """The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few 12 | methods.""" 13 | agent_name = "SAC" 14 | def __init__(self, config): 15 | Base_Agent.__init__(self, config) 16 | assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" 17 | assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax" 18 | self.hyperparameters = config.hyperparameters 19 | self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") 20 | self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, 21 | key_to_use="Critic", override_seed=self.config.seed + 1) 22 | self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), 23 | lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) 24 | self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), 25 | lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) 26 | self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, 27 | key_to_use="Critic") 28 | self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, 29 | key_to_use="Critic") 30 | Base_Agent.copy_model_over(self.critic_local, self.critic_target) 31 | Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) 32 | self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], 33 | self.config.seed) 34 | 35 | self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") 36 | self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), 37 | lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) 38 | self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] 39 | if self.automatic_entropy_tuning: 40 | # we set the max possible entropy as the target entropy 41 | self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98 42 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 43 | self.alpha = self.log_alpha.exp() 44 | self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) 45 | else: 46 | self.alpha = self.hyperparameters["entropy_term_weight"] 47 | assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" 48 | self.add_extra_noise = False 49 | self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"] 50 | 51 | def produce_action_and_action_info(self, state): 52 | """Given the state, produces an action, the probability of the action, the log probability of the action, and 53 | the argmax action""" 54 | action_probabilities = self.actor_local(state) 55 | max_probability_action = torch.argmax(action_probabilities).unsqueeze(0) 56 | action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size) 57 | action = action_distribution.sample().cpu() 58 | # Have to deal with situation of 0.0 probabilities because we can't do log 0 59 | z = action_probabilities == 0.0 60 | z = z.float() * 1e-8 61 | log_action_probabilities = torch.log(action_probabilities + z) 62 | return action, (action_probabilities, log_action_probabilities), max_probability_action 63 | 64 | def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch): 65 | """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy 66 | term is taken into account""" 67 | with torch.no_grad(): 68 | next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch) 69 | qf1_next_target = self.critic_target(next_state_batch) 70 | qf2_next_target = self.critic_target_2(next_state_batch) 71 | min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities) 72 | min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1) 73 | next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target) 74 | 75 | qf1 = self.critic_local(state_batch).gather(1, action_batch.long()) 76 | qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long()) 77 | qf1_loss = F.mse_loss(qf1, next_q_value) 78 | qf2_loss = F.mse_loss(qf2, next_q_value) 79 | return qf1_loss, qf2_loss 80 | 81 | def calculate_actor_loss(self, state_batch): 82 | """Calculates the loss for the actor. This loss includes the additional entropy term""" 83 | action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch) 84 | qf1_pi = self.critic_local(state_batch) 85 | qf2_pi = self.critic_local_2(state_batch) 86 | min_qf_pi = torch.min(qf1_pi, qf2_pi) 87 | inside_term = self.alpha * log_action_probabilities - min_qf_pi 88 | policy_loss = action_probabilities * inside_term 89 | policy_loss = policy_loss.mean() 90 | log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1) 91 | return policy_loss, log_action_probabilities 92 | -------------------------------------------------------------------------------- /utilities/Utility_Functions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | from abc import ABCMeta 5 | import torch 6 | from nn_builder.pytorch.NN import NN 7 | from torch.distributions import Categorical, normal, MultivariateNormal 8 | 9 | def abstract(cls): 10 | return ABCMeta(cls.__name__, cls.__bases__, dict(cls.__dict__)) 11 | 12 | def save_score_results(file_path, results): 13 | """Saves results as a numpy file at given path""" 14 | np.save(file_path, results) 15 | 16 | def normalise_rewards(rewards): 17 | """Normalises rewards to mean 0 and standard deviation 1""" 18 | mean_reward = np.mean(rewards) 19 | std_reward = np.std(rewards) 20 | return (rewards - mean_reward) / (std_reward + 1e-8) #1e-8 added for stability 21 | 22 | def create_actor_distribution(action_types, actor_output, action_size): 23 | """Creates a distribution that the actor can then use to randomly draw actions""" 24 | if action_types == "DISCRETE": 25 | assert actor_output.size()[1] == action_size, "Actor output the wrong size" 26 | action_distribution = Categorical(actor_output) # this creates a distribution to sample from 27 | else: 28 | assert actor_output.size()[1] == action_size * 2, "Actor output the wrong size" 29 | means = actor_output[:, :action_size].squeeze(0) 30 | stds = actor_output[:, action_size:].squeeze(0) 31 | if len(means.shape) == 2: means = means.squeeze(-1) 32 | if len(stds.shape) == 2: stds = stds.squeeze(-1) 33 | if len(stds.shape) > 1 or len(means.shape) > 1: 34 | raise ValueError("Wrong mean and std shapes - {} -- {}".format(stds.shape, means.shape)) 35 | action_distribution = normal.Normal(means.squeeze(0), torch.abs(stds)) 36 | return action_distribution 37 | 38 | class SharedAdam(torch.optim.Adam): 39 | """Creates an adam optimizer object that is shareable between processes. Useful for algorithms like A3C. Code 40 | taken from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py""" 41 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): 42 | super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) 43 | for group in self.param_groups: 44 | for p in group['params']: 45 | state = self.state[p] 46 | state['step'] = torch.zeros(1) 47 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 48 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 49 | 50 | def share_memory(self): 51 | for group in self.param_groups: 52 | for p in group['params']: 53 | state = self.state[p] 54 | state['step'].share_memory_() 55 | state['exp_avg'].share_memory_() 56 | state['exp_avg_sq'].share_memory_() 57 | 58 | def step(self, closure=None): 59 | """Performs a single optimization step. 60 | Arguments: 61 | closure (callable, optional): A closure that reevaluates the model 62 | and returns the loss. 63 | """ 64 | loss = None 65 | if closure is not None: 66 | loss = closure() 67 | for group in self.param_groups: 68 | for p in group['params']: 69 | if p.grad is None: 70 | continue 71 | grad = p.grad.data 72 | amsgrad = group['amsgrad'] 73 | state = self.state[p] 74 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 75 | if amsgrad: 76 | max_exp_avg_sq = state['max_exp_avg_sq'] 77 | beta1, beta2 = group['betas'] 78 | state['step'] += 1 79 | if group['weight_decay'] != 0: 80 | grad = grad.add(group['weight_decay'], p.data) 81 | # Decay the first and second moment running average coefficient 82 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 83 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 84 | if amsgrad: 85 | # Maintains the maximum of all 2nd moment running avg. till now 86 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 87 | # Use the max. for normalizing running avg. of gradient 88 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 89 | else: 90 | denom = exp_avg_sq.sqrt().add_(group['eps']) 91 | bias_correction1 = 1 - beta1 ** state['step'].item() 92 | bias_correction2 = 1 - beta2 ** state['step'].item() 93 | step_size = group['lr'] * math.sqrt( 94 | bias_correction2) / bias_correction1 95 | 96 | p.data.addcdiv_(-step_size, exp_avg, denom) 97 | return loss 98 | 99 | def flatten_action_id_to_actions(action_id_to_actions, global_action_id_to_primitive_action, num_primitive_actions): 100 | """Converts the values in an action_id_to_actions dictionary back to the primitive actions they represent""" 101 | flattened_action_id_to_actions = {} 102 | for key in action_id_to_actions.keys(): 103 | actions = action_id_to_actions[key] 104 | raw_actions = backtrack_action_to_primitive_actions(actions, global_action_id_to_primitive_action, num_primitive_actions) 105 | flattened_action_id_to_actions[key] = raw_actions 106 | return flattened_action_id_to_actions 107 | 108 | def backtrack_action_to_primitive_actions(action_tuple, global_action_id_to_primitive_action, num_primitive_actions): 109 | """Converts an action tuple back to the primitive actions it represents in a recursive way.""" 110 | print("Recursing to backtrack on ", action_tuple) 111 | primitive_actions = range(num_primitive_actions) 112 | if all(action in primitive_actions for action in action_tuple): return action_tuple #base case 113 | new_action_tuple = [] 114 | for action in action_tuple: 115 | if action in primitive_actions: new_action_tuple.append(action) 116 | else: 117 | converted_action = global_action_id_to_primitive_action[action] 118 | print(new_action_tuple) 119 | new_action_tuple.extend(converted_action) 120 | print("Should have changed: ", new_action_tuple) 121 | new_action_tuple = tuple(new_action_tuple) 122 | return backtrack_action_to_primitive_actions(new_action_tuple) 123 | -------------------------------------------------------------------------------- /agents/DQN_agents/DQN.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import torch 4 | import random 5 | import torch.optim as optim 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from agents.Base_Agent import Base_Agent 9 | from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration 10 | from utilities.data_structures.Replay_Buffer import Replay_Buffer 11 | 12 | class DQN(Base_Agent): 13 | """A deep Q learning agent""" 14 | agent_name = "DQN" 15 | def __init__(self, config): 16 | Base_Agent.__init__(self, config) 17 | self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed) 18 | self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) 19 | self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), 20 | lr=self.hyperparameters["learning_rate"], eps=1e-4) 21 | self.exploration_strategy = Epsilon_Greedy_Exploration(config) 22 | 23 | def reset_game(self): 24 | super(DQN, self).reset_game() 25 | self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer) 26 | 27 | def step(self): 28 | """Runs a step within a game including a learning step if required""" 29 | while not self.done: 30 | self.action = self.pick_action() 31 | self.conduct_action(self.action) 32 | if self.time_for_q_network_to_learn(): 33 | for _ in range(self.hyperparameters["learning_iterations"]): 34 | self.learn() 35 | self.save_experience() 36 | self.state = self.next_state #this is to set the state for the next iteration 37 | self.global_step_number += 1 38 | self.episode_number += 1 39 | 40 | def pick_action(self, state=None): 41 | """Uses the local Q network and an epsilon greedy policy to pick an action""" 42 | # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add 43 | # a "fake" dimension to make it a mini-batch rather than a single observation 44 | if state is None: state = self.state 45 | if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state]) 46 | state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) 47 | if len(state.shape) < 2: state = state.unsqueeze(0) 48 | self.q_network_local.eval() #puts network in evaluation mode 49 | with torch.no_grad(): 50 | action_values = self.q_network_local(state) 51 | self.q_network_local.train() #puts network back in training mode 52 | action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values, 53 | "turn_off_exploration": self.turn_off_exploration, 54 | "episode_number": self.episode_number}) 55 | self.logger.info("Q values {} -- Action chosen {}".format(action_values, action)) 56 | return action 57 | 58 | def learn(self, experiences=None): 59 | """Runs a learning iteration for the Q network""" 60 | if experiences is None: states, actions, rewards, next_states, dones = self.sample_experiences() #Sample experiences 61 | else: states, actions, rewards, next_states, dones = experiences 62 | loss = self.compute_loss(states, next_states, rewards, actions, dones) 63 | 64 | actions_list = [action_X.item() for action_X in actions ] 65 | 66 | self.logger.info("Action counts {}".format(Counter(actions_list))) 67 | self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) 68 | 69 | def compute_loss(self, states, next_states, rewards, actions, dones): 70 | """Computes the loss required to train the Q network""" 71 | with torch.no_grad(): 72 | Q_targets = self.compute_q_targets(next_states, rewards, dones) 73 | Q_expected = self.compute_expected_q_values(states, actions) 74 | loss = F.mse_loss(Q_expected, Q_targets) 75 | return loss 76 | 77 | def compute_q_targets(self, next_states, rewards, dones): 78 | """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network""" 79 | Q_targets_next = self.compute_q_values_for_next_states(next_states) 80 | Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones) 81 | return Q_targets 82 | 83 | def compute_q_values_for_next_states(self, next_states): 84 | """Computes the q_values for next state we will use to create the loss to train the Q network""" 85 | Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1) 86 | return Q_targets_next 87 | 88 | def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones): 89 | """Computes the q_values for current state we will use to create the loss to train the Q network""" 90 | Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones)) 91 | return Q_targets_current 92 | 93 | def compute_expected_q_values(self, states, actions): 94 | """Computes the expected q_values we will use to create the loss to train the Q network""" 95 | Q_expected = self.q_network_local(states).gather(1, actions.long()) #must convert actions to long so can be used as index 96 | return Q_expected 97 | 98 | def locally_save_policy(self): 99 | """Saves the policy""" 100 | torch.save(self.q_network_local.state_dict(), "Models/{}_local_network.pt".format(self.agent_name)) 101 | 102 | def time_for_q_network_to_learn(self): 103 | """Returns boolean indicating whether enough steps have been taken for learning to begin and there are 104 | enough experiences in the replay buffer to learn from""" 105 | return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from() 106 | 107 | def right_amount_of_steps_taken(self): 108 | """Returns boolean indicating whether enough steps have been taken for learning to begin""" 109 | return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 110 | 111 | def sample_experiences(self): 112 | """Draws a random sample of experience from the memory buffer""" 113 | experiences = self.memory.sample() 114 | states, actions, rewards, next_states, dones = experiences 115 | return states, actions, rewards, next_states, dones -------------------------------------------------------------------------------- /agents/actor_critic_agents/DDPG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as functional 3 | from torch import optim 4 | from agents.Base_Agent import Base_Agent 5 | from utilities.data_structures.Replay_Buffer import Replay_Buffer 6 | from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration 7 | 8 | class DDPG(Base_Agent): 9 | """A DDPG Agent""" 10 | agent_name = "DDPG" 11 | 12 | def __init__(self, config): 13 | Base_Agent.__init__(self, config) 14 | self.hyperparameters = config.hyperparameters 15 | self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") 16 | self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") 17 | Base_Agent.copy_model_over(self.critic_local, self.critic_target) 18 | 19 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), 20 | lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) 21 | self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], 22 | self.config.seed) 23 | self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") 24 | self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") 25 | Base_Agent.copy_model_over(self.actor_local, self.actor_target) 26 | 27 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), 28 | lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) 29 | self.exploration_strategy = OU_Noise_Exploration(self.config) 30 | 31 | def step(self): 32 | """Runs a step in the game""" 33 | while not self.done: 34 | # print("State ", self.state.shape) 35 | self.action = self.pick_action() 36 | self.conduct_action(self.action) 37 | if self.time_for_critic_and_actor_to_learn(): 38 | for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): 39 | states, actions, rewards, next_states, dones = self.sample_experiences() 40 | self.critic_learn(states, actions, rewards, next_states, dones) 41 | self.actor_learn(states) 42 | self.save_experience() 43 | self.state = self.next_state #this is to set the state for the next iteration 44 | self.global_step_number += 1 45 | self.episode_number += 1 46 | 47 | def sample_experiences(self): 48 | return self.memory.sample() 49 | 50 | def pick_action(self, state=None): 51 | """Picks an action using the actor network and then adds some noise to it to ensure exploration""" 52 | if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) 53 | self.actor_local.eval() 54 | with torch.no_grad(): 55 | action = self.actor_local(state).cpu().data.numpy() 56 | self.actor_local.train() 57 | action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action}) 58 | return action.squeeze(0) 59 | 60 | def critic_learn(self, states, actions, rewards, next_states, dones): 61 | """Runs a learning iteration for the critic""" 62 | loss = self.compute_loss(states, next_states, rewards, actions, dones) 63 | self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"]) 64 | self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) 65 | 66 | def compute_loss(self, states, next_states, rewards, actions, dones): 67 | """Computes the loss for the critic""" 68 | with torch.no_grad(): 69 | critic_targets = self.compute_critic_targets(next_states, rewards, dones) 70 | critic_expected = self.compute_expected_critic_values(states, actions) 71 | loss = functional.mse_loss(critic_expected, critic_targets) 72 | return loss 73 | 74 | def compute_critic_targets(self, next_states, rewards, dones): 75 | """Computes the critic target values to be used in the loss for the critic""" 76 | critic_targets_next = self.compute_critic_values_for_next_states(next_states) 77 | critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones) 78 | return critic_targets 79 | 80 | def compute_critic_values_for_next_states(self, next_states): 81 | """Computes the critic values for next states to be used in the loss for the critic""" 82 | with torch.no_grad(): 83 | actions_next = self.actor_target(next_states) 84 | critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1)) 85 | return critic_targets_next 86 | 87 | def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones): 88 | """Computes the critic values for current states to be used in the loss for the critic""" 89 | critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones)) 90 | return critic_targets_current 91 | 92 | def compute_expected_critic_values(self, states, actions): 93 | """Computes the expected critic values to be used in the loss for the critic""" 94 | critic_expected = self.critic_local(torch.cat((states, actions), 1)) 95 | return critic_expected 96 | 97 | def time_for_critic_and_actor_to_learn(self): 98 | """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the 99 | actor and critic""" 100 | return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 101 | 102 | def actor_learn(self, states): 103 | """Runs a learning iteration for the actor""" 104 | if self.done: #we only update the learning rate at end of each episode 105 | self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer) 106 | actor_loss = self.calculate_actor_loss(states) 107 | self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss, 108 | self.hyperparameters["Actor"]["gradient_clipping_norm"]) 109 | self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"]) 110 | 111 | def calculate_actor_loss(self, states): 112 | """Calculates the loss for the actor""" 113 | actions_pred = self.actor_local(states) 114 | actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean() 115 | return actor_loss -------------------------------------------------------------------------------- /results/Taxi.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from agents.DQN_agents.DDQN import DDQN 4 | from agents.hierarchical_agents.HRL.HRL import HRL 5 | from agents.hierarchical_agents.HRL.Model_HRL import Model_HRL 6 | from agents.Trainer import Trainer 7 | from utilities.data_structures.Config import Config 8 | 9 | config = Config() 10 | config.seed = 1 11 | config.environment = gym.make("CartPole-v0") 12 | config.env_parameters = {} 13 | config.num_episodes_to_run = 500 14 | config.file_to_save_data_results = "data_and_graphs/hrl_experiments/Cart_Pole_data.pkl" 15 | config.file_to_save_results_graph = "data_and_graphs/hrl_experiments/Cart_Poke.png" 16 | config.show_solution_score = False 17 | config.visualise_individual_results = False 18 | config.visualise_overall_agent_results = True 19 | config.standard_deviation_results = 1.0 20 | config.runs_per_agent = 10 21 | config.use_GPU = False 22 | config.overwrite_existing_results_file = False 23 | config.randomise_random_seed = True 24 | config.save_model = False 25 | 26 | 27 | # Loss is not drawing a random sample! otherwise wouldnt jump around that much!! 28 | 29 | linear_hidden_units = [32, 32] 30 | learning_rate = 0.005 # 0.001 taxi 31 | buffer_size = 1000000 32 | batch_size = 256 33 | batch_norm = False 34 | embedding_dimensionality = 10 35 | gradient_clipping_norm = 0.5 #needs to be optimised 36 | update_every_n_steps = 1 37 | learning_iterations = 1 38 | epsilon_decay_rate_denominator = 2 #150 39 | episodes_per_round = 50 #80 40 | discount_rate = 0.99 41 | tau = 0.004 42 | sequitur_k = 2 43 | pre_training_learning_iterations_multiplier = 0 44 | episodes_to_run_with_no_exploration = 0 45 | action_balanced_replay_buffer = True 46 | copy_over_hidden_layers = True 47 | 48 | num_top_results_to_use = 10 49 | action_frequency_required_in_top_results = 0.8 50 | 51 | random_episodes_to_run = 0 52 | 53 | action_length_reward_bonus = 0.0 54 | 55 | only_train_new_actions = True 56 | only_train_final_layer = True 57 | reduce_macro_action_appearance_cutoff_throughout_training = False 58 | add_1_macro_action_at_a_time = True 59 | 60 | calculate_q_values_as_increments = True 61 | abandon_ship = True 62 | clip_rewards = True 63 | use_relative_counts = True 64 | 65 | config.debug_mode = False 66 | 67 | config.hyperparameters = { 68 | 69 | "HRL": { 70 | "linear_hidden_units": linear_hidden_units, 71 | "learning_rate": learning_rate, 72 | "buffer_size": buffer_size, 73 | "batch_size": batch_size, 74 | "final_layer_activation": "None", 75 | # "columns_of_data_to_be_embedded": [0], 76 | # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 77 | "batch_norm": batch_norm, 78 | "gradient_clipping_norm": gradient_clipping_norm, 79 | "update_every_n_steps": update_every_n_steps, 80 | "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator, 81 | "discount_rate": discount_rate, 82 | "learning_iterations": learning_iterations, 83 | "tau": tau, 84 | "sequitur_k": sequitur_k, 85 | "use_relative_counts": use_relative_counts, 86 | "action_length_reward_bonus": action_length_reward_bonus, 87 | "pre_training_learning_iterations_multiplier": pre_training_learning_iterations_multiplier, 88 | "episodes_to_run_with_no_exploration": episodes_to_run_with_no_exploration, 89 | "action_balanced_replay_buffer": action_balanced_replay_buffer, 90 | "copy_over_hidden_layers": copy_over_hidden_layers, 91 | "random_episodes_to_run": random_episodes_to_run, 92 | "only_train_new_actions": only_train_new_actions, 93 | "only_train_final_layer": only_train_final_layer, 94 | "num_top_results_to_use": num_top_results_to_use, 95 | "action_frequency_required_in_top_results": action_frequency_required_in_top_results, 96 | "reduce_macro_action_appearance_cutoff_throughout_training": reduce_macro_action_appearance_cutoff_throughout_training, 97 | "add_1_macro_action_at_a_time": add_1_macro_action_at_a_time, 98 | "calculate_q_values_as_increments": calculate_q_values_as_increments, 99 | "episodes_per_round": episodes_per_round, 100 | "abandon_ship": abandon_ship, 101 | "clip_rewards": clip_rewards 102 | }, 103 | 104 | "DQN_Agents": { 105 | "linear_hidden_units": linear_hidden_units, 106 | "learning_rate": learning_rate, 107 | "buffer_size": buffer_size, 108 | "batch_size": batch_size, 109 | "final_layer_activation": "None", 110 | # "columns_of_data_to_be_embedded": [0], 111 | # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 112 | "batch_norm": batch_norm, 113 | "gradient_clipping_norm": gradient_clipping_norm, 114 | "update_every_n_steps": update_every_n_steps, 115 | "epsilon_decay_rate_denominator": epsilon_decay_rate_denominator, 116 | "discount_rate": discount_rate, 117 | "learning_iterations": learning_iterations, 118 | "tau": tau, 119 | "clip_rewards": clip_rewards 120 | }, 121 | 122 | "Actor_Critic_Agents": { 123 | "Actor": { 124 | "learning_rate": 0.0003, 125 | "linear_hidden_units": [64, 64], 126 | "final_layer_activation": "Softmax", 127 | # "columns_of_data_to_be_embedded": [0], 128 | # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 129 | "batch_norm": False, 130 | "tau": 0.005, 131 | "gradient_clipping_norm": 5, 132 | "initialiser": "Xavier" 133 | }, 134 | 135 | "Critic": { 136 | "learning_rate": 0.0003, 137 | "linear_hidden_units": [64, 64], 138 | "final_layer_activation": None, 139 | # "columns_of_data_to_be_embedded": [0], 140 | # "embedding_dimensions": [[config.environment.observation_space.n, embedding_dimensionality]], 141 | "batch_norm": False, 142 | "buffer_size": 1000000, 143 | "tau": 0.005, 144 | "gradient_clipping_norm": 5, 145 | "initialiser": "Xavier" 146 | }, 147 | 148 | "min_steps_before_learning": 10000, 149 | "batch_size": 256, 150 | "discount_rate": 0.99, 151 | "mu": 0.0, # for O-H noise 152 | "theta": 0.15, # for O-H noise 153 | "sigma": 0.25, # for O-H noise 154 | "action_noise_std": 0.2, # for TD3 155 | "action_noise_clipping_range": 0.5, # for TD3 156 | "update_every_n_steps": 1, 157 | "learning_updates_per_learning_session": 1, 158 | "automatically_tune_entropy_hyperparameter": True, 159 | "entropy_term_weight": None, 160 | "add_extra_noise": False, 161 | "do_evaluation_iterations": True, 162 | "clip_rewards": clip_rewards 163 | } 164 | } 165 | 166 | 167 | if __name__ == "__main__": 168 | AGENTS = [HRL, DDQN] #] #DDQN, , ] #] ## ] #, SAC_Discrete, SAC_Discrete, DDQN] #HRL] #, SNN_HRL, DQN, h_DQN] 169 | trainer = Trainer(config, AGENTS) 170 | trainer.run_games_for_agents() 171 | 172 | 173 | 174 | --------------------------------------------------------------------------------