├── HER ├── bitflip │ ├── results.npy │ ├── test_env.py │ ├── networks.py │ ├── test_her.py │ ├── episode.py │ ├── bit_flip.py │ ├── main.py │ ├── her.py │ └── agent.py └── robotic │ ├── wrappers.py │ ├── env_test.py │ ├── utils.py │ ├── normalizer.py │ ├── episode.py │ ├── main.py │ ├── networks.py │ ├── her.py │ └── agent.py ├── README.md └── PER ├── max_heap └── test_maxheap.py ├── ranked ├── network.py ├── main.py ├── utils.py ├── agent.py └── memory.py └── proportional ├── network.py ├── main.py ├── atari ├── main.py ├── network.py ├── wrappers.py ├── agent.py ├── utils.py └── memory.py ├── memory.py ├── utils.py └── agent.py /HER/bitflip/results.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Advanced-Replay-Strategies/HEAD/HER/bitflip/results.npy -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Source code for my new course: Advanced Replay Strategies. 2 | You can find this course at https://www.neuralnet.ai/courses 3 | -------------------------------------------------------------------------------- /HER/robotic/wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | 5 | class FlattenDictWrapper(gym.ObservationWrapper): 6 | def __init__(self, env): 7 | super(FlattenDictWrapper, self).__init__(env) 8 | 9 | def observation(self, obs): 10 | obs = np.concatenate([obs[k] for k in obs.keys()]) 11 | return obs 12 | -------------------------------------------------------------------------------- /HER/robotic/env_test.py: -------------------------------------------------------------------------------- 1 | import panda_gym 2 | import gym 3 | import time 4 | 5 | env = gym.make('PandaReach-v2', render=True) 6 | 7 | for _ in range(100): 8 | obs = env.reset() 9 | done = False 10 | while not done: 11 | action = env.action_space.sample() 12 | _, _, done, _ = env.step(action) 13 | time.sleep(0.05) 14 | env.close() 15 | -------------------------------------------------------------------------------- /HER/bitflip/test_env.py: -------------------------------------------------------------------------------- 1 | from bit_flip import BitFlipEnv 2 | 3 | 4 | if __name__ == '__main__': 5 | env = BitFlipEnv(n_bits=4) 6 | 7 | for _ in range(2): 8 | done = False 9 | obs = env.reset() 10 | print('starting new episode') 11 | while not done: 12 | action = env.action_space_sample() 13 | obs_, reward, done, info = env.step(action) 14 | env.render() 15 | -------------------------------------------------------------------------------- /PER/max_heap/test_maxheap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def max_heapify(array, i, N=None): 5 | pass 6 | 7 | 8 | def build_max_heap(array): 9 | pass 10 | 11 | 12 | if __name__ == '__main__': 13 | np.random.seed(42) 14 | a = np.random.choice(np.arange(100), 21, replace=False) 15 | print('unsorted array: {}'.format(a)) 16 | a = build_max_heap(a) 17 | reference = np.array([90., 80., 83., 77., 55., 73., 70., 76., 18 | 53., 44., 18., 30., 39., 33., 22., 4., 19 | 45., 10., 12., 31., 0]) 20 | print('max heap array: {}'.format(a)) 21 | assert (a == reference).all() 22 | -------------------------------------------------------------------------------- /HER/robotic/utils.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | import torch as T 4 | 5 | 6 | def sync_networks(network): 7 | comm = MPI.COMM_WORLD 8 | 9 | params = np.concatenate([getattr(p, 'data').cpu().numpy().flatten() 10 | for p in network.parameters()]) 11 | comm.Bcast(params) 12 | idx = 0 13 | for p in network.parameters(): 14 | getattr(p, 'data').copy_(T.tensor( 15 | params[idx:idx + p.data.numel()]).view_as(p.data)) 16 | idx += p.data.numel() 17 | 18 | 19 | def sync_grads(network): 20 | comm = MPI.COMM_WORLD 21 | 22 | grads = np.concatenate([getattr(p, 'grad').cpu().numpy().flatten() 23 | for p in network.parameters()]) 24 | global_grads = np.zeros_like(grads) 25 | comm.Allreduce(grads, global_grads, op=MPI.SUM) 26 | idx = 0 27 | for p in network.parameters(): 28 | getattr(p, 'grad').copy_(T.tensor( 29 | global_grads[idx:idx + p.data.numel()]).view_as(p.data)) 30 | idx += p.data.numel() 31 | -------------------------------------------------------------------------------- /PER/ranked/network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class LinearDeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(LinearDeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.fc1 = nn.Linear(*input_dims, 32) 15 | self.fc2 = nn.Linear(32, 32) 16 | self.q = nn.Linear(32, n_actions) 17 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 18 | 19 | self.loss = nn.MSELoss() 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 21 | self.to(self.device) 22 | 23 | def forward(self, state): 24 | flat1 = F.relu(self.fc1(state)) 25 | flat2 = F.relu(self.fc2(flat1)) 26 | q = self.q(flat2) 27 | 28 | return q 29 | 30 | def save_checkpoint(self): 31 | T.save(self.state_dict(), self.checkpoint_file) 32 | 33 | def load_checkpoint(self): 34 | self.load_state_dict(T.load(self.checkpoint_file)) 35 | -------------------------------------------------------------------------------- /PER/proportional/network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class LinearDeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(LinearDeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.fc1 = nn.Linear(*input_dims, 32) 15 | self.fc2 = nn.Linear(32, 32) 16 | self.q = nn.Linear(32, n_actions) 17 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 18 | 19 | self.loss = nn.MSELoss() 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 21 | self.to(self.device) 22 | 23 | def forward(self, state): 24 | flat1 = F.relu(self.fc1(state)) 25 | flat2 = F.relu(self.fc2(flat1)) 26 | q = self.q(flat2) 27 | 28 | return q 29 | 30 | def save_checkpoint(self): 31 | T.save(self.state_dict(), self.checkpoint_file) 32 | 33 | def load_checkpoint(self): 34 | self.load_state_dict(T.load(self.checkpoint_file)) 35 | -------------------------------------------------------------------------------- /HER/bitflip/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class DeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.fc1 = nn.Linear(input_dims, 256) 15 | self.fc2 = nn.Linear(256, n_actions) 16 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 17 | 18 | self.loss = nn.MSELoss() 19 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 20 | self.to(self.device) 21 | 22 | def forward(self, state): 23 | flat1 = F.relu(self.fc1(state)) 24 | actions = self.fc2(flat1) 25 | 26 | return actions 27 | 28 | def save_checkpoint(self): 29 | print('... saving checkpoint ...') 30 | T.save(self.state_dict(), self.checkpoint_file) 31 | 32 | def load_checkpoint(self): 33 | print('... loading checkpoint ...') 34 | self.load_state_dict(T.load(self.checkpoint_file)) 35 | -------------------------------------------------------------------------------- /HER/bitflip/test_her.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from bit_flip import BitFlipEnv 4 | from her import HER 5 | 6 | n_bits = 16 7 | env = BitFlipEnv(n_bits) 8 | random.seed(123) 9 | np.random.seed(123) 10 | batch_size = 2 11 | max_size = 1_000 12 | input_shape = n_bits 13 | memory = HER(max_mem=max_size, input_shape=input_shape, n_actions=1, 14 | batch_size=batch_size, goal_shape=n_bits, 15 | strategy='final', reward_fn=env.compute_reward) 16 | 17 | for _ in range(40): 18 | o = env.reset() 19 | agl = o[n_bits:2*n_bits] 20 | dg_ = o[2*n_bits:3*n_bits] 21 | d = False 22 | s, a, re, dn, s_, dg, ag, ag_ = [], [], [], [], [], [], [], [] 23 | while not d: 24 | action = env.action_space_sample() 25 | o_, r, d, i = env.step(action) 26 | agl_ = o_[2*n_bits:3*n_bits] 27 | s.append(o[:n_bits]) 28 | a.append(action) 29 | re.append(r) 30 | dn.append(d) 31 | s_.append(o_[:n_bits]) 32 | dg.append(dg_) 33 | ag.append(agl) 34 | ag_.append(agl_) 35 | agl = agl_ 36 | o = o_ 37 | memory.store_episode([s, a, re, s_, dn, dg, ag, ag_]) 38 | assert memory.ready(), 'Unexpected number of memories in buffer' 39 | 40 | s, a, re, s_, dn, dg, ag = memory.sample_memory() 41 | 42 | data = np.load('results.npy') 43 | 44 | assert (s[0] == data[0]).all(), 'Unexpected values for sampling of states' 45 | assert (s[1] == data[1]).all(), 'Unexpected values for sampling of states' 46 | assert (s_[0] == data[2]).all(), 'Unexpected values for sampling of states_' 47 | assert (s_[1] == data[3]).all(), 'Unexpected values for sampling of states_' 48 | -------------------------------------------------------------------------------- /HER/bitflip/episode.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class EpisodeWorker: 5 | def __init__(self, env, agent, memory): 6 | self.agent = agent 7 | self.env = env 8 | self.memory = memory 9 | self.get_slices() 10 | 11 | def get_slices(self): 12 | OB = self.env.observation_space['observation'].shape[0] 13 | A = self.env.observation_space['achieved_goal'].shape[0] 14 | D = self.env.observation_space['desired_goal'].shape[0] 15 | 16 | self.ob = slice(0, OB) 17 | self.ag = slice(OB, OB + A) 18 | self.dg = slice(OB + A, OB + A + D) 19 | 20 | def play_episode(self, evaluate=False): 21 | observation = self.env.reset() 22 | done = False 23 | score = 0 24 | desired_goal = observation[self.dg] 25 | achieved_goal = observation[self.ag] 26 | observation = observation[self.ob] 27 | 28 | states, actions, rewards, states_,\ 29 | dones, dg, ag, ag_ = [], [], [], [], [], [], [], [] 30 | 31 | while not done: 32 | action = self.agent.choose_action(np.concatenate( 33 | [observation, desired_goal]), evaluate) 34 | observation_, reward, done, info = self.env.step(action) 35 | achieved_goal_new = observation_[self.ag] 36 | states.append(observation) 37 | states_.append(observation_[self.ob]) 38 | rewards.append(reward) 39 | actions.append(action) 40 | dones.append(done) 41 | dg.append(desired_goal) 42 | ag.append(achieved_goal) 43 | ag_.append(achieved_goal_new) 44 | score += reward 45 | achieved_goal = achieved_goal_new 46 | observation = observation_[self.ob] 47 | if not evaluate: 48 | self.memory.store_episode([states, actions, rewards, 49 | states_, dones, dg, ag, ag_]) 50 | success = info['is_success'] 51 | return score, success 52 | -------------------------------------------------------------------------------- /HER/bitflip/bit_flip.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BitFlipEnv: 5 | def __init__(self, n_bits, max_steps=50): 6 | self.n_bits = n_bits 7 | self.max_steps = max_steps 8 | self.n_actions = n_bits 9 | 10 | self.observation = self.reset() 11 | self.observation_space = {'observation': np.empty((self.n_bits)), 12 | 'achieved_goal': np.empty((self.n_bits)), 13 | 'desired_goal': np.empty((self.n_bits)), 14 | } 15 | 16 | def reset(self): 17 | self._bits = np.array([np.random.randint(2) 18 | for _ in range(self.n_bits)]) 19 | self._desired_goal = np.array([np.random.randint(2) 20 | for _ in range(self.n_bits)]) 21 | self._achieved_goal = self._bits.copy() 22 | 23 | obs = np.concatenate([self._bits, 24 | self._achieved_goal, 25 | self._desired_goal]) 26 | self._step = 0 27 | return obs 28 | 29 | def compute_reward(self, desired_goal, achieved_goal, info): 30 | reward = 0.0 if (desired_goal == achieved_goal).all() else -1.0 31 | return reward 32 | 33 | def step(self, action): 34 | assert action <= self.n_actions, "Invalid Action" 35 | new_bit = 0 if self._bits[action] == 1 else 1 36 | self._bits[action] = new_bit 37 | info = {} 38 | self._achieved_goal = self._bits.copy() 39 | reward = self.compute_reward(self._desired_goal, 40 | self._achieved_goal, {}) 41 | self._step += 1 42 | if reward == 0.0 or self._step >= self.max_steps: 43 | done = True 44 | else: 45 | done = False 46 | info['is_success'] = 1.0 if reward == 0.0 else 0.0 47 | obs = np.concatenate([self._bits, self._achieved_goal, 48 | self._desired_goal]) 49 | return obs, reward, done, info 50 | 51 | def action_space_sample(self): 52 | return np.random.randint(0, self.n_actions) 53 | 54 | def render(self): 55 | for bit in self._bits: 56 | print(bit, end=' ') 57 | print('\n') 58 | -------------------------------------------------------------------------------- /HER/bitflip/main.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch as T 4 | from agent import Agent 5 | from episode import EpisodeWorker 6 | from her import HER 7 | from bit_flip import BitFlipEnv 8 | 9 | 10 | def train(agent, worker, memory): 11 | epochs = 100 12 | cycle_length = 16 13 | n_cycles = 50 14 | n_updates = 40 15 | n_tests = 10 16 | for epoch in range(epochs): 17 | for cycle in range(n_cycles): 18 | score_history, success_history = [], [] 19 | for i in range(cycle_length): 20 | score, success = worker.play_episode() 21 | score_history.append(score) 22 | success_history.append(success) 23 | # cycle_avg_score = np.mean(score_history) 24 | # cycle_avg_success = np.mean(success_history) 25 | # print('Epoch: {} Cycle: {} Training Avg Score {:.1f} ' 26 | # 'Trainig Avg Success: {:.3f}'. 27 | # format(epoch, cycle, cycle_avg_score, cycle_avg_success)) 28 | if memory.ready(): 29 | for _ in range(n_updates): 30 | memories = memory.sample_memory() 31 | agent.learn(memories) 32 | score_history, success_history = [], [] 33 | for episode in range(n_tests): 34 | score, success = worker.play_episode(evaluate=True) 35 | success_history.append(success) 36 | score_history.append(score) 37 | avg_success = np.mean(success_history) 38 | avg_score = np.mean(score_history) 39 | print('Epoch: {} Testing Agent. Avg Score: {:.1f} ' 40 | 'Avg Sucess: {:.3f}'. 41 | format(epoch, avg_score, avg_success)) 42 | 43 | 44 | def main(): 45 | n_bits = 32 46 | env = BitFlipEnv(n_bits, max_steps=n_bits) 47 | random.seed(123) 48 | np.random.seed(123) 49 | T.manual_seed(123) 50 | T.cuda.manual_seed(123) 51 | 52 | batch_size = 128 53 | max_size = 1_000_000 54 | input_shape = n_bits 55 | memory = HER(max_mem=max_size, input_shape=input_shape, n_actions=1, 56 | batch_size=batch_size, goal_shape=n_bits, strategy=None, 57 | reward_fn=env.compute_reward) 58 | agent = Agent(lr=0.001, epsilon=0.2, n_actions=n_bits, eps_dec=0.0, 59 | batch_size=batch_size, input_dims=2*input_shape, gamma=0.98) 60 | ep_worker = EpisodeWorker(env, agent, memory) 61 | 62 | train(agent, ep_worker, memory) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /HER/robotic/normalizer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import numpy as np 3 | from mpi4py import MPI 4 | 5 | 6 | class Normalizer: 7 | def __init__(self, size, eps=1e-2, default_clip_range=np.inf): 8 | self.size = size 9 | self.eps = eps 10 | self.default_clip_range = default_clip_range 11 | 12 | self.local_sum = np.zeros(self.size, dtype=np.float32) 13 | self.local_sum_sq = np.zeros(self.size, dtype=np.float32) 14 | self.local_cnt = np.zeros(1, dtype=np.float32) 15 | 16 | self.lock = threading.Lock() 17 | 18 | self.running_mean = np.zeros(self.size, dtype=np.float32) 19 | self.running_std = np.ones(self.size, dtype=np.float32) 20 | self.running_sum = np.zeros(self.size, dtype=np.float32) 21 | self.running_sum_sq = np.zeros(self.size, dtype=np.float32) 22 | self.running_cnt = 1 23 | 24 | def update_local_stats(self, new_data): 25 | with self.lock: 26 | self.local_sum += new_data.sum(axis=0) 27 | self.local_sum_sq += (np.square(new_data)).sum(axis=0) 28 | self.local_cnt[0] += new_data.shape[0] 29 | 30 | def sync_thread_stats(self, local_sum, local_sum_sq, local_cnt): 31 | local_sum[...] = self.mpi_average(local_sum) 32 | local_sum_sq[...] = self.mpi_average(local_sum_sq) 33 | local_cnt[...] = self.mpi_average(local_cnt) 34 | return local_sum, local_sum_sq, local_cnt 35 | 36 | def mpi_average(self, x): 37 | buf = np.zeros_like(x) 38 | MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM) 39 | buf /= MPI.COMM_WORLD.Get_size() 40 | return buf 41 | 42 | def normalize_observation(self, v): 43 | clip_range = self.default_clip_range 44 | return np.clip((v - self.running_mean) / self.running_std, 45 | -clip_range, clip_range).astype(np.float32) 46 | 47 | def recompute_global_stats(self): 48 | with self.lock: 49 | local_cnt = self.local_cnt.copy() 50 | local_sum = self.local_sum.copy() 51 | local_sum_sq = self.local_sum_sq.copy() 52 | 53 | self.local_cnt[...] = 0 54 | self.local_sum[...] = 0 55 | self.local_sum_sq[...] = 0 56 | 57 | sync_sum, sync_sum_sq, sync_cnt = self.sync_thread_stats( 58 | local_sum, local_sum_sq, local_cnt) 59 | 60 | self.running_cnt += sync_cnt 61 | self.running_sum += sync_sum 62 | self.running_sum_sq += sync_sum_sq 63 | 64 | self.running_mean = self.running_sum / self.running_cnt 65 | tmp = self.running_sum_sq / self.running_cnt -\ 66 | np.square(self.running_sum / self.running_cnt) 67 | self.running_std = np.sqrt(np.maximum(np.square(self.eps), tmp)) 68 | -------------------------------------------------------------------------------- /PER/proportional/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve 5 | 6 | 7 | def clip_reward(r): 8 | if r > 1: 9 | return 1 10 | elif r < -1: 11 | return -1 12 | else: 13 | return r 14 | 15 | 16 | if __name__ == '__main__': 17 | env = gym.make('CartPole-v0') 18 | best_score = -np.inf 19 | load_checkpoint = False 20 | n_games = 1000 21 | alpha = 0.6 22 | beta = 0.4 23 | replace = 250 24 | bs = 64 25 | agent = Agent(gamma=0.99, epsilon=1, lr=1e-4, alpha=alpha, 26 | beta=beta, input_dims=(env.observation_space.shape), 27 | n_actions=env.action_space.n, mem_size=50*1024, eps_min=0.01, 28 | batch_size=bs, replace=replace, eps_dec=1e-4, 29 | chkpt_dir='models/', algo='ddqn', env_name='CartPole-v0') 30 | 31 | if load_checkpoint: 32 | agent.load_models() 33 | 34 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 35 | + str(n_games) + 'games' + str(alpha) +\ 36 | 'alpha_' + str(beta) + '_replace_' + str(replace) 37 | figure_file = 'plots/' + fname + '.png' 38 | # if you want to record video of your agent playing, 39 | # do a mkdir tmp && mkdir tmp/dqn-video 40 | # and uncomment the following 2 lines. 41 | # env = wrappers.Monitor(env, "tmp/dqn-video", 42 | # video_callable=lambda episode_id: True, force=True) 43 | n_steps = 0 44 | scores, eps_history, steps_array = [], [], [] 45 | 46 | for i in range(n_games): 47 | done = False 48 | observation = env.reset() 49 | 50 | score = 0 51 | while not done: 52 | action = agent.choose_action(observation) 53 | observation_, reward, done, info = env.step(action) 54 | score += reward 55 | r = clip_reward(reward) 56 | if not load_checkpoint: 57 | agent.store_transition(observation, action, 58 | r, observation_, done) 59 | agent.learn() 60 | observation = observation_ 61 | n_steps += 1 62 | scores.append(score) 63 | steps_array.append(n_steps) 64 | 65 | avg_score = np.mean(scores[-100:]) 66 | print('episode {} score {:.1f} eps {:.2f} n steps {}'. 67 | format(i, avg_score, agent.epsilon, n_steps)) 68 | 69 | if avg_score > best_score: 70 | if not load_checkpoint: 71 | agent.save_models() 72 | best_score = avg_score 73 | 74 | eps_history.append(agent.epsilon) 75 | 76 | x = [i+1 for i in range(len(scores))] 77 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 78 | -------------------------------------------------------------------------------- /PER/proportional/atari/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve 5 | from wrappers import make_env 6 | 7 | def clip_reward(r): 8 | if r > 1: 9 | return 1 10 | elif r < -1: 11 | return -1 12 | else: 13 | return r 14 | 15 | 16 | if __name__ == '__main__': 17 | env_name = 'SpaceInvadersNoFrameskip-v4' 18 | # env = gym.make('CartPole-v0') 19 | env = make_env(env_name) 20 | best_score = -np.inf 21 | load_checkpoint = False 22 | n_games = 1500 23 | alpha = 0.6 24 | beta = 0.4 25 | bs = 64 26 | agent = Agent(gamma=0.99, epsilon=1, lr=5e-5, alpha=alpha, 27 | beta=beta, input_dims=(env.observation_space.shape), 28 | n_actions=env.action_space.n, mem_size=50*1024, eps_min=0.01, 29 | batch_size=bs, eps_dec=1e-5, 30 | chkpt_dir='models/', algo='ddqn', env_name='SpaceInvaders') 31 | 32 | if load_checkpoint: 33 | agent.load_models() 34 | 35 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 36 | + str(n_games) + 'games' + str(alpha) +\ 37 | 'alpha_' + str(beta) 38 | figure_file = 'plots/' + fname + '.png' 39 | # if you want to record video of your agent playing, 40 | # do a mkdir tmp && mkdir tmp/dqn-video 41 | # and uncomment the following 2 lines. 42 | # env = wrappers.Monitor(env, "tmp/dqn-video", 43 | # video_callable=lambda episode_id: True, force=True) 44 | n_steps = 0 45 | scores, eps_history, steps_array = [], [], [] 46 | 47 | for i in range(n_games): 48 | done = False 49 | observation = env.reset() 50 | 51 | score = 0 52 | while not done: 53 | action = agent.choose_action(observation) 54 | observation_, reward, done, info = env.step(action) 55 | score += reward 56 | r = clip_reward(reward) 57 | if not load_checkpoint: 58 | agent.store_transition(observation, action, 59 | r, observation_, done) 60 | agent.learn() 61 | observation = observation_ 62 | n_steps += 1 63 | scores.append(score) 64 | steps_array.append(n_steps) 65 | 66 | avg_score = np.mean(scores[-100:]) 67 | print('episode {} score {:.1f} eps {:.2f} n steps {}'. 68 | format(i, avg_score, agent.epsilon, n_steps)) 69 | 70 | if avg_score > best_score: 71 | if not load_checkpoint: 72 | agent.save_models() 73 | best_score = avg_score 74 | 75 | eps_history.append(agent.epsilon) 76 | agent.memory.anneal_beta(i, n_games) 77 | 78 | x = [i+1 for i in range(len(scores))] 79 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 80 | -------------------------------------------------------------------------------- /PER/proportional/atari/network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch as T 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | 8 | 9 | class DeepQNetwork(nn.Module): 10 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 11 | super(DeepQNetwork, self).__init__() 12 | self.checkpoint_dir = chkpt_dir 13 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 14 | 15 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 16 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 17 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 18 | 19 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 20 | 21 | self.fc1 = nn.Linear(fc_input_dims, 512) 22 | self.fc2 = nn.Linear(512, n_actions) 23 | 24 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 25 | 26 | self.loss = nn.MSELoss() 27 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 28 | self.to(self.device) 29 | 30 | def forward(self, state): 31 | conv1 = F.relu(self.conv1(state)) 32 | conv2 = F.relu(self.conv2(conv1)) 33 | conv3 = F.relu(self.conv3(conv2)) 34 | conv_state = conv3.view(conv3.size()[0], -1) 35 | 36 | flat1 = F.relu(self.fc1(conv_state)) 37 | q = self.fc2(flat1) 38 | 39 | return q 40 | 41 | def calculate_conv_output_dims(self, input_dims): 42 | state = T.zeros(1, *input_dims) 43 | dims = self.conv1(state) 44 | dims = self.conv2(dims) 45 | dims = self.conv3(dims) 46 | return int(np.prod(dims.size())) 47 | 48 | def save_checkpoint(self): 49 | T.save(self.state_dict(), self.checkpoint_file) 50 | 51 | def load_checkpoint(self): 52 | self.load_state_dict(T.load(self.checkpoint_file)) 53 | 54 | 55 | class LinearDeepQNetwork(nn.Module): 56 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 57 | super(LinearDeepQNetwork, self).__init__() 58 | self.checkpoint_dir = chkpt_dir 59 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 60 | 61 | self.fc1 = nn.Linear(*input_dims, 32) 62 | self.fc2 = nn.Linear(32, 32) 63 | self.q = nn.Linear(32, n_actions) 64 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 65 | 66 | self.loss = nn.MSELoss() 67 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 68 | self.to(self.device) 69 | 70 | def forward(self, state): 71 | flat1 = F.relu(self.fc1(state)) 72 | flat2 = F.relu(self.fc2(flat1)) 73 | q = self.q(flat2) 74 | 75 | return q 76 | 77 | def save_checkpoint(self): 78 | T.save(self.state_dict(), self.checkpoint_file) 79 | 80 | def load_checkpoint(self): 81 | self.load_state_dict(T.load(self.checkpoint_file)) 82 | -------------------------------------------------------------------------------- /HER/bitflip/her.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class HER: 5 | def __init__(self, max_mem, input_shape, n_actions, goal_shape, batch_size, 6 | reward_fn, strategy='final'): 7 | self.max_mem = max_mem 8 | self.strategy = strategy 9 | self.mem_cntr = 0 10 | self.batch_size = batch_size 11 | self.input_shape = input_shape 12 | self.reward_fn = reward_fn 13 | 14 | self.states = np.zeros((max_mem, input_shape), 15 | dtype=np.float64) 16 | self.states_ = np.zeros((max_mem, input_shape), 17 | dtype=np.float64) 18 | self.actions = np.zeros((max_mem, n_actions), 19 | dtype=np.float32) 20 | self.rewards = np.zeros(max_mem, dtype=np.float32) 21 | self.dones = np.zeros(max_mem, dtype=np.bool) 22 | self.desired_goals = np.zeros((max_mem, goal_shape), dtype=np.float64) 23 | self.achieved_goals = np.zeros((max_mem, goal_shape), dtype=np.float64) 24 | self.achieved_goals_ = np.zeros((max_mem, goal_shape), 25 | dtype=np.float64) 26 | 27 | def store_memory(self, state, action, reward, state_, done, 28 | d_goal, a_goal, a_goal_): 29 | index = self.mem_cntr % self.max_mem 30 | self.states[index] = state 31 | self.states_[index] = state_ 32 | self.actions[index] = action 33 | self.rewards[index] = reward 34 | self.dones[index] = done 35 | self.desired_goals[index] = d_goal 36 | self.achieved_goals[index] = a_goal 37 | self.achieved_goals_[index] = a_goal_ 38 | self.mem_cntr += 1 39 | 40 | def store_episode(self, ep_memory): 41 | states, actions, rewards, states_, dones, dg, ag, ag_ = ep_memory 42 | 43 | if self.strategy == 'final': 44 | hindsight_goals = [[ag_[-1]]] * len(ag_) 45 | 46 | elif self.strategy is None: 47 | hindsight_goals = [[dg[0]]] * len(dg) 48 | 49 | for idx, s in enumerate(states): 50 | self.store_memory(s, actions[idx], rewards[idx], states_[idx], 51 | dones[idx], dg[idx], ag[idx], ag_[idx]) 52 | for goal in hindsight_goals[idx]: 53 | reward = self.reward_fn(ag_[idx], goal, {}) 54 | self.store_memory(s, actions[idx], reward, states_[idx], 55 | dones[idx], goal, ag[idx], ag_[idx]) 56 | 57 | def sample_memory(self): 58 | last_mem = min(self.mem_cntr, self.max_mem) 59 | batch = np.random.choice(last_mem, self.batch_size, replace=False) 60 | 61 | return self.states[batch], self.actions[batch], self.rewards[batch],\ 62 | self.states_[batch], self.dones[batch],\ 63 | self.desired_goals[batch] 64 | 65 | def ready(self): 66 | return self.mem_cntr > self.batch_size 67 | -------------------------------------------------------------------------------- /PER/ranked/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import DQNAgent 4 | from utils import plot_learning_curve 5 | 6 | 7 | def clip_reward(r): 8 | if r > 1: 9 | return 1 10 | elif r < -1: 11 | return -1 12 | else: 13 | return r 14 | 15 | 16 | if __name__ == '__main__': 17 | env = gym.make('CartPole-v0') 18 | best_score = -np.inf 19 | load_checkpoint = False 20 | n_games = 500 21 | r_iter = 64 22 | alpha = 0.25 23 | beta = 0.5 24 | replace = 250 25 | agent = DQNAgent(gamma=0.99, epsilon=1, lr=2.5e-4, alpha=alpha, 26 | beta=beta, r_iter=r_iter, 27 | input_dims=(env.observation_space.shape), 28 | n_actions=env.action_space.n, mem_size=20*1024, 29 | eps_min=0.01, 30 | batch_size=64, replace=replace, eps_dec=1e-4, 31 | chkpt_dir='models/', algo='DQNAgent', 32 | env_name='CartPole-v0') 33 | 34 | if load_checkpoint: 35 | agent.load_models() 36 | 37 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 38 | + str(n_games) + 'games' + str(r_iter) + '_r_iter_' + str(alpha) +\ 39 | 'alpha_' + str(beta) + '_replace_' + str(replace) 40 | figure_file = 'plots/' + fname + '.png' 41 | # if you want to record video of your agent playing, do a mkdir tmp && mkdir tmp/dqn-video 42 | # and uncomment the following 2 lines. 43 | # env = wrappers.Monitor(env, "tmp/dqn-video", 44 | # video_callable=lambda episode_id: True, force=True) 45 | n_steps = 0 46 | scores, eps_history, steps_array = [], [], [] 47 | 48 | for i in range(n_games): 49 | done = False 50 | observation = env.reset() 51 | 52 | score = 0 53 | while not done: 54 | action = agent.choose_action(observation) 55 | observation_, reward, done, info = env.step(action) 56 | score += reward 57 | r = clip_reward(reward) 58 | if not load_checkpoint: 59 | agent.store_transition(observation, action, 60 | r, observation_, done) 61 | agent.learn() 62 | observation = observation_ 63 | n_steps += 1 64 | scores.append(score) 65 | steps_array.append(n_steps) 66 | 67 | avg_score = np.mean(scores[-100:]) 68 | print('episode: ', i, 'score: ', score, 69 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 70 | 'epsilon %.2f' % agent.epsilon) 71 | 72 | if avg_score > best_score: 73 | if not load_checkpoint: 74 | agent.save_models() 75 | best_score = avg_score 76 | 77 | eps_history.append(agent.epsilon) 78 | 79 | x = [i+1 for i in range(len(scores))] 80 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 81 | -------------------------------------------------------------------------------- /HER/robotic/episode.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class EpisodeWorker: 5 | def __init__(self, env, agent, memory): 6 | self.agent = agent 7 | self.env = env 8 | self.memory = memory 9 | self.get_slices() 10 | 11 | def get_slices(self): 12 | OB = self.env.observation_space['observation'].shape[0] 13 | A = self.env.observation_space['achieved_goal'].shape[0] 14 | D = self.env.observation_space['desired_goal'].shape[0] 15 | 16 | self.ob = slice(0, OB) 17 | self.ag = slice(OB, OB + A) 18 | self.dg = slice(OB + A, OB + A + D) 19 | 20 | def play_episode(self, evaluate=False): 21 | observation = self.env.reset() 22 | done = False 23 | score = 0 24 | desired_goal = observation[self.dg] 25 | achieved_goal = observation[self.ag] 26 | observation = observation[self.ob] 27 | 28 | self.agent.obs_stats.update_local_stats(observation) 29 | self.agent.goal_stats.update_local_stats(desired_goal) 30 | 31 | achieved_goal = self.agent.goal_stats.normalize_observation( 32 | achieved_goal) 33 | desired_goal = self.agent.goal_stats.normalize_observation( 34 | desired_goal) 35 | observation = self.agent.obs_stats.normalize_observation(observation) 36 | 37 | states, actions, rewards, states_,\ 38 | dones, dg, ag, ag_ = [], [], [], [], [], [], [], [] 39 | 40 | while not done: 41 | action = self.agent.choose_action(np.concatenate( 42 | [observation, desired_goal]), evaluate) 43 | observation_, reward, done, info = self.env.step(action) 44 | 45 | achieved_goal_new = observation_[self.ag] 46 | observation_ = observation_[self.ob] 47 | 48 | if not evaluate: 49 | self.agent.goal_stats.update_local_stats(achieved_goal) 50 | self.agent.obs_stats.update_local_stats(observation_) 51 | 52 | observation_ = self.agent.obs_stats.normalize_observation( 53 | observation_) 54 | achieved_goal_new = self.agent.goal_stats.normalize_observation( 55 | achieved_goal_new) 56 | 57 | states.append(observation) 58 | states_.append(observation_) 59 | rewards.append(reward) 60 | actions.append(action) 61 | dones.append(done) 62 | dg.append(desired_goal) 63 | ag.append(achieved_goal) 64 | ag_.append(achieved_goal_new) 65 | 66 | score += reward 67 | 68 | achieved_goal = achieved_goal_new 69 | observation = observation_ 70 | 71 | if not evaluate: 72 | self.agent.obs_stats.recompute_global_stats() 73 | self.agent.goal_stats.recompute_global_stats() 74 | self.memory.store_episode([states, actions, rewards, 75 | states_, dones, dg, ag, ag_]) 76 | success = info['is_success'] 77 | return score, success 78 | -------------------------------------------------------------------------------- /HER/bitflip/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from networks import DeepQNetwork 4 | 5 | 6 | class Agent: 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo='dqn', env_name='bit_flip', 10 | chkpt_dir='models'): 11 | self.gamma = gamma 12 | self.epsilon = epsilon 13 | self.lr = lr 14 | self.n_actions = n_actions 15 | self.input_dims = input_dims 16 | self.batch_size = batch_size 17 | self.eps_min = eps_min 18 | self.eps_dec = eps_dec 19 | self.replace_target_cnt = replace 20 | self.algo = algo 21 | self.env_name = env_name 22 | self.chkpt_dir = chkpt_dir 23 | self.action_space = [i for i in range(n_actions)] 24 | self.learn_step_counter = 0 25 | 26 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 27 | input_dims=self.input_dims, 28 | name=self.env_name+'_'+self.algo+'_q_eval', 29 | chkpt_dir=self.chkpt_dir) 30 | 31 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 32 | input_dims=self.input_dims, 33 | name=self.env_name+'_'+self.algo+'_q_next', 34 | chkpt_dir=self.chkpt_dir) 35 | 36 | def choose_action(self, observation, evaluate=False): 37 | if np.random.random() > self.epsilon or evaluate: 38 | state = T.tensor([observation], 39 | dtype=T.float).to(self.q_eval.device) 40 | actions = self.q_eval.forward(state) 41 | action = T.argmax(actions).item() 42 | else: 43 | action = np.random.choice(self.action_space) 44 | 45 | return action 46 | 47 | def replace_target_network(self): 48 | if self.learn_step_counter % self.replace_target_cnt == 0: 49 | self.q_next.load_state_dict(self.q_eval.state_dict()) 50 | 51 | def decrement_epsilon(self): 52 | self.epsilon = self.epsilon - self.eps_dec \ 53 | if self.epsilon > self.eps_min else self.eps_min 54 | 55 | def save_models(self): 56 | self.q_eval.save_checkpoint() 57 | self.q_next.save_checkpoint() 58 | 59 | def load_models(self): 60 | self.q_eval.load_checkpoint() 61 | self.q_next.load_checkpoint() 62 | 63 | def learn(self, memories): 64 | state, action, reward, new_state, done, dg = memories 65 | state = np.concatenate([state, dg], axis=1) 66 | new_state = np.concatenate([new_state, dg], axis=1) 67 | 68 | states = T.tensor(state, dtype=T.float).to(self.q_eval.device) 69 | rewards = T.tensor(reward).to(self.q_eval.device) 70 | dones = T.tensor(done).to(self.q_eval.device) 71 | actions = T.tensor(action, dtype=T.long).to(self.q_eval.device) 72 | states_ = T.tensor(new_state, dtype=T.float).to(self.q_eval.device) 73 | 74 | actions = actions.view(-1) 75 | 76 | self.q_eval.optimizer.zero_grad() 77 | 78 | self.replace_target_network() 79 | 80 | indices = np.arange(self.batch_size) 81 | q_pred = self.q_eval.forward(states)[indices, actions] 82 | 83 | q_next = self.q_next.forward(states_).max(dim=1)[0] 84 | q_next[dones] = 0.0 85 | q_target = rewards + self.gamma*q_next 86 | 87 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 88 | loss.backward() 89 | self.q_eval.optimizer.step() 90 | self.learn_step_counter += 1 91 | self.decrement_epsilon() 92 | -------------------------------------------------------------------------------- /HER/robotic/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import gym 4 | import panda_gym 5 | from mpi4py import MPI 6 | import numpy as np 7 | import torch as T 8 | from agent import Agent 9 | from episode import EpisodeWorker 10 | from her import HER 11 | from wrappers import FlattenDictWrapper 12 | 13 | 14 | def train(agent, worker, memory, environ): 15 | epochs = 50 16 | cycle_length = 16 17 | n_cycles = 50 18 | n_updates = 40 19 | n_tests = 10 20 | for epoch in range(epochs): 21 | for cycle in range(n_cycles): 22 | score_history, success_history = [], [] 23 | for i in range(cycle_length): 24 | score, success = worker.play_episode() 25 | score_history.append(score) 26 | success_history.append(success) 27 | """ 28 | if MPI.COMM_WORLD.Get_rank() == 0: 29 | cycle_avg_score = np.mean(score_history) 30 | cycle_avg_success = np.mean(success_history) 31 | 32 | print('Epoch: {} Cycle: {} Training Avg Score {:.1f} ' 33 | 'Training Avg Success: {:.3f}'. 34 | format(epoch, cycle, cycle_avg_score, cycle_avg_success)) 35 | """ 36 | if memory.ready(): 37 | for _ in range(n_updates): 38 | memories = memory.sample_memory() 39 | agent.learn(memories) 40 | agent.update_network_parameters() 41 | score_history, success_history = [], [] 42 | for episode in range(n_tests): 43 | score, success = worker.play_episode(evaluate=True) 44 | success_history.append(success) 45 | score_history.append(score) 46 | avg_success = np.mean(success_history) 47 | avg_score = np.mean(score_history) 48 | global_success = MPI.COMM_WORLD.allreduce(avg_success, op=MPI.SUM) 49 | global_score = MPI.COMM_WORLD.allreduce(avg_score, op=MPI.SUM) 50 | eval_score = global_score / MPI.COMM_WORLD.Get_size() 51 | eval_success = global_success / MPI.COMM_WORLD.Get_size() 52 | if MPI.COMM_WORLD.Get_rank() == 0: 53 | print('Epoch: {} Testing Agent. Avg Score: {:.1f} ' 54 | 'Avg Sucess: {:.3f} Environment: {}'. 55 | format(epoch, eval_score, eval_success, environ)) 56 | 57 | 58 | def main(): 59 | env_string = 'PandaPickAndPlace-v2' 60 | # env_string = 'PandaPush-v2' 61 | env = gym.make(env_string) 62 | env = FlattenDictWrapper(env) 63 | seed = 123 + MPI.COMM_WORLD.Get_rank() 64 | 65 | random.seed(seed) 66 | np.random.seed(seed) 67 | T.manual_seed(seed) 68 | T.cuda.manual_seed(seed) 69 | 70 | batch_size = 256 71 | max_size = 1_000_000 72 | obs_shape = env.observation_space['observation'].shape[0] 73 | goal_shape = env.observation_space['achieved_goal'].shape[0] 74 | input_shape = obs_shape 75 | memory = HER(max_mem=max_size, input_shape=input_shape, 76 | n_actions=env.action_space.shape[0], 77 | batch_size=batch_size, goal_shape=goal_shape, 78 | strategy='future', reward_fn=env.compute_reward) 79 | input_shape = obs_shape + goal_shape 80 | agent = Agent(alpha=0.001, beta=0.001, action_space=env.action_space, 81 | input_dims=input_shape, tau=0.05, gamma=0.98, 82 | fc1_dims=256, fc2_dims=256, fc3_dims=256, 83 | n_actions=env.action_space.shape[0], explore=0.3, 84 | obs_shape=obs_shape, goal_shape=goal_shape, 85 | action_noise=0.2) 86 | ep_worker = EpisodeWorker(env, agent, memory) 87 | 88 | train(agent, ep_worker, memory, env_string) 89 | 90 | 91 | if __name__ == '__main__': 92 | os.environ['OMP_NUM_THREADS'] = '1' 93 | os.environ['MKL_NUM_THREADS'] = '1' 94 | os.environ['IN_MPI'] = '1' 95 | main() 96 | -------------------------------------------------------------------------------- /HER/robotic/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class CriticNetwork(nn.Module): 9 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, fc3_dims, 10 | n_actions, name, chkpt_dir='tmp/ddpg'): 11 | super(CriticNetwork, self).__init__() 12 | self.input_dims = input_dims 13 | self.fc1_dims = fc1_dims 14 | self.fc2_dims = fc2_dims 15 | self.fc3_dims = fc3_dims 16 | self.n_actions = n_actions 17 | self.name = name 18 | self.checkpoint_dir = chkpt_dir 19 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg') 20 | 21 | self.fc1 = nn.Linear(self.input_dims + self.n_actions, self.fc1_dims) 22 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 23 | self.fc3 = nn.Linear(self.fc2_dims, self.fc3_dims) 24 | 25 | self.q = nn.Linear(self.fc3_dims, 1) 26 | 27 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 28 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 29 | 30 | self.to(self.device) 31 | 32 | def forward(self, state, action): 33 | state_value = F.relu(self.fc1(T.cat((state, action), dim=1))) 34 | state_value = F.relu(self.fc2(state_value)) 35 | state_value = F.relu(self.fc3(state_value)) 36 | 37 | q = self.q(state_value) 38 | 39 | return q 40 | 41 | def save_checkpoint(self): 42 | print('... saving checkpoint ...') 43 | T.save(self.state_dict(), self.checkpoint_file) 44 | 45 | def load_checkpoint(self): 46 | print('... loading checkpoint ...') 47 | self.load_state_dict(T.load(self.checkpoint_file)) 48 | 49 | def save_best(self): 50 | print('... saving best checkpoint ...') 51 | checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best') 52 | T.save(self.state_dict(), checkpoint_file) 53 | 54 | 55 | class ActorNetwork(nn.Module): 56 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, fc3_dims, 57 | n_actions, name, chkpt_dir='tmp/ddpg'): 58 | super(ActorNetwork, self).__init__() 59 | self.input_dims = input_dims 60 | self.fc1_dims = fc1_dims 61 | self.fc2_dims = fc2_dims 62 | self.fc3_dims = fc3_dims 63 | self.n_actions = n_actions 64 | self.name = name 65 | self.checkpoint_dir = chkpt_dir 66 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg') 67 | 68 | self.fc1 = nn.Linear(self.input_dims, self.fc1_dims) 69 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 70 | self.fc3 = nn.Linear(self.fc2_dims, self.fc3_dims) 71 | self.mu = nn.Linear(self.fc3_dims, self.n_actions) 72 | 73 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 74 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 75 | 76 | self.to(self.device) 77 | 78 | def forward(self, state): 79 | x = F.relu(self.fc1(state)) 80 | x = F.relu(self.fc2(x)) 81 | x = F.relu(self.fc3(x)) 82 | 83 | mu = self.mu(x) 84 | pi = T.clamp(T.tanh(mu), -5, 5) 85 | 86 | return mu, pi 87 | 88 | def save_checkpoint(self): 89 | print('... saving checkpoint ...') 90 | T.save(self.state_dict(), self.checkpoint_file) 91 | 92 | def load_checkpoint(self): 93 | print('... loading checkpoint ...') 94 | self.load_state_dict(T.load(self.checkpoint_file)) 95 | 96 | def save_best(self): 97 | print('... saving best checkpoint ...') 98 | checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best') 99 | T.save(self.state_dict(), checkpoint_file) 100 | -------------------------------------------------------------------------------- /PER/proportional/atari/wrappers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import gym 5 | 6 | 7 | class RepeatActionAndMaxFrame(gym.Wrapper): 8 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 9 | fire_first=False): 10 | super(RepeatActionAndMaxFrame, self).__init__(env) 11 | self.repeat = repeat 12 | self.shape = env.observation_space.low.shape 13 | self.frame_buffer = np.zeros(shape=(2, *self.shape), dtype=np.float16) 14 | self.clip_reward = clip_reward 15 | self.no_ops = no_ops 16 | self.fire_first = fire_first 17 | 18 | def step(self, action): 19 | t_reward = 0.0 20 | done = False 21 | for i in range(self.repeat): 22 | obs, reward, done, info = self.env.step(action) 23 | if self.clip_reward: 24 | reward = np.clip(np.array([reward]), -1, 1)[0] 25 | t_reward += reward 26 | idx = i % 2 27 | self.frame_buffer[idx] = obs 28 | if done: 29 | break 30 | 31 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 32 | return max_frame, t_reward, done, info 33 | 34 | def reset(self): 35 | obs = self.env.reset() 36 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 37 | for _ in range(no_ops): 38 | _, _, done, _ = self.env.step(0) 39 | if done: 40 | self.env.reset() 41 | if self.fire_first: 42 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 43 | obs, _, _, _ = self.env.step(1) 44 | 45 | self.frame_buffer = np.zeros(shape=(2, *self.shape), dtype=np.float16) 46 | self.frame_buffer[0] = obs 47 | 48 | return obs 49 | 50 | 51 | class PreprocessFrame(gym.ObservationWrapper): 52 | def __init__(self, shape, env=None): 53 | super(PreprocessFrame, self).__init__(env) 54 | self.shape = (shape[2], shape[0], shape[1]) 55 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 56 | shape=self.shape, 57 | dtype=np.float16) 58 | 59 | def observation(self, obs): 60 | new_frame = cv2.cvtColor(obs.astype(np.uint8), cv2.COLOR_RGB2GRAY) 61 | resized_screen = cv2.resize(new_frame, self.shape[1:], 62 | interpolation=cv2.INTER_AREA) 63 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 64 | new_obs = new_obs / 255.0 65 | 66 | return new_obs.astype(np.float16) 67 | 68 | 69 | class StackFrames(gym.ObservationWrapper): 70 | def __init__(self, env, repeat): 71 | super(StackFrames, self).__init__(env) 72 | self.observation_space = gym.spaces.Box( 73 | env.observation_space.low.repeat(repeat, axis=0), 74 | env.observation_space.high.repeat(repeat, axis=0), 75 | dtype=np.float16) 76 | self.stack = collections.deque(maxlen=repeat) 77 | 78 | def reset(self): 79 | self.stack.clear() 80 | observation = self.env.reset() 81 | for _ in range(self.stack.maxlen): 82 | self.stack.append(observation) 83 | 84 | return np.array(self.stack, dtype=np.float16).reshape( 85 | self.observation_space.low.shape) 86 | 87 | def observation(self, observation): 88 | self.stack.append(observation) 89 | 90 | return np.array(self.stack).reshape(self.observation_space.low.shape) 91 | 92 | 93 | def make_env(env_name, shape=(84, 84, 1), repeat=4, clip_rewards=False, 94 | no_ops=0, fire_first=False): 95 | env = gym.make(env_name) 96 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, 97 | no_ops, fire_first) 98 | env = PreprocessFrame(shape, env) 99 | env = StackFrames(env, repeat) 100 | 101 | return env 102 | -------------------------------------------------------------------------------- /PER/proportional/memory.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | import numpy as np 4 | 5 | 6 | @dataclass 7 | class Node: 8 | value: float = 0.01 9 | total: float = 0.01 10 | 11 | def update_priority(self, priority: float): 12 | delta = priority - self.value 13 | self.value = priority 14 | self.total += delta 15 | return delta 16 | 17 | def update_total(self, delta: float): 18 | self.total += delta 19 | 20 | 21 | class SumTree: 22 | def __init__(self, max_size: int = 1_00_000, batch_size: int = 32, 23 | alpha: float = 0.5, beta: float = 0.5): 24 | self.counter = 0 25 | self.max_size = max_size 26 | self.batch_size = batch_size 27 | self.alpha = alpha 28 | self.beta = beta 29 | self.alpha_start = alpha 30 | self.beta_start = beta 31 | 32 | self.sum_tree = [] 33 | self.transitions = [] 34 | 35 | def _insert(self, transition: List): 36 | if self.counter < self.max_size: 37 | self.transitions.append(transition) 38 | self.sum_tree.append(Node()) 39 | else: 40 | index = self.counter % self.max_size 41 | self.transitions[index] = transition 42 | self.counter += 1 43 | 44 | def store_transition(self, transition: List): 45 | self._insert(transition) 46 | 47 | def _calculate_parents(self, index: int): 48 | parents = [] 49 | while index > 0: 50 | parents.append(int((index-1)//2)) 51 | index = int((index-1)//2) 52 | return parents 53 | 54 | def update_priorities(self, indices: List, priorities: List): 55 | self._propagate_changes(indices, priorities) 56 | 57 | def _propagate_changes(self, indices: List, priorities: List): 58 | for idx, p in zip(indices, priorities): 59 | delta = self.sum_tree[idx].update_priority(p**self.alpha) 60 | parents = self._calculate_parents(idx) 61 | for parent in parents: 62 | self.sum_tree[parent].update_total(delta) 63 | 64 | def _sample(self): 65 | total_weight = self.sum_tree[0].total 66 | 67 | if total_weight == 0.01: 68 | samples = np.random.choice(self.batch_size, self.batch_size, 69 | replace=False) 70 | probs = [1 / self.batch_size for _ in range(self.batch_size)] 71 | return samples, probs 72 | 73 | samples, probs, n_samples = [], [], 1 74 | index = self.counter % self.max_size - 1 75 | samples.append(index) 76 | probs.append(self.sum_tree[index].value / self.sum_tree[0].total) 77 | while n_samples < self.batch_size: 78 | index = 0 79 | target = total_weight * np.random.random() 80 | while True: 81 | left = 2 * index + 1 82 | right = 2 * index + 2 83 | if left > len(self.sum_tree) - 1\ 84 | or right > len(self.sum_tree) - 1: 85 | break 86 | left_sum = self.sum_tree[left].total 87 | if target < left_sum: 88 | index = left 89 | continue 90 | target -= left_sum 91 | right_sum = self.sum_tree[right].total 92 | if target < right_sum: 93 | index = right 94 | continue 95 | target -= right_sum 96 | break 97 | samples.append(index) 98 | n_samples += 1 99 | probs.append(self.sum_tree[index].value / self.sum_tree[0].total) 100 | return samples, probs 101 | 102 | def sample(self): 103 | samples, probs = self._sample() 104 | weights = self._calculate_weights(probs) 105 | mems = [self.transitions[s] for s in samples] 106 | return mems, samples, weights 107 | 108 | def _calculate_weights(self, probs: List): 109 | weights = np.array([(1 / self.counter * 1 / prob)**self.beta 110 | for prob in probs]) 111 | weights *= 1 / max(weights) 112 | return weights 113 | 114 | def ready(self): 115 | return self.counter >= self.batch_size 116 | 117 | def anneal_beta(self, ep: int, ep_max: int): 118 | self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start) 119 | 120 | def anneal_alpha(self, ep: int, ep_max: int): 121 | self.alpha = self.alpha_start * (1 - ep / ep_max) 122 | -------------------------------------------------------------------------------- /HER/robotic/her.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import numpy as np 3 | 4 | 5 | class HER: 6 | def __init__(self, max_mem, input_shape, n_actions, goal_shape, batch_size, 7 | reward_fn, strategy='final', k=4): 8 | self.max_mem = max_mem 9 | self.strategy = strategy 10 | self.mem_cntr = 0 11 | self.batch_size = batch_size 12 | self.input_shape = input_shape 13 | self.reward_fn = reward_fn 14 | self.k = k 15 | self.lock = threading.Lock() 16 | 17 | self.states = np.zeros((max_mem, input_shape), 18 | dtype=np.float64) 19 | self.states_ = np.zeros((max_mem, input_shape), 20 | dtype=np.float64) 21 | self.actions = np.zeros((max_mem, n_actions), 22 | dtype=np.float32) 23 | self.rewards = np.zeros(max_mem, dtype=np.float32) 24 | self.dones = np.zeros(max_mem, dtype=np.bool) 25 | self.desired_goals = np.zeros((max_mem, goal_shape), dtype=np.float64) 26 | self.achieved_goals = np.zeros((max_mem, goal_shape), dtype=np.float64) 27 | self.achieved_goals_ = np.zeros((max_mem, goal_shape), 28 | dtype=np.float64) 29 | 30 | def store_memory(self, state, action, reward, state_, done, 31 | d_goal, a_goal, a_goal_): 32 | with self.lock: 33 | index = self.mem_cntr % self.max_mem 34 | self.states[index] = state 35 | self.states_[index] = state_ 36 | self.actions[index] = action 37 | self.rewards[index] = reward 38 | self.dones[index] = done 39 | self.desired_goals[index] = d_goal 40 | self.achieved_goals[index] = a_goal 41 | self.achieved_goals_[index] = a_goal_ 42 | self.mem_cntr += 1 43 | 44 | def store_episode(self, ep_memory): 45 | states, actions, rewards, states_, dones, dg, ag, ag_ = ep_memory 46 | hindsight_goals = [] 47 | 48 | if self.strategy == 'final': 49 | hindsight_goals = [[ag_[-1]]] * len(ag_) 50 | 51 | elif self.strategy is None: 52 | hindsight_goals = [[dg[0]]] * len(dg) 53 | 54 | elif self.strategy == 'future': 55 | for idx, _ in enumerate(ag_): 56 | t_step_goals = [] 57 | for m in range(self.k): 58 | if idx + m >= len(ag_) - 1: 59 | break 60 | goal_idx = np.random.randint(idx + 1, len(ag_)) 61 | t_step_goals.append(ag_[goal_idx]) 62 | hindsight_goals.append(t_step_goals) 63 | 64 | elif self.strategy == 'random': 65 | if self.mem_cntr <= len(ag_): 66 | max_mem = len(ag_) 67 | memory = ag_ 68 | else: 69 | max_mem = min(self.mem_cntr, self.max_mem) 70 | memory = self.achieved_goals_ 71 | for idx, _ in enumerate(ag_): 72 | t_step_goals = [] 73 | for m in range(self.k): 74 | goal_idx = np.random.randint(0, max_mem) 75 | t_step_goals.append(memory[goal_idx]) 76 | hindsight_goals.append(t_step_goals) 77 | 78 | elif self.strategy == 'episode': 79 | for idx, _ in enumerate(ag_): 80 | t_step_goals = [] 81 | for m in range(self.k): 82 | goal_idx = np.random.randint(0, len(ag_)) 83 | t_step_goals.append(ag_[goal_idx]) 84 | hindsight_goals.append(t_step_goals) 85 | 86 | for idx, s in enumerate(states): 87 | self.store_memory(s, actions[idx], rewards[idx], states_[idx], 88 | dones[idx], dg[idx], ag[idx], ag_[idx]) 89 | for goal in hindsight_goals[idx]: 90 | reward = self.reward_fn(ag_[idx], goal, {}) 91 | self.store_memory(s, actions[idx], reward, states_[idx], 92 | dones[idx], goal, ag[idx], ag_[idx]) 93 | 94 | def sample_memory(self): 95 | with self.lock: 96 | last_mem = min(self.mem_cntr, self.max_mem) 97 | batch = np.random.choice(last_mem, self.batch_size, replace=False) 98 | 99 | return self.states[batch], self.actions[batch], self.rewards[batch],\ 100 | self.states_[batch], self.dones[batch],\ 101 | self.desired_goals[batch] 102 | 103 | def ready(self): 104 | return self.mem_cntr > self.batch_size 105 | -------------------------------------------------------------------------------- /PER/proportional/atari/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from network import DeepQNetwork 4 | from memory import SumTree 5 | 6 | 7 | class Agent: 8 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 9 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 10 | replace=1000, alpha=0.5, beta=0, r_iter=32, 11 | algo=None, env_name=None, chkpt_dir='tmp/dqn'): 12 | self.gamma = gamma 13 | self.epsilon = epsilon 14 | self.lr = lr 15 | self.n_actions = n_actions 16 | self.input_dims = input_dims 17 | self.batch_size = batch_size 18 | self.eps_min = eps_min 19 | self.eps_dec = eps_dec 20 | self.replace_target_cnt = replace 21 | self.algo = algo 22 | self.env_name = env_name 23 | self.chkpt_dir = chkpt_dir 24 | self.action_space = [i for i in range(n_actions)] 25 | self.learn_step_counter = 0 26 | self.rebalance_iter = r_iter 27 | 28 | self.memory = SumTree(mem_size, batch_size, alpha=alpha, beta=beta) 29 | 30 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 31 | input_dims=self.input_dims, 32 | name=self.env_name+'_q_eval', 33 | chkpt_dir=self.chkpt_dir) 34 | 35 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 36 | input_dims=self.input_dims, 37 | name=self.env_name+'_q_next', 38 | chkpt_dir=self.chkpt_dir) 39 | 40 | def choose_action(self, observation): 41 | if np.random.random() > self.epsilon: 42 | state = T.tensor([observation], 43 | dtype=T.float).to(self.q_eval.device) 44 | actions = self.q_eval.forward(state) 45 | action = T.argmax(actions).item() 46 | else: 47 | action = np.random.choice(self.action_space) 48 | 49 | return action 50 | 51 | def store_transition(self, state, action, reward, state_, done): 52 | self.memory.store_transition([state, action, reward, state_, done]) 53 | 54 | def sample_memory(self): 55 | sarsd, sample_idx, weights = self.memory.sample() 56 | 57 | states, actions, rewards, states_, dones = sarsd 58 | 59 | states = T.tensor(states, dtype=T.float).to(self.q_eval.device) 60 | rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) 61 | dones = T.tensor(dones).to(self.q_eval.device) 62 | actions = T.tensor(actions).to(self.q_eval.device) 63 | states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device) 64 | 65 | weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device) 66 | 67 | return states, actions, rewards, states_, dones, sample_idx, weights 68 | 69 | def replace_target_network(self): 70 | if self.learn_step_counter % self.replace_target_cnt == 0: 71 | self.q_next.load_state_dict(self.q_eval.state_dict()) 72 | 73 | def decrement_epsilon(self): 74 | self.epsilon = self.epsilon - self.eps_dec \ 75 | if self.epsilon > self.eps_min else self.eps_min 76 | 77 | def save_models(self): 78 | self.q_eval.save_checkpoint() 79 | self.q_next.save_checkpoint() 80 | 81 | def load_models(self): 82 | self.q_eval.load_checkpoint() 83 | self.q_next.load_checkpoint() 84 | 85 | def learn(self): 86 | if not self.memory.ready(): 87 | return 88 | 89 | self.q_eval.optimizer.zero_grad() 90 | 91 | self.replace_target_network() 92 | 93 | states, actions, rewards, states_, dones,\ 94 | sample_idx, weights = self.sample_memory() 95 | indices = np.arange(self.batch_size) 96 | 97 | q_pred = self.q_eval.forward(states)[indices, actions] 98 | q_next = self.q_next.forward(states_) 99 | q_eval = self.q_eval.forward(states_) 100 | 101 | max_actions = T.argmax(q_eval, dim=1) 102 | q_next[dones] = 0.0 103 | q_target = rewards + self.gamma * q_next[indices, max_actions] 104 | 105 | td_error = np.abs((q_target.detach().cpu().numpy() - 106 | q_pred.detach().cpu().numpy())) 107 | td_error = np.clip(td_error, 0., 1.) 108 | 109 | self.memory.update_priorities(sample_idx, td_error) 110 | 111 | q_target *= weights 112 | q_pred *= weights 113 | 114 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 115 | loss.backward() 116 | self.q_eval.optimizer.step() 117 | self.learn_step_counter += 1 118 | self.decrement_epsilon() 119 | -------------------------------------------------------------------------------- /PER/ranked/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 38 | fire_first=False): 39 | super(RepeatActionAndMaxFrame, self).__init__(env) 40 | self.repeat = repeat 41 | self.shape = env.observation_space.low.shape 42 | self.frame_buffer = np.zeros_like((2, self.shape)) 43 | self.clip_reward = clip_reward 44 | self.no_ops = no_ops 45 | self.fire_first = fire_first 46 | 47 | def step(self, action): 48 | t_reward = 0.0 49 | done = False 50 | for i in range(self.repeat): 51 | obs, reward, done, info = self.env.step(action) 52 | if self.clip_reward: 53 | reward = np.clip(np.array([reward]), -1, 1)[0] 54 | t_reward += reward 55 | idx = i % 2 56 | self.frame_buffer[idx] = obs 57 | if done: 58 | break 59 | 60 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 61 | return max_frame, t_reward, done, info 62 | 63 | def reset(self): 64 | obs = self.env.reset() 65 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 66 | for _ in range(no_ops): 67 | _, _, done, _ = self.env.step(0) 68 | if done: 69 | self.env.reset() 70 | if self.fire_first: 71 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 72 | obs, _, _, _ = self.env.step(1) 73 | 74 | self.frame_buffer = np.zeros_like((2,self.shape)) 75 | self.frame_buffer[0] = obs 76 | 77 | return obs 78 | 79 | class PreprocessFrame(gym.ObservationWrapper): 80 | def __init__(self, shape, env=None): 81 | super(PreprocessFrame, self).__init__(env) 82 | self.shape = (shape[2], shape[0], shape[1]) 83 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 84 | shape=self.shape, dtype=np.float32) 85 | 86 | def observation(self, obs): 87 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 88 | resized_screen = cv2.resize(new_frame, self.shape[1:], 89 | interpolation=cv2.INTER_AREA) 90 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 91 | new_obs = new_obs / 255.0 92 | 93 | return new_obs 94 | 95 | class StackFrames(gym.ObservationWrapper): 96 | def __init__(self, env, repeat): 97 | super(StackFrames, self).__init__(env) 98 | self.observation_space = gym.spaces.Box( 99 | env.observation_space.low.repeat(repeat, axis=0), 100 | env.observation_space.high.repeat(repeat, axis=0), 101 | dtype=np.float32) 102 | self.stack = collections.deque(maxlen=repeat) 103 | 104 | def reset(self): 105 | self.stack.clear() 106 | observation = self.env.reset() 107 | for _ in range(self.stack.maxlen): 108 | self.stack.append(observation) 109 | 110 | return np.array(self.stack).reshape(self.observation_space.low.shape) 111 | 112 | def observation(self, observation): 113 | self.stack.append(observation) 114 | 115 | return np.array(self.stack).reshape(self.observation_space.low.shape) 116 | 117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 118 | no_ops=0, fire_first=False): 119 | env = gym.make(env_name) 120 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 121 | env = PreprocessFrame(shape, env) 122 | env = StackFrames(env, repeat) 123 | 124 | return env 125 | -------------------------------------------------------------------------------- /PER/proportional/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 38 | fire_first=False): 39 | super(RepeatActionAndMaxFrame, self).__init__(env) 40 | self.repeat = repeat 41 | self.shape = env.observation_space.low.shape 42 | self.frame_buffer = np.zeros_like((2, self.shape)) 43 | self.clip_reward = clip_reward 44 | self.no_ops = no_ops 45 | self.fire_first = fire_first 46 | 47 | def step(self, action): 48 | t_reward = 0.0 49 | done = False 50 | for i in range(self.repeat): 51 | obs, reward, done, info = self.env.step(action) 52 | if self.clip_reward: 53 | reward = np.clip(np.array([reward]), -1, 1)[0] 54 | t_reward += reward 55 | idx = i % 2 56 | self.frame_buffer[idx] = obs 57 | if done: 58 | break 59 | 60 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 61 | return max_frame, t_reward, done, info 62 | 63 | def reset(self): 64 | obs = self.env.reset() 65 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 66 | for _ in range(no_ops): 67 | _, _, done, _ = self.env.step(0) 68 | if done: 69 | self.env.reset() 70 | if self.fire_first: 71 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 72 | obs, _, _, _ = self.env.step(1) 73 | 74 | self.frame_buffer = np.zeros_like((2,self.shape)) 75 | self.frame_buffer[0] = obs 76 | 77 | return obs 78 | 79 | class PreprocessFrame(gym.ObservationWrapper): 80 | def __init__(self, shape, env=None): 81 | super(PreprocessFrame, self).__init__(env) 82 | self.shape = (shape[2], shape[0], shape[1]) 83 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 84 | shape=self.shape, dtype=np.float32) 85 | 86 | def observation(self, obs): 87 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 88 | resized_screen = cv2.resize(new_frame, self.shape[1:], 89 | interpolation=cv2.INTER_AREA) 90 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 91 | new_obs = new_obs / 255.0 92 | 93 | return new_obs 94 | 95 | class StackFrames(gym.ObservationWrapper): 96 | def __init__(self, env, repeat): 97 | super(StackFrames, self).__init__(env) 98 | self.observation_space = gym.spaces.Box( 99 | env.observation_space.low.repeat(repeat, axis=0), 100 | env.observation_space.high.repeat(repeat, axis=0), 101 | dtype=np.float32) 102 | self.stack = collections.deque(maxlen=repeat) 103 | 104 | def reset(self): 105 | self.stack.clear() 106 | observation = self.env.reset() 107 | for _ in range(self.stack.maxlen): 108 | self.stack.append(observation) 109 | 110 | return np.array(self.stack).reshape(self.observation_space.low.shape) 111 | 112 | def observation(self, observation): 113 | self.stack.append(observation) 114 | 115 | return np.array(self.stack).reshape(self.observation_space.low.shape) 116 | 117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 118 | no_ops=0, fire_first=False): 119 | env = gym.make(env_name) 120 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 121 | env = PreprocessFrame(shape, env) 122 | env = StackFrames(env, repeat) 123 | 124 | return env 125 | -------------------------------------------------------------------------------- /PER/proportional/atari/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 38 | fire_first=False): 39 | super(RepeatActionAndMaxFrame, self).__init__(env) 40 | self.repeat = repeat 41 | self.shape = env.observation_space.low.shape 42 | self.frame_buffer = np.zeros_like((2, self.shape)) 43 | self.clip_reward = clip_reward 44 | self.no_ops = no_ops 45 | self.fire_first = fire_first 46 | 47 | def step(self, action): 48 | t_reward = 0.0 49 | done = False 50 | for i in range(self.repeat): 51 | obs, reward, done, info = self.env.step(action) 52 | if self.clip_reward: 53 | reward = np.clip(np.array([reward]), -1, 1)[0] 54 | t_reward += reward 55 | idx = i % 2 56 | self.frame_buffer[idx] = obs 57 | if done: 58 | break 59 | 60 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 61 | return max_frame, t_reward, done, info 62 | 63 | def reset(self): 64 | obs = self.env.reset() 65 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 66 | for _ in range(no_ops): 67 | _, _, done, _ = self.env.step(0) 68 | if done: 69 | self.env.reset() 70 | if self.fire_first: 71 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 72 | obs, _, _, _ = self.env.step(1) 73 | 74 | self.frame_buffer = np.zeros_like((2,self.shape)) 75 | self.frame_buffer[0] = obs 76 | 77 | return obs 78 | 79 | class PreprocessFrame(gym.ObservationWrapper): 80 | def __init__(self, shape, env=None): 81 | super(PreprocessFrame, self).__init__(env) 82 | self.shape = (shape[2], shape[0], shape[1]) 83 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 84 | shape=self.shape, dtype=np.float32) 85 | 86 | def observation(self, obs): 87 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 88 | resized_screen = cv2.resize(new_frame, self.shape[1:], 89 | interpolation=cv2.INTER_AREA) 90 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 91 | new_obs = new_obs / 255.0 92 | 93 | return new_obs 94 | 95 | class StackFrames(gym.ObservationWrapper): 96 | def __init__(self, env, repeat): 97 | super(StackFrames, self).__init__(env) 98 | self.observation_space = gym.spaces.Box( 99 | env.observation_space.low.repeat(repeat, axis=0), 100 | env.observation_space.high.repeat(repeat, axis=0), 101 | dtype=np.float32) 102 | self.stack = collections.deque(maxlen=repeat) 103 | 104 | def reset(self): 105 | self.stack.clear() 106 | observation = self.env.reset() 107 | for _ in range(self.stack.maxlen): 108 | self.stack.append(observation) 109 | 110 | return np.array(self.stack).reshape(self.observation_space.low.shape) 111 | 112 | def observation(self, observation): 113 | self.stack.append(observation) 114 | 115 | return np.array(self.stack).reshape(self.observation_space.low.shape) 116 | 117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 118 | no_ops=0, fire_first=False): 119 | env = gym.make(env_name) 120 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 121 | env = PreprocessFrame(shape, env) 122 | env = StackFrames(env, repeat) 123 | 124 | return env 125 | -------------------------------------------------------------------------------- /PER/proportional/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from network import LinearDeepQNetwork 4 | from memory import SumTree 5 | 6 | 7 | class Agent: 8 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 9 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 10 | replace=1000, alpha=0.5, beta=0, r_iter=32, 11 | algo=None, env_name=None, chkpt_dir='tmp/dqn'): 12 | self.gamma = gamma 13 | self.epsilon = epsilon 14 | self.lr = lr 15 | self.n_actions = n_actions 16 | self.input_dims = input_dims 17 | self.batch_size = batch_size 18 | self.eps_min = eps_min 19 | self.eps_dec = eps_dec 20 | self.replace_target_cnt = replace 21 | self.algo = algo 22 | self.env_name = env_name 23 | self.chkpt_dir = chkpt_dir 24 | self.action_space = [i for i in range(n_actions)] 25 | self.learn_step_counter = 0 26 | self.rebalance_iter = r_iter 27 | 28 | self.memory = SumTree(mem_size, batch_size, alpha=alpha, beta=beta) 29 | 30 | self.q_eval = LinearDeepQNetwork(self.lr, self.n_actions, 31 | input_dims=self.input_dims, 32 | name=self.env_name+'_q_eval', 33 | chkpt_dir=self.chkpt_dir) 34 | 35 | self.q_next = LinearDeepQNetwork(self.lr, self.n_actions, 36 | input_dims=self.input_dims, 37 | name=self.env_name+'_q_next', 38 | chkpt_dir=self.chkpt_dir) 39 | 40 | def choose_action(self, observation): 41 | if np.random.random() > self.epsilon: 42 | state = T.tensor([observation], 43 | dtype=T.float).to(self.q_eval.device) 44 | actions = self.q_eval.forward(state) 45 | action = T.argmax(actions).item() 46 | else: 47 | action = np.random.choice(self.action_space) 48 | 49 | return action 50 | 51 | def store_transition(self, state, action, reward, state_, done): 52 | self.memory.store_transition([state, action, reward, state_, done]) 53 | 54 | def sample_memory(self): 55 | sarsd, sample_idx, weights = self.memory.sample() 56 | 57 | states = np.array([row[0] for row in sarsd]) 58 | actions = np.array([row[1] for row in sarsd]) 59 | rewards = np.array([row[2] for row in sarsd]) 60 | states_ = np.array([row[3] for row in sarsd]) 61 | dones = np.array([row[4] for row in sarsd]) 62 | 63 | states = T.tensor(states, dtype=T.float).to(self.q_eval.device) 64 | rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) 65 | dones = T.tensor(dones).to(self.q_eval.device) 66 | actions = T.tensor(actions).to(self.q_eval.device) 67 | states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device) 68 | 69 | weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device) 70 | 71 | return states, actions, rewards, states_, dones, sample_idx, weights 72 | 73 | def replace_target_network(self): 74 | if self.learn_step_counter % self.replace_target_cnt == 0: 75 | self.q_next.load_state_dict(self.q_eval.state_dict()) 76 | 77 | def decrement_epsilon(self): 78 | self.epsilon = self.epsilon - self.eps_dec \ 79 | if self.epsilon > self.eps_min else self.eps_min 80 | 81 | def save_models(self): 82 | self.q_eval.save_checkpoint() 83 | self.q_next.save_checkpoint() 84 | 85 | def load_models(self): 86 | self.q_eval.load_checkpoint() 87 | self.q_next.load_checkpoint() 88 | 89 | def learn(self): 90 | if not self.memory.ready(): 91 | return 92 | 93 | self.q_eval.optimizer.zero_grad() 94 | 95 | self.replace_target_network() 96 | 97 | states, actions, rewards, states_, dones,\ 98 | sample_idx, weights = self.sample_memory() 99 | indices = np.arange(self.batch_size) 100 | 101 | q_pred = self.q_eval.forward(states)[indices, actions] 102 | q_next = self.q_next.forward(states_) 103 | q_eval = self.q_eval.forward(states_) 104 | 105 | max_actions = T.argmax(q_eval, dim=1) 106 | q_next[dones] = 0.0 107 | q_target = rewards + self.gamma * q_next[indices, max_actions] 108 | 109 | td_error = np.abs((q_target.detach().cpu().numpy() - 110 | q_pred.detach().cpu().numpy())) 111 | td_error = np.clip(td_error, 0., 1.) 112 | 113 | self.memory.update_priorities(sample_idx, td_error) 114 | 115 | q_target *= weights 116 | q_pred *= weights 117 | 118 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 119 | loss.backward() 120 | self.q_eval.optimizer.step() 121 | self.learn_step_counter += 1 122 | self.decrement_epsilon() 123 | -------------------------------------------------------------------------------- /PER/ranked/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from network import LinearDeepQNetwork 4 | from memory import MaxHeap 5 | 6 | 7 | class DQNAgent: 8 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 9 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 10 | replace=1000, alpha=0.5, beta=0, r_iter=32, 11 | algo=None, env_name=None, chkpt_dir='tmp/dqn'): 12 | self.gamma = gamma 13 | self.epsilon = epsilon 14 | self.lr = lr 15 | self.n_actions = n_actions 16 | self.input_dims = input_dims 17 | self.batch_size = batch_size 18 | self.eps_min = eps_min 19 | self.eps_dec = eps_dec 20 | self.replace_target_cnt = replace 21 | self.algo = algo 22 | self.env_name = env_name 23 | self.chkpt_dir = chkpt_dir 24 | self.action_space = [i for i in range(n_actions)] 25 | self.learn_step_counter = 0 26 | self.rebalance_iter = r_iter 27 | 28 | self.memory = MaxHeap(mem_size, batch_size, alpha=alpha, beta=beta, 29 | r_iter=r_iter) 30 | 31 | self.q_eval = LinearDeepQNetwork(self.lr, self.n_actions, 32 | input_dims=self.input_dims, 33 | name=self.env_name+'_'+self.algo+'_q_eval', 34 | chkpt_dir=self.chkpt_dir) 35 | 36 | self.q_next = LinearDeepQNetwork(self.lr, self.n_actions, 37 | input_dims=self.input_dims, 38 | name=self.env_name+'_'+self.algo+'_q_next', 39 | chkpt_dir=self.chkpt_dir) 40 | 41 | def choose_action(self, observation): 42 | if np.random.random() > self.epsilon: 43 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 44 | actions = self.q_eval.forward(state) 45 | action = T.argmax(actions).item() 46 | else: 47 | action = np.random.choice(self.action_space) 48 | 49 | return action 50 | 51 | def store_transition(self, state, action, reward, state_, done): 52 | self.memory.store_transition([state, action, reward, state_, done]) 53 | 54 | def sample_memory(self): 55 | sarsd, sample_idx, weights = self.memory.sample() 56 | states = np.array([row[0] for row in sarsd]) 57 | actions = np.array([row[1] for row in sarsd]) 58 | rewards = np.array([row[2] for row in sarsd]) 59 | states_ = np.array([row[3] for row in sarsd]) 60 | dones = np.array([row[4] for row in sarsd]) 61 | 62 | states = T.tensor(states, dtype=T.float).to(self.q_eval.device) 63 | rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) 64 | dones = T.tensor(dones).to(self.q_eval.device) 65 | actions = T.tensor(actions).to(self.q_eval.device) 66 | states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device) 67 | 68 | weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device) 69 | 70 | return states, actions, rewards, states_, dones, sample_idx, weights 71 | 72 | def replace_target_network(self): 73 | if self.learn_step_counter % self.replace_target_cnt == 0: 74 | self.q_next.load_state_dict(self.q_eval.state_dict()) 75 | 76 | def decrement_epsilon(self): 77 | self.epsilon = self.epsilon - self.eps_dec \ 78 | if self.epsilon > self.eps_min else self.eps_min 79 | 80 | def save_models(self): 81 | self.q_eval.save_checkpoint() 82 | self.q_next.save_checkpoint() 83 | 84 | def load_models(self): 85 | self.q_eval.load_checkpoint() 86 | self.q_next.load_checkpoint() 87 | 88 | def rebalance_heap(self): 89 | if self.rebalance_iter > 1: 90 | if self.learn_step_counter % self.rebalance_iter == 0: 91 | self.memory.rebalance_heap() 92 | self.memory._update_ranks() 93 | self.memory.compute_probs() 94 | 95 | def learn(self): 96 | if not self.memory.ready(): 97 | return 98 | 99 | self.q_eval.optimizer.zero_grad() 100 | 101 | self.replace_target_network() 102 | 103 | self.rebalance_heap() 104 | 105 | states, actions, rewards, states_, dones,\ 106 | sample_idx, weights = self.sample_memory() 107 | indices = np.arange(self.batch_size) 108 | q_pred = self.q_eval.forward(states)[indices, actions] 109 | 110 | q_next = self.q_next.forward(states_).max(dim=1)[0] 111 | q_next[dones] = 0.0 112 | q_target = rewards + self.gamma*q_next 113 | 114 | td_error = np.abs((q_target.detach().cpu().numpy() - 115 | q_pred.detach().cpu().numpy())) 116 | td_error = np.clip(td_error, -1., 1.) 117 | 118 | self.memory.update_priorities(sample_idx, td_error) 119 | 120 | q_target *= weights 121 | q_pred *= weights 122 | 123 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 124 | loss.backward() 125 | self.q_eval.optimizer.step() 126 | self.learn_step_counter += 1 127 | self.decrement_epsilon() 128 | -------------------------------------------------------------------------------- /PER/proportional/atari/memory.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List 3 | import numpy as np 4 | 5 | 6 | @dataclass 7 | class Node: 8 | value: float = 0.01 9 | total: float = 0.01 10 | 11 | def update_priority(self, priority: float): 12 | delta = priority - self.value 13 | self.value = priority 14 | self.total += delta 15 | return delta 16 | 17 | def update_total(self, delta: float): 18 | self.total += delta 19 | 20 | 21 | class SumTree: 22 | def __init__(self, max_size: int = 1_00_000, batch_size: int = 32, 23 | alpha: float = 0.5, beta: float = 0.5, 24 | input_shape=(4, 84, 84)): 25 | self.counter = 0 26 | self.max_size = max_size 27 | self.batch_size = batch_size 28 | self.alpha = alpha 29 | self.beta = beta 30 | self.alpha_start = alpha 31 | self.beta_start = beta 32 | 33 | self.sum_tree = [] 34 | self.states = np.zeros(shape=(max_size, *input_shape), 35 | dtype=np.float16) 36 | self.actions = np.zeros(shape=(max_size,), dtype=np.int64) 37 | self.rewards = np.zeros(shape=(max_size,), dtype=np.float16) 38 | self.states_ = np.zeros(shape=(max_size, *input_shape), 39 | dtype=np.float16) 40 | self.dones = np.zeros(shape=(max_size,), dtype=np.bool) 41 | 42 | def _insert(self, transition: List): 43 | state, action, reward, state_, done = transition 44 | index = self.counter % self.max_size 45 | self.states[index] = state 46 | self.actions[index] = action 47 | self.rewards[index] = reward 48 | self.states_[index] = state_ 49 | self.dones[index] = done 50 | if self.counter < self.max_size: 51 | self.sum_tree.append(Node()) 52 | self.counter += 1 53 | 54 | def store_transition(self, transition: List): 55 | self._insert(transition) 56 | 57 | def _calculate_parents(self, index: int): 58 | parents = [] 59 | while index > 0: 60 | parents.append(int((index-1)//2)) 61 | index = int((index-1)//2) 62 | return parents 63 | 64 | def update_priorities(self, indices: List, priorities: List): 65 | self._propagate_changes(indices, priorities) 66 | 67 | def _propagate_changes(self, indices: List, priorities: List): 68 | for idx, p in zip(indices, priorities): 69 | delta = self.sum_tree[idx].update_priority((p+1e-3)**self.alpha) 70 | parents = self._calculate_parents(idx) 71 | for parent in parents: 72 | self.sum_tree[parent].update_total(delta) 73 | 74 | def _sample(self): 75 | total_weight = self.sum_tree[0].total 76 | 77 | if total_weight == 0.01: 78 | samples = np.random.choice(self.batch_size, self.batch_size, 79 | replace=False) 80 | probs = [1 / self.batch_size for _ in range(self.batch_size)] 81 | return samples, probs 82 | 83 | samples, probs, n_samples = [], [], 1 84 | index = self.counter % self.max_size - 1 85 | samples.append(index) 86 | probs.append(self.sum_tree[index].value / self.sum_tree[0].total) 87 | while n_samples < self.batch_size: 88 | index = 0 89 | target = total_weight * np.random.random() 90 | while True: 91 | left = 2 * index + 1 92 | right = 2 * index + 2 93 | if left > len(self.sum_tree) - 1\ 94 | or right > len(self.sum_tree) - 1: 95 | break 96 | left_sum = self.sum_tree[left].total 97 | if target < left_sum: 98 | index = left 99 | continue 100 | target -= left_sum 101 | right_sum = self.sum_tree[right].total 102 | if target < right_sum: 103 | index = right 104 | continue 105 | target -= right_sum 106 | break 107 | samples.append(index) 108 | n_samples += 1 109 | probs.append(self.sum_tree[index].value / self.sum_tree[0].total) 110 | return samples, probs 111 | 112 | def sample(self): 113 | samples, probs = self._sample() 114 | weights = self._calculate_weights(probs) 115 | mems = [self.states[samples], self.actions[samples], 116 | self.rewards[samples], self.states_[samples], 117 | self.dones[samples]] 118 | return mems, samples, weights 119 | 120 | def _calculate_weights(self, probs: List): 121 | weights = np.array([(1 / self.counter * 1 / prob)**self.beta 122 | for prob in probs]) 123 | weights *= 1 / max(weights) 124 | return weights 125 | 126 | def ready(self): 127 | return self.counter >= self.batch_size 128 | 129 | def anneal_beta(self, ep: int, ep_max: int): 130 | self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start) 131 | 132 | def anneal_alpha(self, ep: int, ep_max: int): 133 | self.alpha = self.alpha_start * (1 - ep / ep_max) 134 | -------------------------------------------------------------------------------- /PER/ranked/memory.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from dataclasses import dataclass, field 3 | from typing import List 4 | import numpy as np 5 | 6 | 7 | @dataclass 8 | class MemoryCell: 9 | priority: float 10 | rank: int 11 | transition: List[np.array] = field(repr=False) 12 | 13 | def update_priority(self, new_priority: float): 14 | self.priority = new_priority 15 | 16 | def update_rank(self, new_rank: int): 17 | self.rank = new_rank 18 | 19 | def __gt__(self, other): 20 | return self.priority > other.priority 21 | 22 | def __ge__(self, other): 23 | return self.priority >= other.priority 24 | 25 | def __lt__(self, other): 26 | return self.priority < other.priority 27 | 28 | def __le__(self, other): 29 | return self.priority < other.priority 30 | 31 | 32 | class MaxHeap: 33 | def __init__(self, max_size: int = 1e6, n_batches: int = 32, 34 | alpha: float = 0.5, beta: float = 0, r_iter: int = 32): 35 | self.array: List[MemoryCell] = [] 36 | self.max_size = max_size 37 | self.mem_cntr: int = 0 38 | self.n_batches = n_batches 39 | self.alpha = alpha 40 | self.beta = beta 41 | self.beta_start = beta 42 | self.alpha_start = alpha 43 | self.r_iter = r_iter 44 | self._precompute_indices() 45 | 46 | def store_transition(self, sarsd: List[np.array]): 47 | priority = 10 48 | rank = 1 49 | transition = MemoryCell(priority, rank, sarsd) 50 | self._insert(transition) 51 | 52 | def _insert(self, transition: MemoryCell): 53 | if self.mem_cntr < self.max_size: 54 | self.array.append(transition) 55 | else: 56 | index = self.mem_cntr % self.max_size 57 | self.array[index] = transition 58 | self.mem_cntr += 1 59 | 60 | def _update_ranks(self): 61 | array = deepcopy(self.array) 62 | indices = [i for i in range(len(array))] 63 | sorted_array = [list(x) for x in zip(*sorted(zip(array, indices), 64 | key=lambda pair: pair[0], 65 | reverse=True))] 66 | 67 | for index, value in enumerate(sorted_array[1]): 68 | self.array[value].rank = index + 1 69 | 70 | def print_array(self, a=None): 71 | array = self.array if a is None else a 72 | for cell in array: 73 | print(cell) 74 | print('\n') 75 | 76 | def _max_heapify(self, array: List[MemoryCell], i: int, N: int = None): 77 | N = len(array) if N is None else N 78 | left = 2 * i + 1 79 | right = 2 * i + 2 80 | largest = i 81 | if left < N and array[left] > array[i]: 82 | largest = left 83 | if right < N and array[right] > array[largest]: 84 | largest = right 85 | if largest != i: 86 | array[i], array[largest] = array[largest], array[i] 87 | self._max_heapify(array, largest, N) 88 | return array 89 | 90 | def _build_max_heap(self): 91 | array = deepcopy(self.array) 92 | N = len(array) 93 | for i in range(N//2, -1, -1): 94 | array = self._max_heapify(array, i) 95 | return array 96 | 97 | def rebalance_heap(self): 98 | self.array = self._build_max_heap() 99 | 100 | def update_priorities(self, indices: List[int], priorities: List[float]): 101 | for idx, index in enumerate(indices): 102 | self.array[index].update_priority(priorities[idx]) 103 | 104 | def ready(self): 105 | return self.mem_cntr >= self.n_batches 106 | 107 | def anneal_beta(self, ep: int, ep_max: int): 108 | self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start) 109 | 110 | def anneal_alpha(self, ep: int, ep_max: int): 111 | self.alpha = self.alpha_start * (1 - ep / ep_max) 112 | 113 | def _precompute_indices(self): 114 | print('precomputing indices') 115 | self.indices = [] 116 | n_batches = self.n_batches if self.r_iter > 1 else self.r_iter 117 | start = [i for i in range(n_batches, self.max_size + 1, n_batches)] 118 | for start_idx in start: 119 | bs = start_idx // n_batches 120 | indices = np.array([[j * bs + k for k in range(bs)] 121 | for j in range(n_batches)], dtype=np.int16) 122 | self.indices.append(indices) 123 | 124 | def compute_probs(self): 125 | self.probs = [] 126 | n_batches = self.n_batches if self.r_iter > 1 else self.r_iter 127 | idx = min(self.mem_cntr, self.max_size) // n_batches - 1 128 | for indices in self.indices[idx]: 129 | probs = [] 130 | for index in indices: 131 | p = 1 / (self.array[index].rank)**self.alpha 132 | probs.append(p) 133 | z = [p / sum(probs) for p in probs] 134 | self.probs.append(z) 135 | 136 | def _calculate_weights(self, probs: List): 137 | weights = np.array([(1 / self.mem_cntr * 1 / prob)**self.beta 138 | for prob in probs]) 139 | weights *= 1 / (max(weights)) 140 | return weights 141 | 142 | def sample(self): 143 | n_batches = self.n_batches if self.r_iter > 1 else self.r_iter 144 | idx = min(self.mem_cntr, self.max_size) // n_batches - 1 145 | if self.r_iter != 1: 146 | samples = [np.random.choice(self.indices[idx][row], 147 | p=self.probs[row]) 148 | for row in range(len(self.indices[idx]))] 149 | p = [val for row in self.probs for val in row] 150 | probs = [p[s] for s in samples] 151 | else: 152 | samples = np.random.choice(self.indices[idx][0], self.n_batches) 153 | probs = [1 / len(samples) for _ in range(len(samples))] 154 | weights = self._calculate_weights(probs) 155 | mems = np.array([self.array[s] for s in samples]) 156 | sarsd = [] 157 | for item in mems: 158 | row = [] 159 | for i in range(len(item.transition)): 160 | row.append(np.array(item.transition[i])) 161 | sarsd.append(row) 162 | return sarsd, samples, weights 163 | -------------------------------------------------------------------------------- /HER/robotic/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn.functional as F 4 | from torch.distributions.normal import Normal 5 | from networks import ActorNetwork, CriticNetwork 6 | from normalizer import Normalizer 7 | from utils import sync_networks, sync_grads 8 | 9 | 10 | class Agent: 11 | def __init__(self, alpha, beta, input_dims, tau, n_actions, action_space, 12 | gamma=0.99, action_noise=0.05, explore=0.2, obs_shape=[8], 13 | goal_shape=[3], max_size=1_000_000, fc1_dims=256, 14 | fc2_dims=256, fc3_dims=256): 15 | self.gamma = gamma 16 | self.tau = tau 17 | self.alpha = alpha 18 | self.beta = beta 19 | self.action_space = action_space 20 | self.n_actions = n_actions 21 | self.limit = -1 / (1 - self.gamma) 22 | self.action_noise = action_noise * self.action_space.high 23 | self.explore = explore 24 | 25 | self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 26 | fc3_dims, n_actions=n_actions, 27 | name='actor') 28 | self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, 29 | fc3_dims, n_actions=n_actions, 30 | name='critic') 31 | 32 | self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 33 | fc3_dims, n_actions=n_actions, 34 | name='target_actor') 35 | 36 | self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, 37 | fc2_dims, fc3_dims, 38 | n_actions=n_actions, 39 | name='target_critic') 40 | 41 | self.noise = Normal(T.zeros(n_actions), T.tensor(self.action_noise)) 42 | 43 | self.update_network_parameters(tau=1) 44 | 45 | sync_networks(self.actor) 46 | sync_networks(self.critic) 47 | 48 | self.obs_stats = Normalizer(obs_shape, 0.01, 5) 49 | self.goal_stats = Normalizer(goal_shape, 0.01, 5) 50 | 51 | def choose_action(self, observation, evaluate): 52 | if evaluate: 53 | with T.no_grad(): 54 | state = T.tensor([observation], 55 | dtype=T.float).to(self.actor.device) 56 | _, pi = self.target_actor.forward(state) 57 | action = pi.cpu().detach().numpy().squeeze() 58 | return action 59 | if np.random.uniform() <= self.explore: 60 | action = self.action_space.sample() 61 | else: 62 | state = T.tensor([observation], 63 | dtype=T.float).to(self.actor.device) 64 | _, pi = self.actor.forward(state) 65 | noise = self.noise.sample().to(self.actor.device) 66 | action = (pi + noise).cpu().detach().numpy().squeeze() 67 | action = np.clip(action, -1., 1.) 68 | return action 69 | 70 | def save_models(self): 71 | self.actor.save_checkpoint() 72 | self.target_actor.save_checkpoint() 73 | self.critic.save_checkpoint() 74 | self.target_critic.save_checkpoint() 75 | 76 | def load_models(self): 77 | self.actor.load_checkpoint() 78 | self.target_actor.load_checkpoint() 79 | self.critic.load_checkpoint() 80 | self.target_critic.load_checkpoint() 81 | 82 | def learn(self, memories): 83 | states, actions, rewards, states_, done, goals = memories 84 | states = np.concatenate([states, goals], axis=1) 85 | states_ = np.concatenate([states_, goals], axis=1) 86 | 87 | states = T.tensor(states, dtype=T.float).to(self.actor.device) 88 | states_ = T.tensor(states_, dtype=T.float).to(self.actor.device) 89 | actions = T.tensor(actions, dtype=T.float).to(self.actor.device) 90 | rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) 91 | done = T.tensor(done).to(self.actor.device) 92 | 93 | _, target_actions = self.target_actor.forward(states_) 94 | critic_value_ = self.target_critic.forward(states_, target_actions) 95 | critic_value = self.critic.forward(states, actions) 96 | 97 | critic_value_[done] = 0.0 98 | critic_value_ = critic_value_.view(-1) 99 | critic_value = critic_value.view(-1) 100 | 101 | target = rewards + self.gamma*critic_value_ 102 | target = target.view(critic_value_.size(), 1) 103 | target = T.clamp(target, min=self.limit, max=0) 104 | 105 | self.critic.optimizer.zero_grad() 106 | critic_loss = F.mse_loss(target, critic_value) 107 | critic_loss.backward() 108 | sync_grads(self.critic) 109 | self.critic.optimizer.step() 110 | 111 | self.actor.optimizer.zero_grad() 112 | mu, pi = self.actor.forward(states) 113 | actor_loss = self.critic.forward(states, pi) 114 | actor_loss = -T.mean(actor_loss) 115 | actor_loss += mu.pow(2).mean() 116 | actor_loss.backward() 117 | sync_grads(self.actor) 118 | self.actor.optimizer.step() 119 | 120 | def update_network_parameters(self, tau=None): 121 | if tau is None: 122 | tau = self.tau 123 | 124 | actor_params = self.actor.named_parameters() 125 | critic_params = self.critic.named_parameters() 126 | target_actor_params = self.target_actor.named_parameters() 127 | target_critic_params = self.target_critic.named_parameters() 128 | 129 | critic_state_dict = dict(critic_params) 130 | actor_state_dict = dict(actor_params) 131 | target_critic_state_dict = dict(target_critic_params) 132 | target_actor_state_dict = dict(target_actor_params) 133 | 134 | for name in critic_state_dict: 135 | critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ 136 | (1-tau)*target_critic_state_dict[name].clone() 137 | 138 | for name in actor_state_dict: 139 | actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ 140 | (1-tau)*target_actor_state_dict[name].clone() 141 | 142 | self.target_critic.load_state_dict(critic_state_dict) 143 | self.target_actor.load_state_dict(actor_state_dict) 144 | --------------------------------------------------------------------------------