├── HER
    ├── bitflip
    │   ├── results.npy
    │   ├── test_env.py
    │   ├── networks.py
    │   ├── test_her.py
    │   ├── episode.py
    │   ├── bit_flip.py
    │   ├── main.py
    │   ├── her.py
    │   └── agent.py
    └── robotic
    │   ├── wrappers.py
    │   ├── env_test.py
    │   ├── utils.py
    │   ├── normalizer.py
    │   ├── episode.py
    │   ├── main.py
    │   ├── networks.py
    │   ├── her.py
    │   └── agent.py
├── README.md
└── PER
    ├── max_heap
        └── test_maxheap.py
    ├── ranked
        ├── network.py
        ├── main.py
        ├── utils.py
        ├── agent.py
        └── memory.py
    └── proportional
        ├── network.py
        ├── main.py
        ├── atari
            ├── main.py
            ├── network.py
            ├── wrappers.py
            ├── agent.py
            ├── utils.py
            └── memory.py
        ├── memory.py
        ├── utils.py
        └── agent.py


/HER/bitflip/results.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Advanced-Replay-Strategies/HEAD/HER/bitflip/results.npy


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Source code for my new course: Advanced Replay Strategies.
2 | You can find this course at https://www.neuralnet.ai/courses
3 | 


--------------------------------------------------------------------------------
/HER/robotic/wrappers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | 
 5 | class FlattenDictWrapper(gym.ObservationWrapper):
 6 |     def __init__(self, env):
 7 |         super(FlattenDictWrapper, self).__init__(env)
 8 | 
 9 |     def observation(self, obs):
10 |         obs = np.concatenate([obs[k] for k in obs.keys()])
11 |         return obs
12 | 


--------------------------------------------------------------------------------
/HER/robotic/env_test.py:
--------------------------------------------------------------------------------
 1 | import panda_gym
 2 | import gym
 3 | import time
 4 | 
 5 | env = gym.make('PandaReach-v2', render=True)
 6 | 
 7 | for _ in range(100):
 8 |     obs = env.reset()
 9 |     done = False
10 |     while not done:
11 |         action = env.action_space.sample()
12 |         _, _, done, _ = env.step(action)
13 |         time.sleep(0.05)
14 | env.close()
15 | 


--------------------------------------------------------------------------------
/HER/bitflip/test_env.py:
--------------------------------------------------------------------------------
 1 | from bit_flip import BitFlipEnv
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 |     env = BitFlipEnv(n_bits=4)
 6 | 
 7 |     for _ in range(2):
 8 |         done = False
 9 |         obs = env.reset()
10 |         print('starting new episode')
11 |         while not done:
12 |             action = env.action_space_sample()
13 |             obs_, reward, done, info = env.step(action)
14 |             env.render()
15 | 


--------------------------------------------------------------------------------
/PER/max_heap/test_maxheap.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def max_heapify(array, i, N=None):
 5 |     pass
 6 | 
 7 | 
 8 | def build_max_heap(array):
 9 |     pass
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     np.random.seed(42)
14 |     a = np.random.choice(np.arange(100), 21, replace=False)
15 |     print('unsorted array: {}'.format(a))
16 |     a = build_max_heap(a)
17 |     reference = np.array([90., 80., 83., 77., 55., 73., 70., 76.,
18 |                           53., 44., 18., 30., 39., 33., 22., 4.,
19 |                           45., 10., 12., 31., 0])
20 |     print('max heap array: {}'.format(a))
21 |     assert (a == reference).all()
22 | 


--------------------------------------------------------------------------------
/HER/robotic/utils.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | import torch as T
 4 | 
 5 | 
 6 | def sync_networks(network):
 7 |     comm = MPI.COMM_WORLD
 8 | 
 9 |     params = np.concatenate([getattr(p, 'data').cpu().numpy().flatten()
10 |                              for p in network.parameters()])
11 |     comm.Bcast(params)
12 |     idx = 0
13 |     for p in network.parameters():
14 |         getattr(p, 'data').copy_(T.tensor(
15 |             params[idx:idx + p.data.numel()]).view_as(p.data))
16 |         idx += p.data.numel()
17 | 
18 | 
19 | def sync_grads(network):
20 |     comm = MPI.COMM_WORLD
21 | 
22 |     grads = np.concatenate([getattr(p, 'grad').cpu().numpy().flatten()
23 |                            for p in network.parameters()])
24 |     global_grads = np.zeros_like(grads)
25 |     comm.Allreduce(grads, global_grads, op=MPI.SUM)
26 |     idx = 0
27 |     for p in network.parameters():
28 |         getattr(p, 'grad').copy_(T.tensor(
29 |             global_grads[idx:idx + p.data.numel()]).view_as(p.data))
30 |         idx += p.data.numel()
31 | 


--------------------------------------------------------------------------------
/PER/ranked/network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class LinearDeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(LinearDeepQNetwork, self).__init__()
11 |         self.checkpoint_dir = chkpt_dir
12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
13 | 
14 |         self.fc1 = nn.Linear(*input_dims, 32)
15 |         self.fc2 = nn.Linear(32, 32)
16 |         self.q = nn.Linear(32, n_actions)
17 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
18 | 
19 |         self.loss = nn.MSELoss()
20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
21 |         self.to(self.device)
22 | 
23 |     def forward(self, state):
24 |         flat1 = F.relu(self.fc1(state))
25 |         flat2 = F.relu(self.fc2(flat1))
26 |         q = self.q(flat2)
27 | 
28 |         return q
29 | 
30 |     def save_checkpoint(self):
31 |         T.save(self.state_dict(), self.checkpoint_file)
32 | 
33 |     def load_checkpoint(self):
34 |         self.load_state_dict(T.load(self.checkpoint_file))
35 | 


--------------------------------------------------------------------------------
/PER/proportional/network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class LinearDeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(LinearDeepQNetwork, self).__init__()
11 |         self.checkpoint_dir = chkpt_dir
12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
13 | 
14 |         self.fc1 = nn.Linear(*input_dims, 32)
15 |         self.fc2 = nn.Linear(32, 32)
16 |         self.q = nn.Linear(32, n_actions)
17 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
18 | 
19 |         self.loss = nn.MSELoss()
20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
21 |         self.to(self.device)
22 | 
23 |     def forward(self, state):
24 |         flat1 = F.relu(self.fc1(state))
25 |         flat2 = F.relu(self.fc2(flat1))
26 |         q = self.q(flat2)
27 | 
28 |         return q
29 | 
30 |     def save_checkpoint(self):
31 |         T.save(self.state_dict(), self.checkpoint_file)
32 | 
33 |     def load_checkpoint(self):
34 |         self.load_state_dict(T.load(self.checkpoint_file))
35 | 


--------------------------------------------------------------------------------
/HER/bitflip/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class DeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(DeepQNetwork, self).__init__()
11 |         self.checkpoint_dir = chkpt_dir
12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
13 | 
14 |         self.fc1 = nn.Linear(input_dims, 256)
15 |         self.fc2 = nn.Linear(256, n_actions)
16 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
17 | 
18 |         self.loss = nn.MSELoss()
19 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
20 |         self.to(self.device)
21 | 
22 |     def forward(self, state):
23 |         flat1 = F.relu(self.fc1(state))
24 |         actions = self.fc2(flat1)
25 | 
26 |         return actions
27 | 
28 |     def save_checkpoint(self):
29 |         print('... saving checkpoint ...')
30 |         T.save(self.state_dict(), self.checkpoint_file)
31 | 
32 |     def load_checkpoint(self):
33 |         print('... loading checkpoint ...')
34 |         self.load_state_dict(T.load(self.checkpoint_file))
35 | 


--------------------------------------------------------------------------------
/HER/bitflip/test_her.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | from bit_flip import BitFlipEnv
 4 | from her import HER
 5 | 
 6 | n_bits = 16
 7 | env = BitFlipEnv(n_bits)
 8 | random.seed(123)
 9 | np.random.seed(123)
10 | batch_size = 2
11 | max_size = 1_000
12 | input_shape = n_bits
13 | memory = HER(max_mem=max_size, input_shape=input_shape, n_actions=1,
14 |              batch_size=batch_size, goal_shape=n_bits,
15 |              strategy='final', reward_fn=env.compute_reward)
16 | 
17 | for _ in range(40):
18 |     o = env.reset()
19 |     agl = o[n_bits:2*n_bits]
20 |     dg_ = o[2*n_bits:3*n_bits]
21 |     d = False
22 |     s, a, re, dn, s_, dg, ag, ag_ = [], [], [], [], [], [], [], []
23 |     while not d:
24 |         action = env.action_space_sample()
25 |         o_, r, d, i = env.step(action)
26 |         agl_ = o_[2*n_bits:3*n_bits]
27 |         s.append(o[:n_bits])
28 |         a.append(action)
29 |         re.append(r)
30 |         dn.append(d)
31 |         s_.append(o_[:n_bits])
32 |         dg.append(dg_)
33 |         ag.append(agl)
34 |         ag_.append(agl_)
35 |         agl = agl_
36 |         o = o_
37 |     memory.store_episode([s, a, re, s_, dn, dg, ag, ag_])
38 |     assert memory.ready(), 'Unexpected number of memories in buffer'
39 | 
40 | s, a, re, s_, dn, dg, ag = memory.sample_memory()
41 | 
42 | data = np.load('results.npy')
43 | 
44 | assert (s[0] == data[0]).all(), 'Unexpected values for sampling of states'
45 | assert (s[1] == data[1]).all(), 'Unexpected values for sampling of states'
46 | assert (s_[0] == data[2]).all(), 'Unexpected values for sampling of states_'
47 | assert (s_[1] == data[3]).all(), 'Unexpected values for sampling of states_'
48 | 


--------------------------------------------------------------------------------
/HER/bitflip/episode.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class EpisodeWorker:
 5 |     def __init__(self, env, agent, memory):
 6 |         self.agent = agent
 7 |         self.env = env
 8 |         self.memory = memory
 9 |         self.get_slices()
10 | 
11 |     def get_slices(self):
12 |         OB = self.env.observation_space['observation'].shape[0]
13 |         A = self.env.observation_space['achieved_goal'].shape[0]
14 |         D = self.env.observation_space['desired_goal'].shape[0]
15 | 
16 |         self.ob = slice(0, OB)
17 |         self.ag = slice(OB, OB + A)
18 |         self.dg = slice(OB + A, OB + A + D)
19 | 
20 |     def play_episode(self, evaluate=False):
21 |         observation = self.env.reset()
22 |         done = False
23 |         score = 0
24 |         desired_goal = observation[self.dg]
25 |         achieved_goal = observation[self.ag]
26 |         observation = observation[self.ob]
27 | 
28 |         states, actions, rewards, states_,\
29 |             dones, dg, ag, ag_ = [], [], [], [], [], [], [], []
30 | 
31 |         while not done:
32 |             action = self.agent.choose_action(np.concatenate(
33 |                                [observation, desired_goal]), evaluate)
34 |             observation_, reward, done, info = self.env.step(action)
35 |             achieved_goal_new = observation_[self.ag]
36 |             states.append(observation)
37 |             states_.append(observation_[self.ob])
38 |             rewards.append(reward)
39 |             actions.append(action)
40 |             dones.append(done)
41 |             dg.append(desired_goal)
42 |             ag.append(achieved_goal)
43 |             ag_.append(achieved_goal_new)
44 |             score += reward
45 |             achieved_goal = achieved_goal_new
46 |             observation = observation_[self.ob]
47 |         if not evaluate:
48 |             self.memory.store_episode([states, actions, rewards,
49 |                                        states_, dones, dg, ag, ag_])
50 |         success = info['is_success']
51 |         return score, success
52 | 


--------------------------------------------------------------------------------
/HER/bitflip/bit_flip.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class BitFlipEnv:
 5 |     def __init__(self, n_bits, max_steps=50):
 6 |         self.n_bits = n_bits
 7 |         self.max_steps = max_steps
 8 |         self.n_actions = n_bits
 9 | 
10 |         self.observation = self.reset()
11 |         self.observation_space = {'observation': np.empty((self.n_bits)),
12 |                                   'achieved_goal': np.empty((self.n_bits)),
13 |                                   'desired_goal': np.empty((self.n_bits)),
14 |                                   }
15 | 
16 |     def reset(self):
17 |         self._bits = np.array([np.random.randint(2)
18 |                               for _ in range(self.n_bits)])
19 |         self._desired_goal = np.array([np.random.randint(2)
20 |                                       for _ in range(self.n_bits)])
21 |         self._achieved_goal = self._bits.copy()
22 | 
23 |         obs = np.concatenate([self._bits,
24 |                               self._achieved_goal,
25 |                               self._desired_goal])
26 |         self._step = 0
27 |         return obs
28 | 
29 |     def compute_reward(self, desired_goal, achieved_goal, info):
30 |         reward = 0.0 if (desired_goal == achieved_goal).all() else -1.0
31 |         return reward
32 | 
33 |     def step(self, action):
34 |         assert action <= self.n_actions, "Invalid Action"
35 |         new_bit = 0 if self._bits[action] == 1 else 1
36 |         self._bits[action] = new_bit
37 |         info = {}
38 |         self._achieved_goal = self._bits.copy()
39 |         reward = self.compute_reward(self._desired_goal,
40 |                                      self._achieved_goal, {})
41 |         self._step += 1
42 |         if reward == 0.0 or self._step >= self.max_steps:
43 |             done = True
44 |         else:
45 |             done = False
46 |         info['is_success'] = 1.0 if reward == 0.0 else 0.0
47 |         obs = np.concatenate([self._bits, self._achieved_goal,
48 |                               self._desired_goal])
49 |         return obs, reward, done, info
50 | 
51 |     def action_space_sample(self):
52 |         return np.random.randint(0, self.n_actions)
53 | 
54 |     def render(self):
55 |         for bit in self._bits:
56 |             print(bit, end='  ')
57 |         print('\n')
58 | 


--------------------------------------------------------------------------------
/HER/bitflip/main.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch as T
 4 | from agent import Agent
 5 | from episode import EpisodeWorker
 6 | from her import HER
 7 | from bit_flip import BitFlipEnv
 8 | 
 9 | 
10 | def train(agent, worker, memory):
11 |     epochs = 100
12 |     cycle_length = 16
13 |     n_cycles = 50
14 |     n_updates = 40
15 |     n_tests = 10
16 |     for epoch in range(epochs):
17 |         for cycle in range(n_cycles):
18 |             score_history, success_history = [], []
19 |             for i in range(cycle_length):
20 |                 score, success = worker.play_episode()
21 |                 score_history.append(score)
22 |                 success_history.append(success)
23 |             # cycle_avg_score = np.mean(score_history)
24 |             # cycle_avg_success = np.mean(success_history)
25 |             # print('Epoch: {} Cycle: {} Training Avg Score {:.1f} '
26 |             #      'Trainig Avg Success: {:.3f}'.
27 |             #      format(epoch, cycle, cycle_avg_score, cycle_avg_success))
28 |             if memory.ready():
29 |                 for _ in range(n_updates):
30 |                     memories = memory.sample_memory()
31 |                     agent.learn(memories)
32 |         score_history, success_history = [], []
33 |         for episode in range(n_tests):
34 |             score, success = worker.play_episode(evaluate=True)
35 |             success_history.append(success)
36 |             score_history.append(score)
37 |         avg_success = np.mean(success_history)
38 |         avg_score = np.mean(score_history)
39 |         print('Epoch: {} Testing Agent. Avg Score: {:.1f} '
40 |               'Avg Sucess: {:.3f}'.
41 |               format(epoch, avg_score, avg_success))
42 | 
43 | 
44 | def main():
45 |     n_bits = 32
46 |     env = BitFlipEnv(n_bits, max_steps=n_bits)
47 |     random.seed(123)
48 |     np.random.seed(123)
49 |     T.manual_seed(123)
50 |     T.cuda.manual_seed(123)
51 | 
52 |     batch_size = 128
53 |     max_size = 1_000_000
54 |     input_shape = n_bits
55 |     memory = HER(max_mem=max_size, input_shape=input_shape, n_actions=1,
56 |                  batch_size=batch_size, goal_shape=n_bits, strategy=None,
57 |                  reward_fn=env.compute_reward)
58 |     agent = Agent(lr=0.001, epsilon=0.2, n_actions=n_bits, eps_dec=0.0,
59 |                   batch_size=batch_size, input_dims=2*input_shape, gamma=0.98)
60 |     ep_worker = EpisodeWorker(env, agent, memory)
61 | 
62 |     train(agent, ep_worker, memory)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/HER/robotic/normalizer.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import numpy as np
 3 | from mpi4py import MPI
 4 | 
 5 | 
 6 | class Normalizer:
 7 |     def __init__(self, size, eps=1e-2, default_clip_range=np.inf):
 8 |         self.size = size
 9 |         self.eps = eps
10 |         self.default_clip_range = default_clip_range
11 | 
12 |         self.local_sum = np.zeros(self.size, dtype=np.float32)
13 |         self.local_sum_sq = np.zeros(self.size, dtype=np.float32)
14 |         self.local_cnt = np.zeros(1, dtype=np.float32)
15 | 
16 |         self.lock = threading.Lock()
17 | 
18 |         self.running_mean = np.zeros(self.size, dtype=np.float32)
19 |         self.running_std = np.ones(self.size, dtype=np.float32)
20 |         self.running_sum = np.zeros(self.size, dtype=np.float32)
21 |         self.running_sum_sq = np.zeros(self.size, dtype=np.float32)
22 |         self.running_cnt = 1
23 | 
24 |     def update_local_stats(self, new_data):
25 |         with self.lock:
26 |             self.local_sum += new_data.sum(axis=0)
27 |             self.local_sum_sq += (np.square(new_data)).sum(axis=0)
28 |             self.local_cnt[0] += new_data.shape[0]
29 | 
30 |     def sync_thread_stats(self, local_sum, local_sum_sq, local_cnt):
31 |         local_sum[...] = self.mpi_average(local_sum)
32 |         local_sum_sq[...] = self.mpi_average(local_sum_sq)
33 |         local_cnt[...] = self.mpi_average(local_cnt)
34 |         return local_sum, local_sum_sq, local_cnt
35 | 
36 |     def mpi_average(self, x):
37 |         buf = np.zeros_like(x)
38 |         MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM)
39 |         buf /= MPI.COMM_WORLD.Get_size()
40 |         return buf
41 | 
42 |     def normalize_observation(self, v):
43 |         clip_range = self.default_clip_range
44 |         return np.clip((v - self.running_mean) / self.running_std,
45 |                        -clip_range, clip_range).astype(np.float32)
46 | 
47 |     def recompute_global_stats(self):
48 |         with self.lock:
49 |             local_cnt = self.local_cnt.copy()
50 |             local_sum = self.local_sum.copy()
51 |             local_sum_sq = self.local_sum_sq.copy()
52 | 
53 |             self.local_cnt[...] = 0
54 |             self.local_sum[...] = 0
55 |             self.local_sum_sq[...] = 0
56 | 
57 |         sync_sum, sync_sum_sq, sync_cnt = self.sync_thread_stats(
58 |                 local_sum, local_sum_sq, local_cnt)
59 | 
60 |         self.running_cnt += sync_cnt
61 |         self.running_sum += sync_sum
62 |         self.running_sum_sq += sync_sum_sq
63 | 
64 |         self.running_mean = self.running_sum / self.running_cnt
65 |         tmp = self.running_sum_sq / self.running_cnt -\
66 |             np.square(self.running_sum / self.running_cnt)
67 |         self.running_std = np.sqrt(np.maximum(np.square(self.eps), tmp))
68 | 


--------------------------------------------------------------------------------
/PER/proportional/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | 
 7 | def clip_reward(r):
 8 |     if r > 1:
 9 |         return 1
10 |     elif r < -1:
11 |         return -1
12 |     else:
13 |         return r
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     env = gym.make('CartPole-v0')
18 |     best_score = -np.inf
19 |     load_checkpoint = False
20 |     n_games = 1000
21 |     alpha = 0.6
22 |     beta = 0.4
23 |     replace = 250
24 |     bs = 64
25 |     agent = Agent(gamma=0.99, epsilon=1, lr=1e-4, alpha=alpha,
26 |                   beta=beta, input_dims=(env.observation_space.shape),
27 |                   n_actions=env.action_space.n, mem_size=50*1024, eps_min=0.01,
28 |                   batch_size=bs, replace=replace, eps_dec=1e-4,
29 |                   chkpt_dir='models/', algo='ddqn', env_name='CartPole-v0')
30 | 
31 |     if load_checkpoint:
32 |         agent.load_models()
33 | 
34 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
35 |         + str(n_games) + 'games' + str(alpha) +\
36 |         'alpha_' + str(beta) + '_replace_' + str(replace)
37 |     figure_file = 'plots/' + fname + '.png'
38 |     # if you want to record video of your agent playing,
39 |     # do a mkdir tmp && mkdir tmp/dqn-video
40 |     # and uncomment the following 2 lines.
41 |     # env = wrappers.Monitor(env, "tmp/dqn-video",
42 |     #                    video_callable=lambda episode_id: True, force=True)
43 |     n_steps = 0
44 |     scores, eps_history, steps_array = [], [], []
45 | 
46 |     for i in range(n_games):
47 |         done = False
48 |         observation = env.reset()
49 | 
50 |         score = 0
51 |         while not done:
52 |             action = agent.choose_action(observation)
53 |             observation_, reward, done, info = env.step(action)
54 |             score += reward
55 |             r = clip_reward(reward)
56 |             if not load_checkpoint:
57 |                 agent.store_transition(observation, action,
58 |                                        r, observation_, done)
59 |                 agent.learn()
60 |             observation = observation_
61 |             n_steps += 1
62 |         scores.append(score)
63 |         steps_array.append(n_steps)
64 | 
65 |         avg_score = np.mean(scores[-100:])
66 |         print('episode {} score {:.1f} eps {:.2f} n steps {}'.
67 |               format(i, avg_score, agent.epsilon, n_steps))
68 | 
69 |         if avg_score > best_score:
70 |             if not load_checkpoint:
71 |                 agent.save_models()
72 |             best_score = avg_score
73 | 
74 |         eps_history.append(agent.epsilon)
75 | 
76 |     x = [i+1 for i in range(len(scores))]
77 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
78 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve
 5 | from wrappers import make_env
 6 | 
 7 | def clip_reward(r):
 8 |     if r > 1:
 9 |         return 1
10 |     elif r < -1:
11 |         return -1
12 |     else:
13 |         return r
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     env_name = 'SpaceInvadersNoFrameskip-v4'
18 |     # env = gym.make('CartPole-v0')
19 |     env = make_env(env_name)
20 |     best_score = -np.inf
21 |     load_checkpoint = False
22 |     n_games = 1500
23 |     alpha = 0.6
24 |     beta = 0.4
25 |     bs = 64
26 |     agent = Agent(gamma=0.99, epsilon=1, lr=5e-5, alpha=alpha,
27 |                   beta=beta, input_dims=(env.observation_space.shape),
28 |                   n_actions=env.action_space.n, mem_size=50*1024, eps_min=0.01,
29 |                   batch_size=bs, eps_dec=1e-5,
30 |                   chkpt_dir='models/', algo='ddqn', env_name='SpaceInvaders')
31 | 
32 |     if load_checkpoint:
33 |         agent.load_models()
34 | 
35 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
36 |         + str(n_games) + 'games' + str(alpha) +\
37 |         'alpha_' + str(beta)
38 |     figure_file = 'plots/' + fname + '.png'
39 |     # if you want to record video of your agent playing,
40 |     # do a mkdir tmp && mkdir tmp/dqn-video
41 |     # and uncomment the following 2 lines.
42 |     # env = wrappers.Monitor(env, "tmp/dqn-video",
43 |     #                    video_callable=lambda episode_id: True, force=True)
44 |     n_steps = 0
45 |     scores, eps_history, steps_array = [], [], []
46 | 
47 |     for i in range(n_games):
48 |         done = False
49 |         observation = env.reset()
50 | 
51 |         score = 0
52 |         while not done:
53 |             action = agent.choose_action(observation)
54 |             observation_, reward, done, info = env.step(action)
55 |             score += reward
56 |             r = clip_reward(reward)
57 |             if not load_checkpoint:
58 |                 agent.store_transition(observation, action,
59 |                                        r, observation_, done)
60 |                 agent.learn()
61 |             observation = observation_
62 |             n_steps += 1
63 |         scores.append(score)
64 |         steps_array.append(n_steps)
65 | 
66 |         avg_score = np.mean(scores[-100:])
67 |         print('episode {} score {:.1f} eps {:.2f} n steps {}'.
68 |               format(i, avg_score, agent.epsilon, n_steps))
69 | 
70 |         if avg_score > best_score:
71 |             if not load_checkpoint:
72 |                 agent.save_models()
73 |             best_score = avg_score
74 | 
75 |         eps_history.append(agent.epsilon)
76 |         agent.memory.anneal_beta(i, n_games)
77 | 
78 |     x = [i+1 for i in range(len(scores))]
79 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
80 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import torch as T
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.optim as optim
 7 | 
 8 | 
 9 | class DeepQNetwork(nn.Module):
10 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
11 |         super(DeepQNetwork, self).__init__()
12 |         self.checkpoint_dir = chkpt_dir
13 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
14 | 
15 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
16 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
17 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
18 | 
19 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
20 | 
21 |         self.fc1 = nn.Linear(fc_input_dims, 512)
22 |         self.fc2 = nn.Linear(512, n_actions)
23 | 
24 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
25 | 
26 |         self.loss = nn.MSELoss()
27 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
28 |         self.to(self.device)
29 | 
30 |     def forward(self, state):
31 |         conv1 = F.relu(self.conv1(state))
32 |         conv2 = F.relu(self.conv2(conv1))
33 |         conv3 = F.relu(self.conv3(conv2))
34 |         conv_state = conv3.view(conv3.size()[0], -1)
35 | 
36 |         flat1 = F.relu(self.fc1(conv_state))
37 |         q = self.fc2(flat1)
38 | 
39 |         return q
40 | 
41 |     def calculate_conv_output_dims(self, input_dims):
42 |         state = T.zeros(1, *input_dims)
43 |         dims = self.conv1(state)
44 |         dims = self.conv2(dims)
45 |         dims = self.conv3(dims)
46 |         return int(np.prod(dims.size()))
47 | 
48 |     def save_checkpoint(self):
49 |         T.save(self.state_dict(), self.checkpoint_file)
50 | 
51 |     def load_checkpoint(self):
52 |         self.load_state_dict(T.load(self.checkpoint_file))
53 | 
54 | 
55 | class LinearDeepQNetwork(nn.Module):
56 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
57 |         super(LinearDeepQNetwork, self).__init__()
58 |         self.checkpoint_dir = chkpt_dir
59 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
60 | 
61 |         self.fc1 = nn.Linear(*input_dims, 32)
62 |         self.fc2 = nn.Linear(32, 32)
63 |         self.q = nn.Linear(32, n_actions)
64 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
65 | 
66 |         self.loss = nn.MSELoss()
67 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
68 |         self.to(self.device)
69 | 
70 |     def forward(self, state):
71 |         flat1 = F.relu(self.fc1(state))
72 |         flat2 = F.relu(self.fc2(flat1))
73 |         q = self.q(flat2)
74 | 
75 |         return q
76 | 
77 |     def save_checkpoint(self):
78 |         T.save(self.state_dict(), self.checkpoint_file)
79 | 
80 |     def load_checkpoint(self):
81 |         self.load_state_dict(T.load(self.checkpoint_file))
82 | 


--------------------------------------------------------------------------------
/HER/bitflip/her.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class HER:
 5 |     def __init__(self, max_mem, input_shape, n_actions, goal_shape, batch_size,
 6 |                  reward_fn, strategy='final'):
 7 |         self.max_mem = max_mem
 8 |         self.strategy = strategy
 9 |         self.mem_cntr = 0
10 |         self.batch_size = batch_size
11 |         self.input_shape = input_shape
12 |         self.reward_fn = reward_fn
13 | 
14 |         self.states = np.zeros((max_mem, input_shape),
15 |                                dtype=np.float64)
16 |         self.states_ = np.zeros((max_mem, input_shape),
17 |                                 dtype=np.float64)
18 |         self.actions = np.zeros((max_mem, n_actions),
19 |                                 dtype=np.float32)
20 |         self.rewards = np.zeros(max_mem, dtype=np.float32)
21 |         self.dones = np.zeros(max_mem, dtype=np.bool)
22 |         self.desired_goals = np.zeros((max_mem, goal_shape), dtype=np.float64)
23 |         self.achieved_goals = np.zeros((max_mem, goal_shape), dtype=np.float64)
24 |         self.achieved_goals_ = np.zeros((max_mem, goal_shape),
25 |                                         dtype=np.float64)
26 | 
27 |     def store_memory(self, state, action, reward, state_, done,
28 |                      d_goal, a_goal, a_goal_):
29 |         index = self.mem_cntr % self.max_mem
30 |         self.states[index] = state
31 |         self.states_[index] = state_
32 |         self.actions[index] = action
33 |         self.rewards[index] = reward
34 |         self.dones[index] = done
35 |         self.desired_goals[index] = d_goal
36 |         self.achieved_goals[index] = a_goal
37 |         self.achieved_goals_[index] = a_goal_
38 |         self.mem_cntr += 1
39 | 
40 |     def store_episode(self, ep_memory):
41 |         states, actions, rewards, states_, dones, dg, ag, ag_ = ep_memory
42 | 
43 |         if self.strategy == 'final':
44 |             hindsight_goals = [[ag_[-1]]] * len(ag_)
45 | 
46 |         elif self.strategy is None:
47 |             hindsight_goals = [[dg[0]]] * len(dg)
48 | 
49 |         for idx, s in enumerate(states):
50 |             self.store_memory(s, actions[idx], rewards[idx], states_[idx],
51 |                               dones[idx], dg[idx], ag[idx], ag_[idx])
52 |             for goal in hindsight_goals[idx]:
53 |                 reward = self.reward_fn(ag_[idx], goal, {})
54 |                 self.store_memory(s, actions[idx], reward, states_[idx],
55 |                                   dones[idx], goal, ag[idx], ag_[idx])
56 | 
57 |     def sample_memory(self):
58 |         last_mem = min(self.mem_cntr, self.max_mem)
59 |         batch = np.random.choice(last_mem, self.batch_size, replace=False)
60 | 
61 |         return self.states[batch], self.actions[batch], self.rewards[batch],\
62 |             self.states_[batch], self.dones[batch],\
63 |             self.desired_goals[batch]
64 | 
65 |     def ready(self):
66 |         return self.mem_cntr > self.batch_size
67 | 


--------------------------------------------------------------------------------
/PER/ranked/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import DQNAgent
 4 | from utils import plot_learning_curve
 5 | 
 6 | 
 7 | def clip_reward(r):
 8 |     if r > 1:
 9 |         return 1
10 |     elif r < -1:
11 |         return -1
12 |     else:
13 |         return r
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     env = gym.make('CartPole-v0')
18 |     best_score = -np.inf
19 |     load_checkpoint = False
20 |     n_games = 500
21 |     r_iter = 64
22 |     alpha = 0.25
23 |     beta = 0.5
24 |     replace = 250
25 |     agent = DQNAgent(gamma=0.99, epsilon=1, lr=2.5e-4, alpha=alpha,
26 |                      beta=beta, r_iter=r_iter,
27 |                      input_dims=(env.observation_space.shape),
28 |                      n_actions=env.action_space.n, mem_size=20*1024,
29 |                      eps_min=0.01,
30 |                      batch_size=64, replace=replace, eps_dec=1e-4,
31 |                      chkpt_dir='models/', algo='DQNAgent',
32 |                      env_name='CartPole-v0')
33 | 
34 |     if load_checkpoint:
35 |         agent.load_models()
36 | 
37 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
38 |         + str(n_games) + 'games' + str(r_iter) + '_r_iter_' + str(alpha) +\
39 |         'alpha_' + str(beta) + '_replace_' + str(replace)
40 |     figure_file = 'plots/' + fname + '.png'
41 |     # if you want to record video of your agent playing, do a mkdir tmp && mkdir tmp/dqn-video
42 |     # and uncomment the following 2 lines.
43 |     # env = wrappers.Monitor(env, "tmp/dqn-video",
44 |     #                    video_callable=lambda episode_id: True, force=True)
45 |     n_steps = 0
46 |     scores, eps_history, steps_array = [], [], []
47 | 
48 |     for i in range(n_games):
49 |         done = False
50 |         observation = env.reset()
51 | 
52 |         score = 0
53 |         while not done:
54 |             action = agent.choose_action(observation)
55 |             observation_, reward, done, info = env.step(action)
56 |             score += reward
57 |             r = clip_reward(reward)
58 |             if not load_checkpoint:
59 |                 agent.store_transition(observation, action,
60 |                                        r, observation_, done)
61 |                 agent.learn()
62 |             observation = observation_
63 |             n_steps += 1
64 |         scores.append(score)
65 |         steps_array.append(n_steps)
66 | 
67 |         avg_score = np.mean(scores[-100:])
68 |         print('episode: ', i, 'score: ', score,
69 |               ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
70 |               'epsilon %.2f' % agent.epsilon)
71 | 
72 |         if avg_score > best_score:
73 |             if not load_checkpoint:
74 |                 agent.save_models()
75 |             best_score = avg_score
76 | 
77 |         eps_history.append(agent.epsilon)
78 | 
79 |     x = [i+1 for i in range(len(scores))]
80 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
81 | 


--------------------------------------------------------------------------------
/HER/robotic/episode.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class EpisodeWorker:
 5 |     def __init__(self, env, agent, memory):
 6 |         self.agent = agent
 7 |         self.env = env
 8 |         self.memory = memory
 9 |         self.get_slices()
10 | 
11 |     def get_slices(self):
12 |         OB = self.env.observation_space['observation'].shape[0]
13 |         A = self.env.observation_space['achieved_goal'].shape[0]
14 |         D = self.env.observation_space['desired_goal'].shape[0]
15 | 
16 |         self.ob = slice(0, OB)
17 |         self.ag = slice(OB, OB + A)
18 |         self.dg = slice(OB + A, OB + A + D)
19 | 
20 |     def play_episode(self, evaluate=False):
21 |         observation = self.env.reset()
22 |         done = False
23 |         score = 0
24 |         desired_goal = observation[self.dg]
25 |         achieved_goal = observation[self.ag]
26 |         observation = observation[self.ob]
27 | 
28 |         self.agent.obs_stats.update_local_stats(observation)
29 |         self.agent.goal_stats.update_local_stats(desired_goal)
30 | 
31 |         achieved_goal = self.agent.goal_stats.normalize_observation(
32 |                 achieved_goal)
33 |         desired_goal = self.agent.goal_stats.normalize_observation(
34 |                 desired_goal)
35 |         observation = self.agent.obs_stats.normalize_observation(observation)
36 | 
37 |         states, actions, rewards, states_,\
38 |             dones, dg, ag, ag_ = [], [], [], [], [], [], [], []
39 | 
40 |         while not done:
41 |             action = self.agent.choose_action(np.concatenate(
42 |                                [observation, desired_goal]), evaluate)
43 |             observation_, reward, done, info = self.env.step(action)
44 | 
45 |             achieved_goal_new = observation_[self.ag]
46 |             observation_ = observation_[self.ob]
47 | 
48 |             if not evaluate:
49 |                 self.agent.goal_stats.update_local_stats(achieved_goal)
50 |                 self.agent.obs_stats.update_local_stats(observation_)
51 | 
52 |             observation_ = self.agent.obs_stats.normalize_observation(
53 |                     observation_)
54 |             achieved_goal_new = self.agent.goal_stats.normalize_observation(
55 |                     achieved_goal_new)
56 | 
57 |             states.append(observation)
58 |             states_.append(observation_)
59 |             rewards.append(reward)
60 |             actions.append(action)
61 |             dones.append(done)
62 |             dg.append(desired_goal)
63 |             ag.append(achieved_goal)
64 |             ag_.append(achieved_goal_new)
65 | 
66 |             score += reward
67 | 
68 |             achieved_goal = achieved_goal_new
69 |             observation = observation_
70 | 
71 |         if not evaluate:
72 |             self.agent.obs_stats.recompute_global_stats()
73 |             self.agent.goal_stats.recompute_global_stats()
74 |             self.memory.store_episode([states, actions, rewards,
75 |                                        states_, dones, dg, ag, ag_])
76 |         success = info['is_success']
77 |         return score, success
78 | 


--------------------------------------------------------------------------------
/HER/bitflip/agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch as T
 3 | from networks import DeepQNetwork
 4 | 
 5 | 
 6 | class Agent:
 7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
 8 |                  batch_size, eps_min=0.01, eps_dec=5e-7,
 9 |                  replace=1000, algo='dqn', env_name='bit_flip',
10 |                  chkpt_dir='models'):
11 |         self.gamma = gamma
12 |         self.epsilon = epsilon
13 |         self.lr = lr
14 |         self.n_actions = n_actions
15 |         self.input_dims = input_dims
16 |         self.batch_size = batch_size
17 |         self.eps_min = eps_min
18 |         self.eps_dec = eps_dec
19 |         self.replace_target_cnt = replace
20 |         self.algo = algo
21 |         self.env_name = env_name
22 |         self.chkpt_dir = chkpt_dir
23 |         self.action_space = [i for i in range(n_actions)]
24 |         self.learn_step_counter = 0
25 | 
26 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
27 |                                    input_dims=self.input_dims,
28 |                                    name=self.env_name+'_'+self.algo+'_q_eval',
29 |                                    chkpt_dir=self.chkpt_dir)
30 | 
31 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
32 |                                    input_dims=self.input_dims,
33 |                                    name=self.env_name+'_'+self.algo+'_q_next',
34 |                                    chkpt_dir=self.chkpt_dir)
35 | 
36 |     def choose_action(self, observation, evaluate=False):
37 |         if np.random.random() > self.epsilon or evaluate:
38 |             state = T.tensor([observation],
39 |                              dtype=T.float).to(self.q_eval.device)
40 |             actions = self.q_eval.forward(state)
41 |             action = T.argmax(actions).item()
42 |         else:
43 |             action = np.random.choice(self.action_space)
44 | 
45 |         return action
46 | 
47 |     def replace_target_network(self):
48 |         if self.learn_step_counter % self.replace_target_cnt == 0:
49 |             self.q_next.load_state_dict(self.q_eval.state_dict())
50 | 
51 |     def decrement_epsilon(self):
52 |         self.epsilon = self.epsilon - self.eps_dec \
53 |                            if self.epsilon > self.eps_min else self.eps_min
54 | 
55 |     def save_models(self):
56 |         self.q_eval.save_checkpoint()
57 |         self.q_next.save_checkpoint()
58 | 
59 |     def load_models(self):
60 |         self.q_eval.load_checkpoint()
61 |         self.q_next.load_checkpoint()
62 | 
63 |     def learn(self, memories):
64 |         state, action, reward, new_state, done, dg = memories
65 |         state = np.concatenate([state, dg], axis=1)
66 |         new_state = np.concatenate([new_state, dg], axis=1)
67 | 
68 |         states = T.tensor(state, dtype=T.float).to(self.q_eval.device)
69 |         rewards = T.tensor(reward).to(self.q_eval.device)
70 |         dones = T.tensor(done).to(self.q_eval.device)
71 |         actions = T.tensor(action, dtype=T.long).to(self.q_eval.device)
72 |         states_ = T.tensor(new_state, dtype=T.float).to(self.q_eval.device)
73 | 
74 |         actions = actions.view(-1)
75 | 
76 |         self.q_eval.optimizer.zero_grad()
77 | 
78 |         self.replace_target_network()
79 | 
80 |         indices = np.arange(self.batch_size)
81 |         q_pred = self.q_eval.forward(states)[indices, actions]
82 | 
83 |         q_next = self.q_next.forward(states_).max(dim=1)[0]
84 |         q_next[dones] = 0.0
85 |         q_target = rewards + self.gamma*q_next
86 | 
87 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
88 |         loss.backward()
89 |         self.q_eval.optimizer.step()
90 |         self.learn_step_counter += 1
91 |         self.decrement_epsilon()
92 | 


--------------------------------------------------------------------------------
/HER/robotic/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import gym
 4 | import panda_gym
 5 | from mpi4py import MPI
 6 | import numpy as np
 7 | import torch as T
 8 | from agent import Agent
 9 | from episode import EpisodeWorker
10 | from her import HER
11 | from wrappers import FlattenDictWrapper
12 | 
13 | 
14 | def train(agent, worker, memory, environ):
15 |     epochs = 50
16 |     cycle_length = 16
17 |     n_cycles = 50
18 |     n_updates = 40
19 |     n_tests = 10
20 |     for epoch in range(epochs):
21 |         for cycle in range(n_cycles):
22 |             score_history, success_history = [], []
23 |             for i in range(cycle_length):
24 |                 score, success = worker.play_episode()
25 |                 score_history.append(score)
26 |                 success_history.append(success)
27 |             """
28 |             if MPI.COMM_WORLD.Get_rank() == 0:
29 |                 cycle_avg_score = np.mean(score_history)
30 |                 cycle_avg_success = np.mean(success_history)
31 | 
32 |                 print('Epoch: {} Cycle: {} Training Avg Score {:.1f} '
33 |                       'Training Avg Success: {:.3f}'.
34 |                       format(epoch, cycle, cycle_avg_score, cycle_avg_success))
35 |             """
36 |             if memory.ready():
37 |                 for _ in range(n_updates):
38 |                     memories = memory.sample_memory()
39 |                     agent.learn(memories)
40 |                 agent.update_network_parameters()
41 |         score_history, success_history = [], []
42 |         for episode in range(n_tests):
43 |             score, success = worker.play_episode(evaluate=True)
44 |             success_history.append(success)
45 |             score_history.append(score)
46 |         avg_success = np.mean(success_history)
47 |         avg_score = np.mean(score_history)
48 |         global_success = MPI.COMM_WORLD.allreduce(avg_success, op=MPI.SUM)
49 |         global_score = MPI.COMM_WORLD.allreduce(avg_score, op=MPI.SUM)
50 |         eval_score = global_score / MPI.COMM_WORLD.Get_size()
51 |         eval_success = global_success / MPI.COMM_WORLD.Get_size()
52 |         if MPI.COMM_WORLD.Get_rank() == 0:
53 |             print('Epoch: {} Testing Agent. Avg Score: {:.1f} '
54 |                   'Avg Sucess: {:.3f} Environment: {}'.
55 |                   format(epoch, eval_score, eval_success, environ))
56 | 
57 | 
58 | def main():
59 |     env_string = 'PandaPickAndPlace-v2'
60 |     # env_string = 'PandaPush-v2'
61 |     env = gym.make(env_string)
62 |     env = FlattenDictWrapper(env)
63 |     seed = 123 + MPI.COMM_WORLD.Get_rank()
64 | 
65 |     random.seed(seed)
66 |     np.random.seed(seed)
67 |     T.manual_seed(seed)
68 |     T.cuda.manual_seed(seed)
69 | 
70 |     batch_size = 256
71 |     max_size = 1_000_000
72 |     obs_shape = env.observation_space['observation'].shape[0]
73 |     goal_shape = env.observation_space['achieved_goal'].shape[0]
74 |     input_shape = obs_shape
75 |     memory = HER(max_mem=max_size, input_shape=input_shape,
76 |                  n_actions=env.action_space.shape[0],
77 |                  batch_size=batch_size, goal_shape=goal_shape,
78 |                  strategy='future', reward_fn=env.compute_reward)
79 |     input_shape = obs_shape + goal_shape
80 |     agent = Agent(alpha=0.001, beta=0.001, action_space=env.action_space,
81 |                   input_dims=input_shape, tau=0.05, gamma=0.98,
82 |                   fc1_dims=256, fc2_dims=256, fc3_dims=256,
83 |                   n_actions=env.action_space.shape[0], explore=0.3,
84 |                   obs_shape=obs_shape, goal_shape=goal_shape,
85 |                   action_noise=0.2)
86 |     ep_worker = EpisodeWorker(env, agent, memory)
87 | 
88 |     train(agent, ep_worker, memory, env_string)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     os.environ['OMP_NUM_THREADS'] = '1'
93 |     os.environ['MKL_NUM_THREADS'] = '1'
94 |     os.environ['IN_MPI'] = '1'
95 |     main()
96 | 


--------------------------------------------------------------------------------
/HER/robotic/networks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as T
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | 
  7 | 
  8 | class CriticNetwork(nn.Module):
  9 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, fc3_dims,
 10 |                  n_actions, name, chkpt_dir='tmp/ddpg'):
 11 |         super(CriticNetwork, self).__init__()
 12 |         self.input_dims = input_dims
 13 |         self.fc1_dims = fc1_dims
 14 |         self.fc2_dims = fc2_dims
 15 |         self.fc3_dims = fc3_dims
 16 |         self.n_actions = n_actions
 17 |         self.name = name
 18 |         self.checkpoint_dir = chkpt_dir
 19 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')
 20 | 
 21 |         self.fc1 = nn.Linear(self.input_dims + self.n_actions, self.fc1_dims)
 22 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 23 |         self.fc3 = nn.Linear(self.fc2_dims, self.fc3_dims)
 24 | 
 25 |         self.q = nn.Linear(self.fc3_dims, 1)
 26 | 
 27 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
 28 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 29 | 
 30 |         self.to(self.device)
 31 | 
 32 |     def forward(self, state, action):
 33 |         state_value = F.relu(self.fc1(T.cat((state, action), dim=1)))
 34 |         state_value = F.relu(self.fc2(state_value))
 35 |         state_value = F.relu(self.fc3(state_value))
 36 | 
 37 |         q = self.q(state_value)
 38 | 
 39 |         return q
 40 | 
 41 |     def save_checkpoint(self):
 42 |         print('... saving checkpoint ...')
 43 |         T.save(self.state_dict(), self.checkpoint_file)
 44 | 
 45 |     def load_checkpoint(self):
 46 |         print('... loading checkpoint ...')
 47 |         self.load_state_dict(T.load(self.checkpoint_file))
 48 | 
 49 |     def save_best(self):
 50 |         print('... saving best checkpoint ...')
 51 |         checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
 52 |         T.save(self.state_dict(), checkpoint_file)
 53 | 
 54 | 
 55 | class ActorNetwork(nn.Module):
 56 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, fc3_dims,
 57 |                  n_actions, name, chkpt_dir='tmp/ddpg'):
 58 |         super(ActorNetwork, self).__init__()
 59 |         self.input_dims = input_dims
 60 |         self.fc1_dims = fc1_dims
 61 |         self.fc2_dims = fc2_dims
 62 |         self.fc3_dims = fc3_dims
 63 |         self.n_actions = n_actions
 64 |         self.name = name
 65 |         self.checkpoint_dir = chkpt_dir
 66 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')
 67 | 
 68 |         self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
 69 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 70 |         self.fc3 = nn.Linear(self.fc2_dims, self.fc3_dims)
 71 |         self.mu = nn.Linear(self.fc3_dims, self.n_actions)
 72 | 
 73 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
 74 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 75 | 
 76 |         self.to(self.device)
 77 | 
 78 |     def forward(self, state):
 79 |         x = F.relu(self.fc1(state))
 80 |         x = F.relu(self.fc2(x))
 81 |         x = F.relu(self.fc3(x))
 82 | 
 83 |         mu = self.mu(x)
 84 |         pi = T.clamp(T.tanh(mu), -5, 5)
 85 | 
 86 |         return mu, pi
 87 | 
 88 |     def save_checkpoint(self):
 89 |         print('... saving checkpoint ...')
 90 |         T.save(self.state_dict(), self.checkpoint_file)
 91 | 
 92 |     def load_checkpoint(self):
 93 |         print('... loading checkpoint ...')
 94 |         self.load_state_dict(T.load(self.checkpoint_file))
 95 | 
 96 |     def save_best(self):
 97 |         print('... saving best checkpoint ...')
 98 |         checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
 99 |         T.save(self.state_dict(), checkpoint_file)
100 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/wrappers.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import gym
  5 | 
  6 | 
  7 | class RepeatActionAndMaxFrame(gym.Wrapper):
  8 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
  9 |                  fire_first=False):
 10 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 11 |         self.repeat = repeat
 12 |         self.shape = env.observation_space.low.shape
 13 |         self.frame_buffer = np.zeros(shape=(2, *self.shape), dtype=np.float16)
 14 |         self.clip_reward = clip_reward
 15 |         self.no_ops = no_ops
 16 |         self.fire_first = fire_first
 17 | 
 18 |     def step(self, action):
 19 |         t_reward = 0.0
 20 |         done = False
 21 |         for i in range(self.repeat):
 22 |             obs, reward, done, info = self.env.step(action)
 23 |             if self.clip_reward:
 24 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 25 |             t_reward += reward
 26 |             idx = i % 2
 27 |             self.frame_buffer[idx] = obs
 28 |             if done:
 29 |                 break
 30 | 
 31 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 32 |         return max_frame, t_reward, done, info
 33 | 
 34 |     def reset(self):
 35 |         obs = self.env.reset()
 36 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 37 |         for _ in range(no_ops):
 38 |             _, _, done, _ = self.env.step(0)
 39 |             if done:
 40 |                 self.env.reset()
 41 |         if self.fire_first:
 42 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 43 |             obs, _, _, _ = self.env.step(1)
 44 | 
 45 |         self.frame_buffer = np.zeros(shape=(2, *self.shape), dtype=np.float16)
 46 |         self.frame_buffer[0] = obs
 47 | 
 48 |         return obs
 49 | 
 50 | 
 51 | class PreprocessFrame(gym.ObservationWrapper):
 52 |     def __init__(self, shape, env=None):
 53 |         super(PreprocessFrame, self).__init__(env)
 54 |         self.shape = (shape[2], shape[0], shape[1])
 55 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 56 |                                                 shape=self.shape,
 57 |                                                 dtype=np.float16)
 58 | 
 59 |     def observation(self, obs):
 60 |         new_frame = cv2.cvtColor(obs.astype(np.uint8), cv2.COLOR_RGB2GRAY)
 61 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 62 |                                     interpolation=cv2.INTER_AREA)
 63 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 64 |         new_obs = new_obs / 255.0
 65 | 
 66 |         return new_obs.astype(np.float16)
 67 | 
 68 | 
 69 | class StackFrames(gym.ObservationWrapper):
 70 |     def __init__(self, env, repeat):
 71 |         super(StackFrames, self).__init__(env)
 72 |         self.observation_space = gym.spaces.Box(
 73 |                             env.observation_space.low.repeat(repeat, axis=0),
 74 |                             env.observation_space.high.repeat(repeat, axis=0),
 75 |                             dtype=np.float16)
 76 |         self.stack = collections.deque(maxlen=repeat)
 77 | 
 78 |     def reset(self):
 79 |         self.stack.clear()
 80 |         observation = self.env.reset()
 81 |         for _ in range(self.stack.maxlen):
 82 |             self.stack.append(observation)
 83 | 
 84 |         return np.array(self.stack, dtype=np.float16).reshape(
 85 |             self.observation_space.low.shape)
 86 | 
 87 |     def observation(self, observation):
 88 |         self.stack.append(observation)
 89 | 
 90 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
 91 | 
 92 | 
 93 | def make_env(env_name, shape=(84, 84, 1), repeat=4, clip_rewards=False,
 94 |              no_ops=0, fire_first=False):
 95 |     env = gym.make(env_name)
 96 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards,
 97 |                                   no_ops, fire_first)
 98 |     env = PreprocessFrame(shape, env)
 99 |     env = StackFrames(env, repeat)
100 | 
101 |     return env
102 | 


--------------------------------------------------------------------------------
/PER/proportional/memory.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List
  3 | import numpy as np
  4 | 
  5 | 
  6 | @dataclass
  7 | class Node:
  8 |     value: float = 0.01
  9 |     total: float = 0.01
 10 | 
 11 |     def update_priority(self, priority: float):
 12 |         delta = priority - self.value
 13 |         self.value = priority
 14 |         self.total += delta
 15 |         return delta
 16 | 
 17 |     def update_total(self, delta: float):
 18 |         self.total += delta
 19 | 
 20 | 
 21 | class SumTree:
 22 |     def __init__(self, max_size: int = 1_00_000, batch_size: int = 32,
 23 |                  alpha: float = 0.5, beta: float = 0.5):
 24 |         self.counter = 0
 25 |         self.max_size = max_size
 26 |         self.batch_size = batch_size
 27 |         self.alpha = alpha
 28 |         self.beta = beta
 29 |         self.alpha_start = alpha
 30 |         self.beta_start = beta
 31 | 
 32 |         self.sum_tree = []
 33 |         self.transitions = []
 34 | 
 35 |     def _insert(self, transition: List):
 36 |         if self.counter < self.max_size:
 37 |             self.transitions.append(transition)
 38 |             self.sum_tree.append(Node())
 39 |         else:
 40 |             index = self.counter % self.max_size
 41 |             self.transitions[index] = transition
 42 |         self.counter += 1
 43 | 
 44 |     def store_transition(self, transition: List):
 45 |         self._insert(transition)
 46 | 
 47 |     def _calculate_parents(self, index: int):
 48 |         parents = []
 49 |         while index > 0:
 50 |             parents.append(int((index-1)//2))
 51 |             index = int((index-1)//2)
 52 |         return parents
 53 | 
 54 |     def update_priorities(self, indices: List, priorities: List):
 55 |         self._propagate_changes(indices, priorities)
 56 | 
 57 |     def _propagate_changes(self, indices: List, priorities: List):
 58 |         for idx, p in zip(indices, priorities):
 59 |             delta = self.sum_tree[idx].update_priority(p**self.alpha)
 60 |             parents = self._calculate_parents(idx)
 61 |             for parent in parents:
 62 |                 self.sum_tree[parent].update_total(delta)
 63 | 
 64 |     def _sample(self):
 65 |         total_weight = self.sum_tree[0].total
 66 | 
 67 |         if total_weight == 0.01:
 68 |             samples = np.random.choice(self.batch_size, self.batch_size,
 69 |                                        replace=False)
 70 |             probs = [1 / self.batch_size for _ in range(self.batch_size)]
 71 |             return samples, probs
 72 | 
 73 |         samples, probs, n_samples = [], [], 1
 74 |         index = self.counter % self.max_size - 1
 75 |         samples.append(index)
 76 |         probs.append(self.sum_tree[index].value / self.sum_tree[0].total)
 77 |         while n_samples < self.batch_size:
 78 |             index = 0
 79 |             target = total_weight * np.random.random()
 80 |             while True:
 81 |                 left = 2 * index + 1
 82 |                 right = 2 * index + 2
 83 |                 if left > len(self.sum_tree) - 1\
 84 |                    or right > len(self.sum_tree) - 1:
 85 |                     break
 86 |                 left_sum = self.sum_tree[left].total
 87 |                 if target < left_sum:
 88 |                     index = left
 89 |                     continue
 90 |                 target -= left_sum
 91 |                 right_sum = self.sum_tree[right].total
 92 |                 if target < right_sum:
 93 |                     index = right
 94 |                     continue
 95 |                 target -= right_sum
 96 |                 break
 97 |             samples.append(index)
 98 |             n_samples += 1
 99 |             probs.append(self.sum_tree[index].value / self.sum_tree[0].total)
100 |         return samples, probs
101 | 
102 |     def sample(self):
103 |         samples, probs = self._sample()
104 |         weights = self._calculate_weights(probs)
105 |         mems = [self.transitions[s] for s in samples]
106 |         return mems, samples, weights
107 | 
108 |     def _calculate_weights(self, probs: List):
109 |         weights = np.array([(1 / self.counter * 1 / prob)**self.beta
110 |                             for prob in probs])
111 |         weights *= 1 / max(weights)
112 |         return weights
113 | 
114 |     def ready(self):
115 |         return self.counter >= self.batch_size
116 | 
117 |     def anneal_beta(self, ep: int, ep_max: int):
118 |         self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start)
119 | 
120 |     def anneal_alpha(self, ep: int, ep_max: int):
121 |         self.alpha = self.alpha_start * (1 - ep / ep_max)
122 | 


--------------------------------------------------------------------------------
/HER/robotic/her.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import numpy as np
  3 | 
  4 | 
  5 | class HER:
  6 |     def __init__(self, max_mem, input_shape, n_actions, goal_shape, batch_size,
  7 |                  reward_fn, strategy='final', k=4):
  8 |         self.max_mem = max_mem
  9 |         self.strategy = strategy
 10 |         self.mem_cntr = 0
 11 |         self.batch_size = batch_size
 12 |         self.input_shape = input_shape
 13 |         self.reward_fn = reward_fn
 14 |         self.k = k
 15 |         self.lock = threading.Lock()
 16 | 
 17 |         self.states = np.zeros((max_mem, input_shape),
 18 |                                dtype=np.float64)
 19 |         self.states_ = np.zeros((max_mem, input_shape),
 20 |                                 dtype=np.float64)
 21 |         self.actions = np.zeros((max_mem, n_actions),
 22 |                                 dtype=np.float32)
 23 |         self.rewards = np.zeros(max_mem, dtype=np.float32)
 24 |         self.dones = np.zeros(max_mem, dtype=np.bool)
 25 |         self.desired_goals = np.zeros((max_mem, goal_shape), dtype=np.float64)
 26 |         self.achieved_goals = np.zeros((max_mem, goal_shape), dtype=np.float64)
 27 |         self.achieved_goals_ = np.zeros((max_mem, goal_shape),
 28 |                                         dtype=np.float64)
 29 | 
 30 |     def store_memory(self, state, action, reward, state_, done,
 31 |                      d_goal, a_goal, a_goal_):
 32 |         with self.lock:
 33 |             index = self.mem_cntr % self.max_mem
 34 |             self.states[index] = state
 35 |             self.states_[index] = state_
 36 |             self.actions[index] = action
 37 |             self.rewards[index] = reward
 38 |             self.dones[index] = done
 39 |             self.desired_goals[index] = d_goal
 40 |             self.achieved_goals[index] = a_goal
 41 |             self.achieved_goals_[index] = a_goal_
 42 |             self.mem_cntr += 1
 43 | 
 44 |     def store_episode(self, ep_memory):
 45 |         states, actions, rewards, states_, dones, dg, ag, ag_ = ep_memory
 46 |         hindsight_goals = []
 47 | 
 48 |         if self.strategy == 'final':
 49 |             hindsight_goals = [[ag_[-1]]] * len(ag_)
 50 | 
 51 |         elif self.strategy is None:
 52 |             hindsight_goals = [[dg[0]]] * len(dg)
 53 | 
 54 |         elif self.strategy == 'future':
 55 |             for idx, _ in enumerate(ag_):
 56 |                 t_step_goals = []
 57 |                 for m in range(self.k):
 58 |                     if idx + m >= len(ag_) - 1:
 59 |                         break
 60 |                     goal_idx = np.random.randint(idx + 1, len(ag_))
 61 |                     t_step_goals.append(ag_[goal_idx])
 62 |                 hindsight_goals.append(t_step_goals)
 63 | 
 64 |         elif self.strategy == 'random':
 65 |             if self.mem_cntr <= len(ag_):
 66 |                 max_mem = len(ag_)
 67 |                 memory = ag_
 68 |             else:
 69 |                 max_mem = min(self.mem_cntr, self.max_mem)
 70 |                 memory = self.achieved_goals_
 71 |             for idx, _ in enumerate(ag_):
 72 |                 t_step_goals = []
 73 |                 for m in range(self.k):
 74 |                     goal_idx = np.random.randint(0, max_mem)
 75 |                     t_step_goals.append(memory[goal_idx])
 76 |                 hindsight_goals.append(t_step_goals)
 77 | 
 78 |         elif self.strategy == 'episode':
 79 |             for idx, _ in enumerate(ag_):
 80 |                 t_step_goals = []
 81 |                 for m in range(self.k):
 82 |                     goal_idx = np.random.randint(0, len(ag_))
 83 |                     t_step_goals.append(ag_[goal_idx])
 84 |                 hindsight_goals.append(t_step_goals)
 85 | 
 86 |         for idx, s in enumerate(states):
 87 |             self.store_memory(s, actions[idx], rewards[idx], states_[idx],
 88 |                               dones[idx], dg[idx], ag[idx], ag_[idx])
 89 |             for goal in hindsight_goals[idx]:
 90 |                 reward = self.reward_fn(ag_[idx], goal, {})
 91 |                 self.store_memory(s, actions[idx], reward, states_[idx],
 92 |                                   dones[idx], goal, ag[idx], ag_[idx])
 93 | 
 94 |     def sample_memory(self):
 95 |         with self.lock:
 96 |             last_mem = min(self.mem_cntr, self.max_mem)
 97 |             batch = np.random.choice(last_mem, self.batch_size, replace=False)
 98 | 
 99 |         return self.states[batch], self.actions[batch], self.rewards[batch],\
100 |             self.states_[batch], self.dones[batch],\
101 |             self.desired_goals[batch]
102 | 
103 |     def ready(self):
104 |         return self.mem_cntr > self.batch_size
105 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from network import DeepQNetwork
  4 | from memory import SumTree
  5 | 
  6 | 
  7 | class Agent:
  8 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  9 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 10 |                  replace=1000, alpha=0.5, beta=0, r_iter=32,
 11 |                  algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 12 |         self.gamma = gamma
 13 |         self.epsilon = epsilon
 14 |         self.lr = lr
 15 |         self.n_actions = n_actions
 16 |         self.input_dims = input_dims
 17 |         self.batch_size = batch_size
 18 |         self.eps_min = eps_min
 19 |         self.eps_dec = eps_dec
 20 |         self.replace_target_cnt = replace
 21 |         self.algo = algo
 22 |         self.env_name = env_name
 23 |         self.chkpt_dir = chkpt_dir
 24 |         self.action_space = [i for i in range(n_actions)]
 25 |         self.learn_step_counter = 0
 26 |         self.rebalance_iter = r_iter
 27 | 
 28 |         self.memory = SumTree(mem_size, batch_size, alpha=alpha, beta=beta)
 29 | 
 30 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
 31 |                                    input_dims=self.input_dims,
 32 |                                    name=self.env_name+'_q_eval',
 33 |                                    chkpt_dir=self.chkpt_dir)
 34 | 
 35 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
 36 |                                    input_dims=self.input_dims,
 37 |                                    name=self.env_name+'_q_next',
 38 |                                    chkpt_dir=self.chkpt_dir)
 39 | 
 40 |     def choose_action(self, observation):
 41 |         if np.random.random() > self.epsilon:
 42 |             state = T.tensor([observation],
 43 |                              dtype=T.float).to(self.q_eval.device)
 44 |             actions = self.q_eval.forward(state)
 45 |             action = T.argmax(actions).item()
 46 |         else:
 47 |             action = np.random.choice(self.action_space)
 48 | 
 49 |         return action
 50 | 
 51 |     def store_transition(self, state, action, reward, state_, done):
 52 |         self.memory.store_transition([state, action, reward, state_, done])
 53 | 
 54 |     def sample_memory(self):
 55 |         sarsd, sample_idx, weights = self.memory.sample()
 56 | 
 57 |         states, actions, rewards, states_, dones = sarsd
 58 | 
 59 |         states = T.tensor(states, dtype=T.float).to(self.q_eval.device)
 60 |         rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
 61 |         dones = T.tensor(dones).to(self.q_eval.device)
 62 |         actions = T.tensor(actions).to(self.q_eval.device)
 63 |         states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device)
 64 | 
 65 |         weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device)
 66 | 
 67 |         return states, actions, rewards, states_, dones, sample_idx, weights
 68 | 
 69 |     def replace_target_network(self):
 70 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 71 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 72 | 
 73 |     def decrement_epsilon(self):
 74 |         self.epsilon = self.epsilon - self.eps_dec \
 75 |                            if self.epsilon > self.eps_min else self.eps_min
 76 | 
 77 |     def save_models(self):
 78 |         self.q_eval.save_checkpoint()
 79 |         self.q_next.save_checkpoint()
 80 | 
 81 |     def load_models(self):
 82 |         self.q_eval.load_checkpoint()
 83 |         self.q_next.load_checkpoint()
 84 | 
 85 |     def learn(self):
 86 |         if not self.memory.ready():
 87 |             return
 88 | 
 89 |         self.q_eval.optimizer.zero_grad()
 90 | 
 91 |         self.replace_target_network()
 92 | 
 93 |         states, actions, rewards, states_, dones,\
 94 |             sample_idx, weights = self.sample_memory()
 95 |         indices = np.arange(self.batch_size)
 96 | 
 97 |         q_pred = self.q_eval.forward(states)[indices, actions]
 98 |         q_next = self.q_next.forward(states_)
 99 |         q_eval = self.q_eval.forward(states_)
100 | 
101 |         max_actions = T.argmax(q_eval, dim=1)
102 |         q_next[dones] = 0.0
103 |         q_target = rewards + self.gamma * q_next[indices, max_actions]
104 | 
105 |         td_error = np.abs((q_target.detach().cpu().numpy() -
106 |                           q_pred.detach().cpu().numpy()))
107 |         td_error = np.clip(td_error, 0., 1.)
108 | 
109 |         self.memory.update_priorities(sample_idx, td_error)
110 | 
111 |         q_target *= weights
112 |         q_pred *= weights
113 | 
114 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
115 |         loss.backward()
116 |         self.q_eval.optimizer.step()
117 |         self.learn_step_counter += 1
118 |         self.decrement_epsilon()
119 | 


--------------------------------------------------------------------------------
/PER/ranked/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 38 |                  fire_first=False):
 39 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 40 |         self.repeat = repeat
 41 |         self.shape = env.observation_space.low.shape
 42 |         self.frame_buffer = np.zeros_like((2, self.shape))
 43 |         self.clip_reward = clip_reward
 44 |         self.no_ops = no_ops
 45 |         self.fire_first = fire_first
 46 | 
 47 |     def step(self, action):
 48 |         t_reward = 0.0
 49 |         done = False
 50 |         for i in range(self.repeat):
 51 |             obs, reward, done, info = self.env.step(action)
 52 |             if self.clip_reward:
 53 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 54 |             t_reward += reward
 55 |             idx = i % 2
 56 |             self.frame_buffer[idx] = obs
 57 |             if done:
 58 |                 break
 59 | 
 60 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 61 |         return max_frame, t_reward, done, info
 62 | 
 63 |     def reset(self):
 64 |         obs = self.env.reset()
 65 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 66 |         for _ in range(no_ops):
 67 |             _, _, done, _ = self.env.step(0)
 68 |             if done:
 69 |                 self.env.reset()
 70 |         if self.fire_first:
 71 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 72 |             obs, _, _, _ = self.env.step(1)
 73 | 
 74 |         self.frame_buffer = np.zeros_like((2,self.shape))
 75 |         self.frame_buffer[0] = obs
 76 | 
 77 |         return obs
 78 | 
 79 | class PreprocessFrame(gym.ObservationWrapper):
 80 |     def __init__(self, shape, env=None):
 81 |         super(PreprocessFrame, self).__init__(env)
 82 |         self.shape = (shape[2], shape[0], shape[1])
 83 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 84 |                                     shape=self.shape, dtype=np.float32)
 85 | 
 86 |     def observation(self, obs):
 87 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 88 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 89 |                                     interpolation=cv2.INTER_AREA)
 90 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 91 |         new_obs = new_obs / 255.0
 92 | 
 93 |         return new_obs
 94 | 
 95 | class StackFrames(gym.ObservationWrapper):
 96 |     def __init__(self, env, repeat):
 97 |         super(StackFrames, self).__init__(env)
 98 |         self.observation_space = gym.spaces.Box(
 99 |                             env.observation_space.low.repeat(repeat, axis=0),
100 |                             env.observation_space.high.repeat(repeat, axis=0),
101 |                             dtype=np.float32)
102 |         self.stack = collections.deque(maxlen=repeat)
103 | 
104 |     def reset(self):
105 |         self.stack.clear()
106 |         observation = self.env.reset()
107 |         for _ in range(self.stack.maxlen):
108 |             self.stack.append(observation)
109 | 
110 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
111 | 
112 |     def observation(self, observation):
113 |         self.stack.append(observation)
114 | 
115 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
116 | 
117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
118 |              no_ops=0, fire_first=False):
119 |     env = gym.make(env_name)
120 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
121 |     env = PreprocessFrame(shape, env)
122 |     env = StackFrames(env, repeat)
123 | 
124 |     return env
125 | 


--------------------------------------------------------------------------------
/PER/proportional/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 38 |                  fire_first=False):
 39 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 40 |         self.repeat = repeat
 41 |         self.shape = env.observation_space.low.shape
 42 |         self.frame_buffer = np.zeros_like((2, self.shape))
 43 |         self.clip_reward = clip_reward
 44 |         self.no_ops = no_ops
 45 |         self.fire_first = fire_first
 46 | 
 47 |     def step(self, action):
 48 |         t_reward = 0.0
 49 |         done = False
 50 |         for i in range(self.repeat):
 51 |             obs, reward, done, info = self.env.step(action)
 52 |             if self.clip_reward:
 53 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 54 |             t_reward += reward
 55 |             idx = i % 2
 56 |             self.frame_buffer[idx] = obs
 57 |             if done:
 58 |                 break
 59 | 
 60 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 61 |         return max_frame, t_reward, done, info
 62 | 
 63 |     def reset(self):
 64 |         obs = self.env.reset()
 65 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 66 |         for _ in range(no_ops):
 67 |             _, _, done, _ = self.env.step(0)
 68 |             if done:
 69 |                 self.env.reset()
 70 |         if self.fire_first:
 71 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 72 |             obs, _, _, _ = self.env.step(1)
 73 | 
 74 |         self.frame_buffer = np.zeros_like((2,self.shape))
 75 |         self.frame_buffer[0] = obs
 76 | 
 77 |         return obs
 78 | 
 79 | class PreprocessFrame(gym.ObservationWrapper):
 80 |     def __init__(self, shape, env=None):
 81 |         super(PreprocessFrame, self).__init__(env)
 82 |         self.shape = (shape[2], shape[0], shape[1])
 83 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 84 |                                     shape=self.shape, dtype=np.float32)
 85 | 
 86 |     def observation(self, obs):
 87 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 88 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 89 |                                     interpolation=cv2.INTER_AREA)
 90 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 91 |         new_obs = new_obs / 255.0
 92 | 
 93 |         return new_obs
 94 | 
 95 | class StackFrames(gym.ObservationWrapper):
 96 |     def __init__(self, env, repeat):
 97 |         super(StackFrames, self).__init__(env)
 98 |         self.observation_space = gym.spaces.Box(
 99 |                             env.observation_space.low.repeat(repeat, axis=0),
100 |                             env.observation_space.high.repeat(repeat, axis=0),
101 |                             dtype=np.float32)
102 |         self.stack = collections.deque(maxlen=repeat)
103 | 
104 |     def reset(self):
105 |         self.stack.clear()
106 |         observation = self.env.reset()
107 |         for _ in range(self.stack.maxlen):
108 |             self.stack.append(observation)
109 | 
110 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
111 | 
112 |     def observation(self, observation):
113 |         self.stack.append(observation)
114 | 
115 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
116 | 
117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
118 |              no_ops=0, fire_first=False):
119 |     env = gym.make(env_name)
120 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
121 |     env = PreprocessFrame(shape, env)
122 |     env = StackFrames(env, repeat)
123 | 
124 |     return env
125 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 38 |                  fire_first=False):
 39 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 40 |         self.repeat = repeat
 41 |         self.shape = env.observation_space.low.shape
 42 |         self.frame_buffer = np.zeros_like((2, self.shape))
 43 |         self.clip_reward = clip_reward
 44 |         self.no_ops = no_ops
 45 |         self.fire_first = fire_first
 46 | 
 47 |     def step(self, action):
 48 |         t_reward = 0.0
 49 |         done = False
 50 |         for i in range(self.repeat):
 51 |             obs, reward, done, info = self.env.step(action)
 52 |             if self.clip_reward:
 53 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 54 |             t_reward += reward
 55 |             idx = i % 2
 56 |             self.frame_buffer[idx] = obs
 57 |             if done:
 58 |                 break
 59 | 
 60 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 61 |         return max_frame, t_reward, done, info
 62 | 
 63 |     def reset(self):
 64 |         obs = self.env.reset()
 65 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 66 |         for _ in range(no_ops):
 67 |             _, _, done, _ = self.env.step(0)
 68 |             if done:
 69 |                 self.env.reset()
 70 |         if self.fire_first:
 71 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 72 |             obs, _, _, _ = self.env.step(1)
 73 | 
 74 |         self.frame_buffer = np.zeros_like((2,self.shape))
 75 |         self.frame_buffer[0] = obs
 76 | 
 77 |         return obs
 78 | 
 79 | class PreprocessFrame(gym.ObservationWrapper):
 80 |     def __init__(self, shape, env=None):
 81 |         super(PreprocessFrame, self).__init__(env)
 82 |         self.shape = (shape[2], shape[0], shape[1])
 83 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 84 |                                     shape=self.shape, dtype=np.float32)
 85 | 
 86 |     def observation(self, obs):
 87 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 88 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 89 |                                     interpolation=cv2.INTER_AREA)
 90 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 91 |         new_obs = new_obs / 255.0
 92 | 
 93 |         return new_obs
 94 | 
 95 | class StackFrames(gym.ObservationWrapper):
 96 |     def __init__(self, env, repeat):
 97 |         super(StackFrames, self).__init__(env)
 98 |         self.observation_space = gym.spaces.Box(
 99 |                             env.observation_space.low.repeat(repeat, axis=0),
100 |                             env.observation_space.high.repeat(repeat, axis=0),
101 |                             dtype=np.float32)
102 |         self.stack = collections.deque(maxlen=repeat)
103 | 
104 |     def reset(self):
105 |         self.stack.clear()
106 |         observation = self.env.reset()
107 |         for _ in range(self.stack.maxlen):
108 |             self.stack.append(observation)
109 | 
110 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
111 | 
112 |     def observation(self, observation):
113 |         self.stack.append(observation)
114 | 
115 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
116 | 
117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
118 |              no_ops=0, fire_first=False):
119 |     env = gym.make(env_name)
120 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
121 |     env = PreprocessFrame(shape, env)
122 |     env = StackFrames(env, repeat)
123 | 
124 |     return env
125 | 


--------------------------------------------------------------------------------
/PER/proportional/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from network import LinearDeepQNetwork
  4 | from memory import SumTree
  5 | 
  6 | 
  7 | class Agent:
  8 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  9 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 10 |                  replace=1000, alpha=0.5, beta=0, r_iter=32,
 11 |                  algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 12 |         self.gamma = gamma
 13 |         self.epsilon = epsilon
 14 |         self.lr = lr
 15 |         self.n_actions = n_actions
 16 |         self.input_dims = input_dims
 17 |         self.batch_size = batch_size
 18 |         self.eps_min = eps_min
 19 |         self.eps_dec = eps_dec
 20 |         self.replace_target_cnt = replace
 21 |         self.algo = algo
 22 |         self.env_name = env_name
 23 |         self.chkpt_dir = chkpt_dir
 24 |         self.action_space = [i for i in range(n_actions)]
 25 |         self.learn_step_counter = 0
 26 |         self.rebalance_iter = r_iter
 27 | 
 28 |         self.memory = SumTree(mem_size, batch_size, alpha=alpha, beta=beta)
 29 | 
 30 |         self.q_eval = LinearDeepQNetwork(self.lr, self.n_actions,
 31 |                                          input_dims=self.input_dims,
 32 |                                          name=self.env_name+'_q_eval',
 33 |                                          chkpt_dir=self.chkpt_dir)
 34 | 
 35 |         self.q_next = LinearDeepQNetwork(self.lr, self.n_actions,
 36 |                                          input_dims=self.input_dims,
 37 |                                          name=self.env_name+'_q_next',
 38 |                                          chkpt_dir=self.chkpt_dir)
 39 | 
 40 |     def choose_action(self, observation):
 41 |         if np.random.random() > self.epsilon:
 42 |             state = T.tensor([observation],
 43 |                              dtype=T.float).to(self.q_eval.device)
 44 |             actions = self.q_eval.forward(state)
 45 |             action = T.argmax(actions).item()
 46 |         else:
 47 |             action = np.random.choice(self.action_space)
 48 | 
 49 |         return action
 50 | 
 51 |     def store_transition(self, state, action, reward, state_, done):
 52 |         self.memory.store_transition([state, action, reward, state_, done])
 53 | 
 54 |     def sample_memory(self):
 55 |         sarsd, sample_idx, weights = self.memory.sample()
 56 | 
 57 |         states = np.array([row[0] for row in sarsd])
 58 |         actions = np.array([row[1] for row in sarsd])
 59 |         rewards = np.array([row[2] for row in sarsd])
 60 |         states_ = np.array([row[3] for row in sarsd])
 61 |         dones = np.array([row[4] for row in sarsd])
 62 | 
 63 |         states = T.tensor(states, dtype=T.float).to(self.q_eval.device)
 64 |         rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
 65 |         dones = T.tensor(dones).to(self.q_eval.device)
 66 |         actions = T.tensor(actions).to(self.q_eval.device)
 67 |         states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device)
 68 | 
 69 |         weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device)
 70 | 
 71 |         return states, actions, rewards, states_, dones, sample_idx, weights
 72 | 
 73 |     def replace_target_network(self):
 74 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 75 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 76 | 
 77 |     def decrement_epsilon(self):
 78 |         self.epsilon = self.epsilon - self.eps_dec \
 79 |                            if self.epsilon > self.eps_min else self.eps_min
 80 | 
 81 |     def save_models(self):
 82 |         self.q_eval.save_checkpoint()
 83 |         self.q_next.save_checkpoint()
 84 | 
 85 |     def load_models(self):
 86 |         self.q_eval.load_checkpoint()
 87 |         self.q_next.load_checkpoint()
 88 | 
 89 |     def learn(self):
 90 |         if not self.memory.ready():
 91 |             return
 92 | 
 93 |         self.q_eval.optimizer.zero_grad()
 94 | 
 95 |         self.replace_target_network()
 96 | 
 97 |         states, actions, rewards, states_, dones,\
 98 |             sample_idx, weights = self.sample_memory()
 99 |         indices = np.arange(self.batch_size)
100 | 
101 |         q_pred = self.q_eval.forward(states)[indices, actions]
102 |         q_next = self.q_next.forward(states_)
103 |         q_eval = self.q_eval.forward(states_)
104 | 
105 |         max_actions = T.argmax(q_eval, dim=1)
106 |         q_next[dones] = 0.0
107 |         q_target = rewards + self.gamma * q_next[indices, max_actions]
108 | 
109 |         td_error = np.abs((q_target.detach().cpu().numpy() -
110 |                           q_pred.detach().cpu().numpy()))
111 |         td_error = np.clip(td_error, 0., 1.)
112 | 
113 |         self.memory.update_priorities(sample_idx, td_error)
114 | 
115 |         q_target *= weights
116 |         q_pred *= weights
117 | 
118 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
119 |         loss.backward()
120 |         self.q_eval.optimizer.step()
121 |         self.learn_step_counter += 1
122 |         self.decrement_epsilon()
123 | 


--------------------------------------------------------------------------------
/PER/ranked/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from network import LinearDeepQNetwork
  4 | from memory import MaxHeap
  5 | 
  6 | 
  7 | class DQNAgent:
  8 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  9 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 10 |                  replace=1000, alpha=0.5, beta=0, r_iter=32,
 11 |                  algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 12 |         self.gamma = gamma
 13 |         self.epsilon = epsilon
 14 |         self.lr = lr
 15 |         self.n_actions = n_actions
 16 |         self.input_dims = input_dims
 17 |         self.batch_size = batch_size
 18 |         self.eps_min = eps_min
 19 |         self.eps_dec = eps_dec
 20 |         self.replace_target_cnt = replace
 21 |         self.algo = algo
 22 |         self.env_name = env_name
 23 |         self.chkpt_dir = chkpt_dir
 24 |         self.action_space = [i for i in range(n_actions)]
 25 |         self.learn_step_counter = 0
 26 |         self.rebalance_iter = r_iter
 27 | 
 28 |         self.memory = MaxHeap(mem_size, batch_size, alpha=alpha, beta=beta,
 29 |                               r_iter=r_iter)
 30 | 
 31 |         self.q_eval = LinearDeepQNetwork(self.lr, self.n_actions,
 32 |                                     input_dims=self.input_dims,
 33 |                                     name=self.env_name+'_'+self.algo+'_q_eval',
 34 |                                     chkpt_dir=self.chkpt_dir)
 35 | 
 36 |         self.q_next = LinearDeepQNetwork(self.lr, self.n_actions,
 37 |                                     input_dims=self.input_dims,
 38 |                                     name=self.env_name+'_'+self.algo+'_q_next',
 39 |                                     chkpt_dir=self.chkpt_dir)
 40 | 
 41 |     def choose_action(self, observation):
 42 |         if np.random.random() > self.epsilon:
 43 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
 44 |             actions = self.q_eval.forward(state)
 45 |             action = T.argmax(actions).item()
 46 |         else:
 47 |             action = np.random.choice(self.action_space)
 48 | 
 49 |         return action
 50 | 
 51 |     def store_transition(self, state, action, reward, state_, done):
 52 |         self.memory.store_transition([state, action, reward, state_, done])
 53 | 
 54 |     def sample_memory(self):
 55 |         sarsd, sample_idx, weights = self.memory.sample()
 56 |         states = np.array([row[0] for row in sarsd])
 57 |         actions = np.array([row[1] for row in sarsd])
 58 |         rewards = np.array([row[2] for row in sarsd])
 59 |         states_ = np.array([row[3] for row in sarsd])
 60 |         dones = np.array([row[4] for row in sarsd])
 61 | 
 62 |         states = T.tensor(states, dtype=T.float).to(self.q_eval.device)
 63 |         rewards = T.tensor(rewards, dtype=T.float).to(self.q_eval.device)
 64 |         dones = T.tensor(dones).to(self.q_eval.device)
 65 |         actions = T.tensor(actions).to(self.q_eval.device)
 66 |         states_ = T.tensor(states_, dtype=T.float).to(self.q_eval.device)
 67 | 
 68 |         weights = T.tensor(weights, dtype=T.float).to(self.q_eval.device)
 69 | 
 70 |         return states, actions, rewards, states_, dones, sample_idx, weights
 71 | 
 72 |     def replace_target_network(self):
 73 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 74 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 75 | 
 76 |     def decrement_epsilon(self):
 77 |         self.epsilon = self.epsilon - self.eps_dec \
 78 |                            if self.epsilon > self.eps_min else self.eps_min
 79 | 
 80 |     def save_models(self):
 81 |         self.q_eval.save_checkpoint()
 82 |         self.q_next.save_checkpoint()
 83 | 
 84 |     def load_models(self):
 85 |         self.q_eval.load_checkpoint()
 86 |         self.q_next.load_checkpoint()
 87 | 
 88 |     def rebalance_heap(self):
 89 |         if self.rebalance_iter > 1:
 90 |             if self.learn_step_counter % self.rebalance_iter == 0:
 91 |                 self.memory.rebalance_heap()
 92 |                 self.memory._update_ranks()
 93 |                 self.memory.compute_probs()
 94 | 
 95 |     def learn(self):
 96 |         if not self.memory.ready():
 97 |             return
 98 | 
 99 |         self.q_eval.optimizer.zero_grad()
100 | 
101 |         self.replace_target_network()
102 | 
103 |         self.rebalance_heap()
104 | 
105 |         states, actions, rewards, states_, dones,\
106 |             sample_idx, weights = self.sample_memory()
107 |         indices = np.arange(self.batch_size)
108 |         q_pred = self.q_eval.forward(states)[indices, actions]
109 | 
110 |         q_next = self.q_next.forward(states_).max(dim=1)[0]
111 |         q_next[dones] = 0.0
112 |         q_target = rewards + self.gamma*q_next
113 | 
114 |         td_error = np.abs((q_target.detach().cpu().numpy() -
115 |                           q_pred.detach().cpu().numpy()))
116 |         td_error = np.clip(td_error, -1., 1.)
117 | 
118 |         self.memory.update_priorities(sample_idx, td_error)
119 | 
120 |         q_target *= weights
121 |         q_pred *= weights
122 | 
123 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
124 |         loss.backward()
125 |         self.q_eval.optimizer.step()
126 |         self.learn_step_counter += 1
127 |         self.decrement_epsilon()
128 | 


--------------------------------------------------------------------------------
/PER/proportional/atari/memory.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List
  3 | import numpy as np
  4 | 
  5 | 
  6 | @dataclass
  7 | class Node:
  8 |     value: float = 0.01
  9 |     total: float = 0.01
 10 | 
 11 |     def update_priority(self, priority: float):
 12 |         delta = priority - self.value
 13 |         self.value = priority
 14 |         self.total += delta
 15 |         return delta
 16 | 
 17 |     def update_total(self, delta: float):
 18 |         self.total += delta
 19 | 
 20 | 
 21 | class SumTree:
 22 |     def __init__(self, max_size: int = 1_00_000, batch_size: int = 32,
 23 |                  alpha: float = 0.5, beta: float = 0.5,
 24 |                  input_shape=(4, 84, 84)):
 25 |         self.counter = 0
 26 |         self.max_size = max_size
 27 |         self.batch_size = batch_size
 28 |         self.alpha = alpha
 29 |         self.beta = beta
 30 |         self.alpha_start = alpha
 31 |         self.beta_start = beta
 32 | 
 33 |         self.sum_tree = []
 34 |         self.states = np.zeros(shape=(max_size, *input_shape),
 35 |                                dtype=np.float16)
 36 |         self.actions = np.zeros(shape=(max_size,), dtype=np.int64)
 37 |         self.rewards = np.zeros(shape=(max_size,), dtype=np.float16)
 38 |         self.states_ = np.zeros(shape=(max_size, *input_shape),
 39 |                                 dtype=np.float16)
 40 |         self.dones = np.zeros(shape=(max_size,), dtype=np.bool)
 41 | 
 42 |     def _insert(self, transition: List):
 43 |         state, action, reward, state_, done = transition
 44 |         index = self.counter % self.max_size
 45 |         self.states[index] = state
 46 |         self.actions[index] = action
 47 |         self.rewards[index] = reward
 48 |         self.states_[index] = state_
 49 |         self.dones[index] = done
 50 |         if self.counter < self.max_size:
 51 |             self.sum_tree.append(Node())
 52 |         self.counter += 1
 53 | 
 54 |     def store_transition(self, transition: List):
 55 |         self._insert(transition)
 56 | 
 57 |     def _calculate_parents(self, index: int):
 58 |         parents = []
 59 |         while index > 0:
 60 |             parents.append(int((index-1)//2))
 61 |             index = int((index-1)//2)
 62 |         return parents
 63 | 
 64 |     def update_priorities(self, indices: List, priorities: List):
 65 |         self._propagate_changes(indices, priorities)
 66 | 
 67 |     def _propagate_changes(self, indices: List, priorities: List):
 68 |         for idx, p in zip(indices, priorities):
 69 |             delta = self.sum_tree[idx].update_priority((p+1e-3)**self.alpha)
 70 |             parents = self._calculate_parents(idx)
 71 |             for parent in parents:
 72 |                 self.sum_tree[parent].update_total(delta)
 73 | 
 74 |     def _sample(self):
 75 |         total_weight = self.sum_tree[0].total
 76 | 
 77 |         if total_weight == 0.01:
 78 |             samples = np.random.choice(self.batch_size, self.batch_size,
 79 |                                        replace=False)
 80 |             probs = [1 / self.batch_size for _ in range(self.batch_size)]
 81 |             return samples, probs
 82 | 
 83 |         samples, probs, n_samples = [], [], 1
 84 |         index = self.counter % self.max_size - 1
 85 |         samples.append(index)
 86 |         probs.append(self.sum_tree[index].value / self.sum_tree[0].total)
 87 |         while n_samples < self.batch_size:
 88 |             index = 0
 89 |             target = total_weight * np.random.random()
 90 |             while True:
 91 |                 left = 2 * index + 1
 92 |                 right = 2 * index + 2
 93 |                 if left > len(self.sum_tree) - 1\
 94 |                    or right > len(self.sum_tree) - 1:
 95 |                     break
 96 |                 left_sum = self.sum_tree[left].total
 97 |                 if target < left_sum:
 98 |                     index = left
 99 |                     continue
100 |                 target -= left_sum
101 |                 right_sum = self.sum_tree[right].total
102 |                 if target < right_sum:
103 |                     index = right
104 |                     continue
105 |                 target -= right_sum
106 |                 break
107 |             samples.append(index)
108 |             n_samples += 1
109 |             probs.append(self.sum_tree[index].value / self.sum_tree[0].total)
110 |         return samples, probs
111 | 
112 |     def sample(self):
113 |         samples, probs = self._sample()
114 |         weights = self._calculate_weights(probs)
115 |         mems = [self.states[samples], self.actions[samples],
116 |                 self.rewards[samples], self.states_[samples],
117 |                 self.dones[samples]]
118 |         return mems, samples, weights
119 | 
120 |     def _calculate_weights(self, probs: List):
121 |         weights = np.array([(1 / self.counter * 1 / prob)**self.beta
122 |                             for prob in probs])
123 |         weights *= 1 / max(weights)
124 |         return weights
125 | 
126 |     def ready(self):
127 |         return self.counter >= self.batch_size
128 | 
129 |     def anneal_beta(self, ep: int, ep_max: int):
130 |         self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start)
131 | 
132 |     def anneal_alpha(self, ep: int, ep_max: int):
133 |         self.alpha = self.alpha_start * (1 - ep / ep_max)
134 | 


--------------------------------------------------------------------------------
/PER/ranked/memory.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from dataclasses import dataclass, field
  3 | from typing import List
  4 | import numpy as np
  5 | 
  6 | 
  7 | @dataclass
  8 | class MemoryCell:
  9 |     priority: float
 10 |     rank: int
 11 |     transition: List[np.array] = field(repr=False)
 12 | 
 13 |     def update_priority(self, new_priority: float):
 14 |         self.priority = new_priority
 15 | 
 16 |     def update_rank(self, new_rank: int):
 17 |         self.rank = new_rank
 18 | 
 19 |     def __gt__(self, other):
 20 |         return self.priority > other.priority
 21 | 
 22 |     def __ge__(self, other):
 23 |         return self.priority >= other.priority
 24 | 
 25 |     def __lt__(self, other):
 26 |         return self.priority < other.priority
 27 | 
 28 |     def __le__(self, other):
 29 |         return self.priority < other.priority
 30 | 
 31 | 
 32 | class MaxHeap:
 33 |     def __init__(self, max_size: int = 1e6, n_batches: int = 32,
 34 |                  alpha: float = 0.5, beta: float = 0, r_iter: int = 32):
 35 |         self.array: List[MemoryCell] = []
 36 |         self.max_size = max_size
 37 |         self.mem_cntr: int = 0
 38 |         self.n_batches = n_batches
 39 |         self.alpha = alpha
 40 |         self.beta = beta
 41 |         self.beta_start = beta
 42 |         self.alpha_start = alpha
 43 |         self.r_iter = r_iter
 44 |         self._precompute_indices()
 45 | 
 46 |     def store_transition(self, sarsd: List[np.array]):
 47 |         priority = 10
 48 |         rank = 1
 49 |         transition = MemoryCell(priority, rank, sarsd)
 50 |         self._insert(transition)
 51 | 
 52 |     def _insert(self, transition: MemoryCell):
 53 |         if self.mem_cntr < self.max_size:
 54 |             self.array.append(transition)
 55 |         else:
 56 |             index = self.mem_cntr % self.max_size
 57 |             self.array[index] = transition
 58 |         self.mem_cntr += 1
 59 | 
 60 |     def _update_ranks(self):
 61 |         array = deepcopy(self.array)
 62 |         indices = [i for i in range(len(array))]
 63 |         sorted_array = [list(x) for x in zip(*sorted(zip(array, indices),
 64 |                         key=lambda pair: pair[0],
 65 |                         reverse=True))]
 66 | 
 67 |         for index, value in enumerate(sorted_array[1]):
 68 |             self.array[value].rank = index + 1
 69 | 
 70 |     def print_array(self, a=None):
 71 |         array = self.array if a is None else a
 72 |         for cell in array:
 73 |             print(cell)
 74 |         print('\n')
 75 | 
 76 |     def _max_heapify(self, array: List[MemoryCell], i: int, N: int = None):
 77 |         N = len(array) if N is None else N
 78 |         left = 2 * i + 1
 79 |         right = 2 * i + 2
 80 |         largest = i
 81 |         if left < N and array[left] > array[i]:
 82 |             largest = left
 83 |         if right < N and array[right] > array[largest]:
 84 |             largest = right
 85 |         if largest != i:
 86 |             array[i], array[largest] = array[largest], array[i]
 87 |             self._max_heapify(array, largest, N)
 88 |         return array
 89 | 
 90 |     def _build_max_heap(self):
 91 |         array = deepcopy(self.array)
 92 |         N = len(array)
 93 |         for i in range(N//2, -1, -1):
 94 |             array = self._max_heapify(array, i)
 95 |         return array
 96 | 
 97 |     def rebalance_heap(self):
 98 |         self.array = self._build_max_heap()
 99 | 
100 |     def update_priorities(self, indices: List[int], priorities: List[float]):
101 |         for idx, index in enumerate(indices):
102 |             self.array[index].update_priority(priorities[idx])
103 | 
104 |     def ready(self):
105 |         return self.mem_cntr >= self.n_batches
106 | 
107 |     def anneal_beta(self, ep: int, ep_max: int):
108 |         self.beta = self.beta_start + ep / ep_max * (1 - self.beta_start)
109 | 
110 |     def anneal_alpha(self, ep: int, ep_max: int):
111 |         self.alpha = self.alpha_start * (1 - ep / ep_max)
112 | 
113 |     def _precompute_indices(self):
114 |         print('precomputing indices')
115 |         self.indices = []
116 |         n_batches = self.n_batches if self.r_iter > 1 else self.r_iter
117 |         start = [i for i in range(n_batches, self.max_size + 1, n_batches)]
118 |         for start_idx in start:
119 |             bs = start_idx // n_batches
120 |             indices = np.array([[j * bs + k for k in range(bs)]
121 |                                for j in range(n_batches)], dtype=np.int16)
122 |             self.indices.append(indices)
123 | 
124 |     def compute_probs(self):
125 |         self.probs = []
126 |         n_batches = self.n_batches if self.r_iter > 1 else self.r_iter
127 |         idx = min(self.mem_cntr, self.max_size) // n_batches - 1
128 |         for indices in self.indices[idx]:
129 |             probs = []
130 |             for index in indices:
131 |                 p = 1 / (self.array[index].rank)**self.alpha
132 |                 probs.append(p)
133 |             z = [p / sum(probs) for p in probs]
134 |             self.probs.append(z)
135 | 
136 |     def _calculate_weights(self, probs: List):
137 |         weights = np.array([(1 / self.mem_cntr * 1 / prob)**self.beta
138 |                            for prob in probs])
139 |         weights *= 1 / (max(weights))
140 |         return weights
141 | 
142 |     def sample(self):
143 |         n_batches = self.n_batches if self.r_iter > 1 else self.r_iter
144 |         idx = min(self.mem_cntr, self.max_size) // n_batches - 1
145 |         if self.r_iter != 1:
146 |             samples = [np.random.choice(self.indices[idx][row],
147 |                                         p=self.probs[row])
148 |                        for row in range(len(self.indices[idx]))]
149 |             p = [val for row in self.probs for val in row]
150 |             probs = [p[s] for s in samples]
151 |         else:
152 |             samples = np.random.choice(self.indices[idx][0], self.n_batches)
153 |             probs = [1 / len(samples) for _ in range(len(samples))]
154 |         weights = self._calculate_weights(probs)
155 |         mems = np.array([self.array[s] for s in samples])
156 |         sarsd = []
157 |         for item in mems:
158 |             row = []
159 |             for i in range(len(item.transition)):
160 |                 row.append(np.array(item.transition[i]))
161 |             sarsd.append(row)
162 |         return sarsd, samples, weights
163 | 


--------------------------------------------------------------------------------
/HER/robotic/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | import torch.nn.functional as F
  4 | from torch.distributions.normal import Normal
  5 | from networks import ActorNetwork, CriticNetwork
  6 | from normalizer import Normalizer
  7 | from utils import sync_networks, sync_grads
  8 | 
  9 | 
 10 | class Agent:
 11 |     def __init__(self, alpha, beta, input_dims, tau, n_actions, action_space,
 12 |                  gamma=0.99, action_noise=0.05, explore=0.2, obs_shape=[8],
 13 |                  goal_shape=[3], max_size=1_000_000, fc1_dims=256,
 14 |                  fc2_dims=256, fc3_dims=256):
 15 |         self.gamma = gamma
 16 |         self.tau = tau
 17 |         self.alpha = alpha
 18 |         self.beta = beta
 19 |         self.action_space = action_space
 20 |         self.n_actions = n_actions
 21 |         self.limit = -1 / (1 - self.gamma)
 22 |         self.action_noise = action_noise * self.action_space.high
 23 |         self.explore = explore
 24 | 
 25 |         self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 26 |                                   fc3_dims, n_actions=n_actions,
 27 |                                   name='actor')
 28 |         self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
 29 |                                     fc3_dims, n_actions=n_actions,
 30 |                                     name='critic')
 31 | 
 32 |         self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 33 |                                          fc3_dims, n_actions=n_actions,
 34 |                                          name='target_actor')
 35 | 
 36 |         self.target_critic = CriticNetwork(beta, input_dims, fc1_dims,
 37 |                                            fc2_dims, fc3_dims,
 38 |                                            n_actions=n_actions,
 39 |                                            name='target_critic')
 40 | 
 41 |         self.noise = Normal(T.zeros(n_actions), T.tensor(self.action_noise))
 42 | 
 43 |         self.update_network_parameters(tau=1)
 44 | 
 45 |         sync_networks(self.actor)
 46 |         sync_networks(self.critic)
 47 | 
 48 |         self.obs_stats = Normalizer(obs_shape, 0.01, 5)
 49 |         self.goal_stats = Normalizer(goal_shape, 0.01, 5)
 50 | 
 51 |     def choose_action(self, observation, evaluate):
 52 |         if evaluate:
 53 |             with T.no_grad():
 54 |                 state = T.tensor([observation],
 55 |                                  dtype=T.float).to(self.actor.device)
 56 |                 _, pi = self.target_actor.forward(state)
 57 |                 action = pi.cpu().detach().numpy().squeeze()
 58 |             return action
 59 |         if np.random.uniform() <= self.explore:
 60 |             action = self.action_space.sample()
 61 |         else:
 62 |             state = T.tensor([observation],
 63 |                              dtype=T.float).to(self.actor.device)
 64 |             _, pi = self.actor.forward(state)
 65 |             noise = self.noise.sample().to(self.actor.device)
 66 |             action = (pi + noise).cpu().detach().numpy().squeeze()
 67 |             action = np.clip(action, -1., 1.)
 68 |         return action
 69 | 
 70 |     def save_models(self):
 71 |         self.actor.save_checkpoint()
 72 |         self.target_actor.save_checkpoint()
 73 |         self.critic.save_checkpoint()
 74 |         self.target_critic.save_checkpoint()
 75 | 
 76 |     def load_models(self):
 77 |         self.actor.load_checkpoint()
 78 |         self.target_actor.load_checkpoint()
 79 |         self.critic.load_checkpoint()
 80 |         self.target_critic.load_checkpoint()
 81 | 
 82 |     def learn(self, memories):
 83 |         states, actions, rewards, states_, done, goals = memories
 84 |         states = np.concatenate([states, goals], axis=1)
 85 |         states_ = np.concatenate([states_, goals], axis=1)
 86 | 
 87 |         states = T.tensor(states, dtype=T.float).to(self.actor.device)
 88 |         states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
 89 |         actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
 90 |         rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
 91 |         done = T.tensor(done).to(self.actor.device)
 92 | 
 93 |         _, target_actions = self.target_actor.forward(states_)
 94 |         critic_value_ = self.target_critic.forward(states_, target_actions)
 95 |         critic_value = self.critic.forward(states, actions)
 96 | 
 97 |         critic_value_[done] = 0.0
 98 |         critic_value_ = critic_value_.view(-1)
 99 |         critic_value = critic_value.view(-1)
100 | 
101 |         target = rewards + self.gamma*critic_value_
102 |         target = target.view(critic_value_.size(), 1)
103 |         target = T.clamp(target, min=self.limit, max=0)
104 | 
105 |         self.critic.optimizer.zero_grad()
106 |         critic_loss = F.mse_loss(target, critic_value)
107 |         critic_loss.backward()
108 |         sync_grads(self.critic)
109 |         self.critic.optimizer.step()
110 | 
111 |         self.actor.optimizer.zero_grad()
112 |         mu, pi = self.actor.forward(states)
113 |         actor_loss = self.critic.forward(states, pi)
114 |         actor_loss = -T.mean(actor_loss)
115 |         actor_loss += mu.pow(2).mean()
116 |         actor_loss.backward()
117 |         sync_grads(self.actor)
118 |         self.actor.optimizer.step()
119 | 
120 |     def update_network_parameters(self, tau=None):
121 |         if tau is None:
122 |             tau = self.tau
123 | 
124 |         actor_params = self.actor.named_parameters()
125 |         critic_params = self.critic.named_parameters()
126 |         target_actor_params = self.target_actor.named_parameters()
127 |         target_critic_params = self.target_critic.named_parameters()
128 | 
129 |         critic_state_dict = dict(critic_params)
130 |         actor_state_dict = dict(actor_params)
131 |         target_critic_state_dict = dict(target_critic_params)
132 |         target_actor_state_dict = dict(target_actor_params)
133 | 
134 |         for name in critic_state_dict:
135 |             critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
136 |                                 (1-tau)*target_critic_state_dict[name].clone()
137 | 
138 |         for name in actor_state_dict:
139 |             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
140 |                                  (1-tau)*target_actor_state_dict[name].clone()
141 | 
142 |         self.target_critic.load_state_dict(critic_state_dict)
143 |         self.target_actor.load_state_dict(actor_state_dict)
144 | 


--------------------------------------------------------------------------------