├── smooth_signal.py ├── log_utils.py ├── HER.py ├── test_trained_model_pg.py ├── test_trained_model.py ├── main.py ├── README.md ├── Models.py ├── Nav2D.py ├── dqn_HER.py └── PG_HER_KaggleVersion.py /smooth_signal.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Or 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | def smooth(x,window_len=11,window='hanning'): 9 | if window_len<3: 10 | return x 11 | 12 | s=np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]] 13 | #print(len(s)) 14 | if window == 'flat': #moving average 15 | w=np.ones(window_len,'d') 16 | else: 17 | w=eval('np.'+window+'(window_len)') 18 | 19 | y=np.convolve(w/w.sum(),s,mode='valid') 20 | return y 21 | -------------------------------------------------------------------------------- /log_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: orrivlin 3 | """ 4 | 5 | class mean_val: 6 | def __init__(self): 7 | self.k = 0 8 | self.val = 0 9 | self.mean = 0 10 | 11 | def append(self,x): 12 | self.k += 1 13 | self.val += x 14 | self.mean = self.val/self.k 15 | 16 | def get(self): 17 | return self.mean 18 | 19 | 20 | class logger: 21 | def __init__(self): 22 | self.log = dict() 23 | 24 | def add_log(self,name): 25 | self.log[name] = [] 26 | 27 | def add_item(self,name,x): 28 | self.log[name].append(x) 29 | 30 | def get_log(self,name): 31 | return self.log[name] 32 | 33 | def get_keys(self): 34 | return self.log.keys() 35 | 36 | def get_current(self,name): 37 | return self.log[name][-1] 38 | 39 | -------------------------------------------------------------------------------- /HER.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 8 23:42:57 2019 4 | 5 | @author: Or 6 | """ 7 | from collections import deque 8 | import torch 9 | import numpy as np 10 | import copy 11 | 12 | 13 | class HER: 14 | def __init__(self): 15 | self.buffer = deque() 16 | 17 | def reset(self): 18 | self.buffer = deque() 19 | 20 | def keep(self,item): 21 | self.buffer.append(item) 22 | 23 | def backward(self): 24 | num = len(self.buffer) 25 | goal = self.buffer[-1][-2][1,:,:] 26 | for i in range(num): 27 | self.buffer[-1-i][-2][2,:,:] = goal 28 | self.buffer[-1-i][0][2,:,:] = goal 29 | self.buffer[-1-i][2] = -1.0 30 | self.buffer[-1-i][4] = False 31 | if np.sum(np.abs(self.buffer[-1-i][-2][1,:,:] - goal)) == 0: 32 | self.buffer[-1-i][2] = 0.0 33 | self.buffer[-1-i][4] = True 34 | return self.buffer 35 | 36 | -------------------------------------------------------------------------------- /test_trained_model_pg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 25 07:57:53 2019 4 | 5 | @author: orrivlin 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | from Models import ConvNet2 11 | from Nav2D import Navigate2D 12 | from copy import deepcopy as dc 13 | import torch.nn.functional as F 14 | 15 | 16 | N = 20 17 | Nobs = 15 18 | Dobs = 2 19 | Rmin = 10 20 | env = Navigate2D(N,Nobs,Dobs,Rmin) 21 | [Sdim,Adim] = env.get_dims() 22 | model = ConvNet2(Sdim[0],Sdim[0],3,Adim).cuda() 23 | model.load_state_dict(torch.load('nav2d_model_PG.pt')) 24 | model.eval() 25 | start_obs, done = env.reset() 26 | cum_obs = dc(start_obs) 27 | obs = dc(start_obs) 28 | done = False 29 | state = env.get_tensor(obs) 30 | sum_r = 0 31 | epsilon = 0.0 32 | for t in range(50): 33 | [pi,val] = model(state.cuda()) 34 | num = np.random.rand() 35 | pi = F.softmax(pi,dim=1) 36 | dist = torch.distributions.categorical.Categorical(pi.squeeze()) 37 | action = dist.sample().item() 38 | new_obs, reward, done, dist = env.step(obs,action) 39 | new_state = env.get_tensor(new_obs) 40 | sum_r = sum_r + reward 41 | state = dc(new_state) 42 | obs = dc(new_obs) 43 | cum_obs[:,:,1] += obs[:,:,1] 44 | if done: 45 | break 46 | env.render(cum_obs) 47 | print('time: {}'.format(t)) 48 | print('return: {}'.format(sum_r)) 49 | -------------------------------------------------------------------------------- /test_trained_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 25 07:57:53 2019 4 | 5 | @author: orrivlin 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | from Models import ConvNet 11 | from Nav2D import Navigate2D 12 | from copy import deepcopy as dc 13 | 14 | 15 | N = 20 16 | Nobs = 15 17 | Dobs = 2 18 | Rmin = 10 19 | env = Navigate2D(N,Nobs,Dobs,Rmin) 20 | [Sdim,Adim] = env.get_dims() 21 | model = ConvNet(Sdim[0],Sdim[0],3,Adim).cuda() 22 | model.load_state_dict(torch.load('model.pt')) 23 | image_mean = torch.load('norm.pt').cuda() 24 | 25 | start_obs, done = env.reset() 26 | cum_obs = dc(start_obs) 27 | obs = dc(start_obs) 28 | done = False 29 | state = env.get_tensor(obs) 30 | sum_r = 0 31 | epsilon = 0.0 32 | for t in range(50): 33 | Q = model(state.cuda() - image_mean) 34 | num = np.random.rand() 35 | if (num < epsilon): 36 | action = torch.randint(0,Q.shape[1],(1,)).type(torch.LongTensor) 37 | else: 38 | action = torch.argmax(Q,dim=1) 39 | new_obs, reward, done, dist = env.step(obs,action.item()) 40 | new_state = env.get_tensor(new_obs) 41 | sum_r = sum_r + reward 42 | state = dc(new_state) 43 | obs = dc(new_obs) 44 | cum_obs[:,:,1] += obs[:,:,1] 45 | if done: 46 | break 47 | env.render(cum_obs) 48 | print('time: {}'.format(t)) 49 | print('return: {}'.format(sum_r)) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 8 23:23:01 2019 4 | 5 | @author: Or 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | from matplotlib.pyplot import imshow 11 | import matplotlib.pyplot as plt 12 | from smooth_signal import smooth 13 | from Nav2D import Navigate2D 14 | from Models import ConvNet 15 | import time 16 | from dqn_HER import DQN_HER 17 | from copy import deepcopy as dc 18 | import time 19 | from collections import deque 20 | 21 | 22 | N = 20 23 | Nobs = 15 24 | Dobs = 2 25 | Rmin = 10 26 | env = Navigate2D(N,Nobs,Dobs,Rmin) 27 | gamma = 0.99 28 | buffer_size = 1000000 29 | ddqn = True 30 | alg = DQN_HER(env, gamma, buffer_size, ddqn) 31 | epochs = 15000 32 | 33 | distances = deque(maxlen=100) 34 | 35 | n_data = 400 36 | for i in range(n_data): 37 | min_dist = alg.gather_data() 38 | distances.append(min_dist) 39 | alg.calc_norm() 40 | for i in range(epochs): 41 | T1 = time.time() 42 | log = alg.run_epoch() 43 | T2 = time.time() 44 | distances.append(log.get_current('final_dist')) 45 | Y = np.asarray(distances) 46 | Y[Y > 1] = 1.0 47 | Y = 1 - Y 48 | print('done: {} of {}. loss: {}. success rate: {}. time: {}'.format(i,epochs,np.round(log.get_current('avg_loss'),2),np.round(np.mean(Y),2),np.round(T2-T1,3))) 49 | if (i % 100) == 0: 50 | torch.save(alg.model.state_dict(),'model.pt') 51 | torch.save(alg.image_mean, 'norm.pt') 52 | 53 | 54 | Y = np.asarray(log.get_log('final_dist')) 55 | Y2 = smooth(Y) 56 | x = np.linspace(0, len(Y), len(Y)) 57 | fig1 = plt.figure() 58 | ax1 = plt.axes() 59 | ax1.plot(x, Y, Y2) 60 | plt.xlabel('episodes') 61 | plt.ylabel('minimum episode distance') 62 | 63 | Y = np.asarray(log.get_log('avg_loss')) 64 | Y2 = smooth(Y) 65 | x = np.linspace(0, len(Y), len(Y)) 66 | fig2 = plt.figure() 67 | ax2 = plt.axes() 68 | ax2.plot(x, Y, Y2) 69 | plt.xlabel('episodes') 70 | plt.ylabel('average loss') 71 | 72 | Y = np.asarray(log.get_log('tot_return')) 73 | Y2 = smooth(Y) 74 | x = np.linspace(0, len(Y), len(Y)) 75 | fig3 = plt.figure() 76 | ax3 = plt.axes() 77 | ax3.plot(x, Y, Y2) 78 | plt.xlabel('episodes') 79 | plt.ylabel('episode return') 80 | 81 | 82 | Y = np.asarray(log.get_log('final_dist')) 83 | Y[Y > 1] = 1.0 84 | Y = 1 - Y 85 | K = 100 86 | Y2 = smooth(Y,window_len=K) 87 | x = np.linspace(0, len(Y2), len(Y2)) 88 | fig3 = plt.figure() 89 | ax3 = plt.axes() 90 | ax3.plot(x,Y2) 91 | plt.xlabel('episodes') 92 | plt.ylabel('success rate') 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gridworld Navigation - Hindsight Experience Replay 2 | ### 2D navigation using DQN/Actor-Critic and Hindsight Experience Replay 3 | 4 | This repository contains a PyTorch implementation of a simple 2D navigation environment, in which an agent needs to traverse a map and arrive at a destination pixel, while circumventing onstacles. Both agent position and goal are given implicitly in the input image. 5 | For every step in which the agent has not arrived at the goal, it recieves a -1 reward, which makes the problem difficult. To train the agent, I started by using a standard DQN algorithm coupled with HER (Hindsight Experience Replay), which helps to overcome the sparse rewards. This has only managed to achieve around 80% success rate in arriving at the goal, and training takes quite a few hours. Next, I implemented an actor-critic version of HER, and recently achieved ~90% success rate in getting to the goal pixel. I think that if I used a more sophisticated learning algorithm such as Proximal-Policy-Optimization or Soft-Actor-Critic, I could probably get better results. This was great fun to work on. I also wrote a Medium article on Hindsight-Experience-Replay, feel free to [check it out](https://towardsdatascience.com/reinforcement-learning-with-hindsight-experience-replay-1fee5704f2f8) 6 | 7 | Learning curve for DQN-HER: 8 | 9 | ![alt text](https://user-images.githubusercontent.com/46422351/55673889-a6f6e000-58b6-11e9-980f-b07cac8b8b13.png) 10 | 11 | 12 | 13 | Learning curve for PG-HER: 14 | 15 | ![alt text](https://user-images.githubusercontent.com/46422351/58496974-9f9bd680-8183-11e9-929e-679b2fb3ef6b.png) 16 | 17 | 18 | 19 | And some examples of trajectories using a trained agent: 20 | 21 | ![alt text](https://user-images.githubusercontent.com/46422351/55673893-b1b17500-58b6-11e9-9293-364000ef4751.png) 22 | ![alt text](https://user-images.githubusercontent.com/46422351/55673896-b8d88300-58b6-11e9-8ced-4fe95394bd9b.png) 23 | ![alt text](https://user-images.githubusercontent.com/46422351/55673898-c4c44500-58b6-11e9-8a27-ffadcc98eb73.png) 24 | ![alt text](https://user-images.githubusercontent.com/46422351/55673901-d0177080-58b6-11e9-94a6-744ca3c52a85.png) 25 | ![alt text](https://user-images.githubusercontent.com/46422351/55673904-d6a5e800-58b6-11e9-8c76-8573d8781633.png) 26 | ![alt text](https://user-images.githubusercontent.com/46422351/55673910-e0c7e680-58b6-11e9-9f5d-3cf488c36318.png) 27 | 28 | 29 | Not so evident in the trajectories shown here, but I noticed the agent tends to exploit the fact that the edges of the map are free by construction, and often maneuvers along the edges even if it's not mandatory. 30 | -------------------------------------------------------------------------------- /Models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 8 20:28:07 2019 4 | 5 | @author: Or 6 | """ 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | 11 | class ConvNet(torch.nn.Module): 12 | def __init__(self,H,W,C,Dout): 13 | super(ConvNet, self).__init__() 14 | self.H = H 15 | self.W = W 16 | self.C = C 17 | self.Dout = Dout 18 | self.Chid = 32 19 | self.Chid2 = 64 20 | self.Chid3 = 64 21 | 22 | self.conv1 = torch.nn.Conv2d(in_channels=self.C,out_channels=self.Chid,kernel_size=3,stride=1,padding=1) 23 | self.conv2 = torch.nn.Conv2d(in_channels=self.Chid,out_channels=self.Chid2,kernel_size=3,stride=1,padding=1) 24 | self.conv3 = torch.nn.Conv2d(in_channels=self.Chid2,out_channels=self.Chid3,kernel_size=3,stride=1,padding=1) 25 | self.fc1 = torch.nn.Linear(int(self.Chid3*H*W/16),564) 26 | self.fc2 = torch.nn.Linear(564,Dout) 27 | 28 | def forward(self,x): 29 | batch_size = x.shape[0] 30 | x = F.max_pool2d(F.relu(self.conv1(x)),2) 31 | x = F.relu(self.conv2(x)) 32 | x = F.max_pool2d(F.relu(self.conv3(x)),2) 33 | x = x.view(batch_size,int(self.Chid3*self.H*self.W/16)) 34 | x = F.relu(self.fc1(x)) 35 | x = self.fc2(x) 36 | return x 37 | 38 | class ConvNet2(torch.nn.Module): 39 | def __init__(self,H,W,C,Dout): 40 | super(ConvNet2, self).__init__() 41 | self.H = H 42 | self.W = W 43 | self.C = C 44 | self.Dout = Dout 45 | self.Chid = 32 46 | self.Chid2 = 64 47 | self.Chid3 = 64 48 | 49 | self.conv1 = torch.nn.Conv2d(in_channels=self.C,out_channels=self.Chid,kernel_size=3,stride=1,padding=1) 50 | self.conv2 = torch.nn.Conv2d(in_channels=self.Chid,out_channels=self.Chid2,kernel_size=3,stride=1,padding=1) 51 | self.conv3 = torch.nn.Conv2d(in_channels=self.Chid2,out_channels=self.Chid3,kernel_size=3,stride=1,padding=1) 52 | self.fc1 = torch.nn.Linear(int(self.Chid3*H*W/16),564) 53 | self.policy = torch.nn.Linear(564,Dout) 54 | self.value = torch.nn.Linear(564,1) 55 | 56 | def forward(self,x): 57 | batch_size = x.shape[0] 58 | x = F.max_pool2d(F.relu(self.conv1(x)),2) 59 | x = F.relu(self.conv2(x)) 60 | x = F.max_pool2d(F.relu(self.conv3(x)),2) 61 | x = x.view(batch_size,int(self.Chid3*self.H*self.W/16)) 62 | x = F.relu(self.fc1(x)) 63 | pi = self.policy(x) 64 | val = self.value(x) 65 | return pi,val 66 | -------------------------------------------------------------------------------- /Nav2D.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 8 19:33:29 2019 4 | 5 | @author: Or 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | from matplotlib.pyplot import imshow 11 | import matplotlib as plt 12 | from copy import deepcopy as dc 13 | 14 | class Navigate2D: 15 | def __init__(self,N,Nobs,Dobs,Rmin): 16 | self.N = N 17 | self.Nobs = Nobs 18 | self.Dobs = Dobs 19 | self.Rmin = Rmin 20 | self.state_dim = [N,N,3] 21 | self.action_dim = 4 22 | self.scale = 10.0 23 | 24 | def get_dims(self): 25 | return self.state_dim, self.action_dim 26 | 27 | def reset(self): 28 | grid = np.zeros((self.N,self.N,3)) 29 | for i in range(self.Nobs): 30 | center = np.random.randint(0,self.N,(1,2)) 31 | minX = np.maximum(center[0,0] - self.Dobs,1) 32 | minY = np.maximum(center[0,1] - self.Dobs,1) 33 | maxX = np.minimum(center[0,0] + self.Dobs,self.N-1) 34 | maxY = np.minimum(center[0,1] + self.Dobs,self.N-1) 35 | grid[minX:maxX,minY:maxY,0] = 1.0 36 | 37 | free_idx = np.argwhere(grid[:,:,0] == 0.0) 38 | start = free_idx[np.random.randint(0,free_idx.shape[0],1),:].squeeze() 39 | while (True): 40 | finish = free_idx[np.random.randint(0,free_idx.shape[0],1),:].squeeze() 41 | if ((start[0] != finish[0]) and (start[1] != finish[1]) and (np.linalg.norm(start - finish) >= self.Rmin)): 42 | break 43 | grid[start[0],start[1],1] = self.scale*1.0 44 | grid[finish[0],finish[1],2] = self.scale*1.0 45 | done = False 46 | return grid, done 47 | 48 | def step(self,grid,action): 49 | max_norm = self.N 50 | 51 | new_grid = dc(grid) 52 | done = False 53 | reward = -1.0 54 | act = np.array([[1,0],[0,1],[-1,0],[0,-1]]) 55 | pos = np.argwhere(grid[:,:,1] == self.scale**1.0)[0] 56 | target = np.argwhere(grid[:,:,2] == self.scale*1.0)[0] 57 | new_pos = pos + act[action] 58 | 59 | dist1 = np.linalg.norm(pos - target) 60 | dist2 = np.linalg.norm(new_pos - target) 61 | #reward = (dist1 - dist2)*(max_norm - dist2) 62 | #reward = -dist2 63 | reward = -1 64 | if (np.any(new_pos < 0.0) or np.any(new_pos > (self.N - 1)) or (grid[new_pos[0],new_pos[1],0] == 1.0)): 65 | #dist = np.linalg.norm(pos - target) 66 | #reward = (dist1 - dist2) 67 | return grid, reward, done, dist2 68 | new_grid[pos[0],pos[1],1] = 0.0 69 | new_grid[new_pos[0],new_pos[1],1] = self.scale*1.0 70 | if ((new_pos[0] == target[0]) and (new_pos[1] == target[1])): 71 | reward = 0.0 72 | done = True 73 | #dist = np.linalg.norm(new_pos - target) 74 | #reward = (dist1 - dist2) 75 | return new_grid, reward, done, dist2 76 | 77 | def get_tensor(self,grid): 78 | S = torch.Tensor(grid).transpose(2,1).transpose(1,0).unsqueeze(0) 79 | return S 80 | 81 | def render(self,grid): 82 | #imshow(grid) 83 | plot = imshow(grid) 84 | return plot -------------------------------------------------------------------------------- /dqn_HER.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | @author: orrivlin 4 | """ 5 | 6 | import torch 7 | import numpy as np 8 | import copy 9 | import torch.nn.functional as F 10 | from collections import deque 11 | from Models import ConvNet 12 | import random 13 | from log_utils import logger, mean_val 14 | from HER import HER 15 | from copy import deepcopy as dc 16 | 17 | 18 | 19 | class DQN_HER: 20 | def __init__(self, env, gamma, buffer_size, ddqn): 21 | self.env = env 22 | [Sdim,Adim] = env.get_dims() 23 | self.model = ConvNet(Sdim[0],Sdim[0],3,Adim).cuda() 24 | self.target_model = copy.deepcopy(self.model).cuda() 25 | self.her = HER() 26 | self.gamma = gamma 27 | self.optimizer = torch.optim.Adam(self.model.parameters(),lr=0.0001) 28 | self.batch_size = 16 29 | self.epsilon = 0.1 30 | self.buffer_size = buffer_size 31 | self.step_counter = 0 32 | self.epsi_high = 0.9 33 | self.epsi_low = 0.1 34 | self.steps = 0 35 | self.count = 0 36 | self.decay = 2000 37 | self.eps = self.epsi_high 38 | self.update_target_step = 3000 39 | self.log = logger() 40 | self.log.add_log('tot_return') 41 | self.log.add_log('avg_loss') 42 | self.log.add_log('final_dist') 43 | self.log.add_log('buffer') 44 | self.image_mean = 0 45 | self.image_std = 0 46 | self.ddqn = ddqn 47 | 48 | self.replay_buffer = deque(maxlen=buffer_size) 49 | 50 | def run_episode(self): 51 | self.her.reset() 52 | obs, done = self.env.reset() 53 | done = False 54 | state = self.env.get_tensor(obs) 55 | sum_r = 0 56 | mean_loss = mean_val() 57 | min_dist = 100000 58 | max_t = 50 59 | 60 | for t in range(max_t): 61 | self.steps += 1 62 | self.eps = self.epsi_low + (self.epsi_high-self.epsi_low) * (np.exp(-1.0 * self.steps/self.decay)) 63 | Q = self.model(self.norm(state.cuda())) 64 | num = np.random.rand() 65 | if (num < self.eps): 66 | action = torch.randint(0,Q.shape[1],(1,)).type(torch.LongTensor) 67 | else: 68 | action = torch.argmax(Q,dim=1) 69 | new_obs, reward, done, dist = self.env.step(obs,action.item()) 70 | new_state = self.env.get_tensor(new_obs) 71 | sum_r = sum_r + reward 72 | if dist < min_dist: 73 | min_dist = dist 74 | if (t+1) == max_t: 75 | done = True 76 | 77 | self.replay_buffer.append([dc(state.squeeze(0).numpy()),dc(action),dc(reward),dc(new_state.squeeze(0).numpy()),dc(done)]) 78 | self.her.keep([state.squeeze(0).numpy(),action,reward,new_state.squeeze(0).numpy(),done]) 79 | loss = self.update_model() 80 | mean_loss.append(loss) 81 | state = dc(new_state) 82 | obs = dc(new_obs) 83 | 84 | self.step_counter = self.step_counter + 1 85 | if (self.step_counter > self.update_target_step): 86 | self.target_model.load_state_dict(self.model.state_dict()) 87 | self.step_counter = 0 88 | print('updated target model') 89 | her_list = self.her.backward() 90 | for item in her_list: 91 | self.replay_buffer.append(item) 92 | self.log.add_item('tot_return',sum_r) 93 | self.log.add_item('avg_loss',mean_loss.get()) 94 | self.log.add_item('final_dist',min_dist) 95 | 96 | def gather_data(self): 97 | self.her.reset() 98 | obs, done = self.env.reset() 99 | done = False 100 | state = self.env.get_tensor(obs) 101 | sum_r = 0 102 | min_dist = 100000 103 | max_t = 50 104 | 105 | for t in range(max_t): 106 | self.eps = 1.0 107 | Q = self.model(state.cuda()) 108 | num = np.random.rand() 109 | if (num < self.eps): 110 | action = torch.randint(0,Q.shape[1],(1,)).type(torch.LongTensor) 111 | else: 112 | action = torch.argmax(Q,dim=1) 113 | new_obs, reward, done, dist = self.env.step(obs,action.item()) 114 | new_state = self.env.get_tensor(new_obs) 115 | sum_r = sum_r + reward 116 | if dist < min_dist: 117 | min_dist = dist 118 | if (t+1) == max_t: 119 | done = True 120 | 121 | self.replay_buffer.append([dc(state.squeeze(0).numpy()),dc(action),dc(reward),dc(new_state.squeeze(0).numpy()),dc(done)]) 122 | state = dc(new_state) 123 | obs = dc(new_obs) 124 | return min_dist 125 | 126 | def calc_norm(self): 127 | S0, A0, R1, S1, D1 = zip(*self.replay_buffer) 128 | S0 = torch.tensor( S0, dtype=torch.float) 129 | self.image_mean = S0.mean(dim=0).cuda() 130 | self.image_std = S0.std(dim=0).cuda() 131 | 132 | def norm(self,state): 133 | return state - self.image_mean 134 | 135 | def update_model(self): 136 | self.optimizer.zero_grad() 137 | num = len(self.replay_buffer) 138 | K = np.min([num,self.batch_size]) 139 | samples = random.sample(self.replay_buffer, K) 140 | 141 | S0, A0, R1, S1, D1 = zip(*samples) 142 | S0 = torch.tensor( S0, dtype=torch.float) 143 | A0 = torch.tensor( A0, dtype=torch.long).view(K, -1) 144 | R1 = torch.tensor( R1, dtype=torch.float).view(K, -1) 145 | S1 = torch.tensor( S1, dtype=torch.float) 146 | D1 = torch.tensor( D1, dtype=torch.float) 147 | 148 | S0 = self.norm(S0.cuda()) 149 | S1 = self.norm(S1.cuda()) 150 | if self.ddqn == True: 151 | model_next_acts = self.model(S1).detach().max(dim=1)[1] 152 | target_q = R1.squeeze().cuda() + self.gamma*self.target_model(S1).gather(1,model_next_acts.unsqueeze(1)).squeeze()*(1 - D1.cuda()) 153 | else: 154 | target_q = R1.squeeze().cuda() + self.gamma*self.target_model(S1).max(dim=1)[0].detach()*(1 - D1.cuda()) 155 | policy_q = self.model(S0).gather(1,A0.cuda()) 156 | L = F.smooth_l1_loss(policy_q.squeeze(),target_q.squeeze()) 157 | L.backward() 158 | self.optimizer.step() 159 | return L.detach().item() 160 | 161 | def run_epoch(self): 162 | self.run_episode() 163 | self.log.add_item('buffer',len(self.replay_buffer)) 164 | return self.log 165 | 166 | -------------------------------------------------------------------------------- /PG_HER_KaggleVersion.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import torch 5 | import matplotlib.pyplot as plt 6 | from matplotlib.pyplot import imshow 7 | from copy import deepcopy as dc 8 | import torch.nn.functional as F 9 | from collections import deque 10 | import time 11 | 12 | 13 | def smooth(x,window_len=11,window='hanning'): 14 | if window_len<3: 15 | return x 16 | 17 | s=np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]] 18 | if window == 'flat': #moving average 19 | w=np.ones(window_len,'d') 20 | else: 21 | w=eval('np.'+window+'(window_len)') 22 | 23 | y=np.convolve(w/w.sum(),s,mode='valid') 24 | return y 25 | 26 | class mean_val: 27 | def __init__(self): 28 | self.k = 0 29 | self.val = 0 30 | self.mean = 0 31 | 32 | def append(self,x): 33 | self.k += 1 34 | self.val += x 35 | self.mean = self.val/self.k 36 | 37 | def get(self): 38 | return self.mean 39 | 40 | 41 | class logger: 42 | def __init__(self): 43 | self.log = dict() 44 | 45 | def add_log(self,name): 46 | self.log[name] = [] 47 | 48 | def add_item(self,name,x): 49 | self.log[name].append(x) 50 | 51 | def get_log(self,name): 52 | return self.log[name] 53 | 54 | def get_keys(self): 55 | return self.log.keys() 56 | 57 | def get_current(self,name): 58 | return self.log[name][-1] 59 | 60 | class Navigate2D: 61 | def __init__(self,N,Nobs,Dobs,Rmin): 62 | self.N = N 63 | self.Nobs = Nobs 64 | self.Dobs = Dobs 65 | self.Rmin = Rmin 66 | self.state_dim = [N,N,3] 67 | self.action_dim = 4 68 | self.scale = 10.0 69 | 70 | def get_dims(self): 71 | return self.state_dim, self.action_dim 72 | 73 | def reset(self): 74 | grid = np.zeros((self.N,self.N,3)) 75 | for i in range(self.Nobs): 76 | center = np.random.randint(0,self.N,(1,2)) 77 | minX = np.maximum(center[0,0] - self.Dobs,1) 78 | minY = np.maximum(center[0,1] - self.Dobs,1) 79 | maxX = np.minimum(center[0,0] + self.Dobs,self.N-1) 80 | maxY = np.minimum(center[0,1] + self.Dobs,self.N-1) 81 | grid[minX:maxX,minY:maxY,0] = 1.0 82 | 83 | free_idx = np.argwhere(grid[:,:,0] == 0.0) 84 | start = free_idx[np.random.randint(0,free_idx.shape[0],1),:].squeeze() 85 | while (True): 86 | finish = free_idx[np.random.randint(0,free_idx.shape[0],1),:].squeeze() 87 | if ((start[0] != finish[0]) and (start[1] != finish[1]) and (np.linalg.norm(start - finish) >= self.Rmin)): 88 | break 89 | grid[start[0],start[1],1] = self.scale*1.0 90 | grid[finish[0],finish[1],2] = self.scale*1.0 91 | done = False 92 | return grid, done 93 | 94 | def step(self,grid,action): 95 | new_grid = dc(grid) 96 | done = False 97 | reward = 0.0 98 | act = np.array([[1,0],[0,1],[-1,0],[0,-1]]) 99 | pos = np.argwhere(grid[:,:,1] == self.scale**1.0)[0] 100 | target = np.argwhere(grid[:,:,2] == self.scale*1.0)[0] 101 | new_pos = pos + act[action] 102 | 103 | dist2 = np.linalg.norm(new_pos - target) 104 | if (np.any(new_pos < 0.0) or np.any(new_pos > (self.N - 1)) or (grid[new_pos[0],new_pos[1],0] == 1.0)): 105 | return grid, reward, done, dist2 106 | new_grid[pos[0],pos[1],1] = 0.0 107 | new_grid[new_pos[0],new_pos[1],1] = self.scale*1.0 108 | if ((new_pos[0] == target[0]) and (new_pos[1] == target[1])): 109 | reward = 10.0 110 | done = True 111 | return new_grid, reward, done, dist2 112 | 113 | def get_tensor(self,grid): 114 | S = torch.Tensor(grid).transpose(2,1).transpose(1,0).unsqueeze(0) 115 | return S 116 | 117 | def render(self,grid): 118 | plot = imshow(grid) 119 | return plot 120 | 121 | 122 | 123 | class ConvNet(torch.nn.Module): 124 | def __init__(self,H,W,C,Dout): 125 | super(ConvNet, self).__init__() 126 | self.H = H 127 | self.W = W 128 | self.C = C 129 | self.Dout = Dout 130 | self.Chid = 32 131 | self.Chid2 = 64 132 | self.Chid3 = 64 133 | 134 | self.conv1 = torch.nn.Conv2d(in_channels=self.C,out_channels=self.Chid,kernel_size=3,stride=1,padding=1) 135 | self.conv2 = torch.nn.Conv2d(in_channels=self.Chid,out_channels=self.Chid2,kernel_size=3,stride=1,padding=1) 136 | self.conv3 = torch.nn.Conv2d(in_channels=self.Chid2,out_channels=self.Chid3,kernel_size=3,stride=1,padding=1) 137 | self.fc1 = torch.nn.Linear(int(self.Chid3*H*W/16),564) 138 | self.policy = torch.nn.Linear(564,Dout) 139 | self.value = torch.nn.Linear(564,1) 140 | 141 | def forward(self,x): 142 | batch_size = x.shape[0] 143 | x = F.max_pool2d(F.relu(self.conv1(x)),2) 144 | x = F.relu(self.conv2(x)) 145 | x = F.max_pool2d(F.relu(self.conv3(x)),2) 146 | x = x.view(batch_size,int(self.Chid3*self.H*self.W/16)) 147 | x = F.relu(self.fc1(x)) 148 | pi = self.policy(x) 149 | val = self.value(x) 150 | return pi,val 151 | 152 | 153 | class HER: 154 | def __init__(self,N,cuda_flag): 155 | self.buffer = deque() 156 | self.N = N 157 | self.cuda = cuda_flag 158 | 159 | def reset(self): 160 | self.buffer = deque() 161 | 162 | def keep(self,item): 163 | self.buffer.append(item) 164 | 165 | def backward(self,model,gamma): 166 | K = len(self.buffer) 167 | new_buffer = deque() 168 | for i in range(K): 169 | new_buffer.append(self.buffer[i]) 170 | num = len(new_buffer) 171 | goal = new_buffer[-1][2][0,1,:,:] 172 | for i in range(num): 173 | new_buffer[-1-i][3] = 10.0 174 | new_buffer[-1-i][0][0,2,:,:] = goal 175 | if self.cuda: 176 | [pi,v] = model(new_buffer[-1-i][0].cuda()) 177 | new_buffer[-1-i][1] = F.softmax(pi,dim=1).squeeze()[new_buffer[-1-i][1]] 178 | else: 179 | [pi,v] = model(new_buffer[-1-i][0]) 180 | new_buffer[-1-i][1] = F.softmax(pi,dim=1).squeeze()[new_buffer[-1-i][1]] 181 | if ((new_buffer[-1-i][2][0,1,:,:] - goal).abs().sum() == 0): 182 | new_buffer[-1-i][3] = 10.0 183 | for t in range(len(new_buffer)): 184 | if (t==0): 185 | X = new_buffer[t][0] 186 | ratio = new_buffer[t][1].detach().item()/new_buffer[t][4] 187 | PI = ratio*new_buffer[t][1].unsqueeze(0) 188 | R = torch.Tensor(np.array(new_buffer[t][3])).unsqueeze(0) 189 | V = v.unsqueeze(0) 190 | else: 191 | X = torch.cat([X,new_buffer[t][0]],dim=0) 192 | ratio = new_buffer[t][1].detach().item()/new_buffer[t][4] 193 | PI = torch.cat([PI,ratio*new_buffer[t][1].unsqueeze(0)],dim=0) 194 | R = torch.cat([R,torch.Tensor(np.array(new_buffer[t][3])).unsqueeze(0)],dim=0) 195 | V = torch.cat([V,v.unsqueeze(0)]) 196 | return X,PI,R,V 197 | 198 | 199 | class DiscretePolicyGradient: 200 | def __init__(self, env, gamma, num_episodes,cuda_flag): 201 | self.env = env 202 | [Sdim,Adim] = env.get_dims() 203 | if cuda_flag: 204 | self.model = ConvNet(Sdim[0],Sdim[0],3,Adim).cuda() 205 | else: 206 | self.model = ConvNet(Sdim[0],Sdim[0],3,Adim) 207 | self.gamma = gamma 208 | self.optimizer = torch.optim.Adam(self.model.parameters(),lr=0.0003) 209 | self.batch_size = 32 210 | self.epsilon = 1e-8 211 | self.num_episodes = num_episodes 212 | self.her = HER(self.env.N,cuda_flag) 213 | self.cuda = cuda_flag 214 | self.log = logger() 215 | self.log.add_log('reward') 216 | self.log.add_log('final_dist') 217 | self.log.add_log('TD_error') 218 | self.log.add_log('entropy') 219 | 220 | def run_episode(self): 221 | obs, done = self.env.reset() 222 | state = self.env.get_tensor(obs) 223 | sum_r = 0 224 | min_dist = self.env.N 225 | self.her.reset() 226 | max_time = 50 227 | for t in range(max_time): 228 | if self.cuda: 229 | [pi,val] = self.model(state.cuda()) 230 | else: 231 | [pi,val] = self.model(state) 232 | pi = F.softmax(pi,dim=1) 233 | dist = torch.distributions.categorical.Categorical(pi.squeeze()) 234 | action = dist.sample().item() 235 | new_obs, reward, done, dist = self.env.step(obs,action) 236 | obs = dc(new_obs) 237 | new_state = self.env.get_tensor(obs) 238 | sum_r = sum_r + reward 239 | if dist < min_dist: 240 | min_dist = dist 241 | 242 | self.her.keep([dc(state),dc(action),dc(new_state),dc(reward),dc(pi[0,action].detach().item())]) 243 | 244 | if (t==0): 245 | X = state 246 | PI = pi[0,action].unsqueeze(0) 247 | R = torch.Tensor(np.array(reward)).unsqueeze(0) 248 | V = val.unsqueeze(0) 249 | else: 250 | X = torch.cat([X,state],dim=0) 251 | PI = torch.cat([PI,pi[0,action].unsqueeze(0)],dim=0) 252 | R = torch.cat([R,torch.Tensor(np.array(reward)).unsqueeze(0)],dim=0) 253 | V = torch.cat([V,val.unsqueeze(0)],dim=0) 254 | state = new_state 255 | if done: 256 | break 257 | self.log.add_item('reward',sum_r) 258 | self.log.add_item('final_dist',min_dist) 259 | tot_return = R.sum().item() 260 | for i in range(R.shape[0] - 1): 261 | R[-2-i] = R[-1] 262 | [XX,PIPI,RR,VV] = self.her.backward(self.model,self.gamma) 263 | X = torch.cat((X,XX),dim=0) 264 | PI = torch.cat((PI,PIPI),dim=0) 265 | R = torch.cat((R,RR),dim=0) 266 | V = torch.cat((V,VV),dim=0) 267 | 268 | 269 | return X, PI, R, V, tot_return 270 | 271 | 272 | def update_model(self,PI,R,V): 273 | self.optimizer.zero_grad() 274 | if self.cuda: 275 | R = R.cuda() 276 | A = R.squeeze() - V.squeeze().detach() 277 | #A = R 278 | L_policy = -(torch.log(PI)*A).mean() 279 | L_value = F.smooth_l1_loss(V.squeeze(), R.squeeze()) 280 | L_entropy = -(PI*PI.log()).mean() 281 | L = L_policy + L_value - 0.01*L_entropy 282 | L.backward() 283 | self.optimizer.step() 284 | self.log.add_item('TD_error',L_value.detach().item()) 285 | self.log.add_item('entropy',L_entropy.detach().item()) 286 | 287 | 288 | def run_epoch(self): 289 | mean_return = 0 290 | for i in range(self.num_episodes): 291 | [x,pi,r,val,tot_return] = self.run_episode() 292 | mean_return = mean_return + tot_return 293 | if (i == 0): 294 | PI = pi 295 | R = r 296 | V = val 297 | else: 298 | PI = torch.cat([PI,pi],dim=0) 299 | R = torch.cat([R,r],dim=0) 300 | V = torch.cat([V,val],dim=0) 301 | 302 | mean_return = mean_return/self.num_episodes 303 | self.update_model(PI,R,V) 304 | return self.log 305 | 306 | 307 | N = 20 308 | Nobs = 15 309 | Dobs = 2 310 | Rmin = 10 311 | env = Navigate2D(N,Nobs,Dobs,Rmin) 312 | 313 | gamma = 0.99 314 | num_episodes = 10 315 | alg = DiscretePolicyGradient(env,gamma,num_episodes,True) 316 | num_epochs = 400 317 | 318 | for i in range(num_epochs): 319 | mean_time = mean_val() 320 | success_rate = mean_val() 321 | mean_loss = mean_val() 322 | mean_h = mean_val() 323 | for j in range(100): 324 | T1 = time.time() 325 | log = alg.run_epoch() 326 | T2 = time.time() 327 | mean_time.append(T2-T1) 328 | mean_loss.append(log.get_current('TD_error')) 329 | mean_h.append(log.get_current('entropy')) 330 | if log.get_current('final_dist') > 0.0: 331 | success_rate.append(0.0) 332 | else: 333 | success_rate.append(1.0) 334 | if (j % 10) == 0: 335 | torch.save(alg.model.state_dict(),'nav2d_model_PG.pt') 336 | 337 | print('Done: {} of {}. TD error: {}. success rate: {}. entropy: {}. mean iteration time: {}'.format(i*100,num_epochs*100,np.round(mean_loss.get(),2),np.round(success_rate.get(),2),np.round(mean_h.get(),2),np.round(mean_time.get(),3))) 338 | 339 | 340 | 341 | tot_ret_i = log.get_log('reward') 342 | Y = np.asarray(tot_ret_i) 343 | Y2 = smooth(Y) 344 | x = np.linspace(0, len(Y), len(Y)) 345 | fig1 = plt.figure() 346 | ax1 = plt.axes() 347 | ax1.plot(x, Y, Y2) 348 | plt.xlabel('episodes') 349 | plt.ylabel('episode return') 350 | 351 | tot_ret_i = log.get_log('final_dist') 352 | Y = np.asarray(tot_ret_i) 353 | Y2 = smooth(Y) 354 | x = np.linspace(0, len(Y), len(Y)) 355 | fig2 = plt.figure() 356 | ax2 = plt.axes() 357 | ax2.plot(x, Y, Y2) 358 | plt.xlabel('episodes') 359 | plt.ylabel('episode final distance') 360 | 361 | Y = np.asarray(log.get_log('final_dist')) 362 | Y[Y > 1] = 1.0 363 | Y = 1 - Y 364 | K = 1000 365 | Y2 = smooth(Y,window_len=K) 366 | x = np.linspace(0, len(Y2), len(Y2)) 367 | fig3 = plt.figure() 368 | ax3 = plt.axes() 369 | ax3.plot(x,Y2) 370 | plt.xlabel('episodes') 371 | plt.ylabel('success rate') --------------------------------------------------------------------------------