├── DQN ├── MountainCar_success.pt ├── dqn.py ├── main_dqn.py ├── play.py ├── readme.md └── utils.py ├── README.md ├── REINFORCE ├── README.md ├── REINFORCE.py ├── images │ ├── CartPole-v0.png │ └── LunarLander-v2.png ├── main.py └── utils.py └── REINFORCE_baselines ├── README.md ├── REINFORCE_baselines.py ├── main.py └── utils.py /DQN/MountainCar_success.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/DQN/MountainCar_success.pt -------------------------------------------------------------------------------- /DQN/dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import matplotlib 6 | import matplotlib.pyplot as plt 7 | from torch import optim 8 | import torchvision.transforms as T 9 | import cv2 10 | 11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 12 | 13 | class DeepQNetwork(nn.Module): 14 | 15 | def __init__(self,learning_rate,h,w,n_actions): 16 | super(DeepQNetwork, self).__init__() 17 | self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4) 18 | self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) 19 | self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) 20 | 21 | def conv2d_size(size, kernel_size = 3, stride = 1): 22 | return (size - kernel_size)// stride + 1 23 | 24 | convw = conv2d_size(conv2d_size(conv2d_size(w,8,4),4,2)) 25 | convh = conv2d_size(conv2d_size(conv2d_size(h,8,4),4,2)) 26 | lin_1 = convw*convh 27 | 28 | 29 | self.linear1 = nn.Linear(lin_1*64,256) 30 | self.linear2 = nn.Linear(256, n_actions) 31 | 32 | self.criterion = nn.MSELoss() 33 | self.optimizer = optim.Adam(self.parameters(),lr = learning_rate) 34 | 35 | 36 | def forward(self, x): 37 | x = torch.relu(self.conv1(x)) 38 | x = torch.relu(self.conv2(x)) 39 | x = torch.relu(self.conv3(x)) 40 | x = x.view(x.shape[0],-1) 41 | x = torch.relu(self.linear1(x)) 42 | action_values = self.linear2(x) 43 | return action_values 44 | 45 | class agent(): 46 | 47 | def __init__(self,epsilon,eps_decay,epsilon_min,gamma,l_r,n_actions,memory,batch_size,target_update,env,save=False): 48 | self.epsilon = epsilon 49 | self.eps_decay = eps_decay 50 | self.epsilon_min = epsilon_min 51 | self.gamma = gamma 52 | self.env = env 53 | self.n_actions = n_actions 54 | self.batch_size = batch_size 55 | self.memory = memory 56 | self.memory_count = 0 57 | self.ROWS = 84 58 | self.COLS = 84 59 | self.state_memory = torch.zeros([self.memory,1,self.ROWS,self.COLS],dtype = torch.float32) 60 | self.next_state_memory = torch.zeros([self.memory,1,self.ROWS,self.COLS],dtype = torch.float32) 61 | self.action_memory = torch.zeros(self.memory,dtype=torch.int32) 62 | self.terminal_memory = torch.zeros(self.memory,dtype=torch.uint8) 63 | self.reward_memory = torch.zeros(self.memory) 64 | self.policy_net = DeepQNetwork(learning_rate = l_r,h=self.ROWS,w=self.COLS,n_actions=self.n_actions).to(device) 65 | self.target_net = DeepQNetwork(learning_rate = l_r,h=self.ROWS,w=self.COLS,n_actions=self.n_actions).to(device) 66 | self.target_update = target_update 67 | self.save = save 68 | 69 | 70 | 71 | def choose_action(self,state): 72 | r = np.random.random() 73 | if rself.epsilon_min: 147 | self.epsilon = self.epsilon-self.eps_decay 148 | return self.epsilon 149 | 150 | 151 | print('Done') 152 | -------------------------------------------------------------------------------- /DQN/main_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from dqn import cart_agent 3 | import numpy as np 4 | import torch 5 | from utils import plot_durations 6 | from utils import plot_learning_curve 7 | 8 | device = torch.device('cuda' if torch.cuda.is_available else 'cpu') 9 | 10 | if __name__ == '__main__': 11 | 12 | env = gym.make('MountainCar-v0') 13 | 14 | A = agent(epsilon=1,eps_decay=0.005,epsilon_min=0.01,gamma=0.99,l_r=0.0001,n_actions=3, 15 | memory=20000,batch_size=32,target_update=7,env=env,save=True) 16 | 17 | scores, avg_score, epsilon_history = [], [], [] 18 | best_score = -np.inf 19 | n_games = 1000 20 | score = 0 21 | 22 | print("Save is currently !!!!!!!!!!!!!!!!!! ", A.save) 23 | 24 | for i in range(n_games): 25 | A.env.reset() 26 | last_screen = A.get_state() 27 | current_screen = A.get_state() 28 | state = current_screen-last_screen 29 | 30 | done = False 31 | score = 0 32 | 33 | if i%20==0 and i>0: 34 | plot_durations(scores, 0.001) 35 | print('----------------- training --------------------') 36 | print('epsiode number', i) 37 | print("Average score ",avg_score[-1]) 38 | print('----------------- training --------------------') 39 | 40 | while not done: 41 | action = A.choose_action(state) 42 | 43 | _, reward, done, _ = A.env.step(action) 44 | 45 | last_screen = current_screen 46 | current_screen = A.get_state() 47 | 48 | next_state = current_screen - last_screen 49 | 50 | A.store_experience(state,action,reward,done,next_state) 51 | A.learn_with_experience_replay() 52 | 53 | score += reward 54 | state = next_state 55 | 56 | scores.append(score) 57 | if i>30: 58 | avg_score.append(np.mean(scores[-30:])) 59 | else: 60 | avg_score.append(np.mean(scores)) 61 | 62 | if avg_score[-1] > best_score: 63 | torch.save(A.policy_net.state_dict(),'/home/raj/My_projects/DQN/MountanCar.pt') 64 | best_score = avg_score[-1] 65 | print("***************\ncurrent best average score is "+ str(best_score) +"\n***************") 66 | 67 | if i%A.target_update == 0: 68 | A.target_net.load_state_dict(A.policy_net.state_dict()) 69 | 70 | A.epsilon_decay() 71 | 72 | plot_durations(scores,5) 73 | -------------------------------------------------------------------------------- /DQN/play.py: -------------------------------------------------------------------------------- 1 | from dqn import cart_agent 2 | import torch 3 | import gym 4 | import time 5 | import numpy as np 6 | from utils import plot_learning_curve 7 | 8 | device = torch.device('cuda' if torch.cuda.is_available else 'cpu') 9 | env = gym.make('MountainCar-v0').unwrapped 10 | 11 | if __name__ == '__main__': 12 | player = agent(epsilon=0,eps_decay=0,epsilon_min=0,gamma=0,l_r=0,n_actions=3, 13 | memory=0,batch_size=0,target_update=0,env = env,save = True) 14 | n_games = 3 15 | scores = [] 16 | player.policy_net.load_state_dict(torch.load('/home/raj/My_projects/DQN/MountanCar.pt')) 17 | 18 | for i in range(n_games): 19 | env.reset() 20 | last_screen = player.get_state() 21 | current_screen = player.get_state() 22 | state = current_screen-last_screen 23 | 24 | done = False 25 | score = 0 26 | while not done: 27 | action = player.choose_action(state) 28 | time.sleep(0.05) 29 | _, reward, done, _ = player.env.step(action) 30 | 31 | last_screen = current_screen 32 | current_screen = player.get_state() 33 | 34 | next_state = current_screen - last_screen 35 | score += reward 36 | state = next_state 37 | 38 | 39 | scores.append(score) 40 | print(np.mean(scores)) 41 | plot_learning_curve(i, scores,0) 42 | -------------------------------------------------------------------------------- /DQN/readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Deep Q learning using fixed q targets and experience replay 3 | 4 | ## Results 5 | 6 | ### Trained Mountain Car : 7 | ![](https://media.giphy.com/media/dZopKlQbCgEBTPBy8n/giphy.gif) 8 | 9 | ### Trained Cart Pole : 10 | ![](https://media.giphy.com/media/J5Yh1aY9WhlJc4TZFR/giphy.gif) 11 | 12 | ## Abstract: 13 | 14 | Function approximators like neural networks have succesfully been combined with reinforcement learning because of their ability to derive optimal estimations of the environment using higher order inputs like audios and images.This is an implementation of the Human-level control through deep reinforcement learning with some crunch time tweaks.My implementation was first tested using the low state inputs of the CartPole environment from OpenAi Gym.Then it was succesfully applied to different OpenAi gym environments without any major hyper-parameter tuning using just the high-dimensional sensory inputs. 15 | 16 | 17 | ## Environments: 18 | 19 | - **CartPole** - [https://gym.openai.com/envs/CartPole-v1/] 20 | - **MountainCar** - [https://gym.openai.com/envs/MountainCar-v0/] 21 | 22 | ## Instruction: 23 | 24 | ``` Hyper-parameters tuning for new problems should be done accordingly ``` 25 | ``` The path to save pytorch model checkpoints should be changed ``` 26 | 27 | ## Dependencies: 28 | 29 | - Anaconda: [link](https://docs.anaconda.com/anaconda/install/linux/) 30 | - OpenAi gym: [link](https://gym.openai.com/) 31 | - pytorch: [link](https://pytorch.org/) 32 | 33 | ## References: 34 | 35 | - [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 36 | 37 | -------------------------------------------------------------------------------- /DQN/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from IPython.display import clear_output 4 | import matplotlib 5 | import torch 6 | import matplotlib.pyplot as plt 7 | 8 | is_ipython = 'inline' in matplotlib.get_backend() 9 | if is_ipython: 10 | from IPython import display 11 | 12 | def plot_learning_curve(episode, scores, epsilon): 13 | clear_output(True) 14 | plt.figure(figsize=(20,5)) 15 | plt.subplot(131) 16 | plt.title('episode %s. average_reward: %s' % (episode, np.mean(scores[-10:]))) 17 | plt.plot(scores) 18 | plt.subplot(132) 19 | plt.title('epsilon') 20 | plt.plot(epsilon) 21 | plt.show() 22 | 23 | def plot_playing_curve(episode, scores): 24 | clear_output(True) 25 | plt.figure(figsize=(5,5)) 26 | plt.title('episode %s. average_reward: %s' % (episode, np.mean(scores[-10:]))) 27 | plt.plot(scores) 28 | plt.show() 29 | 30 | def plot_durations(scores,pause): 31 | plt.ion() 32 | plt.figure(2) 33 | plt.clf() 34 | 35 | durations_t = torch.tensor(scores, dtype=torch.float) 36 | plt.title('Training...') 37 | plt.xlabel('Episode') 38 | plt.ylabel('Scores') 39 | plt.plot(durations_t.numpy()) 40 | # Take 20 episode averages and plot them too 41 | if len(durations_t) >= 20: 42 | means = durations_t.unfold(0, 20, 1).mean(1).view(-1) 43 | means = torch.cat((torch.zeros(19), means)) 44 | plt.plot(means.numpy()) 45 | 46 | plt.pause(pause) # pause a bit so that plots are updated 47 | if is_ipython: 48 | display.clear_output(wait=True) 49 | display.display(plt.gcf()) 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL-algorithms 2 | This is a repository of my codes for implementing deep RL research papers 3 | 4 | ### Algorithms implemented 5 | 6 | ```(There is a seperate readme for further details about its implementation in the every folder) ``` 7 | 8 | #### Value based methods 9 | - [X] Deep Q learning using fixed Q targets and experience replay 10 | 11 | #### Policy based methods 12 | - [X] REINFORCE 13 | - [X] REINFORCE with baseline 14 | -------------------------------------------------------------------------------- /REINFORCE/README.md: -------------------------------------------------------------------------------- 1 | 2 | # REINFORCE 3 | 4 | REINFORCE is a vanilla policy gradient approach towards RL problems. This algorithm is implemented succesfully on the followoing problem from OpenAi gym 5 | 6 | ### Results: 7 | 8 | #### CartPole-v0 9 | 10 | ![](./images/CartPole-v0.png) 11 | 12 | #### LunarLander-v2 13 | 14 | ![](./images/LunarLander-v2.png) 15 | 16 | ### Observations: 17 | This method theoretically seems to work in expectation terms, but for individual trials occasionally gives sub-optimal results.The new data generated depends upon previous policy and hence this technique cannot be used in high stake situations. 18 | 19 | ### dependencies: 20 | 21 | * [openai gym](https://gym.openai.com/) 22 | * [pytorch](https://pytorch.org/) 23 | 24 | -------------------------------------------------------------------------------- /REINFORCE/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | class Policy(nn.Module): 8 | def __init__(self, lr, input_dims, h1, h2, n_actions): 9 | super(Policy,self).__init__() 10 | self.input_dims = input_dims 11 | self.lr = lr 12 | self.h1 = h1 13 | self.h2 = h2 14 | self.n_actions = n_actions 15 | self.linear1 = nn.Linear(*self.input_dims, self.h1) 16 | self.linear2 = nn.Linear(self.h1, self.h2) 17 | self.linear3 = nn.Linear(self.h2, self.n_actions) 18 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 19 | 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0') 21 | self.to(self.device) 22 | 23 | def forward(self,obs): 24 | x = T.tensor(obs,dtype=T.float).to(self.device) 25 | x = F.relu(self.linear1(x)) 26 | x = F.relu(self.linear2(x)) 27 | x = self.linear3(x) 28 | 29 | return x 30 | 31 | class Agent(object): 32 | def __init__(self, lr, input_dims, gamma=0.99, n_actions=2, h1=128, h2=128): 33 | self.gamma = gamma 34 | self.reward_memory = [] 35 | self.action_memory = [] 36 | self.policy = Policy(lr, input_dims, h1, h2, n_actions) 37 | 38 | def choose_action(self, observation): 39 | probs = F.softmax(self.policy(observation),dim=0) 40 | action_probs = T.distributions.Categorical(probs) 41 | action = action_probs.sample() 42 | log_probs = T.log(probs[action]) 43 | self.action_memory.append(log_probs) 44 | 45 | return action.item() 46 | 47 | def store_rewards(self, reward): 48 | self.reward_memory.append(reward) 49 | 50 | def improve(self): 51 | self.policy.optimizer.zero_grad() 52 | G = np.zeros_like(self.reward_memory, dtype=np.float64) 53 | for t in range(len(self.reward_memory)): 54 | g_sum = 0 55 | disc = 1 56 | for i in range(t, len(self.reward_memory)): 57 | g_sum += self.reward_memory[i]*disc 58 | disc *= self.gamma 59 | G[t] = g_sum 60 | G = (G - np.mean(G))/(np.std(G) if np.std(G) > 0 else 1) 61 | 62 | G = T.tensor(G, dtype=T.float).to(self.policy.device) 63 | 64 | loss = 0 65 | for g,log_prob in zip(G, self.action_memory): 66 | loss += -g * log_prob 67 | 68 | loss.backward() 69 | self.policy.optimizer.step() 70 | 71 | self.action_memory = [] 72 | self.reward_memory = [] 73 | -------------------------------------------------------------------------------- /REINFORCE/images/CartPole-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/REINFORCE/images/CartPole-v0.png -------------------------------------------------------------------------------- /REINFORCE/images/LunarLander-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/REINFORCE/images/LunarLander-v2.png -------------------------------------------------------------------------------- /REINFORCE/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from REINFORCE import Agent 3 | from utils import plot_score 4 | import numpy as np 5 | import torch 6 | from gym import wrappers 7 | 8 | 9 | NAME = "LunarLander-v2" 10 | INPUT_DIMS = [8] 11 | GAMMA = 0.99 12 | N_ACTIONS = 4 13 | N_GAMES = 200 14 | 15 | if __name__ == '__main__': 16 | env = gym.make(NAME) 17 | agent = Agent(lr=0.001, input_dims=INPUT_DIMS, gamma=GAMMA, n_actions=N_ACTIONS, 18 | h1=64, h2=32) 19 | score_history = [] 20 | score = 0 21 | best_score = -1000 22 | 23 | for i in range(N_GAMES): 24 | print('episode: ', i, 'score %.3f' % score) 25 | done = False 26 | score = 0 27 | state = env.reset() 28 | while not done: 29 | action = agent.choose_action(state) 30 | next_state, reward, done, _ = env.step(action) 31 | agent.store_rewards(reward) 32 | state = next_state 33 | score += reward 34 | if(np.mean(score_history[-20:])>best_score and i>20): 35 | torch.save(agent.policy.state_dict(),'./params/'+NAME+'pt.') 36 | best_score = np.mean(score_history[-20]) 37 | score_history.append(score) 38 | agent.improve() 39 | 40 | plot_score(score_history,NAME,save=True) 41 | -------------------------------------------------------------------------------- /REINFORCE/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | 5 | def plot_score(score_history,exp,save=False): 6 | score = np.array(score_history) 7 | iters = np.arange(len(score_history)) 8 | plt.plot(iters,score) 9 | plt.xlabel('training iterations') 10 | plt.ylabel('Total scores obtained') 11 | plt.title('REINFORCE ' + exp) 12 | if(save): 13 | plt.savefig('./images/'+exp+'.png') 14 | plt.legend() 15 | plt.show() 16 | -------------------------------------------------------------------------------- /REINFORCE_baselines/README.md: -------------------------------------------------------------------------------- 1 | # REINFORCE with baseline 2 | 3 | 4 | ### Observations: 5 | Baselines are used to reduce variance in the gradient estimate of the policy performance 6 | 7 | ### dependencies: 8 | 9 | * [openai gym](https://gym.openai.com/) 10 | * [pytorch](https://pytorch.org/) 11 | -------------------------------------------------------------------------------- /REINFORCE_baselines/REINFORCE_baselines.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | class Policy(nn.Module): 8 | def __init__(self, lr, input_dims, h1, h2, n_actions): 9 | super(Policy,self).__init__() 10 | self.input_dims = input_dims 11 | self.lr = lr 12 | self.h1 = h1 13 | self.h2 = h2 14 | self.n_actions = n_actions 15 | self.linear1 = nn.Linear(*self.input_dims, self.h1) 16 | self.linear2 = nn.Linear(self.h1, self.h2) 17 | self.linear3 = nn.Linear(self.h2, self.n_actions) 18 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 19 | 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0') 21 | self.to(self.device) 22 | 23 | def forward(self,obs): 24 | x = T.tensor(obs,dtype=T.float).to(self.device) 25 | x = F.relu(self.linear1(x)) 26 | x = F.relu(self.linear2(x)) 27 | x = self.linear3(x) 28 | 29 | return x 30 | 31 | class Value(nn.Module): 32 | def __init__(self, lr, input_dims, h1): 33 | super(Value,self).__init__() 34 | self.input_dims = input_dims 35 | self.lr = lr 36 | self.h1 = h1 37 | self.linear1 = nn.Linear(*self.input_dims, self.h1) 38 | self.linear2 = nn.Linear(self.h1, 1) 39 | 40 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 41 | self.loss = nn.MSELoss() 42 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0') 43 | self.to(self.device) 44 | 45 | def forward(self,obs): 46 | x = T.tensor(obs,dtype=T.float).to(self.device) 47 | x = F.relu(self.linear1(x)) 48 | x = self.linear2(x) 49 | 50 | return x 51 | 52 | 53 | class Agent(object): 54 | def __init__(self, lr, input_dims, gamma=0.96, n_actions=2, h1=128, h2=128, vh1=256, alpha=0.0003): 55 | self.gamma = gamma 56 | self.reward_memory = [] 57 | self.action_memory = [] 58 | self.state_memory = [] 59 | self.policy = Policy(lr, input_dims, h1, h2, n_actions) 60 | self.value = Value(alpha, input_dims, vh1) 61 | 62 | def choose_action(self, observation): 63 | probs = F.softmax(self.policy(observation),dim=0) 64 | action_probs = T.distributions.Categorical(probs) 65 | action = action_probs.sample() 66 | log_probs = T.log(probs[action]) 67 | self.action_memory.append(log_probs) 68 | 69 | return action.item() 70 | 71 | def store_rewards(self, reward): 72 | self.reward_memory.append(reward) 73 | 74 | def store_state(self, state): 75 | self.state_memory.append(state) 76 | 77 | def improve(self): 78 | G = np.zeros_like(self.reward_memory, dtype=np.float64) 79 | for t in range(len(self.reward_memory)): 80 | g_sum = 0 81 | disc = 1 82 | for i in range(t, len(self.reward_memory)): 83 | g_sum += self.reward_memory[i]*disc 84 | disc *= self.gamma 85 | G[t] = g_sum 86 | 87 | G = (G - np.mean(G))/(np.std(G) if np.std(G) > 0 else 1) 88 | G = T.tensor(G, dtype=T.float).to(self.policy.device) 89 | 90 | s = np.array(self.state_memory) 91 | s = T.tensor(s, dtype=T.float).to(self.value.device) 92 | val = self.value.forward(s).squeeze(dim=1) 93 | 94 | delta = G.clone().detach() 95 | delta.to(self.value.device) 96 | self.value.optimizer.zero_grad() 97 | 98 | lossv = self.value.loss(val, delta) 99 | lossv.backward(retain_graph = True) 100 | self.value.optimizer.step() 101 | 102 | self.policy.optimizer.zero_grad() 103 | 104 | G = G - val 105 | lossp = 0 106 | for g,log_prob in zip(G, self.action_memory): 107 | lossp += -g * log_prob 108 | 109 | lossp.backward() 110 | self.policy.optimizer.step() 111 | 112 | 113 | self.action_memory = [] 114 | self.reward_memory = [] 115 | self.state_memory = [] 116 | -------------------------------------------------------------------------------- /REINFORCE_baselines/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from REINFORCE_baselines import Agent 3 | from utils import plot_score 4 | import numpy as np 5 | import torch 6 | from gym import wrappers 7 | 8 | NAME = "CartPole-v0" 9 | INPUT_DIMS = [4] 10 | GAMMA = 0.99 11 | N_ACTIONS = 2 12 | N_GAMES = 400 13 | best_score = -1000 14 | 15 | if __name__ == '__main__': 16 | env = gym.make(NAME) 17 | agent = Agent(lr=0.001, input_dims=INPUT_DIMS, gamma=GAMMA, n_actions=N_ACTIONS, 18 | h1=64, h2=32, alpha = 0.001) 19 | score_history = [] 20 | score = 0 21 | n_games = N_GAMES 22 | for i in range(n_games): 23 | print('episode: ', i, 'score %.3f' % score) 24 | done = False 25 | score = 0 26 | state = env.reset() 27 | while not done: 28 | action = agent.choose_action(state) 29 | next_state, reward, done, _ = env.step(action) 30 | agent.store_rewards(reward) 31 | agent.store_state(state) 32 | state = next_state 33 | score += reward 34 | score_history.append(score) 35 | agent.improve() 36 | if(np.mean(score_history[-20:])>best_score and i>20): 37 | torch.save(agent.policy.state_dict(),'./params/'+NAME+'pt.') 38 | best_score = np.mean(score_history[-20]) 39 | plot_score(score_history,NAME,save=True) 40 | 41 | 42 | def play(n_games, agent): 43 | score = 0 44 | -------------------------------------------------------------------------------- /REINFORCE_baselines/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | 5 | def plot_score(score_history,exp,save=False): 6 | score = np.array(score_history) 7 | iters = np.arange(len(score_history)) 8 | plt.plot(iters,score) 9 | plt.xlabel('training iterations') 10 | plt.ylabel('Total scores obtained') 11 | 12 | if(save): 13 | plt.savefig('./images/'+exp+'.png') 14 | 15 | plt.show() 16 | --------------------------------------------------------------------------------