├── DQN
    ├── MountainCar_success.pt
    ├── dqn.py
    ├── main_dqn.py
    ├── play.py
    ├── readme.md
    └── utils.py
├── README.md
├── REINFORCE
    ├── README.md
    ├── REINFORCE.py
    ├── images
    │   ├── CartPole-v0.png
    │   └── LunarLander-v2.png
    ├── main.py
    └── utils.py
└── REINFORCE_baselines
    ├── README.md
    ├── REINFORCE_baselines.py
    ├── main.py
    └── utils.py


/DQN/MountainCar_success.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/DQN/MountainCar_success.pt


--------------------------------------------------------------------------------
/DQN/dqn.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | import matplotlib
  6 | import matplotlib.pyplot as plt
  7 | from torch import optim
  8 | import torchvision.transforms as T
  9 | import cv2
 10 | 
 11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 12 | 
 13 | class DeepQNetwork(nn.Module):
 14 | 
 15 |     def __init__(self,learning_rate,h,w,n_actions):
 16 |         super(DeepQNetwork, self).__init__()
 17 |         self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
 18 |         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
 19 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
 20 | 
 21 |         def conv2d_size(size, kernel_size = 3, stride = 1):
 22 |             return (size - kernel_size)// stride  + 1
 23 | 
 24 |         convw = conv2d_size(conv2d_size(conv2d_size(w,8,4),4,2))
 25 |         convh = conv2d_size(conv2d_size(conv2d_size(h,8,4),4,2))
 26 |         lin_1 = convw*convh
 27 | 
 28 | 
 29 |         self.linear1 = nn.Linear(lin_1*64,256)
 30 |         self.linear2 = nn.Linear(256, n_actions)
 31 | 
 32 |         self.criterion = nn.MSELoss()
 33 |         self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
 34 | 
 35 | 
 36 |     def forward(self, x):
 37 |         x = torch.relu(self.conv1(x))
 38 |         x = torch.relu(self.conv2(x))
 39 |         x = torch.relu(self.conv3(x))
 40 |         x = x.view(x.shape[0],-1)
 41 |         x = torch.relu(self.linear1(x))
 42 |         action_values = self.linear2(x)
 43 |         return action_values
 44 | 
 45 | class agent():
 46 | 
 47 |     def __init__(self,epsilon,eps_decay,epsilon_min,gamma,l_r,n_actions,memory,batch_size,target_update,env,save=False):
 48 |         self.epsilon = epsilon
 49 |         self.eps_decay = eps_decay
 50 |         self.epsilon_min = epsilon_min
 51 |         self.gamma = gamma
 52 |         self.env = env
 53 |         self.n_actions = n_actions
 54 |         self.batch_size = batch_size
 55 |         self.memory = memory
 56 |         self.memory_count = 0
 57 |         self.ROWS = 84
 58 |         self.COLS = 84
 59 |         self.state_memory = torch.zeros([self.memory,1,self.ROWS,self.COLS],dtype = torch.float32)
 60 |         self.next_state_memory = torch.zeros([self.memory,1,self.ROWS,self.COLS],dtype = torch.float32)
 61 |         self.action_memory = torch.zeros(self.memory,dtype=torch.int32)
 62 |         self.terminal_memory = torch.zeros(self.memory,dtype=torch.uint8)
 63 |         self.reward_memory = torch.zeros(self.memory)
 64 |         self.policy_net = DeepQNetwork(learning_rate = l_r,h=self.ROWS,w=self.COLS,n_actions=self.n_actions).to(device)
 65 |         self.target_net = DeepQNetwork(learning_rate = l_r,h=self.ROWS,w=self.COLS,n_actions=self.n_actions).to(device)
 66 |         self.target_update = target_update
 67 |         self.save = save
 68 | 
 69 | 
 70 | 
 71 |     def choose_action(self,state):
 72 |         r = np.random.random()
 73 |         if r<self.epsilon:
 74 |             action = self.env.action_space.sample()
 75 |         else:
 76 |             with torch.no_grad():
 77 |                 q_val = self.policy_net.forward(state)
 78 |                 action = torch.argmax(q_val).item()
 79 | 
 80 |         return action
 81 | 
 82 | 
 83 |     def store_experience(self,state,action,reward,terminal,next_state):
 84 |         index = self.memory_count%self.memory
 85 | 
 86 |         self.state_memory[index] = state
 87 |         self.action_memory[index] = action
 88 |         self.reward_memory[index] = reward
 89 |         self.terminal_memory[index] = 1-terminal
 90 |         self.next_state_memory[index] = next_state
 91 | 
 92 |         self.memory_count+=1
 93 | 
 94 |     def get_state(self):
 95 | 
 96 |         screen = self.env.render(mode='rgb_array')
 97 |         screen_1 = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
 98 |         r_screen = cv2.resize(screen_1, (self.ROWS,self.COLS), interpolation=cv2.INTER_AREA)
 99 |         r_screen = np.array(r_screen)
100 |         r_screen = np.expand_dims(r_screen,axis=0)
101 |         r_screen = torch.Tensor(r_screen)
102 |         return r_screen.unsqueeze(0).to(device)
103 | 
104 | 
105 |     def learn_with_experience_replay(self):
106 |         if self.memory_count < self.batch_size:
107 |             return
108 | 
109 |         if self.memory_count < self.memory:
110 |             mem = self.memory_count
111 |         else:
112 |             mem = self.memory
113 | 
114 |         self.policy_net.optimizer.zero_grad()
115 | 
116 | 
117 |         batch = np.random.choice(mem, self.batch_size, replace=False)
118 | 
119 |         state_batch = self.state_memory[batch].to(device)
120 |         action_batch = self.action_memory[batch]
121 |         new_state_batch = self.next_state_memory[batch].to(device)
122 |         reward_batch = self.reward_memory[batch].to(device)
123 |         terminal_batch = self.terminal_memory[batch].to(device)
124 | 
125 | 
126 |         q_val = self.policy_net.forward(state_batch).to(device)
127 | 
128 |         q_next = self.target_net.forward(new_state_batch).to(device).detach()
129 | 
130 |         q_target = self.policy_net.forward(state_batch).to(device).detach()
131 | 
132 |         batch_index = np.arange(self.batch_size)
133 |         action_values = torch.max(q_next,1)[0]
134 | 
135 |         q_target[batch_index, np.array(action_batch)] = reward_batch + self.gamma*action_values*terminal_batch
136 | 
137 |         loss = self.policy_net.criterion(q_val,q_target).to(device)
138 | 
139 |         self.policy_net.optimizer.zero_grad()
140 |         loss.backward()
141 |         self.policy_net.optimizer.step()
142 | 
143 | 
144 | 
145 |     def epsilon_decay(self):
146 |         if self.epsilon>self.epsilon_min:
147 |             self.epsilon = self.epsilon-self.eps_decay
148 |         return self.epsilon
149 | 
150 | 
151 | print('Done')
152 | 


--------------------------------------------------------------------------------
/DQN/main_dqn.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from dqn import cart_agent
 3 | import numpy as np
 4 | import torch
 5 | from utils import plot_durations
 6 | from utils import plot_learning_curve
 7 | 
 8 | device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     env = gym.make('MountainCar-v0')
13 | 
14 |     A =  agent(epsilon=1,eps_decay=0.005,epsilon_min=0.01,gamma=0.99,l_r=0.0001,n_actions=3,
15 |                 memory=20000,batch_size=32,target_update=7,env=env,save=True)
16 | 
17 |     scores, avg_score, epsilon_history = [], [], []
18 |     best_score = -np.inf
19 |     n_games = 1000
20 |     score = 0
21 | 
22 |     print("Save is currently !!!!!!!!!!!!!!!!!! ", A.save)
23 | 
24 |     for i in range(n_games):
25 |         A.env.reset()
26 |         last_screen = A.get_state()
27 |         current_screen = A.get_state()
28 |         state = current_screen-last_screen
29 | 
30 |         done = False
31 |         score = 0
32 | 
33 |         if i%20==0 and i>0:
34 |             plot_durations(scores, 0.001)
35 |             print('----------------- training --------------------')
36 |             print('epsiode number', i)
37 |             print("Average score ",avg_score[-1])
38 |             print('----------------- training --------------------')
39 | 
40 |         while not done:
41 |             action = A.choose_action(state)
42 | 
43 |             _, reward, done, _ = A.env.step(action)
44 | 
45 |             last_screen = current_screen
46 |             current_screen = A.get_state()
47 | 
48 |             next_state = current_screen - last_screen
49 | 
50 |             A.store_experience(state,action,reward,done,next_state)
51 |             A.learn_with_experience_replay()
52 | 
53 |             score += reward
54 |             state = next_state
55 | 
56 |         scores.append(score)
57 |         if i>30:
58 |             avg_score.append(np.mean(scores[-30:]))
59 |         else:
60 |             avg_score.append(np.mean(scores))
61 | 
62 |         if avg_score[-1] > best_score:
63 |             torch.save(A.policy_net.state_dict(),'/home/raj/My_projects/DQN/MountanCar.pt')
64 |             best_score = avg_score[-1]
65 |             print("***************\ncurrent best average score is "+ str(best_score) +"\n***************")
66 | 
67 |         if i%A.target_update == 0:
68 |             A.target_net.load_state_dict(A.policy_net.state_dict())
69 | 
70 |         A.epsilon_decay()
71 | 
72 |     plot_durations(scores,5)
73 | 


--------------------------------------------------------------------------------
/DQN/play.py:
--------------------------------------------------------------------------------
 1 | from dqn import cart_agent
 2 | import torch
 3 | import gym
 4 | import time
 5 | import numpy as np
 6 | from utils import plot_learning_curve
 7 | 
 8 | device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
 9 | env = gym.make('MountainCar-v0').unwrapped
10 | 
11 | if __name__ == '__main__':
12 |     player =  agent(epsilon=0,eps_decay=0,epsilon_min=0,gamma=0,l_r=0,n_actions=3,
13 |                 memory=0,batch_size=0,target_update=0,env = env,save = True)
14 |     n_games = 3
15 |     scores = []
16 |     player.policy_net.load_state_dict(torch.load('/home/raj/My_projects/DQN/MountanCar.pt'))
17 | 
18 |     for i in range(n_games):
19 |         env.reset()
20 |         last_screen = player.get_state()
21 |         current_screen = player.get_state()
22 |         state = current_screen-last_screen
23 | 
24 |         done = False
25 |         score = 0
26 |         while not done:
27 |             action = player.choose_action(state)
28 |             time.sleep(0.05)
29 |             _, reward, done, _ = player.env.step(action)
30 | 
31 |             last_screen = current_screen
32 |             current_screen = player.get_state()
33 | 
34 |             next_state = current_screen - last_screen
35 |             score += reward
36 |             state = next_state
37 | 
38 | 
39 |         scores.append(score)
40 |     print(np.mean(scores))
41 |     plot_learning_curve(i, scores,0)
42 | 


--------------------------------------------------------------------------------
/DQN/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Deep Q learning using fixed q targets and experience replay
 3 | 
 4 | ## Results
 5 | 
 6 | ### Trained Mountain Car :
 7 | ![](https://media.giphy.com/media/dZopKlQbCgEBTPBy8n/giphy.gif)
 8 | 
 9 | ### Trained Cart Pole :
10 | ![](https://media.giphy.com/media/J5Yh1aY9WhlJc4TZFR/giphy.gif)
11 | 
12 | ## Abstract:
13 | 
14 | Function approximators like neural networks have succesfully been combined with reinforcement learning because of their ability to derive optimal estimations of the environment using higher order inputs like audios and images.This is an implementation of the Human-level control through deep reinforcement learning with some crunch time tweaks.My implementation was first tested using the low state inputs of the CartPole environment from OpenAi Gym.Then it was succesfully applied to different OpenAi gym  environments without any major hyper-parameter tuning using just the high-dimensional sensory inputs. 
15 | 
16 | 
17 | ## Environments:
18 | 
19 | - **CartPole** - [https://gym.openai.com/envs/CartPole-v1/]
20 | - **MountainCar** - [https://gym.openai.com/envs/MountainCar-v0/]
21 | 
22 | ## Instruction:
23 | 
24 | ``` Hyper-parameters tuning for new problems should be done accordingly ```
25 | ``` The path to save pytorch model checkpoints should be changed ```
26 | 
27 | ## Dependencies:
28 | 
29 | - Anaconda: [link](https://docs.anaconda.com/anaconda/install/linux/)
30 | - OpenAi gym: [link](https://gym.openai.com/)
31 | - pytorch: [link](https://pytorch.org/)
32 | 
33 | ## References:
34 | 
35 | - [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
36 | 
37 | 


--------------------------------------------------------------------------------
/DQN/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from IPython.display import clear_output
 4 | import matplotlib
 5 | import torch
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | is_ipython = 'inline' in matplotlib.get_backend()
 9 | if is_ipython:
10 |     from IPython import display
11 | 
12 | def plot_learning_curve(episode, scores, epsilon):
13 |     clear_output(True)
14 |     plt.figure(figsize=(20,5))
15 |     plt.subplot(131)
16 |     plt.title('episode %s. average_reward: %s' % (episode, np.mean(scores[-10:])))
17 |     plt.plot(scores)
18 |     plt.subplot(132)
19 |     plt.title('epsilon')
20 |     plt.plot(epsilon)
21 |     plt.show()
22 | 
23 | def plot_playing_curve(episode, scores):
24 |     clear_output(True)
25 |     plt.figure(figsize=(5,5))
26 |     plt.title('episode %s. average_reward: %s' % (episode, np.mean(scores[-10:])))
27 |     plt.plot(scores)
28 |     plt.show()
29 | 
30 | def plot_durations(scores,pause):
31 |     plt.ion()
32 |     plt.figure(2)
33 |     plt.clf()
34 | 
35 |     durations_t = torch.tensor(scores, dtype=torch.float)
36 |     plt.title('Training...')
37 |     plt.xlabel('Episode')
38 |     plt.ylabel('Scores')
39 |     plt.plot(durations_t.numpy())
40 |     # Take 20 episode averages and plot them too
41 |     if len(durations_t) >= 20:
42 |         means = durations_t.unfold(0, 20, 1).mean(1).view(-1)
43 |         means = torch.cat((torch.zeros(19), means))
44 |         plt.plot(means.numpy())
45 | 
46 |     plt.pause(pause)  # pause a bit so that plots are updated
47 |     if is_ipython:
48 |         display.clear_output(wait=True)
49 |         display.display(plt.gcf())
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RL-algorithms
 2 | This is a repository of my codes for implementing deep RL research papers
 3 | 
 4 | ### Algorithms implemented
 5 | 
 6 | ```(There is a seperate readme for further details about its implementation in the every folder) ```
 7 | 
 8 | #### Value based methods
 9 | - [X]  Deep Q learning using fixed Q targets and experience replay 
10 | 
11 | #### Policy based methods
12 | - [X] REINFORCE
13 | - [X] REINFORCE with baseline
14 | 


--------------------------------------------------------------------------------
/REINFORCE/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # REINFORCE
 3 | 
 4 | REINFORCE is a vanilla policy gradient approach towards RL problems. This algorithm is implemented succesfully on the followoing problem from OpenAi gym
 5 | 
 6 | ### Results:
 7 | 
 8 | #### CartPole-v0
 9 | 
10 | ![](./images/CartPole-v0.png)
11 | 
12 | #### LunarLander-v2
13 | 
14 | ![](./images/LunarLander-v2.png)
15 | 
16 | ### Observations:
17 | This method theoretically seems to work in expectation terms, but for individual trials occasionally gives sub-optimal results.The new data generated depends upon previous policy and hence this technique cannot be used in high stake situations.
18 | 
19 | ### dependencies:
20 | 
21 | * [openai gym](https://gym.openai.com/)           
22 | * [pytorch](https://pytorch.org/)
23 | 
24 | 


--------------------------------------------------------------------------------
/REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | import numpy as np
 6 | 
 7 | class Policy(nn.Module):
 8 |     def __init__(self, lr, input_dims, h1, h2, n_actions):
 9 |         super(Policy,self).__init__()
10 |         self.input_dims = input_dims
11 |         self.lr = lr
12 |         self.h1 = h1
13 |         self.h2 = h2
14 |         self.n_actions = n_actions
15 |         self.linear1 = nn.Linear(*self.input_dims, self.h1)
16 |         self.linear2 = nn.Linear(self.h1, self.h2)
17 |         self.linear3 = nn.Linear(self.h2, self.n_actions)
18 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
19 | 
20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
21 |         self.to(self.device)
22 | 
23 |     def forward(self,obs):
24 |         x = T.tensor(obs,dtype=T.float).to(self.device)
25 |         x = F.relu(self.linear1(x))
26 |         x = F.relu(self.linear2(x))
27 |         x = self.linear3(x)
28 | 
29 |         return x
30 | 
31 | class Agent(object):
32 |     def __init__(self, lr, input_dims, gamma=0.99, n_actions=2, h1=128, h2=128):
33 |         self.gamma = gamma
34 |         self.reward_memory = []
35 |         self.action_memory = []
36 |         self.policy = Policy(lr, input_dims, h1, h2, n_actions)
37 | 
38 |     def choose_action(self, observation):
39 |         probs = F.softmax(self.policy(observation),dim=0)
40 |         action_probs = T.distributions.Categorical(probs)
41 |         action = action_probs.sample()
42 |         log_probs = T.log(probs[action])
43 |         self.action_memory.append(log_probs)
44 | 
45 |         return action.item()
46 | 
47 |     def store_rewards(self, reward):
48 |         self.reward_memory.append(reward)
49 | 
50 |     def improve(self):
51 |         self.policy.optimizer.zero_grad()
52 |         G = np.zeros_like(self.reward_memory, dtype=np.float64)
53 |         for t in range(len(self.reward_memory)):
54 |             g_sum = 0
55 |             disc = 1
56 |             for i in range(t, len(self.reward_memory)):
57 |                 g_sum += self.reward_memory[i]*disc
58 |                 disc *= self.gamma
59 |             G[t] = g_sum
60 |         G = (G - np.mean(G))/(np.std(G) if np.std(G) > 0 else 1)
61 | 
62 |         G = T.tensor(G, dtype=T.float).to(self.policy.device)
63 | 
64 |         loss = 0
65 |         for g,log_prob in zip(G, self.action_memory):
66 |             loss += -g * log_prob
67 | 
68 |         loss.backward()
69 |         self.policy.optimizer.step()
70 | 
71 |         self.action_memory = []
72 |         self.reward_memory = []
73 | 


--------------------------------------------------------------------------------
/REINFORCE/images/CartPole-v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/REINFORCE/images/CartPole-v0.png


--------------------------------------------------------------------------------
/REINFORCE/images/LunarLander-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajGhugare19/RL-algorithms/8501fc5a99a80383e91047807841f200a6c277a4/REINFORCE/images/LunarLander-v2.png


--------------------------------------------------------------------------------
/REINFORCE/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from REINFORCE import Agent
 3 | from utils import plot_score
 4 | import numpy as np
 5 | import torch
 6 | from gym import wrappers
 7 | 
 8 | 
 9 | NAME = "LunarLander-v2"
10 | INPUT_DIMS = [8]
11 | GAMMA = 0.99
12 | N_ACTIONS = 4
13 | N_GAMES = 200
14 | 
15 | if __name__ == '__main__':
16 |     env = gym.make(NAME)
17 |     agent = Agent(lr=0.001, input_dims=INPUT_DIMS, gamma=GAMMA, n_actions=N_ACTIONS,
18 |                     h1=64, h2=32)
19 |     score_history = []
20 |     score = 0
21 |     best_score = -1000
22 | 
23 |     for i in range(N_GAMES):
24 |         print('episode: ', i, 'score %.3f' % score)
25 |         done = False
26 |         score = 0
27 |         state = env.reset()
28 |         while not done:
29 |             action = agent.choose_action(state)
30 |             next_state, reward, done, _ = env.step(action)
31 |             agent.store_rewards(reward)
32 |             state = next_state
33 |             score += reward
34 |         if(np.mean(score_history[-20:])>best_score and i>20):
35 |             torch.save(agent.policy.state_dict(),'./params/'+NAME+'pt.')
36 |             best_score = np.mean(score_history[-20])
37 |         score_history.append(score)
38 |         agent.improve()
39 | 
40 |     plot_score(score_history,NAME,save=True)
41 | 


--------------------------------------------------------------------------------
/REINFORCE/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import time
 3 | import numpy as np
 4 | 
 5 | def plot_score(score_history,exp,save=False):
 6 |     score = np.array(score_history)
 7 |     iters = np.arange(len(score_history))
 8 |     plt.plot(iters,score)
 9 |     plt.xlabel('training iterations')
10 |     plt.ylabel('Total scores obtained')
11 |     plt.title('REINFORCE ' + exp)
12 |     if(save):
13 |         plt.savefig('./images/'+exp+'.png')
14 |     plt.legend()
15 |     plt.show()
16 | 


--------------------------------------------------------------------------------
/REINFORCE_baselines/README.md:
--------------------------------------------------------------------------------
 1 | # REINFORCE with baseline
 2 | 
 3 | 
 4 | ### Observations:
 5 | Baselines are used to reduce variance in the  gradient estimate of the policy performance
 6 | 
 7 | ### dependencies:
 8 | 
 9 | * [openai gym](https://gym.openai.com/)           
10 | * [pytorch](https://pytorch.org/)
11 | 


--------------------------------------------------------------------------------
/REINFORCE_baselines/REINFORCE_baselines.py:
--------------------------------------------------------------------------------
  1 | import torch as T
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | 
  7 | class Policy(nn.Module):
  8 |     def __init__(self, lr, input_dims, h1, h2, n_actions):
  9 |         super(Policy,self).__init__()
 10 |         self.input_dims = input_dims
 11 |         self.lr = lr
 12 |         self.h1 = h1
 13 |         self.h2 = h2
 14 |         self.n_actions = n_actions
 15 |         self.linear1 = nn.Linear(*self.input_dims, self.h1)
 16 |         self.linear2 = nn.Linear(self.h1, self.h2)
 17 |         self.linear3 = nn.Linear(self.h2, self.n_actions)
 18 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
 19 | 
 20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
 21 |         self.to(self.device)
 22 | 
 23 |     def forward(self,obs):
 24 |         x = T.tensor(obs,dtype=T.float).to(self.device)
 25 |         x = F.relu(self.linear1(x))
 26 |         x = F.relu(self.linear2(x))
 27 |         x = self.linear3(x)
 28 | 
 29 |         return x
 30 | 
 31 | class Value(nn.Module):
 32 |     def __init__(self, lr, input_dims, h1):
 33 |         super(Value,self).__init__()
 34 |         self.input_dims = input_dims
 35 |         self.lr = lr
 36 |         self.h1 = h1
 37 |         self.linear1 = nn.Linear(*self.input_dims, self.h1)
 38 |         self.linear2 = nn.Linear(self.h1, 1)
 39 | 
 40 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
 41 |         self.loss = nn.MSELoss()
 42 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu:0')
 43 |         self.to(self.device)
 44 | 
 45 |     def forward(self,obs):
 46 |         x = T.tensor(obs,dtype=T.float).to(self.device)
 47 |         x = F.relu(self.linear1(x))
 48 |         x = self.linear2(x)
 49 | 
 50 |         return x
 51 | 
 52 | 
 53 | class Agent(object):
 54 |     def __init__(self, lr, input_dims, gamma=0.96, n_actions=2, h1=128, h2=128, vh1=256, alpha=0.0003):
 55 |         self.gamma = gamma
 56 |         self.reward_memory = []
 57 |         self.action_memory = []
 58 |         self.state_memory = []
 59 |         self.policy = Policy(lr, input_dims, h1, h2, n_actions)
 60 |         self.value = Value(alpha, input_dims, vh1)
 61 | 
 62 |     def choose_action(self, observation):
 63 |         probs = F.softmax(self.policy(observation),dim=0)
 64 |         action_probs = T.distributions.Categorical(probs)
 65 |         action = action_probs.sample()
 66 |         log_probs = T.log(probs[action])
 67 |         self.action_memory.append(log_probs)
 68 | 
 69 |         return action.item()
 70 | 
 71 |     def store_rewards(self, reward):
 72 |         self.reward_memory.append(reward)
 73 | 
 74 |     def store_state(self, state):
 75 |         self.state_memory.append(state)
 76 | 
 77 |     def improve(self):
 78 |         G = np.zeros_like(self.reward_memory, dtype=np.float64)
 79 |         for t in range(len(self.reward_memory)):
 80 |             g_sum = 0
 81 |             disc = 1
 82 |             for i in range(t, len(self.reward_memory)):
 83 |                 g_sum += self.reward_memory[i]*disc
 84 |                 disc *= self.gamma
 85 |             G[t] = g_sum
 86 | 
 87 |         G = (G - np.mean(G))/(np.std(G) if np.std(G) > 0 else 1)
 88 |         G = T.tensor(G, dtype=T.float).to(self.policy.device)
 89 | 
 90 |         s = np.array(self.state_memory)
 91 |         s = T.tensor(s, dtype=T.float).to(self.value.device)
 92 |         val = self.value.forward(s).squeeze(dim=1)
 93 | 
 94 |         delta = G.clone().detach()
 95 |         delta.to(self.value.device)
 96 |         self.value.optimizer.zero_grad()
 97 | 
 98 |         lossv = self.value.loss(val, delta)
 99 |         lossv.backward(retain_graph = True)
100 |         self.value.optimizer.step()
101 | 
102 |         self.policy.optimizer.zero_grad()
103 | 
104 |         G = G - val
105 |         lossp = 0
106 |         for g,log_prob in zip(G, self.action_memory):
107 |             lossp += -g * log_prob
108 | 
109 |         lossp.backward()
110 |         self.policy.optimizer.step()
111 | 
112 | 
113 |         self.action_memory = []
114 |         self.reward_memory = []
115 |         self.state_memory = []
116 | 


--------------------------------------------------------------------------------
/REINFORCE_baselines/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from REINFORCE_baselines import Agent
 3 | from utils import plot_score
 4 | import numpy as np
 5 | import torch
 6 | from gym import wrappers
 7 | 
 8 | NAME = "CartPole-v0"
 9 | INPUT_DIMS = [4]
10 | GAMMA = 0.99
11 | N_ACTIONS = 2
12 | N_GAMES = 400
13 | best_score = -1000
14 | 
15 | if __name__ == '__main__':
16 |     env = gym.make(NAME)
17 |     agent = Agent(lr=0.001, input_dims=INPUT_DIMS, gamma=GAMMA, n_actions=N_ACTIONS,
18 |                     h1=64, h2=32, alpha = 0.001)
19 |     score_history = []
20 |     score = 0
21 |     n_games = N_GAMES
22 |     for i in range(n_games):
23 |         print('episode: ', i, 'score %.3f' % score)
24 |         done = False
25 |         score = 0
26 |         state = env.reset()
27 |         while not done:
28 |             action = agent.choose_action(state)
29 |             next_state, reward, done, _ = env.step(action)
30 |             agent.store_rewards(reward)
31 |             agent.store_state(state)
32 |             state = next_state
33 |             score += reward
34 |         score_history.append(score)
35 |         agent.improve()
36 |         if(np.mean(score_history[-20:])>best_score and i>20):
37 |             torch.save(agent.policy.state_dict(),'./params/'+NAME+'pt.')
38 |             best_score = np.mean(score_history[-20])
39 |     plot_score(score_history,NAME,save=True)
40 | 
41 | 
42 | def play(n_games, agent):
43 |     score = 0
44 | 


--------------------------------------------------------------------------------
/REINFORCE_baselines/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import time
 3 | import numpy as np
 4 | 
 5 | def plot_score(score_history,exp,save=False):
 6 |     score = np.array(score_history)
 7 |     iters = np.arange(len(score_history))
 8 |     plt.plot(iters,score)
 9 |     plt.xlabel('training iterations')
10 |     plt.ylabel('Total scores obtained')
11 | 
12 |     if(save):
13 |         plt.savefig('./images/'+exp+'.png')
14 | 
15 |     plt.show()
16 | 


--------------------------------------------------------------------------------