├── .gitignore
├── 1. Bandits
├── Epsilon-greedy.py
├── Gradient-bandit.py
├── Optim-init-value.py
└── README.md
├── 2. DP methods
├── One cycle PI.py
├── README.md
├── algorithms.py
└── deterministic VI.py
├── 3. Model free methods
├── Monte Carlo
│ ├── MC evaluation.py
│ ├── On policy MC.py
│ ├── README.md
│ └── utils.py
├── README.md
└── TD Learning
│ ├── QLearning.py
│ ├── README.md
│ ├── SARSA.py
│ └── sarsa taxi.png
├── Deterministic PG
├── DDPG
│ ├── Inverted pendulum.png
│ ├── README.md
│ ├── agent.py
│ └── ddpg.py
├── README.md
└── TD3
│ ├── agent.py
│ └── td3.py
├── README.md
└── Vanilla Policy gradient methods
├── Actor critic
├── agent.py
└── model.py
├── README.md
└── REINFORCE
├── Cartpole_result.png
├── README.md
├── REINFORCE.py
├── model.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.vscode
2 | *.pyc
3 | *.pyo
4 | *.ipynb_checkpoints
5 | .installed.cfg
6 | bin
7 | develop-eggs
8 | dist
9 | downloads
10 | eggs
11 | parts
12 | src/*.egg-info
13 | __pycache__/
14 | *.py[cod]
--------------------------------------------------------------------------------
/1. Bandits/Epsilon-greedy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 | class GaussianBandit(object):
5 |
6 | def __init__(self, num_arms, variance=1, mean=10):
7 |
8 | self.mean = [random.uniform(-mean, mean) for i in range(num_arms)]
9 | self.max_value_arm = np.argmax(self.mean)
10 | self.variance = variance
11 |
12 | def Rewards(self, arm):
13 |
14 | return random.gauss(self.mean[arm], self.variance)
15 |
16 |
17 | def EpsilonGreedy(epsilon, num_arms, iter, alpha):
18 |
19 | q_value = np.zeros(num_arms)
20 | arm_occur = np.zeros(num_arms)
21 | arm_reward = np.zeros(num_arms)
22 |
23 | for i in range(iter):
24 |
25 | rand = random.uniform(0,1)
26 |
27 | if(rand > epsilon):
28 | arm = np.argmax(q_value)
29 | else:
30 | arm = int(random.uniform(0,num_arms))
31 |
32 | arm_occur[arm] += 1
33 | reward = bandit.Rewards(arm)
34 | arm_reward[arm] += reward
35 |
36 | q_value[arm] = q_value[arm] + alpha*(reward - q_value[arm])
37 |
38 |
39 | if(i%1000 == 0):
40 | epsilon /= 2
41 | print('\nrewards are - ', arm_reward)
42 | print('-----------------------------------------------------------')
43 | print('\narm chosen maximum number of times -', np.argmax(arm_occur)+1)
44 | print('arm with max expected return is - ', bandit.max_value_arm+1)
45 |
46 |
47 | iter = 10000
48 | num_arms = 10
49 | alpha = 0.5
50 | epsilon = 0.7
51 | mean = 10
52 | variance = 1
53 |
54 | bandit = GaussianBandit(num_arms)
55 | print('true expected rewards\n\n',bandit.mean,'\n')
56 | print('---------------------------------------------------')
57 |
58 | EpsilonGreedy(epsilon, num_arms, iter, alpha)
59 |
--------------------------------------------------------------------------------
/1. Bandits/Gradient-bandit.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random as rd
3 |
4 | class GaussianBandit(object):
5 |
6 | def __init__(self, num_arms, mean=10, variance=1):
7 |
8 | self.variance = variance
9 | self.mean = [rd.uniform(-mean, mean) for i in range(num_arms)]
10 | self.max_value_arm = np.argmax(self.mean)
11 |
12 | def Rewards(self, arm):
13 |
14 | return rd.gauss(self.mean[arm], self.variance)
15 |
16 |
17 | def Softmax(arm_preference):
18 |
19 | a = np.exp(arm_preference)
20 | print(a)
21 | z = np.sum(np.exp(arm_preference))
22 | print(z)
23 | return a/z
24 |
25 |
26 | def GradientBandit(num_arms, alpha, iterations):
27 |
28 | arm_preference = np.zeros(num_arms)
29 | arm_reward = np.zeros(num_arms)
30 | arm_occur = np.zeros(num_arms)
31 | avg_reward = np.zeros(num_arms)
32 |
33 | for i in range(iterations):
34 |
35 | arm = int(rd.uniform(0,num_arms))
36 | arm_occur[arm] += 1
37 |
38 | reward = bandit.Rewards(arm)
39 | arm_reward[arm] += reward
40 | avg_reward[arm] = avg_reward[arm] + (reward - avg_reward[arm])/arm_occur[arm]
41 |
42 | softmax = Softmax(arm_preference)
43 |
44 | for a in range(num_arms):
45 |
46 | if(a == arm):
47 | arm_preference[a] = arm_preference[a] + alpha*(reward - avg_reward[a])*(1 - softmax[a])
48 | else:
49 | arm_preference[a] = arm_preference[a] + alpha*(reward - avg_reward[a])*softmax[a]
50 |
51 | print(arm_preference)
52 |
53 | print('here\n')
54 | if(i%1000 == 0):
55 | print('\nrewards are - ', arm_reward)
56 | print('\narm with max preference is - ', np.argmax(arm_preference))
57 | print('\narm with max expected return is - ', bandit.max_value_arm)
58 | print('-----------------------------------------------------------')
59 |
60 |
61 | num_arms = 10
62 | mean = 10
63 | variance = 1
64 | alpha = 0.5
65 | iterations = 5000
66 |
67 | bandit = GaussianBandit(num_arms, mean, variance)
68 | print(bandit.mean)
69 | print('----------------------------------------------------')
70 |
71 | GradientBandit(num_arms, alpha, iterations)
--------------------------------------------------------------------------------
/1. Bandits/Optim-init-value.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random as rd
3 |
4 | class GaussianBandit(object):
5 |
6 | def __init__(self, num_arms, variance=1, mean=10):
7 |
8 | self.mean = [rd.uniform(-mean,mean) for i in range(num_arms)]
9 | self.max_value_arm = np.argmax(self.mean)
10 | self.variance = variance
11 |
12 | def Rewards(self, arm):
13 |
14 | return rd.gauss(self.mean[arm], self.variance)
15 |
16 |
17 | def OptimValue(optim_value, alpha, num_arms, iterations):
18 |
19 | q_value = optim_value*(np.ones(num_arms))
20 | arm_occur = np.zeros(num_arms)
21 | arm_reward = np.zeros(num_arms)
22 |
23 | arm = int(rd.uniform(0,num_arms))
24 |
25 | for i in range(iterations):
26 |
27 | arm_occur[arm] += 1
28 | reward = bandit.Rewards(arm)
29 | arm_reward[arm] += reward
30 |
31 | q_value[arm] = q_value[arm] + alpha*(reward - q_value[arm])
32 |
33 | arm = np.argmax(q_value)
34 |
35 | if(i%1000 == 0):
36 | print('\nrewards are - ', arm_reward)
37 | print('-----------------------------------------------------------')
38 | print('\narm chosen maximum number of times', np.argmax(arm_occur)+1)
39 | print('arm with maximum true expected reward', bandit.max_value_arm+1)
40 |
41 | optim_value = 30
42 | num_arms = 10
43 | variance = 1
44 | mean = 10
45 | alpha = 0.5
46 | iterations = 5000
47 |
48 | bandit = GaussianBandit(num_arms, variance, mean)
49 | print(bandit.mean)
50 | print('\narm with max expected return is - ', bandit.max_value_arm + 1)
51 | print('----------------------------------------------------')
52 |
53 | OptimValue(optim_value, alpha, num_arms, iterations)
54 |
--------------------------------------------------------------------------------
/1. Bandits/README.md:
--------------------------------------------------------------------------------
1 | # Multi-Armed Bandit
2 |
3 | ### Summary
4 | This is a classic reinforcement learning problem that exemplies the exploration-exploitation tradeoff. Each arm can be thought of one arm of a slot machine. The rewards may differ every time we pull the arm but there's a true expected reward associated with every slot machine.
5 | To find the true expectated reward we would have to choose one arm again and again to see what kind of rewards we get, even if we don't get high immediate rewards.
6 | The agent needs to *explore* as well as *exploit* it's current knowledge of which arm yields high immediate reward. This aim of maximizing the objective of expected rewards leads to the exploration-exploitation dilemma.
7 |
8 | **Problem Statement** : There are 10 one-arm bandits. Reward associated with each arm is a Gaussian distribution with some fixed mean and variance. The mean of the gaussian represents the expected reward and is sampled randomly for each arm from some uniform distribution.
9 |
10 | ### Algorithms implemented
11 | - [x] Epsilon Greedy
12 | - [x] Softmax exploration
13 | - [x] Optimistic initialisation
14 | - [ ] UCB
15 | - [ ] Median Elimination
16 | - [ ] Thompson Sampling
17 |
18 | ### Resources
19 | 1. Chapters 1 and 2 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf)
--------------------------------------------------------------------------------
/2. DP methods/One cycle PI.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random as rd
3 | import gym
4 | import time
5 |
6 | env = gym.make('FrozenLake8x8-v0')
7 | #this is a stochastic environment, action taken may be different from what you chose
8 |
9 | env = env.unwrapped
10 |
11 | state_space = env.nS
12 | action_space = env.nA
13 |
14 | policy = [int(rd.uniform(0,env.nA)) for i in range(state_space)]
15 | values = np.zeros(env.nS)
16 | discount = 0.9
17 | delta = 0
18 | theta = 1e-9
19 | i = 0
20 |
21 | while(True):
22 |
23 | prev_values = values.copy()
24 |
25 | for state in range(state_space):
26 |
27 | temp_value = 0
28 | nextstates = len(env.P[state][policy[state]])
29 | # multiple nextstates because environment throws you anywhere randomly
30 | # there is equal probability assigned to every action, doesn't really matter what action you take lmao
31 |
32 | for next in range(nextstates):
33 |
34 | # because there are multiple nextstates, so iterating over all possible states we could end up in
35 |
36 | probability, next_state, reward, _ = env.P[state][policy[state]][next]
37 | temp_value += reward + discount*probability*values[next_state]
38 | values[state] = temp_value / nextstates # value of the current state = average over the values of states we can end up in
39 |
40 | for state in range(state_space):
41 |
42 | temp_value = 0
43 | temp_values = []
44 |
45 | for action in range(action_space):
46 |
47 | for next in range(len(env.P[state][action])): # same reason for iterating here
48 |
49 | probability, next_state, reward, _ = env.P[state][action][next]
50 | temp_value += reward + discount*probability*values[next_state]
51 |
52 | temp_value /= len(env.P[state][action])
53 | temp_values.append(temp_value)
54 |
55 | values[state] = np.max(temp_values)
56 | policy[state] = np.argmax(temp_values)
57 |
58 | i += 1
59 | delta = max(0, abs(np.sum(prev_values) - np.sum(values)))
60 |
61 | if(delta < theta):
62 | break
63 |
64 | print('trained agent in', i, 'episdode(s)', '\n')
65 | print('value function\n', values, '\n')
66 | print('policy\n', policy, '\n')
67 |
68 | nextstate = 0
69 |
70 | while(1):
71 | time.sleep(0.5)
72 | action = policy[nextstate]
73 | nextstate, reward, done, info = env.step(int(action))
74 | env.render()
75 | if(done):
76 | break
77 |
--------------------------------------------------------------------------------
/2. DP methods/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Programming Methods
2 |
3 | ### Summary
4 | In settings where we know how the world/environment works, there are some simpler planning algorithms which we can use like Policy Iteration (PI) and Value Iteration (VI). Essentially, it's about trying to learn the true expected reward of each state and then making decisions using this knowledge of learnt state/action values.
5 |
6 | **Problem Statement** : [Frozen Lake environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py). The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. The agent is rewarded for finding a walkable path to a goal tile.
7 |
8 | ### Algorithms implemented :
9 | - [x] Policy Iteration
10 | - [x] Value Iteration
11 |
12 | ### Resources
13 | 1. Chapters 2 and 3 from [Sutton and Barto]()
14 | 2. David Silver's course - [lectures 1, 2 and 3](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
15 | 3. Stanford CS234 - [lecture 1 and 2](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u)
--------------------------------------------------------------------------------
/2. DP methods/algorithms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def PolicyEvaluation(env, policy, values, discount, state_space, action_space):
4 |
5 | for state in range(state_space):
6 | probability, nextstate, reward, _ = env.P[state][policy[state]][0]
7 | values[state] = reward + discount * probability * values[nextstate]
8 |
9 | return values
10 |
11 | def PolicyImprovement(env, policy, values, discount, state_space, action_space):
12 |
13 | for state in range(state_space):
14 |
15 | temp = []
16 | for action in range(action_space):
17 |
18 | probability, nextstate, reward, _ = env.P[state][action][0]
19 | temp.append(reward + discount * probability * values[nextstate])
20 | policy[state] = np.argmax(temp)
21 |
22 | return policy
23 |
24 | def ValueIteration(env, values, discount, state_space, action_space):
25 |
26 | for state in range(state_space):
27 |
28 | temp_values = []
29 | for action in range(action_space):
30 |
31 | probability, nextstate, reward, _ = env.P[state][action][0]
32 | temp_values.append(reward + discount * probability * values[nextstate])
33 | values[state] = np.max(temp_values)
34 |
35 | return values
36 |
--------------------------------------------------------------------------------
/2. DP methods/deterministic VI.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | import random as rd
4 | import time
5 | from algorithms import PolicyImprovement, PolicyEvaluation, ValueIteration
6 |
7 | from gym.envs.registration import register
8 | register(
9 | id='FrozenLakeNotSlippery-v0',
10 | entry_point='gym.envs.toy_text:FrozenLakeEnv',
11 | kwargs={'map_name' : '4x4', 'is_slippery': False},
12 | max_episode_steps=200,
13 | reward_threshold=0.78, # optimum = .8196
14 | )
15 |
16 | direction = {
17 | 0: "LEFT",
18 | 1: "DOWN",
19 | 2: "RIGHT",
20 | 3: "UP"
21 | }
22 |
23 | env = gym.make('FrozenLakeNotSlippery-v0')
24 | env = env.unwrapped
25 | env.render()
26 |
27 | state_space = env.nS
28 | action_space = env.nA
29 |
30 | values = np.zeros(state_space)
31 | policy = np.zeros(state_space)
32 | theta = 1e-9
33 | discount = 0.9
34 | i = 0
35 | iterations = 10
36 |
37 | def PolicyIteration(env, policy, values, discount, state_space, action_space):
38 |
39 | values = PolicyEvaluation(env, policy, values, discount, state_space, action_space)
40 | policy = PolicyImprovement(env, policy, values, discount, state_space, action_space)
41 |
42 | return values, policy
43 |
44 | while(1):
45 |
46 | # depending on which algorithm you want to run comment one of the two lines below
47 | values, policy = PolicyIteration(env, policy, values, discount, state_space, action_space)
48 | values = ValueIteration(env, values, discount, state_space, action_space)
49 |
50 | i += 1
51 | if(i>iterations):
52 | break
53 |
54 | # comment this if you ran policy iteration.
55 | # This finds the policy because in value iteration you don't change the policy,
56 | # you can find it using the value function returned from the algorithm
57 | for state in range(state_space):
58 |
59 | temp = []
60 | for action in range(action_space):
61 |
62 | probability, nextstate, reward, _ = env.P[state][action][0]
63 | temp.append(reward + discount * probability * values[nextstate])
64 | policy[state] = np.argmax(temp)
65 |
66 |
67 | print('value function\n', values,'\n')
68 | print('policy\n',policy, '\n')
69 | print('agent trained for', i, 'episodes')
70 | nextstate = 0
71 |
72 |
73 | while(1):
74 | time.sleep(0.5)
75 | action = policy[nextstate]
76 | nextstate, reward, done, info = env.step(int(action))
77 | env.render()
78 | if(done):
79 | break
80 |
81 | # even if you don't uncomment anything, the code will still work
--------------------------------------------------------------------------------
/3. Model free methods/Monte Carlo/MC evaluation.py:
--------------------------------------------------------------------------------
1 | # simplified blackjack state space, On policy MC.py has the full solution
2 |
3 | import gym
4 | import numpy as np
5 | import random as rd
6 | from utils import *
7 |
8 | env = gym.make('Blackjack-v0')
9 | env.unwrapped
10 |
11 | state_space = env.observation_space
12 | action_space = env.action_space
13 |
14 | policy = {}
15 | state_occur = {}
16 | values = {}
17 | for i in range(21): # because 21 is the number of states we can see - [1, 21]
18 |
19 | # the probability for lower value hands is 0, the changes have not been made accordingly to make the code efficient
20 |
21 | values.update({i+1 : 0}) # initialise value fn
22 | state_occur.update({i+1 : 0}) # initialise occurence of state
23 | policy.update({i+1 : int(rd.uniform(0,2))}) # initialise random policy
24 |
25 | for j in range(500000):
26 |
27 | episodes = generateEpisode(env) # generates episode using policy same as dealer's
28 | observed_states = []
29 |
30 | for i in range(len(episodes)):
31 | if episodes[i][0] < 22:
32 | observed_states.append(episodes[i][0])
33 | state_occur[episodes[i][0]] += 1
34 | rewards = episodes[i][2]
35 |
36 | for state in observed_states:
37 | values[state] = values[state] + (rewards - values[state]) / state_occur[state]
38 |
39 | if j%500000==0:
40 | print('iteration number', j)
--------------------------------------------------------------------------------
/3. Model free methods/Monte Carlo/On policy MC.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import random as rd
4 | from utils import *
5 |
6 | env = gym.make('Blackjack-v0')
7 | env.unwrapped
8 |
9 | action_space = [0,1]
10 |
11 | policy = {}
12 | state_occur = {}
13 | values = {}
14 | q_values = {}
15 |
16 | for i in range(4,22): # our card sum can be from [4, 21]
17 |
18 | for j in range(1,11): # dealer's card value can be [1-10] where 1 is ace
19 |
20 | for ace in [True, False]: # whether or not we have a usable ace
21 |
22 | values.update({(i, j, ace) : 0 }) # initialise value fn
23 | state_occur.update({(i, j, ace) : 0}) # initialise occurence of state
24 | policy.update({(i, j, ace) : int(rd.uniform(0,2))}) # initialise random policy
25 |
26 | for action in action_space:
27 | q_values.update({(i, j, ace, action) : 0}) # initialise Q value lookup table
28 |
29 |
30 |
31 | ## This is the code for monte carlo control
32 |
33 | temp_qvlaues = np.zeros(2)
34 | epsilon = 0.7
35 |
36 | for i in range(100000):
37 |
38 | # for _ in range(50): # this loop can be thrown away if we don't want to converge to the true value of that policy
39 |
40 | episodes = Policyimproving(env, policy) # episodes = (our hand, dealer hand, action, reward, usable ace)
41 | observed_states = []
42 |
43 | for j in range(len(episodes)):
44 |
45 | if episodes[j][0] < 22:
46 | observed_states.append((episodes[j][0], episodes[j][1], episodes[j][4], episodes[j][2]))
47 | # observed_states contains : tuple (our hand, dealer hand, usable ace, action)
48 | state_occur[(episodes[j][0], episodes[j][1], episodes[j][4])] += 1 # increasing count of current state
49 |
50 | rewards = episodes[j][3]
51 |
52 | for state_action in observed_states:
53 |
54 | state = (state_action[0], state_action[1], state_action[2])
55 | q_values[state_action] = q_values[state_action] + (rewards - q_values[state_action]) / state_occur[state]
56 |
57 |
58 | # choosing new policy
59 | for state in range(4,22):
60 |
61 | for dealer in range(1,11):
62 |
63 | for ace in [True, False]:
64 |
65 | for action in range(2):
66 | temp_qvlaues[ace] = q_values[(state, dealer, ace, action)]
67 |
68 | chance = rd.uniform(0,1)
69 | if(chance > epsilon):
70 | policy[(state, dealer, ace)] = np.argmax(temp_qvlaues)
71 | else:
72 | policy[(state, dealer, ace)] = int(rd.uniform(0,2))
73 |
74 |
75 | if(i%10000==0):
76 | epsilon /= 2
77 | print('iteration',i)
78 |
79 |
80 |
81 | print('q_values\n', q_values)
82 |
--------------------------------------------------------------------------------
/3. Model free methods/Monte Carlo/README.md:
--------------------------------------------------------------------------------
1 | # Monte Carlo solutions to Model-free RL
2 |
3 | ### Summary
4 | Let's say we sampled a trajectory from our MDP. Now, to judge how good the trajectory/policy was we would look at the total return of the trajectory. Similarly, to judge how good the encountered state-action pairs were we could use the total return *starting from that state-action pair*. This is the basic idea behind Monte Carlo control where we sample trajectories and estimate Q-values with the total return we get starting from that state, and we improve our policy using our knowledge of estimated Q-values.
5 |
6 | **Problem Statement** : [Blackjack environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py)
7 |
8 | ### Algorithms implemented:
9 | - [x] On-policy MC evaluation and control (Blackjack environment)
10 | - [ ] Off-policy MC evaluation and control (Gridworld environment)
11 |
12 | ### Resources:
13 | 1. Chapter 5 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf)
14 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
15 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u)
--------------------------------------------------------------------------------
/3. Model free methods/Monte Carlo/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random as rd
3 |
4 | def generateEpisode(env):
5 |
6 | state = env.reset() # env.reset() returns 3 things - current hand (total sum), dealer show card and usable ace (bool)
7 | episodes = []
8 | usable_ace = state[2]
9 | dealer_hand = state[1]
10 | curr_hand = state[0]
11 | while(1):
12 |
13 | action = 0 if curr_hand > 16 else 1 # policy which we evaluate, same as dealer's policy : 0 is stay 1 is hit
14 | nextstate, reward, done, _ = env.step(action) # nextstate contains {player hand, dealer hand, usable ace}
15 |
16 | sample = (curr_hand, action, reward, usable_ace)
17 | episodes.append(sample)
18 | if done:
19 | break
20 | curr_hand = nextstate[0] # nextstate[0] is total sum of player hand
21 | usable_ace = nextstate[2]
22 |
23 | return episodes
24 |
25 |
26 | def Policyimproving(env, policy):
27 |
28 | episodes = []
29 | state = env.reset()
30 | curr_hand = state[0]
31 | dealer_hand = state[1]
32 | usable_ace = state[2]
33 |
34 | while(1):
35 |
36 | action = policy[(curr_hand, dealer_hand, usable_ace)]
37 | nextstate, reward, done, _ = env.step(action)
38 |
39 | sample = (curr_hand, dealer_hand, action, reward, usable_ace)
40 | episodes.append(sample)
41 | if done:
42 | break
43 | curr_hand = nextstate[0]
44 | dealer_hand = nextstate[1]
45 | usable_ace = nextstate[2]
46 |
47 | return episodes
48 |
49 |
50 | def Softmax(x):
51 |
52 | return np.exp(x) / np.sum(np.exp(x))
--------------------------------------------------------------------------------
/3. Model free methods/README.md:
--------------------------------------------------------------------------------
1 | # Model-free RL
2 |
3 | ### Summary
4 | One drawback of dynamic programming methods is that it needs the transition dynamics of the environment to find the optimal policy. There is a different class of algorithms which can solve for the optimal policy without the knowledge of the MDP. The idea here is to sample trajectories from the MDP and estimate Q-values for all the encountered state-action pairs.
5 |
6 | ### Algorithms implemented :
7 | - [x] On-policy Monte Carlo control
8 | - [ ] Off-policy Monte Carlo control using importance sampling
9 | - [x] Q-Learning
10 | - [x] SARSA
11 |
12 | ### Resources
13 | 1. Chapter 4, 5 and 6 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf)
14 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
15 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u)
16 |
--------------------------------------------------------------------------------
/3. Model free methods/TD Learning/QLearning.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | import time
4 |
5 | env = gym.make('Taxi-v2')
6 | env.unwrapped
7 |
8 | print(env.observation_space) # Discrete(500)
9 | print(env.action_space) # Discrete(6)
10 | action_space = [0,1,2,3,4,5]
11 |
12 | q_values = {}
13 | policy = {}
14 |
15 | for i in range(env.observation_space.n):
16 | policy.update({i : np.random.choice(action_space)}) # initialise policy
17 | for j in range(env.action_space.n):
18 | q_values.update({(i,j) : 0}) # initialise q_value table
19 |
20 | epsilon = 0.9
21 | alpha = 0.1
22 | total_reward = 0
23 | state = env.reset()
24 |
25 | for iterations in range(100000):
26 |
27 | nextstate, reward, done, _ = env.step(policy[state])
28 | total_reward += reward
29 |
30 | temp_q = []
31 | for action in range(env.action_space.n):
32 | temp_q.append(q_values[(nextstate, action)])
33 | target = reward + max(temp_q)
34 |
35 | # action value update
36 | q_values[(state, policy[state])] += alpha*(target - q_values[(state, policy[state])])
37 | state = nextstate
38 |
39 | # random behaviour policy (epsilon greedy)
40 | for i in range(env.observation_space.n):
41 |
42 | temp = []
43 | for j in range(env.action_space.n):
44 | temp.append(q_values[(i, j)])
45 | temp = np.asarray(temp)
46 |
47 | chance = np.random.uniform(0,1)
48 | if chance < epsilon:
49 | policy[i] = env.action_space.sample()
50 | else:
51 | policy[i] = np.argmax(temp)
52 |
53 | if done:
54 | state = env.reset()
55 |
56 | if (iterations+1)%10000 == 0:
57 | print(f'iterations completed : {iterations+1}, total reward = {total_reward}')
58 | epsilon /= 1.5
59 |
60 |
61 | # forming the policy
62 | for i in range(env.observation_space.n):
63 | temp = []
64 | for j in range(env.action_space.n):
65 | temp.append(q_values[(i,j)])
66 | policy[state] = np.argmax(temp)
67 |
68 | # try out the policy
69 | state = env.reset()
70 | done = False
71 | while not done:
72 | env.render()
73 | time.sleep(0.5)
74 | nextstate, reward, done, _ = env.step(policy[state])
75 | state = nextstate
76 |
--------------------------------------------------------------------------------
/3. Model free methods/TD Learning/README.md:
--------------------------------------------------------------------------------
1 | # Temporal-Difference Learning
2 |
3 | ### Summary
4 | Temporal difference learning is a family of algorithms in which the agent estimates the Q-values by bootstrapping from the current estimate of Q-values. Advantages of bootstrapping are reduced variance and faster propagation of reward signals.
5 |
6 |
7 | **Problem Statement** : [Taxi environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py). There are 4 locations (labeled by different letters) and the agent's job is to pick up the passenger at one location and drop him off in another. The agent receives +20 points for a successful dropoff, and loses 1 point for every timestep it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.
8 |
9 | ### Algorithms implemented:
10 | - [x] SARSA
11 | - [x] Q-Learning/SARSAMAX
12 | - [ ] Double Q-Learning
13 | - [ ] Expected SARSA
14 |
15 | ### Results
16 | 
17 |
18 | 
19 |
20 | ### Resources:
21 | 1. Chapter 5 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf)
22 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
23 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u)
24 |
--------------------------------------------------------------------------------
/3. Model free methods/TD Learning/SARSA.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | import matplotlib.pyplot as plt
4 | from scipy.ndimage import gaussian_filter
5 | import time
6 |
7 | env = gym.make('Taxi-v2')
8 | env.unwrapped
9 | # reward : +20 for successful dropoff and -1 for each timestep. -10 for illegal pickup and dropoff actions
10 | # actions : {0,1,2,3,4,5} = {south, north, east, west, pickup, dropoff}
11 |
12 |
13 | # Observation space = Discrete(500)
14 | # Action space = Discrete(6)
15 | action_space = [0,1,2,3,4,5]
16 |
17 | q_values = {}
18 | policy = {}
19 |
20 | for i in range(env.observation_space.n):
21 | policy.update({i : np.random.choice(action_space)}) # initialise policy
22 | for j in range(env.action_space.n):
23 | q_values.update({(i,j) : 0}) # initialise q_value table
24 |
25 | epsilon = 0.9
26 | alpha = 0.1
27 |
28 | rewards = []
29 | timestep = 0
30 | total_reward = 0
31 | state = env.reset() # returns the state of the environment
32 |
33 | # since TD is online learning algo, no need to generate episode like in MC control
34 | for iteration in range(100000):
35 |
36 | # policy evaluation
37 | nextstate, reward, done, _ = env.step(policy[state])
38 | timestep += 1
39 | total_reward += reward
40 | q_values[(state, policy[state])] += alpha * (reward + q_values[(nextstate, policy[nextstate])] - q_values[(state, policy[state])])
41 | state = nextstate
42 |
43 | # (epsilon greedy) policy improvement
44 | for i in range(env.observation_space.n):
45 | temp_policy = []
46 | for j in action_space:
47 | temp_policy.append(q_values[(i,j)])
48 |
49 | chance = np.random.uniform(0,1)
50 | if chance < epsilon:
51 | policy[i] = np.random.choice(action_space)
52 | else:
53 | policy[i] = np.argmax(temp_policy)
54 |
55 | if (iteration+1)%10000 == 0:
56 | print(f'iteration number {iteration+1} : avg reward per timestep = {rewards[-1:]}')
57 | epsilon /= 1.5
58 |
59 | if done:
60 | state = env.reset()
61 | rewards.append(total_reward / timestep)
62 | total_reward = 0
63 | timestep = 0
64 |
65 |
66 | # plot average rewards
67 | plt.title('Rewards per timestep')
68 | plt.xlabel('Episodes')
69 | plt.ylabel('Rewards')
70 | plt.plot(rewards)
71 | plt.plot(gaussian_filter(rewards, sigma = 10))
72 | plt.show()
73 |
74 |
75 | # try out the policy
76 | state = env.reset()
77 | while(1):
78 | env.render()
79 | time.sleep(0.5)
80 | action = policy[state]
81 | state, reward, done, _ = env.step(action)
82 | if done :
83 | break
--------------------------------------------------------------------------------
/3. Model free methods/TD Learning/sarsa taxi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/3. Model free methods/TD Learning/sarsa taxi.png
--------------------------------------------------------------------------------
/Deterministic PG/DDPG/Inverted pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/Deterministic PG/DDPG/Inverted pendulum.png
--------------------------------------------------------------------------------
/Deterministic PG/DDPG/README.md:
--------------------------------------------------------------------------------
1 | # Deep Deterministic Policy Gradients
2 |
3 | ### Summary
4 | This paper builds on the idea of [Deterministic policy gradients]() and uses function approximation to solve continuous control problems. This implementation tries to solve the inverted pendulum task.
5 |
6 | ### Results and plots
7 |
8 | **Rewards v/s Episodes**
9 | 
10 |
11 | **Observations** : The policy starts performing badly after a point and it keeps getting worse. This was observed in every run of this algorithm and I am not exactly sure why this happens.
--------------------------------------------------------------------------------
/Deterministic PG/DDPG/agent.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import gym
5 | from ddpg import *
6 | from collections import deque
7 | import random
8 | import matplotlib.pyplot as plt
9 | from scipy.ndimage import gaussian_filter
10 |
11 | env = gym.make('BipedalWalker-v2')
12 | env.unwrapped
13 | action_space = env.action_space.shape[0]
14 | state_space = env.observation_space.shape[0]
15 |
16 | behaviour_critic = ddpg_critic(state_space, action_space).cuda()
17 | target_critic = ddpg_critic(state_space, action_space).cuda()
18 | target_critic.eval()
19 | target_critic.load_state_dict(behaviour_critic.state_dict())
20 |
21 | behaviour_actor = ddpg_actor(state_space, action_space).cuda()
22 | target_actor = ddpg_actor(state_space, action_space).cuda()
23 | target_actor.eval()
24 | target_actor.load_state_dict(behaviour_actor.state_dict())
25 |
26 |
27 | episodes = 4000
28 | episode_length = 200 # THIS IS REQUIRED BECAUSE THE EPISODE NEVER ENDS FOR INVERTED PENDULUM (DONE == FALSE ALWAYS)
29 | BATCH_SIZE = 64
30 | MEMORY = 1000000 # REPLAY MEMORY CAPACITY
31 | TAU = 0.001 # FOR POLYAK AVERAGING
32 | replay_buffer = deque([]) # INITIALISED REPLAY BUFFER
33 | episode_reward = []
34 |
35 | alpha_critic = 0.0003
36 | alpha_actor = 0.0003
37 | lossfn = nn.MSELoss()
38 | critic_optimizer = torch.optim.Adam(behaviour_critic.parameters(), lr = alpha_critic)
39 | actor_optimizer = torch.optim.Adam(behaviour_actor.parameters(), lr = alpha_actor)
40 |
41 |
42 | for episode in range(episodes):
43 |
44 | state = env.reset()
45 | done = False
46 | total_reward = 0
47 | for _ in range(episode_length):
48 | # SELECT ACTION
49 | action = behaviour_actor.actor_forward(state).detach().cpu()
50 | action = action + torch.randn(1) # ADDING NOISE FOR EXPLORATION
51 | nextstate, reward, _, _ = env.step([action.item()])
52 | total_reward += reward
53 |
54 | # STORE THE TRANSITION
55 | experience = (state, action.item(), reward, nextstate)
56 | if len(replay_buffer) < MEMORY :
57 | replay_buffer.append(experience)
58 | else:
59 | replay_buffer.popleft()
60 | replay_buffer.append(experience)
61 |
62 | # TRAINING NETWORK
63 | if len(replay_buffer) >= BATCH_SIZE :
64 | batch_buffer = random.sample(replay_buffer, BATCH_SIZE)
65 | s, a, r, ns = map(np.stack, zip(*batch_buffer)) # LEARNT SOMETHING NEW HERE
66 | s = torch.FloatTensor(s)
67 | a = torch.FloatTensor(a).view(-1,1).cuda()
68 | r = torch.FloatTensor(r).view(-1,1).cuda()
69 | ns = torch.FloatTensor(ns)
70 |
71 | # UPDATE THE CRITIC
72 | ns_actions = target_actor.actor_forward(ns)
73 | target_qvalues = r + target_critic.critic_forward(ns, ns_actions)
74 | predicted_qvalues = behaviour_critic.critic_forward(s, a)
75 | loss_critic = lossfn(predicted_qvalues, target_qvalues)
76 |
77 | critic_optimizer.zero_grad()
78 | loss_critic.backward()
79 | critic_optimizer.step()
80 |
81 | # UPDATE THE ACTOR
82 | loss_actor = behaviour_critic.critic_forward(s, behaviour_actor.actor_forward(s))
83 | loss_actor = -loss_actor.mean()
84 | actor_optimizer.zero_grad()
85 | loss_actor.backward()
86 | actor_optimizer.step()
87 |
88 | # POLYAK AVERAGING
89 | for target_param, param in zip(target_critic.parameters(), behaviour_critic.parameters()):
90 | target_param.data.copy_(
91 | target_param.data * (1.0 - TAU) + param.data * TAU
92 | )
93 |
94 | for target_param, param in zip(target_actor.parameters(), behaviour_actor.parameters()):
95 | target_param.data.copy_(
96 | target_param.data * (1.0 - TAU) + param.data * TAU
97 | )
98 |
99 | state = nextstate
100 |
101 |
102 | episode_reward.append(total_reward)
103 | if (episode+1)%200 == 0:
104 | print(f'episode number {episode+1}; average reward of last 200 episodes = {np.mean(episode_reward[-200:])}')
105 | # TEST THE NETWORK
106 | x = env.reset()
107 | for i in range(200) :
108 | env.render()
109 | u = target_actor.actor_forward(x)
110 | x_, _, _, _ = env.step([action.item()])
111 | if done :
112 | break
113 | x = x_
114 |
115 | plt.ylabel('Rewards')
116 | plt.xlabel('Episodes')
117 | plt.plot(episode_reward)
118 | plt.plot(gaussian_filter(episode_reward, 25))
119 | plt.show()
120 |
--------------------------------------------------------------------------------
/Deterministic PG/DDPG/ddpg.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as f
4 |
5 | class ddpg_actor(nn.Module): # TAKES STATE AS INPUT AND OUTPUTS ACTION
6 | def __init__(self, state_space, action_space):
7 | super(ddpg_actor, self).__init__()
8 |
9 | self.actor1 = nn.Linear(state_space, 400)
10 | self.actor2 = nn.Linear(400, 200)
11 | self.actor3 = nn.Linear(200, action_space)
12 |
13 | def actor_forward(self, state):
14 |
15 | state = torch.FloatTensor(state).cuda()
16 | hidden1 = f.relu(self.actor1(state))
17 | hidden2 = f.relu(self.actor2(hidden1))
18 | out = self.actor3(hidden2)
19 |
20 | return out
21 |
22 | class ddpg_critic(nn.Module): # TAKES STATE AND ACTION AS INPUT, OUTPUTS Q VALUE
23 | def __init__(self, state_space, action_space):
24 | super(ddpg_critic, self).__init__()
25 |
26 | self.critic1 = nn.Linear(state_space + action_space, 400)
27 | self.critic2 = nn.Linear(400, 200)
28 | self.critic3 = nn.Linear(200, 1)
29 |
30 | def critic_forward(self, state, action): # ACTION IS ALREADY A TENSOR, STATE IS NOT A TENSOR
31 |
32 | state = torch.FloatTensor(state).cuda()
33 | # action = action.cuda()
34 | state = torch.cat((state, action), 1).cuda()
35 | hidden1 = f.relu(self.critic1(state))
36 | hidden2 = f.relu(self.critic2(hidden1))
37 | out = self.critic3(hidden2)
38 |
39 | return out
--------------------------------------------------------------------------------
/Deterministic PG/README.md:
--------------------------------------------------------------------------------
1 | # Deterministic Policy Gradients
2 |
3 | ### Summary
4 | This family of deep RL algorithms build on top of the idea presented in [Deterministic Policy Gradients](http://proceedings.mlr.press/v32/silver14.pdf). These assume the policy to be a deterministic function of the features and not a probability distribution over all actions. Deep deterministic policy gradients (DDPG) and Twin Delayed Deep Deterministic Policy Gradients (TD3) are some of the widely used algorithms.
5 |
6 | These algorithms can be used to solve any continuous control task while the result may or may not be good depending on the difficulty of the task. TD3 usually outperforms DDPG and makes progress in environments even where DDPG fails.
--------------------------------------------------------------------------------
/Deterministic PG/TD3/agent.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as f
4 | import numpy as np
5 | import gym
6 | from td3 import *
7 |
8 | env = gym.make('Pendulum-v0')
9 | env.unwrapped
10 | state_space = env.observation_space.shape[0]
11 | action_space = env.action_space.shape[0]
12 |
13 | critic1 = Critic(state_space, action_space).cuda()
14 | target_critic1 = Critic(state_space, action_space)
15 | critic2 = Critic(state_space, action_space)
16 | target_critic2 = Critic(state_space, action_space)
17 |
18 | actor = Actor(state_space, action_space)
19 | target_actor = Actor(state_space, action_space)
20 |
21 |
--------------------------------------------------------------------------------
/Deterministic PG/TD3/td3.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as f
4 |
5 | class Actor(nn.Module):
6 | def __init__(self, state_space, action_space):
7 | super(Actor, self).__init__
8 |
9 | self.linear1 = nn.Linear(state_space, 400)
10 | self.linear2 = nn.Linear(400, 200)
11 | self.linear3 = nn.Linear(200, 1)
12 |
13 | def forward(self, state) :
14 |
15 | state = torch.FloatTensor(state).cuda()
16 | hidden1 = f.relu(self.linear1(state))
17 | hidden2 = f.relu(self.linear2(hidden1))
18 | action = self.linear3(hidden2)
19 |
20 | return action
21 |
22 | class Critic(nn.Module):
23 | def __init__(self, state_space, action_space):
24 | super(Critic, self).__init__
25 |
26 | self.linear1 = nn.Linear(state_space+action_space, 400)
27 | self.linear2 = nn.Linear(400, 200)
28 | self.linear3 = nn.Linear(200, 1)
29 |
30 | def forward(self, state, action):
31 |
32 | state = torch.FloatTensor(state)
33 | action = torch.FloatTensor(action)
34 | x = torch.cat([state, action]).cuda()
35 | hidden1 = f.relu(self.linear1(x))
36 | hidden2 = f.relu(self.linear2(hidden1))
37 | q_value = self.linear3(hidden2)
38 |
39 | return q_value
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning
2 |
3 | All the algorithms I implemented (using Python3 and NumPy) while reading Introduction to Reinforcement Learning by Sutton and Barto.
4 | *There's a separate ReadMe for each topic*
5 |
6 |
7 | High Level structure of the repo :
8 | 1. [Bandits](https://github.com/jayeshk7/RL-Algorithms/tree/master/1.%20Bandits)
9 | - Epsilon-greedy
10 | - Optimistic initial value
11 | - Softmax exploration
12 | 2. [Dynamic Programming methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/2.%20DP%20methods)
13 | - Policy iteration
14 | - Value iteration
15 | 3. [Model free methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods)
16 | - [Monte Carlo control](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods/Monte%20Carlo)
17 | - On-Policy Monte Carlo
18 | - Off-policy Monte Carlo using Importance Sampling (incomplete)
19 | - [Temporal-difference methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods/TD%20Learning)
20 | - Q-Learning
21 | - SARSA
22 |
23 | ### Resources
24 | CS234 and David silver often use different notations, it would be better to follow just one of them in the beginning (I prefer David Silver's lectures)
25 | 1. [Introduction to Reinforcement Learning by Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf)
26 | 2. [David Silver's lectures](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
27 | 3. [Stanford CS234](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u)
28 |
29 |
30 | [Check this out](https://github.com/IvLabs/resources) for more resources!
31 |
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/Actor critic/agent.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import gym
4 | from model import *
5 |
6 | env = gym.make('CartPole-v0')
7 | state_space = env.observation_space.shape[0]
8 | action_space = env.action_space.n
9 |
10 | policy = Actor(state_space, action_space)
11 |
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/Actor critic/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as f
4 |
5 | class Actor(nn.Module):
6 | def __init__(self, state_space, action_space):
7 | super(Actor, self).__init__
8 |
9 | self.actor1 = nn.Linear(state_space, 256)
10 | self.actor2 = nn.Linear(256, 128)
11 | self.actor3 = nn.Linear(128, action_space)
12 |
13 | def forward(self, state):
14 |
15 | state = torch.FloatTensor(state).cuda()
16 | hid1 = f.relu(self.actor1(state))
17 | hid2 = f.relu(self.actor2(hid1))
18 | action_probs = f.softmax(self.actor3(hid2))
19 |
20 | return action_probs
21 |
22 | class Critic(nn.Module):
23 | def __init__(self, state_space, action_space):
24 | super(Critic, self).__init__
25 |
26 | self.critic1 = nn.Linear(state_space+action_space, 256)
27 | self.critic2 = nn.Linear(256, 128)
28 | self.critic3 = nn.Linear(128, 1)
29 |
30 | def forward(self, state):
31 |
32 | state = torch.FloatTensor(state).cuda
33 | hid1 = f.relu(self.critic1(state))
34 | hid2 = f.relu(self.critic2(hid1))
35 | state_value = self.critic3(hid2)
36 |
37 | return state_value
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/README.md:
--------------------------------------------------------------------------------
1 | # Policy gradient methods
2 |
3 | ### Summary
4 | Policy gradient methods is a class of algorithms where we directly optimize our policy. The policy is represented by a function approximator like neural networks which takes in the state and outputs a distribution over actions. Agent takes the action with the highest probability.
5 | Since the policy is already stochastic in this case, we don't have to think about exploration here.
6 |
7 | ### Results
8 |
9 | **REINFORCE on CartPole-v0**
10 | 
11 |
12 |
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/REINFORCE/Cartpole_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/Vanilla Policy gradient methods/REINFORCE/Cartpole_result.png
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/REINFORCE/README.md:
--------------------------------------------------------------------------------
1 | # REINFORCE
2 |
3 | PyTorch implementation of the Monte carlo policy gradient algorithm (also known as REINFORCE).
4 |
5 | ### Results
6 |
7 | **Cartpole-v0**
8 | 
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from model import Policynetwork
4 | from utils import generate_episode, plot
5 | import numpy as np
6 | import gym
7 |
8 | env = gym.make('CartPole-v0')
9 |
10 | state_space = env.observation_space.shape[0]
11 | action_space = env.action_space.n
12 |
13 | alpha = 0.001
14 | policy_network = Policynetwork(state_space, action_space)
15 | optimizer = torch.optim.Adam(policy_network.parameters(), lr = alpha)
16 |
17 | ## TRAIN
18 |
19 | episodes = 1000
20 | rewards = []
21 |
22 | for episode in range(episodes):
23 |
24 | episode_experience, reward_list = generate_episode(env, policy_network) # RETURNS (S,A,R,S') TUPLES OF THE EPISODE AND LIST OF REWARDS OBTAINED AT EACH STEP
25 | total_reward = np.sum(reward_list) # TOTAL REWARD OF THE EPISODE
26 | rewards.append(total_reward) # STORING TOTAL REWARD OF EACH EPISODE
27 | loss = 0
28 |
29 | for i,sars in enumerate(episode_experience):
30 |
31 | state, action, _, nextstate = sars
32 | target_reward = np.sum(reward_list[i:-1]) - np.mean(reward_list)
33 | reward_weight = torch.tensor(target_reward) # using monte carlo estimate for target, have i incorporated causality here??
34 | action_logprob = -torch.log(policy_network(state))[action]
35 | loss += action_logprob*reward_weight
36 |
37 | optimizer.zero_grad()
38 | loss.backward()
39 | optimizer.step()
40 | if (episode+1)%100 == 0:
41 | print(f'{episode+1}th episode; average reward of past 100 episodes :', np.mean(rewards[-100:])) # PRINTING AVG OF LAST 100 EPISODES
42 |
43 |
44 | plot(rewards)
45 |
46 | # TESTING THE NETWORK
47 |
48 | done = False
49 | state = env.reset()
50 | policy_network.eval()
51 | while not done:
52 | env.render()
53 | action_probs = policy_network(state)
54 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy()))
55 | nextstate, reward, done, _ = env.step(action)
56 | if done:
57 | break
58 | state = nextstate
59 |
60 |
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/REINFORCE/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Policynetwork(nn.Module):
5 | def __init__(self, state_space, action_space):
6 | super(Policynetwork, self).__init__()
7 |
8 | self.linear1 = nn.Linear(state_space, 64)
9 | self.linear2 = nn.Linear(64, 16)
10 | self.linear3 = nn.Linear(16, action_space)
11 |
12 | def forward(self, state):
13 |
14 | state = torch.FloatTensor(state)
15 | hidden1 = nn.functional.relu(self.linear1(state))
16 | hidden2 = nn.functional.relu(self.linear2(hidden1))
17 | action_preds = nn.functional.softmax(self.linear3(hidden2), dim=0)
18 |
19 | return action_preds
--------------------------------------------------------------------------------
/Vanilla Policy gradient methods/REINFORCE/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from scipy.ndimage import gaussian_filter
4 |
5 |
6 | def generate_episode(env, policy):
7 |
8 | action_space = env.action_space.n
9 | experience = []
10 | rewards = []
11 | state = env.reset()
12 | action_probs = policy.forward(state)
13 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy())) # SELECTING ACTION ACCORDING TO THE PROBABILITY GIVEN BY NETWORK
14 |
15 | done = False
16 | while not done:
17 | nextstate, reward, done, _ = env.step(action)
18 | rewards.append(reward) # APPENDING REWARD AT EACH STEP
19 | experience.append((state, action, reward, nextstate)) # APPENDING (S,A,R,S') TUPLE
20 | state = nextstate
21 | action_probs = policy.forward(state)
22 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy())) # SELECTING ACTION
23 |
24 | return experience, rewards
25 |
26 | def plot(rewards):
27 | plt.plot(rewards)
28 | plt.xlabel('Episodes')
29 | plt.ylabel('Rewards')
30 | plt.plot(gaussian_filter(rewards, sigma=50))
31 | plt.show()
--------------------------------------------------------------------------------