├── .gitignore ├── 1. Bandits ├── Epsilon-greedy.py ├── Gradient-bandit.py ├── Optim-init-value.py └── README.md ├── 2. DP methods ├── One cycle PI.py ├── README.md ├── algorithms.py └── deterministic VI.py ├── 3. Model free methods ├── Monte Carlo │ ├── MC evaluation.py │ ├── On policy MC.py │ ├── README.md │ └── utils.py ├── README.md └── TD Learning │ ├── QLearning.py │ ├── README.md │ ├── SARSA.py │ └── sarsa taxi.png ├── Deterministic PG ├── DDPG │ ├── Inverted pendulum.png │ ├── README.md │ ├── agent.py │ └── ddpg.py ├── README.md └── TD3 │ ├── agent.py │ └── td3.py ├── README.md └── Vanilla Policy gradient methods ├── Actor critic ├── agent.py └── model.py ├── README.md └── REINFORCE ├── Cartpole_result.png ├── README.md ├── REINFORCE.py ├── model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.vscode 2 | *.pyc 3 | *.pyo 4 | *.ipynb_checkpoints 5 | .installed.cfg 6 | bin 7 | develop-eggs 8 | dist 9 | downloads 10 | eggs 11 | parts 12 | src/*.egg-info 13 | __pycache__/ 14 | *.py[cod] -------------------------------------------------------------------------------- /1. Bandits/Epsilon-greedy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class GaussianBandit(object): 5 | 6 | def __init__(self, num_arms, variance=1, mean=10): 7 | 8 | self.mean = [random.uniform(-mean, mean) for i in range(num_arms)] 9 | self.max_value_arm = np.argmax(self.mean) 10 | self.variance = variance 11 | 12 | def Rewards(self, arm): 13 | 14 | return random.gauss(self.mean[arm], self.variance) 15 | 16 | 17 | def EpsilonGreedy(epsilon, num_arms, iter, alpha): 18 | 19 | q_value = np.zeros(num_arms) 20 | arm_occur = np.zeros(num_arms) 21 | arm_reward = np.zeros(num_arms) 22 | 23 | for i in range(iter): 24 | 25 | rand = random.uniform(0,1) 26 | 27 | if(rand > epsilon): 28 | arm = np.argmax(q_value) 29 | else: 30 | arm = int(random.uniform(0,num_arms)) 31 | 32 | arm_occur[arm] += 1 33 | reward = bandit.Rewards(arm) 34 | arm_reward[arm] += reward 35 | 36 | q_value[arm] = q_value[arm] + alpha*(reward - q_value[arm]) 37 | 38 | 39 | if(i%1000 == 0): 40 | epsilon /= 2 41 | print('\nrewards are - ', arm_reward) 42 | print('-----------------------------------------------------------') 43 | print('\narm chosen maximum number of times -', np.argmax(arm_occur)+1) 44 | print('arm with max expected return is - ', bandit.max_value_arm+1) 45 | 46 | 47 | iter = 10000 48 | num_arms = 10 49 | alpha = 0.5 50 | epsilon = 0.7 51 | mean = 10 52 | variance = 1 53 | 54 | bandit = GaussianBandit(num_arms) 55 | print('true expected rewards\n\n',bandit.mean,'\n') 56 | print('---------------------------------------------------') 57 | 58 | EpsilonGreedy(epsilon, num_arms, iter, alpha) 59 | -------------------------------------------------------------------------------- /1. Bandits/Gradient-bandit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random as rd 3 | 4 | class GaussianBandit(object): 5 | 6 | def __init__(self, num_arms, mean=10, variance=1): 7 | 8 | self.variance = variance 9 | self.mean = [rd.uniform(-mean, mean) for i in range(num_arms)] 10 | self.max_value_arm = np.argmax(self.mean) 11 | 12 | def Rewards(self, arm): 13 | 14 | return rd.gauss(self.mean[arm], self.variance) 15 | 16 | 17 | def Softmax(arm_preference): 18 | 19 | a = np.exp(arm_preference) 20 | print(a) 21 | z = np.sum(np.exp(arm_preference)) 22 | print(z) 23 | return a/z 24 | 25 | 26 | def GradientBandit(num_arms, alpha, iterations): 27 | 28 | arm_preference = np.zeros(num_arms) 29 | arm_reward = np.zeros(num_arms) 30 | arm_occur = np.zeros(num_arms) 31 | avg_reward = np.zeros(num_arms) 32 | 33 | for i in range(iterations): 34 | 35 | arm = int(rd.uniform(0,num_arms)) 36 | arm_occur[arm] += 1 37 | 38 | reward = bandit.Rewards(arm) 39 | arm_reward[arm] += reward 40 | avg_reward[arm] = avg_reward[arm] + (reward - avg_reward[arm])/arm_occur[arm] 41 | 42 | softmax = Softmax(arm_preference) 43 | 44 | for a in range(num_arms): 45 | 46 | if(a == arm): 47 | arm_preference[a] = arm_preference[a] + alpha*(reward - avg_reward[a])*(1 - softmax[a]) 48 | else: 49 | arm_preference[a] = arm_preference[a] + alpha*(reward - avg_reward[a])*softmax[a] 50 | 51 | print(arm_preference) 52 | 53 | print('here\n') 54 | if(i%1000 == 0): 55 | print('\nrewards are - ', arm_reward) 56 | print('\narm with max preference is - ', np.argmax(arm_preference)) 57 | print('\narm with max expected return is - ', bandit.max_value_arm) 58 | print('-----------------------------------------------------------') 59 | 60 | 61 | num_arms = 10 62 | mean = 10 63 | variance = 1 64 | alpha = 0.5 65 | iterations = 5000 66 | 67 | bandit = GaussianBandit(num_arms, mean, variance) 68 | print(bandit.mean) 69 | print('----------------------------------------------------') 70 | 71 | GradientBandit(num_arms, alpha, iterations) -------------------------------------------------------------------------------- /1. Bandits/Optim-init-value.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random as rd 3 | 4 | class GaussianBandit(object): 5 | 6 | def __init__(self, num_arms, variance=1, mean=10): 7 | 8 | self.mean = [rd.uniform(-mean,mean) for i in range(num_arms)] 9 | self.max_value_arm = np.argmax(self.mean) 10 | self.variance = variance 11 | 12 | def Rewards(self, arm): 13 | 14 | return rd.gauss(self.mean[arm], self.variance) 15 | 16 | 17 | def OptimValue(optim_value, alpha, num_arms, iterations): 18 | 19 | q_value = optim_value*(np.ones(num_arms)) 20 | arm_occur = np.zeros(num_arms) 21 | arm_reward = np.zeros(num_arms) 22 | 23 | arm = int(rd.uniform(0,num_arms)) 24 | 25 | for i in range(iterations): 26 | 27 | arm_occur[arm] += 1 28 | reward = bandit.Rewards(arm) 29 | arm_reward[arm] += reward 30 | 31 | q_value[arm] = q_value[arm] + alpha*(reward - q_value[arm]) 32 | 33 | arm = np.argmax(q_value) 34 | 35 | if(i%1000 == 0): 36 | print('\nrewards are - ', arm_reward) 37 | print('-----------------------------------------------------------') 38 | print('\narm chosen maximum number of times', np.argmax(arm_occur)+1) 39 | print('arm with maximum true expected reward', bandit.max_value_arm+1) 40 | 41 | optim_value = 30 42 | num_arms = 10 43 | variance = 1 44 | mean = 10 45 | alpha = 0.5 46 | iterations = 5000 47 | 48 | bandit = GaussianBandit(num_arms, variance, mean) 49 | print(bandit.mean) 50 | print('\narm with max expected return is - ', bandit.max_value_arm + 1) 51 | print('----------------------------------------------------') 52 | 53 | OptimValue(optim_value, alpha, num_arms, iterations) 54 | -------------------------------------------------------------------------------- /1. Bandits/README.md: -------------------------------------------------------------------------------- 1 | # Multi-Armed Bandit 2 | 3 | ### Summary 4 | This is a classic reinforcement learning problem that exemplies the exploration-exploitation tradeoff. Each arm can be thought of one arm of a slot machine. The rewards may differ every time we pull the arm but there's a true expected reward associated with every slot machine.
5 | To find the true expectated reward we would have to choose one arm again and again to see what kind of rewards we get, even if we don't get high immediate rewards.
6 | The agent needs to *explore* as well as *exploit* it's current knowledge of which arm yields high immediate reward. This aim of maximizing the objective of expected rewards leads to the exploration-exploitation dilemma. 7 | 8 | **Problem Statement** : There are 10 one-arm bandits. Reward associated with each arm is a Gaussian distribution with some fixed mean and variance. The mean of the gaussian represents the expected reward and is sampled randomly for each arm from some uniform distribution. 9 | 10 | ### Algorithms implemented 11 | - [x] Epsilon Greedy 12 | - [x] Softmax exploration 13 | - [x] Optimistic initialisation 14 | - [ ] UCB 15 | - [ ] Median Elimination 16 | - [ ] Thompson Sampling 17 | 18 | ### Resources 19 | 1. Chapters 1 and 2 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf) -------------------------------------------------------------------------------- /2. DP methods/One cycle PI.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random as rd 3 | import gym 4 | import time 5 | 6 | env = gym.make('FrozenLake8x8-v0') 7 | #this is a stochastic environment, action taken may be different from what you chose 8 | 9 | env = env.unwrapped 10 | 11 | state_space = env.nS 12 | action_space = env.nA 13 | 14 | policy = [int(rd.uniform(0,env.nA)) for i in range(state_space)] 15 | values = np.zeros(env.nS) 16 | discount = 0.9 17 | delta = 0 18 | theta = 1e-9 19 | i = 0 20 | 21 | while(True): 22 | 23 | prev_values = values.copy() 24 | 25 | for state in range(state_space): 26 | 27 | temp_value = 0 28 | nextstates = len(env.P[state][policy[state]]) 29 | # multiple nextstates because environment throws you anywhere randomly 30 | # there is equal probability assigned to every action, doesn't really matter what action you take lmao 31 | 32 | for next in range(nextstates): 33 | 34 | # because there are multiple nextstates, so iterating over all possible states we could end up in 35 | 36 | probability, next_state, reward, _ = env.P[state][policy[state]][next] 37 | temp_value += reward + discount*probability*values[next_state] 38 | values[state] = temp_value / nextstates # value of the current state = average over the values of states we can end up in 39 | 40 | for state in range(state_space): 41 | 42 | temp_value = 0 43 | temp_values = [] 44 | 45 | for action in range(action_space): 46 | 47 | for next in range(len(env.P[state][action])): # same reason for iterating here 48 | 49 | probability, next_state, reward, _ = env.P[state][action][next] 50 | temp_value += reward + discount*probability*values[next_state] 51 | 52 | temp_value /= len(env.P[state][action]) 53 | temp_values.append(temp_value) 54 | 55 | values[state] = np.max(temp_values) 56 | policy[state] = np.argmax(temp_values) 57 | 58 | i += 1 59 | delta = max(0, abs(np.sum(prev_values) - np.sum(values))) 60 | 61 | if(delta < theta): 62 | break 63 | 64 | print('trained agent in', i, 'episdode(s)', '\n') 65 | print('value function\n', values, '\n') 66 | print('policy\n', policy, '\n') 67 | 68 | nextstate = 0 69 | 70 | while(1): 71 | time.sleep(0.5) 72 | action = policy[nextstate] 73 | nextstate, reward, done, info = env.step(int(action)) 74 | env.render() 75 | if(done): 76 | break 77 | -------------------------------------------------------------------------------- /2. DP methods/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Programming Methods 2 | 3 | ### Summary 4 | In settings where we know how the world/environment works, there are some simpler planning algorithms which we can use like Policy Iteration (PI) and Value Iteration (VI). Essentially, it's about trying to learn the true expected reward of each state and then making decisions using this knowledge of learnt state/action values. 5 | 6 | **Problem Statement** : [Frozen Lake environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py). The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. The agent is rewarded for finding a walkable path to a goal tile. 7 | 8 | ### Algorithms implemented : 9 | - [x] Policy Iteration 10 | - [x] Value Iteration 11 | 12 | ### Resources 13 | 1. Chapters 2 and 3 from [Sutton and Barto]() 14 | 2. David Silver's course - [lectures 1, 2 and 3](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 15 | 3. Stanford CS234 - [lecture 1 and 2](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u) -------------------------------------------------------------------------------- /2. DP methods/algorithms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def PolicyEvaluation(env, policy, values, discount, state_space, action_space): 4 | 5 | for state in range(state_space): 6 | probability, nextstate, reward, _ = env.P[state][policy[state]][0] 7 | values[state] = reward + discount * probability * values[nextstate] 8 | 9 | return values 10 | 11 | def PolicyImprovement(env, policy, values, discount, state_space, action_space): 12 | 13 | for state in range(state_space): 14 | 15 | temp = [] 16 | for action in range(action_space): 17 | 18 | probability, nextstate, reward, _ = env.P[state][action][0] 19 | temp.append(reward + discount * probability * values[nextstate]) 20 | policy[state] = np.argmax(temp) 21 | 22 | return policy 23 | 24 | def ValueIteration(env, values, discount, state_space, action_space): 25 | 26 | for state in range(state_space): 27 | 28 | temp_values = [] 29 | for action in range(action_space): 30 | 31 | probability, nextstate, reward, _ = env.P[state][action][0] 32 | temp_values.append(reward + discount * probability * values[nextstate]) 33 | values[state] = np.max(temp_values) 34 | 35 | return values 36 | -------------------------------------------------------------------------------- /2. DP methods/deterministic VI.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import random as rd 4 | import time 5 | from algorithms import PolicyImprovement, PolicyEvaluation, ValueIteration 6 | 7 | from gym.envs.registration import register 8 | register( 9 | id='FrozenLakeNotSlippery-v0', 10 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 11 | kwargs={'map_name' : '4x4', 'is_slippery': False}, 12 | max_episode_steps=200, 13 | reward_threshold=0.78, # optimum = .8196 14 | ) 15 | 16 | direction = { 17 | 0: "LEFT", 18 | 1: "DOWN", 19 | 2: "RIGHT", 20 | 3: "UP" 21 | } 22 | 23 | env = gym.make('FrozenLakeNotSlippery-v0') 24 | env = env.unwrapped 25 | env.render() 26 | 27 | state_space = env.nS 28 | action_space = env.nA 29 | 30 | values = np.zeros(state_space) 31 | policy = np.zeros(state_space) 32 | theta = 1e-9 33 | discount = 0.9 34 | i = 0 35 | iterations = 10 36 | 37 | def PolicyIteration(env, policy, values, discount, state_space, action_space): 38 | 39 | values = PolicyEvaluation(env, policy, values, discount, state_space, action_space) 40 | policy = PolicyImprovement(env, policy, values, discount, state_space, action_space) 41 | 42 | return values, policy 43 | 44 | while(1): 45 | 46 | # depending on which algorithm you want to run comment one of the two lines below 47 | values, policy = PolicyIteration(env, policy, values, discount, state_space, action_space) 48 | values = ValueIteration(env, values, discount, state_space, action_space) 49 | 50 | i += 1 51 | if(i>iterations): 52 | break 53 | 54 | # comment this if you ran policy iteration. 55 | # This finds the policy because in value iteration you don't change the policy, 56 | # you can find it using the value function returned from the algorithm 57 | for state in range(state_space): 58 | 59 | temp = [] 60 | for action in range(action_space): 61 | 62 | probability, nextstate, reward, _ = env.P[state][action][0] 63 | temp.append(reward + discount * probability * values[nextstate]) 64 | policy[state] = np.argmax(temp) 65 | 66 | 67 | print('value function\n', values,'\n') 68 | print('policy\n',policy, '\n') 69 | print('agent trained for', i, 'episodes') 70 | nextstate = 0 71 | 72 | 73 | while(1): 74 | time.sleep(0.5) 75 | action = policy[nextstate] 76 | nextstate, reward, done, info = env.step(int(action)) 77 | env.render() 78 | if(done): 79 | break 80 | 81 | # even if you don't uncomment anything, the code will still work -------------------------------------------------------------------------------- /3. Model free methods/Monte Carlo/MC evaluation.py: -------------------------------------------------------------------------------- 1 | # simplified blackjack state space, On policy MC.py has the full solution 2 | 3 | import gym 4 | import numpy as np 5 | import random as rd 6 | from utils import * 7 | 8 | env = gym.make('Blackjack-v0') 9 | env.unwrapped 10 | 11 | state_space = env.observation_space 12 | action_space = env.action_space 13 | 14 | policy = {} 15 | state_occur = {} 16 | values = {} 17 | for i in range(21): # because 21 is the number of states we can see - [1, 21] 18 | 19 | # the probability for lower value hands is 0, the changes have not been made accordingly to make the code efficient 20 | 21 | values.update({i+1 : 0}) # initialise value fn 22 | state_occur.update({i+1 : 0}) # initialise occurence of state 23 | policy.update({i+1 : int(rd.uniform(0,2))}) # initialise random policy 24 | 25 | for j in range(500000): 26 | 27 | episodes = generateEpisode(env) # generates episode using policy same as dealer's 28 | observed_states = [] 29 | 30 | for i in range(len(episodes)): 31 | if episodes[i][0] < 22: 32 | observed_states.append(episodes[i][0]) 33 | state_occur[episodes[i][0]] += 1 34 | rewards = episodes[i][2] 35 | 36 | for state in observed_states: 37 | values[state] = values[state] + (rewards - values[state]) / state_occur[state] 38 | 39 | if j%500000==0: 40 | print('iteration number', j) -------------------------------------------------------------------------------- /3. Model free methods/Monte Carlo/On policy MC.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import random as rd 4 | from utils import * 5 | 6 | env = gym.make('Blackjack-v0') 7 | env.unwrapped 8 | 9 | action_space = [0,1] 10 | 11 | policy = {} 12 | state_occur = {} 13 | values = {} 14 | q_values = {} 15 | 16 | for i in range(4,22): # our card sum can be from [4, 21] 17 | 18 | for j in range(1,11): # dealer's card value can be [1-10] where 1 is ace 19 | 20 | for ace in [True, False]: # whether or not we have a usable ace 21 | 22 | values.update({(i, j, ace) : 0 }) # initialise value fn 23 | state_occur.update({(i, j, ace) : 0}) # initialise occurence of state 24 | policy.update({(i, j, ace) : int(rd.uniform(0,2))}) # initialise random policy 25 | 26 | for action in action_space: 27 | q_values.update({(i, j, ace, action) : 0}) # initialise Q value lookup table 28 | 29 | 30 | 31 | ## This is the code for monte carlo control 32 | 33 | temp_qvlaues = np.zeros(2) 34 | epsilon = 0.7 35 | 36 | for i in range(100000): 37 | 38 | # for _ in range(50): # this loop can be thrown away if we don't want to converge to the true value of that policy 39 | 40 | episodes = Policyimproving(env, policy) # episodes = (our hand, dealer hand, action, reward, usable ace) 41 | observed_states = [] 42 | 43 | for j in range(len(episodes)): 44 | 45 | if episodes[j][0] < 22: 46 | observed_states.append((episodes[j][0], episodes[j][1], episodes[j][4], episodes[j][2])) 47 | # observed_states contains : tuple (our hand, dealer hand, usable ace, action) 48 | state_occur[(episodes[j][0], episodes[j][1], episodes[j][4])] += 1 # increasing count of current state 49 | 50 | rewards = episodes[j][3] 51 | 52 | for state_action in observed_states: 53 | 54 | state = (state_action[0], state_action[1], state_action[2]) 55 | q_values[state_action] = q_values[state_action] + (rewards - q_values[state_action]) / state_occur[state] 56 | 57 | 58 | # choosing new policy 59 | for state in range(4,22): 60 | 61 | for dealer in range(1,11): 62 | 63 | for ace in [True, False]: 64 | 65 | for action in range(2): 66 | temp_qvlaues[ace] = q_values[(state, dealer, ace, action)] 67 | 68 | chance = rd.uniform(0,1) 69 | if(chance > epsilon): 70 | policy[(state, dealer, ace)] = np.argmax(temp_qvlaues) 71 | else: 72 | policy[(state, dealer, ace)] = int(rd.uniform(0,2)) 73 | 74 | 75 | if(i%10000==0): 76 | epsilon /= 2 77 | print('iteration',i) 78 | 79 | 80 | 81 | print('q_values\n', q_values) 82 | -------------------------------------------------------------------------------- /3. Model free methods/Monte Carlo/README.md: -------------------------------------------------------------------------------- 1 | # Monte Carlo solutions to Model-free RL 2 | 3 | ### Summary 4 | Let's say we sampled a trajectory from our MDP. Now, to judge how good the trajectory/policy was we would look at the total return of the trajectory. Similarly, to judge how good the encountered state-action pairs were we could use the total return *starting from that state-action pair*. This is the basic idea behind Monte Carlo control where we sample trajectories and estimate Q-values with the total return we get starting from that state, and we improve our policy using our knowledge of estimated Q-values. 5 | 6 | **Problem Statement** : [Blackjack environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/blackjack.py) 7 | 8 | ### Algorithms implemented: 9 | - [x] On-policy MC evaluation and control (Blackjack environment) 10 | - [ ] Off-policy MC evaluation and control (Gridworld environment) 11 | 12 | ### Resources: 13 | 1. Chapter 5 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf) 14 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 15 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u) -------------------------------------------------------------------------------- /3. Model free methods/Monte Carlo/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random as rd 3 | 4 | def generateEpisode(env): 5 | 6 | state = env.reset() # env.reset() returns 3 things - current hand (total sum), dealer show card and usable ace (bool) 7 | episodes = [] 8 | usable_ace = state[2] 9 | dealer_hand = state[1] 10 | curr_hand = state[0] 11 | while(1): 12 | 13 | action = 0 if curr_hand > 16 else 1 # policy which we evaluate, same as dealer's policy : 0 is stay 1 is hit 14 | nextstate, reward, done, _ = env.step(action) # nextstate contains {player hand, dealer hand, usable ace} 15 | 16 | sample = (curr_hand, action, reward, usable_ace) 17 | episodes.append(sample) 18 | if done: 19 | break 20 | curr_hand = nextstate[0] # nextstate[0] is total sum of player hand 21 | usable_ace = nextstate[2] 22 | 23 | return episodes 24 | 25 | 26 | def Policyimproving(env, policy): 27 | 28 | episodes = [] 29 | state = env.reset() 30 | curr_hand = state[0] 31 | dealer_hand = state[1] 32 | usable_ace = state[2] 33 | 34 | while(1): 35 | 36 | action = policy[(curr_hand, dealer_hand, usable_ace)] 37 | nextstate, reward, done, _ = env.step(action) 38 | 39 | sample = (curr_hand, dealer_hand, action, reward, usable_ace) 40 | episodes.append(sample) 41 | if done: 42 | break 43 | curr_hand = nextstate[0] 44 | dealer_hand = nextstate[1] 45 | usable_ace = nextstate[2] 46 | 47 | return episodes 48 | 49 | 50 | def Softmax(x): 51 | 52 | return np.exp(x) / np.sum(np.exp(x)) -------------------------------------------------------------------------------- /3. Model free methods/README.md: -------------------------------------------------------------------------------- 1 | # Model-free RL 2 | 3 | ### Summary 4 | One drawback of dynamic programming methods is that it needs the transition dynamics of the environment to find the optimal policy. There is a different class of algorithms which can solve for the optimal policy without the knowledge of the MDP. The idea here is to sample trajectories from the MDP and estimate Q-values for all the encountered state-action pairs. 5 | 6 | ### Algorithms implemented : 7 | - [x] On-policy Monte Carlo control 8 | - [ ] Off-policy Monte Carlo control using importance sampling 9 | - [x] Q-Learning 10 | - [x] SARSA 11 | 12 | ### Resources 13 | 1. Chapter 4, 5 and 6 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf) 14 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 15 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u) 16 | -------------------------------------------------------------------------------- /3. Model free methods/TD Learning/QLearning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import time 4 | 5 | env = gym.make('Taxi-v2') 6 | env.unwrapped 7 | 8 | print(env.observation_space) # Discrete(500) 9 | print(env.action_space) # Discrete(6) 10 | action_space = [0,1,2,3,4,5] 11 | 12 | q_values = {} 13 | policy = {} 14 | 15 | for i in range(env.observation_space.n): 16 | policy.update({i : np.random.choice(action_space)}) # initialise policy 17 | for j in range(env.action_space.n): 18 | q_values.update({(i,j) : 0}) # initialise q_value table 19 | 20 | epsilon = 0.9 21 | alpha = 0.1 22 | total_reward = 0 23 | state = env.reset() 24 | 25 | for iterations in range(100000): 26 | 27 | nextstate, reward, done, _ = env.step(policy[state]) 28 | total_reward += reward 29 | 30 | temp_q = [] 31 | for action in range(env.action_space.n): 32 | temp_q.append(q_values[(nextstate, action)]) 33 | target = reward + max(temp_q) 34 | 35 | # action value update 36 | q_values[(state, policy[state])] += alpha*(target - q_values[(state, policy[state])]) 37 | state = nextstate 38 | 39 | # random behaviour policy (epsilon greedy) 40 | for i in range(env.observation_space.n): 41 | 42 | temp = [] 43 | for j in range(env.action_space.n): 44 | temp.append(q_values[(i, j)]) 45 | temp = np.asarray(temp) 46 | 47 | chance = np.random.uniform(0,1) 48 | if chance < epsilon: 49 | policy[i] = env.action_space.sample() 50 | else: 51 | policy[i] = np.argmax(temp) 52 | 53 | if done: 54 | state = env.reset() 55 | 56 | if (iterations+1)%10000 == 0: 57 | print(f'iterations completed : {iterations+1}, total reward = {total_reward}') 58 | epsilon /= 1.5 59 | 60 | 61 | # forming the policy 62 | for i in range(env.observation_space.n): 63 | temp = [] 64 | for j in range(env.action_space.n): 65 | temp.append(q_values[(i,j)]) 66 | policy[state] = np.argmax(temp) 67 | 68 | # try out the policy 69 | state = env.reset() 70 | done = False 71 | while not done: 72 | env.render() 73 | time.sleep(0.5) 74 | nextstate, reward, done, _ = env.step(policy[state]) 75 | state = nextstate 76 | -------------------------------------------------------------------------------- /3. Model free methods/TD Learning/README.md: -------------------------------------------------------------------------------- 1 | # Temporal-Difference Learning 2 | 3 | ### Summary 4 | Temporal difference learning is a family of algorithms in which the agent estimates the Q-values by bootstrapping from the current estimate of Q-values. Advantages of bootstrapping are reduced variance and faster propagation of reward signals. 5 | 6 | 7 | **Problem Statement** : [Taxi environment](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py). There are 4 locations (labeled by different letters) and the agent's job is to pick up the passenger at one location and drop him off in another. The agent receives +20 points for a successful dropoff, and loses 1 point for every timestep it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions. 8 | 9 | ### Algorithms implemented: 10 | - [x] SARSA 11 | - [x] Q-Learning/SARSAMAX 12 | - [ ] Double Q-Learning 13 | - [ ] Expected SARSA 14 | 15 | ### Results 16 | ![taxi_sarsa](https://github.com/jayeshk7/RL-Algorithms/blob/master/3.%20Model%20free%20methods/TD%20Learning/sarsa%20taxi.png) 17 | 18 | ![taxi_sarsa](https://machinelearningjourney.com/wp-content/uploads/2020/07/Taxi-demo.gif) 19 | 20 | ### Resources: 21 | 1. Chapter 5 from [Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf) 22 | 2. David Silver's course - [Lectures 4 and 5](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 23 | 3. Stanford CS234 - [Lectures 3 and 4](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u) 24 | -------------------------------------------------------------------------------- /3. Model free methods/TD Learning/SARSA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot as plt 4 | from scipy.ndimage import gaussian_filter 5 | import time 6 | 7 | env = gym.make('Taxi-v2') 8 | env.unwrapped 9 | # reward : +20 for successful dropoff and -1 for each timestep. -10 for illegal pickup and dropoff actions 10 | # actions : {0,1,2,3,4,5} = {south, north, east, west, pickup, dropoff} 11 | 12 | 13 | # Observation space = Discrete(500) 14 | # Action space = Discrete(6) 15 | action_space = [0,1,2,3,4,5] 16 | 17 | q_values = {} 18 | policy = {} 19 | 20 | for i in range(env.observation_space.n): 21 | policy.update({i : np.random.choice(action_space)}) # initialise policy 22 | for j in range(env.action_space.n): 23 | q_values.update({(i,j) : 0}) # initialise q_value table 24 | 25 | epsilon = 0.9 26 | alpha = 0.1 27 | 28 | rewards = [] 29 | timestep = 0 30 | total_reward = 0 31 | state = env.reset() # returns the state of the environment 32 | 33 | # since TD is online learning algo, no need to generate episode like in MC control 34 | for iteration in range(100000): 35 | 36 | # policy evaluation 37 | nextstate, reward, done, _ = env.step(policy[state]) 38 | timestep += 1 39 | total_reward += reward 40 | q_values[(state, policy[state])] += alpha * (reward + q_values[(nextstate, policy[nextstate])] - q_values[(state, policy[state])]) 41 | state = nextstate 42 | 43 | # (epsilon greedy) policy improvement 44 | for i in range(env.observation_space.n): 45 | temp_policy = [] 46 | for j in action_space: 47 | temp_policy.append(q_values[(i,j)]) 48 | 49 | chance = np.random.uniform(0,1) 50 | if chance < epsilon: 51 | policy[i] = np.random.choice(action_space) 52 | else: 53 | policy[i] = np.argmax(temp_policy) 54 | 55 | if (iteration+1)%10000 == 0: 56 | print(f'iteration number {iteration+1} : avg reward per timestep = {rewards[-1:]}') 57 | epsilon /= 1.5 58 | 59 | if done: 60 | state = env.reset() 61 | rewards.append(total_reward / timestep) 62 | total_reward = 0 63 | timestep = 0 64 | 65 | 66 | # plot average rewards 67 | plt.title('Rewards per timestep') 68 | plt.xlabel('Episodes') 69 | plt.ylabel('Rewards') 70 | plt.plot(rewards) 71 | plt.plot(gaussian_filter(rewards, sigma = 10)) 72 | plt.show() 73 | 74 | 75 | # try out the policy 76 | state = env.reset() 77 | while(1): 78 | env.render() 79 | time.sleep(0.5) 80 | action = policy[state] 81 | state, reward, done, _ = env.step(action) 82 | if done : 83 | break -------------------------------------------------------------------------------- /3. Model free methods/TD Learning/sarsa taxi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/3. Model free methods/TD Learning/sarsa taxi.png -------------------------------------------------------------------------------- /Deterministic PG/DDPG/Inverted pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/Deterministic PG/DDPG/Inverted pendulum.png -------------------------------------------------------------------------------- /Deterministic PG/DDPG/README.md: -------------------------------------------------------------------------------- 1 | # Deep Deterministic Policy Gradients 2 | 3 | ### Summary 4 | This paper builds on the idea of [Deterministic policy gradients]() and uses function approximation to solve continuous control problems. This implementation tries to solve the inverted pendulum task. 5 | 6 | ### Results and plots 7 | 8 | **Rewards v/s Episodes** 9 | ![rewardplot](https://github.com/jayeshk7/Deep-RL/blob/master/3.%20Deterministic%20PG/DDPG/Inverted%20pendulum.png) 10 | 11 | **Observations** : The policy starts performing badly after a point and it keeps getting worse. This was observed in every run of this algorithm and I am not exactly sure why this happens. -------------------------------------------------------------------------------- /Deterministic PG/DDPG/agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import gym 5 | from ddpg import * 6 | from collections import deque 7 | import random 8 | import matplotlib.pyplot as plt 9 | from scipy.ndimage import gaussian_filter 10 | 11 | env = gym.make('BipedalWalker-v2') 12 | env.unwrapped 13 | action_space = env.action_space.shape[0] 14 | state_space = env.observation_space.shape[0] 15 | 16 | behaviour_critic = ddpg_critic(state_space, action_space).cuda() 17 | target_critic = ddpg_critic(state_space, action_space).cuda() 18 | target_critic.eval() 19 | target_critic.load_state_dict(behaviour_critic.state_dict()) 20 | 21 | behaviour_actor = ddpg_actor(state_space, action_space).cuda() 22 | target_actor = ddpg_actor(state_space, action_space).cuda() 23 | target_actor.eval() 24 | target_actor.load_state_dict(behaviour_actor.state_dict()) 25 | 26 | 27 | episodes = 4000 28 | episode_length = 200 # THIS IS REQUIRED BECAUSE THE EPISODE NEVER ENDS FOR INVERTED PENDULUM (DONE == FALSE ALWAYS) 29 | BATCH_SIZE = 64 30 | MEMORY = 1000000 # REPLAY MEMORY CAPACITY 31 | TAU = 0.001 # FOR POLYAK AVERAGING 32 | replay_buffer = deque([]) # INITIALISED REPLAY BUFFER 33 | episode_reward = [] 34 | 35 | alpha_critic = 0.0003 36 | alpha_actor = 0.0003 37 | lossfn = nn.MSELoss() 38 | critic_optimizer = torch.optim.Adam(behaviour_critic.parameters(), lr = alpha_critic) 39 | actor_optimizer = torch.optim.Adam(behaviour_actor.parameters(), lr = alpha_actor) 40 | 41 | 42 | for episode in range(episodes): 43 | 44 | state = env.reset() 45 | done = False 46 | total_reward = 0 47 | for _ in range(episode_length): 48 | # SELECT ACTION 49 | action = behaviour_actor.actor_forward(state).detach().cpu() 50 | action = action + torch.randn(1) # ADDING NOISE FOR EXPLORATION 51 | nextstate, reward, _, _ = env.step([action.item()]) 52 | total_reward += reward 53 | 54 | # STORE THE TRANSITION 55 | experience = (state, action.item(), reward, nextstate) 56 | if len(replay_buffer) < MEMORY : 57 | replay_buffer.append(experience) 58 | else: 59 | replay_buffer.popleft() 60 | replay_buffer.append(experience) 61 | 62 | # TRAINING NETWORK 63 | if len(replay_buffer) >= BATCH_SIZE : 64 | batch_buffer = random.sample(replay_buffer, BATCH_SIZE) 65 | s, a, r, ns = map(np.stack, zip(*batch_buffer)) # LEARNT SOMETHING NEW HERE 66 | s = torch.FloatTensor(s) 67 | a = torch.FloatTensor(a).view(-1,1).cuda() 68 | r = torch.FloatTensor(r).view(-1,1).cuda() 69 | ns = torch.FloatTensor(ns) 70 | 71 | # UPDATE THE CRITIC 72 | ns_actions = target_actor.actor_forward(ns) 73 | target_qvalues = r + target_critic.critic_forward(ns, ns_actions) 74 | predicted_qvalues = behaviour_critic.critic_forward(s, a) 75 | loss_critic = lossfn(predicted_qvalues, target_qvalues) 76 | 77 | critic_optimizer.zero_grad() 78 | loss_critic.backward() 79 | critic_optimizer.step() 80 | 81 | # UPDATE THE ACTOR 82 | loss_actor = behaviour_critic.critic_forward(s, behaviour_actor.actor_forward(s)) 83 | loss_actor = -loss_actor.mean() 84 | actor_optimizer.zero_grad() 85 | loss_actor.backward() 86 | actor_optimizer.step() 87 | 88 | # POLYAK AVERAGING 89 | for target_param, param in zip(target_critic.parameters(), behaviour_critic.parameters()): 90 | target_param.data.copy_( 91 | target_param.data * (1.0 - TAU) + param.data * TAU 92 | ) 93 | 94 | for target_param, param in zip(target_actor.parameters(), behaviour_actor.parameters()): 95 | target_param.data.copy_( 96 | target_param.data * (1.0 - TAU) + param.data * TAU 97 | ) 98 | 99 | state = nextstate 100 | 101 | 102 | episode_reward.append(total_reward) 103 | if (episode+1)%200 == 0: 104 | print(f'episode number {episode+1}; average reward of last 200 episodes = {np.mean(episode_reward[-200:])}') 105 | # TEST THE NETWORK 106 | x = env.reset() 107 | for i in range(200) : 108 | env.render() 109 | u = target_actor.actor_forward(x) 110 | x_, _, _, _ = env.step([action.item()]) 111 | if done : 112 | break 113 | x = x_ 114 | 115 | plt.ylabel('Rewards') 116 | plt.xlabel('Episodes') 117 | plt.plot(episode_reward) 118 | plt.plot(gaussian_filter(episode_reward, 25)) 119 | plt.show() 120 | -------------------------------------------------------------------------------- /Deterministic PG/DDPG/ddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as f 4 | 5 | class ddpg_actor(nn.Module): # TAKES STATE AS INPUT AND OUTPUTS ACTION 6 | def __init__(self, state_space, action_space): 7 | super(ddpg_actor, self).__init__() 8 | 9 | self.actor1 = nn.Linear(state_space, 400) 10 | self.actor2 = nn.Linear(400, 200) 11 | self.actor3 = nn.Linear(200, action_space) 12 | 13 | def actor_forward(self, state): 14 | 15 | state = torch.FloatTensor(state).cuda() 16 | hidden1 = f.relu(self.actor1(state)) 17 | hidden2 = f.relu(self.actor2(hidden1)) 18 | out = self.actor3(hidden2) 19 | 20 | return out 21 | 22 | class ddpg_critic(nn.Module): # TAKES STATE AND ACTION AS INPUT, OUTPUTS Q VALUE 23 | def __init__(self, state_space, action_space): 24 | super(ddpg_critic, self).__init__() 25 | 26 | self.critic1 = nn.Linear(state_space + action_space, 400) 27 | self.critic2 = nn.Linear(400, 200) 28 | self.critic3 = nn.Linear(200, 1) 29 | 30 | def critic_forward(self, state, action): # ACTION IS ALREADY A TENSOR, STATE IS NOT A TENSOR 31 | 32 | state = torch.FloatTensor(state).cuda() 33 | # action = action.cuda() 34 | state = torch.cat((state, action), 1).cuda() 35 | hidden1 = f.relu(self.critic1(state)) 36 | hidden2 = f.relu(self.critic2(hidden1)) 37 | out = self.critic3(hidden2) 38 | 39 | return out -------------------------------------------------------------------------------- /Deterministic PG/README.md: -------------------------------------------------------------------------------- 1 | # Deterministic Policy Gradients 2 | 3 | ### Summary 4 | This family of deep RL algorithms build on top of the idea presented in [Deterministic Policy Gradients](http://proceedings.mlr.press/v32/silver14.pdf). These assume the policy to be a deterministic function of the features and not a probability distribution over all actions. Deep deterministic policy gradients (DDPG) and Twin Delayed Deep Deterministic Policy Gradients (TD3) are some of the widely used algorithms. 5 | 6 | These algorithms can be used to solve any continuous control task while the result may or may not be good depending on the difficulty of the task. TD3 usually outperforms DDPG and makes progress in environments even where DDPG fails. -------------------------------------------------------------------------------- /Deterministic PG/TD3/agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as f 4 | import numpy as np 5 | import gym 6 | from td3 import * 7 | 8 | env = gym.make('Pendulum-v0') 9 | env.unwrapped 10 | state_space = env.observation_space.shape[0] 11 | action_space = env.action_space.shape[0] 12 | 13 | critic1 = Critic(state_space, action_space).cuda() 14 | target_critic1 = Critic(state_space, action_space) 15 | critic2 = Critic(state_space, action_space) 16 | target_critic2 = Critic(state_space, action_space) 17 | 18 | actor = Actor(state_space, action_space) 19 | target_actor = Actor(state_space, action_space) 20 | 21 | -------------------------------------------------------------------------------- /Deterministic PG/TD3/td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as f 4 | 5 | class Actor(nn.Module): 6 | def __init__(self, state_space, action_space): 7 | super(Actor, self).__init__ 8 | 9 | self.linear1 = nn.Linear(state_space, 400) 10 | self.linear2 = nn.Linear(400, 200) 11 | self.linear3 = nn.Linear(200, 1) 12 | 13 | def forward(self, state) : 14 | 15 | state = torch.FloatTensor(state).cuda() 16 | hidden1 = f.relu(self.linear1(state)) 17 | hidden2 = f.relu(self.linear2(hidden1)) 18 | action = self.linear3(hidden2) 19 | 20 | return action 21 | 22 | class Critic(nn.Module): 23 | def __init__(self, state_space, action_space): 24 | super(Critic, self).__init__ 25 | 26 | self.linear1 = nn.Linear(state_space+action_space, 400) 27 | self.linear2 = nn.Linear(400, 200) 28 | self.linear3 = nn.Linear(200, 1) 29 | 30 | def forward(self, state, action): 31 | 32 | state = torch.FloatTensor(state) 33 | action = torch.FloatTensor(action) 34 | x = torch.cat([state, action]).cuda() 35 | hidden1 = f.relu(self.linear1(x)) 36 | hidden2 = f.relu(self.linear2(hidden1)) 37 | q_value = self.linear3(hidden2) 38 | 39 | return q_value -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning 2 | 3 | All the algorithms I implemented (using Python3 and NumPy) while reading Introduction to Reinforcement Learning by Sutton and Barto.
4 | *There's a separate ReadMe for each topic* 5 | 6 | 7 | High Level structure of the repo : 8 | 1. [Bandits](https://github.com/jayeshk7/RL-Algorithms/tree/master/1.%20Bandits) 9 | - Epsilon-greedy 10 | - Optimistic initial value 11 | - Softmax exploration 12 | 2. [Dynamic Programming methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/2.%20DP%20methods) 13 | - Policy iteration 14 | - Value iteration 15 | 3. [Model free methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods) 16 | - [Monte Carlo control](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods/Monte%20Carlo) 17 | - On-Policy Monte Carlo 18 | - Off-policy Monte Carlo using Importance Sampling (incomplete) 19 | - [Temporal-difference methods](https://github.com/jayeshk7/RL-Algorithms/tree/master/3.%20Model%20free%20methods/TD%20Learning) 20 | - Q-Learning 21 | - SARSA 22 | 23 | ### Resources 24 | CS234 and David silver often use different notations, it would be better to follow just one of them in the beginning (I prefer David Silver's lectures) 25 | 1. [Introduction to Reinforcement Learning by Sutton and Barto](http://incompleteideas.net/book/RLbook2020.pdf) 26 | 2. [David Silver's lectures](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 27 | 3. [Stanford CS234](https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u) 28 | 29 | 30 | [Check this out](https://github.com/IvLabs/resources) for more resources! 31 | -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/Actor critic/agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import gym 4 | from model import * 5 | 6 | env = gym.make('CartPole-v0') 7 | state_space = env.observation_space.shape[0] 8 | action_space = env.action_space.n 9 | 10 | policy = Actor(state_space, action_space) 11 | -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/Actor critic/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as f 4 | 5 | class Actor(nn.Module): 6 | def __init__(self, state_space, action_space): 7 | super(Actor, self).__init__ 8 | 9 | self.actor1 = nn.Linear(state_space, 256) 10 | self.actor2 = nn.Linear(256, 128) 11 | self.actor3 = nn.Linear(128, action_space) 12 | 13 | def forward(self, state): 14 | 15 | state = torch.FloatTensor(state).cuda() 16 | hid1 = f.relu(self.actor1(state)) 17 | hid2 = f.relu(self.actor2(hid1)) 18 | action_probs = f.softmax(self.actor3(hid2)) 19 | 20 | return action_probs 21 | 22 | class Critic(nn.Module): 23 | def __init__(self, state_space, action_space): 24 | super(Critic, self).__init__ 25 | 26 | self.critic1 = nn.Linear(state_space+action_space, 256) 27 | self.critic2 = nn.Linear(256, 128) 28 | self.critic3 = nn.Linear(128, 1) 29 | 30 | def forward(self, state): 31 | 32 | state = torch.FloatTensor(state).cuda 33 | hid1 = f.relu(self.critic1(state)) 34 | hid2 = f.relu(self.critic2(hid1)) 35 | state_value = self.critic3(hid2) 36 | 37 | return state_value -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/README.md: -------------------------------------------------------------------------------- 1 | # Policy gradient methods 2 | 3 | ### Summary 4 | Policy gradient methods is a class of algorithms where we directly optimize our policy. The policy is represented by a function approximator like neural networks which takes in the state and outputs a distribution over actions. Agent takes the action with the highest probability.
5 | Since the policy is already stochastic in this case, we don't have to think about exploration here. 6 | 7 | ### Results 8 | 9 | **REINFORCE on CartPole-v0** 10 | ![reinforce_cartpole](https://github.com/jayeshk7/Deep-RL/blob/master/2.%20Vanilla%20Policy%20gradient%20methods/REINFORCE/Cartpole_result.png) 11 | 12 | -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/REINFORCE/Cartpole_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayeshk7/RL-Algorithms/8ba63014fb4607253b047a192c3b08bddb9a9f4c/Vanilla Policy gradient methods/REINFORCE/Cartpole_result.png -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/REINFORCE/README.md: -------------------------------------------------------------------------------- 1 | # REINFORCE 2 | 3 | PyTorch implementation of the Monte carlo policy gradient algorithm (also known as REINFORCE). 4 | 5 | ### Results 6 | 7 | **Cartpole-v0** 8 | ![image](https://github.com/jayeshk7/Deep-RL/blob/master/2.%20Vanilla%20Policy%20gradient%20methods/REINFORCE/Cartpole_result.png) -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/REINFORCE/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from model import Policynetwork 4 | from utils import generate_episode, plot 5 | import numpy as np 6 | import gym 7 | 8 | env = gym.make('CartPole-v0') 9 | 10 | state_space = env.observation_space.shape[0] 11 | action_space = env.action_space.n 12 | 13 | alpha = 0.001 14 | policy_network = Policynetwork(state_space, action_space) 15 | optimizer = torch.optim.Adam(policy_network.parameters(), lr = alpha) 16 | 17 | ## TRAIN 18 | 19 | episodes = 1000 20 | rewards = [] 21 | 22 | for episode in range(episodes): 23 | 24 | episode_experience, reward_list = generate_episode(env, policy_network) # RETURNS (S,A,R,S') TUPLES OF THE EPISODE AND LIST OF REWARDS OBTAINED AT EACH STEP 25 | total_reward = np.sum(reward_list) # TOTAL REWARD OF THE EPISODE 26 | rewards.append(total_reward) # STORING TOTAL REWARD OF EACH EPISODE 27 | loss = 0 28 | 29 | for i,sars in enumerate(episode_experience): 30 | 31 | state, action, _, nextstate = sars 32 | target_reward = np.sum(reward_list[i:-1]) - np.mean(reward_list) 33 | reward_weight = torch.tensor(target_reward) # using monte carlo estimate for target, have i incorporated causality here?? 34 | action_logprob = -torch.log(policy_network(state))[action] 35 | loss += action_logprob*reward_weight 36 | 37 | optimizer.zero_grad() 38 | loss.backward() 39 | optimizer.step() 40 | if (episode+1)%100 == 0: 41 | print(f'{episode+1}th episode; average reward of past 100 episodes :', np.mean(rewards[-100:])) # PRINTING AVG OF LAST 100 EPISODES 42 | 43 | 44 | plot(rewards) 45 | 46 | # TESTING THE NETWORK 47 | 48 | done = False 49 | state = env.reset() 50 | policy_network.eval() 51 | while not done: 52 | env.render() 53 | action_probs = policy_network(state) 54 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy())) 55 | nextstate, reward, done, _ = env.step(action) 56 | if done: 57 | break 58 | state = nextstate 59 | 60 | -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/REINFORCE/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Policynetwork(nn.Module): 5 | def __init__(self, state_space, action_space): 6 | super(Policynetwork, self).__init__() 7 | 8 | self.linear1 = nn.Linear(state_space, 64) 9 | self.linear2 = nn.Linear(64, 16) 10 | self.linear3 = nn.Linear(16, action_space) 11 | 12 | def forward(self, state): 13 | 14 | state = torch.FloatTensor(state) 15 | hidden1 = nn.functional.relu(self.linear1(state)) 16 | hidden2 = nn.functional.relu(self.linear2(hidden1)) 17 | action_preds = nn.functional.softmax(self.linear3(hidden2), dim=0) 18 | 19 | return action_preds -------------------------------------------------------------------------------- /Vanilla Policy gradient methods/REINFORCE/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.ndimage import gaussian_filter 4 | 5 | 6 | def generate_episode(env, policy): 7 | 8 | action_space = env.action_space.n 9 | experience = [] 10 | rewards = [] 11 | state = env.reset() 12 | action_probs = policy.forward(state) 13 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy())) # SELECTING ACTION ACCORDING TO THE PROBABILITY GIVEN BY NETWORK 14 | 15 | done = False 16 | while not done: 17 | nextstate, reward, done, _ = env.step(action) 18 | rewards.append(reward) # APPENDING REWARD AT EACH STEP 19 | experience.append((state, action, reward, nextstate)) # APPENDING (S,A,R,S') TUPLE 20 | state = nextstate 21 | action_probs = policy.forward(state) 22 | action = int(np.random.choice(np.arange(action_space), p=action_probs.detach().numpy())) # SELECTING ACTION 23 | 24 | return experience, rewards 25 | 26 | def plot(rewards): 27 | plt.plot(rewards) 28 | plt.xlabel('Episodes') 29 | plt.ylabel('Rewards') 30 | plt.plot(gaussian_filter(rewards, sigma=50)) 31 | plt.show() --------------------------------------------------------------------------------