├── .DS_Store ├── .gitattributes ├── Atari-GAN └── atari-gan-generation.py ├── BlackJack_First_Vist_MC ├── .DS_Store ├── BlackJack_MC_uda.py ├── Figure 2020-08-16 113558.png ├── Figure_1.png ├── Figure_2.png ├── Monte_Carlo_FirstVisit_BlackJack.py ├── __pycache__ │ └── plot_utils.cpython-38.pyc └── plot_utils.py ├── Cart_pole ├── .DS_Store ├── cartpole_ActionWrapper.py ├── cartpole_cross_entropy.py ├── cartpole_random.py ├── cartpole_random_monitor.py └── runs │ ├── Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole │ └── events.out.tfevents.1600576634.Pavans-MacBook-Pro.local │ ├── Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole │ └── events.out.tfevents.1600576679.Pavans-MacBook-Pro.local │ └── Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole │ └── events.out.tfevents.1601364152.Pavans-MacBook-Pro.local ├── Cross-Entrorpy ├── CEM_method.py └── checkpoint.pth ├── Deep-Q-Learning ├── .DS_Store ├── dqn_pong.py └── lib_dep │ ├── __pycache__ │ ├── dqn_model.cpython-38.pyc │ └── wrappers.cpython-38.pyc │ ├── dqn_model.py │ └── wrappers.py ├── Deep-Q-Network ├── Deep_Q_network.py ├── __pycache__ │ ├── dqn_agent.cpython-38.pyc │ └── model.cpython-38.pyc ├── dqn_agent.py ├── lunar_lander_test.py └── model.py ├── Discretization └── Discretization_udacity.py ├── Frozen_lake ├── .DS_Store ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb └── Frozen_lake_v_0.py ├── Gradient_Bandit ├── Figure_1.png └── gradient_bandit.py ├── K-armed-Bandit ├── .DS_Store ├── K-armed_Bandit-Problem.py └── output │ ├── actions.png │ └── rewards.png ├── K-armed-Greedy ├── Figure_1.png ├── Figure_2.png ├── Figure_3.png ├── Figure_4.png ├── Figure_5.png └── K-armed-Greedy-and-rest.py ├── Monte_Carlo_Frozen_lake ├── .DS_Store └── MC_Frozenlake.py ├── MountainCar_Q_learn └── mountainCarQlearn.py ├── Off_Policy_Monte_Carlo └── Off_Policy_MC.py ├── Pac-Man └── pacman_DQN.py ├── Ping_pong ├── .DS_Store └── ping_pong.py ├── Policy_eval_Grid_World ├── .DS_Store ├── Figure_1.png └── policy_eval_GridWorld.py ├── Q-Learning └── Q-learning.py ├── README.md ├── Reinforce └── policy_graident.py ├── Sarsa ├── Sarsa.py └── n-Sarsa_and_Sarsa(lambda).py ├── Temporal-Difference ├── TD_Udacity.py ├── __pycache__ │ ├── check_test.cpython-38.pyc │ └── plot_utils.cpython-38.pyc ├── check_test.py ├── plot_graph.png └── plot_utils.py ├── Tile-coding └── Tile_coding_Uda.py ├── Upper-Confidence-Bound ├── Figure_1.png └── UCB.py ├── _config.yml └── cheatsheet.pdf /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Atari-GAN/atari-gan-generation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 18 16:38:32 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import random 10 | import argparse 11 | import cv2 12 | 13 | #torch 14 | import torch 15 | import torch.nn as nn 16 | import torch.optim as optim 17 | from tensorboardX import SummaryWriter 18 | 19 | import torchvision.utils as vutils 20 | 21 | #GYM 22 | import gym 23 | import gym.spaces 24 | import numpy as np 25 | 26 | log = gym.logger 27 | log.setLevel(gym.logger.INFO) 28 | 29 | 30 | LATENT_VECTOR_SIZE = 100 31 | DISCR_FILTERS = 64 32 | GENER_FILTERS = 64 33 | BATCH_SIZE = 16 34 | 35 | # dimension input image will be rescaled 36 | IMAGE_SIZE = 64 37 | 38 | LEARNING_RATE = 0.0001 39 | REPORT_EVERY_ITER = 100 40 | SAVE_IMAGE_EVERY_ITER = 1000 41 | 42 | class InputWrapper(gym.ObservationWrapper): 43 | 44 | def __init__(self, *args): 45 | super(InputWrapper, self).__init__(*args) 46 | assert isinstance(self.observation_space,gym.spaces.Box) 47 | old_space = self.observation_space 48 | self.observation_space = gym.spaces.Box( 49 | self.observation(old_space.low), 50 | self.observation(old_space.high), 51 | dtype=np.float32) 52 | 53 | 54 | def observation(self,observation): 55 | 56 | #resizing the imagw 57 | new_obs = cv2.resize(observation , (IMAGE_SIZE,IMAGE_SIZE)) 58 | # transform (210, 160, 3) -> (3, 210, 160) 59 | new_obs = np.moveaxis(new_obs, 2, 0) 60 | return new_obs.astype(np.float32) 61 | 62 | class Discriminator(nn.Module): 63 | 64 | def __init__(self,input_shape): 65 | super(Discriminator,self).__init__() 66 | 67 | #convering images into single number 68 | self.conv_pipe = nn.Sequential( 69 | nn.Conv2d(in_channels= input_shape[0], out_channels = DISCR_FILTERS 70 | , kernel_size = 4 , stride=2 , padding=1), 71 | nn.ReLU(), 72 | 73 | nn.Conv2d(in_channels= DISCR_FILTERS, out_channels = DISCR_FILTERS *2 74 | , kernel_size = 4 , stride=2 , padding=1), 75 | nn.BatchNorm2d(DISCR_FILTERS *2), 76 | nn.ReLU(), 77 | 78 | nn.Conv2d(in_channels= DISCR_FILTERS *2, out_channels = DISCR_FILTERS *4 79 | , kernel_size = 4 , stride=2 , padding=1), 80 | nn.BatchNorm2d(DISCR_FILTERS *4), 81 | nn.ReLU(), 82 | 83 | nn.Conv2d(in_channels= DISCR_FILTERS *4, out_channels = DISCR_FILTERS *8 84 | , kernel_size = 4 , stride=2 , padding=1), 85 | nn.BatchNorm2d(DISCR_FILTERS *8), 86 | nn.ReLU(), 87 | 88 | nn.Conv2d(in_channels=DISCR_FILTERS * 8, out_channels=1, 89 | kernel_size=4, stride=1, padding=0), 90 | nn.Sigmoid() 91 | 92 | ) 93 | 94 | def forward(self,x): 95 | conv_out = self.conv_pipe(x) 96 | 97 | return conv_out.view(-1,1).squeeze(dim = 1) 98 | 99 | class Generator(nn.Module): 100 | 101 | def __init__(self,output_shape): 102 | super(Generator,self).__init__() 103 | # deconvise 104 | self.pipe = nn.Sequential( 105 | nn.ConvTranspose2d(in_channels = LATENT_VECTOR_SIZE, out_channels = GENER_FILTERS*8 106 | , kernel_size = 4,stride = 1, padding = 0), 107 | 108 | nn.BatchNorm2d(GENER_FILTERS *8), 109 | nn.ReLU(), 110 | 111 | nn.ConvTranspose2d(in_channels = GENER_FILTERS *8, out_channels = GENER_FILTERS*4 112 | , kernel_size = 4,stride = 2, padding = 1), 113 | 114 | nn.BatchNorm2d(GENER_FILTERS *4), 115 | nn.ReLU(), 116 | 117 | nn.ConvTranspose2d(in_channels =GENER_FILTERS *4, out_channels = GENER_FILTERS*2 118 | , kernel_size = 4,stride = 1, padding = 0), 119 | 120 | nn.BatchNorm2d(GENER_FILTERS *2), 121 | nn.ReLU(), 122 | 123 | nn.ConvTranspose2d(in_channels =GENER_FILTERS *2, out_channels = GENER_FILTERS 124 | , kernel_size = 4,stride = 1, padding = 0), 125 | 126 | nn.BatchNorm2d(GENER_FILTERS ), 127 | nn.ReLU(), 128 | 129 | nn.ConvTranspose2d(in_channels=GENER_FILTERS, out_channels=output_shape[0], 130 | kernel_size=4, stride=2, padding=1), 131 | nn.Tanh() 132 | 133 | 134 | ) 135 | 136 | def forward(self,x): 137 | return self.pipe(x) 138 | 139 | 140 | def iterate_batches(envs, batch_size = BATCH_SIZE): 141 | 142 | batch = [e.reset() for e in envs] 143 | env_gen = iter(lambda : random.choice(envs), None) 144 | 145 | 146 | while True: 147 | 148 | e = next(env_gen) 149 | 150 | obs, reward,is_done,_ = e.step(e.action_sample.sample()) 151 | 152 | if np.mean(obs) > 0.01: 153 | batch.append(obs) 154 | 155 | if len(batch) == batch_size: 156 | #normalising betn 1 and -1 157 | batch_np = np.array(batch, dtype=np.float32) 158 | batch_np *= 2.0 / 255.0 - 1.0 159 | 160 | yield torch.tensor(batch_np) 161 | batch.clear() 162 | 163 | if is_done: 164 | e.reset() 165 | 166 | if __name__ == 'main': 167 | 168 | parser= argparse.ArgumentParser() 169 | parser.add_argument( "--cuda", default=False, action='store_true', 170 | help="Enable cuda computation") 171 | 172 | args = parser.parse_args() 173 | 174 | device= torch.device('cuda' if args.cuda else 'cpu') 175 | 176 | envs = [ 177 | InputWrapper(gym.make(name)) 178 | for name in ('Breakout-v0','AirRaid-v0', 'Pong-v0') 179 | ] 180 | input_shape = envs[0].observation_space.shape 181 | 182 | net_discr = Discriminator(input_shape=input_shape).to(device) 183 | net_gener = Generator(output_shape=input_shape).to(device) 184 | 185 | objective = nn.BCELoss() 186 | 187 | gen_optimizer = optim.Adam(params=net_gener.parameters(), lr=LEARNING_RATE, 188 | betas=(0.5, 0.999)) 189 | 190 | dis_optimizer = optim.Adam( 191 | params=net_discr.parameters(), lr=LEARNING_RATE, 192 | betas=(0.5, 0.999)) 193 | writer = SummaryWriter() 194 | 195 | gen_losses = [] 196 | dis_losses = [] 197 | iter_no = 0 198 | 199 | true_labels_v = torch.ones(BATCH_SIZE, device=device) 200 | fake_labels_v = torch.zeros(BATCH_SIZE, device=device) 201 | 202 | 203 | for batch_v in iterate_batches(envs): 204 | # fake samples, input is 4D: batch, filters, x, y 205 | gen_input_v = torch.FloatTensor( 206 | BATCH_SIZE, LATENT_VECTOR_SIZE, 1, 1) 207 | gen_input_v.normal_(0, 1) 208 | gen_input_v = gen_input_v.to(device) 209 | batch_v = batch_v.to(device) 210 | gen_output_v = net_gener(gen_input_v) 211 | 212 | 213 | # train discriminator 214 | dis_optimizer.zero_grad() 215 | dis_output_true_v = net_discr(batch_v) 216 | dis_output_fake_v = net_discr(gen_output_v.detach()) 217 | dis_loss = objective(dis_output_true_v, true_labels_v) + \ 218 | objective(dis_output_fake_v, fake_labels_v) 219 | dis_loss.backward() 220 | dis_optimizer.step() 221 | dis_losses.append(dis_loss.item()) 222 | 223 | # train generator 224 | gen_optimizer.zero_grad() 225 | dis_output_v = net_discr(gen_output_v) 226 | gen_loss_v = objective(dis_output_v, true_labels_v) 227 | gen_loss_v.backward() 228 | gen_optimizer.step() 229 | gen_losses.append(gen_loss_v.item()) 230 | 231 | iter_no += 1 232 | if iter_no % REPORT_EVERY_ITER == 0: 233 | log.info("Iter %d: gen_loss=%.3e, dis_loss=%.3e", 234 | iter_no, np.mean(gen_losses), 235 | np.mean(dis_losses)) 236 | writer.add_scalar( 237 | "gen_loss", np.mean(gen_losses), iter_no) 238 | writer.add_scalar( 239 | "dis_loss", np.mean(dis_losses), iter_no) 240 | gen_losses = [] 241 | dis_losses = [] 242 | if iter_no % SAVE_IMAGE_EVERY_ITER == 0: 243 | writer.add_image("fake", vutils.make_grid( 244 | gen_output_v.data[:64], normalize=True), iter_no) 245 | writer.add_image("real", vutils.make_grid( 246 | batch_v.data[:64], normalize=True), iter_no) 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/.DS_Store -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/BlackJack_MC_uda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Aug 16 11:20:06 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import sys 11 | import gym 12 | import numpy as np 13 | from collections import defaultdict 14 | 15 | from plot_utils import plot_blackjack_values, plot_policy 16 | 17 | env = gym.make('Blackjack-v0') 18 | 19 | #checking the env 20 | print(env.observation_space) 21 | print(env.action_space) 22 | print(" ") 23 | 24 | #using blackjack env to play random state 25 | 26 | for i in range(3): 27 | state = env.reset() 28 | while True: 29 | print(state) 30 | action = env.action_space.sample() 31 | state,reward,done,info= env.step(action) 32 | 33 | if done: 34 | 35 | print('End game! Reward: ', reward) 36 | print('You won :)\n') if reward > 0 else print('You lost :(\n') 37 | 38 | break 39 | 40 | 41 | def generate_episode_from_limit_stochastic(bj_env): 42 | episode = [] 43 | state = bj_env.reset() 44 | while True: 45 | probs = [0.8, 0.2] if state[0] > 18 else [0.2, 0.8] 46 | action = np.random.choice(np.arange(2), p=probs) 47 | next_state, reward, done, info = bj_env.step(action) 48 | episode.append((state, action, reward)) 49 | state = next_state 50 | if done: 51 | break 52 | return episode 53 | 54 | 55 | for i in range(3): 56 | print(generate_episode_from_limit_stochastic(env)) 57 | 58 | 59 | def mc_prediction_q(env, num_episodes, generate_episode, gamma=1.0): 60 | # initialize empty dictionaries of arrays 61 | returns_sum = defaultdict(lambda: np.zeros(env.action_space.n)) 62 | N = defaultdict(lambda: np.zeros(env.action_space.n)) 63 | Q = defaultdict(lambda: np.zeros(env.action_space.n)) 64 | # loop over episodes 65 | for i_episode in range(1, num_episodes+1): 66 | # monitor progress 67 | if i_episode % 1000 == 0: 68 | print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") 69 | sys.stdout.flush() 70 | # generate an episode 71 | episode = generate_episode(env) 72 | # obtain the states, actions, and rewards 73 | states, actions, rewards = zip(*episode) 74 | # prepare for discounting 75 | discounts = np.array([gamma**i for i in range(len(rewards)+1)]) 76 | # update the sum of the returns, number of visits, and action-value 77 | # function estimates for each state-action pair in the episode 78 | for i, state in enumerate(states): 79 | returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(1+i)]) 80 | N[state][actions[i]] += 1.0 81 | Q[state][actions[i]] = returns_sum[state][actions[i]] / N[state][actions[i]] 82 | return Q 83 | 84 | 85 | # obtain the action-value function 86 | Q = mc_prediction_q(env, 500000, generate_episode_from_limit_stochastic) 87 | 88 | # obtain the corresponding state-value function 89 | V_to_plot = dict((k,(k[0]>18)*(np.dot([0.8, 0.2],v)) + (k[0]<=18)*(np.dot([0.2, 0.8],v))) \ 90 | for k, v in Q.items()) 91 | 92 | # plot the state-value function 93 | plot_blackjack_values(V_to_plot) 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/Figure 2020-08-16 113558.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure 2020-08-16 113558.png -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure_1.png -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/Figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure_2.png -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/Monte_Carlo_FirstVisit_BlackJack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 12 09:16:57 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from mpl_toolkits.mplot3d import Axes3D 13 | 14 | import matplotlib as mpl 15 | from matplotlib import cm 16 | 17 | env = gym.make('Blackjack-v0') 18 | 19 | 20 | state_val = np.zeros((22,12,2)) 21 | 22 | state_count = np.zeros(state_val.shape) 23 | 24 | #Holds value at greater than 20 25 | policy = 20 26 | 27 | 28 | episodes = 100000 29 | 30 | for episode in range(episodes): 31 | complete = False 32 | s_0 = env.reset() 33 | 34 | G = [] 35 | states =[s_0] 36 | 37 | while complete == False: 38 | #implement the policy 39 | if s_0[0] >= policy: 40 | s_1,reward,complete , _ = env.step(0) 41 | 42 | else: 43 | s_1,reward,complete, _ = env.step(1) 44 | G.append(reward) 45 | states.append(s_1) 46 | 47 | 48 | if complete == True: 49 | 50 | for s_i , s in enumerate(states[:-1]): 51 | 52 | if s[2] == True: 53 | 54 | s_ace = 1 55 | 56 | else: 57 | 58 | s_ace = 0 59 | 60 | 61 | returns = np.mean(G[s_i:]) 62 | 63 | #update values 64 | 65 | state_count[s[0], s[1],s_ace] +=1 66 | 67 | state_val[s[0],s[1],s_ace] += (returns-state_val[s[0],s[1],s_ace])/state_count[s[0],s[1],s_ace] 68 | 69 | s_0 =s_1 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | fig = plt.figure(figsize=(12,8)) 78 | ax = fig.gca(projection='3d') 79 | player_range = np.arange(11, 22) 80 | dealer_range = np.arange(1, 11) 81 | 82 | X, Y = np.meshgrid(dealer_range, player_range) 83 | Z = state_val[11:22,1:11,0].reshape(X.shape) 84 | ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, linewidth=1, 85 | rstride=1, cstride=1) 86 | ax.set_title("Without Ace") 87 | ax.set_xlabel("Dealer Showing") 88 | ax.set_ylabel("Player Hand") 89 | ax.set_zlabel("State Value") 90 | plt.show() 91 | 92 | # With usable ace 93 | fig = plt.figure(figsize=(12,8)) 94 | ax = fig.gca(projection='3d') 95 | player = np.arange(11, 22) 96 | dealer = np.arange(2, 12) 97 | 98 | X, Y = np.meshgrid(dealer_range, player_range) 99 | Z = state_val[11:22,1:11,1].reshape(X.shape) 100 | ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, linewidth=1, 101 | rstride=1, cstride=1) 102 | ax.set_title("With Ace") 103 | ax.set_xlabel("Dealer Showing") 104 | ax.set_ylabel("Player Hand") 105 | ax.set_zlabel("State Value") 106 | plt.show() 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/__pycache__/plot_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/__pycache__/plot_utils.cpython-38.pyc -------------------------------------------------------------------------------- /BlackJack_First_Vist_MC/plot_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Aug 16 11:23:19 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import numpy as np 10 | from mpl_toolkits.mplot3d import Axes3D 11 | import matplotlib.pyplot as plt 12 | from mpl_toolkits.axes_grid1 import make_axes_locatable 13 | 14 | def plot_blackjack_values(V): 15 | 16 | def get_Z(x, y, usable_ace): 17 | if (x,y,usable_ace) in V: 18 | return V[x,y,usable_ace] 19 | else: 20 | return 0 21 | 22 | def get_figure(usable_ace, ax): 23 | x_range = np.arange(11, 22) 24 | y_range = np.arange(1, 11) 25 | X, Y = np.meshgrid(x_range, y_range) 26 | 27 | Z = np.array([get_Z(x,y,usable_ace) for x,y in zip(np.ravel(X), np.ravel(Y))]).reshape(X.shape) 28 | 29 | surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=plt.cm.coolwarm, vmin=-1.0, vmax=1.0) 30 | ax.set_xlabel('Player\'s Current Sum') 31 | ax.set_ylabel('Dealer\'s Showing Card') 32 | ax.set_zlabel('State Value') 33 | ax.view_init(ax.elev, -120) 34 | 35 | fig = plt.figure(figsize=(20, 20)) 36 | ax = fig.add_subplot(211, projection='3d') 37 | ax.set_title('Usable Ace') 38 | get_figure(True, ax) 39 | ax = fig.add_subplot(212, projection='3d') 40 | ax.set_title('No Usable Ace') 41 | get_figure(False, ax) 42 | plt.show() 43 | 44 | def plot_policy(policy): 45 | 46 | def get_Z(x, y, usable_ace): 47 | if (x,y,usable_ace) in policy: 48 | return policy[x,y,usable_ace] 49 | else: 50 | return 1 51 | 52 | def get_figure(usable_ace, ax): 53 | x_range = np.arange(11, 22) 54 | y_range = np.arange(10, 0, -1) 55 | X, Y = np.meshgrid(x_range, y_range) 56 | Z = np.array([[get_Z(x,y,usable_ace) for x in x_range] for y in y_range]) 57 | surf = ax.imshow(Z, cmap=plt.get_cmap('Pastel2', 2), vmin=0, vmax=1, extent=[10.5, 21.5, 0.5, 10.5]) 58 | plt.xticks(x_range) 59 | plt.yticks(y_range) 60 | plt.gca().invert_yaxis() 61 | ax.set_xlabel('Player\'s Current Sum') 62 | ax.set_ylabel('Dealer\'s Showing Card') 63 | ax.grid(color='w', linestyle='-', linewidth=1) 64 | divider = make_axes_locatable(ax) 65 | cax = divider.append_axes("right", size="5%", pad=0.1) 66 | cbar = plt.colorbar(surf, ticks=[0,1], cax=cax) 67 | cbar.ax.set_yticklabels(['0 (STICK)','1 (HIT)']) 68 | 69 | fig = plt.figure(figsize=(15, 15)) 70 | ax = fig.add_subplot(121) 71 | ax.set_title('Usable Ace') 72 | get_figure(True, ax) 73 | ax = fig.add_subplot(122) 74 | ax.set_title('No Usable Ace') 75 | get_figure(False, ax) 76 | plt.show() -------------------------------------------------------------------------------- /Cart_pole/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/.DS_Store -------------------------------------------------------------------------------- /Cart_pole/cartpole_ActionWrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Sep 17 12:56:05 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | from typing import TypeVar 11 | import random 12 | 13 | Action = TypeVar('Action') 14 | 15 | 16 | class RandomActionWrapper(gym.ActionWrapper): 17 | def __init__(self, env, epsilon=0.1): 18 | super(RandomActionWrapper, self).__init__(env) 19 | self.epsilon = epsilon 20 | 21 | def action(self, action: Action) -> Action: 22 | if random.random() < self.epsilon: 23 | print("Random!") 24 | return self.env.action_space.sample() 25 | return action 26 | 27 | 28 | if __name__ == "__main__": 29 | env = RandomActionWrapper(gym.make("CartPole-v0")) 30 | 31 | obs = env.reset() 32 | total_reward = 0.0 33 | 34 | while True: 35 | obs, reward, done, _ = env.step(0) 36 | total_reward += reward 37 | env.render() 38 | if done: 39 | break 40 | 41 | print("Reward got: %.2f" % total_reward) -------------------------------------------------------------------------------- /Cart_pole/cartpole_cross_entropy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Sep 19 22:37:30 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | from collections import namedtuple 11 | import numpy as np 12 | from tensorboardX import SummaryWriter 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.optim as optim 17 | 18 | 19 | 20 | 21 | 22 | HIDDEN_SIZE = 128 23 | BATCH_SIZE = 16 24 | PERCENTILE = 70 25 | 26 | 27 | class Net(nn.Module): 28 | def __init__(self, obs_size, hidden_size, n_actions): 29 | super(Net, self).__init__() 30 | self.net = nn.Sequential( 31 | nn.Linear(obs_size, hidden_size), 32 | nn.ReLU(), 33 | nn.Linear(hidden_size, n_actions) 34 | ) 35 | 36 | def forward(self, x): 37 | return self.net(x) 38 | 39 | 40 | Episode = namedtuple('Episode', field_names=['reward', 'steps']) 41 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) 42 | 43 | 44 | def iterate_batches(env, net, batch_size): 45 | batch = [] 46 | episode_reward = 0.0 47 | episode_steps = [] 48 | obs = env.reset() 49 | sm = nn.Softmax(dim=1) 50 | while True: 51 | obs_v = torch.FloatTensor([obs]) 52 | act_probs_v = sm(net(obs_v)) 53 | act_probs = act_probs_v.data.numpy()[0] 54 | action = np.random.choice(len(act_probs), p=act_probs) 55 | next_obs, reward, is_done, _ = env.step(action) 56 | episode_reward += reward 57 | step = EpisodeStep(observation=obs, action=action) 58 | episode_steps.append(step) 59 | if is_done: 60 | e = Episode(reward=episode_reward, steps=episode_steps) 61 | batch.append(e) 62 | episode_reward = 0.0 63 | episode_steps = [] 64 | next_obs = env.reset() 65 | if len(batch) == batch_size: 66 | yield batch 67 | batch = [] 68 | obs = next_obs 69 | 70 | 71 | def filter_batch(batch, percentile): 72 | rewards = list(map(lambda s: s.reward, batch)) 73 | reward_bound = np.percentile(rewards, percentile) 74 | reward_mean = float(np.mean(rewards)) 75 | 76 | train_obs = [] 77 | train_act = [] 78 | for reward, steps in batch: 79 | if reward < reward_bound: 80 | continue 81 | train_obs.extend(map(lambda step: step.observation, steps)) 82 | train_act.extend(map(lambda step: step.action, steps)) 83 | 84 | train_obs_v = torch.FloatTensor(train_obs) 85 | train_act_v = torch.LongTensor(train_act) 86 | return train_obs_v, train_act_v, reward_bound, reward_mean 87 | 88 | 89 | if __name__ == "__main__": 90 | env = gym.make("CartPole-v0") 91 | # env = gym.wrappers.Monitor(env, directory="mon", force=True) 92 | obs_size = env.observation_space.shape[0] 93 | n_actions = env.action_space.n 94 | 95 | net = Net(obs_size, HIDDEN_SIZE, n_actions) 96 | objective = nn.CrossEntropyLoss() 97 | optimizer = optim.Adam(params=net.parameters(), lr=0.01) 98 | writer = SummaryWriter(comment="-cartpole") 99 | 100 | for iter_no, batch in enumerate(iterate_batches( 101 | env, net, BATCH_SIZE)): 102 | obs_v, acts_v, reward_b, reward_m = \ 103 | filter_batch(batch, PERCENTILE) 104 | optimizer.zero_grad() 105 | action_scores_v = net(obs_v) 106 | loss_v = objective(action_scores_v, acts_v) 107 | loss_v.backward() 108 | optimizer.step() 109 | print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % ( 110 | iter_no, loss_v.item(), reward_m, reward_b)) 111 | writer.add_scalar("loss", loss_v.item(), iter_no) 112 | writer.add_scalar("reward_bound", reward_b, iter_no) 113 | writer.add_scalar("reward_mean", reward_m, iter_no) 114 | if reward_m > 199: 115 | print("Solved!") 116 | break 117 | writer.close() -------------------------------------------------------------------------------- /Cart_pole/cartpole_random.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Sep 17 12:17:45 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | 11 | if __name__ == "__main__": 12 | 13 | env = gym.make("CartPole-v0") 14 | total_reward= 0.0 15 | total_steps = 0 16 | obs = env.reset() 17 | 18 | while True: 19 | action = env.action_space.sample() 20 | obs,reward,done,_ = env.step(action) 21 | total_reward += reward 22 | total_steps+=1 23 | env.render() 24 | 25 | if done: 26 | break 27 | print("Episode done in %d steps, total reward %.2f" % ( total_steps, total_reward)) 28 | 29 | 30 | -------------------------------------------------------------------------------- /Cart_pole/cartpole_random_monitor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Sep 17 14:22:39 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | 11 | 12 | if __name__ == "__main__": 13 | env = gym.make("CartPole-v0") 14 | env = gym.wrappers.Monitor(env, "recording", force = True) 15 | 16 | total_reward = 0.0 17 | total_steps = 0 18 | obs = env.reset() 19 | 20 | while True: 21 | action = env.action_space.sample() 22 | obs, reward, done, _ = env.step(action) 23 | total_reward += reward 24 | total_steps += 1 25 | if done: 26 | break 27 | 28 | print("Episode done in %d steps, total reward %.2f" % ( 29 | total_steps, total_reward)) 30 | env.close() 31 | env.env.close() -------------------------------------------------------------------------------- /Cart_pole/runs/Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576634.Pavans-MacBook-Pro.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576634.Pavans-MacBook-Pro.local -------------------------------------------------------------------------------- /Cart_pole/runs/Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576679.Pavans-MacBook-Pro.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576679.Pavans-MacBook-Pro.local -------------------------------------------------------------------------------- /Cart_pole/runs/Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1601364152.Pavans-MacBook-Pro.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1601364152.Pavans-MacBook-Pro.local -------------------------------------------------------------------------------- /Cross-Entrorpy/CEM_method.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Aug 25 11:50:01 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | #importing stuff 10 | 11 | import gym 12 | import numpy as np 13 | import math 14 | from collections import deque 15 | import matplotlib.pyplot as plt 16 | 17 | #torch importing 18 | import torch 19 | import torch.nn as nn 20 | import torch.nn.functional as F 21 | from torch.autograd import Variable 22 | 23 | #Initalizing the environment 24 | 25 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 26 | 27 | env = gym.make('MountainCarContinuous-v0') 28 | env.seed(101) 29 | np.random.seed(101) 30 | 31 | print('observation space:', env.observation_space) 32 | print('action space:', env.action_space) 33 | print(' - low:', env.action_space.low) 34 | print(' - high:', env.action_space.high) 35 | 36 | 37 | #Creating the agent 38 | 39 | class Agent(nn.Module): 40 | 41 | 42 | def __init__(self,env, h_size = 16): 43 | super(Agent, self).__init__() 44 | self.env = env 45 | 46 | #state 47 | self.s_size = env.observation_space.shape[0] 48 | #Hidden layer 49 | self.h_size = h_size 50 | #actionsize 51 | self.a_size = env.action_space.shape[0] 52 | 53 | #defining the layers 54 | self.fc1 = nn.Linear(self.s_size, self.h_size) 55 | self.fc2 = nn.Linear(self.h_size,self.a_size) 56 | 57 | 58 | def set_weights(self, weights): 59 | 60 | s_size = self.s_size 61 | h_size= self.h_size 62 | a_size = self.a_size 63 | 64 | # seprate the weighs for each layer 65 | fc1_end = (s_size * h_size) + h_size 66 | fc1_W = torch.from_numpy(weights[: s_size * h_size].reshape(s_size,h_size)) 67 | fc1_b = torch.from_numpy(weights[ s_size* h_size : fc1_end]) 68 | fc2_W = torch.from_numpy(weights[fc1_end :fc1_end +(h_size * a_size)] . reshape(h_size,a_size)) 69 | fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size) : ]) 70 | 71 | #set the weights for each layer 72 | self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data)) 73 | self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data)) 74 | self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data)) 75 | self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data)) 76 | 77 | def get_weights_dim(self): 78 | return (self.s_size +1) * self.h_size + (self.h_size +1)* self.a_size 79 | 80 | def forward(self, x): 81 | x = F.relu(self.fc1(x)) 82 | x = F.tanh(self.fc2(x)) 83 | return x.cpu().data 84 | 85 | def evaluate(self, weights, gamma=1.0, max_t=5000): 86 | self.set_weights(weights) 87 | episode_return = 0.0 88 | state = self.env.reset() 89 | for t in range(max_t): 90 | state = torch.from_numpy(state).float().to(device) 91 | action = self.forward(state) 92 | state, reward, done, _ = self.env.step(action) 93 | episode_return += reward * math.pow(gamma, t) 94 | if done: 95 | break 96 | return episode_return 97 | 98 | 99 | agent = Agent(env).to(device) 100 | 101 | 102 | def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5): 103 | 104 | n_elite=int(pop_size*elite_frac) 105 | 106 | scores_deque = deque(maxlen=100) 107 | scores = [] 108 | best_weight = sigma*np.random.randn(agent.get_weights_dim()) 109 | 110 | for i_iteration in range(1, n_iterations+1): 111 | weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)] 112 | rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop]) 113 | 114 | elite_idxs = rewards.argsort()[-n_elite:] 115 | elite_weights = [weights_pop[i] for i in elite_idxs] 116 | best_weight = np.array(elite_weights).mean(axis=0) 117 | 118 | reward = agent.evaluate(best_weight, gamma=1.0) 119 | scores_deque.append(reward) 120 | scores.append(reward) 121 | 122 | torch.save(agent.state_dict(), 'checkpoint.pth') 123 | 124 | if i_iteration % print_every == 0: 125 | print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque))) 126 | 127 | if np.mean(scores_deque)>=90.0: 128 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque))) 129 | break 130 | return scores 131 | 132 | scores = cem() 133 | 134 | # plot the scores 135 | fig = plt.figure() 136 | ax = fig.add_subplot(111) 137 | plt.plot(np.arange(1, len(scores)+1), scores) 138 | plt.ylabel('Score') 139 | plt.xlabel('Episode #') 140 | plt.show() 141 | 142 | 143 | agent.load_state_dict(torch.load('checkpoint.pth')) 144 | 145 | state = env.reset() 146 | while True: 147 | state = torch.from_numpy(state).float().to(device) 148 | with torch.no_grad(): 149 | action = agent(state) 150 | env.render() 151 | next_state, reward, done, _ = env.step(action) 152 | state = next_state 153 | if done: 154 | break 155 | 156 | env.close() 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /Cross-Entrorpy/checkpoint.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cross-Entrorpy/checkpoint.pth -------------------------------------------------------------------------------- /Deep-Q-Learning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/.DS_Store -------------------------------------------------------------------------------- /Deep-Q-Learning/dqn_pong.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Sep 21 10:56:19 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | from lib_dep import wrappers 10 | from lib_dep import dqn_model 11 | 12 | import argparse 13 | import time 14 | import numpy as np 15 | import collections 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.optim as optim 20 | 21 | from tensorboardX import SummaryWriter 22 | 23 | 24 | DEFAULT_ENV_NAME = "PongNoFrameskip-v4" 25 | MEAN_REWARD_BOUND = 19 26 | 27 | GAMMA = 0.99 28 | BATCH_SIZE = 32 29 | REPLAY_SIZE = 10000 30 | REPLAY_START_SIZE = 10000 31 | LEARNING_RATE = 1e-4 32 | SYNC_TARGET_FRAMES = 1000 33 | 34 | EPSILON_DECAY_LAST_FRAME = 150000 35 | EPSILON_START = 1.0 36 | EPSILON_FINAL = 0.01 37 | 38 | Experience = collections.namedtuple( 'Experience', field_names = ['state', 'action', 39 | 'reward', 'done', 'new_state']) 40 | 41 | class Experience_Buffer: 42 | 43 | def __init__(self,capacity): 44 | self.buffer = collections.deque(maxlen = capacity) 45 | 46 | 47 | def __len__(self): 48 | return len(self.buffer) 49 | 50 | # adding the experience to the buffer 51 | def append(self,experience): 52 | self.buffer.append(experience) 53 | 54 | def sample(self, batch_size): 55 | indices = np.random.choice(len(self.buffer),batch_size,replace= False) 56 | 57 | states,actions,rewards,dones,next_states= \ 58 | zip(*[self.buffer[idx] for idx in indices]) 59 | 60 | 61 | return np.array(states), np.array(actions), \ 62 | np.array(rewards, dtype=np.float32), \ 63 | np.array(dones, dtype=np.uint8), \ 64 | np.array(next_states) 65 | 66 | 67 | class Agent: 68 | 69 | def __init__(self, env,exp_buffer): 70 | 71 | self.env = env 72 | self.exp_buffer = exp_buffer 73 | self._reset() 74 | 75 | def _reset(self): 76 | self.state = self.env.reset() 77 | self.total_reward = 0.0 78 | 79 | 80 | @torch.no_grad() 81 | def play_step(self, net, epsilon=0.0, device="cpu"): 82 | done_reward = None 83 | 84 | if np.random.random() < epsilon: 85 | action = self.env.action_space.sample() 86 | else: 87 | state_a = np.array([self.state], copy=False) 88 | state_v = torch.tensor(state_a).to(device) 89 | q_vals_v = net(state_v) 90 | _, act_v = torch.max(q_vals_v, dim=1) 91 | action = int(act_v.item()) 92 | 93 | 94 | 95 | new_state, reward, is_done, _ = self.env.step(action) 96 | self.total_reward += reward 97 | 98 | exp = Experience(self.state, action, reward, 99 | is_done, new_state) 100 | self.exp_buffer.append(exp) 101 | self.state = new_state 102 | if is_done: 103 | done_reward = self.total_reward 104 | self._reset() 105 | return done_reward 106 | 107 | 108 | def calc_loss(batch, net, tgt_net , device = 'cpu'): 109 | 110 | states,actions,rewards,dones ,next_states = batch 111 | 112 | states_v= torch.tensor(np.array(states, copy= False)).to(device) 113 | 114 | next_states_v = torch.tensor(np.array(next_states, copy = False)).to(device) 115 | 116 | actions_v = torch.tensor(actions).to(device) 117 | rewards_v = torch.tensor(rewards).to(device) 118 | done_mask = torch.BoolTensor(dones).to(device) 119 | 120 | state_action_values = net(states_v).gather( 121 | 1, actions_v.unsqueeze(-1)).squeeze(-1) 122 | 123 | with torch.no_grad(): 124 | next_state_values = tgt_net(next_states_v).max(1)[0] 125 | next_state_values[done_mask] = 0.0 126 | next_state_values = next_state_values.detach() 127 | 128 | expected_state_action_values = next_state_values * GAMMA + rewards_v 129 | 130 | return nn.MSELoss()(state_action_values, expected_state_action_values) 131 | 132 | 133 | if __name__ == "__main__": 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument("--cuda", default=False, 136 | action="store_true", help="Enable cuda") 137 | parser.add_argument("--env", default=DEFAULT_ENV_NAME, 138 | help="Name of the environment, default=" + 139 | DEFAULT_ENV_NAME) 140 | args = parser.parse_args() 141 | device = torch.device("cuda" if args.cuda else "cpu") 142 | 143 | env = wrappers.make_env(args.env) 144 | 145 | net = dqn_model.DQN(env.observation_space.shape, 146 | env.action_space.n).to(device) 147 | tgt_net = dqn_model.DQN(env.observation_space.shape, 148 | env.action_space.n).to(device) 149 | writer = SummaryWriter(comment="-" + args.env) 150 | print(net) 151 | 152 | buffer = Experience_Buffer(REPLAY_SIZE) 153 | agent = Agent(env, buffer) 154 | epsilon = EPSILON_START 155 | 156 | optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 157 | total_rewards = [] 158 | frame_idx = 0 159 | ts_frame = 0 160 | ts = time.time() 161 | best_m_reward = None 162 | 163 | while True: 164 | frame_idx += 1 165 | epsilon = max(EPSILON_FINAL, EPSILON_START - 166 | frame_idx / EPSILON_DECAY_LAST_FRAME) 167 | 168 | reward = agent.play_step(net, epsilon, device=device) 169 | env.render() 170 | if reward is not None: 171 | total_rewards.append(reward) 172 | speed = (frame_idx - ts_frame) / (time.time() - ts) 173 | ts_frame = frame_idx 174 | ts = time.time() 175 | m_reward = np.mean(total_rewards[-100:]) 176 | print("%d: done %d games, reward %.3f, " 177 | "eps %.2f, speed %.2f f/s" % ( 178 | frame_idx, len(total_rewards), m_reward, epsilon, 179 | speed 180 | )) 181 | writer.add_scalar("epsilon", epsilon, frame_idx) 182 | writer.add_scalar("speed", speed, frame_idx) 183 | writer.add_scalar("reward_100", m_reward, frame_idx) 184 | writer.add_scalar("reward", reward, frame_idx) 185 | if best_m_reward is None or best_m_reward < m_reward: 186 | torch.save(net.state_dict(), args.env + 187 | "-best_%.0f.dat" % m_reward) 188 | if best_m_reward is not None: 189 | print("Best reward updated %.3f -> %.3f" % ( 190 | best_m_reward, m_reward)) 191 | best_m_reward = m_reward 192 | if m_reward > MEAN_REWARD_BOUND: 193 | print("Solved in %d frames!" % frame_idx) 194 | break 195 | 196 | if len(buffer) < REPLAY_START_SIZE: 197 | continue 198 | 199 | if frame_idx % SYNC_TARGET_FRAMES == 0: 200 | tgt_net.load_state_dict(net.state_dict()) 201 | 202 | optimizer.zero_grad() 203 | batch = buffer.sample(BATCH_SIZE) 204 | loss_t = calc_loss(batch, net, tgt_net, device=device) 205 | loss_t.backward() 206 | optimizer.step() 207 | writer.close() 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /Deep-Q-Learning/lib_dep/__pycache__/dqn_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/lib_dep/__pycache__/dqn_model.cpython-38.pyc -------------------------------------------------------------------------------- /Deep-Q-Learning/lib_dep/__pycache__/wrappers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/lib_dep/__pycache__/wrappers.cpython-38.pyc -------------------------------------------------------------------------------- /Deep-Q-Learning/lib_dep/dqn_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Sep 20 22:19:54 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | import numpy as np 12 | 13 | class DQN(nn.Module): 14 | 15 | def __init__(self,input_shape, n_actions): 16 | super(DQN,self).__init__() 17 | 18 | self.conv = nn.Sequential( 19 | 20 | nn.Conv2d(in_channels = input_shape[0], out_channels = 32, 21 | kernel_size = 4 , stride = 2), 22 | nn.ReLU(), 23 | 24 | nn.Conv2d(in_channels = 32, out_channels = 64, 25 | kernel_size = 8 , stride = 4), 26 | nn.ReLU(), 27 | 28 | nn.Conv2d(in_channels = 64, out_channels = 64, 29 | kernel_size = 3 , stride = 1), 30 | nn.ReLU(), 31 | 32 | 33 | ) 34 | 35 | conv_out_size = self._get_conv_out(input_shape) 36 | self.fc = nn.Sequential( 37 | nn.Linear(conv_out_size, 512), 38 | nn.ReLU(), 39 | nn.Linear(512, n_actions) 40 | ) 41 | 42 | 43 | def _get_conv_out(self, shape): 44 | o = self.conv(torch.zeros(1, *shape)) 45 | return int(np.prod(o.size())) 46 | 47 | def forward(self, x): 48 | conv_out = self.conv(x).view(x.size()[0], -1) 49 | return self.fc(conv_out) 50 | -------------------------------------------------------------------------------- /Deep-Q-Learning/lib_dep/wrappers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Sep 20 20:27:55 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import cv2 10 | import gym 11 | import gym.spaces 12 | import numpy as np 13 | import collections 14 | 15 | class FireResetEnv(gym.Wrapper): 16 | 17 | def __init__(self,env = None): 18 | super(FireResetEnv,self).__init__(env) 19 | 20 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 21 | assert len(env.unwrapped.get_action_meanings()) >= 3 22 | 23 | def step(self, action): 24 | return self.env.step(action) 25 | 26 | def reset(self): 27 | self.env.reset() 28 | obs, _, done, _ = self.env.step(1) 29 | if done: 30 | self.env.reset() 31 | obs, _, done, _ = self.env.step(2) 32 | if done: 33 | self.env.reset() 34 | return obs 35 | 36 | class MaxAndSkipEnv(gym.Wrapper): 37 | 38 | def __init__(self, env =None, skip = 4): 39 | 40 | 41 | super(MaxAndSkipEnv, self).__init__(env) 42 | self._obs_buffer = collections.deque(maxlen=2) 43 | self._skip = skip 44 | 45 | def step(self, action): 46 | total_reward = 0.0 47 | done = None 48 | for _ in range(self._skip): 49 | obs, reward, done, info = self.env.step(action) 50 | self._obs_buffer.append(obs) 51 | total_reward += reward 52 | if done: 53 | break 54 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 55 | return max_frame, total_reward, done, info 56 | def reset(self): 57 | 58 | self._obs_buffer.clear() 59 | obs = self.env.reset() 60 | self._obs_buffer.append(obs) 61 | return obs 62 | 63 | class ProcessFrame84(gym.ObservationWrapper): 64 | 65 | def __init__(self, env = None): 66 | super(ProcessFrame84,self).__init__(env) 67 | self.observation_space = gym.spaces.Box( 68 | low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 69 | 70 | 71 | def observation(self, obs): 72 | return ProcessFrame84.process(obs) 73 | 74 | 75 | @staticmethod 76 | def process(frame): 77 | if frame.size == 210 * 160 * 3: 78 | img = np.reshape(frame, [210, 160, 3]).astype( 79 | np.float32) 80 | elif frame.size == 250 * 160 * 3: 81 | img = np.reshape(frame, [250, 160, 3]).astype( 82 | np.float32) 83 | else: 84 | assert False, "Unknown resolution." 85 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \ 86 | img[:, :, 2] * 0.114 87 | resized_screen = cv2.resize( 88 | img, (84, 110), interpolation=cv2.INTER_AREA) 89 | x_t = resized_screen[18:102, :] 90 | x_t = np.reshape(x_t, [84, 84, 1]) 91 | return x_t.astype(np.uint8) 92 | 93 | class ImageToPyTorch(gym.ObservationWrapper): 94 | def __init__(self, env): 95 | super(ImageToPyTorch, self).__init__(env) 96 | old_shape = self.observation_space.shape 97 | new_shape = (old_shape[-1], old_shape[0], old_shape[1]) 98 | self.observation_space = gym.spaces.Box( 99 | low=0.0, high=1.0, shape=new_shape, dtype=np.float32) 100 | 101 | def observation(self, observation): 102 | return np.moveaxis(observation, 2, 0) 103 | 104 | 105 | 106 | class ScaledFloatFrame(gym.ObservationWrapper): 107 | def observation(self, obs): 108 | return np.array(obs).astype(np.float32) / 255.0 109 | 110 | 111 | class BufferWrapper(gym.ObservationWrapper): 112 | 113 | def __init__(self, env,n_steps,dtype = np.float32): 114 | super(BufferWrapper, self).__init__(env) 115 | self.dtype = dtype 116 | old_space = env.observation_space 117 | self.observation_space = gym.spaces.Box( 118 | old_space.low.repeat(n_steps, axis=0), 119 | old_space.high.repeat(n_steps, axis=0), dtype=dtype) 120 | 121 | 122 | def reset(self): 123 | self.buffer = np.zeros_like( 124 | self.observation_space.low, dtype=self.dtype) 125 | return self.observation(self.env.reset()) 126 | 127 | def observation(self, observation): 128 | self.buffer[:-1] = self.buffer[1:] 129 | self.buffer[-1] = observation 130 | return self.buffer 131 | 132 | 133 | def make_env(env_name): 134 | env = gym.make(env_name) 135 | env = MaxAndSkipEnv(env) 136 | env = FireResetEnv(env) 137 | env = ProcessFrame84(env) 138 | env = ImageToPyTorch(env) 139 | env = BufferWrapper(env, 4) 140 | return ScaledFloatFrame(env) 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /Deep-Q-Network/Deep_Q_network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Aug 22 09:59:22 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | import gym 9 | import random 10 | import torch 11 | import numpy as np 12 | from collections import deque 13 | import matplotlib.pyplot as plt 14 | 15 | env = gym.make('LunarLander-v2') 16 | env.seed(0) 17 | print('State shape: ', env.observation_space.shape) 18 | print('Number of actions: ', env.action_space.n) 19 | 20 | # a random agent 21 | from dqn_agent import Agent 22 | 23 | agent = Agent(state_size=8, action_size=4, seed=0) 24 | 25 | # watch an untrained agenta 26 | state = env.reset() 27 | for j in range(200): 28 | action = agent.act(state) 29 | #env.render() 30 | state, reward, done, _ = env.step(action) 31 | if done: 32 | break 33 | 34 | env.close() 35 | 36 | def dqn(n_episodes=100, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): 37 | 38 | scores = [] # list containing scores from each episode 39 | scores_window = deque(maxlen=100) # last 100 scores 40 | eps = eps_start # initialize epsilon 41 | for i_episode in range(1, n_episodes+1): 42 | state = env.reset() 43 | score = 0 44 | for t in range(max_t): 45 | action = agent.act(state, eps) 46 | next_state, reward, done, _ = env.step(action) 47 | agent.step(state, action, reward, next_state, done) 48 | state = next_state 49 | score += reward 50 | if done: 51 | break 52 | scores_window.append(score) # save most recent score 53 | scores.append(score) # save most recent score 54 | eps = max(eps_end, eps_decay*eps) # decrease epsilon 55 | print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") 56 | if i_episode % 100 == 0: 57 | print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) 58 | if np.mean(scores_window)>=200.0: 59 | print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) 60 | torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') 61 | break 62 | return scores 63 | 64 | scores = dqn() 65 | 66 | # plot the scores 67 | fig = plt.figure() 68 | ax = fig.add_subplot(111) 69 | plt.plot(np.arange(len(scores)), scores) 70 | plt.ylabel('Score') 71 | plt.xlabel('Episode #') 72 | plt.show() 73 | 74 | agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) 75 | 76 | for i in range(3): 77 | state = env.reset() 78 | for j in range(200): 79 | action = agent.act(state) 80 | env.render() 81 | state, reward, done, _ = env.step(action) 82 | if done: 83 | break 84 | 85 | env.close() 86 | 87 | -------------------------------------------------------------------------------- /Deep-Q-Network/__pycache__/dqn_agent.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Network/__pycache__/dqn_agent.cpython-38.pyc -------------------------------------------------------------------------------- /Deep-Q-Network/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Network/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /Deep-Q-Network/dqn_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Aug 22 08:46:19 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import numpy as np 10 | import random 11 | from collections import namedtuple, deque 12 | 13 | from model import QNetwork 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | import torch.optim as optim 18 | 19 | BUFFER_SIZE = int(1e5) #Replay Buffer size 20 | BATCH_SIZE = 64 # min Batch size 21 | GAMMA = 0.9 # discount Factor 22 | TAU = 1e-3 #for soft update of target parameters 23 | LR = 5e-4 # learning rate 24 | UPDATE_EVERY = 4 #How often do you update the network 25 | 26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 27 | 28 | 29 | 30 | 31 | 32 | class Agent(): 33 | 34 | def __init__(self, state_size, action_size, seed): 35 | 36 | self.state_size = state_size 37 | self.action_size = action_size 38 | self.seed = random.seed(seed) 39 | 40 | # Q-Network 41 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) 42 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 43 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) 44 | 45 | # Replay memory 46 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 47 | # Initialize time step (for updating every UPDATE_EVERY steps) 48 | self.t_step = 0 49 | 50 | def step(self, state, action, reward, next_state, done): 51 | # Save experience in replay memory 52 | self.memory.add(state, action, reward, next_state, done) 53 | 54 | # Learn every UPDATE_EVERY time steps. 55 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 56 | if self.t_step == 0: 57 | # If enough samples are available in memory, get random subset and learn 58 | if len(self.memory) > BATCH_SIZE: 59 | experiences = self.memory.sample() 60 | self.learn(experiences, GAMMA) 61 | 62 | def act(self, state, eps=0.): 63 | 64 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 65 | self.qnetwork_local.eval() 66 | with torch.no_grad(): 67 | action_values = self.qnetwork_local(state) 68 | self.qnetwork_local.train() 69 | 70 | # Epsilon-greedy action selection 71 | if random.random() > eps: 72 | return np.argmax(action_values.cpu().data.numpy()) 73 | else: 74 | return random.choice(np.arange(self.action_size)) 75 | 76 | def learn(self, experiences, gamma): 77 | 78 | states, actions, rewards, next_states, dones = experiences 79 | 80 | # Get max predicted Q values (for next states) from target model 81 | Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) 82 | # Compute Q targets for current states 83 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 84 | 85 | # Get expected Q values from local model 86 | Q_expected = self.qnetwork_local(states).gather(1, actions) 87 | 88 | # Compute loss 89 | loss = F.mse_loss(Q_expected, Q_targets) 90 | # Minimize the loss 91 | self.optimizer.zero_grad() 92 | loss.backward() 93 | self.optimizer.step() 94 | 95 | # ------------------- update target network ------------------- # 96 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 97 | 98 | def soft_update(self, local_model, target_model, tau): 99 | 100 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 101 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 102 | 103 | 104 | class ReplayBuffer: 105 | """Fixed-size buffer to store experience tuples.""" 106 | 107 | def __init__(self, action_size, buffer_size, batch_size, seed): 108 | 109 | self.action_size = action_size 110 | self.memory = deque(maxlen=buffer_size) 111 | self.batch_size = batch_size 112 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 113 | self.seed = random.seed(seed) 114 | 115 | def add(self, state, action, reward, next_state, done): 116 | """Add a new experience to memory.""" 117 | e = self.experience(state, action, reward, next_state, done) 118 | self.memory.append(e) 119 | 120 | def sample(self): 121 | """Randomly sample a batch of experiences from memory.""" 122 | experiences = random.sample(self.memory, k=self.batch_size) 123 | 124 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 125 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 126 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 127 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 128 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 129 | 130 | return (states, actions, rewards, next_states, dones) 131 | 132 | def __len__(self): 133 | """Return the current size of internal memory.""" 134 | return len(self.memory) 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /Deep-Q-Network/lunar_lander_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Aug 22 14:33:12 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | import numpy as np 11 | 12 | 13 | # creating the environment 14 | 15 | env = gym.make('LunarLander-v2') 16 | env.seed(0) 17 | print('State shape: ', env.observation_space.shape) 18 | print('Number of actions: ', env.action_space.n) 19 | 20 | 21 | # creating a random agent 22 | env.reset() 23 | 24 | score = 0 25 | 26 | for i in range(500): 27 | action = env.action_space.sample() 28 | env.render() 29 | state,reward,done,info = env.step(action) 30 | score += reward 31 | if done: 32 | break 33 | 34 | env.close() 35 | 36 | 37 | print("Score is:", score) -------------------------------------------------------------------------------- /Deep-Q-Network/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Aug 22 07:53:55 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import torch 10 | 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | 15 | class QNetwork(nn.Module): 16 | 17 | def __init__(self,state_size,action_size,seed,fc1_units = 64,fc2_units = 64): 18 | 19 | super(QNetwork,self).__init__() 20 | self.seed= torch.manual_seed(seed) 21 | self.fc1= nn.Linear(state_size,fc1_units) # number of nodes hidden in first hidden layer 22 | self.fc2 = nn.Linear(fc1_units,fc2_units) # number of nodes hidden in first hidden layer 23 | self.fc3 = nn.Linear(fc2_units,action_size) 24 | 25 | def forward(self,state): 26 | 27 | x = F.relu(self.fc1(state)) 28 | x = F.relu(self.fc2(x)) 29 | return self.fc3(x) 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /Discretization/Discretization_udacity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Aug 20 15:09:47 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | #Importing stuff 11 | 12 | import sys 13 | import gym 14 | import numpy as np 15 | 16 | import pandas as pd 17 | import matplotlib.pyplot as plt 18 | import matplotlib.collections as mc 19 | plt.style.use('ggplot') 20 | np.set_printoptions(precision=3, linewidth=120) 21 | 22 | #creating the environment 23 | 24 | env= gym.make('MountainCar-v0') 25 | env.seed(505) #Random seeding 26 | 27 | #creating a Random agent 28 | state= env.reset() 29 | score = 0 30 | for t in range(200): 31 | action = env.action_space.sample() 32 | #env.render() 33 | state,reward, done,_ = env.step(action) 34 | score += reward 35 | if done: 36 | break 37 | 38 | print('Final Score: ',score) 39 | env.close() 40 | 41 | 42 | 43 | 44 | # now we have to create a uniform grid in this environment 45 | 46 | def create_uniform_grid(low,high,bins= (10,10)): 47 | 48 | grid = [np.linspace(low[dim], high[dim],bins[dim]+1)[1 :-1] for dim in range(len(bins))] 49 | 50 | print("Uniform grid: [, ] / => ") 51 | for l, h, b, splits in zip(low, high, bins, grid): 52 | print(" [{}, {}] / {} => {}".format(l, h, b, splits)) 53 | return grid 54 | 55 | 56 | 57 | low = [-1.0, -5.0] 58 | high = [1.0, 5.0] 59 | create_uniform_grid(low, high) # testing 60 | 61 | # we are going to discretize 62 | 63 | def discretize(sample, gird): 64 | return list(int(np.digitize(s,g)) for s ,g in zip(sample,gird)) # applying on each dimension 65 | 66 | # testing 67 | grid = create_uniform_grid([-1.0, -5.0], [1.0, 5.0]) 68 | samples = np.array( 69 | [[-1.0 , -5.0], 70 | [-0.81, -4.1], 71 | [-0.8 , -4.0], 72 | [-0.5 , 0.0], 73 | [ 0.2 , -1.9], 74 | [ 0.8 , 4.0], 75 | [ 0.81, 4.1], 76 | [ 1.0 , 5.0]]) 77 | discretized_samples = np.array([discretize(sample, grid) for sample in samples]) 78 | print("\nSamples:", repr(samples), sep="\n") 79 | print("\nDiscretized samples:", repr(discretized_samples), sep="\n") 80 | 81 | 82 | 83 | def visualize_samples(samples, discretized_samples, grid, low=None, high=None): 84 | """Visualize original and discretized samples on a given 2-dimensional grid.""" 85 | 86 | fig, ax = plt.subplots(figsize=(10, 10)) 87 | 88 | # Show grid 89 | ax.xaxis.set_major_locator(plt.FixedLocator(grid[0])) 90 | ax.yaxis.set_major_locator(plt.FixedLocator(grid[1])) 91 | ax.grid(True) 92 | 93 | # If bounds (low, high) are specified, use them to set axis limits 94 | if low is not None and high is not None: 95 | ax.set_xlim(low[0], high[0]) 96 | ax.set_ylim(low[1], high[1]) 97 | else: 98 | # Otherwise use first, last grid locations as low, high (for further mapping discretized samples) 99 | low = [splits[0] for splits in grid] 100 | high = [splits[-1] for splits in grid] 101 | 102 | # Map each discretized sample (which is really an index) to the center of corresponding grid cell 103 | grid_extended = np.hstack((np.array([low]).T, grid, np.array([high]).T)) # add low and high ends 104 | grid_centers = (grid_extended[:, 1:] + grid_extended[:, :-1]) / 2 # compute center of each grid cell 105 | locs = np.stack(grid_centers[i, discretized_samples[:, i]] for i in range(len(grid))).T # map discretized samples 106 | 107 | ax.plot(samples[:, 0], samples[:, 1], 'o') # plot original samples 108 | ax.plot(locs[:, 0], locs[:, 1], 's') # plot discretized samples in mapped locations 109 | ax.add_collection(mc.LineCollection(list(zip(samples, locs)), colors='orange')) # add a line connecting each original-discretized sample 110 | ax.legend(['original', 'discretized']) 111 | 112 | 113 | visualize_samples(samples, discretized_samples, grid, low, high) 114 | 115 | #Create a grid to discretize the state space 116 | state_grid = create_uniform_grid(env.observation_space.low, env.observation_space.high, bins=(10, 10)) 117 | state_grid 118 | # Obtain some samples from the space, discretize them, and then visualize them 119 | state_samples = np.array([env.observation_space.sample() for i in range(10)]) 120 | discretized_state_samples = np.array([discretize(sample, state_grid) for sample in state_samples]) 121 | visualize_samples(state_samples, discretized_state_samples, state_grid, 122 | env.observation_space.low, env.observation_space.high) 123 | plt.xlabel('position'); plt.ylabel('velocity'); # axis labels for MountainCar-v0 state space 124 | 125 | 126 | # now as we are done with the discretization stuff let's get to Q learning 127 | 128 | 129 | class QLearningAgent: 130 | 131 | # we can use this Agent to act on contious space by discretizing it 132 | 133 | def __init__(self,env,state_grid,alpha = 0.02,gamma = 0.99, epsilon = 1.0, 134 | epsilon_decay_rate = 0.9995,min_epsilon = .01,seed = 505): 135 | 136 | 137 | # Environment Info 138 | self.env = env 139 | self.state_grid = state_grid 140 | self.state_size = tuple(len(splits) +1 for splits in self.state_grid) # n dimendional space 141 | self.action_size = self.env.action_space.n #dimensional discrete space size 142 | self.seed = np.random.seed(seed) 143 | print(" ") 144 | print("Environment:", self.env) 145 | print("State space size:", self.state_size) 146 | print("Action space size:", self.action_size) 147 | print(" ") 148 | 149 | 150 | #Learning parameters 151 | self.alpha = alpha # learning rate 152 | self.gamma = gamma # discount factor 153 | self.epsilon = self.inital_epsilon = epsilon #Exploratory factor 154 | self.epsilon_decay_rate = epsilon_decay_rate # how quickly should we decrease the epsilon 155 | self.min_epsilon = epsilon 156 | 157 | #creating a Q table 158 | self.q_table = np.zeros(shape = (self.state_size +(self.action_size,))) 159 | 160 | print("Q table size:", self.q_table.shape) 161 | print(" ") 162 | 163 | 164 | def preprocess_state(self,state): 165 | return tuple(discretize(state, self.state_grid)) 166 | 167 | def reset_episode(self,state): 168 | 169 | #Gradually decreasing the exploratory rate 170 | 171 | self.epsilon *= self.epsilon_decay_rate 172 | self.epsilon = max(self.epsilon,self.min_epsilon) 173 | 174 | self.last_state = self.preprocess_state(state) 175 | self.last_action = np.argmax(self.q_table[self.last_state]) 176 | return self.last_action 177 | 178 | def reset_exploration(self,epsilon = None): 179 | 180 | self.epsilon = epsilon if epsilon is not None else self.initial_epsilon 181 | 182 | def act(self,state , reward = None, done = None, mode = 'train'): 183 | 184 | state = self.preprocess_state(state) 185 | 186 | if mode == 'test': 187 | 188 | action = np.argmax(self.q_table[state]) 189 | 190 | else: 191 | 192 | self.q_table[self.last_state + (self.last_action,)] += self.alpha * (reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state + (self.last_action,)]) 193 | 194 | # Exploration vs. exploitation 195 | do_exploration = np.random.uniform(0, 1) < self.epsilon 196 | if do_exploration: 197 | # Pick a random action 198 | action = np.random.randint(0, self.action_size) 199 | else: 200 | # Pick the best action from Q table 201 | action = np.argmax(self.q_table[state]) 202 | 203 | self.last_state = state 204 | self.last_action = action 205 | return action 206 | 207 | q_agent = QLearningAgent(env, state_grid) 208 | 209 | 210 | #Running the agent 211 | 212 | def run(agent,env, num_episodes = 20000,mode = 'train'): 213 | 214 | scores = [] 215 | max_avg_score = -np.inf 216 | 217 | for i_episode in range(1, num_episodes +1): 218 | state = env.reset() 219 | action= agent.reset_episode(state) 220 | total_reward = 0 221 | done = False 222 | 223 | while not done: 224 | 225 | state,reward,done,info = env.step(action) 226 | total_reward += reward 227 | action = agent.act(state,reward,done,mode) 228 | 229 | 230 | 231 | #save final scores 232 | scores.append(total_reward) 233 | 234 | #print episode stats 235 | 236 | if mode == 'train': 237 | if len(scores) > 100: 238 | avg_score = np.mean(scores[-100:]) 239 | if avg_score > max_avg_score: 240 | max_avg_score = avg_score 241 | if i_episode % 100 == 0: 242 | print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="") 243 | sys.stdout.flush() 244 | 245 | return scores 246 | 247 | scores = run(q_agent, env) 248 | 249 | # Plot scores obtained per episode 250 | plt.plot(scores); plt.title("Scores"); 251 | 252 | def plot_scores(scores, rolling_window=100): 253 | """Plot scores and optional rolling mean using specified window.""" 254 | plt.plot(scores); plt.title("Scores"); 255 | rolling_mean = pd.Series(scores).rolling(rolling_window).mean() 256 | plt.plot(rolling_mean); 257 | return rolling_mean 258 | 259 | rolling_mean = plot_scores(scores) 260 | 261 | # Run in test mode and analyze scores obtained 262 | test_scores = run(q_agent, env, num_episodes=100, mode='test') 263 | print("[TEST] Completed {} episodes with avg. score = {}".format(len(test_scores), np.mean(test_scores))) 264 | _ = plot_scores(test_scores) 265 | 266 | def plot_q_table(q_table): 267 | """Visualize max Q-value for each state and corresponding action.""" 268 | q_image = np.max(q_table, axis=2) # max Q-value for each state 269 | q_actions = np.argmax(q_table, axis=2) # best action for each state 270 | 271 | fig, ax = plt.subplots(figsize=(10, 10)) 272 | cax = ax.imshow(q_image, cmap='jet'); 273 | cbar = fig.colorbar(cax) 274 | for x in range(q_image.shape[0]): 275 | for y in range(q_image.shape[1]): 276 | ax.text(x, y, q_actions[x, y], color='white', 277 | horizontalalignment='center', verticalalignment='center') 278 | ax.grid(False) 279 | ax.set_title("Q-table, size: {}".format(q_table.shape)) 280 | ax.set_xlabel('position') 281 | ax.set_ylabel('velocity') 282 | 283 | 284 | plot_q_table(q_agent.q_table) 285 | 286 | state_grid_new = create_uniform_grid(env.observation_space.low, env.observation_space.high, bins=(20, 20)) 287 | q_agent_new = QLearningAgent(env, state_grid_new) 288 | q_agent_new.scores = [] 289 | 290 | q_agent_new.scores += run(q_agent_new, env, num_episodes=50000) # accumulate scores 291 | rolling_mean_new = plot_scores(q_agent_new.scores) 292 | 293 | 294 | plot_q_table(q_agent_new.q_table) 295 | 296 | state = env.reset() 297 | score = 0 298 | for t in range(200): 299 | action = q_agent_new.act(state, mode='test') 300 | env.render() 301 | state, reward, done, _ = env.step(action) 302 | score += reward 303 | if done: 304 | break 305 | print('Final score:', score) 306 | env.close() 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | -------------------------------------------------------------------------------- /Frozen_lake/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Frozen_lake/.DS_Store -------------------------------------------------------------------------------- /Frozen_lake/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /Frozen_lake/Frozen_lake_v_0.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | env = gym.make('FrozenLake-v0') 4 | 5 | 6 | def policy_evaluation(policy, environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9): 7 | # Number of evaluation iterations 8 | evaluation_iterations = 1 9 | # Initialize a value function for each state as zero 10 | V = np.zeros(environment.nS) 11 | # Repeat until change in value is below the threshold 12 | for i in range(int(max_iterations)): 13 | 14 | # Initialize a change of value function as zero 15 | delta = 0 16 | # Iterate though each state 17 | for state in range(environment.nS): 18 | 19 | # Initial a new value of current state 20 | v = 0 21 | # Try all possible actions which can be taken from this state 22 | for action, action_probability in enumerate(policy[state]): 23 | # Check how good next state will be 24 | for state_probability, next_state, reward, terminated in environment.P[state][action]: 25 | # Calculate the expected value 26 | v += action_probability * state_probability * (reward + discount_factor * V[next_state]) 27 | 28 | # Calculate the absolute change of value function 29 | delta = max(delta, np.abs(V[state] - v)) 30 | # Update value function 31 | V[state] = v 32 | evaluation_iterations += 1 33 | 34 | # Terminate if value change is insignificant 35 | if delta < theta: 36 | 37 | print(f'Policy evaluated in {evaluation_iterations} iterations.') 38 | return V 39 | 40 | def one_step_lookahead(environment, state, V, discount_factor): 41 | action_values = np.zeros(environment.nA) 42 | for action in range(environment.nA): 43 | for probability, next_state, reward, terminated in environment.P[state][action]: 44 | action_values[action] += probability * (reward + discount_factor * V[next_state]) 45 | return action_values 46 | 47 | 48 | def policy_iteration(environment, discount_factor=1.0, max_iterations=1e9): 49 | # Start with a random policy 50 | #num states x num actions / num actions 51 | policy = np.ones([environment.nS, environment.nA]) / environment.nA 52 | # Initialize counter of evaluated policies 53 | evaluated_policies = 1 54 | # Repeat until convergence or critical number of iterations reached 55 | for i in range(int(max_iterations)): 56 | stable_policy = True 57 | # Evaluate current policy 58 | V = policy_evaluation(policy, environment, discount_factor=discount_factor) 59 | # Go through each state and try to improve actions that were taken (policy Improvement) 60 | for state in range(environment.nS): 61 | # Choose the best action in a current state under current policy 62 | current_action = np.argmax(policy[state]) 63 | # Look one step ahead and evaluate if current action is optimal 64 | # We will try every possible action in a current state 65 | action_value = one_step_lookahead(environment, state, V, discount_factor) 66 | # Select a better action 67 | best_action = np.argmax(action_value) 68 | # If action didn't change 69 | if current_action != best_action: 70 | stable_policy = True 71 | # Greedy policy update 72 | policy[state] = np.eye(environment.nA)[best_action] 73 | evaluated_policies += 1 74 | # If the algorithm converged and policy is not changing anymore, then return final policy and value function 75 | if stable_policy: 76 | print(f'Evaluated {evaluated_policies} policies.') 77 | return policy, V 78 | 79 | def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9): 80 | # Initialize state-value function with zeros for each environment state 81 | V = np.zeros(environment.nS) 82 | for i in range(int(max_iterations)): 83 | # Early stopping condition 84 | delta = 0 85 | # Update each state 86 | for state in range(environment.nS): 87 | # Do a one-step lookahead to calculate state-action values 88 | action_value = one_step_lookahead(environment, state, V, discount_factor) 89 | # Select best action to perform based on the highest state-action value 90 | best_action_value = np.max(action_value) 91 | # Calculate change in value 92 | delta = max(delta, np.abs(V[state] - best_action_value)) 93 | # Update the value function for current state 94 | V[state] = best_action_value 95 | # Check if we can stop 96 | if delta < theta: 97 | print(f'Value-iteration converged at iteration#{i}.') 98 | break 99 | 100 | # Create a deterministic policy using the optimal value function 101 | policy = np.zeros([environment.nS, environment.nA]) 102 | for state in range(environment.nS): 103 | # One step lookahead to find the best action for this state 104 | action_value = one_step_lookahead(environment, state, V, discount_factor) 105 | # Select best action based on the highest state-action value 106 | best_action = np.argmax(action_value) 107 | # Update the policy to perform a better action at a current state 108 | policy[state, best_action] = 1.0 109 | return policy, V 110 | 111 | 112 | 113 | def play_episodes(environment, n_episodes, policy): 114 | wins = 0 115 | total_reward = 0 116 | for episode in range(n_episodes): 117 | terminated = False 118 | state = environment.reset() 119 | while not terminated: 120 | # Select best action to perform in a current state 121 | action = np.argmax(policy[state]) 122 | # Perform an action an observe how environment acted in response 123 | next_state, reward, terminated, info = environment.step(action) 124 | # Summarize total reward 125 | total_reward += reward 126 | # Update current state 127 | state = next_state 128 | # Calculate number of wins over episodes 129 | if terminated and reward == 1.0: 130 | wins += 1 131 | average_reward = total_reward / n_episodes 132 | return wins, total_reward, average_reward 133 | 134 | # Number of episodes to play 135 | n_episodes = 10000 136 | # Functions to find best policy 137 | solvers = [('Policy Iteration', policy_iteration), 138 | ('Value Iteration', value_iteration)] 139 | for iteration_name, iteration_func in solvers: 140 | # Load a Frozen Lake environment 141 | environment = gym.make('FrozenLake-v0') 142 | # Search for an optimal policy using policy iteration 143 | policy, V = iteration_func(environment.env) 144 | # Apply best policy to the real environment 145 | wins, total_reward, average_reward = play_episodes(environment, n_episodes, policy) 146 | print(f'{iteration_name} :: number of wins over {n_episodes} episodes = {wins}') 147 | print(f'{iteration_name} :: average reward over {n_episodes} episodes = {average_reward} \n\n') 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /Gradient_Bandit/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Gradient_Bandit/Figure_1.png -------------------------------------------------------------------------------- /Gradient_Bandit/gradient_bandit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Jul 2 08:35:45 2020 5 | 6 | @author: pavankunchala 7 | 8 | """ 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | """ 14 | in Gradient algortihm we use preferences if the following actions is more prefered the agent is more 15 | likely to prefer that action we can find the preference using softmax 16 | """ 17 | # def softmax(x): 18 | # return np.exp(x - x.max()) /np.sum(np.exp(x - x.max()),axis = 0) 19 | 20 | 21 | class grad_bandit: 22 | 23 | def __init__(self, k, alpha, iters, mu='random'): 24 | # Number of arms 25 | self.k = k 26 | self.actions = np.arange(k) 27 | # Number of iterations 28 | self.iters = iters 29 | # Step count 30 | self.n = 1 31 | # Step count for each arm 32 | self.k_n = np.ones(k) 33 | # Total mean reward 34 | self.mean_reward = 0 35 | self.reward = np.zeros(iters) 36 | # Mean reward for each arm 37 | self.k_reward = np.zeros(k) 38 | # Initialize preferences 39 | self.H = np.zeros(k) 40 | # Learning rate 41 | self.alpha = alpha 42 | 43 | if type(mu) == list or type(mu).__module__ == np.__name__: 44 | # User-defined averages 45 | self.mu = np.array(mu) 46 | elif mu == 'random': 47 | # Draw means from probability distribution 48 | self.mu = np.random.normal(0, 1, k) 49 | elif mu == 'sequence': 50 | # Increase the mean for each arm by one 51 | self.mu = np.linspace(0, k-1, k) 52 | 53 | def softmax(self): 54 | self.prob_action = np.exp(self.H - np.max(self.H)) \ 55 | / np.sum(np.exp(self.H - np.max(self.H)), axis=0) 56 | 57 | def pull(self): 58 | # Update probabilities 59 | self.softmax() 60 | # Select highest preference action 61 | a = np.random.choice(self.actions, p=self.prob_action) 62 | 63 | reward = np.random.normal(self.mu[a], 1) 64 | 65 | # Update counts 66 | self.n += 1 67 | self.k_n[a] += 1 68 | 69 | # Update total 70 | self.mean_reward = self.mean_reward + ( 71 | reward - self.mean_reward) / self.n 72 | 73 | # Update results for a_k 74 | self.k_reward[a] = self.k_reward[a] + ( 75 | reward - self.k_reward[a]) / self.k_n[a] 76 | 77 | # Update preferences 78 | self.H[a] = self.H[a] + \ 79 | self.alpha * (reward - self.mean_reward) * (1 - 80 | self.prob_action[a]) 81 | actions_not_taken = self.actions!=a 82 | self.H[actions_not_taken] = self.H[actions_not_taken] - self.alpha * (reward - self.mean_reward) * self.prob_action[actions_not_taken] 83 | 84 | def run(self): 85 | for i in range(self.iters): 86 | self.pull() 87 | self.reward[i] = self.mean_reward 88 | 89 | def reset(self, mu=None): 90 | # Resets results while keeping settings 91 | self.n = 0 92 | self.k_n = np.zeros(self.k) 93 | self.mean_reward = 0 94 | self.reward = np.zeros(iters) 95 | self.k_reward = np.zeros(self.k) 96 | self.H = np.zeros(self.k) 97 | if mu == 'random': 98 | self.mu = np.random.normal(0, 1, self.k) 99 | 100 | 101 | 102 | 103 | 104 | k = 10 105 | iters = 1000 106 | # Initialize bandits 107 | grad = grad_bandit(k, 0.1, iters, mu='random') 108 | 109 | grad_rewards = np.zeros(iters) 110 | opt_grad = 0 111 | 112 | episodes = 1000 113 | # Run experiments 114 | for i in range(episodes): 115 | # Reset counts and rewards 116 | grad.reset('random') 117 | 118 | 119 | grad.run() 120 | 121 | grad_rewards = grad_rewards + ( 122 | grad.reward - grad_rewards) / (i + 1) 123 | 124 | opt_grad += grad.k_n[np.argmax(grad.mu)] 125 | 126 | 127 | 128 | 129 | 130 | 131 | plt.figure(figsize=(12,8)) 132 | plt.plot(grad_rewards, label="Gradient") 133 | 134 | plt.xlabel("Iterations") 135 | plt.ylabel("Average Reward") 136 | plt.title("Average Gradient Bandit Rewards after " 137 | + str(episodes) + " Episodes") 138 | plt.show() 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /K-armed-Bandit/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/.DS_Store -------------------------------------------------------------------------------- /K-armed-Bandit/K-armed_Bandit-Problem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | np.random.seed(0) 5 | 6 | class Environment: 7 | 8 | 9 | def __init__(self,probs): 10 | #succesfull probabilties for each arm 11 | self.probs = probs 12 | 13 | def step(self,action): 14 | return 1 if(np.random.random() < self.probs[action]) else 0 15 | 16 | 17 | class Agent: 18 | 19 | def __init__(self,nActions,eps): 20 | self.nActions= nActions 21 | self.eps = eps 22 | self.n = np.zeros(nActions,dtype = np.int) #action values n(a) 23 | self.Q = np.zeros(nActions,dtype = np.float) # value Q(a) 24 | 25 | 26 | 27 | 28 | def update_Q(self,action,reward): 29 | 30 | # this is formulae for finding the Q(a) 31 | # New estimate = Old estimate +1/n(Reward - old Estimate) 32 | self.n[action] += 1 33 | self.Q[action] += (1.0/self.n[action])*(reward - self.Q[action] ) 34 | 35 | 36 | def get_action(self): 37 | 38 | # epislon greedy policy 39 | # explore % times and exploit 1 -% times 40 | if np.random.random() < self.eps: 41 | 42 | #explore 43 | return(np.random.randint(self.nActions)) 44 | else: #exploit 45 | return np.random.choice(np.flatnonzero(self.Q == self.Q.max())) 46 | 47 | # Multi armed bandit simulation 48 | 49 | def experiment(probs , N_episodes): 50 | 51 | env = Environment(probs) # initalizinf the arm probablites 52 | agent = Agent(len(env.probs),eps) 53 | actions, rewards = [], [] 54 | 55 | for episodes in range(N_episodes): 56 | action = agent.get_action() 57 | reward = env.step(action) 58 | agent.update_Q(action, reward) 59 | actions.append(action) 60 | rewards.append(reward) 61 | 62 | return np.array(actions),np.array(rewards) 63 | 64 | 65 | 66 | #Settings 67 | 68 | probs = [0.10, 0.50, 0.60, 0.80, 0.10, 69 | 0.25, 0.60, 0.45, 0.75, 0.65] # bandit arm probabilities of success 70 | N_experiments = 10000 # number of experiments to perform 71 | N_steps = 500 # number of steps (episodes) 72 | eps = 0.1 # probability of random exploration (fraction) 73 | save_fig = True # save file in same directory 74 | output_dir = os.path.join(os.getcwd(), "output") 75 | 76 | # Run multi-armed bandit experiments 77 | print("Running multi-armed bandits with nActions = {}, eps = {}".format(len(probs), eps)) 78 | R = np.zeros((N_steps,)) # reward history sum 79 | A = np.zeros((N_steps, len(probs))) # action history sum 80 | for i in range(N_experiments): 81 | actions, rewards = experiment(probs, N_steps) # perform experiment 82 | if (i + 1) % (N_experiments / 100) == 0: 83 | print("[Experiment {}/{}] ".format(i + 1, N_experiments) + 84 | "n_steps = {}, ".format(N_steps) + 85 | "reward_avg = {}".format(np.sum(rewards) / len(rewards))) 86 | R += rewards 87 | for j, a in enumerate(actions): 88 | A[j][a] += 1 89 | 90 | # Plot reward results 91 | R_avg = R / np.float(N_experiments) 92 | plt.plot(R_avg, ".") 93 | plt.xlabel("Step") 94 | plt.ylabel("Average Reward") 95 | plt.grid() 96 | ax = plt.gca() 97 | plt.xlim([1, N_steps]) 98 | if save_fig: 99 | if not os.path.exists(output_dir): os.mkdir(output_dir) 100 | plt.savefig(os.path.join(output_dir, "rewards.png"), bbox_inches="tight") 101 | else: 102 | plt.show() 103 | plt.close() 104 | 105 | # Plot action results 106 | for i in range(len(probs)): 107 | A_pct = 100 * A[:,i] / N_experiments 108 | steps = list(np.array(range(len(A_pct)))+1) 109 | plt.plot(steps, A_pct, "-", 110 | linewidth=4, 111 | label="Arm {} ({:.0f}%)".format(i+1, 100*probs[i])) 112 | plt.xlabel("Step") 113 | plt.ylabel("Count Percentage (%)") 114 | leg = plt.legend(loc='upper left', shadow=True) 115 | plt.xlim([1, N_steps]) 116 | plt.ylim([0, 100]) 117 | for legobj in leg.legendHandles: 118 | legobj.set_linewidth(4.0) 119 | if save_fig: 120 | if not os.path.exists(output_dir): os.mkdir(output_dir) 121 | plt.savefig(os.path.join(output_dir, "actions.png"), bbox_inches="tight") 122 | else: 123 | plt.show() 124 | plt.close() 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /K-armed-Bandit/output/actions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/output/actions.png -------------------------------------------------------------------------------- /K-armed-Bandit/output/rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/output/rewards.png -------------------------------------------------------------------------------- /K-armed-Greedy/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_1.png -------------------------------------------------------------------------------- /K-armed-Greedy/Figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_2.png -------------------------------------------------------------------------------- /K-armed-Greedy/Figure_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_3.png -------------------------------------------------------------------------------- /K-armed-Greedy/Figure_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_4.png -------------------------------------------------------------------------------- /K-armed-Greedy/Figure_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_5.png -------------------------------------------------------------------------------- /K-armed-Greedy/K-armed-Greedy-and-rest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 24 15:55:24 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | 14 | 15 | class eps_bandit: 16 | ''' 17 | epsilon-greedy k-bandit problem 18 | 19 | Inputs 20 | ===================================================== 21 | k: number of arms (int) 22 | eps: probability of random action 0 < eps < 1 (float) 23 | iters: number of steps (int) 24 | mu: set the average rewards for each of the k-arms. 25 | Set to "random" for the rewards to be selected from 26 | a normal distribution with mean = 0. 27 | Set to "sequence" for the means to be ordered from 28 | 0 to k-1. 29 | Pass a list or array of length = k for user-defined 30 | values. 31 | ''' 32 | 33 | def __init__(self, k, eps, iters, mu='random'): 34 | # Number of arms 35 | self.k = k 36 | # Search probability 37 | self.eps = eps 38 | # Number of iterations 39 | self.iters = iters 40 | # Step count 41 | self.n = 0 42 | # Step count for each arm 43 | self.k_n = np.zeros(k) 44 | # Total mean reward 45 | self.mean_reward = 0 46 | self.reward = np.zeros(iters) 47 | # Mean reward for each arm 48 | self.k_reward = np.zeros(k) 49 | 50 | if type(mu) == list or type(mu).__module__ == np.__name__: 51 | # User-defined averages 52 | self.mu = np.array(mu) 53 | elif mu == 'random': 54 | # Draw means from probability distribution 55 | self.mu = np.random.normal(0, 1, k) 56 | elif mu == 'sequence': 57 | # Increase the mean for each arm by one 58 | self.mu = np.linspace(0, k-1, k) 59 | 60 | def pull(self): 61 | # Generate random number 62 | p = np.random.rand() 63 | if self.eps == 0 and self.n == 0: 64 | a = np.random.choice(self.k) 65 | elif p < self.eps: 66 | # Randomly select an action 67 | a = np.random.choice(self.k) 68 | else: 69 | # Take greedy action 70 | a = np.argmax(self.k_reward) 71 | 72 | reward = np.random.normal(self.mu[a], 1) 73 | 74 | # Update counts 75 | self.n += 1 76 | self.k_n[a] += 1 77 | 78 | # Update total 79 | self.mean_reward = self.mean_reward + ( 80 | reward - self.mean_reward) / self.n 81 | 82 | # Update results for a_k 83 | self.k_reward[a] = self.k_reward[a] + ( 84 | reward - self.k_reward[a]) / self.k_n[a] 85 | 86 | def run(self): 87 | for i in range(self.iters): 88 | self.pull() 89 | self.reward[i] = self.mean_reward 90 | 91 | def reset(self): 92 | # Resets results while keeping settings 93 | self.n = 0 94 | self.k_n = np.zeros(k) 95 | self.mean_reward = 0 96 | self.reward = np.zeros(iters) 97 | self.k_reward = np.zeros(k) 98 | 99 | k = 10 100 | iters = 1000 101 | 102 | eps_0_rewards = np.zeros(iters) 103 | eps_01_rewards = np.zeros(iters) 104 | eps_1_rewards = np.zeros(iters) 105 | 106 | episodes = 1000 107 | # Run experiments 108 | for i in range(episodes): 109 | # Initialize bandits 110 | eps_0 = eps_bandit(k, 0, iters) 111 | eps_01 = eps_bandit(k, 0.01, iters, eps_0.mu.copy()) 112 | eps_1 = eps_bandit(k, 0.1, iters, eps_0.mu.copy()) 113 | 114 | # Run experiments 115 | eps_0.run() 116 | eps_01.run() 117 | eps_1.run() 118 | 119 | # Update long-term averages 120 | eps_0_rewards = eps_0_rewards + ( 121 | eps_0.reward - eps_0_rewards) / (i + 1) 122 | eps_01_rewards = eps_01_rewards + ( 123 | eps_01.reward - eps_01_rewards) / (i + 1) 124 | eps_1_rewards = eps_1_rewards + ( 125 | eps_1.reward - eps_1_rewards) / (i + 1) 126 | 127 | plt.figure(figsize=(12,8)) 128 | plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)") 129 | plt.plot(eps_01_rewards, label="$\epsilon=0.01$") 130 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$") 131 | plt.legend(bbox_to_anchor=(1.3, 0.5)) 132 | plt.xlabel("Iterations") 133 | plt.ylabel("Average Reward") 134 | plt.title("Average $\epsilon-greedy$ Rewards after " + str(episodes) 135 | + " Episodes") 136 | plt.show() 137 | 138 | k = 10 139 | iters = 1000 140 | 141 | eps_0_rewards = np.zeros(iters) 142 | eps_01_rewards = np.zeros(iters) 143 | eps_1_rewards = np.zeros(iters) 144 | eps_0_selection = np.zeros(k) 145 | eps_01_selection = np.zeros(k) 146 | eps_1_selection = np.zeros(k) 147 | 148 | episodes = 1000 149 | # Run experiments 150 | for i in range(episodes): 151 | # Initialize bandits 152 | eps_0 = eps_bandit(k, 0, iters, mu='sequence') 153 | eps_01 = eps_bandit(k, 0.01, iters, eps_0.mu.copy()) 154 | eps_1 = eps_bandit(k, 0.1, iters, eps_0.mu.copy()) 155 | 156 | # Run experiments 157 | eps_0.run() 158 | eps_01.run() 159 | eps_1.run() 160 | 161 | # Update long-term averages 162 | eps_0_rewards = eps_0_rewards + ( 163 | eps_0.reward - eps_0_rewards) / (i + 1) 164 | eps_01_rewards = eps_01_rewards + ( 165 | eps_01.reward - eps_01_rewards) / (i + 1) 166 | eps_1_rewards = eps_1_rewards + ( 167 | eps_1.reward - eps_1_rewards) / (i + 1) 168 | 169 | # Average actions per episode 170 | eps_0_selection = eps_0_selection + ( 171 | eps_0.k_n - eps_0_selection) / (i + 1) 172 | eps_01_selection = eps_01_selection + ( 173 | eps_01.k_n - eps_01_selection) / (i + 1) 174 | eps_1_selection = eps_1_selection + ( 175 | eps_1.k_n - eps_1_selection) / (i + 1) 176 | 177 | plt.figure(figsize=(12,8)) 178 | plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)") 179 | plt.plot(eps_01_rewards, label="$\epsilon=0.01$") 180 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$") 181 | for i in range(k): 182 | plt.hlines(eps_0.mu[i], xmin=0, 183 | xmax=iters, alpha=0.5, 184 | linestyle="--") 185 | plt.legend(bbox_to_anchor=(1.3, 0.5)) 186 | plt.xlabel("Iterations") 187 | plt.ylabel("Average Reward") 188 | plt.title("Average $\epsilon-greedy$ Rewards after " + 189 | str(episodes) + " Episodes") 190 | plt.show() 191 | 192 | 193 | bins = np.linspace(0, k-1, k) 194 | 195 | plt.figure(figsize=(12,8)) 196 | plt.bar(bins, eps_0_selection, 197 | width = 0.33, color='b', 198 | label="$\epsilon=0$") 199 | plt.bar(bins+0.33, eps_01_selection, 200 | width=0.33, color='g', 201 | label="$\epsilon=0.01$") 202 | plt.bar(bins+0.66, eps_1_selection, 203 | width=0.33, color='r', 204 | label="$\epsilon=0.1$") 205 | plt.legend(bbox_to_anchor=(1.2, 0.5)) 206 | plt.xlim([0,k]) 207 | plt.title("Actions Selected by Each Algorithm") 208 | plt.xlabel("Action") 209 | plt.ylabel("Number of Actions Taken") 210 | plt.show() 211 | 212 | opt_per = np.array([eps_0_selection, eps_01_selection, 213 | eps_1_selection]) / iters * 100 214 | df = pd.DataFrame(opt_per, index=['$\epsilon=0$', 215 | '$\epsilon=0.01$', '$\epsilon=0.1$'], 216 | columns=["a = " + str(x) for x in range(0, k)]) 217 | print("Percentage of actions selected:") 218 | df 219 | 220 | 221 | 222 | 223 | class eps_decay_bandit: 224 | ''' 225 | epsilon-decay k-bandit problem 226 | 227 | Inputs 228 | ===================================================== 229 | k: number of arms (int) 230 | iters: number of steps (int) 231 | mu: set the average rewards for each of the k-arms. 232 | Set to "random" for the rewards to be selected from 233 | a normal distribution with mean = 0. 234 | Set to "sequence" for the means to be ordered from 235 | 0 to k-1. 236 | Pass a list or array of length = k for user-defined 237 | values. 238 | ''' 239 | 240 | def __init__(self, k, iters, mu='random'): 241 | # Number of arms 242 | self.k = k 243 | # Number of iterations 244 | self.iters = iters 245 | # Step count 246 | self.n = 0 247 | # Step count for each arm 248 | self.k_n = np.zeros(k) 249 | # Total mean reward 250 | self.mean_reward = 0 251 | self.reward = np.zeros(iters) 252 | # Mean reward for each arm 253 | self.k_reward = np.zeros(k) 254 | 255 | if type(mu) == list or type(mu).__module__ == np.__name__: 256 | # User-defined averages 257 | self.mu = np.array(mu) 258 | elif mu == 'random': 259 | # Draw means from probability distribution 260 | self.mu = np.random.normal(0, 1, k) 261 | elif mu == 'sequence': 262 | # Increase the mean for each arm by one 263 | self.mu = np.linspace(0, k-1, k) 264 | 265 | def pull(self): 266 | # Generate random number 267 | p = np.random.rand() 268 | if p < 1 / (1 + self.n / self.k): 269 | # Randomly select an action 270 | a = np.random.choice(self.k) 271 | else: 272 | # Take greedy action 273 | a = np.argmax(self.k_reward) 274 | 275 | reward = np.random.normal(self.mu[a], 1) 276 | 277 | # Update counts 278 | self.n += 1 279 | self.k_n[a] += 1 280 | 281 | # Update total 282 | self.mean_reward = self.mean_reward + ( 283 | reward - self.mean_reward) / self.n 284 | 285 | # Update results for a_k 286 | self.k_reward[a] = self.k_reward[a] + ( 287 | reward - self.k_reward[a]) / self.k_n[a] 288 | 289 | def run(self): 290 | for i in range(self.iters): 291 | self.pull() 292 | self.reward[i] = self.mean_reward 293 | 294 | def reset(self): 295 | # Resets results while keeping settings 296 | self.n = 0 297 | self.k_n = np.zeros(k) 298 | self.mean_reward = 0 299 | self.reward = np.zeros(iters) 300 | self.k_reward = np.zeros(k) 301 | 302 | k = 10 303 | iters = 1000 304 | eps_decay_rewards = np.zeros(iters) 305 | eps_1_rewards = np.zeros(iters) 306 | episodes = 1000 307 | # Run experiments 308 | for i in range(episodes): 309 | # Initialize bandits 310 | eps_decay = eps_decay_bandit(k, iters) 311 | eps_1 = eps_bandit(k, 0.1, iters, eps_decay.mu.copy()) 312 | 313 | # Run experiments 314 | eps_decay.run() 315 | eps_1.run() 316 | 317 | # Update long-term averages 318 | eps_decay_rewards = eps_decay_rewards + ( 319 | eps_decay.reward - eps_decay_rewards) / (i + 1) 320 | eps_1_rewards = eps_1_rewards + ( 321 | eps_1.reward - eps_1_rewards) / (i + 1) 322 | 323 | plt.figure(figsize=(12,8)) 324 | plt.plot(eps_decay_rewards, label="$\epsilon-decay$") 325 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$") 326 | plt.legend(bbox_to_anchor=(1.2, 0.5)) 327 | plt.xlabel("Iterations") 328 | plt.ylabel("Average Reward") 329 | plt.title("Average $\epsilon-decay$ and" + 330 | "$\epsilon-greedy$ Rewards after " 331 | + str(episodes) + " Episodes") 332 | plt.show() 333 | 334 | 335 | k = 10 336 | iters = 1000 337 | oiv_rewards = np.zeros(iters) 338 | eps_decay_rewards = np.zeros(iters) 339 | eps_1_rewards = np.zeros(iters) 340 | # Select initial values 341 | oiv_init = np.repeat(5., k) 342 | episodes = 1000 343 | # Run experiments 344 | for i in range(episodes): 345 | # Initialize bandits 346 | oiv_bandit = eps_bandit(k, 0, iters) 347 | oiv_bandit.k_reward = oiv_init.copy() 348 | oiv_bandit.k_n = np.ones(k) 349 | eps_decay = eps_decay_bandit(k, iters, oiv_bandit.mu.copy()) 350 | eps_1 = eps_bandit(k, 0.1, iters, oiv_bandit.mu.copy()) 351 | 352 | # Run experiments 353 | oiv_bandit.run() 354 | eps_decay.run() 355 | eps_1.run() 356 | 357 | # Update long-term averages 358 | oiv_rewards = oiv_rewards + ( 359 | oiv_bandit.reward - oiv_rewards) / (i + 1) 360 | eps_decay_rewards = eps_decay_rewards + ( 361 | eps_decay.reward - eps_decay_rewards) / (i + 1) 362 | eps_1_rewards = eps_1_rewards + ( 363 | eps_1.reward - eps_1_rewards) / (i + 1) 364 | 365 | plt.figure(figsize=(12,8)) 366 | plt.plot(oiv_rewards, label="Optimistic") 367 | plt.plot(eps_decay_rewards, label="$\epsilon-decay$") 368 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$") 369 | plt.legend(bbox_to_anchor=(1.2, 0.5)) 370 | plt.xlabel("Iterations") 371 | plt.ylabel("Average Reward") 372 | plt.title("Average Bandit Strategy Rewards after " + 373 | str(episodes) + " Episodes") 374 | plt.show() 375 | 376 | -------------------------------------------------------------------------------- /Monte_Carlo_Frozen_lake/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Monte_Carlo_Frozen_lake/.DS_Store -------------------------------------------------------------------------------- /Monte_Carlo_Frozen_lake/MC_Frozenlake.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 11 09:34:37 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import gym 11 | import numpy as np 12 | import operator 13 | 14 | from IPython.display import clear_output 15 | 16 | from time import sleep 17 | import random 18 | 19 | import itertools 20 | import tqdm 21 | 22 | tqdm.monitor_interval = 0 23 | 24 | 25 | #Random Policy 26 | 27 | def create_random_policy(env): 28 | policy = {} 29 | for key in range(0, env.observation_space.n): 30 | current_end = 0 31 | p = {} 32 | 33 | for action in range(0,env.action_space.n): 34 | 35 | p[action] = 1/env.action_space.n 36 | 37 | policy[key] = p 38 | 39 | return policy 40 | 41 | 42 | 43 | #dictionary for thr state_action_value 44 | 45 | 46 | def create_state_action_dictionary(env,policy): 47 | 48 | Q = {} 49 | 50 | for key in policy.keys(): 51 | Q[key] = {a: 0.0 for a in range(0, env.action_space.n)} 52 | 53 | return Q 54 | 55 | 56 | #To play episodes 57 | 58 | 59 | 60 | def run_game(env,policy,display = True): 61 | 62 | env.reset() 63 | 64 | episode =[] 65 | 66 | finished = False 67 | 68 | while not finished: 69 | s = env.env.s 70 | 71 | if display: 72 | 73 | clear_output(True) 74 | env.render() 75 | sleep(1) 76 | 77 | 78 | timestep =[] 79 | timestep.append(s) 80 | 81 | n = random.uniform(0, sum(policy[s].values())) 82 | 83 | 84 | top_range = 0 85 | 86 | for prob in policy[s].items(): 87 | 88 | top_range += prob[1] 89 | 90 | if n action values 69 | Q = defaultdict(lambda: np.zeros(env.action_space.n)) 70 | 71 | #the cumulative denominator of weighted importance sampling 72 | 73 | C = defaultdict(lambda:np.zeros(env.action_space.n)) 74 | 75 | #the greedy polcy we want to learn 76 | target_policy = create_greedy_policy(Q) 77 | 78 | 79 | for i_episode in range(1,num_episodes+1): 80 | 81 | 82 | if i_episode % 1000 == 0: 83 | print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") 84 | sys.stdout.flush() 85 | 86 | #Generate an episode {it's an array of state state , action ,reward} 87 | 88 | episode = [] 89 | state = env.reset() 90 | 91 | for t in range(100): 92 | 93 | #sampling an action from our policy 94 | probs = behaviour_policy(state) 95 | action = np.random.choice(np.arange(len(probs)), p = probs) 96 | 97 | next_state, reward , done ,_ = env.step(action) 98 | episode.append((state,action,reward)) 99 | 100 | if done: 101 | break 102 | state = next_state 103 | 104 | 105 | #Sum of discounted sums 106 | G = 0.0 107 | #The importance samplingg ratio 108 | W = 1.0 109 | 110 | 111 | #for each step in episode,backwards 112 | for t in range(len(episode))[::-1]: 113 | 114 | 115 | state,action,reward = episode[t] 116 | 117 | # update total reward 118 | G = discount_factor *G +reward 119 | 120 | #updating weight importance sampling 121 | C[state][action] +=W 122 | 123 | Q[state][action] += (W/C[state][action])* (G - Q[state][action]) 124 | 125 | 126 | if action != np.argmax(target_policy(state)): 127 | break 128 | W = W*1/behaviour_policy(state)[action] 129 | 130 | return Q, target_policy 131 | 132 | 133 | 134 | random_policy = create_random_policy(env.action_space.n) 135 | Q, policy = mc_control_importance_sampling(env, num_episodes = 500000, behaviour_policy= random_policy) 136 | 137 | 138 | # For plotting: Create value function from action-value function 139 | # by picking the best action at each state 140 | V = defaultdict(float) 141 | for state, action_values in Q.items(): 142 | action_value = np.max(action_values) 143 | V[state] = action_value 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /Pac-Man/pacman_DQN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 4 15:26:40 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import gym 10 | import time 11 | import numpy as np 12 | 13 | import tensorflow as tf 14 | import keras 15 | #creating the environment 16 | env=gym.make('MsPacman-v0') 17 | 18 | print(env.action_space) 19 | 20 | print(env.observation_space) 21 | 22 | n_height = 210 23 | n_width = 160 24 | n_depth = 3 25 | n_shape = [n_height,n_width,n_depth] 26 | n_inputs = n_height * n_width * n_depth 27 | env.frameskip = 3 28 | 29 | 30 | frame_time = 1.0 / 15 # seconds 31 | 32 | n_episodes = 500 33 | scores = [] 34 | for i_episode in range(n_episodes): 35 | t=0 36 | score=0 37 | then = 0 38 | done = False 39 | env.reset() 40 | while not done: 41 | now = time.time() 42 | if frame_time < now - then: 43 | action = env.action_space.sample() 44 | observation, reward, done, info = env.step(action) 45 | score += reward 46 | #env.render() 47 | then = now 48 | t=t+1 49 | scores.append(score) 50 | 51 | print('Average score {}, max {}, min {}'.format(np.mean(scores),np.max(scores),np.min(scores) )) 52 | 53 | tf.reset_default_graph() 54 | keras.backend.clear_session() 55 | 56 | #Applying deep - Q - learning 57 | 58 | def policy_q_nn(obs,env): 59 | 60 | #explore strategy 61 | if np.random.random() < explore_rate: 62 | 63 | action = env.action_space.sample() 64 | #exploitation strategy 65 | else : 66 | action = np.argmax(q_nn.predict(np.array([obs]))) 67 | return action 68 | 69 | def episode(env, policy, r_max = 0,t_max = 0): 70 | 71 | # create the empty list to contain game memory 72 | memory = deque(maxlen=1000) 73 | 74 | obs = env.reset() 75 | state_prev = obs 76 | 77 | episode_reward = 0 78 | done = False 79 | t = 0 80 | 81 | while not done: 82 | 83 | action = policy(state_prev, env) 84 | obs, reward, done, info = env.step(action) 85 | state_next = obs 86 | 87 | # add the state_prev, action, reward, state_new, done to memory 88 | memory.append([state_prev,action,reward,state_next,done]) 89 | 90 | # Generate and update the q_values with 91 | # maximum future rewards using bellman function: 92 | states = np.array([x[0] for x in memory]) 93 | states_next = np.array([np.zeros(n_shape) if x[4] else x[3] for x in memory]) 94 | 95 | 96 | q_values = q_nn.predict(states) 97 | q_values_next = q_nn.predict(states_next) 98 | 99 | for i in range(len(memory)): 100 | 101 | state_prev,action,reward,state_next,done = memory[i] 102 | if done: 103 | 104 | q_values[i,action] = reward 105 | 106 | else: 107 | 108 | best_q = np.amax(q_values_next[i]) 109 | 110 | bellman_q = reward + discount_rate * best_q 111 | q_values[i,action] = bellman_q 112 | 113 | # train the q_nn with states and q_values, same as updating the q_table 114 | q_nn.fit(states,q_values,epochs=1,batch_size=50,verbose=0) 115 | 116 | state_prev = state_next 117 | 118 | episode_reward += reward 119 | if r_max > 0 and episode_reward > r_max: 120 | break 121 | t+=1 122 | if t_max > 0 and t == t_max: 123 | break 124 | return episode_reward 125 | # experiment collect observations and rewards for each episode 126 | def experiment(env, policy, n_episodes,r_max=0, t_max=0): 127 | 128 | rewards=np.empty(shape=[n_episodes]) 129 | for i in range(n_episodes): 130 | val = episode(env, policy, r_max, t_max) 131 | #print('episode:{}, reward {}'.format(i,val)) 132 | rewards[i]=val 133 | 134 | print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}' 135 | .format(policy.__name__, 136 | np.min(rewards), 137 | np.max(rewards), 138 | np.mean(rewards))) 139 | 140 | 141 | from collections import deque 142 | from tensorflow.keras.models import Sequential 143 | from tensorflow.keras.layers import Dense, Flatten 144 | 145 | # build the Q-Network 146 | model = Sequential() 147 | model.add(Flatten(input_shape = n_shape)) 148 | model.add(Dense(512, activation='relu',name='hidden1')) 149 | model.add(Dense(9, activation='softmax', name='output')) 150 | model.compile(loss='categorical_crossentropy',optimizer='adam') 151 | model.summary() 152 | q_nn = model 153 | 154 | # Hyperparameters 155 | 156 | discount_rate = 0.9 157 | explore_rate = 0.2 158 | n_episodes = 1 159 | 160 | # create the empty list to contain game memory 161 | memory = deque(maxlen=1000) 162 | 163 | experiment(env, policy_q_nn, n_episodes) 164 | 165 | 166 | # Hyperparameters 167 | 168 | discount_rate = 0.9 169 | explore_rate = 0.2 170 | n_episodes = 100 171 | 172 | # create the empty list to contain game memory 173 | memory = deque(maxlen=1000) 174 | 175 | experiment(env, policy_q_nn, n_episodes) 176 | 177 | 178 | from collections import deque 179 | from tensorflow.keras.models import Sequential 180 | from tensorflow.keras.layers import Dense, Flatten 181 | from tensorflow.keras.layers import Conv2D, MaxPooling2D 182 | 183 | # build the CNN Q-Network 184 | model = Sequential() 185 | model.add(Conv2D(16, kernel_size=(5, 5), 186 | strides=(1, 1), 187 | activation='relu', 188 | input_shape=n_shape)) 189 | model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2))) 190 | model.add(Flatten()) 191 | model.add(Dense(512, activation='relu',name='hidden1')) 192 | model.add(Dense(9, activation='softmax', name='output')) 193 | model.compile(loss='categorical_crossentropy',optimizer='adam') 194 | model.summary() 195 | q_nn = model 196 | 197 | 198 | # Hyperparameters 199 | 200 | discount_rate = 0.9 201 | explore_rate = 0.2 202 | n_episodes = 100 203 | 204 | # create the empty list to contain game memory 205 | memory = deque(maxlen=1000) 206 | 207 | experiment(env, policy_q_nn, n_episodes) 208 | 209 | env.close() 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /Ping_pong/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Ping_pong/.DS_Store -------------------------------------------------------------------------------- /Ping_pong/ping_pong.py: -------------------------------------------------------------------------------- 1 | import gym import numpy as np def downsample(image): # Take only alternate pixels - basically halves the resolution of the image (which is fine for us) return image[::2, ::2, :] def remove_color(image): """Convert all color (RGB is the third dimension in the image)""" return image[:, :, 0] def remove_background(image): image[image == 144] = 0 image[image == 109] = 0 return image def preprocess_observations(input_observation, prev_processed_observation, input_dimensions): """ convert the 210x160x3 uint8 frame into a 6400 float vector """ processed_observation = input_observation[35:195] # crop processed_observation = downsample(processed_observation) processed_observation = remove_color(processed_observation) processed_observation = remove_background(processed_observation) processed_observation[processed_observation != 0] = 1 # everything else (paddles, ball) just set to 1 # Convert from 80 x 80 matrix to 1600 x 1 matrix processed_observation = processed_observation.astype(np.float).ravel() # subtract the previous frame from the current one so we are only processing on changes in the game if prev_processed_observation is not None: input_observation = processed_observation - prev_processed_observation else: input_observation = np.zeros(input_dimensions) # store the previous frame so we can subtract from it next time prev_processed_observations = processed_observation return input_observation, prev_processed_observations def sigmoid(x): return 1.0/(1.0 + np.exp(-x)) def relu(vector): vector[vector < 0] = 0 return vector def apply_neural_nets(observation_matrix, weights): """ Based on the observation_matrix and weights, compute the new hidden layer values and the new output layer values""" hidden_layer_values = np.dot(weights['1'], observation_matrix) hidden_layer_values = relu(hidden_layer_values) output_layer_values = np.dot(hidden_layer_values, weights['2']) output_layer_values = sigmoid(output_layer_values) return hidden_layer_values, output_layer_values def choose_action(probability): random_value = np.random.uniform() if random_value < probability: # signifies up in openai gym return 2 else: # signifies down in openai gym return 3 def compute_gradient(gradient_log_p, hidden_layer_values, observation_values, weights): """ See here: http://neuralnetworksanddeeplearning.com/chap2.html""" delta_L = gradient_log_p dC_dw2 = np.dot(hidden_layer_values.T, delta_L).ravel() delta_l2 = np.outer(delta_L, weights['2']) delta_l2 = relu(delta_l2) dC_dw1 = np.dot(delta_l2.T, observation_values) return { '1': dC_dw1, '2': dC_dw2 } def update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate): """ See here: http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop""" epsilon = 1e-5 for layer_name in weights.keys(): g = g_dict[layer_name] expectation_g_squared[layer_name] = decay_rate * expectation_g_squared[layer_name] + (1 - decay_rate) * g**2 weights[layer_name] += (learning_rate * g)/(np.sqrt(expectation_g_squared[layer_name] + epsilon)) g_dict[layer_name] = np.zeros_like(weights[layer_name]) # reset batch gradient buffer def discount_rewards(rewards, gamma): """ Actions you took 20 steps before the end result are less important to the overall result than an action you took a step ago. This implements that logic by discounting the reward on previous actions based on how long ago they were taken""" discounted_rewards = np.zeros_like(rewards) running_add = 0 for t in reversed(range(0, rewards.size)): if rewards[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) running_add = running_add * gamma + rewards[t] discounted_rewards[t] = running_add return discounted_rewards def discount_with_rewards(gradient_log_p, episode_rewards, gamma): """ discount the gradient with the normalized rewards """ discounted_episode_rewards = discount_rewards(episode_rewards, gamma) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_episode_rewards -= np.mean(discounted_episode_rewards) discounted_episode_rewards /= np.std(discounted_episode_rewards) return gradient_log_p * discounted_episode_rewards def main(): env = gym.make("Pong-v0") observation = env.reset() # This gets us the image # hyperparameters episode_number = 0 batch_size = 10 gamma = 0.99 # discount factor for reward decay_rate = 0.99 num_hidden_layer_neurons = 200 input_dimensions = 80 * 80 learning_rate = 1e-4 episode_number = 0 reward_sum = 0 running_reward = None prev_processed_observations = None weights = { '1': np.random.randn(num_hidden_layer_neurons, input_dimensions) / np.sqrt(input_dimensions), '2': np.random.randn(num_hidden_layer_neurons) / np.sqrt(num_hidden_layer_neurons) } # To be used with rmsprop algorithm (http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop) expectation_g_squared = {} g_dict = {} for layer_name in weights.keys(): expectation_g_squared[layer_name] = np.zeros_like(weights[layer_name]) g_dict[layer_name] = np.zeros_like(weights[layer_name]) episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] while True: env.render() processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions) hidden_layer_values, up_probability = apply_neural_nets(processed_observations, weights) episode_observations.append(processed_observations) episode_hidden_layer_values.append(hidden_layer_values) action = choose_action(up_probability) # carry out the chosen action observation, reward, done, info = env.step(action) reward_sum += reward episode_rewards.append(reward) # see here: http://cs231n.github.io/neural-networks-2/#losses fake_label = 1 if action == 2 else 0 loss_function_gradient = fake_label - up_probability episode_gradient_log_ps.append(loss_function_gradient) if done: # an episode finished episode_number += 1 # Combine the following values for the episode episode_hidden_layer_values = np.vstack(episode_hidden_layer_values) episode_observations = np.vstack(episode_observations) episode_gradient_log_ps = np.vstack(episode_gradient_log_ps) episode_rewards = np.vstack(episode_rewards) # Tweak the gradient of the log_ps based on the discounted rewards episode_gradient_log_ps_discounted = discount_with_rewards(episode_gradient_log_ps, episode_rewards, gamma) gradient = compute_gradient( episode_gradient_log_ps_discounted, episode_hidden_layer_values, episode_observations, weights ) # Sum the gradient for use when we hit the batch size for layer_name in gradient: g_dict[layer_name] += gradient[layer_name] if episode_number % batch_size == 0: update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate) episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] # reset values observation = env.reset() # reset env running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)) reward_sum = 0 prev_processed_observations = None main() -------------------------------------------------------------------------------- /Policy_eval_Grid_World/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Policy_eval_Grid_World/.DS_Store -------------------------------------------------------------------------------- /Policy_eval_Grid_World/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Policy_eval_Grid_World/Figure_1.png -------------------------------------------------------------------------------- /Policy_eval_Grid_World/policy_eval_GridWorld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jul 3 10:42:03 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import numpy as np 11 | from tqdm import tqdm 12 | import matplotlib.pyplot as plt 13 | import seaborn as sns 14 | sns.set_style('darkgrid') 15 | import random 16 | 17 | 18 | #Parameters 19 | 20 | gamma = 1 #Discounting Rate range from (0 to 1) 21 | rewardSize = -1 22 | gridSize = 4 23 | terminationStates = [[0,0], [gridSize-1,gridSize-1]] 24 | actions = [[-1,0],[1,0],[0,1],[0,-1]] 25 | numIterations = 1000 26 | 27 | #Utilites 28 | 29 | def actionRewardFunction(initalPostion,action): 30 | 31 | if initalPostion in terminationStates: 32 | return initalPostion,0 33 | reward = rewardSize 34 | finalPosition = np.array(initalPostion) + np.array(action) 35 | 36 | 37 | if -1 in finalPosition or 4 in finalPosition: 38 | finalPosition = initalPostion 39 | 40 | return finalPosition,reward 41 | 42 | 43 | 44 | #initalization 45 | 46 | valueMap = np.zeros((gridSize,gridSize)) 47 | 48 | 49 | states = [[i,j] for i in range(gridSize) for j in range(gridSize)] 50 | 51 | 52 | 53 | #policiy evaluation 54 | 55 | deltas = [] 56 | for it in range(numIterations): 57 | copyValueMap = np.copy(valueMap) 58 | deltaState = [] 59 | for state in states: 60 | weightedRewards = 0 61 | for action in actions: 62 | finalPosition, reward = actionRewardFunction(state, action) 63 | weightedRewards += (1/len(actions))*(reward+(gamma*valueMap[finalPosition[0], finalPosition[1]])) 64 | deltaState.append(np.abs(copyValueMap[state[0], state[1]]-weightedRewards)) 65 | copyValueMap[state[0], state[1]] = weightedRewards 66 | deltas.append(deltaState) 67 | valueMap = copyValueMap 68 | if it in [0,1,2,9, 99, numIterations-1]: 69 | print("Iteration {}".format(it+1)) 70 | print(valueMap) 71 | print("") 72 | 73 | 74 | 75 | 76 | plt.figure(figsize=(20, 10)) 77 | plt.legend() 78 | plt.plot(deltas) 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Q-Learning/Q-learning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 18 15:01:39 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | #Q learning 11 | 12 | import random 13 | import gym 14 | 15 | env = gym.make('Taxi-v3') 16 | 17 | env.render() 18 | 19 | 20 | q = {} 21 | for s in range(env.observation_space.n): 22 | for a in range(env.action_space.n): 23 | q[(s,a)] = 0.0 24 | 25 | 26 | def update_q_table(prev_state,action,reward,next_state,alpha,gamma): 27 | qa = max([q[(next_state,a)] for a in range(env.action_space.n)]) 28 | q[prev_state,action] += alpha *(reward + qa - q[prev_state,action]) 29 | 30 | 31 | 32 | def epsilon_greedy(state,epsilon): 33 | 34 | if random.uniform(0, 1) < epsilon: 35 | 36 | #takinf a random action 37 | return env.action_space.sample() 38 | 39 | else: 40 | 41 | #taking a greedy action 42 | return max(list(range(env.action_space.n)), key = lambda x: q[(state,x)]) 43 | 44 | 45 | 46 | 47 | 48 | alpha = 0.4 49 | gamma = 0.999 50 | epsilon = 0.017 51 | 52 | for i in range(8000): 53 | r = 0 54 | prev_state = env.reset() 55 | #env.render() 56 | 57 | 58 | while True: 59 | 60 | env.render() 61 | 62 | 63 | action = epsilon_greedy(prev_state, epsilon) 64 | 65 | next_state, reward,done , _ = env.step(action) 66 | 67 | update_q_table(prev_state, action, reward, next_state, alpha, gamma) 68 | 69 | prev_state = next_state 70 | 71 | r += reward 72 | 73 | 74 | if done: 75 | break 76 | 77 | print("total reward:", r) 78 | 79 | 80 | env.close() 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcment-Learning 2 | **Reinforcement learning (RL)** is an area of machine learning concerned with how software agents ought to take actions in an environment in order to maximize the notion of cumulative reward. Reinforcement learning is one of three basic machine learning paradigms, alongside supervised learning and unsupervised learning. 3 | 4 | In this repository we are going have codes for the algorithms of reinforcement learning 5 | 6 | * You can also check the instructions to installation of **Gym** [here](https://gym.openai.com/docs/) 7 | 8 | 9 | ## Install Gym 10 | 11 | `pip install gym` 12 | 13 | or 14 | 15 | ``` 16 | git clone https://github.com/openai/gym 17 | cd gym 18 | pip install -e . 19 | ``` 20 | 21 | ### Example 22 | 23 | An example to see wheter _gym_ is working or not 24 | 25 | 26 | ``` 27 | import gym 28 | env = gym.make('CartPole-v0') 29 | env.reset() 30 | for _ in range(1000): 31 | env.render() 32 | env.step(env.action_space.sample()) # take a random action 33 | env.close() 34 | 35 | ``` 36 | 37 | ### The code for Cartpole environment 38 | 39 | ``` 40 | import gym 41 | env = gym.make('CartPole-v0') 42 | for i_episode in range(20): 43 | observation = env.reset() 44 | for t in range(100): 45 | env.render() 46 | print(observation) 47 | action = env.action_space.sample() 48 | observation, reward, done, info = env.step(action) 49 | if done: 50 | print("Episode finished after {} timesteps".format(t+1)) 51 | break 52 | env.close() 53 | ``` 54 | 55 | ## Table of Contents 56 | * [Temporal-Difference](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Temporal-Difference) 57 | * [K-Armed-Bandit](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/K-armed-Bandit) 58 | 59 | * [Tile-Coding](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Tile-coding%20) 60 | 61 | 62 | * [Q-learnig](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Q-Learning) 63 | 64 | * [Deep-Q-network](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Deep-Q-Network) 65 | 66 | 67 | * [Sarsa](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Sarsa) 68 | 69 | 70 | * [Pacman](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Pac-Man) 71 | 72 | 73 | * [Frozen-Lake](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Frozen_lake) 74 | 75 | 76 | * [Reinforce](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Reinforce) 77 | 78 | 79 | * [Upper-Confidence-Bound](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Upper-Confidence-Bound%20) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Reinforce/policy_graident.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Aug 25 16:38:59 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | #importing stuff 10 | import gym 11 | gym.logger.set_level(40) # suppress warnings (please remove if gives error) 12 | import numpy as np 13 | from collections import deque 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | 18 | import torch 19 | torch.manual_seed(0) # set random seed 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | import torch.optim as optim 23 | from torch.distributions import Categorical 24 | 25 | 26 | env = gym.make('CartPole-v0') 27 | env.seed(0) 28 | print('observation space:', env.observation_space) 29 | print('action space:', env.action_space) 30 | 31 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 32 | 33 | class Policy(nn.Module): 34 | def __init__(self, s_size=4, h_size=16, a_size=2): 35 | super(Policy, self).__init__() 36 | self.fc1 = nn.Linear(s_size, h_size) 37 | self.fc2 = nn.Linear(h_size, a_size) 38 | 39 | def forward(self, x): 40 | x = F.relu(self.fc1(x)) 41 | x = self.fc2(x) 42 | return F.softmax(x, dim=1) 43 | 44 | def act(self, state): 45 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 46 | probs = self.forward(state).cpu() 47 | m = Categorical(probs) 48 | action = m.sample() 49 | return action.item(), m.log_prob(action) 50 | 51 | 52 | 53 | policy = Policy().to(device) 54 | optimizer = optim.Adam(policy.parameters(), lr=1e-2) 55 | 56 | def reinforce(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100): 57 | scores_deque = deque(maxlen=100) 58 | scores = [] 59 | for i_episode in range(1, n_episodes+1): 60 | saved_log_probs = [] 61 | rewards = [] 62 | state = env.reset() 63 | for t in range(max_t): 64 | action, log_prob = policy.act(state) 65 | saved_log_probs.append(log_prob) 66 | state, reward, done, _ = env.step(action) 67 | rewards.append(reward) 68 | if done: 69 | break 70 | scores_deque.append(sum(rewards)) 71 | scores.append(sum(rewards)) 72 | 73 | discounts = [gamma**i for i in range(len(rewards)+1)] 74 | R = sum([a*b for a,b in zip(discounts, rewards)]) 75 | 76 | policy_loss = [] 77 | for log_prob in saved_log_probs: 78 | policy_loss.append(-log_prob * R) 79 | policy_loss = torch.cat(policy_loss).sum() 80 | 81 | optimizer.zero_grad() 82 | policy_loss.backward() 83 | optimizer.step() 84 | 85 | if i_episode % print_every == 0: 86 | print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) 87 | if np.mean(scores_deque)>=195.0: 88 | print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque))) 89 | break 90 | 91 | return scores 92 | 93 | scores = reinforce() 94 | 95 | 96 | fig = plt.figure() 97 | ax = fig.add_subplot(111) 98 | plt.plot(np.arange(1, len(scores)+1), scores) 99 | plt.ylabel('Score') 100 | plt.xlabel('Episode #') 101 | plt.show() 102 | 103 | 104 | env = gym.make('CartPole-v0') 105 | 106 | state = env.reset() 107 | for t in range(1000): 108 | action, _ = policy.act(state) 109 | env.render() 110 | state, reward, done, _ = env.step(action) 111 | if done: 112 | break 113 | 114 | env.close() 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /Sarsa/Sarsa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Jul 18 11:21:52 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import gym 13 | 14 | import random 15 | 16 | env = gym.make('Taxi-v3') 17 | 18 | env.render() 19 | 20 | Q = {} 21 | 22 | for s in range(env.observation_space.n): 23 | for a in range(env.action_space.n): 24 | Q[(s,a)] = 0.0 25 | 26 | 27 | 28 | def epsilon_greedy(state,epsilon): 29 | 30 | if random.uniform(0,1) < epsilon: 31 | 32 | #taking a random action 33 | 34 | return env.action_space.sample() 35 | 36 | else: 37 | 38 | #taking a gredy action 39 | 40 | return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)]) 41 | 42 | 43 | 44 | 45 | alpha = 0.85 46 | gamma = 0.9 47 | epsilon = 0.8 48 | 49 | #performing Sarsa 50 | 51 | 52 | for i in range(4000): 53 | 54 | # we store cumulative reward of each episodes in r 55 | r = 0 56 | 57 | # initialize the state, 58 | state = env.reset() 59 | 60 | # select the action using epsilon-greedy policy 61 | action = epsilon_greedy(state,epsilon) 62 | 63 | while True: 64 | 65 | 66 | env.render() 67 | 68 | # then we perform the action and move to the next state, and receive the reward 69 | nextstate, reward, done, _ = env.step(action) 70 | 71 | # again, we select the next action using epsilon greedy policy 72 | nextaction = epsilon_greedy(nextstate,epsilon) 73 | 74 | # we calculate the Q value of previous state using our update rule 75 | Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)]) 76 | 77 | # finally we update our state and action with next action and next state 78 | action = nextaction 79 | state = nextstate 80 | 81 | # store the rewards 82 | r += reward 83 | 84 | # we will break the loop, if we are at the terminal state of the episode 85 | if done: 86 | break 87 | 88 | print("total reward: ", r) 89 | 90 | env.close() 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /Sarsa/n-Sarsa_and_Sarsa(lambda).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Aug 6 14:28:44 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | # Importing part 11 | 12 | import gym 13 | import itertools 14 | import matplotlib 15 | import numpy as np 16 | import pandas as pd 17 | import sys 18 | import time 19 | import timeit 20 | 21 | from collections import namedtuple 22 | 23 | import os 24 | 25 | import glob 26 | 27 | from lib.tile_coding import IHT, tiles 28 | 29 | from matplotlib import pyplot as plt 30 | from matplotlib import cm 31 | matplotlib.style.use('ggplot') 32 | 33 | import io 34 | import base64 35 | 36 | from IPython.display import HTML 37 | 38 | #creating the environment 39 | 40 | env = gym.make('MountainCar-v0') 41 | 42 | env._max_episode_steps = 3000 #increse the upper time limit 43 | np.random.seed(6) # Make plots reproducible 44 | 45 | 46 | 47 | class QEstimator(): 48 | 49 | 50 | def __init__(self,step_size,num_tilings = 8,max_size = 4096,tiling_dim = None, trace = False): 51 | 52 | 53 | self.trace = trace 54 | self.max_size = max_size 55 | self.num_tilings = num_tilings 56 | self.tiling_dim = tiling_dim or num_tilings 57 | 58 | #alpha is the fraction of step_size and num_tilings 59 | 60 | self.alpha = step_size/num_tilings 61 | 62 | #initalzinf the hash table for tile coding and keeping it in max 63 | self.iht = IHT(max_size) 64 | 65 | 66 | #initalzizinf the weights 67 | self.weights = np.zeros(max_size) 68 | if self.trace: 69 | self.z = np.zeros(max_size) 70 | 71 | 72 | #tilecoding software partitions at integer boundaries 73 | 74 | 75 | self.postion_scale = self.tiling_dim / (env.observation_space.high[0] 76 | - env.observation_space.low[0] ) 77 | self.velocity_scale = self.tiling_dim/ ( env.observation_space.high[1] 78 | - env.observation_space.low[1] ) 79 | 80 | 81 | def featurize_state_action(self,state,action): 82 | 83 | #returns the featurized repesentation of state action pair 84 | 85 | 86 | featurized = tiles(self.iht,self.num_tilings, 87 | [self.postion_scale * state[0], 88 | self.velocity_scale * state[1]], 89 | [action] 90 | ) 91 | 92 | return featurized 93 | 94 | def predict(self,s , a = None): 95 | 96 | #predicitng q-value(s) 97 | 98 | 99 | if a is None: 100 | features = [self.featurize_state_action(s, i) for i in 101 | range(env.action_space.n)] 102 | 103 | else: 104 | features = [ self.featurize_state_action(s, a)] 105 | 106 | return [np.sum(self.weights[f]) for f in features] 107 | 108 | def update(self,s,a,target): 109 | 110 | # updates the estimator parameters 111 | 112 | features = self.featurize_state_action(s, a) 113 | 114 | # linear function Approx 115 | estimation = np.sum(self.weights[features]) 116 | 117 | delta = (target-estimation) 118 | 119 | if self.trace: 120 | # self.z[features] += 1 # Accumulating trace 121 | self.z[features] = 1 # Replacing trace 122 | self.weights += self.alpha * delta * self.z 123 | else: 124 | self.weights[features] += self.alpha * delta 125 | 126 | 127 | def reset(self,z_only = False): 128 | 129 | 130 | if z_only: 131 | 132 | assert self.trace #'q-value estimator has no z to reset.'m 133 | self.z = np.zeros(self.max_size) 134 | else: 135 | if self.trace: 136 | self.z = np.zeros(self.max_size) 137 | self.weights = np.zeros(self.max_size) 138 | 139 | 140 | 141 | def make_epsilon_greedy_policy(estimator,epsilon,num_actions ) : 142 | 143 | def policy_fn(observation): 144 | 145 | action_probs = np.ones(num_actions,dtype = float)*epsilon / num_actions 146 | 147 | q_values= estimator.predict(observation) 148 | 149 | best_action_idx = np.argmax(q_values) 150 | 151 | action_probs[best_action_idx] += (1.0- epsilon ) 152 | 153 | return action_probs 154 | return policy_fn 155 | 156 | 157 | 158 | # defining Sarsa n 159 | 160 | 161 | 162 | def sarsa_n(n,env,estimator,gamma = 1.0,epsilon= 1.0): 163 | 164 | 165 | # create epslion greedy policy 166 | 167 | policy = make_epsilon_greedy_policy(estimator, epsilon, env.action_space.n) 168 | 169 | #Resetting the environment 170 | 171 | state = env.reset() 172 | 173 | action_probs = policy(state) 174 | 175 | action = np.random.choice(np.arange(len(action_probs)), p = action_probs) 176 | 177 | 178 | #setting up the stuff 179 | 180 | states = [state] 181 | actions = [ action] 182 | rewards = [0.0] 183 | 184 | #stepping through epsiodes 185 | 186 | T = float('inf') 187 | 188 | for t in itertools.count(): 189 | 190 | if t= 0: 223 | 224 | # Build target 225 | target = 0 226 | for i in range(update_time + 1, min(T, update_time + n) + 1): 227 | target += np.power(gamma, i - update_time - 1) * rewards[i] 228 | if update_time + n < T: 229 | q_values_next = estimator.predict(states[update_time + n]) 230 | target += q_values_next[actions[update_time + n]] 231 | 232 | # Update step 233 | estimator.update(states[update_time], actions[update_time], target) 234 | 235 | if update_time == T - 1: 236 | break 237 | 238 | state = next_state 239 | action = next_action 240 | 241 | ret = np.sum(rewards) 242 | 243 | return t, ret 244 | 245 | 246 | 247 | def sarsa_lambda(lmbda, env, estimator, gamma=1.0, epsilon=0): 248 | 249 | 250 | 251 | # Reset the eligibility trace 252 | estimator.reset(z_only=True) 253 | 254 | # Create epsilon-greedy policy 255 | policy = make_epsilon_greedy_policy( 256 | estimator, epsilon, env.action_space.n) 257 | 258 | # Reset the environment and pick the first action 259 | state = env.reset() 260 | action_probs = policy(state) 261 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 262 | 263 | ret = 0 264 | # Step through episode 265 | for t in itertools.count(): 266 | # Take a step 267 | next_state, reward, done, _ = env.step(action) 268 | ret += reward 269 | 270 | if done: 271 | target = reward 272 | estimator.update(state, action, target) 273 | break 274 | 275 | else: 276 | # Take next step 277 | next_action_probs = policy(next_state) 278 | next_action = np.random.choice( 279 | np.arange(len(next_action_probs)), p=next_action_probs) 280 | 281 | # Estimate q-value at next state-action 282 | q_new = estimator.predict( 283 | next_state, next_action)[0] 284 | target = reward + gamma * q_new 285 | # Update step 286 | estimator.update(state, action, target) 287 | estimator.z *= gamma * lmbda 288 | 289 | state = next_state 290 | action = next_action 291 | 292 | return t, ret 293 | 294 | 295 | 296 | 297 | 298 | 299 | # plotting stuff 300 | 301 | def plot_cost_to_go(env, estimator, num_partitions=50): 302 | 303 | 304 | x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=num_partitions) 305 | y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=num_partitions) 306 | X, Y = np.meshgrid(x, y) 307 | Z = np.apply_along_axis( 308 | lambda obs: -np.max(estimator.predict(obs)), 2, np.stack([X, Y], axis=2)) 309 | 310 | fig, ax = plt.subplots(figsize=(10, 5)) 311 | p = ax.pcolor(X, Y, Z, cmap=cm.RdBu, vmin=0, vmax=200) 312 | 313 | ax.set_xlabel('Position') 314 | ax.set_ylabel('Velocity') 315 | ax.set_title("\"Cost To Go\" Function") 316 | fig.colorbar(p) 317 | plt.show() 318 | 319 | 320 | 321 | def generate_greedy_policy_animation(env, estimator, save_dir): 322 | """ 323 | Follows (deterministic) greedy policy 324 | with respect to the given q-value estimator 325 | and saves animation using openAI gym's Monitor 326 | wrapper. Monitor will throw an error if monitor 327 | files already exist in save_dir so use unique 328 | save_dir for each call. 329 | """ 330 | 331 | if not os.path.exists(save_dir): 332 | os.makedirs(save_dir) 333 | 334 | try: 335 | env = gym.wrappers.Monitor( 336 | env, save_dir, video_callable=lambda episode_id: True) 337 | except gym.error.Error as e: 338 | print(e.what()) 339 | 340 | # Set epsilon to zero to follow greedy policy 341 | policy = make_epsilon_greedy_policy( 342 | estimator=estimator, epsilon=0, num_actions=env.action_space.n) 343 | # Reset the environment 344 | state = env.reset() 345 | for t in itertools.count(): 346 | time.sleep(0.01) # Slow down animation 347 | action_probs = policy(state) # Compute action-values 348 | [action] = np.nonzero(action_probs)[0] # Greedy action 349 | state, _, done, _ = env.step(action) # Take step 350 | env.render() # Animate 351 | if done: 352 | print('Solved in {} steps'.format(t)) 353 | break 354 | 355 | 356 | 357 | def display_animation(filepath): 358 | """ Displays mp4 animation in Jupyter.""" 359 | 360 | video = io.open(filepath, 'r+b').read() 361 | encoded = base64.b64encode(video) 362 | return HTML(data=''''''.format(encoded.decode('ascii'))) 365 | 366 | 367 | 368 | def plot_learning_curves(stats, smoothing_window=10): 369 | """ 370 | Plots the number of steps taken by the agent 371 | to solve the task as a function of episode number, 372 | smoothed over the last smoothing_window episodes. 373 | """ 374 | 375 | plt.figure(figsize=(10,5)) 376 | for algo_stats in stats: 377 | steps_per_episode = pd.Series(algo_stats.steps).rolling( 378 | smoothing_window).mean() # smooth 379 | plt.plot(steps_per_episode, label=algo_stats.algorithm) 380 | plt.xlabel("Episode") 381 | plt.ylabel("Steps") 382 | plt.title("Steps per Episode") 383 | plt.legend() 384 | plt.show() 385 | 386 | 387 | 388 | def plot_grid_search(stats, truncate_steps=400): 389 | """ 390 | Plots average number of steps taken by the agent 391 | to solve the task for each combination of 392 | step size and boostrapping parameter 393 | (n or lambda). 394 | """ 395 | # Truncate high step values for clearer plotting 396 | stats.steps[stats.steps > truncate_steps] = truncate_steps 397 | 398 | # We use -1 step values indicate corresponding combination of 399 | # parameters doesn't converge. Set these to truncate_steps for plotting. 400 | stats.steps[stats.steps == -1] = truncate_steps 401 | 402 | plt.figure() 403 | for b_idx in range(len(stats.bootstrappings)): 404 | plt.plot(stats.step_sizes, stats.steps[b_idx, :], 405 | label='Bootstrapping: {}'.format(stats.bootstrappings[b_idx])) 406 | plt.xlabel('Step size (alpha * number of tilings)') 407 | plt.ylabel('Average steps per episode') 408 | plt.title('Grid Search {}'.format(stats.algorithm)) 409 | plt.ylim(140, truncate_steps - 100) 410 | plt.legend() 411 | 412 | 413 | 414 | 415 | 416 | RunStats = namedtuple('RunStats', ['algorithm', 'steps', 'returns']) 417 | 418 | 419 | def run(algorithm, num_episodes=500, **algorithm_kwargs): 420 | 421 | """ 422 | Runs algorithm over multilple episodes and logs 423 | for each episode the complete return (G_t) and the 424 | number of steps taken. 425 | """ 426 | 427 | stats = RunStats( 428 | algorithm=algorithm, 429 | steps=np.zeros(num_episodes), 430 | returns=np.zeros(num_episodes)) 431 | 432 | algorithm_fn = globals()[algorithm] 433 | 434 | for i in range(num_episodes): 435 | episode_steps, episode_return = algorithm_fn(**algorithm_kwargs) 436 | stats.steps[i] = episode_steps 437 | stats.returns[i] = episode_return 438 | sys.stdout.flush() 439 | print("\rEpisode {}/{} Return {}".format( 440 | i + 1, num_episodes, episode_return), end="") 441 | return stats 442 | 443 | 444 | 445 | 446 | 447 | step_size = 0.5 # Fraction of the way we want to move towards target 448 | n = 4 # Level of bootstrapping (set to intermediate value) 449 | num_episodes = 500 450 | 451 | estimator_n = QEstimator(step_size=step_size) 452 | 453 | start_time = timeit.default_timer() 454 | run_stats_n = run('sarsa_n', num_episodes, n=n, env=env, estimator=estimator_n) 455 | elapsed_time = timeit.default_timer() - start_time 456 | 457 | plot_cost_to_go(env, estimator_n) 458 | print('{} episodes completed in {:.2f}s'.format(num_episodes, elapsed_time)) 459 | 460 | 461 | 462 | 463 | # Animate learned policy 464 | save_dir='./animations/n-step_sarsa/' 465 | generate_greedy_policy_animation(env, estimator_n, save_dir=save_dir) 466 | [filepath] = glob.glob(os.path.join(save_dir, '*.mp4')) 467 | display_animation(filepath) 468 | 469 | 470 | step_size = 0.5 # Fraction of the way we want to move towards target 471 | lmbda = 0.92 # Level of bootstrapping (set to intermediate value) 472 | num_episodes = 500 473 | 474 | estimator_lambda = QEstimator(step_size=step_size, trace=True) 475 | 476 | start_time = timeit.default_timer() 477 | run_stats_lambda = run('sarsa_lambda', num_episodes, lmbda=lmbda, env=env, estimator=estimator_lambda) 478 | elapsed_time = timeit.default_timer() - start_time 479 | 480 | plot_cost_to_go(env, estimator_lambda) 481 | print('{} episodes completed in {:.2f}s'.format(num_episodes, elapsed_time)) 482 | 483 | # Animate learned policy 484 | save_dir='./animations/sarsa_lambda/' 485 | generate_greedy_policy_animation(env, estimator_lambda, save_dir=save_dir) 486 | [filepath] = glob.glob(os.path.join(save_dir, '*.mp4')) 487 | display_animation(filepath) 488 | 489 | 490 | plot_learning_curves([run_stats_n, run_stats_lambda]) 491 | 492 | 493 | 494 | # comparing 495 | 496 | GridSearchStats = namedtuple('GridSearchStats', ['algorithm', 'steps', 'step_sizes', 'bootstrappings']) 497 | 498 | 499 | def run_grid_search(algorithm, step_sizes, bootstrappings, episodes=100, num_runs=5, 500 | **algorithm_kwargs): 501 | 502 | 503 | 504 | stats = GridSearchStats( 505 | algorithm=algorithm, 506 | steps=np.zeros((len(bootstrappings), len(step_sizes))), 507 | step_sizes=step_sizes, 508 | bootstrappings=bootstrappings) 509 | 510 | algorithm_fn = globals()[algorithm] 511 | trace = True if algorithm == 'sarsa_lambda' else False 512 | 513 | for run_idx in range(num_runs): 514 | for b_idx, bootstrapping in enumerate(bootstrappings): 515 | for s_idx, step_size in enumerate(step_sizes): 516 | if algorithm == 'sarsa_n': 517 | if (bootstrapping == 8 and step_size > 1) or \ 518 | (bootstrapping == 16 and step_size > 0.75): 519 | # sarsa_n doesn't converge in these cases so 520 | # assign a default value and skip over. 521 | stats.steps[b_idx, s_idx] = -1 * num_runs * episodes 522 | continue 523 | estimator = QEstimator(step_size=step_size, trace=trace) 524 | for episode in range(episodes): 525 | sys.stdout.flush() 526 | print('\r run: {}, step_size: {}, bootstrapping: {}, episode: {}'.format( 527 | run_idx, step_size, bootstrapping, episode), end="") 528 | episode_steps, _ = algorithm_fn( 529 | bootstrapping, estimator=estimator, **algorithm_kwargs) 530 | stats.steps[b_idx, s_idx] += episode_steps 531 | 532 | 533 | # Average over independent runs and episodes 534 | stats.steps[:] /= (num_runs * episodes) 535 | 536 | return stats 537 | 538 | 539 | step_sizes = np.arange(0.1, 1.8, 0.1) 540 | ns = np.power(2, np.arange(0, 5)) 541 | grid_search_stats_n = run_grid_search('sarsa_n', step_sizes, ns, env=env) 542 | plot_grid_search(grid_search_stats_n) 543 | 544 | step_sizes = np.arange(0.1, 1.8, 0.1) 545 | lambdas = np.array([0, 0.68, 0.84, 0.92, 0.98, 0.99]) 546 | grid_search_stats_lambda = run_grid_search('sarsa_lambda', step_sizes, lambdas, env=env) 547 | plot_grid_search(grid_search_stats_lambda) 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | -------------------------------------------------------------------------------- /Temporal-Difference/TD_Udacity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Aug 18 12:26:04 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import sys 10 | import gym 11 | import numpy as np 12 | import random 13 | import math 14 | from collections import defaultdict,deque 15 | import matplotlib.pyplot as plt 16 | 17 | import check_test 18 | from plot_utils import plot_values 19 | 20 | #creating the cliffenvironment 21 | 22 | env = gym.make('CliffWalking-v0') 23 | 24 | #print env-action-space 25 | print(env.action_space) 26 | print(env.observation_space) 27 | 28 | V_opt = np.zeros((4,12)) 29 | V_opt[0][0:13] = -np.arange(3, 15)[::-1] 30 | V_opt[1][0:13] = -np.arange(3, 15)[::-1] + 1 31 | V_opt[2][0:13] = -np.arange(3, 15)[::-1] + 2 32 | V_opt[3][0] = -13 33 | 34 | plot_values(V_opt) 35 | 36 | #TD control SarSA 37 | 38 | def update_Q_sarsa(alpha,gamma,Q,state,action,reward,next_state= None, next_action = None): 39 | 40 | current = Q[state][action] 41 | 42 | Qsa_next = Q[next_state][next_action]if next_state is not None else 0 43 | 44 | target = reward +(gamma * Qsa_next) 45 | 46 | new_value = current +(alpha*(target - current)) # getting the updated value 47 | 48 | return new_value 49 | 50 | #epsilon greedy 51 | 52 | def epsilon_greedy(Q,state,nA,eps): 53 | 54 | if random.random() >eps: 55 | 56 | return np.argmax(Q[state]) 57 | else: 58 | return random.choice(np.arange(env.action_space.n)) 59 | 60 | 61 | 62 | 63 | #sarsa algo 64 | 65 | def sarsa(env, num_episodes, alpha, gamma=1.0, plot_every=100): 66 | nA = env.action_space.n # number of actions 67 | Q = defaultdict(lambda: np.zeros(nA)) # initialize empty dictionary of arrays 68 | 69 | # monitor performance 70 | tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores 71 | avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes 72 | 73 | for i_episode in range(1, num_episodes+1): 74 | # monitor progress 75 | if i_episode % 100 == 0: 76 | print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") 77 | sys.stdout.flush() 78 | score = 0 # initialize score 79 | state = env.reset() # start episode 80 | 81 | eps = 1.0 / i_episode # set value of epsilon 82 | action = epsilon_greedy(Q, state, nA, eps) # epsilon-greedy action selection 83 | 84 | while True: 85 | next_state, reward, done, info = env.step(action) # take action A, observe R, S' 86 | score += reward # add reward to agent's score 87 | if not done: 88 | next_action = epsilon_greedy(Q, next_state, nA, eps) # epsilon-greedy action 89 | Q[state][action] = update_Q_sarsa(alpha, gamma, Q, \ 90 | state, action, reward, next_state, next_action) 91 | 92 | state = next_state # S <- S' 93 | action = next_action # A <- A' 94 | if done: 95 | Q[state][action] = update_Q_sarsa(alpha, gamma, Q, \ 96 | state, action, reward) 97 | tmp_scores.append(score) # append score 98 | break 99 | if (i_episode % plot_every == 0): 100 | avg_scores.append(np.mean(tmp_scores)) 101 | 102 | # plot performance 103 | plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores)) 104 | plt.xlabel('Episode Number') 105 | plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) 106 | plt.show() 107 | # print best 100-episode performance 108 | print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores)) 109 | return Q 110 | 111 | 112 | 113 | 114 | 115 | Q_sarsa = sarsa(env, 5000, .01) 116 | 117 | # print the estimated optimal policy 118 | policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12) 119 | check_test.run_check('td_control_check', policy_sarsa) 120 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") 121 | print(policy_sarsa) 122 | 123 | # plot the estimated optimal state-value function 124 | V_sarsa = ([np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]) 125 | plot_values(V_sarsa) 126 | 127 | 128 | 129 | # Q learning 130 | 131 | def update_Q_sarsamax(alpha,gamma,Q,state,action,reward,next_state =None): 132 | 133 | current = Q[state][action] 134 | Qsa_next = np.max(Q[next_state]) if next_state is not None else 0 135 | #constructing TD target 136 | target = reward +(gamma * Qsa_next) 137 | 138 | new_value = current +(alpha *(target - current)) 139 | 140 | return new_value 141 | 142 | 143 | 144 | def q_learning(env,num_episodes,alpha,gamma = 1.0,plot_every = 100): 145 | 146 | nA = env.action_space.n 147 | Q = defaultdict(lambda : np.zeros(nA)) 148 | 149 | tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores 150 | avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes 151 | 152 | for i_episode in range(1, num_episodes+1): 153 | # monitor progress 154 | if i_episode % 100 == 0: 155 | print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") 156 | sys.stdout.flush() 157 | score = 0 # initialize score 158 | state = env.reset() # start episode 159 | eps = 1.0 / i_episode # set value of epsilon 160 | 161 | 162 | while True: 163 | action = epsilon_greedy(Q, state, nA, eps) 164 | next_state, reward,done , info = env.step(action) 165 | 166 | score += reward 167 | Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, state, action, reward,next_state) 168 | 169 | state= next_state 170 | 171 | 172 | if done: 173 | 174 | tmp_scores.append(score) # append score 175 | break 176 | 177 | if (i_episode % plot_every == 0): 178 | avg_scores.append(np.mean(tmp_scores)) 179 | 180 | 181 | #plot performances 182 | plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores)) 183 | plt.xlabel('Episode Number') 184 | plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) 185 | plt.show() 186 | # print best 100-episode performance 187 | print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores)) 188 | return Q 189 | 190 | # obtain the estimated optimal policy and corresponding action-value function 191 | Q_sarsamax = q_learning(env, 5000, .01) 192 | 193 | # print the estimated optimal policy 194 | policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) 195 | check_test.run_check('td_control_check', policy_sarsamax) 196 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") 197 | print(policy_sarsamax) 198 | 199 | # plot the estimated optimal state-value function 200 | plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)]) 201 | 202 | 203 | #CONTROL EXPECTED SARSA 204 | 205 | def update_Q_expsarsa(alpha,gamma,nA,eps,Q,state,action,reward,next_state = None): 206 | 207 | current = Q[state][action] 208 | policy_s =np.ones(nA) *eps/nA # current policy 209 | policy_s[np.argmax(Q[next_state])] = 1-eps +(eps/nA) #greedy action 210 | Qsa_next = np.dot(Q[next_state()],policy_s) 211 | target = reward +(gamma* Qsa_next) 212 | new_value = current +(alpha*(target- current)) 213 | 214 | return new_value 215 | 216 | 217 | def expected_sarsa(env, num_episodes, alpha, gamma=1.0, plot_every=100): 218 | 219 | nA = env.action_space.n # number of actions 220 | Q = defaultdict(lambda: np.zeros(nA)) # initialize empty dictionary of arrays 221 | 222 | # monitor performance 223 | tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores 224 | avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes 225 | 226 | for i_episode in range(1, num_episodes+1): 227 | # monitor progress 228 | if i_episode % 100 == 0: 229 | print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") 230 | sys.stdout.flush() 231 | 232 | score = 0 # initialize score 233 | state = env.reset() # start episode 234 | eps = 0.005 # set value of epsilon 235 | 236 | while True: 237 | action = epsilon_greedy(Q, state, nA, eps) # epsilon-greedy action selection 238 | next_state, reward, done, info = env.step(action) # take action A, observe R, S' 239 | score += reward # add reward to agent's score 240 | # update Q 241 | Q[state][action] = update_Q_expsarsa(alpha, gamma, nA, eps, Q, \ 242 | state, action, reward, next_state) 243 | state = next_state # S <- S' 244 | if done: 245 | tmp_scores.append(score) # append score 246 | break 247 | if (i_episode % plot_every == 0): 248 | avg_scores.append(np.mean(tmp_scores)) 249 | 250 | # plot performance 251 | plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores)) 252 | plt.xlabel('Episode Number') 253 | plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) 254 | plt.show() 255 | # print best 100-episode performance 256 | print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores)) 257 | return Q 258 | 259 | 260 | Q_expsarsa = expected_sarsa(env, 5000, 1) 261 | 262 | # print the estimated optimal policy 263 | policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12) 264 | check_test.run_check('td_control_check', policy_expsarsa) 265 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") 266 | print(policy_expsarsa) 267 | 268 | # plot the estimated optimal state-value function 269 | plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)]) 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /Temporal-Difference/__pycache__/check_test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/__pycache__/check_test.cpython-38.pyc -------------------------------------------------------------------------------- /Temporal-Difference/__pycache__/plot_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/__pycache__/plot_utils.cpython-38.pyc -------------------------------------------------------------------------------- /Temporal-Difference/check_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Aug 18 12:31:14 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import unittest 11 | from IPython.display import Markdown, display 12 | import numpy as np 13 | 14 | def printmd(string): 15 | display(Markdown(string)) 16 | 17 | V_opt = np.zeros((4,12)) 18 | V_opt[0:13][0] = -np.arange(3, 15)[::-1] 19 | V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1 20 | V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2 21 | V_opt[3][0] = -13 22 | 23 | pol_opt = np.hstack((np.ones(11), 2, 0)) 24 | 25 | V_true = np.zeros((4,12)) 26 | for i in range(3): 27 | V_true[0:13][i] = -np.arange(3, 15)[::-1] - i 28 | V_true[1][11] = -2 29 | V_true[2][11] = -1 30 | V_true[3][0] = -17 31 | 32 | def get_long_path(V): 33 | return np.array(np.hstack((V[0:13][0], V[1][0], V[1][11], V[2][0], V[2][11], V[3][0], V[3][11]))) 34 | 35 | def get_optimal_path(policy): 36 | return np.array(np.hstack((policy[2][:], policy[3][0]))) 37 | 38 | class Tests(unittest.TestCase): 39 | 40 | def td_prediction_check(self, V): 41 | to_check = get_long_path(V) 42 | soln = get_long_path(V_true) 43 | np.testing.assert_array_almost_equal(soln, to_check) 44 | 45 | def td_control_check(self, policy): 46 | to_check = get_optimal_path(policy) 47 | np.testing.assert_equal(pol_opt, to_check) 48 | 49 | check = Tests() 50 | 51 | def run_check(check_name, func): 52 | try: 53 | getattr(check, check_name)(func) 54 | except check.failureException as e: 55 | printmd('**PLEASE TRY AGAIN**') 56 | return 57 | printmd('**PASSED**') -------------------------------------------------------------------------------- /Temporal-Difference/plot_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/plot_graph.png -------------------------------------------------------------------------------- /Temporal-Difference/plot_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Aug 18 12:27:03 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | sns.set_style("white") 13 | 14 | def plot_values(V): 15 | # reshape the state-value function 16 | V = np.reshape(V, (4,12)) 17 | # plot the state-value function 18 | fig = plt.figure(figsize=(15,5)) 19 | ax = fig.add_subplot(111) 20 | im = ax.imshow(V, cmap='cool') 21 | for (j,i),label in np.ndenumerate(V): 22 | ax.text(i, j, np.round(label,3), ha='center', va='center', fontsize=14) 23 | plt.tick_params(bottom='off', left='off', labelbottom='off', labelleft='off') 24 | plt.title('State-Value Function') 25 | plt.show() -------------------------------------------------------------------------------- /Tile-coding /Tile_coding_Uda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Aug 21 08:22:58 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | #importing stuff 10 | 11 | import sys 12 | import gym 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | 17 | 18 | #plotting stuff 19 | 20 | plt.style.use('ggplot') 21 | np.set_printoptions(precision=3, linewidth=120) 22 | 23 | #creating the environemetn 24 | env = gym.make('Acrobot-v1') 25 | env.seed(505) 26 | 27 | #Exploratorry state 28 | print(" ") 29 | print("State space:", env.observation_space) 30 | print("- low:", env.observation_space.low) 31 | print("- high:", env.observation_space.high) 32 | print(" ") 33 | 34 | # Explore action space 35 | 36 | print("Action space:", env.action_space) 37 | print(" ") 38 | 39 | 40 | #creatinf a random agent 41 | state = env.reset() 42 | score = 0 43 | 44 | for t in range(200): 45 | action = env.action_space.sample() 46 | #env.render() 47 | state,reward , done ,info = env.step(action) 48 | score += reward 49 | 50 | if done: 51 | break 52 | print("Final Score:",score) 53 | env.close() 54 | 55 | #Tiling 56 | 57 | def create_tiling_grid(low,high,bins = (10,10),offsets = (0.0,0.0)): 58 | 59 | grid = [np.linspace(low[dim], high[dim],bins[dim]+1)[1:-1] +offsets[dim] for dim in range(len(bins))] 60 | 61 | print(" ") 62 | print("Tiling: [, ] / + () => ") 63 | print(" ") 64 | for l, h, b, o, splits in zip(low, high, bins, offsets, grid): 65 | print(" [{}, {}] / {} + ({}) => {}".format(l, h, b, o, splits)) 66 | return grid 67 | 68 | 69 | #testing the tiling 70 | low = [-1.0, -5.0] 71 | high = [1.0, 5.0] 72 | create_tiling_grid(low, high, bins=(10, 10), offsets=(-0.1, 0.5)) 73 | 74 | def create_tilings(low,high,tiling_specs): 75 | 76 | return [create_tiling_grid(low, high, bins,offsets) for bins,offsets in tiling_specs] 77 | 78 | # Tiling specs: [(, ), ...] 79 | tiling_specs = [((10, 10), (-0.066, -0.33)), 80 | ((10, 10), (0.0, 0.0)), 81 | ((10, 10), (0.066, 0.33))] 82 | tilings = create_tilings(low, high, tiling_specs) 83 | 84 | from matplotlib.lines import Line2D 85 | 86 | def visualize_tilings(tilings): 87 | """Plot each tiling as a grid.""" 88 | prop_cycle = plt.rcParams['axes.prop_cycle'] 89 | colors = prop_cycle.by_key()['color'] 90 | linestyles = ['-', '--', ':'] 91 | legend_lines = [] 92 | 93 | fig, ax = plt.subplots(figsize=(10, 10)) 94 | for i, grid in enumerate(tilings): 95 | for x in grid[0]: 96 | l = ax.axvline(x=x, color=colors[i % len(colors)], linestyle=linestyles[i % len(linestyles)], label=i) 97 | for y in grid[1]: 98 | l = ax.axhline(y=y, color=colors[i % len(colors)], linestyle=linestyles[i % len(linestyles)]) 99 | legend_lines.append(l) 100 | ax.grid('off') 101 | ax.legend(legend_lines, ["Tiling #{}".format(t) for t in range(len(legend_lines))], facecolor='white', framealpha=0.9) 102 | ax.set_title("Tilings") 103 | return ax # return Axis object to draw on later, if needed 104 | 105 | 106 | visualize_tilings(tilings); 107 | 108 | #Discretize 109 | 110 | def discretize(sample,grid): 111 | return tuple(int(np.digitize(s, g)) for s,g in zip(sample,grid)) # applying along each dimension 112 | 113 | def tile_encode(sample,tilings,flatten =False): 114 | 115 | #encoded sample 116 | encoded_sample = [discretize(sample, grid) for grid in tilings] 117 | return np.concatenate(encoded_sample) if flatten else encoded_sample 118 | 119 | # Test with some sample values 120 | samples = [(-1.2 , -5.1 ), 121 | (-0.75, 3.25), 122 | (-0.5 , 0.0 ), 123 | ( 0.25, -1.9 ), 124 | ( 0.15, -1.75), 125 | ( 0.75, 2.5 ), 126 | ( 0.7 , -3.7 ), 127 | ( 1.0 , 5.0 )] 128 | encoded_samples = [tile_encode(sample, tilings) for sample in samples] 129 | print(" ") 130 | print("\nSamples:", repr(samples), sep="\n") 131 | print("\nEncoded samples:", repr(encoded_samples), sep="\n") 132 | print(" ") 133 | 134 | 135 | # Visulalizing the tiling 136 | 137 | from matplotlib.patches import Rectangle 138 | 139 | def visualize_encoded_samples(samples, encoded_samples, tilings, low=None, high=None): 140 | """Visualize samples by activating the respective tiles.""" 141 | samples = np.array(samples) # for ease of indexing 142 | 143 | # Show tiling grids 144 | ax = visualize_tilings(tilings) 145 | 146 | # If bounds (low, high) are specified, use them to set axis limits 147 | if low is not None and high is not None: 148 | ax.set_xlim(low[0], high[0]) 149 | ax.set_ylim(low[1], high[1]) 150 | else: 151 | # Pre-render (invisible) samples to automatically set reasonable axis limits, and use them as (low, high) 152 | ax.plot(samples[:, 0], samples[:, 1], 'o', alpha=0.0) 153 | low = [ax.get_xlim()[0], ax.get_ylim()[0]] 154 | high = [ax.get_xlim()[1], ax.get_ylim()[1]] 155 | 156 | # Map each encoded sample (which is really a list of indices) to the corresponding tiles it belongs to 157 | tilings_extended = [np.hstack((np.array([low]).T, grid, np.array([high]).T)) for grid in tilings] # add low and high ends 158 | tile_centers = [(grid_extended[:, 1:] + grid_extended[:, :-1]) / 2 for grid_extended in tilings_extended] # compute center of each tile 159 | tile_toplefts = [grid_extended[:, :-1] for grid_extended in tilings_extended] # compute topleft of each tile 160 | tile_bottomrights = [grid_extended[:, 1:] for grid_extended in tilings_extended] # compute bottomright of each tile 161 | 162 | prop_cycle = plt.rcParams['axes.prop_cycle'] 163 | colors = prop_cycle.by_key()['color'] 164 | for sample, encoded_sample in zip(samples, encoded_samples): 165 | for i, tile in enumerate(encoded_sample): 166 | # Shade the entire tile with a rectangle 167 | topleft = tile_toplefts[i][0][tile[0]], tile_toplefts[i][1][tile[1]] 168 | bottomright = tile_bottomrights[i][0][tile[0]], tile_bottomrights[i][1][tile[1]] 169 | ax.add_patch(Rectangle(topleft, bottomright[0] - topleft[0], bottomright[1] - topleft[1], 170 | color=colors[i], alpha=0.33)) 171 | 172 | # In case sample is outside tile bounds, it may not have been highlighted properly 173 | if any(sample < topleft) or any(sample > bottomright): 174 | # So plot a point in the center of the tile and draw a connecting line 175 | cx, cy = tile_centers[i][0][tile[0]], tile_centers[i][1][tile[1]] 176 | ax.add_line(Line2D([sample[0], cx], [sample[1], cy], color=colors[i])) 177 | ax.plot(cx, cy, 's', color=colors[i]) 178 | 179 | # Finally, plot original samples 180 | ax.plot(samples[:, 0], samples[:, 1], 'o', color='r') 181 | 182 | ax.margins(x=0, y=0) # remove unnecessary margins 183 | ax.set_title("Tile-encoded samples") 184 | return ax 185 | 186 | visualize_encoded_samples(samples, encoded_samples, tilings); 187 | 188 | 189 | # Now a Q-table with tile coding 190 | 191 | class QTable: 192 | 193 | def __init__(self,state_size, action_size): 194 | self.state_size = state_size 195 | self.action_size = action_size 196 | 197 | self.q_table = np.zeros(shape= (self.state_size + (self.action_size,))) 198 | print(" ") 199 | print(" Q Table size = ", self.q_table.shape) 200 | 201 | 202 | # Now with the tile coding part 203 | 204 | class TiledQTable: 205 | 206 | 207 | def __init__(self, low, high, tiling_specs, action_size): 208 | 209 | self.tilings = create_tilings(low, high, tiling_specs) 210 | self.state_sizes = [tuple(len(splits)+1 for splits in tiling_grid) for tiling_grid in self.tilings] 211 | self.action_size = action_size 212 | self.q_tables = [QTable(state_size, self.action_size) for state_size in self.state_sizes] 213 | print("TiledQTable(): no. of internal tables = ", len(self.q_tables)) 214 | 215 | def get(self, state, action): 216 | 217 | 218 | encoded_state = tile_encode(state, self.tilings) 219 | 220 | 221 | value = 0.0 222 | for idx, q_table in zip(encoded_state, self.q_tables): 223 | value += q_table.q_table[tuple(idx + (action,))] 224 | value /= len(self.q_tables) 225 | return value 226 | 227 | def update(self, state, action, value, alpha=0.1): 228 | 229 | 230 | encoded_state = tile_encode(state, self.tilings) 231 | 232 | 233 | for idx, q_table in zip(encoded_state, self.q_tables): 234 | value_ = q_table.q_table[tuple(idx + (action,))] # current value 235 | q_table.q_table[tuple(idx + (action,))] = alpha * value + (1.0 - alpha) * value_ 236 | 237 | 238 | 239 | 240 | 241 | # Test with a sample Q-table 242 | tq = TiledQTable(low, high, tiling_specs, 2) 243 | s1 = 3; s2 = 4; a = 0; q = 1.0 244 | print("[GET] Q({}, {}) = {}".format(samples[s1], a, tq.get(samples[s1], a))) # check value at sample = s1, action = a 245 | print("[UPDATE] Q({}, {}) = {}".format(samples[s2], a, q)); tq.update(samples[s2], a, q) # update value for sample with some common tile(s) 246 | print("[GET] Q({}, {}) = {}".format(samples[s1], a, tq.get(samples[s1], a))) 247 | print(" ") 248 | 249 | 250 | class QLearningAgent: 251 | 252 | 253 | def __init__(self, env, tq, alpha=0.02, gamma=0.99, 254 | epsilon=1.0, epsilon_decay_rate=0.9995, min_epsilon=.01, seed=0): 255 | """Initialize variables, create grid for discretization.""" 256 | # Environment info 257 | self.env = env 258 | self.tq = tq 259 | self.state_sizes = tq.state_sizes # list of state sizes for each tiling 260 | self.action_size = self.env.action_space.n # 1-dimensional discrete action space 261 | self.seed = np.random.seed(seed) 262 | print("Environment:", self.env) 263 | print("State space sizes:", self.state_sizes) 264 | print("Action space size:", self.action_size) 265 | 266 | # Learning parameters 267 | self.alpha = alpha # learning rate 268 | self.gamma = gamma # discount factor 269 | self.epsilon = self.initial_epsilon = epsilon # initial exploration rate 270 | self.epsilon_decay_rate = epsilon_decay_rate # how quickly should we decrease epsilon 271 | self.min_epsilon = min_epsilon 272 | 273 | def reset_episode(self, state): 274 | """Reset variables for a new episode.""" 275 | # Gradually decrease exploration rate 276 | self.epsilon *= self.epsilon_decay_rate 277 | self.epsilon = max(self.epsilon, self.min_epsilon) 278 | 279 | self.last_state = state 280 | Q_s = [self.tq.get(state, action) for action in range(self.action_size)] 281 | self.last_action = np.argmax(Q_s) 282 | return self.last_action 283 | 284 | def reset_exploration(self, epsilon=None): 285 | """Reset exploration rate used when training.""" 286 | self.epsilon = epsilon if epsilon is not None else self.initial_epsilon 287 | 288 | def act(self, state, reward=None, done=None, mode='train'): 289 | """Pick next action and update internal Q table (when mode != 'test').""" 290 | Q_s = [self.tq.get(state, action) for action in range(self.action_size)] 291 | # Pick the best action from Q table 292 | greedy_action = np.argmax(Q_s) 293 | if mode == 'test': 294 | # Test mode: Simply produce an action 295 | action = greedy_action 296 | else: 297 | # Train mode (default): Update Q table, pick next action 298 | # Note: We update the Q table entry for the *last* (state, action) pair with current state, reward 299 | value = reward + self.gamma * max(Q_s) 300 | self.tq.update(self.last_state, self.last_action, value, self.alpha) 301 | 302 | # Exploration vs. exploitation 303 | do_exploration = np.random.uniform(0, 1) < self.epsilon 304 | if do_exploration: 305 | # Pick a random action 306 | action = np.random.randint(0, self.action_size) 307 | else: 308 | # Pick the greedy action 309 | action = greedy_action 310 | 311 | # Roll over current state, action for next step 312 | self.last_state = state 313 | self.last_action = action 314 | return action 315 | 316 | 317 | n_bins = 5 318 | bins = tuple([n_bins]*env.observation_space.shape[0]) 319 | offset_pos = (env.observation_space.high - env.observation_space.low)/(3*n_bins) 320 | 321 | tiling_specs = [(bins, -offset_pos), 322 | (bins, tuple([0.0]*env.observation_space.shape[0])), 323 | (bins, offset_pos)] 324 | 325 | tq = TiledQTable(env.observation_space.low, 326 | env.observation_space.high, 327 | tiling_specs, 328 | env.action_space.n) 329 | agent = QLearningAgent(env, tq) 330 | 331 | def run(agent, env, num_episodes=10000, mode='train'): 332 | """Run agent in given reinforcement learning environment and return scores.""" 333 | scores = [] 334 | max_avg_score = -np.inf 335 | for i_episode in range(1, num_episodes+1): 336 | # Initialize episode 337 | state = env.reset() 338 | action = agent.reset_episode(state) 339 | total_reward = 0 340 | done = False 341 | 342 | # Roll out steps until done 343 | while not done: 344 | state, reward, done, info = env.step(action) 345 | total_reward += reward 346 | action = agent.act(state, reward, done, mode) 347 | 348 | 349 | 350 | # Save final score 351 | scores.append(total_reward) 352 | 353 | # Print episode stats 354 | if mode == 'train': 355 | if len(scores) > 100: 356 | avg_score = np.mean(scores[-100:]) 357 | if avg_score > max_avg_score: 358 | max_avg_score = avg_score 359 | if i_episode % 100 == 0: 360 | print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="") 361 | sys.stdout.flush() 362 | 363 | 364 | return scores 365 | 366 | scores = run(agent, env) 367 | 368 | 369 | def plot_scores(scores, rolling_window=100): 370 | """Plot scores and optional rolling mean using specified window.""" 371 | plt.plot(scores); plt.title("Scores"); 372 | rolling_mean = pd.Series(scores).rolling(rolling_window).mean() 373 | plt.plot(rolling_mean); 374 | return rolling_mean 375 | 376 | rolling_mean = plot_scores(scores) 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | -------------------------------------------------------------------------------- /Upper-Confidence-Bound /Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Upper-Confidence-Bound /Figure_1.png -------------------------------------------------------------------------------- /Upper-Confidence-Bound /UCB.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 24 22:25:26 2020 5 | 6 | @author: pavankunchala 7 | """ 8 | 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import pandas as pd 13 | 14 | class ucb_bandit: 15 | 16 | 17 | def __init__(self, k ,c, iters, mu = 'random'): 18 | # number of arms 19 | 20 | self.k = k 21 | # the condindencre bound(exploration parameter) 22 | self.c = c 23 | 24 | # number of iters 25 | self.iters = iters 26 | 27 | #no of times steps 28 | self.n = 1 29 | #step count for each arm 30 | self.k_n =np.ones(k) 31 | #mean reward 32 | self.mean_reward = 0 33 | self.reward = np.zeros(iters) 34 | 35 | #reward for each arm 36 | self.k_reward=np.zeros(k) 37 | 38 | if type(mu) == list or type(mu).__module__ == np.__name__: 39 | #user defined average 40 | self.mu = np.array(mu) 41 | elif mu == 'random': 42 | #draws random from the prob distrubtytion 43 | self.mu = np.random.normal(0,1,k) 44 | elif mu == 'sequence': 45 | self.mu = np.linspace(0, k-1, k) 46 | 47 | def pull(self): 48 | 49 | a = np.argmax(self.k_reward +self.c *np.sqrt(np.log(self.n) / self.k_n)) 50 | 51 | reward = np.random.normal(self.mu[a],1) 52 | 53 | #updte 54 | self.n +=1 55 | self.k_n[a]+=1 56 | 57 | 58 | #update total 59 | self.mean_reward = self.mean_reward +(reward - self.mean_reward)/self.n 60 | 61 | self.k_reward[a] = self.k_reward[a] +(reward - self.k_reward[a])/self.k_n[a] 62 | 63 | def run(self): 64 | for i in range(self.iters): 65 | self.pull() 66 | self.reward[i]= self.mean_reward 67 | 68 | def reset(self,mu = 'none'): 69 | 70 | self.n = 1 71 | self.k_n = np.ones(self.k) 72 | self.mean_reward = 0 73 | self.reward = np.zeros(iters) 74 | self.k_reward = np.zeros(self.k) 75 | if mu == 'random': 76 | self.mu = np.random.normal(0, 1, self.k) 77 | 78 | k = 10 # number of arms 79 | iters = 1000 80 | ucb_rewards = np.zeros(iters) 81 | # Initialize bandits 82 | ucb = ucb_bandit(k, 2, iters) 83 | episodes = 1000 84 | # Run experiments 85 | for i in range(episodes): 86 | ucb.reset('random') 87 | # Run experiments 88 | ucb.run() 89 | 90 | # Update long-term averages 91 | ucb_rewards = ucb_rewards + ( 92 | ucb.reward - ucb_rewards) / (i + 1) 93 | 94 | plt.figure(figsize=(12,8)) 95 | plt.plot(ucb_rewards, label="UCB") 96 | plt.legend(bbox_to_anchor=(1.2, 0.5)) 97 | plt.xlabel("Iterations") 98 | plt.ylabel("Average Reward") 99 | plt.title("Average UCB Rewards after " 100 | + str(episodes) + " Episodes") 101 | plt.show() 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-merlot -------------------------------------------------------------------------------- /cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/cheatsheet.pdf --------------------------------------------------------------------------------