├── .DS_Store
├── .gitattributes
├── Atari-GAN
    └── atari-gan-generation.py
├── BlackJack_First_Vist_MC
    ├── .DS_Store
    ├── BlackJack_MC_uda.py
    ├── Figure 2020-08-16 113558.png
    ├── Figure_1.png
    ├── Figure_2.png
    ├── Monte_Carlo_FirstVisit_BlackJack.py
    ├── __pycache__
    │   └── plot_utils.cpython-38.pyc
    └── plot_utils.py
├── Cart_pole
    ├── .DS_Store
    ├── cartpole_ActionWrapper.py
    ├── cartpole_cross_entropy.py
    ├── cartpole_random.py
    ├── cartpole_random_monitor.py
    └── runs
    │   ├── Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole
    │       └── events.out.tfevents.1600576634.Pavans-MacBook-Pro.local
    │   ├── Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole
    │       └── events.out.tfevents.1600576679.Pavans-MacBook-Pro.local
    │   └── Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole
    │       └── events.out.tfevents.1601364152.Pavans-MacBook-Pro.local
├── Cross-Entrorpy
    ├── CEM_method.py
    └── checkpoint.pth
├── Deep-Q-Learning
    ├── .DS_Store
    ├── dqn_pong.py
    └── lib_dep
    │   ├── __pycache__
    │       ├── dqn_model.cpython-38.pyc
    │       └── wrappers.cpython-38.pyc
    │   ├── dqn_model.py
    │   └── wrappers.py
├── Deep-Q-Network
    ├── Deep_Q_network.py
    ├── __pycache__
    │   ├── dqn_agent.cpython-38.pyc
    │   └── model.cpython-38.pyc
    ├── dqn_agent.py
    ├── lunar_lander_test.py
    └── model.py
├── Discretization
    └── Discretization_udacity.py
├── Frozen_lake
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── Untitled-checkpoint.ipynb
    └── Frozen_lake_v_0.py
├── Gradient_Bandit
    ├── Figure_1.png
    └── gradient_bandit.py
├── K-armed-Bandit
    ├── .DS_Store
    ├── K-armed_Bandit-Problem.py
    └── output
    │   ├── actions.png
    │   └── rewards.png
├── K-armed-Greedy
    ├── Figure_1.png
    ├── Figure_2.png
    ├── Figure_3.png
    ├── Figure_4.png
    ├── Figure_5.png
    └── K-armed-Greedy-and-rest.py
├── Monte_Carlo_Frozen_lake
    ├── .DS_Store
    └── MC_Frozenlake.py
├── MountainCar_Q_learn
    └── mountainCarQlearn.py
├── Off_Policy_Monte_Carlo
    └── Off_Policy_MC.py
├── Pac-Man
    └── pacman_DQN.py
├── Ping_pong
    ├── .DS_Store
    └── ping_pong.py
├── Policy_eval_Grid_World
    ├── .DS_Store
    ├── Figure_1.png
    └── policy_eval_GridWorld.py
├── Q-Learning
    └── Q-learning.py
├── README.md
├── Reinforce
    └── policy_graident.py
├── Sarsa
    ├── Sarsa.py
    └── n-Sarsa_and_Sarsa(lambda).py
├── Temporal-Difference
    ├── TD_Udacity.py
    ├── __pycache__
    │   ├── check_test.cpython-38.pyc
    │   └── plot_utils.cpython-38.pyc
    ├── check_test.py
    ├── plot_graph.png
    └── plot_utils.py
├── Tile-coding 
    └── Tile_coding_Uda.py
├── Upper-Confidence-Bound 
    ├── Figure_1.png
    └── UCB.py
├── _config.yml
└── cheatsheet.pdf


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/Atari-GAN/atari-gan-generation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Sep 18 16:38:32 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import random
 10 | import argparse
 11 | import cv2
 12 | 
 13 | #torch
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.optim as optim
 17 | from tensorboardX import SummaryWriter
 18 | 
 19 | import torchvision.utils as vutils
 20 | 
 21 | #GYM
 22 | import gym
 23 | import gym.spaces
 24 | import numpy as np
 25 | 
 26 | log = gym.logger
 27 | log.setLevel(gym.logger.INFO)
 28 | 
 29 | 
 30 | LATENT_VECTOR_SIZE = 100
 31 | DISCR_FILTERS = 64
 32 | GENER_FILTERS = 64
 33 | BATCH_SIZE = 16
 34 | 
 35 | # dimension input image will be rescaled
 36 | IMAGE_SIZE = 64
 37 | 
 38 | LEARNING_RATE = 0.0001
 39 | REPORT_EVERY_ITER = 100
 40 | SAVE_IMAGE_EVERY_ITER = 1000
 41 | 
 42 | class InputWrapper(gym.ObservationWrapper):
 43 | 
 44 |     def __init__(self, *args):
 45 |         super(InputWrapper, self).__init__(*args)
 46 |         assert isinstance(self.observation_space,gym.spaces.Box)
 47 |         old_space = self.observation_space
 48 |         self.observation_space = gym.spaces.Box(
 49 |             self.observation(old_space.low),
 50 |             self.observation(old_space.high),
 51 |              dtype=np.float32)
 52 |         
 53 |         
 54 |     def observation(self,observation):
 55 |         
 56 |         #resizing the imagw
 57 |         new_obs = cv2.resize(observation   , (IMAGE_SIZE,IMAGE_SIZE))
 58 |          # transform (210, 160, 3) -> (3, 210, 160)
 59 |         new_obs = np.moveaxis(new_obs, 2, 0)
 60 |         return new_obs.astype(np.float32)
 61 |     
 62 | class Discriminator(nn.Module):
 63 |     
 64 |     def __init__(self,input_shape):
 65 |         super(Discriminator,self).__init__()
 66 |         
 67 |         #convering images into single number
 68 |         self.conv_pipe = nn.Sequential(
 69 |             nn.Conv2d(in_channels= input_shape[0], out_channels = DISCR_FILTERS
 70 |                       , kernel_size = 4 , stride=2 , padding=1),
 71 |             nn.ReLU(),
 72 |             
 73 |             nn.Conv2d(in_channels= DISCR_FILTERS, out_channels = DISCR_FILTERS *2
 74 |                       , kernel_size = 4 , stride=2 , padding=1),
 75 |             nn.BatchNorm2d(DISCR_FILTERS *2),
 76 |             nn.ReLU(),
 77 |             
 78 |             nn.Conv2d(in_channels= DISCR_FILTERS *2, out_channels = DISCR_FILTERS *4
 79 |                       , kernel_size = 4 , stride=2 , padding=1),
 80 |             nn.BatchNorm2d(DISCR_FILTERS *4),
 81 |             nn.ReLU(),
 82 |             
 83 |             nn.Conv2d(in_channels= DISCR_FILTERS *4, out_channels = DISCR_FILTERS *8
 84 |                       , kernel_size = 4 , stride=2 , padding=1),
 85 |             nn.BatchNorm2d(DISCR_FILTERS *8),
 86 |             nn.ReLU(),
 87 |             
 88 |             nn.Conv2d(in_channels=DISCR_FILTERS * 8, out_channels=1,
 89 |                       kernel_size=4, stride=1, padding=0),
 90 |             nn.Sigmoid()    
 91 |             
 92 |             )
 93 |         
 94 |     def forward(self,x):
 95 |         conv_out = self.conv_pipe(x)
 96 |         
 97 |         return conv_out.view(-1,1).squeeze(dim = 1)
 98 |     
 99 | class Generator(nn.Module):
100 |     
101 |     def __init__(self,output_shape):
102 |         super(Generator,self).__init__()
103 |         # deconvise
104 |         self.pipe = nn.Sequential(
105 |             nn.ConvTranspose2d(in_channels = LATENT_VECTOR_SIZE, out_channels = GENER_FILTERS*8
106 |                                , kernel_size = 4,stride = 1, padding = 0),
107 |             
108 |             nn.BatchNorm2d(GENER_FILTERS *8),
109 |             nn.ReLU(),
110 |             
111 |             nn.ConvTranspose2d(in_channels = GENER_FILTERS *8, out_channels = GENER_FILTERS*4
112 |                                , kernel_size = 4,stride = 2, padding = 1),
113 |             
114 |             nn.BatchNorm2d(GENER_FILTERS *4),
115 |             nn.ReLU(),
116 |             
117 |             nn.ConvTranspose2d(in_channels =GENER_FILTERS *4, out_channels = GENER_FILTERS*2
118 |                                , kernel_size = 4,stride = 1, padding = 0),
119 |             
120 |             nn.BatchNorm2d(GENER_FILTERS *2),
121 |             nn.ReLU(),
122 |             
123 |             nn.ConvTranspose2d(in_channels =GENER_FILTERS *2, out_channels = GENER_FILTERS
124 |                                , kernel_size = 4,stride = 1, padding = 0),
125 |             
126 |             nn.BatchNorm2d(GENER_FILTERS ),
127 |             nn.ReLU(),
128 |             
129 |             nn.ConvTranspose2d(in_channels=GENER_FILTERS, out_channels=output_shape[0],
130 |                                kernel_size=4, stride=2, padding=1),
131 |             nn.Tanh()
132 |             
133 |             
134 |             )
135 |         
136 |     def forward(self,x):
137 |         return self.pipe(x)
138 |     
139 | 
140 | def iterate_batches(envs, batch_size = BATCH_SIZE):
141 |     
142 |     batch = [e.reset() for e in envs]
143 |     env_gen = iter(lambda : random.choice(envs), None)
144 |     
145 |     
146 |     while True:
147 |         
148 |         e = next(env_gen)
149 |         
150 |         obs, reward,is_done,_ = e.step(e.action_sample.sample())
151 |         
152 |         if np.mean(obs) > 0.01:
153 |             batch.append(obs)
154 |             
155 |         if len(batch) == batch_size:
156 |             #normalising betn 1 and -1
157 |             batch_np = np.array(batch, dtype=np.float32)
158 |             batch_np *= 2.0 / 255.0 - 1.0
159 |             
160 |             yield torch.tensor(batch_np)
161 |             batch.clear()
162 |             
163 |         if is_done:
164 |             e.reset()
165 |         
166 | if __name__ == 'main':
167 |     
168 |     parser= argparse.ArgumentParser()
169 |     parser.add_argument( "--cuda", default=False, action='store_true',
170 |         help="Enable cuda computation")
171 |     
172 |     args = parser.parse_args()
173 |     
174 |     device= torch.device('cuda' if args.cuda else 'cpu')
175 |     
176 |     envs = [
177 |         InputWrapper(gym.make(name))
178 |         for name in ('Breakout-v0','AirRaid-v0', 'Pong-v0')
179 |         ]
180 |     input_shape = envs[0].observation_space.shape
181 |     
182 |     net_discr = Discriminator(input_shape=input_shape).to(device)
183 |     net_gener = Generator(output_shape=input_shape).to(device)
184 |     
185 |     objective = nn.BCELoss()
186 |     
187 |     gen_optimizer = optim.Adam(params=net_gener.parameters(), lr=LEARNING_RATE,
188 |         betas=(0.5, 0.999))
189 |     
190 |     dis_optimizer = optim.Adam(
191 |         params=net_discr.parameters(), lr=LEARNING_RATE,
192 |         betas=(0.5, 0.999))
193 |     writer = SummaryWriter()
194 |     
195 |     gen_losses = []
196 |     dis_losses = []
197 |     iter_no = 0
198 |     
199 |     true_labels_v = torch.ones(BATCH_SIZE, device=device)
200 |     fake_labels_v = torch.zeros(BATCH_SIZE, device=device)
201 |     
202 |     
203 |     for batch_v in iterate_batches(envs):
204 |         # fake samples, input is 4D: batch, filters, x, y
205 |         gen_input_v = torch.FloatTensor(
206 |             BATCH_SIZE, LATENT_VECTOR_SIZE, 1, 1)
207 |         gen_input_v.normal_(0, 1)
208 |         gen_input_v = gen_input_v.to(device)
209 |         batch_v = batch_v.to(device)
210 |         gen_output_v = net_gener(gen_input_v)
211 |         
212 |         
213 |         # train discriminator
214 |         dis_optimizer.zero_grad()
215 |         dis_output_true_v = net_discr(batch_v)
216 |         dis_output_fake_v = net_discr(gen_output_v.detach())
217 |         dis_loss = objective(dis_output_true_v, true_labels_v) + \
218 |                    objective(dis_output_fake_v, fake_labels_v)
219 |         dis_loss.backward()
220 |         dis_optimizer.step()
221 |         dis_losses.append(dis_loss.item())
222 |         
223 |         # train generator
224 |         gen_optimizer.zero_grad()
225 |         dis_output_v = net_discr(gen_output_v)
226 |         gen_loss_v = objective(dis_output_v, true_labels_v)
227 |         gen_loss_v.backward()
228 |         gen_optimizer.step()
229 |         gen_losses.append(gen_loss_v.item())
230 |         
231 |         iter_no += 1
232 |         if iter_no % REPORT_EVERY_ITER == 0:
233 |             log.info("Iter %d: gen_loss=%.3e, dis_loss=%.3e",
234 |                      iter_no, np.mean(gen_losses),
235 |                      np.mean(dis_losses))
236 |             writer.add_scalar(
237 |                 "gen_loss", np.mean(gen_losses), iter_no)
238 |             writer.add_scalar(
239 |                 "dis_loss", np.mean(dis_losses), iter_no)
240 |             gen_losses = []
241 |             dis_losses = []
242 |         if iter_no % SAVE_IMAGE_EVERY_ITER == 0:
243 |             writer.add_image("fake", vutils.make_grid(
244 |                 gen_output_v.data[:64], normalize=True), iter_no)
245 |             writer.add_image("real", vutils.make_grid(
246 |                 batch_v.data[:64], normalize=True), iter_no)
247 |             
248 |             
249 |             
250 |             
251 | 
252 |         
253 |         
254 |    
255 |     
256 |     
257 |     
258 |     
259 |         
260 |     
261 |     
262 |         
263 |     
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/.DS_Store


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/BlackJack_MC_uda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Aug 16 11:20:06 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import sys
 11 | import gym
 12 | import numpy as np
 13 | from collections import defaultdict
 14 | 
 15 | from plot_utils import plot_blackjack_values, plot_policy
 16 | 
 17 | env = gym.make('Blackjack-v0')
 18 | 
 19 | #checking the env
 20 | print(env.observation_space)
 21 | print(env.action_space)
 22 | print("  ")
 23 | 
 24 | #using blackjack env to play random state
 25 | 
 26 | for  i in range(3):
 27 |     state = env.reset()
 28 |     while True:
 29 |         print(state)
 30 |         action = env.action_space.sample()
 31 |         state,reward,done,info= env.step(action)
 32 |         
 33 |         if done:
 34 |             
 35 |             print('End game! Reward: ', reward)
 36 |             print('You won :)\n') if reward > 0 else print('You lost :(\n')
 37 |             
 38 |             break
 39 |         
 40 | 
 41 | def generate_episode_from_limit_stochastic(bj_env):
 42 |     episode = []
 43 |     state = bj_env.reset()
 44 |     while True:
 45 |         probs = [0.8, 0.2] if state[0] > 18 else [0.2, 0.8]
 46 |         action = np.random.choice(np.arange(2), p=probs)
 47 |         next_state, reward, done, info = bj_env.step(action)
 48 |         episode.append((state, action, reward))
 49 |         state = next_state
 50 |         if done:
 51 |             break
 52 |     return episode
 53 | 
 54 | 
 55 | for i in range(3):
 56 |     print(generate_episode_from_limit_stochastic(env))
 57 | 
 58 | 
 59 | def mc_prediction_q(env, num_episodes, generate_episode, gamma=1.0):
 60 |     # initialize empty dictionaries of arrays
 61 |     returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))
 62 |     N = defaultdict(lambda: np.zeros(env.action_space.n))
 63 |     Q = defaultdict(lambda: np.zeros(env.action_space.n))
 64 |     # loop over episodes
 65 |     for i_episode in range(1, num_episodes+1):
 66 |         # monitor progress
 67 |         if i_episode % 1000 == 0:
 68 |             print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
 69 |             sys.stdout.flush()
 70 |         # generate an episode
 71 |         episode = generate_episode(env)
 72 |         # obtain the states, actions, and rewards
 73 |         states, actions, rewards = zip(*episode)
 74 |         # prepare for discounting
 75 |         discounts = np.array([gamma**i for i in range(len(rewards)+1)])
 76 |         # update the sum of the returns, number of visits, and action-value 
 77 |         # function estimates for each state-action pair in the episode
 78 |         for i, state in enumerate(states):
 79 |             returns_sum[state][actions[i]] += sum(rewards[i:]*discounts[:-(1+i)])
 80 |             N[state][actions[i]] += 1.0
 81 |             Q[state][actions[i]] = returns_sum[state][actions[i]] / N[state][actions[i]]
 82 |     return Q
 83 | 
 84 | 
 85 | # obtain the action-value function
 86 | Q = mc_prediction_q(env, 500000, generate_episode_from_limit_stochastic)
 87 | 
 88 | # obtain the corresponding state-value function
 89 | V_to_plot = dict((k,(k[0]>18)*(np.dot([0.8, 0.2],v)) + (k[0]<=18)*(np.dot([0.2, 0.8],v))) \
 90 |          for k, v in Q.items())
 91 | 
 92 | # plot the state-value function
 93 | plot_blackjack_values(V_to_plot)
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/Figure 2020-08-16 113558.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure 2020-08-16 113558.png


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure_1.png


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/Figure_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/Figure_2.png


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/Monte_Carlo_FirstVisit_BlackJack.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Jul 12 09:16:57 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import gym
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | from mpl_toolkits.mplot3d import Axes3D
 13 | 
 14 | import matplotlib as mpl
 15 | from matplotlib import cm
 16 | 
 17 | env = gym.make('Blackjack-v0')
 18 | 
 19 | 
 20 | state_val = np.zeros((22,12,2))
 21 | 
 22 | state_count = np.zeros(state_val.shape)
 23 | 
 24 | #Holds value at greater than 20
 25 | policy = 20
 26 | 
 27 | 
 28 | episodes = 100000
 29 | 
 30 | for episode in range(episodes):
 31 |     complete = False
 32 |     s_0 = env.reset()
 33 |     
 34 |     G = []
 35 |     states =[s_0]
 36 |     
 37 |     while complete == False:
 38 |         #implement the policy
 39 |         if s_0[0] >= policy:
 40 |             s_1,reward,complete , _ = env.step(0)
 41 |             
 42 |         else:
 43 |             s_1,reward,complete, _ = env.step(1)
 44 |         G.append(reward)
 45 |         states.append(s_1)
 46 |         
 47 |         
 48 |         if complete == True:
 49 |             
 50 |             for s_i , s in enumerate(states[:-1]):
 51 |                 
 52 |                 if s[2] == True:
 53 |                     
 54 |                     s_ace = 1
 55 |                     
 56 |                 else:
 57 |                     
 58 |                     s_ace = 0
 59 |                     
 60 |                     
 61 |                 returns  = np.mean(G[s_i:])
 62 |                 
 63 |                 #update values
 64 |                 
 65 |                 state_count[s[0], s[1],s_ace] +=1
 66 |                 
 67 |                 state_val[s[0],s[1],s_ace] += (returns-state_val[s[0],s[1],s_ace])/state_count[s[0],s[1],s_ace]
 68 |                 
 69 |         s_0 =s_1
 70 |         
 71 |         
 72 |         
 73 |         
 74 |         
 75 | 
 76 | 
 77 | fig = plt.figure(figsize=(12,8))
 78 | ax = fig.gca(projection='3d')
 79 | player_range = np.arange(11, 22)
 80 | dealer_range = np.arange(1, 11)
 81 | 
 82 | X, Y = np.meshgrid(dealer_range, player_range)
 83 | Z = state_val[11:22,1:11,0].reshape(X.shape)
 84 | ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, linewidth=1,
 85 |                rstride=1, cstride=1)
 86 | ax.set_title("Without Ace")
 87 | ax.set_xlabel("Dealer Showing")
 88 | ax.set_ylabel("Player Hand")
 89 | ax.set_zlabel("State Value")
 90 | plt.show()
 91 | 
 92 | # With usable ace
 93 | fig = plt.figure(figsize=(12,8))
 94 | ax = fig.gca(projection='3d')
 95 | player = np.arange(11, 22)
 96 | dealer = np.arange(2, 12)
 97 | 
 98 | X, Y = np.meshgrid(dealer_range, player_range)
 99 | Z = state_val[11:22,1:11,1].reshape(X.shape)
100 | ax.plot_surface(X, Y, Z, cmap=cm.coolwarm, linewidth=1,
101 |                rstride=1, cstride=1)
102 | ax.set_title("With Ace")
103 | ax.set_xlabel("Dealer Showing")
104 | ax.set_ylabel("Player Hand")
105 | ax.set_zlabel("State Value")
106 | plt.show()
107 |                 
108 |                 
109 |         
110 |         
111 |         
112 |         
113 |         
114 |         
115 |         
116 |         
117 |         
118 |         
119 |         
120 |         
121 |         
122 |         
123 |         
124 |         
125 |         
126 |     
127 |     


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/__pycache__/plot_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/BlackJack_First_Vist_MC/__pycache__/plot_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/BlackJack_First_Vist_MC/plot_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Aug 16 11:23:19 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import numpy as np
10 | from mpl_toolkits.mplot3d import Axes3D
11 | import matplotlib.pyplot as plt
12 | from mpl_toolkits.axes_grid1 import make_axes_locatable
13 | 
14 | def plot_blackjack_values(V):
15 | 
16 |     def get_Z(x, y, usable_ace):
17 |         if (x,y,usable_ace) in V:
18 |             return V[x,y,usable_ace]
19 |         else:
20 |             return 0
21 | 
22 |     def get_figure(usable_ace, ax):
23 |         x_range = np.arange(11, 22)
24 |         y_range = np.arange(1, 11)
25 |         X, Y = np.meshgrid(x_range, y_range)
26 |         
27 |         Z = np.array([get_Z(x,y,usable_ace) for x,y in zip(np.ravel(X), np.ravel(Y))]).reshape(X.shape)
28 | 
29 |         surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=plt.cm.coolwarm, vmin=-1.0, vmax=1.0)
30 |         ax.set_xlabel('Player\'s Current Sum')
31 |         ax.set_ylabel('Dealer\'s Showing Card')
32 |         ax.set_zlabel('State Value')
33 |         ax.view_init(ax.elev, -120)
34 | 
35 |     fig = plt.figure(figsize=(20, 20))
36 |     ax = fig.add_subplot(211, projection='3d')
37 |     ax.set_title('Usable Ace')
38 |     get_figure(True, ax)
39 |     ax = fig.add_subplot(212, projection='3d')
40 |     ax.set_title('No Usable Ace')
41 |     get_figure(False, ax)
42 |     plt.show()
43 | 
44 | def plot_policy(policy):
45 | 
46 |     def get_Z(x, y, usable_ace):
47 |         if (x,y,usable_ace) in policy:
48 |             return policy[x,y,usable_ace]
49 |         else:
50 |             return 1
51 | 
52 |     def get_figure(usable_ace, ax):
53 |         x_range = np.arange(11, 22)
54 |         y_range = np.arange(10, 0, -1)
55 |         X, Y = np.meshgrid(x_range, y_range)
56 |         Z = np.array([[get_Z(x,y,usable_ace) for x in x_range] for y in y_range])
57 |         surf = ax.imshow(Z, cmap=plt.get_cmap('Pastel2', 2), vmin=0, vmax=1, extent=[10.5, 21.5, 0.5, 10.5])
58 |         plt.xticks(x_range)
59 |         plt.yticks(y_range)
60 |         plt.gca().invert_yaxis()
61 |         ax.set_xlabel('Player\'s Current Sum')
62 |         ax.set_ylabel('Dealer\'s Showing Card')
63 |         ax.grid(color='w', linestyle='-', linewidth=1)
64 |         divider = make_axes_locatable(ax)
65 |         cax = divider.append_axes("right", size="5%", pad=0.1)
66 |         cbar = plt.colorbar(surf, ticks=[0,1], cax=cax)
67 |         cbar.ax.set_yticklabels(['0 (STICK)','1 (HIT)'])
68 |             
69 |     fig = plt.figure(figsize=(15, 15))
70 |     ax = fig.add_subplot(121)
71 |     ax.set_title('Usable Ace')
72 |     get_figure(True, ax)
73 |     ax = fig.add_subplot(122)
74 |     ax.set_title('No Usable Ace')
75 |     get_figure(False, ax)
76 |     plt.show()


--------------------------------------------------------------------------------
/Cart_pole/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/.DS_Store


--------------------------------------------------------------------------------
/Cart_pole/cartpole_ActionWrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Sep 17 12:56:05 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import gym
10 | from typing import TypeVar
11 | import random
12 | 
13 | Action = TypeVar('Action')
14 | 
15 | 
16 | class RandomActionWrapper(gym.ActionWrapper):
17 |     def __init__(self, env, epsilon=0.1):
18 |         super(RandomActionWrapper, self).__init__(env)
19 |         self.epsilon = epsilon
20 | 
21 |     def action(self, action: Action) -> Action:
22 |         if random.random() < self.epsilon:
23 |             print("Random!")
24 |             return self.env.action_space.sample()
25 |         return action
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     env = RandomActionWrapper(gym.make("CartPole-v0"))
30 | 
31 |     obs = env.reset()
32 |     total_reward = 0.0
33 | 
34 |     while True:
35 |         obs, reward, done, _ = env.step(0)
36 |         total_reward += reward
37 |         env.render()
38 |         if done:
39 |             break
40 | 
41 |     print("Reward got: %.2f" % total_reward)


--------------------------------------------------------------------------------
/Cart_pole/cartpole_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Sep 19 22:37:30 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import gym
 10 | from collections import namedtuple
 11 | import numpy as np
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.optim as optim
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | HIDDEN_SIZE = 128
 23 | BATCH_SIZE = 16
 24 | PERCENTILE = 70
 25 | 
 26 | 
 27 | class Net(nn.Module):
 28 |     def __init__(self, obs_size, hidden_size, n_actions):
 29 |         super(Net, self).__init__()
 30 |         self.net = nn.Sequential(
 31 |             nn.Linear(obs_size, hidden_size),
 32 |             nn.ReLU(),
 33 |             nn.Linear(hidden_size, n_actions)
 34 |         )
 35 | 
 36 |     def forward(self, x):
 37 |         return self.net(x)
 38 | 
 39 | 
 40 | Episode = namedtuple('Episode', field_names=['reward', 'steps'])
 41 | EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])
 42 | 
 43 | 
 44 | def iterate_batches(env, net, batch_size):
 45 |     batch = []
 46 |     episode_reward = 0.0
 47 |     episode_steps = []
 48 |     obs = env.reset()
 49 |     sm = nn.Softmax(dim=1)
 50 |     while True:
 51 |         obs_v = torch.FloatTensor([obs])
 52 |         act_probs_v = sm(net(obs_v))
 53 |         act_probs = act_probs_v.data.numpy()[0]
 54 |         action = np.random.choice(len(act_probs), p=act_probs)
 55 |         next_obs, reward, is_done, _ = env.step(action)
 56 |         episode_reward += reward
 57 |         step = EpisodeStep(observation=obs, action=action)
 58 |         episode_steps.append(step)
 59 |         if is_done:
 60 |             e = Episode(reward=episode_reward, steps=episode_steps)
 61 |             batch.append(e)
 62 |             episode_reward = 0.0
 63 |             episode_steps = []
 64 |             next_obs = env.reset()
 65 |             if len(batch) == batch_size:
 66 |                 yield batch
 67 |                 batch = []
 68 |         obs = next_obs
 69 | 
 70 | 
 71 | def filter_batch(batch, percentile):
 72 |     rewards = list(map(lambda s: s.reward, batch))
 73 |     reward_bound = np.percentile(rewards, percentile)
 74 |     reward_mean = float(np.mean(rewards))
 75 | 
 76 |     train_obs = []
 77 |     train_act = []
 78 |     for reward, steps in batch:
 79 |         if reward < reward_bound:
 80 |             continue
 81 |         train_obs.extend(map(lambda step: step.observation, steps))
 82 |         train_act.extend(map(lambda step: step.action, steps))
 83 | 
 84 |     train_obs_v = torch.FloatTensor(train_obs)
 85 |     train_act_v = torch.LongTensor(train_act)
 86 |     return train_obs_v, train_act_v, reward_bound, reward_mean
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     env = gym.make("CartPole-v0")
 91 |     # env = gym.wrappers.Monitor(env, directory="mon", force=True)
 92 |     obs_size = env.observation_space.shape[0]
 93 |     n_actions = env.action_space.n
 94 | 
 95 |     net = Net(obs_size, HIDDEN_SIZE, n_actions)
 96 |     objective = nn.CrossEntropyLoss()
 97 |     optimizer = optim.Adam(params=net.parameters(), lr=0.01)
 98 |     writer = SummaryWriter(comment="-cartpole")
 99 | 
100 |     for iter_no, batch in enumerate(iterate_batches(
101 |             env, net, BATCH_SIZE)):
102 |         obs_v, acts_v, reward_b, reward_m = \
103 |             filter_batch(batch, PERCENTILE)
104 |         optimizer.zero_grad()
105 |         action_scores_v = net(obs_v)
106 |         loss_v = objective(action_scores_v, acts_v)
107 |         loss_v.backward()
108 |         optimizer.step()
109 |         print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (
110 |             iter_no, loss_v.item(), reward_m, reward_b))
111 |         writer.add_scalar("loss", loss_v.item(), iter_no)
112 |         writer.add_scalar("reward_bound", reward_b, iter_no)
113 |         writer.add_scalar("reward_mean", reward_m, iter_no)
114 |         if reward_m > 199:
115 |             print("Solved!")
116 |             break
117 |     writer.close()


--------------------------------------------------------------------------------
/Cart_pole/cartpole_random.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Sep 17 12:17:45 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import gym
10 | 
11 | if __name__ == "__main__":
12 |     
13 |     env = gym.make("CartPole-v0")
14 |     total_reward= 0.0
15 |     total_steps = 0
16 |     obs = env.reset()
17 |     
18 |     while True:
19 |         action = env.action_space.sample()
20 |         obs,reward,done,_ = env.step(action)
21 |         total_reward += reward
22 |         total_steps+=1
23 |         env.render()
24 |         
25 |         if done:
26 |             break
27 |         print("Episode done in %d steps, total reward %.2f" % ( total_steps, total_reward))
28 |         
29 |         
30 |     


--------------------------------------------------------------------------------
/Cart_pole/cartpole_random_monitor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Sep 17 14:22:39 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import gym
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     env = gym.make("CartPole-v0")
14 |     env = gym.wrappers.Monitor(env, "recording", force = True) 
15 | 
16 |     total_reward = 0.0
17 |     total_steps = 0
18 |     obs = env.reset()
19 | 
20 |     while True:
21 |         action = env.action_space.sample()
22 |         obs, reward, done, _ = env.step(action)
23 |         total_reward += reward
24 |         total_steps += 1
25 |         if done:
26 |             break
27 | 
28 |     print("Episode done in %d steps, total reward %.2f" % (
29 |         total_steps, total_reward))
30 |     env.close()
31 |     env.env.close()


--------------------------------------------------------------------------------
/Cart_pole/runs/Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576634.Pavans-MacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep20_10-07-14_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576634.Pavans-MacBook-Pro.local


--------------------------------------------------------------------------------
/Cart_pole/runs/Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576679.Pavans-MacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep20_10-07-59_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1600576679.Pavans-MacBook-Pro.local


--------------------------------------------------------------------------------
/Cart_pole/runs/Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1601364152.Pavans-MacBook-Pro.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cart_pole/runs/Sep29_12-52-32_Pavans-MacBook-Pro.local-cartpole/events.out.tfevents.1601364152.Pavans-MacBook-Pro.local


--------------------------------------------------------------------------------
/Cross-Entrorpy/CEM_method.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Aug 25 11:50:01 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | #importing stuff
 10 | 
 11 | import gym
 12 | import numpy as np
 13 | import math
 14 | from collections import deque
 15 | import matplotlib.pyplot  as plt
 16 | 
 17 | #torch importing
 18 | import torch
 19 | import torch.nn as nn
 20 | import torch.nn.functional as F
 21 | from torch.autograd import Variable
 22 | 
 23 | #Initalizing the environment
 24 | 
 25 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 | env = gym.make('MountainCarContinuous-v0')
 28 | env.seed(101)
 29 | np.random.seed(101)
 30 | 
 31 | print('observation space:', env.observation_space)
 32 | print('action space:', env.action_space)
 33 | print('  - low:', env.action_space.low)
 34 | print('  - high:', env.action_space.high)
 35 | 
 36 | 
 37 | #Creating the agent
 38 | 
 39 | class Agent(nn.Module):
 40 |     
 41 |     
 42 |     def __init__(self,env, h_size = 16):
 43 |         super(Agent, self).__init__()
 44 |         self.env = env
 45 |         
 46 |         #state 
 47 |         self.s_size = env.observation_space.shape[0]
 48 |         #Hidden layer
 49 |         self.h_size = h_size
 50 |         #actionsize
 51 |         self.a_size = env.action_space.shape[0]
 52 |         
 53 |         #defining the layers
 54 |         self.fc1 = nn.Linear(self.s_size, self.h_size)
 55 |         self.fc2 = nn.Linear(self.h_size,self.a_size)
 56 |         
 57 |         
 58 |     def set_weights(self, weights):
 59 |         
 60 |         s_size = self.s_size
 61 |         h_size= self.h_size
 62 |         a_size = self.a_size
 63 |         
 64 |         # seprate the weighs for each layer
 65 |         fc1_end = (s_size * h_size) + h_size
 66 |         fc1_W = torch.from_numpy(weights[: s_size * h_size].reshape(s_size,h_size))
 67 |         fc1_b = torch.from_numpy(weights[ s_size* h_size : fc1_end])
 68 |         fc2_W  = torch.from_numpy(weights[fc1_end :fc1_end +(h_size * a_size)] . reshape(h_size,a_size))
 69 |         fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size) : ])
 70 |         
 71 |         #set the weights for each layer
 72 |         self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
 73 |         self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
 74 |         self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
 75 |         self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
 76 |         
 77 |     def get_weights_dim(self):
 78 |         return (self.s_size +1) * self.h_size + (self.h_size +1)* self.a_size
 79 |     
 80 |     def forward(self, x):
 81 |         x = F.relu(self.fc1(x))
 82 |         x = F.tanh(self.fc2(x))
 83 |         return x.cpu().data
 84 |     
 85 |     def evaluate(self, weights, gamma=1.0, max_t=5000):
 86 |         self.set_weights(weights)
 87 |         episode_return = 0.0
 88 |         state = self.env.reset()
 89 |         for t in range(max_t):
 90 |             state = torch.from_numpy(state).float().to(device)
 91 |             action = self.forward(state)
 92 |             state, reward, done, _ = self.env.step(action)
 93 |             episode_return += reward * math.pow(gamma, t)
 94 |             if done:
 95 |                 break
 96 |         return episode_return
 97 |     
 98 |     
 99 | agent = Agent(env).to(device)
100 |     
101 |     
102 | def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
103 |     
104 |     n_elite=int(pop_size*elite_frac)
105 | 
106 |     scores_deque = deque(maxlen=100)
107 |     scores = []
108 |     best_weight = sigma*np.random.randn(agent.get_weights_dim())
109 | 
110 |     for i_iteration in range(1, n_iterations+1):
111 |         weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
112 |         rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])
113 | 
114 |         elite_idxs = rewards.argsort()[-n_elite:]
115 |         elite_weights = [weights_pop[i] for i in elite_idxs]
116 |         best_weight = np.array(elite_weights).mean(axis=0)
117 | 
118 |         reward = agent.evaluate(best_weight, gamma=1.0)
119 |         scores_deque.append(reward)
120 |         scores.append(reward)
121 |         
122 |         torch.save(agent.state_dict(), 'checkpoint.pth')
123 |         
124 |         if i_iteration % print_every == 0:
125 |             print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
126 | 
127 |         if np.mean(scores_deque)>=90.0:
128 |             print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
129 |             break
130 |     return scores
131 | 
132 | scores = cem()
133 | 
134 | # plot the scores
135 | fig = plt.figure()
136 | ax = fig.add_subplot(111)
137 | plt.plot(np.arange(1, len(scores)+1), scores)
138 | plt.ylabel('Score')
139 | plt.xlabel('Episode #')
140 | plt.show()     
141 | 
142 | 
143 | agent.load_state_dict(torch.load('checkpoint.pth'))
144 | 
145 | state = env.reset()
146 | while True:
147 |     state = torch.from_numpy(state).float().to(device)
148 |     with torch.no_grad():
149 |         action = agent(state)
150 |     env.render()
151 |     next_state, reward, done, _ = env.step(action)
152 |     state = next_state
153 |     if done:
154 |         break
155 | 
156 | env.close() 
157 |     
158 |         
159 |         
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/Cross-Entrorpy/checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Cross-Entrorpy/checkpoint.pth


--------------------------------------------------------------------------------
/Deep-Q-Learning/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/.DS_Store


--------------------------------------------------------------------------------
/Deep-Q-Learning/dqn_pong.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Sep 21 10:56:19 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | from lib_dep import wrappers
 10 | from lib_dep import dqn_model
 11 | 
 12 | import argparse
 13 | import time
 14 | import numpy as np
 15 | import collections
 16 | 
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.optim as optim
 20 | 
 21 | from tensorboardX import SummaryWriter
 22 | 
 23 | 
 24 | DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
 25 | MEAN_REWARD_BOUND = 19
 26 | 
 27 | GAMMA = 0.99
 28 | BATCH_SIZE = 32
 29 | REPLAY_SIZE = 10000
 30 | REPLAY_START_SIZE = 10000
 31 | LEARNING_RATE = 1e-4
 32 | SYNC_TARGET_FRAMES = 1000
 33 | 
 34 | EPSILON_DECAY_LAST_FRAME = 150000
 35 | EPSILON_START = 1.0
 36 | EPSILON_FINAL = 0.01
 37 | 
 38 | Experience = collections.namedtuple( 'Experience', field_names = ['state', 'action',
 39 |                                                                   'reward', 'done', 'new_state'])
 40 | 
 41 | class Experience_Buffer:
 42 |     
 43 |     def __init__(self,capacity):
 44 |         self.buffer = collections.deque(maxlen = capacity)
 45 |         
 46 |         
 47 |     def __len__(self):
 48 |         return len(self.buffer)
 49 |     
 50 |     # adding the experience to the buffer 
 51 |     def append(self,experience):
 52 |         self.buffer.append(experience)
 53 |         
 54 |     def sample(self, batch_size):
 55 |         indices = np.random.choice(len(self.buffer),batch_size,replace= False)
 56 |         
 57 |         states,actions,rewards,dones,next_states= \
 58 |             zip(*[self.buffer[idx] for idx in indices])
 59 |             
 60 |             
 61 |         return np.array(states), np.array(actions), \
 62 |                np.array(rewards, dtype=np.float32), \
 63 |                np.array(dones, dtype=np.uint8), \
 64 |                np.array(next_states)
 65 | 
 66 | 
 67 | class Agent:
 68 |     
 69 |     def __init__(self, env,exp_buffer):
 70 |         
 71 |         self.env = env
 72 |         self.exp_buffer = exp_buffer
 73 |         self._reset()
 74 |         
 75 |     def _reset(self):
 76 |         self.state = self.env.reset()
 77 |         self.total_reward = 0.0
 78 |         
 79 |         
 80 |     @torch.no_grad()
 81 |     def play_step(self, net, epsilon=0.0, device="cpu"):
 82 |         done_reward = None
 83 | 
 84 |         if np.random.random() < epsilon:
 85 |             action = self.env.action_space.sample()
 86 |         else:
 87 |             state_a = np.array([self.state], copy=False)
 88 |             state_v = torch.tensor(state_a).to(device)
 89 |             q_vals_v = net(state_v)
 90 |             _, act_v = torch.max(q_vals_v, dim=1)
 91 |             action = int(act_v.item())
 92 |             
 93 |             
 94 |             
 95 |         new_state, reward, is_done, _ = self.env.step(action)
 96 |         self.total_reward += reward
 97 | 
 98 |         exp = Experience(self.state, action, reward,
 99 |                          is_done, new_state)
100 |         self.exp_buffer.append(exp)
101 |         self.state = new_state
102 |         if is_done:
103 |             done_reward = self.total_reward
104 |             self._reset()
105 |         return done_reward
106 |     
107 |     
108 | def calc_loss(batch, net, tgt_net , device = 'cpu'):
109 |     
110 |     states,actions,rewards,dones ,next_states = batch
111 |     
112 |     states_v= torch.tensor(np.array(states, copy= False)).to(device)
113 |     
114 |     next_states_v = torch.tensor(np.array(next_states, copy = False)).to(device)
115 |     
116 |     actions_v = torch.tensor(actions).to(device)
117 |     rewards_v = torch.tensor(rewards).to(device)
118 |     done_mask = torch.BoolTensor(dones).to(device)
119 |     
120 |     state_action_values = net(states_v).gather(
121 |         1, actions_v.unsqueeze(-1)).squeeze(-1)
122 |     
123 |     with torch.no_grad():
124 |         next_state_values = tgt_net(next_states_v).max(1)[0]
125 |         next_state_values[done_mask] = 0.0
126 |         next_state_values = next_state_values.detach()
127 |         
128 |     expected_state_action_values = next_state_values * GAMMA +   rewards_v
129 |     
130 |     return nn.MSELoss()(state_action_values,   expected_state_action_values)
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument("--cuda", default=False,
136 |                         action="store_true", help="Enable cuda")
137 |     parser.add_argument("--env", default=DEFAULT_ENV_NAME,
138 |                         help="Name of the environment, default=" +
139 |                              DEFAULT_ENV_NAME)
140 |     args = parser.parse_args()
141 |     device = torch.device("cuda" if args.cuda else "cpu")
142 | 
143 |     env = wrappers.make_env(args.env)
144 | 
145 |     net = dqn_model.DQN(env.observation_space.shape,
146 |                         env.action_space.n).to(device)
147 |     tgt_net = dqn_model.DQN(env.observation_space.shape,
148 |                             env.action_space.n).to(device)
149 |     writer = SummaryWriter(comment="-" + args.env)
150 |     print(net)
151 | 
152 |     buffer = Experience_Buffer(REPLAY_SIZE)
153 |     agent = Agent(env, buffer)
154 |     epsilon = EPSILON_START
155 | 
156 |     optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
157 |     total_rewards = []
158 |     frame_idx = 0
159 |     ts_frame = 0
160 |     ts = time.time()
161 |     best_m_reward = None
162 | 
163 |     while True:
164 |         frame_idx += 1
165 |         epsilon = max(EPSILON_FINAL, EPSILON_START -
166 |                       frame_idx / EPSILON_DECAY_LAST_FRAME)
167 | 
168 |         reward = agent.play_step(net, epsilon, device=device)
169 |         env.render()
170 |         if reward is not None:
171 |             total_rewards.append(reward)
172 |             speed = (frame_idx - ts_frame) / (time.time() - ts)
173 |             ts_frame = frame_idx
174 |             ts = time.time()
175 |             m_reward = np.mean(total_rewards[-100:])
176 |             print("%d: done %d games, reward %.3f, "
177 |                   "eps %.2f, speed %.2f f/s" % (
178 |                 frame_idx, len(total_rewards), m_reward, epsilon,
179 |                 speed
180 |             ))
181 |             writer.add_scalar("epsilon", epsilon, frame_idx)
182 |             writer.add_scalar("speed", speed, frame_idx)
183 |             writer.add_scalar("reward_100", m_reward, frame_idx)
184 |             writer.add_scalar("reward", reward, frame_idx)
185 |             if best_m_reward is None or best_m_reward < m_reward:
186 |                 torch.save(net.state_dict(), args.env +
187 |                            "-best_%.0f.dat" % m_reward)
188 |                 if best_m_reward is not None:
189 |                     print("Best reward updated %.3f -> %.3f" % (
190 |                         best_m_reward, m_reward))
191 |                 best_m_reward = m_reward
192 |             if m_reward > MEAN_REWARD_BOUND:
193 |                 print("Solved in %d frames!" % frame_idx)
194 |                 break
195 | 
196 |         if len(buffer) < REPLAY_START_SIZE:
197 |             continue
198 | 
199 |         if frame_idx % SYNC_TARGET_FRAMES == 0:
200 |             tgt_net.load_state_dict(net.state_dict())
201 | 
202 |         optimizer.zero_grad()
203 |         batch = buffer.sample(BATCH_SIZE)
204 |         loss_t = calc_loss(batch, net, tgt_net, device=device)
205 |         loss_t.backward()
206 |         optimizer.step()
207 |     writer.close()
208 | 
209 |     
210 |     
211 |     
212 |     
213 |     
214 |     
215 |     
216 |     
217 |     
218 |     
219 | 
220 |     
221 | 
222 |         
223 |     
224 |         
225 |         
226 |     
227 |  


--------------------------------------------------------------------------------
/Deep-Q-Learning/lib_dep/__pycache__/dqn_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/lib_dep/__pycache__/dqn_model.cpython-38.pyc


--------------------------------------------------------------------------------
/Deep-Q-Learning/lib_dep/__pycache__/wrappers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Learning/lib_dep/__pycache__/wrappers.cpython-38.pyc


--------------------------------------------------------------------------------
/Deep-Q-Learning/lib_dep/dqn_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Sep 20 22:19:54 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import numpy as np
12 | 
13 | class DQN(nn.Module):
14 |     
15 |     def __init__(self,input_shape, n_actions):
16 |         super(DQN,self).__init__()
17 |         
18 |         self.conv = nn.Sequential(
19 |             
20 |             nn.Conv2d(in_channels = input_shape[0], out_channels = 32,
21 |                       kernel_size = 4 , stride = 2),
22 |             nn.ReLU(),
23 |             
24 |             nn.Conv2d(in_channels = 32, out_channels = 64,
25 |                       kernel_size = 8 , stride = 4),
26 |             nn.ReLU(),
27 |             
28 |             nn.Conv2d(in_channels = 64, out_channels = 64,
29 |                       kernel_size = 3 , stride = 1),
30 |             nn.ReLU(),
31 |             
32 |             
33 |             )
34 |         
35 |         conv_out_size = self._get_conv_out(input_shape)
36 |         self.fc = nn.Sequential(
37 |             nn.Linear(conv_out_size, 512),
38 |             nn.ReLU(),
39 |             nn.Linear(512, n_actions)
40 |         )
41 |         
42 |         
43 |     def _get_conv_out(self, shape):
44 |         o = self.conv(torch.zeros(1, *shape))
45 |         return int(np.prod(o.size()))
46 |     
47 |     def forward(self, x):
48 |         conv_out = self.conv(x).view(x.size()[0], -1)
49 |         return self.fc(conv_out)
50 | 


--------------------------------------------------------------------------------
/Deep-Q-Learning/lib_dep/wrappers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Sep 20 20:27:55 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import cv2
 10 | import gym
 11 | import gym.spaces
 12 | import numpy as np
 13 | import collections
 14 | 
 15 | class FireResetEnv(gym.Wrapper):
 16 |     
 17 |     def __init__(self,env = None):
 18 |         super(FireResetEnv,self).__init__(env)
 19 |         
 20 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 21 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 22 |         
 23 |     def step(self, action):
 24 |         return self.env.step(action)
 25 | 
 26 |     def reset(self):
 27 |         self.env.reset()
 28 |         obs, _, done, _ = self.env.step(1)
 29 |         if done:
 30 |             self.env.reset()
 31 |         obs, _, done, _ = self.env.step(2)
 32 |         if done:
 33 |             self.env.reset()
 34 |         return obs
 35 | 
 36 | class MaxAndSkipEnv(gym.Wrapper):
 37 |     
 38 |     def __init__(self, env =None, skip = 4):
 39 |         
 40 |         
 41 |         super(MaxAndSkipEnv, self).__init__(env)
 42 |         self._obs_buffer = collections.deque(maxlen=2)
 43 |         self._skip = skip
 44 |         
 45 |     def step(self, action):
 46 |         total_reward = 0.0
 47 |         done = None
 48 |         for _ in range(self._skip):
 49 |             obs, reward, done, info = self.env.step(action)
 50 |             self._obs_buffer.append(obs)
 51 |             total_reward += reward
 52 |             if done:
 53 |                 break
 54 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 55 |         return max_frame, total_reward, done, info
 56 |     def reset(self):
 57 |        
 58 |         self._obs_buffer.clear()
 59 |         obs = self.env.reset()
 60 |         self._obs_buffer.append(obs)
 61 |         return obs
 62 |     
 63 | class ProcessFrame84(gym.ObservationWrapper):
 64 |     
 65 |     def __init__(self, env = None):
 66 |         super(ProcessFrame84,self).__init__(env)
 67 |         self.observation_space = gym.spaces.Box(
 68 |             low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
 69 |         
 70 |         
 71 |     def observation(self, obs):
 72 |         return ProcessFrame84.process(obs)
 73 |     
 74 |     
 75 |     @staticmethod
 76 |     def process(frame):
 77 |         if frame.size == 210 * 160 * 3:
 78 |             img = np.reshape(frame, [210, 160, 3]).astype(
 79 |                 np.float32)
 80 |         elif frame.size == 250 * 160 * 3:
 81 |             img = np.reshape(frame, [250, 160, 3]).astype(
 82 |                 np.float32)
 83 |         else:
 84 |             assert False, "Unknown resolution."
 85 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \
 86 |               img[:, :, 2] * 0.114
 87 |         resized_screen = cv2.resize(
 88 |             img, (84, 110), interpolation=cv2.INTER_AREA)
 89 |         x_t = resized_screen[18:102, :]
 90 |         x_t = np.reshape(x_t, [84, 84, 1])
 91 |         return x_t.astype(np.uint8)
 92 |     
 93 | class ImageToPyTorch(gym.ObservationWrapper):
 94 |     def __init__(self, env):
 95 |         super(ImageToPyTorch, self).__init__(env)
 96 |         old_shape = self.observation_space.shape
 97 |         new_shape = (old_shape[-1], old_shape[0], old_shape[1])
 98 |         self.observation_space = gym.spaces.Box(
 99 |             low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
100 | 
101 |     def observation(self, observation):
102 |         return np.moveaxis(observation, 2, 0)
103 | 
104 | 
105 | 
106 | class ScaledFloatFrame(gym.ObservationWrapper):
107 |     def observation(self, obs):
108 |         return np.array(obs).astype(np.float32) / 255.0
109 |     
110 |     
111 | class BufferWrapper(gym.ObservationWrapper):
112 |     
113 |     def __init__(self, env,n_steps,dtype = np.float32):
114 |         super(BufferWrapper, self).__init__(env)
115 |         self.dtype = dtype
116 |         old_space = env.observation_space
117 |         self.observation_space = gym.spaces.Box(
118 |             old_space.low.repeat(n_steps, axis=0),
119 |             old_space.high.repeat(n_steps, axis=0), dtype=dtype)
120 |         
121 |     
122 |     def reset(self):
123 |         self.buffer = np.zeros_like(
124 |             self.observation_space.low, dtype=self.dtype)
125 |         return self.observation(self.env.reset())
126 | 
127 |     def observation(self, observation):
128 |         self.buffer[:-1] = self.buffer[1:]
129 |         self.buffer[-1] = observation
130 |         return self.buffer
131 |     
132 | 
133 | def make_env(env_name):
134 |     env = gym.make(env_name)
135 |     env = MaxAndSkipEnv(env)
136 |     env = FireResetEnv(env)
137 |     env = ProcessFrame84(env)
138 |     env = ImageToPyTorch(env)
139 |     env = BufferWrapper(env, 4)
140 |     return ScaledFloatFrame(env)
141 | 
142 | 
143 |         
144 | 
145 | 
146 | 
147 |     
148 |         
149 |          
150 | 
151 | 


--------------------------------------------------------------------------------
/Deep-Q-Network/Deep_Q_network.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Aug 22 09:59:22 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | import gym
 9 | import random
10 | import torch
11 | import numpy as np
12 | from collections import deque
13 | import matplotlib.pyplot as plt
14 | 
15 | env = gym.make('LunarLander-v2')
16 | env.seed(0)
17 | print('State shape: ', env.observation_space.shape)
18 | print('Number of actions: ', env.action_space.n)
19 | 
20 | # a random agent
21 | from dqn_agent import Agent
22 | 
23 | agent = Agent(state_size=8, action_size=4, seed=0)
24 | 
25 | # watch an untrained agenta
26 | state = env.reset()
27 | for j in range(200):
28 |     action = agent.act(state)
29 |     #env.render()
30 |     state, reward, done, _ = env.step(action)
31 |     if done:
32 |         break 
33 |         
34 | env.close()
35 | 
36 | def dqn(n_episodes=100, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
37 |    
38 |     scores = []                        # list containing scores from each episode
39 |     scores_window = deque(maxlen=100)  # last 100 scores
40 |     eps = eps_start                    # initialize epsilon
41 |     for i_episode in range(1, n_episodes+1):
42 |         state = env.reset()
43 |         score = 0
44 |         for t in range(max_t):
45 |             action = agent.act(state, eps)
46 |             next_state, reward, done, _ = env.step(action)
47 |             agent.step(state, action, reward, next_state, done)
48 |             state = next_state
49 |             score += reward
50 |             if done:
51 |                 break 
52 |         scores_window.append(score)       # save most recent score
53 |         scores.append(score)              # save most recent score
54 |         eps = max(eps_end, eps_decay*eps) # decrease epsilon
55 |         print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
56 |         if i_episode % 100 == 0:
57 |             print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
58 |         if np.mean(scores_window)>=200.0:
59 |             print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
60 |             torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
61 |             break
62 |     return scores
63 | 
64 | scores = dqn()
65 | 
66 | # plot the scores
67 | fig = plt.figure()
68 | ax = fig.add_subplot(111)
69 | plt.plot(np.arange(len(scores)), scores)
70 | plt.ylabel('Score')
71 | plt.xlabel('Episode #')
72 | plt.show()
73 | 
74 | agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
75 | 
76 | for i in range(3):
77 |     state = env.reset()
78 |     for j in range(200):
79 |         action = agent.act(state)
80 |         env.render()
81 |         state, reward, done, _ = env.step(action)
82 |         if done:
83 |             break 
84 |             
85 | env.close()
86 | 
87 | 


--------------------------------------------------------------------------------
/Deep-Q-Network/__pycache__/dqn_agent.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Network/__pycache__/dqn_agent.cpython-38.pyc


--------------------------------------------------------------------------------
/Deep-Q-Network/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Deep-Q-Network/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/Deep-Q-Network/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Aug 22 08:46:19 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import numpy as np
 10 | import random
 11 | from collections import namedtuple, deque
 12 | 
 13 | from model import QNetwork
 14 | 
 15 | import torch
 16 | import torch.nn.functional as F
 17 | import torch.optim as optim
 18 | 
 19 | BUFFER_SIZE = int(1e5) #Replay Buffer size
 20 | BATCH_SIZE = 64 # min Batch size
 21 | GAMMA = 0.9     # discount Factor
 22 | TAU = 1e-3      #for  soft update of target parameters
 23 | LR = 5e-4      # learning rate
 24 | UPDATE_EVERY = 4 #How often do you update the network
 25 | 
 26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 27 | 
 28 | 
 29 |             
 30 |             
 31 | 
 32 | class Agent():
 33 | 
 34 |     def __init__(self, state_size, action_size, seed):
 35 |        
 36 |         self.state_size = state_size
 37 |         self.action_size = action_size
 38 |         self.seed = random.seed(seed)
 39 | 
 40 |         # Q-Network
 41 |         self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
 42 |         self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
 43 |         self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
 44 | 
 45 |         # Replay memory
 46 |         self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
 47 |         # Initialize time step (for updating every UPDATE_EVERY steps)
 48 |         self.t_step = 0
 49 |     
 50 |     def step(self, state, action, reward, next_state, done):
 51 |         # Save experience in replay memory
 52 |         self.memory.add(state, action, reward, next_state, done)
 53 |         
 54 |         # Learn every UPDATE_EVERY time steps.
 55 |         self.t_step = (self.t_step + 1) % UPDATE_EVERY
 56 |         if self.t_step == 0:
 57 |             # If enough samples are available in memory, get random subset and learn
 58 |             if len(self.memory) > BATCH_SIZE:
 59 |                 experiences = self.memory.sample()
 60 |                 self.learn(experiences, GAMMA)
 61 | 
 62 |     def act(self, state, eps=0.):
 63 |        
 64 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 65 |         self.qnetwork_local.eval()
 66 |         with torch.no_grad():
 67 |             action_values = self.qnetwork_local(state)
 68 |         self.qnetwork_local.train()
 69 | 
 70 |         # Epsilon-greedy action selection
 71 |         if random.random() > eps:
 72 |             return np.argmax(action_values.cpu().data.numpy())
 73 |         else:
 74 |             return random.choice(np.arange(self.action_size))
 75 | 
 76 |     def learn(self, experiences, gamma):
 77 |        
 78 |         states, actions, rewards, next_states, dones = experiences
 79 | 
 80 |         # Get max predicted Q values (for next states) from target model
 81 |         Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
 82 |         # Compute Q targets for current states 
 83 |         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
 84 | 
 85 |         # Get expected Q values from local model
 86 |         Q_expected = self.qnetwork_local(states).gather(1, actions)
 87 | 
 88 |         # Compute loss
 89 |         loss = F.mse_loss(Q_expected, Q_targets)
 90 |         # Minimize the loss
 91 |         self.optimizer.zero_grad()
 92 |         loss.backward()
 93 |         self.optimizer.step()
 94 | 
 95 |         # ------------------- update target network ------------------- #
 96 |         self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     
 97 | 
 98 |     def soft_update(self, local_model, target_model, tau):
 99 |         
100 |         for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
101 |             target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
102 | 
103 | 
104 | class ReplayBuffer:
105 |     """Fixed-size buffer to store experience tuples."""
106 | 
107 |     def __init__(self, action_size, buffer_size, batch_size, seed):
108 |        
109 |         self.action_size = action_size
110 |         self.memory = deque(maxlen=buffer_size)  
111 |         self.batch_size = batch_size
112 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
113 |         self.seed = random.seed(seed)
114 |     
115 |     def add(self, state, action, reward, next_state, done):
116 |         """Add a new experience to memory."""
117 |         e = self.experience(state, action, reward, next_state, done)
118 |         self.memory.append(e)
119 |     
120 |     def sample(self):
121 |         """Randomly sample a batch of experiences from memory."""
122 |         experiences = random.sample(self.memory, k=self.batch_size)
123 | 
124 |         states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
125 |         actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
126 |         rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
127 |         next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
128 |         dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
129 |   
130 |         return (states, actions, rewards, next_states, dones)
131 | 
132 |     def __len__(self):
133 |         """Return the current size of internal memory."""
134 |         return len(self.memory)
135 | 
136 |     
137 | 
138 |                  
139 |          
140 |          
141 |         
142 |         
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/Deep-Q-Network/lunar_lander_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Aug 22 14:33:12 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import gym
10 | import numpy as np
11 | 
12 | 
13 | # creating the environment
14 | 
15 | env = gym.make('LunarLander-v2')
16 | env.seed(0)
17 | print('State shape: ', env.observation_space.shape)
18 | print('Number of actions: ', env.action_space.n)
19 | 
20 | 
21 | # creating a random agent
22 | env.reset()
23 |  
24 | score = 0
25 | 
26 | for  i in range(500):
27 |     action = env.action_space.sample()
28 |     env.render()
29 |     state,reward,done,info = env.step(action)
30 |     score += reward
31 |     if done:
32 |         break
33 |     
34 | env.close()
35 | 
36 |     
37 | print("Score is:", score)  


--------------------------------------------------------------------------------
/Deep-Q-Network/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Aug 22 07:53:55 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import torch
10 | 
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | 
14 | 
15 | class QNetwork(nn.Module):
16 |     
17 |     def __init__(self,state_size,action_size,seed,fc1_units = 64,fc2_units = 64):
18 |         
19 |         super(QNetwork,self).__init__()
20 |         self.seed= torch.manual_seed(seed)
21 |         self.fc1= nn.Linear(state_size,fc1_units) # number of nodes hidden in first hidden layer
22 |         self.fc2 = nn.Linear(fc1_units,fc2_units)  # number of nodes hidden in first hidden layer
23 |         self.fc3  =  nn.Linear(fc2_units,action_size)
24 |         
25 |     def forward(self,state):
26 |         
27 |         x = F.relu(self.fc1(state))
28 |         x = F.relu(self.fc2(x))
29 |         return self.fc3(x)
30 |     
31 |     
32 |         
33 |         
34 |         
35 |         
36 |             


--------------------------------------------------------------------------------
/Discretization/Discretization_udacity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Aug 20 15:09:47 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | #Importing stuff
 11 | 
 12 | import sys
 13 | import gym
 14 | import numpy as np
 15 | 
 16 | import pandas as pd
 17 | import matplotlib.pyplot as plt
 18 | import matplotlib.collections as mc
 19 | plt.style.use('ggplot')
 20 | np.set_printoptions(precision=3, linewidth=120)
 21 | 
 22 | #creating the environment
 23 | 
 24 | env= gym.make('MountainCar-v0')
 25 | env.seed(505)  #Random seeding
 26 | 
 27 | #creating a Random agent 
 28 | state= env.reset()
 29 | score = 0
 30 | for t in range(200):
 31 |     action = env.action_space.sample()
 32 |     #env.render()
 33 |     state,reward, done,_ = env.step(action)
 34 |     score += reward
 35 |     if done:
 36 |         break
 37 |     
 38 | print('Final Score: ',score)
 39 | env.close()
 40 | 
 41 | 
 42 | 
 43 | 
 44 | # now we have to create a uniform grid in this environment 
 45 | 
 46 | def create_uniform_grid(low,high,bins= (10,10)):
 47 |    
 48 |     grid = [np.linspace(low[dim], high[dim],bins[dim]+1)[1 :-1] for dim in range(len(bins))]
 49 | 
 50 |     print("Uniform grid: [<low>, <high>] / <bins> => <splits>")
 51 |     for l, h, b, splits in zip(low, high, bins, grid):
 52 |         print("    [{}, {}] / {} => {}".format(l, h, b, splits))
 53 |     return grid
 54 | 
 55 | 
 56 | 
 57 | low = [-1.0, -5.0]
 58 | high = [1.0, 5.0]
 59 | create_uniform_grid(low, high) # testing
 60 | 
 61 | # we are going to discretize
 62 | 
 63 | def discretize(sample, gird): 
 64 |     return list(int(np.digitize(s,g)) for s ,g in zip(sample,gird)) # applying on each dimension
 65 | 
 66 |  # testing
 67 | grid = create_uniform_grid([-1.0, -5.0], [1.0, 5.0])
 68 | samples = np.array(
 69 |     [[-1.0 , -5.0],
 70 |      [-0.81, -4.1],
 71 |      [-0.8 , -4.0],
 72 |      [-0.5 ,  0.0],
 73 |      [ 0.2 , -1.9],
 74 |      [ 0.8 ,  4.0],
 75 |      [ 0.81,  4.1],
 76 |      [ 1.0 ,  5.0]])
 77 | discretized_samples = np.array([discretize(sample, grid) for sample in samples])
 78 | print("\nSamples:", repr(samples), sep="\n")
 79 | print("\nDiscretized samples:", repr(discretized_samples), sep="\n")
 80 | 
 81 | 
 82 | 
 83 | def visualize_samples(samples, discretized_samples, grid, low=None, high=None):
 84 |     """Visualize original and discretized samples on a given 2-dimensional grid."""
 85 | 
 86 |     fig, ax = plt.subplots(figsize=(10, 10))
 87 |     
 88 |     # Show grid
 89 |     ax.xaxis.set_major_locator(plt.FixedLocator(grid[0]))
 90 |     ax.yaxis.set_major_locator(plt.FixedLocator(grid[1]))
 91 |     ax.grid(True)
 92 |     
 93 |     # If bounds (low, high) are specified, use them to set axis limits
 94 |     if low is not None and high is not None:
 95 |         ax.set_xlim(low[0], high[0])
 96 |         ax.set_ylim(low[1], high[1])
 97 |     else:
 98 |         # Otherwise use first, last grid locations as low, high (for further mapping discretized samples)
 99 |         low = [splits[0] for splits in grid]
100 |         high = [splits[-1] for splits in grid]
101 | 
102 |     # Map each discretized sample (which is really an index) to the center of corresponding grid cell
103 |     grid_extended = np.hstack((np.array([low]).T, grid, np.array([high]).T))  # add low and high ends
104 |     grid_centers = (grid_extended[:, 1:] + grid_extended[:, :-1]) / 2  # compute center of each grid cell
105 |     locs = np.stack(grid_centers[i, discretized_samples[:, i]] for i in range(len(grid))).T  # map discretized samples
106 | 
107 |     ax.plot(samples[:, 0], samples[:, 1], 'o')  # plot original samples
108 |     ax.plot(locs[:, 0], locs[:, 1], 's')  # plot discretized samples in mapped locations
109 |     ax.add_collection(mc.LineCollection(list(zip(samples, locs)), colors='orange'))  # add a line connecting each original-discretized sample
110 |     ax.legend(['original', 'discretized'])
111 | 
112 |     
113 | visualize_samples(samples, discretized_samples, grid, low, high)
114 | 
115 | #Create a grid to discretize the state space
116 | state_grid = create_uniform_grid(env.observation_space.low, env.observation_space.high, bins=(10, 10))
117 | state_grid
118 | # Obtain some samples from the space, discretize them, and then visualize them
119 | state_samples = np.array([env.observation_space.sample() for i in range(10)])
120 | discretized_state_samples = np.array([discretize(sample, state_grid) for sample in state_samples])
121 | visualize_samples(state_samples, discretized_state_samples, state_grid,
122 |                   env.observation_space.low, env.observation_space.high)
123 | plt.xlabel('position'); plt.ylabel('velocity');  # axis labels for MountainCar-v0 state space
124 | 
125 | 
126 | # now as we are done with the discretization stuff let's get to Q learning
127 | 
128 | 
129 | class QLearningAgent:
130 |     
131 |     # we can use this Agent to act on contious space by discretizing it
132 |     
133 |     def __init__(self,env,state_grid,alpha = 0.02,gamma = 0.99, epsilon  = 1.0,
134 |                  epsilon_decay_rate = 0.9995,min_epsilon = .01,seed = 505):
135 |         
136 |         
137 |         # Environment Info
138 |         self.env = env
139 |         self.state_grid = state_grid
140 |         self.state_size = tuple(len(splits) +1 for  splits in self.state_grid) # n dimendional space
141 |         self.action_size = self.env.action_space.n #dimensional  discrete space size
142 |         self.seed = np.random.seed(seed)
143 |         print(" ")
144 |         print("Environment:", self.env)
145 |         print("State space size:", self.state_size)
146 |         print("Action space size:", self.action_size)
147 |         print(" ")
148 |         
149 |         
150 |         #Learning parameters
151 |         self.alpha  = alpha # learning rate
152 |         self.gamma = gamma # discount factor
153 |         self.epsilon = self.inital_epsilon  = epsilon #Exploratory factor
154 |         self.epsilon_decay_rate = epsilon_decay_rate # how quickly should we decrease the epsilon
155 |         self.min_epsilon = epsilon
156 |         
157 |         #creating a Q table
158 |         self.q_table  = np.zeros(shape = (self.state_size +(self.action_size,)))
159 |         
160 |         print("Q table size:", self.q_table.shape)
161 |         print(" ")
162 |         
163 |         
164 |     def preprocess_state(self,state):
165 |         return  tuple(discretize(state, self.state_grid))
166 |     
167 |     def reset_episode(self,state):
168 |         
169 |         #Gradually decreasing the exploratory rate
170 |         
171 |         self.epsilon *= self.epsilon_decay_rate
172 |         self.epsilon = max(self.epsilon,self.min_epsilon)
173 |         
174 |         self.last_state = self.preprocess_state(state)
175 |         self.last_action = np.argmax(self.q_table[self.last_state])
176 |         return self.last_action
177 |     
178 |     def reset_exploration(self,epsilon = None):
179 |         
180 |         self.epsilon = epsilon if epsilon is not None else self.initial_epsilon
181 |         
182 |     def act(self,state , reward = None, done = None, mode = 'train'):
183 |         
184 |         state = self.preprocess_state(state)
185 |         
186 |         if mode == 'test':
187 |             
188 |             action = np.argmax(self.q_table[state])
189 |             
190 |         else:
191 |             
192 |             self.q_table[self.last_state + (self.last_action,)] += self.alpha * (reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state + (self.last_action,)])
193 |                 
194 |              # Exploration vs. exploitation
195 |             do_exploration = np.random.uniform(0, 1) < self.epsilon
196 |             if do_exploration:
197 |                 # Pick a random action
198 |                 action = np.random.randint(0, self.action_size)
199 |             else:
200 |                 # Pick the best action from Q table
201 |                 action = np.argmax(self.q_table[state])
202 |                 
203 |         self.last_state = state
204 |         self.last_action = action
205 |         return action
206 |     
207 | q_agent = QLearningAgent(env, state_grid)
208 | 
209 | 
210 | #Running the agent
211 | 
212 | def run(agent,env, num_episodes = 20000,mode = 'train'):
213 |     
214 |     scores = []
215 |     max_avg_score = -np.inf
216 |     
217 |     for i_episode in range(1, num_episodes +1):
218 |         state = env.reset()
219 |         action= agent.reset_episode(state)
220 |         total_reward = 0
221 |         done = False
222 |         
223 |         while not done:
224 |            
225 |             state,reward,done,info = env.step(action)
226 |             total_reward += reward
227 |             action = agent.act(state,reward,done,mode)
228 |             
229 |             
230 |             
231 |         #save final scores
232 |         scores.append(total_reward)
233 |         
234 |         #print episode stats
235 |         
236 |         if mode == 'train':
237 |             if len(scores) > 100:
238 |                 avg_score = np.mean(scores[-100:])
239 |                 if avg_score > max_avg_score:
240 |                     max_avg_score = avg_score
241 |             if i_episode % 100 == 0:
242 |                 print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="")
243 |                 sys.stdout.flush()
244 |                 
245 |     return scores
246 | 
247 | scores = run(q_agent, env)
248 | 
249 | # Plot scores obtained per episode
250 | plt.plot(scores); plt.title("Scores");
251 | 
252 | def plot_scores(scores, rolling_window=100):
253 |     """Plot scores and optional rolling mean using specified window."""
254 |     plt.plot(scores); plt.title("Scores");
255 |     rolling_mean = pd.Series(scores).rolling(rolling_window).mean()
256 |     plt.plot(rolling_mean);
257 |     return rolling_mean
258 | 
259 | rolling_mean = plot_scores(scores)
260 | 
261 | # Run in test mode and analyze scores obtained
262 | test_scores = run(q_agent, env, num_episodes=100, mode='test')
263 | print("[TEST] Completed {} episodes with avg. score = {}".format(len(test_scores), np.mean(test_scores)))
264 | _ = plot_scores(test_scores)
265 | 
266 | def plot_q_table(q_table):
267 |     """Visualize max Q-value for each state and corresponding action."""
268 |     q_image = np.max(q_table, axis=2)       # max Q-value for each state
269 |     q_actions = np.argmax(q_table, axis=2)  # best action for each state
270 | 
271 |     fig, ax = plt.subplots(figsize=(10, 10))
272 |     cax = ax.imshow(q_image, cmap='jet');
273 |     cbar = fig.colorbar(cax)
274 |     for x in range(q_image.shape[0]):
275 |         for y in range(q_image.shape[1]):
276 |             ax.text(x, y, q_actions[x, y], color='white',
277 |                     horizontalalignment='center', verticalalignment='center')
278 |     ax.grid(False)
279 |     ax.set_title("Q-table, size: {}".format(q_table.shape))
280 |     ax.set_xlabel('position')
281 |     ax.set_ylabel('velocity')
282 | 
283 | 
284 | plot_q_table(q_agent.q_table)
285 | 
286 | state_grid_new = create_uniform_grid(env.observation_space.low, env.observation_space.high, bins=(20, 20))
287 | q_agent_new = QLearningAgent(env, state_grid_new)
288 | q_agent_new.scores = []
289 | 
290 | q_agent_new.scores += run(q_agent_new, env, num_episodes=50000)  # accumulate scores
291 | rolling_mean_new = plot_scores(q_agent_new.scores)
292 | 
293 | 
294 | plot_q_table(q_agent_new.q_table)
295 | 
296 | state = env.reset()
297 | score = 0
298 | for t in range(200):
299 |     action = q_agent_new.act(state, mode='test')
300 |     env.render()
301 |     state, reward, done, _ = env.step(action)
302 |     score += reward
303 |     if done:
304 |         break 
305 | print('Final score:', score)
306 | env.close()
307 | 
308 |             
309 |         
310 |         
311 |     
312 |     
313 | 
314 | 
315 | 
316 | 
317 | 
318 |             
319 |             
320 |         
321 |         
322 |         
323 |         
324 |         
325 |         
326 | 
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 


--------------------------------------------------------------------------------
/Frozen_lake/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Frozen_lake/.DS_Store


--------------------------------------------------------------------------------
/Frozen_lake/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 4
6 | }
7 | 


--------------------------------------------------------------------------------
/Frozen_lake/Frozen_lake_v_0.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | env = gym.make('FrozenLake-v0')
  4 | 
  5 | 
  6 | def policy_evaluation(policy, environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
  7 |         # Number of evaluation iterations
  8 |         evaluation_iterations = 1
  9 |         # Initialize a value function for each state as zero
 10 |         V = np.zeros(environment.nS)
 11 |         # Repeat until change in value is below the threshold
 12 |         for i in range(int(max_iterations)):
 13 |             
 14 |                 # Initialize a change of value function as zero
 15 |                 delta = 0
 16 |                 # Iterate though each state
 17 |                 for state in range(environment.nS):
 18 |                     
 19 |                        # Initial a new value of current state
 20 |                        v = 0
 21 |                        # Try all possible actions which can be taken from this state
 22 |                        for action, action_probability in enumerate(policy[state]):
 23 |                              # Check how good next state will be
 24 |                              for state_probability, next_state, reward, terminated in environment.P[state][action]:
 25 |                                   # Calculate the expected value
 26 |                                   v += action_probability * state_probability * (reward + discount_factor * V[next_state])
 27 |                        
 28 |                        # Calculate the absolute change of value function
 29 |                        delta = max(delta, np.abs(V[state] - v))
 30 |                        # Update value function
 31 |                        V[state] = v
 32 |                 evaluation_iterations += 1
 33 |                 
 34 |                 # Terminate if value change is insignificant
 35 |                 if delta < theta:
 36 |                     
 37 |                         print(f'Policy evaluated in {evaluation_iterations} iterations.')
 38 |                         return V
 39 |  
 40 | def one_step_lookahead(environment, state, V, discount_factor):
 41 |         action_values = np.zeros(environment.nA)
 42 |         for action in range(environment.nA):
 43 |                 for probability, next_state, reward, terminated in environment.P[state][action]:
 44 |                         action_values[action] += probability * (reward + discount_factor * V[next_state])
 45 |         return action_values
 46 | 
 47 | 
 48 | def policy_iteration(environment, discount_factor=1.0, max_iterations=1e9):
 49 |         # Start with a random policy
 50 |         #num states x num actions / num actions
 51 |         policy = np.ones([environment.nS, environment.nA]) / environment.nA
 52 |         # Initialize counter of evaluated policies
 53 |         evaluated_policies = 1
 54 |         # Repeat until convergence or critical number of iterations reached
 55 |         for i in range(int(max_iterations)):
 56 |                 stable_policy = True
 57 |                 # Evaluate current policy
 58 |                 V = policy_evaluation(policy, environment, discount_factor=discount_factor)
 59 |                 # Go through each state and try to improve actions that were taken (policy Improvement)
 60 |                 for state in range(environment.nS):
 61 |                         # Choose the best action in a current state under current policy
 62 |                         current_action = np.argmax(policy[state])
 63 |                         # Look one step ahead and evaluate if current action is optimal
 64 |                         # We will try every possible action in a current state
 65 |                         action_value = one_step_lookahead(environment, state, V, discount_factor)
 66 |                         # Select a better action
 67 |                         best_action = np.argmax(action_value)
 68 |                         # If action didn't change
 69 |                         if current_action != best_action:
 70 |                                 stable_policy = True
 71 |                                 # Greedy policy update
 72 |                                 policy[state] = np.eye(environment.nA)[best_action]
 73 |                 evaluated_policies += 1
 74 |                 # If the algorithm converged and policy is not changing anymore, then return final policy and value function
 75 |                 if stable_policy:
 76 |                         print(f'Evaluated {evaluated_policies} policies.')
 77 |                         return policy, V
 78 |                     
 79 | def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
 80 |         # Initialize state-value function with zeros for each environment state
 81 |         V = np.zeros(environment.nS)
 82 |         for i in range(int(max_iterations)):
 83 |                 # Early stopping condition
 84 |                 delta = 0
 85 |                 # Update each state
 86 |                 for state in range(environment.nS):
 87 |                         # Do a one-step lookahead to calculate state-action values
 88 |                         action_value = one_step_lookahead(environment, state, V, discount_factor)
 89 |                         # Select best action to perform based on the highest state-action value
 90 |                         best_action_value = np.max(action_value)
 91 |                         # Calculate change in value
 92 |                         delta = max(delta, np.abs(V[state] - best_action_value))
 93 |                         # Update the value function for current state
 94 |                         V[state] = best_action_value
 95 |                         # Check if we can stop
 96 |                 if delta < theta:
 97 |                         print(f'Value-iteration converged at iteration#{i}.')
 98 |                         break
 99 | 
100 |         # Create a deterministic policy using the optimal value function
101 |         policy = np.zeros([environment.nS, environment.nA])
102 |         for state in range(environment.nS):
103 |                 # One step lookahead to find the best action for this state
104 |                 action_value = one_step_lookahead(environment, state, V, discount_factor)
105 |                 # Select best action based on the highest state-action value
106 |                 best_action = np.argmax(action_value)
107 |                 # Update the policy to perform a better action at a current state
108 |                 policy[state, best_action] = 1.0
109 |         return policy, V
110 |     
111 |     
112 |     
113 | def play_episodes(environment, n_episodes, policy):
114 |         wins = 0
115 |         total_reward = 0
116 |         for episode in range(n_episodes):
117 |                 terminated = False
118 |                 state = environment.reset()
119 |                 while not terminated:
120 |                         # Select best action to perform in a current state
121 |                         action = np.argmax(policy[state])
122 |                         # Perform an action an observe how environment acted in response
123 |                         next_state, reward, terminated, info = environment.step(action)
124 |                         # Summarize total reward
125 |                         total_reward += reward
126 |                         # Update current state
127 |                         state = next_state
128 |                         # Calculate number of wins over episodes
129 |                         if terminated and reward == 1.0:
130 |                                 wins += 1
131 |         average_reward = total_reward / n_episodes
132 |         return wins, total_reward, average_reward
133 | 
134 | # Number of episodes to play
135 | n_episodes = 10000
136 | # Functions to find best policy
137 | solvers = [('Policy Iteration', policy_iteration),
138 |            ('Value Iteration', value_iteration)]
139 | for iteration_name, iteration_func in solvers:
140 |         # Load a Frozen Lake environment
141 |         environment = gym.make('FrozenLake-v0')
142 |         # Search for an optimal policy using policy iteration
143 |         policy, V = iteration_func(environment.env)
144 |         # Apply best policy to the real environment
145 |         wins, total_reward, average_reward = play_episodes(environment, n_episodes, policy)
146 |         print(f'{iteration_name} :: number of wins over {n_episodes} episodes = {wins}')
147 |         print(f'{iteration_name} :: average reward over {n_episodes} episodes = {average_reward} \n\n')
148 |     
149 |     
150 |     
151 |     
152 |     
153 |     
154 |     
155 |     
156 |     
157 |     
158 |     
159 |     
160 |     
161 |     
162 |     
163 |     
164 |     
165 |     
166 |     
167 |     
168 |     
169 |     
170 |     
171 |     
172 |     
173 |     
174 |     
175 |     
176 |     
177 |     


--------------------------------------------------------------------------------
/Gradient_Bandit/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Gradient_Bandit/Figure_1.png


--------------------------------------------------------------------------------
/Gradient_Bandit/gradient_bandit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Jul  2 08:35:45 2020
  5 | 
  6 | @author: pavankunchala
  7 | 
  8 | """
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | """
 14 |  in Gradient algortihm we use preferences if the following actions is more prefered the agent is more
 15 |  likely to prefer that action we can find the preference using softmax 
 16 |  """
 17 | # def softmax(x):
 18 | #     return np.exp(x - x.max()) /np.sum(np.exp(x - x.max()),axis = 0)
 19 | 
 20 | 
 21 | class grad_bandit:
 22 |      
 23 |     def __init__(self, k, alpha, iters, mu='random'):
 24 |         # Number of arms
 25 |         self.k = k
 26 |         self.actions = np.arange(k)
 27 |         # Number of iterations
 28 |         self.iters = iters
 29 |         # Step count
 30 |         self.n = 1
 31 |         # Step count for each arm
 32 |         self.k_n = np.ones(k)
 33 |         # Total mean reward
 34 |         self.mean_reward = 0
 35 |         self.reward = np.zeros(iters)
 36 |         # Mean reward for each arm
 37 |         self.k_reward = np.zeros(k)
 38 |         # Initialize preferences
 39 |         self.H = np.zeros(k)
 40 |         # Learning rate
 41 |         self.alpha = alpha
 42 |          
 43 |         if type(mu) == list or type(mu).__module__ == np.__name__:
 44 |             # User-defined averages            
 45 |             self.mu = np.array(mu)
 46 |         elif mu == 'random':
 47 |             # Draw means from probability distribution
 48 |             self.mu = np.random.normal(0, 1, k)
 49 |         elif mu == 'sequence':
 50 |             # Increase the mean for each arm by one
 51 |             self.mu = np.linspace(0, k-1, k)
 52 |              
 53 |     def softmax(self):
 54 |         self.prob_action = np.exp(self.H - np.max(self.H)) \
 55 |             / np.sum(np.exp(self.H - np.max(self.H)), axis=0)
 56 |          
 57 |     def pull(self):
 58 |         # Update probabilities
 59 |         self.softmax()
 60 |         # Select highest preference action
 61 |         a = np.random.choice(self.actions, p=self.prob_action)
 62 |              
 63 |         reward = np.random.normal(self.mu[a], 1)
 64 |          
 65 |         # Update counts
 66 |         self.n += 1
 67 |         self.k_n[a] += 1
 68 |          
 69 |         # Update total
 70 |         self.mean_reward = self.mean_reward + (
 71 |             reward - self.mean_reward) / self.n
 72 |          
 73 |         # Update results for a_k
 74 |         self.k_reward[a] = self.k_reward[a] + (
 75 |             reward - self.k_reward[a]) / self.k_n[a]
 76 |          
 77 |         # Update preferences
 78 |         self.H[a] = self.H[a] + \
 79 |             self.alpha * (reward - self.mean_reward) * (1 -
 80 |                 self.prob_action[a])
 81 |         actions_not_taken = self.actions!=a
 82 |         self.H[actions_not_taken] = self.H[actions_not_taken] - self.alpha * (reward - self.mean_reward) * self.prob_action[actions_not_taken]
 83 |              
 84 |     def run(self):
 85 |         for i in range(self.iters):
 86 |             self.pull()
 87 |             self.reward[i] = self.mean_reward
 88 |              
 89 |     def reset(self, mu=None):
 90 |         # Resets results while keeping settings
 91 |         self.n = 0
 92 |         self.k_n = np.zeros(self.k)
 93 |         self.mean_reward = 0
 94 |         self.reward = np.zeros(iters)
 95 |         self.k_reward = np.zeros(self.k)
 96 |         self.H = np.zeros(self.k)
 97 |         if mu == 'random':
 98 |             self.mu = np.random.normal(0, 1, self.k)
 99 |             
100 |             
101 |             
102 |             
103 | 
104 | k = 10
105 | iters = 1000
106 | # Initialize bandits
107 | grad = grad_bandit(k, 0.1, iters, mu='random') 
108 | 
109 | grad_rewards = np.zeros(iters)
110 | opt_grad = 0
111 | 
112 | episodes = 1000
113 | # Run experiments
114 | for i in range(episodes):
115 |     # Reset counts and rewards
116 |     grad.reset('random')
117 |     
118 |     
119 |     grad.run()
120 |      
121 |     grad_rewards = grad_rewards + (
122 |         grad.reward - grad_rewards) / (i + 1)
123 |     
124 |     opt_grad += grad.k_n[np.argmax(grad.mu)]
125 |     
126 |     
127 | 
128 |           
129 |         
130 |         
131 | plt.figure(figsize=(12,8))
132 | plt.plot(grad_rewards, label="Gradient") 
133 | 
134 | plt.xlabel("Iterations")
135 | plt.ylabel("Average Reward")
136 | plt.title("Average Gradient Bandit Rewards after "
137 |           + str(episodes) + " Episodes")
138 | plt.show()   
139 |         
140 |         
141 |         
142 |         
143 |         
144 | 
145 | 


--------------------------------------------------------------------------------
/K-armed-Bandit/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/.DS_Store


--------------------------------------------------------------------------------
/K-armed-Bandit/K-armed_Bandit-Problem.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | np.random.seed(0)
  5 | 
  6 | class Environment:
  7 |     
  8 |     
  9 |     def __init__(self,probs):
 10 |         #succesfull probabilties for each arm
 11 |         self.probs = probs  
 12 |         
 13 |     def step(self,action):
 14 |         return 1 if(np.random.random() < self.probs[action]) else 0
 15 |     
 16 | 
 17 | class Agent:
 18 |     
 19 |     def __init__(self,nActions,eps):
 20 |         self.nActions= nActions
 21 |         self.eps = eps
 22 |         self.n = np.zeros(nActions,dtype = np.int)    #action values n(a)
 23 |         self.Q = np.zeros(nActions,dtype = np.float) # value Q(a)
 24 |         
 25 |     
 26 |     
 27 |     
 28 |     def update_Q(self,action,reward):
 29 |         
 30 |         # this is formulae for finding the Q(a)
 31 |         # New estimate = Old estimate +1/n(Reward - old Estimate)
 32 |         self.n[action] += 1
 33 |         self.Q[action] += (1.0/self.n[action])*(reward - self.Q[action] )
 34 |         
 35 |         
 36 |     def get_action(self):
 37 |         
 38 |         # epislon greedy policy
 39 |         # explore % times  and exploit 1 -% times
 40 |         if np.random.random() < self.eps:
 41 |             
 42 |             #explore
 43 |             return(np.random.randint(self.nActions))
 44 |         else: #exploit
 45 |             return np.random.choice(np.flatnonzero(self.Q == self.Q.max()))
 46 |         
 47 | # Multi  armed bandit  simulation
 48 | 
 49 | def experiment(probs , N_episodes):
 50 |     
 51 |     env = Environment(probs) # initalizinf the arm probablites
 52 |     agent = Agent(len(env.probs),eps)  
 53 |     actions, rewards = [], []
 54 |     
 55 |     for episodes in range(N_episodes):
 56 |         action = agent.get_action()
 57 |         reward = env.step(action)
 58 |         agent.update_Q(action, reward)
 59 |         actions.append(action)
 60 |         rewards.append(reward)
 61 |         
 62 |     return np.array(actions),np.array(rewards)
 63 | 
 64 | 
 65 | 
 66 | #Settings
 67 | 
 68 | probs = [0.10, 0.50, 0.60, 0.80, 0.10,
 69 |          0.25, 0.60, 0.45, 0.75, 0.65] # bandit arm probabilities of success
 70 | N_experiments = 10000 # number of experiments to perform
 71 | N_steps = 500 # number of steps (episodes)
 72 | eps = 0.1 # probability of random exploration (fraction)
 73 | save_fig = True # save file in same directory
 74 | output_dir = os.path.join(os.getcwd(), "output")
 75 | 
 76 | # Run multi-armed bandit experiments
 77 | print("Running multi-armed bandits with nActions = {}, eps = {}".format(len(probs), eps))
 78 | R = np.zeros((N_steps,))  # reward history sum
 79 | A = np.zeros((N_steps, len(probs)))  # action history sum
 80 | for i in range(N_experiments):
 81 |     actions, rewards = experiment(probs, N_steps)  # perform experiment
 82 |     if (i + 1) % (N_experiments / 100) == 0:
 83 |         print("[Experiment {}/{}] ".format(i + 1, N_experiments) +
 84 |               "n_steps = {}, ".format(N_steps) +
 85 |               "reward_avg = {}".format(np.sum(rewards) / len(rewards)))
 86 |     R += rewards
 87 |     for j, a in enumerate(actions):
 88 |         A[j][a] += 1
 89 | 
 90 | # Plot reward results
 91 | R_avg =  R / np.float(N_experiments)
 92 | plt.plot(R_avg, ".")
 93 | plt.xlabel("Step")
 94 | plt.ylabel("Average Reward")
 95 | plt.grid()
 96 | ax = plt.gca()
 97 | plt.xlim([1, N_steps])
 98 | if save_fig:
 99 |     if not os.path.exists(output_dir): os.mkdir(output_dir)
100 |     plt.savefig(os.path.join(output_dir, "rewards.png"), bbox_inches="tight")
101 | else:
102 |     plt.show()
103 | plt.close()
104 | 
105 | # Plot action results
106 | for i in range(len(probs)):
107 |     A_pct = 100 * A[:,i] / N_experiments
108 |     steps = list(np.array(range(len(A_pct)))+1)
109 |     plt.plot(steps, A_pct, "-",
110 |              linewidth=4,
111 |              label="Arm {} ({:.0f}%)".format(i+1, 100*probs[i]))
112 | plt.xlabel("Step")
113 | plt.ylabel("Count Percentage (%)")
114 | leg = plt.legend(loc='upper left', shadow=True)
115 | plt.xlim([1, N_steps])
116 | plt.ylim([0, 100])
117 | for legobj in leg.legendHandles:
118 |     legobj.set_linewidth(4.0)
119 | if save_fig:
120 |     if not os.path.exists(output_dir): os.mkdir(output_dir)
121 |     plt.savefig(os.path.join(output_dir, "actions.png"), bbox_inches="tight")
122 | else:
123 |     plt.show()
124 | plt.close()
125 | 
126 | 
127 | 
128 | 
129 |         
130 |         
131 |         
132 |     
133 |     
134 |         
135 |         
136 |         
137 |     


--------------------------------------------------------------------------------
/K-armed-Bandit/output/actions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/output/actions.png


--------------------------------------------------------------------------------
/K-armed-Bandit/output/rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Bandit/output/rewards.png


--------------------------------------------------------------------------------
/K-armed-Greedy/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_1.png


--------------------------------------------------------------------------------
/K-armed-Greedy/Figure_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_2.png


--------------------------------------------------------------------------------
/K-armed-Greedy/Figure_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_3.png


--------------------------------------------------------------------------------
/K-armed-Greedy/Figure_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_4.png


--------------------------------------------------------------------------------
/K-armed-Greedy/Figure_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/K-armed-Greedy/Figure_5.png


--------------------------------------------------------------------------------
/K-armed-Greedy/K-armed-Greedy-and-rest.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jun 24 15:55:24 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import numpy as np 
 11 | import matplotlib.pyplot as plt 
 12 | import pandas as pd 
 13 | 
 14 | 
 15 | class eps_bandit:
 16 |     '''
 17 |     epsilon-greedy k-bandit problem
 18 |     
 19 |     Inputs
 20 |     =====================================================
 21 |     k: number of arms (int)
 22 |     eps: probability of random action 0 < eps < 1 (float)
 23 |     iters: number of steps (int)
 24 |     mu: set the average rewards for each of the k-arms.
 25 |         Set to "random" for the rewards to be selected from
 26 |         a normal distribution with mean = 0. 
 27 |         Set to "sequence" for the means to be ordered from 
 28 |         0 to k-1.
 29 |         Pass a list or array of length = k for user-defined
 30 |         values.
 31 |     '''
 32 |     
 33 |     def __init__(self, k, eps, iters, mu='random'):
 34 |         # Number of arms
 35 |         self.k = k
 36 |         # Search probability
 37 |         self.eps = eps
 38 |         # Number of iterations
 39 |         self.iters = iters
 40 |         # Step count
 41 |         self.n = 0
 42 |         # Step count for each arm
 43 |         self.k_n = np.zeros(k)
 44 |         # Total mean reward
 45 |         self.mean_reward = 0
 46 |         self.reward = np.zeros(iters)
 47 |         # Mean reward for each arm
 48 |         self.k_reward = np.zeros(k)
 49 |         
 50 |         if type(mu) == list or type(mu).__module__ == np.__name__:
 51 |             # User-defined averages            
 52 |             self.mu = np.array(mu)
 53 |         elif mu == 'random':
 54 |             # Draw means from probability distribution
 55 |             self.mu = np.random.normal(0, 1, k)
 56 |         elif mu == 'sequence':
 57 |             # Increase the mean for each arm by one
 58 |             self.mu = np.linspace(0, k-1, k)
 59 |         
 60 |     def pull(self):
 61 |         # Generate random number
 62 |         p = np.random.rand()
 63 |         if self.eps == 0 and self.n == 0:
 64 |             a = np.random.choice(self.k)
 65 |         elif p < self.eps:
 66 |             # Randomly select an action
 67 |             a = np.random.choice(self.k)
 68 |         else:
 69 |             # Take greedy action
 70 |             a = np.argmax(self.k_reward)
 71 |             
 72 |         reward = np.random.normal(self.mu[a], 1)
 73 |         
 74 |         # Update counts
 75 |         self.n += 1
 76 |         self.k_n[a] += 1
 77 |         
 78 |         # Update total
 79 |         self.mean_reward = self.mean_reward + (
 80 |             reward - self.mean_reward) / self.n
 81 |         
 82 |         # Update results for a_k
 83 |         self.k_reward[a] = self.k_reward[a] + (
 84 |             reward - self.k_reward[a]) / self.k_n[a]
 85 |         
 86 |     def run(self):
 87 |         for i in range(self.iters):
 88 |             self.pull()
 89 |             self.reward[i] = self.mean_reward
 90 |             
 91 |     def reset(self):
 92 |         # Resets results while keeping settings
 93 |         self.n = 0
 94 |         self.k_n = np.zeros(k)
 95 |         self.mean_reward = 0
 96 |         self.reward = np.zeros(iters)
 97 |         self.k_reward = np.zeros(k)
 98 |         
 99 | k = 10
100 | iters = 1000
101 | 
102 | eps_0_rewards = np.zeros(iters)
103 | eps_01_rewards = np.zeros(iters)
104 | eps_1_rewards = np.zeros(iters)
105 | 
106 | episodes = 1000
107 | # Run experiments
108 | for i in range(episodes):
109 |     # Initialize bandits
110 |     eps_0 = eps_bandit(k, 0, iters)
111 |     eps_01 = eps_bandit(k, 0.01, iters, eps_0.mu.copy())
112 |     eps_1 = eps_bandit(k, 0.1, iters, eps_0.mu.copy())
113 |     
114 |     # Run experiments
115 |     eps_0.run()
116 |     eps_01.run()
117 |     eps_1.run()
118 |     
119 |     # Update long-term averages
120 |     eps_0_rewards = eps_0_rewards + (
121 |         eps_0.reward - eps_0_rewards) / (i + 1)
122 |     eps_01_rewards = eps_01_rewards + (
123 |         eps_01.reward - eps_01_rewards) / (i + 1)
124 |     eps_1_rewards = eps_1_rewards + (
125 |         eps_1.reward - eps_1_rewards) / (i + 1)
126 |     
127 | plt.figure(figsize=(12,8))
128 | plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)")
129 | plt.plot(eps_01_rewards, label="$\epsilon=0.01$")
130 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
131 | plt.legend(bbox_to_anchor=(1.3, 0.5))
132 | plt.xlabel("Iterations")
133 | plt.ylabel("Average Reward")
134 | plt.title("Average $\epsilon-greedy$ Rewards after " + str(episodes) 
135 |     + " Episodes")
136 | plt.show()
137 | 
138 | k = 10
139 | iters = 1000
140 | 
141 | eps_0_rewards = np.zeros(iters)
142 | eps_01_rewards = np.zeros(iters)
143 | eps_1_rewards = np.zeros(iters)
144 | eps_0_selection = np.zeros(k)
145 | eps_01_selection = np.zeros(k)
146 | eps_1_selection = np.zeros(k)
147 | 
148 | episodes = 1000
149 | # Run experiments
150 | for i in range(episodes):
151 |     # Initialize bandits
152 |     eps_0 = eps_bandit(k, 0, iters, mu='sequence')
153 |     eps_01 = eps_bandit(k, 0.01, iters, eps_0.mu.copy())
154 |     eps_1 = eps_bandit(k, 0.1, iters, eps_0.mu.copy())
155 |     
156 |     # Run experiments
157 |     eps_0.run()
158 |     eps_01.run()
159 |     eps_1.run()
160 |     
161 |     # Update long-term averages
162 |     eps_0_rewards = eps_0_rewards + (
163 |         eps_0.reward - eps_0_rewards) / (i + 1)
164 |     eps_01_rewards = eps_01_rewards + (
165 |         eps_01.reward - eps_01_rewards) / (i + 1)
166 |     eps_1_rewards = eps_1_rewards + (
167 |         eps_1.reward - eps_1_rewards) / (i + 1)
168 |     
169 |     # Average actions per episode
170 |     eps_0_selection = eps_0_selection + (
171 |         eps_0.k_n - eps_0_selection) / (i + 1)
172 |     eps_01_selection = eps_01_selection + (
173 |         eps_01.k_n - eps_01_selection) / (i + 1)
174 |     eps_1_selection = eps_1_selection + (
175 |         eps_1.k_n - eps_1_selection) / (i + 1)
176 |     
177 | plt.figure(figsize=(12,8))
178 | plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)")
179 | plt.plot(eps_01_rewards, label="$\epsilon=0.01$")
180 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
181 | for i in range(k):
182 |     plt.hlines(eps_0.mu[i], xmin=0,
183 |               xmax=iters, alpha=0.5,
184 |               linestyle="--")
185 | plt.legend(bbox_to_anchor=(1.3, 0.5))
186 | plt.xlabel("Iterations")
187 | plt.ylabel("Average Reward")
188 | plt.title("Average $\epsilon-greedy$ Rewards after " + 
189 |      str(episodes) + " Episodes")
190 | plt.show()
191 | 
192 | 
193 | bins = np.linspace(0, k-1, k)
194 | 
195 | plt.figure(figsize=(12,8))
196 | plt.bar(bins, eps_0_selection, 
197 |         width = 0.33, color='b', 
198 |         label="$\epsilon=0$")
199 | plt.bar(bins+0.33, eps_01_selection,
200 |         width=0.33, color='g', 
201 |         label="$\epsilon=0.01$")
202 | plt.bar(bins+0.66, eps_1_selection, 
203 |         width=0.33, color='r',
204 |         label="$\epsilon=0.1$")
205 | plt.legend(bbox_to_anchor=(1.2, 0.5))
206 | plt.xlim([0,k])
207 | plt.title("Actions Selected by Each Algorithm")
208 | plt.xlabel("Action")
209 | plt.ylabel("Number of Actions Taken")
210 | plt.show()
211 | 
212 | opt_per = np.array([eps_0_selection, eps_01_selection,
213 |                    eps_1_selection]) / iters * 100
214 | df = pd.DataFrame(opt_per, index=['$\epsilon=0$', 
215 |     '$\epsilon=0.01$', '$\epsilon=0.1$'],
216 |                  columns=["a = " + str(x) for x in range(0, k)])
217 | print("Percentage of actions selected:")
218 | df
219 | 
220 | 
221 | 
222 | 
223 | class eps_decay_bandit:
224 |     '''
225 |     epsilon-decay k-bandit problem
226 |     
227 |     Inputs
228 |     =====================================================
229 |     k: number of arms (int)
230 |     iters: number of steps (int)
231 |     mu: set the average rewards for each of the k-arms.
232 |         Set to "random" for the rewards to be selected from
233 |         a normal distribution with mean = 0. 
234 |         Set to "sequence" for the means to be ordered from 
235 |         0 to k-1.
236 |         Pass a list or array of length = k for user-defined
237 |         values.
238 |     '''
239 |     
240 |     def __init__(self, k, iters, mu='random'):
241 |         # Number of arms
242 |         self.k = k
243 |         # Number of iterations
244 |         self.iters = iters
245 |         # Step count
246 |         self.n = 0
247 |         # Step count for each arm
248 |         self.k_n = np.zeros(k)
249 |         # Total mean reward
250 |         self.mean_reward = 0
251 |         self.reward = np.zeros(iters)
252 |         # Mean reward for each arm
253 |         self.k_reward = np.zeros(k)
254 |         
255 |         if type(mu) == list or type(mu).__module__ == np.__name__:
256 |             # User-defined averages            
257 |             self.mu = np.array(mu)
258 |         elif mu == 'random':
259 |             # Draw means from probability distribution
260 |             self.mu = np.random.normal(0, 1, k)
261 |         elif mu == 'sequence':
262 |             # Increase the mean for each arm by one
263 |             self.mu = np.linspace(0, k-1, k)
264 |         
265 |     def pull(self):
266 |         # Generate random number
267 |         p = np.random.rand()
268 |         if p < 1 / (1 + self.n / self.k):
269 |             # Randomly select an action
270 |             a = np.random.choice(self.k)
271 |         else:
272 |             # Take greedy action
273 |             a = np.argmax(self.k_reward)
274 |             
275 |         reward = np.random.normal(self.mu[a], 1)
276 |         
277 |         # Update counts
278 |         self.n += 1
279 |         self.k_n[a] += 1
280 |         
281 |         # Update total
282 |         self.mean_reward = self.mean_reward + (
283 |             reward - self.mean_reward) / self.n
284 |         
285 |         # Update results for a_k
286 |         self.k_reward[a] = self.k_reward[a] + (
287 |             reward - self.k_reward[a]) / self.k_n[a]
288 |         
289 |     def run(self):
290 |         for i in range(self.iters):
291 |             self.pull()
292 |             self.reward[i] = self.mean_reward
293 |             
294 |     def reset(self):
295 |         # Resets results while keeping settings
296 |         self.n = 0
297 |         self.k_n = np.zeros(k)
298 |         self.mean_reward = 0
299 |         self.reward = np.zeros(iters)
300 |         self.k_reward = np.zeros(k)
301 |         
302 | k = 10
303 | iters = 1000
304 | eps_decay_rewards = np.zeros(iters)
305 | eps_1_rewards = np.zeros(iters)
306 | episodes = 1000
307 | # Run experiments
308 | for i in range(episodes):
309 |     # Initialize bandits
310 |     eps_decay = eps_decay_bandit(k, iters)
311 |     eps_1 = eps_bandit(k, 0.1, iters, eps_decay.mu.copy())
312 |     
313 |     # Run experiments
314 |     eps_decay.run()
315 |     eps_1.run()
316 |     
317 |     # Update long-term averages
318 |     eps_decay_rewards = eps_decay_rewards + (
319 |         eps_decay.reward - eps_decay_rewards) / (i + 1)
320 |     eps_1_rewards = eps_1_rewards + (
321 |         eps_1.reward - eps_1_rewards) / (i + 1)
322 |     
323 | plt.figure(figsize=(12,8))
324 | plt.plot(eps_decay_rewards, label="$\epsilon-decay$")
325 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
326 | plt.legend(bbox_to_anchor=(1.2, 0.5))
327 | plt.xlabel("Iterations")
328 | plt.ylabel("Average Reward")
329 | plt.title("Average $\epsilon-decay$ and" + 
330 |     "$\epsilon-greedy$ Rewards after " 
331 |     + str(episodes) + " Episodes")
332 | plt.show()
333 | 
334 | 
335 | k = 10
336 | iters = 1000
337 | oiv_rewards = np.zeros(iters)
338 | eps_decay_rewards = np.zeros(iters)
339 | eps_1_rewards = np.zeros(iters)
340 | # Select initial values
341 | oiv_init = np.repeat(5., k)
342 | episodes = 1000
343 | # Run experiments
344 | for i in range(episodes):
345 |     # Initialize bandits
346 |     oiv_bandit = eps_bandit(k, 0, iters)
347 |     oiv_bandit.k_reward = oiv_init.copy()
348 |     oiv_bandit.k_n = np.ones(k)
349 |     eps_decay = eps_decay_bandit(k, iters, oiv_bandit.mu.copy())
350 |     eps_1 = eps_bandit(k, 0.1, iters, oiv_bandit.mu.copy())
351 |     
352 |     # Run experiments
353 |     oiv_bandit.run()
354 |     eps_decay.run()
355 |     eps_1.run()
356 |     
357 |     # Update long-term averages
358 |     oiv_rewards = oiv_rewards + (
359 |         oiv_bandit.reward - oiv_rewards) / (i + 1)
360 |     eps_decay_rewards = eps_decay_rewards + (
361 |         eps_decay.reward - eps_decay_rewards) / (i + 1)
362 |     eps_1_rewards = eps_1_rewards + (
363 |         eps_1.reward - eps_1_rewards) / (i + 1)
364 |     
365 | plt.figure(figsize=(12,8))
366 | plt.plot(oiv_rewards, label="Optimistic")
367 | plt.plot(eps_decay_rewards, label="$\epsilon-decay$")
368 | plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
369 | plt.legend(bbox_to_anchor=(1.2, 0.5))
370 | plt.xlabel("Iterations")
371 | plt.ylabel("Average Reward")
372 | plt.title("Average Bandit Strategy Rewards after " + 
373 |     str(episodes) + " Episodes")
374 | plt.show()
375 | 
376 | 


--------------------------------------------------------------------------------
/Monte_Carlo_Frozen_lake/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Monte_Carlo_Frozen_lake/.DS_Store


--------------------------------------------------------------------------------
/Monte_Carlo_Frozen_lake/MC_Frozenlake.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Jul 11 09:34:37 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import gym
 11 | import numpy as np
 12 | import operator
 13 | 
 14 | from IPython.display import clear_output
 15 | 
 16 | from time import sleep
 17 | import random
 18 | 
 19 | import itertools
 20 | import tqdm
 21 | 
 22 | tqdm.monitor_interval = 0
 23 | 
 24 | 
 25 | #Random Policy
 26 | 
 27 | def create_random_policy(env):
 28 |     policy = {}
 29 |     for key in range(0, env.observation_space.n):
 30 |         current_end = 0
 31 |         p = {}
 32 |         
 33 |         for action in range(0,env.action_space.n):
 34 |             
 35 |             p[action]  = 1/env.action_space.n
 36 |             
 37 |         policy[key] = p
 38 |         
 39 |     return policy
 40 | 
 41 | 
 42 | 
 43 | #dictionary for  thr state_action_value
 44 |     
 45 | 
 46 | def create_state_action_dictionary(env,policy):
 47 |     
 48 |     Q = {}
 49 |     
 50 |     for key in policy.keys():
 51 |         Q[key] = {a: 0.0 for a in range(0, env.action_space.n)}
 52 |         
 53 |     return Q
 54 | 
 55 | 
 56 | #To play episodes
 57 |     
 58 | 
 59 | 
 60 | def run_game(env,policy,display = True):
 61 |     
 62 |     env.reset()
 63 |     
 64 |     episode =[]
 65 |     
 66 |     finished = False
 67 |     
 68 |     while not finished:
 69 |         s = env.env.s
 70 |         
 71 |         if display:
 72 |             
 73 |             clear_output(True)
 74 |             env.render()
 75 |             sleep(1)
 76 |             
 77 |             
 78 |         timestep  =[]
 79 |         timestep.append(s)
 80 |         
 81 |         n = random.uniform(0, sum(policy[s].values()))
 82 |         
 83 |         
 84 |         top_range = 0
 85 |         
 86 |         for prob in policy[s].items():
 87 |             
 88 |             top_range += prob[1]
 89 |             
 90 |             if n <top_range:
 91 |                 action= prob[0]
 92 |                 
 93 |                 break
 94 |             
 95 |             
 96 |         state,reward,finished,info = env.step(action)
 97 |         
 98 |         timestep.append(action)
 99 |         
100 |         timestep.append(reward)
101 |         
102 |         episode.append(timestep)
103 |         
104 |         
105 |     if display:
106 |         clear_output(True)
107 |         
108 |         env.render()
109 |         sleep(1)
110 |         
111 |     return episode
112 | 
113 | 
114 | 
115 | 
116 | 
117 | #testing policy
118 |     
119 | 
120 | def test_policy(policy,env):
121 |     
122 |     wins = 0
123 |     r = 100
124 |     
125 |     for i in range(r):
126 |         
127 |         w = run_game(env, policy, display = False)[-1][-1]
128 |         
129 |         if w == 1:
130 |             wins +=1
131 |             
132 |     return wins/r
133 | 
134 | 
135 | 
136 | #Monte_carlo_prediction
137 |     
138 | 
139 | def monte_carlo_e_soft(env ,episodes=100,policy = None,epsilon = 0.01):
140 |     
141 |     if not policy:
142 |         policy = create_random_policy(env)
143 |         
144 |         Q = create_state_action_dictionary(env, policy)
145 |         
146 |         returns = {}
147 |         
148 |         
149 |         
150 |     for _ in range(episodes):
151 |         
152 |             
153 |         G = 0 # Store cumilate reward in G (initalzie at 0)
154 |             
155 |         episode = run_game(env = env, policy = policy,display = False)
156 |             
157 |             
158 |             #we have to loop through reversed lines
159 |             
160 |         for i in reversed(range(0, len(episode))):
161 |             
162 |                 
163 |             s_t,a_t,r_t = episode[i]
164 |                 
165 |             state_action = (s_t,a_t)
166 |                 
167 |             G += r_t #increment the total reward
168 |                 
169 |                 
170 |             if not state_action in  [(x[0],x[1]) for x in episode[0:i]]:
171 |                     
172 |                 if returns.get(state_action):
173 |                     
174 |                     returns[state_action].append(G)
175 |                         
176 |                 else:
177 |                         
178 |                     returns[state_action] = [G]
179 |                         
180 |                     #Average reward across episodes
181 |                         
182 |                         
183 |                 Q[s_t][a_t] = sum(returns[state_action])/len(returns[state_action])
184 |                     
185 |                     #finding the action with max values
186 |                 Q_list = list(map(lambda x :x[1] ,Q[s_t].items()))
187 |                     
188 |                 indices = [i for i ,x in enumerate(Q_list) if x== max(Q_list)]
189 |                     
190 |                 max_Q = random.choice(indices)
191 |                     
192 |                     
193 |                 A_star = max_Q
194 |                     
195 |                     #updating action_prob for s_t in policy
196 |                 for a in policy[s_t].items():
197 |                         
198 |                     if a[0] == A_star:
199 |                         policy[s_t][a[0]] = 1- epsilon +(epsilon/abs(sum(policy[s_t].values())))
200 |                             
201 |                     else:
202 |                         policy[s_t][a[0]] = (epsilon/abs(sum(policy[s_t].values())))
203 |                             
204 |     return policy
205 | 
206 | 
207 | 
208 | env = gym.make('FrozenLake8x8-v0')
209 | 
210 | policy = monte_carlo_e_soft(env,episodes = 50000)
211 | test_policy(policy, env)
212 |     
213 |         
214 | print(test_policy(policy, env))   
215 | 
216 | 
217 |                         
218 |                     
219 |                     
220 |                     
221 |                     
222 |                         
223 |     
224 |             
225 |     
226 |     
227 |     
228 |     
229 |     
230 |     
231 |     
232 |     
233 |     
234 |     
235 |     
236 |     
237 |     
238 |     
239 |     
240 |     
241 |     
242 |     
243 |     
244 |     
245 |             
246 |         
247 |     
248 |         
249 |         
250 |     
251 |     
252 |     
253 |     
254 |     
255 |     
256 |     
257 |     
258 |     
259 |     
260 |     
261 |     
262 |     
263 |         


--------------------------------------------------------------------------------
/MountainCar_Q_learn/mountainCarQlearn.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | import gym
 5 | from gym import wrappers
 6 | 
 7 | n_states = 40
 8 | iter_max = 10000
 9 | 
10 | initial_lr = 1.0 # Learning rate
11 | min_lr = 0.003
12 | gamma = 1.0
13 | t_max = 10000
14 | eps = 0.02
15 | 
16 | def run_episode(env, policy=None, render=False):
17 |     obs = env.reset()
18 |     total_reward = 0
19 |     step_idx = 0
20 |     for _ in range(t_max):
21 |         if render:
22 |             env.render()
23 |         if policy is None:
24 |             action = env.action_space.sample()
25 |         else:
26 |             a,b = obs_to_state(env, obs)
27 |             action = policy[a][b]
28 |         obs, reward, done, _ = env.step(action)
29 |         total_reward += gamma ** step_idx * reward
30 |         step_idx += 1
31 |         if done:
32 |             
33 |             break
34 |     return total_reward
35 | 
36 | def obs_to_state(env, obs):
37 |     """ Maps an observation to state """
38 |     env_low = env.observation_space.low
39 |     env_high = env.observation_space.high
40 |     env_dx = (env_high - env_low) / n_states
41 |     a = int((obs[0] - env_low[0])/env_dx[0])
42 |     b = int((obs[1] - env_low[1])/env_dx[1])
43 |     return a, b
44 | 
45 | if __name__ == '__main__':
46 |     env_name = 'MountainCar-v0'
47 |     env = gym.make(env_name)
48 |     env.seed(0)
49 |     np.random.seed(0)
50 |     print ('----- using Q Learning -----')
51 |     q_table = np.zeros((n_states, n_states, 3))
52 |     for i in range(iter_max):
53 |         obs = env.reset()
54 |         total_reward = 0
55 |         ## eta: learning rate is decreased at each step
56 |         eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
57 |         for j in range(t_max):
58 |             a, b = obs_to_state(env, obs)
59 |             if np.random.uniform(0, 1) < eps:
60 |                 action = np.random.choice(env.action_space.n)
61 |             else:
62 |                 logits = q_table[a][b]
63 |                 logits_exp = np.exp(logits)
64 |                 probs = logits_exp / np.sum(logits_exp)
65 |                 action = np.random.choice(env.action_space.n, p=probs)
66 |             obs, reward, done, _ = env.step(action)
67 |             total_reward += (gamma ** j) * reward
68 |             # update q table
69 |             a_, b_ = obs_to_state(env, obs)
70 |             q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  np.max(q_table[a_][b_]) - q_table[a][b][action])
71 |             if done:
72 |                 break
73 |         if i % 100 == 0:
74 |             print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
75 |     solution_policy = np.argmax(q_table, axis=2)
76 |     solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
77 |     print("Average score of solution = ", np.mean(solution_policy_scores))
78 |     # Animate it
79 |     run_episode(env, solution_policy, True)


--------------------------------------------------------------------------------
/Off_Policy_Monte_Carlo/Off_Policy_MC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Jul 12 12:14:54 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import gym
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | import numpy as np
 14 | 
 15 | import sys
 16 | 
 17 | import collections
 18 | 
 19 | from collections import defaultdict
 20 | 
 21 | #from lib.envs.blackjack import BlackjackEnv
 22 | 
 23 | from mpl_toolkits.mplot3d import Axes3D
 24 | 
 25 | import matplotlib as mpl
 26 | from matplotlib import cm
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | env = gym.make('Blackjack-v0')
 34 | 
 35 | 
 36 | def create_random_policy(nA):
 37 |     
 38 |     #create a random policy which takes no of actions and gives prob based on the no of 
 39 |     #no of actions
 40 |     
 41 |     A = np.ones(nA,dtype = float)/nA
 42 |     
 43 |     def policy_fn(observation):
 44 |         return A
 45 |     return policy_fn
 46 | 
 47 | 
 48 | #creating a greedy policy
 49 |     
 50 | def create_greedy_policy(Q):
 51 |     
 52 |     def policy_fn(state):
 53 |         A = np.zeros_like(Q[state], dtype = float)
 54 |         best_action = np.argmax(Q[state])
 55 |         
 56 |         A[best_action]  = 1.0
 57 |         return A
 58 |     return policy_fn
 59 | 
 60 | 
 61 | #monte carlo control off policy method using weighted importance sampling 
 62 | #for finding greedy optimal polciy
 63 |     
 64 | 
 65 | def mc_control_importance_sampling(env,num_episodes,behaviour_policy,discount_factor = 1.0):
 66 |     
 67 |     
 68 |     #A dictionary thats maps  state -> action values
 69 |     Q = defaultdict(lambda: np.zeros(env.action_space.n))
 70 |     
 71 |     #the cumulative denominator  of weighted importance sampling
 72 |     
 73 |     C = defaultdict(lambda:np.zeros(env.action_space.n))
 74 |     
 75 |     #the greedy polcy we want to learn
 76 |     target_policy = create_greedy_policy(Q)
 77 |     
 78 |     
 79 |     for i_episode in range(1,num_episodes+1):
 80 |         
 81 |         
 82 |         if i_episode % 1000 == 0:
 83 |             print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
 84 |             sys.stdout.flush()
 85 |             
 86 |         #Generate an episode {it's an array of state state , action ,reward}
 87 |             
 88 |         episode = []
 89 |         state = env.reset()
 90 |         
 91 |         for t in range(100):
 92 |             
 93 |             #sampling an action from our policy
 94 |             probs = behaviour_policy(state)
 95 |             action = np.random.choice(np.arange(len(probs)), p = probs)
 96 |             
 97 |             next_state, reward , done ,_ = env.step(action)
 98 |             episode.append((state,action,reward))
 99 |             
100 |             if done:
101 |                 break
102 |             state = next_state
103 |             
104 |             
105 |         #Sum of discounted sums
106 |         G = 0.0
107 |         #The importance samplingg ratio
108 |         W = 1.0
109 |         
110 |         
111 |         #for each step in episode,backwards
112 |         for t in range(len(episode))[::-1]:
113 |             
114 |             
115 |             state,action,reward  = episode[t]
116 |             
117 |             # update total reward
118 |             G = discount_factor *G +reward
119 |             
120 |             #updating weight importance sampling 
121 |             C[state][action] +=W
122 |             
123 |             Q[state][action]  += (W/C[state][action])* (G - Q[state][action])
124 |             
125 |             
126 |             if action != np.argmax(target_policy(state)):
127 |                 break
128 |             W = W*1/behaviour_policy(state)[action]
129 |             
130 |     return Q, target_policy
131 | 
132 | 
133 | 
134 | random_policy = create_random_policy(env.action_space.n)
135 | Q, policy = mc_control_importance_sampling(env, num_episodes = 500000, behaviour_policy= random_policy)
136 | 
137 | 
138 | # For plotting: Create value function from action-value function
139 | # by picking the best action at each state
140 | V = defaultdict(float)
141 | for state, action_values in Q.items():
142 |     action_value = np.max(action_values)
143 |     V[state] = action_value
144 | 
145 | 
146 | 
147 | 
148 |             
149 |     
150 |         
151 |         
152 | 


--------------------------------------------------------------------------------
/Pac-Man/pacman_DQN.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Sep  4 15:26:40 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import gym
 10 | import time
 11 | import numpy as np
 12 | 
 13 | import tensorflow as tf
 14 | import keras
 15 | #creating the environment
 16 | env=gym.make('MsPacman-v0')
 17 | 
 18 | print(env.action_space)
 19 | 
 20 | print(env.observation_space)
 21 | 
 22 | n_height = 210
 23 | n_width = 160
 24 | n_depth = 3
 25 | n_shape = [n_height,n_width,n_depth]
 26 | n_inputs = n_height * n_width * n_depth
 27 | env.frameskip = 3
 28 | 
 29 | 
 30 | frame_time = 1.0 / 15 # seconds
 31 | 
 32 | n_episodes = 500
 33 | scores = []
 34 | for i_episode in range(n_episodes):
 35 |     t=0
 36 |     score=0
 37 |     then = 0
 38 |     done = False
 39 |     env.reset()
 40 |     while not done:
 41 |         now = time.time()
 42 |         if frame_time < now - then:
 43 |             action = env.action_space.sample()
 44 |             observation, reward, done, info = env.step(action)
 45 |             score += reward
 46 |             #env.render()
 47 |             then  = now
 48 |             t=t+1
 49 |     scores.append(score)
 50 |             
 51 | print('Average score {}, max {}, min {}'.format(np.mean(scores),np.max(scores),np.min(scores) ))
 52 | 
 53 | tf.reset_default_graph()
 54 | keras.backend.clear_session()
 55 | 
 56 | #Applying deep - Q - learning
 57 | 
 58 | def policy_q_nn(obs,env):
 59 |     
 60 |     #explore strategy
 61 |     if np.random.random() < explore_rate:
 62 |         
 63 |         action = env.action_space.sample()
 64 |     #exploitation strategy
 65 |     else :
 66 |         action = np.argmax(q_nn.predict(np.array([obs])))
 67 |     return action
 68 | 
 69 | def episode(env, policy, r_max = 0,t_max = 0):
 70 |     
 71 |     # create the empty list to contain game memory
 72 |     memory = deque(maxlen=1000)
 73 |     
 74 |     obs = env.reset()
 75 |     state_prev = obs
 76 |     
 77 |     episode_reward = 0
 78 |     done = False
 79 |     t = 0
 80 |     
 81 |     while not done:
 82 |         
 83 |         action = policy(state_prev, env)
 84 |         obs, reward, done, info = env.step(action)
 85 |         state_next = obs
 86 |         
 87 |         # add the state_prev, action, reward, state_new, done to memory
 88 |         memory.append([state_prev,action,reward,state_next,done])
 89 |         
 90 |         # Generate and update the q_values with 
 91 |         # maximum future rewards using bellman function:
 92 |         states = np.array([x[0] for x in memory])
 93 |         states_next = np.array([np.zeros(n_shape) if x[4] else x[3] for x in memory])
 94 |         
 95 |         
 96 |         q_values = q_nn.predict(states)
 97 |         q_values_next = q_nn.predict(states_next)
 98 |         
 99 |         for i in range(len(memory)):
100 |             
101 |             state_prev,action,reward,state_next,done = memory[i]
102 |             if done:
103 |                 
104 |                 q_values[i,action] = reward
105 |                 
106 |             else:
107 |                 
108 |                 best_q = np.amax(q_values_next[i])
109 |                 
110 |                 bellman_q = reward + discount_rate * best_q
111 |                 q_values[i,action] = bellman_q
112 |                 
113 |         # train the q_nn with states and q_values, same as updating the q_table
114 |         q_nn.fit(states,q_values,epochs=1,batch_size=50,verbose=0)
115 |     
116 |         state_prev = state_next
117 |         
118 |         episode_reward += reward
119 |         if r_max > 0 and episode_reward > r_max:
120 |             break
121 |         t+=1
122 |         if t_max > 0 and t == t_max:
123 |             break
124 |     return episode_reward
125 | # experiment collect observations and rewards for each episode
126 | def experiment(env, policy, n_episodes,r_max=0, t_max=0):
127 |     
128 |     rewards=np.empty(shape=[n_episodes])
129 |     for i in range(n_episodes):
130 |         val = episode(env, policy, r_max, t_max)
131 |         #print('episode:{}, reward {}'.format(i,val))
132 |         rewards[i]=val
133 |             
134 |     print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
135 |         .format(policy.__name__,
136 |               np.min(rewards),
137 |               np.max(rewards),
138 |               np.mean(rewards)))
139 |     
140 |     
141 | from collections import deque 
142 | from tensorflow.keras.models import Sequential
143 | from tensorflow.keras.layers import Dense, Flatten
144 | 
145 | # build the Q-Network
146 | model = Sequential()
147 | model.add(Flatten(input_shape = n_shape))
148 | model.add(Dense(512, activation='relu',name='hidden1'))
149 | model.add(Dense(9, activation='softmax', name='output'))
150 | model.compile(loss='categorical_crossentropy',optimizer='adam')
151 | model.summary()
152 | q_nn = model
153 | 
154 | # Hyperparameters
155 | 
156 | discount_rate = 0.9
157 | explore_rate = 0.2
158 | n_episodes = 1
159 | 
160 | # create the empty list to contain game memory
161 | memory = deque(maxlen=1000)
162 | 
163 | experiment(env, policy_q_nn, n_episodes)
164 | 
165 | 
166 | # Hyperparameters
167 | 
168 | discount_rate = 0.9
169 | explore_rate = 0.2
170 | n_episodes = 100
171 | 
172 | # create the empty list to contain game memory
173 | memory = deque(maxlen=1000)
174 | 
175 | experiment(env, policy_q_nn, n_episodes)
176 | 
177 | 
178 | from collections import deque 
179 | from tensorflow.keras.models import Sequential
180 | from tensorflow.keras.layers import Dense, Flatten
181 | from tensorflow.keras.layers import Conv2D, MaxPooling2D
182 | 
183 | # build the CNN Q-Network
184 | model = Sequential()
185 | model.add(Conv2D(16, kernel_size=(5, 5), 
186 |                  strides=(1, 1),
187 |                  activation='relu',
188 |                  input_shape=n_shape))
189 | model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
190 | model.add(Flatten())
191 | model.add(Dense(512, activation='relu',name='hidden1'))
192 | model.add(Dense(9, activation='softmax', name='output'))
193 | model.compile(loss='categorical_crossentropy',optimizer='adam')
194 | model.summary()
195 | q_nn = model
196 | 
197 | 
198 | # Hyperparameters
199 | 
200 | discount_rate = 0.9
201 | explore_rate = 0.2
202 | n_episodes = 100
203 | 
204 | # create the empty list to contain game memory
205 | memory = deque(maxlen=1000)
206 | 
207 | experiment(env, policy_q_nn, n_episodes)
208 | 
209 | env.close()
210 | 
211 |         
212 | 
213 |           
214 |         
215 |     
216 | 
217 |         
218 |             
219 |             
220 |             
221 |             
222 |             
223 |             
224 |             
225 |             
226 |             
227 |             


--------------------------------------------------------------------------------
/Ping_pong/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Ping_pong/.DS_Store


--------------------------------------------------------------------------------
/Ping_pong/ping_pong.py:
--------------------------------------------------------------------------------
1 | import gymimport numpy as npdef downsample(image):    # Take only alternate pixels - basically halves the resolution of the image (which is fine for us)    return image[::2, ::2, :]def remove_color(image):    """Convert all color (RGB is the third dimension in the image)"""    return image[:, :, 0]def remove_background(image):    image[image == 144] = 0    image[image == 109] = 0    return imagedef preprocess_observations(input_observation, prev_processed_observation, input_dimensions):    """ convert the 210x160x3 uint8 frame into a 6400 float vector """    processed_observation = input_observation[35:195] # crop    processed_observation = downsample(processed_observation)    processed_observation = remove_color(processed_observation)    processed_observation = remove_background(processed_observation)    processed_observation[processed_observation != 0] = 1 # everything else (paddles, ball) just set to 1    # Convert from 80 x 80 matrix to 1600 x 1 matrix    processed_observation = processed_observation.astype(np.float).ravel()    # subtract the previous frame from the current one so we are only processing on changes in the game    if prev_processed_observation is not None:        input_observation = processed_observation - prev_processed_observation    else:        input_observation = np.zeros(input_dimensions)    # store the previous frame so we can subtract from it next time    prev_processed_observations = processed_observation    return input_observation, prev_processed_observationsdef sigmoid(x):    return 1.0/(1.0 + np.exp(-x))def relu(vector):    vector[vector < 0] = 0    return vectordef apply_neural_nets(observation_matrix, weights):    """ Based on the observation_matrix and weights, compute the new hidden layer values and the new output layer values"""    hidden_layer_values = np.dot(weights['1'], observation_matrix)    hidden_layer_values = relu(hidden_layer_values)    output_layer_values = np.dot(hidden_layer_values, weights['2'])    output_layer_values = sigmoid(output_layer_values)    return hidden_layer_values, output_layer_valuesdef choose_action(probability):    random_value = np.random.uniform()    if random_value < probability:        # signifies up in openai gym        return 2    else:         # signifies down in openai gym        return 3def compute_gradient(gradient_log_p, hidden_layer_values, observation_values, weights):    """ See here: http://neuralnetworksanddeeplearning.com/chap2.html"""    delta_L = gradient_log_p    dC_dw2 = np.dot(hidden_layer_values.T, delta_L).ravel()    delta_l2 = np.outer(delta_L, weights['2'])    delta_l2 = relu(delta_l2)    dC_dw1 = np.dot(delta_l2.T, observation_values)    return {        '1': dC_dw1,        '2': dC_dw2    }def update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate):    """ See here: http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop"""    epsilon = 1e-5    for layer_name in weights.keys():        g = g_dict[layer_name]        expectation_g_squared[layer_name] = decay_rate * expectation_g_squared[layer_name] + (1 - decay_rate) * g**2        weights[layer_name] += (learning_rate * g)/(np.sqrt(expectation_g_squared[layer_name] + epsilon))        g_dict[layer_name] = np.zeros_like(weights[layer_name]) # reset batch gradient bufferdef discount_rewards(rewards, gamma):    """ Actions you took 20 steps before the end result are less important to the overall result than an action you took a step ago.    This implements that logic by discounting the reward on previous actions based on how long ago they were taken"""    discounted_rewards = np.zeros_like(rewards)    running_add = 0    for t in reversed(range(0, rewards.size)):        if rewards[t] != 0:            running_add = 0 # reset the sum, since this was a game boundary (pong specific!)        running_add = running_add * gamma + rewards[t]        discounted_rewards[t] = running_add    return discounted_rewardsdef discount_with_rewards(gradient_log_p, episode_rewards, gamma):    """ discount the gradient with the normalized rewards """    discounted_episode_rewards = discount_rewards(episode_rewards, gamma)    # standardize the rewards to be unit normal (helps control the gradient estimator variance)    discounted_episode_rewards -= np.mean(discounted_episode_rewards)    discounted_episode_rewards /= np.std(discounted_episode_rewards)    return gradient_log_p * discounted_episode_rewardsdef main():    env = gym.make("Pong-v0")    observation = env.reset() # This gets us the image    # hyperparameters    episode_number = 0    batch_size = 10    gamma = 0.99 # discount factor for reward    decay_rate = 0.99    num_hidden_layer_neurons = 200    input_dimensions = 80 * 80    learning_rate = 1e-4    episode_number = 0    reward_sum = 0    running_reward = None    prev_processed_observations = None    weights = {        '1': np.random.randn(num_hidden_layer_neurons, input_dimensions) / np.sqrt(input_dimensions),        '2': np.random.randn(num_hidden_layer_neurons) / np.sqrt(num_hidden_layer_neurons)    }    # To be used with rmsprop algorithm (http://sebastianruder.com/optimizing-gradient-descent/index.html#rmsprop)    expectation_g_squared = {}    g_dict = {}    for layer_name in weights.keys():        expectation_g_squared[layer_name] = np.zeros_like(weights[layer_name])        g_dict[layer_name] = np.zeros_like(weights[layer_name])    episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], []    while True:        env.render()        processed_observations, prev_processed_observations = preprocess_observations(observation, prev_processed_observations, input_dimensions)        hidden_layer_values, up_probability = apply_neural_nets(processed_observations, weights)            episode_observations.append(processed_observations)        episode_hidden_layer_values.append(hidden_layer_values)        action = choose_action(up_probability)        # carry out the chosen action        observation, reward, done, info = env.step(action)        reward_sum += reward        episode_rewards.append(reward)        # see here: http://cs231n.github.io/neural-networks-2/#losses        fake_label = 1 if action == 2 else 0        loss_function_gradient = fake_label - up_probability        episode_gradient_log_ps.append(loss_function_gradient)        if done: # an episode finished            episode_number += 1            # Combine the following values for the episode            episode_hidden_layer_values = np.vstack(episode_hidden_layer_values)            episode_observations = np.vstack(episode_observations)            episode_gradient_log_ps = np.vstack(episode_gradient_log_ps)            episode_rewards = np.vstack(episode_rewards)            # Tweak the gradient of the log_ps based on the discounted rewards            episode_gradient_log_ps_discounted = discount_with_rewards(episode_gradient_log_ps, episode_rewards, gamma)            gradient = compute_gradient(              episode_gradient_log_ps_discounted,              episode_hidden_layer_values,              episode_observations,              weights            )            # Sum the gradient for use when we hit the batch size            for layer_name in gradient:                g_dict[layer_name] += gradient[layer_name]            if episode_number % batch_size == 0:                update_weights(weights, expectation_g_squared, g_dict, decay_rate, learning_rate)            episode_hidden_layer_values, episode_observations, episode_gradient_log_ps, episode_rewards = [], [], [], [] # reset values            observation = env.reset() # reset env            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01            print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))            reward_sum = 0            prev_processed_observations = Nonemain()


--------------------------------------------------------------------------------
/Policy_eval_Grid_World/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Policy_eval_Grid_World/.DS_Store


--------------------------------------------------------------------------------
/Policy_eval_Grid_World/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Policy_eval_Grid_World/Figure_1.png


--------------------------------------------------------------------------------
/Policy_eval_Grid_World/policy_eval_GridWorld.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jul  3 10:42:03 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | 
10 | import numpy as np
11 | from tqdm import tqdm
12 | import matplotlib.pyplot as plt
13 | import seaborn as sns
14 | sns.set_style('darkgrid')
15 | import random
16 | 
17 | 
18 | #Parameters
19 | 
20 | gamma = 1 #Discounting Rate range from (0 to 1)
21 | rewardSize = -1
22 | gridSize = 4
23 | terminationStates = [[0,0], [gridSize-1,gridSize-1]]
24 | actions = [[-1,0],[1,0],[0,1],[0,-1]]
25 | numIterations = 1000
26 | 
27 | #Utilites
28 | 
29 | def actionRewardFunction(initalPostion,action):
30 |     
31 |     if initalPostion in terminationStates:
32 |         return initalPostion,0
33 |     reward = rewardSize
34 |     finalPosition = np.array(initalPostion) + np.array(action)
35 |     
36 |     
37 |     if -1 in finalPosition or 4 in finalPosition:
38 |         finalPosition = initalPostion
39 |         
40 |     return finalPosition,reward
41 | 
42 | 
43 | 
44 | #initalization
45 |     
46 | valueMap = np.zeros((gridSize,gridSize))
47 | 
48 | 
49 | states =  [[i,j] for  i in range(gridSize) for j in range(gridSize)]
50 | 
51 | 
52 | 
53 | #policiy evaluation
54 | 
55 | deltas = []
56 | for it in range(numIterations):
57 |     copyValueMap = np.copy(valueMap)
58 |     deltaState = []
59 |     for state in states:
60 |         weightedRewards = 0
61 |         for action in actions:
62 |             finalPosition, reward = actionRewardFunction(state, action)
63 |             weightedRewards += (1/len(actions))*(reward+(gamma*valueMap[finalPosition[0], finalPosition[1]]))
64 |         deltaState.append(np.abs(copyValueMap[state[0], state[1]]-weightedRewards))
65 |         copyValueMap[state[0], state[1]] = weightedRewards
66 |     deltas.append(deltaState)
67 |     valueMap = copyValueMap
68 |     if it in [0,1,2,9, 99, numIterations-1]:
69 |         print("Iteration {}".format(it+1))
70 |         print(valueMap)
71 |         print("")
72 |                 
73 | 
74 | 
75 | 
76 | plt.figure(figsize=(20, 10))
77 | plt.legend()
78 | plt.plot(deltas)
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/Q-Learning/Q-learning.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Jul 18 15:01:39 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | 
10 | #Q learning
11 | 
12 | import random
13 | import gym
14 | 
15 | env = gym.make('Taxi-v3')
16 | 
17 | env.render()
18 | 
19 | 
20 | q = {}
21 | for s in range(env.observation_space.n):
22 |     for a in range(env.action_space.n):
23 |         q[(s,a)] = 0.0
24 |         
25 | 
26 | def update_q_table(prev_state,action,reward,next_state,alpha,gamma):
27 |     qa = max([q[(next_state,a)] for a in range(env.action_space.n)])
28 |     q[prev_state,action] += alpha *(reward + qa - q[prev_state,action])
29 |     
30 | 
31 | 
32 | def epsilon_greedy(state,epsilon):
33 |     
34 |     if random.uniform(0, 1) < epsilon:
35 |         
36 |         #takinf a random action
37 |         return env.action_space.sample()
38 |     
39 |     else:
40 |         
41 |         #taking a greedy action
42 |         return max(list(range(env.action_space.n)), key = lambda x: q[(state,x)])
43 |     
44 |     
45 | 
46 | 
47 | 
48 | alpha = 0.4
49 | gamma = 0.999
50 | epsilon = 0.017
51 | 
52 | for i in range(8000):
53 |     r = 0
54 |     prev_state = env.reset()
55 |     #env.render()
56 |     
57 |     
58 |     while True:
59 |         
60 |         env.render()
61 |         
62 |     
63 |         action = epsilon_greedy(prev_state, epsilon)
64 |         
65 |         next_state, reward,done , _ = env.step(action)
66 |         
67 |         update_q_table(prev_state, action, reward, next_state, alpha, gamma)
68 |         
69 |         prev_state  = next_state
70 |         
71 |         r += reward
72 |         
73 |         
74 |         if done:
75 |             break
76 |         
77 |     print("total reward:", r)
78 |     
79 |     
80 | env.close()
81 | 
82 | 
83 |         
84 |         
85 |         
86 |         
87 |     
88 |         
89 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcment-Learning
 2 | **Reinforcement learning (RL)** is an area of machine learning concerned with how software agents ought to take actions in an environment in order to maximize the notion of cumulative reward. Reinforcement learning is one of three basic machine learning paradigms, alongside supervised learning and unsupervised learning.
 3 | 
 4 | In this repository we are going have codes for the algorithms of reinforcement learning
 5 | 
 6 | * You can also check the instructions to installation of **Gym** [here](https://gym.openai.com/docs/)
 7 | 
 8 | 
 9 | ## Install Gym
10 | 
11 | `pip install gym`
12 | 
13 | or
14 | 
15 | ```
16 | git clone https://github.com/openai/gym
17 | cd gym
18 | pip install -e .
19 | ```
20 | 
21 | ### Example
22 | 
23 | An example to see wheter _gym_ is working or not
24 | 
25 | 
26 | ```
27 | import gym
28 | env = gym.make('CartPole-v0')
29 | env.reset()
30 | for _ in range(1000):
31 |     env.render()
32 |     env.step(env.action_space.sample()) # take a random action
33 | env.close()
34 | 
35 | ```
36 | 
37 | ### The code for Cartpole environment
38 | 
39 | ```
40 | import gym
41 | env = gym.make('CartPole-v0')
42 | for i_episode in range(20):
43 |     observation = env.reset()
44 |     for t in range(100):
45 |         env.render()
46 |         print(observation)
47 |         action = env.action_space.sample()
48 |         observation, reward, done, info = env.step(action)
49 |         if done:
50 |             print("Episode finished after {} timesteps".format(t+1))
51 |             break
52 | env.close()
53 | ```
54 | 
55 | ## Table of Contents 
56 | * [Temporal-Difference](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Temporal-Difference)
57 | * [K-Armed-Bandit](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/K-armed-Bandit)
58 | 
59 | * [Tile-Coding](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Tile-coding%20)
60 | 
61 | 
62 | * [Q-learnig](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Q-Learning)
63 | 
64 | * [Deep-Q-network](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Deep-Q-Network)
65 | 
66 | 
67 | * [Sarsa](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Sarsa)
68 | 
69 | 
70 | * [Pacman](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Pac-Man)
71 | 
72 | 
73 | * [Frozen-Lake](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Frozen_lake)
74 | 
75 | 
76 | * [Reinforce](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Reinforce)
77 | 
78 | 
79 | * [Upper-Confidence-Bound](https://github.com/Pavankunchala/Reinforcement-Learning/tree/master/Upper-Confidence-Bound%20)
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Reinforce/policy_graident.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Aug 25 16:38:59 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | #importing stuff
 10 | import gym
 11 | gym.logger.set_level(40) # suppress warnings (please remove if gives error)
 12 | import numpy as np
 13 | from collections import deque
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | 
 17 | 
 18 | import torch
 19 | torch.manual_seed(0) # set random seed
 20 | import torch.nn as nn
 21 | import torch.nn.functional as F
 22 | import torch.optim as optim
 23 | from torch.distributions import Categorical
 24 | 
 25 | 
 26 | env = gym.make('CartPole-v0')
 27 | env.seed(0)
 28 | print('observation space:', env.observation_space)
 29 | print('action space:', env.action_space)
 30 | 
 31 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 32 | 
 33 | class Policy(nn.Module):
 34 |     def __init__(self, s_size=4, h_size=16, a_size=2):
 35 |         super(Policy, self).__init__()
 36 |         self.fc1 = nn.Linear(s_size, h_size)
 37 |         self.fc2 = nn.Linear(h_size, a_size)
 38 | 
 39 |     def forward(self, x):
 40 |         x = F.relu(self.fc1(x))
 41 |         x = self.fc2(x)
 42 |         return F.softmax(x, dim=1)
 43 |     
 44 |     def act(self, state):
 45 |         state = torch.from_numpy(state).float().unsqueeze(0).to(device)
 46 |         probs = self.forward(state).cpu()
 47 |         m = Categorical(probs)
 48 |         action = m.sample()
 49 |         return action.item(), m.log_prob(action)
 50 |     
 51 |     
 52 |   
 53 | policy = Policy().to(device)
 54 | optimizer = optim.Adam(policy.parameters(), lr=1e-2)
 55 | 
 56 | def reinforce(n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
 57 |     scores_deque = deque(maxlen=100)
 58 |     scores = []
 59 |     for i_episode in range(1, n_episodes+1):
 60 |         saved_log_probs = []
 61 |         rewards = []
 62 |         state = env.reset()
 63 |         for t in range(max_t):
 64 |             action, log_prob = policy.act(state)
 65 |             saved_log_probs.append(log_prob)
 66 |             state, reward, done, _ = env.step(action)
 67 |             rewards.append(reward)
 68 |             if done:
 69 |                 break 
 70 |         scores_deque.append(sum(rewards))
 71 |         scores.append(sum(rewards))
 72 |         
 73 |         discounts = [gamma**i for i in range(len(rewards)+1)]
 74 |         R = sum([a*b for a,b in zip(discounts, rewards)])
 75 |         
 76 |         policy_loss = []
 77 |         for log_prob in saved_log_probs:
 78 |             policy_loss.append(-log_prob * R)
 79 |         policy_loss = torch.cat(policy_loss).sum()
 80 |         
 81 |         optimizer.zero_grad()
 82 |         policy_loss.backward()
 83 |         optimizer.step()
 84 |         
 85 |         if i_episode % print_every == 0:
 86 |             print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
 87 |         if np.mean(scores_deque)>=195.0:
 88 |             print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
 89 |             break
 90 |         
 91 |     return scores
 92 |     
 93 | scores = reinforce()
 94 | 
 95 | 
 96 | fig = plt.figure()
 97 | ax = fig.add_subplot(111)
 98 | plt.plot(np.arange(1, len(scores)+1), scores)
 99 | plt.ylabel('Score')
100 | plt.xlabel('Episode #')
101 | plt.show()
102 | 
103 | 
104 | env = gym.make('CartPole-v0')
105 | 
106 | state = env.reset()
107 | for t in range(1000):
108 |     action, _ = policy.act(state)
109 |     env.render()
110 |     state, reward, done, _ = env.step(action)
111 |     if done:
112 |         break 
113 | 
114 | env.close()    
115 |     
116 |     
117 |     
118 |     
119 |     
120 |     
121 |     
122 |     
123 |     
124 |     
125 |     
126 |     
127 |     
128 |     
129 |     
130 |     
131 |     
132 |     
133 |     
134 |     
135 |     
136 |     
137 |     
138 |     
139 |     
140 |     


--------------------------------------------------------------------------------
/Sarsa/Sarsa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Jul 18 11:21:52 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | import gym
 13 | 
 14 | import random
 15 | 
 16 | env = gym.make('Taxi-v3')
 17 | 
 18 | env.render()
 19 | 
 20 | Q = {}
 21 | 
 22 | for s in range(env.observation_space.n):
 23 |     for a in range(env.action_space.n):
 24 |         Q[(s,a)] = 0.0
 25 | 
 26 | 
 27 | 
 28 | def epsilon_greedy(state,epsilon):
 29 |     
 30 |     if random.uniform(0,1) < epsilon:
 31 |         
 32 |         #taking a random action
 33 |         
 34 |         return env.action_space.sample()
 35 |     
 36 |     else:
 37 |         
 38 |         #taking a gredy action
 39 |         
 40 |         return max(list(range(env.action_space.n)), key = lambda x: Q[(state,x)])
 41 |     
 42 |     
 43 |     
 44 | 
 45 | alpha = 0.85
 46 | gamma = 0.9
 47 | epsilon  = 0.8
 48 | 
 49 | #performing Sarsa
 50 | 
 51 | 
 52 | for i in range(4000):
 53 |     
 54 |     # we store cumulative reward of each episodes in r
 55 |     r = 0
 56 |     
 57 |     # initialize the state,
 58 |     state = env.reset()
 59 |     
 60 |     # select the action using epsilon-greedy policy
 61 |     action = epsilon_greedy(state,epsilon)
 62 |     
 63 |     while True:
 64 |         
 65 |        
 66 |         env.render()
 67 |         
 68 |         # then we perform the action and move to the next state, and receive the reward
 69 |         nextstate, reward, done, _ = env.step(action)
 70 |         
 71 |         # again, we select the next action using epsilon greedy policy
 72 |         nextaction = epsilon_greedy(nextstate,epsilon) 
 73 |     
 74 |         # we calculate the Q value of previous state using our update rule
 75 |         Q[(state,action)] += alpha * (reward + gamma * Q[(nextstate,nextaction)]-Q[(state,action)])
 76 | 
 77 |         # finally we update our state and action with next action and next state
 78 |         action = nextaction
 79 |         state = nextstate
 80 |         
 81 |         # store the rewards
 82 |         r += reward
 83 |         
 84 |         # we will break the loop, if we are at the terminal state of the episode
 85 |         if done:
 86 |             break
 87 |             
 88 |     print("total reward: ", r)
 89 | 
 90 | env.close()
 91 |         
 92 |     
 93 |     
 94 | 
 95 |     
 96 |     
 97 | 
 98 |     
 99 |     
100 |  


--------------------------------------------------------------------------------
/Sarsa/n-Sarsa_and_Sarsa(lambda).py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Aug  6 14:28:44 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | # Importing part
 11 | 
 12 | import gym
 13 | import itertools
 14 | import matplotlib
 15 | import numpy as np
 16 | import pandas as  pd
 17 | import sys
 18 | import time
 19 | import timeit
 20 | 
 21 | from collections import namedtuple
 22 | 
 23 | import os
 24 | 
 25 | import glob
 26 | 
 27 | from lib.tile_coding import IHT, tiles
 28 | 
 29 | from matplotlib import pyplot as plt
 30 | from matplotlib import cm
 31 | matplotlib.style.use('ggplot')
 32 | 
 33 | import io
 34 | import base64
 35 | 
 36 | from IPython.display import HTML
 37 | 
 38 | #creating the environment 
 39 | 
 40 | env = gym.make('MountainCar-v0')
 41 | 
 42 | env._max_episode_steps = 3000 #increse the upper time limit
 43 | np.random.seed(6)  # Make plots reproducible
 44 | 
 45 | 
 46 | 
 47 | class QEstimator():
 48 |     
 49 |     
 50 |     def __init__(self,step_size,num_tilings = 8,max_size = 4096,tiling_dim = None, trace = False):
 51 |         
 52 |         
 53 |         self.trace  = trace
 54 |         self.max_size = max_size
 55 |         self.num_tilings = num_tilings
 56 |         self.tiling_dim = tiling_dim or num_tilings
 57 |         
 58 |         #alpha is the fraction of step_size and num_tilings
 59 |         
 60 |         self.alpha = step_size/num_tilings
 61 |         
 62 |         #initalzinf the hash table for tile coding and keeping it in max
 63 |         self.iht = IHT(max_size)
 64 |         
 65 |         
 66 |         #initalzizinf the weights
 67 |         self.weights = np.zeros(max_size)
 68 |         if self.trace:
 69 |             self.z = np.zeros(max_size)
 70 |             
 71 |         
 72 |         #tilecoding software  partitions  at integer boundaries
 73 |         
 74 |         
 75 |         self.postion_scale  = self.tiling_dim / (env.observation_space.high[0] 
 76 |                                           - env.observation_space.low[0] )
 77 |         self.velocity_scale = self.tiling_dim/ ( env.observation_space.high[1]
 78 |                                                 - env.observation_space.low[1] ) 
 79 |             
 80 |         
 81 |     def featurize_state_action(self,state,action):
 82 |         
 83 |         #returns the featurized repesentation of state action pair
 84 |         
 85 |         
 86 |         featurized = tiles(self.iht,self.num_tilings,
 87 |                            [self.postion_scale * state[0],
 88 |                            self.velocity_scale * state[1]],
 89 |                            [action]      
 90 |                            )
 91 |         
 92 |         return featurized
 93 |     
 94 |     def predict(self,s , a = None):
 95 |         
 96 |         #predicitng q-value(s)
 97 |         
 98 |         
 99 |         if a is None:
100 |             features = [self.featurize_state_action(s, i) for  i in 
101 |                         range(env.action_space.n)]
102 |             
103 |         else:
104 |             features = [ self.featurize_state_action(s, a)]
105 |             
106 |         return [np.sum(self.weights[f]) for f in features]
107 |     
108 |     def update(self,s,a,target):
109 |     
110 |         # updates the estimator parameters
111 |         
112 |         features = self.featurize_state_action(s, a)
113 |         
114 |         # linear function Approx
115 |         estimation = np.sum(self.weights[features])
116 |         
117 |         delta = (target-estimation)
118 |         
119 |         if self.trace:
120 |             # self.z[features] += 1  # Accumulating trace
121 |             self.z[features] = 1  # Replacing trace
122 |             self.weights += self.alpha * delta * self.z
123 |         else:
124 |             self.weights[features] += self.alpha * delta
125 |             
126 |             
127 |     def reset(self,z_only = False):
128 |         
129 |         
130 |         if z_only:
131 |             
132 |             assert self.trace #'q-value estimator has no z to reset.'m
133 |             self.z = np.zeros(self.max_size)
134 |         else:
135 |             if self.trace:
136 |                 self.z = np.zeros(self.max_size)
137 |             self.weights = np.zeros(self.max_size)
138 |             
139 |     
140 |     
141 | def make_epsilon_greedy_policy(estimator,epsilon,num_actions ) :
142 |     
143 |     def policy_fn(observation):
144 |         
145 |         action_probs = np.ones(num_actions,dtype = float)*epsilon / num_actions
146 |         
147 |         q_values= estimator.predict(observation)
148 |         
149 |         best_action_idx = np.argmax(q_values)
150 |         
151 |         action_probs[best_action_idx] += (1.0- epsilon )
152 |         
153 |         return action_probs
154 |     return policy_fn
155 | 
156 | 
157 | 
158 | # defining Sarsa n
159 | 
160 | 
161 | 
162 | def sarsa_n(n,env,estimator,gamma = 1.0,epsilon= 1.0):
163 |     
164 |     
165 |     # create epslion greedy policy
166 |     
167 |     policy = make_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
168 |     
169 |     #Resetting the environment
170 |     
171 |     state = env.reset()
172 |     
173 |     action_probs = policy(state)
174 |     
175 |     action = np.random.choice(np.arange(len(action_probs)), p = action_probs)
176 |     
177 |     
178 |     #setting up the stuff
179 |     
180 |     states = [state]
181 |     actions = [ action]
182 |     rewards = [0.0]
183 |     
184 |     #stepping through epsiodes
185 |     
186 |     T = float('inf')
187 |     
188 |     for t in itertools.count():
189 |         
190 |         if t<T:
191 |             
192 |             
193 |             #take a step
194 |             
195 |             next_state, reward,done,_  =env.step(action)
196 |             
197 |             states.append(next_state)
198 |             
199 |             rewards.append(reward)
200 |             
201 |             
202 |             
203 |             if done:
204 |                 T  = t+1
205 |                 
206 |             else:
207 |                 
208 |                 #take next step
209 |                 
210 |                 next_action_probs = policy(next_state)
211 |                 
212 |                 next_action = np.random.choice(np.arange(len(next_action_probs)),
213 |                                                p = next_action_probs)
214 |                 
215 |                 actions.append(next_action)
216 |                 
217 |                 
218 |         update_time = t+1 -n
219 |         
220 |         
221 |         
222 |         if update_time >= 0:  
223 |             
224 |             # Build target
225 |             target = 0
226 |             for i in range(update_time + 1, min(T, update_time + n) + 1):
227 |                 target += np.power(gamma, i - update_time - 1) * rewards[i]
228 |             if update_time + n < T:
229 |                 q_values_next = estimator.predict(states[update_time + n])
230 |                 target += q_values_next[actions[update_time + n]]
231 |             
232 |             # Update step
233 |             estimator.update(states[update_time], actions[update_time], target)
234 |         
235 |         if update_time == T - 1:
236 |             break
237 | 
238 |         state = next_state
239 |         action = next_action
240 |     
241 |     ret = np.sum(rewards)
242 |     
243 |     return t, ret
244 | 
245 | 
246 | 
247 | def sarsa_lambda(lmbda, env, estimator, gamma=1.0, epsilon=0):
248 |     
249 |    
250 |     
251 |     # Reset the eligibility trace
252 |     estimator.reset(z_only=True)
253 | 
254 |     # Create epsilon-greedy policy
255 |     policy = make_epsilon_greedy_policy(
256 |         estimator, epsilon, env.action_space.n)
257 | 
258 |     # Reset the environment and pick the first action
259 |     state = env.reset()
260 |     action_probs = policy(state)
261 |     action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
262 | 
263 |     ret = 0
264 |     # Step through episode
265 |     for t in itertools.count():
266 |         # Take a step
267 |         next_state, reward, done, _ = env.step(action)
268 |         ret += reward
269 | 
270 |         if done:
271 |             target = reward
272 |             estimator.update(state, action, target)
273 |             break
274 | 
275 |         else:
276 |             # Take next step
277 |             next_action_probs = policy(next_state)
278 |             next_action = np.random.choice(
279 |                 np.arange(len(next_action_probs)), p=next_action_probs)
280 | 
281 |             # Estimate q-value at next state-action
282 |             q_new = estimator.predict(
283 |                 next_state, next_action)[0]
284 |             target = reward + gamma * q_new
285 |             # Update step
286 |             estimator.update(state, action, target)
287 |             estimator.z *= gamma * lmbda
288 | 
289 |         state = next_state
290 |         action = next_action    
291 |     
292 |     return t, ret
293 |     
294 | 
295 | 
296 |         
297 |         
298 |         
299 | # plotting stuff
300 | 
301 | def plot_cost_to_go(env, estimator, num_partitions=50):
302 |    
303 |     
304 |     x = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=num_partitions)
305 |     y = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=num_partitions)
306 |     X, Y = np.meshgrid(x, y)
307 |     Z = np.apply_along_axis(
308 |         lambda obs: -np.max(estimator.predict(obs)), 2, np.stack([X, Y], axis=2))
309 | 
310 |     fig, ax = plt.subplots(figsize=(10, 5))
311 |     p = ax.pcolor(X, Y, Z, cmap=cm.RdBu, vmin=0, vmax=200)
312 | 
313 |     ax.set_xlabel('Position')
314 |     ax.set_ylabel('Velocity')
315 |     ax.set_title("\"Cost To Go\" Function")
316 |     fig.colorbar(p)
317 |     plt.show()  
318 |     
319 |     
320 | 
321 | def generate_greedy_policy_animation(env, estimator, save_dir):
322 |     """
323 |     Follows (deterministic) greedy policy
324 |     with respect to the given q-value estimator
325 |     and saves animation using openAI gym's Monitor 
326 |     wrapper. Monitor will throw an error if monitor 
327 |     files already exist in save_dir so use unique
328 |     save_dir for each call.
329 |     """
330 |     
331 |     if not os.path.exists(save_dir):
332 |         os.makedirs(save_dir)
333 | 
334 |     try:
335 |         env = gym.wrappers.Monitor(
336 |             env, save_dir, video_callable=lambda episode_id: True)
337 |     except gym.error.Error as e:
338 |         print(e.what())
339 | 
340 |     # Set epsilon to zero to follow greedy policy
341 |     policy = make_epsilon_greedy_policy(
342 |         estimator=estimator, epsilon=0, num_actions=env.action_space.n)
343 |     # Reset the environment
344 |     state = env.reset()
345 |     for t in itertools.count():
346 |         time.sleep(0.01)  # Slow down animation
347 |         action_probs = policy(state)  # Compute action-values
348 |         [action] = np.nonzero(action_probs)[0]  # Greedy action
349 |         state, _, done, _ = env.step(action)  # Take step
350 |         env.render()  # Animate
351 |         if done:
352 |             print('Solved in {} steps'.format(t))
353 |             break
354 |                 
355 |                 
356 |                 
357 | def display_animation(filepath):
358 |     """ Displays mp4 animation in Jupyter."""
359 |     
360 |     video = io.open(filepath, 'r+b').read()
361 |     encoded = base64.b64encode(video)
362 |     return HTML(data='''<video alt="test" controls>
363 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
364 |                  </video>'''.format(encoded.decode('ascii')))            
365 |      
366 |     
367 |     
368 | def plot_learning_curves(stats, smoothing_window=10):
369 |     """
370 |     Plots the number of steps taken by the agent
371 |     to solve the task as a function of episode number,
372 |     smoothed over the last smoothing_window episodes. 
373 |     """
374 |     
375 |     plt.figure(figsize=(10,5))
376 |     for algo_stats in stats:
377 |         steps_per_episode = pd.Series(algo_stats.steps).rolling(
378 |             smoothing_window).mean()  # smooth
379 |         plt.plot(steps_per_episode, label=algo_stats.algorithm)
380 |     plt.xlabel("Episode")
381 |     plt.ylabel("Steps")
382 |     plt.title("Steps per Episode")
383 |     plt.legend()
384 |     plt.show()  
385 |         
386 |             
387 | 
388 | def plot_grid_search(stats, truncate_steps=400):
389 |     """ 
390 |     Plots average number of steps taken by the agent 
391 |     to solve the task for each combination of
392 |     step size and boostrapping parameter
393 |     (n or lambda).
394 |     """
395 |     # Truncate high step values for clearer plotting
396 |     stats.steps[stats.steps > truncate_steps] = truncate_steps
397 |     
398 |     # We use -1 step values indicate corresponding combination of
399 |     # parameters doesn't converge. Set these to truncate_steps for plotting.
400 |     stats.steps[stats.steps == -1] = truncate_steps
401 |     
402 |     plt.figure()
403 |     for b_idx in range(len(stats.bootstrappings)):
404 |         plt.plot(stats.step_sizes, stats.steps[b_idx, :], 
405 |             label='Bootstrapping: {}'.format(stats.bootstrappings[b_idx]))
406 |     plt.xlabel('Step size (alpha * number of tilings)')
407 |     plt.ylabel('Average steps per episode')
408 |     plt.title('Grid Search {}'.format(stats.algorithm))
409 |     plt.ylim(140, truncate_steps - 100)
410 |     plt.legend()            
411 |         
412 |             
413 |     
414 |             
415 |         
416 | RunStats = namedtuple('RunStats', ['algorithm', 'steps', 'returns'])     
417 |         
418 |     
419 | def run(algorithm, num_episodes=500, **algorithm_kwargs):
420 |     
421 |     """
422 |     Runs algorithm over multilple episodes and logs
423 |     for each episode the complete return (G_t) and the
424 |     number of steps taken.
425 |     """
426 |     
427 |     stats = RunStats(
428 |         algorithm=algorithm, 
429 |         steps=np.zeros(num_episodes), 
430 |         returns=np.zeros(num_episodes))
431 |     
432 |     algorithm_fn = globals()[algorithm]
433 |     
434 |     for i in range(num_episodes):
435 |         episode_steps, episode_return = algorithm_fn(**algorithm_kwargs)
436 |         stats.steps[i] = episode_steps
437 |         stats.returns[i] = episode_return
438 |         sys.stdout.flush()
439 |         print("\rEpisode {}/{} Return {}".format(
440 |             i + 1, num_episodes, episode_return), end="")
441 |     return stats
442 |       
443 |    
444 |      
445 |         
446 | 
447 | step_size = 0.5  # Fraction of the way we want to move towards target
448 | n = 4  # Level of bootstrapping (set to intermediate value)
449 | num_episodes = 500
450 | 
451 | estimator_n = QEstimator(step_size=step_size)
452 | 
453 | start_time = timeit.default_timer()
454 | run_stats_n = run('sarsa_n', num_episodes, n=n, env=env, estimator=estimator_n)
455 | elapsed_time = timeit.default_timer() - start_time
456 | 
457 | plot_cost_to_go(env, estimator_n)
458 | print('{} episodes completed in {:.2f}s'.format(num_episodes, elapsed_time))
459 |         
460 |   
461 |                  
462 | 
463 | # Animate learned policy
464 | save_dir='./animations/n-step_sarsa/'
465 | generate_greedy_policy_animation(env, estimator_n, save_dir=save_dir)
466 | [filepath] = glob.glob(os.path.join(save_dir, '*.mp4'))
467 | display_animation(filepath)
468 | 
469 | 
470 | step_size = 0.5 # Fraction of the way we want to move towards target
471 | lmbda = 0.92  # Level of bootstrapping (set to intermediate value)
472 | num_episodes = 500
473 | 
474 | estimator_lambda = QEstimator(step_size=step_size, trace=True)
475 | 
476 | start_time = timeit.default_timer()
477 | run_stats_lambda = run('sarsa_lambda', num_episodes, lmbda=lmbda, env=env, estimator=estimator_lambda)
478 | elapsed_time = timeit.default_timer() - start_time
479 | 
480 | plot_cost_to_go(env, estimator_lambda)
481 | print('{} episodes completed in {:.2f}s'.format(num_episodes, elapsed_time))
482 | 
483 | # Animate learned policy
484 | save_dir='./animations/sarsa_lambda/'
485 | generate_greedy_policy_animation(env, estimator_lambda, save_dir=save_dir)
486 | [filepath] = glob.glob(os.path.join(save_dir, '*.mp4'))
487 | display_animation(filepath)
488 | 
489 | 
490 | plot_learning_curves([run_stats_n, run_stats_lambda])
491 | 
492 | 
493 | 
494 | # comparing
495 | 
496 | GridSearchStats = namedtuple('GridSearchStats', ['algorithm', 'steps', 'step_sizes', 'bootstrappings'])
497 | 
498 | 
499 | def run_grid_search(algorithm, step_sizes, bootstrappings, episodes=100, num_runs=5,
500 |                    **algorithm_kwargs):
501 |     
502 |    
503 |     
504 |     stats = GridSearchStats(
505 |         algorithm=algorithm, 
506 |         steps=np.zeros((len(bootstrappings), len(step_sizes))),
507 |         step_sizes=step_sizes,
508 |         bootstrappings=bootstrappings)
509 |         
510 |     algorithm_fn = globals()[algorithm]
511 |     trace = True if algorithm == 'sarsa_lambda' else False
512 | 
513 |     for run_idx in range(num_runs):
514 |         for b_idx, bootstrapping in enumerate(bootstrappings):
515 |             for s_idx, step_size in enumerate(step_sizes):
516 |                 if algorithm == 'sarsa_n':
517 |                     if (bootstrapping == 8 and step_size > 1) or \
518 |                     (bootstrapping == 16 and step_size > 0.75):
519 |                         # sarsa_n doesn't converge in these cases so 
520 |                         # assign a default value and skip over.
521 |                         stats.steps[b_idx, s_idx] = -1 * num_runs * episodes
522 |                         continue
523 |                 estimator = QEstimator(step_size=step_size, trace=trace)
524 |                 for episode in range(episodes):
525 |                     sys.stdout.flush()
526 |                     print('\r run: {}, step_size: {}, bootstrapping: {}, episode: {}'.format(
527 |                             run_idx, step_size, bootstrapping, episode), end="")
528 |                     episode_steps, _ = algorithm_fn(
529 |                         bootstrapping, estimator=estimator, **algorithm_kwargs)
530 |                     stats.steps[b_idx, s_idx] += episode_steps
531 |                     
532 |     
533 |     # Average over independent runs and episodes
534 |     stats.steps[:] /= (num_runs * episodes)
535 |    
536 |     return stats
537 | 
538 | 
539 | step_sizes = np.arange(0.1, 1.8, 0.1)
540 | ns = np.power(2, np.arange(0, 5))
541 | grid_search_stats_n = run_grid_search('sarsa_n', step_sizes, ns, env=env)
542 | plot_grid_search(grid_search_stats_n)
543 | 
544 | step_sizes = np.arange(0.1, 1.8, 0.1)
545 | lambdas = np.array([0, 0.68, 0.84, 0.92, 0.98, 0.99])
546 | grid_search_stats_lambda = run_grid_search('sarsa_lambda', step_sizes, lambdas, env=env)
547 | plot_grid_search(grid_search_stats_lambda)
548 | 
549 | 
550 | 
551 | 
552 | 
553 | 
554 | 
555 | 


--------------------------------------------------------------------------------
/Temporal-Difference/TD_Udacity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Aug 18 12:26:04 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | import sys
 10 | import gym
 11 | import numpy as np
 12 | import random
 13 | import math
 14 | from collections import defaultdict,deque
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | import check_test
 18 | from plot_utils import plot_values
 19 | 
 20 | #creating the cliffenvironment
 21 | 
 22 | env = gym.make('CliffWalking-v0')
 23 | 
 24 | #print env-action-space
 25 | print(env.action_space)
 26 | print(env.observation_space)
 27 | 
 28 | V_opt = np.zeros((4,12))
 29 | V_opt[0][0:13] = -np.arange(3, 15)[::-1]
 30 | V_opt[1][0:13] = -np.arange(3, 15)[::-1] + 1
 31 | V_opt[2][0:13] = -np.arange(3, 15)[::-1] + 2
 32 | V_opt[3][0] = -13
 33 | 
 34 | plot_values(V_opt)
 35 | 
 36 | #TD control SarSA
 37 | 
 38 | def update_Q_sarsa(alpha,gamma,Q,state,action,reward,next_state= None, next_action = None):
 39 |     
 40 |     current = Q[state][action]
 41 |     
 42 |     Qsa_next = Q[next_state][next_action]if next_state is not None else 0
 43 |     
 44 |     target = reward +(gamma * Qsa_next)
 45 |     
 46 |     new_value  = current +(alpha*(target - current)) # getting the updated value
 47 |     
 48 |     return new_value
 49 | 
 50 | #epsilon greedy
 51 | 
 52 | def epsilon_greedy(Q,state,nA,eps):
 53 |     
 54 |     if random.random() >eps:
 55 |         
 56 |         return np.argmax(Q[state])
 57 |     else:
 58 |         return random.choice(np.arange(env.action_space.n))
 59 |     
 60 |     
 61 |     
 62 | 
 63 | #sarsa algo
 64 | 
 65 | def sarsa(env, num_episodes, alpha, gamma=1.0, plot_every=100):
 66 |     nA = env.action_space.n                # number of actions
 67 |     Q = defaultdict(lambda: np.zeros(nA))  # initialize empty dictionary of arrays
 68 |     
 69 |     # monitor performance
 70 |     tmp_scores = deque(maxlen=plot_every)     # deque for keeping track of scores
 71 |     avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes
 72 |     
 73 |     for i_episode in range(1, num_episodes+1):
 74 |         # monitor progress
 75 |         if i_episode % 100 == 0:
 76 |             print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
 77 |             sys.stdout.flush()   
 78 |         score = 0                                             # initialize score
 79 |         state = env.reset()                                   # start episode
 80 |         
 81 |         eps = 1.0 / i_episode                                 # set value of epsilon
 82 |         action = epsilon_greedy(Q, state, nA, eps)            # epsilon-greedy action selection
 83 |         
 84 |         while True:
 85 |             next_state, reward, done, info = env.step(action) # take action A, observe R, S'
 86 |             score += reward                                   # add reward to agent's score
 87 |             if not done:
 88 |                 next_action = epsilon_greedy(Q, next_state, nA, eps) # epsilon-greedy action
 89 |                 Q[state][action] = update_Q_sarsa(alpha, gamma, Q, \
 90 |                                                   state, action, reward, next_state, next_action)
 91 |                 
 92 |                 state = next_state     # S <- S'
 93 |                 action = next_action   # A <- A'
 94 |             if done:
 95 |                 Q[state][action] = update_Q_sarsa(alpha, gamma, Q, \
 96 |                                                   state, action, reward)
 97 |                 tmp_scores.append(score)    # append score
 98 |                 break
 99 |         if (i_episode % plot_every == 0):
100 |             avg_scores.append(np.mean(tmp_scores))
101 | 
102 |     # plot performance
103 |     plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
104 |     plt.xlabel('Episode Number')
105 |     plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
106 |     plt.show()
107 |     # print best 100-episode performance
108 |     print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores))    
109 |     return Q
110 | 
111 | 
112 | 
113 | 
114 | 
115 | Q_sarsa = sarsa(env, 5000, .01)
116 | 
117 | # print the estimated optimal policy
118 | policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12)
119 | check_test.run_check('td_control_check', policy_sarsa)
120 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
121 | print(policy_sarsa)
122 | 
123 | # plot the estimated optimal state-value function
124 | V_sarsa = ([np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)])
125 | plot_values(V_sarsa)
126 | 
127 | 
128 | 
129 | # Q learning
130 | 
131 | def update_Q_sarsamax(alpha,gamma,Q,state,action,reward,next_state =None):
132 |     
133 |     current = Q[state][action]
134 |     Qsa_next = np.max(Q[next_state]) if next_state is not None else 0
135 |     #constructing TD target
136 |     target = reward +(gamma * Qsa_next)
137 |     
138 |     new_value = current +(alpha *(target - current))
139 |     
140 |     return new_value
141 | 
142 | 
143 | 
144 | def q_learning(env,num_episodes,alpha,gamma = 1.0,plot_every = 100):
145 |     
146 |     nA = env.action_space.n
147 |     Q = defaultdict(lambda : np.zeros(nA))
148 |     
149 |     tmp_scores = deque(maxlen=plot_every)     # deque for keeping track of scores
150 |     avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes
151 |     
152 |     for i_episode in range(1, num_episodes+1):
153 |         # monitor progress
154 |         if i_episode % 100 == 0:
155 |             print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
156 |             sys.stdout.flush()
157 |         score = 0                                              # initialize score
158 |         state = env.reset()                                    # start episode
159 |         eps = 1.0 / i_episode                                  # set value of epsilon
160 |         
161 |         
162 |         while True:
163 |             action = epsilon_greedy(Q, state, nA, eps)
164 |             next_state, reward,done , info = env.step(action)
165 |             
166 |             score += reward
167 |             Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, state, action, reward,next_state) 
168 |             
169 |             state= next_state
170 |             
171 |             
172 |             if done:
173 |                 
174 |                 tmp_scores.append(score)                       # append score
175 |                 break
176 |             
177 |         if (i_episode % plot_every == 0):
178 |             avg_scores.append(np.mean(tmp_scores))
179 |             
180 |             
181 |     #plot performances
182 |     plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
183 |     plt.xlabel('Episode Number')
184 |     plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
185 |     plt.show()
186 |     # print best 100-episode performance
187 |     print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores))
188 |     return Q
189 | 
190 | # obtain the estimated optimal policy and corresponding action-value function
191 | Q_sarsamax = q_learning(env, 5000, .01)
192 | 
193 | # print the estimated optimal policy
194 | policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
195 | check_test.run_check('td_control_check', policy_sarsamax)
196 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
197 | print(policy_sarsamax)
198 | 
199 | # plot the estimated optimal state-value function
200 | plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])
201 | 
202 | 
203 | #CONTROL EXPECTED SARSA
204 | 
205 | def update_Q_expsarsa(alpha,gamma,nA,eps,Q,state,action,reward,next_state = None):
206 |     
207 |     current = Q[state][action] 
208 |     policy_s  =np.ones(nA) *eps/nA # current policy
209 |     policy_s[np.argmax(Q[next_state])] = 1-eps +(eps/nA) #greedy action
210 |     Qsa_next = np.dot(Q[next_state()],policy_s)
211 |     target = reward +(gamma* Qsa_next)
212 |     new_value = current +(alpha*(target- current))
213 |     
214 |     return new_value
215 | 
216 | 
217 | def expected_sarsa(env, num_episodes, alpha, gamma=1.0, plot_every=100):
218 |     
219 |     nA = env.action_space.n                # number of actions
220 |     Q = defaultdict(lambda: np.zeros(nA))  # initialize empty dictionary of arrays
221 |     
222 |     # monitor performance
223 |     tmp_scores = deque(maxlen=plot_every)     # deque for keeping track of scores
224 |     avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes
225 |     
226 |     for i_episode in range(1, num_episodes+1):
227 |         # monitor progress
228 |         if i_episode % 100 == 0:
229 |             print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
230 |             sys.stdout.flush()
231 |         
232 |         score = 0             # initialize score
233 |         state = env.reset()   # start episode
234 |         eps = 0.005           # set value of epsilon
235 |         
236 |         while True:
237 |             action = epsilon_greedy(Q, state, nA, eps)         # epsilon-greedy action selection
238 |             next_state, reward, done, info = env.step(action)  # take action A, observe R, S'
239 |             score += reward                                    # add reward to agent's score
240 |             # update Q
241 |             Q[state][action] = update_Q_expsarsa(alpha, gamma, nA, eps, Q, \
242 |                                                  state, action, reward, next_state)        
243 |             state = next_state              # S <- S'
244 |             if done:
245 |                 tmp_scores.append(score)    # append score
246 |                 break
247 |         if (i_episode % plot_every == 0):
248 |             avg_scores.append(np.mean(tmp_scores))
249 |             
250 |     # plot performance
251 |     plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores))
252 |     plt.xlabel('Episode Number')
253 |     plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
254 |     plt.show()
255 |     # print best 100-episode performance
256 |     print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores))
257 |     return Q
258 | 
259 | 
260 | Q_expsarsa = expected_sarsa(env, 5000, 1)
261 | 
262 | # print the estimated optimal policy
263 | policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12)
264 | check_test.run_check('td_control_check', policy_expsarsa)
265 | print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
266 | print(policy_expsarsa)
267 | 
268 | # plot the estimated optimal state-value function
269 | plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])
270 |     
271 | 
272 |         
273 |     
274 |     
275 |     
276 |     
277 | 
278 |                 
279 |                 
280 |                 
281 |                 
282 |   
283 |         
284 |             
285 |             
286 |         
287 | 
288 |         
289 |     
290 |     
291 | 
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/Temporal-Difference/__pycache__/check_test.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/__pycache__/check_test.cpython-38.pyc


--------------------------------------------------------------------------------
/Temporal-Difference/__pycache__/plot_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/__pycache__/plot_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/Temporal-Difference/check_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Aug 18 12:31:14 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | 
10 | import unittest 
11 | from IPython.display import Markdown, display
12 | import numpy as np
13 | 
14 | def printmd(string):
15 |     display(Markdown(string))
16 | 
17 | V_opt = np.zeros((4,12))
18 | V_opt[0:13][0] = -np.arange(3, 15)[::-1]
19 | V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
20 | V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
21 | V_opt[3][0] = -13
22 | 
23 | pol_opt = np.hstack((np.ones(11), 2, 0))
24 | 
25 | V_true = np.zeros((4,12))
26 | for i in range(3):
27 |     V_true[0:13][i] = -np.arange(3, 15)[::-1] - i
28 | V_true[1][11] = -2
29 | V_true[2][11] = -1
30 | V_true[3][0] = -17
31 | 
32 | def get_long_path(V):
33 |     return np.array(np.hstack((V[0:13][0], V[1][0], V[1][11], V[2][0], V[2][11], V[3][0], V[3][11])))
34 | 
35 | def get_optimal_path(policy):
36 |     return np.array(np.hstack((policy[2][:], policy[3][0])))
37 | 
38 | class Tests(unittest.TestCase):
39 | 
40 |     def td_prediction_check(self, V):
41 |         to_check = get_long_path(V)
42 |         soln = get_long_path(V_true)
43 |         np.testing.assert_array_almost_equal(soln, to_check)
44 | 
45 |     def td_control_check(self, policy):
46 |         to_check = get_optimal_path(policy)
47 |         np.testing.assert_equal(pol_opt, to_check)
48 | 
49 | check = Tests()
50 | 
51 | def run_check(check_name, func):
52 |     try:
53 |         getattr(check, check_name)(func)
54 |     except check.failureException as e:
55 |         printmd('**<span style="color: red;">PLEASE TRY AGAIN</span>**')
56 |         return
57 |     printmd('**<span style="color: green;">PASSED</span>**')


--------------------------------------------------------------------------------
/Temporal-Difference/plot_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Temporal-Difference/plot_graph.png


--------------------------------------------------------------------------------
/Temporal-Difference/plot_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Aug 18 12:27:03 2020
 5 | 
 6 | @author: pavankunchala
 7 | """
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | import seaborn as sns
12 | sns.set_style("white")
13 | 
14 | def plot_values(V):
15 | 	# reshape the state-value function
16 | 	V = np.reshape(V, (4,12))
17 | 	# plot the state-value function
18 | 	fig = plt.figure(figsize=(15,5))
19 | 	ax = fig.add_subplot(111)
20 | 	im = ax.imshow(V, cmap='cool')
21 | 	for (j,i),label in np.ndenumerate(V):
22 | 	    ax.text(i, j, np.round(label,3), ha='center', va='center', fontsize=14)
23 | 	plt.tick_params(bottom='off', left='off', labelbottom='off', labelleft='off')
24 | 	plt.title('State-Value Function')
25 | 	plt.show()


--------------------------------------------------------------------------------
/Tile-coding /Tile_coding_Uda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Fri Aug 21 08:22:58 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | #importing stuff
 10 | 
 11 | import sys
 12 | import gym
 13 | import numpy as np
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | 
 17 | 
 18 | #plotting stuff
 19 | 
 20 | plt.style.use('ggplot')
 21 | np.set_printoptions(precision=3, linewidth=120)
 22 | 
 23 | #creating the environemetn
 24 | env = gym.make('Acrobot-v1')
 25 | env.seed(505)
 26 | 
 27 | #Exploratorry state
 28 | print(" ")
 29 | print("State space:", env.observation_space)
 30 | print("- low:", env.observation_space.low)
 31 | print("- high:", env.observation_space.high)
 32 | print(" ")
 33 | 
 34 | # Explore action space
 35 | 
 36 | print("Action space:", env.action_space)
 37 | print(" ")
 38 | 
 39 | 
 40 | #creatinf a random agent
 41 | state = env.reset()
 42 | score = 0
 43 | 
 44 | for  t in range(200):
 45 |     action = env.action_space.sample()
 46 |     #env.render()
 47 |     state,reward , done ,info = env.step(action)
 48 |     score += reward
 49 |     
 50 |     if done:
 51 |         break
 52 | print("Final Score:",score)
 53 | env.close()
 54 | 
 55 | #Tiling
 56 | 
 57 | def create_tiling_grid(low,high,bins = (10,10),offsets = (0.0,0.0)):
 58 |     
 59 |     grid = [np.linspace(low[dim], high[dim],bins[dim]+1)[1:-1] +offsets[dim] for dim in range(len(bins))]
 60 |     
 61 |     print(" ")
 62 |     print("Tiling: [<low>, <high>] / <bins> + (<offset>) => <splits>")
 63 |     print(" ")
 64 |     for l, h, b, o, splits in zip(low, high, bins, offsets, grid):
 65 |         print("    [{}, {}] / {} + ({}) => {}".format(l, h, b, o, splits))
 66 |     return grid
 67 | 
 68 | 
 69 | #testing the tiling
 70 | low = [-1.0, -5.0]
 71 | high = [1.0, 5.0]
 72 | create_tiling_grid(low, high, bins=(10, 10), offsets=(-0.1, 0.5))  
 73 | 
 74 | def create_tilings(low,high,tiling_specs):
 75 |     
 76 |     return [create_tiling_grid(low, high, bins,offsets) for bins,offsets in tiling_specs]
 77 | 
 78 | # Tiling specs: [(<bins>, <offsets>), ...]
 79 | tiling_specs = [((10, 10), (-0.066, -0.33)),
 80 |                 ((10, 10), (0.0, 0.0)),
 81 |                 ((10, 10), (0.066, 0.33))]
 82 | tilings = create_tilings(low, high, tiling_specs)
 83 | 
 84 | from matplotlib.lines import Line2D
 85 | 
 86 | def visualize_tilings(tilings):
 87 |     """Plot each tiling as a grid."""
 88 |     prop_cycle = plt.rcParams['axes.prop_cycle']
 89 |     colors = prop_cycle.by_key()['color']
 90 |     linestyles = ['-', '--', ':']
 91 |     legend_lines = []
 92 | 
 93 |     fig, ax = plt.subplots(figsize=(10, 10))
 94 |     for i, grid in enumerate(tilings):
 95 |         for x in grid[0]:
 96 |             l = ax.axvline(x=x, color=colors[i % len(colors)], linestyle=linestyles[i % len(linestyles)], label=i)
 97 |         for y in grid[1]:
 98 |             l = ax.axhline(y=y, color=colors[i % len(colors)], linestyle=linestyles[i % len(linestyles)])
 99 |         legend_lines.append(l)
100 |     ax.grid('off')
101 |     ax.legend(legend_lines, ["Tiling #{}".format(t) for t in range(len(legend_lines))], facecolor='white', framealpha=0.9)
102 |     ax.set_title("Tilings")
103 |     return ax  # return Axis object to draw on later, if needed
104 | 
105 | 
106 | visualize_tilings(tilings);
107 | 
108 | #Discretize
109 | 
110 | def discretize(sample,grid):
111 |     return tuple(int(np.digitize(s, g)) for s,g in zip(sample,grid)) # applying along each dimension
112 | 
113 | def tile_encode(sample,tilings,flatten =False):
114 |     
115 |     #encoded sample
116 |     encoded_sample = [discretize(sample, grid) for grid in tilings]
117 |     return np.concatenate(encoded_sample) if flatten else encoded_sample
118 | 
119 | # Test with some sample values
120 | samples = [(-1.2 , -5.1 ),
121 |            (-0.75,  3.25),
122 |            (-0.5 ,  0.0 ),
123 |            ( 0.25, -1.9 ),
124 |            ( 0.15, -1.75),
125 |            ( 0.75,  2.5 ),
126 |            ( 0.7 , -3.7 ),
127 |            ( 1.0 ,  5.0 )]
128 | encoded_samples = [tile_encode(sample, tilings) for sample in samples]
129 | print(" ")
130 | print("\nSamples:", repr(samples), sep="\n")
131 | print("\nEncoded samples:", repr(encoded_samples), sep="\n")
132 | print(" ")
133 | 
134 | 
135 | # Visulalizing the tiling
136 | 
137 | from matplotlib.patches import Rectangle
138 | 
139 | def visualize_encoded_samples(samples, encoded_samples, tilings, low=None, high=None):
140 |     """Visualize samples by activating the respective tiles."""
141 |     samples = np.array(samples)  # for ease of indexing
142 | 
143 |     # Show tiling grids
144 |     ax = visualize_tilings(tilings)
145 |     
146 |     # If bounds (low, high) are specified, use them to set axis limits
147 |     if low is not None and high is not None:
148 |         ax.set_xlim(low[0], high[0])
149 |         ax.set_ylim(low[1], high[1])
150 |     else:
151 |         # Pre-render (invisible) samples to automatically set reasonable axis limits, and use them as (low, high)
152 |         ax.plot(samples[:, 0], samples[:, 1], 'o', alpha=0.0)
153 |         low = [ax.get_xlim()[0], ax.get_ylim()[0]]
154 |         high = [ax.get_xlim()[1], ax.get_ylim()[1]]
155 | 
156 |     # Map each encoded sample (which is really a list of indices) to the corresponding tiles it belongs to
157 |     tilings_extended = [np.hstack((np.array([low]).T, grid, np.array([high]).T)) for grid in tilings]  # add low and high ends
158 |     tile_centers = [(grid_extended[:, 1:] + grid_extended[:, :-1]) / 2 for grid_extended in tilings_extended]  # compute center of each tile
159 |     tile_toplefts = [grid_extended[:, :-1] for grid_extended in tilings_extended]  # compute topleft of each tile
160 |     tile_bottomrights = [grid_extended[:, 1:] for grid_extended in tilings_extended]  # compute bottomright of each tile
161 | 
162 |     prop_cycle = plt.rcParams['axes.prop_cycle']
163 |     colors = prop_cycle.by_key()['color']
164 |     for sample, encoded_sample in zip(samples, encoded_samples):
165 |         for i, tile in enumerate(encoded_sample):
166 |             # Shade the entire tile with a rectangle
167 |             topleft = tile_toplefts[i][0][tile[0]], tile_toplefts[i][1][tile[1]]
168 |             bottomright = tile_bottomrights[i][0][tile[0]], tile_bottomrights[i][1][tile[1]]
169 |             ax.add_patch(Rectangle(topleft, bottomright[0] - topleft[0], bottomright[1] - topleft[1],
170 |                                    color=colors[i], alpha=0.33))
171 | 
172 |             # In case sample is outside tile bounds, it may not have been highlighted properly
173 |             if any(sample < topleft) or any(sample > bottomright):
174 |                 # So plot a point in the center of the tile and draw a connecting line
175 |                 cx, cy = tile_centers[i][0][tile[0]], tile_centers[i][1][tile[1]]
176 |                 ax.add_line(Line2D([sample[0], cx], [sample[1], cy], color=colors[i]))
177 |                 ax.plot(cx, cy, 's', color=colors[i])
178 |     
179 |     # Finally, plot original samples
180 |     ax.plot(samples[:, 0], samples[:, 1], 'o', color='r')
181 | 
182 |     ax.margins(x=0, y=0)  # remove unnecessary margins
183 |     ax.set_title("Tile-encoded samples")
184 |     return ax
185 | 
186 | visualize_encoded_samples(samples, encoded_samples, tilings);
187 | 
188 | 
189 | # Now  a Q-table with tile coding
190 | 
191 | class QTable:
192 |     
193 |     def __init__(self,state_size, action_size):
194 |         self.state_size = state_size
195 |         self.action_size = action_size
196 |         
197 |         self.q_table = np.zeros(shape= (self.state_size + (self.action_size,)))
198 |         print(" ")
199 |         print(" Q Table size = ", self.q_table.shape)
200 |         
201 | 
202 | # Now with the tile coding part
203 | 
204 | class TiledQTable:
205 |     
206 |     
207 |     def __init__(self, low, high, tiling_specs, action_size):
208 |        
209 |         self.tilings = create_tilings(low, high, tiling_specs)
210 |         self.state_sizes = [tuple(len(splits)+1 for splits in tiling_grid) for tiling_grid in self.tilings]
211 |         self.action_size = action_size
212 |         self.q_tables = [QTable(state_size, self.action_size) for state_size in self.state_sizes]
213 |         print("TiledQTable(): no. of internal tables = ", len(self.q_tables))
214 |     
215 |     def get(self, state, action):
216 |         
217 |        
218 |         encoded_state = tile_encode(state, self.tilings)
219 |         
220 |        
221 |         value = 0.0
222 |         for idx, q_table in zip(encoded_state, self.q_tables):
223 |             value += q_table.q_table[tuple(idx + (action,))]
224 |         value /= len(self.q_tables)
225 |         return value
226 |     
227 |     def update(self, state, action, value, alpha=0.1):
228 |        
229 |        
230 |         encoded_state = tile_encode(state, self.tilings)
231 |         
232 |         
233 |         for idx, q_table in zip(encoded_state, self.q_tables):
234 |             value_ = q_table.q_table[tuple(idx + (action,))]  # current value
235 |             q_table.q_table[tuple(idx + (action,))] = alpha * value + (1.0 - alpha) * value_
236 | 
237 |             
238 |             
239 | 
240 |         
241 | # Test with a sample Q-table
242 | tq = TiledQTable(low, high, tiling_specs, 2)
243 | s1 = 3; s2 = 4; a = 0; q = 1.0
244 | print("[GET]    Q({}, {}) = {}".format(samples[s1], a, tq.get(samples[s1], a)))  # check value at sample = s1, action = a
245 | print("[UPDATE] Q({}, {}) = {}".format(samples[s2], a, q)); tq.update(samples[s2], a, q)  # update value for sample with some common tile(s)
246 | print("[GET]    Q({}, {}) = {}".format(samples[s1], a, tq.get(samples[s1], a)))   
247 | print(" ")     
248 |         
249 | 
250 | class QLearningAgent:
251 |     
252 | 
253 |     def __init__(self, env, tq, alpha=0.02, gamma=0.99,
254 |                  epsilon=1.0, epsilon_decay_rate=0.9995, min_epsilon=.01, seed=0):
255 |         """Initialize variables, create grid for discretization."""
256 |         # Environment info
257 |         self.env = env
258 |         self.tq = tq 
259 |         self.state_sizes = tq.state_sizes           # list of state sizes for each tiling
260 |         self.action_size = self.env.action_space.n  # 1-dimensional discrete action space
261 |         self.seed = np.random.seed(seed)
262 |         print("Environment:", self.env)
263 |         print("State space sizes:", self.state_sizes)
264 |         print("Action space size:", self.action_size)
265 |         
266 |         # Learning parameters
267 |         self.alpha = alpha  # learning rate
268 |         self.gamma = gamma  # discount factor
269 |         self.epsilon = self.initial_epsilon = epsilon  # initial exploration rate
270 |         self.epsilon_decay_rate = epsilon_decay_rate   # how quickly should we decrease epsilon
271 |         self.min_epsilon = min_epsilon
272 | 
273 |     def reset_episode(self, state):
274 |         """Reset variables for a new episode."""
275 |         # Gradually decrease exploration rate
276 |         self.epsilon *= self.epsilon_decay_rate
277 |         self.epsilon = max(self.epsilon, self.min_epsilon)
278 |         
279 |         self.last_state = state
280 |         Q_s = [self.tq.get(state, action) for action in range(self.action_size)]
281 |         self.last_action = np.argmax(Q_s)
282 |         return self.last_action
283 |     
284 |     def reset_exploration(self, epsilon=None):
285 |         """Reset exploration rate used when training."""
286 |         self.epsilon = epsilon if epsilon is not None else self.initial_epsilon
287 | 
288 |     def act(self, state, reward=None, done=None, mode='train'):
289 |         """Pick next action and update internal Q table (when mode != 'test')."""
290 |         Q_s = [self.tq.get(state, action) for action in range(self.action_size)]
291 |         # Pick the best action from Q table
292 |         greedy_action = np.argmax(Q_s)
293 |         if mode == 'test':
294 |             # Test mode: Simply produce an action
295 |             action = greedy_action
296 |         else:
297 |             # Train mode (default): Update Q table, pick next action
298 |             # Note: We update the Q table entry for the *last* (state, action) pair with current state, reward
299 |             value = reward + self.gamma * max(Q_s)
300 |             self.tq.update(self.last_state, self.last_action, value, self.alpha)
301 | 
302 |             # Exploration vs. exploitation
303 |             do_exploration = np.random.uniform(0, 1) < self.epsilon
304 |             if do_exploration:
305 |                 # Pick a random action
306 |                 action = np.random.randint(0, self.action_size)
307 |             else:
308 |                 # Pick the greedy action
309 |                 action = greedy_action
310 | 
311 |         # Roll over current state, action for next step
312 |         self.last_state = state
313 |         self.last_action = action
314 |         return action
315 |     
316 |     
317 | n_bins = 5
318 | bins = tuple([n_bins]*env.observation_space.shape[0])
319 | offset_pos = (env.observation_space.high - env.observation_space.low)/(3*n_bins)
320 | 
321 | tiling_specs = [(bins, -offset_pos),
322 |                 (bins, tuple([0.0]*env.observation_space.shape[0])),
323 |                 (bins, offset_pos)]
324 | 
325 | tq = TiledQTable(env.observation_space.low, 
326 |                  env.observation_space.high, 
327 |                  tiling_specs, 
328 |                  env.action_space.n)
329 | agent = QLearningAgent(env, tq)
330 | 
331 | def run(agent, env, num_episodes=10000, mode='train'):
332 |     """Run agent in given reinforcement learning environment and return scores."""
333 |     scores = []
334 |     max_avg_score = -np.inf
335 |     for i_episode in range(1, num_episodes+1):
336 |         # Initialize episode
337 |         state = env.reset()
338 |         action = agent.reset_episode(state)
339 |         total_reward = 0
340 |         done = False
341 | 
342 |         # Roll out steps until done
343 |         while not done:
344 |             state, reward, done, info = env.step(action)
345 |             total_reward += reward
346 |             action = agent.act(state, reward, done, mode)
347 |             
348 |       
349 | 
350 |         # Save final score
351 |         scores.append(total_reward)
352 | 
353 |         # Print episode stats
354 |         if mode == 'train':
355 |             if len(scores) > 100:
356 |                 avg_score = np.mean(scores[-100:])
357 |                 if avg_score > max_avg_score:
358 |                     max_avg_score = avg_score
359 |             if i_episode % 100 == 0:
360 |                 print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="")
361 |                 sys.stdout.flush()
362 |                 
363 |         
364 |     return scores
365 | 
366 | scores = run(agent, env)
367 | 
368 | 
369 | def plot_scores(scores, rolling_window=100):
370 |     """Plot scores and optional rolling mean using specified window."""
371 |     plt.plot(scores); plt.title("Scores");
372 |     rolling_mean = pd.Series(scores).rolling(rolling_window).mean()
373 |     plt.plot(rolling_mean);
374 |     return rolling_mean
375 | 
376 | rolling_mean = plot_scores(scores)
377 | 
378 | 
379 |         
380 |         
381 |         
382 |         
383 |         
384 | 
385 | 
386 | 
387 |     
388 |     
389 |     
390 |     
391 |     
392 |     
393 |     
394 |     
395 |     
396 |     
397 |     
398 |     
399 |     
400 |     
401 |     
402 |     
403 |     
404 |     
405 |     
406 |     
407 |     
408 |     
409 | 
410 | 
411 | 


--------------------------------------------------------------------------------
/Upper-Confidence-Bound /Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/Upper-Confidence-Bound /Figure_1.png


--------------------------------------------------------------------------------
/Upper-Confidence-Bound /UCB.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Jun 24 22:25:26 2020
  5 | 
  6 | @author: pavankunchala
  7 | """
  8 | 
  9 | 
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | import pandas as pd
 13 | 
 14 | class ucb_bandit:
 15 |     
 16 |     
 17 |     def __init__(self, k ,c, iters, mu = 'random'):
 18 |         # number of arms 
 19 |         
 20 |         self.k = k
 21 |          # the condindencre bound(exploration parameter)
 22 |         self.c = c
 23 |         
 24 |         # number of iters
 25 |         self.iters = iters
 26 |         
 27 |         #no of times steps
 28 |         self.n = 1
 29 |         #step count for each arm
 30 |         self.k_n  =np.ones(k)
 31 |         #mean reward
 32 |         self.mean_reward  = 0
 33 |         self.reward = np.zeros(iters)
 34 |         
 35 |         #reward for each arm
 36 |         self.k_reward=np.zeros(k)
 37 |         
 38 |         if type(mu) == list or type(mu).__module__ == np.__name__:
 39 |             #user defined average
 40 |             self.mu = np.array(mu)
 41 |         elif mu == 'random':
 42 |             #draws random from the prob distrubtytion
 43 |             self.mu = np.random.normal(0,1,k)
 44 |         elif mu == 'sequence':
 45 |             self.mu = np.linspace(0, k-1, k)
 46 |             
 47 |     def pull(self):
 48 |         
 49 |         a = np.argmax(self.k_reward +self.c *np.sqrt(np.log(self.n) / self.k_n))
 50 |         
 51 |         reward = np.random.normal(self.mu[a],1)
 52 |         
 53 |         #updte
 54 |         self.n +=1
 55 |         self.k_n[a]+=1
 56 |         
 57 |         
 58 |         #update total
 59 |         self.mean_reward = self.mean_reward +(reward - self.mean_reward)/self.n
 60 |         
 61 |         self.k_reward[a] = self.k_reward[a] +(reward - self.k_reward[a])/self.k_n[a]
 62 |         
 63 |     def run(self):
 64 |         for i in range(self.iters):
 65 |             self.pull()
 66 |             self.reward[i]= self.mean_reward
 67 |             
 68 |     def reset(self,mu = 'none'):
 69 |         
 70 |         self.n = 1
 71 |         self.k_n = np.ones(self.k)
 72 |         self.mean_reward = 0
 73 |         self.reward = np.zeros(iters)
 74 |         self.k_reward = np.zeros(self.k)
 75 |         if mu == 'random':
 76 |             self.mu = np.random.normal(0, 1, self.k)
 77 |             
 78 | k = 10 # number of arms
 79 | iters = 1000
 80 | ucb_rewards = np.zeros(iters)
 81 | # Initialize bandits
 82 | ucb = ucb_bandit(k, 2, iters)
 83 | episodes = 1000
 84 | # Run experiments
 85 | for i in range(episodes): 
 86 |     ucb.reset('random')
 87 |     # Run experiments
 88 |     ucb.run()
 89 |     
 90 |     # Update long-term averages
 91 |     ucb_rewards = ucb_rewards + (
 92 |         ucb.reward - ucb_rewards) / (i + 1)
 93 |     
 94 | plt.figure(figsize=(12,8))
 95 | plt.plot(ucb_rewards, label="UCB")
 96 | plt.legend(bbox_to_anchor=(1.2, 0.5))
 97 | plt.xlabel("Iterations")
 98 | plt.ylabel("Average Reward")
 99 | plt.title("Average UCB Rewards after " 
100 |           + str(episodes) + " Episodes")
101 | plt.show()
102 | 
103 | 
104 | 
105 |             
106 |         
107 |         
108 |             
109 |             
110 |         
111 |         


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-merlot


--------------------------------------------------------------------------------
/cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pavankunchala/Reinforcement-Learning/0e453264ee9a2fd06ab9ca41e0fa46304cc364fd/cheatsheet.pdf


--------------------------------------------------------------------------------