├── DQN ├── .ipynb_checkpoints │ ├── Deep_Q_network-checkpoint.ipynb │ ├── dqn_cartpole-checkpoint.ipynb │ └── dqn_cnn-checkpoint.ipynb ├── dqn_cnn.ipynb ├── dqn_low_state.ipynb ├── models │ └── CartPole-v0.pth ├── rewards.npy └── testing.py ├── Model-Based-Learn ├── Makefile ├── __pycache__ │ └── lake_envs.cpython-36.pyc ├── collect_submission.sh ├── discrete_env.py ├── frozen_lake.py ├── lake_envs.py ├── requirements.txt └── vi_and_pi.py ├── Model-Free-Learn ├── custom_environment.py └── q_learning.py └── README.md /DQN/.ipynb_checkpoints/Deep_Q_network-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import numpy as np\n", 11 | "import gym\n", 12 | "import torch.nn as nn\n", 13 | "import time\n", 14 | "import random\n", 15 | "import torch.optim as optim\n", 16 | "import math\n", 17 | "\n", 18 | "\n", 19 | "from IPython.display import clear_output\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "%matplotlib inline\n", 22 | "\n", 23 | "\n", 24 | "if torch.cuda.is_available():\n", 25 | " device = 'cuda'\n", 26 | "else:\n", 27 | " device = 'cpu'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Network architecture for DQN\n", 37 | "class q_network(nn.Module):\n", 38 | " def __init__(self, observations, actions):\n", 39 | " super(q_network, self).__init__()\n", 40 | " self.network = nn.Sequential(\n", 41 | " nn.Linear(observations, 64),\n", 42 | " nn.ReLU(),\n", 43 | " nn.Linear(64, 32),\n", 44 | " nn.ReLU(),\n", 45 | " nn.Linear(32, actions),\n", 46 | " )\n", 47 | " def forward(self, x):\n", 48 | " return self.network(x)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "0.95\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "done = False\n", 66 | "learning_rate = 0.0001\n", 67 | "discount= 0.99\n", 68 | "epsilon = 0.95\n", 69 | "epsilon_decay = 0.9999\n", 70 | "min_epsilon = 0.1\n", 71 | "n_episodes = 1000\n", 72 | "batch_size = 128\n", 73 | "Reward_Path = 'rewards.npy'\n", 74 | "env_name = 'CartPole-v0'\n", 75 | "model_path = './models/' + env_name + '.pth'\n", 76 | "\n", 77 | "print(epsilon)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "array([-0.00270691, -0.01540017, 0.0037088 , 0.01664034])" 89 | ] 90 | }, 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# creating a cartpole environment\n", 98 | "env = gym.make(env_name)\n", 99 | "env.reset()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "'\\n# Testing the environment with random actions\\nfor _ in range(1000):\\n env.render(100)\\n time.sleep(0.05)\\n env.step(env.action_space.sample())\\nenv.close()\\n'" 111 | ] 112 | }, 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "'''\n", 120 | "# Testing the environment with random actions\n", 121 | "for _ in range(1000):\n", 122 | " env.render(100)\n", 123 | " time.sleep(0.05)\n", 124 | " env.step(env.action_space.sample())\n", 125 | "env.close()\n", 126 | "'''" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# for experience replay\n", 136 | "from collections import deque\n", 137 | "\n", 138 | "class Exp_Replay:\n", 139 | " def __init__(self, limit):\n", 140 | " self.memory = deque(maxlen=limit) \n", 141 | " \n", 142 | " def push(self, state, action, reward, next_state, done):\n", 143 | " state = np.expand_dims(state,0)\n", 144 | " next_state = np.expand_dims(next_state,0)\n", 145 | "\n", 146 | " self.memory.append((state, action, reward, next_state, done))\n", 147 | " \n", 148 | " def sample(self, batch_size):\n", 149 | " state, action, reward, next_state, done = zip(*random.sample(self.memory, batch_size))\n", 150 | " return np.concatenate(state), action, reward, np.concatenate(next_state), done\n", 151 | " \n", 152 | " def __len__(self):\n", 153 | " return len(self.memory)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "" 165 | ] 166 | }, 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "# creating object for network\n", 174 | "q_hat = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 175 | "q_hat_target = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 176 | "\n", 177 | "q_hat_target.load_state_dict(q_hat.state_dict())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "criterion = nn.MSELoss().to(device)\n", 187 | "optimizer = optim.RMSprop(q_hat.parameters(), lr = learning_rate)\n", 188 | "\n", 189 | "memory = Exp_Replay(10000)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "def plot(episode, avg_loss, eps, eps_rewards):\n", 199 | " clear_output(True)\n", 200 | " plt.figure(figsize=(20,5))\n", 201 | " plt.subplot(131)\n", 202 | " plt.title('Episode: %5d | Epsilon: %4.2f | Avg. Reward: %5.2f'%(episode, eps, np.mean(ep_rewards[-50:])))\n", 203 | " plt.plot(eps_rewards)\n", 204 | " plt.subplot(132)\n", 205 | " plt.title('loss | Average Loss: %5.2f'%np.mean(ep_loss[-50:]))\n", 206 | " plt.plot(ep_loss)\n", 207 | " plt.show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "def action_select(state, epsilon):\n", 217 | " \n", 218 | " if random.random() < epsilon:\n", 219 | " action = env.action_space.sample()\n", 220 | " else:\n", 221 | " state = torch.FloatTensor(state).unsqueeze(0).to(device) \n", 222 | " action = torch.argmax(q_hat(state)).item()\n", 223 | " \n", 224 | " return action" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 11, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "def compute_td_loss(batch_size,criterion, optimizer, target_net, loss, limit):\n", 234 | " if limit< batch_size:\n", 235 | " return 0\n", 236 | " \n", 237 | " state, action, reward, next_state, done = memory.sample(batch_size)\n", 238 | "\n", 239 | " state = torch.FloatTensor(np.float32(state)).to(device)\n", 240 | " next_state = torch.FloatTensor(np.float32(next_state)).to(device)\n", 241 | " action = torch.LongTensor(action).to(device)\n", 242 | " reward = torch.FloatTensor(reward).to(device)\n", 243 | " done = torch.FloatTensor(done).to(device)\n", 244 | " \n", 245 | " current_q = q_hat(state).gather(1, action.unsqueeze(1)).squeeze(1)\n", 246 | " \n", 247 | " \n", 248 | " next_q = target_net(next_state).max(dim=1)[0]\n", 249 | " q_target = reward + discount*next_q*(1 - done)\n", 250 | " \n", 251 | " tdloss = criterion(current_q, q_target.detach())\n", 252 | " loss += tdloss.item()\n", 253 | " \n", 254 | " optimizer.zero_grad()\n", 255 | " tdloss.backward()\n", 256 | " optimizer.step()\n", 257 | " \n", 258 | " return loss" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 12, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "image/png": "\n", 269 | "text/plain": [ 270 | "
" 271 | ] 272 | }, 273 | "metadata": { 274 | "needs_background": "light" 275 | }, 276 | "output_type": "display_data" 277 | } 278 | ], 279 | "source": [ 280 | "avg_loss = 0\n", 281 | "steps = 200\n", 282 | "ep_rewards = np.array([])\n", 283 | "ep_loss = np.array([])\n", 284 | "\n", 285 | "# Training the agent\n", 286 | "for episode in range(n_episodes):\n", 287 | " state = env.reset()\n", 288 | " ep_reward = 0\n", 289 | " running_loss = 0\n", 290 | " done = False\n", 291 | " \n", 292 | " while not done:\n", 293 | " \n", 294 | " action = action_select(state, epsilon)\n", 295 | " epsilon = max(epsilon*epsilon_decay, min_epsilon)\n", 296 | " \n", 297 | " \n", 298 | " next_state, reward, done, _ = env.step(action)\n", 299 | " \n", 300 | " ep_reward += reward\n", 301 | " memory.push(state, action, reward, next_state, done)\n", 302 | " \n", 303 | " state = next_state\n", 304 | " \n", 305 | " running_loss = compute_td_loss(batch_size,criterion, optimizer, q_hat_target, running_loss, len(memory))\n", 306 | " \n", 307 | " ep_rewards = np.append(ep_rewards, ep_reward)\n", 308 | " ep_loss = np.append(ep_loss, running_loss)\n", 309 | " q_hat_target.load_state_dict(q_hat.state_dict())\n", 310 | " \n", 311 | " if episode%50==0:\n", 312 | " plot(episode, ep_loss, epsilon, ep_rewards)\n", 313 | " avg_loss = 0\n", 314 | " ep_reward =0\n", 315 | " np.save(Reward_Path, ep_rewards)\n", 316 | " \n", 317 | "env.close()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 14, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "" 329 | ] 330 | }, 331 | "execution_count": 14, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "# Save model\n", 338 | "torch.save(q_hat_target.state_dict(),model_path)\n", 339 | "\n", 340 | "q_hat_target = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 341 | "q_hat_target.load_state_dict(torch.load(model_path))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 16, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "Total Reward: 200.0\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "import gym\n", 359 | "import time\n", 360 | "\n", 361 | "env = gym.make(env_name)\n", 362 | "# env.reset()\n", 363 | "state = env.reset()\n", 364 | "done = False\n", 365 | "testing_epsilon = 0.05\n", 366 | "tot_reward = 0\n", 367 | "\n", 368 | "for step in range(2000):\n", 369 | " state = torch.from_numpy(state).float().to(device)\n", 370 | " env.render()\n", 371 | " action = torch.argmax(q_hat_target(state)).item()\n", 372 | " state, reward, done, _ = env.step(action)\n", 373 | " tot_reward += reward\n", 374 | " if done:\n", 375 | "# print('DONE')\n", 376 | " break\n", 377 | " time.sleep(.05)\n", 378 | " \n", 379 | "print('Total Reward:', tot_reward)\n", 380 | " \n", 381 | "env.close()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.6.9" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /DQN/dqn_cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torch.nn as nn\n", 11 | "import torch.optim as optim\n", 12 | "import torch.autograd as autograd\n", 13 | "import torch.nn.functional as F\n", 14 | "import torchvision.transforms as T\n", 15 | "from collections import namedtuple\n", 16 | "\n", 17 | "import gym\n", 18 | "import numpy as np\n", 19 | "import time\n", 20 | "import cv2\n", 21 | "\n", 22 | "import random\n", 23 | "import math\n", 24 | "from itertools import count\n", 25 | "\n", 26 | "\n", 27 | "from PIL import Image\n", 28 | "import matplotlib\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "\n", 31 | "%matplotlib inline\n", 32 | "is_ipython = 'inline' in matplotlib.get_backend()\n", 33 | "if is_ipython:\n", 34 | " from IPython import display\n", 35 | "\n", 36 | "plt.ion()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Checking the Environment" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "array([-0.01334107, -0.03290919, 0.02486646, -0.02966823])" 55 | ] 56 | }, 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "env_id = 'CartPole-v0'\n", 64 | "\n", 65 | "env = gym.make(env_id)\n", 66 | "env.reset()\n", 67 | "# for _ in range(400):\n", 68 | "# env.render()\n", 69 | "# time.sleep(0.01)\n", 70 | "# env.step(env.action_space.sample()) # take random actions\n", 71 | "# env.close()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Checking and using if CUDA is available()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "cuda\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "\n", 96 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 97 | "print(device)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Experience Replay to have memory for model" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from collections import deque\n", 114 | "\n", 115 | "class exp_replay(object):\n", 116 | " def __init__(self, capacity):\n", 117 | " self.buffer = deque(maxlen=capacity)\n", 118 | "\n", 119 | " def push(self, state, action, reward, next_state, done):\n", 120 | " self.buffer.append((state, action, reward, next_state, done))\n", 121 | "\n", 122 | " def sample(self, batch_size):\n", 123 | " state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))\n", 124 | " return np.concatenate(state), action, reward, np.concatenate(next_state), done\n", 125 | "\n", 126 | " def __len__(self):\n", 127 | " return len(self.buffer)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "epsilon = 1\n", 137 | "epsilon_min = 0.01\n", 138 | "epsilon_decay = 3000\n", 139 | "gamma = 0.999\n", 140 | "batch_size = 128\n", 141 | "\n", 142 | "replay_buffer_size = 10000\n", 143 | "learning_rate = 0.0001\n", 144 | "num_episodes = 500\n", 145 | "target_update_freq = 10\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "def get_cart_loc(screen_width):\n", 155 | " world_width = env.x_threshold*2\n", 156 | " scale = screen_width/world_width\n", 157 | "# print(scale)\n", 158 | "# print(int(env.state[0]*scale + screen_width/2.0))\n", 159 | " return int(env.state[0]*scale + screen_width/2.0)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# def get_screen():\n", 169 | "# screen = env.render(mode='rgb_array')\n", 170 | "# screen_1 = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)\n", 171 | "# r_screen = cv2.resize(screen_1, (84,84), interpolation=cv2.INTER_AREA)\n", 172 | "# r_screen = np.array(r_screen)\n", 173 | "# r_screen = np.expand_dims(r_screen,axis=0)\n", 174 | "# r_screen = torch.Tensor(r_screen)\n", 175 | "# return r_screen.unsqueeze(0).to(device)\n" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAADECAYAAACGNXroAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARKklEQVR4nO3de5RV5X3G8e8Dw80LF3W0CNYxhuAljZdOEWsMFDUlGgN/2EabGOJylSTLrkhjq8b+UWxdq3GtVG1XsqwoJlRTLyEm3tIkiqhpa9AB7yKCaGQUZVDBKyry6x/7nfEwzmEOM+ec4T3zfNbaa96993v2/u3Zw8M777mMIgIzM8vPkIEuwMzM+sYBbmaWKQe4mVmmHOBmZplygJuZZcoBbmaWKQe4WQ8ktUgKSU0DXYtZOQ5w6xdJz0t6V9JbJcsPKnjcdEntNaxrvqTra3V8s12BRxdWDadGxN3VPqikpojYWu3j7goa+dqsfjwCt5qRdKWkxSXrl0paIml34L+B/UtG7funUfNiSddLegP4uqQpkh6QtEnSekk/kDS85JiHS7pL0muSXpF0kaSZwEXAl9OxH019x0hamI7zoqRLJA1N+4ZK+r6kjZLWAqf0cm0XpGO8KWmVpBNKjnORpGfTvuWSDkj7QtI5klYDq9O2Q0rqXyXpL0vOMSLV9EK6tv+QNCrtmy6pXdJ5kjakazqrCrfNchIRXrz0eQGeB04ss2834Bng68DxwEZgYto3HWjv1n8+8AEwm2JwMQr4Y2AqxW+LLcBKYF7qvyewHjgPGJnWjyk51vXdjv8L4Cpgd2Bf4EHgG2nfN4GngQOAvYClQABNPVzXZGAdsH9abwEOTu2/Bx5PfQQcAeyd9gVwVzr+qFTHOuCsdH1Hp+/R4an/FcBtqf+ewO3Av5R8/7YC/wQMA04G3gHGDfTPhJf6LQNegJe8lxTgbwGbSpa/Ltk/BXgN+D1wRsn2cgF+fy/nmwf8PLXPAB4u02+7AAf2A94DRpVsOwNYmtr3AN8s2ff5HQT4J4ENwInAsG77VgGzytQUwIyS9S8Dv+3W5yrgH1P4v935H0PadyzwXMn3793S+lJNUwf6Z8JL/RbPgVs1zI4yc+AR8WCaktgXuLmCY60rXZH0KeAyoJViRN8ELE+7DwCerbDGAylGqusldW4bUnK+/bud+/flDhQRayTNo/hP4nBJvwa+ExEvVVBT6TkOBI6RtKlkWxNwHdBMcb3LS+oVMLSk76ux/Tz6O8AeOzi3NRjPgVtNSToHGAG8BJxfsqvcx2B2334lxdTGpIgYTTG33Zlo64CDKzzOOooR+D4RMTYtoyPi8LR/PUX4dvrDMsctDh7xXxHxWYoQDuDSCmrqXtc64L6SesZGxB4R8S2KqZR3KaZTOveNiQgHtHVxgFvNpNHzJcBXgTOB8yUdmXa/AuwtaUwvh9kTeAN4S9IhwLdK9t0B/IGkeekJvz0lHVNy/BZJQwAiYj3wG+BfJY2WNETSwZKmpf43A9+WNFHSOODCHVzXZEkzJI0AtlAE7Ydp9zXAP0uapMJnJO1d5lB3AJ+SdKakYWn5E0mHRsQ24Grgckn7pvNOkPTnvXy/bBBxgFs13N7tdeA/T2+AuR64NCIejYjVFKPn6ySNiIingRuAtekVJvuXOfbfAX8FvEkRaDd17oiIN4GTgFOBlyle2fFnafdP09dXJa1I7a8Bw4GngNeBxcD4tO9q4NfAo8AK4JYdXO8I4HsUo+SXKaaHLkr7LqP4z+A3FP/xLKR4wvJjUv2fB06n+A3lZYqR/IjU5QJgDfC79KqcuymeHDUDQBH+gw5mZjnyCNzMLFMOcDOzTDnAzcwy1a8AlzQzvf13jaSyz9qbmVn19flJzPQZEs9QvAqgHXiI4p12T1WvPDMzK6c/78ScAqyJiLUAkm4EZlG8RKtH++yzT7S0tPTjlGZmg8/y5cs3RkRz9+39CfAJbP+24HbgmDJ9AWhpaaGtra0fpzQzG3wk9fjRDv2ZA1cP2z42HyNprqQ2SW0dHR39OJ2ZmZXqT4C3s/1nR0ykeDfZdiJiQUS0RkRrc/PHfgMwM7M+6k+APwRMknRQ+oD90yk+u9jMzOqgz3PgEbFV0t9QfH7EUODaiHiyapWZmdkO9evzwCPil8Avq1SLmZntBL8T08wsUw5wM7NM+U+q2aD11suru9rbPnh/u32j9p7Y1R46/KOP8x7SNLz2hZlVyCNwM7NMOcDNzDLlKRQbtF5dvayrvXHlb8v2O/D4r3a19zn0+JrWZLYzPAI3M8uUA9zMLFMOcDOzTDnAzcwy5QA3M8uUA9zMLFMOcDOzTDnAzcwy5QA3M8uUA9zMLFMOcDOzTDnAzcwy5QA3M8uUA9zMLFMOcDOzTDnAzcwy1WuAS7pW0gZJT5Rs20vSXZJWp6/jalummZl1V8kI/MfAzG7bLgSWRMQkYElaNzOzOuo1wCPifuC1bptnAYtSexEwu8p1mZlZL/o6B75fRKwHSF/3rV5JZmZWiZo/iSlprqQ2SW0dHR21Pp2Z2aDR1wB/RdJ4gPR1Q7mOEbEgIlojorW5ubmPpzMzs+76GuC3AXNSew5wa3XKMTOzSlXyMsIbgAeAyZLaJZ0NfA84SdJq4KS0bmZmddTUW4eIOKPMrhOqXIuZme0EvxPTzCxTDnAzs0z1OoVi1qje2/RKRf1GjN2vxpWY9Y1H4GZmmXKAm5llylMoNmht2VzZFMrIMZ5CsV2TR+BmZplygJuZZcoBbmaWKQe4mVmmHOBmZplygJuZZcoBbmaWKQe4mVmmHOBmZplygJuZZcoBbmaWKQe4mVmmHOBmZplygJuZZcoBbmaWKQe4mVmmeg1wSQdIWipppaQnJZ2btu8l6S5Jq9PXcbUv18zMOlUyAt8KnBcRhwJTgXMkHQZcCCyJiEnAkrRuZmZ10muAR8T6iFiR2m8CK4EJwCxgUeq2CJhdqyLNzOzjdmoOXFILcBSwDNgvItZDEfLAvtUuzszMyqs4wCXtAfwMmBcRb+zE4+ZKapPU1tHR0ZcazcysBxUFuKRhFOH9k4i4JW1+RdL4tH88sKGnx0bEgohojYjW5ubmatRsZmZU9ioUAQuBlRFxWcmu24A5qT0HuLX65ZmZWTlNFfQ5DjgTeFzSI2nbRcD3gJslnQ28APxFbUo0M7Oe9BrgEfE/gMrsPqG65ZiZWaX8Tkwzs0w5wM3MMuUANzPLlAPczCxTDnAzs0w5wM3MMuUANzPLlAPczCxTDnAzs0xV8lZ6s4axbev7Xe3Y9mHZfkOahne1NWRoTWsy6yuPwM3MMuUANzPLlKdQbFDZsml9V3vrlrfK9hs5dnxXu2nkHjWtyayvPAI3M8uUA9zMLFMOcDOzTDnAzcwy5QA3M8uUA9zMLFMOcDOzTDnAzcwy5QA3M8tUrwEuaaSkByU9KulJSRen7QdJWiZptaSbJA3v7VhmZlY9lYzA3wNmRMQRwJHATElTgUuByyNiEvA6cHbtyjQzs+56DfAodH5oxLC0BDADWJy2LwJm16RCMzPrUUVz4JKGSnoE2ADcBTwLbIqIralLOzChNiWamVlPKgrwiPgwIo4EJgJTgEN76tbTYyXNldQmqa2jo6PvlZqZ2XZ26lUoEbEJuBeYCoyV1PlxtBOBl8o8ZkFEtEZEa3Nzc39qNTOzEpW8CqVZ0tjUHgWcCKwElgKnpW5zgFtrVaSZmX1cJX/QYTywSNJQisC/OSLukPQUcKOkS4CHgYU1rNPMzLrpNcAj4jHgqB62r6WYDzczswHgd2KamWXKAW5mlikHuJlZphzgZmaZcoCbmWXKAW5mlikHuJlZphzgZmaZcoCbmWXKAW5mlikHuJlZphzgZmaZcoCbmWXKAW5mlikHuJlZpir5gw5mA+7ee+/tat933319Ps6E0epqH7V7+X7PrH2+q33zxRf3+XzTpk3rak+fPr3PxzHriUfgZmaZcoCbmWXKUyiWhdJpk/nz5/f5OGedcmxXe9i0b+zgfFd1tX9059V9Pl9prZ5CsWrzCNzMLFMOcDOzTHkKxQaVF9/9ZFf7ubc/Xbbfc2//UcnaAzWsyKzvKh6BSxoq6WFJd6T1gyQtk7Ra0k2ShteuTDMz625nplDOBVaWrF8KXB4Rk4DXgbOrWZiZme1YRQEuaSJwCnBNWhcwA1icuiwCZteiQLNqGjNsY9cyRNvKLqObXu1azHZVlY7ArwDOB7al9b2BTRGxNa23AxOqXJuZme1ArwEu6YvAhohYXrq5h65R5vFzJbVJauvo6OhjmWZm1l0lr0I5DviSpJOBkcBoihH5WElNaRQ+EXippwdHxAJgAUBra2uPIW9WL3fed3dX+1f/92BX+3OfOXC7fmvbX6hbTWZ91esIPCK+GxETI6IFOB24JyK+AiwFTkvd5gC31qxKMzP7mP68kecC4DuS1lDMiS+sTklmZlaJnXojT0TcC9yb2muBKTvz+M2bN3P77bfvzEPMAFi1alVVjvPOlg8+Wtny0StM7vzf2rzapLRu/+xbtfmt9GZmmXKAm5llqq6fhTJmzBhOPfXUep7SGsSKFSsGuoQ+mTx5clfbP/tWbR6Bm5llygFuZpYpB7iZWaYc4GZmmXKAm5llygFuZpYpB7iZWaYc4GZmmXKAm5llyn+V3rIwbdq0rvb8+fMHrpCdVFq3WbV5BG5mlikHuJlZpjyFYlmYPn16j22zwcwjcDOzTDnAzcwypYj6/aF4SR3A28DGup1017EPvu7BxNc9uNT6ug+MiObuG+sa4ACS2iKita4n3QX4ugcXX/fgMlDX7SkUM7NMOcDNzDI1EAG+YADOuSvwdQ8uvu7BZUCuu+5z4GZmVh2eQjEzy1RdA1zSTEmrJK2RdGE9z11Pkg6QtFTSSklPSjo3bd9L0l2SVqev4wa61lqQNFTSw5LuSOsHSVqWrvsmScMHusZqkzRW0mJJT6f7fuxguN+S/jb9jD8h6QZJIxvxfku6VtIGSU+UbOvx/qrw7ynnHpN0dK3qqluASxoK/BD4AnAYcIakw+p1/jrbCpwXEYcCU4Fz0rVeCCyJiEnAkrTeiM4FVpasXwpcnq77deDsAamqtv4N+FVEHAIcQXH9DX2/JU0Avg20RsSngaHA6TTm/f4xMLPbtnL39wvApLTMBa6sVVH1HIFPAdZExNqIeB+4EZhVx/PXTUSsj4gVqf0mxT/mCRTXuyh1WwTMHpgKa0fSROAU4Jq0LmAGsDh1abjrljQa+BywECAi3o+ITQyC+03xeUqjJDUBuwHracD7HRH3A69121zu/s4C/jMKvwPGShpfi7rqGeATgHUl6+1pW0OT1AIcBSwD9ouI9VCEPLDvwFVWM1cA5wPb0vrewKaI2JrWG/G+fwLoAH6Upo6ukbQ7DX6/I+JF4PvACxTBvRlYTuPf707l7m/dsq6eAa4etjX0S2Ak7QH8DJgXEW8MdD21JumLwIaIWF66uYeujXbfm4CjgSsj4iiKj4toqOmSnqQ531nAQcD+wO4U0wfdNdr97k3dfubrGeDtwAEl6xOBl+p4/rqSNIwivH8SEbekza90/iqVvm4YqPpq5DjgS5Kep5gim0ExIh+bfsWGxrzv7UB7RCxL64spAr3R7/eJwHMR0RERHwC3AH9K49/vTuXub92yrp4B/hAwKT1DPZziyY7b6nj+uknzvguBlRFxWcmu24A5qT0HuLXetdVSRHw3IiZGRAvF/b0nIr4CLAVOS90a8bpfBtZJmpw2nQA8RYPfb4qpk6mSdks/853X3dD3u0S5+3sb8LX0apSpwObOqZaqi4i6LcDJwDPAs8A/1PPcdb7Oz1L8yvQY8EhaTqaYD14CrE5f9xroWmv4PZgO3JHanwAeBNYAPwVGDHR9NbjeI4G2dM9/AYwbDPcbuBh4GngCuA4Y0Yj3G7iBYp7/A4oR9tnl7i/FFMoPU849TvEqnZrU5Xdimpllyu/ENDPLlAPczCxTDnAzs0w5wM3MMuUANzPLlAPczCxTDnAzs0w5wM3MMvX/d/xHlJwaU4oAAAAASUVORK5CYII=\n", 186 | "text/plain": [ 187 | "
" 188 | ] 189 | }, 190 | "metadata": { 191 | "needs_background": "light" 192 | }, 193 | "output_type": "display_data" 194 | } 195 | ], 196 | "source": [ 197 | "def get_screen():\n", 198 | " screen = env.render(mode = 'rgb_array')\n", 199 | " ### Capture screen return in shape (H,W,C) = (400,600,3)\n", 200 | " \n", 201 | " ### Cart is in lower half of screen. We remove the unuseful part of environment\n", 202 | " screen_height, screen_width, _ = screen.shape\n", 203 | " screen = screen[int(screen_height*0.4):int(screen_height*0.8),:] # Screen Shape is now (160,600,3)\n", 204 | "# print(screen.shape)\n", 205 | "\n", 206 | " view_width = int(screen_width*0.6) # width required when pole becomes horizontal. \n", 207 | "# print(view_width//2)\n", 208 | " cart_loc = get_cart_loc(screen_width)\n", 209 | "\n", 210 | " ### Extract screen where cart is locaed and discard the remaining part\n", 211 | " if cart_loc < view_width//2:\n", 212 | " slice_range = slice(view_width)\n", 213 | " elif cart_loc > (screen_width - view_width//2):\n", 214 | " slice_range = slice(-view_width, None)\n", 215 | " else:\n", 216 | " slice_range = slice(cart_loc - view_width//2, cart_loc + view_width//2)\n", 217 | "# print(slice_range)\n", 218 | " screen = screen[:,slice_range,:]\n", 219 | "# print(screen.shape)\n", 220 | " \n", 221 | " width = int(screen.shape[1] * 30 / 100)\n", 222 | " height = int(screen.shape[0] * 30 / 100)\n", 223 | " dim = (width, height)\n", 224 | "# print(dim)\n", 225 | " # resize image\n", 226 | " screen = cv2.resize(screen, dim, interpolation = cv2.INTER_AREA)\n", 227 | "# print(screen.shape)\n", 228 | " ### Pytorch work wwith format (C,H,W). So we take transpose\n", 229 | " screen = screen.transpose((2,0,1))\n", 230 | "# \n", 231 | " \n", 232 | " ### Convert ot float and normalize\n", 233 | " screen = np.ascontiguousarray(screen, dtype = np.float32)/255\n", 234 | " screen=np.expand_dims(screen,0)\n", 235 | " \n", 236 | " ### Convert to tensor\n", 237 | " screen = torch.from_numpy(screen)\n", 238 | "# print(screen.shape)\n", 239 | " return screen\n", 240 | " \n", 241 | "\n", 242 | "### Image appears to be blur due to interpolation\n", 243 | "plt.figure()\n", 244 | "plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy())\n", 245 | "plt.title('Extracted screen')\n", 246 | "plt.show()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "Deep Q-Network" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 9, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "class dqn(nn.Module):\n", 263 | " def __init__(self, input_shape, num_actions):\n", 264 | " super(dqn, self).__init__()\n", 265 | " self.device='cuda' if torch.cuda.is_available() else 'cpu'\n", 266 | " self.input_shape = input_shape\n", 267 | " self.num_actions = num_actions\n", 268 | " \n", 269 | " self.features = nn.Sequential(\n", 270 | " nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),\n", 271 | " nn.ReLU(),\n", 272 | " nn.Conv2d(32, 64, kernel_size=4, stride=2),\n", 273 | " nn.ReLU(),\n", 274 | " nn.Conv2d(64, 64, kernel_size=3, stride=1),\n", 275 | " nn.ReLU()\n", 276 | " )\n", 277 | " \n", 278 | " self.fc = nn.Sequential(\n", 279 | " nn.Linear(self.feature_size(), 512),\n", 280 | " nn.ReLU(),\n", 281 | " nn.Linear(512, self.num_actions)\n", 282 | " )\n", 283 | " def forward(self, x):\n", 284 | " x=x.to(self.device)\n", 285 | " x = self.features(x)\n", 286 | " x = x.view(x.size(0), -1)\n", 287 | " x = self.fc(x)\n", 288 | " return x\n", 289 | " \n", 290 | " def feature_size(self):\n", 291 | " return self.features(torch.zeros(1, *self.input_shape)).view(1, -1).size(1)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 10, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "init_screen = get_screen()\n", 301 | "init_screen = init_screen.squeeze(0)\n", 302 | "# print(init_screen.shape[2])\n", 303 | "\n", 304 | "\n", 305 | "num_actions = env.action_space.n\n", 306 | "policy_net = dqn(init_screen.shape, num_actions).to(device)\n", 307 | "target_net = dqn(init_screen.shape, num_actions).to(device)\n", 308 | "\n", 309 | "target_net.load_state_dict(policy_net.state_dict())\n", 310 | "target_net.eval()\n", 311 | "\n", 312 | "criterion = nn.MSELoss()\n", 313 | "optimizer = optim.Adam(policy_net.parameters())\n", 314 | "\n", 315 | "memory = exp_replay(10000)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 11, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "steps_done = 0\n", 325 | "def select_action(state, epsilon):\n", 326 | " global steps_done\n", 327 | " sample = random.random()\n", 328 | "\n", 329 | " if sample < epsilon:\n", 330 | " action = random.randrange(env.action_space.n) \n", 331 | " else:\n", 332 | " q_val = policy_net(state)\n", 333 | " action = torch.argmax(q_val).item()\n", 334 | " \n", 335 | " return action\n", 336 | "\n", 337 | "\n", 338 | "def plot_durations(scores,pause):\n", 339 | " plt.ion()\n", 340 | " plt.figure(2)\n", 341 | " plt.clf()\n", 342 | "\n", 343 | " durations_t = torch.tensor(scores, dtype=torch.float)\n", 344 | " plt.title('Training...')\n", 345 | " plt.xlabel('Episode')\n", 346 | " plt.ylabel('Scores')\n", 347 | " plt.plot(durations_t.numpy())\n", 348 | " # Take 20 episode averages and plot them too\n", 349 | " if len(durations_t) >= 20:\n", 350 | " means = durations_t.unfold(0, 20, 1).mean(1).view(-1)\n", 351 | " means = torch.cat((torch.zeros(19), means))\n", 352 | " plt.plot(means.numpy())\n", 353 | " if pause< 25:\n", 354 | " plt.pause(pause) # pause a bit so that plots are updated\n", 355 | " if is_ipython:\n", 356 | " display.clear_output(wait=True)\n", 357 | " display.display(plt.gcf())" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 12, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "def learn():\n", 367 | " if len(memory) < batch_size:\n", 368 | " return\n", 369 | " \n", 370 | " \n", 371 | " state, action, reward, next_state, done = memory.sample(batch_size)\n", 372 | " \n", 373 | " \n", 374 | " state=torch.FloatTensor(state)\n", 375 | " action=torch.tensor(action).to(device)\n", 376 | " reward=torch.tensor(reward).to(device)\n", 377 | " next_state=torch.FloatTensor(next_state)\n", 378 | " done=torch.tensor(done).float().to(device)\n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | "# state_batch, action_batch, reward_batch, next_state_batch, done_batch = memory.sample(batch_size)\n", 383 | "# reward_batch = torch.tensor(reward_batch, device = device)\n", 384 | "# done_batch = torch.tensor(done_batch, device = device)\n", 385 | "# # print(done_batch)\n", 386 | "# action_batch = torch.tensor(action_batch,device = device)\n", 387 | "# # print(action_batch.size())\n", 388 | "# done_batch = torch.tensor(done, device = device, dtype = torch.float)\n", 389 | "# # print(done_batch)\n", 390 | "\n", 391 | " q_vals = policy_net(state)\n", 392 | " q_next = target_net(next_state).detach()\n", 393 | "# # print(q_val.size, q_target.size)\n", 394 | "# batch_index = np.arange(batch_size)\n", 395 | " q_val = q_vals.gather(1, action.unsqueeze(1)).squeeze(1)\n", 396 | " q_next_val = q_next.max(1)[0]\n", 397 | " \n", 398 | "\n", 399 | " expected_q_val = reward + gamma*q_next_val*(1-done)\n", 400 | " \n", 401 | " loss = criterion(q_val, expected_q_val)\n", 402 | " optimizer.zero_grad()\n", 403 | " loss.backward()\n", 404 | " optimizer.step()\n", 405 | " \n", 406 | " return loss" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "scrolled": true 414 | }, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "
" 420 | ] 421 | }, 422 | "metadata": {}, 423 | "output_type": "display_data" 424 | }, 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "Complete\n" 430 | ] 431 | }, 432 | { 433 | "data": { 434 | "text/plain": [ 435 | "
" 436 | ] 437 | }, 438 | "metadata": {}, 439 | "output_type": "display_data" 440 | } 441 | ], 442 | "source": [ 443 | "ep_reward = []\n", 444 | "num_frame = 120\n", 445 | "\n", 446 | "\n", 447 | "epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon - epsilon_min) * math.exp(-1. * frame_idx / epsilon_decay)\n", 448 | "\n", 449 | "\n", 450 | "\n", 451 | "steps = 0\n", 452 | "for ep in range(1, num_frame+1):\n", 453 | " env.reset()\n", 454 | " last_screen = get_screen()\n", 455 | " current_screen = get_screen()\n", 456 | " state = current_screen - last_screen\n", 457 | " done = 0\n", 458 | " total_reward = 0\n", 459 | " while not done:\n", 460 | " \n", 461 | " epsilon = epsilon_by_frame(steps)\n", 462 | " steps += 1\n", 463 | " \n", 464 | " action = select_action(state, epsilon)\n", 465 | " \n", 466 | " _, reward, done, _ = env.step(action)\n", 467 | "# next_state = torch.tensor(next_state, device = device, dtype = torch.float)\n", 468 | " # Observe new state\n", 469 | " last_screen = current_screen\n", 470 | " current_screen = get_screen()\n", 471 | " next_state = current_screen - last_screen\n", 472 | " \n", 473 | " \n", 474 | " memory.push(state, action, reward, next_state, done)\n", 475 | " loss = learn() # Learning of model with help of experience replay\n", 476 | " \n", 477 | " total_reward += reward\n", 478 | " state = next_state\n", 479 | "\n", 480 | " \n", 481 | " ep_reward.append(total_reward)\n", 482 | " # Update the target network, copying all weights and biases in DQN\n", 483 | " if ep % target_update_freq == 0:\n", 484 | " target_net.load_state_dict(policy_net.state_dict())\n", 485 | " plot_durations(ep_reward,2)\n", 486 | "print('Complete')\n", 487 | "env.close()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": 14, 493 | "metadata": {}, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "image/png": "\n", 498 | "text/plain": [ 499 | "
" 500 | ] 501 | }, 502 | "metadata": { 503 | "needs_background": "light" 504 | }, 505 | "output_type": "display_data" 506 | } 507 | ], 508 | "source": [ 509 | "plot_durations(ep_reward,25)" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": "Python 3", 537 | "language": "python", 538 | "name": "python3" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.6.9" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /DQN/dqn_low_state.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import numpy as np\n", 11 | "import gym\n", 12 | "import torch.nn as nn\n", 13 | "import time\n", 14 | "import random\n", 15 | "import torch.optim as optim\n", 16 | "import math\n", 17 | "\n", 18 | "\n", 19 | "from IPython.display import clear_output\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "%matplotlib inline\n", 22 | "\n", 23 | "\n", 24 | "if torch.cuda.is_available():\n", 25 | " device = 'cuda'\n", 26 | "else:\n", 27 | " device = 'cpu'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Network architecture for DQN\n", 37 | "class q_network(nn.Module):\n", 38 | " def __init__(self, observations, actions):\n", 39 | " super(q_network, self).__init__()\n", 40 | " self.network = nn.Sequential(\n", 41 | " nn.Linear(observations, 64),\n", 42 | " nn.ReLU(),\n", 43 | " nn.Linear(64, 32),\n", 44 | " nn.ReLU(),\n", 45 | " nn.Linear(32, actions),\n", 46 | " )\n", 47 | " def forward(self, x):\n", 48 | " return self.network(x)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "0.95\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "done = False\n", 66 | "learning_rate = 0.0001\n", 67 | "discount= 0.99\n", 68 | "epsilon = 0.95\n", 69 | "epsilon_decay = 0.9999\n", 70 | "min_epsilon = 0.1\n", 71 | "n_episodes = 1000\n", 72 | "batch_size = 128\n", 73 | "Reward_Path = 'rewards.npy'\n", 74 | "env_name = 'CartPole-v0'\n", 75 | "model_path = './models/' + env_name + '.pth'\n", 76 | "\n", 77 | "print(epsilon)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "array([-0.00270691, -0.01540017, 0.0037088 , 0.01664034])" 89 | ] 90 | }, 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# creating a cartpole environment\n", 98 | "env = gym.make(env_name)\n", 99 | "env.reset()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "'\\n# Testing the environment with random actions\\nfor _ in range(1000):\\n env.render(100)\\n time.sleep(0.05)\\n env.step(env.action_space.sample())\\nenv.close()\\n'" 111 | ] 112 | }, 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "'''\n", 120 | "# Testing the environment with random actions\n", 121 | "for _ in range(1000):\n", 122 | " env.render(100)\n", 123 | " time.sleep(0.05)\n", 124 | " env.step(env.action_space.sample())\n", 125 | "env.close()\n", 126 | "'''" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# for experience replay\n", 136 | "from collections import deque\n", 137 | "\n", 138 | "class Exp_Replay:\n", 139 | " def __init__(self, limit):\n", 140 | " self.memory = deque(maxlen=limit) \n", 141 | " \n", 142 | " def push(self, state, action, reward, next_state, done):\n", 143 | " state = np.expand_dims(state,0)\n", 144 | " next_state = np.expand_dims(next_state,0)\n", 145 | "\n", 146 | " self.memory.append((state, action, reward, next_state, done))\n", 147 | " \n", 148 | " def sample(self, batch_size):\n", 149 | " state, action, reward, next_state, done = zip(*random.sample(self.memory, batch_size))\n", 150 | " return np.concatenate(state), action, reward, np.concatenate(next_state), done\n", 151 | " \n", 152 | " def __len__(self):\n", 153 | " return len(self.memory)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "" 165 | ] 166 | }, 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "# creating object for network\n", 174 | "q_hat = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 175 | "q_hat_target = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 176 | "\n", 177 | "q_hat_target.load_state_dict(q_hat.state_dict())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "criterion = nn.MSELoss().to(device)\n", 187 | "optimizer = optim.RMSprop(q_hat.parameters(), lr = learning_rate)\n", 188 | "\n", 189 | "memory = Exp_Replay(10000)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "def plot(episode, avg_loss, eps, eps_rewards):\n", 199 | " clear_output(True)\n", 200 | " plt.figure(figsize=(20,5))\n", 201 | " plt.subplot(131)\n", 202 | " plt.title('Episode: %5d | Epsilon: %4.2f | Avg. Reward: %5.2f'%(episode, eps, np.mean(ep_rewards[-50:])))\n", 203 | " plt.plot(eps_rewards)\n", 204 | " plt.subplot(132)\n", 205 | " plt.title('loss | Average Loss: %5.2f'%np.mean(ep_loss[-50:]))\n", 206 | " plt.plot(ep_loss)\n", 207 | " plt.show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "def action_select(state, epsilon):\n", 217 | " \n", 218 | " if random.random() < epsilon:\n", 219 | " action = env.action_space.sample()\n", 220 | " else:\n", 221 | " state = torch.FloatTensor(state).unsqueeze(0).to(device) \n", 222 | " action = torch.argmax(q_hat(state)).item()\n", 223 | " \n", 224 | " return action" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 11, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "def compute_td_loss(batch_size,criterion, optimizer, target_net, loss, limit):\n", 234 | " if limit< batch_size:\n", 235 | " return 0\n", 236 | " \n", 237 | " state, action, reward, next_state, done = memory.sample(batch_size)\n", 238 | "\n", 239 | " state = torch.FloatTensor(np.float32(state)).to(device)\n", 240 | " next_state = torch.FloatTensor(np.float32(next_state)).to(device)\n", 241 | " action = torch.LongTensor(action).to(device)\n", 242 | " reward = torch.FloatTensor(reward).to(device)\n", 243 | " done = torch.FloatTensor(done).to(device)\n", 244 | " \n", 245 | " current_q = q_hat(state).gather(1, action.unsqueeze(1)).squeeze(1)\n", 246 | " \n", 247 | " \n", 248 | " next_q = target_net(next_state).max(dim=1)[0]\n", 249 | " q_target = reward + discount*next_q*(1 - done)\n", 250 | " \n", 251 | " tdloss = criterion(current_q, q_target.detach())\n", 252 | " loss += tdloss.item()\n", 253 | " \n", 254 | " optimizer.zero_grad()\n", 255 | " tdloss.backward()\n", 256 | " optimizer.step()\n", 257 | " \n", 258 | " return loss" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 12, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "image/png": "\n", 269 | "text/plain": [ 270 | "
" 271 | ] 272 | }, 273 | "metadata": { 274 | "needs_background": "light" 275 | }, 276 | "output_type": "display_data" 277 | } 278 | ], 279 | "source": [ 280 | "avg_loss = 0\n", 281 | "steps = 200\n", 282 | "ep_rewards = np.array([])\n", 283 | "ep_loss = np.array([])\n", 284 | "\n", 285 | "# Training the agent\n", 286 | "for episode in range(n_episodes):\n", 287 | " state = env.reset()\n", 288 | " ep_reward = 0\n", 289 | " running_loss = 0\n", 290 | " done = False\n", 291 | " \n", 292 | " while not done:\n", 293 | " \n", 294 | " action = action_select(state, epsilon)\n", 295 | " epsilon = max(epsilon*epsilon_decay, min_epsilon)\n", 296 | " \n", 297 | " \n", 298 | " next_state, reward, done, _ = env.step(action)\n", 299 | " \n", 300 | " ep_reward += reward\n", 301 | " memory.push(state, action, reward, next_state, done)\n", 302 | " \n", 303 | " state = next_state\n", 304 | " \n", 305 | " running_loss = compute_td_loss(batch_size,criterion, optimizer, q_hat_target, running_loss, len(memory))\n", 306 | " \n", 307 | " ep_rewards = np.append(ep_rewards, ep_reward)\n", 308 | " ep_loss = np.append(ep_loss, running_loss)\n", 309 | " q_hat_target.load_state_dict(q_hat.state_dict())\n", 310 | " \n", 311 | " if episode%50==0:\n", 312 | " plot(episode, ep_loss, epsilon, ep_rewards)\n", 313 | " avg_loss = 0\n", 314 | " ep_reward =0\n", 315 | " np.save(Reward_Path, ep_rewards)\n", 316 | " \n", 317 | "env.close()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 14, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "" 329 | ] 330 | }, 331 | "execution_count": 14, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "# Save model\n", 338 | "torch.save(q_hat_target.state_dict(),model_path)\n", 339 | "\n", 340 | "q_hat_target = q_network(env.observation_space.shape[0], env.action_space.n).to(device)\n", 341 | "q_hat_target.load_state_dict(torch.load(model_path))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 16, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "Total Reward: 200.0\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "import gym\n", 359 | "import time\n", 360 | "\n", 361 | "env = gym.make(env_name)\n", 362 | "# env.reset()\n", 363 | "state = env.reset()\n", 364 | "done = False\n", 365 | "testing_epsilon = 0.05\n", 366 | "tot_reward = 0\n", 367 | "\n", 368 | "for step in range(2000):\n", 369 | " state = torch.from_numpy(state).float().to(device)\n", 370 | " env.render()\n", 371 | " action = torch.argmax(q_hat_target(state)).item()\n", 372 | " state, reward, done, _ = env.step(action)\n", 373 | " tot_reward += reward\n", 374 | " if done:\n", 375 | "# print('DONE')\n", 376 | " break\n", 377 | " time.sleep(.05)\n", 378 | " \n", 379 | "print('Total Reward:', tot_reward)\n", 380 | " \n", 381 | "env.close()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.6.9" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /DQN/models/CartPole-v0.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiPatil/Value-based-RL/a4421c816572162740fbda2798e12ad787e1617c/DQN/models/CartPole-v0.pth -------------------------------------------------------------------------------- /DQN/rewards.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiPatil/Value-based-RL/a4421c816572162740fbda2798e12ad787e1617c/DQN/rewards.npy -------------------------------------------------------------------------------- /DQN/testing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | 8 | import math, random 9 | 10 | import gym 11 | import numpy as np 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.optim as optim 16 | import torch.autograd as autograd 17 | import torch.nn.functional as F 18 | 19 | 20 | # In[2]: 21 | 22 | 23 | # from IPython.display import clear_output 24 | # import matplotlib.pyplot as plt 25 | # get_ipython().run_line_magic('matplotlib', 'inline') 26 | 27 | 28 | # In[3]: 29 | 30 | 31 | USE_CUDA = torch.cuda.is_available() 32 | Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs) 33 | 34 | 35 | # In[4]: 36 | 37 | 38 | from collections import deque 39 | 40 | class ReplayBuffer(object): 41 | def __init__(self, capacity): 42 | self.buffer = deque(maxlen=capacity) 43 | 44 | def push(self, state, action, reward, next_state, done): 45 | state = np.expand_dims(state, 0) 46 | next_state = np.expand_dims(next_state, 0) 47 | 48 | self.buffer.append((state, action, reward, next_state, done)) 49 | 50 | def sample(self, batch_size): 51 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 52 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 53 | 54 | def __len__(self): 55 | return len(self.buffer) 56 | 57 | 58 | # In[5]: 59 | 60 | 61 | env_id = "CartPole-v0" 62 | env = gym.make(env_id) 63 | 64 | 65 | # In[6]: 66 | 67 | 68 | epsilon_start = 1.0 69 | epsilon_final = 0.01 70 | epsilon_decay = 500 71 | 72 | epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) 73 | 74 | 75 | # In[7]: 76 | 77 | 78 | # plt.plot([epsilon_by_frame(i) for i in range(10000)]) 79 | 80 | 81 | # In[8]: 82 | 83 | 84 | class DQN(nn.Module): 85 | def __init__(self, num_inputs, num_actions): 86 | super(DQN, self).__init__() 87 | 88 | self.layers = nn.Sequential( 89 | nn.Linear(env.observation_space.shape[0], 128), 90 | nn.ReLU(), 91 | nn.Linear(128, 128), 92 | nn.ReLU(), 93 | nn.Linear(128, env.action_space.n) 94 | ) 95 | 96 | def forward(self, x): 97 | return self.layers(x) 98 | 99 | def act(self, state, epsilon): 100 | if random.random() > epsilon: 101 | state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True) 102 | q_value = self.forward(state) 103 | action = q_value.max(1)[1].data[0] 104 | else: 105 | action = random.randrange(env.action_space.n) 106 | return action 107 | 108 | 109 | # In[9]: 110 | 111 | 112 | model = DQN(env.observation_space.shape[0], env.action_space.n) 113 | 114 | if USE_CUDA: 115 | model = model.cuda() 116 | 117 | optimizer = optim.Adam(model.parameters()) 118 | 119 | replay_buffer = ReplayBuffer(1000) 120 | 121 | 122 | # In[10]: 123 | 124 | 125 | def compute_td_loss(batch_size): 126 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 127 | 128 | state = Variable(torch.FloatTensor(np.float32(state))) 129 | next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) 130 | action = Variable(torch.LongTensor(action)) 131 | reward = Variable(torch.FloatTensor(reward)) 132 | done = Variable(torch.FloatTensor(done)) 133 | 134 | q_values = model(state) 135 | next_q_values = model(next_state) 136 | 137 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 138 | next_q_value = next_q_values.max(1)[0] 139 | expected_q_value = reward + gamma * next_q_value * (1 - done) 140 | 141 | loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() 142 | 143 | optimizer.zero_grad() 144 | loss.backward() 145 | optimizer.step() 146 | 147 | return loss 148 | 149 | 150 | # In[11]: 151 | 152 | 153 | # def plot(frame_idx, rewards, losses): 154 | # clear_output(True) 155 | # plt.figure(figsize=(20,5)) 156 | # plt.subplot(131) 157 | # plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:]))) 158 | # plt.plot(rewards) 159 | # plt.subplot(132) 160 | # plt.title('loss') 161 | # plt.plot(losses) 162 | # plt.show() 163 | 164 | 165 | # In[12]: 166 | 167 | 168 | num_frames = 10000 169 | batch_size = 32 170 | gamma = 0.99 171 | 172 | losses = [] 173 | all_rewards = [] 174 | episode_reward = 0 175 | 176 | state = env.reset() 177 | for frame_idx in range(1, num_frames + 1): 178 | epsilon = epsilon_by_frame(frame_idx) 179 | action = model.act(state, epsilon) 180 | 181 | next_state, reward, done, _ = env.step(action) 182 | replay_buffer.push(state, action, reward, next_state, done) 183 | 184 | state = next_state 185 | episode_reward += reward 186 | 187 | if done: 188 | state = env.reset() 189 | all_rewards.append(episode_reward) 190 | episode_reward = 0 191 | 192 | if len(replay_buffer) > batch_size: 193 | loss = compute_td_loss(batch_size) 194 | losses.append(loss.item()) 195 | 196 | # if frame_idx % 200 == 0: 197 | # plot(frame_idx, all_rewards, losses) 198 | 199 | 200 | # In[ ]: 201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /Model-Based-Learn/Makefile: -------------------------------------------------------------------------------- 1 | submit: 2 | sh collect_submission.sh 3 | 4 | clean: 5 | rm -f assignment1.zip 6 | rm -f *.pyc *.png *.npy utils/*.pyc 7 | 8 | -------------------------------------------------------------------------------- /Model-Based-Learn/__pycache__/lake_envs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiPatil/Value-based-RL/a4421c816572162740fbda2798e12ad787e1617c/Model-Based-Learn/__pycache__/lake_envs.cpython-36.pyc -------------------------------------------------------------------------------- /Model-Based-Learn/collect_submission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment1.zip 2 | zip -r assignment1.zip vi_and_pi.py 3 | -------------------------------------------------------------------------------- /Model-Based-Learn/discrete_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env, spaces 4 | from gym.utils import seeding 5 | 6 | def categorical_sample(prob_n, np_random): 7 | """ 8 | Sample from categorical distribution 9 | Each row specifies class probabilities 10 | """ 11 | prob_n = np.asarray(prob_n) 12 | csprob_n = np.cumsum(prob_n) 13 | return (csprob_n > np_random.rand()).argmax() 14 | 15 | 16 | class DiscreteEnv(Env): 17 | 18 | """ 19 | Has the following members 20 | - nS: number of states 21 | - nA: number of actions 22 | - P: transitions (*) 23 | - isd: initial state distribution (**) 24 | 25 | (*) dictionary dict of dicts of lists, where 26 | P[s][a] == [(probability, nextstate, reward, done), ...] 27 | (**) list or array of length nS 28 | 29 | 30 | """ 31 | def __init__(self, nS, nA, P, isd): 32 | self.P = P 33 | self.isd = isd 34 | self.lastaction=None # for rendering 35 | self.nS = nS 36 | self.nA = nA 37 | 38 | self.action_space = spaces.Discrete(self.nA) 39 | self.observation_space = spaces.Discrete(self.nS) 40 | 41 | self._seed() 42 | self._reset() 43 | 44 | def _seed(self, seed=None): 45 | self.np_random, seed = seeding.np_random(seed) 46 | return [seed] 47 | 48 | def _reset(self): 49 | self.s = categorical_sample(self.isd, self.np_random) 50 | self.lastaction=None 51 | return self.s 52 | 53 | def _step(self, a): 54 | transitions = self.P[self.s][a] 55 | i = categorical_sample([t[0] for t in transitions], self.np_random) 56 | p, s, r, d= transitions[i] 57 | self.s = s 58 | self.lastaction=a 59 | return (s, r, d, {"prob" : p}) 60 | -------------------------------------------------------------------------------- /Model-Based-Learn/frozen_lake.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from six import StringIO, b 4 | from gym import utils 5 | import discrete_env 6 | 7 | # Mapping between directions and index number 8 | LEFT = 0 9 | DOWN = 1 10 | RIGHT = 2 11 | UP = 3 12 | 13 | # Maps for the two different environments 14 | MAPS = { 15 | "4x4": [ 16 | "SFFF", 17 | "FHFH", 18 | "FFFH", 19 | "HFFG" 20 | ], 21 | "8x8": [ 22 | "SFFFFFFF", 23 | "FFFFFFFF", 24 | "FFFHFFFF", 25 | "FHFFFHFF", 26 | "FFFHFFFF", 27 | "FFHFFFHF", 28 | "FHFFHFHF", 29 | "FFFHFFFG" 30 | ], 31 | } 32 | 33 | class FrozenLakeEnv(discrete_env.DiscreteEnv): 34 | """ 35 | Winter is here. You and your friends were tossing around a frisbee at the park 36 | when you made a wild throw that left the frisbee out in the middle of the lake. 37 | The water is mostly frozen, but there are a few holes where the ice has melted. 38 | If you step into one of those holes, you'll fall into the freezing water. 39 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 40 | you navigate across the lake and retrieve the disc. 41 | However, the ice is slippery, so you won't always move in the direction you intend. 42 | The surface is described using a grid like the following 43 | 44 | SFFF 45 | FHFH 46 | FFFH 47 | HFFG 48 | 49 | S : starting point, safe 50 | F : frozen surface, safe 51 | H : hole, fall to your doom 52 | G : goal, where the frisbee is located 53 | 54 | The episode ends when you reach the goal or fall in a hole. 55 | You receive a reward of 1 if you reach the goal, and zero otherwise. 56 | 57 | """ 58 | 59 | metadata = {'render.modes': ['human', 'ansi']} 60 | 61 | def __init__(self, desc=None, map_name="4x4",is_slippery=True): 62 | if desc is None and map_name is None: 63 | raise ValueError('Must provide either desc or map_name') 64 | elif desc is None: 65 | desc = MAPS[map_name] 66 | self.desc = desc = np.asarray(desc,dtype='c') 67 | self.nrow, self.ncol = nrow, ncol = desc.shape 68 | 69 | nA = 4 # number of actions 70 | nS = nrow * ncol # number of states 71 | 72 | isd = np.array(desc == b'S').astype('float64').ravel() 73 | isd /= isd.sum() 74 | 75 | P = {s : {a : [] for a in range(nA)} for s in range(nS)} 76 | 77 | def to_s(row, col): 78 | return row*ncol + col 79 | def inc(row, col, a): 80 | if a==0: # left 81 | col = max(col-1,0) 82 | elif a==1: # down 83 | row = min(row+1,nrow-1) 84 | elif a==2: # right 85 | col = min(col+1,ncol-1) 86 | elif a==3: # up 87 | row = max(row-1,0) 88 | return (row, col) 89 | 90 | for row in range(nrow): 91 | for col in range(ncol): 92 | s = to_s(row, col) 93 | for a in range(4): 94 | li = P[s][a] 95 | letter = desc[row, col] 96 | if letter in b'GH': 97 | li.append((1.0, s, 0, True)) 98 | else: 99 | if is_slippery: 100 | for b in [(a-1)%4, a, (a+1)%4]: 101 | newrow, newcol = inc(row, col, b) 102 | newstate = to_s(newrow, newcol) 103 | newletter = desc[newrow, newcol] 104 | done = bytes(newletter) in b'GH' 105 | rew = float(newletter == b'G') 106 | li.append((0.8 if b==a else 0.1, newstate, rew, done)) 107 | else: 108 | newrow, newcol = inc(row, col, a) 109 | newstate = to_s(newrow, newcol) 110 | newletter = desc[newrow, newcol] 111 | done = bytes(newletter) in b'GH' 112 | rew = float(newletter == b'G') 113 | li.append((1.0, newstate, rew, done)) 114 | 115 | super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) 116 | 117 | def _render(self, mode='human', close=False): 118 | if close: 119 | return 120 | outfile = StringIO() if mode == 'ansi' else sys.stdout 121 | 122 | row, col = self.s // self.ncol, self.s % self.ncol 123 | desc = self.desc.tolist() 124 | desc = [[c.decode('utf-8') for c in line] for line in desc] 125 | desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) 126 | if self.lastaction is not None: 127 | outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) 128 | else: 129 | outfile.write("\n") 130 | outfile.write("\n".join(''.join(line) for line in desc)+"\n") 131 | 132 | return outfile 133 | -------------------------------------------------------------------------------- /Model-Based-Learn/lake_envs.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Defines some frozen lake maps.""" 3 | from gym.envs.toy_text import frozen_lake, discrete 4 | from gym.envs.registration import register 5 | 6 | 7 | register( 8 | id='Deterministic-4x4-FrozenLake-v0', 9 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 10 | kwargs={'map_name': '4x4', 11 | 'is_slippery': False}) 12 | 13 | register( 14 | id='Deterministic-8x8-FrozenLake-v0', 15 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 16 | kwargs={'map_name': '8x8', 17 | 'is_slippery': False}) 18 | 19 | register( 20 | id='Stochastic-4x4-FrozenLake-v0', 21 | entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv', 22 | kwargs={'map_name': '4x4', 23 | 'is_slippery': True}) 24 | -------------------------------------------------------------------------------- /Model-Based-Learn/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.9 2 | matplotlib 3 | numpy 4 | scipy 5 | -------------------------------------------------------------------------------- /Model-Based-Learn/vi_and_pi.py: -------------------------------------------------------------------------------- 1 | ### MDP Value Iteration and Policy Iteration 2 | 3 | import numpy as np 4 | import gym 5 | import time 6 | from lake_envs import * 7 | import random 8 | 9 | np.set_printoptions(precision=3) 10 | 11 | """ 12 | For policy_evaluation, policy_improvement, policy_iteration and value_iteration, 13 | the parameters P, nS, nA, gamma are defined as follows: 14 | 15 | P: nested dictionary 16 | From gym.core.Environment 17 | For each pair of states in [1, nS] and actions in [1, nA], P[state][action] is a 18 | tuple of the form (probability, nextstate, reward, terminal) where 19 | - probability: float 20 | the probability of transitioning from "state" to "nextstate" with "action" 21 | - nextstate: int 22 | denotes the state we transition to (in range [0, nS - 1]) 23 | - reward: int 24 | either 0 or 1, the reward for transitioning from "state" to 25 | "nextstate" with "action" 26 | - terminal: bool 27 | True when "nextstate" is a terminal state (hole or goal), False otherwise 28 | nS: int 29 | number of states in the environment 30 | nA: int 31 | number of actions in the environment 32 | gamma: float 33 | Discount factor. Number in range [0, 1) 34 | """ 35 | 36 | def policy_evaluation(P, nS, nA, policy, gamma=0.9, tol=1e-3): 37 | """Evaluate the value function from a given policy. 38 | 39 | Parameters 40 | ---------- 41 | P, nS, nA, gamma: 42 | defined at beginning of file 43 | policy: np.array[nS] 44 | The policy to evaluate. Maps states to actions. 45 | tol: float 46 | Terminate policy evaluation when 47 | max |value_function(s) - prev_value_function(s)| < tol 48 | Returns 49 | ------- 50 | value_function: np.ndarray[nS] 51 | The value function of the given policy, where value_function[s] is 52 | the value of state s 53 | """ 54 | # print(nA) 55 | v_new = np.zeros(nS, dtype=float) 56 | ############################ 57 | # YOUR IMPLEMENTATION HERE # 58 | for i in range(1000): 59 | value=v_new.copy() 60 | for state in range(nS): 61 | # for action in range(nA): 62 | # print(P[state][policy[state]]) 63 | for probability, nextstate, reward, terminal in P[state][policy[state]]: 64 | v_new[state] = probability*(reward+gamma*v_new[nextstate]) 65 | 66 | if terminal: 67 | v_new[state] = reward 68 | 69 | if(np.all(np.abs(value-v_new) valMax: 105 | new_policy[state] = action 106 | valMax = Qval 107 | elif Qval == valMax: 108 | if random.random()<0.5: 109 | new_policy[state]=action 110 | 111 | return new_policy 112 | ############################ 113 | 114 | 115 | 116 | def policy_iteration(P, nS, nA, gamma=0.9, tol=10e-3): 117 | """Runs policy iteration. 118 | 119 | You should call the policy_evaluation() and policy_improvement() methods to 120 | implement this method. 121 | 122 | Parameters 123 | ---------- 124 | P, nS, nA, gamma: 125 | defined at beginning of file 126 | tol: float 127 | tol parameter used in policy_evaluation() 128 | Returns: 129 | ---------- 130 | value_function: np.ndarray[nS] 131 | policy: np.ndarray[nS] 132 | """ 133 | 134 | value_function = np.zeros(nS) 135 | policy = np.zeros(nS, dtype=int) 136 | 137 | ############################ 138 | # YOUR IMPLEMENTATION HERE # 139 | for s in range(nS): 140 | policy[s] = 1#s%nA 141 | for i in range(1000): 142 | v_new = policy_evaluation(P, nS, nA, policy, gamma) 143 | policy_new = policy_improvement(P, nS, nA, v_new, policy, gamma) 144 | # if np.all(np.abs(value_function - v_new) < tol): 145 | # print('tatti') 146 | # break 147 | 148 | value_function = v_new.copy() 149 | policy = policy_new.copy() 150 | 151 | print(value_function) 152 | 153 | 154 | ############################ 155 | return value_function, policy 156 | 157 | def value_iteration(P, nS, nA, gamma=0.9, tol=1e-3): 158 | """ 159 | Learn value function and policy by using value iteration method for a given 160 | gamma and environment. 161 | 162 | Parameters: 163 | ---------- 164 | P, nS, nA, gamma: 165 | defined at beginning of file 166 | tol: float 167 | Terminate value iteration when 168 | max |value_function(s) - prev_value_function(s)| < tol 169 | Returns: 170 | ---------- 171 | value_function: np.ndarray[nS] 172 | policy: np.ndarray[nS] 173 | """ 174 | # print(P) 175 | value_function = np.zeros(nS,dtype='double') 176 | policy = np.zeros(nS, dtype=int) 177 | 178 | ############################ 179 | # YOUR IMPLEMENTATION HERE # 180 | for i in range(1000): 181 | v_new=value_function.copy() 182 | for s in range(nS): 183 | 184 | action_reward=[] 185 | for a in range(nA): 186 | q=0 187 | for probability, nextstate, reward, terminal in P[s][a]: 188 | q += probability*(reward+gamma*value_function[nextstate]) 189 | action_reward.append(q) 190 | v_new[s]= np.max(action_reward) 191 | policy[s]=np.argmax(action_reward) 192 | # if value_change self.size-1: 68 | self.x = self.size-1 69 | if self.y < 0: 70 | self.y = 0 71 | elif self.y > self.size-1: 72 | self.y = self.size-1 73 | 74 | 75 | class ENVIRONMENT(): 76 | 77 | 78 | 79 | def __init__(self, num_player=1, num_enemy=1, num_food=1, size = 10, diagonal = False): 80 | self.size = size 81 | self.naction = 4 82 | self.diagonal = diagonal 83 | self.num_enemy = num_enemy 84 | self.num_food = num_food 85 | self.player = Blob(size) 86 | self.enemy = [Blob() for _ in range(self.num_enemy)] 87 | self.food = [Blob() for _ in range(self.num_food)] 88 | self.reward = 0 89 | self.colors = {1: (255, 0, 0), 90 | 2: (0, 255, 0), 91 | 3: (0, 0, 255)} 92 | self.px,self.py = self.player.x,self.player.y 93 | self.ex,self.ey = [self.enemy[iter].x for iter in range(self.num_enemy)], [self.enemy[iter].y for iter in range(self.num_enemy)] 94 | self.fx,self.fy = [self.food[iter].x for iter in range(self.num_food)], [self.food[iter].y for iter in range(self.num_food)] 95 | 96 | 97 | def startover(self, newpos=False): 98 | 99 | self.player.x, self.player.y = self.px, self.py 100 | for iter in range(self.num_enemy): 101 | self.enemy[iter].x, self.enemy[iter].y = self.ex[iter], self.ey[iter] 102 | for iter in range(self.num_food): 103 | self.food[iter].x, self.food[iter].y = self.fx[iter], self.fy[iter] 104 | if newpos == True: 105 | self.player = Blob(self.size) 106 | self.reward = 0 107 | 108 | return (self.player.x, self.player.y), self.reward, False 109 | 110 | def step(self, action): 111 | 112 | self.player.act(action, self.diagonal) 113 | self.reward = self.calculate_reward() 114 | return (self.player.x, self.player.y), self.reward 115 | 116 | def calculate_reward(self): 117 | 118 | if self.player.x in [self.enemy[iter].x for iter in range(self.num_enemy)] and self.player.y in [self.enemy[iter].y for iter in range(self.num_enemy)]: 119 | return -100, True 120 | 121 | if self.player.x in [self.food[iter].x for iter in range(self.num_food)] and self.player.y in [self.food[iter].y for iter in range(self.num_food)]: 122 | return 100, True 123 | 124 | else: 125 | return -1, False 126 | 127 | 128 | def render(self,renderTime=100): 129 | 130 | env = np.zeros((self.size, self.size, 3), dtype=np.uint8) 131 | for iter in range(self.num_food): 132 | env[self.food[iter].x][self.food[iter].y] = self.colors[2] 133 | for iter in range(self.num_enemy): 134 | env[self.enemy[iter].x][self.enemy[iter].y] = self.colors[3] 135 | env[self.player.x][self.player.y] = self.colors[1] 136 | img = Image.fromarray(env, 'RGB') 137 | img = img.resize((300, 300)) 138 | cv2.imshow("image", np.array(img)) 139 | cv2.waitKey(renderTime) 140 | # cv2.destroyAllWindows() 141 | 142 | def sample_action(self): 143 | return np.random.randint(0, self.naction) 144 | -------------------------------------------------------------------------------- /Model-Free-Learn/q_learning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import gym 8 | import numpy as np 9 | from custom_environment import ENVIRONMENT 10 | import cv2 11 | import pickle 12 | import random 13 | 14 | 15 | # In[2]: 16 | 17 | 18 | size=10 19 | episodes = 25000 20 | epsilon=0.9 21 | discount = 0.95 22 | learning_rate=0.1 23 | total_reward=0 24 | display_every = 500 25 | render_every = 500 26 | EPS_DECAY=0.998 27 | 28 | 29 | # In[3]: 30 | 31 | 32 | env = ENVIRONMENT(diagonal=False, size=10, num_enemy=3, num_food=1) 33 | 34 | 35 | # In[4]: 36 | 37 | 38 | class parameter(): 39 | def __init__(self, size, episode, discount, epsilon, learning_rate, render_every=500, verbose_every=500, EPS_DECAY=0.998, random_start=True): 40 | self.episode = episodes 41 | self.discount= discount 42 | self.size = size 43 | self. learning_rate = learning_rate 44 | self.epsilon = epsilon 45 | self.render_every=render_every 46 | self.random_start = random_start 47 | self.verbose_every = display_every 48 | self.EPS_DECAY = EPS_DECAY 49 | 50 | def decay_epsilon(self): 51 | self.epsilon *= self.EPS_DECAY 52 | 53 | # In[5]: 54 | 55 | 56 | def q_table(size, action): 57 | q_table = np.random.randn(size,size,action) 58 | return q_table 59 | 60 | 61 | # In[8]: 62 | 63 | 64 | def q_improve(env, q, parameter, verbose=True): 65 | total_reward = 0 66 | for episode in range(parameter.episode): 67 | state, reward, terminal = env.startover(newpos=parameter.random_start) 68 | while not terminal: 69 | current_q = q[state[0],state[1],:] 70 | # to make policy e-greedy 71 | if random.random() > parameter.epsilon: 72 | action = np.argmax(current_q) 73 | else: 74 | action=env.sample_action() 75 | #Now take a step and see what happens 76 | next_state, (next_reward, terminal) = env.step(action) 77 | total_reward += next_reward 78 | future_q = q[next_state[0],next_state[1],:] 79 | 80 | q[state[0],state[1],action] =current_q[action] + parameter.learning_rate*(next_reward + parameter.discount*np.max(future_q) - current_q[action]) 81 | 82 | if terminal and next_reward == 100: 83 | q[state[0],state[1],:]=0 84 | if episode%parameter.render_every == 0: 85 | env.render(100) 86 | state = next_state 87 | parameter.decay_epsilon() 88 | cv2.destroyAllWindows() 89 | if episode%parameter.verbose_every == 0 and verbose: 90 | print('Episode: ',episode,'state:',state,'| Total Average Reward:', total_reward/500,'| Epsilon:', parameter.epsilon) 91 | total_reward= 0 92 | return q 93 | 94 | 95 | 96 | # In[9]: 97 | 98 | 99 | env = ENVIRONMENT(diagonal=True, size=10, num_enemy = 3, num_food = 1) 100 | q = q_table(size=10, action=4) 101 | parameters = parameter(size, episodes, discount, epsilon, learning_rate) 102 | 103 | # Test Environment 104 | # print(env.startover()) 105 | 106 | # for i in range(10): 107 | # print(env.step(np.random.randint(0,4))) 108 | # env.render(100) 109 | 110 | # cv2.destroyAllWindows() 111 | 112 | # print(env.startover()) 113 | 114 | 115 | # Improve the Q-value table 116 | q = q_improve(env, q, parameter=parameters) 117 | 118 | 119 | # In[ ]: 120 | 121 | 122 | 123 | 124 | 125 | # In[ ]: 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Value-based RL 2 | 3 | Implementation of Reinforcement Learning Algorithms based on value function. 4 | - Deep Q-Network 5 | - Q-learning 6 | - SARSA 7 | - Policy iteration 8 | - Value iteration 9 | 10 | --------------------------------------------------------------------------------