├── .gitignore ├── 01.DQN.ipynb ├── 02.NStep_DQN.ipynb ├── 03.Double_DQN.ipynb ├── 04.Dueling_DQN.ipynb ├── 05.DQN-NoisyNets.ipynb ├── 06.DQN_PriorityReplay.ipynb ├── 07.Categorical-DQN.ipynb ├── 08.Rainbow.ipynb ├── 09.QuantileRegression-DQN.ipynb ├── 10.Quantile-Rainbow.ipynb ├── 11.DRQN.ipynb ├── 12.A2C.ipynb ├── 13.GAE.ipynb ├── 14.PPO.ipynb ├── README.md ├── a2c_devel.py ├── agents ├── A2C.py ├── BaseAgent.py ├── Categorical_DQN.py ├── DQN.py ├── DRQN.py ├── Double_DQN.py ├── Dueling_DQN.py ├── PPO.py ├── QuantileRegression_DQN.py ├── Quantile_Rainbow.py ├── Rainbow.py └── __init__.py ├── dqn_devel.py ├── networks ├── __init__.py ├── layers.py ├── network_bodies.py └── networks.py ├── saved_agents ├── __init__.py ├── model.dump └── optim.dump └── utils ├── ReplayMemory.py ├── RolloutStorage.py ├── __init__.py ├── data_structures.py ├── hyperparameters.py ├── plot.py └── wrappers.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optmized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Ipython Checkpoints 6 | .ipynb_checkpoints/ 7 | 8 | #VSCode Meta 9 | .vscode/ 10 | 11 | #linting 12 | .mypy_cache/ -------------------------------------------------------------------------------- /04.Dueling_DQN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dueling Deep Q Network" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import gym\n", 24 | "import numpy as np\n", 25 | "\n", 26 | "import torch\n", 27 | "import torch.nn as nn\n", 28 | "import torch.nn.functional as F\n", 29 | "import torch.optim as optim\n", 30 | "\n", 31 | "from IPython.display import clear_output\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "%matplotlib inline\n", 34 | "\n", 35 | "from timeit import default_timer as timer\n", 36 | "from datetime import timedelta\n", 37 | "import math\n", 38 | "\n", 39 | "from utils.wrappers import *\n", 40 | "from agents.DQN import Model as DQN_Agent\n", 41 | "from utils.ReplayMemory import ExperienceReplayMemory\n", 42 | "\n", 43 | "from utils.hyperparameters import Config" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Hyperparameters" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "config = Config()\n", 60 | "\n", 61 | "config.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 62 | "\n", 63 | "#epsilon variables\n", 64 | "config.epsilon_start = 1.0\n", 65 | "config.epsilon_final = 0.01\n", 66 | "config.epsilon_decay = 30000\n", 67 | "config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)\n", 68 | "\n", 69 | "#misc agent variables\n", 70 | "config.GAMMA=0.99\n", 71 | "config.LR=1e-4\n", 72 | "\n", 73 | "#memory\n", 74 | "config.TARGET_NET_UPDATE_FREQ = 1000\n", 75 | "config.EXP_REPLAY_SIZE = 100000\n", 76 | "config.BATCH_SIZE = 32\n", 77 | "\n", 78 | "#Learning control variables\n", 79 | "config.LEARN_START = 10000\n", 80 | "config.MAX_FRAMES=1000000\n", 81 | "\n", 82 | "#Nstep controls\n", 83 | "config.N_STEPS=1" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Network" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "class DuelingDQN(nn.Module):\n", 100 | " def __init__(self, input_shape, num_outputs):\n", 101 | " super(DuelingDQN, self).__init__()\n", 102 | " \n", 103 | " self.input_shape = input_shape\n", 104 | " self.num_actions = num_outputs\n", 105 | " \n", 106 | " self.conv1 = nn.Conv2d(self.input_shape[0], 32, kernel_size=8, stride=4)\n", 107 | " self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)\n", 108 | " self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)\n", 109 | "\n", 110 | " self.adv1 = nn.Linear(self.feature_size(), 512)\n", 111 | " self.adv2 = nn.Linear(512, self.num_actions)\n", 112 | "\n", 113 | " self.val1 = nn.Linear(self.feature_size(), 512)\n", 114 | " self.val2 = nn.Linear(512, 1)\n", 115 | " \n", 116 | " def forward(self, x):\n", 117 | " x = F.relu(self.conv1(x))\n", 118 | " x = F.relu(self.conv2(x))\n", 119 | " x = F.relu(self.conv3(x))\n", 120 | " x = x.view(x.size(0), -1)\n", 121 | "\n", 122 | " adv = F.relu(self.adv1(x))\n", 123 | " adv = self.adv2(adv)\n", 124 | "\n", 125 | " val = F.relu(self.val1(x))\n", 126 | " val = self.val2(val)\n", 127 | "\n", 128 | " return val + adv - adv.mean()\n", 129 | " \n", 130 | " def feature_size(self):\n", 131 | " return self.conv3(self.conv2(self.conv1(torch.zeros(1, *self.input_shape)))).view(1, -1).size(1)\n", 132 | " \n", 133 | " def sample_noise(self):\n", 134 | " #ignore this for now\n", 135 | " pass" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "## Agent" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 4, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "class Model(DQN_Agent):\n", 152 | " def __init__(self, static_policy=False, env=None, config=None):\n", 153 | " super(Model, self).__init__(static_policy, env, config)\n", 154 | "\n", 155 | " def declare_networks(self):\n", 156 | " self.model = DuelingDQN(self.env.observation_space.shape, self.env.action_space.n)\n", 157 | " self.target_model = DuelingDQN(self.env.observation_space.shape, self.env.action_space.n)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Plot Results" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "def plot(frame_idx, rewards, losses, sigma, elapsed_time):\n", 174 | " clear_output(True)\n", 175 | " plt.figure(figsize=(20,5))\n", 176 | " plt.subplot(131)\n", 177 | " plt.title('frame %s. reward: %s. time: %s' % (frame_idx, np.mean(rewards[-10:]), elapsed_time))\n", 178 | " plt.plot(rewards)\n", 179 | " if losses:\n", 180 | " plt.subplot(132)\n", 181 | " plt.title('loss')\n", 182 | " plt.plot(losses)\n", 183 | " if sigma:\n", 184 | " plt.subplot(133)\n", 185 | " plt.title('noisy param magnitude')\n", 186 | " plt.plot(sigma)\n", 187 | " plt.show()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "## Training Loop" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "scrolled": true 202 | }, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "image/png": "\n", 207 | "text/plain": [ 208 | "
" 209 | ] 210 | }, 211 | "metadata": {}, 212 | "output_type": "display_data" 213 | } 214 | ], 215 | "source": [ 216 | "start=timer()\n", 217 | "\n", 218 | "env_id = \"PongNoFrameskip-v4\"\n", 219 | "env = make_atari(env_id)\n", 220 | "env = wrap_deepmind(env, frame_stack=False)\n", 221 | "env = wrap_pytorch(env)\n", 222 | "model = Model(env=env, config=config)\n", 223 | "\n", 224 | "episode_reward = 0\n", 225 | "\n", 226 | "observation = env.reset()\n", 227 | "for frame_idx in range(1, config.MAX_FRAMES + 1):\n", 228 | " epsilon = config.epsilon_by_frame(frame_idx)\n", 229 | "\n", 230 | " action = model.get_action(observation, epsilon)\n", 231 | " prev_observation=observation\n", 232 | " observation, reward, done, _ = env.step(action)\n", 233 | " observation = None if done else observation\n", 234 | "\n", 235 | " model.update(prev_observation, action, reward, observation, frame_idx)\n", 236 | " episode_reward += reward\n", 237 | "\n", 238 | " if done:\n", 239 | " model.finish_nstep()\n", 240 | " model.reset_hx()\n", 241 | " observation = env.reset()\n", 242 | " model.save_reward(episode_reward)\n", 243 | " episode_reward = 0\n", 244 | " \n", 245 | " if np.mean(model.rewards[-10:]) > 19:\n", 246 | " plot(frame_idx, model.rewards, model.losses, model.sigma_parameter_mag, timedelta(seconds=int(timer()-start)))\n", 247 | " break\n", 248 | "\n", 249 | " if frame_idx % 10000 == 0:\n", 250 | " plot(frame_idx, model.rewards, model.losses, model.sigma_parameter_mag, timedelta(seconds=int(timer()-start)))\n", 251 | "\n", 252 | "model.save_w()\n", 253 | "env.close()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.6.5" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 2 285 | } 286 | -------------------------------------------------------------------------------- /11.DRQN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deep Recurrent Q Network" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import gym\n", 24 | "import numpy as np\n", 25 | "\n", 26 | "import torch\n", 27 | "import torch.nn as nn\n", 28 | "import torch.nn.functional as F\n", 29 | "import torch.optim as optim\n", 30 | "\n", 31 | "from IPython.display import clear_output\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "%matplotlib inline\n", 34 | "\n", 35 | "from timeit import default_timer as timer\n", 36 | "from datetime import timedelta\n", 37 | "import math\n", 38 | "import random\n", 39 | "\n", 40 | "from utils.wrappers import *\n", 41 | "\n", 42 | "from agents.DQN import Model as DQN_Agent\n", 43 | "\n", 44 | "from networks.network_bodies import SimpleBody, AtariBody\n", 45 | "\n", 46 | "from utils.ReplayMemory import ExperienceReplayMemory\n", 47 | "from utils.hyperparameters import Config" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Hyperparameters" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "config = Config()\n", 64 | "\n", 65 | "config.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 66 | "device = config.device\n", 67 | "\n", 68 | "#epsilon variables\n", 69 | "config.epsilon_start = 1.0\n", 70 | "config.epsilon_final = 0.01\n", 71 | "config.epsilon_decay = 30000\n", 72 | "config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)\n", 73 | "\n", 74 | "#misc agent variables\n", 75 | "config.GAMMA=0.99\n", 76 | "config.LR=1e-4\n", 77 | "\n", 78 | "#memory\n", 79 | "config.TARGET_NET_UPDATE_FREQ = 1024\n", 80 | "config.EXP_REPLAY_SIZE = 10000\n", 81 | "config.BATCH_SIZE = 32\n", 82 | "\n", 83 | "#Learning control variables\n", 84 | "config.LEARN_START = 10000\n", 85 | "config.MAX_FRAMES=1500000\n", 86 | "\n", 87 | "#Nstep controls\n", 88 | "config.N_STEPS=1\n", 89 | "\n", 90 | "#DRQN Parameters\n", 91 | "config.SEQUENCE_LENGTH=8" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## Replay Buffer" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 3, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "class RecurrentExperienceReplayMemory:\n", 108 | " def __init__(self, capacity, sequence_length=10):\n", 109 | " self.capacity = capacity\n", 110 | " self.memory = []\n", 111 | " self.seq_length=sequence_length\n", 112 | "\n", 113 | " def push(self, transition):\n", 114 | " self.memory.append(transition)\n", 115 | " if len(self.memory) > self.capacity:\n", 116 | " del self.memory[0]\n", 117 | "\n", 118 | " def sample(self, batch_size):\n", 119 | " finish = random.sample(range(0, len(self.memory)), batch_size)\n", 120 | " begin = [x-self.seq_length for x in finish]\n", 121 | " samp = []\n", 122 | " for start, end in zip(begin, finish):\n", 123 | " #correct for sampling near beginning\n", 124 | " final = self.memory[max(start+1,0):end+1]\n", 125 | " \n", 126 | " #correct for sampling across episodes\n", 127 | " for i in range(len(final)-2, -1, -1):\n", 128 | " if final[i][3] is None:\n", 129 | " final = final[i+1:]\n", 130 | " break\n", 131 | " \n", 132 | " #pad beginning to account for corrections\n", 133 | " while(len(final)