├── .gitignore
├── Colab Starter For RL.ipynb
├── DQN
    ├── DQN_torch.ipynb
    └── README.md
├── LICENSE
├── Policy Gradients and Actor Critic
    ├── ActorCritic_torch.ipynb
    ├── PolicyGradient_torch.ipynb
    ├── README.md
    └── Synchronous_A2C_torch.ipynb
├── README.md
├── environment.yml
├── my_path_in_RL.md
└── suggested_path_in_RL.md


/.gitignore:
--------------------------------------------------------------------------------
1 | ./*/.ipynb_checkpoints/
2 | .ipynb_checkpoints/
3 | video/
4 | checkpoint.pth
5 | .vscode/
6 | ./*/video/


--------------------------------------------------------------------------------
/Colab Starter For RL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Colab specific cells"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!apt update\n",
 17 |     "!pip install pyvirtualdisplay\n",
 18 |     "!apt-get install -y xvfb python-opengl ffmpeg\n",
 19 |     "!pip install gym[atari,box2d,classic_control]"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# from google.colab import drive\n",
 29 |     "# drive.mount('/content/gdrive')\n",
 30 |     "\n",
 31 |     "# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'\n",
 32 |     "# import os\n",
 33 |     "# os.chdir(root_path)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# %tensorflow_version 2.x\n",
 43 |     "# %tensorflow_version 1.x\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from pyvirtualdisplay import Display\n",
 53 |     "display = Display(visible=0, size=(1400, 900))\n",
 54 |     "display.start()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import glob\n",
 64 |     "import io\n",
 65 |     "from IPython.display import HTML\n",
 66 |     "from IPython import display as ipythondisplay\n",
 67 |     "import base64\n",
 68 |     "\n",
 69 |     "\"\"\"\n",
 70 |     "Utility functions to enable video recording of gym environment and displaying it\n",
 71 |     "To enable video, just do \"env = wrap_env(env)\"\"\n",
 72 |     "\"\"\"\n",
 73 |     "\n",
 74 |     "def show_video():\n",
 75 |     "    mp4list = glob.glob('video/*.mp4')\n",
 76 |     "    if len(mp4list) > 0:\n",
 77 |     "        mp4 = mp4list[-1]\n",
 78 |     "        video = io.open(mp4, 'r+b').read()\n",
 79 |     "        encoded = base64.b64encode(video)\n",
 80 |     "\n",
 81 |     "        # you can add \"loop\" after autoplay to keep the video looping after it ends\n",
 82 |     "        ipythondisplay.display(HTML(data='''<video alt=\"test\" autoplay \n",
 83 |     "                     controls style=\"height: 400px;\">\n",
 84 |     "                    <source src=\"data:video/mp4;base64,{0}\" type=\"video/mp4\" />\n",
 85 |     "                 </video>'''.format(encoded.decode('ascii'))))\n",
 86 |     "    else: \n",
 87 |     "        print(\"Could not find video\")\n",
 88 |     "\n",
 89 |     "        \n",
 90 |     "\n",
 91 |     "from gym.wrappers import Monitor\n",
 92 |     "\n",
 93 |     "def wrap_env(env):\n",
 94 |     "    env = Monitor(env, './video', force=True)\n",
 95 |     "    return env"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## If not using colab change the value to false"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "using_colab = False\n",
112 |     "\n",
113 |     "import torch\n",
114 |     "\n",
115 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
116 |     "\n",
117 |     "if \"cuda\" in str(device):\n",
118 |     "    print(\"Using GPU\")\n",
119 |     "else:\n",
120 |     "    print(\"Not using GPU\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "# Add your algorithm here"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": []
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "Python [conda env:rl] *",
141 |    "language": "python",
142 |    "name": "conda-env-rl-py"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.8.5"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 4
159 | }
160 | 


--------------------------------------------------------------------------------
/DQN/DQN_torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Colab specific cells"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!apt update\n",
 17 |     "!pip install pyvirtualdisplay\n",
 18 |     "!apt-get install -y xvfb python-opengl ffmpeg\n",
 19 |     "!pip install gym[atari,box2d,classic_control]"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# from google.colab import drive\n",
 29 |     "# drive.mount('/content/gdrive')\n",
 30 |     "\n",
 31 |     "# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'\n",
 32 |     "# import os\n",
 33 |     "# os.chdir(root_path)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# %tensorflow_version 2.x\n",
 43 |     "# %tensorflow_version 1.x\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from pyvirtualdisplay import Display\n",
 53 |     "display = Display(visible=0, size=(1400, 900))\n",
 54 |     "display.start()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import glob\n",
 64 |     "import io\n",
 65 |     "from IPython.display import HTML\n",
 66 |     "from IPython import display as ipythondisplay\n",
 67 |     "import base64\n",
 68 |     "\n",
 69 |     "\"\"\"\n",
 70 |     "Utility functions to enable video recording of gym environment and displaying it\n",
 71 |     "To enable video, just do \"env = wrap_env(env)\"\"\n",
 72 |     "\"\"\"\n",
 73 |     "\n",
 74 |     "def show_video():\n",
 75 |     "    mp4list = glob.glob('video/*.mp4')\n",
 76 |     "    if len(mp4list) > 0:\n",
 77 |     "        mp4 = mp4list[-1]\n",
 78 |     "        video = io.open(mp4, 'r+b').read()\n",
 79 |     "        encoded = base64.b64encode(video)\n",
 80 |     "\n",
 81 |     "        # you can add \"loop\" after autoplay to keep the video looping after it ends\n",
 82 |     "        ipythondisplay.display(HTML(data='''<video alt=\"test\" autoplay \n",
 83 |     "                     controls style=\"height: 400px;\">\n",
 84 |     "                    <source src=\"data:video/mp4;base64,{0}\" type=\"video/mp4\" />\n",
 85 |     "                 </video>'''.format(encoded.decode('ascii'))))\n",
 86 |     "    else: \n",
 87 |     "        print(\"Could not find video\")\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "from gym.wrappers import Monitor\n",
 91 |     "\n",
 92 |     "def wrap_env(env):\n",
 93 |     "    env = Monitor(env, './video', force=True)\n",
 94 |     "    return env"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## If not using colab change the value to false"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "using_colab = False\n",
111 |     "\n",
112 |     "import torch\n",
113 |     "\n",
114 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
115 |     "\n",
116 |     "if \"cuda\" in str(device):\n",
117 |     "    print(\"Using GPU\")\n",
118 |     "else:\n",
119 |     "    print(\"Not using GPU\")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "# DQN"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "import gym\n",
136 |     "from gym import logger as gymlogger\n",
137 |     "gymlogger.set_level(40) #error only\n",
138 |     "\n",
139 |     "import numpy as np\n",
140 |     "import random\n",
141 |     "import math\n",
142 |     "\n",
143 |     "import matplotlib\n",
144 |     "import matplotlib.pyplot as plt\n",
145 |     "%matplotlib inline"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "env_list = [\"LunarLander-v2\", \"MsPacman-ram-v0\", \"CartPole-v0\", \"MountainCar-v0\",\n",
155 |     "            \"Breakout-ram-v4\", \"Acrobot-v1\"]\n",
156 |     "\n",
157 |     "env_to_use = env_list[2]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "### Observing the environment using random actions"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "env = gym.make(env_to_use)\n",
174 |     "if using_colab:\n",
175 |     "    env = wrap_env(env)\n",
176 |     "\n",
177 |     "#check out the environment's action and observation space!\n",
178 |     "print(env.action_space)\n",
179 |     "print(env.observation_space)\n",
180 |     "\n",
181 |     "from gym import spaces\n",
182 |     "assert type(env.observation_space)==spaces.Box, print(\"State space should be continuous\")\n",
183 |     "assert len(env.observation_space.shape)==1, print(\"State space should be 1-D\")\n",
184 |     "assert type(env.action_space)==spaces.Discrete, print(\"Action space should be discrete\")\n",
185 |     "\n",
186 |     "observation = env.reset()\n",
187 |     "while True:\n",
188 |     "    env.render()\n",
189 |     "    #your agent goes here\n",
190 |     "    action = env.action_space.sample() # selecting a random action from the action space\n",
191 |     "    observation, reward, done, info = env.step(action) \n",
192 |     "    if done: \n",
193 |     "        break\n",
194 |     "    # break\n",
195 |     "\n",
196 |     "env.close()\n",
197 |     "if using_colab:\n",
198 |     "    show_video()"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Q-Network definition"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "import torch\n",
215 |     "import torch.nn as nn\n",
216 |     "import torch.nn.functional as F\n",
217 |     "\n",
218 |     "class QNetwork(nn.Module):\n",
219 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
220 |     "\n",
221 |     "    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):\n",
222 |     "        \"\"\"Initialize parameters and build model.\n",
223 |     "        Params\n",
224 |     "        ======\n",
225 |     "            state_size (int): Dimension of each state\n",
226 |     "            action_size (int): Dimension of each action\n",
227 |     "            seed (int): Random seed\n",
228 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
229 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
230 |     "        \"\"\"\n",
231 |     "        super(QNetwork, self).__init__()\n",
232 |     "        self.seed = torch.manual_seed(seed)\n",
233 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
234 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
235 |     "        self.fc3 = nn.Linear(fc2_units, action_size)\n",
236 |     "\n",
237 |     "    def forward(self, state):\n",
238 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
239 |     "        x = F.relu(self.fc1(state))\n",
240 |     "        x = F.relu(self.fc2(x))\n",
241 |     "        return self.fc3(x)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### Replay buffer definition"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "from collections import namedtuple, deque\n",
258 |     "\n",
259 |     "class ReplayBuffer:\n",
260 |     "    \"\"\"Fixed-size buffer to store experience tuples.\"\"\"\n",
261 |     "\n",
262 |     "    def __init__(self, action_size, buffer_size, batch_size, seed):\n",
263 |     "        \"\"\"Initialize a ReplayBuffer object.\n",
264 |     "        Params\n",
265 |     "        ======\n",
266 |     "            action_size (int): dimension of each action\n",
267 |     "            buffer_size (int): maximum size of buffer\n",
268 |     "            batch_size (int): size of each training batch\n",
269 |     "            seed (int): random seed\n",
270 |     "        \"\"\"\n",
271 |     "        self.action_size = action_size\n",
272 |     "        self.memory = deque(maxlen=buffer_size)  \n",
273 |     "        self.batch_size = batch_size\n",
274 |     "        self.experience = namedtuple(\"Experience\", field_names=[\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
275 |     "        self.seed = random.seed(seed)\n",
276 |     "    \n",
277 |     "    def add(self, state, action, reward, next_state, done):\n",
278 |     "        \"\"\"Add a new experience to memory.\"\"\"\n",
279 |     "        e = self.experience(state, action, reward, next_state, done)\n",
280 |     "        self.memory.append(e)\n",
281 |     "    \n",
282 |     "    def sample(self):\n",
283 |     "        \"\"\"Randomly sample a batch of experiences from memory.\"\"\"\n",
284 |     "        experiences = random.sample(self.memory, k=self.batch_size)\n",
285 |     "\n",
286 |     "        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)\n",
287 |     "        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)\n",
288 |     "        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)\n",
289 |     "        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)\n",
290 |     "        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)\n",
291 |     "  \n",
292 |     "        return (states, actions, rewards, next_states, dones)\n",
293 |     "\n",
294 |     "    def __len__(self):\n",
295 |     "        \"\"\"Return the current size of internal memory.\"\"\"\n",
296 |     "        return len(self.memory)\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "### Agent Definition"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "import torch.optim as optim\n",
313 |     "\n",
314 |     "BUFFER_SIZE = int(1e5)  # replay buffer size\n",
315 |     "BATCH_SIZE = 64         # minibatch size\n",
316 |     "GAMMA = 0.99            # discount factor\n",
317 |     "TAU = 1e-3              # for soft update of target parameters\n",
318 |     "LR = 5e-4               # learning rate \n",
319 |     "UPDATE_EVERY = 4        # how often to update the network\n",
320 |     "\n",
321 |     "\n",
322 |     "class Agent():\n",
323 |     "    \"\"\"Interacts with and learns from the environment.\"\"\"\n",
324 |     "\n",
325 |     "    def __init__(self, state_size, action_size, seed):\n",
326 |     "        \"\"\"Initialize an Agent object.\n",
327 |     "        \n",
328 |     "        Params\n",
329 |     "        ======\n",
330 |     "            state_size (int): dimension of each state\n",
331 |     "            action_size (int): dimension of each action\n",
332 |     "            seed (int): random seed\n",
333 |     "        \"\"\"\n",
334 |     "        self.state_size = state_size\n",
335 |     "        self.action_size = action_size\n",
336 |     "        self.seed = random.seed(seed)\n",
337 |     "\n",
338 |     "        # Q-Network\n",
339 |     "        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)\n",
340 |     "        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)\n",
341 |     "        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)\n",
342 |     "\n",
343 |     "        # Replay memory\n",
344 |     "        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)\n",
345 |     "        # Initialize time step (for updating every UPDATE_EVERY steps)\n",
346 |     "        self.t_step = 0\n",
347 |     "    \n",
348 |     "    def step(self, state, action, reward, next_state, done):\n",
349 |     "        # Save experience in replay memory\n",
350 |     "        self.memory.add(state, action, reward, next_state, done)\n",
351 |     "        \n",
352 |     "        # Learn every UPDATE_EVERY time steps.\n",
353 |     "        self.t_step = (self.t_step + 1) % UPDATE_EVERY\n",
354 |     "        if self.t_step == 0:\n",
355 |     "            # If enough samples are available in memory, get random subset and learn\n",
356 |     "            if len(self.memory) > BATCH_SIZE:\n",
357 |     "                experiences = self.memory.sample()\n",
358 |     "                self.learn(experiences, GAMMA)\n",
359 |     "\n",
360 |     "    def act(self, state, eps=0.):\n",
361 |     "        \"\"\"Returns actions for given state as per current policy.\n",
362 |     "        \n",
363 |     "        Params\n",
364 |     "        ======\n",
365 |     "            state (array_like): current state\n",
366 |     "            eps (float): epsilon, for epsilon-greedy action selection\n",
367 |     "        \"\"\"\n",
368 |     "        state = torch.from_numpy(state).float().unsqueeze(0).to(device)\n",
369 |     "        self.qnetwork_local.eval()\n",
370 |     "        with torch.no_grad():\n",
371 |     "            action_values = self.qnetwork_local(state)\n",
372 |     "        self.qnetwork_local.train()\n",
373 |     "\n",
374 |     "        # Epsilon-greedy action selection\n",
375 |     "        if random.random() > eps:\n",
376 |     "            return np.argmax(action_values.cpu().data.numpy())\n",
377 |     "        else:\n",
378 |     "            return random.choice(np.arange(self.action_size))\n",
379 |     "\n",
380 |     "    def learn(self, experiences, gamma):\n",
381 |     "        \"\"\"Update value parameters using given batch of experience tuples.\n",
382 |     "        Params\n",
383 |     "        ======\n",
384 |     "            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples \n",
385 |     "            gamma (float): discount factor\n",
386 |     "        \"\"\"\n",
387 |     "        states, actions, rewards, next_states, dones = experiences\n",
388 |     "\n",
389 |     "        \n",
390 |     "        \"\"\"\n",
391 |     "        In the following lines we are using 2 networks, qnetwork_target and qnetwork_local\n",
392 |     "        We continuously keep training the local network and update the weights (soft update or hard update)\n",
393 |     "        of target network at some specific interval.\n",
394 |     "        \n",
395 |     "        Soft interval: update the weights of the target network as (1-tau)*target_weights + tau*local_weights\n",
396 |     "                    This formula slowly convergres the target weights towards local weights\n",
397 |     "        Hard interval: update the weights of target network with the local_weights every few steps\n",
398 |     "        \n",
399 |     "        This solves the problem of moving targets as mentioned in Double DQN paper\n",
400 |     "        which leads to unstable learning in DQN.\n",
401 |     "        \"\"\"\n",
402 |     "        \n",
403 |     "        \n",
404 |     "        ### In below calculation if we use qnetwork_local to calculate Q_targets_next\n",
405 |     "        ### then its standard DQN, but here we perform Double DQN\n",
406 |     "        \n",
407 |     "        # Get max predicted Q values (for next states) from target model\n",
408 |     "        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)\n",
409 |     "        \n",
410 |     "        # Compute Q targets for current states \n",
411 |     "        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))\n",
412 |     "\n",
413 |     "        # Get expected Q values from local model\n",
414 |     "        Q_expected = self.qnetwork_local(states).gather(1, actions)\n",
415 |     "\n",
416 |     "        # Compute loss\n",
417 |     "        loss = F.mse_loss(Q_expected, Q_targets)\n",
418 |     "        # Minimize the loss\n",
419 |     "        self.optimizer.zero_grad()\n",
420 |     "        loss.backward()\n",
421 |     "        self.optimizer.step()\n",
422 |     "\n",
423 |     "        # ------------------- update target network ------------------- #\n",
424 |     "        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     \n",
425 |     "        ### here we can alternatively use hard_update, but not at every call/step\n",
426 |     "\n",
427 |     "    def soft_update(self, local_model, target_model, tau):\n",
428 |     "        \"\"\"Soft update model parameters.\n",
429 |     "        θ_target = τ*θ_local + (1 - τ)*θ_target\n",
430 |     "        Params\n",
431 |     "        ======\n",
432 |     "            local_model (PyTorch model): weights will be copied from\n",
433 |     "            target_model (PyTorch model): weights will be copied to\n",
434 |     "            tau (float): interpolation parameter \n",
435 |     "        \"\"\"\n",
436 |     "        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):\n",
437 |     "            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)\n",
438 |     "            \n",
439 |     "    def hard_update(self, local_model, target_model):\n",
440 |     "        \"\"\"Hard update model parameters\n",
441 |     "        Params\n",
442 |     "        ======\n",
443 |     "            local_model (PyTorch model): weights will be copied from\n",
444 |     "            target_model (PyTorch model): weights will be copied to\n",
445 |     "            tau (float): interpolation parameter \n",
446 |     "        \"\"\"\n",
447 |     "        target_model.load_state_dict(local_model.state_dict())\n"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {},
453 |    "source": [
454 |     "### Watch the untrained agent"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": [
463 |     "\n",
464 |     "env = gym.make(env_to_use)\n",
465 |     "if using_colab:\n",
466 |     "    env = wrap_env(env)\n",
467 |     "\n",
468 |     "env.seed(0)\n",
469 |     "\n",
470 |     "agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=0)\n",
471 |     "\n",
472 |     "# watch an untrained agent\n",
473 |     "state = env.reset()\n",
474 |     "for j in range(200):\n",
475 |     "    action = agent.act(state)\n",
476 |     "    env.render()\n",
477 |     "    state, reward, done, _ = env.step(action)\n",
478 |     "    if done:\n",
479 |     "        break \n",
480 |     "        \n",
481 |     "env.close()\n",
482 |     "if using_colab:\n",
483 |     "    show_video()"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {},
489 |    "source": [
490 |     "### Training the agent"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):\n",
500 |     "    \"\"\"Deep Q-Learning.\n",
501 |     "    \n",
502 |     "    Params\n",
503 |     "    ======\n",
504 |     "        n_episodes (int): maximum number of training episodes\n",
505 |     "        max_t (int): maximum number of timesteps per episode\n",
506 |     "        eps_start (float): starting value of epsilon, for epsilon-greedy action selection\n",
507 |     "        eps_end (float): minimum value of epsilon\n",
508 |     "        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon\n",
509 |     "    \"\"\"\n",
510 |     "    scores = []                        # list containing scores from each episode\n",
511 |     "    eps = eps_start                    # initialize epsilon\n",
512 |     "    for i_episode in range(1, n_episodes+1):\n",
513 |     "        state = env.reset()\n",
514 |     "        score = 0\n",
515 |     "        for t in range(max_t):\n",
516 |     "            action = agent.act(state, eps)\n",
517 |     "            next_state, reward, done, _ = env.step(action)\n",
518 |     "            agent.step(state, action, reward, next_state, done)\n",
519 |     "            state = next_state\n",
520 |     "            score += reward\n",
521 |     "            if done:\n",
522 |     "                break \n",
523 |     "        scores.append(score)              # save most recent score\n",
524 |     "        eps = max(eps_end, eps_decay*eps) # decrease epsilon\n",
525 |     "        if i_episode % 100 == 0:\n",
526 |     "            print('\\rEpisode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores[-100:])))\n",
527 |     "    return scores\n",
528 |     "\n",
529 |     "scores = dqn()\n",
530 |     "\n",
531 |     "# plot the scores\n",
532 |     "fig = plt.figure()\n",
533 |     "ax = fig.add_subplot(111)\n",
534 |     "plt.plot(np.arange(len(scores)), scores)\n",
535 |     "plt.ylabel('Score')\n",
536 |     "plt.xlabel('Episode #')\n",
537 |     "plt.show()"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {},
543 |    "source": [
544 |     "### Watch the trained agent play"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "# watch the trained agent\n",
554 |     "env = gym.make(env_to_use)\n",
555 |     "if using_colab:\n",
556 |     "    env = wrap_env(env)\n",
557 |     "\n",
558 |     "state = env.reset()\n",
559 |     "done=False\n",
560 |     "while not done:\n",
561 |     "    action = agent.act(state)\n",
562 |     "    env.render()\n",
563 |     "    state, reward, done, _ = env.step(action)\n",
564 |     "    if done:\n",
565 |     "        break\n",
566 |     "        \n",
567 |     "env.close()\n",
568 |     "if using_colab:\n",
569 |     "    show_video()\n"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": []
578 |   }
579 |  ],
580 |  "metadata": {
581 |   "kernelspec": {
582 |    "display_name": "Python [conda env:rl] *",
583 |    "language": "python",
584 |    "name": "conda-env-rl-py"
585 |   },
586 |   "language_info": {
587 |    "codemirror_mode": {
588 |     "name": "ipython",
589 |     "version": 3
590 |    },
591 |    "file_extension": ".py",
592 |    "mimetype": "text/x-python",
593 |    "name": "python",
594 |    "nbconvert_exporter": "python",
595 |    "pygments_lexer": "ipython3",
596 |    "version": "3.8.5"
597 |   }
598 |  },
599 |  "nbformat": 4,
600 |  "nbformat_minor": 4
601 | }
602 | 


--------------------------------------------------------------------------------
/DQN/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # DQN
 3 | 
 4 | Different versions of DQN for some of the OpanAI Gym environment.
 5 | 
 6 | #### To-Do
 7 | - [x] DQN implementation [1,2] [notebook](DQN_torch.ipynb)
 8 | - [x] Double Q-Learning [3]
 9 | - [ ] Dueling Network Architectures for DRL [4]
10 | - [ ] Prioritized Experience Replay [5]
11 | 
12 | #### References
13 | 1. https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
14 | 2. https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
15 | 3. https://arxiv.org/pdf/1509.06461.pdf
16 | 4. https://arxiv.org/pdf/1511.06581.pdf
17 | 5. https://arxiv.org/abs/1511.05952
18 | 6. https://github.com/udacity/deep-reinforcement-learning/tree/master/dqn
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Kinal Mehta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Policy Gradients and Actor Critic/ActorCritic_torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Colab specific cells"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!apt update\n",
 17 |     "!pip install pyvirtualdisplay\n",
 18 |     "!apt-get install -y xvfb python-opengl ffmpeg\n",
 19 |     "!pip install gym[atari,box2d,classic_control]"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# from google.colab import drive\n",
 29 |     "# drive.mount('/content/gdrive')\n",
 30 |     "\n",
 31 |     "# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'\n",
 32 |     "# import os\n",
 33 |     "# os.chdir(root_path)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# %tensorflow_version 2.x\n",
 43 |     "# %tensorflow_version 1.x\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from pyvirtualdisplay import Display\n",
 53 |     "display = Display(visible=0, size=(1400, 900))\n",
 54 |     "display.start()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import glob\n",
 64 |     "import io\n",
 65 |     "from IPython.display import HTML\n",
 66 |     "from IPython import display as ipythondisplay\n",
 67 |     "import base64\n",
 68 |     "\n",
 69 |     "\"\"\"\n",
 70 |     "Utility functions to enable video recording of gym environment and displaying it\n",
 71 |     "To enable video, just do \"env = wrap_env(env)\"\"\n",
 72 |     "\"\"\"\n",
 73 |     "\n",
 74 |     "def show_video():\n",
 75 |     "    mp4list = glob.glob('video/*.mp4')\n",
 76 |     "    if len(mp4list) > 0:\n",
 77 |     "        mp4 = mp4list[-1]\n",
 78 |     "        video = io.open(mp4, 'r+b').read()\n",
 79 |     "        encoded = base64.b64encode(video)\n",
 80 |     "\n",
 81 |     "        # you can add \"loop\" after autoplay to keep the video looping after it ends\n",
 82 |     "        ipythondisplay.display(HTML(data='''<video alt=\"test\" autoplay \n",
 83 |     "                     controls style=\"height: 400px;\">\n",
 84 |     "                    <source src=\"data:video/mp4;base64,{0}\" type=\"video/mp4\" />\n",
 85 |     "                 </video>'''.format(encoded.decode('ascii'))))\n",
 86 |     "    else: \n",
 87 |     "        print(\"Could not find video\")\n",
 88 |     "\n",
 89 |     "        \n",
 90 |     "\n",
 91 |     "from gym.wrappers import Monitor\n",
 92 |     "\n",
 93 |     "def wrap_env(env):\n",
 94 |     "    env = Monitor(env, './video', force=True)\n",
 95 |     "    return env"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## If not using colab change the value to false"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "using_colab = False\n",
112 |     "\n",
113 |     "import torch\n",
114 |     "\n",
115 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
116 |     "\n",
117 |     "if \"cuda\" in str(device):\n",
118 |     "    print(\"Using GPU\")\n",
119 |     "else:\n",
120 |     "    print(\"Not using GPU\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Actor Critic Algorithm"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "import gym\n",
137 |     "from gym import logger as gymlogger\n",
138 |     "gymlogger.set_level(40) #error only\n",
139 |     "\n",
140 |     "import numpy as np\n",
141 |     "import random\n",
142 |     "import math\n",
143 |     "\n",
144 |     "import matplotlib\n",
145 |     "import matplotlib.pyplot as plt\n",
146 |     "%matplotlib inline"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "env_list = [\"LunarLander-v2\", \"MsPacman-ram-v0\", \"CartPole-v0\", \"MountainCar-v0\",\n",
156 |     "            \"Breakout-ram-v4\", \"Acrobot-v1\"]\n",
157 |     "\n",
158 |     "env_to_use = env_list[2]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "### Observing the environment using random actions"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "env = gym.make(env_to_use)\n",
175 |     "if using_colab:\n",
176 |     "    env = wrap_env(env)\n",
177 |     "\n",
178 |     "#check out the environment's action and observation space!\n",
179 |     "print(env.action_space)\n",
180 |     "print(env.observation_space)\n",
181 |     "\n",
182 |     "from gym import spaces\n",
183 |     "assert type(env.observation_space)==spaces.Box, print(\"State space should be continuous\")\n",
184 |     "assert len(env.observation_space.shape)==1, print(\"State space should be 1-D\")\n",
185 |     "assert type(env.action_space)==spaces.Discrete, print(\"Action space should be discrete\")\n",
186 |     "\n",
187 |     "observation = env.reset()\n",
188 |     "while True:\n",
189 |     "    env.render()\n",
190 |     "    #your agent goes here\n",
191 |     "    action = env.action_space.sample() # selecting a random action from the action space\n",
192 |     "    observation, reward, done, info = env.step(action) \n",
193 |     "    if done: \n",
194 |     "        break\n",
195 |     "    # break\n",
196 |     "\n",
197 |     "env.close()\n",
198 |     "if using_colab:\n",
199 |     "    show_video()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "### Defining the Network to be used"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "import torch\n",
216 |     "import torch.nn as nn\n",
217 |     "import torch.nn.functional as F\n",
218 |     "\n",
219 |     "class Actor(nn.Module):\n",
220 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
221 |     "\n",
222 |     "    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):\n",
223 |     "        \"\"\"Initialize parameters and build model.\n",
224 |     "        Params\n",
225 |     "        ======\n",
226 |     "            state_size (int): Dimension of each state\n",
227 |     "            action_size (int): Dimension of each action\n",
228 |     "            seed (int): Random seed\n",
229 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
230 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
231 |     "        \"\"\"\n",
232 |     "        super(Actor, self).__init__()\n",
233 |     "        torch.manual_seed(seed)\n",
234 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
235 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
236 |     "        self.actor = nn.Linear(fc2_units, action_size)\n",
237 |     "\n",
238 |     "    def forward(self, state):\n",
239 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
240 |     "        x = F.relu(self.fc1(state))\n",
241 |     "        x = F.relu(self.fc2(x))\n",
242 |     "        return self.actor(x)\n",
243 |     "\n",
244 |     "class Critic(nn.Module):\n",
245 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
246 |     "\n",
247 |     "    def __init__(self, state_size, seed, fc1_units=64, fc2_units=64):\n",
248 |     "        \"\"\"Initialize parameters and build model.\n",
249 |     "        Params\n",
250 |     "        ======\n",
251 |     "            state_size (int): Dimension of each state\n",
252 |     "            action_size (int): Dimension of each action\n",
253 |     "            seed (int): Random seed\n",
254 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
255 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
256 |     "        \"\"\"\n",
257 |     "        super(Critic, self).__init__()\n",
258 |     "        torch.manual_seed(seed)\n",
259 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
260 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
261 |     "        self.critic = nn.Linear(fc2_units, 1)\n",
262 |     "\n",
263 |     "    def forward(self, state):\n",
264 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
265 |     "        x = F.relu(self.fc1(state))\n",
266 |     "        x = F.relu(self.fc2(x))\n",
267 |     "        return self.critic(x)\n",
268 |     "\n",
269 |     "class PGLoss(nn.Module):\n",
270 |     "    def forward(self, policy, act, rew_wt):\n",
271 |     "        logp = policy.log_prob(act)\n",
272 |     "        return -(logp * rew_wt).mean()\n"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "### Defining the agent"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "import torch\n",
289 |     "import torch.optim as optim\n",
290 |     "\n",
291 |     "from torch.distributions.categorical import Categorical\n",
292 |     "\n",
293 |     "from collections import defaultdict\n",
294 |     "\n",
295 |     "class Agent:\n",
296 |     "    def __init__(self, env, batch_size, device=None):\n",
297 |     "        self.env = env\n",
298 |     "        if not device:\n",
299 |     "            device = torch.device('cpu')\n",
300 |     "        self.device = device\n",
301 |     "        \n",
302 |     "        self.state_size = self.env.observation_space.shape[0]\n",
303 |     "        self.action_size = self.env.action_space.n\n",
304 |     "\n",
305 |     "        self.actor_network = Actor(self.state_size, self.action_size, 4).to(self.device)\n",
306 |     "        self.critic_network = Critic(self.state_size, 4).to(self.device)\n",
307 |     "\n",
308 |     "        self.actor_loss = PGLoss()\n",
309 |     "        self.critic_loss = nn.MSELoss()\n",
310 |     "        self.batch_size = batch_size\n",
311 |     "\n",
312 |     "\n",
313 |     "        self.train_stats = defaultdict(list)\n",
314 |     "\n",
315 |     "    def get_policy(self, obs):\n",
316 |     "        net_op = self.actor_network(torch.as_tensor(obs, dtype=torch.float32).to(self.device))\n",
317 |     "        return Categorical(logits=net_op.cpu())\n",
318 |     "    def get_critic_value(self,obs):\n",
319 |     "        return self.critic_network(torch.as_tensor(obs, dtype=torch.float32).to(self.device)).cpu()\n",
320 |     "    def get_action(self, policy):\n",
321 |     "        return policy.sample().item()\n",
322 |     "\n",
323 |     "    def get_episode_batch(self):\n",
324 |     "        batch_obs = []\n",
325 |     "        batch_action = []\n",
326 |     "        batch_weights = []\n",
327 |     "        batch_critic = []\n",
328 |     "        batch_returns = []\n",
329 |     "        batch_lengths = []\n",
330 |     "\n",
331 |     "        cur_obs = self.env.reset()\n",
332 |     "        episode_rewards = []\n",
333 |     "        episode_critic_op = []\n",
334 |     "        done = False\n",
335 |     "\n",
336 |     "        while True:\n",
337 |     "            batch_obs.append(cur_obs.copy())\n",
338 |     "            \n",
339 |     "            policy = self.get_policy(cur_obs)\n",
340 |     "            critic_op = self.get_critic_value(cur_obs)\n",
341 |     "            cur_action = self.get_action(policy)\n",
342 |     "            cur_obs, cur_reward, done, _ = self.env.step(cur_action)\n",
343 |     "\n",
344 |     "            batch_action.append(cur_action)\n",
345 |     "            batch_critic.append(critic_op)\n",
346 |     "            episode_rewards.append(cur_reward)\n",
347 |     "            \n",
348 |     "            if done:\n",
349 |     "                episode_return, episode_length = sum(episode_rewards), len(episode_rewards)\n",
350 |     "                \n",
351 |     "                batch_returns.append(episode_return)\n",
352 |     "                batch_lengths.append(episode_length)\n",
353 |     "                \n",
354 |     "                def get_reward_to_go(rewards_list, gamma=1):\n",
355 |     "                    rtg = []\n",
356 |     "                    sum_rewards = 0\n",
357 |     "                    for i in rewards_list[::-1]:\n",
358 |     "                        sum_rewards = i + sum_rewards*gamma\n",
359 |     "                        rtg.append(sum_rewards)\n",
360 |     "                    return rtg[::-1]\n",
361 |     "                batch_weights += get_reward_to_go(episode_rewards, 0.9)\n",
362 |     "                \n",
363 |     "                if len(batch_obs) > self.batch_size:\n",
364 |     "                    break\n",
365 |     "                \n",
366 |     "                cur_obs, done, episode_rewards, episode_critic_op = self.env.reset(), False, [], []\n",
367 |     "        return batch_obs, batch_action, batch_weights, batch_critic, batch_returns, batch_lengths\n",
368 |     "\n",
369 |     "    def train(self, epochs):\n",
370 |     "        actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=1e-2)\n",
371 |     "        critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=1e-2)\n",
372 |     "\n",
373 |     "        for i in range(epochs):\n",
374 |     "            obs, act, rew_wt, critic_op, eps_rew, eps_len = self.get_episode_batch()\n",
375 |     "            obs = torch.as_tensor(obs)\n",
376 |     "            act = torch.as_tensor(act)\n",
377 |     "            rew_wt = torch.as_tensor(rew_wt)\n",
378 |     "            critic_op = torch.cat(critic_op)\n",
379 |     "            \n",
380 |     "            actor_optimizer.zero_grad()\n",
381 |     "            critic_optimizer.zero_grad()\n",
382 |     "            \n",
383 |     "            actor_loss_val = self.actor_loss(self.get_policy(obs), act, rew_wt-critic_op.detach())\n",
384 |     "            critic_loss_val = self.critic_loss(critic_op, rew_wt)\n",
385 |     "\n",
386 |     "            actor_loss_val.backward()\n",
387 |     "            actor_optimizer.step()\n",
388 |     "\n",
389 |     "            critic_loss_val.backward()\n",
390 |     "            critic_optimizer.step()\n",
391 |     "\n",
392 |     "            self.train_stats[\"actor_loss\"] += [actor_loss_val.item()]\n",
393 |     "            self.train_stats[\"critic_loss\"] += [critic_loss_val.item()]\n",
394 |     "            # self.train_stats[\"total_loss\"] += [loss.item()]\n",
395 |     "            self.train_stats[\"rewards\"] += [np.mean(eps_rew)]\n",
396 |     "\n",
397 |     "            print(\"Epoch:\", i, actor_loss_val.item(), critic_loss_val.item(), np.mean(eps_rew), np.mean(eps_len))\n",
398 |     "        \n",
399 |     "    def train_ac(self, epochs):\n",
400 |     "        actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=1e-30)\n",
401 |     "        critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=1e-30)\n",
402 |     "\n",
403 |     "        cur_obs = self.env.reset()\n",
404 |     "        policy = self.get_policy(cur_obs)\n",
405 |     "        critic_op = self.get_critic_value(cur_obs)\n",
406 |     "\n",
407 |     "        rewards = []\n",
408 |     "        ep_reward = 0\n",
409 |     "\n",
410 |     "        actor_optimizer.zero_grad()\n",
411 |     "        critic_optimizer.zero_grad()\n",
412 |     "        \n",
413 |     "        for i in range(epochs):\n",
414 |     "            cur_action = self.get_action(policy)\n",
415 |     "            nxt_obs, cur_reward, done, _ = self.env.step(cur_action)\n",
416 |     "            # print(nxt_obs, cur_reward, done, critic_op, cur_action)\n",
417 |     "            ep_reward += cur_reward\n",
418 |     "\n",
419 |     "            if done:\n",
420 |     "                nxt_obs = self.env.reset()\n",
421 |     "                rewards.append(ep_reward)\n",
422 |     "                ep_reward = 0\n",
423 |     "\n",
424 |     "            nxt_policy = self.get_policy(nxt_obs)\n",
425 |     "            nxt_critic_op = self.get_critic_value(nxt_obs)\n",
426 |     "\n",
427 |     "            # print(\"[INFO] Reward\", cur_reward, nxt_critic_op)\n",
428 |     "            target_ret = cur_reward + 0.9*nxt_critic_op.detach()*(1-done)\n",
429 |     "\n",
430 |     "            \n",
431 |     "            actor_loss_val = self.actor_loss(policy, torch.tensor([cur_action]), target_ret-critic_op.item())\n",
432 |     "            # print(critic_op, target_ret)\n",
433 |     "            actor_loss_val.backward()\n",
434 |     "            \n",
435 |     "            critic_loss_val = self.critic_loss(critic_op, target_ret)\n",
436 |     "            critic_loss_val.backward()\n",
437 |     "\n",
438 |     "            policy = nxt_policy\n",
439 |     "            cur_obs = nxt_obs\n",
440 |     "            critic_op = nxt_critic_op\n",
441 |     "            \n",
442 |     "            if (i+1)%100==0:\n",
443 |     "                actor_optimizer.step()\n",
444 |     "                critic_optimizer.step()\n",
445 |     "                actor_optimizer.zero_grad()\n",
446 |     "                critic_optimizer.zero_grad()\n",
447 |     "                policy = self.get_policy(nxt_obs)\n",
448 |     "                cur_obs = nxt_obs\n",
449 |     "                critic_op = self.get_critic_value(nxt_obs)\n",
450 |     "            \n",
451 |     "\n",
452 |     "            # break\n",
453 |     "            if (i+1)%1000==0:\n",
454 |     "                self.train_stats[\"actor_loss\"] += [actor_loss_val.item()]\n",
455 |     "                self.train_stats[\"critic_loss\"] += [critic_loss_val.item()]\n",
456 |     "                self.train_stats[\"rewards\"] += [np.mean(rewards)]\n",
457 |     "                print(\"Epoch:\", i, actor_loss_val.item(), critic_loss_val.item(), np.mean(rewards))\n",
458 |     "                rewards = []\n",
459 |     "\n",
460 |     "    \n",
461 |     "    def plot_train_stats(self):\n",
462 |     "        if len(self.train_stats)==0:\n",
463 |     "            print(\"first train to print train stats\")\n",
464 |     "        for i in self.train_stats:\n",
465 |     "            plt.plot(self.train_stats[i])\n",
466 |     "            plt.xlabel(\"Epoch\")\n",
467 |     "            plt.ylabel(i)\n",
468 |     "            plt.show()\n",
469 |     "        return\n",
470 |     "\n"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "### Creating Agent's instance and using it to train"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "scrolled": true
485 |    },
486 |    "outputs": [],
487 |    "source": [
488 |     "print(\"GPU available:\", torch.cuda.is_available())\n",
489 |     "torch.autograd.set_detect_anomaly(True)\n",
490 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
491 |     "\n",
492 |     "env = (gym.make(env_to_use))\n",
493 |     "\n",
494 |     "agent = Agent(env, batch_size=512, device=device)\n",
495 |     "# agent.train_ac(100000)\n",
496 |     "agent.train(100)\n"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "agent.plot_train_stats()"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {},
512 |    "outputs": [],
513 |    "source": [
514 |     "# watch the trained agent\n",
515 |     "env = gym.make(env_to_use)\n",
516 |     "if using_colab:\n",
517 |     "    env = wrap_env(env)\n",
518 |     "state = env.reset()\n",
519 |     "done=False\n",
520 |     "while not done:\n",
521 |     "    action = agent.get_action(state)\n",
522 |     "    env.render()\n",
523 |     "    state, reward, done, _ = env.step(action)\n",
524 |     "    if done:\n",
525 |     "        break\n",
526 |     "        \n",
527 |     "env.close()\n",
528 |     "if using_colab:\n",
529 |     "    show_video()"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": []
538 |   }
539 |  ],
540 |  "metadata": {
541 |   "kernelspec": {
542 |    "display_name": "Python [conda env:rl] *",
543 |    "language": "python",
544 |    "name": "conda-env-rl-py"
545 |   },
546 |   "language_info": {
547 |    "codemirror_mode": {
548 |     "name": "ipython",
549 |     "version": 3
550 |    },
551 |    "file_extension": ".py",
552 |    "mimetype": "text/x-python",
553 |    "name": "python",
554 |    "nbconvert_exporter": "python",
555 |    "pygments_lexer": "ipython3",
556 |    "version": "3.8.5"
557 |   }
558 |  },
559 |  "nbformat": 4,
560 |  "nbformat_minor": 4
561 | }
562 | 


--------------------------------------------------------------------------------
/Policy Gradients and Actor Critic/PolicyGradient_torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Colab specific cells"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "!apt update\n",
 17 |     "!pip install pyvirtualdisplay\n",
 18 |     "!apt-get install -y xvfb python-opengl ffmpeg\n",
 19 |     "!pip install gym[atari,box2d,classic_control]"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# from google.colab import drive\n",
 29 |     "# drive.mount('/content/gdrive')\n",
 30 |     "\n",
 31 |     "# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'\n",
 32 |     "# import os\n",
 33 |     "# os.chdir(root_path)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# %tensorflow_version 2.x\n",
 43 |     "# %tensorflow_version 1.x\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from pyvirtualdisplay import Display\n",
 53 |     "display = Display(visible=0, size=(1400, 900))\n",
 54 |     "display.start()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import glob\n",
 64 |     "import io\n",
 65 |     "from IPython.display import HTML\n",
 66 |     "from IPython import display as ipythondisplay\n",
 67 |     "import base64\n",
 68 |     "\n",
 69 |     "\"\"\"\n",
 70 |     "Utility functions to enable video recording of gym environment and displaying it\n",
 71 |     "To enable video, just do \"env = wrap_env(env)\"\"\n",
 72 |     "\"\"\"\n",
 73 |     "\n",
 74 |     "def show_video():\n",
 75 |     "    mp4list = glob.glob('video/*.mp4')\n",
 76 |     "    if len(mp4list) > 0:\n",
 77 |     "        mp4 = mp4list[-1]\n",
 78 |     "        video = io.open(mp4, 'r+b').read()\n",
 79 |     "        encoded = base64.b64encode(video)\n",
 80 |     "\n",
 81 |     "        # you can add \"loop\" after autoplay to keep the video looping after it ends\n",
 82 |     "        ipythondisplay.display(HTML(data='''<video alt=\"test\" autoplay \n",
 83 |     "                     controls style=\"height: 400px;\">\n",
 84 |     "                    <source src=\"data:video/mp4;base64,{0}\" type=\"video/mp4\" />\n",
 85 |     "                 </video>'''.format(encoded.decode('ascii'))))\n",
 86 |     "    else: \n",
 87 |     "        print(\"Could not find video\")\n",
 88 |     "\n",
 89 |     "        \n",
 90 |     "\n",
 91 |     "from gym.wrappers import Monitor\n",
 92 |     "\n",
 93 |     "def wrap_env(env):\n",
 94 |     "    env = Monitor(env, './video', force=True)\n",
 95 |     "    return env"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## If not using colab change the value to false"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "using_colab = False\n",
112 |     "\n",
113 |     "import torch\n",
114 |     "\n",
115 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
116 |     "\n",
117 |     "if \"cuda\" in str(device):\n",
118 |     "    print(\"Using GPU\")\n",
119 |     "else:\n",
120 |     "    print(\"Not using GPU\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Policy Gradient Algorithm (REINFORCE)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "import gym\n",
137 |     "from gym import logger as gymlogger\n",
138 |     "gymlogger.set_level(40) #error only\n",
139 |     "\n",
140 |     "import numpy as np\n",
141 |     "import random\n",
142 |     "import math\n",
143 |     "\n",
144 |     "import matplotlib\n",
145 |     "import matplotlib.pyplot as plt\n",
146 |     "%matplotlib inline"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "env_list = [\"LunarLander-v2\", \"MsPacman-ram-v0\", \"CartPole-v0\", \"MountainCar-v0\",\n",
156 |     "            \"Breakout-ram-v4\", \"Acrobot-v1\"]\n",
157 |     "\n",
158 |     "env_to_use = env_list[2]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "### Selecting the environment and exploring it"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "env = gym.make(env_to_use)\n",
175 |     "if using_colab:\n",
176 |     "    env = wrap_env(env)\n",
177 |     "\n",
178 |     "#check out the environment's action and observation space!\n",
179 |     "print(env.action_space)\n",
180 |     "print(env.observation_space)\n",
181 |     "\n",
182 |     "from gym import spaces\n",
183 |     "assert type(env.observation_space)==spaces.Box, print(\"State space should be continuous\")\n",
184 |     "assert len(env.observation_space.shape)==1, print(\"State space should be 1-D\")\n",
185 |     "assert type(env.action_space)==spaces.Discrete, print(\"Action space should be discrete\")\n",
186 |     "\n",
187 |     "observation = env.reset()\n",
188 |     "while True:\n",
189 |     "    env.render()\n",
190 |     "    #your agent goes here\n",
191 |     "    action = env.action_space.sample() # selecting a random action from the action space\n",
192 |     "    observation, reward, done, info = env.step(action) \n",
193 |     "    if done: \n",
194 |     "        break\n",
195 |     "    # break\n",
196 |     "\n",
197 |     "env.close()\n",
198 |     "if using_colab:\n",
199 |     "    show_video()\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "### Defining the Network to be used"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "import torch\n",
216 |     "import torch.nn as nn\n",
217 |     "import torch.nn.functional as F\n",
218 |     "\n",
219 |     "class PolicyNet(nn.Module):\n",
220 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
221 |     "\n",
222 |     "    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):\n",
223 |     "        \"\"\"Initialize parameters and build model.\n",
224 |     "        Params\n",
225 |     "        ======\n",
226 |     "            state_size (int): Dimension of each state\n",
227 |     "            action_size (int): Dimension of each action\n",
228 |     "            seed (int): Random seed\n",
229 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
230 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
231 |     "        \"\"\"\n",
232 |     "        super(PolicyNet, self).__init__()\n",
233 |     "        self.seed = torch.manual_seed(seed)\n",
234 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
235 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
236 |     "        self.fc3 = nn.Linear(fc2_units, action_size)\n",
237 |     "\n",
238 |     "    def forward(self, state):\n",
239 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
240 |     "        x = F.relu(self.fc1(state))\n",
241 |     "        x = F.relu(self.fc2(x))\n",
242 |     "        return self.fc3(x)\n",
243 |     "\n",
244 |     "class PGLoss(nn.Module):\n",
245 |     "    def forward(self, policy, act, rew_wt):\n",
246 |     "        logp = policy.log_prob(act)\n",
247 |     "        return -(logp * rew_wt).mean()\n"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "### Defining the agent"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "import torch\n",
264 |     "import torch.optim as optim\n",
265 |     "\n",
266 |     "from torch.distributions.categorical import Categorical\n",
267 |     "\n",
268 |     "from collections import defaultdict\n",
269 |     "\n",
270 |     "class Agent:\n",
271 |     "    def __init__(self, env, batch_size, device='cpu'):\n",
272 |     "        self.env = env\n",
273 |     "        self.device = device\n",
274 |     "        \n",
275 |     "        self.state_size = self.env.observation_space.shape[0]\n",
276 |     "        self.action_size = self.env.action_space.n\n",
277 |     "\n",
278 |     "        self.policy_network = PolicyNet(self.state_size, self.action_size, 4).to(device)\n",
279 |     "        self.loss = PGLoss()\n",
280 |     "        self.batch_size = batch_size\n",
281 |     "        self.train_stats = defaultdict(list)\n",
282 |     "\n",
283 |     "    def get_policy(self, obs):\n",
284 |     "        net_op = self.policy_network(torch.as_tensor(obs, dtype=torch.float32).to(self.device))\n",
285 |     "        return Categorical(logits=net_op.cpu())\n",
286 |     "    def get_action(self, obs):\n",
287 |     "        return self.get_policy(obs).sample().item()\n",
288 |     "\n",
289 |     "    def get_episode_batch(self):\n",
290 |     "        batch_obs = []\n",
291 |     "        batch_action = []\n",
292 |     "        batch_weights = []\n",
293 |     "        batch_returns = []\n",
294 |     "        batch_lengths = []\n",
295 |     "\n",
296 |     "        cur_obs = self.env.reset()\n",
297 |     "        episode_rewards = []\n",
298 |     "        done = False\n",
299 |     "\n",
300 |     "        while True:\n",
301 |     "            batch_obs.append(cur_obs.copy())\n",
302 |     "            \n",
303 |     "            cur_action = self.get_action(cur_obs)\n",
304 |     "            cur_obs, cur_reward, done, _ = self.env.step(cur_action)\n",
305 |     "\n",
306 |     "            batch_action.append(cur_action)\n",
307 |     "            episode_rewards.append(cur_reward)\n",
308 |     "            \n",
309 |     "            if done:\n",
310 |     "                episode_return, episode_length = sum(episode_rewards), len(episode_rewards)\n",
311 |     "                \n",
312 |     "                batch_returns.append(episode_return)\n",
313 |     "                batch_lengths.append(episode_length)\n",
314 |     "                \n",
315 |     "                def get_reward_to_go(rewards_list):\n",
316 |     "                    rtg = []\n",
317 |     "                    sum_rewards = 0\n",
318 |     "                    for i in rewards_list[::-1]:\n",
319 |     "                        sum_rewards+=i\n",
320 |     "                        rtg.append(sum_rewards)\n",
321 |     "                    return rtg[::-1]\n",
322 |     "                batch_weights += get_reward_to_go(episode_rewards)\n",
323 |     "                \n",
324 |     "                if len(batch_obs) > self.batch_size:\n",
325 |     "                    break\n",
326 |     "                \n",
327 |     "                cur_obs, done, episode_rewards = self.env.reset(), False, []\n",
328 |     "        return batch_obs, batch_action, batch_weights, batch_returns, batch_lengths\n",
329 |     "\n",
330 |     "    def train(self, epochs):\n",
331 |     "        optimizer = optim.Adam(self.policy_network.parameters(), lr=1e-2)\n",
332 |     "        for i in range(epochs):\n",
333 |     "            obs, act, rew_wt, eps_rew, eps_len = self.get_episode_batch()\n",
334 |     "            obs = torch.as_tensor(obs)\n",
335 |     "            act = torch.as_tensor(act)\n",
336 |     "            rew_wt = torch.as_tensor(rew_wt)\n",
337 |     "            \n",
338 |     "            optimizer.zero_grad()\n",
339 |     "            loss = self.loss(self.get_policy(obs), act, rew_wt)\n",
340 |     "            loss.backward()\n",
341 |     "            optimizer.step()\n",
342 |     "\n",
343 |     "            self.train_stats[\"total_loss\"] += [loss.item()]\n",
344 |     "            self.train_stats[\"rewards\"] += [np.mean(eps_rew)]\n",
345 |     "\n",
346 |     "            print(\"Epoch:\", i, loss.item(), np.mean(eps_rew), np.mean(eps_len))\n",
347 |     "    \n",
348 |     "    def plot_train_stats(self):\n",
349 |     "        if len(self.train_stats)==0:\n",
350 |     "            print(\"first train to print train stats\")\n",
351 |     "        for i in self.train_stats:\n",
352 |     "            plt.plot(self.train_stats[i])\n",
353 |     "            plt.xlabel(\"Epoch\")\n",
354 |     "            plt.ylabel(i)\n",
355 |     "            plt.show()\n",
356 |     "        return\n"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "### Creating Agent's instance and using it to train"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {
370 |     "scrolled": true
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "print(\"GPU available:\", torch.cuda.is_available())\n",
375 |     "\n",
376 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
377 |     "\n",
378 |     "env = gym.make(env_to_use)\n",
379 |     "if using_colab:\n",
380 |     "    env = wrap_env(env)\n",
381 |     "\n",
382 |     "agent = Agent(env, batch_size=5000, device=device)\n",
383 |     "agent.train(100)\n"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "agent.plot_train_stats()"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "# watch the trained agent\n",
402 |     "env = gym.make(env_to_use)\n",
403 |     "if using_colab:\n",
404 |     "    env = wrap_env(env)\n",
405 |     "state = env.reset()\n",
406 |     "done=False\n",
407 |     "while not done:\n",
408 |     "    action = agent.get_action(state)\n",
409 |     "    env.render()\n",
410 |     "    state, reward, done, _ = env.step(action)\n",
411 |     "    if done:\n",
412 |     "        break\n",
413 |     "        \n",
414 |     "env.close()\n",
415 |     "if using_colab:\n",
416 |     "    show_video()"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": []
425 |   }
426 |  ],
427 |  "metadata": {
428 |   "kernelspec": {
429 |    "display_name": "Python [conda env:rl] *",
430 |    "language": "python",
431 |    "name": "conda-env-rl-py"
432 |   },
433 |   "language_info": {
434 |    "codemirror_mode": {
435 |     "name": "ipython",
436 |     "version": 3
437 |    },
438 |    "file_extension": ".py",
439 |    "mimetype": "text/x-python",
440 |    "name": "python",
441 |    "nbconvert_exporter": "python",
442 |    "pygments_lexer": "ipython3",
443 |    "version": "3.8.5"
444 |   }
445 |  },
446 |  "nbformat": 4,
447 |  "nbformat_minor": 4
448 | }
449 | 


--------------------------------------------------------------------------------
/Policy Gradients and Actor Critic/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Policy Gradients and Actor Critic
 3 | 
 4 | Here I have implemented incremental changes from Vanilla Policy Gradient or REINFORCE to Actor-Critic and other latest developments as mentioned in [3].
 5 | 
 6 | #### To-Do
 7 | - [x] REINFORCE or Policy Gradient(PG) [1,2,3] [notebook](PolicyGradient_torch.ipynb)
 8 | - [x] PG with reward-to-go [2] [notebook](PolicyGradient_torch.ipynb)
 9 | - [x] PG with learned Baseline (why not call it Actor-Critic) [notebook](ActorCritic_torch.ipynb)
10 | - [ ] A3C/Synchronous A2C [3,4]
11 | 
12 | #### References
13 | 1. https://arxiv.org/pdf/1604.06778.pdf
14 | 2. https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html
15 | 3. https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
16 | 4. https://danieltakeshi.github.io/2018/06/28/a2c-a3c/
17 | 


--------------------------------------------------------------------------------
/Policy Gradients and Actor Critic/Synchronous_A2C_torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "nwjab7Q4l0r1"
  8 |    },
  9 |    "source": [
 10 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kinalmehta/Reinforcement-Learning-Notebooks/blob/master/Policy%20Gradients%20and%20Actor%20Critic/Synchronous_A2C_torch.ipynb)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {
 16 |     "colab_type": "text",
 17 |     "id": "lheJ2JosRp2z"
 18 |    },
 19 |    "source": [
 20 |     "### Basic Setup step in **Colab**"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "colab": {},
 28 |     "colab_type": "code",
 29 |     "id": "YEKcwzMBQbzD"
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "#remove \" > /dev/null 2>&1\" to see what is going on under the hood\n",
 34 |     "!pip install gym pyvirtualdisplay > /dev/null 2>&1\n",
 35 |     "!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1\n",
 36 |     "\n",
 37 |     "!apt-get update > /dev/null 2>&1\n",
 38 |     "!apt-get install cmake > /dev/null 2>&1\n",
 39 |     "!apt-get install libopenmpi-dev > /dev/null 2>&1\n",
 40 |     "!apt-get install zlib1g-dev > /dev/null 2>&1\n",
 41 |     "\n",
 42 |     "!pip install --upgrade setuptools 2>&1\n",
 43 |     "!pip install ez_setup > /dev/null 2>&1\n",
 44 |     "!pip install gym[atari,box2d,classic_control] > /dev/null 2>&1 # change to gym[atari,box2d,classic_control]\n",
 45 |     "!pip install stable-baselines[mpi] > /dev/null 2>&1"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "colab": {},
 53 |     "colab_type": "code",
 54 |     "id": "RFFRRTAYSWea"
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# %tensorflow_version 2.x\n",
 59 |     "%tensorflow_version 1.x\n"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "#### Adding a virtual display for rendering"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from pyvirtualdisplay import Display\n",
 76 |     "display = Display(visible=0, size=(1400, 900))\n",
 77 |     "display.start()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {
 83 |     "colab_type": "text",
 84 |     "id": "2_d4hG0Xp8fG"
 85 |    },
 86 |    "source": [
 87 |     "#### Uncomment below to connect to drive to save model and video outputs"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "colab": {},
 95 |     "colab_type": "code",
 96 |     "id": "OgZYXNoWpSVI"
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "\n",
101 |     "# from google.colab import drive\n",
102 |     "# drive.mount('/content/gdrive')\n",
103 |     "\n",
104 |     "# root_path = 'gdrive/My Drive/Colab Notebooks/RL/'\n",
105 |     "# import os\n",
106 |     "# os.chdir(root_path)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {
112 |     "colab_type": "text",
113 |     "id": "XntD8FMRRumy"
114 |    },
115 |    "source": [
116 |     "### Standard imports and notebook setup"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "import gym\n",
126 |     "from gym import logger as gymlogger\n",
127 |     "from gym.wrappers import Monitor\n",
128 |     "gymlogger.set_level(40) #error only\n",
129 |     "import tensorflow as tf\n",
130 |     "import numpy as np\n",
131 |     "import random\n",
132 |     "import matplotlib\n",
133 |     "import matplotlib.pyplot as plt\n",
134 |     "%matplotlib inline\n",
135 |     "import math\n",
136 |     "import glob\n",
137 |     "import io\n",
138 |     "import base64\n",
139 |     "from IPython.display import HTML\n",
140 |     "\n",
141 |     "from IPython import display as ipythondisplay"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "\n",
151 |     "from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv, VecVideoRecorder\n",
152 |     ""
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "\"\"\"\n",
162 |     "Utility functions to enable video recording of gym environment and displaying it\n",
163 |     "To enable video, just do \"env = wrap_env(env)\"\"\n",
164 |     "\"\"\"\n",
165 |     "\n",
166 |     "def show_video():\n",
167 |     "    mp4list = glob.glob('video/*.mp4')\n",
168 |     "    if len(mp4list) > 0:\n",
169 |     "        mp4 = mp4list[-1]\n",
170 |     "        video = io.open(mp4, 'r+b').read()\n",
171 |     "        encoded = base64.b64encode(video)\n",
172 |     "\n",
173 |     "        # you can add \"loop\" after autoplay to keep the video looping after it ends\n",
174 |     "        ipythondisplay.display(HTML(data='''<video alt=\"test\" autoplay \n",
175 |     "                     controls style=\"height: 400px;\">\n",
176 |     "                    <source src=\"data:video/mp4;base64,{0}\" type=\"video/mp4\" />\n",
177 |     "                 </video>'''.format(encoded.decode('ascii'))))\n",
178 |     "    else:\n",
179 |     "        print(\"Could not find video\")\n",
180 |     "\n",
181 |     "\n",
182 |     "def wrap_env(env):\n",
183 |     "    env = VecVideoRecorder(env, './video')\n",
184 |     "    return env"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {
190 |     "colab_type": "text",
191 |     "id": "JX2E1ceMcN-j"
192 |    },
193 |    "source": [
194 |     "## Synchronous A2C Algorithm"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "\n",
204 |     "def make_env(env_id, rank, seed=0):\n",
205 |     "    \"\"\"\n",
206 |     "    Utility function for multiprocessed env.\n",
207 |     "    \n",
208 |     "    :param env_id: (str) the environment ID\n",
209 |     "    :param num_env: (int) the number of environment you wish to have in subprocesses\n",
210 |     "    :param seed: (int) the inital seed for RNG\n",
211 |     "    :param rank: (int) index of the subprocess\n",
212 |     "    \"\"\"\n",
213 |     "    def _init():\n",
214 |     "        env = gym.make(env_id)\n",
215 |     "        env.seed(seed + rank)\n",
216 |     "        return env\n",
217 |     "    return _init\n",
218 |     ""
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "num_cpu = 6  # Number of processes to use\n",
228 |     "\n",
229 |     "env_list = [\"CartPole-v0\", \"LunarLander-v2\", \"MsPacman-ram-v0\", \"CartPole-v0\", \"MountainCar-v0\", \"Breakout-ram-v4\", \"Acrobot-v1\"]\n",
230 |     "\n",
231 |     "env_to_use = env_list[0]\n",
232 |     ""
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "tags": [
240 |      "outputPrepend"
241 |     ]
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "\n",
246 |     "# # Create the vectorized environment\n",
247 |     "# env = SubprocVecEnv([make_env(env_to_use, i) for i in range(num_cpu)])\n",
248 |     "\n",
249 |     "# s0 = env.reset()\n",
250 |     "# print(s0.shape)\n",
251 |     "# actions = [env.action_space.sample() for i in range(num_cpu)]\n",
252 |     "# print(actions)\n",
253 |     "# env.step(actions)\n",
254 |     "\n",
255 |     "# for i in range(1):\n",
256 |     "#     actions = [env.action_space.sample() for i in range(num_cpu)]\n",
257 |     "#     obs, ret, done, info = env.step(actions)\n",
258 |     "#     print(obs, done, info)\n"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "import torch\n",
268 |     "import torch.nn as nn\n",
269 |     "import torch.nn.functional as F\n",
270 |     "\n",
271 |     "class Actor(nn.Module):\n",
272 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
273 |     "\n",
274 |     "    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):\n",
275 |     "        \"\"\"Initialize parameters and build model.\n",
276 |     "        Params\n",
277 |     "        ======\n",
278 |     "            state_size (int): Dimension of each state\n",
279 |     "            action_size (int): Dimension of each action\n",
280 |     "            seed (int): Random seed\n",
281 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
282 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
283 |     "        \"\"\"\n",
284 |     "        super(Actor, self).__init__()\n",
285 |     "        torch.manual_seed(seed)\n",
286 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
287 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
288 |     "        self.actor = nn.Linear(fc2_units, action_size)\n",
289 |     "\n",
290 |     "    def forward(self, state):\n",
291 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
292 |     "        x = F.relu(self.fc1(state))\n",
293 |     "        x = F.relu(self.fc2(x))\n",
294 |     "        return self.actor(x)\n",
295 |     "\n",
296 |     "class Critic(nn.Module):\n",
297 |     "    \"\"\"Actor (Policy) Model.\"\"\"\n",
298 |     "\n",
299 |     "    def __init__(self, state_size, seed, fc1_units=64, fc2_units=64):\n",
300 |     "        \"\"\"Initialize parameters and build model.\n",
301 |     "        Params\n",
302 |     "        ======\n",
303 |     "            state_size (int): Dimension of each state\n",
304 |     "            action_size (int): Dimension of each action\n",
305 |     "            seed (int): Random seed\n",
306 |     "            fc1_units (int): Number of nodes in first hidden layer\n",
307 |     "            fc2_units (int): Number of nodes in second hidden layer\n",
308 |     "        \"\"\"\n",
309 |     "        super(Critic, self).__init__()\n",
310 |     "        torch.manual_seed(seed)\n",
311 |     "        self.fc1 = nn.Linear(state_size, fc1_units)\n",
312 |     "        self.fc2 = nn.Linear(fc1_units, fc2_units)\n",
313 |     "        self.critic = nn.Linear(fc2_units, 1)\n",
314 |     "\n",
315 |     "    def forward(self, state):\n",
316 |     "        \"\"\"Build a network that maps state -> action values.\"\"\"\n",
317 |     "        x = F.relu(self.fc1(state))\n",
318 |     "        x = F.relu(self.fc2(x))\n",
319 |     "        return self.critic(x)\n",
320 |     "\n",
321 |     "class PGLoss(nn.Module):\n",
322 |     "    def forward(self, policy, act, rew_wt):\n",
323 |     "        logp = policy.log_prob(act)\n",
324 |     "        return -(logp * rew_wt).mean()\n",
325 |     "\n",
326 |     ""
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "import torch\n",
336 |     "import torch.optim as optim\n",
337 |     "\n",
338 |     "from torch.distributions.categorical import Categorical\n",
339 |     "\n",
340 |     "from collections import defaultdict\n",
341 |     "\n",
342 |     "class Agent:\n",
343 |     "    def __init__(self, env, device='cpu'):\n",
344 |     "        self.env = env\n",
345 |     "        self.device = device\n",
346 |     "\n",
347 |     "        self.state_size = self.env.observation_space.shape[0]\n",
348 |     "        self.action_size = self.env.action_space.n\n",
349 |     "\n",
350 |     "        # self.policy_network = ACNet(self.state_size, self.action_size, 4)\n",
351 |     "        self.actor_network = Actor(self.state_size, self.action_size, 4).to(self.device)\n",
352 |     "        self.critic_network = Critic(self.state_size, 4).to(self.device)\n",
353 |     "\n",
354 |     "        self.actor_loss = PGLoss()\n",
355 |     "        self.critic_loss = nn.MSELoss()\n",
356 |     "\n",
357 |     "        self.train_stats = defaultdict(list)\n",
358 |     "        self.gamma=0.9\n",
359 |     "\n",
360 |     "    def get_policy(self, obs):\n",
361 |     "        net_op = self.actor_network(torch.as_tensor(obs, dtype=torch.float32).to(self.device))\n",
362 |     "        return Categorical(logits=net_op.cpu())\n",
363 |     "    def get_critic_value(self,obs):\n",
364 |     "        return torch.squeeze(self.critic_network(torch.as_tensor(obs, dtype=torch.float32).to(self.device)).cpu())\n",
365 |     "    def get_action(self, policy):\n",
366 |     "        return policy.sample()\n",
367 |     "\n",
368 |     "    def train(self, epochs):\n",
369 |     "        actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=1e-2)\n",
370 |     "        critic_optimizer = optim.Adam(self.critic_network.parameters(), lr=1e-2)\n",
371 |     "\n",
372 |     "        cur_obs = self.env.reset()\n",
373 |     "\n",
374 |     "        cur_policy = self.get_policy(cur_obs)\n",
375 |     "        cur_ret = self.get_critic_value(cur_obs)\n",
376 |     "\n",
377 |     "        total_rewards = 0\n",
378 |     "        total_episodes = 0\n",
379 |     "        batch_rewards = np.zeros(num_cpu)\n",
380 |     "\n",
381 |     "        for i in range(epochs):\n",
382 |     "\n",
383 |     "            cur_action = self.get_action(cur_policy)\n",
384 |     "            # print(\"[INFO 1] action/return shape\", cur_action.shape, cur_ret.shape)\n",
385 |     "            # next_obs, cur_reward, done, _ = self.env.step(list(cur_actions))\n",
386 |     "            # print(\"a\", cur_actions, cur_action)\n",
387 |     "            next_obs, cur_reward, done, _ = self.env.step(list(cur_action.numpy()))\n",
388 |     "            \n",
389 |     "            next_policy = self.get_policy(next_obs)\n",
390 |     "            next_ret = self.get_critic_value(next_obs)\n",
391 |     "\n",
392 |     "            # print(\"[INFO 2] reward shape\", cur_reward.shape, next_ret.shape, done.shape)\n",
393 |     "            target_ret = torch.tensor(cur_reward, dtype=torch.float32) + self.gamma*next_ret.detach()*(1-torch.tensor(done, dtype=torch.float32))\n",
394 |     "\n",
395 |     "            actor_optimizer.zero_grad()\n",
396 |     "            critic_optimizer.zero_grad()\n",
397 |     "            \n",
398 |     "            actor_loss_val = self.actor_loss(cur_policy, cur_action, target_ret-cur_ret.detach())\n",
399 |     "            critic_loss_val = self.critic_loss(cur_ret, target_ret)\n",
400 |     "            entropy_loss_val = cur_policy.entropy().mean()\n",
401 |     "            actor_loss = actor_loss_val + entropy_loss_val*0.01\n",
402 |     "            # print(\"[INFO 3]\",actor_loss_val, entropy_loss_val, critic_loss_val)\n",
403 |     "\n",
404 |     "            actor_loss_val.backward()\n",
405 |     "            actor_optimizer.step()\n",
406 |     "\n",
407 |     "            critic_loss_val.backward()\n",
408 |     "            critic_optimizer.step()\n",
409 |     "\n",
410 |     "            cur_policy = next_policy\n",
411 |     "            cur_ret = next_ret\n",
412 |     "\n",
413 |     "            if np.any(done):\n",
414 |     "                indxes = np.squeeze(np.argwhere(done), axis=-1)\n",
415 |     "                total_episodes += len(indxes)\n",
416 |     "                total_rewards += np.sum(batch_rewards[indxes])\n",
417 |     "                batch_rewards[indxes] = 0\n",
418 |     "            else:\n",
419 |     "                batch_rewards += cur_reward\n",
420 |     "\n",
421 |     "            if (i+1)%5000==0:\n",
422 |     "                self.train_stats[\"actor_loss\"] += [actor_loss_val.item()]\n",
423 |     "                self.train_stats[\"critic_loss\"] += [critic_loss_val.item()]\n",
424 |     "                self.train_stats[\"returns\"] += [total_rewards/(total_episodes+1e-8)]\n",
425 |     "                total_episodes = 0\n",
426 |     "                total_rewards = 0\n",
427 |     "                print(\"Epoch:\", i, actor_loss_val.item(), critic_loss_val.item(), entropy_loss_val.item(), self.train_stats[\"returns\"][-1])\n",
428 |     "    \n",
429 |     "    def plot_train_stats(self):\n",
430 |     "        if len(self.train_stats)==0:\n",
431 |     "            print(\"first train to print train stats\")\n",
432 |     "        for i in self.train_stats:\n",
433 |     "            plt.plot(self.train_stats[i])\n",
434 |     "            plt.xlabel(\"Epoch\")\n",
435 |     "            plt.ylabel(i)\n",
436 |     "            plt.show()\n",
437 |     "        return\n"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {
444 |     "tags": [
445 |      "outputPrepend",
446 |      "outputPrepend",
447 |      "outputPrepend",
448 |      "outputPrepend",
449 |      "outputPrepend",
450 |      "outputPrepend",
451 |      "outputPrepend",
452 |      "outputPrepend",
453 |      "outputPrepend",
454 |      "outputPrepend",
455 |      "outputPrepend",
456 |      "outputPrepend",
457 |      "outputPrepend",
458 |      "outputPrepend",
459 |      "outputPrepend",
460 |      "outputPrepend",
461 |      "outputPrepend",
462 |      "outputPrepend",
463 |      "outputPrepend",
464 |      "outputPrepend",
465 |      "outputPrepend",
466 |      "outputPrepend",
467 |      "outputPrepend",
468 |      "outputPrepend",
469 |      "outputPrepend",
470 |      "outputPrepend",
471 |      "outputPrepend",
472 |      "outputPrepend",
473 |      "outputPrepend",
474 |      "outputPrepend",
475 |      "outputPrepend",
476 |      "outputPrepend",
477 |      "outputPrepend",
478 |      "outputPrepend",
479 |      "outputPrepend",
480 |      "outputPrepend",
481 |      "outputPrepend",
482 |      "outputPrepend",
483 |      "outputPrepend",
484 |      "outputPrepend",
485 |      "outputPrepend",
486 |      "outputPrepend",
487 |      "outputPrepend",
488 |      "outputPrepend",
489 |      "outputPrepend",
490 |      "outputPrepend",
491 |      "outputPrepend",
492 |      "outputPrepend",
493 |      "outputPrepend",
494 |      "outputPrepend",
495 |      "outputPrepend",
496 |      "outputPrepend",
497 |      "outputPrepend",
498 |      "outputPrepend",
499 |      "outputPrepend",
500 |      "outputPrepend",
501 |      "outputPrepend",
502 |      "outputPrepend",
503 |      "outputPrepend",
504 |      "outputPrepend",
505 |      "outputPrepend",
506 |      "outputPrepend",
507 |      "outputPrepend",
508 |      "outputPrepend",
509 |      "outputPrepend",
510 |      "outputPrepend",
511 |      "outputPrepend",
512 |      "outputPrepend",
513 |      "outputPrepend",
514 |      "outputPrepend",
515 |      "outputPrepend",
516 |      "outputPrepend",
517 |      "outputPrepend",
518 |      "outputPrepend",
519 |      "outputPrepend",
520 |      "outputPrepend",
521 |      "outputPrepend",
522 |      "outputPrepend",
523 |      "outputPrepend",
524 |      "outputPrepend",
525 |      "outputPrepend",
526 |      "outputPrepend",
527 |      "outputPrepend",
528 |      "outputPrepend",
529 |      "outputPrepend",
530 |      "outputPrepend",
531 |      "outputPrepend",
532 |      "outputPrepend",
533 |      "outputPrepend",
534 |      "outputPrepend",
535 |      "outputPrepend",
536 |      "outputPrepend"
537 |     ]
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "\n",
542 |     "print(\"GPU available:\", torch.cuda.is_available())\n",
543 |     "num_cpu = 1\n",
544 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
545 |     "\n",
546 |     "env = SubprocVecEnv([make_env(env_to_use, i) for i in range(num_cpu)])\n",
547 |     "\n",
548 |     "agent = Agent(env, device=device)\n",
549 |     "agent.train(20*5000)\n",
550 |     "\n"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "agent.plot_train_stats()"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "# watch the trained agent\n",
569 |     "env = wrap_env(gym.make(env_to_use))\n",
570 |     "state = env.reset()\n",
571 |     "done=False\n",
572 |     "while not done:\n",
573 |     "    policy, _ = agent.get_net_op(state)\n",
574 |     "    action = agent.get_action(policy)\n",
575 |     "    env.render()\n",
576 |     "    state, reward, done, _ = env.step(action)\n",
577 |     "    if done:\n",
578 |     "        break\n",
579 |     "        \n",
580 |     "env.close()\n",
581 |     "show_video()"
582 |    ]
583 |   }
584 |  ],
585 |  "metadata": {
586 |   "accelerator": "GPU",
587 |   "colab": {
588 |    "collapsed_sections": [
589 |     "XntD8FMRRumy",
590 |     "JX2E1ceMcN-j",
591 |     "-RIHPnzwR85a",
592 |     "HidE0wI2bqT3",
593 |     "92MuHGLIdZJE",
594 |     "N2ZkiiMNeE6M"
595 |    ],
596 |    "name": "Synchronous_A2C_torch.ipynb",
597 |    "provenance": []
598 |   },
599 |   "kernelspec": {
600 |    "display_name": "Python 3.6.10 64-bit ('rlenv': conda)",
601 |    "language": "python",
602 |    "name": "python361064bitrlenvcondad65df3249e4e44c0b0e4c9cdcaffb4d6"
603 |   },
604 |   "language_info": {
605 |    "codemirror_mode": {
606 |     "name": "ipython",
607 |     "version": 3
608 |    },
609 |    "file_extension": ".py",
610 |    "mimetype": "text/x-python",
611 |    "name": "python",
612 |    "nbconvert_exporter": "python",
613 |    "pygments_lexer": "ipython3",
614 |    "version": "3.6.10-final"
615 |   }
616 |  },
617 |  "nbformat": 4,
618 |  "nbformat_minor": 1
619 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning Notebooks
 2 | 
 3 | While going through [SpinningUp as Deep RL Researcher](https://spinningup.openai.com/en/latest/spinningup/spinningup.html), I found it challenging to connect and understand the paper (theory) and available implementation (practical) of several algorithms.<br><br>
 4 | 
 5 | ## Introduction
 6 | In this repository, I try to implement some of the most important, foundational Deep RL algorithms in a single notebook without any dependencies. There may be a lot of redundant code due to this.<br>
 7 | You can directly use these notebooks in [Google Colab](https://colab.research.google.com/).
 8 | 
 9 | Folder-wise readme contains the references and other resources used for that implementation.
10 | 
11 | ---
12 | 
13 | **You can read about my journey in RL [here](./my_path_in_RL.md).** <br>
14 | **[Here](./suggested_path_in_RL.md) is the path I would suggest to a beginner in RL.**
15 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: rl
  2 | channels:
  3 |   - pytorch
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - argon2-cffi=20.1.0=py38h7b6447c_1
  8 |   - attrs=19.3.0=py_0
  9 |   - backcall=0.2.0=py_0
 10 |   - blas=1.0=mkl
 11 |   - bleach=3.1.5=py_0
 12 |   - ca-certificates=2020.6.24=0
 13 |   - certifi=2020.6.20=py38_0
 14 |   - cffi=1.14.1=py38he30daa8_0
 15 |   - cudatoolkit=10.2.89=hfd86e86_1
 16 |   - decorator=4.4.2=py_0
 17 |   - defusedxml=0.6.0=py_0
 18 |   - entrypoints=0.3=py38_0
 19 |   - freetype=2.10.2=h5ab3b9f_0
 20 |   - importlib-metadata=1.7.0=py38_0
 21 |   - importlib_metadata=1.7.0=0
 22 |   - intel-openmp=2020.1=217
 23 |   - ipykernel=5.3.4=py38h5ca1d4c_0
 24 |   - ipython=7.16.1=py38h5ca1d4c_0
 25 |   - ipython_genutils=0.2.0=py38_0
 26 |   - jedi=0.17.0=py38_0
 27 |   - jinja2=2.11.2=py_0
 28 |   - jpeg=9b=h024ee3a_2
 29 |   - jsonschema=3.2.0=py38_0
 30 |   - jupyter_client=6.1.6=py_0
 31 |   - jupyter_core=4.6.3=py38_0
 32 |   - lcms2=2.11=h396b838_0
 33 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 34 |   - libedit=3.1.20191231=h14c3975_1
 35 |   - libffi=3.3=he6710b0_2
 36 |   - libgcc-ng=9.1.0=hdf63c60_0
 37 |   - libpng=1.6.37=hbc83047_0
 38 |   - libsodium=1.0.18=h7b6447c_0
 39 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 40 |   - libtiff=4.1.0=h2733197_1
 41 |   - lz4-c=1.9.2=he6710b0_1
 42 |   - markupsafe=1.1.1=py38h7b6447c_0
 43 |   - mistune=0.8.4=py38h7b6447c_1000
 44 |   - mkl=2020.1=217
 45 |   - mkl-service=2.3.0=py38he904b0f_0
 46 |   - mkl_fft=1.1.0=py38h23d657b_0
 47 |   - mkl_random=1.1.1=py38h0573a6f_0
 48 |   - nb_conda=2.2.1=py38_1
 49 |   - nb_conda_kernels=2.2.3=py38_0
 50 |   - nbconvert=5.6.1=py38_0
 51 |   - nbformat=5.0.7=py_0
 52 |   - ncurses=6.2=he6710b0_1
 53 |   - ninja=1.10.0=py38hfd86e86_0
 54 |   - notebook=6.1.1=py38_0
 55 |   - numpy=1.19.1=py38hbc911f0_0
 56 |   - numpy-base=1.19.1=py38hfa32c7d_0
 57 |   - olefile=0.46=py_0
 58 |   - openssl=1.1.1g=h7b6447c_0
 59 |   - packaging=20.4=py_0
 60 |   - pandoc=2.10=0
 61 |   - pandocfilters=1.4.2=py38_1
 62 |   - parso=0.8.0=py_0
 63 |   - pexpect=4.8.0=py38_0
 64 |   - pickleshare=0.7.5=py38_1000
 65 |   - pillow=7.2.0=py38hb39fc2d_0
 66 |   - pip=20.2.1=py38_0
 67 |   - prometheus_client=0.8.0=py_0
 68 |   - prompt-toolkit=3.0.5=py_0
 69 |   - ptyprocess=0.6.0=py38_0
 70 |   - pycparser=2.20=py_2
 71 |   - pygments=2.6.1=py_0
 72 |   - pyparsing=2.4.7=py_0
 73 |   - pyrsistent=0.16.0=py38h7b6447c_0
 74 |   - python=3.8.5=hcff3b4d_0
 75 |   - python-dateutil=2.8.1=py_0
 76 |   - pytorch=1.6.0=py3.8_cuda10.2.89_cudnn7.6.5_0
 77 |   - pyzmq=19.0.1=py38he6710b0_1
 78 |   - readline=8.0=h7b6447c_0
 79 |   - send2trash=1.5.0=py38_0
 80 |   - setuptools=49.2.1=py38_0
 81 |   - six=1.15.0=py_0
 82 |   - sqlite=3.32.3=h62c20be_0
 83 |   - terminado=0.8.3=py38_0
 84 |   - testpath=0.4.4=py_0
 85 |   - tk=8.6.10=hbc83047_0
 86 |   - torchvision=0.7.0=py38_cu102
 87 |   - tornado=6.0.4=py38h7b6447c_1
 88 |   - traitlets=4.3.3=py38_0
 89 |   - wcwidth=0.2.5=py_0
 90 |   - webencodings=0.5.1=py38_1
 91 |   - wheel=0.34.2=py38_0
 92 |   - xz=5.2.5=h7b6447c_0
 93 |   - zeromq=4.3.2=he6710b0_2
 94 |   - zipp=3.1.0=py_0
 95 |   - zlib=1.2.11=h7b6447c_3
 96 |   - zstd=1.4.5=h9ceee32_0
 97 |   - pip:
 98 |     - astroid==2.4.2
 99 |     - atari-py==0.2.6
100 |     - box2d-py==2.3.8
101 |     - cloudpickle==1.3.0
102 |     - cycler==0.10.0
103 |     - cython==0.29.21
104 |     - future==0.18.2
105 |     - glfw==1.12.0
106 |     - gym==0.17.2
107 |     - imageio==2.9.0
108 |     - isort==5.4.2
109 |     - kiwisolver==1.2.0
110 |     - lazy-object-proxy==1.4.3
111 |     - lockfile==0.12.2
112 |     - matplotlib==3.3.0
113 |     - mccabe==0.6.1
114 |     - mujoco-py==1.50.1.68
115 |     - opencv-python==4.3.0.36
116 |     - pybullet==2.8.5
117 |     - pyglet==1.5.0
118 |     - pylint==2.6.0
119 |     - scipy==1.5.2
120 |     - toml==0.10.1
121 |     - tqdm==4.48.2
122 |     - wrapt==1.12.1
123 | prefix: /home/kinal/miniconda3/envs/rl
124 | 


--------------------------------------------------------------------------------
/my_path_in_RL.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # My path in RL
 3 | 
 4 | Starting with reinforcement learning gets really overwhelming for a beginner. There is no clear visible [path to follow](./suggested_path_in_RL.md) and hence you have to experiment a lot to carve out your own learning path. <br>
 5 | Here I'll be sharing my own experience and the steps I took to get started, learn, and move towards advanced topics to application development or research.<br>
 6 | 
 7 | #### 1. Introduction to Reinforcement Learning -  David Silver's UCL lectures [link](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) (October 2018)
 8 | 
 9 | - This is the first course I started with. It gives a great introduction to classical RL in tabular form. This course is enough to equip you with all the **conceptual and theoretical** knowledge. It is a good place to develop your understanding.
10 | - I personally didn't work on any code with this course.
11 | 
12 | #### 2. UC Berkley Deep RL Course [link](http://rail.eecs.berkeley.edu/deeprlcourse/) (January 2019)
13 | 
14 | - After finishing the first course, I was confident that I could go through this course for Deep RL. I was terribly mistaken.
15 | - I started with the videos and was also doing the assignments along.
16 | - Though I was able to understand the videos, it was really challenging for me to implement the 2nd assignment.
17 | - This was probably because I didn't implement anything in the previous course. This was a great eye-opener.
18 | 
19 | **BREAK:** Next 6 months I took a break from RL and focused my attention on Deep Learning. Was I scared of failing?
20 | 
21 | #### 3. Udacity's Deep Reinforcement Learning Nanodegree [link](https://www.udacity.com/course/deep-reinforcement-learning-nanodegree--nd893) (August 2019)
22 | 
23 | - I had participated in one of the scholarship challenge on Udacity and was awarded this course. It is a fantastic course. A lot of my concepts got revised. It also helped me get a better understanding of the classical control algorithms. This is more of a **practical** course. You have to implement various algorithms explained in their videos immediately. Though not a compulsion, its worth investing your time in the implementations as it gives you confidence about your own understanding.
24 | - In case you **cannot afford to purchase** this course, you can try for scholarships. You can also use their [Github Repository](https://github.com/udacity/deep-reinforcement-learning) and try working with the code samples and finishing the projects even without the videos. The repository and the notebooks are well documented and should suffice.
25 | - Not to forget, this course has a pre-requisite that you have decent knowledge of Deep Learning/Neural Networks. I was able to glide through this course as I had already done [Deep Learning Nanodegree](https://www.udacity.com/course/deep-learning-nanodegree--nd101) as part of the same scholarship plus I also had 1 year of industry experience in DL.
26 | 
27 | #### 4. Reinforcement Learning Specialization by University of Alberta [link](https://www.coursera.org/specializations/reinforcement-learning) (January 2020)
28 | 
29 | - This course along with the [Reinforcement Learning Textbook](http://incompleteideas.net/book/RLbook2018.pdf) is the perfect combination for a beginner.
30 | - I started with this course as I was applying for MS CS at University of Alberta and thought it'll be a good thing to have done their own certification.
31 | - As I had already gone through David Silver's course, my basics were already pretty strong. This course worked as a great revision, and I also implemented several algorithms as assignments.
32 | - It helped me strengthen my fundamentals and get a better understanding with the implementations in various settings.
33 | 
34 | #### 5. Reinforcement Learning Textbook by Richard Sutton and Andrew Barto [link](http://incompleteideas.net/book/RLbook2018.pdf) (January 2020)
35 | 
36 | - This is the only book you would need for beginning in RL.
37 | - As there is no similar book for Deep RL, it has to be learned through blogs, online lectures and papers.
38 | - I followed this book along with the RL Specialization. That was the best combination for clearing the fundamentals.
39 | 
40 | #### 6. OpenAI SpinningUp [link](https://spinningup.openai.com/en/latest/index.html) (February 2020)
41 | 
42 | - After clearing the basic concepts, I started with Deep RL again with SpinningUp.
43 | - Going through the introduction gave me a better understanding of key concepts and Policy Gradients and its derivation.
44 | - Next, I started with Spinning Up as a Deep RL Researcher. One of the most important section here is "Learn by Doing". As a part of "Learn by Doing", I implemented minimalistic versions of various algorithms. This is the part where this repository was born.
45 | 
46 | #### 7. Neuro-Dynamic Programming by Dimitri Bertsekas and John Tsitsiklis [link](http://athenasc.com/ndpbook.html) (March 2020)
47 | 
48 | - I started reading this book with SpinningUp as it was mentioned in David Silver's class.
49 | - It gives a different perspective, but the concepts are quite similar to Sutton and Barto's book.
50 | - I am still reading this book, so will update more details of what I gained/learned from this book.
51 | 


--------------------------------------------------------------------------------
/suggested_path_in_RL.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Suggested Path for a beginner in Reinforcement Learning
 3 | 
 4 | Based on [my learning experience](./my_path_in_RL.md) and the resources I used for learning myself, following is the course I would suggest to someone starting new in Reinforcement Learning.
 5 | 
 6 | **NOTE:** This is my personal opinion and take it with a pinch of salt.
 7 | 
 8 | #### 1. Reinforcement Learning Specialization by University of Alberta [link](https://www.coursera.org/specializations/reinforcement-learning)
 9 | 
10 | - One of the best course to get you started in RL. Follow the RL book along with course, and finish all the reading assignments asked to do.
11 | - It'll help you clear the concepts in tabular settings first and then extend those concepts to function approximation case.
12 | - Alternatively you can also refer to [David Silver's course](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ).
13 | 
14 | #### 2. Reinforcement Learning Textbook by Richard Sutton and Andrew Barto [link](http://incompleteideas.net/book/RLbook2018.pdf)
15 | 
16 | - Read the first chapter of this book before doing anything in RL. It gives a really nice introduction to the RL framework.
17 | - The first part of the book covers tabular methods. The second part covers function approximation with RL. In the third part, various case studies have been discussed including AlphaGo and AlphaZero.
18 | - The first part will almost completely be covered with the course. Some sections of the second part are covered in course, you can read the remaining sections to get better intuition and understanding.
19 | - Going through the case-studies will give you a nice overview of how RL and Deep RL is applied in practical scenarios.
20 | 
21 | #### 3. OpenAI SpinningUp [link](https://spinningup.openai.com/en/latest/index.html)
22 | 
23 | - Spinning Up is one of the most awesome resource to get started as a researcher in RL.
24 | - It contains the list of key papers in RL and DRL research.
25 | - OpenAI has also released [baselines](https://github.com/openai/baselines), which contains implementation of various algorithms.
26 | - One of the most important thing is writing your own implementations of various algorithms. It helps you clear your understanding and connect the theoretical concepts to its implementations.
27 | - [This repository](https://github.com/kinalmehta/Reinforcement-Learning-Notebooks) contains my implementations of various algorithms. I have tried to keep the implementations as simple as possible.
28 | - Alternatively you can also refer to [UC Berkley's Deep RL Course](http://rail.eecs.berkeley.edu/deeprlcourse/).
29 | 
30 | ---
31 | 
32 | **NOTE:** If you have any suggestions or improvements to the above mentioned path, please do contribute.
33 | 


--------------------------------------------------------------------------------