├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .gitignore ├── README.md ├── docs ├── 0 - Reinforcement Learning Intro.ipynb ├── 1 - Bellman Equation and the State Value.ipynb ├── 2 - Bellman Equation and the Action-State Value.ipynb ├── 3 - Q-Learning.ipynb └── 4 - Intro to Neural Networks.ipynb └── requirements.txt /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/minimal-notebook:notebook-6.4.3 2 | 3 | # Needed dependencies to run gym rendering 4 | USER root 5 | RUN apt-get update && apt-get install -y python-opengl graphviz xvfb 6 | USER ${NB_USER} 7 | 8 | # Install python dependencies 9 | COPY requirements.txt requirements.txt 10 | RUN conda install --file requirements.txt 11 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "OpenAI", 3 | "dockerFile": "Dockerfile", 4 | "context": "../", 5 | "extensions": [ 6 | "dbaeumer.vscode-eslint", 7 | "ms-python.python", 8 | "njpwerner.autodocstring", 9 | "eamodio.gitlens", 10 | "mhutchie.git-graph", 11 | "zhuangtongfa.material-theme", 12 | "pkief.material-icon-theme", 13 | "ms-azuretools.vscode-docker", 14 | "yzhang.markdown-all-in-one", 15 | "ms-vsliveshare.vsliveshare", 16 | "Vtrois.gitmoji-vscode", 17 | "GitHub.vscode-pull-request-github" 18 | ], 19 | "settings": { 20 | "python.pythonPath": "/opt/conda/bin/python", 21 | "python.languageServer": "Pylance", 22 | "python.linting.pylintEnabled": false, 23 | "python.linting.flake8Enabled": true, 24 | "python.linting.enabled": true, 25 | "python.formatting.blackPath": "/usr/local/bin/black", 26 | "python.linting.flake8Path": "/usr/local/bin/flake8", 27 | "python.linting.mypyPath": "/usr/local/bin/mypy" 28 | }, 29 | "postAttachCommand": ["xvfb-run", "-s", "-screen 0 1400x900x24", "jupyter", "notebook"], 30 | "forwardPorts": [8888] 31 | } 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | logs -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 👋 Curso sobre Reinforcement Learning 2 | 3 | ## 🙂 Descripción 4 | 5 | En este curso se incluye **todo lo básico relacionado con el Aprendizaje por Refuerzo**. Desde los conceptos más básicos, como qué es una cadena de Markov, hasta conceptos más interesantes como la **inclusión de Redes Neuronales** en algoritmos de RL. Para ello, cada una de las clases cuenta con un Jupyter Notebook ejecutable con toda la teoría y con imágenes de esquemas que ayudan a una mejor comprensión. Esta pensado para que sea un **curso desde 0**, por ello, tanto si eres principiante, como si quieres refrescar conocimientos, este curso es para ti. 6 | 7 | ## 🔖 Tabla de Contenidos 8 | 9 | - [👋 Curso sobre Reinforcement Learning](#-curso-sobre-reinforcement-learning) 10 | - [🙂 Descripción](#-descripción) 11 | - [🔖 Tabla de Contenidos](#-tabla-de-contenidos) 12 | - [📜 Temas](#-temas) 13 | - [0️⃣ Introducción a Reinforcement Learning](#0️⃣-introducción-a-reinforcement-learning) 14 | - [1️⃣ Ecuación de Bellman: El valor de los estados](#1️⃣-ecuación-de-bellman-el-valor-de-los-estados) 15 | - [2️⃣ Ecuación de Bellman: El valor de las acciones](#2️⃣-ecuación-de-bellman-el-valor-de-las-acciones) 16 | - [3️⃣ Q Learning](#3️⃣-q-learning) 17 | - [⚡ Quick-Start: usando remote containers](#-quick-start-usando-remote-containers) 18 | 19 | Este curso está dividido en varias partes: 20 | 21 | # 📜 Temas 22 | 23 | Para ejecutar los notebooks la mejor forma es [usar docker](#-quick-start-usando-remote-containers). En apenas **unos minutos y sin instalar nada** tendrás acceso a todos los notebooks. 🤯 24 | 25 | ## 0️⃣ Introducción a Reinforcement Learning 26 | - Agente y Entorno 27 | - Recompensas, Observaciones y Acciones 28 | - Equilibrio Exploración Explotación 29 | - Maximizar la Recompensa a largo plazo 30 | - Descubriendo Gym: Creando mi primer entorno 31 | - Descubriendo Gym: Creando mi primer agente 32 | 33 | ![Reinforcement Learning Intro](https://user-images.githubusercontent.com/44867923/139915800-8224bede-c52b-47d1-bb22-2e9624687831.jpg) 34 | 35 | ## 1️⃣ Ecuación de Bellman: El valor de los estados 36 | - V-table: asignando un valor a cada estado 37 | - Ecuación de Bellman: calculando V para cada estado 38 | - Cálculo de la Política usando la V-table 39 | 40 | 41 | ![Bellman_equation State_Value](https://user-images.githubusercontent.com/44867923/140994794-51d739af-eb70-4e6a-9036-b925f23ab7fd.jpg) 42 | 43 | ## 2️⃣ Ecuación de Bellman: El valor de las acciones 44 | - Las acciones en los Procesos de decisión de Markov 45 | - Q: El valor de las acciones 46 | - Programación Dinámica: Iteración de Valores 47 | 48 | ![Q-value](https://user-images.githubusercontent.com/44867923/141012134-09ff0d88-4ce9-43af-8b04-d535cf24d897.jpg) 49 | 50 | ## 3️⃣ Q Learning 51 | - Diferencias Temporales: Q-learning 52 | - Alpha: aprender más de lo nuevo o de lo viejo 53 | - Gamma: cuanto más lejos en el futuro menos confianza 54 | - La política Óptima 55 | 56 | ![Q-learning](https://user-images.githubusercontent.com/44867923/141012234-257d26af-bf05-4dad-b402-96b54c735f41.jpg) 57 | 58 | 59 | # ⚡ Quick-Start: usando remote containers 60 | 61 | **1. Instala el Plugin de VSCode de [Remote Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)** 62 | 63 | ``` 64 | # Presiona Ctrl + shift + p 65 | # Pega ext install ms-vscode-remote.remote-containers 66 | # Presiona Enter 67 | ``` 68 | 69 | **2. Abre el entorno de desarrollo** 70 | 71 | ``` 72 | # Presiona Ctrl + shift + p 73 | # Busca: Remote-Containers: Rebuild and Reopen in container 74 | # Presiona Enter (y espera, la primera vez tarda unos minutos) 75 | ``` 76 | 77 | **3. Abre los Notebooks** 78 | 79 | Abre el buscador y ve a [http://127.0.0.1:8888/](http://127.0.0.1:8888/) 80 | -------------------------------------------------------------------------------- /docs/0 - Reinforcement Learning Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d7425342", 6 | "metadata": {}, 7 | "source": [ 8 | "# 1. Introducción al Reinforcement Learning" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "d99f640b", 14 | "metadata": {}, 15 | "source": [ 16 | "### 1.1 Los campos del Machine Learning" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "24522b87", 22 | "metadata": {}, 23 | "source": [ 24 | "Existen 3 grandes campos:\n", 25 | "\n", 26 | "- **Aprendizaje no Supervisado**: tenemos datos pero **NO** sabemos a qué grupo pertenece cada dato.\n", 27 | "- **Aprendizaje Supervisado**: tenemos datos y **SÍ** sabemos a qué grupo pertenece cada dato (etiquetas).\n", 28 | "- **Aprendizaje por Refuerzo**: no tenemos datos, los datos se obtienen explorando un entorno.\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "54c3048d", 34 | "metadata": {}, 35 | "source": [ 36 | "![esquema_ML](https://la.mathworks.com/discovery/reinforcement-learning/_jcr_content/mainParsys3/discoverysubsection/mainParsys/image.adapt.full.medium.png/1630398182247.png)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "f4b822ee", 42 | "metadata": {}, 43 | "source": [ 44 | "### 1.2 Reinforcement Learning" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "7e46eff1", 50 | "metadata": {}, 51 | "source": [ 52 | "![Reinforcement Learning simple schema](https://la.mathworks.com/discovery/reinforcement-learning/_jcr_content/mainParsys3/discoverysubsection_603098216/mainParsys3/image.adapt.full.medium.png/1630398182451.png)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "f3f3b0fb", 58 | "metadata": {}, 59 | "source": [ 60 | "1. El **agente** obtiene unas **observaciones** del entorno\n", 61 | "2. En función a esas **observaciones** decide realizar una **acción**\n", 62 | "3. Esa **acción** le lleva a obtener una **recompensa** y nuevas **observaciones**\n", 63 | "4. Vuelve al paso 2." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "7c8cd723", 69 | "metadata": {}, 70 | "source": [ 71 | "#### ¿Cual sería el Agente, el entorno, las acciones, la recompensa y las observaciones en este clásico ejemplo?" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "a1ba0272", 77 | "metadata": {}, 78 | "source": [ 79 | "![mice in a maze](https://user-images.githubusercontent.com/44867923/139915800-8224bede-c52b-47d1-bb22-2e9624687831.jpg)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "a802dc34", 85 | "metadata": {}, 86 | "source": [ 87 | "### 1.3 Obtener datos: Equilibrio entre exploración y explotación" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "5f74ecea", 93 | "metadata": {}, 94 | "source": [ 95 | "1. Te acabas de mudar. No conoces nada de tu barrio.\n", 96 | "2. Tienes hambre (**recompensa** negativa) y quieres comer.\n", 97 | "3. Observas que existen varios restaurantes en tu calle (**observaciones**)\n", 98 | "4. Eliges un restaurante (tomas una **acción**), no conoces ninguno asi que cómo lo haces? **De forma aleatoria**\n", 99 | "5. Te gusta (**recompensa positiva**) y repites porque vas a lo seguro (**Explotación**).\n", 100 | "6. Cierto día decides arriesgar con un nuevo sitio (**Exploración**).\n", 101 | "7. Puede ser que te guste más y vuelvas (**Explotación**) o puede ser que no te guste y pruebes nuevos sitios (**Exploración**) o vuelvas al anterior (**Explotación**)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "5a025ca0", 107 | "metadata": {}, 108 | "source": [ 109 | "![explotationvsexploration](https://steemitimages.com/640x0/https://steemitimages.com/DQmXH5tjBiS41iNtcyvh7s7Rj5z3SqGkcwoaV2otRJNx3FT/Exploration_vs._Exploitation.png)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "0fbb7185", 115 | "metadata": {}, 116 | "source": [ 117 | "### 1.4 Objetivo del RL: Maximizar la recompensa a largo plazo" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "5ae07579", 123 | "metadata": {}, 124 | "source": [ 125 | "1. Madrugas (recompensa negativa)\n", 126 | "2. Vas a clase, *con un profesor aburrido* (recompensa negativa)\n", 127 | "3. Te fuerzas a estudiar cientos de diapositivas con 30 KG de tinta en texto (recompensa negativa).\n", 128 | "4. ¿Por qué? -> **Para maximizar la recompensa a largo plazo**.\n", 129 | "4. Si estudio trabajaré sentado y bajo un techo (recompensa positiva).\n", 130 | "6. Puedo aspirar a trabajos mejor remunerados (recompensa positiva).\n", 131 | "7. Con ese dinero puedo comprar comida a mi gato flurfils que es lo que más quiero en este mundo (recompensa positiva)." 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "9136f538", 137 | "metadata": {}, 138 | "source": [ 139 | "![Maximizar_recompensa](https://user-images.githubusercontent.com/44867923/139920150-ed9884b2-1340-4753-97b8-e3e1b91845b0.jpg)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "6527ebfa", 145 | "metadata": {}, 146 | "source": [ 147 | "### 1.5 Reinforcement Learning en la pŕactica" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 1, 153 | "id": "2d5dd3c4", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "image/jpeg": "\n", 159 | "text/html": [ 160 | "\n", 161 | " \n", 168 | " " 169 | ], 170 | "text/plain": [ 171 | "" 172 | ] 173 | }, 174 | "execution_count": 1, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "from IPython.display import YouTubeVideo\n", 181 | "YouTubeVideo('WXuK6gekU1Y', width=720, height=480)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "6baf2144", 187 | "metadata": {}, 188 | "source": [ 189 | "#### Import Dependencies" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 2, 195 | "id": "eef65dcf", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "import gym" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "id": "bd9fe062", 205 | "metadata": {}, 206 | "source": [ 207 | "#### Useful functions to show the environment " 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 3, 213 | "id": "9fb0f43c", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "import matplotlib.pyplot as plt\n", 218 | "from IPython import display\n", 219 | "import numpy as np\n", 220 | "%matplotlib inline\n", 221 | "\n", 222 | "def get_env_image(env: gym.Env) -> np.ndarray:\n", 223 | " return env.render(mode='rgb_array')\n", 224 | "\n", 225 | "def start_render(env: gym.Env) -> None:\n", 226 | " global render\n", 227 | " img = get_env_image(env)\n", 228 | " render = plt.imshow(img) # only call this once\n", 229 | "\n", 230 | "def update_render(env: gym.Env) -> None:\n", 231 | " global render\n", 232 | " img = get_env_image(env)\n", 233 | " render.set_data(img) # just update the data\n", 234 | " display.display(plt.gcf())\n", 235 | " display.clear_output(wait=True)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "8fe58e4f", 241 | "metadata": {}, 242 | "source": [ 243 | "#### Create an Environment and play randomly" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 4, 249 | "id": "a3b7a57d", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "ENV_NAME = \"MountainCar-v0\"\n", 254 | "ENV_NAME = \"LunarLander-v2\"\n", 255 | "ENV_NAME = \"CarRacing-v0\"\n", 256 | "ENV_NAME = \"BipedalWalker-v3\"\n", 257 | "ENV_NAME = \"CartPole-v1\"" 258 | ] 259 | } 260 | ], 261 | "metadata": { 262 | "kernelspec": { 263 | "display_name": "Python 3 (ipykernel)", 264 | "language": "python", 265 | "name": "python3" 266 | }, 267 | "language_info": { 268 | "codemirror_mode": { 269 | "name": "ipython", 270 | "version": 3 271 | }, 272 | "file_extension": ".py", 273 | "mimetype": "text/x-python", 274 | "name": "python", 275 | "nbconvert_exporter": "python", 276 | "pygments_lexer": "ipython3", 277 | "version": "3.9.6" 278 | } 279 | }, 280 | "nbformat": 4, 281 | "nbformat_minor": 5 282 | } 283 | -------------------------------------------------------------------------------- /docs/1 - Bellman Equation and the State Value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "018813cb", 6 | "metadata": {}, 7 | "source": [ 8 | "# Bellman Equation: the State Value" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "6d170d38", 14 | "metadata": {}, 15 | "source": [ 16 | "![Bellman_1](https://user-images.githubusercontent.com/44867923/140994838-db45e51e-d92b-4b37-9df6-99e1a7e2def3.jpg)\n", 17 | "![Bellman_2](https://user-images.githubusercontent.com/44867923/140994794-51d739af-eb70-4e6a-9036-b925f23ab7fd.jpg)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "id": "dc907e39", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import gym\n", 28 | "import numpy as np\n", 29 | "import seaborn as sns" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "df1bb9d0", 35 | "metadata": {}, 36 | "source": [ 37 | "### Create the env: Deterministic Frozen-Lake" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "id": "aa1ca95b", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "env = gym.make(\"FrozenLake-v1\", is_slippery=True)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "36d386e2", 53 | "metadata": {}, 54 | "source": [ 55 | "### Create V Table: A value for every state" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "id": "18eb737d", 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "V = np.zeros(env.observation_space.n)\n", 77 | "V" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "c9849f3e", 83 | "metadata": {}, 84 | "source": [ 85 | "### Agent: play the environment and update the V table" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "id": "4652f224", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "gamma = 0.9\n", 96 | "for ep in range(1000):\n", 97 | " obs = env.reset()\n", 98 | " done = False\n", 99 | "\n", 100 | " while not done:\n", 101 | "\n", 102 | " action = env.action_space.sample()\n", 103 | " next_obs, reward, done, _ = env.step(action)\n", 104 | "\n", 105 | " previous_obs_value = V[obs]\n", 106 | " actual_obs_value = reward + gamma*V[next_obs]\n", 107 | "\n", 108 | " if actual_obs_value > previous_obs_value:\n", 109 | " V[obs] = actual_obs_value\n", 110 | "\n", 111 | " obs = next_obs" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "95161d3c", 117 | "metadata": {}, 118 | "source": [ 119 | "### Plot V Table: The value of every State" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "df3b34ea", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "" 132 | ] 133 | }, 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | }, 138 | { 139 | "data": { 140 | "image/png": "\n", 141 | "text/plain": [ 142 | "
" 143 | ] 144 | }, 145 | "metadata": { 146 | "needs_background": "light" 147 | }, 148 | "output_type": "display_data" 149 | } 150 | ], 151 | "source": [ 152 | "sns.heatmap(\n", 153 | " data=V.reshape(4,4),\n", 154 | " annot=True\n", 155 | ")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "b4b2a99b", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.9.6" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 5 188 | } 189 | -------------------------------------------------------------------------------- /docs/2 - Bellman Equation and the Action-State Value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "57982c04", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2 - Bellman Equation and the Action-State Value" 9 | ] 10 | }, 11 | { 12 | "attachments": {}, 13 | "cell_type": "markdown", 14 | "id": "44583299", 15 | "metadata": {}, 16 | "source": [ 17 | "![Bellman_1](https://user-images.githubusercontent.com/44867923/141012209-d05f4699-fd8f-48b9-9e9a-a574687e7d75.jpg)\n", 18 | "![Bellman_2](https://user-images.githubusercontent.com/44867923/141012134-09ff0d88-4ce9-43af-8b04-d535cf24d897.jpg)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 6, 24 | "id": "dc907e39", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import gym\n", 29 | "import numpy as np\n", 30 | "import seaborn as sns\n", 31 | "import matplotlib.pyplot as plt" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "ed208f47", 37 | "metadata": {}, 38 | "source": [ 39 | "### Create Environment: Deterministic Frozen-Lake" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 12, 45 | "id": "3739f7df", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "env = gym.make(\"FrozenLake-v1\", is_slippery=True)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "eea1b671", 55 | "metadata": {}, 56 | "source": [ 57 | "### Create Q Table: Value of every action in every state" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 13, 63 | "id": "8885f380", 64 | "metadata": { 65 | "scrolled": true 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "array([[0., 0., 0., 0.],\n", 72 | " [0., 0., 0., 0.],\n", 73 | " [0., 0., 0., 0.],\n", 74 | " [0., 0., 0., 0.],\n", 75 | " [0., 0., 0., 0.],\n", 76 | " [0., 0., 0., 0.],\n", 77 | " [0., 0., 0., 0.],\n", 78 | " [0., 0., 0., 0.],\n", 79 | " [0., 0., 0., 0.],\n", 80 | " [0., 0., 0., 0.],\n", 81 | " [0., 0., 0., 0.],\n", 82 | " [0., 0., 0., 0.],\n", 83 | " [0., 0., 0., 0.],\n", 84 | " [0., 0., 0., 0.],\n", 85 | " [0., 0., 0., 0.],\n", 86 | " [0., 0., 0., 0.]])" 87 | ] 88 | }, 89 | "execution_count": 13, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "Q = np.zeros((env.observation_space.n, env.action_space.n))\n", 96 | "Q" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "399cac64", 102 | "metadata": {}, 103 | "source": [ 104 | "### Agent: play the environment and update the Q table" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 14, 110 | "id": "94adb64b", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "gamma = 0.9\n", 115 | "for ep in range(1000):\n", 116 | " obs = env.reset()\n", 117 | " done = False\n", 118 | "\n", 119 | " while not done:\n", 120 | "\n", 121 | " action = env.action_space.sample()\n", 122 | " next_obs, reward, done, _ = env.step(action)\n", 123 | "\n", 124 | "\n", 125 | " actual_obs_action_value = reward + gamma*Q[next_obs].max()\n", 126 | " Q[obs, action] = actual_obs_action_value\n", 127 | "\n", 128 | " obs = next_obs\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "8faa0a8f", 134 | "metadata": {}, 135 | "source": [ 136 | "### Plot Q Table: action value for every state" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 15, 142 | "id": "8adb7b26", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "image/png": "\n", 148 | "text/plain": [ 149 | "
" 150 | ] 151 | }, 152 | "metadata": { 153 | "needs_background": "light" 154 | }, 155 | "output_type": "display_data" 156 | } 157 | ], 158 | "source": [ 159 | "fig, ax = plt.subplots(2,2, figsize=(15,15))\n", 160 | "\n", 161 | "plot = ax[0,0]\n", 162 | "plot.set_title(\"izquierda\")\n", 163 | "values = Q.T[0].reshape(4,4)\n", 164 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 165 | "\n", 166 | "plot = ax[0,1]\n", 167 | "plot.set_title(\"abajo\")\n", 168 | "values = Q.T[1].reshape(4,4)\n", 169 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 170 | "\n", 171 | "plot = ax[1,0]\n", 172 | "plot.set_title(\"Derecha\")\n", 173 | "values = Q.T[2].reshape(4,4)\n", 174 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 175 | "\n", 176 | "plot = ax[1,1]\n", 177 | "plot.set_title(\"arriba\")\n", 178 | "values = Q.T[3].reshape(4,4)\n", 179 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 180 | "\n", 181 | "_ = plt.plot()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "bcec23a8", 187 | "metadata": {}, 188 | "source": [ 189 | "### Plot Q* Table: action with highest value for every state" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 16, 195 | "id": "15ecc762", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "" 202 | ] 203 | }, 204 | "execution_count": 16, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | }, 208 | { 209 | "data": { 210 | "image/png": "\n", 211 | "text/plain": [ 212 | "
" 213 | ] 214 | }, 215 | "metadata": { 216 | "needs_background": "light" 217 | }, 218 | "output_type": "display_data" 219 | } 220 | ], 221 | "source": [ 222 | "sns.heatmap(data=Q.max(axis=1).reshape(4,4), annot=True)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "bd58a002", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3 (ipykernel)", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.9.6" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 5 255 | } 256 | -------------------------------------------------------------------------------- /docs/3 - Q-Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f4336a2c", 6 | "metadata": {}, 7 | "source": [ 8 | "# Q-Learning" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "3313e760", 14 | "metadata": {}, 15 | "source": [ 16 | "![Bellman_1](https://user-images.githubusercontent.com/44867923/141013004-bde1b971-fcdc-445e-bf08-9bc89311891c.jpg)\n", 17 | "![Bellman_2](https://user-images.githubusercontent.com/44867923/141012134-09ff0d88-4ce9-43af-8b04-d535cf24d897.jpg)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 13, 23 | "id": "dc907e39", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import gym\n", 28 | "import numpy as np\n", 29 | "import seaborn as sns\n", 30 | "import matplotlib.pyplot as plt" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "7e5a2184", 36 | "metadata": {}, 37 | "source": [ 38 | "### Create Environment: Stochastic Frozen-Lake" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 14, 44 | "id": "3739f7df", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "env = gym.make(\"FrozenLake-v1\", is_slippery=True)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 15, 54 | "id": "42bd0ad5", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "ALPHA = 0.1\n", 59 | "GAMMA = 0.9\n", 60 | "EPISODES = 10000\n", 61 | "EPSILON = 0.9\n", 62 | "DELTA_EPSILON = EPSILON / EPISODES\n", 63 | "MIN_EPSILON = 0.1" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "1c57bcfe", 69 | "metadata": {}, 70 | "source": [ 71 | "### Create Q Table: Value of every action in every state" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 16, 77 | "id": "8885f380", 78 | "metadata": { 79 | "scrolled": true 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "array([[0., 0., 0., 0.],\n", 86 | " [0., 0., 0., 0.],\n", 87 | " [0., 0., 0., 0.],\n", 88 | " [0., 0., 0., 0.],\n", 89 | " [0., 0., 0., 0.],\n", 90 | " [0., 0., 0., 0.],\n", 91 | " [0., 0., 0., 0.],\n", 92 | " [0., 0., 0., 0.],\n", 93 | " [0., 0., 0., 0.],\n", 94 | " [0., 0., 0., 0.],\n", 95 | " [0., 0., 0., 0.],\n", 96 | " [0., 0., 0., 0.],\n", 97 | " [0., 0., 0., 0.],\n", 98 | " [0., 0., 0., 0.],\n", 99 | " [0., 0., 0., 0.],\n", 100 | " [0., 0., 0., 0.]])" 101 | ] 102 | }, 103 | "execution_count": 16, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "Q = np.zeros((env.observation_space.n, env.action_space.n))\n", 110 | "Q" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "c570b4aa", 116 | "metadata": {}, 117 | "source": [ 118 | "### Agent: play the environment and update the Q table" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 17, 124 | "id": "94adb64b", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "for ep in range(EPISODES):\n", 129 | " obs = env.reset()\n", 130 | " done = False\n", 131 | " EPSILON = max(EPSILON - DELTA_EPSILON, MIN_EPSILON)\n", 132 | " while not done:\n", 133 | " \n", 134 | " # Select Action\n", 135 | " if EPSILON < np.random.rand():\n", 136 | " action = env.action_space.sample() # Random action: Exploration\n", 137 | " else:\n", 138 | " action = np.argmax(Q[obs]) # Best action: Exploitation\n", 139 | "\n", 140 | " next_obs, reward, done, _ = env.step(action)\n", 141 | " \n", 142 | " # Update Q value for this action in this observation\n", 143 | " actual_obs_action_value = Q[obs, action]\n", 144 | " best_next_obs_action_value = reward + GAMMA * Q[next_obs].max() - actual_obs_action_value\n", 145 | " Q[obs, action] = actual_obs_action_value + ALPHA * best_next_obs_action_value\n", 146 | " \n", 147 | " obs = next_obs" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "id": "847c484d", 153 | "metadata": {}, 154 | "source": [ 155 | "### Plot Q* Table: highest action value for every state" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 18, 161 | "id": "15ecc762", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "" 168 | ] 169 | }, 170 | "execution_count": 18, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | }, 174 | { 175 | "data": { 176 | "image/png": "\n", 177 | "text/plain": [ 178 | "
" 179 | ] 180 | }, 181 | "metadata": { 182 | "needs_background": "light" 183 | }, 184 | "output_type": "display_data" 185 | } 186 | ], 187 | "source": [ 188 | "sns.heatmap(data=Q.max(axis=1).reshape(4,4), annot=True, vmax=1)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "75f94a24", 194 | "metadata": {}, 195 | "source": [ 196 | "### Plot Q* Table: action with highest value for every state" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 19, 202 | "id": "29bf608f", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "" 209 | ] 210 | }, 211 | "execution_count": 19, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | }, 215 | { 216 | "data": { 217 | "image/png": "\n", 218 | "text/plain": [ 219 | "
" 220 | ] 221 | }, 222 | "metadata": { 223 | "needs_background": "light" 224 | }, 225 | "output_type": "display_data" 226 | } 227 | ], 228 | "source": [ 229 | "sns.heatmap(data=np.argmax(Q, axis=1).reshape(4,4), annot=True)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "394df416", 235 | "metadata": {}, 236 | "source": [ 237 | "### Plot Q Table: state value for every action" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 7, 243 | "id": "8adb7b26", 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "image/png": "\n", 249 | "text/plain": [ 250 | "
" 251 | ] 252 | }, 253 | "metadata": { 254 | "needs_background": "light" 255 | }, 256 | "output_type": "display_data" 257 | } 258 | ], 259 | "source": [ 260 | "fig, ax = plt.subplots(2,2, figsize=(15,15))\n", 261 | "\n", 262 | "plot = ax[0,0]\n", 263 | "plot.set_title(\"izquierda\")\n", 264 | "values = Q.T[0].reshape(4,4)\n", 265 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 266 | "\n", 267 | "plot = ax[0,1]\n", 268 | "plot.set_title(\"abajo\")\n", 269 | "values = Q.T[1].reshape(4,4)\n", 270 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 271 | "\n", 272 | "plot = ax[1,0]\n", 273 | "plot.set_title(\"Derecha\")\n", 274 | "values = Q.T[2].reshape(4,4)\n", 275 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 276 | "\n", 277 | "plot = ax[1,1]\n", 278 | "plot.set_title(\"arriba\")\n", 279 | "values = Q.T[3].reshape(4,4)\n", 280 | "sns.heatmap(values, cmap=\"RdYlGn\", annot=True, ax=plot, vmax=1)\n", 281 | "\n", 282 | "_ = plt.plot()" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3 (ipykernel)", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.9.6" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 5 307 | } 308 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym[Box_2D] 2 | matplotlib 3 | box2d-py 4 | pyglet 5 | seaborn 6 | tensorflow 7 | pydot 8 | graphviz --------------------------------------------------------------------------------