├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── Week2 ├── frozenlake_Qlearning.ipynb └── img │ ├── Q_function.png │ ├── frozenlake_v0.png │ └── short_diag.jpg ├── Week3 ├── README.md ├── agent.py ├── atari_wrappers.py ├── buffers.py ├── central_control.py ├── imgs │ ├── DQN_variations.png │ ├── Dueling_img.png │ ├── double_Qlearning_formula.png │ ├── multistep_formula.png │ ├── noisenet_formula.png │ └── pong_gif.gif ├── main.py ├── neural_net.py └── utils.py ├── Week4 ├── A2C.ipynb ├── PolicyGradient.ipynb └── imgs │ ├── Advantage_actor_critic.png │ ├── Vanilla_policy_gradient.png │ ├── actions_plot_a2c.png │ ├── loss_plot_a2c.png │ ├── nn_ac.png │ ├── reward_pg.png │ └── reward_plot_a2c.png ├── Week5 ├── PPO.py ├── README.md └── imgs │ ├── rew_walker.png │ └── walker_gif.gif ├── Week6 ├── ES.py ├── README.md └── imgs │ ├── LunarLanderContinuous.gif │ └── plot_rewards.PNG ├── Week7 ├── README.md ├── imgs │ ├── animation.gif │ └── pseudocode.png └── model_based.py ├── _config.yml └── images ├── GitHub-Mark-32px.png ├── GitHub-Mark-64px.png ├── frontcover2.jpg ├── logo5.png ├── logo6.png ├── title3.png ├── youtube_social_icon_dark.png └── youtube_social_icon_red.png /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: ['paypal.me/andrealonza'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Andrea Lonza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![](images/title3.png) 3 | 4 | ## Course in Deep Reinforcement Learning 5 | 6 | ### Explore the combination of neural network and reinforcement learning. Algorithms and examples in Python & PyTorch 7 | 8 | 9 | Have you heard about the amazing results achieved by [Deepmind with AlphaGo Zero](https://www.youtube.com/watch?time_continue=24&v=tXlM99xPQC8) and by [OpenAI in Dota 2](https://www.youtube.com/watch?v=l92J1UvHf6M)? It's all about deep neural networks and reinforcement learning. Do you want to know more about it? 10 | This is the right opportunity for you to finally learn Deep RL and use it on new and exciting projects and applications. 11 | 12 | Here you'll find an in depth introduction to these algorithms. Among which you'll learn q learning, deep q learning, PPO, actor critic, and implement them using Python and PyTorch. 13 | 14 | > The ultimate aim is to use these general-purpose technologies and apply them to all sorts of important real world problems. 15 | > **Demis Hassabis** 16 | 17 | 18 | This repository contains: 19 | 20 |
21 | 22 | drawing **Lectures (& other content) primarily from DeepMind and Berkley Youtube's Channel.** 23 | 24 |
25 | 26 | drawing **Algorithms (like DQN, A2C, and PPO) implemented in PyTorch and tested on OpenAI Gym: RoboSchool & Atari.** 27 | 28 | 29 |
30 |
31 | 32 | **Stay tuned and follow me on** [![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/andri27_it) and [![GitHub followers](https://img.shields.io/github/followers/espadrine.svg?style=social&label=Follow)](https://github.com/andri27-ts) **#60DaysRLChallenge** 33 | 34 | Now we have also a [**Slack channel**](https://60daysrlchallenge.slack.com/). To get an invitation, email me at andrea.lonza@gmail.com. Also, email me if you have any idea, suggestion or improvement. 35 | 36 | To learn Deep Learning, Computer Vision or Natural Language Processing check my **[1-Year-ML-Journey](https://github.com/andri27-ts/1-Year-ML-Journey)** 37 | 38 | 39 | ### Before starting.. Prerequisites 40 | * Basic level of Python and PyTorch 41 | * [Machine Learning](https://github.com/andri27-ts/1-Year-ML-Journey) 42 | * [Basic knowledge in Deep Learning (MLP, CNN and RNN)](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2N3AIlp?tag=andreaaffilia-20) 43 | 44 |
45 |
46 | 47 | 48 | ## Quick Note: my NEW BOOK is out! 49 | To learn Reinforcement Learning and Deep RL more in depth, check out my book [**Reinforcement Learning Algorithms with Python**](https://www.amazon.com/Reinforcement-Learning-Algorithms-Python-understand/dp/1789131111)!! 50 | 51 | 52 | drawing 53 | 54 | 55 | **Table of Contents** 56 | 1. The Landscape of Reinforcement Learning 57 | 2. Implementing RL Cycle and OpenAI Gym 58 | 3. Solving Problems with Dynamic Programming 59 | 4. Q learning and SARSA Applications 60 | 5. Deep Q-Network 61 | 6. Learning Stochastic and DDPG optimization 62 | 7. TRPO and PPO implementation 63 | 8. DDPG and TD3 Applications 64 | 9. Model-Based RL 65 | 10. Imitation Learning with the DAgger Algorithm 66 | 11. Understanding Black-Box Optimization Algorithms 67 | 12. Developing the ESBAS Algorithm 68 | 13. Practical Implementation for Resolving RL Challenges 69 | 70 | 71 |
72 |
73 |
74 | 75 | 76 | ## Index - Reinforcement Learning 77 | 78 | - [Week 1 - **Introduction**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-1---introduction) 79 | - [Week 2 - **RL Basics**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-2---rl-basics-mdp-dynamic-programming-and-model-free-control) 80 | - [Week 3 - **Value based algorithms - DQN**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-3---value-function-approximation-and-dqn) 81 | - [Week 4 - **Policy gradient algorithms - REINFORCE & A2C**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-4---policy-gradient-methods-and-a2c) 82 | - [Week 5 - **Advanced Policy Gradients - PPO**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-5---advanced-policy-gradients---trpo--ppo) 83 | - [Week 6 - **Evolution Strategies and Genetic Algorithms - ES**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-6---evolution-strategies-and-genetic-algorithms) 84 | - [Week 7 - **Model-Based reinforcement learning - MB-MF**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-7---model-based-reinforcement-learning) 85 | - [Week 8 - **Advanced Concepts and Project Of Your Choice**](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#week-8---advanced-concepts-and-project-of-your-choice) 86 | - [Last 4 days - **Review + Sharing**](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#last-4-days---review--sharing) 87 | - [Best resources](https://github.com/andri27-ts/60_Days_RL_Challenge#best-resources) 88 | - [Additional resources](https://github.com/andri27-ts/60_Days_RL_Challenge#additional-resources) 89 |
90 | 91 | ## Week 1 - Introduction 92 | 93 | - **[Why is Reinforcement Learning such an important learning method - A simple explanation](https://medium.com/@andrea.lonzats/the-learning-machines-fb922e539335)** 94 | - **[Introduction and course overview](https://www.youtube.com/watch?v=Q4kF8sfggoI&index=1&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - CS294 by Levine, Berkley** 95 | - **[Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/) by Karpathy** 96 | 97 | ## 98 | 99 | #### Other Resources 100 | 101 | - :books: [The "Bible" of Reinforcement Learning: Chapter 1](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto 102 | - Great introductory paper: [Deep Reinforcement Learning: An Overview](https://www.groundai.com/project/deep-reinforcement-learning-an-overview/) 103 | - Start coding: [From Scratch: AI Balancing Act in 50 Lines of Python](https://towardsdatascience.com/from-scratch-ai-balancing-act-in-50-lines-of-python-7ea67ef717) 104 | 105 |
106 | 107 | ## Week 2 - RL Basics: *MDP, Dynamic Programming and Model-Free Control* 108 | 109 | > Those who cannot remember the past are condemned to repeat it - **George Santayana** 110 | 111 | 112 | This week, we will learn about the basic blocks of reinforcement learning, starting from the definition of the problem all the way through the estimation and optimization of the functions that are used to express the quality of a policy or state. 113 | 114 | ## 115 | 116 | ### Lectures - Theory drawing 117 | 118 | 119 | * **[Markov Decision Process](https://www.youtube.com/watch?v=lfHX2hHRMVQ&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=2) - David Silver (DeepMind)** 120 | * Markov Processes 121 | * Markov Decision Processes 122 | 123 | - **[Planning by Dynamic Programming](https://www.youtube.com/watch?v=Nd1-UUMVfz4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=3) - David Silver (DeepMind)** 124 | * Policy iteration 125 | * Value iteration 126 | 127 | * **[Model-Free Prediction](https://www.youtube.com/watch?v=PnHCvfgC_ZA&index=4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-) - David Silver (DeepMind)** 128 | * Monte Carlo Learning 129 | * Temporal Difference Learning 130 | * TD(λ) 131 | 132 | - **[Model-Free Control](https://www.youtube.com/watch?v=0g4j2k_Ggc4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=5) - David Silver (DeepMind)** 133 | * Ɛ-greedy policy iteration 134 | * GLIE Monte Carlo Search 135 | * SARSA 136 | * Importance Sampling 137 | 138 | ## 139 | 140 | ### Project of the Week - [**Q-learning**](Week2/frozenlake_Qlearning.ipynb) drawing 141 | 142 | [**Q-learning applied to FrozenLake**](Week2/frozenlake_Qlearning.ipynb) - For exercise, you can solve the game using SARSA or implement Q-learning by yourself. In the former case, only few changes are needed. 143 | 144 | ## 145 | 146 | #### Other Resources 147 | - :books: [The "Bible" of Reinforcement Learning: Chapters 3 and 4](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto 148 | - :tv: [Value functions introduction](https://www.youtube.com/watch?v=k1vNh4rNYec&index=6&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - DRL UC Berkley by Sergey Levine 149 | 150 |
151 | 152 | ## Week 3 - Value based algorithms - DQN 153 | 154 | This week we'll learn more advanced concepts and apply deep neural network to Q-learning algorithms. 155 | 156 | ## 157 | 158 | ### Lectures - Theory drawing 159 | 160 | - **[Value functions approximation](https://www.youtube.com/watch?v=UoPei5o4fps&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ&index=6) - David Silver (DeepMind)** 161 | - Differentiable function approximators 162 | - Incremental methods 163 | - Batch methods (DQN) 164 | 165 | * **[Advanced Q-learning algorithms](https://www.youtube.com/watch?v=nZXC5OdDfs4&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=7) - Sergey Levine (UC Berkley)** 166 | - Replay Buffer 167 | - Double Q-learning 168 | - Continous actions (NAF,DDPG) 169 | - Pratical tips 170 | 171 | ## 172 | 173 | ### Project of the Week - [**DQN and variants**](Week3) drawing 174 | 175 | 176 | drawing 177 | 178 | [**DQN and some variants applied to Pong**](Week3) - This week the goal is to develop a DQN algorithm to play an Atari game. To make it more interesting I developed three extensions of DQN: **Double Q-learning**, **Multi-step learning**, **Dueling networks** and **Noisy Nets**. Play with them, and if you feel confident, you can implement Prioritized replay, Dueling networks or Distributional RL. To know more about these improvements read the papers! 179 | 180 |
181 | 182 | ## 183 | 184 | 185 | #### Papers 186 | 187 | ##### Must Read 188 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/pdf/1312.5602.pdf) - 2013 189 | - [Human-level control through deep reinforcement learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) - 2015 190 | - [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf) - 2017 191 | 192 | ##### Extensions of DQN 193 | - [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/pdf/1509.06461.pdf) - 2015 194 | - [Prioritized Experience Replay](https://arxiv.org/pdf/1511.05952.pdf) - 2015 195 | - [Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.pdf) - 2016 196 | - [Noisy networks for exploration](https://arxiv.org/pdf/1706.10295.pdf) - 2017 197 | - [Distributional Reinforcement Learning with Quantile Regression](https://arxiv.org/pdf/1710.10044.pdf) - 2017 198 | 199 | #### Other Resources 200 | - :books: [The "Bible" of Reinforcement Learning: Chapters 5 and 6](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto 201 | - :tv: [Deep Reinforcement Learning in the Enterprise: Bridging the Gap from Games to Industry](https://www.youtube.com/watch?v=GOsUHlr4DKE) 202 | 203 |
204 | 205 | ## Week 4 - Policy gradient algorithms - REINFORCE & A2C 206 | 207 | Week 4 introduce Policy Gradient methods, a class of algorithms that optimize directly the policy. Also, you'll learn about Actor-Critic algorithms. These algorithms combine both policy gradient (the actor) and value function (the critic). 208 | 209 | ## 210 | 211 | ### Lectures - Theory drawing 212 | 213 | * **[Policy gradient Methods](https://www.youtube.com/watch?v=KHZVXao4qXs&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ&index=7) - David Silver (DeepMind)** 214 | - Finite Difference Policy Gradient 215 | - Monte-Carlo Policy Gradient 216 | - Actor-Critic Policy Gradient 217 | 218 | - **[Policy gradient intro](https://www.youtube.com/watch?v=XGmd3wcyDg8&t=0s&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=3) - Sergey Levine (RECAP, optional)** 219 | - Policy Gradient (REINFORCE and Vanilla PG) 220 | - Variance reduction 221 | 222 | * **[Actor-Critic](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=4) - Sergey Levine (More in depth)** 223 | - Actor-Critic 224 | - Discout factor 225 | - Actor-Critic algorithm design (batch mode or online) 226 | - state-dependent baseline 227 | 228 | ## 229 | 230 | ### Project of the Week - [**Vanilla PG and A2C**](Week4) drawing 231 | 232 | [**Vanilla PG and A2C applied to CartPole**](Week4) - The exercise of this week is to implement a policy gradient method or a more sophisticated actor-critic. In the repository you can find an implemented version of [PG and A2C](Week4). Bug Alert! Pay attention that A2C give me strange result. 233 | If you find the implementation of PG and A2C easy, you can try with the [asynchronous version of A2C (A3C)](https://arxiv.org/pdf/1602.01783.pdf). 234 | 235 | ## 236 | 237 | #### Papers 238 | 239 | - [Policy Gradient methods for reinforcement learning with function approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) 240 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) 241 | 242 | #### Other Resources 243 | - :books: [The "Bible" of Reinforcement Learning: Chapters 9 and 10](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto 244 | - :books: [Intuitive RL: Intro to Advantage-Actor-Critic (A2C)](https://hackernoon.com/intuitive-rl-intro-to-advantage-actor-critic-a2c-4ff545978752) 245 | - :books: [Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2) 246 | 247 |
248 | 249 | ## Week 5 - Advanced Policy Gradients - PPO 250 | 251 | This week is about advanced policy gradient methods that improve the stability and the convergence of the "Vanilla" policy gradient methods. You'll learn and implement PPO, a RL algorithm developed by OpenAI and adopted in [OpenAI Five](https://blog.openai.com/openai-five/). 252 | 253 | ## 254 | 255 | ### Lectures - Theory drawing 256 | 257 | - **[Advanced policy gradients](https://www.youtube.com/watch?v=ycCtmp4hcUs&t=0s&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=15) - Sergey Levine (UC Berkley)** 258 | - Problems with "Vanilla" Policy Gradient Methods 259 | - Policy Performance Bounds 260 | - Monotonic Improvement Theory 261 | - Algorithms: NPO, TRPO, PPO 262 | 263 | * **[Natural Policy Gradients, TRPO, PPO](https://www.youtube.com/watch?v=xvRrgxcpaHY) - John Schulman (Berkey DRL Bootcamp)** - (RECAP, optional) 264 | * Limitations of "Vanilla" Policy Gradient Methods 265 | * Natural Policy Gradient 266 | * Trust Region Policy Optimization, TRPO 267 | * Proximal Policy Optimization, PPO 268 | 269 | ## 270 | 271 | ### Project of the Week - [**PPO**](Week5) drawing 272 | 273 | drawing 274 | 275 | [**PPO applied to BipedalWalker**](Week5) - This week, you have to implement PPO or TRPO. I suggest PPO given its simplicity (compared to TRPO). In the project folder Week5 you find an implementation of [**PPO that learn to play BipedalWalker**](Week5). 276 | Furthermore, in the folder you can find other resources that will help you in the development of the project. Have fun! 277 | 278 |
279 | 280 | To learn more about PPO read the [paper](https://arxiv.org/pdf/1707.06347.pdf) and take a look at the [Arxiv Insights's video](https://www.youtube.com/watch?v=5P7I-xPq8u8) 281 | 282 | ## 283 | 284 | #### Papers 285 | 286 | - [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) - 2015 287 | - [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) - 2017 288 | 289 | #### Other Resources 290 | - :books: To better understand PPO and TRPO: [The Pursuit of (Robotic) Happiness](https://towardsdatascience.com/the-pursuit-of-robotic-happiness-how-trpo-and-ppo-stabilize-policy-gradient-methods-545784094e3b) 291 | - :tv: [Nuts and Bolts of Deep RL](https://www.youtube.com/watch?v=8EcdaCk9KaQ&) 292 | - :books: PPO best practice: [Training with Proximal Policy Optimization](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-PPO.md) 293 | - :tv: [Explanation of the PPO algorithm by Arxiv Insights](https://www.youtube.com/watch?v=5P7I-xPq8u8) 294 | 295 |
296 | 297 | ## Week 6 - Evolution Strategies and Genetic Algorithms - ES 298 | 299 | In the last year, Evolution strategies (ES) and Genetic Algorithms (GA) has been shown to achieve comparable results to RL methods. They are derivate-free black-box algorithms that require more data than RL to learn but are able to scale up across thousands of CPUs. This week we'll look at this black-box algorithms. 300 | 301 | ## 302 | 303 | ### Lectures & Articles - Theory drawing 304 | 305 | - **Evolution Strategies** 306 | - [Intro to ES: A Visual Guide to Evolution Strategies](http://blog.otoro.net/2017/10/29/visual-evolution-strategies/) 307 | - [ES for RL: Evolving Stable Strategies](http://blog.otoro.net/2017/11/12/evolving-stable-strategies/) 308 | - [Derivative-free Methods - Lecture](https://www.youtube.com/watch?v=SQtOI9jsrJ0&feature=youtu.be) 309 | - [Evolution Strategies (paper discussion)](https://blog.openai.com/evolution-strategies/) 310 | - **Genetic Algorithms** 311 | - [Introduction to Genetic Algorithms — Including Example Code](https://towardsdatascience.com/introduction-to-genetic-algorithms-including-example-code-e396e98d8bf3) 312 | 313 | ## 314 | 315 | ### Project of the Week - [**ES**](Week6) drawing 316 | 317 | drawing 318 | 319 | [**Evolution Strategies applied to LunarLander**](Week6) - This week the project is to implement a ES or GA. 320 | In the [**Week6 folder**](Week6) you can find a basic implementation of the paper [Evolution Strategies as a 321 | Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf) to solve LunarLanderContinuous. You can modify it to play more difficult environments or add your ideas. 322 | 323 |
324 | 325 | ## 326 | 327 | #### Papers 328 | 329 | - [Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for Training Deep Neural Networks for Reinforcement Learning](https://arxiv.org/pdf/1712.06567.pdf) 330 | - [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf) 331 | 332 | #### Other Resources 333 | - :books: [Evolutionary Optimization Algorithms](https://assoc-redirect.amazon.com/g/r/https://amzn.to/34EphXc?tag=andreaaffilia-20) - Dan Simon 334 | 335 |
336 | 337 | ## Week 7 - Model-Based reinforcement learning - MB-MF 338 | 339 | The algorithms studied up to now are model-free, meaning that they only choose the better action given a state. These algorithms achieve very good performance but require a lot of training data. Instead, model-based algorithms, learn the environment and plan the next actions accordingly to the model learned. These methods are more sample efficient than model-free but overall achieve worst performance. In this week you'll learn the theory behind these methods and implement one of the last algorithms. 340 | 341 | ## 342 | 343 | ### Lectures - Theory drawing 344 | 345 | - **Model-Based RL, David Silver (DeepMind) (concise version)** 346 | - [Integrating Learning and Planning](https://www.youtube.com/watch?v=ItMutbeOHtc&index=8&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) 347 | - Model-Based RL Overview 348 | - Integrated architectures 349 | - Simulation-Based search 350 | - **Model-Based RL, Sergey Levine (UC Berkley) (in depth version)** 351 | - [Learning dynamical systems from data](https://www.youtube.com/watch?v=yap_g0d7iBQ&index=9&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) 352 | - Overview of model-based RL 353 | - Global and local models 354 | - Learning with local models and trust regions 355 | - [Learning policies by imitating optimal controllers](https://www.youtube.com/watch?v=AwdauFLan7M&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=10) 356 | - Backpropagation into a policy with learned models 357 | - Guided policy search algorithm 358 | - Imitating optimal control with DAgger 359 | - [Advanced model learning and images](https://www.youtube.com/watch?v=vRkIwM4GktE&index=11&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) 360 | - Models in latent space 361 | - Models directly in image space 362 | - Inverse models 363 | 364 | 365 | ## 366 | 367 | ### Project of the Week - [**MB-MF**](Week7) drawing 368 | 369 | drawing 370 | 371 | [**MB-MF applied to RoboschoolAnt**](Week7) - This week I chose to implement the model-based algorithm described in this [paper](https://arxiv.org/pdf/1708.02596.pdf). 372 | You can find my implementation [here](Week7). 373 | NB: Instead of implementing it on Mujoco as in the paper, I used [RoboSchool](https://github.com/openai/roboschool), an open-source simulator for robot, integrated with OpenAI Gym. 374 | 375 |
376 | 377 | ## 378 | 379 | #### Papers 380 | 381 | - [Imagination-Augmented Agents for Deep Reinforcement Learning - 2017](https://arxiv.org/pdf/1707.06203.pdf) 382 | - [Reinforcement learning with unsupervised auxiliary tasks - 2016](https://arxiv.org/pdf/1611.05397.pdf) 383 | - [Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning - 2018](https://arxiv.org/pdf/1708.02596.pdf) 384 | 385 | #### Other Resources 386 | - :books: [The "Bible" of Reinforcement Learning: Chapter 8](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto 387 | - :books: [World Models - Can agents learn inside of their own dreams?](https://worldmodels.github.io/) 388 | 389 |
390 | 391 | ## Week 8 - Advanced Concepts and Project Of Your Choice 392 | 393 | This last week is about advanced RL concepts and a project of your choice. 394 | 395 | ## 396 | 397 | ### Lectures - Theory drawing 398 | 399 | - Sergey Levine (Berkley) 400 | - [Connection between inference and control](https://www.youtube.com/watch?v=iOYiPhu5GEk&index=13&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 401 | - [Inverse reinforcement learning](https://www.youtube.com/watch?v=-3BcZwgmZLk&index=14&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 402 | - [Exploration (part 1)](https://www.youtube.com/watch?v=npi6B4VQ-7s&index=16&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 403 | - [Exploration (part 2) and transfer learning](https://www.youtube.com/watch?v=0WbVUvKJpg4&index=17&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 404 | - [Multi-task learning and transfer](https://www.youtube.com/watch?v=UqSx23W9RYE&index=18&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 405 | - [Meta-learning and parallelism](https://www.youtube.com/watch?v=Xe9bktyYB34&index=18&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) 406 | - [Advanced imitation learning and open problems](https://www.youtube.com/watch?v=mc-DtbhhiKA&index=20&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s) 407 | - David Silver (DeepMind) 408 | - [Classic Games](https://www.youtube.com/watch?v=N1LKLc6ufGY&feature=youtu.be) 409 | 410 | 411 | ## 412 | 413 | ### The final project 414 | Here you can find some project ideas. 415 | - [Pommerman](https://www.pommerman.com/) (Multiplayer) 416 | - [AI for Prosthetics Challenge](https://www.crowdai.org/challenges/nips-2018-ai-for-prosthetics-challenge) (Challenge) 417 | - [Word Models](https://worldmodels.github.io/) (Paper implementation) 418 | - [Request for research OpenAI](https://blog.openai.com/requests-for-research-2/) (Research) 419 | - [Retro Contest](https://blog.openai.com/retro-contest/) (Transfer learning) 420 | 421 | ## 422 | 423 | #### Other Resources 424 | * AlphaGo Zero 425 | - [Paper](https://www.nature.com/articles/nature24270.epdf?author_access_token=VJXbVjaSHxFoctQQ4p2k4tRgN0jAjWel9jnR3ZoTv0PVW4gB86EEpGqTRDtpIz-2rmo8-KG06gqVobU5NSCFeHILHcVFUeMsbvwS-lxjqQGg98faovwjxeTUgZAUMnRQ) 426 | - DeepMind blog post: [AlphaGo Zero: Learning from scratch](https://deepmind.com/blog/alphago-zero-learning-scratch/) 427 | - Arxiv Insights video: [How AlphaGo Zero works - Google DeepMind](https://www.youtube.com/watch?v=MgowR4pq3e8) 428 | * OpenAI Five 429 | - OpenAI blog post: [OpenAI Five](https://blog.openai.com/openai-five/) 430 | - Arxiv Insights video: [OpenAI Five: Facing Human Pro's in Dota II](https://www.youtube.com/watch?v=0eO2TSVVP1Y) 431 | 432 |
433 | 434 | ## Last 4 days - Review + Sharing 435 | 436 | Congratulation for completing the 60 Days RL Challenge!! Let me know if you enjoyed it and share it! 437 | 438 | See you! 439 | 440 | ## Best resources 441 | 442 | :books: [Reinforcement Learning: An Introduction](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - by Sutton & Barto. The "Bible" of reinforcement learning. [Here](https://drive.google.com/file/d/1opPSz5AZ_kVa1uWOdOiveNiBFiEOHjkG/view) you can find the PDF draft of the second version. 443 | 444 | :books: [Deep Reinforcement Learning Hands-On](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2PRxKD7?tag=andreaaffilia-20) - by Maxim Lapan 445 | 446 | :books: [Deep Learning](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2N3AIlp?tag=andreaaffilia-20) - Ian Goodfellow 447 | 448 | :tv: [Deep Reinforcement Learning](https://www.youtube.com/playlist?list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - UC Berkeley class by Levine, check [here](http://rail.eecs.berkeley.edu/deeprlcourse/) their site. 449 | 450 | :tv: [Reinforcement Learning course](https://www.youtube.com/watch?v=2pWv7GOvuf0&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) - by David Silver, DeepMind. Great introductory lectures by Silver, a lead researcher on AlphaGo. They follow the book Reinforcement Learning by Sutton & Barto. 451 | 452 | 453 | 454 | ## Additional resources 455 | 456 | :books: [Awesome Reinforcement Learning](https://github.com/aikorea/awesome-rl). A curated list of resources dedicated to reinforcement learning 457 | 458 | :books: [GroundAI on RL](https://www.groundai.com/?text=reinforcement+learning). Papers on reinforcement learning 459 | 460 | 461 | ## A cup of Coffe :coffee: 462 | 463 | Any contribution is higly appreciated! Cheers! 464 | 465 | [![paypal](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=NKSNP93CNY4KN) 466 | -------------------------------------------------------------------------------- /Week2/frozenlake_Qlearning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Q-learning applied to FrozenLake " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### **Remember**: Q-learning is a model free, off-policy algorithm that can be used to find an optimal action using a Q function. Q can be represented as a table that contains a value for each pair state-action\n", 15 | " \n", 16 | "To review Q-learning watch [Q learning explained by Siraj](https://www.youtube.com/watch?v=aCEvtRtNO-M)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "#### Q-learning pipeline is quite easy an can be summarised in 5 blocks:\n", 24 | "\n", 25 | "![as](img/short_diag.jpg)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## WHAT'S THE ENVIRONMENT?\n", 33 | "\n", 34 | "#### We'll apply Q-learning on a [Gym](http://gym.openai.com/) game called [FrozenLake](https://gym.openai.com/envs/FrozenLake-v0/)\n", 35 | "\n", 36 | "![](img/frozenlake_v0.png)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## LET'S START TO CODE" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import gym\n", 53 | "import random\n", 54 | "from collections import namedtuple\n", 55 | "import collections\n", 56 | "import numpy as np\n", 57 | "import matplotlib.pyplot as plt\n", 58 | "%matplotlib inline " 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### BASIC FUNCTION TO CHOOSE AN ACTION FOLLOWING DIFFERENT POLICIES" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "def select_eps_greedy_action(table, obs, n_actions):\n", 75 | " '''\n", 76 | " Select the action using a ε-greedy policy (add a randomness ε for the choice of the action)\n", 77 | " '''\n", 78 | " value, action = best_action_value(table, obs)\n", 79 | "\n", 80 | " if random.random() < epsilon:\n", 81 | " return random.randint(0,n_actions-1)\n", 82 | " else:\n", 83 | " return action\n", 84 | "\n", 85 | "def select_greedy_action(table, obs, n_actions):\n", 86 | " '''\n", 87 | " Select the action using a greedy policy (take the best action according to the policy)\n", 88 | " '''\n", 89 | " value, action = best_action_value(table, obs)\n", 90 | " return action\n", 91 | "\n", 92 | "\n", 93 | "def best_action_value(table, state):\n", 94 | " '''\n", 95 | " Exploring the table, take the best action that maximize Q(s,a)\n", 96 | " '''\n", 97 | " best_action = 0\n", 98 | " max_value = 0\n", 99 | " for action in range(n_actions):\n", 100 | " if table[(state, action)] > max_value:\n", 101 | " best_action = action\n", 102 | " max_value = table[(state, action)]\n", 103 | "\n", 104 | " return max_value, best_action" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "![](img/Q_function.png)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "def Q_learning(table, obs0, obs1, reward, action):\n", 121 | " '''\n", 122 | " Q-learning. Update Q(obs0,action) according to Q(obs1,*) and the reward just obtained\n", 123 | " '''\n", 124 | " \n", 125 | " # Take the best value reachable from the state obs1\n", 126 | " best_value, _ = best_action_value(table, obs1)\n", 127 | "\n", 128 | " # Calculate Q-target value \n", 129 | " Q_target = reward + GAMMA * best_value\n", 130 | "\n", 131 | " # Calculate the Q-error between the target and the previous value\n", 132 | " Q_error = Q_target - table[(obs0, action)]\n", 133 | "\n", 134 | " # Update Q(obs0,action)\n", 135 | " table[(obs0, action)] += LEARNING_RATE * Q_error" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### TEST THE POLICY" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 4, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "def test_game(env, table):\n", 152 | " '''\n", 153 | " Test the new table playing TEST_EPISODES games\n", 154 | " '''\n", 155 | " \n", 156 | " n_actions = env.action_space.n\n", 157 | " \n", 158 | " reward_games = []\n", 159 | " for _ in range(TEST_EPISODES):\n", 160 | " obs = env.reset()\n", 161 | " rewards = 0\n", 162 | " while True:\n", 163 | " # Act greedly \n", 164 | " next_obs, reward, done, _ = env.step(select_greedy_action(table, obs, n_actions))\n", 165 | " obs = next_obs\n", 166 | " rewards += reward\n", 167 | "\n", 168 | " if done:\n", 169 | " reward_games.append(rewards)\n", 170 | " break\n", 171 | "\n", 172 | " return np.mean(reward_games)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### MAIN PROCEDURE" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Some hyperparameters..\n", 189 | "GAMMA = 0.95\n", 190 | "\n", 191 | "# NB: the decay rate allow to regulate the Exploration - Exploitation trade-off\n", 192 | "# start with a EPSILON of 1 and decay until reach 0\n", 193 | "EPS_DECAY_RATE = 0.9993\n", 194 | "\n", 195 | "LEARNING_RATE = 0.8\n", 196 | "\n", 197 | "# .. and constants\n", 198 | "TEST_EPISODES = 100\n", 199 | "MAX_GAMES = 15000" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 8, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "\tEp: 999 Test reward: 0.3 0.5\n", 212 | "\tEp: 1999 Test reward: 0.56 0.25\n", 213 | "\tEp: 2999 Test reward: 0.71 0.12\n", 214 | "\tEp: 3999 Test reward: 0.7 0.06\n", 215 | "\tEp: 4999 Test reward: 0.19 0.03\n", 216 | "\tEp: 5999 Test reward: 0.0 0.01\n", 217 | "\tEp: 6999 Test reward: 0.78 0.01\n", 218 | "\tEp: 7999 Test reward: 0.74 0.0\n", 219 | "\tEp: 8999 Test reward: 0.8 0.0\n", 220 | "\tEp: 9999 Test reward: 0.77 0.0\n", 221 | "\tEp: 10999 Test reward: 0.77 0.0\n", 222 | "\tEp: 11999 Test reward: 0.74 0.0\n", 223 | "\tEp: 12999 Test reward: 0.7 0.0\n", 224 | "\tEp: 13999 Test reward: 0.75 0.0\n", 225 | "\tEp: 14999 Test reward: 0.75 0.0\n" 226 | ] 227 | }, 228 | { 229 | "data": { 230 | "image/png": "iVBORw0KGgoAAAANSUhEUgAABB8AAAIWCAYAAAAf5yVaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzdeXzdZZ3+/+vOvicnW7OfLN2XlDYB2kLZlyIt6qgIgqKjuI77jKOzqOP8dGZ+LiPO4Ci4gIIi7rSyg0ihBZp0SRfa0p4sbZM0Odlzsp5z7u8fSWMpKU3bnHxOktfz8eiDLIfkKiRtcuW+329jrRUAAAAAAECoRDgdAAAAAAAAzGyUDwAAAAAAIKQoHwAAAAAAQEhRPgAAAAAAgJCifAAAAAAAACFF+QAAAAAAAEIqyukAZyszM9MWFxc7HQMAAAAAAJykurraa63NGu910658KC4uVlVVldMxAAAAAADASYwx9ad7HdcuAAAAAABASFE+AAAAAACAkKJ8AAAAAAAAIUX5AAAAAAAAQoryAQAAAAAAhBTlAwAAAAAACCnKBwAAAAAAEFKUDwAAAAAAIKQoHwAAAAAAQEhRPgAAAAAAgJCifAAAAAAAACFF+QAAAAAAAEKK8gEAAAAAAIQU5QMAAAAAAAgpygcAAAAAABBSlA8AAAAAACCkQlo+GGPWGWMOGGMOGWO+OM7ri4wxfzbG7DDG1Bhj3hLKPAAAAAAAYOqFrHwwxkRKulvSDZIWS7rVGLP4lIf9i6SHrbUrJN0i6fuhygMAAAAAAJwRypMPF0k6ZK31WGuHJD0k6a2nPMZKShl9OlVSYwjzAAAAAAAAB0SF8G3nSzpy0vNHJV18ymO+KulJY8wnJSVKuiaEeQAAAMJGW++gfvi8R4/sbNS8OUlaU5ap1WUZWpqXoqhIxnIBAGaWUJYPZpyX2VOev1XSfdbabxtjVkv6uTFmqbU2+Lo3ZMyHJX1YkoqKikISFgAAYCp09g3p3s0e3fdinfqGA7pyQbaOdvTpvx7fL0lKjo3SRSXpWl2WodVlGVqUk6KIiPG+rAIAYPoIZflwVFLhSc8X6I3XKj4oaZ0kWWu3GmPiJGVKajn5QdbaeyTdI0mVlZWnFhgAAABhr3tgWD95oVY/3lyrnkG/bizP1Wevmae52cmSpNaeQb3kadNWT5u2Hm7TM/tHvhxKS4jWqpKRImJNWYbmZifJGMoIAMD0EsryYZukecaYEknHNDJQ8j2nPKZB0tWS7jPGLJIUJ6k1hJkAAACmlG/Qr/u21Ome5z3q6h/WdYvn6LPXztei3JTXPS4rOVYbludpw/I8SVJTV7+2Hh4pIrYcbtPje5slSZlJsSOnIkpHygh3RgJlBAAg7BlrQ3eQYHR15nclRUr6ibX268aYr0mqstY+Mrr94l5JSRq5kvEFa+2Tb/Y2KysrbVVVVcgyAwAATIb+oYAeeKleP/jLYbX5hnTlgix97toFWlaQek5v70h732gR4dVWT5uOdw9KknJT48bKiNVlGSpwJUzmbwMAgAkzxlRbayvHfV0oy4dQoHwAAADhbGA4oIdeadDdzx1Wa8+g1s7L1Geuma8Kt2vS3oe1Vh6vb+xkxFZPm9p9Q5KkovSEkVMRc0cKieyUuEl7vwAAvBnKBwAAgBAb8gf16+oj+t9nD6mpa0AXlaTr89fO18WlGSF/38Gg1cGWnrErGi972tQ94JcklWUljs6LyNSq0gylJ8aEPA8AYHaifAAAAAgRfyCo3+04pu8985qOdvRrZVGaPn/dAq0py3BsFkMgaLWvsVtbPV5tOdymbbXt8g0FJEkLc5LHrmlcXJqh1PhoRzICAGYeygcAAIBJFghaPbLrmO56+jXVtfVpWX6qPnfdfF0xPyvsBkAOB4KqOdo1sk3jcJu21bVr0B9UhJGW5KVqTVmGVpVl6KLidCXGhnIeOQBgJqN8AAAAmCTBoNWje5r03adf06GWXi3MSdbnrp2vaxfPCbvS4XQG/QHtbOjUltF5ETsaOjQcsIqKMCovSNWaskytLstQhduluOhIp+MCAKYJygcAAIDzZK3Vk/uO67+fOqj9zT2am52kz14zXzcszVFExPQoHU6nfyig6vqOsU0aNUe7FAhaxURGaEVR2lgZcUFhmmKiIpyOCwAIU5QPAACEmZ9vrdOz+1u0bmmOrl+So7QEhgCGK2utnjvQqu88dVC7j3WpJDNRn756njYsz1PkNC8dTqdnYFhVdX8tI/Y2dstaKT46UpXFrrGZEcvyUxUVSRkBABhB+QAAQJjZ8D8vaE9jl6yVoiON1s7L0vryXF27eI6S4xgAGA6stXrxUJu+/dQB7WjoVIErXp+6ep7+ZkX+rPuGu7NvSC/Xto+t9jxwvEeSlBQbpYtK0rW6NEOryzK0ODdl2p8CAQCcuzcrH5goBADAFLPWqtbr03tXufXOigJtqmnSpl2NenZ/i2KiInTlgiytL8/T1YuylRDDX9VOeNnTpm8/dVCv1LYrNzVOX3/7Ur2ronDWXjlIS4jR9UtGTulIkrd3cGx45dbDbXp2f4skKTU+WqtKR8qINXMzNS87adrMwQAAhBZf0QAAMMVaewbVO+hXaWaiygvSVF6Qpi+uW6gdRzq0cVeTHt3dpCf2Hld8dKSuXpSt9eV5umJBFoP/psD2hg5958mDeuGQV1nJsfrqhsW65aIi/tufIjMpVuvL87S+PE+S1Nw1oK0er7YebtOWw216Yu/x0cfFaNXoqYg1ZZkqzkigjACAWYprFwAATLGXPG265Z6X9LO/vUiXzc96w+sDQatXatu1qaZRj+1pVrtvSEmxUbpu8RytX56rS+dmzdqfwIfK7qNd+s5TB/TnA63KSIzRx64o020XuxUfQ+lwLo60942civC0acthr453D0qSclLixtZ6rinLUIErweGkAIDJxMwHAADCyC9ebtA//X63Nn/hShWmv/k3X/5AUFsOt2lTTaMe39Os7gG/UuOjdf2SOdqwPE+rSzNm3fyByfRqU7e+89RBPbXvuFLjo/WRy0t1x+piJcZyOHSynLhmdGKt50uH29TmG5IkFabHj82LyEiMdTip8/LS4lWWlcjpEADTFuUDAABh5Ot/2qf7t9Zr/9fWndVwviF/UJtfa9WmmiY9te+4egf9ykiM0bqlOdqwPE8XFqfP2O0Lk+214z367tOv6U+7m5QcG6UPrS3V315azLDPKWCt1cHjvdp62Ksth9v0kqdN3QN+p2OFjezk2LFtImvKMlWYHk8ZAWDaoHwAACCMfPC+bTra0a8nPnvZOb+NgeGAnjvQqo01jXrm1eMaGA4qOzlWb1mWqw3L87SyKI1vWMZR6/XprqcP6o+7GpUQHakPXFKiO9eWKjWB0sEpgaDVay098g3O7gLCWum1lt6xuRne3pGrKvlp8WNlxOqyDOWlxTucFABOj/IBAIAwctW3ntOCnGT93+0Vk/L2+ob8eubVFm3c1ajnDrZqyB9Uflq8bizP1fryXC3LT531RcSR9j5975nX9LsdxxQdaXTH6mJ95PIypSfGOB0NeANrrQ639o5cVRm9rtLZNyxJKs5I0OqyzLFCIiuZ6yoAwgflAwAAYWI4ENSif31cH76sVF9Yt3DS337PwLCe2ndcG3c1avNrXvmDVu6MBK0vz9X68jwtzEmeVUVEY2e//vfPh/TwtiOKiDC67eIifeyKMmUnxzkdDZiwYNBqf3OPthz26iVPm172tKtn9KTIvOwkrSkbORVxcUmGXBRqABxE+QAAQJjwtPbqqm//Rd9613K9s6IgpO+rs29IT+xt1qaaJr14yKugleZmJ40VEXOzk0L6/p3U0j2g7z93WL94uUFWVu++sFCfuHKuclM5so7pzx8Iam9j9+g2kTZtq21X/3BAxkiLclLGyoiLStKZYwJgSlE+AAAQJp7ed1wf+lmVfvuxNapwu6bs/Xp7B/XYnmZt2tWoV+raZa20KDdF68tztaE8T0UZM2PlYVvvoH7wl8P62dZ6+YNW71xZoE9ePZeVjpjRhvxB1RztHJsXUd3QoSF/UJERRkvzU0eHV2aostilhBg2uQAIHcoHAADCxL3Pe/T1R1/Vzi9fq7QEZ45HH+8e0J9qmrSpplHbGzolSeUFqdpQnqcby3On5UC7zr4h3fO8R/dtqdPAcEBvW5GvT101T8WZiU5HA6bcwHBA2xs69NJoGbHzSKf8QavoSKMLCtNGh1dmakVRmuKiI52OC2AGoXwAACBMfOl3NXp8T7N2fPk6p6NIko529I0WEU3afaxLklThdmlDea7esixX2SnhPRuhe2BYP95cqx+/UCvfkF/ry/P06avnzegrJcDZ8g36VVXfMTK88rBXu491KWil2KgIVbhdIycj5maovCBN0ZERTscFMI1RPgAAECbe/cOt8getfvuxNU5HeYM6r0+bahq1qaZJ+5t7ZIx0cUm6NizP0w1Lc8NqM0TvoF/3vVire573qHvAr3VLcvSZa+dpYU6K09GAsNc9MKxtte1j2zT2NXVLkhJiInVhcbpWl41c01iSl6rIiNkzoBbA+aN8AAAgTFz49ad1+fwsfetdy52O8qYOtfRo464mbaxplKfVp8gIozVlGdpQnqfrl+QoNcGZIXb9QwH9bGudfvi8R+2+IV29MFufvXa+luanOpIHmAk6fEN6ubZtrIx4raVXkpQcF6WLSzLG1nouzElWBGUEgDdB+QAAQBjoGRjWsq8+qS+sW6CPXzHX6TgTYq3Vq0092ljTqE01jTrS3q/oSKPL5mVp/fJcXbNozpRM0x8YDugXLzfo+88dlrd3UGvnZepz187XiqKpG9oJzBYtPQN6ydOurYe92nq4TXVtfZIkV0K0Vo0Or1xdlqGyrKRZtboXU6epq199QwGnYzguJS5aWcmxTsc4K29WPjDuFgCAKVLr9UmSSqfREERjjBbnpWhxXoq+cP0C1RztGrua8cz+FsVEReiqBdlavzxXVy+co/iYyR1eN+QP6ldVR3T3s4fU3D2gVaXp+r/bV+rC4vRJfT8A/io7OU43Lc/TTcvzJEmNnf1jmzRe8rTpsT3NkqSs5NixTRqryzJUlJ5AGYFz5g8E9eS+47pvS51eqW13Ok5YeP+aYn31piVOx5g0lA8AAEyRsfIha3oOQzTGaHlhmpYXpulLNyzS9oYObapp0p92N+nxvc1KiInU1YvmaH15ri6fn3VeU/SHA0H9bvtRfe+ZQzrW2a8Kt0vfuXm51szNnMTfEYCJyEuL1zsqCvSOigJZa9XQ3jcyvNIzUkg8sqtRkpSfFv+6kxHTcXMOpl5b76Ae2nZED7xUr6auARW44vWFdQuUz8ePSjOn59cLp8O1CwAApsh3njqo/3n2Nb36tXUzar1dIGj1cm2bNtU06bHdTeroG1ZybJSuXTJHG8rzdMncTMVETWyCfiBo9cedx3TXM6+pvq1P5QWp+ty183X5/Cx+ogqEIWutDrf6Rq5oeEZmRnT0DUuSijMSRuZFlGVqVWm6spPDe3sOptbuo126b0udNtY0asgf1CVzM3TH6mJdvWgOg06nMWY+AAAQBj75yx3aeaRDm79wldNRQmY4ENSWw23atKtRT+xtVveAX6nx0Vq3JEcbludpVWm6osZZ5RcMWv1pd5O++/RBHW71aVFuij537Xxdsyib0gGYRoJBqwPHe8aGV75c26aeAb8kaW520sipiNIMrSrNkCuMNuhgagz5g3psT5Pu31Kn7Q2dSoiJ1N+szNcdq4s1b06y0/EwCSgfAAAIAzd+b7MykmL1s7+9yOkoU2LQH9Dmg15tqmnUU/uOyzcUUEZijG5YlqMN5Xm6sDhdxkhP7D2u7z59UPubezR/TpI+e818Xb8kh6n6wAwQCFrtbewaKyO21bWrbyggY6SFOSljZcRFpelKmYLhtXBGS8+AfvnyET34cr1aegblzkjQ+1YX650VBUqN5//7TEL5AACAw6y1WvKVJ3RzZeGMGh41UQPDAT13oEUbdzXpmf3HNTAc1JyUWLkSYrS/uUelmYn69DXztL48j+O2wAw2HAiq5mjn2ADL6voODfqDijDSsvxUrZ2XpRvLc7UwJ5lTTzPAjoYO3b+lTn/a3aThgNXl87P0/jXFunx+FgXzDEX5AACAw5q7BrTqP57R1966RO9bXex0HEf5Bv16Zn+LNu5qVFNXv96/pkRvuyBv3OsYAGa2geGAdjR0js6L8Gp7Q6cCQauyrEStL8/ThuW5mpvNcfzpZNAf0J9qRq5W7DrapaTYKL2zokDvXe1W2TQduIyJo3wAAMBhWw579Z57X9YDH7xYl85jYwMAjKetd1CP7WnWpppGvVzbLmulhTnJ2rA8T+vLc+XOmD6rimeb5q4BPfhyvX75SoO8vUMqzUrUHauL9Y6KAiXFsmRxtniz8oGPAgAApoCndWTNZkkWXzgDwOlkJMXq9lVu3b7KrePdA3p0d5M21TTpm08c0DefOKDyglStL8/VjeV5rGIMA9ZaVdd36L4tdXp8T7MC1uqqBdm6Y02xLp2bydUKvA7lAwAAU6DW61NcdIRyU1g1BwATMSclTh+4pEQfuKRExzr79WhNkzbWNOobj+7XNx7drwq3a6SIWJarbP5snVIDwwE9sqtR92+p097GbiXHRen9a4r13tVuTqfgtLh2AQDAFPjAT19RU9eAHv/MZU5HAYBprb7Np001IyciXm3qljHSxSXpWl+epxuW5igjKdbpiDPWsc5+PfBSvR56pUEdfcOaPydJd6wp1ttX5Cshhp9rg5kPAAA47opv/llL8lJ1920rnY4CADPGoZZebapp1MZdjTrc6lNkhNGasgxtKM/T9UtylJrAGsfzZa3VS5523b+lTk/ua5YkXbNojt6/pliryzLYSoLXoXwAAMBBQ/6gFn35cX3s8jL9/fULnI4DADOOtVb7m3u0qaZRm2qaVN/Wp+hIo7XzsrS+PFfXLp6j5DiKiLPRPxTQ73cc08+21ml/c4/SEqL17gsL9d5VbhW4EpyOhzDFwEkAABzU0N6nQNCqlGGTABASxhgtyk3RotwU/f11C7TnWLc21jTqTzVNenZ/i2KiInTlgiytL8/T1YuyuSLwJo609+nnL9XrV9uOqKt/WItyU/Rf71imm5bnKz4m0ul4mMb4rAMAIMQ8rb2SpJJMygcACDVjjJYVpGpZQaq+uG6hdhzp1MZdjXp0d5Oe2Htc8dGRunpRttaX5+mKBVmKi+YbamutXjzUpvu21OmZ/ccVYYzWLcnRHWuKdWGxi6sVmBSUDwAAhFitd2TNZmlmksNJAGB2iYgwqnC7VOF26V/XL9a2unZt3NWox/Y0a1NNk5Jio3Td4jlavzxXl87NUkxUhNORp5Rv0K/fbT+q+7fW61BLrzISY/SJK+bqtlVFyk1llSkmF+UDAAAh5mn1KSMxhsFnAOCgyAijVaUZWlWaoX+7aYm2etq0aVeTHtvTpN/tOKbU+Ghdv2SONizP0+rSDEVFztwios7r08+21uvXVUfUM+jXsvxUfetdy7W+PJeTIAgZygcAAEKs1utj3gMAhJGoyAitnZeltfOy9O9vW6oXDrVq064mPbq7WQ9XHVVGYozWLc3R+vI8XVSSrsiI6X/tIBi0ev61Vt2/pU7PHWxVpDF6y7Jc3bGmWCuL0rhagZCjfAAAIMQ83l5dtTDb6RgAgHHEREXoqoVzdNXCORoYDui5A63aVNOo320/pgdfblB2cqzesixXG5bnTctv0nsGhvWb6qP62dZ61Xp9ykyK1aeumqfbLi5Sdkqc0/Ewi1A+AAAQQl39w/L2Dqk0i3kPABDu4qIjtW5pjtYtzVHfkF/P7m/Rxl2N+sUrDbpvS53y0+J1Y3mu1pfnall+algXEYdaevWzrXX6bfVR+YYCWlGUprtuuUA3LM2ddbMtEB4oHwAACKETwybZdAEA00tCTJTWl+dpfXmeegaG9fSrx7VxV5N++mKt7nneI3dGgtaX52p9eZ4W5iSHRRERCFo9d6BF922p0+bXvIqJjND68pGrFcsL05yOh1mO8gEAgBCq9Y6s2Sxj5gMATFvJcdF6+4oCvX1Fgbr6hvXE3mZtrGnUD/7i0d1/PqyyrERtWD5SVMzNnvqTbl39w/p11RH9bGu9Gtr7NCclVp+/dr5uvbhImUmxU54HGA/lAzBFfviXw/rZ1nqlxkcrIylGmUmxykiMUXpSjDITY5WRFKOM0ZdlJMUoIYZPT2Am8LT6FGGkwvQEp6MAACZBakK0br6wUDdfWKi23sHRtZ2NuuuZ1/Tdp1/Twpzk0SIiV+6M0BbPB4/36L4tdfr99mPqHw7owmKXvrBuga5fkqPoGbytA9MT390AU+DJvc36j8f2q9LtUmp8tLy+IdV6fWrrHVL/cGDcfyc+OnKskMgcLSTSE2OVmTTydMaJwiIxVumJMdzdA8KUx+tTYXqCYqNYXQYAM01GUqxuX+XW7avcOt49oEd3N2lTTZO++cQBffOJAyovSNX68lzdWJ6n/LT4SXmfgaDVU/uO6/4tddrqaVNsVITeekGe3re6WEvzUyflfQChYKy1Tmc4K5WVlbaqqsrpGMCEeVp79db/fVElWYl6+COr37A7uW/Ir7beIbX5htTWO/j6p31D8o69bFDtviENB8b/nE2Jixo5TZEUo/TEk0uL2FPKihilJcTMiJVRwHRww12bNSclVvd94CKnowAApsixzn79qaZRm2qaVHO0S5JU4XaNFBHLcs9py0SHb0gPbTuiB16q17HOfuWlxum9q4t1y4WFciXGTPZvATgnxphqa23luK8LZflgjFkn6S5JkZJ+ZK39z1Ne/9+Srhx9NkFStrX2TSehUD5gOukb8uttd7+o1p5BbfzkpSpwnd+xa2utugf8Y8XEX/858rR39GXtoy9r7xvSeJ/iEUYjBcXoqYmTr4H8taz469PJsVFhMUQJmG6CQaslX3lCt15UpC9vWOx0HACAA+rbfNpU06SNuxq1v7lHxkgXl6RrfXmebliao4wzzGTY19it+7fU6Q87j2nQH9Sq0nS9f02xrlk0R1FcrUCYebPyIWTXLowxkZLulnStpKOSthljHrHW7jvxGGvtZ096/CclrQhVHmCqWWv1xd/uHllz9LcXn3fxIEnGGKXGRys1PlqlWWd+fCBo1dH313Li9Scq/lpU7G3slrd3UD0D/nHfTkxkxDgnKl4/o+LEyYrMpNg3nO4AZqvm7gH1DwdUwrBJAJi13BmJ+sSVc/WJK+fqUEuvNtU0auOuRv3LH/boK4/s1ZqyDG0oz9P1S3KUmhAtSRoOBPXk3pGrFa/UtSsuOkJ/s7JAd6xxa2FOisO/I+DchHLmw0WSDllrPZJkjHlI0lsl7TvN42+V9JUQ5gGm1E9frNMjuxr1D9cv0KXzMh3JEBlhlJkUOzrlOPmMjx/0B8ZOTZx8DcTrG/ln++jLDrf0yts7qEF/cNy3kxgTqfTRQiLz5CsfSbEqyUzQVQvnTPLvFAhPJ9ZslrFmEwAgaW52kj5zzXx9+up52t/cM1pENOkLv63RP/9ht9bOy9LCnGT9bvsxNXcPqDA9Xv/8lkW6ubJwrJgApqtQlg/5ko6c9PxRSReP90BjjFtSiaRnT/P6D0v6sCQVFRVNbkogBF6pbdc3Hn1V1y6eo49dXuZ0nAmLjYpUbmq8clPPPBDJWqu+ocDYPIoT/xw5UTGkdt/ICYtjnQOqOdqldt+Q/MGROyCPf2YtrT1mBU/ryJpNTj4AAE5mjNGi3BQtyk3R31+3QLuPdWlTTZM27WrUs/tbdOncTP3725bqqoXZzOnCjBHK8mG8z5LTDZi4RdJvrLXjjv231t4j6R5pZObD5MQDQqOle0Cf+MV2Fbji9e2blytihv6FYYxRYmyUEmOjVJRx5isl1lq91tKr6/77eW2rbad8wKzg8foUHx2pnHMYLAYAmB2MMSovSFN5QZq+uG6hugeGlZbAAEnMPKGcUHJUUuFJzxdIajzNY2+R9MsQZgGmxHAgqE/8Yrt6B/z64XsrlRLH8bgTjDGal52kOSmxqqrvcDoOMCU8rT6VZCYysBUAMCEREYbiATNWKMuHbZLmGWNKjDExGikYHjn1QcaYBZJckraGMAswJf7j0f3aVteh/3zHMi3IOfOMhdnGGKNKd7qq6igfMDvUen0q5coFAABA6MoHa61f0t9JekLSq5IettbuNcZ8zRhz00kPvVXSQzaUOz+BKfDHncf0kxdr9YFLivXWC/KdjhO2KtwuHevsV3PXgNNRgJAa9Ad0tKNPpQybBAAACOnMB1lrH5X06Ckv+/Ipz381lBmAqXCguUdf/O1uVbpd+qe3LHI6TlirLHZJkqrrO3Rjea7DaYDQaWjrU9BKpVlJTkcBAABwXCivXQCzQvfAsD76QLWS4qL0/dtWKjqST6s3syg3RfHRkaqqb3c6ChBSh1tH1myWcPIBAACA8gE4H8Gg1ecf3qUj7X36/m0rlc1E+zOKjozQ8sJUVTN0EjNcrXe0fGDmAwAAAOUDcD5+8PxhPbXvuP7pLYt0YXG603GmjUp3uvY2dqtvyO90FCBkPK29ykyKZesNAACAKB+Ac/bCa15964kD2rA8Tx+4pNjpONNKRbFLgaDVziOdTkcBQoZNFwAAAH9F+QCcg2Od/frkL7drbnaS/vNvlskY43SkaWVlkUvGSNWs3MQM5vH62HQBAAAwivIBOEsDwwF97IFq+QNWP7i9QomxIV0aMyOlxkdrfnayqpj7gBmqs29I7b4hTj4AAACMonwAztK/bdynmqNd+vbNy1mhdx4qil3a3tChYNA6HQWYdJ4TwyYz+TMCAABAonwAzsrD247ol6806ONXlOm6JTlOx5nWKt0u9Qz4dbClx+kowKSrHV2zyckHAACAEZQPwATtPtqlf/njHl06N1Ofv26B03GmvQq3S5JYuYkZyePtVWSEUaErwekoAAAAYYHyAZiADt+QPvpAtTITY3TXLRcoMoIBk+erKD1BmUmxDJ3EjFTr9akoPUExUfw1CwAAIElMygPOIBC0+vSvdqq1Z1C//uhqZSTFOh1pRjDGqNLtYugkZiuKvq4AACAASURBVCRPq08lbLoAAAAYw49kgDO46+mDev5gq/7trUu0vDDN6TgzSmWxSw3tfWrpGXA6CjBpgkGrWtZsAgAAvA7lA/Amnt53XN979pBurizQLRcWOh1nxhmb+8DVC8wgjV39GvQHVcKwSQAAgDGUD8Bp1Hl9+uzDO7U0P0Vfe+tSGcOch8m2JC9VsVERXL3AjFI7umazlDWbAAAAYygfgHH0DwX00QeqFRlh9H+3VSguOtLpSDNSTFSElhemUT5gRvGwZhMAAOANKB+AU1hr9U+/360Dx3t01y0rVJjOqrxQqnS7tPdYl/qHAk5HASZFrdenxJhIZScznBYAAOAEygfgFD9/qV6/33FMn7tmvi6fn+V0nBmvstglf9Bq19FOp6MAk+Jwa69KshK5qgUAAHASygfgJNX17fraxn26emG2PnHlXKfjzAori0aHTnL1AjPEyKYL5j0AAACcjPIBGNXSM6CPP7hd+a54fefdFygigp9aToW0hBjNzU6ifMCMMDAc0LHOfpWwZhMAAOB1KB8AScOBoP7uFzvU1T+sH9xeodT4aKcjzSqVbpeq6zsUDFqnowDnpb6tT9YybBIAAOBUlA+ApP//8f16pbZd//E3y7QoN8XpOLNOhdulrv5hHW7tdToKcF48ox/DXLsAAAB4PcoHzHqbahp17+Za3bHarbevKHA6zqxUWZwuSazcxLTn8Y6s2Szh5AMAAMDrUD5gVnvteI++8JsarSxK0z/fuNjpOLNWcUaCMhJjVFVH+YDpzdPqU3ZyrJJio5yOAgAAEFYoHzBr9QwM6yMPVCshJlLfv61CMVF8OjjFGKMKt0vV9e1ORwHOS623l3kPAAAA4+C7LcxK1lr9w69rVN/Wp/99z0rlpMY5HWnWqyx2qa6tT609g05HAc6Zx+tTCfMeAAAA3oDyAbPSPc979PjeZn3phoVaVZrhdBxIqnCPzH1g5Samqw7fkDr7hlXGyQcAAIA3oHzArLPlkFf/9fh+3bgsVx+8tMTpOBi1ND9FMVER2t5A+YDpyeMd3XRB+QAAAPAGlA+YVRo7+/XJX+5QaVaS/uud5TLGOB0Jo2KjIlWen6qqOuY+YHrytI5uuuDaBQAAwBtQPmDWGPQH9PEHt2vQH9QPbq9gGn0Yqih2ac+xbg0MB5yOApw1j9enqAijQle801EAAADCDuUDZo1/37RPO4906lvvKtfcbH4yGY4q3ekaCgS1+1iX01GAs1bb6lNRRoKiIvmrFQAA4FR8hYRZ4TfVR/XASw36yOWlWrc01+k4OI0Kt0uSVFXH3AdMPx5vr0q5cgEAADAuygfMeHuOdemff79bq0sz9A/XLXA6Dt5EemKMSrMSVV3P3AdML4GgVV1bH8MmAQAAToPyATNaZ9+QPvZgtVwJMfqf96zgOPQ0UOl2qbq+Q9Zap6MAE9bY2a8hf1ClmZQPAAAA4+E7McxYwaDVZ361U81dA/r+7SuVmRTrdCRMQKU7XR19wzo8ujkAmA483hObLigfAAAAxkP5gBnre8++pucOtOorG5ZoZZHL6TiYoIrikf9XXL3AdOJp7ZUklWYx8wEAAGA8lA+Ykf68v0V3PfOa3rGyQLddXOR0HJyF0sxEuRKiGTqJaaXW61NybJQyk2KcjgIAABCWKB8w4zS09enTD+3QopwUff3tS2WMcToSzoIxRhVul6obKB8wfXhafSrNSuTPGwAAgNOgfMCM0j8U0EceqJYxRj+4vUJx0ZFOR8I5qHCny9PqU7tvyOkowITUen3MewAAAHgTlA+YMay1+uc/7Nb+5m5995YLVJSR4HQknKPKsbkPnH5A+OsfCuhYZz/zHgAAAN4E5QNmjAdfbtDvth/Tp6+epysXZDsdB+dhWX6qYiIjVMXQSUwDdW1sugAAADgTygfMCNsbOvRvG/fqygVZ+tRV85yOg/MUFx2ppfkpqmboJKYBz+ha2NIsygcAAIDToXzAtOftHdTHH9iunNQ4/fe7L1BEBAPfZoLK4nTVHOvSoD/gdBTgTdV6R9ZscvIBAADg9CgfMK35A0F98hc71NE3pP+7rUJpCay5mykq3C4N+YPac6zL6SjAm/K0+pSbGqeEmCinowAAAIQtygdMa9988oC2etr0jbcv09L8VKfjYBJVuEeGTlZx9QJhzsOmCwAAgDOifMC09djuJv3wLx7dvqpI76gocDoOJllmUqyKMxJUxcYLhDFrrTytvcx7AAAAOAPKB0xLh1p69fe/3qULCtP0r+sXOx0HIVLhTtf2+g5Za52OAoyr3Tek7gG/SjJZswkAAPBmKB8w7fQO+vXRB6oVFx2p/7t9pWKjIp2OhBCpLHapzTekurY+p6MA4/J42XQBAAAwESEtH4wx64wxB4wxh4wxXzzNY242xuwzxuw1xvwilHkw/Vlr9Y+/qZGntVf/854Vyk2NdzoSQqhybO5Du8NJgPHVnlizycwHAACANxWy8sEYEynpbkk3SFos6VZjzOJTHjNP0pckXWKtXSLpM6HKg5nhxy/U6k+7m/SP6xZqTVmm03EQYmVZSUqNj1Y1cx8Qpg57exUdaVTgSnA6CgAAQFgL5cmHiyQdstZ6rLVDkh6S9NZTHnOnpLuttR2SZK1tCWEeTHNbD7fpPx7br3VLcvThy0qdjoMpEBFhVOF2MXQSYau21Sd3RqIiI4zTUQAAAMJaKMuHfElHTnr+6OjLTjZf0nxjzIvGmJeMMevGe0PGmA8bY6qMMVWtra0hiotw1tw1oE/+crvcGQn65rvKZQxf6M8WFW6XDrX0qrNvyOkowBt4vD6uXAAAAExAKMuH8b47PHVkfZSkeZKukHSrpB8ZY9Le8C9Ze4+1ttJaW5mVlTXpQRHehvxBffzBavUNBfTD2yuUHBftdCRMoRNzH7h6gXATCFrVt/lUwrBJAACAMwpl+XBUUuFJzxdIahznMX+01g5ba2slHdBIGQGM+fqf9ml7Q6e++c7lmjcn2ek4mGLlBWmKijBcvUDYOdrRp+GAVRlrNgEAAM4olOXDNknzjDElxpgYSbdIeuSUx/xB0pWSZIzJ1Mg1DE8IM2Ga+f2Oo7p/a73uXFuiG8tznY4DB8THRGpJfqqq6ygfEF5OrNnk5AMAAMCZhax8sNb6Jf2dpCckvSrpYWvtXmPM14wxN40+7AlJbcaYfZL+LOkfrLVtocqE6WVfY7e+9LvdurgkXf+4bqHTceCgSrdLu452asgfdDoKMMbDmk0AAIAJiwrlG7fWPirp0VNe9uWTnraSPjf6CxjT1Tesjz5QrdT4aP3ve1YqKjKUh3QQ7irdLv34hVrtbezSiiKX03EASVKtt1cpcVFKT4xxOgoAAEDY4zs6hJ1g0OpzD+9UU1e/vn/bSmUlxzodCQ6rKGboJMKPp9Wn0qwktu8AAABMAOUDws7dfz6kZ/a36F/XL1aFO93pOAgD2clxKkpPUBVzHxBGalmzCQAAMGGUDwgrzx1o0XeePqi3r8jXe1e5nY6DMFLpdqmqvkMjt7UAZ/UN+dXUNaBShk0CAABMCOUDwsaR9j59+qGdWjAnWd94+zKOMuN1Kopd8vYOqqG9z+kogGpPbLpgzSYAAMCEUD4gLAwMB/SxB6sVtFY/uL1C8TGRTkdCmKkcvYLD1QuEg7FNF5x8AAAAmBDKBzjOWqt//cMe7TnWre+++wIVc4ca45iXnaTkuChVMXQSYeDEyYfiDP68AgAAmAjKBzjuoW1H9Ovqo/rUVXN19aI5TsdBmIqIMFpZ5FJ1fbvTUQB5WnuVnxbPKS0AAIAJonyAo3Ye6dRX/rhXl83P0qevme90HIS5SrdLB4/3qqt/2OkomOVqvT6VcEoLAABgwigf4Ji23kF9/IFqZSXH6q53X6DICAZM4s1VFLskSdsbuHoB51hr5Wn1Me8BAADgLFA+wBGBoNWnHtohr29IP3xvhVyJMU5HwjRwQWGaIiOMqhk6CQd5e4fUM+jn5AMAAMBZoHyAI7795AG9eKhN/9/blmppfqrTcTBNJMREaUleiqqY+wAHeVp7JUmlWazZBAAAmCjKB0y5J/Y26/vPHdatFxXp5spCp+Ngmqlwu7TzSKeGA0Gno2CWOrHpopSTDwAAABNG+YAp5Wnt1ecf3qXlBan66k2LnY6DaajSna6B4aD2NXY7HQWzlMfrU0xUhPLS4p2OAgAAMG1QPmDK+Ab9+ugD1YqONPr+7RWKjWJFHc5e5ejQyap65j7AGZ5Wn4ozEhiSCwAAcBYoHzAlrLX6x9/W6FBLr/7n1pXK5yeGOEdzUuKUnxavauY+wCEeb69KM5n3AAAAcDYoHzAl7ttSp001Tfr76xfo0nmZTsfBNFdZ7FJVXYestU5HwSzjDwTV0NanEtZsAgAAnBXKB4Rc98Cwvv3kQV2xIEsfu7zM6TiYASrdLrX0DOpoR7/TUTDLHOnolz9oGTYJAABwligfEHK/euWIegf9+vy1C2QMd6Rx/irc6ZKkauY+YIrVek+s2aR8AAAAOBuUDwip4UBQP3mxVqtK07WsINXpOJghFuQkKzk2SlXMfcAU87SeWLPJzAcAAICzQfmAkHp0d5OaugZ059pSp6NgBomMMLqgKE1VdZx8wNTyeH1KS4iWKzHG6SgAAADTCuUDQsZaq3ue96gsK1FXLsh2Og5mmEp3ug4c71H3wLDTUTCLeFp7mfcAAABwDigfEDJbPW3a29itD60tVUQEsx4wuSqLXbJW2tHQ6XQUzCK1Xp9KuHIBAABw1igfEDL3Pu9RZlKM3r4i3+komIEuKExThJGq65j7gKnRO+jX8e5Bhk0CAACcA8oHhMRrx3v05wOteu+qYsVFRzodBzNQYmyUFuWmqIqNF5gidd4TwyYpHwAAAM4W5QNC4kebaxUbFaH3rnY7HQUzWKXbpZ1HOuUPBJ2OglngcOuJNZtcuwAAADhblA+YdC09A/r9jmN6Z0WB0pkIjxCqKE5X31BA+5t7nI6CWcDT6pMxkjsjwekoAAAA0w7lAybdz7fWazgY1AcvLXE6Cma4SrdLklTF3AdMgVqvT/lp8VwlAwAAOAeUD5hUfUN+/fylel2zaA5HkxFyeWnxykuNY+4DpoTH26sS5j0AAACcE8oHTKrfVh9VZ9+wPnxZqdNRMEtUFKermvIBIWatVW2rT2WUqgAAAOeE8gGTJhC0+tELtVpemDZ2HB4ItUq3S01dAzrW2e90FMxgLT2D8g0FOPkAAABwjigfMGme2ndc9W19+vDaUhljnI6DWaKCuQ+YAp7W0TWbWZQPAAAA54LyAZPm3s0eFbjidf2SOU5HwSyyMCdZiTGRXL1ASHm8I2s2OfkAAABwbigfMCmq6ztUXd+hD15aoqhIPqwwdaIiI3RBUZqq6igfEDq1rT7FRkUoLzXe6SgAAADTEt8lYlL8aLNHKXFRurmy0OkomIUq3Ona39yt3kG/01EwQ3m8PpVkJioigitlAAAA54LyAeetvs2nJ/Y267ZVbiXGRjkdB7NQpduloJV2NnQ6HQUzVK3Xx7wHAACA80D5gPP2kxdqFRlh9P41xU5HwSy1oihNEUaqqmfoJCbfkD+ohvY+5j0AAACcB8oHnJfOviE9XHVUNy3P15yUOKfjYJZKjovWgpwUhk4iJI509CkQtCrNTHI6CgAAwLRF+YDz8uDLDeofDuhDa0ucjoJZrtLt0o6GTgWC1ukomGFOrNks4doFAADAOaN8wDkb9Ad035Y6rZ2XqUW5KU7HwSxXWexS76Bf+5u7nY6CGaZ2dM1mKdcuAAAAzhnlA87ZH3c2qrVnUHeuLXU6CqAKt0uSuHqBSedp9Sk9MUZpCTFORwEAAJi2KB9wTqy1+tFmjxbmJGvtvEyn4wDKT4vXnJRYVdVRPmByebw+Tj0AAACcJ8oHnJO/HGzVweO9+tDaUhnD3ns4zxijSnc6Jx8w6TytPjZdAAAAnCfKB5yTH22u1ZyUWN20PM/pKMCYCrdLxzr71dTV73QUzBDdA8Py9g6qNItNFwAAAOeD8gFnbW9jl1445NUda4oVE8WHEMJHZTFzHzC5ak9suuDkAwAAwHnhO0ectR9vrlVCTKRuu8jtdBTgdRblpig+OpK5D5g0td6R8qGMNZsAAADnhfIBZ6Wpq1+P7GrUzZWFSk2IdjoO8DrRkRG6oDCNkw+YNJ7WXkUYqSgjwekoAAAA01pIywdjzDpjzAFjzCFjzBfHef37jTGtxpido78+FMo8OH/3balT0Fp98NISp6MA46osdmlfU7d8g36no2AG8Hh9KnAlKDYq0ukoAAAA01rIygdjTKSkuyXdIGmxpFuNMYvHeeivrLUXjP76Uajy4Pz1Dvr1i5cbdMOyXBWm81NAhKcKt0uBoNWuI51OR8EMwKYLAACAyRHKkw8XSTpkrfVYa4ckPSTprSF8fwixX207op4Bv+5cW+p0FOC0VhS5ZIxUxdULnCdrrWq9PpUy7wEAAOC8hbJ8yJd05KTnj46+7FTvMMbUGGN+Y4wpDGEenAd/IKifvFCri4rTdUFhmtNxgNNKjY/W/Oxkygect+buAfUPB1TKyQcAAIDzFsrywYzzMnvK8xslFVtryyU9Len+cd+QMR82xlQZY6paW1snOSYm4rE9zTrW2a8PrWXWA8JfRbFLO+o7FAie+kcOMHEn1myWZiU5nAQAAGD6C2X5cFTSyScZCiQ1nvwAa22btXZw9Nl7JVWM94astfdYayuttZVZWVkhCYvTs9bq3s0elWQm6ppFc5yOA5xRpdulnkG/Dh7vcToKprHDo2s2mfkAAABw/kJZPmyTNM8YU2KMiZF0i6RHTn6AMSb3pGdvkvRqCPPgHL1S266ao1364KUliogY70ALEF4q3emSxMpNnJfaVp/ioyOVkxLndBQAAIBpL2Tlg7XWL+nvJD2hkVLhYWvtXmPM14wxN40+7FPGmL3GmF2SPiXp/aHKg3N372aP0hNj9I6VBU5HASakMD1eWcmxlA84Lx5vr4ozEyldAQAAJkFUKN+4tfZRSY+e8rIvn/T0lyR9KZQZcH4Ot/bq6Vdb9Kmr5yk+hj33mB6MMap0u1RV3+50FExjtV6fluanOh0DAABgRgjltQvMAD/aXKuYqAi9b7Xb6SjAWalwu3SkvV8t3QNOR8E0NOgP6Eh7H5suAAAAJgnlA07L2zuo320/qneszFdmUqzTcYCzUlk8MveBlZs4F0fa+xS0UmkW5QMAAMBkoHzAaf18a70G/UF98NJSp6MAZ21JXorioiNUVUf5gLN3uPXEpgvWbAIAAEwGygeMa2A4oJ+/VK+rF2ZrbjZffGP6iY6MUHlBmqqZ+4BzUMuaTQAAgElF+YBx/Xb7UbX7hnTnZZx6wPRV6XZpb2O3+ocCTkfBNONp7VVmUoxS46OdjgIAADAjUD7gDYJBqx9vrtWy/FRdXJLudBzgnFUWu+QPWu080ul0FEwztV6fSrlyAQAAMGkoH/AGz+xvkcfr052XlcoY9ttj+lpZ5JIkbW9g7gPOjqfVx5ULAACASUT5gDe4d7NH+WnxesvSHKejAOclLSFG87KTVFXH3AdMXFffsNp8Q2y6AAAAmESUD3idXUc69Uptuz5wSbGiIvnwwPRXWexSdX2HgkHrdBRMEx5vrySGTQIAAEwmvrvE69y72aPk2Ci9+8JCp6MAk6LCna7uAb8OtfY6HQXTxIlNF6VZzHwAAACYLJQPGHOkvU+P7m7Sey4uUnIcE94xM1S6R+Y+VNUx9wET42n1KTLCqCg9wekoAAAAMwblA8b89MU6RRij919S7HQUYNK4MxKUmRSjqnrmPmBiar0+FbriFRPFX5EAAACTha+sIEnq6h/Wr7Y1aMPyPOWmxjsdB5g0xhitLBqZ+wBMxOHWXuY9AAAATDLKB0iSfvlKg3xDAX1obYnTUYBJV1nsUn1bn1p7Bp2OgjAXDFrVtfmY9wAAADDJKB+gIX9QP32xVpfMzdCSvFSn4wCTrsKdLkmq5uoFzqCpe0ADw0FOPgAAAEwyygdoU02jjncP6kNrS52OAoTE0vwUxURFcPUCZ1TbemLTBeUDAADAZKJ8mOWstbrneY/mZSfpivlZTscBQiI2KlLLC1JVRfmAM/B4R1aylmZy7QIAAGAyUT7Mci8eatP+5h7dubZUxhin4wAhU+FO155jXRoYDjgdBWHM0+pTQkyk5qTEOh0FAABgRqF8mOXu2exRZlKs3roiz+koQEhVul0aDljVHO1yOgrCmMfrU0lmImUsAADAJKN8mMUONPfo+YOtev8at2KjIp2OA4TUSrdLklTF0Em8iVpvL5suAAAAQoDyYRa7d7NH8dGRuu1it9NRgJBLT4xRaVaiquuY+4DxDQwHdLSjn00XAAAAIUD5MEu1dA/ojzuP6V2VBXIlxjgdB5gSlW6Xqhs6FAxap6MgDDW098laqYxNFwAAAJOO8mGWum9LnfxBqw9eWuJ0FGDKVLrT1dk3PLbRADiZp3Xk44KTDwAAAJOP8mEW8g369eDLDbp+cY7cGXyRjdmjonh07gNXLzAOj9cnifIBAAAgFCgfZqFfVx1RV/+w7rys1OkowJQqzUxUemKMquspH/BGnlafspJjlRwX7XQUAACAGYfyYZYJBK1+/GKtVhalqWJ0+j8wWxhjtLLIRfmAcdV6fSrl1AMAAEBIUD7MMk/sbdaR9n59mFMPmKUqi13yeH1q6x10OgrCjKeVNZsAAAChQvkwi1hrdc/zHrkzEnTt4hyn4wCOqBw98cPpB5yswzekjr5hTj4AAACEyBnLB2PMt4wxS6YiDEKrur5DO4906oOXligywjgdB3DE0vxUxURGUD7gdU4MmyxlzSYAAEBITOTkw35J9xhjXjbGfNQYkxrqUAiNezd7lBofrXdWFDgdBXBMXHSkluanqIryASepZdMFAABASJ2xfLDW/shae4mk90kqllRjjPmFMebKUIfD5Kn1+vTkvuN67yq3EmKinI4DOKqyOF27j3ZpYDjgdBSECU9rr6IijArTE5yOAgAAMCNNaOaDMSZS0sLRX15JuyR9zhjzUAizYRL95IVaRUdE6H1r3E5HARxX4XZpKBDUnmNdTkdBmKj1+lSUnqDoSEYhAQAAhMJEZj58R9IBSW+R9A1rbYW19r+stRskrQh1QJy/Dt+Qfl19RG9bkafs5Din4wCOO7FmlqsXOMHT6mPeAwAAQAhN5Pz9Hkn/Yq3tG+d1F01yHoTAAy/Va2A4qA+tZb0mIEmZSbEqyUxk6CQkScGgVW2bT5fNz3Q6CgAAwIw1kfOlHZKiTzxjjEkzxrxNkqy1nFkOcwPDAd2/tU5XLMjS/DnJTscBwkaF26Xt9R2y1jodBQ471tmvIX9QpVlJTkcBAACYsSZSPnzl5JLBWtsp6Suhi4TJ9Medx+TtHdKdnHoAXqfS7VKbb2hsywFmLzZdAAAAhN5EyofxHsO6hGkgGLS6d3OtFuemaE1ZhtNxgLBSWczcB4zwtPZKEjMfAAAAQmgi5UOVMeY7xpgyY0ypMea/JVWHOhjO318OtupQS6/uvKxExhin4wBhpTQzSWkJ0aquo3yY7Wq9PiXFRikrKdbpKAAAADPWRMqHT0oakvQrSb+WNCDpE6EMhclxz/Me5aTEaX15ntNRgLATEWG0ssilqvp2p6PAYR7vyKYLSloAAIDQOeP1CWutT9IXpyALJtGeY13a6mnTl25YyN564DQq3C49u79FHb4huRJjnI4Dh3hafWPXcAAAABAaZywfjDFZkr4gaYmkuBMvt9ZeFcJcOE/3bvYoKTZKt15c5HQUIGxVuke+4ayu79A1i+c4nAZOGBgOqLGrX6WZhU5HAQAAmNEm8iPxByXtl1Qi6d8k1UnaFsJMOE+Nnf3aVNOkd19YqJS46DP/C8AstbwwTdGRhqGTs1hdm0/WSiUMmwQAAAipiZQPGdbaH0sattb+xVr7t5JWhTgXzsNPX6yVJH3gkmJngwBhLi46UkvyUrWd8mHW+n/t3Xmc3HWd5/H3p+8rR3W6cyfV3SEcIcjRLQIaFwZEECS46gysq7iiPNyHzOg47soMyjgczqgzzsyOrivgqOM4ouOKRhZFBg+QSzrIkRCOpKo76RykKndXp8/67B9dwTYkoZPUr75V1a/n45FH1/Hrqnfg90h3v/v7+3wTqfFtNjvYZhMAACBSkykfRnIft5rZZWZ2pqSFEWbCcdg7OKLv/GaT3nbaPC2MNYSOAxS9rnhMT/ft1vBoNnQUBJBMj5cP7ZQPAAAAkZpM+XCrmc2Q9GeSPiHpTkl/GmkqHLPv/maT+odG9aEV7aGjACWhqy2modGs1mzZEzoKAtiQ6tfc6XVqrH3NEUgAAAA4Dkf8bsvMKiUtdfd7JO2RdEFBUuGYjIxl9fWHkzqno1mvWzgzdBygJHTGmyVJq3t26azF7Hgw1STTGVY9AAAAFMARVz64+5ikKwqUBcfp3me3asueQX1oRUfoKEDJaJ1Wq/isBnX37gwdBQXm7kqkMupg2CQAAEDkJnPZxSNm9iUzW2FmZx34M5kXN7NLzOwFM1tvZjcc4bh3mZmbWdekk+P3uLvueCihJa2NuuCk2aHjACWlc3FMq3t3yd1DR0EB7RoY0Z79I6x8AAAAKIDJXOR6Xu7jzRMec0l/cKRPyl2y8WVJb5HUJ+kJM1vl7s8ddNw0SX8i6fHJhsarPZrYoTWb9+qv//Npqqiw0HGAktLZFtMPfrtZvTsG1MYPolNGItUvSVrS2hQ4CQAAQPl7rZkPFZK+4u7fO4bXPlvSendP5F7rLkkrJT130HG3SPq8xodZ4hjd+VBSLU01eseZC0JHAUpOV27uQ3fvLsqHKSTBThcAAAAF81ozH7KSrj/G114gadOE+325x16R27ZzUW6g5WGZ2XVm1m1m3alU6hjjlK/12/fp589v13vPaVNddWXoRf4wCgAAIABJREFUOEDJWTq7SdPrqrSauQ9TSiKVUXWlaWGsPnQUAACAsjeZmQ/3m9knzGyRmTUf+DOJzzvU2v9XLqjOrar4e41v4XlE7n67u3e5e1dra+sk3npqufOhpGqrKvTec+OhowAlqaLCdFZ8fO4Dpo5kul+LmxtUVTmZL4UAAAA4HpOZ+fCB3MePTHjMJb3Wlgp9khZNuL9Q0pYJ96dJWi7pl2YmSXMlrTKzK9y9exK5ICm1b0g/eHKz3t21UM2NNaHjACWrKx7T376Q0p6BEc1oqA4dBwUwvtMF8x4AAAAK4TV/3ePu7Yf4M5m9HJ+QtNTM2s2sRtJVklZNeN097t7i7m3u3ibpMUkUD0fpW4/2aCSb1bVvag8dBShpnbm5D09uZPXDVDCWdfXuGFAH8x4AAAAK4jVXPpjZ+w71uLv/y5E+z91Hzex6SfdJqpT0z+6+1sxultTt7quO9Pl4bfuHx/Stx3p10Slz+O0dcJzOWDRTVRWm7t6duuBktqstd5t37dfwWFYdrZQPAAAAhTCZyy5eP+F2naQLJT0p6YjlgyS5+72S7j3osZsOc+z5k8iCCb7/ZJ92DYzoujdPZiEKgCOpr6nUqfOnq7uHlQ9TQSI9vs1mewvFLQAAQCG8Zvng7n888b6ZzZD0rcgSYVLGsq6vPZTQ6YtmqiseCx0HKAtnxWP6zm82amQsq2qGEJa1RGp8m01WPgAAABTGsXx3PSBpab6D4Oj8x7qX1bNjQNet6FBuYCeA49QVb9bgSFZrt+wNHQURS6YzmlZXpVkM6gUAACiIycx8+LF+t0VmhaRlkr4XZSi8tjseTGhhrF5vPXVO6ChA2ehqG19F1N2zU2csmhk4DaKUSPero7WJ8hYAAKBAJjPz4W8n3B6V1OvufRHlwSQ8uXGXunt36S/fvoz96YE8mjO9Tgtj9Vrdu0sfXBE6DaKUTGX0ho5ZoWMAAABMGZMpHzZK2urug5JkZvVm1ubuPZEmw2Hd+VBC0+uq9Iddi0JHAcpOVzymhzfskLvzW/EyNTA8qi17BtlmEwAAoIAm82vzf5eUnXB/LPcYAti4Y0A/XbNN7zknrsbayXRHAI5GZ1uzUvuG1Ldrf+goiEhPekCS1M6wSQAAgIKZTPlQ5e7DB+7kbjOhK5B/fjipygrT+89rCx0FKEsHdo/p7t0ZOAmicmCbzQ622QQAACiYyZQPKTO74sAdM1spKR1dJBzO7oFhfa97k644fYHmTK8LHQcoSyfOmaZptVXq7tkVOgoiksxts9nW0hA4CQAAwNQxmXX7H5b0bTP7Uu5+n6T3RRcJh/PtxzdqYHhMH1zRHjoKULYqK0xnLJ6p1b2UD+Uqkc5o/ow6NdRw6RoAAEChvObKB3ff4O7naHyLzVPd/Tx3Xx99NEw0NDqmbz7SoxVLW3TKvOmh4wBlrSverBde3qc9+0dCR0EEEukM8x4AAAAK7DXLBzP7rJnNdPd+d99nZjEzu7UQ4fA7q57aou37hvShFR2howBlr6stJnfptxtZ/VBu3F2JVD/zHgAAAApsMjMfLnX33QfuuPsuSW+LLhIO5u6686GkTp47TSuWtoSOA5S9MxbNVGWFcelFGdqRGda+wVG1s80mAABAQU2mfKg0s9oDd8ysXlLtEY5Hnj34UlovvLxPH1zRITMLHQcoe421VTpl3jSGTpahRG7YZAeXXQAAABTUZMqHf5X0gJlda2bXSrpf0jejjYWJ7nwooTnTa3XF6fNDRwGmjK54s57atFujY9nQUZBHSbbZBAAACGIyAyc/L+lWSadofOjkTyXFI86FnOe27NVDL6V1zXltqqmaTFcEIB864zHtHxnTuq37QkdBHiVSGdVUVmhBrD50FAAAgCllsj/NbpOUlfROSRdKWhdZIvyeO3+dUENNpd5zNn0PUEhdbTFJUnfvzsBJkE+JdEbxWQ2qrOASNgAAgEI6bPlgZiea2U1mtk7SlyRtkmTufoG7f6lgCaewbXsGteqpLfrDrkWa0VAdOg4wpcybUa8FM+vVzdDJspJI9TPvAQAAIIAjrXx4XuOrHN7u7m9y93+SNFaYWJCkbzzSo6y7rn1Te+gowJR0Vjym1T275O6hoyAPRsey2rhzQO3MewAAACi4I5UP79T45Ra/MLM7zOxCSaxTLZD+oVF9+/FeXbp8nhY1N4SOA0xJXfGYtu0d1Obd+0NHQR707dqvkTFn5QMAAEAAhy0f3P1ud/8jSSdL+qWkP5U0x8y+YmYXFyjflPW9JzZp3+CoPriCVQ9AKJ3x8bkPq7n0oiwk07ltNlsoHwAAAAptMrtdZNz92+5+uaSFkp6SdEPkyaaw0bGsvvbrpF7fFtOZi2Oh4wBT1slzp6mxplLdPZQP5WBDKrfNZiuXXQAAABTaUe3d6O473f2r7v4HUQWC9NO127R59359aEVH6CjAlFZVWaEzF8cYOlkmkumMZtRXK8YAXwAAgII7qvIB0XN33fFgQu0tjbrolDmh4wBTXmc8phe27dW+wZHQUXCcEqmMOlobZcb4IgAAgEKjfCgyT/Ts0tN9e3Ttm9pVwT70QHBdbTFlXXpq0+7QUXCckumM2pn3AAAAEATlQ5G5/cGEYg3VeudZC0NHASDpzMUxVZiY+1DiMkOj2rZ3UEuY9wAAABAE5UMR2ZDq1wPPv6z3ntum+prK0HEASGqqrdLJc6ez40WJO7DTBSsfAAAAwqB8KCJf+3VS1ZUVet+58dBRAEzQGY/ptxt3aXQsGzoKjlHiwDabrZQPAAAAIVA+FIkd/UP6v6v79M6zFqilqTZ0HAATdLXFlBke0/Pb9oWOgmOUTGVkJrXNonwAAAAIgfKhSHzrsV4NjWZ17ZvYXhMoNp3xmCRx6UUJS6T7NX9GveqquaQNAAAgBMqHIjA4MqZvPdqrC0+erRNmMwwNKDYLZtZr7vQ6dVM+lKxkOsMlFwAAAAFRPhSBHzy5WTsyw/rgClY9AMXIzNTZFtPqnp2ho+AYuLsSqYw6GDYJAAAQDOVDYNms685fJ3Taghk6p6M5dBwAh9EVj2nLnkFt2b0/dBQcpVT/kPqHRtnpAgAAICDKh8B+/vx2JVIZfXBFu8wsdBwAh9EVHy8HmftQehKpAztdcFkbAABAKJQPgd3xUEILZtbrbafNCx0FwBGcMm+aGmoqKR9KUDK3zSYrHwAAAMKhfAjomb7dejy5U//tjW2qruR/BVDMqiordMaimeruZe5DqUmk+lVTVaEFM+tDRwEAAJiy+Ik3oDseSmpabZX+6PWLQkcBMAmd8ZjWbd2nzNBo6Cg4Csl0Ru2zGlVRwaVtAAAAoVA+BNK3a0D3PrtVV79hsabVVYeOA2ASOuMxjWVdT23aHToKjkIixTabAAAAoVE+BPL1h3tkkt5/XlvoKAAm6ax4TGZSdw9zH0rFyFhWG3cOMO8BAAAgMMqHAPbsH9Fdv9moy183T/O5BhkoGdPrqnXSnGnMfSghm3YOaDTr7HQBAAAQGOVDAHf9ZqMyw2P64IqO0FEAHKXOeEy/3bhbY1kPHQWTwE4XAAAAxYHyocCGR7P6+sM9Om/JLC1fMCN0HABHqastpv6hUb348r7QUTAJidR4+bCEmQ8AAABBUT4U2P97dou27R3Uh97MqgegFHXFmyVJ3b3MfSgFiXRGsYZqzWyoCR0FAABgSqN8KCB31+0PJrV0dpPOP7E1dBwAx2BhrF6zp9VqdQ9zH0pBItXPvAcAAIAiQPlQQI9s2KF1W/fqQys6ZMZ+80ApMjN1xmOsfCgRyXSGeQ8AAABFgPKhgG5/MKGWplqtPHN+6CgAjkNnPKa+Xfv18t7B0FFwBPsGR7R935A6mPcAAAAQHOVDgbywbZ9+9WJK7z8vrtqqytBxAByHrrbc3IceVj8Us570gCSpg5UPAAAAwUVaPpjZJWb2gpmtN7MbDvH8h83sWTN7ysx+bWbLoswT0p0PJVRfXan3vCEeOgqA43Tq/Omqq65Qdy9zH4pZIt0vScx8AAAAKAKRlQ9mVinpy5IulbRM0tWHKBf+zd1Pc/czJH1e0hejyhPS9r2D+tFTW/TuroWKNTJxHSh11ZUVOn3hTK1m7kNRS6QyMpMWNzeEjgIAADDlRbny4WxJ69094e7Dku6StHLiAe6+d8LdRkkeYZ5gvvloj0ayWV37pvbQUQDkSVdbTGu37NXA8GjoKDiMRDqjhbF61VVzqRsAAEBoUZYPCyRtmnC/L/fY7zGzj5jZBo2vfPiTCPMEMTA8qn99bKPeumyu4rO47hgoF13xZo1lXU9v2hM6Cg4jkepXewuXXAAAABSDKMuHQ+0l+aqVDe7+ZXdfIumTkj51yBcyu87Mus2sO5VK5TlmtLIufeCN7frw+UtCRwGQR2ctjkmSVjP3oSi5u5LpDMMmAQAAikSU5UOfpEUT7i+UtOUIx98l6cpDPeHut7t7l7t3tba25jFi9Jpqq/TRi5bqjEUzQ0cBkEczGqp14pwmdTP3oSi9vHdIA8NjbLMJAABQJKIsH56QtNTM2s2sRtJVklZNPMDMlk64e5mklyLMAwB51RmP6cneXcpmy3JcTUl7ZacLLrsAAAAoCpGVD+4+Kul6SfdJWifpe+6+1sxuNrMrcoddb2ZrzewpSR+XdE1UeQAg3zrjzdo7OKqXtveHjoKDJFIZSVI7Kx8AAACKQlWUL+7u90q696DHbppw+6NRvj8ARKkrPj73obt3p06aOy1wGkyUTGdUV12hedPrQkcBAACAor3sAgDKWnxWg1qaarS6h7kPxSaR6lfbrEZVVBxq9jEAAAAKjfIBAI6RmakzHmPoZBFKpjNa0sq8BwAAgGJB+QAAx6Er3qyNOwe0fd9g6CjIGR7NatOu/Wpnm00AAICiQfkAAMehs2187sOTrH4oGht3Dmgs62yzCQAAUEQoHwDgOCyfP0O1VRXqZu5D0UikxncfYeUDAABA8aB8AIDjUFNVodMXzmTuQxFJpse32exoYeYDAABAsaB8AIDjdFY8prVb9mhwZCx0FEhKpDKa1VijGQ3VoaMAAAAgh/IBAI5TVzymkTHX05t2h44Cja98YN4DAABAcaF8AIDj1BkfHzrJpRfFIZHuZ94DAABAkaF8AIDjFGus0ZLWRq2mfAhuz/4RpfuH1dHKvAcAAIBiQvkAAHnQFW/W6t5dymY9dJQp7cCwSVY+AAAAFBfKBwDIg862mPbsH9GG3DaPCCOZHv/vv4SZDwAAAEWF8gEA8qArN/eBSy/CSqQyqjBpUXND6CgAAACYgPIBAPKgvaVRsxprGDoZWCKd0aLmBtVWVYaOAgAAgAkoHwAgD8xMZ8VjrHwILJHKMO8BAACgCFE+AECedMZjSqYzSvcPhY4yJWWzrp50Rh0t7HQBAABQbCgfACBPmPsQ1ra9g9o/MqZ2hk0CAAAUHcoHAMiT5QtmqKaygvIhkAPbbC7hsgsAAICiQ/kAAHlSV12p0xbOUHfPztBRpqREbptTVj4AAAAUH8oHAMijrnhMazbv1eDIWOgoU04inVF9daXmTq8LHQUAAAAHoXwAgDzqjMc0PJbVs5v3hI4y5RzY6cLMQkcBAADAQSgfACCPOhk6GUwynVEHl1wAAAAUJcoHAMijWU216mhpVHcP5UMhDY2OqW/XgDoYNgkAAFCUKB8AIM864zE9uXGX3D10lClj444BZV3qaG0KHQUAAACHQPkAAHnWGY9pZ2ZYidzWj4jehtT4f2suuwAAAChOlA8AkGddbbm5D1x6UTDJXNHTzmUXAAAARYnyAQDyrKOlSTMbqtXduzN0lCkjkepX67RaTaurDh0FAAAAh0D5AAB5VlFh6lwcUzc7XhRMMp1h1QMAAEARo3wAgAh0tsWUSGW0MzMcOsqUkEhntIR5DwAAAEWL8gEAItAVb5YkrWb1Q+R2DwxrZ2aYlQ8AAABFjPIBACLwuoUzVF1plA8FcGBXkY4WttkEAAAoVpQPABCBuupKLV8wQ6sZOhm5ZG6bzXYuuwAAAChalA8AEJHOxTE93bdHQ6NjoaOUtUS6X5UVpsXNDaGjAAAA4DAoHwAgIl1tMQ2PZrVm897QUcpaMp3R4uYGVVfyJQ0AAKBY8Z0aAESk85Whk1x6EaVEKqMOhk0CAAAUNcoHAIhI67RaxWc1qLuHoZNRyWZdyXSGnS4AAACKHOUDAESoMx7T6t5dcvfQUcrSlj37NTSaVUcrO10AAAAUM8oHAIhQV7xZOzLD6tkxEDpKWUrmttlk5QMAAEBxo3wAgAh1tcUkSd09zH2IQiK3zeYSttkEAAAoapQPABChE1qbNL2uSk9uZO5DFJLpjBprKtU6rTZ0FAAAABwB5QMARKiiwtQZjzF0MiIbUv3qaG2SmYWOAgAAgCOgfACAiHXGY3ppe792DwyHjlJ22OkCAACgNFA+AEDEOuPNksSlF3k2ODKmzbv3q4N5DwAAAEWP8gEAInbGopmqqjAuvciz3h0DcmenCwAAgFJA+QAAEauvqdSp86eru5fyIZ8SqX5J0pLWpsBJAAAA8FooHwCgADrjzXp6024Nj2ZDRykbifT4NpttrHwAAAAoepQPAFAAXW0xDY1mtXbLntBRykYildGc6bVqqq0KHQUAAACvIdLywcwuMbMXzGy9md1wiOc/bmbPmdkzZvaAmcWjzAMAoXTFY5Kk1Vx6kTfJdD/zHgAAAEpEZOWDmVVK+rKkSyUtk3S1mS076LDfSupy99dJ+r6kz0eVBwBCmj29Toua6ykf8iiRzqiDeQ8AAAAlIcqVD2dLWu/uCXcflnSXpJUTD3D3X7j7QO7uY5IWRpgHAILqijeru3eX3D10lJK3KzOs3QMj6mDlAwAAQEmIsnxYIGnThPt9uccO51pJPznUE2Z2nZl1m1l3KpXKY0QAKJyz4jGl9g1p0879oaOUvER6fKeLjlbKBwAAgFIQZflgh3jskL/uM7P/KqlL0hcO9by73+7uXe7e1dramseIAFA4B+Y+dPfuDJyk9CVS4ztdtLdw2QUAAEApiLJ86JO0aML9hZK2HHyQmV0k6UZJV7j7UIR5ACCoE+dM07TaKnUz9+G4JdIZVVWYFsXqQ0cBAADAJERZPjwhaamZtZtZjaSrJK2aeICZnSnpqxovHrZHmAUAgqusMJ0Zj2l1D+XD8UqmMlo8q0FVlewYDQAAUAoi+67N3UclXS/pPknrJH3P3dea2c1mdkXusC9IapL072b2lJmtOszLAUBZ6IrH9OL2fdqzfyR0lJKWSPerg0suAAAASkZVlC/u7vdKuvegx26acPuiKN8fAIpNVzwmd+nxxA5dfOrc0HFK0ljW1bNjQOefNDt0FAAAAEwS61UBoIDOise0YGa9/vZnL2hkLBs6Tknasnu/hkezbLMJAABQQigfAKCA6qor9ZkrTtWLL/fra79Oho5TkhLpAztdUD4AAACUCsoHACiwtyybo7csm6N/+I8XtWnnQOg4JSeR6pckdbQy8wEAAKBUUD4AQACfueJUVZjpM6vWyt1DxykpyXRG02qr1NJUEzoKAAAAJonyAQACWDCzXn960Yl64Pntum/ty6HjlJREKqOO1kaZWegoAAAAmCTKBwAI5P1vbNPJc6fpr368Vv1Do6HjlIxkOsO8BwAAgBJD+QAAgVRXVui2d5ymrXsG9Q/3vxg6TknYPzymzbv3M+8BAACgxFA+AEBAnfGYrj57sb7+SI/WbtkTOk7R69nBThcAAACliPIBAAL75CUnaWZ9tW68e42yWYZPHkkiNV4+dLRSPgAAAJQSygcACGxmQ41uvOwUPbVpt77zxMbQcYpaMj2+zSYrHwAAAEoL5QMAFIF3nLlA53Q063M/eV6pfUOh4xStRCqjeTPq1FBTFToKAAAAjgLlAwAUATPTrVeepv0jY/rsvetCxylaCXa6AAAAKEmUDwBQJE6Y3aQP/6cluvu3m/XI+nToOEXH3ZVI9TPvAQAAoARRPgBAEfnIBSdocXODPvXDNRoaHQsdp6jszAxr7+Co2lvYZhMAAKDUUD4AQBGpq67UzStPVSKd0Vd/lQgdp6gk0ux0AQAAUKooHwCgyJx/0mxd9rp5+tIv1qsn9wM3pOSBbTaZ+QAAAFByKB8AoAjddPky1VRW6NM/WiN3Dx2nKGxI96u60rQw1hA6CgAAAI4S5QMAFKE50+v0iYtP1EMvpXXPM1tDxykKyVRG8VmNqqyw0FEAAABwlCgfAKBIvffcNp22YIZuvuc57R0cCR0nuEQ6wyUXAAAAJYryAQCKVGWF6bPvOE07+of0d/e9EDpOUGNZV++OjNoZNgkAAFCSKB8AoIidtnCG3ndum/7lsV49vWl36DjB9O0a0MiYawnbbAIAAJQkygcAKHIfv/hEtTbV6i/uflajY9nQcYI4sM0mKx8AAABKE+UDABS56XXVuunty7R2y15967He0HGCSLDNJgAAQEmjfACAEnDZafP05hNb9Xc/e1Hb9gyGjlNwyXS/ptdVqbmxJnQUAAAAHAPKBwAoAWamW1aequGxrG6557nQcQoukcqoo7VJZmyzCQAAUIooHwCgRMRnNeqPLzhB/+/ZrfrFC9tDxymoJNtsAgAAlDTKBwAoIdf9pw51tDbqph+t0eDIWOg4BTEwPKqtewbVwbBJAACAkkX5AAAlpLaqUrdeuVybdu7Xl36+PnScgkge2OmCbTYBAABKFuUDAJSY85a06D+fuUBffXCD1m/fFzpO5F7Z6YKVDwAAACWL8gEAStBfXHaK6qsrdePda+TuoeNE6sDKh7ZZlA8AAAClivIBAEpQS1Otbrj0FD2e3KkfPLk5dJxIJVL9WjCzXvU1laGjAAAA4BhRPgBAibrq9Yt05uKZuu3eddo9MBw6TmSS6Yza2ekCAACgpFE+AECJqqgw3Xbladqzf0Sf++nzoeNEwt2VSGWY9wAAAFDiKB8AoIQtmz9dH3hjm77zm01a3bszdJy8S/cPa9/QKCsfAAAAShzlAwCUuI9ddKLmz6jTjXev0chYNnScvEqk+iVJHa1sswkAAFDKKB8AoMQ11lbpM1ecque37dPXH06GjpNXB3a66GDlAwAAQEmjfACAMnDxqXN10Slz9Pf3v6TNu/eHjpM3iXRGNVUVmj+zPnQUAAAAHAfKBwAoE5+5Ytn4x1VrAyfJn0Qqo7ZZDaqssNBRAAAAcBwoHwCgTCyMNehjFy3V/c+9rJ+t3RY6Tl4k0v3qaGHeAwAAQKmjfACAMvKBN7XrpDnT9JlVa5UZGg0d57iMjmW1cceA2tlmEwAAoORRPgBAGamurNBt71iuLXsG9Y8PvBQ6znHZtGu/RrPOsEkAAIAyQPkAAGWmq61ZV71+kb7266TWbd0bOs4xS6YPbLNJ+QAAAFDqKB8AoAx98pKTNaO+Wjfe/ayyWQ8d55gkUge22WTmAwAAQKmjfACAMhRrrNFfvO0UPblxt77bvSl0nGOSSGc0s6Fascaa0FEAAABwnCgfAKBMvfOsBXpDe7P+5ifPK90/FDrOUUuk+pn3AAAAUCYoHwCgTJmZbnvHcg0Mj+qz964LHeeoJdMZtXPJBQAAQFmgfACAMnbC7Gm67s0d+sGTm/XIhnToOJPWPzSql/cOMWwSAACgTERaPpjZJWb2gpmtN7MbDvH8m83sSTMbNbN3RZkFAKaq6y9YqkXN9frUD9doaHQsdJxJ6UkfGDZJ+QAAAFAOIisfzKxS0pclXSppmaSrzWzZQYdtlPR+Sf8WVQ4AmOrqayp188rlSqQyuuPBROg4k7IhdWCbTS67AAAAKAdRrnw4W9J6d0+4+7CkuyStnHiAu/e4+zOSshHmAIAp74KTZuuy0+bpn36+Xr07MqHjvKZkOiMzKT6rIXQUAAAA5EGU5cMCSRP3d+vLPXbUzOw6M+s2s+5UKpWXcAAw1Xz68mWqrqzQTT9aK3cPHeeIEqmMFsysV111ZegoAAAAyIMoywc7xGPH9N2uu9/u7l3u3tXa2nqcsQBgapo7o05/dvGJ+tWLKd377LbQcY5ofKcL5j0AAACUiyjLhz5JiybcXyhpS4TvBwB4De89J67lC6brr368VvsGR0LHOSR3VyLVryXMewAAACgbUZYPT0haambtZlYj6SpJqyJ8PwDAa6iqrNBtV56mVP+Q/u5nL4aOc0ipfUPKDI+x8gEAAKCMRFY+uPuopOsl3SdpnaTvuftaM7vZzK6QJDN7vZn1SXq3pK+a2dqo8gAAxp2+aKbee05c//Joj57t2xM6zqtsSOW22WylfAAAACgXUa58kLvf6+4nuvsSd78t99hN7r4qd/sJd1/o7o3uPsvdT40yDwBg3CfeepJmNdXqxh8+q7FscQ2fTKbHywdWPgAAAJSPSMsHAEBxml5XrU9fvkzP9O3Rvz7WGzrO70mk+lVbVaH5M+pDRwEAAECeUD4AwBT19tfN04qlLfrCfS/o5b2DoeO84sBOFxUVh9o0CQAAAKWI8gEApigz080rl2t4LKtb7nkudJxXJNIZ5j0AAACUGcoHAJjC2lsa9ZHzT9A9z2zVr15MhY6jkbGsNu4cYN4DAABAmaF8AIAp7sPnd6ijpVE3/WiNBkfGgmbZuHNAY1lXR0tT0BwAAADIL8oHAJjiaqsqdcuVy9W7Y0D/+xfrg2ZJ5rbZbOeyCwAAgLJC+QAA0BtPaNGVZ8zXV361Qeu39wfLkUiPv3cHl10AAACUFcoHAIAk6cbLlqm+ulKf/uEauXuQDMl0Rs2NNZrZUBPk/QEAABANygcAgCSpdVqtPnnpyXo0sUM/fGpzkAwbUhlWPQAAAJQhygcAwCuufv1inbl4pm69Z532DIwU/P2T6Qw7XQAAAJQhygcAwCsqKky3XXmadu8f0efue76g771vcESpfUPqaGWnCwAAgHJD+QAA+D3L5k/XfzuvTf/2+Eat7t1VsPdNpnM+JEDAAAAODUlEQVQ7XbDyAQAAoOxQPgAAXuVjbzlR82bU6ca7n9XoWLYg75nIbbO5hG02AQAAyg7lAwDgVZpqq/SXbz9Vz2/bp2880lOQ90ykM6owafGshoK8HwAAAAqH8gEAcEhvPXWOLjx5tr54/4vasnt/5O+XSPVrYaxBtVWVkb8XAAAACovyAQBwSGamz1xxqrLu+qsfr438/djpAgAAoHxRPgAADmtRc4M+euGJum/ty/qP516O7H3cXcl0Rh3MewAAAChLlA8AgCP64Ip2nTinSX+5aq0GhkcjeY+X9w5pYHhMHax8AAAAKEuUDwCAI6qurNCtV56mzbv36x8feCmS90ik+iVJHa1Nkbw+AAAAwqJ8AAC8prPbm/WHXQv1tYeSen7b3ry//ob0+DabzHwAAAAoT5QPAIBJueHSUzStrkqfunuNslnP62snUxnVV1dq7vS6vL4uAAAAigPlAwBgUpoba/TnbztF3b279O+rN+X1tRPpfrW3NKqiwvL6ugAAACgOlA8AgEl711kLdXZbs/76J89rR/9Q3l43mc6onZ0uAAAAyhblAwBg0ioqTLe+Y7n6B0f11z95Pi+vOTQ6pk07B7SEeQ8AAABli/IBAHBUTpwzTde9uUPfX92nxxI7jvv1Nu0cUNbFygcAAIAyRvkAADhqf/wHS7UwVq9P/XCNhkezx/VaG1LjO110tLDNJgAAQLmifAAAHLX6mkrdsnK51m/v1x0PJY7rtZIHttlk5QMAAEDZonwAAByTC06erUuXz9X/euAlbdwxcMyvk0j1q6WpVtPrqvOYDgAAAMWE8gEAcMxuevsyVVWYblq1Ru5+TK+RTGfUwbBJAACAskb5AAA4ZvNm1OvjF5+kX76Q0k/XbDum10ikMurgkgsAAICyRvkAADgu15wb17J50/WZH69V/9DoUX3unoER7cgMq52VDwAAAGWN8gEAcFyqKit02zuWa/u+IX3xZy8e1ecm0v2SpI5WdroAAAAoZ5QPAIDjdubimN7zhsX6xiNJrdm8Z9Kf98pOF6x8AAAAKGuUDwCAvPgfbz1ZzY01uvHuZzWWndzwyUQqo8oK0+LmhojTAQAAICTKBwBAXsyor9anL1+mp/v26N8e753U5yTTGS2K1aumii9HAAAA5Yzv9gAAeXPF6fP1xhNm6fM/fUHb9w2+5vEbUv3MewAAAJgCKB8AAHljZrpl5XINjWZ16z3rjnhsNuvq2ZFh3gMAAMAUQPkAAMirjtYm/ffzl2jV01v00Eupwx63de+gBkey6milfAAAACh3lA8AgLz77+cvUXtLoz79wzUaHBk75DHJFDtdAAAATBWUDwCAvKurrtQtK5erZ8eAvvLLDYc8JpHulyQtYeYDAABA2aN8AABE4k1LW7TyjPn6yi83KJHqf9XziVRGjTWVmj2tNkA6AAAAFBLlAwAgMjdedopqqyv06R+tkbv/3nOJdEbtrY0ys0DpAAAAUCiUDwCAyMyeVqf/ecnJenj9Dq16esvvPZdM96u9hUsuAAAApgLKBwBApP7L2Yt1+qKZuuWe57RnYESSNDgypr5d+9XBsEkAAIApgfIBABCpygrTbVcu187MsL7ws+clSRt3DshdbLMJAAAwRVA+AAAit3zBDL3/vHZ9+/GN+u3GXa8MoOzgsgsAAIApIdLywcwuMbMXzGy9md1wiOdrzey7uecfN7O2KPMAAML5+MUnas60Ot149xq99PJ4+dDW0hA4FQAAAAohsvLBzColfVnSpZKWSbrazJYddNi1kna5+wmS/l7S56LKAwAIq6m2Sn/59mV6bute3fFQQrOn1WpaXXXoWAAAACiAKFc+nC1pvbsn3H1Y0l2SVh50zEpJ38zd/r6kC4091wCgbF2yfK4uOKlVewdH1c6wSQAAgCkjyvJhgaRNE+735R475DHuPippj6RZEWYCAARkZrp55XLVVVfo5LnTQscBAABAgVRF+NqHWsHgx3CMzOw6SddJ0uLFi48/GQAgmEXNDbr3T1ZoVlNt6CgAAAAokChXPvRJWjTh/kJJWw53jJlVSZohaefBL+Tut7t7l7t3tba2RhQXAFAoHa1NmlHPvAcAAICpIsry4QlJS82s3cxqJF0ladVBx6ySdE3u9rsk/dzdX7XyAQAAAAAAlK7ILrtw91Ezu17SfZIqJf2zu681s5sldbv7Kklfk/QtM1uv8RUPV0WVBwAAAAAAhBHlzAe5+72S7j3osZsm3B6U9O4oMwAAAAAAgLCivOwCAAAAAACA8gEAAAAAAESL8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAESK8gEAAAAAAETK3D10hqNiZilJvaFzHIMWSenQIRAc5wEO4FyAxHmAcZwHOIBzARLnAX6nFM+FuLu3HuqJkisfSpWZdbt7V+gcCIvzAAdwLkDiPMA4zgMcwLkAifMAv1Nu5wKXXQAAAAAAgEhRPgAAAAAAgEhRPhTO7aEDoChwHuAAzgVInAcYx3mAAzgXIHEe4HfK6lxg5gMAAAAAAIgUKx8AAAAAAECkKB8iZmaXmNkLZrbezG4InQdhmNkiM/uFma0zs7Vm9tHQmRCOmVWa2W/N7J7QWRCOmc00s++b2fO5fxvODZ0JhWdmf5r7urDGzL5jZnWhM6EwzOyfzWy7ma2Z8Fizmd1vZi/lPsZCZkT0DnMefCH3teEZM7vbzGaGzIjCONS5MOG5T5iZm1lLiGz5QvkQITOrlPRlSZdKWibpajNbFjYVAhmV9GfufoqkcyR9hHNhSvuopHWhQyC4f5T0U3c/WdLp4pyYcsxsgaQ/kdTl7sslVUq6KmwqFNA3JF1y0GM3SHrA3ZdKeiB3H+XtG3r1eXC/pOXu/jpJL0r680KHQhDf0KvPBZnZIklvkbSx0IHyjfIhWmdLWu/uCXcflnSXpJWBMyEAd9/q7k/mbu/T+A8ZC8KmQghmtlDSZZLuDJ0F4ZjZdElvlvQ1SXL3YXffHTYVAqmSVG9mVZIaJG0JnAcF4u4PStp50MMrJX0zd/ubkq4saCgU3KHOA3f/mbuP5u4+JmlhwYOh4A7zb4Ik/b2k/ymp5Ic1Uj5Ea4GkTRPu94kfOKc8M2uTdKakx8MmQSD/oPEvINnQQRBUh6SUpK/nLsG508waQ4dCYbn7Zkl/q/HfZm2VtMfdfxY2FQKb4+5bpfFfXEiaHTgPwvuApJ+EDoEwzOwKSZvd/enQWfKB8iFadojHSr6xwrEzsyZJ/1fSx9x9b+g8KCwzu1zSdndfHToLgquSdJakr7j7mZIyYnn1lJO7nn+lpHZJ8yU1mtl/DZsKQLEwsxs1funut0NnQeGZWYOkGyXdFDpLvlA+RKtP0qIJ9xeK5ZRTlplVa7x4+La7/yB0HgTxRklXmFmPxi/D+gMz+9ewkRBIn6Q+dz+wAur7Gi8jMLVcJCnp7il3H5H0A0nnBc6EsF42s3mSlPu4PXAeBGJm10i6XNJ73J1fXk5NSzReTj+d+95xoaQnzWxu0FTHgfIhWk9IWmpm7WZWo/EhUqsCZ0IAZmYav7Z7nbt/MXQehOHuf+7uC929TeP/Hvzc3fkt5xTk7tskbTKzk3IPXSjpuYCREMZGSeeYWUPu68SFYvDoVLdK0jW529dI+lHALAjEzC6R9ElJV7j7QOg8CMPdn3X32e7elvvesU/SWbnvIUoS5UOEcoNirpd0n8a/mfieu68NmwqBvFHSezX+m+6ncn/eFjoUgKD+WNK3zewZSWdI+mzgPCiw3MqX70t6UtKzGv++7PagoVAwZvYdSY9KOsnM+szsWkl/I+ktZvaSxqfb/03IjIjeYc6DL0maJun+3PeM/ydoSBTEYc6FsmKs4gEAAAAAAFFi5QMAAAAAAIgU5QMAAAAAAIgU5QMAAAAAAIgU5QMAAAAAAIgU5QMAAAAAAIgU5QMAAMgbM7vRzNaa2TO5LeLeYGYfM7OG0NkAAEA4bLUJAADywszOlfRFSee7+5CZtUiqkfSIpC53TwcNCAAAgmHlAwAAyJd5ktLuPiRJubLhXZLmS/qFmf1CkszsYjN71MyeNLN/N7Om3OM9ZvY5M/tN7s8JucffbWZrzOxpM3swzF8NAAAcD1Y+AACAvMiVCL+W1CDpPyR9191/ZWY9yq18yK2G+IGkS909Y2aflFTr7jfnjrvD3W8zs/dJ+kN3v9zMnpV0ibtvNrOZ7r47yF8QAAAcM1Y+AACAvHD3fkmdkq6TlJL0XTN7/0GHnSNpmaSHzewpSddIik94/jsTPp6bu/2wpG+Y2YckVUaTHgAARKkqdAAAAFA+3H1M0i8l/TK3YuGagw4xSfe7+9WHe4mDb7v7h83sDZIuk/SUmZ3h7jvymxwAAESJlQ8AACAvzOwkM1s64aEzJPVK2idpWu6xxyS9ccI8hwYzO3HC5/zRhI+P5o5Z4u6Pu/tNktKSFkX41wAAABFg5QMAAMiXJkn/ZGYzJY1KWq/xSzCulvQTM9vq7hfkLsX4jpnV5j7vU5JezN2uNbPHNf4LkgOrI76QKzVM0gOSni7I3wYAAOQNAycBAEBRmDiYMnQWAACQX1x2AQAAAAAAIsXKBwAAAAAAEClWPgAAAAAAgEhRPgAAAAAAgEhRPgAAAAAAgEhRPgAAAAAAgEhRPgAAAAAAgEhRPgAAAAAAgEj9f17Dy2TntJF/AAAAAElFTkSuQmCC\n", 231 | "text/plain": [ 232 | "
" 233 | ] 234 | }, 235 | "metadata": { 236 | "needs_background": "light" 237 | }, 238 | "output_type": "display_data" 239 | } 240 | ], 241 | "source": [ 242 | "# Create the environment\n", 243 | "#env = gym.make('Taxi-v2')\n", 244 | "env = gym.make(\"FrozenLake-v0\")\n", 245 | "obs = env.reset()\n", 246 | "\n", 247 | "obs_length = env.observation_space.n\n", 248 | "n_actions = env.action_space.n\n", 249 | "\n", 250 | "reward_count = 0\n", 251 | "games_count = 0\n", 252 | "\n", 253 | "# Create and initialize the table with 0.0\n", 254 | "table = collections.defaultdict(float)\n", 255 | " \n", 256 | "test_rewards_list = []\n", 257 | "\n", 258 | "# Reinitialize epsilon after each session\n", 259 | "epsilon = 1.0\n", 260 | "\n", 261 | "while games_count < MAX_GAMES:\n", 262 | "\n", 263 | " # Select the action following an ε-greedy policy\n", 264 | " action = select_eps_greedy_action(table, obs, n_actions)\n", 265 | " next_obs, reward, done, _ = env.step(action)\n", 266 | "\n", 267 | " # Update the Q-table\n", 268 | " Q_learning(table, obs, next_obs, reward, action)\n", 269 | "\n", 270 | " reward_count += reward\n", 271 | " obs = next_obs\n", 272 | "\n", 273 | " if done:\n", 274 | " epsilon *= EPS_DECAY_RATE\n", 275 | "\n", 276 | " # Test the new table every 1k games\n", 277 | " if (games_count + 1) % 1000 == 0:\n", 278 | " test_reward = test_game(env, table)\n", 279 | " print('\\tEp:', games_count, 'Test reward:', test_reward, np.round(epsilon,2))\n", 280 | "\n", 281 | " test_rewards_list.append(test_reward)\n", 282 | "\n", 283 | " obs = env.reset()\n", 284 | " reward_count = 0\n", 285 | " games_count += 1 \n", 286 | "\n", 287 | "# Plot the accuracy over the number of steps\n", 288 | "plt.figure(figsize=(18,9))\n", 289 | "plt.xlabel('Steps')\n", 290 | "plt.ylabel('Accurracy')\n", 291 | "plt.plot(test_rewards_list)\n", 292 | "plt.show()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "#### NB: in case you want to apply Q-learning to continuous state and actions games, you have to quantize the state and action spaces" 300 | ] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 3", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.7.6" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 2 324 | } 325 | -------------------------------------------------------------------------------- /Week2/img/Q_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/Q_function.png -------------------------------------------------------------------------------- /Week2/img/frozenlake_v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/frozenlake_v0.png -------------------------------------------------------------------------------- /Week2/img/short_diag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/short_diag.jpg -------------------------------------------------------------------------------- /Week3/README.md: -------------------------------------------------------------------------------- 1 | # DQN, Double Q-learning, Deuling Networks, Multi-step learning and Noisy Nets applied to Pong 2 | 3 | This week we will apply Deep Q-Networks (DQN) to [Pong](https://gym.openai.com/envs/Pong-v0/). 4 | 5 | ![Pong Gif](imgs/pong_gif.gif) 6 | 7 | For the DQN implementation and the choose of the hyperparameters, I mostly followed [Mnih et al.](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf). (In the last page there is a table with all the hyperparameters.) 8 | 9 | To make things more interesting, I improved the basic DQN, implementing some variations like **Double Q-learning**, **Dueling networks**, **Multi-step learning** and **Noisy Nets**. You can find them summarized by [Hessel et al.](https://arxiv.org/pdf/1710.02298.pdf) 10 | 11 | ### [Learn the theory](../README.md) 12 | 13 | --- 14 | 15 | ### Double Q-learning - [Paper](https://arxiv.org/pdf/1509.06461.pdf) 16 | 17 | Minimize the overestimation bias introduced by the conventional Q-learning. 18 | 19 | drawing 20 | 21 | To use it, in *main.py*, set 22 | ```python 23 | DQN_HYPERPARAMS = { 24 | 'double_DQN': True, 25 | ... 26 | } 27 | ``` 28 | 29 | --- 30 | 31 | ### Dueling networks - [Paper](http://proceedings.mlr.press/v48/wangf16.pdf) 32 | 33 | It uses two different neural networks, one outputs the value of the state and the other the advantage of each action. 34 | The two NNs share the convolutional encoder. 35 | 36 | drawing 37 | 38 | To use it, in *main.py*, set 39 | ```python 40 | DQN_HYPERPARAMS = { 41 | 'dueling': True, 42 | ... 43 | } 44 | ``` 45 | 46 | --- 47 | 48 | ### NoisyNet - [Paper](https://arxiv.org/pdf/1706.10295.pdf) 49 | 50 | An idea to overcome the ε-greedy limitations is to introduce noise linear layers. The network will manage the noise stream to balance the exploration. 51 | 52 | drawing 53 | 54 | To use it, in *main.py*, set 55 | ```python 56 | DQN_HYPERPARAMS = { 57 | 'noisy_net': True, 58 | ... 59 | } 60 | ``` 61 | 62 | --- 63 | 64 | ### Multi-step 65 | 66 | Introduce a forward-view multi-step. Similar to TD(λ) 67 | 68 | drawing 69 | 70 | 71 | To use it, in *main.py*, set 72 | ```python 73 | DQN_HYPERPARAMS = { 74 | 'n_multi_step': 2, # or 3 75 | ... 76 | } 77 | ``` 78 | 79 | NB: From today's on, because we will train deep neural networks, I suggest to run the code on GPUs. If you don't have it, you can use [Google Colab](https://colab.research.google.com/). 80 | Also, to track the networks' results, we'll use [TensorboardX](https://github.com/lanpa/tensorboardX) (tensorboard for PyTorch). In case you use Google Colab to run TensorBoard on your pc, execute the commands in the section below. 81 | 82 | NB: If you use GPUs remember to change DEVICE from 'cpu' to 'cuda' in *main.py*. 83 | 84 | 85 | ## To make the code more clear, it's structured in 6 files: 86 | - **main.py** contains the main body. It creates the agent, the environment and plays N games. For each step, it updates the agent 87 | - **agents.py** has the Agent class that control the central control, the replay buffer and basic functions 88 | - **central_control.py** contains CentralControl class. It is responsible to instantiate the DQN (or its variants), optimize it, calculate the loss ecc.. 89 | - **buffers.py** contains the ReplayBuffer class to keep the agent's memories inside a deque list and sample from it. 90 | - **neural_net.py** contains the deep neural nets for the agent namely DQN, DuelingDQN and a NoisyLinear Layer for the Noisy DQN. 91 | - **atari_wrappers.py** include some Atari wrappers. https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 92 | - **utils.py**, for now, contains only a testing function. 93 | 94 | 95 | ## Results 96 | 97 | In the image below are shown the rewards mean of the last 10 games and the last 40 games for three different DQN variations. 98 | The x-axis is the number of games. You can see that only 120 games are enough to learn the game pretty well. 99 | 100 | ![results](imgs/DQN_variations.png) 101 | 102 | - ![#00ac77](https://placehold.it/15/00ac77/000000?text=+) `Basic DQN` 103 | - ![#628ced](https://placehold.it/15/628ced/000000?text=+) `2-step DQN` 104 | - ![#df1515](https://placehold.it/15/df1515/000000?text=+) `2-step Dueling DQN` 105 | 106 | May seem strange that 2-step Dueling DQN performs worst than 2-step DQN but it's important to keep in mind that the NNs are stochastic and that I tested only on one game. The authors of the DuelingDQN paper, reported better results when applied to other games. 107 | 108 | 109 | ## Install 110 | 111 | ``` 112 | !pip install gym 113 | !pip install torch torchvision 114 | !pip install tensorboardX 115 | !apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 116 | ``` 117 | 118 | Install gym 119 | ``` 120 | !git clone https://github.com/openai/gym.git 121 | import os 122 | os.chdir('gym') 123 | !ls 124 | !pip install -e . 125 | os.chdir('..') 126 | ``` 127 | 128 | Install gym 129 | ``` 130 | !pip install gym[atari] 131 | ``` 132 | 133 | 134 | ## To run TensorBoard in Google Colab 135 | 136 | Instructions from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/ 137 | 138 | Download and install ngrok 139 | ``` 140 | !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip 141 | !unzip ngrok-stable-linux-amd64.zip 142 | ``` 143 | 144 | run ngrok and tensorboard 145 | ``` 146 | LOG_DIR = 'content/runs' 147 | 148 | get_ipython().system_raw( 149 | 'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'.format(LOG_DIR) 150 | ) 151 | 152 | get_ipython().system_raw('./ngrok http 6006 &') 153 | 154 | !curl -s http://localhost:4040/api/tunnels | python3 -c \ 155 | "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])" 156 | ``` 157 | -------------------------------------------------------------------------------- /Week3/agent.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import namedtuple 4 | import collections 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | 9 | import time 10 | 11 | from neural_net import DQN 12 | from central_control import CentralControl 13 | from buffers import ReplayBuffer 14 | 15 | 16 | class DQNAgent(): 17 | ''' 18 | Agent class. It control all the agent functionalities 19 | ''' 20 | rewards = [] 21 | total_reward = 0 22 | birth_time = 0 23 | n_iter = 0 24 | n_games = 0 25 | ts_frame = 0 26 | ts = time.time() 27 | 28 | Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False) 29 | 30 | def __init__(self, env, device, hyperparameters, summary_writer=None): 31 | ''' 32 | Agent initialization. It create the CentralControl that control all the low 33 | ''' 34 | 35 | # The CentralControl is the 'brain' of the agent 36 | self.cc = CentralControl(env.observation_space.shape, env.action_space.n, hyperparameters['gamma'], hyperparameters['n_multi_step'], hyperparameters['double_DQN'], 37 | hyperparameters['noisy_net'], hyperparameters['dueling'], device) 38 | 39 | self.cc.set_optimizer(hyperparameters['learning_rate']) 40 | 41 | self.birth_time = time.time() 42 | 43 | self.iter_update_target = hyperparameters['n_iter_update_target'] 44 | self.buffer_start_size = hyperparameters['buffer_start_size'] 45 | 46 | self.epsilon_start = hyperparameters['epsilon_start'] 47 | self.epsilon = hyperparameters['epsilon_start'] 48 | self.epsilon_decay = hyperparameters['epsilon_decay'] 49 | self.epsilon_final = hyperparameters['epsilon_final'] 50 | 51 | self.accumulated_loss = [] 52 | self.device = device 53 | 54 | # initialize the replay buffer (i.e. the memory) of the agent 55 | self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) 56 | self.summary_writer = summary_writer 57 | 58 | self.noisy_net = hyperparameters['noisy_net'] 59 | 60 | self.env = env 61 | 62 | def act(self, obs): 63 | ''' 64 | Greedy action outputted by the NN in the CentralControl 65 | ''' 66 | return self.cc.get_max_action(obs) 67 | 68 | def act_eps_greedy(self, obs): 69 | ''' 70 | E-greedy action 71 | ''' 72 | 73 | # In case of a noisy net, it takes a greedy action 74 | if self.noisy_net: 75 | return self.act(obs) 76 | 77 | if np.random.random() < self.epsilon: 78 | return self.env.action_space.sample() 79 | else: 80 | return self.act(obs) 81 | 82 | def add_env_feedback(self, obs, action, new_obs, reward, done): 83 | ''' 84 | Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean. 85 | ''' 86 | 87 | # Create the new memory and update the buffer 88 | new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done) 89 | self.replay_buffer.append(new_memory) 90 | 91 | # update the variables 92 | self.n_iter += 1 93 | # decrease epsilon 94 | self.epsilon = max(self.epsilon_final, self.epsilon_start - self.n_iter/self.epsilon_decay) 95 | self.total_reward += reward 96 | 97 | def sample_and_optimize(self, batch_size): 98 | ''' 99 | Sample batch_size memories from the buffer and optimize them 100 | ''' 101 | 102 | if len(self.replay_buffer) > self.buffer_start_size: 103 | # sample 104 | mini_batch = self.replay_buffer.sample(batch_size) 105 | # optimize 106 | l_loss = self.cc.optimize(mini_batch) 107 | self.accumulated_loss.append(l_loss) 108 | 109 | # update target NN 110 | if self.n_iter % self.iter_update_target == 0: 111 | self.cc.update_target() 112 | 113 | def reset_stats(self): 114 | ''' 115 | Reset the agent's statistics 116 | ''' 117 | self.rewards.append(self.total_reward) 118 | self.total_reward = 0 119 | self.accumulated_loss = [] 120 | self.n_games += 1 121 | 122 | 123 | def print_info(self): 124 | ''' 125 | Print information about the agent 126 | ''' 127 | fps = (self.n_iter-self.ts_frame)/(time.time()-self.ts) 128 | print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss))) 129 | 130 | self.ts_frame = self.n_iter 131 | self.ts = time.time() 132 | 133 | if self.summary_writer != None: 134 | self.summary_writer.add_scalar('reward', self.total_reward, self.n_games) 135 | self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games) 136 | self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games) 137 | self.summary_writer.add_scalar('esilon', self.epsilon, self.n_games) 138 | self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games) 139 | -------------------------------------------------------------------------------- /Week3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | import cv2 7 | 8 | ''' 9 | Atari Wrapper copied from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 10 | ''' 11 | 12 | 13 | class LazyFrames(object): 14 | def __init__(self, frames): 15 | """This object ensures that common frames between the observations are only stored once. 16 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 17 | buffers. 18 | This object should only be converted to numpy array before being passed to the model. 19 | You'd not believe how complex the previous solution was.""" 20 | self._frames = frames 21 | self._out = None 22 | 23 | def _force(self): 24 | if self._out is None: 25 | self._out = np.concatenate(self._frames, axis=2) 26 | self._frames = None 27 | return self._out 28 | 29 | def __array__(self, dtype=None): 30 | out = self._force() 31 | if dtype is not None: 32 | out = out.astype(dtype) 33 | return out 34 | 35 | def __len__(self): 36 | return len(self._force()) 37 | 38 | def __getitem__(self, i): 39 | return self._force()[i] 40 | 41 | class FireResetEnv(gym.Wrapper): 42 | def __init__(self, env): 43 | """Take action on reset for environments that are fixed until firing.""" 44 | gym.Wrapper.__init__(self, env) 45 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 46 | assert len(env.unwrapped.get_action_meanings()) >= 3 47 | 48 | def reset(self, **kwargs): 49 | self.env.reset(**kwargs) 50 | obs, _, done, _ = self.env.step(1) 51 | if done: 52 | self.env.reset(**kwargs) 53 | obs, _, done, _ = self.env.step(2) 54 | if done: 55 | self.env.reset(**kwargs) 56 | return obs 57 | 58 | def step(self, ac): 59 | return self.env.step(ac) 60 | 61 | 62 | class MaxAndSkipEnv(gym.Wrapper): 63 | def __init__(self, env, skip=4): 64 | """Return only every `skip`-th frame""" 65 | gym.Wrapper.__init__(self, env) 66 | # most recent raw observations (for max pooling across time steps) 67 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 68 | self._skip = skip 69 | 70 | def step(self, action): 71 | """Repeat action, sum reward, and max over last observations.""" 72 | total_reward = 0.0 73 | done = None 74 | for i in range(self._skip): 75 | obs, reward, done, info = self.env.step(action) 76 | if i == self._skip - 2: self._obs_buffer[0] = obs 77 | if i == self._skip - 1: self._obs_buffer[1] = obs 78 | total_reward += reward 79 | if done: 80 | break 81 | # Note that the observation on the done=True frame 82 | # doesn't matter 83 | max_frame = self._obs_buffer.max(axis=0) 84 | 85 | return max_frame, total_reward, done, info 86 | 87 | def reset(self, **kwargs): 88 | return self.env.reset(**kwargs) 89 | 90 | 91 | 92 | class WarpFrame(gym.ObservationWrapper): 93 | def __init__(self, env): 94 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 95 | gym.ObservationWrapper.__init__(self, env) 96 | self.width = 84 97 | self.height = 84 98 | self.observation_space = spaces.Box(low=0, high=255, 99 | shape=(self.height, self.width, 1), dtype=np.uint8) 100 | 101 | def observation(self, frame): 102 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 103 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 104 | return frame[:, :, None] 105 | 106 | 107 | 108 | class FrameStack(gym.Wrapper): 109 | def __init__(self, env, k): 110 | """Stack k last frames. 111 | Returns lazy array, which is much more memory efficient. 112 | See Also 113 | -------- 114 | baselines.common.atari_wrappers.LazyFrames 115 | """ 116 | gym.Wrapper.__init__(self, env) 117 | self.k = k 118 | self.frames = deque([], maxlen=k) 119 | shp = env.observation_space.shape 120 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype) 121 | 122 | def reset(self): 123 | ob = self.env.reset() 124 | for _ in range(self.k): 125 | self.frames.append(ob) 126 | return self._get_ob() 127 | 128 | def step(self, action): 129 | ob, reward, done, info = self.env.step(action) 130 | self.frames.append(ob) 131 | return self._get_ob(), reward, done, info 132 | 133 | def _get_ob(self): 134 | assert len(self.frames) == self.k 135 | return LazyFrames(list(self.frames)) 136 | 137 | 138 | class ImageToPyTorch(gym.ObservationWrapper): 139 | def __init__(self, env): 140 | super(ImageToPyTorch, self).__init__(env) 141 | old_shape = self.observation_space.shape 142 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32) 143 | 144 | def observation(self, observation): 145 | return np.moveaxis(observation, 2, 0) 146 | 147 | 148 | 149 | class ScaledFloatFrame(gym.ObservationWrapper): 150 | def __init__(self, env): 151 | gym.ObservationWrapper.__init__(self, env) 152 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) 153 | 154 | def observation(self, observation): 155 | # careful! This undoes the memory optimization, use 156 | # with smaller replay buffers only. 157 | return np.array(observation).astype(np.float32) / 255.0 158 | 159 | 160 | def make_env(env_name, fire=True): 161 | env = gym.make(env_name) 162 | env = MaxAndSkipEnv(env) ## Return only every `skip`-th frame 163 | if fire: 164 | env = FireResetEnv(env) ## Fire at the beginning 165 | env = WarpFrame(env) ## Reshape image 166 | env = ImageToPyTorch(env) ## Invert shape 167 | env = FrameStack(env, 4) ## Stack last 4 frames 168 | env = ScaledFloatFrame(env) ## Scale frames 169 | return env -------------------------------------------------------------------------------- /Week3/buffers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | 4 | 5 | 6 | class ReplayBuffer(): 7 | ''' 8 | Replay Buffer class to keep the agent memories memorized in a deque structure. 9 | ''' 10 | def __init__(self, size, n_multi_step, gamma): 11 | self.buffer = collections.deque(maxlen=size) 12 | self.n_multi_step = n_multi_step 13 | self.gamma = gamma 14 | 15 | def __len__(self): 16 | return len(self.buffer) 17 | 18 | def append(self, memory): 19 | ''' 20 | append a new 'memory' to the buffer 21 | ''' 22 | self.buffer.append(memory) 23 | 24 | def sample(self, batch_size): 25 | ''' 26 | Sample batch_size memories from the buffer. 27 | NB: It deals the N-step DQN 28 | ''' 29 | # randomly pick batch_size elements from the buffer 30 | indices = np.random.choice(len(self.buffer), batch_size, replace=False) 31 | 32 | states = [] 33 | actions = [] 34 | next_states = [] 35 | rewards = [] 36 | dones = [] 37 | 38 | # for each indices 39 | for i in indices: 40 | sum_reward = 0 41 | states_look_ahead = self.buffer[i].new_obs 42 | done_look_ahead = self.buffer[i].done 43 | 44 | # N-step look ahead loop to compute the reward and pick the new 'next_state' (of the n-th state) 45 | for n in range(self.n_multi_step): 46 | if len(self.buffer) > i+n: 47 | # compute the n-th reward 48 | sum_reward += (self.gamma**n) * self.buffer[i+n].reward 49 | if self.buffer[i+n].done: 50 | states_look_ahead = self.buffer[i+n].new_obs 51 | done_look_ahead = True 52 | break 53 | else: 54 | states_look_ahead = self.buffer[i+n].new_obs 55 | done_look_ahead = False 56 | 57 | # Populate the arrays with the next_state, reward and dones just computed 58 | states.append(self.buffer[i].obs) 59 | actions.append(self.buffer[i].action) 60 | next_states.append(states_look_ahead) 61 | rewards.append(sum_reward) 62 | dones.append(done_look_ahead) 63 | 64 | return (np.array(states, dtype=np.float32), np.array(actions, dtype=np.int64), np.array(next_states, dtype=np.float32), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8)) 65 | -------------------------------------------------------------------------------- /Week3/central_control.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import namedtuple 3 | import collections 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | import time 9 | 10 | from neural_net import DQN, DuelingDQN 11 | 12 | 13 | class CentralControl(): 14 | 15 | def __init__(self, observation_space_shape, action_space_shape, gamma, n_multi_step, double_DQN, noisy_net, dueling, device): 16 | if dueling: 17 | # Dueling NN 18 | self.target_nn = DuelingDQN(observation_space_shape, action_space_shape).to(device) 19 | self.moving_nn = DuelingDQN(observation_space_shape, action_space_shape).to(device) 20 | else: 21 | # Normal NN 22 | self.target_nn = DQN(observation_space_shape, action_space_shape, noisy_net).to(device) 23 | self.moving_nn = DQN(observation_space_shape, action_space_shape, noisy_net).to(device) 24 | 25 | self.device = device 26 | self.gamma = gamma 27 | self.n_multi_step = n_multi_step 28 | self.double_DQN = double_DQN 29 | 30 | def set_optimizer(self, learning_rate): 31 | self.optimizer = optim.Adam(self.moving_nn.parameters(), lr=learning_rate) 32 | 33 | def optimize(self, mini_batch): 34 | ''' 35 | Optimize the NN 36 | ''' 37 | # reset the grads 38 | self.optimizer.zero_grad() 39 | # caluclate the loss of the mini batch 40 | loss = self._calulate_loss(mini_batch) 41 | loss_v = loss.item() 42 | 43 | # do backpropagation 44 | loss.backward() 45 | # one step of optimization 46 | self.optimizer.step() 47 | 48 | return loss_v 49 | 50 | def update_target(self): 51 | ''' 52 | Copy the moving NN in the target NN 53 | ''' 54 | self.target_nn.load_state_dict(self.moving_nn.state_dict()) 55 | self.target_nn = self.moving_nn 56 | 57 | def get_max_action(self, obs): 58 | ''' 59 | Forward pass of the NN to obtain the action of the given observations 60 | ''' 61 | # convert the observation in tensor 62 | state_t = torch.tensor(np.array([obs])).to(self.device) 63 | # forawrd pass 64 | q_values_t = self.moving_nn(state_t) 65 | # get the maximum value of the output (i.e. the best action to take) 66 | _, act_t = torch.max(q_values_t, dim=1) 67 | return int(act_t.item()) 68 | 69 | 70 | def _calulate_loss(self, mini_batch): 71 | ''' 72 | Calculate mini batch's MSE loss. 73 | It support also the double DQN version 74 | ''' 75 | 76 | states, actions, next_states, rewards, dones = mini_batch 77 | 78 | # convert the data in tensors 79 | states_t = torch.as_tensor(states, device=self.device) 80 | next_states_t = torch.as_tensor(next_states, device=self.device) 81 | actions_t = torch.as_tensor(actions, device=self.device) 82 | rewards_t = torch.as_tensor(rewards, dtype=torch.float32, device=self.device) 83 | done_t = torch.as_tensor(dones, dtype=torch.uint8, device=self.device) 84 | 85 | # Value of the action taken previously (recorded in actions_v) in the state_t 86 | state_action_values = self.moving_nn(states_t).gather(1, actions_t[:,None]).squeeze(-1) 87 | # NB gather is a differentiable function 88 | 89 | # Next state value with Double DQN. (i.e. get the value predicted by the target nn, of the best action predicted by the moving nn) 90 | if self.double_DQN: 91 | double_max_action = self.moving_nn(next_states_t).max(1)[1] 92 | double_max_action = double_max_action.detach() 93 | target_output = self.target_nn(next_states_t) 94 | next_state_values = torch.gather(target_output, 1, double_max_action[:,None]).squeeze(-1) # NB: [:,None] add an extra dimension 95 | 96 | # Next state value in the normal configuration 97 | else: 98 | next_state_values = self.target_nn(next_states_t).max(1)[0] 99 | 100 | next_state_values = next_state_values.detach() # No backprop 101 | 102 | # Use the Bellman equation 103 | expected_state_action_values = rewards_t + (self.gamma**self.n_multi_step) * next_state_values 104 | # compute the loss 105 | return nn.MSELoss()(state_action_values, expected_state_action_values) 106 | -------------------------------------------------------------------------------- /Week3/imgs/DQN_variations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/DQN_variations.png -------------------------------------------------------------------------------- /Week3/imgs/Dueling_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/Dueling_img.png -------------------------------------------------------------------------------- /Week3/imgs/double_Qlearning_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/double_Qlearning_formula.png -------------------------------------------------------------------------------- /Week3/imgs/multistep_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/multistep_formula.png -------------------------------------------------------------------------------- /Week3/imgs/noisenet_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/noisenet_formula.png -------------------------------------------------------------------------------- /Week3/imgs/pong_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/pong_gif.gif -------------------------------------------------------------------------------- /Week3/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import namedtuple 4 | import collections 5 | import time 6 | import math 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | from torch.nn import Parameter, init 12 | from torch.nn import functional as F 13 | 14 | from tensorboardX import SummaryWriter 15 | 16 | import atari_wrappers 17 | from agent import DQNAgent 18 | import utils 19 | 20 | DQN_HYPERPARAMS = { 21 | 'dueling': False, 22 | 'noisy_net': False, 23 | 'double_DQN': False, 24 | 'n_multi_step': 2, 25 | 'buffer_start_size': 10001, 26 | 'buffer_capacity': 15000, 27 | 'epsilon_start': 1.0, 28 | 'epsilon_decay': 10**5, 29 | 'epsilon_final': 0.02, 30 | 'learning_rate': 5e-5, 31 | 'gamma': 0.99, 32 | 'n_iter_update_target': 1000 33 | } 34 | 35 | 36 | BATCH_SIZE = 32 37 | MAX_N_GAMES = 3000 38 | TEST_FREQUENCY = 10 39 | 40 | ENV_NAME = "PongNoFrameskip-v4" 41 | SAVE_VIDEO = True 42 | DEVICE = 'cpu' # or 'cuda' 43 | SUMMARY_WRITER = True 44 | 45 | LOG_DIR = 'content/runs' 46 | name = '_'.join([str(k)+'.'+str(v) for k,v in DQN_HYPERPARAMS.items()]) 47 | name = 'prv' 48 | 49 | if __name__ == '__main__': 50 | 51 | # create the environment 52 | env = atari_wrappers.make_env(ENV_NAME) 53 | if SAVE_VIDEO: 54 | # save the video of the games 55 | env = gym.wrappers.Monitor(env, "main-"+ENV_NAME, force=True) 56 | obs = env.reset() 57 | 58 | # TensorBoard 59 | writer = SummaryWriter(log_dir=LOG_DIR+'/'+name + str(time.time())) if SUMMARY_WRITER else None 60 | 61 | print('Hyperparams:', DQN_HYPERPARAMS) 62 | 63 | # create the agent 64 | agent = DQNAgent(env, device=DEVICE, summary_writer=writer, hyperparameters=DQN_HYPERPARAMS) 65 | 66 | n_games = 0 67 | n_iter = 0 68 | 69 | # Play MAX_N_GAMES games 70 | while n_games < MAX_N_GAMES: 71 | # act greedly 72 | action = agent.act_eps_greedy(obs) 73 | 74 | # one step on the environment 75 | new_obs, reward, done, _ = env.step(action) 76 | 77 | # add the environment feedback to the agent 78 | agent.add_env_feedback(obs, action, new_obs, reward, done) 79 | 80 | # sample and optimize NB: the agent could wait to have enough memories 81 | agent.sample_and_optimize(BATCH_SIZE) 82 | 83 | obs = new_obs 84 | if done: 85 | n_games += 1 86 | 87 | # print info about the agent and reset the stats 88 | agent.print_info() 89 | agent.reset_stats() 90 | 91 | #if n_games % TEST_FREQUENCY == 0: 92 | # print('Test mean:', utils.test_game(env, agent, 1)) 93 | 94 | obs = env.reset() 95 | 96 | writer.close() 97 | 98 | # tensorboard --logdir content/runs --host localhost 99 | -------------------------------------------------------------------------------- /Week3/neural_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.optim as optim 5 | from torch.nn import Parameter, init 6 | from torch.nn import functional as F 7 | import math 8 | 9 | class NoisyLinear(nn.Linear): 10 | ''' 11 | Noisy Linear layer -> NOISY NETWORKS FOR EXPLORATION https://arxiv.org/pdf/1706.10295.pdf 12 | 13 | NB: IT DOESN T WORKS. PROBLEMS WITH THE EPSILON PARAMETERES INITIALIZATION 14 | ''' 15 | 16 | 17 | def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): 18 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) 19 | self.sigma_init = sigma_init 20 | 21 | self.sigma_weight = Parameter(torch.Tensor(out_features, in_features)) 22 | self.register_buffer('epsilon_weight', torch.zeros(out_features, in_features)) 23 | if bias: 24 | self.sigma_bias = Parameter(torch.Tensor(out_features)) 25 | self.register_buffer('epsilon_bias', torch.zeros(out_features)) 26 | self.reset_parameters() 27 | 28 | def reset_parameters(self): 29 | ''' 30 | Initialize the biases and weights 31 | ''' 32 | if hasattr(self, 'sigma_bias'): 33 | init.constant_(self.sigma_bias, self.sigma_init) 34 | init.constant_(self.sigma_weight, self.sigma_init) 35 | 36 | std = math.sqrt(3/self.in_features) 37 | init.uniform_(self.weight, -std, std) 38 | init.uniform_(self.bias, -std, std) 39 | 40 | def forward(self, input): 41 | if self.bias is not None: 42 | ## NB: in place operation. PyTorch is not happy with that!! CHANGE IT 43 | self.epsilon_bias.data.normal_() 44 | 45 | # new bias with noise 46 | bias = self.bias + self.sigma_bias*self.epsilon_bias 47 | else: 48 | bias = self.bias 49 | 50 | ## NB: in place operation. PyTorch is not happy with that!! CHANGE IT 51 | self.epsilon_weight.data.normal_() 52 | # new weight with noise 53 | weight = self.weight + self.sigma_weight*self.epsilon_weight 54 | # create the linear layer it the added noise 55 | return F.linear(input, weight, bias) 56 | 57 | 58 | class DuelingDQN(nn.Module): 59 | ''' 60 | Dueling DQN -> http://proceedings.mlr.press/v48/wangf16.pdf 61 | ''' 62 | 63 | def __init__(self, input_shape, n_actions): 64 | super(DuelingDQN, self).__init__() 65 | 66 | self.conv = nn.Sequential( 67 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 68 | nn.BatchNorm2d(32), 69 | nn.ReLU(), 70 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 71 | nn.BatchNorm2d(64), 72 | nn.ReLU(), 73 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 74 | nn.BatchNorm2d(64), 75 | nn.ReLU()) 76 | 77 | conv_out_size = self._get_conv_out(input_shape) 78 | # Predict the actions advantage 79 | self.fc_a = nn.Sequential( 80 | nn.Linear(conv_out_size, 512), 81 | nn.ReLU(), 82 | nn.Linear(512, n_actions)) 83 | 84 | # Predict the state value 85 | self.fc_v = nn.Sequential( 86 | nn.Linear(conv_out_size, 512), 87 | nn.ReLU(), 88 | nn.Linear(512, 1)) 89 | 90 | def _get_conv_out(self, shape): 91 | o = self.conv(torch.zeros(1, *shape)) # apply convolution layers.. 92 | return int(np.prod(o.size())) # ..to obtain the output shape 93 | 94 | def forward(self, x): 95 | batch_size = x.size()[0] 96 | conv_out = self.conv(x).view(batch_size, -1) # apply convolution layers and flatten the results 97 | 98 | adv = self.fc_a(conv_out) 99 | val = self.fc_v(conv_out) 100 | 101 | # Sum the state value with the advantage of each action (NB: the mean has been subtracted from the advantage. It is used in the paper) 102 | return val + adv - torch.mean(adv, dim=1, keepdim=True) 103 | 104 | 105 | class DQN(nn.Module): 106 | ''' 107 | Deep Q newtork following the architecture used in the DeepMind paper (https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) 108 | ''' 109 | 110 | def __init__(self, input_shape, n_actions, noisy_net): 111 | super(DQN, self).__init__() 112 | 113 | # 3 convolutional layers. Take an image as input (NB: the BatchNorm layers aren't in the paper but they increase the convergence) 114 | self.conv = nn.Sequential( 115 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 116 | nn.BatchNorm2d(32), 117 | nn.ReLU(), 118 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 119 | nn.BatchNorm2d(64), 120 | nn.ReLU(), 121 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 122 | nn.BatchNorm2d(64), 123 | nn.ReLU()) 124 | 125 | # Compute the output shape of the conv layers 126 | conv_out_size = self._get_conv_out(input_shape) 127 | 128 | # 2 fully connected layers 129 | if noisy_net: 130 | # In case of NoisyNet use noisy linear layers 131 | self.fc = nn.Sequential( 132 | NoisyLinear(conv_out_size, 512), 133 | nn.ReLU(), 134 | NoisyLinear(512, n_actions)) 135 | else: 136 | self.fc = nn.Sequential( 137 | nn.Linear(conv_out_size, 512), 138 | nn.ReLU(), 139 | nn.Linear(512, n_actions)) 140 | 141 | def _get_conv_out(self, shape): 142 | # Compute the output shape of the conv layers 143 | o = self.conv(torch.zeros(1, *shape)) # apply convolution layers.. 144 | return int(np.prod(o.size())) # ..to obtain the output shape 145 | 146 | def forward(self, x): 147 | batch_size = x.size()[0] 148 | conv_out = self.conv(x).view(batch_size, -1) # apply convolution layers and flatten the results 149 | return self.fc(conv_out) # apply fc layers 150 | -------------------------------------------------------------------------------- /Week3/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | def test_game(env, agent, test_episodes): 5 | reward_games = [] 6 | for _ in range(test_episodes): 7 | obs = env.reset() 8 | rewards = 0 9 | while True: 10 | action = agent.act(obs) 11 | next_obs, reward, done, _ = env.step(action) 12 | obs = next_obs 13 | rewards += reward 14 | 15 | if done: 16 | reward_games.append(rewards) 17 | obs = env.reset() 18 | break 19 | 20 | return np.mean(reward_games) 21 | -------------------------------------------------------------------------------- /Week4/A2C.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Advantage Actor-Critic (A2C) on CartPole" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Actor-critic is an algorithm that combines both policy gradient (the actor) and value function (the critic)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "![A2C](imgs/Advantage_actor_critic.png)\n", 22 | "Credit: Sergey Levine" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "A2C is a more sophisticated version of the actor-critic that use the advantage, n-step return and a policy is run in multiple (synchronous) environments. \n", 30 | "[A3C](https://arxiv.org/pdf/1602.01783.pdf) is an asynchronous A2C with the environments that are run in parallel. " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "The Actor and Critic can share the same neural network or have two separate network design. In this example, I used a shared network.\n", 38 | "\"drawing\"\n", 39 | "Credit: Sergey Levine" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import numpy as np\n", 49 | "import gym\n", 50 | "from tensorboardX import SummaryWriter\n", 51 | "\n", 52 | "import datetime\n", 53 | "from collections import namedtuple\n", 54 | "from collections import deque\n", 55 | "\n", 56 | "import torch\n", 57 | "import torch.nn as nn\n", 58 | "import torch.nn.functional as F\n", 59 | "import torch.optim as optim\n", 60 | "from torch.nn.utils.clip_grad import clip_grad_norm_" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "class A2C_nn(nn.Module):\n", 70 | " '''\n", 71 | " Advantage actor-critic neural net\n", 72 | " '''\n", 73 | "\n", 74 | " def __init__(self, input_shape, n_actions):\n", 75 | " super(A2C_nn, self).__init__()\n", 76 | "\n", 77 | " self.lp = nn.Sequential(\n", 78 | " nn.Linear(input_shape[0], 64),\n", 79 | " nn.ReLU())\n", 80 | "\n", 81 | " self.policy = nn.Linear(64, n_actions)\n", 82 | " self.value = nn.Linear(64, 1)\n", 83 | "\n", 84 | " def forward(self, x):\n", 85 | " l = self.lp(x.float())\n", 86 | " # return the actor and the critic\n", 87 | " return self.policy(l), self.value(l)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "The total loss contains:\n", 95 | "- actor loss $\\partial\\theta_v\\leftarrow\\partial\\theta_v + \\dfrac{\\partial(R-V_\\theta(s))^2}{\\partial\\theta_v}$\n", 96 | "- policy loss $\\partial\\theta_\\pi\\leftarrow\\partial\\theta_\\pi + \\alpha\\triangledown_\\theta log\\pi_\\theta(a|s)(R-V_\\theta(s))$\n", 97 | "- entropy loss $\\beta\\sum_i\\pi_\\theta(s)log\\pi_\\theta(s)$" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "def calculate_loss(memories, nn, writer):\n", 107 | " '''\n", 108 | " Calculate the loss of the memories\n", 109 | " '''\n", 110 | "\n", 111 | " #batch_mem = np.random.choice(len(memories), size=32)\n", 112 | "\n", 113 | " rewards = torch.tensor(np.array([m.reward for m in memories], dtype=np.float32))\n", 114 | " log_val = nn(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)))\n", 115 | "\n", 116 | " act_log_softmax = F.log_softmax(log_val[0], dim=1)[:,np.array([m.action for m in memories])]\n", 117 | " # Calculate the advantage\n", 118 | " adv = (rewards - log_val[1].detach())\n", 119 | "\n", 120 | " # actor loss (policy gradient)\n", 121 | " pg_loss = - torch.mean(act_log_softmax * adv)\n", 122 | " # critic loss (value loss)\n", 123 | " vl_loss = F.mse_loss(log_val[1].squeeze(-1), rewards)\n", 124 | " # entropy loss\n", 125 | " entropy_loss = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(log_val[0], dim=1) * F.log_softmax(log_val[0], dim=1), dim=1))\n", 126 | "\n", 127 | " # total loss\n", 128 | " loss = pg_loss + vl_loss - entropy_loss\n", 129 | "\n", 130 | " # add scalar to the writer\n", 131 | " writer.add_scalar('loss', float(loss), n_iter)\n", 132 | " writer.add_scalar('pg_loss', float(pg_loss), n_iter)\n", 133 | " writer.add_scalar('vl_loss', float(vl_loss), n_iter)\n", 134 | " writer.add_scalar('entropy_loss', float(entropy_loss), n_iter)\n", 135 | " writer.add_scalar('actions', np.mean([m.action for m in memories]), n_iter)\n", 136 | " writer.add_scalar('adv', float(torch.mean(adv)), n_iter)\n", 137 | " writer.add_scalar('act_lgsoft', float(torch.mean(act_log_softmax)), n_iter)\n", 138 | "\n", 139 | " return loss" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "class Env:\n", 149 | " '''\n", 150 | " Environment class. Used to deal with multiple environments\n", 151 | " '''\n", 152 | "\n", 153 | " game_rew = 0\n", 154 | " last_game_rew = 0\n", 155 | "\n", 156 | " def __init__(self, env_name, n_steps, gamma):\n", 157 | " super(Env, self).__init__()\n", 158 | "\n", 159 | " # create the new environment\n", 160 | " self.env = gym.make(env_name)\n", 161 | " self.obs = self.env.reset()\n", 162 | "\n", 163 | " self.n_steps = n_steps\n", 164 | " self.action_n = self.env.action_space.n\n", 165 | " self.observation_n = self.env.observation_space.shape[0]\n", 166 | " self.gamma = gamma\n", 167 | "\n", 168 | " def step(self, agent):\n", 169 | " '''\n", 170 | " Execute the agent n_steps in the environment\n", 171 | " '''\n", 172 | " memories = []\n", 173 | " for s in range(self.n_steps):\n", 174 | "\n", 175 | " # get the agent policy\n", 176 | " pol_val = agent(torch.tensor(self.obs))\n", 177 | " s_act = F.softmax(pol_val[0])\n", 178 | "\n", 179 | " # get an action following the policy distribution\n", 180 | " action = int(np.random.choice(np.arange(self.action_n), p=s_act.detach().numpy(), size=1))\n", 181 | "\n", 182 | " # Perform a step in the environment\n", 183 | " new_obs, reward, done, _ = self.env.step(action)\n", 184 | "\n", 185 | " # update the memory\n", 186 | " memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=reward, done=done))\n", 187 | "\n", 188 | " self.game_rew += reward\n", 189 | " self.obs = new_obs\n", 190 | "\n", 191 | " if done:\n", 192 | " # if done reset the env and the variables\n", 193 | " self.done = True\n", 194 | " # if the game is over, run_add take the 0 value\n", 195 | " self.run_add = 0\n", 196 | " self.obs = self.env.reset()\n", 197 | "\n", 198 | " self.last_game_rew = self.game_rew\n", 199 | " self.game_rew = 0\n", 200 | " break\n", 201 | " else:\n", 202 | " self.done = False\n", 203 | "\n", 204 | " if not self.done:\n", 205 | " # if the game isn't over, run_add take the value of the last state\n", 206 | " self.run_add = float(agent(torch.tensor(self.obs))[1])\n", 207 | "\n", 208 | " # compute the discount reward of the memories and return it\n", 209 | " return self.discounted_rewards(memories)\n", 210 | "\n", 211 | "\n", 212 | " def discounted_rewards(self, memories):\n", 213 | " '''\n", 214 | " Compute the discounted reward backward\n", 215 | " '''\n", 216 | " upd_memories = []\n", 217 | "\n", 218 | " for t in reversed(range(len(memories))):\n", 219 | " if memories[t].done: self.run_add = 0\n", 220 | " self.run_add = self.run_add * self.gamma + memories[t].reward\n", 221 | "\n", 222 | " # Update the memories with the discounted reward\n", 223 | " upd_memories.append(Memory(obs=memories[t].obs, action=memories[t].action, new_obs=memories[t].new_obs, reward=self.run_add, done=memories[t].done))\n", 224 | "\n", 225 | " return upd_memories[::-1]\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)\n", 235 | "\n", 236 | "# Hyperparameters\n", 237 | "GAMMA = 0.95\n", 238 | "LEARNING_RATE = 0.003\n", 239 | "ENTROPY_BETA = 0.01\n", 240 | "ENV_NAME = 'CartPole-v0'\n", 241 | "\n", 242 | "MAX_ITER = 100000\n", 243 | "# Number of the env\n", 244 | "N_ENVS = 40\n", 245 | "\n", 246 | "# Max normalized gradient\n", 247 | "CLIP_GRAD = 0.1\n", 248 | "\n", 249 | "device = 'cpu'\n", 250 | "\n", 251 | "now = datetime.datetime.now()\n", 252 | "date_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# create N_ENVS environments\n", 262 | "envs = [Env(ENV_NAME, 1, GAMMA) for _ in range(N_ENVS)]\n", 263 | "\n", 264 | "writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)\n", 265 | "\n", 266 | "# initialize the actor-critic NN\n", 267 | "agent_nn = A2C_nn(gym.make(ENV_NAME).observation_space.shape, gym.make(ENV_NAME).action_space.n).to(device)\n", 268 | "\n", 269 | "# Adam optimizer\n", 270 | "optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE, eps=1e-3)\n", 271 | "\n", 272 | "experience = []\n", 273 | "n_iter = 0\n", 274 | "\n", 275 | "while n_iter < MAX_ITER:\n", 276 | " n_iter += 1\n", 277 | "\n", 278 | " # list containing all the memories\n", 279 | " memories = [mem for env in envs for mem in env.step(agent_nn)]\n", 280 | "\n", 281 | " # calculate the loss\n", 282 | " losses = calculate_loss(memories, agent_nn, writer)\n", 283 | "\n", 284 | " # optimizer step\n", 285 | " optimizer.zero_grad()\n", 286 | " losses.backward()\n", 287 | " # clip the gradient\n", 288 | " clip_grad_norm_(agent_nn.parameters(), CLIP_GRAD)\n", 289 | " optimizer.step()\n", 290 | "\n", 291 | "\n", 292 | " writer.add_scalar('rew', np.mean([env.last_game_rew for env in envs]), n_iter)\n", 293 | " print(n_iter, np.round(float(losses),2), 'rew:', np.round(np.mean([env.last_game_rew for env in envs]),2))\n", 294 | "\n", 295 | "writer.close()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "### ATTENTION! the model is not working, look at the graph below. Why this strange behavior? I tried to tune the hyperparameters but the results are the same.\n", 303 | "![Reward plot](imgs/reward_plot_a2c.png)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "#### Why is the loss decreasing so fast? \n", 311 | "![Reward plot](imgs/loss_plot_a2c.png)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "#### In some cases, the model start preferring always the same action..\n", 319 | "![Reward plot](imgs/actions_plot_a2c.png)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "Some idea:\n", 327 | " - Use two different neural networks and optimizer" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.5.2" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 2 352 | } 353 | -------------------------------------------------------------------------------- /Week4/PolicyGradient.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## POLICY GRADIENT on CartPole" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Policy Gradient algorithms find an optimal behavior strategy optimizing directly the policy. \n", 15 | "The policy is a parametrized function respect to $\\theta$ $\\pi_\\theta(a|s)$\n", 16 | "\n", 17 | "The reward function is defined as \n", 18 | "$$J(\\theta) = \\sum_{s}d^\\pi(s)\\sum_{a}\\pi_\\theta(a|s)Q^\\pi(s,a)$$\n", 19 | "\n", 20 | "In Vanilla Policy Gradient, we estimate the return $R_t$ (REINFORCE algorithm) and update the policy subtracting a baseline value from $R_t$ to reduce the variance." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "\"drawing\"\n", 28 | "Credit: John Schulman" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import gym\n", 39 | "from tensorboardX import SummaryWriter\n", 40 | "\n", 41 | "import time\n", 42 | "from collections import namedtuple\n", 43 | "from collections import deque\n", 44 | "import datetime\n", 45 | "\n", 46 | "import torch\n", 47 | "import torch.nn as nn\n", 48 | "import torch.nn.functional as F\n", 49 | "import torch.optim as optim" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "class PG_nn(nn.Module):\n", 59 | " '''\n", 60 | " Policy neural net\n", 61 | " '''\n", 62 | " def __init__(self, input_shape, n_actions):\n", 63 | " super(PG_nn, self).__init__()\n", 64 | "\n", 65 | " self.mlp = nn.Sequential(\n", 66 | " nn.Linear(input_shape[0], 64),\n", 67 | " nn.ReLU(),\n", 68 | " nn.Linear(64, n_actions))\n", 69 | "\n", 70 | " def forward(self, x):\n", 71 | " return self.mlp(x.float())" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "def discounted_rewards(memories, gamma):\n", 81 | " '''\n", 82 | " Compute the discounted reward backward\n", 83 | " '''\n", 84 | "\n", 85 | " disc_rew = np.zeros(len(memories))\n", 86 | " run_add = 0\n", 87 | "\n", 88 | " for t in reversed(range(len(memories))):\n", 89 | " if memories[t].done: run_add = 0\n", 90 | " run_add = run_add * gamma + memories[t].reward\n", 91 | " disc_rew[t] = run_add\n", 92 | "\n", 93 | " return disc_rew" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)\n", 103 | "\n", 104 | "GAMMA = 0.99\n", 105 | "LEARNING_RATE = 0.002\n", 106 | "ENTROPY_BETA = 0.01\n", 107 | "ENV_NAME = 'CartPole-v0'\n", 108 | "\n", 109 | "MAX_N_GAMES = 10000\n", 110 | "n_games = 0\n", 111 | "\n", 112 | "device = 'cpu'\n", 113 | "\n", 114 | "now = datetime.datetime.now()\n", 115 | "date_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "env = gym.make(ENV_NAME)\n", 125 | "obs = env.reset()\n", 126 | "\n", 127 | "# Initialize the writer\n", 128 | "writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)\n", 129 | "\n", 130 | "# create the agent neural net\n", 131 | "action_n = env.action_space.n\n", 132 | "agent_nn = PG_nn(env.observation_space.shape, action_n).to(device)\n", 133 | "\n", 134 | "# Adam optimizer\n", 135 | "optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE)\n", 136 | "\n", 137 | "experience = []\n", 138 | "tot_reward = 0\n", 139 | "n_iter = 0\n", 140 | "# deque list to keep the baseline\n", 141 | "baseline = deque(maxlen=30000)\n", 142 | "game_rew = 0\n", 143 | "\n", 144 | "## MAIN BODY\n", 145 | "while n_games < MAX_N_GAMES:\n", 146 | "\n", 147 | " n_iter += 1\n", 148 | "\n", 149 | " # execute the agent\n", 150 | " act = agent_nn(torch.tensor(obs))\n", 151 | " act_soft = F.softmax(act)\n", 152 | " # get an action following the policy distribution\n", 153 | " action = int(np.random.choice(np.arange(action_n), p=act_soft.detach().numpy(), size=1))\n", 154 | "\n", 155 | " # make a step in the env\n", 156 | " new_obs, reward, done, _ = env.step(action)\n", 157 | "\n", 158 | " game_rew += reward\n", 159 | " # update the experience list with the last memory\n", 160 | " experience.append(Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done))\n", 161 | "\n", 162 | " obs = new_obs\n", 163 | "\n", 164 | " if done:\n", 165 | " # Calculate the discounted rewards\n", 166 | " disc_rewards = discounted_rewards(experience, GAMMA)\n", 167 | "\n", 168 | " # update the baseline\n", 169 | " baseline.extend(disc_rewards)\n", 170 | " # subtract the baseline mean from the discounted reward.\n", 171 | " disc_rewards -= np.mean(baseline)\n", 172 | "\n", 173 | " # run the agent NN on the obs in the experience list\n", 174 | " acts = agent_nn(torch.tensor([e.obs for e in experience]))\n", 175 | "\n", 176 | " # take the log softmax of the action taken previously\n", 177 | " game_act_log_softmax_t = F.log_softmax(acts, dim=1)[:,[e.action for e in experience]]\n", 178 | "\n", 179 | " disc_rewards_t = torch.tensor(disc_rewards, dtype=torch.float32).to(device)\n", 180 | "\n", 181 | " # compute the loss entropy\n", 182 | " l_entropy = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(acts, dim=1) * F.log_softmax(acts, dim=1), dim=1))\n", 183 | "\n", 184 | " # compute the loss\n", 185 | " loss = - torch.mean(disc_rewards_t * game_act_log_softmax_t)\n", 186 | " loss = loss + l_entropy\n", 187 | "\n", 188 | " # optimize\n", 189 | " optimizer.zero_grad()\n", 190 | " loss.backward()\n", 191 | " optimizer.step()\n", 192 | "\n", 193 | " # print the stats\n", 194 | " writer.add_scalar('loss', loss, n_iter)\n", 195 | " writer.add_scalar('reward', game_rew, n_iter)\n", 196 | "\n", 197 | " print(n_games, loss.detach().numpy(), game_rew, np.mean(disc_rewards), np.mean(baseline))\n", 198 | "\n", 199 | " # reset the variables and env\n", 200 | " experience = []\n", 201 | " game_rew = 0\n", 202 | " obs = env.reset()\n", 203 | " n_games += 1\n", 204 | "\n", 205 | "\n", 206 | "writer.close()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "![Reward](imgs/reward_pg.png)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.5.2" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 2 245 | } 246 | -------------------------------------------------------------------------------- /Week4/imgs/Advantage_actor_critic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/Advantage_actor_critic.png -------------------------------------------------------------------------------- /Week4/imgs/Vanilla_policy_gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/Vanilla_policy_gradient.png -------------------------------------------------------------------------------- /Week4/imgs/actions_plot_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/actions_plot_a2c.png -------------------------------------------------------------------------------- /Week4/imgs/loss_plot_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/loss_plot_a2c.png -------------------------------------------------------------------------------- /Week4/imgs/nn_ac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/nn_ac.png -------------------------------------------------------------------------------- /Week4/imgs/reward_pg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/reward_pg.png -------------------------------------------------------------------------------- /Week4/imgs/reward_plot_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/reward_plot_a2c.png -------------------------------------------------------------------------------- /Week5/PPO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from tensorboardX import SummaryWriter 4 | 5 | import datetime 6 | from collections import namedtuple 7 | from collections import deque 8 | import math 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.nn.utils.clip_grad import clip_grad_norm_ 15 | 16 | class A2C_policy(nn.Module): 17 | ''' 18 | Policy neural network 19 | ''' 20 | def __init__(self, input_shape, n_actions): 21 | super(A2C_policy, self).__init__() 22 | 23 | self.lp = nn.Sequential( 24 | nn.Linear(input_shape[0], 32), 25 | nn.ReLU(), 26 | nn.Linear(32, 32), 27 | nn.ReLU()) 28 | 29 | self.mean_l = nn.Linear(32, n_actions[0]) 30 | self.mean_l.weight.data.mul_(0.1) 31 | 32 | self.var_l = nn.Linear(32, n_actions[0]) 33 | self.var_l.weight.data.mul_(0.1) 34 | 35 | self.logstd = nn.Parameter(torch.zeros(n_actions[0])) 36 | 37 | def forward(self, x): 38 | ot_n = self.lp(x.float()) 39 | return F.tanh(self.mean_l(ot_n)) 40 | 41 | class A2C_value(nn.Module): 42 | ''' 43 | Actor neural network 44 | ''' 45 | def __init__(self, input_shape): 46 | super(A2C_value, self).__init__() 47 | 48 | self.lp = nn.Sequential( 49 | nn.Linear(input_shape[0], 32), 50 | nn.ReLU(), 51 | nn.Linear(32, 32), 52 | nn.ReLU(), 53 | nn.Linear(32, 1)) 54 | 55 | 56 | def forward(self, x): 57 | return self.lp(x.float()) 58 | 59 | 60 | class Env: 61 | ''' 62 | Environment class 63 | ''' 64 | game_rew = 0 65 | last_game_rew = 0 66 | game_n = 0 67 | last_games_rews = [-200] 68 | n_iter = 0 69 | 70 | def __init__(self, env_name, n_steps, gamma, gae_lambda, save_video=False): 71 | super(Env, self).__init__() 72 | 73 | # create the new environment 74 | self.env = gym.make(env_name) 75 | self.obs = self.env.reset() 76 | 77 | self.n_steps = n_steps 78 | self.action_n = self.env.action_space.shape 79 | self.observation_n = self.env.observation_space.shape[0] 80 | self.gamma = gamma 81 | self.gae_lambda = gae_lambda 82 | 83 | # CHANGED 84 | def steps(self, agent_policy, agent_value): 85 | ''' 86 | Execute the agent n_steps in the environment 87 | ''' 88 | memories = [] 89 | for s in range(self.n_steps): 90 | self.n_iter += 1 91 | 92 | # get the agent policy 93 | ag_mean = agent_policy(torch.tensor(self.obs)) 94 | 95 | # get an action following the policy distribution 96 | logstd = agent_policy.logstd.data.cpu().numpy() 97 | action = ag_mean.data.cpu().numpy() + np.exp(logstd) * np.random.normal(size=logstd.shape) 98 | #action = np.random.normal(loc=ag_mean.data.cpu().numpy(), scale=torch.sqrt(ag_var).data.cpu().numpy()) 99 | action = np.clip(action, -1, 1) 100 | 101 | state_value = float(agent_value(torch.tensor(self.obs))) 102 | 103 | # Perform a step in the environment 104 | new_obs, reward, done, _ = self.env.step(action) 105 | 106 | # Update the memories with the last interaction 107 | if done: 108 | # change the reward to 0 in case the episode is end 109 | memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=0, done=done, value=state_value, adv=0)) 110 | else: 111 | memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=reward, done=done, value=state_value, adv=0)) 112 | 113 | 114 | self.game_rew += reward 115 | self.obs = new_obs 116 | 117 | if done: 118 | print('#####',self.game_n, 'rew:', int(self.game_rew), int(np.mean(self.last_games_rews[-100:])), np.round(reward,2), self.n_iter) 119 | 120 | # reset the environment 121 | self.obs = self.env.reset() 122 | self.last_game_rew = self.game_rew 123 | self.game_rew = 0 124 | self.game_n += 1 125 | self.n_iter = 0 126 | self.last_games_rews.append(self.last_game_rew) 127 | 128 | # compute the discount reward of the memories and return it 129 | return self.generalized_advantage_estimation(memories) 130 | 131 | def generalized_advantage_estimation(self, memories): 132 | ''' 133 | Calculate the advantage diuscounted reward as in the paper 134 | ''' 135 | upd_memories = [] 136 | run_add = 0 137 | 138 | for t in reversed(range(len(memories)-1)): 139 | if memories[t].done: 140 | run_add = memories[t].reward 141 | else: 142 | sigma = memories[t].reward + self.gamma * memories[t+1].value - memories[t].value 143 | run_add = sigma + run_add * self.gamma * self.gae_lambda 144 | 145 | ## NB: the last memoy is missing 146 | # Update the memories with the discounted reward 147 | upd_memories.append(Memory(obs=memories[t].obs, action=memories[t].action, new_obs=memories[t].new_obs, reward=run_add + memories[t].value, done=memories[t].done, value=memories[t].value, adv=run_add)) 148 | 149 | return upd_memories[::-1] 150 | 151 | 152 | def log_policy_prob(mean, std, actions): 153 | # policy log probability 154 | act_log_softmax = -((mean-actions)**2)/(2*torch.exp(std).clamp(min=1e-4)) - torch.log(torch.sqrt(2*math.pi*torch.exp(std))) 155 | return act_log_softmax 156 | 157 | def compute_log_policy_prob(memories, nn_policy, device): 158 | ''' 159 | Run the policy on the observation in the memory and compute the policy log probability 160 | ''' 161 | n_mean = nn_policy(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)).to(device)) 162 | n_mean = n_mean.type(torch.DoubleTensor) 163 | logstd = agent_policy.logstd.type(torch.DoubleTensor) 164 | 165 | actions = torch.DoubleTensor(np.array([m.action for m in memories])).to(device) 166 | 167 | return log_policy_prob(n_mean, logstd, actions) 168 | 169 | def clipped_PPO_loss(memories, nn_policy, nn_value, old_log_policy, adv, epsilon, writer, device): 170 | ''' 171 | Clipped PPO loss as in the paperself. 172 | It return the clipped policy loss and the value loss 173 | ''' 174 | 175 | # state value 176 | rewards = torch.tensor(np.array([m.reward for m in memories], dtype=np.float32)).to(device) 177 | value = nn_value(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)).to(device)) 178 | # Value loss 179 | vl_loss = F.mse_loss(value.squeeze(-1), rewards) 180 | 181 | new_log_policy = compute_log_policy_prob(memories, nn_policy, device) 182 | rt_theta = torch.exp(new_log_policy - old_log_policy.detach()) 183 | 184 | adv = adv.unsqueeze(-1) # add a dimension because rt_theta has shape: [batch_size, n_actions] 185 | pg_loss = -torch.mean(torch.min(rt_theta*adv, torch.clamp(rt_theta, 1-epsilon, 1+epsilon)*adv)) 186 | 187 | return pg_loss, vl_loss 188 | 189 | def test_game(tst_env, agent_policy, test_episodes): 190 | ''' 191 | Execute test episodes on the test environment 192 | ''' 193 | 194 | reward_games = [] 195 | steps_games = [] 196 | for _ in range(test_episodes): 197 | obs = tst_env.reset() 198 | rewards = 0 199 | steps = 0 200 | while True: 201 | ag_mean = agent_policy(torch.tensor(obs)) 202 | action = np.clip(ag_mean.data.cpu().numpy().squeeze(), -1, 1) 203 | 204 | next_obs, reward, done, _ = tst_env.step(action) 205 | steps += 1 206 | obs = next_obs 207 | rewards += reward 208 | 209 | if done: 210 | reward_games.append(rewards) 211 | steps_games.append(steps) 212 | obs = tst_env.reset() 213 | break 214 | 215 | return np.mean(reward_games), np.mean(steps_games) 216 | 217 | 218 | Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done', 'value', 'adv'], verbose=False, rename=False) 219 | 220 | # Hyperparameters 221 | ENV_NAME = 'BipedalWalker-v2' 222 | #ENV_NAME = 'BipedalWalkerHardcore-v2' 223 | 224 | MAX_ITER = 500000 225 | 226 | BATCH_SIZE = 64 227 | PPO_EPOCHS = 7 228 | device = 'cpu' 229 | CLIP_GRADIENT = 0.2 230 | CLIP_EPS = 0.2 231 | 232 | TRAJECTORY_SIZE = 2049 233 | GAE_LAMBDA = 0.95 234 | GAMMA = 0.99 235 | 236 | ## Test Hyperparameters 237 | test_episodes = 5 238 | best_test_result = -1e5 239 | save_video_test = True 240 | N_ITER_TEST = 100 241 | 242 | POLICY_LR = 0.0004 243 | VALUE_LR = 0.001 244 | now = datetime.datetime.now() 245 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 246 | 247 | load_model = False 248 | checkpoint_name = "checkpoints/..." 249 | 250 | if __name__ == '__main__': 251 | # Create the environment 252 | env = Env(ENV_NAME, TRAJECTORY_SIZE, GAMMA, GAE_LAMBDA) 253 | 254 | writer_name = 'PPO_'+ENV_NAME+'_'+date_time+'_'+str(POLICY_LR)+'_'+str(VALUE_LR)+'_'+str(TRAJECTORY_SIZE)+'_'+str(BATCH_SIZE) 255 | writer = SummaryWriter(log_dir='content/runs/'+writer_name) 256 | 257 | # create the test environment 258 | test_env = gym.make(ENV_NAME) 259 | if save_video_test: 260 | test_env = gym.wrappers.Monitor(test_env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: episode_id%10==0) 261 | 262 | # initialize the actor-critic NN 263 | agent_policy = A2C_policy(test_env.observation_space.shape, test_env.action_space.shape).to(device) 264 | agent_value = A2C_value(test_env.observation_space.shape).to(device) 265 | 266 | # initialize policy and value optimizer 267 | optimizer_policy = optim.Adam(agent_policy.parameters(), lr=POLICY_LR) 268 | optimizer_value = optim.Adam(agent_value.parameters(), lr=VALUE_LR) 269 | 270 | # Do you want to load a trained model? 271 | if load_model: 272 | print('> Loading checkpoint {}'.format(checkpoint_name)) 273 | checkpoint = torch.load(checkpoint_name) 274 | agent_policy.load_state_dict(checkpoint['agent_policy']) 275 | agent_value.load_state_dict(checkpoint['agent_value']) 276 | optimizer_policy.load_state_dict(checkpoint['optimizer_policy']) 277 | optimizer_value.load_state_dict(checkpoint['optimizer_value']) 278 | 279 | 280 | experience = [] 281 | n_iter = 0 282 | 283 | while n_iter < MAX_ITER: 284 | n_iter += 1 285 | 286 | batch = env.steps(agent_policy, agent_value) 287 | 288 | # Compute the policy probability with the old policy network 289 | old_log_policy = compute_log_policy_prob(batch, agent_policy, device) 290 | 291 | # Gather the advantage from the memory.. 292 | batch_adv = np.array([m.adv for m in batch]) 293 | # .. and normalize it to stabilize network 294 | batch_adv = (batch_adv - np.mean(batch_adv)) / (np.std(batch_adv) + 1e-7) 295 | batch_adv = torch.tensor(batch_adv).to(device) 296 | 297 | # variables to accumulate losses 298 | pol_loss_acc = [] 299 | val_loss_acc = [] 300 | 301 | # execute PPO_EPOCHS epochs 302 | for s in range(PPO_EPOCHS): 303 | # compute the loss and optimize over mini batches of size BATCH_SIZE 304 | for mb in range(0, len(batch), BATCH_SIZE): 305 | mini_batch = batch[mb:mb+BATCH_SIZE] 306 | minib_old_log_policy = old_log_policy[mb:mb+BATCH_SIZE] 307 | minib_adv = batch_adv[mb:mb+BATCH_SIZE] 308 | 309 | # Compute the PPO clipped loss and the value loss 310 | pol_loss, val_loss = clipped_PPO_loss(mini_batch, agent_policy, agent_value, minib_old_log_policy, minib_adv, CLIP_EPS, writer, device) 311 | 312 | # optimize the policy network 313 | optimizer_policy.zero_grad() 314 | pol_loss.backward() 315 | optimizer_policy.step() 316 | 317 | # optimize the value network 318 | optimizer_value.zero_grad() 319 | val_loss.backward() 320 | optimizer_value.step() 321 | 322 | pol_loss_acc.append(float(pol_loss)) 323 | val_loss_acc.append(float(val_loss)) 324 | 325 | # add scalars to the tensorboard 326 | writer.add_scalar('pg_loss', np.mean(pol_loss_acc), n_iter) 327 | writer.add_scalar('vl_loss', np.mean(val_loss_acc), n_iter) 328 | writer.add_scalar('rew', env.last_game_rew, n_iter) 329 | writer.add_scalar('10rew', np.mean(env.last_games_rews[-100:]), n_iter) 330 | 331 | # Test the agent 332 | if n_iter % N_ITER_TEST == 0: 333 | test_rews, test_stps = test_game(test_env, agent_policy, test_episodes) 334 | print(' > Testing..', n_iter,test_rews, test_stps) 335 | # if it achieve the best results so far, save the models 336 | if test_rews > best_test_result: 337 | torch.save({ 338 | 'agent_policy': agent_policy.state_dict(), 339 | 'agent_value': agent_value.state_dict(), 340 | 'optimizer_policy': optimizer_policy.state_dict(), 341 | 'optimizer_value': optimizer_value.state_dict(), 342 | 'test_reward': test_rews 343 | }, 'checkpoints/checkpoint_'+writer_name+'.pth.tar') 344 | best_test_result = test_rews 345 | print('=> Best test!! Reward:{:.2f} Steps:{}'.format(test_rews, test_stps)) 346 | 347 | writer.add_scalar('test_rew', test_rews, n_iter) 348 | 349 | 350 | writer.close() 351 | -------------------------------------------------------------------------------- /Week5/README.md: -------------------------------------------------------------------------------- 1 | # Let's solve BipedalWalker with PPO 2 | 3 | This is an implementation of [PPO](https://blog.openai.com/openai-baselines-ppo/) with continuous actions, a new algorithm developed by OpenAI that has been used in [OpenAI Five to play Dota 2](https://blog.openai.com/openai-five/). 4 | 5 | PPO is a policy gradient method that differently from the vanilla implementation, it combines the sampling data through interaction with the environment and the optimization of a surrogate objective function. Read the [paper](https://arxiv.org/pdf/1707.06347.pdf) to learn more about it. 6 | 7 | For the DQN implementation and the choose of the hyperparameters, I mostly followed the [paper](https://arxiv.org/pdf/1707.06347.pdf). (In the last page there is a table with all the hyperparameters.). In case you want to fine-tune them, check out [Training with Proximal Policy Optimization](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-PPO.md) 8 | 9 | ### [Learn the theory behind PPO](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#week-5---advanced-policy-gradients---trpo--ppo) 10 | 11 | 12 | ## Results 13 | 14 | ![walker gif](imgs/walker_gif.gif) 15 | 16 | In the plot below are shown the rewards. The game defines "solving" as getting an average reward of 300 over 100 consecutive trials. We aren't at that level yet, but is possible to reach that goal tuning the hyperparameters and playing more episodes. 17 | 18 | ![results](imgs/rew_walker.png) 19 | 20 | 21 | ## Install 22 | 23 | ``` 24 | pip install gym 25 | pip install torch torchvision 26 | pip install tensorboardX 27 | apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 28 | 29 | git clone https://github.com/pybox2d/pybox2d 30 | cd pybox2d 31 | !pip install -e . 32 | ``` 33 | -------------------------------------------------------------------------------- /Week5/imgs/rew_walker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week5/imgs/rew_walker.png -------------------------------------------------------------------------------- /Week5/imgs/walker_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week5/imgs/walker_gif.gif -------------------------------------------------------------------------------- /Week6/ES.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorboardX 3 | import time 4 | import datetime 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.multiprocessing as mp 10 | from torch import optim 11 | 12 | import scipy.stats as ss 13 | from tensorboardX import SummaryWriter 14 | import gym 15 | 16 | 17 | class NeuralNetwork(nn.Module): 18 | ''' 19 | Neural network for continuous action space 20 | ''' 21 | def __init__(self, input_shape, n_actions): 22 | super(NeuralNetwork, self).__init__() 23 | 24 | self.mlp = nn.Sequential( 25 | nn.Linear(input_shape, 32), 26 | nn.Tanh(), 27 | nn.Linear(32, 32), 28 | nn.Tanh()) 29 | 30 | self.mean_l = nn.Linear(32, n_actions) 31 | self.mean_l.weight.data.mul_(0.1) 32 | 33 | self.var_l = nn.Linear(32, n_actions) 34 | self.var_l.weight.data.mul_(0.1) 35 | 36 | self.logstd = nn.Parameter(torch.zeros(n_actions)) 37 | 38 | def forward(self, x): 39 | ot_n = self.mlp(x.float()) 40 | return torch.tanh(self.mean_l(ot_n)) 41 | 42 | 43 | def sample_noise(neural_net): 44 | ''' 45 | Sample noise for each parameter of the neural net 46 | ''' 47 | nn_noise = [] 48 | for n in neural_net.parameters(): 49 | noise = np.random.normal(size=n.data.numpy().shape) 50 | nn_noise.append(noise) 51 | return np.array(nn_noise) 52 | 53 | def evaluate_neuralnet(nn, env): 54 | ''' 55 | Evaluate an agent running it in the environment and computing the total reward 56 | ''' 57 | obs = env.reset() 58 | game_reward = 0 59 | 60 | while True: 61 | # Output of the neural net 62 | net_output = nn(torch.tensor(obs)) 63 | # the action is the value clipped returned by the nn 64 | action = np.clip(net_output.data.cpu().numpy().squeeze(), -1, 1) 65 | new_obs, reward, done, _ = env.step(action) 66 | obs = new_obs 67 | 68 | game_reward += reward 69 | 70 | if done: 71 | break 72 | 73 | return game_reward 74 | 75 | def evaluate_noisy_net(noise, neural_net, env): 76 | ''' 77 | Evaluate a noisy agent by adding the noise to the plain agent 78 | ''' 79 | old_dict = neural_net.state_dict() 80 | 81 | # add the noise to each parameter of the NN 82 | for n, p in zip(noise, neural_net.parameters()): 83 | p.data += torch.FloatTensor(n * STD_NOISE) 84 | 85 | # evaluate the agent with the noise 86 | reward = evaluate_neuralnet(neural_net, env) 87 | # load the previous paramater (the ones without the noise) 88 | neural_net.load_state_dict(old_dict) 89 | 90 | return reward 91 | 92 | def worker(params_queue, output_queue): 93 | ''' 94 | Function execute by each worker: get the agent' NN, sample noise and evaluate the agent adding the noise. Then return the seed and the rewards to the central unit 95 | ''' 96 | 97 | env = gym.make(ENV_NAME) 98 | actor = NeuralNetwork(env.observation_space.shape[0], env.action_space.shape[0]) 99 | 100 | while True: 101 | # get the new actor's params 102 | act_params = params_queue.get() 103 | if act_params != None: 104 | # load the actor params 105 | actor.load_state_dict(act_params) 106 | 107 | # get a random seed 108 | seed = np.random.randint(1e6) 109 | # set the new seed 110 | np.random.seed(seed) 111 | 112 | noise = sample_noise(actor) 113 | 114 | pos_rew = evaluate_noisy_net(noise, actor, env) 115 | # Mirrored sampling 116 | neg_rew = evaluate_noisy_net(-noise, actor, env) 117 | 118 | output_queue.put([[pos_rew, neg_rew], seed]) 119 | else: 120 | break 121 | 122 | 123 | def normalized_rank(rewards): 124 | ''' 125 | Rank the rewards and normalize them. 126 | ''' 127 | ranked = ss.rankdata(rewards) 128 | norm = (ranked - 1) / (len(ranked) - 1) 129 | norm -= 0.5 130 | return norm 131 | 132 | 133 | ENV_NAME = 'LunarLanderContinuous-v2' 134 | 135 | # Hyperparameters 136 | STD_NOISE = 0.05 137 | BATCH_SIZE = 100 138 | LEARNING_RATE = 0.01 139 | MAX_ITERATIONS = 10000 140 | 141 | MAX_WORKERS = 4 142 | 143 | save_video_test = True 144 | VIDEOS_INTERVAL = 100 145 | 146 | now = datetime.datetime.now() 147 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 148 | 149 | if __name__ == '__main__': 150 | # Writer name 151 | writer_name = 'ASY_ES_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, date_time, str(STD_NOISE), str(BATCH_SIZE), str(LEARNING_RATE), str(MAX_ITERATIONS), str(MAX_WORKERS)) 152 | print('Name:', writer_name) 153 | 154 | # Create the test environment 155 | env = gym.make(ENV_NAME) 156 | if save_video_test: 157 | env = gym.wrappers.Monitor(env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: True) 158 | 159 | # Initialize the agent 160 | actor = NeuralNetwork(env.observation_space.shape[0], env.action_space.shape[0]) 161 | # Initialize the optimizer 162 | optimizer = optim.Adam(actor.parameters(), lr=LEARNING_RATE) 163 | 164 | writer = SummaryWriter(log_dir='content/runs/'+writer_name) 165 | 166 | # Queues to pass and get the variables to and from each processe 167 | output_queue = mp.Queue(maxsize=BATCH_SIZE) 168 | params_queue = mp.Queue(maxsize=BATCH_SIZE) 169 | 170 | processes = [] 171 | 172 | # Create and start the processes 173 | for _ in range(MAX_WORKERS): 174 | p = mp.Process(target=worker, args=(params_queue, output_queue)) 175 | p.start() 176 | processes.append(p) 177 | 178 | 179 | # Execute the main loop MAX_ITERATIONS times 180 | for n_iter in range(MAX_ITERATIONS): 181 | it_time = time.time() 182 | 183 | batch_noise = [] 184 | batch_reward = [] 185 | 186 | # create the queue with the actor parameters 187 | for _ in range(BATCH_SIZE): 188 | params_queue.put(actor.state_dict()) 189 | 190 | # receive from each worker the results (the seed and the rewards) 191 | for i in range(BATCH_SIZE): 192 | p_rews, p_seed = output_queue.get() 193 | 194 | np.random.seed(p_seed) 195 | noise = sample_noise(actor) 196 | batch_noise.append(noise) 197 | batch_noise.append(-noise) 198 | 199 | batch_reward.append(p_rews[0]) # reward of the positive noise 200 | batch_reward.append(p_rews[1]) # reward of the negative noise 201 | 202 | # Print some stats 203 | print(n_iter, 'Mean:',np.round(np.mean(batch_reward), 2), 'Max:', np.round(np.max(batch_reward), 2), 'Time:', np.round(time.time()-it_time, 2)) 204 | writer.add_scalar('reward', np.mean(batch_reward), n_iter) 205 | 206 | # Rank the reward and normalize it 207 | batch_reward = normalized_rank(batch_reward) 208 | 209 | 210 | th_update = [] 211 | optimizer.zero_grad() 212 | # for each actor's parameter, and for each noise in the batch, update it by the reward * the noise value 213 | for idx, p in enumerate(actor.parameters()): 214 | upd_weights = np.zeros(p.data.shape) 215 | 216 | for n,r in zip(batch_noise, batch_reward): 217 | upd_weights += r*n[idx] 218 | 219 | upd_weights = upd_weights / (BATCH_SIZE*STD_NOISE) 220 | # put the updated weight on the gradient variable so that afterwards the optimizer will use it 221 | p.grad = torch.FloatTensor( -upd_weights) 222 | th_update.append(np.mean(upd_weights)) 223 | 224 | # Optimize the actor's NN 225 | optimizer.step() 226 | 227 | writer.add_scalar('loss', np.mean(th_update), n_iter) 228 | 229 | if n_iter % VIDEOS_INTERVAL == 0: 230 | print('Test reward:',evaluate_neuralnet(actor, env)) 231 | 232 | # quit the processes 233 | for _ in range(MAX_WORKERS): 234 | params_queue.put(None) 235 | 236 | for p in processes: 237 | p.join() 238 | 239 | # tensorboard --logdir content/runs --host localhost 240 | -------------------------------------------------------------------------------- /Week6/README.md: -------------------------------------------------------------------------------- 1 | # Scalable Evolution Strategies on LunarLander 2 | 3 | Evolution Strategies is a valid alternative to the most popular MDP-based RL techniques. Here is provided an implementation of the OpenAI paper [Evolution Strategies as a 4 | Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf). I decided to test it on [LunarLander](https://gym.openai.com/envs/LunarLanderContinuous-v2/) Gym environment to show the applicability and competitiveness of this category of algorithms. 5 | 6 | The following are the key parts of this implementation: 7 | - Novel communication strategy based on common random number 8 | - Mirrored sampling 9 | - Normalized rank 10 | 11 | 12 | ### [Learn more about Evolution Strategies](https://github.com/andri27-ts/60_Days_RL_Challenge#week-6---evolution-strategies-and-genetic-algorithms) 13 | 14 | 15 | 16 | ## Results 17 | 18 | ![LunarLander](imgs/LunarLanderContinuous.gif) 19 | 20 | The following plot shows the reward for each iteration. ES is able to solve the game after 650 iterations. Keep in mind that in this version, for each iteration, 100 games are played. This means that the algorithm solved the gamed after having played about 65.000 games. 21 | 22 | ![results](imgs/plot_rewards.PNG) 23 | 24 | 25 | ## Install 26 | 27 | ``` 28 | pip install gym 29 | pip install torch torchvision 30 | pip install tensorboardX 31 | apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 32 | 33 | git clone https://github.com/pybox2d/pybox2d 34 | cd pybox2d 35 | pip install -e . 36 | ``` 37 | -------------------------------------------------------------------------------- /Week6/imgs/LunarLanderContinuous.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week6/imgs/LunarLanderContinuous.gif -------------------------------------------------------------------------------- /Week6/imgs/plot_rewards.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week6/imgs/plot_rewards.PNG -------------------------------------------------------------------------------- /Week7/README.md: -------------------------------------------------------------------------------- 1 | # Model-Based Reinforcement Learning 2 | 3 | The strength of model-based reinforcement learning algorithms is that, once they learned the environment, they can plan the next actions to take. This allows the agent to transfer the knowledge of the environment it has acquired to other tasks. Model-based methods generally are more sample efficient than model-free to the detriment of performance. 4 | Better and more efficient RL algorithms can be obtained merging these two techniques. 5 | This repository contains an implementation of the model-based algorithm proposed in section IV of [this paper](https://arxiv.org/pdf/1708.02596.pdf) with some differences: 6 | - used [Roboschool](https://github.com/openai/roboschool) instead of [Mujoco](http://www.mujoco.org/). 7 | - along with the next environment state, also the reward is learned. To do that another neural network has been used. 8 | - hyperparameters have been adapted to the new environment and problem reformulation (i.e. the reward has to be learned). 9 | 10 | The pseudocode of the main loop is the following: 11 | 12 | ![pseudocode](imgs/pseudocode.png) 13 | 14 | 15 | 16 | ### [Learn more about Model-Based Reinforcement Learning](https://github.com/andri27-ts/60_Days_RL_Challenge#week-7---model-based-reinforcement-learning) 17 | 18 | ## Results 19 | 20 | ![animation](imgs/animation.gif) 21 | 22 | To train RoboschoolAnt-v1, no aggregation steps has been used. 23 | On RoboschoolAnt, playing 10.000 games it achieves a mean reward of about 800. These games have been played only taking random actions. 24 | 25 | 26 | ## Install 27 | 28 | Roboschool installation: 29 | ``` 30 | apt install cmake ffmpeg pkg-config qtbase5-dev libqt5opengl5-dev libassimp-dev libpython3.6-dev libboost-python-dev libtinyxml-dev 31 | 32 | git clone https://github.com/openai/gym 33 | pip install -e gym 34 | 35 | git clone https://github.com/openai/roboschool 36 | 37 | cd roboschool 38 | ROBOSCHOOL_PATH=`pwd` 39 | git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision 40 | mkdir bullet3/build 41 | cd bullet3/build 42 | cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF .. 43 | make -j4 44 | make install 45 | cd ../.. 46 | 47 | 48 | pip3 install -e $ROBOSCHOOL_PATH 49 | ``` 50 | 51 | Torch installation: 52 | ``` 53 | pip install torch torchvision 54 | ``` 55 | 56 | In case you use Google Colab, run 57 | 58 | ``` 59 | # Install Chainer, ChainerRL and CuPy! 60 | 61 | %%script bash 62 | 63 | apt-get -qq -y install libcusparse8.0 libnvrtc8.0 libnvtoolsext1 > /dev/null 64 | ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so 65 | pip -q install https://github.com/kmaehashi/chainer-colab/releases/download/2018-02-06/cupy_cuda80-4.0.0b3-cp36-cp36m-linux_x86_64.whl 66 | pip -q install 'chainer==4.0.0b3' 67 | apt-get -qq -y install xvfb freeglut3-dev ffmpeg> /dev/null 68 | pip -q install chainerrl 69 | pip -q install gym 70 | pip -q install pyglet 71 | pip -q install pyopengl 72 | pip -q install pyvirtualdisplay 73 | ``` 74 | -------------------------------------------------------------------------------- /Week7/imgs/animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week7/imgs/animation.gif -------------------------------------------------------------------------------- /Week7/imgs/pseudocode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week7/imgs/pseudocode.png -------------------------------------------------------------------------------- /Week7/model_based.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | # Needed only if you run it on Google Colab 4 | from pyvirtualdisplay import Display 5 | display = Display(visible=0, size=(1024, 768)) 6 | display.start() 7 | import os 8 | os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)''' 9 | 10 | 11 | from sklearn.preprocessing import StandardScaler 12 | import roboschool 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.optim as optim 17 | from torch.nn import functional as F 18 | 19 | from tqdm import tqdm 20 | import datetime 21 | import time 22 | 23 | import gym 24 | import numpy as np 25 | 26 | class NNDynamicModel(nn.Module): 27 | ''' 28 | Model that predict the next state, given the current state and action 29 | ''' 30 | def __init__(self, input_dim, obs_output_dim): 31 | super(NNDynamicModel, self).__init__() 32 | 33 | self.mlp = nn.Sequential( 34 | nn.Linear(input_dim, 512), 35 | nn.BatchNorm1d(num_features=512), 36 | nn.ReLU(), 37 | nn.Linear(512,256), 38 | nn.BatchNorm1d(num_features=256), 39 | nn.ReLU(), 40 | nn.Linear(256, obs_output_dim) 41 | ) 42 | 43 | def forward(self, x): 44 | return self.mlp(x.float()) 45 | 46 | 47 | class NNRewardModel(nn.Module): 48 | ''' 49 | Model that predict the reward given the current state and action 50 | ''' 51 | def __init__(self, input_dim, reward_output_dim): 52 | super(NNRewardModel, self).__init__() 53 | 54 | self.mlp = nn.Sequential( 55 | nn.Linear(input_dim, 512), 56 | nn.BatchNorm1d(num_features=512), 57 | nn.ReLU(), 58 | nn.Linear(512,256), 59 | nn.BatchNorm1d(num_features=256), 60 | nn.ReLU(), 61 | nn.Linear(256, reward_output_dim) 62 | ) 63 | 64 | def forward(self, x): 65 | return self.mlp(x.float()) 66 | 67 | def gather_random_trajectories(num_traj, env_name): 68 | ''' 69 | Run num_traj random trajectories to gather information about the next state and reward. 70 | Data used to train the models in a supervised way. 71 | ''' 72 | dataset_random = [] 73 | env = gym.make(env_name) 74 | 75 | game_rewards = [] 76 | for n in range(num_traj): 77 | 78 | obs = env.reset() 79 | while True: 80 | sampled_action = env.action_space.sample() 81 | new_obs, reward, done, _ = env.step(sampled_action) 82 | 83 | dataset_random.append([obs, new_obs, reward, done, sampled_action]) 84 | 85 | obs = new_obs 86 | game_rewards.append(reward) 87 | 88 | if done: 89 | break 90 | 91 | # print some stats 92 | print('Mean R:',np.round(np.sum(game_rewards)/num_traj,2), 'Max R:', np.round(np.max(game_rewards),2), np.round(len(game_rewards)/num_traj)) 93 | 94 | return dataset_random 95 | 96 | def model_MSEloss(y_truth, y_pred, device): 97 | ''' 98 | Compute the MSE (Mean Squared Error) 99 | ''' 100 | y_truth = torch.FloatTensor(np.array(y_truth)).to(device) 101 | return F.mse_loss(y_pred.view(-1).float(), y_truth.view(-1)) 102 | 103 | 104 | def train_dyna_model(random_dataset, rl_dataset, env_model, rew_model, batch_size, max_model_iter, num_examples_added, ENV_LEARNING_RATE, REW_LEARNING_RATE, device): 105 | ''' 106 | Train the two models that predict the next state and the expected reward 107 | ''' 108 | 109 | env_optimizer = optim.Adam(env_model.parameters(), lr=ENV_LEARNING_RATE) 110 | rew_optimizer = optim.Adam(rew_model.parameters(), lr=REW_LEARNING_RATE) 111 | 112 | if len(rl_dataset) > 0: 113 | ''' 114 | # To use only a fraction of the random dataset 115 | rand = np.arange(len(random_dataset)) 116 | np.random.shuffle(rand) 117 | rand = rand[:int(len(rl_dataset)*0.8)] # 80% of rl dataset 118 | 119 | d_concat = np.concatenate([np.array(random_dataset)[rand], rl_dataset], axis=0)''' 120 | 121 | # Concatenate the random dataset with the RL dataset. Used only in the aggregation iterations 122 | d_concat = np.concatenate([random_dataset, rl_dataset], axis=0) 123 | else: 124 | d_concat = np.array(random_dataset) 125 | 126 | # Split the dataset into train(80%) and test(20%) 127 | D_train = d_concat[:int(-num_examples_added*1/5)] 128 | D_valid = d_concat[int(-num_examples_added*1/5):] 129 | 130 | print("len(D):", len(d_concat), 'len(Dtrain)', len(D_train)) 131 | 132 | # Shuffle the dataset 133 | sff = np.arange(len(D_train)) 134 | np.random.shuffle(sff) 135 | D_train = D_train[sff] 136 | 137 | 138 | # Create the input and output for the train 139 | X_train = np.array([np.concatenate([obs,act]) for obs,_,_,_,act in D_train]) # Takes obs and action 140 | # Reward's output 141 | y_rew_train = np.array([[rw] for _,_,rw,_,_ in D_train]) 142 | # Next state output 143 | y_env_train = np.array([no for _,no,_,_,_ in D_train]) 144 | y_env_train = y_env_train - np.array([obs for obs,_,_,_,_ in D_train]) # y(state) = s(t+1) - s(t) 145 | 146 | # Create the input and output array for the validation 147 | X_valid = np.array([np.concatenate([obs,act]) for obs,_,_,_,act in D_valid]) # Takes obs and action 148 | # Reward output 149 | y_rew_valid = np.array([[rw] for _,_,rw,_,_ in D_valid]) 150 | # Next state output 151 | y_env_valid = np.array([no for _,no,_,_,_ in D_valid]) 152 | y_env_valid = y_env_valid - np.array([obs for obs,_,_,_,_ in D_valid]) # y(state) = s(t+1) - s(t) 153 | 154 | # Standardize the input features by removing the mean and scaling to unit variance 155 | input_scaler = StandardScaler() 156 | X_train = input_scaler.fit_transform(X_train) 157 | X_valid = input_scaler.transform(X_valid) 158 | 159 | # Standardize the outputs by removing the mean and scaling to unit variance 160 | 161 | env_output_scaler = StandardScaler() 162 | y_env_train = env_output_scaler.fit_transform(y_env_train) 163 | y_env_valid = env_output_scaler.transform(y_env_valid) 164 | 165 | rew_output_scaler = StandardScaler() 166 | y_rew_train = rew_output_scaler.fit_transform(y_rew_train) 167 | y_rew_valid = rew_output_scaler.transform(y_rew_valid) 168 | 169 | # store all the scalers in a variable to later uses 170 | norm = (input_scaler, env_output_scaler, rew_output_scaler) 171 | 172 | losses_env = [] 173 | losses_rew = [] 174 | 175 | # go through max_model_iter supervised iterations 176 | for it in tqdm(range(max_model_iter)): 177 | # create mini batches of size batch_size 178 | for mb in range(0, len(X_train), batch_size): 179 | 180 | if len(X_train) > mb+BATCH_SIZE: 181 | X_mb = X_train[mb:mb+BATCH_SIZE] 182 | 183 | y_env_mb = y_env_train[mb:mb+BATCH_SIZE] 184 | y_rew_mb = y_rew_train[mb:mb+BATCH_SIZE] 185 | 186 | # Add gaussian noise with mean 0 and variance 0.0001 as in the paper 187 | X_mb += np.random.normal(loc=0, scale=0.001, size=X_mb.shape) 188 | 189 | ## Optimization of the 'env_model' neural net 190 | 191 | env_optimizer.zero_grad() 192 | # forward pass of the model to compute the output 193 | pred_state = env_model(torch.tensor(X_mb).to(device)) 194 | # compute the MSE loss 195 | loss = model_MSEloss(y_env_mb, pred_state, device) 196 | 197 | if it == (max_model_iter - 1): 198 | losses_env.append(loss.cpu().detach().numpy()) 199 | 200 | # backward pass 201 | loss.backward() 202 | # optimization step 203 | env_optimizer.step() 204 | 205 | 206 | ## Optimization of the 'rew_model' neural net 207 | rew_optimizer.zero_grad() 208 | # forward pass of the model to compute the output 209 | pred_rew = rew_model(torch.tensor(X_mb).to(device)) 210 | # compute the MSE loss 211 | loss = model_MSEloss(y_rew_mb, pred_rew, device) 212 | 213 | if it == (max_model_iter - 1): 214 | losses_rew.append(loss.cpu().detach().numpy()) 215 | # backward pass 216 | loss.backward() 217 | # optimization step 218 | rew_optimizer.step() 219 | 220 | # Evalute the models every 10 iterations and print the losses 221 | if it % 10 == 0: 222 | env_model.eval() 223 | rew_model.eval() 224 | 225 | pred_state = env_model(torch.tensor(X_valid).to(device)) 226 | pred_rew = rew_model(torch.tensor(X_valid).to(device)) 227 | env_model.train(True) 228 | rew_model.train(True) 229 | 230 | valid_env_loss = model_MSEloss(y_env_valid, pred_state, device) 231 | valid_rew_loss = model_MSEloss(y_rew_valid, pred_rew, device) 232 | 233 | print('..', it, valid_env_loss.cpu().detach().numpy(), valid_rew_loss.cpu().detach().numpy()) 234 | 235 | 236 | ## Evaluate the MSE losses 237 | 238 | env_model.eval() 239 | rew_model.eval() 240 | 241 | pred_state = env_model(torch.tensor(X_valid).to(device)) 242 | pred_rew = rew_model(torch.tensor(X_valid).to(device)) 243 | env_model.train(True) 244 | rew_model.train(True) 245 | 246 | valid_env_loss = model_MSEloss(y_env_valid, pred_state, device) 247 | valid_rew_loss = model_MSEloss(y_rew_valid, pred_rew, device) 248 | 249 | return np.mean(losses_env), np.mean(losses_rew), valid_env_loss.cpu().detach().numpy(), valid_rew_loss.cpu().detach().numpy(), norm 250 | 251 | 252 | def multi_model_based_control(env_model, rew_model, real_obs, num_sequences, horizon_length, sample_action, norm, device): 253 | ''' 254 | Use a random-sampling shooting method, generating random action sequences. The first action with the highest reward of the entire sequence is returned 255 | ''' 256 | best_reward = -1e9 257 | best_next_action = [] 258 | 259 | input_scaler, env_output_scaler, rew_output_scaler = norm 260 | 261 | m_obs = np.array([real_obs for _ in range(num_sequences)]) 262 | 263 | # array that contains the rewards for all the sequence 264 | unroll_rewards = np.zeros((num_sequences, 1)) 265 | first_sampled_actions = [] 266 | 267 | env_model.eval() 268 | rew_model.eval() 269 | 270 | ## Create a batch of size 'num_sequences' (number of trajectories) to roll the models 'horizon_length' times. 271 | ## i.e. roll a given number of trajectories in a single batch (to increase speed) 272 | 273 | for t in range(horizon_length): 274 | # sampled actions for each sequence 275 | sampled_actions = [sample_action() for _ in range(num_sequences)] 276 | # scale the input 277 | models_input = input_scaler.transform(np.concatenate([m_obs, sampled_actions], axis=1)) 278 | # compute the next state for each sequence 279 | pred_obs = env_model(torch.tensor(models_input).to(device)) 280 | # and the reward 281 | pred_rew = rew_model(torch.tensor(models_input).to(device)) 282 | 283 | # inverse scaler transofrmation 284 | pred_obs = env_output_scaler.inverse_transform(pred_obs.cpu().detach().numpy()) 285 | # and add previous observation 286 | m_obs = pred_obs + m_obs 287 | 288 | assert(pred_rew.cpu().detach().numpy().shape == unroll_rewards.shape) 289 | 290 | # sum of the expected rewards 291 | unroll_rewards += pred_rew.cpu().detach().numpy() 292 | 293 | if t == 0: 294 | first_sampled_actions = sampled_actions 295 | 296 | env_model.train(True) 297 | rew_model.train(True) 298 | 299 | # Best the position of the sequence with the higher reward 300 | arg_best_reward = np.argmax(unroll_rewards) 301 | best_sum_reward = unroll_rewards[arg_best_reward].squeeze() 302 | # take the first action of this sequence 303 | best_action = first_sampled_actions[arg_best_reward] 304 | 305 | return best_action, best_sum_reward 306 | 307 | 308 | ENV_NAME = 'RoboschoolAnt-v1' 309 | 310 | # Main loop hyperp 311 | AGGR_ITER = 3 312 | STEPS_PER_AGGR = 20000 313 | 314 | # Random MB hyperp 315 | NUM_RAND_TRAJECTORIES = 1000 316 | 317 | # 'cuda' or 'cpu' 318 | device = 'cuda' 319 | 320 | # Supervised Model Hyperp 321 | ENV_LEARNING_RATE = 1e-3 322 | REW_LEARNING_RATE = 1e-3 323 | BATCH_SIZE = 512 324 | TRAIN_ITER_MODEL = 55 325 | 326 | # Controller Hyperp 327 | HORIZION_LENGTH = 10 328 | NUM_ACTIONS_SEQUENCES = 20000 329 | 330 | save_video_test = True 331 | 332 | now = datetime.datetime.now() 333 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) 334 | 335 | if __name__ == '__main__': 336 | writer_name = 'MB_RL_'+ENV_NAME+'_'+date_time 337 | print('Name:',writer_name, device) 338 | 339 | # create the environment 340 | env = gym.make(ENV_NAME) 341 | if save_video_test: 342 | env = gym.wrappers.Monitor(env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: True) 343 | obs = env.reset() 344 | 345 | # gather the dataset of random sequences 346 | rand_dataset = gather_random_trajectories(NUM_RAND_TRAJECTORIES, ENV_NAME) 347 | 348 | rl_dataset = [] 349 | 350 | # Initialize the models 351 | env_model = NNDynamicModel(env.action_space.shape[0] + env.observation_space.shape[0], env.observation_space.shape[0]).to(device) 352 | rew_model = NNRewardModel(env.action_space.shape[0] + env.observation_space.shape[0], 1).to(device) 353 | 354 | 355 | game_reward = 0 356 | num_examples_added = len(rand_dataset) 357 | 358 | for n_iter in range(AGGR_ITER): 359 | 360 | # supervised training of the dataset (random and rl if it exists) 361 | train_env_loss, train_rew_loss, valid_env_loss, valid_rew_loss, norm = train_dyna_model(rand_dataset, rl_dataset, env_model, rew_model, BATCH_SIZE, TRAIN_ITER_MODEL, num_examples_added, ENV_LEARNING_RATE, REW_LEARNING_RATE, device) 362 | print('{} >> Eloss:{:.4f} EV loss:{:.4f} -- Rloss:{:.4f} RV loss:{:.4f}'.format(n_iter, train_env_loss, valid_env_loss, train_rew_loss, valid_rew_loss)) 363 | 364 | obs = env.reset() 365 | 366 | num_examples_added = 0 367 | game_reward = 0 368 | game_pred_rews = [] 369 | rews = [] 370 | 371 | while num_examples_added < STEPS_PER_AGGR: 372 | while True: 373 | 374 | tt = time.time() 375 | # Execute the control to roll the sequences and pick the first action of the sequence with the higher reward 376 | action, pred_rew = multi_model_based_control(env_model, rew_model, obs, NUM_ACTIONS_SEQUENCES, HORIZION_LENGTH, env.action_space.sample, norm, device) 377 | game_pred_rews.append(pred_rew) 378 | 379 | # one step in the environment with the action returned by the controller 380 | new_obs, reward, done, _ = env.step(action) 381 | 382 | input_scaler, env_output_scaler, rew_output_scaler = norm 383 | 384 | ## Compute the reward and print some stats 385 | models_input = input_scaler.transform([np.concatenate([obs, action])]) 386 | rew_model.eval() 387 | p_rew = rew_model(torch.tensor(models_input).to(device)) 388 | rew_model.train(True) 389 | unnorm_rew = rew_output_scaler.inverse_transform([float(p_rew.cpu().data[0])]).squeeze() 390 | print(' >> ',len(game_pred_rews), 'gt:',np.round(reward,3), 'pred:',np.round(unnorm_rew, 3), 391 | 'sum:', np.round(pred_rew,3), '|', game_reward, np.round(time.time()-tt, 4), HORIZION_LENGTH) 392 | 393 | # add the last step to the RL dataset 394 | rl_dataset.append([obs, new_obs, reward, done, action]) 395 | 396 | 397 | num_examples_added += 1 398 | obs = new_obs 399 | game_reward += reward 400 | 401 | # if the environment is done, reset it and print some stats 402 | if done: 403 | obs = env.reset() 404 | print(' >> R: {:.2f}, Mean sum:{:.2f}, {}'.format(game_reward, np.mean(game_pred_rews), num_examples_added)) 405 | 406 | rews.append(game_reward) 407 | game_reward = 0 408 | game_pred_rews = [] 409 | break 410 | 411 | print(' >> Mean: {:.2f}', np.mean(rews)) 412 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect -------------------------------------------------------------------------------- /images/GitHub-Mark-32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/GitHub-Mark-32px.png -------------------------------------------------------------------------------- /images/GitHub-Mark-64px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/GitHub-Mark-64px.png -------------------------------------------------------------------------------- /images/frontcover2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/frontcover2.jpg -------------------------------------------------------------------------------- /images/logo5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/logo5.png -------------------------------------------------------------------------------- /images/logo6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/logo6.png -------------------------------------------------------------------------------- /images/title3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/title3.png -------------------------------------------------------------------------------- /images/youtube_social_icon_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/youtube_social_icon_dark.png -------------------------------------------------------------------------------- /images/youtube_social_icon_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/youtube_social_icon_red.png --------------------------------------------------------------------------------