├── .github
└── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── Week2
├── frozenlake_Qlearning.ipynb
└── img
│ ├── Q_function.png
│ ├── frozenlake_v0.png
│ └── short_diag.jpg
├── Week3
├── README.md
├── agent.py
├── atari_wrappers.py
├── buffers.py
├── central_control.py
├── imgs
│ ├── DQN_variations.png
│ ├── Dueling_img.png
│ ├── double_Qlearning_formula.png
│ ├── multistep_formula.png
│ ├── noisenet_formula.png
│ └── pong_gif.gif
├── main.py
├── neural_net.py
└── utils.py
├── Week4
├── A2C.ipynb
├── PolicyGradient.ipynb
└── imgs
│ ├── Advantage_actor_critic.png
│ ├── Vanilla_policy_gradient.png
│ ├── actions_plot_a2c.png
│ ├── loss_plot_a2c.png
│ ├── nn_ac.png
│ ├── reward_pg.png
│ └── reward_plot_a2c.png
├── Week5
├── PPO.py
├── README.md
└── imgs
│ ├── rew_walker.png
│ └── walker_gif.gif
├── Week6
├── ES.py
├── README.md
└── imgs
│ ├── LunarLanderContinuous.gif
│ └── plot_rewards.PNG
├── Week7
├── README.md
├── imgs
│ ├── animation.gif
│ └── pseudocode.png
└── model_based.py
├── _config.yml
└── images
├── GitHub-Mark-32px.png
├── GitHub-Mark-64px.png
├── frontcover2.jpg
├── logo5.png
├── logo6.png
├── title3.png
├── youtube_social_icon_dark.png
└── youtube_social_icon_red.png
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: ['paypal.me/andrealonza']
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Andrea Lonza
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 
3 |
4 | ## Course in Deep Reinforcement Learning
5 |
6 | ### Explore the combination of neural network and reinforcement learning. Algorithms and examples in Python & PyTorch
7 |
8 |
9 | Have you heard about the amazing results achieved by [Deepmind with AlphaGo Zero](https://www.youtube.com/watch?time_continue=24&v=tXlM99xPQC8) and by [OpenAI in Dota 2](https://www.youtube.com/watch?v=l92J1UvHf6M)? It's all about deep neural networks and reinforcement learning. Do you want to know more about it?
10 | This is the right opportunity for you to finally learn Deep RL and use it on new and exciting projects and applications.
11 |
12 | Here you'll find an in depth introduction to these algorithms. Among which you'll learn q learning, deep q learning, PPO, actor critic, and implement them using Python and PyTorch.
13 |
14 | > The ultimate aim is to use these general-purpose technologies and apply them to all sorts of important real world problems.
15 | > **Demis Hassabis**
16 |
17 |
18 | This repository contains:
19 |
20 |
21 |
22 |
**Lectures (& other content) primarily from DeepMind and Berkley Youtube's Channel.**
23 |
24 |
25 |
26 |
**Algorithms (like DQN, A2C, and PPO) implemented in PyTorch and tested on OpenAI Gym: RoboSchool & Atari.**
27 |
28 |
29 |
30 |
31 |
32 | **Stay tuned and follow me on** [](https://twitter.com/andri27_it) and [](https://github.com/andri27-ts) **#60DaysRLChallenge**
33 |
34 | Now we have also a [**Slack channel**](https://60daysrlchallenge.slack.com/). To get an invitation, email me at andrea.lonza@gmail.com. Also, email me if you have any idea, suggestion or improvement.
35 |
36 | To learn Deep Learning, Computer Vision or Natural Language Processing check my **[1-Year-ML-Journey](https://github.com/andri27-ts/1-Year-ML-Journey)**
37 |
38 |
39 | ### Before starting.. Prerequisites
40 | * Basic level of Python and PyTorch
41 | * [Machine Learning](https://github.com/andri27-ts/1-Year-ML-Journey)
42 | * [Basic knowledge in Deep Learning (MLP, CNN and RNN)](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2N3AIlp?tag=andreaaffilia-20)
43 |
44 |
45 |
46 |
47 |
48 | ## Quick Note: my NEW BOOK is out!
49 | To learn Reinforcement Learning and Deep RL more in depth, check out my book [**Reinforcement Learning Algorithms with Python**](https://www.amazon.com/Reinforcement-Learning-Algorithms-Python-understand/dp/1789131111)!!
50 |
51 |
52 |
53 |
54 |
55 | **Table of Contents**
56 | 1. The Landscape of Reinforcement Learning
57 | 2. Implementing RL Cycle and OpenAI Gym
58 | 3. Solving Problems with Dynamic Programming
59 | 4. Q learning and SARSA Applications
60 | 5. Deep Q-Network
61 | 6. Learning Stochastic and DDPG optimization
62 | 7. TRPO and PPO implementation
63 | 8. DDPG and TD3 Applications
64 | 9. Model-Based RL
65 | 10. Imitation Learning with the DAgger Algorithm
66 | 11. Understanding Black-Box Optimization Algorithms
67 | 12. Developing the ESBAS Algorithm
68 | 13. Practical Implementation for Resolving RL Challenges
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 | ## Index - Reinforcement Learning
77 |
78 | - [Week 1 - **Introduction**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-1---introduction)
79 | - [Week 2 - **RL Basics**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-2---rl-basics-mdp-dynamic-programming-and-model-free-control)
80 | - [Week 3 - **Value based algorithms - DQN**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-3---value-function-approximation-and-dqn)
81 | - [Week 4 - **Policy gradient algorithms - REINFORCE & A2C**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-4---policy-gradient-methods-and-a2c)
82 | - [Week 5 - **Advanced Policy Gradients - PPO**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-5---advanced-policy-gradients---trpo--ppo)
83 | - [Week 6 - **Evolution Strategies and Genetic Algorithms - ES**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-6---evolution-strategies-and-genetic-algorithms)
84 | - [Week 7 - **Model-Based reinforcement learning - MB-MF**](https://github.com/andri27-ts/60_Days_RL_Challenge#week-7---model-based-reinforcement-learning)
85 | - [Week 8 - **Advanced Concepts and Project Of Your Choice**](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#week-8---advanced-concepts-and-project-of-your-choice)
86 | - [Last 4 days - **Review + Sharing**](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#last-4-days---review--sharing)
87 | - [Best resources](https://github.com/andri27-ts/60_Days_RL_Challenge#best-resources)
88 | - [Additional resources](https://github.com/andri27-ts/60_Days_RL_Challenge#additional-resources)
89 |
90 |
91 | ## Week 1 - Introduction
92 |
93 | - **[Why is Reinforcement Learning such an important learning method - A simple explanation](https://medium.com/@andrea.lonzats/the-learning-machines-fb922e539335)**
94 | - **[Introduction and course overview](https://www.youtube.com/watch?v=Q4kF8sfggoI&index=1&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - CS294 by Levine, Berkley**
95 | - **[Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/) by Karpathy**
96 |
97 | ##
98 |
99 | #### Other Resources
100 |
101 | - :books: [The "Bible" of Reinforcement Learning: Chapter 1](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto
102 | - Great introductory paper: [Deep Reinforcement Learning: An Overview](https://www.groundai.com/project/deep-reinforcement-learning-an-overview/)
103 | - Start coding: [From Scratch: AI Balancing Act in 50 Lines of Python](https://towardsdatascience.com/from-scratch-ai-balancing-act-in-50-lines-of-python-7ea67ef717)
104 |
105 |
106 |
107 | ## Week 2 - RL Basics: *MDP, Dynamic Programming and Model-Free Control*
108 |
109 | > Those who cannot remember the past are condemned to repeat it - **George Santayana**
110 |
111 |
112 | This week, we will learn about the basic blocks of reinforcement learning, starting from the definition of the problem all the way through the estimation and optimization of the functions that are used to express the quality of a policy or state.
113 |
114 | ##
115 |
116 | ### Lectures - Theory
117 |
118 |
119 | * **[Markov Decision Process](https://www.youtube.com/watch?v=lfHX2hHRMVQ&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=2) - David Silver (DeepMind)**
120 | * Markov Processes
121 | * Markov Decision Processes
122 |
123 | - **[Planning by Dynamic Programming](https://www.youtube.com/watch?v=Nd1-UUMVfz4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=3) - David Silver (DeepMind)**
124 | * Policy iteration
125 | * Value iteration
126 |
127 | * **[Model-Free Prediction](https://www.youtube.com/watch?v=PnHCvfgC_ZA&index=4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-) - David Silver (DeepMind)**
128 | * Monte Carlo Learning
129 | * Temporal Difference Learning
130 | * TD(λ)
131 |
132 | - **[Model-Free Control](https://www.youtube.com/watch?v=0g4j2k_Ggc4&list=PLzuuYNsE1EZAXYR4FJ75jcJseBmo4KQ9-&index=5) - David Silver (DeepMind)**
133 | * Ɛ-greedy policy iteration
134 | * GLIE Monte Carlo Search
135 | * SARSA
136 | * Importance Sampling
137 |
138 | ##
139 |
140 | ### Project of the Week - [**Q-learning**](Week2/frozenlake_Qlearning.ipynb)
141 |
142 | [**Q-learning applied to FrozenLake**](Week2/frozenlake_Qlearning.ipynb) - For exercise, you can solve the game using SARSA or implement Q-learning by yourself. In the former case, only few changes are needed.
143 |
144 | ##
145 |
146 | #### Other Resources
147 | - :books: [The "Bible" of Reinforcement Learning: Chapters 3 and 4](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto
148 | - :tv: [Value functions introduction](https://www.youtube.com/watch?v=k1vNh4rNYec&index=6&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - DRL UC Berkley by Sergey Levine
149 |
150 |
151 |
152 | ## Week 3 - Value based algorithms - DQN
153 |
154 | This week we'll learn more advanced concepts and apply deep neural network to Q-learning algorithms.
155 |
156 | ##
157 |
158 | ### Lectures - Theory
159 |
160 | - **[Value functions approximation](https://www.youtube.com/watch?v=UoPei5o4fps&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ&index=6) - David Silver (DeepMind)**
161 | - Differentiable function approximators
162 | - Incremental methods
163 | - Batch methods (DQN)
164 |
165 | * **[Advanced Q-learning algorithms](https://www.youtube.com/watch?v=nZXC5OdDfs4&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=7) - Sergey Levine (UC Berkley)**
166 | - Replay Buffer
167 | - Double Q-learning
168 | - Continous actions (NAF,DDPG)
169 | - Pratical tips
170 |
171 | ##
172 |
173 | ### Project of the Week - [**DQN and variants**](Week3)
174 |
175 |
176 |
177 |
178 | [**DQN and some variants applied to Pong**](Week3) - This week the goal is to develop a DQN algorithm to play an Atari game. To make it more interesting I developed three extensions of DQN: **Double Q-learning**, **Multi-step learning**, **Dueling networks** and **Noisy Nets**. Play with them, and if you feel confident, you can implement Prioritized replay, Dueling networks or Distributional RL. To know more about these improvements read the papers!
179 |
180 |
181 |
182 | ##
183 |
184 |
185 | #### Papers
186 |
187 | ##### Must Read
188 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/pdf/1312.5602.pdf) - 2013
189 | - [Human-level control through deep reinforcement learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) - 2015
190 | - [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf) - 2017
191 |
192 | ##### Extensions of DQN
193 | - [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/pdf/1509.06461.pdf) - 2015
194 | - [Prioritized Experience Replay](https://arxiv.org/pdf/1511.05952.pdf) - 2015
195 | - [Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.pdf) - 2016
196 | - [Noisy networks for exploration](https://arxiv.org/pdf/1706.10295.pdf) - 2017
197 | - [Distributional Reinforcement Learning with Quantile Regression](https://arxiv.org/pdf/1710.10044.pdf) - 2017
198 |
199 | #### Other Resources
200 | - :books: [The "Bible" of Reinforcement Learning: Chapters 5 and 6](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto
201 | - :tv: [Deep Reinforcement Learning in the Enterprise: Bridging the Gap from Games to Industry](https://www.youtube.com/watch?v=GOsUHlr4DKE)
202 |
203 |
204 |
205 | ## Week 4 - Policy gradient algorithms - REINFORCE & A2C
206 |
207 | Week 4 introduce Policy Gradient methods, a class of algorithms that optimize directly the policy. Also, you'll learn about Actor-Critic algorithms. These algorithms combine both policy gradient (the actor) and value function (the critic).
208 |
209 | ##
210 |
211 | ### Lectures - Theory
212 |
213 | * **[Policy gradient Methods](https://www.youtube.com/watch?v=KHZVXao4qXs&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ&index=7) - David Silver (DeepMind)**
214 | - Finite Difference Policy Gradient
215 | - Monte-Carlo Policy Gradient
216 | - Actor-Critic Policy Gradient
217 |
218 | - **[Policy gradient intro](https://www.youtube.com/watch?v=XGmd3wcyDg8&t=0s&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=3) - Sergey Levine (RECAP, optional)**
219 | - Policy Gradient (REINFORCE and Vanilla PG)
220 | - Variance reduction
221 |
222 | * **[Actor-Critic](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=4) - Sergey Levine (More in depth)**
223 | - Actor-Critic
224 | - Discout factor
225 | - Actor-Critic algorithm design (batch mode or online)
226 | - state-dependent baseline
227 |
228 | ##
229 |
230 | ### Project of the Week - [**Vanilla PG and A2C**](Week4)
231 |
232 | [**Vanilla PG and A2C applied to CartPole**](Week4) - The exercise of this week is to implement a policy gradient method or a more sophisticated actor-critic. In the repository you can find an implemented version of [PG and A2C](Week4). Bug Alert! Pay attention that A2C give me strange result.
233 | If you find the implementation of PG and A2C easy, you can try with the [asynchronous version of A2C (A3C)](https://arxiv.org/pdf/1602.01783.pdf).
234 |
235 | ##
236 |
237 | #### Papers
238 |
239 | - [Policy Gradient methods for reinforcement learning with function approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)
240 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
241 |
242 | #### Other Resources
243 | - :books: [The "Bible" of Reinforcement Learning: Chapters 9 and 10](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto
244 | - :books: [Intuitive RL: Intro to Advantage-Actor-Critic (A2C)](https://hackernoon.com/intuitive-rl-intro-to-advantage-actor-critic-a2c-4ff545978752)
245 | - :books: [Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)
246 |
247 |
248 |
249 | ## Week 5 - Advanced Policy Gradients - PPO
250 |
251 | This week is about advanced policy gradient methods that improve the stability and the convergence of the "Vanilla" policy gradient methods. You'll learn and implement PPO, a RL algorithm developed by OpenAI and adopted in [OpenAI Five](https://blog.openai.com/openai-five/).
252 |
253 | ##
254 |
255 | ### Lectures - Theory
256 |
257 | - **[Advanced policy gradients](https://www.youtube.com/watch?v=ycCtmp4hcUs&t=0s&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=15) - Sergey Levine (UC Berkley)**
258 | - Problems with "Vanilla" Policy Gradient Methods
259 | - Policy Performance Bounds
260 | - Monotonic Improvement Theory
261 | - Algorithms: NPO, TRPO, PPO
262 |
263 | * **[Natural Policy Gradients, TRPO, PPO](https://www.youtube.com/watch?v=xvRrgxcpaHY) - John Schulman (Berkey DRL Bootcamp)** - (RECAP, optional)
264 | * Limitations of "Vanilla" Policy Gradient Methods
265 | * Natural Policy Gradient
266 | * Trust Region Policy Optimization, TRPO
267 | * Proximal Policy Optimization, PPO
268 |
269 | ##
270 |
271 | ### Project of the Week - [**PPO**](Week5)
272 |
273 |
274 |
275 | [**PPO applied to BipedalWalker**](Week5) - This week, you have to implement PPO or TRPO. I suggest PPO given its simplicity (compared to TRPO). In the project folder Week5 you find an implementation of [**PPO that learn to play BipedalWalker**](Week5).
276 | Furthermore, in the folder you can find other resources that will help you in the development of the project. Have fun!
277 |
278 |
279 |
280 | To learn more about PPO read the [paper](https://arxiv.org/pdf/1707.06347.pdf) and take a look at the [Arxiv Insights's video](https://www.youtube.com/watch?v=5P7I-xPq8u8)
281 |
282 | ##
283 |
284 | #### Papers
285 |
286 | - [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) - 2015
287 | - [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) - 2017
288 |
289 | #### Other Resources
290 | - :books: To better understand PPO and TRPO: [The Pursuit of (Robotic) Happiness](https://towardsdatascience.com/the-pursuit-of-robotic-happiness-how-trpo-and-ppo-stabilize-policy-gradient-methods-545784094e3b)
291 | - :tv: [Nuts and Bolts of Deep RL](https://www.youtube.com/watch?v=8EcdaCk9KaQ&)
292 | - :books: PPO best practice: [Training with Proximal Policy Optimization](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-PPO.md)
293 | - :tv: [Explanation of the PPO algorithm by Arxiv Insights](https://www.youtube.com/watch?v=5P7I-xPq8u8)
294 |
295 |
296 |
297 | ## Week 6 - Evolution Strategies and Genetic Algorithms - ES
298 |
299 | In the last year, Evolution strategies (ES) and Genetic Algorithms (GA) has been shown to achieve comparable results to RL methods. They are derivate-free black-box algorithms that require more data than RL to learn but are able to scale up across thousands of CPUs. This week we'll look at this black-box algorithms.
300 |
301 | ##
302 |
303 | ### Lectures & Articles - Theory
304 |
305 | - **Evolution Strategies**
306 | - [Intro to ES: A Visual Guide to Evolution Strategies](http://blog.otoro.net/2017/10/29/visual-evolution-strategies/)
307 | - [ES for RL: Evolving Stable Strategies](http://blog.otoro.net/2017/11/12/evolving-stable-strategies/)
308 | - [Derivative-free Methods - Lecture](https://www.youtube.com/watch?v=SQtOI9jsrJ0&feature=youtu.be)
309 | - [Evolution Strategies (paper discussion)](https://blog.openai.com/evolution-strategies/)
310 | - **Genetic Algorithms**
311 | - [Introduction to Genetic Algorithms — Including Example Code](https://towardsdatascience.com/introduction-to-genetic-algorithms-including-example-code-e396e98d8bf3)
312 |
313 | ##
314 |
315 | ### Project of the Week - [**ES**](Week6)
316 |
317 |
318 |
319 | [**Evolution Strategies applied to LunarLander**](Week6) - This week the project is to implement a ES or GA.
320 | In the [**Week6 folder**](Week6) you can find a basic implementation of the paper [Evolution Strategies as a
321 | Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf) to solve LunarLanderContinuous. You can modify it to play more difficult environments or add your ideas.
322 |
323 |
324 |
325 | ##
326 |
327 | #### Papers
328 |
329 | - [Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for Training Deep Neural Networks for Reinforcement Learning](https://arxiv.org/pdf/1712.06567.pdf)
330 | - [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf)
331 |
332 | #### Other Resources
333 | - :books: [Evolutionary Optimization Algorithms](https://assoc-redirect.amazon.com/g/r/https://amzn.to/34EphXc?tag=andreaaffilia-20) - Dan Simon
334 |
335 |
336 |
337 | ## Week 7 - Model-Based reinforcement learning - MB-MF
338 |
339 | The algorithms studied up to now are model-free, meaning that they only choose the better action given a state. These algorithms achieve very good performance but require a lot of training data. Instead, model-based algorithms, learn the environment and plan the next actions accordingly to the model learned. These methods are more sample efficient than model-free but overall achieve worst performance. In this week you'll learn the theory behind these methods and implement one of the last algorithms.
340 |
341 | ##
342 |
343 | ### Lectures - Theory
344 |
345 | - **Model-Based RL, David Silver (DeepMind) (concise version)**
346 | - [Integrating Learning and Planning](https://www.youtube.com/watch?v=ItMutbeOHtc&index=8&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ)
347 | - Model-Based RL Overview
348 | - Integrated architectures
349 | - Simulation-Based search
350 | - **Model-Based RL, Sergey Levine (UC Berkley) (in depth version)**
351 | - [Learning dynamical systems from data](https://www.youtube.com/watch?v=yap_g0d7iBQ&index=9&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3)
352 | - Overview of model-based RL
353 | - Global and local models
354 | - Learning with local models and trust regions
355 | - [Learning policies by imitating optimal controllers](https://www.youtube.com/watch?v=AwdauFLan7M&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&index=10)
356 | - Backpropagation into a policy with learned models
357 | - Guided policy search algorithm
358 | - Imitating optimal control with DAgger
359 | - [Advanced model learning and images](https://www.youtube.com/watch?v=vRkIwM4GktE&index=11&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3)
360 | - Models in latent space
361 | - Models directly in image space
362 | - Inverse models
363 |
364 |
365 | ##
366 |
367 | ### Project of the Week - [**MB-MF**](Week7)
368 |
369 |
370 |
371 | [**MB-MF applied to RoboschoolAnt**](Week7) - This week I chose to implement the model-based algorithm described in this [paper](https://arxiv.org/pdf/1708.02596.pdf).
372 | You can find my implementation [here](Week7).
373 | NB: Instead of implementing it on Mujoco as in the paper, I used [RoboSchool](https://github.com/openai/roboschool), an open-source simulator for robot, integrated with OpenAI Gym.
374 |
375 |
376 |
377 | ##
378 |
379 | #### Papers
380 |
381 | - [Imagination-Augmented Agents for Deep Reinforcement Learning - 2017](https://arxiv.org/pdf/1707.06203.pdf)
382 | - [Reinforcement learning with unsupervised auxiliary tasks - 2016](https://arxiv.org/pdf/1611.05397.pdf)
383 | - [Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning - 2018](https://arxiv.org/pdf/1708.02596.pdf)
384 |
385 | #### Other Resources
386 | - :books: [The "Bible" of Reinforcement Learning: Chapter 8](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - Sutton & Barto
387 | - :books: [World Models - Can agents learn inside of their own dreams?](https://worldmodels.github.io/)
388 |
389 |
390 |
391 | ## Week 8 - Advanced Concepts and Project Of Your Choice
392 |
393 | This last week is about advanced RL concepts and a project of your choice.
394 |
395 | ##
396 |
397 | ### Lectures - Theory
398 |
399 | - Sergey Levine (Berkley)
400 | - [Connection between inference and control](https://www.youtube.com/watch?v=iOYiPhu5GEk&index=13&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
401 | - [Inverse reinforcement learning](https://www.youtube.com/watch?v=-3BcZwgmZLk&index=14&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
402 | - [Exploration (part 1)](https://www.youtube.com/watch?v=npi6B4VQ-7s&index=16&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
403 | - [Exploration (part 2) and transfer learning](https://www.youtube.com/watch?v=0WbVUvKJpg4&index=17&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
404 | - [Multi-task learning and transfer](https://www.youtube.com/watch?v=UqSx23W9RYE&index=18&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
405 | - [Meta-learning and parallelism](https://www.youtube.com/watch?v=Xe9bktyYB34&index=18&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3)
406 | - [Advanced imitation learning and open problems](https://www.youtube.com/watch?v=mc-DtbhhiKA&index=20&list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3&t=0s)
407 | - David Silver (DeepMind)
408 | - [Classic Games](https://www.youtube.com/watch?v=N1LKLc6ufGY&feature=youtu.be)
409 |
410 |
411 | ##
412 |
413 | ### The final project
414 | Here you can find some project ideas.
415 | - [Pommerman](https://www.pommerman.com/) (Multiplayer)
416 | - [AI for Prosthetics Challenge](https://www.crowdai.org/challenges/nips-2018-ai-for-prosthetics-challenge) (Challenge)
417 | - [Word Models](https://worldmodels.github.io/) (Paper implementation)
418 | - [Request for research OpenAI](https://blog.openai.com/requests-for-research-2/) (Research)
419 | - [Retro Contest](https://blog.openai.com/retro-contest/) (Transfer learning)
420 |
421 | ##
422 |
423 | #### Other Resources
424 | * AlphaGo Zero
425 | - [Paper](https://www.nature.com/articles/nature24270.epdf?author_access_token=VJXbVjaSHxFoctQQ4p2k4tRgN0jAjWel9jnR3ZoTv0PVW4gB86EEpGqTRDtpIz-2rmo8-KG06gqVobU5NSCFeHILHcVFUeMsbvwS-lxjqQGg98faovwjxeTUgZAUMnRQ)
426 | - DeepMind blog post: [AlphaGo Zero: Learning from scratch](https://deepmind.com/blog/alphago-zero-learning-scratch/)
427 | - Arxiv Insights video: [How AlphaGo Zero works - Google DeepMind](https://www.youtube.com/watch?v=MgowR4pq3e8)
428 | * OpenAI Five
429 | - OpenAI blog post: [OpenAI Five](https://blog.openai.com/openai-five/)
430 | - Arxiv Insights video: [OpenAI Five: Facing Human Pro's in Dota II](https://www.youtube.com/watch?v=0eO2TSVVP1Y)
431 |
432 |
433 |
434 | ## Last 4 days - Review + Sharing
435 |
436 | Congratulation for completing the 60 Days RL Challenge!! Let me know if you enjoyed it and share it!
437 |
438 | See you!
439 |
440 | ## Best resources
441 |
442 | :books: [Reinforcement Learning: An Introduction](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2HRSSmh?tag=andreaaffilia-20) - by Sutton & Barto. The "Bible" of reinforcement learning. [Here](https://drive.google.com/file/d/1opPSz5AZ_kVa1uWOdOiveNiBFiEOHjkG/view) you can find the PDF draft of the second version.
443 |
444 | :books: [Deep Reinforcement Learning Hands-On](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2PRxKD7?tag=andreaaffilia-20) - by Maxim Lapan
445 |
446 | :books: [Deep Learning](https://assoc-redirect.amazon.com/g/r/https://amzn.to/2N3AIlp?tag=andreaaffilia-20) - Ian Goodfellow
447 |
448 | :tv: [Deep Reinforcement Learning](https://www.youtube.com/playlist?list=PLkFD6_40KJIznC9CDbVTjAF2oyt8_VAe3) - UC Berkeley class by Levine, check [here](http://rail.eecs.berkeley.edu/deeprlcourse/) their site.
449 |
450 | :tv: [Reinforcement Learning course](https://www.youtube.com/watch?v=2pWv7GOvuf0&list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) - by David Silver, DeepMind. Great introductory lectures by Silver, a lead researcher on AlphaGo. They follow the book Reinforcement Learning by Sutton & Barto.
451 |
452 |
453 |
454 | ## Additional resources
455 |
456 | :books: [Awesome Reinforcement Learning](https://github.com/aikorea/awesome-rl). A curated list of resources dedicated to reinforcement learning
457 |
458 | :books: [GroundAI on RL](https://www.groundai.com/?text=reinforcement+learning). Papers on reinforcement learning
459 |
460 |
461 | ## A cup of Coffe :coffee:
462 |
463 | Any contribution is higly appreciated! Cheers!
464 |
465 | [](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=NKSNP93CNY4KN)
466 |
--------------------------------------------------------------------------------
/Week2/frozenlake_Qlearning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Q-learning applied to FrozenLake "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "#### **Remember**: Q-learning is a model free, off-policy algorithm that can be used to find an optimal action using a Q function. Q can be represented as a table that contains a value for each pair state-action\n",
15 | " \n",
16 | "To review Q-learning watch [Q learning explained by Siraj](https://www.youtube.com/watch?v=aCEvtRtNO-M)"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "#### Q-learning pipeline is quite easy an can be summarised in 5 blocks:\n",
24 | "\n",
25 | ""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## WHAT'S THE ENVIRONMENT?\n",
33 | "\n",
34 | "#### We'll apply Q-learning on a [Gym](http://gym.openai.com/) game called [FrozenLake](https://gym.openai.com/envs/FrozenLake-v0/)\n",
35 | "\n",
36 | ""
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## LET'S START TO CODE"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 1,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "import gym\n",
53 | "import random\n",
54 | "from collections import namedtuple\n",
55 | "import collections\n",
56 | "import numpy as np\n",
57 | "import matplotlib.pyplot as plt\n",
58 | "%matplotlib inline "
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "### BASIC FUNCTION TO CHOOSE AN ACTION FOLLOWING DIFFERENT POLICIES"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 2,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "def select_eps_greedy_action(table, obs, n_actions):\n",
75 | " '''\n",
76 | " Select the action using a ε-greedy policy (add a randomness ε for the choice of the action)\n",
77 | " '''\n",
78 | " value, action = best_action_value(table, obs)\n",
79 | "\n",
80 | " if random.random() < epsilon:\n",
81 | " return random.randint(0,n_actions-1)\n",
82 | " else:\n",
83 | " return action\n",
84 | "\n",
85 | "def select_greedy_action(table, obs, n_actions):\n",
86 | " '''\n",
87 | " Select the action using a greedy policy (take the best action according to the policy)\n",
88 | " '''\n",
89 | " value, action = best_action_value(table, obs)\n",
90 | " return action\n",
91 | "\n",
92 | "\n",
93 | "def best_action_value(table, state):\n",
94 | " '''\n",
95 | " Exploring the table, take the best action that maximize Q(s,a)\n",
96 | " '''\n",
97 | " best_action = 0\n",
98 | " max_value = 0\n",
99 | " for action in range(n_actions):\n",
100 | " if table[(state, action)] > max_value:\n",
101 | " best_action = action\n",
102 | " max_value = table[(state, action)]\n",
103 | "\n",
104 | " return max_value, best_action"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | ""
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 3,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "def Q_learning(table, obs0, obs1, reward, action):\n",
121 | " '''\n",
122 | " Q-learning. Update Q(obs0,action) according to Q(obs1,*) and the reward just obtained\n",
123 | " '''\n",
124 | " \n",
125 | " # Take the best value reachable from the state obs1\n",
126 | " best_value, _ = best_action_value(table, obs1)\n",
127 | "\n",
128 | " # Calculate Q-target value \n",
129 | " Q_target = reward + GAMMA * best_value\n",
130 | "\n",
131 | " # Calculate the Q-error between the target and the previous value\n",
132 | " Q_error = Q_target - table[(obs0, action)]\n",
133 | "\n",
134 | " # Update Q(obs0,action)\n",
135 | " table[(obs0, action)] += LEARNING_RATE * Q_error"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "### TEST THE POLICY"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 4,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "def test_game(env, table):\n",
152 | " '''\n",
153 | " Test the new table playing TEST_EPISODES games\n",
154 | " '''\n",
155 | " \n",
156 | " n_actions = env.action_space.n\n",
157 | " \n",
158 | " reward_games = []\n",
159 | " for _ in range(TEST_EPISODES):\n",
160 | " obs = env.reset()\n",
161 | " rewards = 0\n",
162 | " while True:\n",
163 | " # Act greedly \n",
164 | " next_obs, reward, done, _ = env.step(select_greedy_action(table, obs, n_actions))\n",
165 | " obs = next_obs\n",
166 | " rewards += reward\n",
167 | "\n",
168 | " if done:\n",
169 | " reward_games.append(rewards)\n",
170 | " break\n",
171 | "\n",
172 | " return np.mean(reward_games)"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### MAIN PROCEDURE"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 5,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "# Some hyperparameters..\n",
189 | "GAMMA = 0.95\n",
190 | "\n",
191 | "# NB: the decay rate allow to regulate the Exploration - Exploitation trade-off\n",
192 | "# start with a EPSILON of 1 and decay until reach 0\n",
193 | "EPS_DECAY_RATE = 0.9993\n",
194 | "\n",
195 | "LEARNING_RATE = 0.8\n",
196 | "\n",
197 | "# .. and constants\n",
198 | "TEST_EPISODES = 100\n",
199 | "MAX_GAMES = 15000"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 8,
205 | "metadata": {},
206 | "outputs": [
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "\tEp: 999 Test reward: 0.3 0.5\n",
212 | "\tEp: 1999 Test reward: 0.56 0.25\n",
213 | "\tEp: 2999 Test reward: 0.71 0.12\n",
214 | "\tEp: 3999 Test reward: 0.7 0.06\n",
215 | "\tEp: 4999 Test reward: 0.19 0.03\n",
216 | "\tEp: 5999 Test reward: 0.0 0.01\n",
217 | "\tEp: 6999 Test reward: 0.78 0.01\n",
218 | "\tEp: 7999 Test reward: 0.74 0.0\n",
219 | "\tEp: 8999 Test reward: 0.8 0.0\n",
220 | "\tEp: 9999 Test reward: 0.77 0.0\n",
221 | "\tEp: 10999 Test reward: 0.77 0.0\n",
222 | "\tEp: 11999 Test reward: 0.74 0.0\n",
223 | "\tEp: 12999 Test reward: 0.7 0.0\n",
224 | "\tEp: 13999 Test reward: 0.75 0.0\n",
225 | "\tEp: 14999 Test reward: 0.75 0.0\n"
226 | ]
227 | },
228 | {
229 | "data": {
230 | "image/png": "\n",
231 | "text/plain": [
232 | ""
233 | ]
234 | },
235 | "metadata": {
236 | "needs_background": "light"
237 | },
238 | "output_type": "display_data"
239 | }
240 | ],
241 | "source": [
242 | "# Create the environment\n",
243 | "#env = gym.make('Taxi-v2')\n",
244 | "env = gym.make(\"FrozenLake-v0\")\n",
245 | "obs = env.reset()\n",
246 | "\n",
247 | "obs_length = env.observation_space.n\n",
248 | "n_actions = env.action_space.n\n",
249 | "\n",
250 | "reward_count = 0\n",
251 | "games_count = 0\n",
252 | "\n",
253 | "# Create and initialize the table with 0.0\n",
254 | "table = collections.defaultdict(float)\n",
255 | " \n",
256 | "test_rewards_list = []\n",
257 | "\n",
258 | "# Reinitialize epsilon after each session\n",
259 | "epsilon = 1.0\n",
260 | "\n",
261 | "while games_count < MAX_GAMES:\n",
262 | "\n",
263 | " # Select the action following an ε-greedy policy\n",
264 | " action = select_eps_greedy_action(table, obs, n_actions)\n",
265 | " next_obs, reward, done, _ = env.step(action)\n",
266 | "\n",
267 | " # Update the Q-table\n",
268 | " Q_learning(table, obs, next_obs, reward, action)\n",
269 | "\n",
270 | " reward_count += reward\n",
271 | " obs = next_obs\n",
272 | "\n",
273 | " if done:\n",
274 | " epsilon *= EPS_DECAY_RATE\n",
275 | "\n",
276 | " # Test the new table every 1k games\n",
277 | " if (games_count + 1) % 1000 == 0:\n",
278 | " test_reward = test_game(env, table)\n",
279 | " print('\\tEp:', games_count, 'Test reward:', test_reward, np.round(epsilon,2))\n",
280 | "\n",
281 | " test_rewards_list.append(test_reward)\n",
282 | "\n",
283 | " obs = env.reset()\n",
284 | " reward_count = 0\n",
285 | " games_count += 1 \n",
286 | "\n",
287 | "# Plot the accuracy over the number of steps\n",
288 | "plt.figure(figsize=(18,9))\n",
289 | "plt.xlabel('Steps')\n",
290 | "plt.ylabel('Accurracy')\n",
291 | "plt.plot(test_rewards_list)\n",
292 | "plt.show()"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "#### NB: in case you want to apply Q-learning to continuous state and actions games, you have to quantize the state and action spaces"
300 | ]
301 | }
302 | ],
303 | "metadata": {
304 | "kernelspec": {
305 | "display_name": "Python 3",
306 | "language": "python",
307 | "name": "python3"
308 | },
309 | "language_info": {
310 | "codemirror_mode": {
311 | "name": "ipython",
312 | "version": 3
313 | },
314 | "file_extension": ".py",
315 | "mimetype": "text/x-python",
316 | "name": "python",
317 | "nbconvert_exporter": "python",
318 | "pygments_lexer": "ipython3",
319 | "version": "3.7.6"
320 | }
321 | },
322 | "nbformat": 4,
323 | "nbformat_minor": 2
324 | }
325 |
--------------------------------------------------------------------------------
/Week2/img/Q_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/Q_function.png
--------------------------------------------------------------------------------
/Week2/img/frozenlake_v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/frozenlake_v0.png
--------------------------------------------------------------------------------
/Week2/img/short_diag.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week2/img/short_diag.jpg
--------------------------------------------------------------------------------
/Week3/README.md:
--------------------------------------------------------------------------------
1 | # DQN, Double Q-learning, Deuling Networks, Multi-step learning and Noisy Nets applied to Pong
2 |
3 | This week we will apply Deep Q-Networks (DQN) to [Pong](https://gym.openai.com/envs/Pong-v0/).
4 |
5 | 
6 |
7 | For the DQN implementation and the choose of the hyperparameters, I mostly followed [Mnih et al.](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf). (In the last page there is a table with all the hyperparameters.)
8 |
9 | To make things more interesting, I improved the basic DQN, implementing some variations like **Double Q-learning**, **Dueling networks**, **Multi-step learning** and **Noisy Nets**. You can find them summarized by [Hessel et al.](https://arxiv.org/pdf/1710.02298.pdf)
10 |
11 | ### [Learn the theory](../README.md)
12 |
13 | ---
14 |
15 | ### Double Q-learning - [Paper](https://arxiv.org/pdf/1509.06461.pdf)
16 |
17 | Minimize the overestimation bias introduced by the conventional Q-learning.
18 |
19 |
20 |
21 | To use it, in *main.py*, set
22 | ```python
23 | DQN_HYPERPARAMS = {
24 | 'double_DQN': True,
25 | ...
26 | }
27 | ```
28 |
29 | ---
30 |
31 | ### Dueling networks - [Paper](http://proceedings.mlr.press/v48/wangf16.pdf)
32 |
33 | It uses two different neural networks, one outputs the value of the state and the other the advantage of each action.
34 | The two NNs share the convolutional encoder.
35 |
36 |
37 |
38 | To use it, in *main.py*, set
39 | ```python
40 | DQN_HYPERPARAMS = {
41 | 'dueling': True,
42 | ...
43 | }
44 | ```
45 |
46 | ---
47 |
48 | ### NoisyNet - [Paper](https://arxiv.org/pdf/1706.10295.pdf)
49 |
50 | An idea to overcome the ε-greedy limitations is to introduce noise linear layers. The network will manage the noise stream to balance the exploration.
51 |
52 |
53 |
54 | To use it, in *main.py*, set
55 | ```python
56 | DQN_HYPERPARAMS = {
57 | 'noisy_net': True,
58 | ...
59 | }
60 | ```
61 |
62 | ---
63 |
64 | ### Multi-step
65 |
66 | Introduce a forward-view multi-step. Similar to TD(λ)
67 |
68 |
69 |
70 |
71 | To use it, in *main.py*, set
72 | ```python
73 | DQN_HYPERPARAMS = {
74 | 'n_multi_step': 2, # or 3
75 | ...
76 | }
77 | ```
78 |
79 | NB: From today's on, because we will train deep neural networks, I suggest to run the code on GPUs. If you don't have it, you can use [Google Colab](https://colab.research.google.com/).
80 | Also, to track the networks' results, we'll use [TensorboardX](https://github.com/lanpa/tensorboardX) (tensorboard for PyTorch). In case you use Google Colab to run TensorBoard on your pc, execute the commands in the section below.
81 |
82 | NB: If you use GPUs remember to change DEVICE from 'cpu' to 'cuda' in *main.py*.
83 |
84 |
85 | ## To make the code more clear, it's structured in 6 files:
86 | - **main.py** contains the main body. It creates the agent, the environment and plays N games. For each step, it updates the agent
87 | - **agents.py** has the Agent class that control the central control, the replay buffer and basic functions
88 | - **central_control.py** contains CentralControl class. It is responsible to instantiate the DQN (or its variants), optimize it, calculate the loss ecc..
89 | - **buffers.py** contains the ReplayBuffer class to keep the agent's memories inside a deque list and sample from it.
90 | - **neural_net.py** contains the deep neural nets for the agent namely DQN, DuelingDQN and a NoisyLinear Layer for the Noisy DQN.
91 | - **atari_wrappers.py** include some Atari wrappers. https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
92 | - **utils.py**, for now, contains only a testing function.
93 |
94 |
95 | ## Results
96 |
97 | In the image below are shown the rewards mean of the last 10 games and the last 40 games for three different DQN variations.
98 | The x-axis is the number of games. You can see that only 120 games are enough to learn the game pretty well.
99 |
100 | 
101 |
102 | -  `Basic DQN`
103 | -  `2-step DQN`
104 | -  `2-step Dueling DQN`
105 |
106 | May seem strange that 2-step Dueling DQN performs worst than 2-step DQN but it's important to keep in mind that the NNs are stochastic and that I tested only on one game. The authors of the DuelingDQN paper, reported better results when applied to other games.
107 |
108 |
109 | ## Install
110 |
111 | ```
112 | !pip install gym
113 | !pip install torch torchvision
114 | !pip install tensorboardX
115 | !apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
116 | ```
117 |
118 | Install gym
119 | ```
120 | !git clone https://github.com/openai/gym.git
121 | import os
122 | os.chdir('gym')
123 | !ls
124 | !pip install -e .
125 | os.chdir('..')
126 | ```
127 |
128 | Install gym
129 | ```
130 | !pip install gym[atari]
131 | ```
132 |
133 |
134 | ## To run TensorBoard in Google Colab
135 |
136 | Instructions from https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/
137 |
138 | Download and install ngrok
139 | ```
140 | !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
141 | !unzip ngrok-stable-linux-amd64.zip
142 | ```
143 |
144 | run ngrok and tensorboard
145 | ```
146 | LOG_DIR = 'content/runs'
147 |
148 | get_ipython().system_raw(
149 | 'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'.format(LOG_DIR)
150 | )
151 |
152 | get_ipython().system_raw('./ngrok http 6006 &')
153 |
154 | !curl -s http://localhost:4040/api/tunnels | python3 -c \
155 | "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"
156 | ```
157 |
--------------------------------------------------------------------------------
/Week3/agent.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import namedtuple
4 | import collections
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 |
9 | import time
10 |
11 | from neural_net import DQN
12 | from central_control import CentralControl
13 | from buffers import ReplayBuffer
14 |
15 |
16 | class DQNAgent():
17 | '''
18 | Agent class. It control all the agent functionalities
19 | '''
20 | rewards = []
21 | total_reward = 0
22 | birth_time = 0
23 | n_iter = 0
24 | n_games = 0
25 | ts_frame = 0
26 | ts = time.time()
27 |
28 | Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)
29 |
30 | def __init__(self, env, device, hyperparameters, summary_writer=None):
31 | '''
32 | Agent initialization. It create the CentralControl that control all the low
33 | '''
34 |
35 | # The CentralControl is the 'brain' of the agent
36 | self.cc = CentralControl(env.observation_space.shape, env.action_space.n, hyperparameters['gamma'], hyperparameters['n_multi_step'], hyperparameters['double_DQN'],
37 | hyperparameters['noisy_net'], hyperparameters['dueling'], device)
38 |
39 | self.cc.set_optimizer(hyperparameters['learning_rate'])
40 |
41 | self.birth_time = time.time()
42 |
43 | self.iter_update_target = hyperparameters['n_iter_update_target']
44 | self.buffer_start_size = hyperparameters['buffer_start_size']
45 |
46 | self.epsilon_start = hyperparameters['epsilon_start']
47 | self.epsilon = hyperparameters['epsilon_start']
48 | self.epsilon_decay = hyperparameters['epsilon_decay']
49 | self.epsilon_final = hyperparameters['epsilon_final']
50 |
51 | self.accumulated_loss = []
52 | self.device = device
53 |
54 | # initialize the replay buffer (i.e. the memory) of the agent
55 | self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma'])
56 | self.summary_writer = summary_writer
57 |
58 | self.noisy_net = hyperparameters['noisy_net']
59 |
60 | self.env = env
61 |
62 | def act(self, obs):
63 | '''
64 | Greedy action outputted by the NN in the CentralControl
65 | '''
66 | return self.cc.get_max_action(obs)
67 |
68 | def act_eps_greedy(self, obs):
69 | '''
70 | E-greedy action
71 | '''
72 |
73 | # In case of a noisy net, it takes a greedy action
74 | if self.noisy_net:
75 | return self.act(obs)
76 |
77 | if np.random.random() < self.epsilon:
78 | return self.env.action_space.sample()
79 | else:
80 | return self.act(obs)
81 |
82 | def add_env_feedback(self, obs, action, new_obs, reward, done):
83 | '''
84 | Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean.
85 | '''
86 |
87 | # Create the new memory and update the buffer
88 | new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done)
89 | self.replay_buffer.append(new_memory)
90 |
91 | # update the variables
92 | self.n_iter += 1
93 | # decrease epsilon
94 | self.epsilon = max(self.epsilon_final, self.epsilon_start - self.n_iter/self.epsilon_decay)
95 | self.total_reward += reward
96 |
97 | def sample_and_optimize(self, batch_size):
98 | '''
99 | Sample batch_size memories from the buffer and optimize them
100 | '''
101 |
102 | if len(self.replay_buffer) > self.buffer_start_size:
103 | # sample
104 | mini_batch = self.replay_buffer.sample(batch_size)
105 | # optimize
106 | l_loss = self.cc.optimize(mini_batch)
107 | self.accumulated_loss.append(l_loss)
108 |
109 | # update target NN
110 | if self.n_iter % self.iter_update_target == 0:
111 | self.cc.update_target()
112 |
113 | def reset_stats(self):
114 | '''
115 | Reset the agent's statistics
116 | '''
117 | self.rewards.append(self.total_reward)
118 | self.total_reward = 0
119 | self.accumulated_loss = []
120 | self.n_games += 1
121 |
122 |
123 | def print_info(self):
124 | '''
125 | Print information about the agent
126 | '''
127 | fps = (self.n_iter-self.ts_frame)/(time.time()-self.ts)
128 | print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss)))
129 |
130 | self.ts_frame = self.n_iter
131 | self.ts = time.time()
132 |
133 | if self.summary_writer != None:
134 | self.summary_writer.add_scalar('reward', self.total_reward, self.n_games)
135 | self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games)
136 | self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games)
137 | self.summary_writer.add_scalar('esilon', self.epsilon, self.n_games)
138 | self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games)
139 |
--------------------------------------------------------------------------------
/Week3/atari_wrappers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | from collections import deque
4 | import gym
5 | from gym import spaces
6 | import cv2
7 |
8 | '''
9 | Atari Wrapper copied from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
10 | '''
11 |
12 |
13 | class LazyFrames(object):
14 | def __init__(self, frames):
15 | """This object ensures that common frames between the observations are only stored once.
16 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
17 | buffers.
18 | This object should only be converted to numpy array before being passed to the model.
19 | You'd not believe how complex the previous solution was."""
20 | self._frames = frames
21 | self._out = None
22 |
23 | def _force(self):
24 | if self._out is None:
25 | self._out = np.concatenate(self._frames, axis=2)
26 | self._frames = None
27 | return self._out
28 |
29 | def __array__(self, dtype=None):
30 | out = self._force()
31 | if dtype is not None:
32 | out = out.astype(dtype)
33 | return out
34 |
35 | def __len__(self):
36 | return len(self._force())
37 |
38 | def __getitem__(self, i):
39 | return self._force()[i]
40 |
41 | class FireResetEnv(gym.Wrapper):
42 | def __init__(self, env):
43 | """Take action on reset for environments that are fixed until firing."""
44 | gym.Wrapper.__init__(self, env)
45 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
46 | assert len(env.unwrapped.get_action_meanings()) >= 3
47 |
48 | def reset(self, **kwargs):
49 | self.env.reset(**kwargs)
50 | obs, _, done, _ = self.env.step(1)
51 | if done:
52 | self.env.reset(**kwargs)
53 | obs, _, done, _ = self.env.step(2)
54 | if done:
55 | self.env.reset(**kwargs)
56 | return obs
57 |
58 | def step(self, ac):
59 | return self.env.step(ac)
60 |
61 |
62 | class MaxAndSkipEnv(gym.Wrapper):
63 | def __init__(self, env, skip=4):
64 | """Return only every `skip`-th frame"""
65 | gym.Wrapper.__init__(self, env)
66 | # most recent raw observations (for max pooling across time steps)
67 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
68 | self._skip = skip
69 |
70 | def step(self, action):
71 | """Repeat action, sum reward, and max over last observations."""
72 | total_reward = 0.0
73 | done = None
74 | for i in range(self._skip):
75 | obs, reward, done, info = self.env.step(action)
76 | if i == self._skip - 2: self._obs_buffer[0] = obs
77 | if i == self._skip - 1: self._obs_buffer[1] = obs
78 | total_reward += reward
79 | if done:
80 | break
81 | # Note that the observation on the done=True frame
82 | # doesn't matter
83 | max_frame = self._obs_buffer.max(axis=0)
84 |
85 | return max_frame, total_reward, done, info
86 |
87 | def reset(self, **kwargs):
88 | return self.env.reset(**kwargs)
89 |
90 |
91 |
92 | class WarpFrame(gym.ObservationWrapper):
93 | def __init__(self, env):
94 | """Warp frames to 84x84 as done in the Nature paper and later work."""
95 | gym.ObservationWrapper.__init__(self, env)
96 | self.width = 84
97 | self.height = 84
98 | self.observation_space = spaces.Box(low=0, high=255,
99 | shape=(self.height, self.width, 1), dtype=np.uint8)
100 |
101 | def observation(self, frame):
102 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
103 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
104 | return frame[:, :, None]
105 |
106 |
107 |
108 | class FrameStack(gym.Wrapper):
109 | def __init__(self, env, k):
110 | """Stack k last frames.
111 | Returns lazy array, which is much more memory efficient.
112 | See Also
113 | --------
114 | baselines.common.atari_wrappers.LazyFrames
115 | """
116 | gym.Wrapper.__init__(self, env)
117 | self.k = k
118 | self.frames = deque([], maxlen=k)
119 | shp = env.observation_space.shape
120 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
121 |
122 | def reset(self):
123 | ob = self.env.reset()
124 | for _ in range(self.k):
125 | self.frames.append(ob)
126 | return self._get_ob()
127 |
128 | def step(self, action):
129 | ob, reward, done, info = self.env.step(action)
130 | self.frames.append(ob)
131 | return self._get_ob(), reward, done, info
132 |
133 | def _get_ob(self):
134 | assert len(self.frames) == self.k
135 | return LazyFrames(list(self.frames))
136 |
137 |
138 | class ImageToPyTorch(gym.ObservationWrapper):
139 | def __init__(self, env):
140 | super(ImageToPyTorch, self).__init__(env)
141 | old_shape = self.observation_space.shape
142 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)
143 |
144 | def observation(self, observation):
145 | return np.moveaxis(observation, 2, 0)
146 |
147 |
148 |
149 | class ScaledFloatFrame(gym.ObservationWrapper):
150 | def __init__(self, env):
151 | gym.ObservationWrapper.__init__(self, env)
152 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
153 |
154 | def observation(self, observation):
155 | # careful! This undoes the memory optimization, use
156 | # with smaller replay buffers only.
157 | return np.array(observation).astype(np.float32) / 255.0
158 |
159 |
160 | def make_env(env_name, fire=True):
161 | env = gym.make(env_name)
162 | env = MaxAndSkipEnv(env) ## Return only every `skip`-th frame
163 | if fire:
164 | env = FireResetEnv(env) ## Fire at the beginning
165 | env = WarpFrame(env) ## Reshape image
166 | env = ImageToPyTorch(env) ## Invert shape
167 | env = FrameStack(env, 4) ## Stack last 4 frames
168 | env = ScaledFloatFrame(env) ## Scale frames
169 | return env
--------------------------------------------------------------------------------
/Week3/buffers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import collections
3 |
4 |
5 |
6 | class ReplayBuffer():
7 | '''
8 | Replay Buffer class to keep the agent memories memorized in a deque structure.
9 | '''
10 | def __init__(self, size, n_multi_step, gamma):
11 | self.buffer = collections.deque(maxlen=size)
12 | self.n_multi_step = n_multi_step
13 | self.gamma = gamma
14 |
15 | def __len__(self):
16 | return len(self.buffer)
17 |
18 | def append(self, memory):
19 | '''
20 | append a new 'memory' to the buffer
21 | '''
22 | self.buffer.append(memory)
23 |
24 | def sample(self, batch_size):
25 | '''
26 | Sample batch_size memories from the buffer.
27 | NB: It deals the N-step DQN
28 | '''
29 | # randomly pick batch_size elements from the buffer
30 | indices = np.random.choice(len(self.buffer), batch_size, replace=False)
31 |
32 | states = []
33 | actions = []
34 | next_states = []
35 | rewards = []
36 | dones = []
37 |
38 | # for each indices
39 | for i in indices:
40 | sum_reward = 0
41 | states_look_ahead = self.buffer[i].new_obs
42 | done_look_ahead = self.buffer[i].done
43 |
44 | # N-step look ahead loop to compute the reward and pick the new 'next_state' (of the n-th state)
45 | for n in range(self.n_multi_step):
46 | if len(self.buffer) > i+n:
47 | # compute the n-th reward
48 | sum_reward += (self.gamma**n) * self.buffer[i+n].reward
49 | if self.buffer[i+n].done:
50 | states_look_ahead = self.buffer[i+n].new_obs
51 | done_look_ahead = True
52 | break
53 | else:
54 | states_look_ahead = self.buffer[i+n].new_obs
55 | done_look_ahead = False
56 |
57 | # Populate the arrays with the next_state, reward and dones just computed
58 | states.append(self.buffer[i].obs)
59 | actions.append(self.buffer[i].action)
60 | next_states.append(states_look_ahead)
61 | rewards.append(sum_reward)
62 | dones.append(done_look_ahead)
63 |
64 | return (np.array(states, dtype=np.float32), np.array(actions, dtype=np.int64), np.array(next_states, dtype=np.float32), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8))
65 |
--------------------------------------------------------------------------------
/Week3/central_control.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import namedtuple
3 | import collections
4 | import torch
5 | import torch.nn as nn
6 | import torch.optim as optim
7 |
8 | import time
9 |
10 | from neural_net import DQN, DuelingDQN
11 |
12 |
13 | class CentralControl():
14 |
15 | def __init__(self, observation_space_shape, action_space_shape, gamma, n_multi_step, double_DQN, noisy_net, dueling, device):
16 | if dueling:
17 | # Dueling NN
18 | self.target_nn = DuelingDQN(observation_space_shape, action_space_shape).to(device)
19 | self.moving_nn = DuelingDQN(observation_space_shape, action_space_shape).to(device)
20 | else:
21 | # Normal NN
22 | self.target_nn = DQN(observation_space_shape, action_space_shape, noisy_net).to(device)
23 | self.moving_nn = DQN(observation_space_shape, action_space_shape, noisy_net).to(device)
24 |
25 | self.device = device
26 | self.gamma = gamma
27 | self.n_multi_step = n_multi_step
28 | self.double_DQN = double_DQN
29 |
30 | def set_optimizer(self, learning_rate):
31 | self.optimizer = optim.Adam(self.moving_nn.parameters(), lr=learning_rate)
32 |
33 | def optimize(self, mini_batch):
34 | '''
35 | Optimize the NN
36 | '''
37 | # reset the grads
38 | self.optimizer.zero_grad()
39 | # caluclate the loss of the mini batch
40 | loss = self._calulate_loss(mini_batch)
41 | loss_v = loss.item()
42 |
43 | # do backpropagation
44 | loss.backward()
45 | # one step of optimization
46 | self.optimizer.step()
47 |
48 | return loss_v
49 |
50 | def update_target(self):
51 | '''
52 | Copy the moving NN in the target NN
53 | '''
54 | self.target_nn.load_state_dict(self.moving_nn.state_dict())
55 | self.target_nn = self.moving_nn
56 |
57 | def get_max_action(self, obs):
58 | '''
59 | Forward pass of the NN to obtain the action of the given observations
60 | '''
61 | # convert the observation in tensor
62 | state_t = torch.tensor(np.array([obs])).to(self.device)
63 | # forawrd pass
64 | q_values_t = self.moving_nn(state_t)
65 | # get the maximum value of the output (i.e. the best action to take)
66 | _, act_t = torch.max(q_values_t, dim=1)
67 | return int(act_t.item())
68 |
69 |
70 | def _calulate_loss(self, mini_batch):
71 | '''
72 | Calculate mini batch's MSE loss.
73 | It support also the double DQN version
74 | '''
75 |
76 | states, actions, next_states, rewards, dones = mini_batch
77 |
78 | # convert the data in tensors
79 | states_t = torch.as_tensor(states, device=self.device)
80 | next_states_t = torch.as_tensor(next_states, device=self.device)
81 | actions_t = torch.as_tensor(actions, device=self.device)
82 | rewards_t = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
83 | done_t = torch.as_tensor(dones, dtype=torch.uint8, device=self.device)
84 |
85 | # Value of the action taken previously (recorded in actions_v) in the state_t
86 | state_action_values = self.moving_nn(states_t).gather(1, actions_t[:,None]).squeeze(-1)
87 | # NB gather is a differentiable function
88 |
89 | # Next state value with Double DQN. (i.e. get the value predicted by the target nn, of the best action predicted by the moving nn)
90 | if self.double_DQN:
91 | double_max_action = self.moving_nn(next_states_t).max(1)[1]
92 | double_max_action = double_max_action.detach()
93 | target_output = self.target_nn(next_states_t)
94 | next_state_values = torch.gather(target_output, 1, double_max_action[:,None]).squeeze(-1) # NB: [:,None] add an extra dimension
95 |
96 | # Next state value in the normal configuration
97 | else:
98 | next_state_values = self.target_nn(next_states_t).max(1)[0]
99 |
100 | next_state_values = next_state_values.detach() # No backprop
101 |
102 | # Use the Bellman equation
103 | expected_state_action_values = rewards_t + (self.gamma**self.n_multi_step) * next_state_values
104 | # compute the loss
105 | return nn.MSELoss()(state_action_values, expected_state_action_values)
106 |
--------------------------------------------------------------------------------
/Week3/imgs/DQN_variations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/DQN_variations.png
--------------------------------------------------------------------------------
/Week3/imgs/Dueling_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/Dueling_img.png
--------------------------------------------------------------------------------
/Week3/imgs/double_Qlearning_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/double_Qlearning_formula.png
--------------------------------------------------------------------------------
/Week3/imgs/multistep_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/multistep_formula.png
--------------------------------------------------------------------------------
/Week3/imgs/noisenet_formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/noisenet_formula.png
--------------------------------------------------------------------------------
/Week3/imgs/pong_gif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week3/imgs/pong_gif.gif
--------------------------------------------------------------------------------
/Week3/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import namedtuple
4 | import collections
5 | import time
6 | import math
7 |
8 | import torch
9 | import torch.nn as nn
10 | import torch.optim as optim
11 | from torch.nn import Parameter, init
12 | from torch.nn import functional as F
13 |
14 | from tensorboardX import SummaryWriter
15 |
16 | import atari_wrappers
17 | from agent import DQNAgent
18 | import utils
19 |
20 | DQN_HYPERPARAMS = {
21 | 'dueling': False,
22 | 'noisy_net': False,
23 | 'double_DQN': False,
24 | 'n_multi_step': 2,
25 | 'buffer_start_size': 10001,
26 | 'buffer_capacity': 15000,
27 | 'epsilon_start': 1.0,
28 | 'epsilon_decay': 10**5,
29 | 'epsilon_final': 0.02,
30 | 'learning_rate': 5e-5,
31 | 'gamma': 0.99,
32 | 'n_iter_update_target': 1000
33 | }
34 |
35 |
36 | BATCH_SIZE = 32
37 | MAX_N_GAMES = 3000
38 | TEST_FREQUENCY = 10
39 |
40 | ENV_NAME = "PongNoFrameskip-v4"
41 | SAVE_VIDEO = True
42 | DEVICE = 'cpu' # or 'cuda'
43 | SUMMARY_WRITER = True
44 |
45 | LOG_DIR = 'content/runs'
46 | name = '_'.join([str(k)+'.'+str(v) for k,v in DQN_HYPERPARAMS.items()])
47 | name = 'prv'
48 |
49 | if __name__ == '__main__':
50 |
51 | # create the environment
52 | env = atari_wrappers.make_env(ENV_NAME)
53 | if SAVE_VIDEO:
54 | # save the video of the games
55 | env = gym.wrappers.Monitor(env, "main-"+ENV_NAME, force=True)
56 | obs = env.reset()
57 |
58 | # TensorBoard
59 | writer = SummaryWriter(log_dir=LOG_DIR+'/'+name + str(time.time())) if SUMMARY_WRITER else None
60 |
61 | print('Hyperparams:', DQN_HYPERPARAMS)
62 |
63 | # create the agent
64 | agent = DQNAgent(env, device=DEVICE, summary_writer=writer, hyperparameters=DQN_HYPERPARAMS)
65 |
66 | n_games = 0
67 | n_iter = 0
68 |
69 | # Play MAX_N_GAMES games
70 | while n_games < MAX_N_GAMES:
71 | # act greedly
72 | action = agent.act_eps_greedy(obs)
73 |
74 | # one step on the environment
75 | new_obs, reward, done, _ = env.step(action)
76 |
77 | # add the environment feedback to the agent
78 | agent.add_env_feedback(obs, action, new_obs, reward, done)
79 |
80 | # sample and optimize NB: the agent could wait to have enough memories
81 | agent.sample_and_optimize(BATCH_SIZE)
82 |
83 | obs = new_obs
84 | if done:
85 | n_games += 1
86 |
87 | # print info about the agent and reset the stats
88 | agent.print_info()
89 | agent.reset_stats()
90 |
91 | #if n_games % TEST_FREQUENCY == 0:
92 | # print('Test mean:', utils.test_game(env, agent, 1))
93 |
94 | obs = env.reset()
95 |
96 | writer.close()
97 |
98 | # tensorboard --logdir content/runs --host localhost
99 |
--------------------------------------------------------------------------------
/Week3/neural_net.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.optim as optim
5 | from torch.nn import Parameter, init
6 | from torch.nn import functional as F
7 | import math
8 |
9 | class NoisyLinear(nn.Linear):
10 | '''
11 | Noisy Linear layer -> NOISY NETWORKS FOR EXPLORATION https://arxiv.org/pdf/1706.10295.pdf
12 |
13 | NB: IT DOESN T WORKS. PROBLEMS WITH THE EPSILON PARAMETERES INITIALIZATION
14 | '''
15 |
16 |
17 | def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
18 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
19 | self.sigma_init = sigma_init
20 |
21 | self.sigma_weight = Parameter(torch.Tensor(out_features, in_features))
22 | self.register_buffer('epsilon_weight', torch.zeros(out_features, in_features))
23 | if bias:
24 | self.sigma_bias = Parameter(torch.Tensor(out_features))
25 | self.register_buffer('epsilon_bias', torch.zeros(out_features))
26 | self.reset_parameters()
27 |
28 | def reset_parameters(self):
29 | '''
30 | Initialize the biases and weights
31 | '''
32 | if hasattr(self, 'sigma_bias'):
33 | init.constant_(self.sigma_bias, self.sigma_init)
34 | init.constant_(self.sigma_weight, self.sigma_init)
35 |
36 | std = math.sqrt(3/self.in_features)
37 | init.uniform_(self.weight, -std, std)
38 | init.uniform_(self.bias, -std, std)
39 |
40 | def forward(self, input):
41 | if self.bias is not None:
42 | ## NB: in place operation. PyTorch is not happy with that!! CHANGE IT
43 | self.epsilon_bias.data.normal_()
44 |
45 | # new bias with noise
46 | bias = self.bias + self.sigma_bias*self.epsilon_bias
47 | else:
48 | bias = self.bias
49 |
50 | ## NB: in place operation. PyTorch is not happy with that!! CHANGE IT
51 | self.epsilon_weight.data.normal_()
52 | # new weight with noise
53 | weight = self.weight + self.sigma_weight*self.epsilon_weight
54 | # create the linear layer it the added noise
55 | return F.linear(input, weight, bias)
56 |
57 |
58 | class DuelingDQN(nn.Module):
59 | '''
60 | Dueling DQN -> http://proceedings.mlr.press/v48/wangf16.pdf
61 | '''
62 |
63 | def __init__(self, input_shape, n_actions):
64 | super(DuelingDQN, self).__init__()
65 |
66 | self.conv = nn.Sequential(
67 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
68 | nn.BatchNorm2d(32),
69 | nn.ReLU(),
70 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
71 | nn.BatchNorm2d(64),
72 | nn.ReLU(),
73 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
74 | nn.BatchNorm2d(64),
75 | nn.ReLU())
76 |
77 | conv_out_size = self._get_conv_out(input_shape)
78 | # Predict the actions advantage
79 | self.fc_a = nn.Sequential(
80 | nn.Linear(conv_out_size, 512),
81 | nn.ReLU(),
82 | nn.Linear(512, n_actions))
83 |
84 | # Predict the state value
85 | self.fc_v = nn.Sequential(
86 | nn.Linear(conv_out_size, 512),
87 | nn.ReLU(),
88 | nn.Linear(512, 1))
89 |
90 | def _get_conv_out(self, shape):
91 | o = self.conv(torch.zeros(1, *shape)) # apply convolution layers..
92 | return int(np.prod(o.size())) # ..to obtain the output shape
93 |
94 | def forward(self, x):
95 | batch_size = x.size()[0]
96 | conv_out = self.conv(x).view(batch_size, -1) # apply convolution layers and flatten the results
97 |
98 | adv = self.fc_a(conv_out)
99 | val = self.fc_v(conv_out)
100 |
101 | # Sum the state value with the advantage of each action (NB: the mean has been subtracted from the advantage. It is used in the paper)
102 | return val + adv - torch.mean(adv, dim=1, keepdim=True)
103 |
104 |
105 | class DQN(nn.Module):
106 | '''
107 | Deep Q newtork following the architecture used in the DeepMind paper (https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
108 | '''
109 |
110 | def __init__(self, input_shape, n_actions, noisy_net):
111 | super(DQN, self).__init__()
112 |
113 | # 3 convolutional layers. Take an image as input (NB: the BatchNorm layers aren't in the paper but they increase the convergence)
114 | self.conv = nn.Sequential(
115 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
116 | nn.BatchNorm2d(32),
117 | nn.ReLU(),
118 | nn.Conv2d(32, 64, kernel_size=4, stride=2),
119 | nn.BatchNorm2d(64),
120 | nn.ReLU(),
121 | nn.Conv2d(64, 64, kernel_size=3, stride=1),
122 | nn.BatchNorm2d(64),
123 | nn.ReLU())
124 |
125 | # Compute the output shape of the conv layers
126 | conv_out_size = self._get_conv_out(input_shape)
127 |
128 | # 2 fully connected layers
129 | if noisy_net:
130 | # In case of NoisyNet use noisy linear layers
131 | self.fc = nn.Sequential(
132 | NoisyLinear(conv_out_size, 512),
133 | nn.ReLU(),
134 | NoisyLinear(512, n_actions))
135 | else:
136 | self.fc = nn.Sequential(
137 | nn.Linear(conv_out_size, 512),
138 | nn.ReLU(),
139 | nn.Linear(512, n_actions))
140 |
141 | def _get_conv_out(self, shape):
142 | # Compute the output shape of the conv layers
143 | o = self.conv(torch.zeros(1, *shape)) # apply convolution layers..
144 | return int(np.prod(o.size())) # ..to obtain the output shape
145 |
146 | def forward(self, x):
147 | batch_size = x.size()[0]
148 | conv_out = self.conv(x).view(batch_size, -1) # apply convolution layers and flatten the results
149 | return self.fc(conv_out) # apply fc layers
150 |
--------------------------------------------------------------------------------
/Week3/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 |
4 | def test_game(env, agent, test_episodes):
5 | reward_games = []
6 | for _ in range(test_episodes):
7 | obs = env.reset()
8 | rewards = 0
9 | while True:
10 | action = agent.act(obs)
11 | next_obs, reward, done, _ = env.step(action)
12 | obs = next_obs
13 | rewards += reward
14 |
15 | if done:
16 | reward_games.append(rewards)
17 | obs = env.reset()
18 | break
19 |
20 | return np.mean(reward_games)
21 |
--------------------------------------------------------------------------------
/Week4/A2C.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Advantage Actor-Critic (A2C) on CartPole"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Actor-critic is an algorithm that combines both policy gradient (the actor) and value function (the critic)."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "\n",
22 | "Credit: Sergey Levine"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "A2C is a more sophisticated version of the actor-critic that use the advantage, n-step return and a policy is run in multiple (synchronous) environments. \n",
30 | "[A3C](https://arxiv.org/pdf/1602.01783.pdf) is an asynchronous A2C with the environments that are run in parallel. "
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "The Actor and Critic can share the same neural network or have two separate network design. In this example, I used a shared network.\n",
38 | "
\n",
39 | "Credit: Sergey Levine"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import numpy as np\n",
49 | "import gym\n",
50 | "from tensorboardX import SummaryWriter\n",
51 | "\n",
52 | "import datetime\n",
53 | "from collections import namedtuple\n",
54 | "from collections import deque\n",
55 | "\n",
56 | "import torch\n",
57 | "import torch.nn as nn\n",
58 | "import torch.nn.functional as F\n",
59 | "import torch.optim as optim\n",
60 | "from torch.nn.utils.clip_grad import clip_grad_norm_"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "class A2C_nn(nn.Module):\n",
70 | " '''\n",
71 | " Advantage actor-critic neural net\n",
72 | " '''\n",
73 | "\n",
74 | " def __init__(self, input_shape, n_actions):\n",
75 | " super(A2C_nn, self).__init__()\n",
76 | "\n",
77 | " self.lp = nn.Sequential(\n",
78 | " nn.Linear(input_shape[0], 64),\n",
79 | " nn.ReLU())\n",
80 | "\n",
81 | " self.policy = nn.Linear(64, n_actions)\n",
82 | " self.value = nn.Linear(64, 1)\n",
83 | "\n",
84 | " def forward(self, x):\n",
85 | " l = self.lp(x.float())\n",
86 | " # return the actor and the critic\n",
87 | " return self.policy(l), self.value(l)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "The total loss contains:\n",
95 | "- actor loss $\\partial\\theta_v\\leftarrow\\partial\\theta_v + \\dfrac{\\partial(R-V_\\theta(s))^2}{\\partial\\theta_v}$\n",
96 | "- policy loss $\\partial\\theta_\\pi\\leftarrow\\partial\\theta_\\pi + \\alpha\\triangledown_\\theta log\\pi_\\theta(a|s)(R-V_\\theta(s))$\n",
97 | "- entropy loss $\\beta\\sum_i\\pi_\\theta(s)log\\pi_\\theta(s)$"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "def calculate_loss(memories, nn, writer):\n",
107 | " '''\n",
108 | " Calculate the loss of the memories\n",
109 | " '''\n",
110 | "\n",
111 | " #batch_mem = np.random.choice(len(memories), size=32)\n",
112 | "\n",
113 | " rewards = torch.tensor(np.array([m.reward for m in memories], dtype=np.float32))\n",
114 | " log_val = nn(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)))\n",
115 | "\n",
116 | " act_log_softmax = F.log_softmax(log_val[0], dim=1)[:,np.array([m.action for m in memories])]\n",
117 | " # Calculate the advantage\n",
118 | " adv = (rewards - log_val[1].detach())\n",
119 | "\n",
120 | " # actor loss (policy gradient)\n",
121 | " pg_loss = - torch.mean(act_log_softmax * adv)\n",
122 | " # critic loss (value loss)\n",
123 | " vl_loss = F.mse_loss(log_val[1].squeeze(-1), rewards)\n",
124 | " # entropy loss\n",
125 | " entropy_loss = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(log_val[0], dim=1) * F.log_softmax(log_val[0], dim=1), dim=1))\n",
126 | "\n",
127 | " # total loss\n",
128 | " loss = pg_loss + vl_loss - entropy_loss\n",
129 | "\n",
130 | " # add scalar to the writer\n",
131 | " writer.add_scalar('loss', float(loss), n_iter)\n",
132 | " writer.add_scalar('pg_loss', float(pg_loss), n_iter)\n",
133 | " writer.add_scalar('vl_loss', float(vl_loss), n_iter)\n",
134 | " writer.add_scalar('entropy_loss', float(entropy_loss), n_iter)\n",
135 | " writer.add_scalar('actions', np.mean([m.action for m in memories]), n_iter)\n",
136 | " writer.add_scalar('adv', float(torch.mean(adv)), n_iter)\n",
137 | " writer.add_scalar('act_lgsoft', float(torch.mean(act_log_softmax)), n_iter)\n",
138 | "\n",
139 | " return loss"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "class Env:\n",
149 | " '''\n",
150 | " Environment class. Used to deal with multiple environments\n",
151 | " '''\n",
152 | "\n",
153 | " game_rew = 0\n",
154 | " last_game_rew = 0\n",
155 | "\n",
156 | " def __init__(self, env_name, n_steps, gamma):\n",
157 | " super(Env, self).__init__()\n",
158 | "\n",
159 | " # create the new environment\n",
160 | " self.env = gym.make(env_name)\n",
161 | " self.obs = self.env.reset()\n",
162 | "\n",
163 | " self.n_steps = n_steps\n",
164 | " self.action_n = self.env.action_space.n\n",
165 | " self.observation_n = self.env.observation_space.shape[0]\n",
166 | " self.gamma = gamma\n",
167 | "\n",
168 | " def step(self, agent):\n",
169 | " '''\n",
170 | " Execute the agent n_steps in the environment\n",
171 | " '''\n",
172 | " memories = []\n",
173 | " for s in range(self.n_steps):\n",
174 | "\n",
175 | " # get the agent policy\n",
176 | " pol_val = agent(torch.tensor(self.obs))\n",
177 | " s_act = F.softmax(pol_val[0])\n",
178 | "\n",
179 | " # get an action following the policy distribution\n",
180 | " action = int(np.random.choice(np.arange(self.action_n), p=s_act.detach().numpy(), size=1))\n",
181 | "\n",
182 | " # Perform a step in the environment\n",
183 | " new_obs, reward, done, _ = self.env.step(action)\n",
184 | "\n",
185 | " # update the memory\n",
186 | " memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=reward, done=done))\n",
187 | "\n",
188 | " self.game_rew += reward\n",
189 | " self.obs = new_obs\n",
190 | "\n",
191 | " if done:\n",
192 | " # if done reset the env and the variables\n",
193 | " self.done = True\n",
194 | " # if the game is over, run_add take the 0 value\n",
195 | " self.run_add = 0\n",
196 | " self.obs = self.env.reset()\n",
197 | "\n",
198 | " self.last_game_rew = self.game_rew\n",
199 | " self.game_rew = 0\n",
200 | " break\n",
201 | " else:\n",
202 | " self.done = False\n",
203 | "\n",
204 | " if not self.done:\n",
205 | " # if the game isn't over, run_add take the value of the last state\n",
206 | " self.run_add = float(agent(torch.tensor(self.obs))[1])\n",
207 | "\n",
208 | " # compute the discount reward of the memories and return it\n",
209 | " return self.discounted_rewards(memories)\n",
210 | "\n",
211 | "\n",
212 | " def discounted_rewards(self, memories):\n",
213 | " '''\n",
214 | " Compute the discounted reward backward\n",
215 | " '''\n",
216 | " upd_memories = []\n",
217 | "\n",
218 | " for t in reversed(range(len(memories))):\n",
219 | " if memories[t].done: self.run_add = 0\n",
220 | " self.run_add = self.run_add * self.gamma + memories[t].reward\n",
221 | "\n",
222 | " # Update the memories with the discounted reward\n",
223 | " upd_memories.append(Memory(obs=memories[t].obs, action=memories[t].action, new_obs=memories[t].new_obs, reward=self.run_add, done=memories[t].done))\n",
224 | "\n",
225 | " return upd_memories[::-1]\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)\n",
235 | "\n",
236 | "# Hyperparameters\n",
237 | "GAMMA = 0.95\n",
238 | "LEARNING_RATE = 0.003\n",
239 | "ENTROPY_BETA = 0.01\n",
240 | "ENV_NAME = 'CartPole-v0'\n",
241 | "\n",
242 | "MAX_ITER = 100000\n",
243 | "# Number of the env\n",
244 | "N_ENVS = 40\n",
245 | "\n",
246 | "# Max normalized gradient\n",
247 | "CLIP_GRAD = 0.1\n",
248 | "\n",
249 | "device = 'cpu'\n",
250 | "\n",
251 | "now = datetime.datetime.now()\n",
252 | "date_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# create N_ENVS environments\n",
262 | "envs = [Env(ENV_NAME, 1, GAMMA) for _ in range(N_ENVS)]\n",
263 | "\n",
264 | "writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)\n",
265 | "\n",
266 | "# initialize the actor-critic NN\n",
267 | "agent_nn = A2C_nn(gym.make(ENV_NAME).observation_space.shape, gym.make(ENV_NAME).action_space.n).to(device)\n",
268 | "\n",
269 | "# Adam optimizer\n",
270 | "optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE, eps=1e-3)\n",
271 | "\n",
272 | "experience = []\n",
273 | "n_iter = 0\n",
274 | "\n",
275 | "while n_iter < MAX_ITER:\n",
276 | " n_iter += 1\n",
277 | "\n",
278 | " # list containing all the memories\n",
279 | " memories = [mem for env in envs for mem in env.step(agent_nn)]\n",
280 | "\n",
281 | " # calculate the loss\n",
282 | " losses = calculate_loss(memories, agent_nn, writer)\n",
283 | "\n",
284 | " # optimizer step\n",
285 | " optimizer.zero_grad()\n",
286 | " losses.backward()\n",
287 | " # clip the gradient\n",
288 | " clip_grad_norm_(agent_nn.parameters(), CLIP_GRAD)\n",
289 | " optimizer.step()\n",
290 | "\n",
291 | "\n",
292 | " writer.add_scalar('rew', np.mean([env.last_game_rew for env in envs]), n_iter)\n",
293 | " print(n_iter, np.round(float(losses),2), 'rew:', np.round(np.mean([env.last_game_rew for env in envs]),2))\n",
294 | "\n",
295 | "writer.close()"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "### ATTENTION! the model is not working, look at the graph below. Why this strange behavior? I tried to tune the hyperparameters but the results are the same.\n",
303 | ""
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "#### Why is the loss decreasing so fast? \n",
311 | ""
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "#### In some cases, the model start preferring always the same action..\n",
319 | ""
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "Some idea:\n",
327 | " - Use two different neural networks and optimizer"
328 | ]
329 | }
330 | ],
331 | "metadata": {
332 | "kernelspec": {
333 | "display_name": "Python 3",
334 | "language": "python",
335 | "name": "python3"
336 | },
337 | "language_info": {
338 | "codemirror_mode": {
339 | "name": "ipython",
340 | "version": 3
341 | },
342 | "file_extension": ".py",
343 | "mimetype": "text/x-python",
344 | "name": "python",
345 | "nbconvert_exporter": "python",
346 | "pygments_lexer": "ipython3",
347 | "version": "3.5.2"
348 | }
349 | },
350 | "nbformat": 4,
351 | "nbformat_minor": 2
352 | }
353 |
--------------------------------------------------------------------------------
/Week4/PolicyGradient.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## POLICY GRADIENT on CartPole"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Policy Gradient algorithms find an optimal behavior strategy optimizing directly the policy. \n",
15 | "The policy is a parametrized function respect to $\\theta$ $\\pi_\\theta(a|s)$\n",
16 | "\n",
17 | "The reward function is defined as \n",
18 | "$$J(\\theta) = \\sum_{s}d^\\pi(s)\\sum_{a}\\pi_\\theta(a|s)Q^\\pi(s,a)$$\n",
19 | "\n",
20 | "In Vanilla Policy Gradient, we estimate the return $R_t$ (REINFORCE algorithm) and update the policy subtracting a baseline value from $R_t$ to reduce the variance."
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "
\n",
28 | "Credit: John Schulman"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import numpy as np\n",
38 | "import gym\n",
39 | "from tensorboardX import SummaryWriter\n",
40 | "\n",
41 | "import time\n",
42 | "from collections import namedtuple\n",
43 | "from collections import deque\n",
44 | "import datetime\n",
45 | "\n",
46 | "import torch\n",
47 | "import torch.nn as nn\n",
48 | "import torch.nn.functional as F\n",
49 | "import torch.optim as optim"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "class PG_nn(nn.Module):\n",
59 | " '''\n",
60 | " Policy neural net\n",
61 | " '''\n",
62 | " def __init__(self, input_shape, n_actions):\n",
63 | " super(PG_nn, self).__init__()\n",
64 | "\n",
65 | " self.mlp = nn.Sequential(\n",
66 | " nn.Linear(input_shape[0], 64),\n",
67 | " nn.ReLU(),\n",
68 | " nn.Linear(64, n_actions))\n",
69 | "\n",
70 | " def forward(self, x):\n",
71 | " return self.mlp(x.float())"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "def discounted_rewards(memories, gamma):\n",
81 | " '''\n",
82 | " Compute the discounted reward backward\n",
83 | " '''\n",
84 | "\n",
85 | " disc_rew = np.zeros(len(memories))\n",
86 | " run_add = 0\n",
87 | "\n",
88 | " for t in reversed(range(len(memories))):\n",
89 | " if memories[t].done: run_add = 0\n",
90 | " run_add = run_add * gamma + memories[t].reward\n",
91 | " disc_rew[t] = run_add\n",
92 | "\n",
93 | " return disc_rew"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], verbose=False, rename=False)\n",
103 | "\n",
104 | "GAMMA = 0.99\n",
105 | "LEARNING_RATE = 0.002\n",
106 | "ENTROPY_BETA = 0.01\n",
107 | "ENV_NAME = 'CartPole-v0'\n",
108 | "\n",
109 | "MAX_N_GAMES = 10000\n",
110 | "n_games = 0\n",
111 | "\n",
112 | "device = 'cpu'\n",
113 | "\n",
114 | "now = datetime.datetime.now()\n",
115 | "date_time = \"{}_{}.{}.{}\".format(now.day, now.hour, now.minute, now.second)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "env = gym.make(ENV_NAME)\n",
125 | "obs = env.reset()\n",
126 | "\n",
127 | "# Initialize the writer\n",
128 | "writer = SummaryWriter(log_dir='content/runs/A2C'+ENV_NAME+'_'+date_time)\n",
129 | "\n",
130 | "# create the agent neural net\n",
131 | "action_n = env.action_space.n\n",
132 | "agent_nn = PG_nn(env.observation_space.shape, action_n).to(device)\n",
133 | "\n",
134 | "# Adam optimizer\n",
135 | "optimizer = optim.Adam(agent_nn.parameters(), lr=LEARNING_RATE)\n",
136 | "\n",
137 | "experience = []\n",
138 | "tot_reward = 0\n",
139 | "n_iter = 0\n",
140 | "# deque list to keep the baseline\n",
141 | "baseline = deque(maxlen=30000)\n",
142 | "game_rew = 0\n",
143 | "\n",
144 | "## MAIN BODY\n",
145 | "while n_games < MAX_N_GAMES:\n",
146 | "\n",
147 | " n_iter += 1\n",
148 | "\n",
149 | " # execute the agent\n",
150 | " act = agent_nn(torch.tensor(obs))\n",
151 | " act_soft = F.softmax(act)\n",
152 | " # get an action following the policy distribution\n",
153 | " action = int(np.random.choice(np.arange(action_n), p=act_soft.detach().numpy(), size=1))\n",
154 | "\n",
155 | " # make a step in the env\n",
156 | " new_obs, reward, done, _ = env.step(action)\n",
157 | "\n",
158 | " game_rew += reward\n",
159 | " # update the experience list with the last memory\n",
160 | " experience.append(Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done))\n",
161 | "\n",
162 | " obs = new_obs\n",
163 | "\n",
164 | " if done:\n",
165 | " # Calculate the discounted rewards\n",
166 | " disc_rewards = discounted_rewards(experience, GAMMA)\n",
167 | "\n",
168 | " # update the baseline\n",
169 | " baseline.extend(disc_rewards)\n",
170 | " # subtract the baseline mean from the discounted reward.\n",
171 | " disc_rewards -= np.mean(baseline)\n",
172 | "\n",
173 | " # run the agent NN on the obs in the experience list\n",
174 | " acts = agent_nn(torch.tensor([e.obs for e in experience]))\n",
175 | "\n",
176 | " # take the log softmax of the action taken previously\n",
177 | " game_act_log_softmax_t = F.log_softmax(acts, dim=1)[:,[e.action for e in experience]]\n",
178 | "\n",
179 | " disc_rewards_t = torch.tensor(disc_rewards, dtype=torch.float32).to(device)\n",
180 | "\n",
181 | " # compute the loss entropy\n",
182 | " l_entropy = ENTROPY_BETA * torch.mean(torch.sum(F.softmax(acts, dim=1) * F.log_softmax(acts, dim=1), dim=1))\n",
183 | "\n",
184 | " # compute the loss\n",
185 | " loss = - torch.mean(disc_rewards_t * game_act_log_softmax_t)\n",
186 | " loss = loss + l_entropy\n",
187 | "\n",
188 | " # optimize\n",
189 | " optimizer.zero_grad()\n",
190 | " loss.backward()\n",
191 | " optimizer.step()\n",
192 | "\n",
193 | " # print the stats\n",
194 | " writer.add_scalar('loss', loss, n_iter)\n",
195 | " writer.add_scalar('reward', game_rew, n_iter)\n",
196 | "\n",
197 | " print(n_games, loss.detach().numpy(), game_rew, np.mean(disc_rewards), np.mean(baseline))\n",
198 | "\n",
199 | " # reset the variables and env\n",
200 | " experience = []\n",
201 | " game_rew = 0\n",
202 | " obs = env.reset()\n",
203 | " n_games += 1\n",
204 | "\n",
205 | "\n",
206 | "writer.close()"
207 | ]
208 | },
209 | {
210 | "cell_type": "markdown",
211 | "metadata": {},
212 | "source": [
213 | ""
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": []
222 | }
223 | ],
224 | "metadata": {
225 | "kernelspec": {
226 | "display_name": "Python 3",
227 | "language": "python",
228 | "name": "python3"
229 | },
230 | "language_info": {
231 | "codemirror_mode": {
232 | "name": "ipython",
233 | "version": 3
234 | },
235 | "file_extension": ".py",
236 | "mimetype": "text/x-python",
237 | "name": "python",
238 | "nbconvert_exporter": "python",
239 | "pygments_lexer": "ipython3",
240 | "version": "3.5.2"
241 | }
242 | },
243 | "nbformat": 4,
244 | "nbformat_minor": 2
245 | }
246 |
--------------------------------------------------------------------------------
/Week4/imgs/Advantage_actor_critic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/Advantage_actor_critic.png
--------------------------------------------------------------------------------
/Week4/imgs/Vanilla_policy_gradient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/Vanilla_policy_gradient.png
--------------------------------------------------------------------------------
/Week4/imgs/actions_plot_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/actions_plot_a2c.png
--------------------------------------------------------------------------------
/Week4/imgs/loss_plot_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/loss_plot_a2c.png
--------------------------------------------------------------------------------
/Week4/imgs/nn_ac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/nn_ac.png
--------------------------------------------------------------------------------
/Week4/imgs/reward_pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/reward_pg.png
--------------------------------------------------------------------------------
/Week4/imgs/reward_plot_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week4/imgs/reward_plot_a2c.png
--------------------------------------------------------------------------------
/Week5/PPO.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from tensorboardX import SummaryWriter
4 |
5 | import datetime
6 | from collections import namedtuple
7 | from collections import deque
8 | import math
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | import torch.optim as optim
14 | from torch.nn.utils.clip_grad import clip_grad_norm_
15 |
16 | class A2C_policy(nn.Module):
17 | '''
18 | Policy neural network
19 | '''
20 | def __init__(self, input_shape, n_actions):
21 | super(A2C_policy, self).__init__()
22 |
23 | self.lp = nn.Sequential(
24 | nn.Linear(input_shape[0], 32),
25 | nn.ReLU(),
26 | nn.Linear(32, 32),
27 | nn.ReLU())
28 |
29 | self.mean_l = nn.Linear(32, n_actions[0])
30 | self.mean_l.weight.data.mul_(0.1)
31 |
32 | self.var_l = nn.Linear(32, n_actions[0])
33 | self.var_l.weight.data.mul_(0.1)
34 |
35 | self.logstd = nn.Parameter(torch.zeros(n_actions[0]))
36 |
37 | def forward(self, x):
38 | ot_n = self.lp(x.float())
39 | return F.tanh(self.mean_l(ot_n))
40 |
41 | class A2C_value(nn.Module):
42 | '''
43 | Actor neural network
44 | '''
45 | def __init__(self, input_shape):
46 | super(A2C_value, self).__init__()
47 |
48 | self.lp = nn.Sequential(
49 | nn.Linear(input_shape[0], 32),
50 | nn.ReLU(),
51 | nn.Linear(32, 32),
52 | nn.ReLU(),
53 | nn.Linear(32, 1))
54 |
55 |
56 | def forward(self, x):
57 | return self.lp(x.float())
58 |
59 |
60 | class Env:
61 | '''
62 | Environment class
63 | '''
64 | game_rew = 0
65 | last_game_rew = 0
66 | game_n = 0
67 | last_games_rews = [-200]
68 | n_iter = 0
69 |
70 | def __init__(self, env_name, n_steps, gamma, gae_lambda, save_video=False):
71 | super(Env, self).__init__()
72 |
73 | # create the new environment
74 | self.env = gym.make(env_name)
75 | self.obs = self.env.reset()
76 |
77 | self.n_steps = n_steps
78 | self.action_n = self.env.action_space.shape
79 | self.observation_n = self.env.observation_space.shape[0]
80 | self.gamma = gamma
81 | self.gae_lambda = gae_lambda
82 |
83 | # CHANGED
84 | def steps(self, agent_policy, agent_value):
85 | '''
86 | Execute the agent n_steps in the environment
87 | '''
88 | memories = []
89 | for s in range(self.n_steps):
90 | self.n_iter += 1
91 |
92 | # get the agent policy
93 | ag_mean = agent_policy(torch.tensor(self.obs))
94 |
95 | # get an action following the policy distribution
96 | logstd = agent_policy.logstd.data.cpu().numpy()
97 | action = ag_mean.data.cpu().numpy() + np.exp(logstd) * np.random.normal(size=logstd.shape)
98 | #action = np.random.normal(loc=ag_mean.data.cpu().numpy(), scale=torch.sqrt(ag_var).data.cpu().numpy())
99 | action = np.clip(action, -1, 1)
100 |
101 | state_value = float(agent_value(torch.tensor(self.obs)))
102 |
103 | # Perform a step in the environment
104 | new_obs, reward, done, _ = self.env.step(action)
105 |
106 | # Update the memories with the last interaction
107 | if done:
108 | # change the reward to 0 in case the episode is end
109 | memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=0, done=done, value=state_value, adv=0))
110 | else:
111 | memories.append(Memory(obs=self.obs, action=action, new_obs=new_obs, reward=reward, done=done, value=state_value, adv=0))
112 |
113 |
114 | self.game_rew += reward
115 | self.obs = new_obs
116 |
117 | if done:
118 | print('#####',self.game_n, 'rew:', int(self.game_rew), int(np.mean(self.last_games_rews[-100:])), np.round(reward,2), self.n_iter)
119 |
120 | # reset the environment
121 | self.obs = self.env.reset()
122 | self.last_game_rew = self.game_rew
123 | self.game_rew = 0
124 | self.game_n += 1
125 | self.n_iter = 0
126 | self.last_games_rews.append(self.last_game_rew)
127 |
128 | # compute the discount reward of the memories and return it
129 | return self.generalized_advantage_estimation(memories)
130 |
131 | def generalized_advantage_estimation(self, memories):
132 | '''
133 | Calculate the advantage diuscounted reward as in the paper
134 | '''
135 | upd_memories = []
136 | run_add = 0
137 |
138 | for t in reversed(range(len(memories)-1)):
139 | if memories[t].done:
140 | run_add = memories[t].reward
141 | else:
142 | sigma = memories[t].reward + self.gamma * memories[t+1].value - memories[t].value
143 | run_add = sigma + run_add * self.gamma * self.gae_lambda
144 |
145 | ## NB: the last memoy is missing
146 | # Update the memories with the discounted reward
147 | upd_memories.append(Memory(obs=memories[t].obs, action=memories[t].action, new_obs=memories[t].new_obs, reward=run_add + memories[t].value, done=memories[t].done, value=memories[t].value, adv=run_add))
148 |
149 | return upd_memories[::-1]
150 |
151 |
152 | def log_policy_prob(mean, std, actions):
153 | # policy log probability
154 | act_log_softmax = -((mean-actions)**2)/(2*torch.exp(std).clamp(min=1e-4)) - torch.log(torch.sqrt(2*math.pi*torch.exp(std)))
155 | return act_log_softmax
156 |
157 | def compute_log_policy_prob(memories, nn_policy, device):
158 | '''
159 | Run the policy on the observation in the memory and compute the policy log probability
160 | '''
161 | n_mean = nn_policy(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)).to(device))
162 | n_mean = n_mean.type(torch.DoubleTensor)
163 | logstd = agent_policy.logstd.type(torch.DoubleTensor)
164 |
165 | actions = torch.DoubleTensor(np.array([m.action for m in memories])).to(device)
166 |
167 | return log_policy_prob(n_mean, logstd, actions)
168 |
169 | def clipped_PPO_loss(memories, nn_policy, nn_value, old_log_policy, adv, epsilon, writer, device):
170 | '''
171 | Clipped PPO loss as in the paperself.
172 | It return the clipped policy loss and the value loss
173 | '''
174 |
175 | # state value
176 | rewards = torch.tensor(np.array([m.reward for m in memories], dtype=np.float32)).to(device)
177 | value = nn_value(torch.tensor(np.array([m.obs for m in memories], dtype=np.float32)).to(device))
178 | # Value loss
179 | vl_loss = F.mse_loss(value.squeeze(-1), rewards)
180 |
181 | new_log_policy = compute_log_policy_prob(memories, nn_policy, device)
182 | rt_theta = torch.exp(new_log_policy - old_log_policy.detach())
183 |
184 | adv = adv.unsqueeze(-1) # add a dimension because rt_theta has shape: [batch_size, n_actions]
185 | pg_loss = -torch.mean(torch.min(rt_theta*adv, torch.clamp(rt_theta, 1-epsilon, 1+epsilon)*adv))
186 |
187 | return pg_loss, vl_loss
188 |
189 | def test_game(tst_env, agent_policy, test_episodes):
190 | '''
191 | Execute test episodes on the test environment
192 | '''
193 |
194 | reward_games = []
195 | steps_games = []
196 | for _ in range(test_episodes):
197 | obs = tst_env.reset()
198 | rewards = 0
199 | steps = 0
200 | while True:
201 | ag_mean = agent_policy(torch.tensor(obs))
202 | action = np.clip(ag_mean.data.cpu().numpy().squeeze(), -1, 1)
203 |
204 | next_obs, reward, done, _ = tst_env.step(action)
205 | steps += 1
206 | obs = next_obs
207 | rewards += reward
208 |
209 | if done:
210 | reward_games.append(rewards)
211 | steps_games.append(steps)
212 | obs = tst_env.reset()
213 | break
214 |
215 | return np.mean(reward_games), np.mean(steps_games)
216 |
217 |
218 | Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done', 'value', 'adv'], verbose=False, rename=False)
219 |
220 | # Hyperparameters
221 | ENV_NAME = 'BipedalWalker-v2'
222 | #ENV_NAME = 'BipedalWalkerHardcore-v2'
223 |
224 | MAX_ITER = 500000
225 |
226 | BATCH_SIZE = 64
227 | PPO_EPOCHS = 7
228 | device = 'cpu'
229 | CLIP_GRADIENT = 0.2
230 | CLIP_EPS = 0.2
231 |
232 | TRAJECTORY_SIZE = 2049
233 | GAE_LAMBDA = 0.95
234 | GAMMA = 0.99
235 |
236 | ## Test Hyperparameters
237 | test_episodes = 5
238 | best_test_result = -1e5
239 | save_video_test = True
240 | N_ITER_TEST = 100
241 |
242 | POLICY_LR = 0.0004
243 | VALUE_LR = 0.001
244 | now = datetime.datetime.now()
245 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
246 |
247 | load_model = False
248 | checkpoint_name = "checkpoints/..."
249 |
250 | if __name__ == '__main__':
251 | # Create the environment
252 | env = Env(ENV_NAME, TRAJECTORY_SIZE, GAMMA, GAE_LAMBDA)
253 |
254 | writer_name = 'PPO_'+ENV_NAME+'_'+date_time+'_'+str(POLICY_LR)+'_'+str(VALUE_LR)+'_'+str(TRAJECTORY_SIZE)+'_'+str(BATCH_SIZE)
255 | writer = SummaryWriter(log_dir='content/runs/'+writer_name)
256 |
257 | # create the test environment
258 | test_env = gym.make(ENV_NAME)
259 | if save_video_test:
260 | test_env = gym.wrappers.Monitor(test_env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: episode_id%10==0)
261 |
262 | # initialize the actor-critic NN
263 | agent_policy = A2C_policy(test_env.observation_space.shape, test_env.action_space.shape).to(device)
264 | agent_value = A2C_value(test_env.observation_space.shape).to(device)
265 |
266 | # initialize policy and value optimizer
267 | optimizer_policy = optim.Adam(agent_policy.parameters(), lr=POLICY_LR)
268 | optimizer_value = optim.Adam(agent_value.parameters(), lr=VALUE_LR)
269 |
270 | # Do you want to load a trained model?
271 | if load_model:
272 | print('> Loading checkpoint {}'.format(checkpoint_name))
273 | checkpoint = torch.load(checkpoint_name)
274 | agent_policy.load_state_dict(checkpoint['agent_policy'])
275 | agent_value.load_state_dict(checkpoint['agent_value'])
276 | optimizer_policy.load_state_dict(checkpoint['optimizer_policy'])
277 | optimizer_value.load_state_dict(checkpoint['optimizer_value'])
278 |
279 |
280 | experience = []
281 | n_iter = 0
282 |
283 | while n_iter < MAX_ITER:
284 | n_iter += 1
285 |
286 | batch = env.steps(agent_policy, agent_value)
287 |
288 | # Compute the policy probability with the old policy network
289 | old_log_policy = compute_log_policy_prob(batch, agent_policy, device)
290 |
291 | # Gather the advantage from the memory..
292 | batch_adv = np.array([m.adv for m in batch])
293 | # .. and normalize it to stabilize network
294 | batch_adv = (batch_adv - np.mean(batch_adv)) / (np.std(batch_adv) + 1e-7)
295 | batch_adv = torch.tensor(batch_adv).to(device)
296 |
297 | # variables to accumulate losses
298 | pol_loss_acc = []
299 | val_loss_acc = []
300 |
301 | # execute PPO_EPOCHS epochs
302 | for s in range(PPO_EPOCHS):
303 | # compute the loss and optimize over mini batches of size BATCH_SIZE
304 | for mb in range(0, len(batch), BATCH_SIZE):
305 | mini_batch = batch[mb:mb+BATCH_SIZE]
306 | minib_old_log_policy = old_log_policy[mb:mb+BATCH_SIZE]
307 | minib_adv = batch_adv[mb:mb+BATCH_SIZE]
308 |
309 | # Compute the PPO clipped loss and the value loss
310 | pol_loss, val_loss = clipped_PPO_loss(mini_batch, agent_policy, agent_value, minib_old_log_policy, minib_adv, CLIP_EPS, writer, device)
311 |
312 | # optimize the policy network
313 | optimizer_policy.zero_grad()
314 | pol_loss.backward()
315 | optimizer_policy.step()
316 |
317 | # optimize the value network
318 | optimizer_value.zero_grad()
319 | val_loss.backward()
320 | optimizer_value.step()
321 |
322 | pol_loss_acc.append(float(pol_loss))
323 | val_loss_acc.append(float(val_loss))
324 |
325 | # add scalars to the tensorboard
326 | writer.add_scalar('pg_loss', np.mean(pol_loss_acc), n_iter)
327 | writer.add_scalar('vl_loss', np.mean(val_loss_acc), n_iter)
328 | writer.add_scalar('rew', env.last_game_rew, n_iter)
329 | writer.add_scalar('10rew', np.mean(env.last_games_rews[-100:]), n_iter)
330 |
331 | # Test the agent
332 | if n_iter % N_ITER_TEST == 0:
333 | test_rews, test_stps = test_game(test_env, agent_policy, test_episodes)
334 | print(' > Testing..', n_iter,test_rews, test_stps)
335 | # if it achieve the best results so far, save the models
336 | if test_rews > best_test_result:
337 | torch.save({
338 | 'agent_policy': agent_policy.state_dict(),
339 | 'agent_value': agent_value.state_dict(),
340 | 'optimizer_policy': optimizer_policy.state_dict(),
341 | 'optimizer_value': optimizer_value.state_dict(),
342 | 'test_reward': test_rews
343 | }, 'checkpoints/checkpoint_'+writer_name+'.pth.tar')
344 | best_test_result = test_rews
345 | print('=> Best test!! Reward:{:.2f} Steps:{}'.format(test_rews, test_stps))
346 |
347 | writer.add_scalar('test_rew', test_rews, n_iter)
348 |
349 |
350 | writer.close()
351 |
--------------------------------------------------------------------------------
/Week5/README.md:
--------------------------------------------------------------------------------
1 | # Let's solve BipedalWalker with PPO
2 |
3 | This is an implementation of [PPO](https://blog.openai.com/openai-baselines-ppo/) with continuous actions, a new algorithm developed by OpenAI that has been used in [OpenAI Five to play Dota 2](https://blog.openai.com/openai-five/).
4 |
5 | PPO is a policy gradient method that differently from the vanilla implementation, it combines the sampling data through interaction with the environment and the optimization of a surrogate objective function. Read the [paper](https://arxiv.org/pdf/1707.06347.pdf) to learn more about it.
6 |
7 | For the DQN implementation and the choose of the hyperparameters, I mostly followed the [paper](https://arxiv.org/pdf/1707.06347.pdf). (In the last page there is a table with all the hyperparameters.). In case you want to fine-tune them, check out [Training with Proximal Policy Optimization](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-PPO.md)
8 |
9 | ### [Learn the theory behind PPO](https://github.com/andri27-ts/60_Days_RL_Challenge/blob/master/README.md#week-5---advanced-policy-gradients---trpo--ppo)
10 |
11 |
12 | ## Results
13 |
14 | 
15 |
16 | In the plot below are shown the rewards. The game defines "solving" as getting an average reward of 300 over 100 consecutive trials. We aren't at that level yet, but is possible to reach that goal tuning the hyperparameters and playing more episodes.
17 |
18 | 
19 |
20 |
21 | ## Install
22 |
23 | ```
24 | pip install gym
25 | pip install torch torchvision
26 | pip install tensorboardX
27 | apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
28 |
29 | git clone https://github.com/pybox2d/pybox2d
30 | cd pybox2d
31 | !pip install -e .
32 | ```
33 |
--------------------------------------------------------------------------------
/Week5/imgs/rew_walker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week5/imgs/rew_walker.png
--------------------------------------------------------------------------------
/Week5/imgs/walker_gif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week5/imgs/walker_gif.gif
--------------------------------------------------------------------------------
/Week6/ES.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorboardX
3 | import time
4 | import datetime
5 |
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | import torch.multiprocessing as mp
10 | from torch import optim
11 |
12 | import scipy.stats as ss
13 | from tensorboardX import SummaryWriter
14 | import gym
15 |
16 |
17 | class NeuralNetwork(nn.Module):
18 | '''
19 | Neural network for continuous action space
20 | '''
21 | def __init__(self, input_shape, n_actions):
22 | super(NeuralNetwork, self).__init__()
23 |
24 | self.mlp = nn.Sequential(
25 | nn.Linear(input_shape, 32),
26 | nn.Tanh(),
27 | nn.Linear(32, 32),
28 | nn.Tanh())
29 |
30 | self.mean_l = nn.Linear(32, n_actions)
31 | self.mean_l.weight.data.mul_(0.1)
32 |
33 | self.var_l = nn.Linear(32, n_actions)
34 | self.var_l.weight.data.mul_(0.1)
35 |
36 | self.logstd = nn.Parameter(torch.zeros(n_actions))
37 |
38 | def forward(self, x):
39 | ot_n = self.mlp(x.float())
40 | return torch.tanh(self.mean_l(ot_n))
41 |
42 |
43 | def sample_noise(neural_net):
44 | '''
45 | Sample noise for each parameter of the neural net
46 | '''
47 | nn_noise = []
48 | for n in neural_net.parameters():
49 | noise = np.random.normal(size=n.data.numpy().shape)
50 | nn_noise.append(noise)
51 | return np.array(nn_noise)
52 |
53 | def evaluate_neuralnet(nn, env):
54 | '''
55 | Evaluate an agent running it in the environment and computing the total reward
56 | '''
57 | obs = env.reset()
58 | game_reward = 0
59 |
60 | while True:
61 | # Output of the neural net
62 | net_output = nn(torch.tensor(obs))
63 | # the action is the value clipped returned by the nn
64 | action = np.clip(net_output.data.cpu().numpy().squeeze(), -1, 1)
65 | new_obs, reward, done, _ = env.step(action)
66 | obs = new_obs
67 |
68 | game_reward += reward
69 |
70 | if done:
71 | break
72 |
73 | return game_reward
74 |
75 | def evaluate_noisy_net(noise, neural_net, env):
76 | '''
77 | Evaluate a noisy agent by adding the noise to the plain agent
78 | '''
79 | old_dict = neural_net.state_dict()
80 |
81 | # add the noise to each parameter of the NN
82 | for n, p in zip(noise, neural_net.parameters()):
83 | p.data += torch.FloatTensor(n * STD_NOISE)
84 |
85 | # evaluate the agent with the noise
86 | reward = evaluate_neuralnet(neural_net, env)
87 | # load the previous paramater (the ones without the noise)
88 | neural_net.load_state_dict(old_dict)
89 |
90 | return reward
91 |
92 | def worker(params_queue, output_queue):
93 | '''
94 | Function execute by each worker: get the agent' NN, sample noise and evaluate the agent adding the noise. Then return the seed and the rewards to the central unit
95 | '''
96 |
97 | env = gym.make(ENV_NAME)
98 | actor = NeuralNetwork(env.observation_space.shape[0], env.action_space.shape[0])
99 |
100 | while True:
101 | # get the new actor's params
102 | act_params = params_queue.get()
103 | if act_params != None:
104 | # load the actor params
105 | actor.load_state_dict(act_params)
106 |
107 | # get a random seed
108 | seed = np.random.randint(1e6)
109 | # set the new seed
110 | np.random.seed(seed)
111 |
112 | noise = sample_noise(actor)
113 |
114 | pos_rew = evaluate_noisy_net(noise, actor, env)
115 | # Mirrored sampling
116 | neg_rew = evaluate_noisy_net(-noise, actor, env)
117 |
118 | output_queue.put([[pos_rew, neg_rew], seed])
119 | else:
120 | break
121 |
122 |
123 | def normalized_rank(rewards):
124 | '''
125 | Rank the rewards and normalize them.
126 | '''
127 | ranked = ss.rankdata(rewards)
128 | norm = (ranked - 1) / (len(ranked) - 1)
129 | norm -= 0.5
130 | return norm
131 |
132 |
133 | ENV_NAME = 'LunarLanderContinuous-v2'
134 |
135 | # Hyperparameters
136 | STD_NOISE = 0.05
137 | BATCH_SIZE = 100
138 | LEARNING_RATE = 0.01
139 | MAX_ITERATIONS = 10000
140 |
141 | MAX_WORKERS = 4
142 |
143 | save_video_test = True
144 | VIDEOS_INTERVAL = 100
145 |
146 | now = datetime.datetime.now()
147 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
148 |
149 | if __name__ == '__main__':
150 | # Writer name
151 | writer_name = 'ASY_ES_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, date_time, str(STD_NOISE), str(BATCH_SIZE), str(LEARNING_RATE), str(MAX_ITERATIONS), str(MAX_WORKERS))
152 | print('Name:', writer_name)
153 |
154 | # Create the test environment
155 | env = gym.make(ENV_NAME)
156 | if save_video_test:
157 | env = gym.wrappers.Monitor(env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: True)
158 |
159 | # Initialize the agent
160 | actor = NeuralNetwork(env.observation_space.shape[0], env.action_space.shape[0])
161 | # Initialize the optimizer
162 | optimizer = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
163 |
164 | writer = SummaryWriter(log_dir='content/runs/'+writer_name)
165 |
166 | # Queues to pass and get the variables to and from each processe
167 | output_queue = mp.Queue(maxsize=BATCH_SIZE)
168 | params_queue = mp.Queue(maxsize=BATCH_SIZE)
169 |
170 | processes = []
171 |
172 | # Create and start the processes
173 | for _ in range(MAX_WORKERS):
174 | p = mp.Process(target=worker, args=(params_queue, output_queue))
175 | p.start()
176 | processes.append(p)
177 |
178 |
179 | # Execute the main loop MAX_ITERATIONS times
180 | for n_iter in range(MAX_ITERATIONS):
181 | it_time = time.time()
182 |
183 | batch_noise = []
184 | batch_reward = []
185 |
186 | # create the queue with the actor parameters
187 | for _ in range(BATCH_SIZE):
188 | params_queue.put(actor.state_dict())
189 |
190 | # receive from each worker the results (the seed and the rewards)
191 | for i in range(BATCH_SIZE):
192 | p_rews, p_seed = output_queue.get()
193 |
194 | np.random.seed(p_seed)
195 | noise = sample_noise(actor)
196 | batch_noise.append(noise)
197 | batch_noise.append(-noise)
198 |
199 | batch_reward.append(p_rews[0]) # reward of the positive noise
200 | batch_reward.append(p_rews[1]) # reward of the negative noise
201 |
202 | # Print some stats
203 | print(n_iter, 'Mean:',np.round(np.mean(batch_reward), 2), 'Max:', np.round(np.max(batch_reward), 2), 'Time:', np.round(time.time()-it_time, 2))
204 | writer.add_scalar('reward', np.mean(batch_reward), n_iter)
205 |
206 | # Rank the reward and normalize it
207 | batch_reward = normalized_rank(batch_reward)
208 |
209 |
210 | th_update = []
211 | optimizer.zero_grad()
212 | # for each actor's parameter, and for each noise in the batch, update it by the reward * the noise value
213 | for idx, p in enumerate(actor.parameters()):
214 | upd_weights = np.zeros(p.data.shape)
215 |
216 | for n,r in zip(batch_noise, batch_reward):
217 | upd_weights += r*n[idx]
218 |
219 | upd_weights = upd_weights / (BATCH_SIZE*STD_NOISE)
220 | # put the updated weight on the gradient variable so that afterwards the optimizer will use it
221 | p.grad = torch.FloatTensor( -upd_weights)
222 | th_update.append(np.mean(upd_weights))
223 |
224 | # Optimize the actor's NN
225 | optimizer.step()
226 |
227 | writer.add_scalar('loss', np.mean(th_update), n_iter)
228 |
229 | if n_iter % VIDEOS_INTERVAL == 0:
230 | print('Test reward:',evaluate_neuralnet(actor, env))
231 |
232 | # quit the processes
233 | for _ in range(MAX_WORKERS):
234 | params_queue.put(None)
235 |
236 | for p in processes:
237 | p.join()
238 |
239 | # tensorboard --logdir content/runs --host localhost
240 |
--------------------------------------------------------------------------------
/Week6/README.md:
--------------------------------------------------------------------------------
1 | # Scalable Evolution Strategies on LunarLander
2 |
3 | Evolution Strategies is a valid alternative to the most popular MDP-based RL techniques. Here is provided an implementation of the OpenAI paper [Evolution Strategies as a
4 | Scalable Alternative to Reinforcement Learning](https://arxiv.org/pdf/1703.03864.pdf). I decided to test it on [LunarLander](https://gym.openai.com/envs/LunarLanderContinuous-v2/) Gym environment to show the applicability and competitiveness of this category of algorithms.
5 |
6 | The following are the key parts of this implementation:
7 | - Novel communication strategy based on common random number
8 | - Mirrored sampling
9 | - Normalized rank
10 |
11 |
12 | ### [Learn more about Evolution Strategies](https://github.com/andri27-ts/60_Days_RL_Challenge#week-6---evolution-strategies-and-genetic-algorithms)
13 |
14 |
15 |
16 | ## Results
17 |
18 | 
19 |
20 | The following plot shows the reward for each iteration. ES is able to solve the game after 650 iterations. Keep in mind that in this version, for each iteration, 100 games are played. This means that the algorithm solved the gamed after having played about 65.000 games.
21 |
22 | 
23 |
24 |
25 | ## Install
26 |
27 | ```
28 | pip install gym
29 | pip install torch torchvision
30 | pip install tensorboardX
31 | apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb ffmpeg xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
32 |
33 | git clone https://github.com/pybox2d/pybox2d
34 | cd pybox2d
35 | pip install -e .
36 | ```
37 |
--------------------------------------------------------------------------------
/Week6/imgs/LunarLanderContinuous.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week6/imgs/LunarLanderContinuous.gif
--------------------------------------------------------------------------------
/Week6/imgs/plot_rewards.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week6/imgs/plot_rewards.PNG
--------------------------------------------------------------------------------
/Week7/README.md:
--------------------------------------------------------------------------------
1 | # Model-Based Reinforcement Learning
2 |
3 | The strength of model-based reinforcement learning algorithms is that, once they learned the environment, they can plan the next actions to take. This allows the agent to transfer the knowledge of the environment it has acquired to other tasks. Model-based methods generally are more sample efficient than model-free to the detriment of performance.
4 | Better and more efficient RL algorithms can be obtained merging these two techniques.
5 | This repository contains an implementation of the model-based algorithm proposed in section IV of [this paper](https://arxiv.org/pdf/1708.02596.pdf) with some differences:
6 | - used [Roboschool](https://github.com/openai/roboschool) instead of [Mujoco](http://www.mujoco.org/).
7 | - along with the next environment state, also the reward is learned. To do that another neural network has been used.
8 | - hyperparameters have been adapted to the new environment and problem reformulation (i.e. the reward has to be learned).
9 |
10 | The pseudocode of the main loop is the following:
11 |
12 | 
13 |
14 |
15 |
16 | ### [Learn more about Model-Based Reinforcement Learning](https://github.com/andri27-ts/60_Days_RL_Challenge#week-7---model-based-reinforcement-learning)
17 |
18 | ## Results
19 |
20 | 
21 |
22 | To train RoboschoolAnt-v1, no aggregation steps has been used.
23 | On RoboschoolAnt, playing 10.000 games it achieves a mean reward of about 800. These games have been played only taking random actions.
24 |
25 |
26 | ## Install
27 |
28 | Roboschool installation:
29 | ```
30 | apt install cmake ffmpeg pkg-config qtbase5-dev libqt5opengl5-dev libassimp-dev libpython3.6-dev libboost-python-dev libtinyxml-dev
31 |
32 | git clone https://github.com/openai/gym
33 | pip install -e gym
34 |
35 | git clone https://github.com/openai/roboschool
36 |
37 | cd roboschool
38 | ROBOSCHOOL_PATH=`pwd`
39 | git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision
40 | mkdir bullet3/build
41 | cd bullet3/build
42 | cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..
43 | make -j4
44 | make install
45 | cd ../..
46 |
47 |
48 | pip3 install -e $ROBOSCHOOL_PATH
49 | ```
50 |
51 | Torch installation:
52 | ```
53 | pip install torch torchvision
54 | ```
55 |
56 | In case you use Google Colab, run
57 |
58 | ```
59 | # Install Chainer, ChainerRL and CuPy!
60 |
61 | %%script bash
62 |
63 | apt-get -qq -y install libcusparse8.0 libnvrtc8.0 libnvtoolsext1 > /dev/null
64 | ln -snf /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so.8.0 /usr/lib/x86_64-linux-gnu/libnvrtc-builtins.so
65 | pip -q install https://github.com/kmaehashi/chainer-colab/releases/download/2018-02-06/cupy_cuda80-4.0.0b3-cp36-cp36m-linux_x86_64.whl
66 | pip -q install 'chainer==4.0.0b3'
67 | apt-get -qq -y install xvfb freeglut3-dev ffmpeg> /dev/null
68 | pip -q install chainerrl
69 | pip -q install gym
70 | pip -q install pyglet
71 | pip -q install pyopengl
72 | pip -q install pyvirtualdisplay
73 | ```
74 |
--------------------------------------------------------------------------------
/Week7/imgs/animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week7/imgs/animation.gif
--------------------------------------------------------------------------------
/Week7/imgs/pseudocode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/Week7/imgs/pseudocode.png
--------------------------------------------------------------------------------
/Week7/model_based.py:
--------------------------------------------------------------------------------
1 |
2 | '''
3 | # Needed only if you run it on Google Colab
4 | from pyvirtualdisplay import Display
5 | display = Display(visible=0, size=(1024, 768))
6 | display.start()
7 | import os
8 | os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)'''
9 |
10 |
11 | from sklearn.preprocessing import StandardScaler
12 | import roboschool
13 |
14 | import torch
15 | import torch.nn as nn
16 | import torch.optim as optim
17 | from torch.nn import functional as F
18 |
19 | from tqdm import tqdm
20 | import datetime
21 | import time
22 |
23 | import gym
24 | import numpy as np
25 |
26 | class NNDynamicModel(nn.Module):
27 | '''
28 | Model that predict the next state, given the current state and action
29 | '''
30 | def __init__(self, input_dim, obs_output_dim):
31 | super(NNDynamicModel, self).__init__()
32 |
33 | self.mlp = nn.Sequential(
34 | nn.Linear(input_dim, 512),
35 | nn.BatchNorm1d(num_features=512),
36 | nn.ReLU(),
37 | nn.Linear(512,256),
38 | nn.BatchNorm1d(num_features=256),
39 | nn.ReLU(),
40 | nn.Linear(256, obs_output_dim)
41 | )
42 |
43 | def forward(self, x):
44 | return self.mlp(x.float())
45 |
46 |
47 | class NNRewardModel(nn.Module):
48 | '''
49 | Model that predict the reward given the current state and action
50 | '''
51 | def __init__(self, input_dim, reward_output_dim):
52 | super(NNRewardModel, self).__init__()
53 |
54 | self.mlp = nn.Sequential(
55 | nn.Linear(input_dim, 512),
56 | nn.BatchNorm1d(num_features=512),
57 | nn.ReLU(),
58 | nn.Linear(512,256),
59 | nn.BatchNorm1d(num_features=256),
60 | nn.ReLU(),
61 | nn.Linear(256, reward_output_dim)
62 | )
63 |
64 | def forward(self, x):
65 | return self.mlp(x.float())
66 |
67 | def gather_random_trajectories(num_traj, env_name):
68 | '''
69 | Run num_traj random trajectories to gather information about the next state and reward.
70 | Data used to train the models in a supervised way.
71 | '''
72 | dataset_random = []
73 | env = gym.make(env_name)
74 |
75 | game_rewards = []
76 | for n in range(num_traj):
77 |
78 | obs = env.reset()
79 | while True:
80 | sampled_action = env.action_space.sample()
81 | new_obs, reward, done, _ = env.step(sampled_action)
82 |
83 | dataset_random.append([obs, new_obs, reward, done, sampled_action])
84 |
85 | obs = new_obs
86 | game_rewards.append(reward)
87 |
88 | if done:
89 | break
90 |
91 | # print some stats
92 | print('Mean R:',np.round(np.sum(game_rewards)/num_traj,2), 'Max R:', np.round(np.max(game_rewards),2), np.round(len(game_rewards)/num_traj))
93 |
94 | return dataset_random
95 |
96 | def model_MSEloss(y_truth, y_pred, device):
97 | '''
98 | Compute the MSE (Mean Squared Error)
99 | '''
100 | y_truth = torch.FloatTensor(np.array(y_truth)).to(device)
101 | return F.mse_loss(y_pred.view(-1).float(), y_truth.view(-1))
102 |
103 |
104 | def train_dyna_model(random_dataset, rl_dataset, env_model, rew_model, batch_size, max_model_iter, num_examples_added, ENV_LEARNING_RATE, REW_LEARNING_RATE, device):
105 | '''
106 | Train the two models that predict the next state and the expected reward
107 | '''
108 |
109 | env_optimizer = optim.Adam(env_model.parameters(), lr=ENV_LEARNING_RATE)
110 | rew_optimizer = optim.Adam(rew_model.parameters(), lr=REW_LEARNING_RATE)
111 |
112 | if len(rl_dataset) > 0:
113 | '''
114 | # To use only a fraction of the random dataset
115 | rand = np.arange(len(random_dataset))
116 | np.random.shuffle(rand)
117 | rand = rand[:int(len(rl_dataset)*0.8)] # 80% of rl dataset
118 |
119 | d_concat = np.concatenate([np.array(random_dataset)[rand], rl_dataset], axis=0)'''
120 |
121 | # Concatenate the random dataset with the RL dataset. Used only in the aggregation iterations
122 | d_concat = np.concatenate([random_dataset, rl_dataset], axis=0)
123 | else:
124 | d_concat = np.array(random_dataset)
125 |
126 | # Split the dataset into train(80%) and test(20%)
127 | D_train = d_concat[:int(-num_examples_added*1/5)]
128 | D_valid = d_concat[int(-num_examples_added*1/5):]
129 |
130 | print("len(D):", len(d_concat), 'len(Dtrain)', len(D_train))
131 |
132 | # Shuffle the dataset
133 | sff = np.arange(len(D_train))
134 | np.random.shuffle(sff)
135 | D_train = D_train[sff]
136 |
137 |
138 | # Create the input and output for the train
139 | X_train = np.array([np.concatenate([obs,act]) for obs,_,_,_,act in D_train]) # Takes obs and action
140 | # Reward's output
141 | y_rew_train = np.array([[rw] for _,_,rw,_,_ in D_train])
142 | # Next state output
143 | y_env_train = np.array([no for _,no,_,_,_ in D_train])
144 | y_env_train = y_env_train - np.array([obs for obs,_,_,_,_ in D_train]) # y(state) = s(t+1) - s(t)
145 |
146 | # Create the input and output array for the validation
147 | X_valid = np.array([np.concatenate([obs,act]) for obs,_,_,_,act in D_valid]) # Takes obs and action
148 | # Reward output
149 | y_rew_valid = np.array([[rw] for _,_,rw,_,_ in D_valid])
150 | # Next state output
151 | y_env_valid = np.array([no for _,no,_,_,_ in D_valid])
152 | y_env_valid = y_env_valid - np.array([obs for obs,_,_,_,_ in D_valid]) # y(state) = s(t+1) - s(t)
153 |
154 | # Standardize the input features by removing the mean and scaling to unit variance
155 | input_scaler = StandardScaler()
156 | X_train = input_scaler.fit_transform(X_train)
157 | X_valid = input_scaler.transform(X_valid)
158 |
159 | # Standardize the outputs by removing the mean and scaling to unit variance
160 |
161 | env_output_scaler = StandardScaler()
162 | y_env_train = env_output_scaler.fit_transform(y_env_train)
163 | y_env_valid = env_output_scaler.transform(y_env_valid)
164 |
165 | rew_output_scaler = StandardScaler()
166 | y_rew_train = rew_output_scaler.fit_transform(y_rew_train)
167 | y_rew_valid = rew_output_scaler.transform(y_rew_valid)
168 |
169 | # store all the scalers in a variable to later uses
170 | norm = (input_scaler, env_output_scaler, rew_output_scaler)
171 |
172 | losses_env = []
173 | losses_rew = []
174 |
175 | # go through max_model_iter supervised iterations
176 | for it in tqdm(range(max_model_iter)):
177 | # create mini batches of size batch_size
178 | for mb in range(0, len(X_train), batch_size):
179 |
180 | if len(X_train) > mb+BATCH_SIZE:
181 | X_mb = X_train[mb:mb+BATCH_SIZE]
182 |
183 | y_env_mb = y_env_train[mb:mb+BATCH_SIZE]
184 | y_rew_mb = y_rew_train[mb:mb+BATCH_SIZE]
185 |
186 | # Add gaussian noise with mean 0 and variance 0.0001 as in the paper
187 | X_mb += np.random.normal(loc=0, scale=0.001, size=X_mb.shape)
188 |
189 | ## Optimization of the 'env_model' neural net
190 |
191 | env_optimizer.zero_grad()
192 | # forward pass of the model to compute the output
193 | pred_state = env_model(torch.tensor(X_mb).to(device))
194 | # compute the MSE loss
195 | loss = model_MSEloss(y_env_mb, pred_state, device)
196 |
197 | if it == (max_model_iter - 1):
198 | losses_env.append(loss.cpu().detach().numpy())
199 |
200 | # backward pass
201 | loss.backward()
202 | # optimization step
203 | env_optimizer.step()
204 |
205 |
206 | ## Optimization of the 'rew_model' neural net
207 | rew_optimizer.zero_grad()
208 | # forward pass of the model to compute the output
209 | pred_rew = rew_model(torch.tensor(X_mb).to(device))
210 | # compute the MSE loss
211 | loss = model_MSEloss(y_rew_mb, pred_rew, device)
212 |
213 | if it == (max_model_iter - 1):
214 | losses_rew.append(loss.cpu().detach().numpy())
215 | # backward pass
216 | loss.backward()
217 | # optimization step
218 | rew_optimizer.step()
219 |
220 | # Evalute the models every 10 iterations and print the losses
221 | if it % 10 == 0:
222 | env_model.eval()
223 | rew_model.eval()
224 |
225 | pred_state = env_model(torch.tensor(X_valid).to(device))
226 | pred_rew = rew_model(torch.tensor(X_valid).to(device))
227 | env_model.train(True)
228 | rew_model.train(True)
229 |
230 | valid_env_loss = model_MSEloss(y_env_valid, pred_state, device)
231 | valid_rew_loss = model_MSEloss(y_rew_valid, pred_rew, device)
232 |
233 | print('..', it, valid_env_loss.cpu().detach().numpy(), valid_rew_loss.cpu().detach().numpy())
234 |
235 |
236 | ## Evaluate the MSE losses
237 |
238 | env_model.eval()
239 | rew_model.eval()
240 |
241 | pred_state = env_model(torch.tensor(X_valid).to(device))
242 | pred_rew = rew_model(torch.tensor(X_valid).to(device))
243 | env_model.train(True)
244 | rew_model.train(True)
245 |
246 | valid_env_loss = model_MSEloss(y_env_valid, pred_state, device)
247 | valid_rew_loss = model_MSEloss(y_rew_valid, pred_rew, device)
248 |
249 | return np.mean(losses_env), np.mean(losses_rew), valid_env_loss.cpu().detach().numpy(), valid_rew_loss.cpu().detach().numpy(), norm
250 |
251 |
252 | def multi_model_based_control(env_model, rew_model, real_obs, num_sequences, horizon_length, sample_action, norm, device):
253 | '''
254 | Use a random-sampling shooting method, generating random action sequences. The first action with the highest reward of the entire sequence is returned
255 | '''
256 | best_reward = -1e9
257 | best_next_action = []
258 |
259 | input_scaler, env_output_scaler, rew_output_scaler = norm
260 |
261 | m_obs = np.array([real_obs for _ in range(num_sequences)])
262 |
263 | # array that contains the rewards for all the sequence
264 | unroll_rewards = np.zeros((num_sequences, 1))
265 | first_sampled_actions = []
266 |
267 | env_model.eval()
268 | rew_model.eval()
269 |
270 | ## Create a batch of size 'num_sequences' (number of trajectories) to roll the models 'horizon_length' times.
271 | ## i.e. roll a given number of trajectories in a single batch (to increase speed)
272 |
273 | for t in range(horizon_length):
274 | # sampled actions for each sequence
275 | sampled_actions = [sample_action() for _ in range(num_sequences)]
276 | # scale the input
277 | models_input = input_scaler.transform(np.concatenate([m_obs, sampled_actions], axis=1))
278 | # compute the next state for each sequence
279 | pred_obs = env_model(torch.tensor(models_input).to(device))
280 | # and the reward
281 | pred_rew = rew_model(torch.tensor(models_input).to(device))
282 |
283 | # inverse scaler transofrmation
284 | pred_obs = env_output_scaler.inverse_transform(pred_obs.cpu().detach().numpy())
285 | # and add previous observation
286 | m_obs = pred_obs + m_obs
287 |
288 | assert(pred_rew.cpu().detach().numpy().shape == unroll_rewards.shape)
289 |
290 | # sum of the expected rewards
291 | unroll_rewards += pred_rew.cpu().detach().numpy()
292 |
293 | if t == 0:
294 | first_sampled_actions = sampled_actions
295 |
296 | env_model.train(True)
297 | rew_model.train(True)
298 |
299 | # Best the position of the sequence with the higher reward
300 | arg_best_reward = np.argmax(unroll_rewards)
301 | best_sum_reward = unroll_rewards[arg_best_reward].squeeze()
302 | # take the first action of this sequence
303 | best_action = first_sampled_actions[arg_best_reward]
304 |
305 | return best_action, best_sum_reward
306 |
307 |
308 | ENV_NAME = 'RoboschoolAnt-v1'
309 |
310 | # Main loop hyperp
311 | AGGR_ITER = 3
312 | STEPS_PER_AGGR = 20000
313 |
314 | # Random MB hyperp
315 | NUM_RAND_TRAJECTORIES = 1000
316 |
317 | # 'cuda' or 'cpu'
318 | device = 'cuda'
319 |
320 | # Supervised Model Hyperp
321 | ENV_LEARNING_RATE = 1e-3
322 | REW_LEARNING_RATE = 1e-3
323 | BATCH_SIZE = 512
324 | TRAIN_ITER_MODEL = 55
325 |
326 | # Controller Hyperp
327 | HORIZION_LENGTH = 10
328 | NUM_ACTIONS_SEQUENCES = 20000
329 |
330 | save_video_test = True
331 |
332 | now = datetime.datetime.now()
333 | date_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
334 |
335 | if __name__ == '__main__':
336 | writer_name = 'MB_RL_'+ENV_NAME+'_'+date_time
337 | print('Name:',writer_name, device)
338 |
339 | # create the environment
340 | env = gym.make(ENV_NAME)
341 | if save_video_test:
342 | env = gym.wrappers.Monitor(env, "VIDEOS/TEST_VIDEOS_"+writer_name, video_callable=lambda episode_id: True)
343 | obs = env.reset()
344 |
345 | # gather the dataset of random sequences
346 | rand_dataset = gather_random_trajectories(NUM_RAND_TRAJECTORIES, ENV_NAME)
347 |
348 | rl_dataset = []
349 |
350 | # Initialize the models
351 | env_model = NNDynamicModel(env.action_space.shape[0] + env.observation_space.shape[0], env.observation_space.shape[0]).to(device)
352 | rew_model = NNRewardModel(env.action_space.shape[0] + env.observation_space.shape[0], 1).to(device)
353 |
354 |
355 | game_reward = 0
356 | num_examples_added = len(rand_dataset)
357 |
358 | for n_iter in range(AGGR_ITER):
359 |
360 | # supervised training of the dataset (random and rl if it exists)
361 | train_env_loss, train_rew_loss, valid_env_loss, valid_rew_loss, norm = train_dyna_model(rand_dataset, rl_dataset, env_model, rew_model, BATCH_SIZE, TRAIN_ITER_MODEL, num_examples_added, ENV_LEARNING_RATE, REW_LEARNING_RATE, device)
362 | print('{} >> Eloss:{:.4f} EV loss:{:.4f} -- Rloss:{:.4f} RV loss:{:.4f}'.format(n_iter, train_env_loss, valid_env_loss, train_rew_loss, valid_rew_loss))
363 |
364 | obs = env.reset()
365 |
366 | num_examples_added = 0
367 | game_reward = 0
368 | game_pred_rews = []
369 | rews = []
370 |
371 | while num_examples_added < STEPS_PER_AGGR:
372 | while True:
373 |
374 | tt = time.time()
375 | # Execute the control to roll the sequences and pick the first action of the sequence with the higher reward
376 | action, pred_rew = multi_model_based_control(env_model, rew_model, obs, NUM_ACTIONS_SEQUENCES, HORIZION_LENGTH, env.action_space.sample, norm, device)
377 | game_pred_rews.append(pred_rew)
378 |
379 | # one step in the environment with the action returned by the controller
380 | new_obs, reward, done, _ = env.step(action)
381 |
382 | input_scaler, env_output_scaler, rew_output_scaler = norm
383 |
384 | ## Compute the reward and print some stats
385 | models_input = input_scaler.transform([np.concatenate([obs, action])])
386 | rew_model.eval()
387 | p_rew = rew_model(torch.tensor(models_input).to(device))
388 | rew_model.train(True)
389 | unnorm_rew = rew_output_scaler.inverse_transform([float(p_rew.cpu().data[0])]).squeeze()
390 | print(' >> ',len(game_pred_rews), 'gt:',np.round(reward,3), 'pred:',np.round(unnorm_rew, 3),
391 | 'sum:', np.round(pred_rew,3), '|', game_reward, np.round(time.time()-tt, 4), HORIZION_LENGTH)
392 |
393 | # add the last step to the RL dataset
394 | rl_dataset.append([obs, new_obs, reward, done, action])
395 |
396 |
397 | num_examples_added += 1
398 | obs = new_obs
399 | game_reward += reward
400 |
401 | # if the environment is done, reset it and print some stats
402 | if done:
403 | obs = env.reset()
404 | print(' >> R: {:.2f}, Mean sum:{:.2f}, {}'.format(game_reward, np.mean(game_pred_rews), num_examples_added))
405 |
406 | rews.append(game_reward)
407 | game_reward = 0
408 | game_pred_rews = []
409 | break
410 |
411 | print(' >> Mean: {:.2f}', np.mean(rews))
412 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect
--------------------------------------------------------------------------------
/images/GitHub-Mark-32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/GitHub-Mark-32px.png
--------------------------------------------------------------------------------
/images/GitHub-Mark-64px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/GitHub-Mark-64px.png
--------------------------------------------------------------------------------
/images/frontcover2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/frontcover2.jpg
--------------------------------------------------------------------------------
/images/logo5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/logo5.png
--------------------------------------------------------------------------------
/images/logo6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/logo6.png
--------------------------------------------------------------------------------
/images/title3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/title3.png
--------------------------------------------------------------------------------
/images/youtube_social_icon_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/youtube_social_icon_dark.png
--------------------------------------------------------------------------------
/images/youtube_social_icon_red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andri27-ts/Reinforcement-Learning/c57064f747f51d1c495639c7413f5a2be01acd5f/images/youtube_social_icon_red.png
--------------------------------------------------------------------------------