├── .gitignore
├── README.md
├── code
├── ddpg
│ ├── __init__.py
│ └── ddpg.py
├── mjpro131.tar.gz
├── modified_gravity_hopper.py
├── plot_results.py
├── run_ddpg.py
├── run_trpo.py
├── sampling_utils.py
├── test_manual.py
└── test_modified_hopper_env_manually.py
└── slides.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning Summer School : Practical Tutorial on RL for Continuous Control
2 |
3 | Here we go over:
4 |
5 | + How to setup MuJoCo and openai/rllab
6 | + How to run basic TRPO and DDPG code
7 | + The core code snippets in TRPO and DDPG so you can build on top of these algorithms
8 | + How to create your own modified MuJoCo environment (Multi-task modifications can be pull-requested into gym-extensions)
9 |
10 |
11 | ## How to run examples
12 |
13 | ### Run TRPO
14 |
15 | ```bash
16 | cd code; source activate rllab3; python run_trpo.py Hopper-v1
17 | ```
18 |
19 | ### Run DDPG
20 |
21 | ```bash
22 | cd code; source activate rllab3; python run_ddpg.py Hopper-v1
23 | ```
24 |
25 | ### Plotting Results
26 |
27 | ```bash
28 | cd code; python plot_results.py data/progress.csv Hopper-v1 --labels "trpo"
29 | ```
30 |
31 |
32 | ### Manual testing of an env and custom env
33 | ```bash
34 | cd code; python test_manual Hopper-v1
35 | ```
36 |
37 | ```bash
38 | cd code; python test_modified_hopper_env_manually.py
39 | ```
40 |
--------------------------------------------------------------------------------
/code/ddpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Breakend/RLSSContinuousControlTutorial/19901507fd037b1ba6a37232c537654f1ae70ce4/code/ddpg/__init__.py
--------------------------------------------------------------------------------
/code/ddpg/ddpg.py:
--------------------------------------------------------------------------------
1 | # MODIFIED FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py
2 | import gc
3 | import time
4 |
5 | #import pickle as pickle
6 | import numpy as np
7 | import tensorflow as tf
8 |
9 | import pyprind
10 | import rllab.misc.logger as logger
11 | from rllab.algos.base import RLAlgorithm
12 | from rllab.core.serializable import Serializable
13 | from rllab.misc import ext, special
14 | from rllab.misc.overrides import overrides
15 | from rllab.plotter import plotter
16 | from rllab.sampler import parallel_sampler
17 | from sampling_utils import SimpleReplayPool
18 | from sandbox.rocky.tf.misc import tensor_utils
19 | from sandbox.rocky.tf.optimizers.first_order_optimizer import \
20 | FirstOrderOptimizer
21 |
22 |
23 | class DDPG(RLAlgorithm):
24 | """
25 | Deep Deterministic Policy Gradient.
26 | """
27 |
28 | def __init__(
29 | self,
30 | env,
31 | policy,
32 | qf,
33 | es,
34 | batch_size=32,
35 | n_epochs=200,
36 | epoch_length=1000,
37 | min_pool_size=10000,
38 | replay_pool_size=1000000,
39 | replacement_prob=1.0,
40 | discount=0.99,
41 | max_path_length=250,
42 | qf_weight_decay=0.,
43 | qf_update_method='adam',
44 | qf_learning_rate=1e-3,
45 | policy_weight_decay=0,
46 | policy_update_method='adam',
47 | policy_learning_rate=1e-3,
48 | policy_updates_ratio=1.0,
49 | eval_samples=10000,
50 | soft_target=True,
51 | soft_target_tau=0.001,
52 | n_updates_per_sample=1,
53 | scale_reward=1.0,
54 | include_horizon_terminal_transitions=False,
55 | plot=False,
56 | pause_for_plot=False,
57 | **kwargs):
58 | """
59 | :param env: Environment
60 | :param policy: Policy
61 | :param qf: Q function
62 | :param es: Exploration strategy
63 | :param batch_size: Number of samples for each minibatch.
64 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch.
65 | :param epoch_length: How many timesteps for each epoch.
66 | :param min_pool_size: Minimum size of the pool to start training.
67 | :param replay_pool_size: Size of the experience replay pool.
68 | :param discount: Discount factor for the cumulative return.
69 | :param max_path_length: Discount factor for the cumulative return.
70 | :param qf_weight_decay: Weight decay factor for parameters of the Q function.
71 | :param qf_update_method: Online optimization method for training Q function.
72 | :param qf_learning_rate: Learning rate for training Q function.
73 | :param policy_weight_decay: Weight decay factor for parameters of the policy.
74 | :param policy_update_method: Online optimization method for training the policy.
75 | :param policy_learning_rate: Learning rate for training the policy.
76 | :param eval_samples: Number of samples (timesteps) for evaluating the policy.
77 | :param soft_target_tau: Interpolation parameter for doing the soft target update.
78 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained
79 | :param scale_reward: The scaling factor applied to the rewards when training
80 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the
81 | horizon was reached. This might make the Q value back up less stable for certain tasks.
82 | :param plot: Whether to visualize the policy performance after each eval_interval.
83 | :param pause_for_plot: Whether to pause before continuing when plotting.
84 | :return:
85 | """
86 | self.env = env
87 | self.policy = policy
88 | self.qf = qf
89 | self.es = es
90 | self.batch_size = batch_size
91 | self.n_epochs = n_epochs
92 | self.epoch_length = epoch_length
93 | self.min_pool_size = min_pool_size
94 | self.replay_pool_size = replay_pool_size
95 | self.replacement_prob = replacement_prob
96 | self.discount = discount
97 | self.max_path_length = max_path_length
98 | self.qf_weight_decay = qf_weight_decay
99 | self.qf_update_method = \
100 | FirstOrderOptimizer(
101 | update_method=qf_update_method,
102 | learning_rate=qf_learning_rate,
103 | )
104 | self.qf_learning_rate = qf_learning_rate
105 | self.policy_weight_decay = policy_weight_decay
106 | self.policy_update_method = \
107 | FirstOrderOptimizer(
108 | update_method=policy_update_method,
109 | learning_rate=policy_learning_rate,
110 | )
111 | self.policy_learning_rate = policy_learning_rate
112 | self.policy_updates_ratio = policy_updates_ratio
113 | self.eval_samples = eval_samples
114 | self.soft_target_tau = soft_target_tau
115 | self.n_updates_per_sample = n_updates_per_sample
116 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions
117 | self.plot = plot
118 | self.pause_for_plot = pause_for_plot
119 |
120 | self.qf_loss_averages = []
121 | self.policy_surr_averages = []
122 | self.q_averages = []
123 | self.y_averages = []
124 | self.paths = []
125 | self.es_path_returns = []
126 | self.paths_samples_cnt = 0
127 |
128 | self.scale_reward = scale_reward
129 |
130 | self.train_policy_itr = 0
131 |
132 | self.opt_info = None
133 |
134 | def start_worker(self):
135 | parallel_sampler.populate_task(self.env, self.policy)
136 | if self.plot:
137 | plotter.init_plot(self.env, self.policy)
138 |
139 | @overrides
140 | def train(self):
141 | gc_dump_time = time.time()
142 | with tf.Session() as sess:
143 | sess.run(tf.global_variables_initializer())
144 | # This seems like a rather sequential method
145 | pool = SimpleReplayPool(
146 | max_pool_size=self.replay_pool_size,
147 | observation_dim=self.env.observation_space.flat_dim,
148 | action_dim=self.env.action_space.flat_dim,
149 | replacement_prob=self.replacement_prob,
150 | )
151 | self.start_worker()
152 |
153 | self.init_opt()
154 | # This initializes the optimizer parameters
155 | sess.run(tf.global_variables_initializer())
156 | itr = 0
157 | path_length = 0
158 | path_return = 0
159 | terminal = False
160 | initial = False
161 | observation = self.env.reset()
162 |
163 | with tf.variable_scope("sample_policy"):
164 | sample_policy = Serializable.clone(self.policy)
165 |
166 | for epoch in range(self.n_epochs):
167 | logger.push_prefix('epoch #%d | ' % epoch)
168 | logger.log("Training started")
169 | train_qf_itr, train_policy_itr = 0, 0
170 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
171 | # Execute policy
172 | if terminal:
173 | # Note that if the last time step ends an episode, the very
174 | # last state and observation will be ignored and not added
175 | # to the replay pool
176 | observation = self.env.reset()
177 | sample_policy.reset()
178 | self.es_path_returns.append(path_return)
179 | path_length = 0
180 | path_return = 0
181 | initial = True
182 | else:
183 | initial = False
184 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf)
185 |
186 | next_observation, reward, terminal, _ = self.env.step(action)
187 | path_length += 1
188 | path_return += reward
189 |
190 | if not terminal and path_length >= self.max_path_length:
191 | terminal = True
192 | # only include the terminal transition in this case if the flag was set
193 | if self.include_horizon_terminal_transitions:
194 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
195 | else:
196 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
197 |
198 | observation = next_observation
199 |
200 | if pool.size >= self.min_pool_size:
201 | for update_itr in range(self.n_updates_per_sample):
202 | # Train policy
203 | batch = pool.random_batch(self.batch_size)
204 | itrs = self.do_training(itr, batch)
205 | train_qf_itr += itrs[0]
206 | train_policy_itr += itrs[1]
207 | sample_policy.set_param_values(self.policy.get_param_values())
208 |
209 | itr += 1
210 | if time.time() - gc_dump_time > 100:
211 | gc.collect()
212 | gc_dump_time = time.time()
213 |
214 | logger.log("Training finished")
215 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
216 | if pool.size >= self.min_pool_size:
217 | self.evaluate(epoch, pool)
218 | params = self.get_epoch_snapshot(epoch)
219 | logger.save_itr_params(epoch, params)
220 | logger.dump_tabular(with_prefix=False)
221 | logger.pop_prefix()
222 | if self.plot:
223 | self.update_plot()
224 | if self.pause_for_plot:
225 | input("Plotting evaluation run: Press Enter to "
226 | "continue...")
227 | self.env.terminate()
228 | self.policy.terminate()
229 |
230 | def init_opt(self):
231 |
232 | # First, create "target" policy and Q functions
233 | with tf.variable_scope("target_policy"):
234 | target_policy = Serializable.clone(self.policy)
235 | with tf.variable_scope("target_qf"):
236 | target_qf = Serializable.clone(self.qf)
237 |
238 | # y need to be computed first
239 | obs = self.env.observation_space.new_tensor_variable(
240 | 'obs',
241 | extra_dims=1,
242 | )
243 |
244 | # The yi values are computed separately as above and then passed to
245 | # the training functions below
246 | action = self.env.action_space.new_tensor_variable(
247 | 'action',
248 | extra_dims=1,
249 | )
250 |
251 | yvar = tensor_utils.new_tensor(
252 | 'ys',
253 | ndim=1,
254 | dtype=tf.float32,
255 | )
256 |
257 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
258 | sum([tf.reduce_sum(tf.square(param)) for param in
259 | self.qf.get_params(regularizable=True)])
260 |
261 | qval = self.qf.get_qval_sym(obs, action)
262 |
263 | qf_loss = tf.reduce_mean(tf.square(yvar - qval))
264 | qf_reg_loss = qf_loss + qf_weight_decay_term
265 |
266 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
267 | sum([tf.reduce_sum(tf.square(param))
268 | for param in self.policy.get_params(regularizable=True)])
269 | policy_qval = self.qf.get_qval_sym(
270 | obs, self.policy.get_action_sym(obs),
271 | deterministic=True
272 | )
273 | policy_surr = -tf.reduce_mean(policy_qval)
274 |
275 | policy_reg_surr = policy_surr + policy_weight_decay_term
276 |
277 | qf_input_list = [yvar, obs, action]
278 | policy_input_list = [obs]
279 |
280 | self.qf_update_method.update_opt(
281 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
282 | self.policy_update_method.update_opt(
283 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)
284 |
285 | f_train_qf = tensor_utils.compile_function(
286 | inputs=qf_input_list,
287 | outputs=[qf_loss, qval, self.qf_update_method._train_op],
288 | )
289 |
290 | f_train_policy = tensor_utils.compile_function(
291 | inputs=policy_input_list,
292 | outputs=[policy_surr, self.policy_update_method._train_op],
293 | )
294 |
295 | self.opt_info = dict(
296 | f_train_qf=f_train_qf,
297 | f_train_policy=f_train_policy,
298 | target_qf=target_qf,
299 | target_policy=target_policy,
300 | )
301 |
302 | def do_training(self, itr, batch):
303 |
304 | obs, actions, rewards, next_obs, terminals = ext.extract(
305 | batch,
306 | "observations", "actions", "rewards", "next_observations",
307 | "terminals"
308 | )
309 |
310 | # compute the on-policy y values
311 | target_qf = self.opt_info["target_qf"]
312 | target_policy = self.opt_info["target_policy"]
313 |
314 | next_actions, _ = target_policy.get_actions(next_obs)
315 | next_qvals = target_qf.get_qval(next_obs, next_actions)
316 |
317 | ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1)
318 |
319 | f_train_qf = self.opt_info["f_train_qf"]
320 | qf_loss, qval, _ = f_train_qf(ys, obs, actions)
321 | target_qf.set_param_values(
322 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
323 | self.qf.get_param_values() * self.soft_target_tau)
324 | self.qf_loss_averages.append(qf_loss)
325 | self.q_averages.append(qval)
326 | self.y_averages.append(ys)
327 |
328 | self.train_policy_itr += self.policy_updates_ratio
329 | train_policy_itr = 0
330 | while self.train_policy_itr > 0:
331 | f_train_policy = self.opt_info["f_train_policy"]
332 | policy_surr, _ = f_train_policy(obs)
333 | target_policy.set_param_values(
334 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
335 | self.policy.get_param_values() * self.soft_target_tau)
336 | self.policy_surr_averages.append(policy_surr)
337 | self.train_policy_itr -= 1
338 | train_policy_itr += 1
339 | return 1, train_policy_itr # number of itrs qf, policy are trained
340 |
341 | def evaluate(self, epoch, pool):
342 | logger.log("Collecting samples for evaluation")
343 |
344 | paths = parallel_sampler.sample_paths(
345 | policy_params=self.policy.get_param_values(),
346 | max_samples=self.eval_samples,
347 | max_path_length=self.max_path_length,
348 | )
349 |
350 | self.env.reset()
351 |
352 | average_discounted_return = np.mean(
353 | [special.discount_return(path["rewards"], self.discount) for path in paths]
354 | )
355 |
356 | returns = [sum(path["rewards"]) for path in paths]
357 |
358 | all_qs = np.concatenate(self.q_averages)
359 | all_ys = np.concatenate(self.y_averages)
360 |
361 | average_q_loss = np.mean(self.qf_loss_averages)
362 | average_policy_surr = np.mean(self.policy_surr_averages)
363 | average_action = np.mean(np.square(np.concatenate(
364 | [path["actions"] for path in paths]
365 | )))
366 |
367 | policy_reg_param_norm = np.linalg.norm(
368 | self.policy.get_param_values(regularizable=True)
369 | )
370 | qfun_reg_param_norm = np.linalg.norm(
371 | self.qf.get_param_values(regularizable=True)
372 | )
373 |
374 | logger.record_tabular('Epoch', epoch)
375 | logger.record_tabular('Iteration', epoch)
376 | logger.record_tabular('AverageReturn', np.mean(returns))
377 | logger.record_tabular('StdReturn',
378 | np.std(returns))
379 | logger.record_tabular('MaxReturn',
380 | np.max(returns))
381 | logger.record_tabular('MinReturn',
382 | np.min(returns))
383 | if len(self.es_path_returns) > 0:
384 | logger.record_tabular('AverageEsReturn',
385 | np.mean(self.es_path_returns))
386 | logger.record_tabular('StdEsReturn',
387 | np.std(self.es_path_returns))
388 | logger.record_tabular('MaxEsReturn',
389 | np.max(self.es_path_returns))
390 | logger.record_tabular('MinEsReturn',
391 | np.min(self.es_path_returns))
392 | logger.record_tabular('AverageDiscountedReturn',
393 | average_discounted_return)
394 | logger.record_tabular('AverageQLoss', average_q_loss)
395 | logger.record_tabular('AveragePolicySurr', average_policy_surr)
396 | logger.record_tabular('AverageQ', np.mean(all_qs))
397 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
398 | logger.record_tabular('AverageY', np.mean(all_ys))
399 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
400 | logger.record_tabular('AverageAbsQYDiff',
401 | np.mean(np.abs(all_qs - all_ys)))
402 | logger.record_tabular('AverageAction', average_action)
403 |
404 | logger.record_tabular('PolicyRegParamNorm',
405 | policy_reg_param_norm)
406 | logger.record_tabular('QFunRegParamNorm',
407 | qfun_reg_param_norm)
408 |
409 | self.env.log_diagnostics(paths)
410 | self.policy.log_diagnostics(paths)
411 |
412 | self.qf_loss_averages = []
413 | self.policy_surr_averages = []
414 |
415 | self.q_averages = []
416 | self.y_averages = []
417 | self.es_path_returns = []
418 |
419 | def update_plot(self):
420 | if self.plot:
421 | plotter.update_plot(self.policy, self.max_path_length)
422 |
423 | def get_epoch_snapshot(self, epoch):
424 | return dict(
425 | env=self.env,
426 | epoch=epoch,
427 | qf=self.qf,
428 | policy=self.policy,
429 | target_qf=self.opt_info["target_qf"],
430 | target_policy=self.opt_info["target_policy"],
431 | es=self.es,
432 | )
433 |
--------------------------------------------------------------------------------
/code/mjpro131.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Breakend/RLSSContinuousControlTutorial/19901507fd037b1ba6a37232c537654f1ae70ce4/code/mjpro131.tar.gz
--------------------------------------------------------------------------------
/code/modified_gravity_hopper.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import os.path as osp
4 | import random
5 | import tempfile
6 | import xml.etree.ElementTree as ET
7 |
8 | import gym
9 | import mujoco_py
10 | import numpy as np
11 | from gym import utils
12 | from gym.envs.mujoco import mujoco_env
13 | from gym.envs.mujoco.hopper import HopperEnv
14 |
15 |
16 | class GravityEnv(HopperEnv, utils.EzPickle):
17 | """
18 | Allows the gravity to be changed by the
19 | """
20 | def __init__(
21 | self,
22 | gravity=-9.81,
23 | *args,
24 | **kwargs):
25 | HopperEnv.__init__(self)
26 | utils.EzPickle.__init__(self)
27 |
28 | # make sure we're using a proper OpenAI gym Mujoco Env
29 | assert isinstance(self, mujoco_env.MujocoEnv)
30 |
31 | self.model.opt.gravity = (mujoco_py.mjtypes.c_double * 3)(*[0., 0., gravity])
32 | self.model._compute_subtree()
33 | self.model.forward()
34 |
--------------------------------------------------------------------------------
/code/plot_results.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import time
3 | from itertools import cycle
4 |
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 | import pandas as pd
8 | from numpy import genfromtxt
9 | from numpy.random import choice
10 |
11 |
12 | def multiple_plot(average_vals_list, std_dev_list, traj_list, other_labels, env_name, smoothing_window=5, no_show=False, ignore_std=False, limit=None, extra_lines=None):
13 | fig = plt.figure(figsize=(15, 10))
14 | colors = ["k", "red", "blue", "green", "magenta", "cyan", "brown", "purple"]
15 | color_index = 0
16 | ax = plt.subplot() # Defines ax variable by creating an empty plot
17 |
18 | # Set the tick labels font
19 | for label in (ax.get_xticklabels() + ax.get_yticklabels()):
20 | label.set_fontname('Arial')
21 | label.set_fontsize(22)
22 |
23 | index = 0
24 | for average_vals, std_dev, label, trajs in zip(average_vals_list, std_dev_list, other_labels[:len(average_vals_list)], traj_list):
25 | index += 1
26 | rewards_smoothed_1 = pd.Series(average_vals).rolling(smoothing_window, min_periods=smoothing_window).mean()[:limit]
27 | if limit is None:
28 | limit = len(rewards_smoothed_1)
29 | rewards_smoothed_1 = rewards_smoothed_1[:limit]
30 | std_dev = std_dev[:limit]
31 |
32 | fill_color = colors[color_index]#choice(colors, 1)
33 | color_index += 1
34 | cum_rwd_1, = plt.plot(range(len(rewards_smoothed_1)), rewards_smoothed_1, label=label, color=fill_color[0])
35 | if not ignore_std:
36 | plt.fill_between(range(len(rewards_smoothed_1)), rewards_smoothed_1 + std_dev, rewards_smoothed_1 - std_dev, alpha=0.3, edgecolor=fill_color, facecolor=fill_color)
37 |
38 | if extra_lines:
39 | for lin in extra_lines:
40 | plt.plot(range(len(rewards_smoothed_1)), np.repeat(lin, len(rewards_smoothed_1)), linestyle='-.', color = colors[color_index], linewidth=2.5, label=other_labels[index])
41 | color_index += 1
42 | index += 1
43 |
44 | axis_font = {'fontname':'Arial', 'size':'28'}
45 | #plt.legend(loc='upper left', prop={'size' : 16})
46 | plt.legend(loc='lower right', prop={'size' : 16})
47 | plt.xlabel("Iterations", **axis_font)
48 | plt.ylabel("Average Return", **axis_font)
49 | plt.title("%s Environment"% env_name, **axis_font)
50 |
51 | if no_show:
52 | fig.savefig('%s.png' % env_name, dpi=fig.dpi)
53 | else:
54 | plt.show()
55 |
56 | return fig
57 |
58 |
59 | parser = argparse.ArgumentParser()
60 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs")
61 | parser.add_argument("env_name")
62 | parser.add_argument("--save", action="store_true")
63 | parser.add_argument("--ignore_std", action="store_true")
64 | parser.add_argument('--labels', nargs='+', help='List of labels to go along with the paths', required=False)
65 | parser.add_argument('--smoothing_window', default=5, type=int)
66 | parser.add_argument('--limit', default=None, type=int)
67 | parser.add_argument('--extra_lines', nargs="+", type=float)
68 |
69 | args = parser.parse_args()
70 |
71 | avg_rets = []
72 | std_dev_rets = []
73 | trajs = []
74 |
75 | for o in args.paths_to_progress_csvs:
76 | data = pd.read_csv(o)
77 | avg_ret = np.array(data["AverageReturn"])
78 | std_dev_ret = np.array(data["StdReturn"])
79 | if "NumTrajs" in data:
80 | trajs.append(np.cumsum(np.array(data["NumTrajs"])))
81 | else:
82 | trajs.append(np.cumsum(np.array([25]*len(data["AverageReturn"]))))
83 | avg_rets.append(avg_ret)
84 | std_dev_rets.append(std_dev_ret)
85 |
86 | multiple_plot(avg_rets, std_dev_rets, trajs, args.labels, args.env_name, smoothing_window=args.smoothing_window, no_show=args.save, ignore_std=args.ignore_std, limit=args.limit, extra_lines=args.extra_lines)
87 |
--------------------------------------------------------------------------------
/code/run_ddpg.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path as osp
3 | import pickle
4 |
5 | import tensorflow as tf
6 |
7 | from ddpg.ddpg import DDPG
8 | from rllab.envs.gym_env import GymEnv
9 | from rllab.envs.normalized_env import normalize
10 | from rllab.exploration_strategies.ou_strategy import OUStrategy
11 | from rllab.misc import ext
12 | from rllab.misc.instrument import run_experiment_lite, stub
13 | from sandbox.rocky.tf.envs.base import TfEnv
14 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import \
15 | DeterministicMLPPolicy
16 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import \
17 | ContinuousMLPQFunction
18 |
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
21 | parser.add_argument("--num_epochs", default=250, type=int)
22 | parser.add_argument("--data_dir", default="./data_ddpg/")
23 | parser.add_argument("--reward_scale", default=1.0, type=float)
24 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured")
25 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.")
26 | args = parser.parse_args()
27 |
28 | stub(globals())
29 | ext.set_seed(1)
30 |
31 | gymenv = GymEnv(args.env, force_reset=True, record_video=True, record_log=True)
32 |
33 | env = TfEnv(normalize(gymenv))
34 |
35 | policy = DeterministicMLPPolicy(
36 | env_spec=env.spec,
37 | name="policy",
38 | # The neural network policy should have two hidden layers, each with 32 hidden units.
39 | hidden_sizes=(100, 50, 25),
40 | hidden_nonlinearity=tf.nn.relu,
41 | )
42 |
43 | es = OUStrategy(env_spec=env.spec)
44 |
45 | qf = ContinuousMLPQFunction(env_spec=env.spec,
46 | hidden_sizes=(100,100),
47 | hidden_nonlinearity=tf.nn.relu,)
48 |
49 | algo = DDPG(
50 | env=env,
51 | policy=policy,
52 | es=es,
53 | qf=qf,
54 | batch_size=64,
55 | max_path_length=env.horizon,
56 | epoch_length=1000,
57 | min_pool_size=10000,
58 | n_epochs=args.num_epochs,
59 | discount=0.99,
60 | scale_reward=args.reward_scale,
61 | qf_learning_rate=1e-3,
62 | policy_learning_rate=1e-4,
63 | plot=False
64 | )
65 |
66 |
67 | run_experiment_lite(
68 | algo.train(),
69 | log_dir=None if args.use_ec2 else args.data_dir,
70 | # Number of parallel workers for sampling
71 | n_parallel=1,
72 | # Only keep the snapshot parameters for the last iteration
73 | snapshot_mode="last",
74 | # Specifies the seed for the experiment. If this is not provided, a random seed
75 | # will be used
76 | exp_prefix="DDPG_" + args.env,
77 | seed=1,
78 | mode="ec2" if args.use_ec2 else "local",
79 | plot=False,
80 | # dry=True,
81 | terminate_machine=args.dont_terminate_machine,
82 | added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))]
83 | )
84 |
--------------------------------------------------------------------------------
/code/run_trpo.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path as osp
3 | import pickle
4 |
5 | import tensorflow as tf
6 |
7 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
8 | from rllab.envs.gym_env import GymEnv
9 | from rllab.envs.normalized_env import normalize
10 | from rllab.misc import ext
11 | from rllab.misc.instrument import run_experiment_lite, stub
12 | from sandbox.rocky.tf.algos.trpo import TRPO
13 | from sandbox.rocky.tf.envs.base import TfEnv
14 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (ConjugateGradientOptimizer,
15 | FiniteDifferenceHvp)
16 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
17 |
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
20 | parser.add_argument("--num_epochs", default=250, type=int)
21 | parser.add_argument("--data_dir", default="./data_trpo/")
22 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured")
23 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.")
24 | args = parser.parse_args()
25 |
26 | stub(globals())
27 | ext.set_seed(1)
28 |
29 | gymenv = GymEnv(args.env, force_reset=True, record_video=True, record_log=True)
30 |
31 | env = TfEnv(normalize(gymenv))
32 |
33 | policy = GaussianMLPPolicy(
34 | name="policy",
35 | env_spec=env.spec,
36 | # The neural network policy should have two hidden layers, each with 32 hidden units.
37 | hidden_sizes=(100, 50, 25),
38 | hidden_nonlinearity=tf.nn.relu,
39 | )
40 |
41 | baseline = LinearFeatureBaseline(env_spec=env.spec)
42 |
43 | algo = TRPO(
44 | env=env,
45 | policy=policy,
46 | baseline=baseline,
47 | batch_size=5000,
48 | max_path_length=env.horizon,
49 | n_itr=args.num_epochs,
50 | discount=0.99,
51 | step_size=0.01,
52 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
53 | )
54 |
55 | run_experiment_lite(
56 | algo.train(),
57 | log_dir=None if args.use_ec2 else args.data_dir,
58 | # Number of parallel workers for sampling
59 | n_parallel=1,
60 | # Only keep the snapshot parameters for the last iteration
61 | snapshot_mode="last",
62 | # Specifies the seed for the experiment. If this is not provided, a random seed
63 | # will be used
64 | exp_prefix="TRPO_" + args.env,
65 | seed=1,
66 | mode="ec2" if args.use_ec2 else "local",
67 | plot=False,
68 | terminate_machine=args.dont_terminate_machine,
69 | added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))]
70 | )
71 |
--------------------------------------------------------------------------------
/code/sampling_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import rllab.misc.logger as logger
3 |
4 | class SimpleReplayPool(object):
5 | """
6 | Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py
7 | """
8 | def __init__(
9 | self, max_pool_size, observation_dim, action_dim,
10 | replacement_policy='stochastic', replacement_prob=1.0,
11 | max_skip_episode=10):
12 | self._observation_dim = observation_dim
13 | self._action_dim = action_dim
14 | self._max_pool_size = max_pool_size
15 | self._replacement_policy = replacement_policy
16 | self._replacement_prob = replacement_prob
17 | self._max_skip_episode = max_skip_episode
18 | self._observations = np.zeros(
19 | (max_pool_size, observation_dim),
20 | )
21 | self._actions = np.zeros(
22 | (max_pool_size, action_dim),
23 | )
24 | self._rewards = np.zeros(max_pool_size)
25 | self._terminals = np.zeros(max_pool_size, dtype='uint8')
26 | self._initials = np.zeros(max_pool_size, dtype='uint8')
27 | self._bottom = 0
28 | self._top = 0
29 | self._size = 0
30 |
31 | def add_sample(self, observation, action, reward, terminal, initial):
32 | self.check_replacement()
33 | self._observations[self._top] = observation
34 | self._actions[self._top] = action
35 | self._rewards[self._top] = reward
36 | self._terminals[self._top] = terminal
37 | self._initials[self._top] = initial
38 | self.advance()
39 |
40 | def check_replacement(self):
41 | if self._replacement_prob < 1.0:
42 | if self._size < self._max_pool_size or \
43 | not self._initials[self._top]: return
44 | self.advance_until_terminate()
45 |
46 | def get_skip_flag(self):
47 | if self._replacement_policy == 'full': skip = False
48 | elif self._replacement_policy == 'stochastic':
49 | skip = np.random.uniform() > self._replacement_prob
50 | else: raise NotImplementedError
51 | return skip
52 |
53 | def advance_until_terminate(self):
54 | skip = self.get_skip_flag()
55 | n_skips = 0
56 | old_top = self._top
57 | new_top = (old_top + 1) % self._max_pool_size
58 | while skip and old_top != new_top and n_skips < self._max_skip_episode:
59 | n_skips += 1
60 | self.advance()
61 | while not self._initials[self._top]:
62 | self.advance()
63 | skip = self.get_skip_flag()
64 | new_top = self._top
65 | logger.log("add_sample, skipped %d episodes, top=%d->%d"%(
66 | n_skips, old_top, new_top))
67 |
68 | def advance(self):
69 | self._top = (self._top + 1) % self._max_pool_size
70 | if self._size >= self._max_pool_size:
71 | self._bottom = (self._bottom + 1) % self._max_pool_size
72 | else:
73 | self._size += 1
74 |
75 | def random_batch(self, batch_size):
76 | assert self._size > batch_size
77 | indices = np.zeros(batch_size, dtype='uint64')
78 | transition_indices = np.zeros(batch_size, dtype='uint64')
79 | count = 0
80 | while count < batch_size:
81 | index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size
82 | # make sure that the transition is valid: if we are at the end of the pool, we need to discard
83 | # this sample
84 | if index == self._size - 1 and self._size <= self._max_pool_size:
85 | continue
86 |
87 | transition_index = (index + 1) % self._max_pool_size
88 |
89 | # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets
90 | if not self._terminals[index] and self._initials[transition_index]:
91 | continue
92 | indices[count] = index
93 | transition_indices[count] = transition_index
94 | count += 1
95 | return dict(
96 | observations=self._observations[indices],
97 | actions=self._actions[indices],
98 | rewards=self._rewards[indices],
99 | terminals=self._terminals[indices],
100 | initials=self._initials[indices],
101 | next_observations=self._observations[transition_indices]
102 | )
103 |
104 | @property
105 | def size(self):
106 | return self._size
107 |
--------------------------------------------------------------------------------
/code/test_manual.py:
--------------------------------------------------------------------------------
1 | import gym, gym.spaces, gym.utils, gym.utils.seeding
2 | import numpy as np
3 | import sys
4 | from gym.envs.mujoco.mujoco_env import MujocoEnv
5 |
6 | # From https://raw.githubusercontent.com/openai/roboschool/master/roboschool/test_manual.py
7 |
8 | #
9 | # Run this file to test environments using manual control:
10 | #
11 | # python test_manual.py RoboschoolHopper-v0
12 | #
13 |
14 | class TestKeyboardControl:
15 | def __init__(self):
16 | self.keys = {}
17 | self.control = np.zeros(9)
18 | self.human_pause = False
19 | self.human_done = False
20 | def key(self, event_type, key, modifiers):
21 | self.keys[key] = +1 if event_type==6 else 0
22 | #print ("event_type", event_type, "key", key, "modifiers", modifiers)
23 | self.control[0] = self.keys.get(0x1000014, 0) - self.keys.get(0x1000012, 0)
24 | self.control[1] = self.keys.get(0x1000013, 0) - self.keys.get(0x1000015, 0)
25 | self.control[2] = self.keys.get(ord('A'), 0) - self.keys.get(ord('Z'), 0)
26 | self.control[3] = self.keys.get(ord('S'), 0) - self.keys.get(ord('X'), 0)
27 | self.control[4] = self.keys.get(ord('D'), 0) - self.keys.get(ord('C'), 0)
28 | self.control[5] = self.keys.get(ord('F'), 0) - self.keys.get(ord('V'), 0)
29 | self.control[6] = self.keys.get(ord('G'), 0) - self.keys.get(ord('B'), 0)
30 | self.control[7] = self.keys.get(ord('H'), 0) - self.keys.get(ord('N'), 0)
31 | self.control[8] = self.keys.get(ord('J'), 0) - self.keys.get(ord('M'), 0)
32 | if event_type==6 and key==32: # press Space to pause
33 | self.human_pause = 1 - self.human_pause
34 | if event_type==6 and key==0x1000004: # press Enter to restart
35 | self.human_done = True
36 |
37 |
38 | class TestKeyboardControlMuj:
39 | def __init__(self):
40 | self.keys = {}
41 | self.control = np.zeros(9)
42 | self.human_pause = False
43 | self.human_done = False
44 |
45 | def key(self, window, key, scancode, event_type, modifiers):
46 | self.keys[key] = +1 if event_type==1 else 0
47 | # print(key)
48 | #print ("event_type", event_type, "key", key, "modifiers", modifiers)
49 | self.control[0] = self.keys.get(265, 0) - self.keys.get(264, 0)
50 | self.control[1] = self.keys.get(262, 0) - self.keys.get(263, 0)
51 | self.control[2] = self.keys.get(ord('A'), 0) - self.keys.get(ord('Z'), 0)
52 | self.control[3] = self.keys.get(ord('S'), 0) - self.keys.get(ord('X'), 0)
53 | self.control[4] = self.keys.get(ord('D'), 0) - self.keys.get(ord('C'), 0)
54 | self.control[5] = self.keys.get(ord('F'), 0) - self.keys.get(ord('V'), 0)
55 | self.control[6] = self.keys.get(ord('G'), 0) - self.keys.get(ord('B'), 0)
56 | self.control[7] = self.keys.get(ord('H'), 0) - self.keys.get(ord('N'), 0)
57 | self.control[8] = self.keys.get(ord('J'), 0) - self.keys.get(ord('M'), 0)
58 | if event_type==1 and key==32: # press Space to pause
59 | self.human_pause = 1 - self.human_pause
60 | if event_type==1 and key==257: # press Enter to restart
61 | self.human_done = True
62 |
63 |
64 | usage = """
65 | This is manual test. Usage:
66 | %s
67 |
68 | Keyboard shortcuts:
69 | * F1 toggle slow motion
70 | * F2 toggle captions
71 | * F3 toggle HUD: observations, actions, reward
72 | * ENTER to restart episode (works only in this test)
73 | * SPACE to pause (works only in this test)
74 | * Up/down, left/right, a/z, s/x, d/c, f/v, g/b, h/n, j/m to control robot (works only in this test)
75 | """
76 |
77 | def test(env_id):
78 | print(usage % sys.argv[0])
79 |
80 | env = gym.make(env_id)
81 | # import pdb; pdb.set_trace()
82 | env.reset() # This creates default single player scene
83 | if isinstance(env.unwrapped, MujocoEnv):
84 | ctrl = TestKeyboardControlMuj()
85 | from mujoco_py.glfw import set_key_callback
86 | set_key_callback(env.unwrapped._get_viewer().window, ctrl.key)
87 | else:
88 | raise NotImplementedError
89 |
90 | a = np.zeros(env.action_space.shape)
91 | copy_n = min(len(a), len(ctrl.control))
92 | ctrl.human_pause = False
93 |
94 | while 1:
95 | ctrl.human_done = False
96 | sn = env.reset()
97 | frame = 0
98 | reward = 0.0
99 | episode_over = False
100 | while 1:
101 | s = sn
102 | a[:copy_n] = ctrl.control[:copy_n]
103 | # import pdb; pdb.set_trace()
104 | sn, rplus, done, info = env.step(a)
105 | reward += rplus
106 | #env.render("rgb_array")
107 | episode_over |= done
108 | still_visible = True
109 | # import pdb; pdb.set_trace()
110 | while True:
111 | env.render("human")
112 | #env.unwrapped.camera.test_window()
113 | if not ctrl.human_pause: break
114 | if ctrl.human_done: break
115 | if not still_visible: break
116 | frame += 1
117 | if not still_visible: break
118 |
119 | if __name__=="__main__":
120 | env_id = "RoboschoolHumanoid-v0" if len(sys.argv) <= 1 else sys.argv[1]
121 | test(env_id)
122 |
--------------------------------------------------------------------------------
/code/test_modified_hopper_env_manually.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from test_manual import test
3 |
4 | arg_dict = dict(id="HopperHalfGravity-v0",
5 | entry_point="modified_gravity_hopper:GravityEnv",
6 | max_episode_steps=1000,
7 | kwargs={"gravity" : -1.0})
8 |
9 | gym.envs.register(**arg_dict)
10 |
11 | test("HopperHalfGravity-v0")
12 |
--------------------------------------------------------------------------------
/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Breakend/RLSSContinuousControlTutorial/19901507fd037b1ba6a37232c537654f1ae70ce4/slides.pdf
--------------------------------------------------------------------------------