├── code ├── ddpg │ ├── __init__.py │ └── ddpg.py ├── mjpro131.tar.gz ├── test_modified_hopper_env_manually.py ├── modified_gravity_hopper.py ├── run_trpo.py ├── run_ddpg.py ├── plot_results.py ├── sampling_utils.py └── test_manual.py ├── slides.pdf ├── README.md └── .gitignore /code/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Breakend/RLSSContinuousControlTutorial/HEAD/slides.pdf -------------------------------------------------------------------------------- /code/mjpro131.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Breakend/RLSSContinuousControlTutorial/HEAD/code/mjpro131.tar.gz -------------------------------------------------------------------------------- /code/test_modified_hopper_env_manually.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from test_manual import test 3 | 4 | arg_dict = dict(id="HopperHalfGravity-v0", 5 | entry_point="modified_gravity_hopper:GravityEnv", 6 | max_episode_steps=1000, 7 | kwargs={"gravity" : -1.0}) 8 | 9 | gym.envs.register(**arg_dict) 10 | 11 | test("HopperHalfGravity-v0") 12 | -------------------------------------------------------------------------------- /code/modified_gravity_hopper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import os.path as osp 4 | import random 5 | import tempfile 6 | import xml.etree.ElementTree as ET 7 | 8 | import gym 9 | import mujoco_py 10 | import numpy as np 11 | from gym import utils 12 | from gym.envs.mujoco import mujoco_env 13 | from gym.envs.mujoco.hopper import HopperEnv 14 | 15 | 16 | class GravityEnv(HopperEnv, utils.EzPickle): 17 | """ 18 | Allows the gravity to be changed by the 19 | """ 20 | def __init__( 21 | self, 22 | gravity=-9.81, 23 | *args, 24 | **kwargs): 25 | HopperEnv.__init__(self) 26 | utils.EzPickle.__init__(self) 27 | 28 | # make sure we're using a proper OpenAI gym Mujoco Env 29 | assert isinstance(self, mujoco_env.MujocoEnv) 30 | 31 | self.model.opt.gravity = (mujoco_py.mjtypes.c_double * 3)(*[0., 0., gravity]) 32 | self.model._compute_subtree() 33 | self.model.forward() 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning Summer School : Practical Tutorial on RL for Continuous Control 2 | 3 | Here we go over: 4 | 5 | + How to setup MuJoCo and openai/rllab 6 | + How to run basic TRPO and DDPG code 7 | + The core code snippets in TRPO and DDPG so you can build on top of these algorithms 8 | + How to create your own modified MuJoCo environment (Multi-task modifications can be pull-requested into gym-extensions) 9 | 10 | 11 | ## How to run examples 12 | 13 | ### Run TRPO 14 | 15 | ```bash 16 | cd code; source activate rllab3; python run_trpo.py Hopper-v1 17 | ``` 18 | 19 | ### Run DDPG 20 | 21 | ```bash 22 | cd code; source activate rllab3; python run_ddpg.py Hopper-v1 23 | ``` 24 | 25 | ### Plotting Results 26 | 27 | ```bash 28 | cd code; python plot_results.py data/progress.csv Hopper-v1 --labels "trpo" 29 | ``` 30 | 31 | 32 | ### Manual testing of an env and custom env 33 | ```bash 34 | cd code; python test_manual Hopper-v1 35 | ``` 36 | 37 | ```bash 38 | cd code; python test_modified_hopper_env_manually.py 39 | ``` 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /code/run_trpo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import pickle 4 | 5 | import tensorflow as tf 6 | 7 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 8 | from rllab.envs.gym_env import GymEnv 9 | from rllab.envs.normalized_env import normalize 10 | from rllab.misc import ext 11 | from rllab.misc.instrument import run_experiment_lite, stub 12 | from sandbox.rocky.tf.algos.trpo import TRPO 13 | from sandbox.rocky.tf.envs.base import TfEnv 14 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (ConjugateGradientOptimizer, 15 | FiniteDifferenceHvp) 16 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 20 | parser.add_argument("--num_epochs", default=250, type=int) 21 | parser.add_argument("--data_dir", default="./data_trpo/") 22 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured") 23 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.") 24 | args = parser.parse_args() 25 | 26 | stub(globals()) 27 | ext.set_seed(1) 28 | 29 | gymenv = GymEnv(args.env, force_reset=True, record_video=True, record_log=True) 30 | 31 | env = TfEnv(normalize(gymenv)) 32 | 33 | policy = GaussianMLPPolicy( 34 | name="policy", 35 | env_spec=env.spec, 36 | # The neural network policy should have two hidden layers, each with 32 hidden units. 37 | hidden_sizes=(100, 50, 25), 38 | hidden_nonlinearity=tf.nn.relu, 39 | ) 40 | 41 | baseline = LinearFeatureBaseline(env_spec=env.spec) 42 | 43 | algo = TRPO( 44 | env=env, 45 | policy=policy, 46 | baseline=baseline, 47 | batch_size=5000, 48 | max_path_length=env.horizon, 49 | n_itr=args.num_epochs, 50 | discount=0.99, 51 | step_size=0.01, 52 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 53 | ) 54 | 55 | run_experiment_lite( 56 | algo.train(), 57 | log_dir=None if args.use_ec2 else args.data_dir, 58 | # Number of parallel workers for sampling 59 | n_parallel=1, 60 | # Only keep the snapshot parameters for the last iteration 61 | snapshot_mode="last", 62 | # Specifies the seed for the experiment. If this is not provided, a random seed 63 | # will be used 64 | exp_prefix="TRPO_" + args.env, 65 | seed=1, 66 | mode="ec2" if args.use_ec2 else "local", 67 | plot=False, 68 | terminate_machine=args.dont_terminate_machine, 69 | added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))] 70 | ) 71 | -------------------------------------------------------------------------------- /code/run_ddpg.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path as osp 3 | import pickle 4 | 5 | import tensorflow as tf 6 | 7 | from ddpg.ddpg import DDPG 8 | from rllab.envs.gym_env import GymEnv 9 | from rllab.envs.normalized_env import normalize 10 | from rllab.exploration_strategies.ou_strategy import OUStrategy 11 | from rllab.misc import ext 12 | from rllab.misc.instrument import run_experiment_lite, stub 13 | from sandbox.rocky.tf.envs.base import TfEnv 14 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import \ 15 | DeterministicMLPPolicy 16 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import \ 17 | ContinuousMLPQFunction 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 21 | parser.add_argument("--num_epochs", default=250, type=int) 22 | parser.add_argument("--data_dir", default="./data_ddpg/") 23 | parser.add_argument("--reward_scale", default=1.0, type=float) 24 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured") 25 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.") 26 | args = parser.parse_args() 27 | 28 | stub(globals()) 29 | ext.set_seed(1) 30 | 31 | gymenv = GymEnv(args.env, force_reset=True, record_video=True, record_log=True) 32 | 33 | env = TfEnv(normalize(gymenv)) 34 | 35 | policy = DeterministicMLPPolicy( 36 | env_spec=env.spec, 37 | name="policy", 38 | # The neural network policy should have two hidden layers, each with 32 hidden units. 39 | hidden_sizes=(100, 50, 25), 40 | hidden_nonlinearity=tf.nn.relu, 41 | ) 42 | 43 | es = OUStrategy(env_spec=env.spec) 44 | 45 | qf = ContinuousMLPQFunction(env_spec=env.spec, 46 | hidden_sizes=(100,100), 47 | hidden_nonlinearity=tf.nn.relu,) 48 | 49 | algo = DDPG( 50 | env=env, 51 | policy=policy, 52 | es=es, 53 | qf=qf, 54 | batch_size=64, 55 | max_path_length=env.horizon, 56 | epoch_length=1000, 57 | min_pool_size=10000, 58 | n_epochs=args.num_epochs, 59 | discount=0.99, 60 | scale_reward=args.reward_scale, 61 | qf_learning_rate=1e-3, 62 | policy_learning_rate=1e-4, 63 | plot=False 64 | ) 65 | 66 | 67 | run_experiment_lite( 68 | algo.train(), 69 | log_dir=None if args.use_ec2 else args.data_dir, 70 | # Number of parallel workers for sampling 71 | n_parallel=1, 72 | # Only keep the snapshot parameters for the last iteration 73 | snapshot_mode="last", 74 | # Specifies the seed for the experiment. If this is not provided, a random seed 75 | # will be used 76 | exp_prefix="DDPG_" + args.env, 77 | seed=1, 78 | mode="ec2" if args.use_ec2 else "local", 79 | plot=False, 80 | # dry=True, 81 | terminate_machine=args.dont_terminate_machine, 82 | added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))] 83 | ) 84 | -------------------------------------------------------------------------------- /code/plot_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | from itertools import cycle 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | from numpy import genfromtxt 9 | from numpy.random import choice 10 | 11 | 12 | def multiple_plot(average_vals_list, std_dev_list, traj_list, other_labels, env_name, smoothing_window=5, no_show=False, ignore_std=False, limit=None, extra_lines=None): 13 | fig = plt.figure(figsize=(15, 10)) 14 | colors = ["k", "red", "blue", "green", "magenta", "cyan", "brown", "purple"] 15 | color_index = 0 16 | ax = plt.subplot() # Defines ax variable by creating an empty plot 17 | 18 | # Set the tick labels font 19 | for label in (ax.get_xticklabels() + ax.get_yticklabels()): 20 | label.set_fontname('Arial') 21 | label.set_fontsize(22) 22 | 23 | index = 0 24 | for average_vals, std_dev, label, trajs in zip(average_vals_list, std_dev_list, other_labels[:len(average_vals_list)], traj_list): 25 | index += 1 26 | rewards_smoothed_1 = pd.Series(average_vals).rolling(smoothing_window, min_periods=smoothing_window).mean()[:limit] 27 | if limit is None: 28 | limit = len(rewards_smoothed_1) 29 | rewards_smoothed_1 = rewards_smoothed_1[:limit] 30 | std_dev = std_dev[:limit] 31 | 32 | fill_color = colors[color_index]#choice(colors, 1) 33 | color_index += 1 34 | cum_rwd_1, = plt.plot(range(len(rewards_smoothed_1)), rewards_smoothed_1, label=label, color=fill_color[0]) 35 | if not ignore_std: 36 | plt.fill_between(range(len(rewards_smoothed_1)), rewards_smoothed_1 + std_dev, rewards_smoothed_1 - std_dev, alpha=0.3, edgecolor=fill_color, facecolor=fill_color) 37 | 38 | if extra_lines: 39 | for lin in extra_lines: 40 | plt.plot(range(len(rewards_smoothed_1)), np.repeat(lin, len(rewards_smoothed_1)), linestyle='-.', color = colors[color_index], linewidth=2.5, label=other_labels[index]) 41 | color_index += 1 42 | index += 1 43 | 44 | axis_font = {'fontname':'Arial', 'size':'28'} 45 | #plt.legend(loc='upper left', prop={'size' : 16}) 46 | plt.legend(loc='lower right', prop={'size' : 16}) 47 | plt.xlabel("Iterations", **axis_font) 48 | plt.ylabel("Average Return", **axis_font) 49 | plt.title("%s Environment"% env_name, **axis_font) 50 | 51 | if no_show: 52 | fig.savefig('%s.png' % env_name, dpi=fig.dpi) 53 | else: 54 | plt.show() 55 | 56 | return fig 57 | 58 | 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs") 61 | parser.add_argument("env_name") 62 | parser.add_argument("--save", action="store_true") 63 | parser.add_argument("--ignore_std", action="store_true") 64 | parser.add_argument('--labels', nargs='+', help='List of labels to go along with the paths', required=False) 65 | parser.add_argument('--smoothing_window', default=5, type=int) 66 | parser.add_argument('--limit', default=None, type=int) 67 | parser.add_argument('--extra_lines', nargs="+", type=float) 68 | 69 | args = parser.parse_args() 70 | 71 | avg_rets = [] 72 | std_dev_rets = [] 73 | trajs = [] 74 | 75 | for o in args.paths_to_progress_csvs: 76 | data = pd.read_csv(o) 77 | avg_ret = np.array(data["AverageReturn"]) 78 | std_dev_ret = np.array(data["StdReturn"]) 79 | if "NumTrajs" in data: 80 | trajs.append(np.cumsum(np.array(data["NumTrajs"]))) 81 | else: 82 | trajs.append(np.cumsum(np.array([25]*len(data["AverageReturn"])))) 83 | avg_rets.append(avg_ret) 84 | std_dev_rets.append(std_dev_ret) 85 | 86 | multiple_plot(avg_rets, std_dev_rets, trajs, args.labels, args.env_name, smoothing_window=args.smoothing_window, no_show=args.save, ignore_std=args.ignore_std, limit=args.limit, extra_lines=args.extra_lines) 87 | -------------------------------------------------------------------------------- /code/sampling_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import rllab.misc.logger as logger 3 | 4 | class SimpleReplayPool(object): 5 | """ 6 | Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py 7 | """ 8 | def __init__( 9 | self, max_pool_size, observation_dim, action_dim, 10 | replacement_policy='stochastic', replacement_prob=1.0, 11 | max_skip_episode=10): 12 | self._observation_dim = observation_dim 13 | self._action_dim = action_dim 14 | self._max_pool_size = max_pool_size 15 | self._replacement_policy = replacement_policy 16 | self._replacement_prob = replacement_prob 17 | self._max_skip_episode = max_skip_episode 18 | self._observations = np.zeros( 19 | (max_pool_size, observation_dim), 20 | ) 21 | self._actions = np.zeros( 22 | (max_pool_size, action_dim), 23 | ) 24 | self._rewards = np.zeros(max_pool_size) 25 | self._terminals = np.zeros(max_pool_size, dtype='uint8') 26 | self._initials = np.zeros(max_pool_size, dtype='uint8') 27 | self._bottom = 0 28 | self._top = 0 29 | self._size = 0 30 | 31 | def add_sample(self, observation, action, reward, terminal, initial): 32 | self.check_replacement() 33 | self._observations[self._top] = observation 34 | self._actions[self._top] = action 35 | self._rewards[self._top] = reward 36 | self._terminals[self._top] = terminal 37 | self._initials[self._top] = initial 38 | self.advance() 39 | 40 | def check_replacement(self): 41 | if self._replacement_prob < 1.0: 42 | if self._size < self._max_pool_size or \ 43 | not self._initials[self._top]: return 44 | self.advance_until_terminate() 45 | 46 | def get_skip_flag(self): 47 | if self._replacement_policy == 'full': skip = False 48 | elif self._replacement_policy == 'stochastic': 49 | skip = np.random.uniform() > self._replacement_prob 50 | else: raise NotImplementedError 51 | return skip 52 | 53 | def advance_until_terminate(self): 54 | skip = self.get_skip_flag() 55 | n_skips = 0 56 | old_top = self._top 57 | new_top = (old_top + 1) % self._max_pool_size 58 | while skip and old_top != new_top and n_skips < self._max_skip_episode: 59 | n_skips += 1 60 | self.advance() 61 | while not self._initials[self._top]: 62 | self.advance() 63 | skip = self.get_skip_flag() 64 | new_top = self._top 65 | logger.log("add_sample, skipped %d episodes, top=%d->%d"%( 66 | n_skips, old_top, new_top)) 67 | 68 | def advance(self): 69 | self._top = (self._top + 1) % self._max_pool_size 70 | if self._size >= self._max_pool_size: 71 | self._bottom = (self._bottom + 1) % self._max_pool_size 72 | else: 73 | self._size += 1 74 | 75 | def random_batch(self, batch_size): 76 | assert self._size > batch_size 77 | indices = np.zeros(batch_size, dtype='uint64') 78 | transition_indices = np.zeros(batch_size, dtype='uint64') 79 | count = 0 80 | while count < batch_size: 81 | index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size 82 | # make sure that the transition is valid: if we are at the end of the pool, we need to discard 83 | # this sample 84 | if index == self._size - 1 and self._size <= self._max_pool_size: 85 | continue 86 | 87 | transition_index = (index + 1) % self._max_pool_size 88 | 89 | # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets 90 | if not self._terminals[index] and self._initials[transition_index]: 91 | continue 92 | indices[count] = index 93 | transition_indices[count] = transition_index 94 | count += 1 95 | return dict( 96 | observations=self._observations[indices], 97 | actions=self._actions[indices], 98 | rewards=self._rewards[indices], 99 | terminals=self._terminals[indices], 100 | initials=self._initials[indices], 101 | next_observations=self._observations[transition_indices] 102 | ) 103 | 104 | @property 105 | def size(self): 106 | return self._size 107 | -------------------------------------------------------------------------------- /code/test_manual.py: -------------------------------------------------------------------------------- 1 | import gym, gym.spaces, gym.utils, gym.utils.seeding 2 | import numpy as np 3 | import sys 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv 5 | 6 | # From https://raw.githubusercontent.com/openai/roboschool/master/roboschool/test_manual.py 7 | 8 | # 9 | # Run this file to test environments using manual control: 10 | # 11 | # python test_manual.py RoboschoolHopper-v0 12 | # 13 | 14 | class TestKeyboardControl: 15 | def __init__(self): 16 | self.keys = {} 17 | self.control = np.zeros(9) 18 | self.human_pause = False 19 | self.human_done = False 20 | def key(self, event_type, key, modifiers): 21 | self.keys[key] = +1 if event_type==6 else 0 22 | #print ("event_type", event_type, "key", key, "modifiers", modifiers) 23 | self.control[0] = self.keys.get(0x1000014, 0) - self.keys.get(0x1000012, 0) 24 | self.control[1] = self.keys.get(0x1000013, 0) - self.keys.get(0x1000015, 0) 25 | self.control[2] = self.keys.get(ord('A'), 0) - self.keys.get(ord('Z'), 0) 26 | self.control[3] = self.keys.get(ord('S'), 0) - self.keys.get(ord('X'), 0) 27 | self.control[4] = self.keys.get(ord('D'), 0) - self.keys.get(ord('C'), 0) 28 | self.control[5] = self.keys.get(ord('F'), 0) - self.keys.get(ord('V'), 0) 29 | self.control[6] = self.keys.get(ord('G'), 0) - self.keys.get(ord('B'), 0) 30 | self.control[7] = self.keys.get(ord('H'), 0) - self.keys.get(ord('N'), 0) 31 | self.control[8] = self.keys.get(ord('J'), 0) - self.keys.get(ord('M'), 0) 32 | if event_type==6 and key==32: # press Space to pause 33 | self.human_pause = 1 - self.human_pause 34 | if event_type==6 and key==0x1000004: # press Enter to restart 35 | self.human_done = True 36 | 37 | 38 | class TestKeyboardControlMuj: 39 | def __init__(self): 40 | self.keys = {} 41 | self.control = np.zeros(9) 42 | self.human_pause = False 43 | self.human_done = False 44 | 45 | def key(self, window, key, scancode, event_type, modifiers): 46 | self.keys[key] = +1 if event_type==1 else 0 47 | # print(key) 48 | #print ("event_type", event_type, "key", key, "modifiers", modifiers) 49 | self.control[0] = self.keys.get(265, 0) - self.keys.get(264, 0) 50 | self.control[1] = self.keys.get(262, 0) - self.keys.get(263, 0) 51 | self.control[2] = self.keys.get(ord('A'), 0) - self.keys.get(ord('Z'), 0) 52 | self.control[3] = self.keys.get(ord('S'), 0) - self.keys.get(ord('X'), 0) 53 | self.control[4] = self.keys.get(ord('D'), 0) - self.keys.get(ord('C'), 0) 54 | self.control[5] = self.keys.get(ord('F'), 0) - self.keys.get(ord('V'), 0) 55 | self.control[6] = self.keys.get(ord('G'), 0) - self.keys.get(ord('B'), 0) 56 | self.control[7] = self.keys.get(ord('H'), 0) - self.keys.get(ord('N'), 0) 57 | self.control[8] = self.keys.get(ord('J'), 0) - self.keys.get(ord('M'), 0) 58 | if event_type==1 and key==32: # press Space to pause 59 | self.human_pause = 1 - self.human_pause 60 | if event_type==1 and key==257: # press Enter to restart 61 | self.human_done = True 62 | 63 | 64 | usage = """ 65 | This is manual test. Usage: 66 | %s 67 | 68 | Keyboard shortcuts: 69 | * F1 toggle slow motion 70 | * F2 toggle captions 71 | * F3 toggle HUD: observations, actions, reward 72 | * ENTER to restart episode (works only in this test) 73 | * SPACE to pause (works only in this test) 74 | * Up/down, left/right, a/z, s/x, d/c, f/v, g/b, h/n, j/m to control robot (works only in this test) 75 | """ 76 | 77 | def test(env_id): 78 | print(usage % sys.argv[0]) 79 | 80 | env = gym.make(env_id) 81 | # import pdb; pdb.set_trace() 82 | env.reset() # This creates default single player scene 83 | if isinstance(env.unwrapped, MujocoEnv): 84 | ctrl = TestKeyboardControlMuj() 85 | from mujoco_py.glfw import set_key_callback 86 | set_key_callback(env.unwrapped._get_viewer().window, ctrl.key) 87 | else: 88 | raise NotImplementedError 89 | 90 | a = np.zeros(env.action_space.shape) 91 | copy_n = min(len(a), len(ctrl.control)) 92 | ctrl.human_pause = False 93 | 94 | while 1: 95 | ctrl.human_done = False 96 | sn = env.reset() 97 | frame = 0 98 | reward = 0.0 99 | episode_over = False 100 | while 1: 101 | s = sn 102 | a[:copy_n] = ctrl.control[:copy_n] 103 | # import pdb; pdb.set_trace() 104 | sn, rplus, done, info = env.step(a) 105 | reward += rplus 106 | #env.render("rgb_array") 107 | episode_over |= done 108 | still_visible = True 109 | # import pdb; pdb.set_trace() 110 | while True: 111 | env.render("human") 112 | #env.unwrapped.camera.test_window() 113 | if not ctrl.human_pause: break 114 | if ctrl.human_done: break 115 | if not still_visible: break 116 | frame += 1 117 | if not still_visible: break 118 | 119 | if __name__=="__main__": 120 | env_id = "RoboschoolHumanoid-v0" if len(sys.argv) <= 1 else sys.argv[1] 121 | test(env_id) 122 | -------------------------------------------------------------------------------- /code/ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | # MODIFIED FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py 2 | import gc 3 | import time 4 | 5 | #import pickle as pickle 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | import pyprind 10 | import rllab.misc.logger as logger 11 | from rllab.algos.base import RLAlgorithm 12 | from rllab.core.serializable import Serializable 13 | from rllab.misc import ext, special 14 | from rllab.misc.overrides import overrides 15 | from rllab.plotter import plotter 16 | from rllab.sampler import parallel_sampler 17 | from sampling_utils import SimpleReplayPool 18 | from sandbox.rocky.tf.misc import tensor_utils 19 | from sandbox.rocky.tf.optimizers.first_order_optimizer import \ 20 | FirstOrderOptimizer 21 | 22 | 23 | class DDPG(RLAlgorithm): 24 | """ 25 | Deep Deterministic Policy Gradient. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | env, 31 | policy, 32 | qf, 33 | es, 34 | batch_size=32, 35 | n_epochs=200, 36 | epoch_length=1000, 37 | min_pool_size=10000, 38 | replay_pool_size=1000000, 39 | replacement_prob=1.0, 40 | discount=0.99, 41 | max_path_length=250, 42 | qf_weight_decay=0., 43 | qf_update_method='adam', 44 | qf_learning_rate=1e-3, 45 | policy_weight_decay=0, 46 | policy_update_method='adam', 47 | policy_learning_rate=1e-3, 48 | policy_updates_ratio=1.0, 49 | eval_samples=10000, 50 | soft_target=True, 51 | soft_target_tau=0.001, 52 | n_updates_per_sample=1, 53 | scale_reward=1.0, 54 | include_horizon_terminal_transitions=False, 55 | plot=False, 56 | pause_for_plot=False, 57 | **kwargs): 58 | """ 59 | :param env: Environment 60 | :param policy: Policy 61 | :param qf: Q function 62 | :param es: Exploration strategy 63 | :param batch_size: Number of samples for each minibatch. 64 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch. 65 | :param epoch_length: How many timesteps for each epoch. 66 | :param min_pool_size: Minimum size of the pool to start training. 67 | :param replay_pool_size: Size of the experience replay pool. 68 | :param discount: Discount factor for the cumulative return. 69 | :param max_path_length: Discount factor for the cumulative return. 70 | :param qf_weight_decay: Weight decay factor for parameters of the Q function. 71 | :param qf_update_method: Online optimization method for training Q function. 72 | :param qf_learning_rate: Learning rate for training Q function. 73 | :param policy_weight_decay: Weight decay factor for parameters of the policy. 74 | :param policy_update_method: Online optimization method for training the policy. 75 | :param policy_learning_rate: Learning rate for training the policy. 76 | :param eval_samples: Number of samples (timesteps) for evaluating the policy. 77 | :param soft_target_tau: Interpolation parameter for doing the soft target update. 78 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained 79 | :param scale_reward: The scaling factor applied to the rewards when training 80 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the 81 | horizon was reached. This might make the Q value back up less stable for certain tasks. 82 | :param plot: Whether to visualize the policy performance after each eval_interval. 83 | :param pause_for_plot: Whether to pause before continuing when plotting. 84 | :return: 85 | """ 86 | self.env = env 87 | self.policy = policy 88 | self.qf = qf 89 | self.es = es 90 | self.batch_size = batch_size 91 | self.n_epochs = n_epochs 92 | self.epoch_length = epoch_length 93 | self.min_pool_size = min_pool_size 94 | self.replay_pool_size = replay_pool_size 95 | self.replacement_prob = replacement_prob 96 | self.discount = discount 97 | self.max_path_length = max_path_length 98 | self.qf_weight_decay = qf_weight_decay 99 | self.qf_update_method = \ 100 | FirstOrderOptimizer( 101 | update_method=qf_update_method, 102 | learning_rate=qf_learning_rate, 103 | ) 104 | self.qf_learning_rate = qf_learning_rate 105 | self.policy_weight_decay = policy_weight_decay 106 | self.policy_update_method = \ 107 | FirstOrderOptimizer( 108 | update_method=policy_update_method, 109 | learning_rate=policy_learning_rate, 110 | ) 111 | self.policy_learning_rate = policy_learning_rate 112 | self.policy_updates_ratio = policy_updates_ratio 113 | self.eval_samples = eval_samples 114 | self.soft_target_tau = soft_target_tau 115 | self.n_updates_per_sample = n_updates_per_sample 116 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions 117 | self.plot = plot 118 | self.pause_for_plot = pause_for_plot 119 | 120 | self.qf_loss_averages = [] 121 | self.policy_surr_averages = [] 122 | self.q_averages = [] 123 | self.y_averages = [] 124 | self.paths = [] 125 | self.es_path_returns = [] 126 | self.paths_samples_cnt = 0 127 | 128 | self.scale_reward = scale_reward 129 | 130 | self.train_policy_itr = 0 131 | 132 | self.opt_info = None 133 | 134 | def start_worker(self): 135 | parallel_sampler.populate_task(self.env, self.policy) 136 | if self.plot: 137 | plotter.init_plot(self.env, self.policy) 138 | 139 | @overrides 140 | def train(self): 141 | gc_dump_time = time.time() 142 | with tf.Session() as sess: 143 | sess.run(tf.global_variables_initializer()) 144 | # This seems like a rather sequential method 145 | pool = SimpleReplayPool( 146 | max_pool_size=self.replay_pool_size, 147 | observation_dim=self.env.observation_space.flat_dim, 148 | action_dim=self.env.action_space.flat_dim, 149 | replacement_prob=self.replacement_prob, 150 | ) 151 | self.start_worker() 152 | 153 | self.init_opt() 154 | # This initializes the optimizer parameters 155 | sess.run(tf.global_variables_initializer()) 156 | itr = 0 157 | path_length = 0 158 | path_return = 0 159 | terminal = False 160 | initial = False 161 | observation = self.env.reset() 162 | 163 | with tf.variable_scope("sample_policy"): 164 | sample_policy = Serializable.clone(self.policy) 165 | 166 | for epoch in range(self.n_epochs): 167 | logger.push_prefix('epoch #%d | ' % epoch) 168 | logger.log("Training started") 169 | train_qf_itr, train_policy_itr = 0, 0 170 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): 171 | # Execute policy 172 | if terminal: 173 | # Note that if the last time step ends an episode, the very 174 | # last state and observation will be ignored and not added 175 | # to the replay pool 176 | observation = self.env.reset() 177 | sample_policy.reset() 178 | self.es_path_returns.append(path_return) 179 | path_length = 0 180 | path_return = 0 181 | initial = True 182 | else: 183 | initial = False 184 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) 185 | 186 | next_observation, reward, terminal, _ = self.env.step(action) 187 | path_length += 1 188 | path_return += reward 189 | 190 | if not terminal and path_length >= self.max_path_length: 191 | terminal = True 192 | # only include the terminal transition in this case if the flag was set 193 | if self.include_horizon_terminal_transitions: 194 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 195 | else: 196 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 197 | 198 | observation = next_observation 199 | 200 | if pool.size >= self.min_pool_size: 201 | for update_itr in range(self.n_updates_per_sample): 202 | # Train policy 203 | batch = pool.random_batch(self.batch_size) 204 | itrs = self.do_training(itr, batch) 205 | train_qf_itr += itrs[0] 206 | train_policy_itr += itrs[1] 207 | sample_policy.set_param_values(self.policy.get_param_values()) 208 | 209 | itr += 1 210 | if time.time() - gc_dump_time > 100: 211 | gc.collect() 212 | gc_dump_time = time.time() 213 | 214 | logger.log("Training finished") 215 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) 216 | if pool.size >= self.min_pool_size: 217 | self.evaluate(epoch, pool) 218 | params = self.get_epoch_snapshot(epoch) 219 | logger.save_itr_params(epoch, params) 220 | logger.dump_tabular(with_prefix=False) 221 | logger.pop_prefix() 222 | if self.plot: 223 | self.update_plot() 224 | if self.pause_for_plot: 225 | input("Plotting evaluation run: Press Enter to " 226 | "continue...") 227 | self.env.terminate() 228 | self.policy.terminate() 229 | 230 | def init_opt(self): 231 | 232 | # First, create "target" policy and Q functions 233 | with tf.variable_scope("target_policy"): 234 | target_policy = Serializable.clone(self.policy) 235 | with tf.variable_scope("target_qf"): 236 | target_qf = Serializable.clone(self.qf) 237 | 238 | # y need to be computed first 239 | obs = self.env.observation_space.new_tensor_variable( 240 | 'obs', 241 | extra_dims=1, 242 | ) 243 | 244 | # The yi values are computed separately as above and then passed to 245 | # the training functions below 246 | action = self.env.action_space.new_tensor_variable( 247 | 'action', 248 | extra_dims=1, 249 | ) 250 | 251 | yvar = tensor_utils.new_tensor( 252 | 'ys', 253 | ndim=1, 254 | dtype=tf.float32, 255 | ) 256 | 257 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ 258 | sum([tf.reduce_sum(tf.square(param)) for param in 259 | self.qf.get_params(regularizable=True)]) 260 | 261 | qval = self.qf.get_qval_sym(obs, action) 262 | 263 | qf_loss = tf.reduce_mean(tf.square(yvar - qval)) 264 | qf_reg_loss = qf_loss + qf_weight_decay_term 265 | 266 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ 267 | sum([tf.reduce_sum(tf.square(param)) 268 | for param in self.policy.get_params(regularizable=True)]) 269 | policy_qval = self.qf.get_qval_sym( 270 | obs, self.policy.get_action_sym(obs), 271 | deterministic=True 272 | ) 273 | policy_surr = -tf.reduce_mean(policy_qval) 274 | 275 | policy_reg_surr = policy_surr + policy_weight_decay_term 276 | 277 | qf_input_list = [yvar, obs, action] 278 | policy_input_list = [obs] 279 | 280 | self.qf_update_method.update_opt( 281 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) 282 | self.policy_update_method.update_opt( 283 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) 284 | 285 | f_train_qf = tensor_utils.compile_function( 286 | inputs=qf_input_list, 287 | outputs=[qf_loss, qval, self.qf_update_method._train_op], 288 | ) 289 | 290 | f_train_policy = tensor_utils.compile_function( 291 | inputs=policy_input_list, 292 | outputs=[policy_surr, self.policy_update_method._train_op], 293 | ) 294 | 295 | self.opt_info = dict( 296 | f_train_qf=f_train_qf, 297 | f_train_policy=f_train_policy, 298 | target_qf=target_qf, 299 | target_policy=target_policy, 300 | ) 301 | 302 | def do_training(self, itr, batch): 303 | 304 | obs, actions, rewards, next_obs, terminals = ext.extract( 305 | batch, 306 | "observations", "actions", "rewards", "next_observations", 307 | "terminals" 308 | ) 309 | 310 | # compute the on-policy y values 311 | target_qf = self.opt_info["target_qf"] 312 | target_policy = self.opt_info["target_policy"] 313 | 314 | next_actions, _ = target_policy.get_actions(next_obs) 315 | next_qvals = target_qf.get_qval(next_obs, next_actions) 316 | 317 | ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1) 318 | 319 | f_train_qf = self.opt_info["f_train_qf"] 320 | qf_loss, qval, _ = f_train_qf(ys, obs, actions) 321 | target_qf.set_param_values( 322 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) + 323 | self.qf.get_param_values() * self.soft_target_tau) 324 | self.qf_loss_averages.append(qf_loss) 325 | self.q_averages.append(qval) 326 | self.y_averages.append(ys) 327 | 328 | self.train_policy_itr += self.policy_updates_ratio 329 | train_policy_itr = 0 330 | while self.train_policy_itr > 0: 331 | f_train_policy = self.opt_info["f_train_policy"] 332 | policy_surr, _ = f_train_policy(obs) 333 | target_policy.set_param_values( 334 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) + 335 | self.policy.get_param_values() * self.soft_target_tau) 336 | self.policy_surr_averages.append(policy_surr) 337 | self.train_policy_itr -= 1 338 | train_policy_itr += 1 339 | return 1, train_policy_itr # number of itrs qf, policy are trained 340 | 341 | def evaluate(self, epoch, pool): 342 | logger.log("Collecting samples for evaluation") 343 | 344 | paths = parallel_sampler.sample_paths( 345 | policy_params=self.policy.get_param_values(), 346 | max_samples=self.eval_samples, 347 | max_path_length=self.max_path_length, 348 | ) 349 | 350 | self.env.reset() 351 | 352 | average_discounted_return = np.mean( 353 | [special.discount_return(path["rewards"], self.discount) for path in paths] 354 | ) 355 | 356 | returns = [sum(path["rewards"]) for path in paths] 357 | 358 | all_qs = np.concatenate(self.q_averages) 359 | all_ys = np.concatenate(self.y_averages) 360 | 361 | average_q_loss = np.mean(self.qf_loss_averages) 362 | average_policy_surr = np.mean(self.policy_surr_averages) 363 | average_action = np.mean(np.square(np.concatenate( 364 | [path["actions"] for path in paths] 365 | ))) 366 | 367 | policy_reg_param_norm = np.linalg.norm( 368 | self.policy.get_param_values(regularizable=True) 369 | ) 370 | qfun_reg_param_norm = np.linalg.norm( 371 | self.qf.get_param_values(regularizable=True) 372 | ) 373 | 374 | logger.record_tabular('Epoch', epoch) 375 | logger.record_tabular('Iteration', epoch) 376 | logger.record_tabular('AverageReturn', np.mean(returns)) 377 | logger.record_tabular('StdReturn', 378 | np.std(returns)) 379 | logger.record_tabular('MaxReturn', 380 | np.max(returns)) 381 | logger.record_tabular('MinReturn', 382 | np.min(returns)) 383 | if len(self.es_path_returns) > 0: 384 | logger.record_tabular('AverageEsReturn', 385 | np.mean(self.es_path_returns)) 386 | logger.record_tabular('StdEsReturn', 387 | np.std(self.es_path_returns)) 388 | logger.record_tabular('MaxEsReturn', 389 | np.max(self.es_path_returns)) 390 | logger.record_tabular('MinEsReturn', 391 | np.min(self.es_path_returns)) 392 | logger.record_tabular('AverageDiscountedReturn', 393 | average_discounted_return) 394 | logger.record_tabular('AverageQLoss', average_q_loss) 395 | logger.record_tabular('AveragePolicySurr', average_policy_surr) 396 | logger.record_tabular('AverageQ', np.mean(all_qs)) 397 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) 398 | logger.record_tabular('AverageY', np.mean(all_ys)) 399 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) 400 | logger.record_tabular('AverageAbsQYDiff', 401 | np.mean(np.abs(all_qs - all_ys))) 402 | logger.record_tabular('AverageAction', average_action) 403 | 404 | logger.record_tabular('PolicyRegParamNorm', 405 | policy_reg_param_norm) 406 | logger.record_tabular('QFunRegParamNorm', 407 | qfun_reg_param_norm) 408 | 409 | self.env.log_diagnostics(paths) 410 | self.policy.log_diagnostics(paths) 411 | 412 | self.qf_loss_averages = [] 413 | self.policy_surr_averages = [] 414 | 415 | self.q_averages = [] 416 | self.y_averages = [] 417 | self.es_path_returns = [] 418 | 419 | def update_plot(self): 420 | if self.plot: 421 | plotter.update_plot(self.policy, self.max_path_length) 422 | 423 | def get_epoch_snapshot(self, epoch): 424 | return dict( 425 | env=self.env, 426 | epoch=epoch, 427 | qf=self.qf, 428 | policy=self.policy, 429 | target_qf=self.opt_info["target_qf"], 430 | target_policy=self.opt_info["target_policy"], 431 | es=self.es, 432 | ) 433 | --------------------------------------------------------------------------------