├── .gitignore ├── README.md ├── ave_results.py ├── ddpg_tensorflow └── ddpg.py ├── from_same_dist.py ├── plot_results.py ├── reproducibility_ML_DDPG ├── HalfCheetah_Scripts │ ├── run_ddpg_halfcheetah_batch_size.py │ ├── run_ddpg_halfcheetah_learning_rates.py │ ├── run_ddpg_halfcheetah_network_structure.py │ └── run_ddpg_halfcheetah_reward_scale.py ├── Hopper_Scripts │ ├── run_ddpg_hopper_batch_size.py │ ├── run_ddpg_hopper_learning_rates.py │ ├── run_ddpg_hopper_network_structure.py │ └── run_ddpg_hopper_reward_scale.py ├── InvertedPendulum_Scripts │ ├── run_ddpg_invpendulum_batch_size.py │ ├── run_ddpg_invpendulum_learning_rates.py │ ├── run_ddpg_invpendulum_network_structure.py │ └── run_ddpg_invpendulum_reward_scale.py └── Walker_Scripts │ ├── run_ddpg_walker_batch_size.py │ ├── run_ddpg_walker_learning_rates.py │ ├── run_ddpg_walker_network_structure.py │ └── run_ddpg_walker_reward_scale.py ├── run_trpo.py └── sampling_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control 2 | 3 | Policy gradient methods in reinforcement learning have become increasingly prevalent 4 | for state-of-the-art performance in continuous control tasks. Novel methods 5 | typically benchmark against a few key algorithms such as deep deterministic policy 6 | gradients and trust region policy optimization. As such, it is important to 7 | present and use consistent baselines experiments. However, this can be difficult 8 | due to general variance in the algorithms, hyper-parameter tuning, and environment 9 | stochasticity. We investigate and discuss: the significance of hyper-parameters in 10 | policy gradients for continuous control, general variance in the algorithms, and 11 | reproducibility of reported results. We provide guidelines on reporting novel results 12 | as comparisons against baseline methods such that future researchers can make 13 | informed decisions when investigating novel methods. 14 | 15 | ## Citation 16 | 17 | ``` 18 | @article{islam2017reproducibility, 19 | title={Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control}, 20 | author={Islam*, Riashat and Henderson*, Peter and Gomrokchi, Maziar and Precup, Doina}, 21 | journal={ICML 2017 Reproducibility in Machine Learning Workshop}, 22 | year={2017}, 23 | url={https://arxiv.org/pdf/1708.04133.pdf} 24 | } 25 | ``` 26 | 27 | ## References 28 | 29 | Here, we use the rllab implementation of various benchmark algorithms. 30 | -------------------------------------------------------------------------------- /ave_results.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | import pandas as pd 5 | from itertools import cycle 6 | 7 | from numpy import genfromtxt 8 | 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("csvs_to_compile", nargs='+', help="The csvs to compile") 13 | parser.add_argument("ave_out", help="the output file") 14 | 15 | args = parser.parse_args() 16 | 17 | data_frames = [] 18 | for f in args.csvs_to_compile: 19 | 20 | data = pd.read_csv(f) 21 | data_frames.append(data) 22 | 23 | 24 | df = pd.concat(data_frames, axis=1) 25 | # import pdb; pdb.set_trace() 26 | 27 | # df = df.swaplevel(0, 1, axis=1).sortlevel(axis=1) 28 | foo = df.groupby(level=0, axis=1).mean() 29 | 30 | foo.to_csv(args.ave_out) 31 | -------------------------------------------------------------------------------- /ddpg_tensorflow/ddpg.py: -------------------------------------------------------------------------------- 1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py 2 | from rllab.algos.base import RLAlgorithm 3 | from rllab.misc.overrides import overrides 4 | from rllab.misc import special 5 | from sandbox.rocky.tf.misc import tensor_utils 6 | from rllab.sampler import parallel_sampler 7 | from rllab.plotter import plotter 8 | from rllab.misc import ext 9 | import rllab.misc.logger as logger 10 | #import pickle as pickle 11 | import numpy as np 12 | import pyprind 13 | import tensorflow as tf 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading 16 | from rllab.core.serializable import Serializable 17 | from sampling_utils import SimpleReplayPool 18 | 19 | class DDPG(RLAlgorithm): 20 | """ 21 | Deep Deterministic Policy Gradient. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | env, 27 | policy, 28 | qf, 29 | es, 30 | batch_size=32, 31 | n_epochs=200, 32 | epoch_length=1000, 33 | min_pool_size=10000, 34 | replay_pool_size=1000000, 35 | replacement_prob=1.0, 36 | discount=0.99, 37 | max_path_length=250, 38 | qf_weight_decay=0., 39 | qf_update_method='adam', 40 | qf_learning_rate=1e-3, 41 | policy_weight_decay=0, 42 | policy_update_method='adam', 43 | policy_learning_rate=1e-3, 44 | policy_updates_ratio=1.0, 45 | eval_samples=10000, 46 | soft_target=True, 47 | soft_target_tau=0.001, 48 | n_updates_per_sample=1, 49 | scale_reward=1.0, 50 | include_horizon_terminal_transitions=False, 51 | plot=False, 52 | pause_for_plot=False): 53 | """ 54 | :param env: Environment 55 | :param policy: Policy 56 | :param qf: Q function 57 | :param es: Exploration strategy 58 | :param batch_size: Number of samples for each minibatch. 59 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch. 60 | :param epoch_length: How many timesteps for each epoch. 61 | :param min_pool_size: Minimum size of the pool to start training. 62 | :param replay_pool_size: Size of the experience replay pool. 63 | :param discount: Discount factor for the cumulative return. 64 | :param max_path_length: Discount factor for the cumulative return. 65 | :param qf_weight_decay: Weight decay factor for parameters of the Q function. 66 | :param qf_update_method: Online optimization method for training Q function. 67 | :param qf_learning_rate: Learning rate for training Q function. 68 | :param policy_weight_decay: Weight decay factor for parameters of the policy. 69 | :param policy_update_method: Online optimization method for training the policy. 70 | :param policy_learning_rate: Learning rate for training the policy. 71 | :param eval_samples: Number of samples (timesteps) for evaluating the policy. 72 | :param soft_target_tau: Interpolation parameter for doing the soft target update. 73 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained 74 | :param scale_reward: The scaling factor applied to the rewards when training 75 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the 76 | horizon was reached. This might make the Q value back up less stable for certain tasks. 77 | :param plot: Whether to visualize the policy performance after each eval_interval. 78 | :param pause_for_plot: Whether to pause before continuing when plotting. 79 | :return: 80 | """ 81 | self.env = env 82 | self.policy = policy 83 | self.qf = qf 84 | self.es = es 85 | self.batch_size = batch_size 86 | self.n_epochs = n_epochs 87 | self.epoch_length = epoch_length 88 | self.min_pool_size = min_pool_size 89 | self.replay_pool_size = replay_pool_size 90 | self.replacement_prob = replacement_prob 91 | self.discount = discount 92 | self.max_path_length = max_path_length 93 | self.qf_weight_decay = qf_weight_decay 94 | self.qf_update_method = \ 95 | FirstOrderOptimizer( 96 | update_method=qf_update_method, 97 | learning_rate=qf_learning_rate, 98 | ) 99 | self.qf_learning_rate = qf_learning_rate 100 | self.policy_weight_decay = policy_weight_decay 101 | self.policy_update_method = \ 102 | FirstOrderOptimizer( 103 | update_method=policy_update_method, 104 | learning_rate=policy_learning_rate, 105 | ) 106 | self.policy_learning_rate = policy_learning_rate 107 | self.policy_updates_ratio = policy_updates_ratio 108 | self.eval_samples = eval_samples 109 | self.soft_target_tau = soft_target_tau 110 | self.n_updates_per_sample = n_updates_per_sample 111 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions 112 | self.plot = plot 113 | self.pause_for_plot = pause_for_plot 114 | 115 | self.qf_loss_averages = [] 116 | self.policy_surr_averages = [] 117 | self.q_averages = [] 118 | self.y_averages = [] 119 | self.paths = [] 120 | self.es_path_returns = [] 121 | self.paths_samples_cnt = 0 122 | 123 | self.scale_reward = scale_reward 124 | 125 | self.train_policy_itr = 0 126 | 127 | self.opt_info = None 128 | 129 | def start_worker(self): 130 | parallel_sampler.populate_task(self.env, self.policy) 131 | if self.plot: 132 | plotter.init_plot(self.env, self.policy) 133 | 134 | @overrides 135 | def train(self): 136 | with tf.Session() as sess: 137 | sess.run(tf.global_variables_initializer()) 138 | # This seems like a rather sequential method 139 | pool = SimpleReplayPool( 140 | max_pool_size=self.replay_pool_size, 141 | observation_dim=self.env.observation_space.flat_dim, 142 | action_dim=self.env.action_space.flat_dim, 143 | replacement_prob=self.replacement_prob, 144 | ) 145 | self.start_worker() 146 | 147 | self.init_opt() 148 | # This initializes the optimizer parameters 149 | sess.run(tf.global_variables_initializer()) 150 | itr = 0 151 | path_length = 0 152 | path_return = 0 153 | terminal = False 154 | initial = False 155 | observation = self.env.reset() 156 | 157 | with tf.variable_scope("sample_policy"): 158 | sample_policy = Serializable.clone(self.policy) 159 | 160 | for epoch in range(self.n_epochs): 161 | logger.push_prefix('epoch #%d | ' % epoch) 162 | logger.log("Training started") 163 | train_qf_itr, train_policy_itr = 0, 0 164 | 165 | # updated_q_network, updated_policy_network, _, _, end_trajectory_action, end_trajectory_state = self.lp.lp_exploration() 166 | 167 | # Don't need to set the values because we're actually using the same policy/qf already 168 | # self.qf.set_param_values(updated_q_network.get_param_values()) 169 | # self.policy.set_param_values(updated_policy_network.get_param_values()) 170 | 171 | # observation = end_trajectory_state 172 | 173 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): 174 | # Execute policy 175 | if terminal: # or path_length > self.max_path_length: 176 | # Note that if the last time step ends an episode, the very 177 | # last state and observation will be ignored and not added 178 | # to the replay pool 179 | observation = self.env.reset() 180 | self.es.reset() 181 | sample_policy.reset() 182 | self.es_path_returns.append(path_return) 183 | path_length = 0 184 | path_return = 0 185 | initial = True 186 | else: 187 | initial = False 188 | 189 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) 190 | 191 | next_observation, reward, terminal, _ = self.env.step(action) 192 | path_length += 1 193 | path_return += reward 194 | 195 | if not terminal and path_length >= self.max_path_length: 196 | terminal = True 197 | # only include the terminal transition in this case if the flag was set 198 | if self.include_horizon_terminal_transitions: 199 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 200 | else: 201 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 202 | 203 | observation = next_observation 204 | 205 | if pool.size >= self.min_pool_size: 206 | for update_itr in range(self.n_updates_per_sample): 207 | # Train policy 208 | batch = pool.random_batch(self.batch_size) 209 | itrs = self.do_training(itr, batch) 210 | train_qf_itr += itrs[0] 211 | train_policy_itr += itrs[1] 212 | sample_policy.set_param_values(self.policy.get_param_values()) 213 | 214 | itr += 1 215 | 216 | logger.log("Training finished") 217 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) 218 | if pool.size >= self.min_pool_size: 219 | self.evaluate(epoch, pool) 220 | params = self.get_epoch_snapshot(epoch) 221 | logger.save_itr_params(epoch, params) 222 | logger.dump_tabular(with_prefix=False) 223 | logger.pop_prefix() 224 | if self.plot: 225 | self.update_plot() 226 | if self.pause_for_plot: 227 | input("Plotting evaluation run: Press Enter to " 228 | "continue...") 229 | self.env.terminate() 230 | self.policy.terminate() 231 | 232 | def init_opt(self): 233 | 234 | # First, create "target" policy and Q functions 235 | with tf.variable_scope("target_policy"): 236 | target_policy = Serializable.clone(self.policy) 237 | with tf.variable_scope("target_qf"): 238 | target_qf = Serializable.clone(self.qf) 239 | 240 | # y need to be computed first 241 | obs = self.env.observation_space.new_tensor_variable( 242 | 'obs', 243 | extra_dims=1, 244 | ) 245 | 246 | # The yi values are computed separately as above and then passed to 247 | # the training functions below 248 | action = self.env.action_space.new_tensor_variable( 249 | 'action', 250 | extra_dims=1, 251 | ) 252 | 253 | yvar = tensor_utils.new_tensor( 254 | 'ys', 255 | ndim=1, 256 | dtype=tf.float32, 257 | ) 258 | 259 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ 260 | sum([tf.reduce_sum(tf.square(param)) for param in 261 | self.qf.get_params(regularizable=True)]) 262 | 263 | qval = self.qf.get_qval_sym(obs, action) 264 | 265 | qf_loss = tf.reduce_mean(tf.square(yvar - qval)) 266 | qf_reg_loss = qf_loss + qf_weight_decay_term 267 | 268 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ 269 | sum([tf.reduce_sum(tf.square(param)) 270 | for param in self.policy.get_params(regularizable=True)]) 271 | policy_qval = self.qf.get_qval_sym( 272 | obs, self.policy.get_action_sym(obs), 273 | deterministic=True 274 | ) 275 | policy_surr = -tf.reduce_mean(policy_qval) 276 | 277 | policy_reg_surr = policy_surr + policy_weight_decay_term 278 | 279 | qf_input_list = [yvar, obs, action] 280 | policy_input_list = [obs] 281 | 282 | self.qf_update_method.update_opt( 283 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) 284 | self.policy_update_method.update_opt( 285 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) 286 | 287 | f_train_qf = tensor_utils.compile_function( 288 | inputs=qf_input_list, 289 | outputs=[qf_loss, qval, self.qf_update_method._train_op], 290 | ) 291 | 292 | f_train_policy = tensor_utils.compile_function( 293 | inputs=policy_input_list, 294 | outputs=[policy_surr, self.policy_update_method._train_op], 295 | ) 296 | 297 | self.opt_info = dict( 298 | f_train_qf=f_train_qf, 299 | f_train_policy=f_train_policy, 300 | target_qf=target_qf, 301 | target_policy=target_policy, 302 | ) 303 | 304 | def do_training(self, itr, batch): 305 | 306 | obs, actions, rewards, next_obs, terminals = ext.extract( 307 | batch, 308 | "observations", "actions", "rewards", "next_observations", 309 | "terminals" 310 | ) 311 | 312 | # compute the on-policy y values 313 | target_qf = self.opt_info["target_qf"] 314 | target_policy = self.opt_info["target_policy"] 315 | 316 | next_actions, _ = target_policy.get_actions(next_obs) 317 | next_qvals = target_qf.get_qval(next_obs, next_actions) 318 | 319 | ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1) 320 | 321 | f_train_qf = self.opt_info["f_train_qf"] 322 | qf_loss, qval, _ = f_train_qf(ys, obs, actions) 323 | target_qf.set_param_values( 324 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) + 325 | self.qf.get_param_values() * self.soft_target_tau) 326 | self.qf_loss_averages.append(qf_loss) 327 | self.q_averages.append(qval) 328 | self.y_averages.append(ys) 329 | 330 | self.train_policy_itr += self.policy_updates_ratio 331 | train_policy_itr = 0 332 | while self.train_policy_itr > 0: 333 | f_train_policy = self.opt_info["f_train_policy"] 334 | policy_surr, _ = f_train_policy(obs) 335 | target_policy.set_param_values( 336 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) + 337 | self.policy.get_param_values() * self.soft_target_tau) 338 | self.policy_surr_averages.append(policy_surr) 339 | self.train_policy_itr -= 1 340 | train_policy_itr += 1 341 | return 1, train_policy_itr # number of itrs qf, policy are trained 342 | 343 | def evaluate(self, epoch, pool): 344 | logger.log("Collecting samples for evaluation") 345 | paths = parallel_sampler.sample_paths( 346 | policy_params=self.policy.get_param_values(), 347 | max_samples=self.eval_samples, 348 | max_path_length=self.max_path_length, 349 | ) 350 | 351 | average_discounted_return = np.mean( 352 | [special.discount_return(path["rewards"], self.discount) for path in paths] 353 | ) 354 | 355 | returns = [sum(path["rewards"]) for path in paths] 356 | 357 | all_qs = np.concatenate(self.q_averages) 358 | all_ys = np.concatenate(self.y_averages) 359 | 360 | average_q_loss = np.mean(self.qf_loss_averages) 361 | average_policy_surr = np.mean(self.policy_surr_averages) 362 | average_action = np.mean(np.square(np.concatenate( 363 | [path["actions"] for path in paths] 364 | ))) 365 | 366 | policy_reg_param_norm = np.linalg.norm( 367 | self.policy.get_param_values(regularizable=True) 368 | ) 369 | qfun_reg_param_norm = np.linalg.norm( 370 | self.qf.get_param_values(regularizable=True) 371 | ) 372 | 373 | logger.record_tabular('Epoch', epoch) 374 | logger.record_tabular('Iteration', epoch) 375 | logger.record_tabular('AverageReturn', np.mean(returns)) 376 | logger.record_tabular('StdReturn', 377 | np.std(returns)) 378 | logger.record_tabular('MaxReturn', 379 | np.max(returns)) 380 | logger.record_tabular('MinReturn', 381 | np.min(returns)) 382 | if len(self.es_path_returns) > 0: 383 | logger.record_tabular('AverageEsReturn', 384 | np.mean(self.es_path_returns)) 385 | logger.record_tabular('StdEsReturn', 386 | np.std(self.es_path_returns)) 387 | logger.record_tabular('MaxEsReturn', 388 | np.max(self.es_path_returns)) 389 | logger.record_tabular('MinEsReturn', 390 | np.min(self.es_path_returns)) 391 | logger.record_tabular('AverageDiscountedReturn', 392 | average_discounted_return) 393 | logger.record_tabular('AverageQLoss', average_q_loss) 394 | logger.record_tabular('AveragePolicySurr', average_policy_surr) 395 | logger.record_tabular('AverageQ', np.mean(all_qs)) 396 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) 397 | logger.record_tabular('AverageY', np.mean(all_ys)) 398 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) 399 | logger.record_tabular('AverageAbsQYDiff', 400 | np.mean(np.abs(all_qs - all_ys))) 401 | logger.record_tabular('AverageAction', average_action) 402 | 403 | logger.record_tabular('PolicyRegParamNorm', 404 | policy_reg_param_norm) 405 | logger.record_tabular('QFunRegParamNorm', 406 | qfun_reg_param_norm) 407 | 408 | self.env.log_diagnostics(paths) 409 | self.policy.log_diagnostics(paths) 410 | 411 | self.qf_loss_averages = [] 412 | self.policy_surr_averages = [] 413 | 414 | self.q_averages = [] 415 | self.y_averages = [] 416 | self.es_path_returns = [] 417 | 418 | def update_plot(self): 419 | if self.plot: 420 | plotter.update_plot(self.policy, self.max_path_length) 421 | 422 | def get_epoch_snapshot(self, epoch): 423 | return dict( 424 | env=self.env, 425 | epoch=epoch, 426 | qf=self.qf, 427 | policy=self.policy, 428 | target_qf=self.opt_info["target_qf"], 429 | target_policy=self.opt_info["target_policy"], 430 | es=self.es, 431 | ) 432 | -------------------------------------------------------------------------------- /from_same_dist.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as stats 2 | import pandas as pd 3 | import numpy as np 4 | import argparse 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs") 7 | parser.add_argument("--range_start", type=int, default=-1) 8 | parser.add_argument("--range_end", type=int, default=100000000) 9 | 10 | 11 | args = parser.parse_args() 12 | assert len(args.paths_to_progress_csvs) == 2 13 | 14 | avg_rets = [] 15 | std_dev_rets = [] 16 | trajs = [] 17 | 18 | data = pd.read_csv(args.paths_to_progress_csvs[0]) 19 | 20 | a_means = data["AverageReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 21 | a_stds = data["StdReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 22 | n_as = data["NumTrajs"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 23 | 24 | args.paths_to_progress_csvs 25 | data = pd.read_csv(args.paths_to_progress_csvs[1]) 26 | 27 | b_means = data["AverageReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 28 | b_stds = data["StdReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 29 | n_bs = data["NumTrajs"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))] 30 | 31 | # Do a T - test 32 | ts, ps = [],[] 33 | 34 | for a_mean, a_std, n_a, b_mean, b_std, n_b in zip(a_means, a_stds, n_as, b_means, b_stds, n_bs): 35 | t, p = stats.ttest_ind_from_stats(a_mean, a_std, n_a, b_mean, b_std, n_b, equal_var=False) 36 | ts.append(t) 37 | ps.append(p) 38 | 39 | print("t=%f,p=%f" % (np.mean(ts), np.mean(ps))) 40 | -------------------------------------------------------------------------------- /plot_results.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | import pandas as pd 5 | from itertools import cycle 6 | 7 | from numpy import genfromtxt 8 | from numpy.random import choice 9 | 10 | 11 | 12 | def multiple_plot(average_vals_list, std_dev_list, traj_list, other_labels, env_name, smoothing_window=5, no_show=False, ignore_std=False, limit=None, extra_lines=None): 13 | fig = plt.figure(figsize=(15, 10)) 14 | colors = ["k", "red", "blue", "green", "magenta", "cyan", "brown", "purple"] 15 | color_index = 0 16 | ax = plt.subplot() # Defines ax variable by creating an empty plot 17 | 18 | # Set the tick labels font 19 | for label in (ax.get_xticklabels() + ax.get_yticklabels()): 20 | label.set_fontname('Arial') 21 | label.set_fontsize(22) 22 | 23 | index = 0 24 | for average_vals, std_dev, label, trajs in zip(average_vals_list, std_dev_list, other_labels[:len(average_vals_list)], traj_list): 25 | index += 1 26 | rewards_smoothed_1 = pd.Series(average_vals).rolling(smoothing_window, min_periods=smoothing_window).mean()[:limit] 27 | if limit is None: 28 | limit = len(rewards_smoothed_1) 29 | rewards_smoothed_1 = rewards_smoothed_1[:limit] 30 | std_dev = std_dev[:limit] 31 | 32 | fill_color = colors[color_index]#choice(colors, 1) 33 | color_index += 1 34 | cum_rwd_1, = plt.plot(range(len(rewards_smoothed_1)), rewards_smoothed_1, label=label, color=fill_color[0]) 35 | if not ignore_std: 36 | plt.fill_between(range(len(rewards_smoothed_1)), rewards_smoothed_1 + std_dev, rewards_smoothed_1 - std_dev, alpha=0.3, edgecolor=fill_color, facecolor=fill_color) 37 | 38 | if extra_lines: 39 | for lin in extra_lines: 40 | plt.plot(range(len(rewards_smoothed_1)), np.repeat(lin, len(rewards_smoothed_1)), linestyle='-.', color = colors[color_index], linewidth=2.5, label=other_labels[index]) 41 | color_index += 1 42 | index += 1 43 | 44 | axis_font = {'fontname':'Arial', 'size':'28'} 45 | #plt.legend(loc='upper left', prop={'size' : 16}) 46 | plt.legend(loc='lower right', prop={'size' : 16}) 47 | plt.xlabel("Iterations", **axis_font) 48 | plt.ylabel("Average Return", **axis_font) 49 | plt.title("%s Environment"% env_name, **axis_font) 50 | 51 | if no_show: 52 | fig.savefig('%s.png' % env_name, dpi=fig.dpi) 53 | else: 54 | plt.show() 55 | 56 | return fig 57 | 58 | # def multipe_plot(stats1, stats2, smoothing_window=50, noshow=False): 59 | # 60 | # fig = plt.figure(figsize=(30, 20)) 61 | # rewards_smoothed_1 = pd.Series(stats1).rolling(smoothing_window, min_periods=smoothing_window).mean() 62 | # 63 | # rewards_smoothed_2 = pd.Series(stats2).rolling(smoothing_window, min_periods=smoothing_window).mean() 64 | # 65 | # cum_rwd_1, = plt.plot(eps, rewards_smoothed_1, label="DDPG") 66 | # plt.fill_between( eps, rewards_smoothed_1 + ddpg_walker_std_return, rewards_smoothed_1 - ddpg_walker_std_return, alpha=0.3, edgecolor='blue', facecolor='blue') 67 | # 68 | # cum_rwd_2, = plt.plot(eps2, rewards_smoothed_2, label="Unified DDPG") 69 | # plt.fill_between( eps2, rewards_smoothed_2 + unified_ddpg_walker_std_return, rewards_smoothed_2 - unified_ddpg_walker_std_return, alpha=0.3, edgecolor='blue', facecolor='red') 70 | # 71 | # plt.legend(handles=[cum_rwd_1, cum_rwd_2]) 72 | # plt.xlabel("Epsiode") 73 | # plt.ylabel("Average Return") 74 | # plt.title("Walker Environment") 75 | # 76 | # plt.show() 77 | # 78 | # return fig 79 | 80 | 81 | 82 | 83 | 84 | 85 | import argparse 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs") 88 | parser.add_argument("env_name") 89 | parser.add_argument("--save", action="store_true") 90 | parser.add_argument("--ignore_std", action="store_true") 91 | parser.add_argument('--labels', nargs='+', help='List of labels to go along with the paths', required=False) 92 | parser.add_argument('--smoothing_window', default=5, type=int) 93 | parser.add_argument('--limit', default=None, type=int) 94 | parser.add_argument('--extra_lines', nargs="+", type=float) 95 | 96 | args = parser.parse_args() 97 | 98 | avg_rets = [] 99 | std_dev_rets = [] 100 | trajs = [] 101 | 102 | for o in args.paths_to_progress_csvs: 103 | data = pd.read_csv(o) 104 | avg_ret = np.array(data["AverageReturn"]) 105 | std_dev_ret = np.array(data["StdReturn"]) 106 | trajs.append(np.cumsum(np.array(data["NumTrajs"]))) 107 | avg_rets.append(avg_ret) 108 | std_dev_rets.append(std_dev_ret) 109 | 110 | multiple_plot(avg_rets, std_dev_rets, trajs, args.labels, args.env_name, smoothing_window=args.smoothing_window, no_show=args.save, ignore_std=args.ignore_std, limit=args.limit, extra_lines=args.extra_lines) 111 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_batch_size.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | batch_size_values = [32, 64, 128] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for b in range(len(batch_size_values)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=batch_size_values[b], 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=1.0, 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Batch_Size_Tune/" + "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_learning_rates.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | critic_rate = [1e-3, 1e-4, 1e-5] 56 | actor_rate = [1e-4, 1e-5, 1e-6] 57 | 58 | learning_rate_size = len(critic_rate) 59 | 60 | 61 | 62 | for r in range(learning_rate_size): 63 | 64 | policy = DeterministicMLPPolicy( 65 | env_spec=env.spec, 66 | name="policy", 67 | # The neural network policy should have two hidden layers, each with 32 hidden units. 68 | hidden_sizes=(100, 50, 25), 69 | hidden_nonlinearity=tf.nn.relu, 70 | ) 71 | 72 | es = OUStrategy(env_spec=env.spec) 73 | 74 | qf = ContinuousMLPQFunction(env_spec=env.spec, 75 | hidden_sizes=(100, 50, 25), 76 | hidden_nonlinearity=tf.nn.relu,) 77 | 78 | 79 | for e in range(num_experiments): 80 | 81 | algo = ddpg_class( 82 | env=env, 83 | policy=policy, 84 | es=es, 85 | qf=qf, 86 | batch_size=32, 87 | max_path_length=env.horizon, 88 | epoch_length=1000, 89 | min_pool_size=10000, 90 | n_epochs=args.num_epochs, 91 | discount=0.99, 92 | scale_reward=0.1, 93 | qf_learning_rate=critic_rate[r], 94 | policy_learning_rate=actor_rate[r], 95 | # Uncomment both lines (this and the plot parameter below) to enable plotting 96 | plot=args.plot, 97 | ) 98 | 99 | 100 | run_experiment_lite( 101 | algo.train(), 102 | # log_dir=args.data_dir, 103 | # Number of parallel workers for sampling 104 | n_parallel=1, 105 | # Only keep the snapshot parameters for the last iteration 106 | snapshot_mode="last", 107 | # Specifies the seed for the experiment. If this is not provided, a random seed 108 | # will be used 109 | exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e), 110 | seed=1, 111 | plot=args.plot, 112 | ) 113 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_network_structure.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | layer_1 = [400, 100, 100] 56 | layer_2 = [300, 100, 50] 57 | 58 | layer_size = 3 59 | 60 | 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 62 | 63 | 64 | for l in range(layer_size): 65 | 66 | policy = DeterministicMLPPolicy( 67 | env_spec=env.spec, 68 | name="policy", 69 | # The neural network policy should have two hidden layers, each with 32 hidden units. 70 | hidden_sizes=(layer_1[l], layer_2[l]), 71 | hidden_nonlinearity=tf.nn.relu, 72 | ) 73 | 74 | es = OUStrategy(env_spec=env.spec) 75 | 76 | qf = ContinuousMLPQFunction(env_spec=env.spec, 77 | hidden_sizes=(layer_1[l], layer_2[l]), 78 | hidden_nonlinearity=tf.nn.relu,) 79 | 80 | 81 | for e in range(num_experiments): 82 | 83 | algo = ddpg_class( 84 | env=env, 85 | policy=policy, 86 | es=es, 87 | qf=qf, 88 | batch_size=32, 89 | max_path_length=env.horizon, 90 | epoch_length=1000, 91 | min_pool_size=10000, 92 | n_epochs=args.num_epochs, 93 | discount=0.99, 94 | scale_reward=0.1, 95 | qf_learning_rate=1e-3, 96 | policy_learning_rate=1e-4, 97 | # Uncomment both lines (this and the plot parameter below) to enable plotting 98 | plot=args.plot, 99 | ) 100 | 101 | 102 | run_experiment_lite( 103 | algo.train(), 104 | # log_dir=args.data_dir, 105 | # Number of parallel workers for sampling 106 | n_parallel=1, 107 | # Only keep the snapshot parameters for the last iteration 108 | snapshot_mode="last", 109 | # Specifies the seed for the experiment. If this is not provided, a random seed 110 | # will be used 111 | exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e), 112 | seed=1, 113 | plot=args.plot, 114 | ) 115 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_reward_scale.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | reward_scaling = [0.001, 0.1, 1.0] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for r in range(len(reward_scaling)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=32, 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=reward_scaling[r], 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Reward_Scale_Tune/" + "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_batch_size.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | batch_size_values = [32, 64, 128] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for b in range(len(batch_size_values)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=batch_size_values[b], 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=1.0, 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Batch_Size_Tune/" + "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_learning_rates.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | critic_rate = [1e-3, 1e-4, 1e-5] 56 | actor_rate = [1e-4, 1e-5, 1e-6] 57 | 58 | learning_rate_size = len(critic_rate) 59 | 60 | 61 | 62 | for r in range(learning_rate_size): 63 | 64 | policy = DeterministicMLPPolicy( 65 | env_spec=env.spec, 66 | name="policy", 67 | # The neural network policy should have two hidden layers, each with 32 hidden units. 68 | hidden_sizes=(100, 50, 25), 69 | hidden_nonlinearity=tf.nn.relu, 70 | ) 71 | 72 | es = OUStrategy(env_spec=env.spec) 73 | 74 | qf = ContinuousMLPQFunction(env_spec=env.spec, 75 | hidden_sizes=(100, 50, 25), 76 | hidden_nonlinearity=tf.nn.relu,) 77 | 78 | 79 | for e in range(num_experiments): 80 | 81 | algo = ddpg_class( 82 | env=env, 83 | policy=policy, 84 | es=es, 85 | qf=qf, 86 | batch_size=32, 87 | max_path_length=env.horizon, 88 | epoch_length=1000, 89 | min_pool_size=10000, 90 | n_epochs=args.num_epochs, 91 | discount=0.99, 92 | scale_reward=0.1, 93 | qf_learning_rate=critic_rate[r], 94 | policy_learning_rate=actor_rate[r], 95 | # Uncomment both lines (this and the plot parameter below) to enable plotting 96 | plot=args.plot, 97 | ) 98 | 99 | 100 | run_experiment_lite( 101 | algo.train(), 102 | # log_dir=args.data_dir, 103 | # Number of parallel workers for sampling 104 | n_parallel=1, 105 | # Only keep the snapshot parameters for the last iteration 106 | snapshot_mode="last", 107 | # Specifies the seed for the experiment. If this is not provided, a random seed 108 | # will be used 109 | exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e), 110 | seed=1, 111 | plot=args.plot, 112 | ) 113 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_network_structure.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | layer_1 = [400, 100, 100] 56 | layer_2 = [300, 100, 50] 57 | 58 | layer_size = 3 59 | 60 | 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 62 | 63 | 64 | for l in range(layer_size): 65 | 66 | policy = DeterministicMLPPolicy( 67 | env_spec=env.spec, 68 | name="policy", 69 | # The neural network policy should have two hidden layers, each with 32 hidden units. 70 | hidden_sizes=(layer_1[l], layer_2[l]), 71 | hidden_nonlinearity=tf.nn.relu, 72 | ) 73 | 74 | es = OUStrategy(env_spec=env.spec) 75 | 76 | qf = ContinuousMLPQFunction(env_spec=env.spec, 77 | hidden_sizes=(layer_1[l], layer_2[l]), 78 | hidden_nonlinearity=tf.nn.relu,) 79 | 80 | 81 | for e in range(num_experiments): 82 | 83 | algo = ddpg_class( 84 | env=env, 85 | policy=policy, 86 | es=es, 87 | qf=qf, 88 | batch_size=32, 89 | max_path_length=env.horizon, 90 | epoch_length=1000, 91 | min_pool_size=10000, 92 | n_epochs=args.num_epochs, 93 | discount=0.99, 94 | scale_reward=0.1, 95 | qf_learning_rate=1e-3, 96 | policy_learning_rate=1e-4, 97 | # Uncomment both lines (this and the plot parameter below) to enable plotting 98 | plot=args.plot, 99 | ) 100 | 101 | 102 | run_experiment_lite( 103 | algo.train(), 104 | # log_dir=args.data_dir, 105 | # Number of parallel workers for sampling 106 | n_parallel=1, 107 | # Only keep the snapshot parameters for the last iteration 108 | snapshot_mode="last", 109 | # Specifies the seed for the experiment. If this is not provided, a random seed 110 | # will be used 111 | exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e), 112 | seed=1, 113 | plot=args.plot, 114 | ) 115 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_reward_scale.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | reward_scaling = [0.001, 0.1, 1.0] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for r in range(len(reward_scaling)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=32, 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=reward_scaling[r], 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Reward_Scale_Tune/" + "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_batch_size.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Pendulum-v0", "InvertedPendulum-v1", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | batch_size_values = [32, 64, 128] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for b in range(len(batch_size_values)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=batch_size_values[b], 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=1.0, 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + "Batch_Size_Tune/" + "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_learning_rates.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | critic_rate = [1e-3, 1e-4, 1e-5] 56 | actor_rate = [1e-4, 1e-5, 1e-6] 57 | 58 | learning_rate_size = len(critic_rate) 59 | 60 | 61 | 62 | for r in range(learning_rate_size): 63 | 64 | policy = DeterministicMLPPolicy( 65 | env_spec=env.spec, 66 | name="policy", 67 | # The neural network policy should have two hidden layers, each with 32 hidden units. 68 | hidden_sizes=(100, 50, 25), 69 | hidden_nonlinearity=tf.nn.relu, 70 | ) 71 | 72 | es = OUStrategy(env_spec=env.spec) 73 | 74 | qf = ContinuousMLPQFunction(env_spec=env.spec, 75 | hidden_sizes=(100, 50, 25), 76 | hidden_nonlinearity=tf.nn.relu,) 77 | 78 | 79 | for e in range(num_experiments): 80 | 81 | algo = ddpg_class( 82 | env=env, 83 | policy=policy, 84 | es=es, 85 | qf=qf, 86 | batch_size=32, 87 | max_path_length=env.horizon, 88 | epoch_length=1000, 89 | min_pool_size=10000, 90 | n_epochs=args.num_epochs, 91 | discount=0.99, 92 | scale_reward=0.1, 93 | qf_learning_rate=critic_rate[r], 94 | policy_learning_rate=actor_rate[r], 95 | # Uncomment both lines (this and the plot parameter below) to enable plotting 96 | plot=args.plot, 97 | ) 98 | 99 | 100 | run_experiment_lite( 101 | algo.train(), 102 | # log_dir=args.data_dir, 103 | # Number of parallel workers for sampling 104 | n_parallel=1, 105 | # Only keep the snapshot parameters for the last iteration 106 | snapshot_mode="last", 107 | # Specifies the seed for the experiment. If this is not provided, a random seed 108 | # will be used 109 | exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e), 110 | seed=1, 111 | plot=args.plot, 112 | ) 113 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_network_structure.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | layer_1 = [400, 100, 100] 56 | layer_2 = [300, 100, 50] 57 | 58 | layer_size = 3 59 | 60 | 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 62 | 63 | 64 | for l in range(layer_size): 65 | 66 | policy = DeterministicMLPPolicy( 67 | env_spec=env.spec, 68 | name="policy", 69 | # The neural network policy should have two hidden layers, each with 32 hidden units. 70 | hidden_sizes=(layer_1[l], layer_2[l]), 71 | hidden_nonlinearity=tf.nn.relu, 72 | ) 73 | 74 | es = OUStrategy(env_spec=env.spec) 75 | 76 | qf = ContinuousMLPQFunction(env_spec=env.spec, 77 | hidden_sizes=(layer_1[l], layer_2[l]), 78 | hidden_nonlinearity=tf.nn.relu,) 79 | 80 | 81 | for e in range(num_experiments): 82 | 83 | algo = ddpg_class( 84 | env=env, 85 | policy=policy, 86 | es=es, 87 | qf=qf, 88 | batch_size=32, 89 | max_path_length=env.horizon, 90 | epoch_length=1000, 91 | min_pool_size=10000, 92 | n_epochs=args.num_epochs, 93 | discount=0.99, 94 | scale_reward=0.1, 95 | qf_learning_rate=1e-3, 96 | policy_learning_rate=1e-4, 97 | # Uncomment both lines (this and the plot parameter below) to enable plotting 98 | plot=args.plot, 99 | ) 100 | 101 | 102 | run_experiment_lite( 103 | algo.train(), 104 | # log_dir=args.data_dir, 105 | # Number of parallel workers for sampling 106 | n_parallel=1, 107 | # Only keep the snapshot parameters for the last iteration 108 | snapshot_mode="last", 109 | # Specifies the seed for the experiment. If this is not provided, a random seed 110 | # will be used 111 | exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e), 112 | seed=1, 113 | plot=args.plot, 114 | ) 115 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_reward_scale.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | reward_scaling = [0.001, 0.1, 1.0] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for r in range(len(reward_scaling)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=32, 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=reward_scaling[r], 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + "Reward_Scale_Tune/" + "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_batch_size.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | batch_size_values = [32, 64, 128] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for b in range(len(batch_size_values)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=batch_size_values[b], 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=1.0, 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Batch_Size_Tune/" + "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_learning_rates.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | critic_rate = [1e-3, 1e-4, 1e-5] 56 | actor_rate = [1e-4, 1e-5, 1e-6] 57 | 58 | learning_rate_size = len(critic_rate) 59 | 60 | 61 | 62 | for r in range(learning_rate_size): 63 | 64 | policy = DeterministicMLPPolicy( 65 | env_spec=env.spec, 66 | name="policy", 67 | # The neural network policy should have two hidden layers, each with 32 hidden units. 68 | hidden_sizes=(100, 50, 25), 69 | hidden_nonlinearity=tf.nn.relu, 70 | ) 71 | 72 | es = OUStrategy(env_spec=env.spec) 73 | 74 | qf = ContinuousMLPQFunction(env_spec=env.spec, 75 | hidden_sizes=(100, 50, 25), 76 | hidden_nonlinearity=tf.nn.relu,) 77 | 78 | 79 | for e in range(num_experiments): 80 | 81 | algo = ddpg_class( 82 | env=env, 83 | policy=policy, 84 | es=es, 85 | qf=qf, 86 | batch_size=32, 87 | max_path_length=env.horizon, 88 | epoch_length=1000, 89 | min_pool_size=10000, 90 | n_epochs=args.num_epochs, 91 | discount=0.99, 92 | scale_reward=0.1, 93 | qf_learning_rate=critic_rate[r], 94 | policy_learning_rate=actor_rate[r], 95 | # Uncomment both lines (this and the plot parameter below) to enable plotting 96 | plot=args.plot, 97 | ) 98 | 99 | 100 | run_experiment_lite( 101 | algo.train(), 102 | # log_dir=args.data_dir, 103 | # Number of parallel workers for sampling 104 | n_parallel=1, 105 | # Only keep the snapshot parameters for the last iteration 106 | snapshot_mode="last", 107 | # Specifies the seed for the experiment. If this is not provided, a random seed 108 | # will be used 109 | exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e), 110 | seed=1, 111 | plot=args.plot, 112 | ) 113 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_network_structure.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | ddpg_type_map = {"regular" : DDPG} 41 | 42 | 43 | ddpg_class = ddpg_type_map[args.type] 44 | 45 | 46 | 47 | env = TfEnv(normalize(gymenv)) 48 | 49 | 50 | 51 | 52 | ## loops: 53 | num_experiments = 5 54 | 55 | layer_1 = [400, 100, 100] 56 | layer_2 = [300, 100, 50] 57 | 58 | layer_size = 3 59 | 60 | 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 62 | 63 | 64 | for l in range(layer_size): 65 | 66 | policy = DeterministicMLPPolicy( 67 | env_spec=env.spec, 68 | name="policy", 69 | # The neural network policy should have two hidden layers, each with 32 hidden units. 70 | hidden_sizes=(layer_1[l], layer_2[l]), 71 | hidden_nonlinearity=tf.nn.relu, 72 | ) 73 | 74 | es = OUStrategy(env_spec=env.spec) 75 | 76 | qf = ContinuousMLPQFunction(env_spec=env.spec, 77 | hidden_sizes=(layer_1[l], layer_2[l]), 78 | hidden_nonlinearity=tf.nn.relu,) 79 | 80 | 81 | for e in range(num_experiments): 82 | 83 | algo = ddpg_class( 84 | env=env, 85 | policy=policy, 86 | es=es, 87 | qf=qf, 88 | batch_size=32, 89 | max_path_length=env.horizon, 90 | epoch_length=1000, 91 | min_pool_size=10000, 92 | n_epochs=args.num_epochs, 93 | discount=0.99, 94 | scale_reward=0.1, 95 | qf_learning_rate=1e-3, 96 | policy_learning_rate=1e-4, 97 | # Uncomment both lines (this and the plot parameter below) to enable plotting 98 | plot=args.plot, 99 | ) 100 | 101 | 102 | run_experiment_lite( 103 | algo.train(), 104 | # log_dir=args.data_dir, 105 | # Number of parallel workers for sampling 106 | n_parallel=1, 107 | # Only keep the snapshot parameters for the last iteration 108 | snapshot_mode="last", 109 | # Specifies the seed for the experiment. If this is not provided, a random seed 110 | # will be used 111 | exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e), 112 | seed=1, 113 | plot=args.plot, 114 | ) 115 | -------------------------------------------------------------------------------- /reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_reward_scale.py: -------------------------------------------------------------------------------- 1 | from ddpg_tensorflow.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.envs.gym_env import GymEnv 11 | from rllab.misc import ext 12 | import pickle 13 | import tensorflow as tf 14 | 15 | import argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 19 | parser.add_argument("--num_epochs", default=100, type=int) 20 | parser.add_argument("--plot", action="store_true") 21 | # parser.add_argument("--data_dir", default="./data/") 22 | args = parser.parse_args() 23 | 24 | stub(globals()) 25 | ext.set_seed(1) 26 | 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 28 | 29 | other_env_class_map = { "Cartpole" : CartpoleEnv} 30 | 31 | if args.env in supported_gym_envs: 32 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 33 | # gymenv.env.seed(1) 34 | else: 35 | gymenv = other_env_class_map[args.env]() 36 | 37 | #TODO: assert continuous space 38 | 39 | 40 | env = TfEnv(normalize(gymenv)) 41 | 42 | policy = DeterministicMLPPolicy( 43 | env_spec=env.spec, 44 | name="policy", 45 | # The neural network policy should have two hidden layers, each with 32 hidden units. 46 | hidden_sizes=(100, 50, 25), 47 | hidden_nonlinearity=tf.nn.relu, 48 | ) 49 | 50 | es = OUStrategy(env_spec=env.spec) 51 | 52 | qf = ContinuousMLPQFunction(env_spec=env.spec, 53 | hidden_sizes=(100,100), 54 | hidden_nonlinearity=tf.nn.relu,) 55 | 56 | 57 | ddpg_type_map = {"regular" : DDPG} 58 | 59 | 60 | ddpg_class = ddpg_type_map[args.type] 61 | 62 | 63 | ## loops: 64 | num_experiments = 5 65 | reward_scaling = [0.001, 0.1, 1.0] 66 | 67 | 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size'])) 69 | 70 | for r in range(len(reward_scaling)): 71 | 72 | for e in range(num_experiments): 73 | 74 | algo = ddpg_class( 75 | env=env, 76 | policy=policy, 77 | es=es, 78 | qf=qf, 79 | batch_size=32, 80 | max_path_length=env.horizon, 81 | epoch_length=1000, 82 | min_pool_size=10000, 83 | n_epochs=args.num_epochs, 84 | discount=0.99, 85 | scale_reward=reward_scaling[r], 86 | qf_learning_rate=1e-3, 87 | policy_learning_rate=1e-4, 88 | # Uncomment both lines (this and the plot parameter below) to enable plotting 89 | plot=args.plot, 90 | ) 91 | 92 | 93 | run_experiment_lite( 94 | algo.train(), 95 | # log_dir=args.data_dir, 96 | # Number of parallel workers for sampling 97 | n_parallel=1, 98 | # Only keep the snapshot parameters for the last iteration 99 | snapshot_mode="last", 100 | # Specifies the seed for the experiment. If this is not provided, a random seed 101 | # will be used 102 | exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Reward_Scale_Tune/" + "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e), 103 | seed=1, 104 | plot=args.plot, 105 | ) 106 | -------------------------------------------------------------------------------- /run_trpo.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 2 | from rllab.envs.normalized_env import normalize 3 | from rllab.misc.instrument import stub, run_experiment_lite 4 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 5 | from rllab.envs.gym_env import GymEnv 6 | 7 | from sandbox.rocky.tf.envs.base import TfEnv 8 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 9 | from sandbox.rocky.tf.algos.trpo import TRPO 10 | from rllab.misc import ext 11 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 12 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp 13 | 14 | import pickle 15 | import os.path as osp 16 | 17 | import tensorflow as tf 18 | 19 | import argparse 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 22 | parser.add_argument("--num_epochs", default=100, type=int) 23 | parser.add_argument("--batch_size", default=5000, type=int) 24 | parser.add_argument("--step_size", default=0.01, type=float) 25 | parser.add_argument("--reg_coeff", default=1e-5, type=float) 26 | parser.add_argument("--gae_lambda", default=1.0, type=float) 27 | parser.add_argument("--network_architecture", default=[100,50,25], type=int, nargs='*') 28 | parser.add_argument("--data_dir", default="./data/") 29 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured") 30 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.") 31 | parser.add_argument("--random_seed", default=1, type=int) 32 | args = parser.parse_args() 33 | 34 | stub(globals()) 35 | ext.set_seed(args.random_seed) 36 | 37 | supported_gym_envs = ["MountainCarContinuous-v0", "InvertedPendulum-v1", "InvertedDoublePendulum-v1", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 38 | 39 | other_env_class_map = { "Cartpole" : CartpoleEnv} 40 | 41 | if args.env in supported_gym_envs: 42 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 43 | else: 44 | gymenv = other_env_class_map[args.env]() 45 | 46 | #TODO: assert continuous space 47 | 48 | env = TfEnv(normalize(gymenv)) 49 | 50 | print("Using network arch: %s" % ", ".join([str(x) for x in args.network_architecture])) 51 | 52 | policy = GaussianMLPPolicy( 53 | name="policy", 54 | env_spec=env.spec, 55 | # The neural network policy should have two hidden layers, each with 32 hidden units. 56 | hidden_sizes=tuple([int(x) for x in args.network_architecture]), 57 | hidden_nonlinearity=tf.nn.relu, 58 | ) 59 | 60 | baseline = LinearFeatureBaseline(env_spec=env.spec) 61 | 62 | algo = TRPO( 63 | env=env, 64 | policy=policy, 65 | baseline=baseline, 66 | batch_size=args.batch_size, 67 | max_path_length=env.horizon, 68 | n_itr=args.num_epochs, 69 | discount=0.99, 70 | step_size=args.step_size, 71 | gae_lambda=args.gae_lambda, 72 | optimizer=ConjugateGradientOptimizer(reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)) 73 | ) 74 | 75 | arch_name="_".join([str(x) for x in args.network_architecture]) 76 | pref = "TRPO_" + args.env + "_bs_" + str(args.batch_size) + "_sp_" + str(args.step_size) + "_regc_" + str(args.reg_coeff) + "_gael_" + str(args.gae_lambda) + "_na_" + arch_name + "_seed_" + str(args.random_seed) 77 | pref = pref.replace(".", "_") 78 | print("Using prefix %s" % pref) 79 | 80 | run_experiment_lite( 81 | algo.train(), 82 | log_dir=None if args.use_ec2 else args.data_dir, 83 | # Number of parallel workers for sampling 84 | n_parallel=1, 85 | # Only keep the snapshot parameters for the last iteration 86 | snapshot_mode="none", 87 | # Specifies the seed for the experiment. If this is not provided, a random seed 88 | # will be used 89 | exp_prefix=pref, 90 | seed=args.random_seed, 91 | mode="ec2" if args.use_ec2 else "local", 92 | plot=False, 93 | # dry=True, 94 | terminate_machine=args.dont_terminate_machine, 95 | added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))] 96 | ) 97 | -------------------------------------------------------------------------------- /sampling_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | import numpy as np 5 | import rllab.misc.logger as logger 6 | 7 | class SimpleReplayPool(object): 8 | """ 9 | Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py 10 | """ 11 | def __init__( 12 | self, max_pool_size, observation_dim, action_dim, 13 | replacement_policy='stochastic', replacement_prob=1.0, 14 | max_skip_episode=10): 15 | self._observation_dim = observation_dim 16 | self._action_dim = action_dim 17 | self._max_pool_size = max_pool_size 18 | self._replacement_policy = replacement_policy 19 | self._replacement_prob = replacement_prob 20 | self._max_skip_episode = max_skip_episode 21 | self._observations = np.zeros( 22 | (max_pool_size, observation_dim), 23 | ) 24 | self._actions = np.zeros( 25 | (max_pool_size, action_dim), 26 | ) 27 | self._rewards = np.zeros(max_pool_size) 28 | self._terminals = np.zeros(max_pool_size, dtype='uint8') 29 | self._initials = np.zeros(max_pool_size, dtype='uint8') 30 | self._bottom = 0 31 | self._top = 0 32 | self._size = 0 33 | 34 | def add_sample(self, observation, action, reward, terminal, initial): 35 | self.check_replacement() 36 | self._observations[self._top] = observation 37 | self._actions[self._top] = action 38 | self._rewards[self._top] = reward 39 | self._terminals[self._top] = terminal 40 | self._initials[self._top] = initial 41 | self.advance() 42 | 43 | def check_replacement(self): 44 | if self._replacement_prob < 1.0: 45 | if self._size < self._max_pool_size or \ 46 | not self._initials[self._top]: return 47 | self.advance_until_terminate() 48 | 49 | def get_skip_flag(self): 50 | if self._replacement_policy == 'full': skip = False 51 | elif self._replacement_policy == 'stochastic': 52 | skip = np.random.uniform() > self._replacement_prob 53 | else: raise NotImplementedError 54 | return skip 55 | 56 | def advance_until_terminate(self): 57 | skip = self.get_skip_flag() 58 | n_skips = 0 59 | old_top = self._top 60 | new_top = (old_top + 1) % self._max_pool_size 61 | while skip and old_top != new_top and n_skips < self._max_skip_episode: 62 | n_skips += 1 63 | self.advance() 64 | while not self._initials[self._top]: 65 | self.advance() 66 | skip = self.get_skip_flag() 67 | new_top = self._top 68 | logger.log("add_sample, skipped %d episodes, top=%d->%d"%( 69 | n_skips, old_top, new_top)) 70 | 71 | def advance(self): 72 | self._top = (self._top + 1) % self._max_pool_size 73 | if self._size >= self._max_pool_size: 74 | self._bottom = (self._bottom + 1) % self._max_pool_size 75 | else: 76 | self._size += 1 77 | 78 | def random_batch(self, batch_size): 79 | assert self._size > batch_size 80 | indices = np.zeros(batch_size, dtype='uint64') 81 | transition_indices = np.zeros(batch_size, dtype='uint64') 82 | count = 0 83 | while count < batch_size: 84 | index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size 85 | # make sure that the transition is valid: if we are at the end of the pool, we need to discard 86 | # this sample 87 | if index == self._size - 1 and self._size <= self._max_pool_size: 88 | continue 89 | # if self._terminals[index]: 90 | # continue 91 | transition_index = (index + 1) % self._max_pool_size 92 | # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets 93 | if not self._terminals[index] and self._initials[transition_index]: 94 | continue 95 | indices[count] = index 96 | transition_indices[count] = transition_index 97 | count += 1 98 | return dict( 99 | observations=self._observations[indices], 100 | actions=self._actions[indices], 101 | rewards=self._rewards[indices], 102 | terminals=self._terminals[indices], 103 | initials=self._initials[indices], 104 | next_observations=self._observations[transition_indices] 105 | ) 106 | 107 | @property 108 | def size(self): 109 | return self._size 110 | --------------------------------------------------------------------------------