├── .gitignore
├── README.md
├── ave_results.py
├── ddpg_tensorflow
    └── ddpg.py
├── from_same_dist.py
├── plot_results.py
├── reproducibility_ML_DDPG
    ├── HalfCheetah_Scripts
    │   ├── run_ddpg_halfcheetah_batch_size.py
    │   ├── run_ddpg_halfcheetah_learning_rates.py
    │   ├── run_ddpg_halfcheetah_network_structure.py
    │   └── run_ddpg_halfcheetah_reward_scale.py
    ├── Hopper_Scripts
    │   ├── run_ddpg_hopper_batch_size.py
    │   ├── run_ddpg_hopper_learning_rates.py
    │   ├── run_ddpg_hopper_network_structure.py
    │   └── run_ddpg_hopper_reward_scale.py
    ├── InvertedPendulum_Scripts
    │   ├── run_ddpg_invpendulum_batch_size.py
    │   ├── run_ddpg_invpendulum_learning_rates.py
    │   ├── run_ddpg_invpendulum_network_structure.py
    │   └── run_ddpg_invpendulum_reward_scale.py
    └── Walker_Scripts
    │   ├── run_ddpg_walker_batch_size.py
    │   ├── run_ddpg_walker_learning_rates.py
    │   ├── run_ddpg_walker_network_structure.py
    │   └── run_ddpg_walker_reward_scale.py
├── run_trpo.py
└── sampling_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control
 2 | 
 3 | Policy gradient methods in reinforcement learning have become increasingly prevalent
 4 | for state-of-the-art performance in continuous control tasks. Novel methods
 5 | typically benchmark against a few key algorithms such as deep deterministic policy
 6 | gradients and trust region policy optimization. As such, it is important to
 7 | present and use consistent baselines experiments. However, this can be difficult
 8 | due to general variance in the algorithms, hyper-parameter tuning, and environment
 9 | stochasticity. We investigate and discuss: the significance of hyper-parameters in
10 | policy gradients for continuous control, general variance in the algorithms, and
11 | reproducibility of reported results. We provide guidelines on reporting novel results
12 | as comparisons against baseline methods such that future researchers can make
13 | informed decisions when investigating novel methods.
14 | 
15 | ## Citation
16 | 
17 | ```
18 | @article{islam2017reproducibility,
19 |   title={Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control},
20 |   author={Islam*, Riashat and Henderson*, Peter and Gomrokchi, Maziar and Precup, Doina},
21 |   journal={ICML 2017 Reproducibility in Machine Learning Workshop},
22 |   year={2017},
23 |   url={https://arxiv.org/pdf/1708.04133.pdf}
24 | }
25 | ```
26 | 
27 | ## References
28 | 
29 | Here, we use the <a href="https://github.com/rll/rllab">rllab implementation</a> of various benchmark algorithms.
30 | 


--------------------------------------------------------------------------------
/ave_results.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import time
 3 | import numpy as np
 4 | import pandas as pd
 5 | from itertools import cycle
 6 | 
 7 | from numpy import genfromtxt
 8 | 
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("csvs_to_compile", nargs='+', help="The csvs to compile")
13 | parser.add_argument("ave_out", help="the output file")
14 | 
15 | args = parser.parse_args()
16 | 
17 | data_frames = []
18 | for f in args.csvs_to_compile:
19 | 
20 |     data = pd.read_csv(f)
21 |     data_frames.append(data)
22 | 
23 | 
24 | df = pd.concat(data_frames, axis=1)
25 | # import pdb; pdb.set_trace()
26 | 
27 | # df = df.swaplevel(0, 1, axis=1).sortlevel(axis=1)
28 | foo = df.groupby(level=0, axis=1).mean()
29 | 
30 | foo.to_csv(args.ave_out)
31 | 


--------------------------------------------------------------------------------
/ddpg_tensorflow/ddpg.py:
--------------------------------------------------------------------------------
  1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py
  2 | from rllab.algos.base import RLAlgorithm
  3 | from rllab.misc.overrides import overrides
  4 | from rllab.misc import special
  5 | from sandbox.rocky.tf.misc import tensor_utils
  6 | from rllab.sampler import parallel_sampler
  7 | from rllab.plotter import plotter
  8 | from rllab.misc import ext
  9 | import rllab.misc.logger as logger
 10 | #import pickle as pickle
 11 | import numpy as np
 12 | import pyprind
 13 | import tensorflow as tf
 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading
 16 | from rllab.core.serializable import Serializable
 17 | from sampling_utils import SimpleReplayPool
 18 | 
 19 | class DDPG(RLAlgorithm):
 20 |     """
 21 |     Deep Deterministic Policy Gradient.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |             self,
 26 |             env,
 27 |             policy,
 28 |             qf,
 29 |             es,
 30 |             batch_size=32,
 31 |             n_epochs=200,
 32 |             epoch_length=1000,
 33 |             min_pool_size=10000,
 34 |             replay_pool_size=1000000,
 35 |             replacement_prob=1.0,
 36 |             discount=0.99,
 37 |             max_path_length=250,
 38 |             qf_weight_decay=0.,
 39 |             qf_update_method='adam',
 40 |             qf_learning_rate=1e-3,
 41 |             policy_weight_decay=0,
 42 |             policy_update_method='adam',
 43 |             policy_learning_rate=1e-3,
 44 |             policy_updates_ratio=1.0,
 45 |             eval_samples=10000,
 46 |             soft_target=True,
 47 |             soft_target_tau=0.001,
 48 |             n_updates_per_sample=1,
 49 |             scale_reward=1.0,
 50 |             include_horizon_terminal_transitions=False,
 51 |             plot=False,
 52 |             pause_for_plot=False):
 53 |         """
 54 |         :param env: Environment
 55 |         :param policy: Policy
 56 |         :param qf: Q function
 57 |         :param es: Exploration strategy
 58 |         :param batch_size: Number of samples for each minibatch.
 59 |         :param n_epochs: Number of epochs. Policy will be evaluated after each epoch.
 60 |         :param epoch_length: How many timesteps for each epoch.
 61 |         :param min_pool_size: Minimum size of the pool to start training.
 62 |         :param replay_pool_size: Size of the experience replay pool.
 63 |         :param discount: Discount factor for the cumulative return.
 64 |         :param max_path_length: Discount factor for the cumulative return.
 65 |         :param qf_weight_decay: Weight decay factor for parameters of the Q function.
 66 |         :param qf_update_method: Online optimization method for training Q function.
 67 |         :param qf_learning_rate: Learning rate for training Q function.
 68 |         :param policy_weight_decay: Weight decay factor for parameters of the policy.
 69 |         :param policy_update_method: Online optimization method for training the policy.
 70 |         :param policy_learning_rate: Learning rate for training the policy.
 71 |         :param eval_samples: Number of samples (timesteps) for evaluating the policy.
 72 |         :param soft_target_tau: Interpolation parameter for doing the soft target update.
 73 |         :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained
 74 |         :param scale_reward: The scaling factor applied to the rewards when training
 75 |         :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the
 76 |         horizon was reached. This might make the Q value back up less stable for certain tasks.
 77 |         :param plot: Whether to visualize the policy performance after each eval_interval.
 78 |         :param pause_for_plot: Whether to pause before continuing when plotting.
 79 |         :return:
 80 |         """
 81 |         self.env = env
 82 |         self.policy = policy
 83 |         self.qf = qf
 84 |         self.es = es
 85 |         self.batch_size = batch_size
 86 |         self.n_epochs = n_epochs
 87 |         self.epoch_length = epoch_length
 88 |         self.min_pool_size = min_pool_size
 89 |         self.replay_pool_size = replay_pool_size
 90 |         self.replacement_prob = replacement_prob
 91 |         self.discount = discount
 92 |         self.max_path_length = max_path_length
 93 |         self.qf_weight_decay = qf_weight_decay
 94 |         self.qf_update_method = \
 95 |             FirstOrderOptimizer(
 96 |                 update_method=qf_update_method,
 97 |                 learning_rate=qf_learning_rate,
 98 |             )
 99 |         self.qf_learning_rate = qf_learning_rate
100 |         self.policy_weight_decay = policy_weight_decay
101 |         self.policy_update_method = \
102 |             FirstOrderOptimizer(
103 |                 update_method=policy_update_method,
104 |                 learning_rate=policy_learning_rate,
105 |             )
106 |         self.policy_learning_rate = policy_learning_rate
107 |         self.policy_updates_ratio = policy_updates_ratio
108 |         self.eval_samples = eval_samples
109 |         self.soft_target_tau = soft_target_tau
110 |         self.n_updates_per_sample = n_updates_per_sample
111 |         self.include_horizon_terminal_transitions = include_horizon_terminal_transitions
112 |         self.plot = plot
113 |         self.pause_for_plot = pause_for_plot
114 | 
115 |         self.qf_loss_averages = []
116 |         self.policy_surr_averages = []
117 |         self.q_averages = []
118 |         self.y_averages = []
119 |         self.paths = []
120 |         self.es_path_returns = []
121 |         self.paths_samples_cnt = 0
122 | 
123 |         self.scale_reward = scale_reward
124 | 
125 |         self.train_policy_itr = 0
126 | 
127 |         self.opt_info = None
128 | 
129 |     def start_worker(self):
130 |         parallel_sampler.populate_task(self.env, self.policy)
131 |         if self.plot:
132 |             plotter.init_plot(self.env, self.policy)
133 | 
134 |     @overrides
135 |     def train(self):
136 |         with tf.Session() as sess:
137 |             sess.run(tf.global_variables_initializer())
138 |             # This seems like a rather sequential method
139 |             pool = SimpleReplayPool(
140 |                 max_pool_size=self.replay_pool_size,
141 |                 observation_dim=self.env.observation_space.flat_dim,
142 |                 action_dim=self.env.action_space.flat_dim,
143 |                 replacement_prob=self.replacement_prob,
144 |             )
145 |             self.start_worker()
146 | 
147 |             self.init_opt()
148 |             # This initializes the optimizer parameters
149 |             sess.run(tf.global_variables_initializer())
150 |             itr = 0
151 |             path_length = 0
152 |             path_return = 0
153 |             terminal = False
154 |             initial = False
155 |             observation = self.env.reset()
156 | 
157 |             with tf.variable_scope("sample_policy"):
158 |                 sample_policy = Serializable.clone(self.policy)
159 | 
160 |             for epoch in range(self.n_epochs):
161 |                 logger.push_prefix('epoch #%d | ' % epoch)
162 |                 logger.log("Training started")
163 |                 train_qf_itr, train_policy_itr = 0, 0
164 | 
165 |                 # updated_q_network, updated_policy_network, _, _, end_trajectory_action, end_trajectory_state = self.lp.lp_exploration()
166 | 
167 |                 # Don't need to set the values because we're actually using the same policy/qf already
168 |                 # self.qf.set_param_values(updated_q_network.get_param_values())
169 |                 # self.policy.set_param_values(updated_policy_network.get_param_values())
170 | 
171 |                 # observation = end_trajectory_state
172 | 
173 |                 for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
174 |                     # Execute policy
175 |                     if terminal:  # or path_length > self.max_path_length:
176 |                         # Note that if the last time step ends an episode, the very
177 |                         # last state and observation will be ignored and not added
178 |                         # to the replay pool
179 |                         observation = self.env.reset()
180 |                         self.es.reset()
181 |                         sample_policy.reset()
182 |                         self.es_path_returns.append(path_return)
183 |                         path_length = 0
184 |                         path_return = 0
185 |                         initial = True
186 |                     else:
187 |                         initial = False
188 |                         
189 |                     action = self.es.get_action(itr, observation, policy=sample_policy)  # qf=qf)
190 | 
191 |                     next_observation, reward, terminal, _ = self.env.step(action)
192 |                     path_length += 1
193 |                     path_return += reward
194 | 
195 |                     if not terminal and path_length >= self.max_path_length:
196 |                         terminal = True
197 |                         # only include the terminal transition in this case if the flag was set
198 |                         if self.include_horizon_terminal_transitions:
199 |                             pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
200 |                     else:
201 |                         pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
202 | 
203 |                     observation = next_observation
204 | 
205 |                     if pool.size >= self.min_pool_size:
206 |                         for update_itr in range(self.n_updates_per_sample):
207 |                             # Train policy
208 |                             batch = pool.random_batch(self.batch_size)
209 |                             itrs = self.do_training(itr, batch)
210 |                             train_qf_itr += itrs[0]
211 |                             train_policy_itr += itrs[1]
212 |                         sample_policy.set_param_values(self.policy.get_param_values())
213 | 
214 |                     itr += 1
215 | 
216 |                 logger.log("Training finished")
217 |                 logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
218 |                 if pool.size >= self.min_pool_size:
219 |                     self.evaluate(epoch, pool)
220 |                     params = self.get_epoch_snapshot(epoch)
221 |                     logger.save_itr_params(epoch, params)
222 |                 logger.dump_tabular(with_prefix=False)
223 |                 logger.pop_prefix()
224 |                 if self.plot:
225 |                     self.update_plot()
226 |                     if self.pause_for_plot:
227 |                         input("Plotting evaluation run: Press Enter to "
228 |                                   "continue...")
229 |             self.env.terminate()
230 |             self.policy.terminate()
231 | 
232 |     def init_opt(self):
233 | 
234 |         # First, create "target" policy and Q functions
235 |         with tf.variable_scope("target_policy"):
236 |             target_policy = Serializable.clone(self.policy)
237 |         with tf.variable_scope("target_qf"):
238 |             target_qf = Serializable.clone(self.qf)
239 | 
240 |         # y need to be computed first
241 |         obs = self.env.observation_space.new_tensor_variable(
242 |             'obs',
243 |             extra_dims=1,
244 |         )
245 | 
246 |         # The yi values are computed separately as above and then passed to
247 |         # the training functions below
248 |         action = self.env.action_space.new_tensor_variable(
249 |             'action',
250 |             extra_dims=1,
251 |         )
252 | 
253 |         yvar = tensor_utils.new_tensor(
254 |             'ys',
255 |             ndim=1,
256 |             dtype=tf.float32,
257 |         )
258 | 
259 |         qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
260 |                                sum([tf.reduce_sum(tf.square(param)) for param in
261 |                                     self.qf.get_params(regularizable=True)])
262 | 
263 |         qval = self.qf.get_qval_sym(obs, action)
264 | 
265 |         qf_loss = tf.reduce_mean(tf.square(yvar - qval))
266 |         qf_reg_loss = qf_loss + qf_weight_decay_term
267 | 
268 |         policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
269 |                                    sum([tf.reduce_sum(tf.square(param))
270 |                                         for param in self.policy.get_params(regularizable=True)])
271 |         policy_qval = self.qf.get_qval_sym(
272 |             obs, self.policy.get_action_sym(obs),
273 |             deterministic=True
274 |         )
275 |         policy_surr = -tf.reduce_mean(policy_qval)
276 | 
277 |         policy_reg_surr = policy_surr + policy_weight_decay_term
278 | 
279 |         qf_input_list = [yvar, obs, action]
280 |         policy_input_list = [obs]
281 | 
282 |         self.qf_update_method.update_opt(
283 |             loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
284 |         self.policy_update_method.update_opt(
285 |             loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)
286 | 
287 |         f_train_qf = tensor_utils.compile_function(
288 |             inputs=qf_input_list,
289 |             outputs=[qf_loss, qval, self.qf_update_method._train_op],
290 |         )
291 | 
292 |         f_train_policy = tensor_utils.compile_function(
293 |             inputs=policy_input_list,
294 |             outputs=[policy_surr, self.policy_update_method._train_op],
295 |         )
296 | 
297 |         self.opt_info = dict(
298 |             f_train_qf=f_train_qf,
299 |             f_train_policy=f_train_policy,
300 |             target_qf=target_qf,
301 |             target_policy=target_policy,
302 |         )
303 | 
304 |     def do_training(self, itr, batch):
305 | 
306 |         obs, actions, rewards, next_obs, terminals = ext.extract(
307 |             batch,
308 |             "observations", "actions", "rewards", "next_observations",
309 |             "terminals"
310 |         )
311 | 
312 |         # compute the on-policy y values
313 |         target_qf = self.opt_info["target_qf"]
314 |         target_policy = self.opt_info["target_policy"]
315 | 
316 |         next_actions, _ = target_policy.get_actions(next_obs)
317 |         next_qvals = target_qf.get_qval(next_obs, next_actions)
318 | 
319 |         ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1)
320 | 
321 |         f_train_qf = self.opt_info["f_train_qf"]
322 |         qf_loss, qval, _ = f_train_qf(ys, obs, actions)
323 |         target_qf.set_param_values(
324 |             target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
325 |             self.qf.get_param_values() * self.soft_target_tau)
326 |         self.qf_loss_averages.append(qf_loss)
327 |         self.q_averages.append(qval)
328 |         self.y_averages.append(ys)
329 | 
330 |         self.train_policy_itr += self.policy_updates_ratio
331 |         train_policy_itr = 0
332 |         while self.train_policy_itr > 0:
333 |             f_train_policy = self.opt_info["f_train_policy"]
334 |             policy_surr, _ = f_train_policy(obs)
335 |             target_policy.set_param_values(
336 |                 target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
337 |                 self.policy.get_param_values() * self.soft_target_tau)
338 |             self.policy_surr_averages.append(policy_surr)
339 |             self.train_policy_itr -= 1
340 |             train_policy_itr += 1
341 |         return 1, train_policy_itr # number of itrs qf, policy are trained
342 | 
343 |     def evaluate(self, epoch, pool):
344 |         logger.log("Collecting samples for evaluation")
345 |         paths = parallel_sampler.sample_paths(
346 |             policy_params=self.policy.get_param_values(),
347 |             max_samples=self.eval_samples,
348 |             max_path_length=self.max_path_length,
349 |         )
350 | 
351 |         average_discounted_return = np.mean(
352 |             [special.discount_return(path["rewards"], self.discount) for path in paths]
353 |         )
354 | 
355 |         returns = [sum(path["rewards"]) for path in paths]
356 | 
357 |         all_qs = np.concatenate(self.q_averages)
358 |         all_ys = np.concatenate(self.y_averages)
359 | 
360 |         average_q_loss = np.mean(self.qf_loss_averages)
361 |         average_policy_surr = np.mean(self.policy_surr_averages)
362 |         average_action = np.mean(np.square(np.concatenate(
363 |             [path["actions"] for path in paths]
364 |         )))
365 | 
366 |         policy_reg_param_norm = np.linalg.norm(
367 |             self.policy.get_param_values(regularizable=True)
368 |         )
369 |         qfun_reg_param_norm = np.linalg.norm(
370 |             self.qf.get_param_values(regularizable=True)
371 |         )
372 | 
373 |         logger.record_tabular('Epoch', epoch)
374 |         logger.record_tabular('Iteration', epoch)
375 |         logger.record_tabular('AverageReturn', np.mean(returns))
376 |         logger.record_tabular('StdReturn',
377 |                               np.std(returns))
378 |         logger.record_tabular('MaxReturn',
379 |                               np.max(returns))
380 |         logger.record_tabular('MinReturn',
381 |                               np.min(returns))
382 |         if len(self.es_path_returns) > 0:
383 |             logger.record_tabular('AverageEsReturn',
384 |                                   np.mean(self.es_path_returns))
385 |             logger.record_tabular('StdEsReturn',
386 |                                   np.std(self.es_path_returns))
387 |             logger.record_tabular('MaxEsReturn',
388 |                                   np.max(self.es_path_returns))
389 |             logger.record_tabular('MinEsReturn',
390 |                                   np.min(self.es_path_returns))
391 |         logger.record_tabular('AverageDiscountedReturn',
392 |                               average_discounted_return)
393 |         logger.record_tabular('AverageQLoss', average_q_loss)
394 |         logger.record_tabular('AveragePolicySurr', average_policy_surr)
395 |         logger.record_tabular('AverageQ', np.mean(all_qs))
396 |         logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
397 |         logger.record_tabular('AverageY', np.mean(all_ys))
398 |         logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
399 |         logger.record_tabular('AverageAbsQYDiff',
400 |                               np.mean(np.abs(all_qs - all_ys)))
401 |         logger.record_tabular('AverageAction', average_action)
402 | 
403 |         logger.record_tabular('PolicyRegParamNorm',
404 |                               policy_reg_param_norm)
405 |         logger.record_tabular('QFunRegParamNorm',
406 |                               qfun_reg_param_norm)
407 | 
408 |         self.env.log_diagnostics(paths)
409 |         self.policy.log_diagnostics(paths)
410 | 
411 |         self.qf_loss_averages = []
412 |         self.policy_surr_averages = []
413 | 
414 |         self.q_averages = []
415 |         self.y_averages = []
416 |         self.es_path_returns = []
417 | 
418 |     def update_plot(self):
419 |         if self.plot:
420 |             plotter.update_plot(self.policy, self.max_path_length)
421 | 
422 |     def get_epoch_snapshot(self, epoch):
423 |         return dict(
424 |             env=self.env,
425 |             epoch=epoch,
426 |             qf=self.qf,
427 |             policy=self.policy,
428 |             target_qf=self.opt_info["target_qf"],
429 |             target_policy=self.opt_info["target_policy"],
430 |             es=self.es,
431 |         )
432 | 


--------------------------------------------------------------------------------
/from_same_dist.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as stats
 2 | import pandas as pd
 3 | import numpy as np
 4 | import argparse
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs")
 7 | parser.add_argument("--range_start", type=int, default=-1)
 8 | parser.add_argument("--range_end", type=int, default=100000000)
 9 | 
10 | 
11 | args = parser.parse_args()
12 | assert len(args.paths_to_progress_csvs) == 2
13 | 
14 | avg_rets = []
15 | std_dev_rets = []
16 | trajs = []
17 | 
18 | data = pd.read_csv(args.paths_to_progress_csvs[0])
19 | 
20 | a_means = data["AverageReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
21 | a_stds = data["StdReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
22 | n_as = data["NumTrajs"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
23 | 
24 | args.paths_to_progress_csvs
25 | data = pd.read_csv(args.paths_to_progress_csvs[1])
26 | 
27 | b_means = data["AverageReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
28 | b_stds = data["StdReturn"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
29 | n_bs = data["NumTrajs"][max(args.range_start,0):min(args.range_end, len(data["AverageReturn"]))]
30 | 
31 | # Do a T - test
32 | ts, ps = [],[]
33 | 
34 | for a_mean, a_std, n_a, b_mean, b_std, n_b in zip(a_means, a_stds, n_as, b_means, b_stds, n_bs):
35 |     t, p = stats.ttest_ind_from_stats(a_mean, a_std, n_a, b_mean, b_std, n_b, equal_var=False)
36 |     ts.append(t)
37 |     ps.append(p)
38 | 
39 | print("t=%f,p=%f" % (np.mean(ts), np.mean(ps)))
40 | 


--------------------------------------------------------------------------------
/plot_results.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import time
  3 | import numpy as np
  4 | import pandas as pd
  5 | from itertools import cycle
  6 | 
  7 | from numpy import genfromtxt
  8 | from numpy.random import choice
  9 | 
 10 | 
 11 | 
 12 | def multiple_plot(average_vals_list, std_dev_list, traj_list, other_labels, env_name, smoothing_window=5, no_show=False, ignore_std=False, limit=None, extra_lines=None):
 13 |     fig = plt.figure(figsize=(15, 10))
 14 |     colors = ["k", "red", "blue", "green", "magenta", "cyan", "brown", "purple"]
 15 |     color_index = 0
 16 |     ax = plt.subplot() # Defines ax variable by creating an empty plot
 17 | 
 18 |     # Set the tick labels font
 19 |     for label in (ax.get_xticklabels() + ax.get_yticklabels()):
 20 |         label.set_fontname('Arial')
 21 |         label.set_fontsize(22)
 22 | 
 23 |     index = 0 
 24 |     for average_vals, std_dev, label, trajs in zip(average_vals_list, std_dev_list, other_labels[:len(average_vals_list)], traj_list):
 25 |         index += 1
 26 |         rewards_smoothed_1 = pd.Series(average_vals).rolling(smoothing_window, min_periods=smoothing_window).mean()[:limit]
 27 |         if limit is None:
 28 |             limit = len(rewards_smoothed_1)
 29 |         rewards_smoothed_1 = rewards_smoothed_1[:limit]
 30 |         std_dev = std_dev[:limit]
 31 | 
 32 |         fill_color = colors[color_index]#choice(colors, 1)
 33 |         color_index += 1
 34 |         cum_rwd_1, = plt.plot(range(len(rewards_smoothed_1)), rewards_smoothed_1, label=label, color=fill_color[0])
 35 |         if not ignore_std:
 36 |             plt.fill_between(range(len(rewards_smoothed_1)), rewards_smoothed_1 + std_dev,   rewards_smoothed_1 - std_dev, alpha=0.3, edgecolor=fill_color, facecolor=fill_color)
 37 | 
 38 |     if extra_lines:
 39 |         for lin in extra_lines:
 40 |             plt.plot(range(len(rewards_smoothed_1)), np.repeat(lin, len(rewards_smoothed_1)), linestyle='-.', color = colors[color_index], linewidth=2.5, label=other_labels[index])
 41 |             color_index += 1
 42 |             index += 1
 43 | 
 44 |     axis_font = {'fontname':'Arial', 'size':'28'}
 45 |     #plt.legend(loc='upper left', prop={'size' : 16})
 46 |     plt.legend(loc='lower right', prop={'size' : 16})
 47 |     plt.xlabel("Iterations", **axis_font)
 48 |     plt.ylabel("Average Return", **axis_font)
 49 |     plt.title("%s Environment"% env_name, **axis_font)
 50 | 
 51 |     if no_show:
 52 |         fig.savefig('%s.png' % env_name, dpi=fig.dpi)
 53 |     else:
 54 |         plt.show()
 55 | 
 56 |     return fig
 57 | 
 58 | # def multipe_plot(stats1, stats2, smoothing_window=50, noshow=False):
 59 | #
 60 | #     fig = plt.figure(figsize=(30, 20))
 61 | #     rewards_smoothed_1 = pd.Series(stats1).rolling(smoothing_window, min_periods=smoothing_window).mean()
 62 | #
 63 | #     rewards_smoothed_2 = pd.Series(stats2).rolling(smoothing_window, min_periods=smoothing_window).mean()
 64 | #
 65 | #     cum_rwd_1, = plt.plot(eps, rewards_smoothed_1, label="DDPG")
 66 | #     plt.fill_between( eps, rewards_smoothed_1 + ddpg_walker_std_return,   rewards_smoothed_1 - ddpg_walker_std_return, alpha=0.3, edgecolor='blue', facecolor='blue')
 67 | #
 68 | #     cum_rwd_2, = plt.plot(eps2, rewards_smoothed_2, label="Unified DDPG")
 69 | #     plt.fill_between( eps2, rewards_smoothed_2 + unified_ddpg_walker_std_return,   rewards_smoothed_2 - unified_ddpg_walker_std_return, alpha=0.3, edgecolor='blue', facecolor='red')
 70 | #
 71 | #     plt.legend(handles=[cum_rwd_1, cum_rwd_2])
 72 | #     plt.xlabel("Epsiode")
 73 | #     plt.ylabel("Average Return")
 74 | #     plt.title("Walker Environment")
 75 | #
 76 | #     plt.show()
 77 | #
 78 | #     return fig
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | import argparse
 86 | parser = argparse.ArgumentParser()
 87 | parser.add_argument("paths_to_progress_csvs", nargs="+", help="All the csvs")
 88 | parser.add_argument("env_name")
 89 | parser.add_argument("--save", action="store_true")
 90 | parser.add_argument("--ignore_std", action="store_true")
 91 | parser.add_argument('--labels', nargs='+', help='List of labels to go along with the paths', required=False)
 92 | parser.add_argument('--smoothing_window', default=5, type=int)
 93 | parser.add_argument('--limit', default=None, type=int)
 94 | parser.add_argument('--extra_lines', nargs="+", type=float)
 95 | 
 96 | args = parser.parse_args()
 97 | 
 98 | avg_rets = []
 99 | std_dev_rets = []
100 | trajs = []
101 | 
102 | for o in args.paths_to_progress_csvs:
103 |     data = pd.read_csv(o)
104 |     avg_ret = np.array(data["AverageReturn"])
105 |     std_dev_ret = np.array(data["StdReturn"])
106 |     trajs.append(np.cumsum(np.array(data["NumTrajs"])))
107 |     avg_rets.append(avg_ret)
108 |     std_dev_rets.append(std_dev_ret)
109 | 
110 | multiple_plot(avg_rets, std_dev_rets, trajs, args.labels, args.env_name, smoothing_window=args.smoothing_window, no_show=args.save, ignore_std=args.ignore_std, limit=args.limit, extra_lines=args.extra_lines)
111 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_batch_size.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | batch_size_values = [32, 64, 128]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for b in range(len(batch_size_values)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=batch_size_values[b],
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=1.0,
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Batch_Size_Tune/" +  "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_learning_rates.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | critic_rate = [1e-3, 1e-4, 1e-5]
 56 | actor_rate = [1e-4, 1e-5, 1e-6]
 57 | 
 58 | learning_rate_size = len(critic_rate)
 59 | 
 60 | 
 61 | 
 62 | for r in range(learning_rate_size): 
 63 | 
 64 |         policy = DeterministicMLPPolicy(
 65 |             env_spec=env.spec,
 66 |             name="policy",
 67 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 68 |             hidden_sizes=(100, 50, 25),
 69 |             hidden_nonlinearity=tf.nn.relu,
 70 |         )
 71 | 
 72 |         es = OUStrategy(env_spec=env.spec)
 73 | 
 74 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 75 |                                     hidden_sizes=(100, 50, 25),
 76 |                                     hidden_nonlinearity=tf.nn.relu,)
 77 | 
 78 | 
 79 |         for e in range(num_experiments):
 80 | 
 81 |             algo = ddpg_class(
 82 |                 env=env,
 83 |                 policy=policy,
 84 |                 es=es,
 85 |                 qf=qf,
 86 |                 batch_size=32,
 87 |                 max_path_length=env.horizon,
 88 |                 epoch_length=1000,
 89 |                 min_pool_size=10000,
 90 |                 n_epochs=args.num_epochs,
 91 |                 discount=0.99,
 92 |                 scale_reward=0.1,
 93 |                 qf_learning_rate=critic_rate[r],
 94 |                 policy_learning_rate=actor_rate[r],
 95 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 96 |                 plot=args.plot,
 97 |             )
 98 | 
 99 | 
100 |             run_experiment_lite(
101 |                 algo.train(),
102 |                 # log_dir=args.data_dir,
103 |                 # Number of parallel workers for sampling
104 |                 n_parallel=1,
105 |                 # Only keep the snapshot parameters for the last iteration
106 |                 snapshot_mode="last",
107 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
108 |                 # will be used
109 |                 exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" +  "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e),
110 |                 seed=1,
111 |                 plot=args.plot,
112 |             )
113 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_network_structure.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | layer_1 = [400, 100, 100]
 56 | layer_2 = [300, 100, 50]
 57 | 
 58 | layer_size = 3
 59 | 
 60 | 
 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 62 | 
 63 | 
 64 | for l in range(layer_size): 
 65 | 
 66 |         policy = DeterministicMLPPolicy(
 67 |             env_spec=env.spec,
 68 |             name="policy",
 69 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 70 |             hidden_sizes=(layer_1[l], layer_2[l]),
 71 |             hidden_nonlinearity=tf.nn.relu,
 72 |         )
 73 | 
 74 |         es = OUStrategy(env_spec=env.spec)
 75 | 
 76 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 77 |                                     hidden_sizes=(layer_1[l], layer_2[l]),
 78 |                                     hidden_nonlinearity=tf.nn.relu,)
 79 | 
 80 | 
 81 |         for e in range(num_experiments):
 82 | 
 83 |             algo = ddpg_class(
 84 |                 env=env,
 85 |                 policy=policy,
 86 |                 es=es,
 87 |                 qf=qf,
 88 |                 batch_size=32,
 89 |                 max_path_length=env.horizon,
 90 |                 epoch_length=1000,
 91 |                 min_pool_size=10000,
 92 |                 n_epochs=args.num_epochs,
 93 |                 discount=0.99,
 94 |                 scale_reward=0.1,
 95 |                 qf_learning_rate=1e-3,
 96 |                 policy_learning_rate=1e-4,
 97 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 98 |                 plot=args.plot,
 99 |             )
100 | 
101 | 
102 |             run_experiment_lite(
103 |                 algo.train(),
104 |                 # log_dir=args.data_dir,
105 |                 # Number of parallel workers for sampling
106 |                 n_parallel=1,
107 |                 # Only keep the snapshot parameters for the last iteration
108 |                 snapshot_mode="last",
109 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
110 |                 # will be used
111 |                 exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e),
112 |                 seed=1,
113 |                 plot=args.plot,
114 |             )
115 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/HalfCheetah_Scripts/run_ddpg_halfcheetah_reward_scale.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | reward_scaling = [0.001, 0.1, 1.0]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for r in range(len(reward_scaling)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=32,
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=reward_scaling[r],
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "HalfCheetah/" + "Reward_Scale_Tune/" +  "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_batch_size.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | batch_size_values = [32, 64, 128]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for b in range(len(batch_size_values)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=batch_size_values[b],
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=1.0,
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Batch_Size_Tune/" +  "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_learning_rates.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | critic_rate = [1e-3, 1e-4, 1e-5]
 56 | actor_rate = [1e-4, 1e-5, 1e-6]
 57 | 
 58 | learning_rate_size = len(critic_rate)
 59 | 
 60 | 
 61 | 
 62 | for r in range(learning_rate_size): 
 63 | 
 64 |         policy = DeterministicMLPPolicy(
 65 |             env_spec=env.spec,
 66 |             name="policy",
 67 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 68 |             hidden_sizes=(100, 50, 25),
 69 |             hidden_nonlinearity=tf.nn.relu,
 70 |         )
 71 | 
 72 |         es = OUStrategy(env_spec=env.spec)
 73 | 
 74 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 75 |                                     hidden_sizes=(100, 50, 25),
 76 |                                     hidden_nonlinearity=tf.nn.relu,)
 77 | 
 78 | 
 79 |         for e in range(num_experiments):
 80 | 
 81 |             algo = ddpg_class(
 82 |                 env=env,
 83 |                 policy=policy,
 84 |                 es=es,
 85 |                 qf=qf,
 86 |                 batch_size=32,
 87 |                 max_path_length=env.horizon,
 88 |                 epoch_length=1000,
 89 |                 min_pool_size=10000,
 90 |                 n_epochs=args.num_epochs,
 91 |                 discount=0.99,
 92 |                 scale_reward=0.1,
 93 |                 qf_learning_rate=critic_rate[r],
 94 |                 policy_learning_rate=actor_rate[r],
 95 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 96 |                 plot=args.plot,
 97 |             )
 98 | 
 99 | 
100 |             run_experiment_lite(
101 |                 algo.train(),
102 |                 # log_dir=args.data_dir,
103 |                 # Number of parallel workers for sampling
104 |                 n_parallel=1,
105 |                 # Only keep the snapshot parameters for the last iteration
106 |                 snapshot_mode="last",
107 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
108 |                 # will be used
109 |                 exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" +  "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e),
110 |                 seed=1,
111 |                 plot=args.plot,
112 |             )
113 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_network_structure.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | layer_1 = [400, 100, 100]
 56 | layer_2 = [300, 100, 50]
 57 | 
 58 | layer_size = 3
 59 | 
 60 | 
 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 62 | 
 63 | 
 64 | for l in range(layer_size): 
 65 | 
 66 |         policy = DeterministicMLPPolicy(
 67 |             env_spec=env.spec,
 68 |             name="policy",
 69 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 70 |             hidden_sizes=(layer_1[l], layer_2[l]),
 71 |             hidden_nonlinearity=tf.nn.relu,
 72 |         )
 73 | 
 74 |         es = OUStrategy(env_spec=env.spec)
 75 | 
 76 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 77 |                                     hidden_sizes=(layer_1[l], layer_2[l]),
 78 |                                     hidden_nonlinearity=tf.nn.relu,)
 79 | 
 80 | 
 81 |         for e in range(num_experiments):
 82 | 
 83 |             algo = ddpg_class(
 84 |                 env=env,
 85 |                 policy=policy,
 86 |                 es=es,
 87 |                 qf=qf,
 88 |                 batch_size=32,
 89 |                 max_path_length=env.horizon,
 90 |                 epoch_length=1000,
 91 |                 min_pool_size=10000,
 92 |                 n_epochs=args.num_epochs,
 93 |                 discount=0.99,
 94 |                 scale_reward=0.1,
 95 |                 qf_learning_rate=1e-3,
 96 |                 policy_learning_rate=1e-4,
 97 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 98 |                 plot=args.plot,
 99 |             )
100 | 
101 | 
102 |             run_experiment_lite(
103 |                 algo.train(),
104 |                 # log_dir=args.data_dir,
105 |                 # Number of parallel workers for sampling
106 |                 n_parallel=1,
107 |                 # Only keep the snapshot parameters for the last iteration
108 |                 snapshot_mode="last",
109 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
110 |                 # will be used
111 |                 exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e),
112 |                 seed=1,
113 |                 plot=args.plot,
114 |             )
115 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Hopper_Scripts/run_ddpg_hopper_reward_scale.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | reward_scaling = [0.001, 0.1, 1.0]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for r in range(len(reward_scaling)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=32,
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=reward_scaling[r],
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "Hopper/" + "Reward_Scale_Tune/" +  "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_batch_size.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Pendulum-v0", "InvertedPendulum-v1", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | batch_size_values = [32, 64, 128]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for b in range(len(batch_size_values)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=batch_size_values[b],
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=1.0,
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + "Batch_Size_Tune/" +  "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_learning_rates.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | critic_rate = [1e-3, 1e-4, 1e-5]
 56 | actor_rate = [1e-4, 1e-5, 1e-6]
 57 | 
 58 | learning_rate_size = len(critic_rate)
 59 | 
 60 | 
 61 | 
 62 | for r in range(learning_rate_size): 
 63 | 
 64 |         policy = DeterministicMLPPolicy(
 65 |             env_spec=env.spec,
 66 |             name="policy",
 67 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 68 |             hidden_sizes=(100, 50, 25),
 69 |             hidden_nonlinearity=tf.nn.relu,
 70 |         )
 71 | 
 72 |         es = OUStrategy(env_spec=env.spec)
 73 | 
 74 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 75 |                                     hidden_sizes=(100, 50, 25),
 76 |                                     hidden_nonlinearity=tf.nn.relu,)
 77 | 
 78 | 
 79 |         for e in range(num_experiments):
 80 | 
 81 |             algo = ddpg_class(
 82 |                 env=env,
 83 |                 policy=policy,
 84 |                 es=es,
 85 |                 qf=qf,
 86 |                 batch_size=32,
 87 |                 max_path_length=env.horizon,
 88 |                 epoch_length=1000,
 89 |                 min_pool_size=10000,
 90 |                 n_epochs=args.num_epochs,
 91 |                 discount=0.99,
 92 |                 scale_reward=0.1,
 93 |                 qf_learning_rate=critic_rate[r],
 94 |                 policy_learning_rate=actor_rate[r],
 95 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 96 |                 plot=args.plot,
 97 |             )
 98 | 
 99 | 
100 |             run_experiment_lite(
101 |                 algo.train(),
102 |                 # log_dir=args.data_dir,
103 |                 # Number of parallel workers for sampling
104 |                 n_parallel=1,
105 |                 # Only keep the snapshot parameters for the last iteration
106 |                 snapshot_mode="last",
107 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
108 |                 # will be used
109 |                 exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" +  "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e),
110 |                 seed=1,
111 |                 plot=args.plot,
112 |             )
113 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_network_structure.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | layer_1 = [400, 100, 100]
 56 | layer_2 = [300, 100, 50]
 57 | 
 58 | layer_size = 3
 59 | 
 60 | 
 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 62 | 
 63 | 
 64 | for l in range(layer_size): 
 65 | 
 66 |         policy = DeterministicMLPPolicy(
 67 |             env_spec=env.spec,
 68 |             name="policy",
 69 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 70 |             hidden_sizes=(layer_1[l], layer_2[l]),
 71 |             hidden_nonlinearity=tf.nn.relu,
 72 |         )
 73 | 
 74 |         es = OUStrategy(env_spec=env.spec)
 75 | 
 76 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 77 |                                     hidden_sizes=(layer_1[l], layer_2[l]),
 78 |                                     hidden_nonlinearity=tf.nn.relu,)
 79 | 
 80 | 
 81 |         for e in range(num_experiments):
 82 | 
 83 |             algo = ddpg_class(
 84 |                 env=env,
 85 |                 policy=policy,
 86 |                 es=es,
 87 |                 qf=qf,
 88 |                 batch_size=32,
 89 |                 max_path_length=env.horizon,
 90 |                 epoch_length=1000,
 91 |                 min_pool_size=10000,
 92 |                 n_epochs=args.num_epochs,
 93 |                 discount=0.99,
 94 |                 scale_reward=0.1,
 95 |                 qf_learning_rate=1e-3,
 96 |                 policy_learning_rate=1e-4,
 97 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 98 |                 plot=args.plot,
 99 |             )
100 | 
101 | 
102 |             run_experiment_lite(
103 |                 algo.train(),
104 |                 # log_dir=args.data_dir,
105 |                 # Number of parallel workers for sampling
106 |                 n_parallel=1,
107 |                 # Only keep the snapshot parameters for the last iteration
108 |                 snapshot_mode="last",
109 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
110 |                 # will be used
111 |                 exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e),
112 |                 seed=1,
113 |                 plot=args.plot,
114 |             )
115 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/InvertedPendulum_Scripts/run_ddpg_invpendulum_reward_scale.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "InvertedPendulum-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | reward_scaling = [0.001, 0.1, 1.0]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for r in range(len(reward_scaling)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=32,
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=reward_scaling[r],
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "InvertedPendulum/" + "Reward_Scale_Tune/" +  "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_batch_size.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | batch_size_values = [32, 64, 128]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for b in range(len(batch_size_values)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=batch_size_values[b],
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=1.0,
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Batch_Size_Tune/" +  "Batch_Size_" + str(batch_size_values[b]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_learning_rates.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | critic_rate = [1e-3, 1e-4, 1e-5]
 56 | actor_rate = [1e-4, 1e-5, 1e-6]
 57 | 
 58 | learning_rate_size = len(critic_rate)
 59 | 
 60 | 
 61 | 
 62 | for r in range(learning_rate_size): 
 63 | 
 64 |         policy = DeterministicMLPPolicy(
 65 |             env_spec=env.spec,
 66 |             name="policy",
 67 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 68 |             hidden_sizes=(100, 50, 25),
 69 |             hidden_nonlinearity=tf.nn.relu,
 70 |         )
 71 | 
 72 |         es = OUStrategy(env_spec=env.spec)
 73 | 
 74 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 75 |                                     hidden_sizes=(100, 50, 25),
 76 |                                     hidden_nonlinearity=tf.nn.relu,)
 77 | 
 78 | 
 79 |         for e in range(num_experiments):
 80 | 
 81 |             algo = ddpg_class(
 82 |                 env=env,
 83 |                 policy=policy,
 84 |                 es=es,
 85 |                 qf=qf,
 86 |                 batch_size=32,
 87 |                 max_path_length=env.horizon,
 88 |                 epoch_length=1000,
 89 |                 min_pool_size=10000,
 90 |                 n_epochs=args.num_epochs,
 91 |                 discount=0.99,
 92 |                 scale_reward=0.1,
 93 |                 qf_learning_rate=critic_rate[r],
 94 |                 policy_learning_rate=actor_rate[r],
 95 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 96 |                 plot=args.plot,
 97 |             )
 98 | 
 99 | 
100 |             run_experiment_lite(
101 |                 algo.train(),
102 |                 # log_dir=args.data_dir,
103 |                 # Number of parallel workers for sampling
104 |                 n_parallel=1,
105 |                 # Only keep the snapshot parameters for the last iteration
106 |                 snapshot_mode="last",
107 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
108 |                 # will be used
109 |                 exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" +  "Learning_Rate_Tune/" + "Learning_Rate_Combo_" + str(r) + "_Experiment_" + str(e),
110 |                 seed=1,
111 |                 plot=args.plot,
112 |             )
113 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_network_structure.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | ddpg_type_map = {"regular" : DDPG}
 41 | 
 42 | 
 43 | ddpg_class = ddpg_type_map[args.type]
 44 | 
 45 | 
 46 | 
 47 | env = TfEnv(normalize(gymenv))
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ## loops:
 53 | num_experiments = 5
 54 | 
 55 | layer_1 = [400, 100, 100]
 56 | layer_2 = [300, 100, 50]
 57 | 
 58 | layer_size = 3
 59 | 
 60 | 
 61 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 62 | 
 63 | 
 64 | for l in range(layer_size): 
 65 | 
 66 |         policy = DeterministicMLPPolicy(
 67 |             env_spec=env.spec,
 68 |             name="policy",
 69 |             # The neural network policy should have two hidden layers, each with 32 hidden units.
 70 |             hidden_sizes=(layer_1[l], layer_2[l]),
 71 |             hidden_nonlinearity=tf.nn.relu,
 72 |         )
 73 | 
 74 |         es = OUStrategy(env_spec=env.spec)
 75 | 
 76 |         qf = ContinuousMLPQFunction(env_spec=env.spec,
 77 |                                     hidden_sizes=(layer_1[l], layer_2[l]),
 78 |                                     hidden_nonlinearity=tf.nn.relu,)
 79 | 
 80 | 
 81 |         for e in range(num_experiments):
 82 | 
 83 |             algo = ddpg_class(
 84 |                 env=env,
 85 |                 policy=policy,
 86 |                 es=es,
 87 |                 qf=qf,
 88 |                 batch_size=32,
 89 |                 max_path_length=env.horizon,
 90 |                 epoch_length=1000,
 91 |                 min_pool_size=10000,
 92 |                 n_epochs=args.num_epochs,
 93 |                 discount=0.99,
 94 |                 scale_reward=0.1,
 95 |                 qf_learning_rate=1e-3,
 96 |                 policy_learning_rate=1e-4,
 97 |                 # Uncomment both lines (this and the plot parameter below) to enable plotting
 98 |                 plot=args.plot,
 99 |             )
100 | 
101 | 
102 |             run_experiment_lite(
103 |                 algo.train(),
104 |                 # log_dir=args.data_dir,
105 |                 # Number of parallel workers for sampling
106 |                 n_parallel=1,
107 |                 # Only keep the snapshot parameters for the last iteration
108 |                 snapshot_mode="last",
109 |                 # Specifies the seed for the experiment. If this is not provided, a random seed
110 |                 # will be used
111 |                 exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Network_Structure_Tune/" + "Layer_Size_" + str(l) + "_Experiment_" + str(e),
112 |                 seed=1,
113 |                 plot=args.plot,
114 |             )
115 | 


--------------------------------------------------------------------------------
/reproducibility_ML_DDPG/Walker_Scripts/run_ddpg_walker_reward_scale.py:
--------------------------------------------------------------------------------
  1 | from ddpg_tensorflow.ddpg import DDPG
  2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  3 | from rllab.envs.normalized_env import normalize
  4 | from rllab.misc.instrument import stub, run_experiment_lite
  5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
  6 | from sandbox.rocky.tf.policies.deterministic_mlp_policy import DeterministicMLPPolicy
  7 | from sandbox.rocky.tf.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
  8 | 
  9 | from sandbox.rocky.tf.envs.base import TfEnv
 10 | from rllab.envs.gym_env import GymEnv
 11 | from rllab.misc import ext
 12 | import pickle
 13 | import tensorflow as tf
 14 | 
 15 | import argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 18 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 19 | parser.add_argument("--num_epochs", default=100, type=int)
 20 | parser.add_argument("--plot", action="store_true")
 21 | # parser.add_argument("--data_dir", default="./data/")
 22 | args = parser.parse_args()
 23 | 
 24 | stub(globals())
 25 | ext.set_seed(1)
 26 | 
 27 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 28 | 
 29 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 30 | 
 31 | if args.env in supported_gym_envs:
 32 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 33 |     # gymenv.env.seed(1)
 34 | else:
 35 |     gymenv = other_env_class_map[args.env]()
 36 | 
 37 | #TODO: assert continuous space
 38 | 
 39 | 
 40 | env = TfEnv(normalize(gymenv))
 41 | 
 42 | policy = DeterministicMLPPolicy(
 43 |     env_spec=env.spec,
 44 |     name="policy",
 45 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 46 |     hidden_sizes=(100, 50, 25),
 47 |     hidden_nonlinearity=tf.nn.relu,
 48 | )
 49 | 
 50 | es = OUStrategy(env_spec=env.spec)
 51 | 
 52 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 53 |                             hidden_sizes=(100,100),
 54 |                             hidden_nonlinearity=tf.nn.relu,)
 55 | 
 56 | 
 57 | ddpg_type_map = {"regular" : DDPG}
 58 | 
 59 | 
 60 | ddpg_class = ddpg_type_map[args.type]
 61 | 
 62 | 
 63 | ## loops:
 64 | num_experiments = 5
 65 | reward_scaling = [0.001, 0.1, 1.0]
 66 | 
 67 | 
 68 | # n_itr = int(np.ceil(float(n_episodes*max_path_length)/flags['batch_size']))
 69 | 
 70 | for r in range(len(reward_scaling)): 
 71 |     
 72 |     for e in range(num_experiments):
 73 | 
 74 |         algo = ddpg_class(
 75 |             env=env,
 76 |             policy=policy,
 77 |             es=es,
 78 |             qf=qf,
 79 |             batch_size=32,
 80 |             max_path_length=env.horizon,
 81 |             epoch_length=1000,
 82 |             min_pool_size=10000,
 83 |             n_epochs=args.num_epochs,
 84 |             discount=0.99,
 85 |             scale_reward=reward_scaling[r],
 86 |             qf_learning_rate=1e-3,
 87 |             policy_learning_rate=1e-4,
 88 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 89 |             plot=args.plot,
 90 |         )
 91 | 
 92 | 
 93 |         run_experiment_lite(
 94 |             algo.train(),
 95 |             # log_dir=args.data_dir,
 96 |             # Number of parallel workers for sampling
 97 |             n_parallel=1,
 98 |             # Only keep the snapshot parameters for the last iteration
 99 |             snapshot_mode="last",
100 |             # Specifies the seed for the experiment. If this is not provided, a random seed
101 |             # will be used
102 |             exp_name="reproducibility_ML/" + "DDPG/" + "Walker/" + "Reward_Scale_Tune/" +  "Reward_Scale_" + str(reward_scaling[r]) + "_Experiment_" + str(e),
103 |             seed=1,
104 |             plot=args.plot,
105 |         )
106 | 


--------------------------------------------------------------------------------
/run_trpo.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 2 | from rllab.envs.normalized_env import normalize
 3 | from rllab.misc.instrument import stub, run_experiment_lite
 4 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 5 | from rllab.envs.gym_env import GymEnv
 6 | 
 7 | from sandbox.rocky.tf.envs.base import TfEnv
 8 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 9 | from sandbox.rocky.tf.algos.trpo import TRPO
10 | from rllab.misc import ext
11 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
12 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
13 | 
14 | import pickle
15 | import os.path as osp
16 | 
17 | import tensorflow as tf
18 | 
19 | import argparse
20 | parser = argparse.ArgumentParser()
21 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
22 | parser.add_argument("--num_epochs", default=100, type=int)
23 | parser.add_argument("--batch_size", default=5000, type=int)
24 | parser.add_argument("--step_size", default=0.01, type=float)
25 | parser.add_argument("--reg_coeff", default=1e-5, type=float)
26 | parser.add_argument("--gae_lambda", default=1.0, type=float)
27 | parser.add_argument("--network_architecture", default=[100,50,25], type=int, nargs='*')
28 | parser.add_argument("--data_dir", default="./data/")
29 | parser.add_argument("--use_ec2", action="store_true", help="Use your ec2 instances if configured")
30 | parser.add_argument("--dont_terminate_machine", action="store_false", help="Whether to terminate your spot instance or not. Be careful.")
31 | parser.add_argument("--random_seed", default=1, type=int)
32 | args = parser.parse_args()
33 | 
34 | stub(globals())
35 | ext.set_seed(args.random_seed)
36 | 
37 | supported_gym_envs = ["MountainCarContinuous-v0", "InvertedPendulum-v1", "InvertedDoublePendulum-v1", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
38 | 
39 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
40 | 
41 | if args.env in supported_gym_envs:
42 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
43 | else:
44 |     gymenv = other_env_class_map[args.env]()
45 | 
46 | #TODO: assert continuous space
47 | 
48 | env = TfEnv(normalize(gymenv))
49 | 
50 | print("Using network arch: %s" % ", ".join([str(x) for x in args.network_architecture]))
51 | 
52 | policy = GaussianMLPPolicy(
53 | name="policy",
54 | env_spec=env.spec,
55 | # The neural network policy should have two hidden layers, each with 32 hidden units.
56 | hidden_sizes=tuple([int(x) for x in args.network_architecture]),
57 | hidden_nonlinearity=tf.nn.relu,
58 | )
59 | 
60 | baseline = LinearFeatureBaseline(env_spec=env.spec)
61 | 
62 | algo = TRPO(
63 |     env=env,
64 |     policy=policy,
65 |     baseline=baseline,
66 |     batch_size=args.batch_size,
67 |     max_path_length=env.horizon,
68 |     n_itr=args.num_epochs,
69 |     discount=0.99,
70 |     step_size=args.step_size,
71 |     gae_lambda=args.gae_lambda,
72 |     optimizer=ConjugateGradientOptimizer(reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff))
73 | )
74 | 
75 | arch_name="_".join([str(x) for x in args.network_architecture])
76 | pref = "TRPO_" + args.env + "_bs_" + str(args.batch_size) + "_sp_" + str(args.step_size) + "_regc_" + str(args.reg_coeff) + "_gael_" + str(args.gae_lambda) + "_na_" + arch_name + "_seed_" + str(args.random_seed)
77 | pref = pref.replace(".", "_")
78 | print("Using prefix %s" % pref)
79 | 
80 | run_experiment_lite(
81 |     algo.train(),
82 |     log_dir=None if args.use_ec2 else args.data_dir,
83 |     # Number of parallel workers for sampling
84 |     n_parallel=1,
85 |     # Only keep the snapshot parameters for the last iteration
86 |     snapshot_mode="none",
87 |     # Specifies the seed for the experiment. If this is not provided, a random seed
88 |     # will be used
89 |     exp_prefix=pref,
90 |     seed=args.random_seed,
91 |     mode="ec2" if args.use_ec2 else "local",
92 |     plot=False,
93 |     # dry=True,
94 |     terminate_machine=args.dont_terminate_machine,
95 |     added_project_directories=[osp.abspath(osp.join(osp.dirname(__file__), '.'))]
96 | )
97 | 


--------------------------------------------------------------------------------
/sampling_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | import numpy as np
  5 | import rllab.misc.logger as logger
  6 | 
  7 | class SimpleReplayPool(object):
  8 |     """
  9 |     Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py
 10 |     """
 11 |     def __init__(
 12 |             self, max_pool_size, observation_dim, action_dim,
 13 |             replacement_policy='stochastic', replacement_prob=1.0,
 14 |             max_skip_episode=10):
 15 |         self._observation_dim = observation_dim
 16 |         self._action_dim = action_dim
 17 |         self._max_pool_size = max_pool_size
 18 |         self._replacement_policy = replacement_policy
 19 |         self._replacement_prob = replacement_prob
 20 |         self._max_skip_episode = max_skip_episode
 21 |         self._observations = np.zeros(
 22 |             (max_pool_size, observation_dim),
 23 |         )
 24 |         self._actions = np.zeros(
 25 |             (max_pool_size, action_dim),
 26 |         )
 27 |         self._rewards = np.zeros(max_pool_size)
 28 |         self._terminals = np.zeros(max_pool_size, dtype='uint8')
 29 |         self._initials = np.zeros(max_pool_size, dtype='uint8')
 30 |         self._bottom = 0
 31 |         self._top = 0
 32 |         self._size = 0
 33 | 
 34 |     def add_sample(self, observation, action, reward, terminal, initial):
 35 |         self.check_replacement()
 36 |         self._observations[self._top] = observation
 37 |         self._actions[self._top] = action
 38 |         self._rewards[self._top] = reward
 39 |         self._terminals[self._top] = terminal
 40 |         self._initials[self._top] = initial
 41 |         self.advance()
 42 | 
 43 |     def check_replacement(self):
 44 |         if self._replacement_prob < 1.0:
 45 |             if self._size < self._max_pool_size or \
 46 |                 not self._initials[self._top]: return
 47 |             self.advance_until_terminate()
 48 | 
 49 |     def get_skip_flag(self):
 50 |         if self._replacement_policy == 'full': skip = False
 51 |         elif self._replacement_policy == 'stochastic':
 52 |             skip = np.random.uniform() > self._replacement_prob
 53 |         else: raise NotImplementedError
 54 |         return skip
 55 | 
 56 |     def advance_until_terminate(self):
 57 |         skip = self.get_skip_flag()
 58 |         n_skips = 0
 59 |         old_top = self._top
 60 |         new_top = (old_top + 1) % self._max_pool_size
 61 |         while skip and old_top != new_top and n_skips < self._max_skip_episode:
 62 |             n_skips += 1
 63 |             self.advance()
 64 |             while not self._initials[self._top]:
 65 |                 self.advance()
 66 |             skip = self.get_skip_flag()
 67 |             new_top = self._top
 68 |         logger.log("add_sample, skipped %d episodes, top=%d->%d"%(
 69 |             n_skips, old_top, new_top))
 70 | 
 71 |     def advance(self):
 72 |         self._top = (self._top + 1) % self._max_pool_size
 73 |         if self._size >= self._max_pool_size:
 74 |             self._bottom = (self._bottom + 1) % self._max_pool_size
 75 |         else:
 76 |             self._size += 1
 77 | 
 78 |     def random_batch(self, batch_size):
 79 |         assert self._size > batch_size
 80 |         indices = np.zeros(batch_size, dtype='uint64')
 81 |         transition_indices = np.zeros(batch_size, dtype='uint64')
 82 |         count = 0
 83 |         while count < batch_size:
 84 |             index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size
 85 |             # make sure that the transition is valid: if we are at the end of the pool, we need to discard
 86 |             # this sample
 87 |             if index == self._size - 1 and self._size <= self._max_pool_size:
 88 |                 continue
 89 |             # if self._terminals[index]:
 90 |             #     continue
 91 |             transition_index = (index + 1) % self._max_pool_size
 92 |             # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets
 93 |             if not self._terminals[index] and self._initials[transition_index]:
 94 |                 continue
 95 |             indices[count] = index
 96 |             transition_indices[count] = transition_index
 97 |             count += 1
 98 |         return dict(
 99 |             observations=self._observations[indices],
100 |             actions=self._actions[indices],
101 |             rewards=self._rewards[indices],
102 |             terminals=self._terminals[indices],
103 |             initials=self._initials[indices],
104 |             next_observations=self._observations[transition_indices]
105 |         )
106 | 
107 |     @property
108 |     def size(self):
109 |         return self._size
110 | 


--------------------------------------------------------------------------------