├── .gitignore ├── LICENSE ├── README.md ├── evaluation ├── README.md ├── main_algo.py ├── models │ ├── policy_models │ │ ├── checkpoint │ │ ├── policy.ckpt.data-00000-of-00001 │ │ ├── policy.ckpt.index │ │ └── policy.ckpt.meta │ ├── scaler │ │ └── scaler.pkl │ └── val_models │ │ ├── checkpoint │ │ ├── value.ckpt.data-00000-of-00001 │ │ ├── value.ckpt.index │ │ └── value.ckpt.meta ├── phi_functions │ ├── ContinousMLPPhiFunction.py │ └── __init__.py ├── policy.py ├── run.py ├── tb_logger.py ├── traj_visualize.py ├── utils.py ├── value_function.py └── walker2d_train_eval.sh └── optimization ├── main_algo.py ├── phi_functions ├── ContinousMLPPhiFunction.py └── __init__.py ├── policy.py ├── scripts ├── test_FitQ.sh └── test_MinVar.sh ├── tb_logger.py ├── train.py ├── utils.py └── value_function.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | **/__pycache__ 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # log files 105 | log-files 106 | bak 107 | fun 108 | .vscode 109 | eval_data/ 110 | 111 | # exp directory 112 | 500_200 113 | 500_1000_fitq 114 | 500500 115 | fitq_1000_500 116 | wybie 117 | plot 118 | 119 | # misc 120 | walker2d* 121 | max_timesteps* 122 | aft 123 | pre 124 | */com 125 | com 126 | fitq.sh 127 | results* 128 | sandbox 129 | minvar.sh 130 | 131 | # os 132 | .DS_Store 133 | *.DS_Store 134 | ._.DS_Store 135 | **/.DS_Store 136 | **/._.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Dartmouth Machine Learning Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PPO With Stein Control Variate 2 | 3 | In this work, we propose a control variate method to effectively reduce variance for policy gradient methods motivated by Stein's identity. 4 | 5 | 6 | This repository contains the code of the Proximal Policy Optimization(PPO) with [Stein control variates](https://arxiv.org/pdf/1710.11198.pdf) for Mujoco environments. 7 | 8 | The code is based on the excellent implementation of [PPO](https://github.com/pat-coady/trpo). 9 | 10 | 11 | ## Dependencies 12 | 13 | * `Python 3.5` 14 | * [`MuJoCo`](http://www.mujoco.org/) 15 | * `TensorFlow 1.3` 16 | * `Gym` - [Installation instructions](https://gym.openai.com/docs). 17 | 18 | ## Running Experiments 19 | 20 | You can run following commands to reproduce our results: 21 | 22 | ```Shell 23 | cd optimization 24 | 25 | # For MinVar optimization 26 | python train.py HalfCheetah-v1 -b 10000 -ps large -po MinVar -p 500 27 | python train.py Walker2d-v1 -b 10000 -ps large -po MinVar -p 500 28 | python train.py Hopper-v1 -b 10000 -ps large -po MinVar -p 500 29 | 30 | python train.py Ant-v1 -b 10000 -ps small -po MinVar -p 500 31 | python train.py Humanoid-v1 -b 10000 -ps small -po MinVar -p 500 32 | python train.py HumanoidStandup-v1 -b 10000 -ps small -po MinVar -p 500 33 | 34 | 35 | # For FitQ optimization 36 | python train.py HalfCheetah-v1 -b 10000 -ps large -po FitQ -p 500 37 | python train.py Walker2d-v1 -b 10000 -ps large -po FitQ -p 500 38 | python train.py Hopper-v1 -b 10000 -ps large -po FitQ -p 500 39 | 40 | python train.py Ant-v1 -b 10000 -ps small -po FitQ -p 500 41 | python train.py Humanoid-v1 -b 10000 -ps small -po FitQ -p 500 42 | python train.py HumanoidStandup-v1 -b 10000 -ps small -po FitQ -p 500 43 | 44 | 45 | #For baseline PPO 46 | python train.py HalfCheetah-v1 -b 10000 -ps large -c 0 47 | python train.py Walker2d-v1 -b 10000 -ps large -c 0 48 | python train.py Hopper-v1 -b 10000 -ps large -c 0 49 | 50 | python train.py Ant-v1 -b 10000 -ps small -c 0 51 | python train.py Humanoid-v1 -b 10000 -ps small -c 0 52 | python train.py HumanoidStandup-v1 -b 10000 -ps small -c 0 53 | ``` 54 | The log files is in optimization/dartml_data. Further, we provide two shell scripts for tuning hyperparameters of stein control variates in the [scripts](optimization/scripts) folder. 55 | 56 | For evaluation of PPO with/without Stein control variate, please see [here](evaluation). 57 | 58 | ## Citations 59 | If you find Stein control variates helpful, please cite following papers: 60 | 61 | >[Sample-efficient Policy Optimization with Stein Control Variate.](https://arxiv.org/pdf/1710.11198.pdf) 62 | >Hao Liu\*, Yihao Feng\*, Yi Mao, Dengyong Zhou, Jian Peng, Qiang Liu (*: equal contribution). 63 | >Preprint 2017 64 | 65 | ## Feedbacks 66 | 67 | If you have any questions about the code or the paper, please feel free to [contact us](mailto:yihaof95@gmail.com). 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluations of PPO with/without Stein control variate 2 | This is the code of the evaluation part of [Stein control variate](https://arxiv.org/pdf/1710.11198.pdf). It evaluates different variance reduction methods introduced in the paper. 3 | 4 | 5 | ## Running Examples 6 | 7 | Take Walker2d-v1 environment as an example. 8 | 9 | Train and generate evaluation data: 10 | ```shell 11 | #Evaluation Policy with or without Stein control variates 12 | bash walker2d_train_eval.sh 13 | ``` 14 | Different max-timesteps lead to different scale of variance. 15 | NB: Max-timesteps can be set through `-m` option, larger max-timesteps leads to larger batch-size which need more iterations to fit. 16 | 17 | Visualize the variance plot of different optimization Phi function methods: 18 | 19 | ```python 20 | # plot variance figure 21 | python traj_visualize.py 22 | ``` 23 | -------------------------------------------------------------------------------- /evaluation/main_algo.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | import os 3 | import gym 4 | import pickle 5 | import random 6 | 7 | import numpy as np 8 | import tb_logger as logger 9 | 10 | import scipy.signal 11 | from gym import wrappers 12 | from utils import Scaler 13 | from policy import Policy 14 | from datetime import datetime 15 | from value_function import NNValueFunction 16 | from utils import Dataset 17 | import copy 18 | 19 | def init_gym(env_name): 20 | 21 | env = gym.make(env_name) 22 | obs_dim = env.observation_space.shape[0] 23 | act_dim = env.action_space.shape[0] 24 | 25 | return env, obs_dim, act_dim 26 | 27 | def set_global_seeds(i): 28 | try: 29 | import tensorflow as tf 30 | except ImportError: 31 | pass 32 | else: 33 | tf.set_random_seed(i) 34 | np.random.seed(i) 35 | random.seed(i) 36 | 37 | def run_episode(env, policy, scaler, max_timesteps, animate=False): 38 | 39 | obs = env.reset() 40 | observes, actions, rewards, unscaled_obs = [], [], [], [] 41 | done = False 42 | step = 0.0 43 | scale, offset = scaler.get() 44 | scale[-1] = 1.0 # don't scale time step feature 45 | offset[-1] = 0.0 # don't offset time step feature 46 | for _ in range(max_timesteps): 47 | if animate: 48 | env.render() 49 | obs = obs.astype(np.float32).reshape((1, -1)) 50 | obs = np.append(obs, [[step]], axis=1) # add time step feature 51 | unscaled_obs.append(obs) 52 | obs = (obs - offset) * scale # center and scale observations 53 | observes.append(obs) 54 | action = policy.sample(obs).reshape((1, -1)).astype(np.float32) 55 | actions.append(action) 56 | obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) 57 | if not isinstance(reward, float): 58 | reward = np.asscalar(reward) 59 | rewards.append(reward) 60 | step += 1e-3 # increment time step feature 61 | if done: 62 | break 63 | 64 | return (np.concatenate(observes), np.concatenate(actions), 65 | np.array(rewards, dtype=np.float64), np.concatenate(unscaled_obs)) 66 | 67 | 68 | def run_policy(env, policy, scaler, num_episodes, max_timesteps, mode): 69 | 70 | total_steps = 0 71 | trajectories = [] 72 | traj_len_list = [] 73 | 74 | for itr in range(num_episodes): 75 | observes, actions, rewards, unscaled_obs = run_episode(env, \ 76 | policy, scaler, 77 | max_timesteps=max_timesteps) 78 | 79 | total_steps += observes.shape[0] 80 | 81 | traj_len_list.append(len(observes)) 82 | 83 | trajectory = {'observes': observes, 84 | 'actions': actions, 85 | 'rewards': rewards, 86 | 'unscaled_obs': unscaled_obs} 87 | trajectories.append(trajectory) 88 | 89 | unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) 90 | if mode == 'save': # only update scaler when training policy, get rid of possible bias when evaluating 91 | scaler.update(unscaled) 92 | logger.record_dicts({ 93 | "_MeanReward":np.mean([t['rewards'].sum() for t in trajectories]), 94 | 'Steps': total_steps,}) 95 | 96 | return trajectories, traj_len_list 97 | 98 | 99 | def discount(x, gamma): 100 | """ Calculate discounted forward sum of a sequence at each point """ 101 | return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] 102 | 103 | 104 | def add_disc_sum_rew(trajectories, gamma): 105 | 106 | for trajectory in trajectories: 107 | if gamma < 0.999: # don't scale for gamma ~= 1 108 | rewards = trajectory['rewards'] * (1 - gamma) 109 | else: 110 | rewards = trajectory['rewards'] 111 | disc_sum_rew = discount(rewards, gamma) 112 | trajectory['disc_sum_rew'] = disc_sum_rew 113 | 114 | 115 | def add_value(trajectories, val_func): 116 | 117 | for trajectory in trajectories: 118 | observes = trajectory['observes'] 119 | values = val_func.predict(observes) 120 | trajectory['values'] = values 121 | 122 | 123 | def add_gae(trajectories, gamma, lam): 124 | 125 | for trajectory in trajectories: 126 | if gamma < 0.999: # don't scale for gamma ~= 1 127 | rewards = trajectory['rewards'] * (1 - gamma) 128 | else: 129 | rewards = trajectory['rewards'] 130 | values = trajectory['values'] 131 | # temporal differences 132 | tds = rewards - values + np.append(values[1:] * gamma, 0) 133 | advantages = discount(tds, gamma * lam) 134 | trajectory['advantages'] = advantages 135 | 136 | 137 | def build_train_set(trajectories): 138 | 139 | observes = np.concatenate([t['observes'] for t in trajectories]) 140 | actions = np.concatenate([t['actions'] for t in trajectories]) 141 | disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories]) 142 | advantages = np.concatenate([t['advantages'] for t in trajectories]) 143 | # normalize advantages 144 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) 145 | 146 | return observes, actions, advantages, disc_sum_rew 147 | 148 | 149 | def log_batch_stats(observes, actions, advantages, disc_sum_rew, episode): 150 | 151 | logger.record_dicts({ 152 | '_mean_obs': np.mean(observes), 153 | '_min_obs': np.min(observes), 154 | '_max_obs': np.max(observes), 155 | '_mean_act': np.mean(actions), 156 | '_max_act': np.max(actions), 157 | '_std_act': np.mean(np.var(actions, axis=0)), 158 | '_mean_adv': np.mean(advantages), 159 | '_min_adv': np.min(advantages), 160 | '_max_adv': np.max(advantages), 161 | '_std_adv': np.var(advantages), 162 | '_mean_discrew': np.mean(disc_sum_rew), 163 | '_min_discrew': np.min(disc_sum_rew), 164 | '_max_discrew': np.max(disc_sum_rew), 165 | '_std_discrew': np.var(disc_sum_rew)}) 166 | 167 | logger.dump_tabular() 168 | 169 | 170 | 171 | def train_models(env_name, num_episodes, 172 | gamma, lam, kl_targ, 173 | coef, use_lr_adjust, 174 | ada_kl_penalty, seed, 175 | epochs, phi_epochs, 176 | max_timesteps, reg_scale, 177 | phi_lr, phi_hs, 178 | policy_size, 179 | phi_obj, load_model): 180 | 181 | env, obs_dim, act_dim = init_gym(env_name) 182 | set_global_seeds(seed) 183 | env.seed(seed) 184 | env._max_episode_steps = max_timesteps 185 | obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) 186 | now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories 187 | aigym_path = os.path.join('log-files/', env_name, now) 188 | env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) 189 | scaler = Scaler(obs_dim) 190 | val_func = NNValueFunction(obs_dim) 191 | policy = Policy(obs_dim, act_dim, 192 | kl_targ,epochs, 193 | phi_epochs, 194 | policy_size=policy_size, 195 | phi_hidden_sizes=phi_hs, 196 | reg_scale=reg_scale, 197 | lr_phi=phi_lr, 198 | phi_obj=phi_obj) 199 | 200 | 201 | run_policy(env, policy, 202 | scaler, num_episodes, 203 | max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler 204 | 205 | episode = 0 206 | for i in range(2000): 207 | print("sampling and training at %s iteration\n"%(i)) 208 | trajectories, traj_len_list = run_policy(env, policy, scaler, 209 | num_episodes, max_timesteps=max_timesteps, mode=load_model) 210 | 211 | num_traj = len(trajectories) 212 | 213 | episode += len(trajectories) 214 | add_value(trajectories, val_func) 215 | add_disc_sum_rew(trajectories, gamma) 216 | add_gae(trajectories, gamma, lam) 217 | 218 | observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) 219 | 220 | policy.update(load_model, observes, actions, advantages, 221 | use_lr_adjust, ada_kl_penalty, c=0.) # update policy 222 | val_func.fit(observes, disc_sum_rew) 223 | 224 | # Save models 225 | policy.save_policy() 226 | val_func.save_val_func() 227 | refine_scaler = False 228 | if refine_scaler == True: 229 | run_policy(env, policy, 230 | scaler, num_episodes, 231 | max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler 232 | with open('models/scaler/scaler.pkl', 'wb') as output: 233 | pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) 234 | logger.log("saved model") 235 | 236 | 237 | def eval_models(env_name, num_episodes, 238 | gamma, lam, kl_targ, 239 | coef, use_lr_adjust, 240 | ada_kl_penalty, seed, 241 | epochs, phi_epochs, 242 | max_timesteps, reg_scale, 243 | phi_lr, phi_hs, 244 | policy_size, 245 | phi_obj, load_model): 246 | 247 | env, obs_dim, act_dim = init_gym(env_name) 248 | set_global_seeds(seed) 249 | env.seed(seed) 250 | env._max_episode_steps = max_timesteps 251 | obs_dim += 1 252 | now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") 253 | aigym_path = os.path.join('log-files/', env_name, now) 254 | env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) 255 | # scaler = Scaler(obs_dim) 256 | logger.log("loading scaler") 257 | with open('models/scaler/scaler.pkl', 'rb') as input: 258 | scaler = pickle.load(input) 259 | val_func = NNValueFunction(obs_dim) 260 | policy = Policy(obs_dim, act_dim, 261 | kl_targ,epochs, 262 | phi_epochs, 263 | policy_size=policy_size, 264 | phi_hidden_sizes=phi_hs, 265 | reg_scale=reg_scale, 266 | lr_phi=phi_lr, 267 | phi_obj=phi_obj) 268 | 269 | logger.log("loading model") 270 | load_dir = "models/" 271 | policy.load_model(load_dir) 272 | load_v = False #whether load value function baseline or train from scratch; no big impact on stein 273 | if load_v == True: 274 | val_func.load_val_model(load_dir) 275 | 276 | episode = 0 277 | 278 | trajectories, traj_len_list = run_policy(env, policy, scaler, 279 | num_episodes, max_timesteps=max_timesteps, mode=load_model) 280 | 281 | num_traj = len(trajectories) 282 | logger.log("Avg Length %d total Length %d"%( \ 283 | np.mean(traj_len_list), \ 284 | np.sum(traj_len_list))) 285 | 286 | episode += len(trajectories) 287 | 288 | #Split data into validation and training data 289 | random.shuffle(trajectories) 290 | t_trajectories = trajectories[:int(len(trajectories)/2)] 291 | v_trajectories = trajectories[int(len(trajectories)/2):] 292 | 293 | refit_v = True # if fit value function baseline once again before evaluating; no big impact on stein 294 | if refit_v == True: 295 | tt_trajectories = copy.deepcopy(t_trajectories) 296 | add_value(tt_trajectories, val_func) 297 | add_disc_sum_rew(tt_trajectories, gamma) 298 | add_gae(tt_trajectories, gamma, lam) 299 | tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set(tt_trajectories) 300 | logger.log("refit value function baseline") 301 | val_func.fit(tt_observes, tt_disc_sum_rew) # update value function 302 | logger.log("done") 303 | 304 | # build training data after refit v 305 | add_value(t_trajectories, val_func) 306 | add_disc_sum_rew(t_trajectories, gamma) 307 | add_gae(t_trajectories, gamma, lam) 308 | t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set(t_trajectories) 309 | 310 | # build validation data after refit v 311 | add_value(v_trajectories, val_func) 312 | add_disc_sum_rew(v_trajectories, gamma) 313 | add_gae(v_trajectories, gamma, lam) 314 | v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set(v_trajectories) 315 | 316 | sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ 317 | max_timesteps, env_name, phi_obj, 318 | seed, max_timesteps) 319 | if not os.path.exists(sub_folder): 320 | os.mkdir(sub_folder) 321 | 322 | # save original gradient 323 | mc_grad_info = policy.get_batch_gradient(v_observes, v_actions, v_advantages, c=0.) 324 | mc_grad_info['traj_lens'] = traj_len_list 325 | with open(sub_folder+'/mc_num_episode=%d.pkl'%(num_episodes), 'wb') as fp: 326 | pickle.dump(mc_grad_info, fp) 327 | 328 | d = Dataset(dict(ob=t_observes, ac=t_actions, atarg=t_advantages, vtarg=t_disc_sum_rew), shuffle=True) 329 | for _ in range(phi_epochs): # optim_epochs 330 | for batch in d.iterate_once(128): # optim_batchsize 331 | policy.update(load_model, batch['ob'], batch['ac'], batch['atarg'], 332 | use_lr_adjust, ada_kl_penalty, c=1) # update policy 333 | 334 | stein_grad_info = policy.get_batch_gradient(v_observes, \ 335 | v_actions, v_advantages, c=1.) 336 | 337 | 338 | stein_grad_info['traj_lens'] = traj_len_list 339 | with open(sub_folder+'/stein_num_episode=%d.pkl'%(num_episodes), 'wb') as fp: 340 | pickle.dump(stein_grad_info, fp) 341 | -------------------------------------------------------------------------------- /evaluation/models/policy_models/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "policy.ckpt" 2 | all_model_checkpoint_paths: "policy.ckpt" 3 | -------------------------------------------------------------------------------- /evaluation/models/policy_models/policy.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/policy_models/policy.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /evaluation/models/policy_models/policy.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/policy_models/policy.ckpt.index -------------------------------------------------------------------------------- /evaluation/models/policy_models/policy.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/policy_models/policy.ckpt.meta -------------------------------------------------------------------------------- /evaluation/models/scaler/scaler.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/scaler/scaler.pkl -------------------------------------------------------------------------------- /evaluation/models/val_models/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "value.ckpt" 2 | all_model_checkpoint_paths: "value.ckpt" 3 | -------------------------------------------------------------------------------- /evaluation/models/val_models/value.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/val_models/value.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /evaluation/models/val_models/value.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/val_models/value.ckpt.index -------------------------------------------------------------------------------- /evaluation/models/val_models/value.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DartML/PPO-Stein-Control-Variate/f87aab6fc52907cda575ef19d0f4ff8400b233a6/evaluation/models/val_models/value.ckpt.meta -------------------------------------------------------------------------------- /evaluation/phi_functions/ContinousMLPPhiFunction.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Stein control variate 3 | ''' 4 | import tensorflow as tf 5 | import numpy as np 6 | import math 7 | 8 | class ContinousMLPPhiFunction(object): 9 | 10 | def __init__(self, obs_dim, 11 | act_dim, name='phi_nn', 12 | hidden_sizes=[100, 100], 13 | regular_scale=0., fn_type='relu'): 14 | 15 | self.obs_dim = obs_dim 16 | self.act_dim = act_dim 17 | self.name=name 18 | self.hidden_sizes=hidden_sizes 19 | self.fn_type = fn_type 20 | 21 | if regular_scale == 0.: 22 | kernel_regularizer = None 23 | else: 24 | kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=regular_scale) 25 | 26 | self.kernel_regularizer = kernel_regularizer 27 | 28 | 29 | # f fan-in size 30 | def variable(self,shape,f): 31 | return tf.Variable(tf.random_uniform(shape,-1/math.sqrt(f),1/math.sqrt(f))) 32 | 33 | 34 | def __call__(self, obs_ph, act_ph, reuse=True): 35 | with tf.variable_scope(self.name) as vs: 36 | if reuse: 37 | vs.reuse_variables() 38 | 39 | hid1_size = self.hidden_sizes[0] 40 | hid2_size = self.hidden_sizes[1] 41 | 42 | obs_dim = self.obs_dim 43 | act_dim = self.act_dim 44 | 45 | W1 = self.variable([obs_dim,hid1_size],obs_dim) 46 | b1 = self.variable([hid1_size],obs_dim) 47 | W2 = self.variable([hid1_size,hid2_size],hid1_size+act_dim) 48 | W2_action = self.variable([act_dim,hid2_size],hid1_size+act_dim) 49 | b2 = self.variable([hid2_size],hid1_size+act_dim) 50 | W3 = tf.Variable(tf.random_uniform([hid2_size,1],-3e-3,3e-3)) 51 | b3 = tf.Variable(tf.random_uniform([1],-3e-3,3e-3)) 52 | 53 | if self.fn_type == 'relu': 54 | layer1 = tf.nn.relu(tf.matmul(obs_ph, W1) + b1) 55 | layer2 = tf.nn.relu(tf.matmul(layer1,W2) + tf.matmul(act_ph, W2_action) + b2) 56 | out = tf.identity(tf.matmul(layer2,W3) + b3) 57 | elif self.fn_type == 'tanh': 58 | layer1 = tf.tanh(tf.matmul(obs_ph, W1) + b1) 59 | layer2 = tf.tanh(tf.matmul(layer1,W2) + tf.matmul(act_ph, W2_action) + b2) 60 | out = tf.identity(tf.matmul(layer2,W3) + b3) 61 | 62 | phi_value = tf.squeeze(out) 63 | phi_act_g= tf.gradients(phi_value, act_ph)[0] 64 | 65 | return phi_value, phi_act_g 66 | 67 | @property 68 | def phi_vars(self): 69 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) 70 | 71 | -------------------------------------------------------------------------------- /evaluation/phi_functions/__init__.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf -------------------------------------------------------------------------------- /evaluation/policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Policy Optimization Policy with Stein control variates 3 | """ 4 | 5 | import os 6 | import pickle 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | import tb_logger as logger 11 | from phi_functions.ContinousMLPPhiFunction import ContinousMLPPhiFunction 12 | 13 | 14 | class Policy(object): 15 | """ NN-based policy approximation """ 16 | def __init__(self, 17 | obs_dim, 18 | act_dim, 19 | kl_targ, 20 | epochs, 21 | phi_epochs, 22 | policy_size='large', 23 | phi_hidden_sizes='100x50', 24 | reg_scale=.0, 25 | lr_phi=0.0005, 26 | phi_obj='MinVar'): 27 | 28 | self.beta = 1.0 # dynamically adjusted D_KL loss multiplier 29 | self.eta = 50 # multiplier for D_KL-kl_targ hinge-squared loss 30 | self.kl_targ = kl_targ 31 | self.epochs = epochs 32 | self.phi_epochs = phi_epochs 33 | self.lr = None # lr for policy neural network 34 | self.lr_phi = None # lr for phi function neural network 35 | self.lr_multiplier = 1.0 # dynamically adjust policy's lr when D_KL out of control 36 | self.obs_dim = obs_dim 37 | self.act_dim = act_dim 38 | self.policy_size=policy_size 39 | self.phi_obj=phi_obj 40 | 41 | # create Phi networks 42 | self.reg_scale = reg_scale 43 | phi_hidden_sizes = [int(x) for x in phi_hidden_sizes.split("x")] 44 | self.phi = ContinousMLPPhiFunction(obs_dim, act_dim, 45 | hidden_sizes=phi_hidden_sizes, regular_scale=reg_scale) 46 | 47 | self.lr_phi = lr_phi 48 | 49 | self._build_graph() 50 | 51 | def _build_graph(self): 52 | """ Build and initialize TensorFlow graph """ 53 | self.g = tf.Graph() 54 | with self.g.as_default(): 55 | self._placeholders() 56 | self._policy_nn() 57 | 58 | self._logprob() 59 | self._kl_entropy() 60 | self._sample() 61 | self._loss_train_op() 62 | self.init = tf.global_variables_initializer() 63 | 64 | # Save only policy parameters 65 | policy_vars = tf.get_collection(\ 66 | tf.GraphKeys.TRAINABLE_VARIABLES, 67 | scope='policy_nn') 68 | 69 | var_dict = {} 70 | for var in policy_vars: 71 | logger.log(var.name) 72 | var_dict[var.name]= var 73 | 74 | self._init_session() 75 | self.saver = tf.train.Saver(var_dict) 76 | 77 | 78 | def load_model(self, log_dir='log_dir/'): 79 | saver = tf.train.import_meta_graph( 80 | os.path.join(log_dir, 'policy_models/', 81 | 'policy.ckpt.meta')) 82 | 83 | saver.restore(self.sess, 84 | tf.train.latest_checkpoint( 85 | os.path.join(log_dir, 'policy_models/'))) 86 | 87 | def _placeholders(self): 88 | """ Input placeholders""" 89 | # observations, actions and advantages: 90 | self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs') 91 | self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'act') 92 | self.advantages_ph = tf.placeholder(tf.float32, (None,), 'advantages') 93 | # strength of D_KL loss terms: 94 | self.beta_ph = tf.placeholder(tf.float32, (), 'beta') 95 | self.eta_ph = tf.placeholder(tf.float32, (), 'eta') 96 | # learning rate: 97 | self.lr_ph = tf.placeholder(tf.float32, (), 'lr') 98 | self.c_ph = tf.placeholder(tf.float32, (), 'c_ph') 99 | 100 | self.lr_phi_ph = tf.placeholder(tf.float32, (), 'eta_phi') 101 | 102 | self.old_log_vars_ph = tf.placeholder(tf.float32, (self.act_dim,), 'old_log_vars') 103 | self.old_means_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_means') 104 | 105 | 106 | 107 | def _policy_nn(self): 108 | 109 | with tf.variable_scope("policy_nn"): 110 | # hidden layer sizes determined by obs_dim and act_dim (hid2 is geometric mean) 111 | if self.policy_size == 'small': 112 | logger.log("using small structure") 113 | hid1_size = self.obs_dim # * 10 114 | hid3_size = self.act_dim # * 10 115 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 116 | logvar_speed = (10 * hid3_size) // 48 117 | elif self.policy_size == 'large': 118 | logger.log('Using large structure ') 119 | hid1_size = self.obs_dim * 10 120 | hid3_size = self.act_dim * 10 121 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 122 | logvar_speed = (hid3_size) // 48 123 | else: 124 | raise NotImplementedError 125 | 126 | self.lr = 9e-4 / np.sqrt(hid2_size) # 9e-4 empirically determined 127 | 128 | # 3 hidden layers with tanh activations 129 | out = tf.layers.dense(self.obs_ph, 130 | hid1_size, tf.tanh, 131 | kernel_initializer=tf.random_normal_initializer( 132 | stddev=np.sqrt(1 / self.obs_dim)), name="h1") 133 | 134 | out = tf.layers.dense(out, 135 | hid2_size, tf.tanh, 136 | kernel_initializer= \ 137 | tf.random_normal_initializer( \ 138 | stddev=np.sqrt(1 / hid1_size)), 139 | name="h2") 140 | 141 | out = tf.layers.dense(out, 142 | hid3_size, tf.tanh, 143 | kernel_initializer= \ 144 | tf.random_normal_initializer( \ 145 | stddev=np.sqrt(1 / hid2_size)), 146 | name="h3") 147 | 148 | self.means = tf.layers.dense(out, self.act_dim, 149 | kernel_initializer= \ 150 | tf.random_normal_initializer( \ 151 | stddev=np.sqrt(1 / hid3_size)), 152 | name="means") 153 | 154 | logvar_speed = (10 * hid3_size) // 48 155 | 156 | log_vars = tf.get_variable('logvars', 157 | (logvar_speed, self.act_dim), 158 | tf.float32, 159 | tf.constant_initializer(0.0)) 160 | 161 | self.log_vars = tf.reduce_sum(log_vars, axis=0) - 1.0 162 | 163 | self.policy_nn_vars = tf.get_collection(\ 164 | tf.GraphKeys.TRAINABLE_VARIABLES, 165 | scope='policy_nn') 166 | 167 | logger.log('Policy Params -- h1: {}, h2: {},\ 168 | h3: {}, lr: {:.3g}, logvar_speed: {}' 169 | .format(hid1_size, hid2_size, 170 | hid3_size, self.lr, logvar_speed)) 171 | 172 | 173 | def _logprob(self): 174 | 175 | """ 176 | Calculate log probabilities 177 | of a batch of observations & actions 178 | """ 179 | 180 | logp = -0.5 * tf.reduce_sum(self.log_vars) 181 | logp += -0.5 * tf.reduce_sum( 182 | tf.square(self.act_ph - self.means) / 183 | tf.exp(self.log_vars), axis=1) 184 | self.logp = logp 185 | 186 | logp_old = -0.5 * tf.reduce_sum(self.old_log_vars_ph) 187 | logp_old += -0.5 * tf.reduce_sum( 188 | tf.square(self.act_ph - self.old_means_ph) / 189 | tf.exp(self.old_log_vars_ph), axis=1) 190 | 191 | self.logp_old = logp_old 192 | 193 | def _kl_entropy(self): 194 | """ 195 | Add to Graph: 196 | 1. KL divergence between old and new distributions 197 | 2. Entropy of present policy given states and actions 198 | 199 | """ 200 | log_det_cov_old = tf.reduce_sum(self.old_log_vars_ph) 201 | log_det_cov_new = tf.reduce_sum(self.log_vars) 202 | tr_old_new = tf.reduce_sum(tf.exp(self.old_log_vars_ph - self.log_vars)) 203 | 204 | self.kl = 0.5 * tf.reduce_mean(log_det_cov_new - \ 205 | log_det_cov_old + tr_old_new + \ 206 | tf.reduce_sum(tf.square(self.means - \ 207 | self.old_means_ph) / \ 208 | tf.exp(self.log_vars), \ 209 | axis=1) - self.act_dim) 210 | 211 | self.entropy = 0.5 * (self.act_dim * \ 212 | (np.log(2 * np.pi) + 1) + \ 213 | tf.reduce_sum(self.log_vars)) 214 | 215 | 216 | def _sample(self): 217 | """ Sample from distribution, given observation """ 218 | self.sampled_act = (self.means + 219 | tf.exp(self.log_vars / 2.0) * 220 | tf.random_normal(shape=(self.act_dim,))) 221 | 222 | 223 | def _loss_train_op(self): 224 | 225 | 226 | # get Phi function and its derivatives 227 | phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False) 228 | self.phi_value = phi_value 229 | self.phi_act_g = phi_act_g 230 | self.phi_nn_vars = self.phi.phi_vars 231 | 232 | ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means) 233 | ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \ 234 | - 1/tf.exp(self.log_vars) * \ 235 | (self.act_ph - self.means) * \ 236 | (self.act_ph - self.means) * \ 237 | 1 / tf.exp(self.log_vars)) 238 | 239 | self.phi_value.set_shape((None,)) 240 | 241 | log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ 242 | * (ll_log_vars_g * tf.expand_dims(self.advantages_ph 243 | - self.c_ph * self.phi_value, 1) \ 244 | + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g ) 245 | 246 | means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ 247 | * (ll_mean_g * tf.expand_dims(self.advantages_ph - 248 | self.c_ph * self.phi_value, 1) \ 249 | + self.c_ph * self.phi_act_g) 250 | 251 | loss1_log_vars = - tf.reduce_mean( 252 | tf.stop_gradient(log_vars_inner) * \ 253 | tf.exp(self.log_vars)) 254 | 255 | loss1_mean = -tf.reduce_mean( 256 | tf.stop_gradient(means_inner) * \ 257 | self.means) 258 | 259 | loss1 = loss1_log_vars + loss1_mean 260 | 261 | loss2 = tf.reduce_mean(self.beta_ph * self.kl) 262 | 263 | loss3 = self.eta_ph * tf.square(\ 264 | tf.maximum(0.0, \ 265 | self.kl - 2.0 * self.kl_targ)) 266 | 267 | self.loss = loss1 + loss2 + loss3 268 | 269 | optimizer = tf.train.AdamOptimizer(self.lr_ph) 270 | self.train_op = optimizer.minimize(self.loss, 271 | var_list= self.policy_nn_vars) 272 | 273 | 274 | # phi loss train op 275 | if self.phi_obj == 'MinVar': 276 | means_mse = tf.reduce_sum(\ 277 | tf.reduce_mean( \ 278 | tf.square(means_inner - \ 279 | tf.reduce_mean(means_inner, \ 280 | axis=0)), axis = 0)) 281 | 282 | logstd_vars_mse = tf.reduce_sum(\ 283 | tf.reduce_mean(\ 284 | tf.square(log_vars_inner - \ 285 | tf.reduce_mean(log_vars_inner,\ 286 | axis=0)), axis = 0)) 287 | 288 | gradient = tf.concat([means_inner, log_vars_inner], axis=1) 289 | 290 | est_A = tf.gather(gradient, tf.range(0, tf.shape(gradient)[0] //2)) 291 | 292 | est_B = tf.gather(gradient, 293 | tf.range(tf.shape(gradient)[0] //2, 294 | tf.shape(gradient)[0])) 295 | 296 | # calculate loss 297 | est_var = tf.reduce_sum(\ 298 | tf.square(tf.reduce_mean(\ 299 | est_A, axis=0) - \ 300 | tf.reduce_mean(est_B, axis=0))) 301 | 302 | 303 | if self.reg_scale > 0.: 304 | reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 305 | reg_term = tf.contrib.layers.apply_regularization( 306 | self.phi.kernel_regularizer, reg_variables) 307 | 308 | for var in reg_variables: 309 | logger.log("regularized, ", var.name, var.shape) 310 | else: 311 | reg_term = 0. 312 | 313 | if self.phi_obj == 'FitQ': 314 | self.phi_loss = tf.reduce_mean(\ 315 | tf.square(self.advantages_ph - \ 316 | self.phi_value), axis=0) + reg_term 317 | 318 | logger.log('phi_with FitQ as objective function') 319 | 320 | elif self.phi_obj == 'MinVar': 321 | 322 | self.phi_loss = means_mse + logstd_vars_mse + reg_term 323 | logger.log('phi with MinVar as objecive function') 324 | 325 | else: 326 | raise NotImplementedError 327 | 328 | 329 | phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph) 330 | self.phi_train_op = phi_optimizer.minimize(self.phi_loss, var_list=self.phi_nn_vars) 331 | 332 | self.means_inner = means_inner 333 | self.log_vars_inner = log_vars_inner 334 | 335 | def get_batch_gradient(self, observes, actions, advantages, c): 336 | feed_dict = {self.obs_ph: observes, 337 | self.act_ph: actions, 338 | self.advantages_ph: advantages, 339 | self.beta_ph: self.beta, 340 | self.eta_ph: self.eta, 341 | self.lr_ph: self.lr * self.lr_multiplier, 342 | self.lr_phi_ph: self.lr_phi, 343 | self.c_ph:c} 344 | old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], 345 | feed_dict) 346 | feed_dict[self.old_log_vars_ph] = old_log_vars_np 347 | feed_dict[self.old_means_ph] = old_means_np 348 | 349 | means_gradient, vars_gradient, phi_loss = self.sess.run( 350 | [self.means_inner, 351 | self.log_vars_inner, self.phi_loss], 352 | feed_dict=feed_dict) 353 | 354 | return {"mu_grad":means_gradient, 355 | 'sigma_grad':vars_gradient, 356 | 'phi_loss':phi_loss} 357 | 358 | 359 | def _init_session(self): 360 | """Launch TensorFlow session and initialize variables""" 361 | self.sess = tf.Session(graph=self.g) 362 | self.sess.run(self.init) 363 | 364 | def sample(self, obs): 365 | """Draw sample from policy distribution""" 366 | feed_dict = {self.obs_ph: obs} 367 | 368 | return self.sess.run(self.sampled_act, feed_dict=feed_dict) 369 | 370 | def update(self, load_policy, 371 | observes, actions, 372 | advantages, use_lr_adjust, 373 | ada_kl_penalty, c=1): 374 | 375 | feed_dict = {self.obs_ph: observes, 376 | self.act_ph: actions, 377 | self.advantages_ph: advantages, 378 | self.beta_ph: self.beta, 379 | self.eta_ph: self.eta, 380 | self.lr_ph: self.lr * self.lr_multiplier, 381 | self.lr_phi_ph: self.lr_phi, 382 | self.c_ph:c} 383 | 384 | old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], 385 | feed_dict) 386 | feed_dict[self.old_log_vars_ph] = old_log_vars_np 387 | feed_dict[self.old_means_ph] = old_means_np 388 | loss, kl, entropy = 0, 0, 0 389 | 390 | # mini batch training 391 | self.sess.run(self.phi_train_op, feed_dict) 392 | 393 | if load_policy == 'save': 394 | 395 | for e in range(self.epochs): 396 | self.sess.run(self.train_op, feed_dict) 397 | loss, kl, entropy = self.sess.run([self.loss, 398 | self.kl, self.entropy], feed_dict) 399 | if kl > self.kl_targ * 4: 400 | break 401 | 402 | if (ada_kl_penalty): 403 | if kl > self.kl_targ * 2: # servo beta to reach D_KL target 404 | self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta 405 | if (use_lr_adjust): 406 | if self.beta > 30 and self.lr_multiplier > 0.1: 407 | self.lr_multiplier /= 1.5 408 | elif kl < self.kl_targ / 2: 409 | self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta 410 | if (use_lr_adjust): 411 | if self.beta < (1 / 30) and self.lr_multiplier < 10: 412 | self.lr_multiplier *= 1.5 413 | 414 | logger.record_dicts({ 415 | 'PolicyLoss': loss, 416 | 'PolicyEntropy': entropy, 417 | 'KL': kl, 418 | 'Beta': self.beta, 419 | '_lr_multiplier': self.lr_multiplier}) 420 | 421 | 422 | def save_policy(self, model_dir="models/policy_models"): 423 | self.saver.save(self.sess, 424 | os.path.join(model_dir, 425 | "policy.ckpt")) 426 | -------------------------------------------------------------------------------- /evaluation/run.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import tb_logger as logger 6 | 7 | from main_algo import train_models, eval_models 8 | from datetime import datetime 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser(description=('Train policy on OpenAI Gym environment ' 12 | 'using Proximal Policy Optimizer with Stein Control Variates')) 13 | parser.add_argument('env_name', type=str, help='OpenAI Gym environment name') 14 | parser.add_argument('-n', '--num_episodes', type=int, help='Number of episodes to run', 15 | default=20) 16 | parser.add_argument('-g', '--gamma', type=float, help='Discount factor', default=0.995) 17 | parser.add_argument('-l', '--lam', type=float, help='Lambda for Generalized Advantage Estimation', 18 | default=0.98) 19 | parser.add_argument('-k', '--kl_targ', type=float, help='D_KL target value', 20 | default=0.003) 21 | 22 | parser.add_argument('-c', '--coef', type=float, help='Coefficient value', 23 | default=1.0) 24 | parser.add_argument('-u', '--use_lr_adjust', help='whether adaptively adjust lr', type=int, default=0) 25 | parser.add_argument('-a', '--ada_kl_penalty', help='whether add kl adaptive penalty', type=int, default=1) 26 | parser.add_argument('-s','--seed', help='RNG seed', type=int, default=0) 27 | parser.add_argument('-e', '--epochs', help='epochs', type=int, default=20) 28 | parser.add_argument('-p', '--phi_epochs', help='phi epochs', type=int, default=500) 29 | parser.add_argument('-m', '--max_timesteps', help='Max timesteps', type=int, default=1000) 30 | parser.add_argument('-r', '--reg_scale', help='regularization scale on phi function', type=float, default=.0) 31 | parser.add_argument('-lr', '--phi_lr', help='phi learning_rate', type=float, default=1e-3)#1e-2/np.sqrt(300) 32 | parser.add_argument('-ph', '--phi_hs', help='phi structure', type=str, default='100x100') 33 | 34 | parser.add_argument('-ps', '--policy_size', help='large or small policy size to use', type=str, default='large') 35 | parser.add_argument('-po', '--phi_obj', help='phi objective function FitQ or MinVar', type=str, default='MinVar') 36 | parser.add_argument('-sha', '--load_model', 37 | help='if load, save or without doing anything', type=str, default='none') 38 | args = parser.parse_args() 39 | 40 | if args.load_model == 'save': 41 | if not os.path.exists('models'): 42 | os.makedirs('models') 43 | 44 | train_models(**vars(args)) 45 | 46 | elif args.load_model == 'load': 47 | if not os.path.exists('max_timesteps=%s_eval_data'%(args.max_timesteps)): 48 | os.makedirs('max_timesteps=%s_eval_data'%(args.max_timesteps)) 49 | 50 | eval_models(**vars(args)) 51 | 52 | else: 53 | raise NotImplementedError 54 | 55 | -------------------------------------------------------------------------------- /evaluation/tb_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import os.path as osp 5 | import json 6 | import time 7 | import datetime 8 | import tempfile 9 | 10 | LOG_OUTPUT_FORMATS = ['log', 'json', 'tensorboard', 'stdout'] 11 | 12 | DEBUG = 10 13 | INFO = 20 14 | WARN = 30 15 | ERROR = 40 16 | 17 | DISABLED = 50 18 | LOG_DIR='' 19 | 20 | class OutputFormat(object): 21 | def writekvs(self, kvs): 22 | """ 23 | Write key-value pairs 24 | """ 25 | raise NotImplementedError 26 | 27 | def writeseq(self, args): 28 | """ 29 | Write a sequence of other data (e.g. a logging message) 30 | """ 31 | pass 32 | 33 | def close(self): 34 | return 35 | 36 | 37 | class HumanOutputFormat(OutputFormat): 38 | def __init__(self, file): 39 | self.file = file 40 | 41 | def writekvs(self, kvs): 42 | # Create strings for printing 43 | key2str = {} 44 | for (key, val) in sorted(kvs.items()): 45 | if isinstance(val, float): 46 | valstr = '%-8.3g' % (val,) 47 | else: 48 | valstr = str(val) 49 | key2str[self._truncate(key)] = self._truncate(valstr) 50 | 51 | # Find max widths 52 | keywidth = max(map(len, key2str.keys())) 53 | valwidth = max(map(len, key2str.values())) 54 | 55 | # Write out the data 56 | dashes = '-' * (keywidth + valwidth + 7) 57 | lines = [dashes] 58 | for (key, val) in sorted(key2str.items()): 59 | lines.append('| %s%s | %s%s |' % ( 60 | key, 61 | ' ' * (keywidth - len(key)), 62 | val, 63 | ' ' * (valwidth - len(val)), 64 | )) 65 | lines.append(dashes) 66 | self.file.write('\n'.join(lines) + '\n') 67 | 68 | # Flush the output to the file 69 | self.file.flush() 70 | 71 | def _truncate(self, s): 72 | return s[:20] + '...' if len(s) > 23 else s 73 | 74 | def writeseq(self, args): 75 | for arg in args: 76 | self.file.write(arg) 77 | self.file.write('\n') 78 | self.file.flush() 79 | 80 | class JSONOutputFormat(OutputFormat): 81 | def __init__(self, file): 82 | self.file = file 83 | 84 | def writekvs(self, kvs): 85 | for k, v in sorted(kvs.items()): 86 | if hasattr(v, 'dtype'): 87 | v = v.tolist() 88 | kvs[k] = float(v) 89 | self.file.write(json.dumps(kvs) + '\n') 90 | self.file.flush() 91 | 92 | class TensorBoardOutputFormat(OutputFormat): 93 | """ 94 | Dumps key/value pairs into TensorBoard's numeric format. 95 | """ 96 | def __init__(self, dir): 97 | os.makedirs(dir, exist_ok=True) 98 | self.dir = dir 99 | self.step = 1 100 | prefix = 'events' 101 | path = osp.join(osp.abspath(dir), prefix) 102 | import tensorflow as tf 103 | from tensorflow.python import pywrap_tensorflow 104 | from tensorflow.core.util import event_pb2 105 | from tensorflow.python.util import compat 106 | self.tf = tf 107 | self.event_pb2 = event_pb2 108 | self.pywrap_tensorflow = pywrap_tensorflow 109 | self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) 110 | 111 | def writekvs(self, kvs): 112 | def summary_val(k, v): 113 | kwargs = {'tag': k, 'simple_value': float(v)} 114 | return self.tf.Summary.Value(**kwargs) 115 | summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) 116 | event = self.event_pb2.Event(wall_time=time.time(), summary=summary) 117 | event.step = self.step # is there any reason why you'd want to specify the step? 118 | self.writer.WriteEvent(event) 119 | self.writer.Flush() 120 | self.step += 1 121 | 122 | def close(self): 123 | if self.writer: 124 | self.writer.Close() 125 | self.writer = None 126 | 127 | 128 | def make_output_format(format, ev_dir): 129 | os.makedirs(ev_dir, exist_ok=True) 130 | if format == 'stdout': 131 | return HumanOutputFormat(sys.stdout) 132 | elif format == 'log': 133 | log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') 134 | return HumanOutputFormat(log_file) 135 | elif format == 'json': 136 | json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') 137 | return JSONOutputFormat(json_file) 138 | elif format == 'tensorboard': 139 | return TensorBoardOutputFormat(osp.join(ev_dir, 'tb')) 140 | else: 141 | raise ValueError('Unknown format specified: %s' % (format,)) 142 | 143 | # ================================================================ 144 | # API 145 | # ================================================================ 146 | 147 | def logkv(key, val): 148 | """ 149 | Log a value of some diagnostic 150 | Call this once for each diagnostic quantity, each iteration 151 | """ 152 | Logger.CURRENT.logkv(key, val) 153 | 154 | def logkvs(d): 155 | """ 156 | Log a dictionary of key-value pairs 157 | """ 158 | for (k, v) in d.items(): 159 | logkv(k, v) 160 | 161 | def dumpkvs(): 162 | """ 163 | Write all of the diagnostics from the current iteration 164 | 165 | level: int. (see logger.py docs) If the global logger level is higher than 166 | the level argument here, don't print to stdout. 167 | """ 168 | Logger.CURRENT.dumpkvs() 169 | 170 | def getkvs(): 171 | return Logger.CURRENT.name2val 172 | 173 | 174 | def log(*args, level=INFO): 175 | """ 176 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 177 | """ 178 | Logger.CURRENT.log(*args, level=level) 179 | 180 | 181 | def debug(*args): 182 | log(*args, level=DEBUG) 183 | 184 | 185 | def info(*args): 186 | log(*args, level=INFO) 187 | 188 | 189 | def warn(*args): 190 | log(*args, level=WARN) 191 | 192 | 193 | def error(*args): 194 | log(*args, level=ERROR) 195 | 196 | 197 | def set_level(level): 198 | """ 199 | Set logging threshold on current logger. 200 | """ 201 | Logger.CURRENT.set_level(level) 202 | 203 | def get_dir(): 204 | """ 205 | Get directory that log files are being written to. 206 | will be None if there is no output directory (i.e., if you didn't call start) 207 | """ 208 | return Logger.CURRENT.get_dir() 209 | 210 | record_tabular = logkv 211 | dump_tabular = dumpkvs 212 | record_dicts = logkvs 213 | 214 | # ================================================================ 215 | # Backend 216 | # ================================================================ 217 | 218 | class Logger(object): 219 | DEFAULT = None # A logger with no output files. (See right below class definition) 220 | # So that you can still log to the terminal without setting up any output files 221 | CURRENT = None # Current logger being used by the free functions above 222 | 223 | def __init__(self, dir, output_formats): 224 | self.name2val = {} # values this iteration 225 | self.level = INFO 226 | self.dir = dir 227 | self.output_formats = output_formats 228 | 229 | # Logging API, forwarded 230 | # ---------------------------------------- 231 | def logkv(self, key, val): 232 | self.name2val[key] = val 233 | 234 | def dumpkvs(self): 235 | if self.level == DISABLED: return 236 | for fmt in self.output_formats: 237 | fmt.writekvs(self.name2val) 238 | self.name2val.clear() 239 | 240 | def log(self, *args, level=INFO): 241 | if self.level <= level: 242 | self._do_log(args) 243 | 244 | # Configuration 245 | # ---------------------------------------- 246 | def set_level(self, level): 247 | self.level = level 248 | 249 | def get_dir(self): 250 | return self.dir 251 | 252 | def close(self): 253 | for fmt in self.output_formats: 254 | fmt.close() 255 | 256 | # Misc 257 | # ---------------------------------------- 258 | def _do_log(self, args): 259 | for fmt in self.output_formats: 260 | fmt.writeseq(args) 261 | 262 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) 263 | 264 | def configure(dir=None, format_strs=None): 265 | print("has been called with dir=%s"%dir) 266 | assert Logger.CURRENT is Logger.DEFAULT,\ 267 | "Only call logger.configure() when it's in the default state. Try calling logger.reset() first." 268 | prevlogger = Logger.CURRENT 269 | if dir is None: 270 | dir = os.getenv('DARTML_LOGDIR') 271 | if dir is None: 272 | dir = osp.join(tempfile.gettempdir(), 273 | datetime.datetime.now().strftime("dartml-%Y-%m-%d-%H-%M-%S-%f")) 274 | if format_strs is None: 275 | format_strs = LOG_OUTPUT_FORMATS 276 | output_formats = [make_output_format(f, dir) for f in format_strs] 277 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) 278 | log('Logging to %s'%dir) 279 | print("set up down..") 280 | #time.sleep(10) 281 | 282 | 283 | def set_logdir(dir_name): 284 | if not os.path.exists(dir_name): 285 | os.mkdir(dir_name) 286 | print("logger set dir has been called with %s"%(dir_name)) 287 | LOG_DIR=dir_name 288 | 289 | configure(dir=dir_name) 290 | 291 | def reset(): 292 | Logger.CURRENT = Logger.DEFAULT 293 | log('Reset logger') 294 | 295 | # ================================================================ 296 | -------------------------------------------------------------------------------- /evaluation/traj_visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import errno 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | 9 | def mkdir_p(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as exc: 13 | if exc.errno == errno.EEXIST and os.path.isdir(path): 14 | pass 15 | else: 16 | raise 17 | 18 | def load_sample_grads(batch_range, prefix_dir): 19 | file_dir = prefix_dir 20 | 21 | # load mc traj 22 | stein_phi_loss = [] 23 | mc_phi_loss = [] 24 | mc_grads = [] 25 | for i in batch_range: 26 | file_path = os.path.join(file_dir, 'mc_num_episode=%d.pkl'%i) 27 | with open(file_path, 'rb') as f: 28 | traj_data = pickle.load(f) 29 | sample_grads = np.concatenate([traj_data['mu_grad'], 30 | traj_data['sigma_grad']], axis=1) 31 | 32 | 33 | mc_grads.append(sample_grads) 34 | mc_phi_loss.append(traj_data['phi_loss']) 35 | 36 | stein_grads = [] 37 | for i in batch_range: 38 | file_path = os.path.join(file_dir, 'stein_num_episode=%d.pkl'%i) 39 | with open(file_path, 'rb') as f: 40 | traj_data = pickle.load(f) 41 | sample_grads = np.concatenate([traj_data['mu_grad'], 42 | traj_data['sigma_grad']], axis=1) 43 | 44 | stein_grads.append(sample_grads) 45 | stein_phi_loss.append(traj_data['phi_loss']) 46 | 47 | return mc_grads, stein_grads, mc_phi_loss, stein_phi_loss 48 | 49 | 50 | def gen_index(indices, max_length): 51 | total_indices = [] 52 | 53 | for index in indices: 54 | total_indices.append(np.arange(index*max_length, (index+1)* max_length)) 55 | 56 | return np.concatenate(total_indices, axis=0) 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | batch_range= range(10, 70, 10) 62 | 63 | env_name = 'Walker2d-v1' 64 | 65 | try: 66 | seeds = [int(x) for x in input("Enter seeds of evaluation saved data, seperated by space\n").split()] 67 | except (SyntaxError, ValueError): 68 | seeds = list(range(13, 253, 30)) 69 | if not seeds: 70 | seeds = list(range(13, 253, 30)) 71 | print(seeds) 72 | 73 | try: 74 | phi_obj = input("Enter type of evaluation, FitQ or MinVar: \n") 75 | except (SyntaxError, ValueError): 76 | print("no choice about evaluation type") 77 | phi_obj = 'FitQ' 78 | 79 | try: 80 | max_timesteps = input("Enter max_timesteps: \n") 81 | except (SyntaxError, ValueError): 82 | print("no choice about max_timesteps") 83 | max_timesteps = 50 84 | 85 | k = 20000 86 | plot_stein_loss = [] 87 | plot_mc_loss = [] 88 | 89 | for seed in seeds: 90 | 91 | prefix_dir = 'max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%s'%(max_timesteps, env_name, phi_obj, seed, max_timesteps) 92 | 93 | print(prefix_dir) 94 | 95 | # This is gradient for each trajectory 96 | mc_x = [] 97 | stein_x = [] 98 | plot_stein_vars = [] 99 | plot_mc_vars = [] 100 | 101 | mc_grads, stein_grads, mc_phi_loss, \ 102 | stein_phi_loss = load_sample_grads(batch_range, prefix_dir) 103 | 104 | for mc_grad, stein_grad in zip(mc_grads, stein_grads): 105 | 106 | mc_x.append(len(mc_grad)) 107 | stein_x.append(len(stein_grad)) 108 | print(len(mc_grad)) 109 | # Calculate MSE/Variance 110 | mc_vars = [] 111 | mc_num_traj = len(mc_grad) 112 | for kk in range(k): 113 | 114 | indices = np.random.choice(mc_num_traj, int(mc_num_traj/2), replace=False) 115 | total_indices = np.arange(0, mc_num_traj) 116 | mask = np.zeros(total_indices.shape, dtype=bool) 117 | mask[indices] = True 118 | 119 | mc_grad = np.array(mc_grad) 120 | mc_var = (np.mean(mc_grad[total_indices[mask]], axis=0) - \ 121 | np.mean(mc_grad[total_indices[~mask]], axis=0)) ** 2 122 | mc_vars.append(np.sum(mc_var)) 123 | 124 | plot_mc_vars.append(np.mean(mc_vars)) 125 | 126 | 127 | stein_vars = [] 128 | stein_num_traj = len(stein_grad) 129 | 130 | for kk in range(k): 131 | 132 | indices = np.random.choice(stein_num_traj, int(stein_num_traj/2), replace=False) 133 | total_indices = np.arange(0, stein_num_traj) 134 | mask = np.zeros(total_indices.shape, dtype=bool) 135 | mask[indices] = True 136 | stein_grad = np.array(stein_grad) 137 | stein_var = (np.mean(stein_grad[total_indices[mask]], axis=0) - \ 138 | np.mean(stein_grad[total_indices[~mask]], axis=0)) ** 2 139 | stein_vars.append(np.sum(stein_var)) 140 | 141 | plot_stein_vars.append(np.mean(stein_vars)) 142 | 143 | print (seed) 144 | print (mc_x) 145 | print (stein_x) 146 | print (np.log(plot_stein_vars)) 147 | print (np.log(plot_mc_vars)) 148 | plt.plot(np.log(mc_x), np.log(plot_mc_vars), label='mc') 149 | plt.plot(np.log(stein_x), np.log(plot_stein_vars), label='stein') 150 | plt.legend() 151 | mkdir_p('results') 152 | plt.savefig('results/' + '%s_avg_variance_seed=%s_max-steps=%s_phi_obj=%s.pdf'%(env_name, seed, max_timesteps, phi_obj)) 153 | plt.gcf().clear() 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /evaluation/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging and Data Scaling Utilities 3 | 4 | """ 5 | import numpy as np 6 | import os 7 | import shutil 8 | import glob 9 | import csv 10 | 11 | class Scaler(object): 12 | """ Generate scale and offset based on running mean and stddev along axis=0 13 | 14 | offset = running mean 15 | scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0) 16 | """ 17 | 18 | def __init__(self, obs_dim): 19 | """ 20 | Args: 21 | obs_dim: dimension of axis=1 22 | """ 23 | self.vars = np.zeros(obs_dim) 24 | self.means = np.zeros(obs_dim) 25 | self.m = 0 26 | self.n = 0 27 | self.first_pass = True 28 | 29 | def update(self, x): 30 | """ Update running mean and variance (this is an exact method) 31 | Args: 32 | x: NumPy array, shape = (N, obs_dim) 33 | 34 | see: https://stats.stackexchange.com/questions/43159/how-to-calculate-pooled- 35 | variance-of-two-groups-given-known-group-variances-mean 36 | """ 37 | if self.first_pass: 38 | self.means = np.mean(x, axis=0) 39 | self.vars = np.var(x, axis=0) 40 | self.m = x.shape[0] 41 | self.first_pass = False 42 | else: 43 | n = x.shape[0] 44 | new_data_var = np.var(x, axis=0) 45 | new_data_mean = np.mean(x, axis=0) 46 | new_data_mean_sq = np.square(new_data_mean) 47 | new_means = ((self.means * self.m) + (new_data_mean * n)) / (self.m + n) 48 | self.vars = (((self.m * (self.vars + np.square(self.means))) + 49 | (n * (new_data_var + new_data_mean_sq))) / (self.m + n) - 50 | np.square(new_means)) 51 | self.vars = np.maximum(0.0, self.vars) # occasionally goes negative, clip 52 | self.means = new_means 53 | self.m += n 54 | 55 | def get(self): 56 | """ returns 2-tuple: (scale, offset) """ 57 | return 1/(np.sqrt(self.vars) + 0.1)/3, self.means 58 | 59 | 60 | class Dataset(object): 61 | def __init__(self, data_map, deterministic=False, shuffle=True): 62 | self.data_map = data_map 63 | self.deterministic = deterministic 64 | self.enable_shuffle = shuffle 65 | self.n = next(iter(data_map.values())).shape[0] 66 | self._next_id = 0 67 | self.shuffle() 68 | 69 | def shuffle(self): 70 | if self.deterministic: 71 | return 72 | perm = np.arange(self.n) 73 | np.random.shuffle(perm) 74 | 75 | for key in self.data_map: 76 | self.data_map[key] = self.data_map[key][perm] 77 | 78 | self._next_id = 0 79 | 80 | def next_batch(self, batch_size): 81 | if self._next_id >= self.n and self.enable_shuffle: 82 | self.shuffle() 83 | 84 | cur_id = self._next_id 85 | cur_batch_size = min(batch_size, self.n - self._next_id) 86 | self._next_id += cur_batch_size 87 | 88 | data_map = dict() 89 | for key in self.data_map: 90 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 91 | return data_map 92 | 93 | def iterate_once(self, batch_size): 94 | if self.enable_shuffle: self.shuffle() 95 | 96 | while self._next_id <= self.n - batch_size: 97 | yield self.next_batch(batch_size) 98 | self._next_id = 0 99 | 100 | def subset(self, num_elements, deterministic=True): 101 | data_map = dict() 102 | for key in self.data_map: 103 | data_map[key] = self.data_map[key][:num_elements] 104 | return Dataset(data_map, deterministic) 105 | 106 | 107 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 108 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 109 | arrays = tuple(map(np.asarray, arrays)) 110 | n = arrays[0].shape[0] 111 | assert all(a.shape[0] == n for a in arrays[1:]) 112 | inds = np.arange(n) 113 | if shuffle: np.random.shuffle(inds) 114 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 115 | for batch_inds in np.array_split(inds, sections): 116 | if include_final_partial_batch or len(batch_inds) == batch_size: 117 | yield tuple(a[batch_inds] for a in arrays) -------------------------------------------------------------------------------- /evaluation/value_function.py: -------------------------------------------------------------------------------- 1 | """ 2 | State-Value Function 3 | 4 | """ 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | from sklearn.utils import shuffle 9 | import tb_logger as logger 10 | import os 11 | 12 | class NNValueFunction(object): 13 | """ NN-based state-value function """ 14 | def __init__(self, obs_dim): 15 | """ 16 | Args: 17 | obs_dim: number of dimensions in observation vector (int) 18 | """ 19 | self.replay_buffer_x = None 20 | self.replay_buffer_y = None 21 | self.obs_dim = obs_dim 22 | self.epochs = 10 23 | self.lr = None # learning rate set in _build_graph() 24 | self.g = tf.Graph() 25 | self._build_graph() 26 | self.sess = tf.Session(graph=self.g) 27 | self.sess.run(self.init) 28 | 29 | def _build_graph(self): #NOTE:experimental results show that quartic function works also well 30 | """ Construct TensorFlow graph, including loss function, init op and train op """ 31 | self.g = tf.Graph() 32 | with self.g.as_default(): 33 | self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc') 34 | self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc') 35 | # hid1 layer size is 10x obs_dim, hid3 size is 5, and hid2 is geometric mean 36 | hid1_size = self.obs_dim 37 | hid3_size = 5 # 5 chosen empirically on 'Hopper-v1' 38 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 39 | # heuristic to set learning rate based on NN size (tuned on 'Hopper-v1') 40 | self.lr = 1e-3 / np.sqrt(hid2_size) # 1e-3 empirically determined 41 | print('Value Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}' 42 | .format(hid1_size, hid2_size, hid3_size, self.lr)) 43 | # 3 hidden layers with tanh activations 44 | out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh, 45 | kernel_initializer=tf.random_normal_initializer( 46 | stddev=np.sqrt(1 / self.obs_dim)), name="h1") 47 | out = tf.layers.dense(out, hid2_size, tf.tanh, 48 | kernel_initializer=tf.random_normal_initializer( 49 | stddev=np.sqrt(1 / hid1_size)), name="h2") 50 | out = tf.layers.dense(out, hid3_size, tf.tanh, 51 | kernel_initializer=tf.random_normal_initializer( 52 | stddev=np.sqrt(1 / hid2_size)), name="h3") 53 | out = tf.layers.dense(out, 1, 54 | kernel_initializer=tf.random_normal_initializer( 55 | stddev=np.sqrt(1 / hid3_size)), name='output') 56 | self.out = tf.squeeze(out) 57 | self.loss = tf.reduce_mean(tf.square(self.out - self.val_ph)) # squared loss 58 | optimizer = tf.train.AdamOptimizer(self.lr) 59 | self.train_op = optimizer.minimize(self.loss) 60 | self.init = tf.global_variables_initializer() 61 | 62 | # Add ops to save and restore all the variables. 63 | # self.saver = tf.train.Saver() 64 | val_params = tf.trainable_variables() 65 | val_dict = {} 66 | for var in val_params: 67 | print(var.name) 68 | val_dict[var.name] = var 69 | 70 | self.saver = tf.train.Saver(val_dict) 71 | # # Restore variables from disk. 72 | #self.saver = tf.train.Saver() 73 | #self.saver.restore(self.sess, "../data/value.ckpt.data-00000-of-00001") 74 | 75 | #self.saver = tf.train.import_meta_graph('./value.ckpt.meta') 76 | #self.saver.restore(self.sess, tf.train.latest_checkpoint('./')) 77 | 78 | def load_val_model(self, log_dir='log_dir/'): 79 | saver = tf.train.import_meta_graph( 80 | os.path.join(log_dir, 'val_models/', 81 | 'value.ckpt.meta')) 82 | 83 | saver.restore(self.sess, 84 | tf.train.latest_checkpoint( 85 | os.path.join(log_dir, 'val_models/'))) 86 | 87 | 88 | def fit(self, x, y): 89 | """ Fit model to current data batch + previous data batch 90 | 91 | Args: 92 | x: features 93 | y: target 94 | """ 95 | num_batches = max(x.shape[0] // 256, 1) 96 | batch_size = x.shape[0] // num_batches 97 | y_hat = self.predict(x) # check explained variance prior to update 98 | old_exp_var = 1 - np.var(y - y_hat)/np.var(y) 99 | if self.replay_buffer_x is None: 100 | x_train, y_train = x, y 101 | else: 102 | x_train = np.concatenate([x, self.replay_buffer_x]) 103 | y_train = np.concatenate([y, self.replay_buffer_y]) 104 | self.replay_buffer_x = x 105 | self.replay_buffer_y = y 106 | for e in range(self.epochs): 107 | x_train, y_train = shuffle(x_train, y_train) 108 | for j in range(num_batches): 109 | start = j * batch_size 110 | end = (j + 1) * batch_size 111 | feed_dict = {self.obs_ph: x_train[start:end, :], 112 | self.val_ph: y_train[start:end]} 113 | _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) 114 | y_hat = self.predict(x) 115 | loss = np.mean(np.square(y_hat - y)) # explained variance after update 116 | exp_var = 1 - np.var(y - y_hat) / np.var(y) # diagnose over-fitting of val func 117 | 118 | logger.record_dicts({ 119 | 'VarFuncLoss': loss, 120 | 'ExplainedVarNew': exp_var, 121 | 'ExplainedVarOld': old_exp_var}) 122 | 123 | 124 | def predict(self, x): 125 | """ Predict method """ 126 | feed_dict = {self.obs_ph: x} 127 | y_hat = self.sess.run(self.out, feed_dict=feed_dict) 128 | 129 | return np.squeeze(y_hat) 130 | 131 | def save_val_func(self, model_dir="models/val_models"): 132 | self.saver.save(self.sess, 133 | os.path.join(model_dir, 134 | "value.ckpt")) 135 | -------------------------------------------------------------------------------- /evaluation/walker2d_train_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # save a PPO policy 4 | # python run.py Walker2d-v1 -c 0.0 -p 0 -sha save -s 13 5 | 6 | # eval on a saved policy 7 | for ((s=13; s<=33; s+=30)) # evaluate on one seed to save time 8 | do 9 | for ((i=10; i<=80; i+=10)) # few number of trajectories to save time 10 | do 11 | # load Stein PPO and compute variance and save 12 | 13 | # FitQ 14 | j=$((50*$i)) 15 | echo $j 16 | python run.py Walker2d-v1 -ps large -p $j -c 1 -n $i -sha load -m 500 -s $s -po FitQ & 17 | sleep 1.5s 18 | 19 | # MinVar 20 | k=$((50*$i)) 21 | echo $k 22 | python run.py Walker2d-v1 -ps large -p $k -c 1 -n $i -sha load -m 500 -s $s -po MinVar & 23 | sleep 1.5s 24 | done 25 | done -------------------------------------------------------------------------------- /optimization/main_algo.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | """ 3 | Stein PPO: Sample-efficient Policy Optimization with Stein Control Variate 4 | 5 | Motivated by the Stein’s identity, Stein PPO extends the previous 6 | control variate methods used in REINFORCE and advantage actor-critic 7 | by introducing more general action-dependent baseline functions. 8 | Details see the following papers: 9 | 10 | Stein PPO: 11 | https://arxiv.org/pdf/1710.11198.pdf 12 | 13 | Distributed PPO: 14 | https://arxiv.org/abs/1707.02286 15 | 16 | Proximal Policy Optimization Algorithms 17 | https://arxiv.org/pdf/1707.06347.pdf 18 | 19 | Generalized Advantage Estimation: 20 | https://arxiv.org/pdf/1506.02438.pdf 21 | 22 | Code modified from this Github repo: https://github.com/pat-coady/trpo 23 | 24 | This GitHub repo is also helpful. 25 | https://github.com/joschu/modular_rl 26 | 27 | This implementation learns policies for continuous environments 28 | in the OpenAI Gym (https://gym.openai.com/). Testing was focused on 29 | the MuJoCo control tasks. 30 | """ 31 | import os 32 | import gym 33 | import random 34 | 35 | import numpy as np 36 | import tb_logger as logger 37 | 38 | import scipy.signal 39 | from gym import wrappers 40 | from utils import Scaler 41 | from policy import Policy 42 | from datetime import datetime 43 | from value_function import NNValueFunction 44 | 45 | 46 | def set_global_seeds(i): 47 | try: 48 | import tensorflow as tf 49 | except ImportError: 50 | pass 51 | else: 52 | tf.set_random_seed(i) 53 | np.random.seed(i) 54 | random.seed(i) 55 | 56 | def init_gym(env_name): 57 | """ 58 | Initialize gym environment, return dimension of observation 59 | and action spaces. 60 | 61 | Args: 62 | env_name: str environment name (e.g. "Humanoid-v1") 63 | 64 | Returns: 3-tuple 65 | gym environment (object) 66 | number of observation dimensions (int) 67 | number of action dimensions (int) 68 | """ 69 | env = gym.make(env_name) 70 | obs_dim = env.observation_space.shape[0] 71 | act_dim = env.action_space.shape[0] 72 | 73 | return env, obs_dim, act_dim 74 | 75 | 76 | def run_episode(env, policy, scaler, max_timesteps, animate=False): 77 | """ Run single episode with option to animate 78 | 79 | Args: 80 | env: ai gym environment 81 | policy: policy object with sample() method 82 | scaler: scaler object, used to scale/offset each observation dimension 83 | to a similar range 84 | animate: boolean, True uses env.render() method to animate episode 85 | 86 | Returns: 4-tuple of NumPy arrays 87 | observes: shape = (episode len, obs_dim) 88 | actions: shape = (episode len, act_dim) 89 | rewards: shape = (episode len,) 90 | unscaled_obs: useful for training scaler, shape = (episode len, obs_dim) 91 | """ 92 | obs = env.reset() 93 | observes, actions, rewards, unscaled_obs = [], [], [], [] 94 | done = False 95 | step = 0.0 96 | scale, offset = scaler.get() 97 | scale[-1] = 1.0 # don't scale time step feature 98 | offset[-1] = 0.0 # don't offset time step feature 99 | for _ in range(max_timesteps): 100 | 101 | if animate: 102 | env.render() 103 | obs = obs.astype(np.float32).reshape((1, -1)) 104 | obs = np.append(obs, [[step]], axis=1) # add time step feature 105 | unscaled_obs.append(obs) 106 | obs = (obs - offset) * scale # center and scale observations 107 | observes.append(obs) 108 | action = policy.sample(obs).reshape((1, -1)).astype(np.float32) 109 | actions.append(action) 110 | obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) 111 | if not isinstance(reward, float): 112 | reward = np.asscalar(reward) 113 | rewards.append(reward) 114 | step += 1e-3 # increment time step feature 115 | if done: 116 | break 117 | 118 | return (np.concatenate(observes), np.concatenate(actions), 119 | np.array(rewards, dtype=np.float64), np.concatenate(unscaled_obs)) 120 | 121 | 122 | def run_policy(env, policy, scaler, batch_size, max_timesteps): 123 | """ Run policy and collect data for a minimum of min_steps and min_episodes 124 | 125 | Args: 126 | env: ai gym environment 127 | policy: policy object with sample() method 128 | scaler: scaler object, used to scale/offset each observation dimension 129 | to a similar range 130 | episodes: total episodes to run 131 | max_timesteps: max timesteps per episode to run 132 | 133 | Returns: list of trajectory dictionaries, list length = number of episodes 134 | 'observes' : NumPy array of states from episode 135 | 'actions' : NumPy array of actions from episode 136 | 'rewards' : NumPy array of (un-discounted) rewards from episode 137 | 'unscaled_obs' : NumPy array of (un-discounted) rewards from episode 138 | """ 139 | total_steps = 0 140 | trajectories = [] 141 | 142 | while total_steps < batch_size: 143 | observes, actions, rewards, unscaled_obs = run_episode(env, \ 144 | policy, scaler, max_timesteps=max_timesteps) 145 | total_steps += observes.shape[0] 146 | trajectory = {'observes': observes, 147 | 'actions': actions, 148 | 'rewards': rewards, 149 | 'unscaled_obs': unscaled_obs} 150 | trajectories.append(trajectory) 151 | 152 | 153 | unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) 154 | scaler.update(unscaled) # update running statistics for scaling observations 155 | 156 | logger.record_dicts({ 157 | "_MeanReward":np.mean([t['rewards'].sum() for t in trajectories]), 158 | 'Steps': total_steps,}) 159 | 160 | return trajectories 161 | 162 | 163 | def discount(x, gamma): 164 | """ Calculate discounted forward sum of a sequence at each point """ 165 | return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] 166 | 167 | 168 | def add_disc_sum_rew(trajectories, gamma): 169 | """ Adds discounted sum of rewards to all time steps of all trajectories 170 | 171 | Args: 172 | trajectories: as returned by run_policy() 173 | gamma: discount 174 | 175 | Returns: 176 | None (mutates trajectories dictionary to add 'disc_sum_rew') 177 | """ 178 | for trajectory in trajectories: 179 | if gamma < 0.999: # don't scale for gamma ~= 1 180 | rewards = trajectory['rewards'] * (1 - gamma) 181 | else: 182 | rewards = trajectory['rewards'] 183 | disc_sum_rew = discount(rewards, gamma) 184 | trajectory['disc_sum_rew'] = disc_sum_rew 185 | 186 | 187 | def add_value(trajectories, val_func): 188 | """ Adds estimated value to all time steps of all trajectories 189 | 190 | Args: 191 | trajectories: as returned by run_policy() 192 | val_func: object with predict() method, takes observations 193 | and returns predicted state value 194 | 195 | Returns: 196 | None (mutates trajectories dictionary to add 'values') 197 | """ 198 | for trajectory in trajectories: 199 | observes = trajectory['observes'] 200 | values = val_func.predict(observes) 201 | trajectory['values'] = values 202 | 203 | 204 | def add_gae(trajectories, gamma, lam): 205 | """ Add generalized advantage estimator. 206 | https://arxiv.org/pdf/1506.02438.pdf 207 | 208 | Args: 209 | trajectories: as returned by run_policy(), must include 'values' 210 | key from add_value(). 211 | gamma: reward discount 212 | lam: lambda (see paper). 213 | lam=0 : use TD residuals 214 | lam=1 : A = Sum Discounted Rewards - V_hat(s) 215 | 216 | Returns: 217 | None (mutates trajectories dictionary to add 'advantages') 218 | """ 219 | for trajectory in trajectories: 220 | if gamma < 0.999: # don't scale for gamma ~= 1 221 | rewards = trajectory['rewards'] * (1 - gamma) 222 | else: 223 | rewards = trajectory['rewards'] 224 | values = trajectory['values'] 225 | # temporal differences 226 | tds = rewards - values + np.append(values[1:] * gamma, 0) 227 | advantages = discount(tds, gamma * lam) 228 | trajectory['advantages'] = advantages 229 | 230 | 231 | def build_train_set(trajectories): 232 | """ 233 | 234 | Args: 235 | trajectories: trajectories after processing by add_disc_sum_rew(), 236 | add_value(), and add_gae() 237 | 238 | Returns: 4-tuple of NumPy arrays 239 | observes: shape = (N, obs_dim) 240 | actions: shape = (N, act_dim) 241 | advantages: shape = (N,) 242 | disc_sum_rew: shape = (N,) 243 | """ 244 | observes = np.concatenate([t['observes'] for t in trajectories]) 245 | actions = np.concatenate([t['actions'] for t in trajectories]) 246 | disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories]) 247 | advantages = np.concatenate([t['advantages'] for t in trajectories]) 248 | # normalize advantages 249 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) 250 | 251 | return observes, actions, advantages, disc_sum_rew 252 | 253 | 254 | def log_batch_stats(observes, actions, advantages, disc_sum_rew): 255 | """ Log batch statistics """ 256 | 257 | logger.record_dicts({ 258 | '_mean_obs': np.mean(observes), 259 | '_min_obs': np.min(observes), 260 | '_max_obs': np.max(observes), 261 | '_mean_act': np.mean(actions), 262 | '_max_act': np.max(actions), 263 | '_std_act': np.mean(np.var(actions, axis=0)), 264 | '_mean_adv': np.mean(advantages), 265 | '_min_adv': np.min(advantages), 266 | '_max_adv': np.max(advantages), 267 | '_std_adv': np.var(advantages), 268 | '_mean_discrew': np.mean(disc_sum_rew), 269 | '_min_discrew': np.min(disc_sum_rew), 270 | '_max_discrew': np.max(disc_sum_rew), 271 | '_std_discrew': np.var(disc_sum_rew)}) 272 | 273 | logger.dump_tabular() 274 | 275 | 276 | def main(env_name, num_iterations, gamma, lam, kl_targ, 277 | batch_size,hid1_mult, policy_logvar, coef, use_lr_adjust, ada_kl_penalty, 278 | seed, epochs, phi_epochs, max_timesteps, 279 | reg_scale, phi_lr, 280 | phi_hs, 281 | policy_size, 282 | phi_obj): 283 | """ Main training loop 284 | 285 | Args: 286 | env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' 287 | num_iterations: maximum number of iterations to run 288 | gamma: reward discount factor (float) 289 | lam: lambda from Generalized Advantage Estimate 290 | kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) 291 | batch_size: number of episodes per policy training batch 292 | hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) 293 | policy_logvar: natural log of initial policy variance 294 | coef: coefficient of Stein control variate 295 | use_lr_adjust: whether adjust lr based on kl 296 | ada_kl_penalty: whether adjust kl penalty 297 | max_timesteps: maximum time steps per trajectory 298 | reg_scale: regularization coefficient 299 | policy_size: policy network size 300 | phi_obj: FitQ or MinVar 301 | """ 302 | 303 | env, obs_dim, act_dim = init_gym(env_name) 304 | set_global_seeds(seed) 305 | env.seed(seed) 306 | env._max_episode_steps = max_timesteps 307 | obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) 308 | 309 | now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") 310 | aigym_path = os.path.join('log-files/', env_name, now) 311 | env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) 312 | 313 | scaler = Scaler(obs_dim) 314 | val_func = NNValueFunction(obs_dim, hid1_mult) 315 | 316 | policy = Policy(obs_dim, act_dim, kl_targ, 317 | hid1_mult, policy_logvar, 318 | epochs, phi_epochs, 319 | policy_size=policy_size, 320 | phi_hidden_sizes=phi_hs, 321 | c_ph=coef, 322 | reg_scale=reg_scale, 323 | lr_phi=phi_lr, 324 | phi_obj=phi_obj) 325 | 326 | # run a few episodes of untrained policy to initialize scaler: 327 | run_policy(env, policy, scaler, batch_size=1000, max_timesteps=max_timesteps) 328 | 329 | for _ in range(num_iterations): 330 | logger.log("\n#Training Iter %d"%(_)) 331 | logger.log("Draw Samples..") 332 | 333 | trajectories = run_policy(env, policy, scaler, 334 | batch_size=batch_size, max_timesteps=max_timesteps) 335 | 336 | add_value(trajectories, val_func) # add estimated values to episodes 337 | add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs 338 | add_gae(trajectories, gamma, lam) # calculate advantage 339 | 340 | # concatenate all episodes into single NumPy arrays 341 | observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) 342 | 343 | # add various stats to training log: 344 | log_batch_stats(observes, actions, advantages, disc_sum_rew) 345 | 346 | logger.log("Starting Training...") 347 | policy.update(observes, actions, advantages, \ 348 | use_lr_adjust, ada_kl_penalty) # update policy 349 | 350 | val_func.fit(observes, disc_sum_rew) # update value function 351 | 352 | logger.log('--------------------------------\n') 353 | 354 | policy.close_sess() 355 | val_func.close_sess() -------------------------------------------------------------------------------- /optimization/phi_functions/ContinousMLPPhiFunction.py: -------------------------------------------------------------------------------- 1 | """ 2 | MLP Stein control variate 3 | """ 4 | 5 | import math 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | class ContinousMLPPhiFunction(object): 11 | 12 | def __init__(self, obs_dim, act_dim, 13 | name='phi_nn', 14 | hidden_sizes=[100, 100], 15 | regular_scale=0., fn_type='relu'): 16 | self.obs_dim = obs_dim 17 | self.act_dim = act_dim 18 | self.name=name 19 | self.hidden_sizes=hidden_sizes 20 | 21 | if fn_type == 'relu': 22 | self.activation = tf.nn.relu 23 | elif fn_type == 'relu': 24 | self.activation = tf.tanh 25 | 26 | if regular_scale == 0.: 27 | kernel_regularizer = None 28 | else: 29 | kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=regular_scale) 30 | 31 | self.kernel_regularizer = kernel_regularizer 32 | 33 | 34 | def __call__(self, obs_ph, act_ph, reuse=True): 35 | with tf.variable_scope(self.name) as vs: 36 | if reuse: 37 | vs.reuse_variables() 38 | 39 | hid1_size = self.hidden_sizes[0] 40 | hid2_size = self.hidden_sizes[1] 41 | 42 | obs_dim = self.obs_dim 43 | act_dim = self.act_dim 44 | 45 | W1 = self.variable([obs_dim,hid1_size],obs_dim) 46 | b1 = self.variable([hid1_size],obs_dim) 47 | 48 | W2 = self.variable([hid1_size,hid2_size],hid1_size+act_dim) 49 | W2_action = self.variable([act_dim,hid2_size],hid1_size+act_dim) 50 | b2 = self.variable([hid2_size],hid1_size+act_dim) 51 | 52 | W3 = tf.Variable(tf.random_uniform([hid2_size,1],-3e-3,3e-3)) 53 | b3 = tf.Variable(tf.random_uniform([1],-3e-3,3e-3)) 54 | 55 | 56 | layer1 = self.activation(tf.matmul(obs_ph, W1) + b1) 57 | layer2 = self.activation(tf.matmul(layer1,W2) + \ 58 | tf.matmul(act_ph, W2_action) + b2) 59 | out = tf.identity(tf.matmul(layer2,W3) + b3) 60 | 61 | phi_value = tf.squeeze(out) 62 | phi_act_g= tf.gradients(phi_value, act_ph)[0] 63 | 64 | return phi_value, phi_act_g 65 | 66 | 67 | def variable(self,shape, f): 68 | return tf.Variable(tf.random_uniform(shape, \ 69 | -1/math.sqrt(f),1/math.sqrt(f))) 70 | 71 | 72 | 73 | @property 74 | def phi_vars(self): 75 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) 76 | 77 | -------------------------------------------------------------------------------- /optimization/phi_functions/__init__.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf -------------------------------------------------------------------------------- /optimization/policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | NN Policy with KL Divergence 3 | Constraint and Stein control variates 4 | """ 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import tb_logger as logger 9 | from utils import progressbar 10 | from phi_functions.ContinousMLPPhiFunction import ContinousMLPPhiFunction 11 | 12 | 13 | class Policy(object): 14 | """ NN-based policy approximation """ 15 | def __init__(self, obs_dim, 16 | act_dim, 17 | kl_targ, 18 | hid1_mult, 19 | policy_logvar, 20 | epochs, 21 | phi_epochs, 22 | policy_size='large', 23 | phi_hidden_sizes='100x100', 24 | c_ph=1, 25 | reg_scale=.0, 26 | lr_phi=0.0005, 27 | phi_obj='MinVar'): 28 | """ 29 | Args: 30 | obs_dim: num observation dimensions (int) 31 | act_dim: num action dimensions (int) 32 | kl_targ: target KL divergence between pi_old and pi_new 33 | hid1_mult: size of first hidden layer, multiplier of obs_dim 34 | policy_logvar: natural log of initial policy variance 35 | epochs: PPO update epochs 36 | phi_epochs: stein control variates update epochs 37 | policy_size: policy_size according to environments 38 | phi_hidden_sizes: control variate Phi network size 39 | c_ph: whether use control varaite 40 | reg_scale: regularization term 41 | lr_phi: Phi learning rate 42 | phi_obj: Updating Phi methods FitQ or MinVar 43 | """ 44 | 45 | self.beta = 1.0 # dynamically adjusted D_KL loss multiplier 46 | self.eta = 50 # multiplier for D_KL-kl_targ hinge-squared loss 47 | self.kl_targ = kl_targ 48 | self.epochs = epochs 49 | self.hid1_mult = hid1_mult 50 | self.policy_logvar = policy_logvar 51 | self.phi_epochs = phi_epochs 52 | self.lr = None # lr for policy neural network 53 | self.lr_phi = None # lr for phi function neural network 54 | self.lr_multiplier = 1.0 # dynamically adjust policy's lr 55 | self.obs_dim = obs_dim 56 | self.act_dim = act_dim 57 | self.c_ph = c_ph 58 | self.policy_size=policy_size 59 | self.phi_obj = phi_obj 60 | 61 | # create Phi networks 62 | self.reg_scale = reg_scale 63 | phi_hidden_sizes = [int(x) for x in phi_hidden_sizes.split("x")] 64 | self.phi = ContinousMLPPhiFunction( 65 | obs_dim, act_dim, 66 | hidden_sizes=phi_hidden_sizes, 67 | regular_scale=reg_scale) 68 | 69 | self.lr_phi = lr_phi 70 | 71 | self._build_graph() 72 | self._init_session() 73 | 74 | def _build_graph(self): 75 | """ Build and initialize TensorFlow graph """ 76 | self.g = tf.Graph() 77 | with self.g.as_default(): 78 | self._placeholders() 79 | self._policy_nn() 80 | 81 | self._logprob() 82 | self._kl_entropy() 83 | self._sample() 84 | self._loss_train_op() 85 | self.init = tf.global_variables_initializer() 86 | 87 | def _placeholders(self): 88 | """ Input placeholders""" 89 | # observations, actions and advantages: 90 | self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs') 91 | self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'act') 92 | self.advantages_ph = tf.placeholder(tf.float32, (None,), 'advantages') 93 | 94 | # strength of D_KL loss terms: 95 | self.beta_ph = tf.placeholder(tf.float32, (), 'beta') 96 | self.eta_ph = tf.placeholder(tf.float32, (), 'eta') 97 | 98 | # learning rate: 99 | self.lr_ph = tf.placeholder(tf.float32, (), 'eta') 100 | self.lr_phi_ph = tf.placeholder(tf.float32, (), 'lr_phi') 101 | 102 | self.old_log_vars_ph = tf.placeholder(tf.float32, (self.act_dim,), 'old_log_vars') 103 | self.old_means_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_means') 104 | 105 | 106 | def _policy_nn(self): 107 | """ 108 | Neural net for policy 109 | approximation function 110 | """ 111 | 112 | with tf.variable_scope("policy_nn"): 113 | # hidden layer sizes determined by obs_dim 114 | # and act_dim (hid2 is geometric mean) 115 | if self.policy_size == 'small': 116 | logger.log("using small structure") 117 | 118 | hid1_size = self.obs_dim # * 10 119 | hid3_size = self.act_dim # * 10 120 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 121 | 122 | elif self.policy_size == 'large': 123 | logger.log('Using large structure ') 124 | 125 | hid1_size = self.obs_dim * self.hid1_mult 126 | hid3_size = self.act_dim * 10 127 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 128 | else: 129 | raise NotImplementedError 130 | 131 | # heuristic to set learning rate based on NN size 132 | self.lr = 9e-4 / np.sqrt(hid2_size) # 9e-4 empirically determined 133 | 134 | # 3 hidden layers with tanh activations 135 | out = tf.layers.dense(self.obs_ph, 136 | hid1_size, tf.tanh, 137 | kernel_initializer=tf.random_normal_initializer( 138 | stddev=np.sqrt(1 / self.obs_dim)), name="h1") 139 | 140 | out = tf.layers.dense(out, 141 | hid2_size, tf.tanh, 142 | kernel_initializer= \ 143 | tf.random_normal_initializer( \ 144 | stddev=np.sqrt(1 / hid1_size)), 145 | name="h2") 146 | 147 | out = tf.layers.dense(out, 148 | hid3_size, tf.tanh, 149 | kernel_initializer= \ 150 | tf.random_normal_initializer( \ 151 | stddev=np.sqrt(1 / hid2_size)), 152 | name="h3") 153 | 154 | self.means = tf.layers.dense(out, self.act_dim, 155 | kernel_initializer= \ 156 | tf.random_normal_initializer( \ 157 | stddev=np.sqrt(1 / hid3_size)), 158 | name="means") 159 | 160 | logvar_speed = (10 * hid3_size) // 48 161 | log_vars = tf.get_variable('logvars', 162 | (logvar_speed, self.act_dim), 163 | tf.float32, 164 | tf.constant_initializer(0.0)) 165 | 166 | 167 | self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar 168 | 169 | self.policy_nn_vars = tf.get_collection(\ 170 | tf.GraphKeys.TRAINABLE_VARIABLES, 171 | scope='policy_nn') 172 | 173 | logger.log('Policy Params -- h1: {}, h2: {}, \ 174 | h3: {}, lr: {:.3g}, logvar_speed: {}' 175 | .format(hid1_size, hid2_size, hid3_size, 176 | self.lr, logvar_speed)) 177 | 178 | 179 | def _logprob(self): 180 | 181 | """ 182 | Calculate log probabilities 183 | of a batch of observations & actions 184 | """ 185 | 186 | logp = -0.5 * tf.reduce_sum(self.log_vars) 187 | logp += -0.5 * tf.reduce_sum( 188 | tf.square(self.act_ph - self.means) / 189 | tf.exp(self.log_vars), axis=1) 190 | self.logp = logp 191 | 192 | logp_old = -0.5 * tf.reduce_sum(self.old_log_vars_ph) 193 | logp_old += -0.5 * tf.reduce_sum( 194 | tf.square(self.act_ph - self.old_means_ph) / 195 | tf.exp(self.old_log_vars_ph), axis=1) 196 | 197 | self.logp_old = logp_old 198 | 199 | def _kl_entropy(self): 200 | """ 201 | Add to Graph: 202 | 1. KL divergence between old and new distributions 203 | 2. Entropy of present policy given states and actions 204 | 205 | """ 206 | log_det_cov_old = tf.reduce_sum(self.old_log_vars_ph) 207 | log_det_cov_new = tf.reduce_sum(self.log_vars) 208 | tr_old_new = tf.reduce_sum(tf.exp(self.old_log_vars_ph - self.log_vars)) 209 | 210 | self.kl = 0.5 * tf.reduce_mean(log_det_cov_new - \ 211 | log_det_cov_old + tr_old_new + \ 212 | tf.reduce_sum(tf.square(self.means - \ 213 | self.old_means_ph) / \ 214 | tf.exp(self.log_vars), \ 215 | axis=1) - self.act_dim) 216 | 217 | self.entropy = 0.5 * (self.act_dim * \ 218 | (np.log(2 * np.pi) + 1) + \ 219 | tf.reduce_sum(self.log_vars)) 220 | 221 | def _sample(self): 222 | """ 223 | Sample from distribution, 224 | given observation 225 | """ 226 | 227 | self.sampled_act = (self.means + 228 | tf.exp(self.log_vars / 2.0) * 229 | tf.random_normal(shape=(self.act_dim,))) 230 | 231 | def _loss_train_op(self): 232 | 233 | # get Phi function and its derivatives 234 | phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False) 235 | self.phi_value = phi_value 236 | self.phi_act_g = phi_act_g 237 | self.phi_nn_vars = self.phi.phi_vars 238 | 239 | ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means) 240 | ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \ 241 | - 1/tf.exp(self.log_vars) * \ 242 | (self.act_ph - self.means) * \ 243 | (self.act_ph - self.means) * \ 244 | 1 / tf.exp(self.log_vars)) 245 | 246 | self.phi_value.set_shape((None,)) 247 | 248 | log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ 249 | * (ll_log_vars_g * tf.expand_dims(self.advantages_ph 250 | - self.c_ph * self.phi_value, 1) \ 251 | + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g ) 252 | 253 | means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ 254 | * (ll_mean_g * tf.expand_dims(self.advantages_ph - 255 | self.c_ph * self.phi_value, 1) \ 256 | + self.c_ph * self.phi_act_g) 257 | 258 | loss1_log_vars = - tf.reduce_mean( 259 | tf.stop_gradient(log_vars_inner) * \ 260 | tf.exp(self.log_vars)) 261 | 262 | loss1_mean = -tf.reduce_mean( 263 | tf.stop_gradient(means_inner) * \ 264 | self.means) 265 | 266 | loss1 = loss1_log_vars + loss1_mean 267 | 268 | loss2 = tf.reduce_mean(self.beta_ph * self.kl) 269 | 270 | loss3 = self.eta_ph * tf.square(\ 271 | tf.maximum(0.0, \ 272 | self.kl - 2.0 * self.kl_targ)) 273 | 274 | self.loss = loss1 + loss2 + loss3 275 | 276 | optimizer = tf.train.AdamOptimizer(self.lr_ph) 277 | self.train_op = optimizer.minimize(self.loss, 278 | var_list= self.policy_nn_vars) 279 | 280 | 281 | if self.reg_scale > 0.: 282 | reg_variables = tf.get_collection(\ 283 | tf.GraphKeys.REGULARIZATION_LOSSES) 284 | 285 | reg_term = tf.contrib.layers.apply_regularization( 286 | self.phi.kernel_regularizer, 287 | reg_variables) 288 | else: 289 | reg_term = 0. 290 | 291 | if self.c_ph == 1.: 292 | if self.phi_obj == 'FitQ': 293 | self.phi_loss = tf.reduce_mean(\ 294 | tf.square(self.advantages_ph - \ 295 | self.phi_value), axis=0) + reg_term 296 | 297 | logger.log('phi_with FitQ as objective function') 298 | 299 | elif self.phi_obj == 'MinVar': 300 | self.means_mse = tf.reduce_sum(\ 301 | tf.reduce_mean( \ 302 | tf.square(means_inner - \ 303 | tf.reduce_mean(means_inner, \ 304 | axis=0)), axis = 0)) 305 | 306 | self.logstd_vars_mse = tf.reduce_sum(\ 307 | tf.reduce_mean( \ 308 | tf.square(log_vars_inner - \ 309 | tf.reduce_mean(log_vars_inner, \ 310 | axis=0)), axis = 0)) 311 | 312 | self.phi_loss = self.means_mse + self.logstd_vars_mse + reg_term 313 | logger.log('phi with MinVar as objecive function') 314 | 315 | else: 316 | raise NotImplementedError 317 | 318 | phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph) 319 | self.phi_train_op = phi_optimizer.minimize(\ 320 | self.phi_loss, 321 | var_list=self.phi_nn_vars) 322 | 323 | elif self.c_ph == 0.: 324 | logger.log("Training with PPO") 325 | self.phi_train_op = tf.no_op 326 | 327 | 328 | 329 | def _init_session(self): 330 | """Launch TensorFlow session and initialize variables""" 331 | self.sess = tf.Session(graph=self.g) 332 | self.sess.run(self.init) 333 | 334 | def sample(self, obs): 335 | """Draw sample from policy distribution""" 336 | feed_dict = {self.obs_ph: obs} 337 | 338 | return self.sess.run(self.sampled_act, feed_dict=feed_dict) 339 | 340 | def update(self, observes, actions, advantages, use_lr_adjust, ada_kl_penalty): 341 | """ Update policy based on observations, actions and advantages 342 | 343 | Args: 344 | observes: observations, shape = (N, obs_dim) 345 | actions: actions, shape = (N, act_dim) 346 | advantages: advantages, shape = (N,) 347 | phi_value: phi_value, shape = (N,) 348 | phi_act_g: phi_act_g, shape = (N, act_dim) 349 | """ 350 | feed_dict = {self.obs_ph: observes, 351 | self.act_ph: actions, 352 | self.advantages_ph: advantages, 353 | self.beta_ph: self.beta, 354 | self.eta_ph: self.eta, 355 | self.lr_ph: self.lr * self.lr_multiplier, 356 | self.lr_phi_ph: self.lr_phi} 357 | old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], 358 | feed_dict) 359 | feed_dict[self.old_log_vars_ph] = old_log_vars_np 360 | feed_dict[self.old_means_ph] = old_means_np 361 | loss, kl, entropy = 0, 0, 0 362 | 363 | if self.c_ph == 1.: 364 | # Update phi function & policy network 365 | logger.log("Training Phi for %d epochs"%self.phi_epochs) 366 | 367 | for _ in progressbar(range(self.phi_epochs), "Train Phi:", 25): 368 | self.sess.run(self.phi_train_op, feed_dict) 369 | phi_loss = self.sess.run(self.phi_loss, feed_dict) 370 | 371 | logger.record_tabular("Phi_loss", phi_loss) 372 | 373 | # Training policy 374 | logger.log("Training Policy for %d epochs"%self.epochs) 375 | for _ in progressbar(range(self.epochs), "Train Policy", 25): 376 | self.sess.run(self.train_op, feed_dict) 377 | loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict) 378 | if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly 379 | break 380 | 381 | if (ada_kl_penalty): 382 | if kl > self.kl_targ * 2: # servo beta to reach D_KL target 383 | self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta 384 | if (use_lr_adjust): 385 | if self.beta > 30 and self.lr_multiplier > 0.1: 386 | self.lr_multiplier /= 1.5 387 | elif kl < self.kl_targ / 2: 388 | self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta 389 | if (use_lr_adjust): 390 | if self.beta < (1 / 30) and self.lr_multiplier < 10: 391 | self.lr_multiplier *= 1.5 392 | 393 | logger.record_dicts({ 394 | 'PolicyLoss': loss, 395 | 'PolicyEntropy': entropy, 396 | 'KL': kl, 397 | 'Beta': self.beta, 398 | '_lr_multiplier': self.lr_multiplier}) 399 | 400 | 401 | def close_sess(self): 402 | """ Close TensorFlow session """ 403 | self.sess.close() 404 | -------------------------------------------------------------------------------- /optimization/scripts/test_FitQ.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Optimizing Phi by FitQ method 3 | cd .. 4 | 5 | for ((s=1; s<=5; s+=1)) 6 | do 7 | echo $s 8 | 9 | # Walker2d-v1 with different training phi iterations 10 | python train.py Walker2d-v1 -po FitQ -p 500 -s $s -n 500& 11 | sleep 1.5s 12 | python train.py Walker2d-v1 -po FitQ -p 1000 -s $s -n 500& 13 | sleep 1.5s 14 | python train.py Walker2d-v1 -c 0 -p 0 -s $s -n 500& 15 | sleep 1.5s 16 | 17 | # HalfCheetah-v1 with different training phi iterations 18 | python train.py HalfCheetah-v1 -c 0 -p 0 -s $s -n 500& 19 | sleep 1.5s 20 | python train.py HalfCheetah-v1 -po FitQ -p 500 -s $s -n 500& 21 | sleep 1.5s 22 | python train.py HalfCheetah-v1 -po FitQ -p 1000 -s $s -n 500& 23 | sleep 1.5s 24 | done 25 | -------------------------------------------------------------------------------- /optimization/scripts/test_MinVar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Optimizing Phi by MinVar method 3 | cd .. 4 | 5 | for ((s=1; s<=5; s+=1)) 6 | do 7 | echo $s 8 | 9 | # Walker2d-v1 10 | python train.py Walker2d-v1 -po MinVar -p 500 -s $s -n 500& 11 | sleep 1.5s 12 | python train.py Walker2d-v1 -po MinVar -p 1000 -s $s -n 500& 13 | sleep 1.5s 14 | python train.py Walker2d-v1 -c 0 -p 0 -s $s -n 500& 15 | sleep 1.5s 16 | 17 | # HalfCheetah-v1 18 | python train.py HalfCheetah-v1 -c 0 -p 0 -s $s -n 500& 19 | sleep 1.5s 20 | python train.py HalfCheetah-v1 -po MinVar -p 500 -s $s -n 500& 21 | sleep 1.5s 22 | python train.py HalfCheetah-v1 -po MinVar -p 1000 -s $s -n 500& 23 | sleep 1.5s 24 | done 25 | -------------------------------------------------------------------------------- /optimization/tb_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import os.path as osp 5 | import json 6 | import time 7 | import datetime 8 | import tempfile 9 | 10 | LOG_OUTPUT_FORMATS = ['log', 'json', 'tensorboard', 'stdout'] 11 | 12 | DEBUG = 10 13 | INFO = 20 14 | WARN = 30 15 | ERROR = 40 16 | 17 | DISABLED = 50 18 | LOG_DIR='' 19 | 20 | class OutputFormat(object): 21 | def writekvs(self, kvs): 22 | """ 23 | Write key-value pairs 24 | """ 25 | raise NotImplementedError 26 | 27 | def writeseq(self, args): 28 | """ 29 | Write a sequence of other data (e.g. a logging message) 30 | """ 31 | pass 32 | 33 | def close(self): 34 | return 35 | 36 | 37 | class HumanOutputFormat(OutputFormat): 38 | def __init__(self, file): 39 | self.file = file 40 | 41 | def writekvs(self, kvs): 42 | # Create strings for printing 43 | key2str = {} 44 | for (key, val) in sorted(kvs.items()): 45 | if isinstance(val, float): 46 | valstr = '%-8.3g' % (val,) 47 | else: 48 | valstr = str(val) 49 | key2str[self._truncate(key)] = self._truncate(valstr) 50 | 51 | # Find max widths 52 | keywidth = max(map(len, key2str.keys())) 53 | valwidth = max(map(len, key2str.values())) 54 | 55 | # Write out the data 56 | dashes = '-' * (keywidth + valwidth + 7) 57 | lines = [dashes] 58 | for (key, val) in sorted(key2str.items()): 59 | lines.append('| %s%s | %s%s |' % ( 60 | key, 61 | ' ' * (keywidth - len(key)), 62 | val, 63 | ' ' * (valwidth - len(val)), 64 | )) 65 | lines.append(dashes) 66 | self.file.write('\n'.join(lines) + '\n') 67 | 68 | # Flush the output to the file 69 | self.file.flush() 70 | 71 | def _truncate(self, s): 72 | return s[:20] + '...' if len(s) > 23 else s 73 | 74 | def writeseq(self, args): 75 | for arg in args: 76 | self.file.write(arg) 77 | self.file.write('\n') 78 | self.file.flush() 79 | 80 | class JSONOutputFormat(OutputFormat): 81 | def __init__(self, file): 82 | self.file = file 83 | 84 | def writekvs(self, kvs): 85 | for k, v in sorted(kvs.items()): 86 | if hasattr(v, 'dtype'): 87 | v = v.tolist() 88 | kvs[k] = float(v) 89 | self.file.write(json.dumps(kvs) + '\n') 90 | self.file.flush() 91 | 92 | class TensorBoardOutputFormat(OutputFormat): 93 | """ 94 | Dumps key/value pairs into TensorBoard's numeric format. 95 | """ 96 | def __init__(self, dir): 97 | os.makedirs(dir, exist_ok=True) 98 | self.dir = dir 99 | self.step = 1 100 | prefix = 'events' 101 | path = osp.join(osp.abspath(dir), prefix) 102 | import tensorflow as tf 103 | from tensorflow.python import pywrap_tensorflow 104 | from tensorflow.core.util import event_pb2 105 | from tensorflow.python.util import compat 106 | self.tf = tf 107 | self.event_pb2 = event_pb2 108 | self.pywrap_tensorflow = pywrap_tensorflow 109 | self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) 110 | 111 | def writekvs(self, kvs): 112 | def summary_val(k, v): 113 | kwargs = {'tag': k, 'simple_value': float(v)} 114 | return self.tf.Summary.Value(**kwargs) 115 | summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) 116 | event = self.event_pb2.Event(wall_time=time.time(), summary=summary) 117 | event.step = self.step # is there any reason why you'd want to specify the step? 118 | self.writer.WriteEvent(event) 119 | self.writer.Flush() 120 | self.step += 1 121 | 122 | def close(self): 123 | if self.writer: 124 | self.writer.Close() 125 | self.writer = None 126 | 127 | 128 | def make_output_format(format, ev_dir): 129 | os.makedirs(ev_dir, exist_ok=True) 130 | if format == 'stdout': 131 | return HumanOutputFormat(sys.stdout) 132 | elif format == 'log': 133 | log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') 134 | return HumanOutputFormat(log_file) 135 | elif format == 'json': 136 | json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') 137 | return JSONOutputFormat(json_file) 138 | elif format == 'tensorboard': 139 | return TensorBoardOutputFormat(osp.join(ev_dir, 'tb')) 140 | else: 141 | raise ValueError('Unknown format specified: %s' % (format,)) 142 | 143 | # ================================================================ 144 | # API 145 | # ================================================================ 146 | 147 | def logkv(key, val): 148 | """ 149 | Log a value of some diagnostic 150 | Call this once for each diagnostic quantity, each iteration 151 | """ 152 | Logger.CURRENT.logkv(key, val) 153 | 154 | def logkvs(d): 155 | """ 156 | Log a dictionary of key-value pairs 157 | """ 158 | for (k, v) in d.items(): 159 | logkv(k, v) 160 | 161 | def dumpkvs(): 162 | """ 163 | Write all of the diagnostics from the current iteration 164 | 165 | level: int. (see logger.py docs) If the global logger level is higher than 166 | the level argument here, don't print to stdout. 167 | """ 168 | Logger.CURRENT.dumpkvs() 169 | 170 | def getkvs(): 171 | return Logger.CURRENT.name2val 172 | 173 | 174 | def log(*args, level=INFO): 175 | """ 176 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 177 | """ 178 | Logger.CURRENT.log(*args, level=level) 179 | 180 | 181 | def debug(*args): 182 | log(*args, level=DEBUG) 183 | 184 | 185 | def info(*args): 186 | log(*args, level=INFO) 187 | 188 | 189 | def warn(*args): 190 | log(*args, level=WARN) 191 | 192 | 193 | def error(*args): 194 | log(*args, level=ERROR) 195 | 196 | 197 | def set_level(level): 198 | """ 199 | Set logging threshold on current logger. 200 | """ 201 | Logger.CURRENT.set_level(level) 202 | 203 | def get_dir(): 204 | """ 205 | Get directory that log files are being written to. 206 | will be None if there is no output directory (i.e., if you didn't call start) 207 | """ 208 | return Logger.CURRENT.get_dir() 209 | 210 | record_tabular = logkv 211 | dump_tabular = dumpkvs 212 | record_dicts = logkvs 213 | 214 | # ================================================================ 215 | # Backend 216 | # ================================================================ 217 | 218 | class Logger(object): 219 | DEFAULT = None # A logger with no output files. (See right below class definition) 220 | # So that you can still log to the terminal without setting up any output files 221 | CURRENT = None # Current logger being used by the free functions above 222 | 223 | def __init__(self, dir, output_formats): 224 | self.name2val = {} # values this iteration 225 | self.level = INFO 226 | self.dir = dir 227 | self.output_formats = output_formats 228 | 229 | # Logging API, forwarded 230 | # ---------------------------------------- 231 | def logkv(self, key, val): 232 | self.name2val[key] = val 233 | 234 | def dumpkvs(self): 235 | if self.level == DISABLED: return 236 | for fmt in self.output_formats: 237 | fmt.writekvs(self.name2val) 238 | self.name2val.clear() 239 | 240 | def log(self, *args, level=INFO): 241 | if self.level <= level: 242 | self._do_log(args) 243 | 244 | # Configuration 245 | # ---------------------------------------- 246 | def set_level(self, level): 247 | self.level = level 248 | 249 | def get_dir(self): 250 | return self.dir 251 | 252 | def close(self): 253 | for fmt in self.output_formats: 254 | fmt.close() 255 | 256 | # Misc 257 | # ---------------------------------------- 258 | def _do_log(self, args): 259 | for fmt in self.output_formats: 260 | fmt.writeseq(args) 261 | 262 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) 263 | 264 | def configure(dir=None, format_strs=None): 265 | print("has been called with dir=%s"%dir) 266 | assert Logger.CURRENT is Logger.DEFAULT,\ 267 | "Only call logger.configure() when it's in the default state. Try calling logger.reset() first." 268 | prevlogger = Logger.CURRENT 269 | if dir is None: 270 | dir = os.getenv('DARTML_LOGDIR') 271 | if dir is None: 272 | dir = osp.join(tempfile.gettempdir(), 273 | datetime.datetime.now().strftime("dartml-%Y-%m-%d-%H-%M-%S-%f")) 274 | if format_strs is None: 275 | format_strs = LOG_OUTPUT_FORMATS 276 | output_formats = [make_output_format(f, dir) for f in format_strs] 277 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) 278 | log('Logging to %s'%dir) 279 | print("set up down..") 280 | #time.sleep(10) 281 | 282 | 283 | def set_logdir(dir_name): 284 | if not os.path.exists(dir_name): 285 | os.mkdir(dir_name) 286 | print("logger set dir has been called with %s"%(dir_name)) 287 | LOG_DIR=dir_name 288 | 289 | configure(dir=dir_name) 290 | 291 | def reset(): 292 | Logger.CURRENT = Logger.DEFAULT 293 | log('Reset logger') 294 | 295 | # ================================================================ 296 | -------------------------------------------------------------------------------- /optimization/train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import tb_logger as logger 6 | 7 | from main_algo import main 8 | from datetime import datetime 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser(description=('Train policy on OpenAI Gym environment ' 13 | 'using Proximal Policy Optimizer with Stein Control Variates')) 14 | parser.add_argument('env_name', type=str, help='OpenAI Gym environment name') 15 | parser.add_argument('-n', '--num_iterations', type=int, help='Number of iterations to run', 16 | default=1000) 17 | parser.add_argument('-g', '--gamma', type=float, help='Discount factor', default=0.995) 18 | parser.add_argument('-l', '--lam', type=float, help='Lambda for Generalized Advantage Estimation', 19 | default=0.98) 20 | parser.add_argument('-k', '--kl_targ', type=float, help='D_KL target value', 21 | default=0.003) 22 | parser.add_argument('-b', '--batch_size', type=int, 23 | help='Number of batch_size per training batch', 24 | default=10000) 25 | parser.add_argument('-m', '--hid1_mult', type=int, 26 | help='Size of first hidden layer for value and policy NNs' 27 | '(integer multiplier of observation dimension)', 28 | default=10) 29 | parser.add_argument('-v', '--policy_logvar', type=float, 30 | help='Initial policy log-variance (natural log of variance)', 31 | default=-1.0) 32 | parser.add_argument('-c', '--coef', type=float, help='Stein control variate coefficient value', 33 | default=1.0) 34 | parser.add_argument('-u', '--use_lr_adjust', help='whether adaptively adjust lr', type=int, default=0) 35 | parser.add_argument('-a', '--ada_kl_penalty', help='whether add kl adaptive penalty', type=int, default=1) 36 | parser.add_argument('-s','--seed', help='RNG seed', type=int, default=0) 37 | parser.add_argument('-e', '--epochs', help='epochs', type=int, default=20) 38 | parser.add_argument('-p', '--phi_epochs', help='phi epochs', type=int, default=500) 39 | parser.add_argument('-mt', '--max_timesteps', help='Max timesteps', type=int, default=1000) 40 | parser.add_argument('-r', '--reg_scale', help='regularization scale on phi function', type=float, default=.0) 41 | parser.add_argument('-lr', '--phi_lr', help='phi learning_rate', type=float, default=0.0005) 42 | parser.add_argument('-ph', '--phi_hs', 43 | help='phi structure, default 100x100 for mlp', 44 | type=str, default='100x100') 45 | 46 | parser.add_argument('-ps', '--policy_size', 47 | help='large or small policy size to use, \ 48 | use small for Ant, Humanoid and HumanoidStandup', 49 | type=str, default='large') 50 | parser.add_argument('-po', '--phi_obj', help='phi objective \ 51 | function FitQ or MinVar', type=str, default='MinVar') 52 | 53 | args = parser.parse_args() 54 | 55 | # logs 56 | dir_name = os.path.join('dartml_data', 'env=%s/'%(args.env_name)) 57 | 58 | if args.coef == 0.: 59 | dir_name += 'PPO-%s'%(datetime.now().strftime('%m_%d_%H:%M:%S')) 60 | else: 61 | dir_name += 'Stein-PPO_Phi_obj=%s-%s'%(args.phi_obj, \ 62 | datetime.now().strftime('%m_%d_%H:%M:%S')) 63 | 64 | 65 | if not os.path.exists(dir_name): 66 | os.makedirs(dir_name) 67 | os.environ["DARTML_LOGDIR"]=dir_name 68 | logger.set_logdir(dir_name) 69 | 70 | args = parser.parse_args() 71 | main(**vars(args)) 72 | -------------------------------------------------------------------------------- /optimization/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data Scaling Utilities 3 | """ 4 | import os 5 | import sys 6 | import glob 7 | import shutil 8 | import numpy as np 9 | 10 | class Scaler(object): 11 | """ Generate scale and offset based on running mean and stddev along axis=0 12 | 13 | offset = running mean 14 | scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0) 15 | """ 16 | 17 | def __init__(self, obs_dim): 18 | """ 19 | Args: 20 | obs_dim: dimension of axis=1 21 | """ 22 | self.vars = np.zeros(obs_dim) 23 | self.means = np.zeros(obs_dim) 24 | self.m = 0 25 | self.n = 0 26 | self.first_pass = True 27 | 28 | def update(self, x): 29 | """ Update running mean and variance (this is an exact method) 30 | Args: 31 | x: NumPy array, shape = (N, obs_dim) 32 | 33 | see: https://stats.stackexchange.com/questions/43159/how-to-calculate-pooled- 34 | variance-of-two-groups-given-known-group-variances-mean 35 | """ 36 | if self.first_pass: 37 | self.means = np.mean(x, axis=0) 38 | self.vars = np.var(x, axis=0) 39 | self.m = x.shape[0] 40 | self.first_pass = False 41 | else: 42 | n = x.shape[0] 43 | new_data_var = np.var(x, axis=0) 44 | new_data_mean = np.mean(x, axis=0) 45 | new_data_mean_sq = np.square(new_data_mean) 46 | new_means = ((self.means * self.m) + (new_data_mean * n)) / (self.m + n) 47 | self.vars = (((self.m * (self.vars + np.square(self.means))) + 48 | (n * (new_data_var + new_data_mean_sq))) / (self.m + n) - 49 | np.square(new_means)) 50 | self.vars = np.maximum(0.0, self.vars) # occasionally goes negative, clip 51 | self.means = new_means 52 | self.m += n 53 | 54 | def get(self): 55 | """ returns 2-tuple: (scale, offset) """ 56 | return 1/(np.sqrt(self.vars) + 0.1)/3, self.means 57 | 58 | 59 | def progressbar(it, prefix = "", size = 60): 60 | count = len(it) 61 | def _show(_i): 62 | x = int(size*_i/count) 63 | sys.stdout.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), _i, count)) 64 | sys.stdout.flush() 65 | 66 | _show(0) 67 | for i, item in enumerate(it): 68 | yield item 69 | _show(i+1) 70 | sys.stdout.write("\n") 71 | sys.stdout.flush() -------------------------------------------------------------------------------- /optimization/value_function.py: -------------------------------------------------------------------------------- 1 | """ 2 | State-Value Function 3 | 4 | """ 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | from sklearn.utils import shuffle 9 | import tb_logger as logger 10 | 11 | class NNValueFunction(object): 12 | """ NN-based state-value function """ 13 | def __init__(self, obs_dim, hid1_mult): 14 | """ 15 | Args: 16 | obs_dim: number of dimensions in observation vector (int) 17 | hid1_mult: size of first hidden layer, multiplier of obs_dim 18 | """ 19 | self.replay_buffer_x = None 20 | self.replay_buffer_y = None 21 | self.obs_dim = obs_dim 22 | self.hid1_mult = hid1_mult 23 | self.epochs = 10 24 | self.lr = None # learning rate set in _build_graph() 25 | self._build_graph() 26 | self.sess = tf.Session(graph=self.g) 27 | self.sess.run(self.init) 28 | 29 | def _build_graph(self): #NOTE:experimental results show that quartic function works also well 30 | """ Construct TensorFlow graph, including loss function, init op and train op """ 31 | self.g = tf.Graph() 32 | with self.g.as_default(): 33 | self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc') 34 | self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc') 35 | 36 | hid1_size = self.obs_dim * self.hid1_mult 37 | hid3_size = 5 38 | hid2_size = int(np.sqrt(hid1_size * hid3_size)) 39 | 40 | self.lr = 1e-3 / np.sqrt(hid2_size) # 1e-3 empirically determined 41 | print('Value Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}' 42 | .format(hid1_size, hid2_size, hid3_size, self.lr)) 43 | # 3 hidden layers with tanh activations 44 | out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh, 45 | kernel_initializer=tf.random_normal_initializer( 46 | stddev=np.sqrt(1 / self.obs_dim)), name="h1") 47 | out = tf.layers.dense(out, hid2_size, tf.tanh, 48 | kernel_initializer=tf.random_normal_initializer( 49 | stddev=np.sqrt(1 / hid1_size)), name="h2") 50 | out = tf.layers.dense(out, hid3_size, tf.tanh, 51 | kernel_initializer=tf.random_normal_initializer( 52 | stddev=np.sqrt(1 / hid2_size)), name="h3") 53 | out = tf.layers.dense(out, 1, 54 | kernel_initializer=tf.random_normal_initializer( 55 | stddev=np.sqrt(1 / hid3_size)), name='output') 56 | self.out = tf.squeeze(out) 57 | self.loss = tf.reduce_mean(tf.square(self.out - self.val_ph)) # squared loss 58 | optimizer = tf.train.AdamOptimizer(self.lr) 59 | self.train_op = optimizer.minimize(self.loss) 60 | self.init = tf.global_variables_initializer() 61 | self.sess = tf.Session(graph=self.g) 62 | self.sess.run(self.init) 63 | 64 | def fit(self, x, y): 65 | """ Fit model to current data batch + previous data batch 66 | 67 | Args: 68 | x: features 69 | y: target 70 | """ 71 | num_batches = max(x.shape[0] // 256, 1) 72 | batch_size = x.shape[0] // num_batches 73 | y_hat = self.predict(x) # check explained variance prior to update 74 | old_exp_var = 1 - np.var(y - y_hat)/np.var(y) 75 | if self.replay_buffer_x is None: 76 | x_train, y_train = x, y 77 | else: 78 | x_train = np.concatenate([x, self.replay_buffer_x]) 79 | y_train = np.concatenate([y, self.replay_buffer_y]) 80 | self.replay_buffer_x = x 81 | self.replay_buffer_y = y 82 | for e in range(self.epochs): 83 | x_train, y_train = shuffle(x_train, y_train) 84 | for j in range(num_batches): 85 | start = j * batch_size 86 | end = (j + 1) * batch_size 87 | feed_dict = {self.obs_ph: x_train[start:end, :], 88 | self.val_ph: y_train[start:end]} 89 | _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) 90 | y_hat = self.predict(x) 91 | loss = np.mean(np.square(y_hat - y)) # explained variance after update 92 | exp_var = 1 - np.var(y - y_hat) / np.var(y) # diagnose over-fitting of val func 93 | 94 | logger.record_dicts({ 95 | 'VarFuncLoss': loss, 96 | 'ExplainedVarNew':exp_var, 97 | 'ExplainedVarOld': old_exp_var}) 98 | 99 | def predict(self, x): 100 | """ Predict method """ 101 | feed_dict = {self.obs_ph: x} 102 | y_hat = self.sess.run(self.out, feed_dict=feed_dict) 103 | 104 | return np.squeeze(y_hat) 105 | 106 | def close_sess(self): 107 | """ Close TensorFlow session """ 108 | self.sess.close() 109 | --------------------------------------------------------------------------------