├── .gitignore ├── Dockerfile ├── README.md ├── __init__.py ├── demo_scripts ├── PETS.sh ├── POPLINA_INIT.sh ├── POPLINA_REPLAN.sh ├── POPLINP_AVG.sh ├── POPLINP_BC.sh └── POPLINP_UNI.sh ├── dmbrl ├── __init__.py ├── config │ ├── __init__.py │ ├── default.py │ ├── gym_acrobot.py │ ├── gym_ant.py │ ├── gym_cartpole.py │ ├── gym_cheetah.py │ ├── gym_fhopper.py │ ├── gym_fswimmer.py │ ├── gym_hopper.py │ ├── gym_invertedPendulum.py │ ├── gym_pendulum.py │ ├── gym_reacher.py │ ├── gym_swimmer.py │ ├── gym_walker2d.py │ ├── halfcheetah.py │ ├── pusher.py │ ├── reacher.py │ ├── reward_util.py │ ├── template.py │ └── view_humanoid.py ├── controllers │ ├── Controller.py │ ├── MPC.py │ └── __init__.py ├── env │ ├── __init__.py │ ├── assets │ │ ├── cartpole.xml │ │ ├── half_cheetah.xml │ │ ├── pusher.xml │ │ └── reacher3d.xml │ ├── cartpole.py │ ├── half_cheetah.py │ ├── pusher.py │ └── reacher.py ├── misc │ ├── Agent.py │ ├── DotmapUtils.py │ ├── MBExp.py │ ├── __init__.py │ ├── logger.py │ └── optimizers │ │ ├── POPLIN_A.py │ │ ├── POPLIN_P.py │ │ ├── __init__.py │ │ ├── cem.py │ │ ├── gbp_cem.py │ │ ├── gbp_rs.py │ │ ├── optimizer.py │ │ ├── pgcem.py │ │ ├── policy_network │ │ ├── BC_A_policy.py │ │ ├── BC_WA_policy.py │ │ ├── BC_WD_policy.py │ │ ├── __init__.py │ │ ├── base_policy.py │ │ ├── gan_policy.py │ │ ├── gmm_policy.py │ │ ├── gmm_util.py │ │ ├── tf_networks.py │ │ ├── tf_norm.py │ │ ├── tf_utils.py │ │ ├── wgan_policy.py │ │ └── whitening_util.py │ │ └── random.py └── modeling │ ├── layers │ ├── FC.py │ └── __init__.py │ ├── models │ ├── BNN.py │ ├── GT_dynamics.py │ ├── NN.py │ ├── TFGP.py │ └── __init__.py │ └── utils │ ├── TensorStandardScaler.py │ └── __init__.py ├── img ├── curve.png ├── policy_control.png ├── reward.png └── table.png ├── mbexp.py ├── requirements.txt ├── scripts ├── mbexp.py └── render.py ├── show_result.py └── show_with_test_result.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | tags 3 | *.pyc 4 | log/ 5 | *.swp 6 | *.swo 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | 3 | RUN rm /bin/sh && ln -s /bin/bash /bin/sh 4 | 5 | # Install pip 6 | RUN apt-get update 7 | RUN apt-get -y install python3 python3-pip python3-dev python3-tk 8 | RUN apt-get -y install libglu1-mesa libxi-dev libxmu-dev libglu1-mesa-dev 9 | 10 | # Install basic libraries 11 | RUN pip3 install --upgrade pip 12 | RUN pip3 install numpy tensorflow-gpu==1.9 matplotlib scipy scikit-learn future 13 | 14 | # Install MuJoCo + OpenAI gym 15 | RUN pip3 install gym==0.9.4 16 | RUN apt-get update 17 | RUN apt-get -y install unzip unetbootin wget 18 | RUN mkdir -p /.mujoco && cd /.mujoco && wget https://www.roboti.us/download/mjpro131_linux.zip && unzip mjpro131_linux.zip 19 | ENV MUJOCO_PY_MJKEY_PATH="/root/.mujoco/mjkey.txt" 20 | ENV MUJOCO_PY_MJPRO_PATH="/root/.mujoco/mjpro131" 21 | RUN pip3 install mujoco-py==0.5.7 22 | 23 | # Install additional requirements 24 | RUN pip3 install datetime gitpython h5py tqdm dotmap cython 25 | 26 | # GPFlow 27 | RUN apt-get -y install git 28 | RUN git clone https://github.com/GPflow/GPflow.git 29 | RUN pip3 install pandas multipledispatch pytest 30 | RUN cd GPflow/ && pip install . --no-deps 31 | 32 | # Create copy of Deep MBRL repo and place in ~/handful-of-trials 33 | RUN cd ~ && git clone https://github.com/kchua/handful-of-trials.git 34 | 35 | # Environment setup 36 | RUN echo 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/x86_64-linux-gnu' >> /root/.bashrc 37 | RUN echo 'alias python=python3' >> /root/.bashrc 38 | 39 | CMD /bin/bash 40 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/__init__.py -------------------------------------------------------------------------------- /demo_scripts/PETS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python mbexp.py -logdir ./log/PETS \ 3 | -env halfcheetah \ 4 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 5 | -ca opt-type CEM \ 6 | -ca model-type PE \ 7 | -ca prop-type E 8 | -------------------------------------------------------------------------------- /demo_scripts/POPLINA_INIT.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # The following script will run POPLIN-A with INIT methods on halfcheetah 3 | 4 | python mbexp.py -logdir ./log/POPLIN_A \ 5 | -env halfcheetah \ 6 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 7 | -o ctrl_cfg.cem_cfg.cem_type POPLINA-INIT \ 8 | -o ctrl_cfg.cem_cfg.training_scheme BC-AI \ 9 | -o ctrl_cfg.cem_cfg.test_policy 1 \ 10 | -ca model-type PE -ca prop-type E \ 11 | -ca opt-type POPLIN-A 12 | -------------------------------------------------------------------------------- /demo_scripts/POPLINA_REPLAN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # The following script will run POPLIN-A with REPLAN methods on halfcheetah 3 | 4 | python mbexp.py -logdir ./log/POPLINA_REPLAN \ 5 | -env halfcheetah \ 6 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 7 | -o ctrl_cfg.cem_cfg.cem_type POPLINA-REPLAN \ 8 | -o ctrl_cfg.cem_cfg.training_scheme BC-AI \ 9 | -o ctrl_cfg.cem_cfg.test_policy 1 \ 10 | -ca model-type PE -ca prop-type E \ 11 | -ca opt-type POPLIN-A 12 | -------------------------------------------------------------------------------- /demo_scripts/POPLINP_AVG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # The following script will run POPLIN-P using the AVG training methods on halfcheetah 3 | 4 | python mbexp.py -logdir ./log/POPLINP_AVG -env halfcheetah \ 5 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 6 | -o ctrl_cfg.cem_cfg.cem_type POPLINP-SEP \ 7 | -o ctrl_cfg.cem_cfg.training_scheme AVG-R \ 8 | -o ctrl_cfg.cem_cfg.policy_network_shape [32] \ 9 | -o ctrl_cfg.opt_cfg.init_var 0.1 \ 10 | -o ctrl_cfg.cem_cfg.test_policy 1 \ 11 | -ca model-type PE -ca prop-type E \ 12 | -ca opt-type POPLIN-P 13 | -------------------------------------------------------------------------------- /demo_scripts/POPLINP_BC.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # The following script will run POPLIN-P using the BC training methods. 3 | 4 | python mbexp.py -logdir ./log/POPLINP_BC -env halfcheetah \ 5 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 6 | -o ctrl_cfg.cem_cfg.cem_type POPLINP-SEP \ 7 | -o ctrl_cfg.cem_cfg.training_scheme BC-PR \ 8 | -o ctrl_cfg.cem_cfg.policy_network_shape [32] \ 9 | -o ctrl_cfg.opt_cfg.init_var 0.03 \ 10 | -o ctrl_cfg.cem_cfg.test_policy 1 \ 11 | -ca model-type PE -ca prop-type E \ 12 | -ca opt-type POPLIN-P 13 | -------------------------------------------------------------------------------- /demo_scripts/POPLINP_UNI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python mbexp.py -logdir ./log/POPLINP_UNI -env halfcheetah \ 4 | -o exp_cfg.exp_cfg.ntrain_iters 50 \ 5 | -o ctrl_cfg.cem_cfg.cem_type POPLINP-UNI\ 6 | -o ctrl_cfg.cem_cfg.training_scheme AVG-R \ 7 | -o ctrl_cfg.cem_cfg.policy_network_shape [32] \ 8 | -o ctrl_cfg.opt_cfg.init_var 0.1 \ 9 | -o ctrl_cfg.cem_cfg.test_policy 1 \ 10 | -ca model-type PE -ca prop-type E \ 11 | -ca opt-type POPLIN-P 12 | -------------------------------------------------------------------------------- /dmbrl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/dmbrl/__init__.py -------------------------------------------------------------------------------- /dmbrl/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .default import create_config -------------------------------------------------------------------------------- /dmbrl/config/gym_acrobot.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | """ 12 | Module name, 13 | MODEL_IN, MODEL_OUT, 14 | import env, env_name 15 | """ 16 | 17 | 18 | class GymAcrobotConfigModule: 19 | ENV_NAME = "MBRLGYM_acrobot-v0" 20 | TASK_HORIZON = 1000 21 | NTRAIN_ITERS = 300 22 | NROLLOUTS_PER_ITER = 1 23 | PLAN_HOR = 30 24 | INIT_VAR = 0.25 25 | MODEL_IN, MODEL_OUT = 7, 6 # obs -> 6, action -> 1 26 | GP_NINDUCING_POINTS = 300 27 | 28 | def __init__(self): 29 | # self.ENV = gym.make(self.ENV_NAME) 30 | from mbbl.env.gym_env import acrobot 31 | self.ENV = acrobot.env(env_name='gym_acrobot', rand_seed=1234, 32 | misc_info={'reset_type': 'gym'}) 33 | cfg = tf.ConfigProto() 34 | cfg.gpu_options.allow_growth = True 35 | self.SESS = tf.Session(config=cfg) 36 | self.NN_TRAIN_CFG = {"epochs": 5} 37 | self.OPT_CFG = { 38 | "Random": { 39 | "popsize": 2500 40 | }, 41 | "GBPRandom": { 42 | "popsize": 2500 43 | }, 44 | "GBPCEM": { 45 | "popsize": 500, 46 | "num_elites": 50, 47 | "max_iters": 5, 48 | "alpha": 0.1 49 | }, 50 | "CEM": { 51 | "popsize": 500, 52 | "num_elites": 50, 53 | "max_iters": 5, 54 | "alpha": 0.1 55 | }, 56 | "POPLIN-P": { 57 | "popsize": 500, 58 | "num_elites": 50, 59 | "max_iters": 5, 60 | "alpha": 0.1 61 | }, 62 | "POPLIN-A": { 63 | "popsize": 500, 64 | "num_elites": 50, 65 | "max_iters": 5, 66 | "alpha": 0.1 67 | } 68 | } 69 | 70 | @staticmethod 71 | def obs_preproc(obs): 72 | """ @brief: no cheating of the observation function 73 | """ 74 | if isinstance(obs, np.ndarray): 75 | return obs 76 | else: 77 | return obs 78 | 79 | @staticmethod 80 | def obs_postproc(obs, pred): 81 | if isinstance(obs, np.ndarray): 82 | return obs + pred 83 | else: 84 | return obs + pred 85 | 86 | @staticmethod 87 | def targ_proc(obs, next_obs): 88 | return next_obs - obs 89 | 90 | @staticmethod 91 | def obs_cost_fn(obs): 92 | """ @brief: 93 | 94 | def reward(data_dict): 95 | def height(obs): 96 | h1 = obs[0] # Height of first arm 97 | h2 = obs[0] * obs[2] - obs[1] * obs[3] # Height of second arm 98 | return -(h1 + h2) # total height 99 | 100 | start_height = height(data_dict['start_state']) 101 | 102 | reward = { 103 | 'gym_acrobot': start_height, 104 | 'gym_acrobot_sparse': (start_height > 1) - 1 105 | }[self._env_name] # gets gt reward based on sparse/dense 106 | return reward 107 | self.reward = reward 108 | """ 109 | return obs[:, 0] + obs[:, 0] * obs[:, 2] - obs[:, 1] * obs[:, 3] 110 | 111 | @staticmethod 112 | def ac_cost_fn(acs): 113 | if isinstance(acs, np.ndarray): 114 | return np.sum(np.square(acs), axis=1) * 0.0 115 | else: 116 | return tf.reduce_sum(tf.square(acs), axis=1) * 0.0 117 | 118 | def nn_constructor(self, model_init_cfg, misc=None): 119 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 120 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 121 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 122 | model_dir=model_init_cfg.get("model_dir", None), 123 | misc=misc 124 | )) 125 | if not model_init_cfg.get("load_model", False): 126 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 127 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 128 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 129 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 130 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 131 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 132 | return model 133 | 134 | def gp_constructor(self, model_init_cfg): 135 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 136 | name="model", 137 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 138 | kernel_args=model_init_cfg.get("kernel_args", {}), 139 | num_inducing_points=get_required_argument( 140 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 141 | ), 142 | sess=self.SESS 143 | )) 144 | return model 145 | 146 | 147 | CONFIG_MODULE = GymAcrobotConfigModule 148 | -------------------------------------------------------------------------------- /dmbrl/config/gym_ant.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class AntConfigModule: 14 | """ 15 | @brief: migrate the gym module from the mbbl repo 16 | 'gym_cheetah': { 17 | 'path': 'mbbl.env.gym_env.walker', 18 | 'ob_size': 17, 'action_size': 6, 'max_length': 1000 19 | } 20 | """ 21 | ENV_NAME = "MBRLGYM_ANT-v0" 22 | TASK_HORIZON = 1000 23 | NTRAIN_ITERS = 300 24 | NROLLOUTS_PER_ITER = 1 25 | PLAN_HOR = 30 26 | INIT_VAR = 0.25 27 | MODEL_IN, MODEL_OUT = 35, 27 # obs - > 27, action 8 28 | GP_NINDUCING_POINTS = 300 29 | 30 | def __init__(self): 31 | # self.ENV = gym.make(self.ENV_NAME) 32 | from mbbl.env.gym_env import walker 33 | self.ENV = walker.env(env_name='gym_ant', rand_seed=1234, 34 | misc_info={'reset_type': 'gym'}) 35 | cfg = tf.ConfigProto() 36 | cfg.gpu_options.allow_growth = True 37 | self.SESS = tf.Session(config=cfg) 38 | self.NN_TRAIN_CFG = {"epochs": 5} 39 | self.OPT_CFG = { 40 | "Random": { 41 | "popsize": 2500 42 | }, 43 | "GBPRandom": { 44 | "popsize": 2500 45 | }, 46 | "GBPCEM": { 47 | "popsize": 500, 48 | "num_elites": 50, 49 | "max_iters": 5, 50 | "alpha": 0.1 51 | }, 52 | "CEM": { 53 | "popsize": 500, 54 | "num_elites": 50, 55 | "max_iters": 5, 56 | "alpha": 0.1 57 | }, 58 | "POPLIN-P": { 59 | "popsize": 500, 60 | "num_elites": 50, 61 | "max_iters": 5, 62 | "alpha": 0.1 63 | }, 64 | "POPLIN-A": { 65 | "popsize": 500, 66 | "num_elites": 50, 67 | "max_iters": 5, 68 | "alpha": 0.1 69 | } 70 | } 71 | 72 | @staticmethod 73 | def obs_preproc(obs): 74 | """ @brief: no cheating of the observation function 75 | """ 76 | if isinstance(obs, np.ndarray): 77 | return obs 78 | else: 79 | return obs 80 | 81 | @staticmethod 82 | def obs_postproc(obs, pred): 83 | if isinstance(obs, np.ndarray): 84 | return obs + pred 85 | else: 86 | return obs + pred 87 | 88 | @staticmethod 89 | def targ_proc(obs, next_obs): 90 | return next_obs - obs 91 | 92 | @staticmethod 93 | def obs_cost_fn(obs): 94 | """ @brief: 95 | see mbbl.env.gym_env.walker.py for reward details 96 | """ 97 | if isinstance(obs, np.ndarray): 98 | velocity_cost = -obs[:, 13] # the qvel for the root-x joint 99 | height_cost = 3 * np.square(obs[:, 0] - 0.57) # the height 100 | return velocity_cost + height_cost 101 | else: 102 | velocity_cost = -obs[:, 13] # the qvel for the root-x joint 103 | height_cost = 3 * tf.square(obs[:, 0] - 0.57) # the height 104 | return velocity_cost + height_cost 105 | 106 | @staticmethod 107 | def ac_cost_fn(acs): 108 | if isinstance(acs, np.ndarray): 109 | return 0.1 * np.sum(np.square(acs), axis=1) 110 | else: 111 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 112 | 113 | def nn_constructor(self, model_init_cfg, misc=None): 114 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 115 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 116 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 117 | model_dir=model_init_cfg.get("model_dir", None), 118 | misc=misc 119 | )) 120 | if not model_init_cfg.get("load_model", False): 121 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 122 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 123 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 124 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 125 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 126 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 127 | return model 128 | 129 | def gp_constructor(self, model_init_cfg): 130 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 131 | name="model", 132 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 133 | kernel_args=model_init_cfg.get("kernel_args", {}), 134 | num_inducing_points=get_required_argument( 135 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 136 | ), 137 | sess=self.SESS 138 | )) 139 | return model 140 | 141 | 142 | CONFIG_MODULE = AntConfigModule 143 | -------------------------------------------------------------------------------- /dmbrl/config/gym_cartpole.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | """ 12 | Module name, 13 | MODEL_IN, MODEL_OUT, 14 | import env, env_name 15 | """ 16 | 17 | 18 | class GymCartpoleConfigModule: 19 | ENV_NAME = "MBRLGYM_cartpole-v0" 20 | TASK_HORIZON = 1000 21 | NTRAIN_ITERS = 300 22 | NROLLOUTS_PER_ITER = 1 23 | PLAN_HOR = 30 24 | INIT_VAR = 0.25 25 | MODEL_IN, MODEL_OUT = 5, 4 # obs -> 3, action -> 1 26 | GP_NINDUCING_POINTS = 300 27 | 28 | def __init__(self): 29 | # self.ENV = gym.make(self.ENV_NAME) 30 | from mbbl.env.gym_env import cartpole 31 | self.ENV = cartpole.env(env_name='gym_cartpole', rand_seed=1234, 32 | misc_info={'reset_type': 'gym'}) 33 | cfg = tf.ConfigProto() 34 | cfg.gpu_options.allow_growth = True 35 | self.SESS = tf.Session(config=cfg) 36 | self.NN_TRAIN_CFG = {"epochs": 5} 37 | self.OPT_CFG = { 38 | "Random": { 39 | "popsize": 2500 40 | }, 41 | "GBPRandom": { 42 | "popsize": 2500 43 | }, 44 | "GBPCEM": { 45 | "popsize": 500, 46 | "num_elites": 50, 47 | "max_iters": 5, 48 | "alpha": 0.1 49 | }, 50 | "CEM": { 51 | "popsize": 500, 52 | "num_elites": 50, 53 | "max_iters": 5, 54 | "alpha": 0.1 55 | }, 56 | "POPLIN-P": { 57 | "popsize": 500, 58 | "num_elites": 50, 59 | "max_iters": 5, 60 | "alpha": 0.1 61 | }, 62 | "POPLIN-A": { 63 | "popsize": 500, 64 | "num_elites": 50, 65 | "max_iters": 5, 66 | "alpha": 0.1 67 | } 68 | } 69 | 70 | @staticmethod 71 | def obs_preproc(obs): 72 | """ @brief: no cheating of the observation function 73 | """ 74 | if isinstance(obs, np.ndarray): 75 | return obs 76 | else: 77 | return obs 78 | 79 | @staticmethod 80 | def obs_postproc(obs, pred): 81 | if isinstance(obs, np.ndarray): 82 | return obs + pred 83 | else: 84 | return obs + pred 85 | 86 | @staticmethod 87 | def targ_proc(obs, next_obs): 88 | return next_obs - obs 89 | 90 | @staticmethod 91 | def obs_cost_fn(obs): 92 | """ @brief: 93 | 94 | x, _, theta, _ = data_dict['start_state'] 95 | up_reward = np.cos(theta) 96 | distance_penalty_reward = -0.01 * (x ** 2) 97 | return up_reward + distance_penalty_reward 98 | """ 99 | x = obs[:, 0] 100 | theta = obs[:, 2] 101 | if isinstance(obs, np.ndarray): 102 | return -(np.cos(theta) - 0.01 * (x ** 2)) 103 | else: 104 | return -(tf.cos(theta) - 0.01 * (x ** 2)) 105 | 106 | @staticmethod 107 | def ac_cost_fn(acs): 108 | if isinstance(acs, np.ndarray): 109 | return np.sum(np.square(acs), axis=1) * 0.0 110 | else: 111 | return tf.reduce_sum(tf.square(acs), axis=1) * 0.0 112 | 113 | def nn_constructor(self, model_init_cfg, misc=None): 114 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 115 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 116 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 117 | model_dir=model_init_cfg.get("model_dir", None), 118 | misc=misc 119 | )) 120 | if not model_init_cfg.get("load_model", False): 121 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 122 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 123 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 124 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 125 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 126 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 127 | return model 128 | 129 | def gp_constructor(self, model_init_cfg): 130 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 131 | name="model", 132 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 133 | kernel_args=model_init_cfg.get("kernel_args", {}), 134 | num_inducing_points=get_required_argument( 135 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 136 | ), 137 | sess=self.SESS 138 | )) 139 | return model 140 | 141 | 142 | CONFIG_MODULE = GymCartpoleConfigModule 143 | -------------------------------------------------------------------------------- /dmbrl/config/gym_cheetah.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class HalfCheetahConfigModule: 14 | """ 15 | @brief: migrate the gym module from the mbbl repo 16 | 'gym_cheetah': { 17 | 'path': 'mbbl.env.gym_env.walker', 18 | 'ob_size': 17, 'action_size': 6, 'max_length': 1000 19 | } 20 | """ 21 | ENV_NAME = "MBRLGYM_HalfCheetah-v0" 22 | TASK_HORIZON = 1000 23 | NTRAIN_ITERS = 300 24 | NROLLOUTS_PER_ITER = 1 25 | PLAN_HOR = 30 26 | INIT_VAR = 0.25 27 | MODEL_IN, MODEL_OUT = 23, 17 # obs - > 17, action 6 28 | GP_NINDUCING_POINTS = 300 29 | 30 | def __init__(self): 31 | # self.ENV = gym.make(self.ENV_NAME) 32 | from mbbl.env.gym_env import walker 33 | self.ENV = walker.env(env_name='gym_cheetah', rand_seed=1234, 34 | misc_info={'reset_type': 'gym'}) 35 | cfg = tf.ConfigProto() 36 | cfg.gpu_options.allow_growth = True 37 | self.SESS = tf.Session(config=cfg) 38 | self.NN_TRAIN_CFG = {"epochs": 5} 39 | self.OPT_CFG = { 40 | "Random": { 41 | "popsize": 2500 42 | }, 43 | "GBPRandom": { 44 | "popsize": 2500 45 | }, 46 | "GBPCEM": { 47 | "popsize": 500, 48 | "num_elites": 50, 49 | "max_iters": 5, 50 | "alpha": 0.1 51 | }, 52 | "CEM": { 53 | "popsize": 500, 54 | "num_elites": 50, 55 | "max_iters": 5, 56 | "alpha": 0.1 57 | }, 58 | "POPLIN-P": { 59 | "popsize": 500, 60 | "num_elites": 50, 61 | "max_iters": 5, 62 | "alpha": 0.1 63 | }, 64 | "POPLIN-A": { 65 | "popsize": 500, 66 | "num_elites": 50, 67 | "max_iters": 5, 68 | "alpha": 0.1 69 | } 70 | } 71 | 72 | @staticmethod 73 | def obs_preproc(obs): 74 | """ @brief: no cheating of the observation function 75 | """ 76 | if isinstance(obs, np.ndarray): 77 | return obs 78 | else: 79 | return obs 80 | 81 | @staticmethod 82 | def obs_postproc(obs, pred): 83 | if isinstance(obs, np.ndarray): 84 | return obs + pred 85 | else: 86 | return obs + pred 87 | 88 | @staticmethod 89 | def targ_proc(obs, next_obs): 90 | return next_obs - obs 91 | 92 | @staticmethod 93 | def obs_cost_fn(obs): 94 | """ @brief: 95 | see mbbl.env.gym_env.walker.py for reward details 96 | """ 97 | return -obs[:, 8] # the qvel for the root-x joint 98 | 99 | @staticmethod 100 | def ac_cost_fn(acs): 101 | if isinstance(acs, np.ndarray): 102 | return 0.1 * np.sum(np.square(acs), axis=1) 103 | else: 104 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 105 | 106 | def nn_constructor(self, model_init_cfg, misc=None): 107 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 108 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 109 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 110 | model_dir=model_init_cfg.get("model_dir", None), 111 | misc=misc 112 | )) 113 | if not model_init_cfg.get("load_model", False): 114 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 115 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 116 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 117 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 118 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 119 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 120 | return model 121 | 122 | def gp_constructor(self, model_init_cfg): 123 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 124 | name="model", 125 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 126 | kernel_args=model_init_cfg.get("kernel_args", {}), 127 | num_inducing_points=get_required_argument( 128 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 129 | ), 130 | sess=self.SESS 131 | )) 132 | return model 133 | 134 | 135 | CONFIG_MODULE = HalfCheetahConfigModule 136 | -------------------------------------------------------------------------------- /dmbrl/config/gym_fhopper.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class FixedHopperConfigModule: 14 | ENV_NAME = "MBRLGYM_Hopper-v0" 15 | TASK_HORIZON = 1000 16 | NTRAIN_ITERS = 300 17 | NROLLOUTS_PER_ITER = 1 18 | PLAN_HOR = 30 19 | INIT_VAR = 0.25 20 | MODEL_IN, MODEL_OUT = 14, 11 # obs - > 11, action 3 21 | GP_NINDUCING_POINTS = 300 22 | 23 | def __init__(self): 24 | # self.ENV = gym.make(self.ENV_NAME) 25 | from mbbl.env.gym_env import fixed_walker 26 | self.ENV = fixed_walker.env(env_name='gym_fhopper', rand_seed=1234, 27 | misc_info={'reset_type': 'gym'}) 28 | cfg = tf.ConfigProto() 29 | cfg.gpu_options.allow_growth = True 30 | self.SESS = tf.Session(config=cfg) 31 | self.NN_TRAIN_CFG = {"epochs": 5} 32 | self.OPT_CFG = { 33 | "Random": { 34 | "popsize": 2500 35 | }, 36 | "GBPRandom": { 37 | "popsize": 2500 38 | }, 39 | "GBPCEM": { 40 | "popsize": 500, 41 | "num_elites": 50, 42 | "max_iters": 5, 43 | "alpha": 0.1 44 | }, 45 | "CEM": { 46 | "popsize": 500, 47 | "num_elites": 50, 48 | "max_iters": 5, 49 | "alpha": 0.1 50 | }, 51 | "POPLIN-P": { 52 | "popsize": 500, 53 | "num_elites": 50, 54 | "max_iters": 5, 55 | "alpha": 0.1 56 | }, 57 | "POPLIN-A": { 58 | "popsize": 500, 59 | "num_elites": 50, 60 | "max_iters": 5, 61 | "alpha": 0.1 62 | } 63 | } 64 | 65 | @staticmethod 66 | def obs_preproc(obs): 67 | """ @brief: no cheating of the observation function 68 | """ 69 | if isinstance(obs, np.ndarray): 70 | return obs 71 | else: 72 | return obs 73 | 74 | @staticmethod 75 | def obs_postproc(obs, pred): 76 | if isinstance(obs, np.ndarray): 77 | return obs + pred 78 | else: 79 | return obs + pred 80 | 81 | @staticmethod 82 | def targ_proc(obs, next_obs): 83 | return next_obs - obs 84 | 85 | @staticmethod 86 | def obs_cost_fn(obs): 87 | """ @brief: 88 | see mbbl.env.gym_env.walker.py for reward details 89 | """ 90 | if isinstance(obs, np.ndarray): 91 | velocity_cost = -obs[:, 5] # the qvel for the root-x joint 92 | height_cost = 3 * np.square(obs[:, 0] - 1.3) # the height 93 | # height, ang = ob[0], ob[1] 94 | done = (obs[:, 0] <= 0.7) or (abs(obs[:, 1]) >= 0.2) 95 | alive_reward = 1.0 - np.array(done, dtype=np.float) 96 | return velocity_cost + height_cost - alive_reward 97 | else: 98 | velocity_cost = -obs[:, 5] # the qvel for the root-x joint 99 | height_cost = 3 * tf.square(obs[:, 0] - 1.3) # the height 100 | done = tf.logical_or(obs[:, 0] <= 0.7, tf.abs(obs[:, 1]) >= 0.2) 101 | alive_reward = 1.0 - tf.cast(done, dtype=velocity_cost.dtype) 102 | return velocity_cost + height_cost - alive_reward 103 | 104 | @staticmethod 105 | def ac_cost_fn(acs): 106 | if isinstance(acs, np.ndarray): 107 | return 0.1 * np.sum(np.square(acs), axis=1) 108 | else: 109 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 110 | 111 | def nn_constructor(self, model_init_cfg, misc=None): 112 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 113 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 114 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 115 | model_dir=model_init_cfg.get("model_dir", None), 116 | misc=misc 117 | )) 118 | if not model_init_cfg.get("load_model", False): 119 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 120 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 121 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 122 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 123 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 124 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 125 | return model 126 | 127 | def gp_constructor(self, model_init_cfg): 128 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 129 | name="model", 130 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 131 | kernel_args=model_init_cfg.get("kernel_args", {}), 132 | num_inducing_points=get_required_argument( 133 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 134 | ), 135 | sess=self.SESS 136 | )) 137 | return model 138 | 139 | 140 | CONFIG_MODULE = FixedHopperConfigModule 141 | -------------------------------------------------------------------------------- /dmbrl/config/gym_fswimmer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class FixedSwimmerConfigModule: 14 | """ 15 | """ 16 | ENV_NAME = "MBRLGYM_SWIMMER-v0" 17 | TASK_HORIZON = 1000 18 | NTRAIN_ITERS = 300 19 | NROLLOUTS_PER_ITER = 1 20 | PLAN_HOR = 30 21 | INIT_VAR = 0.25 22 | MODEL_IN, MODEL_OUT = 11, 9 # obs - > 8 + 1, action 2 23 | GP_NINDUCING_POINTS = 300 24 | 25 | def __init__(self): 26 | # self.ENV = gym.make(self.ENV_NAME) 27 | from mbbl.env.gym_env import fixed_swimmer 28 | self.ENV = fixed_swimmer.env(env_name='gym_fswimmer', rand_seed=1234, 29 | misc_info={'reset_type': 'gym'}) 30 | cfg = tf.ConfigProto() 31 | cfg.gpu_options.allow_growth = True 32 | self.SESS = tf.Session(config=cfg) 33 | self.NN_TRAIN_CFG = {"epochs": 5} 34 | self.OPT_CFG = { 35 | "Random": { 36 | "popsize": 2500 37 | }, 38 | "GBPRandom": { 39 | "popsize": 2500 40 | }, 41 | "GBPCEM": { 42 | "popsize": 500, 43 | "num_elites": 50, 44 | "max_iters": 5, 45 | "alpha": 0.1 46 | }, 47 | "CEM": { 48 | "popsize": 500, 49 | "num_elites": 50, 50 | "max_iters": 5, 51 | "alpha": 0.1 52 | }, 53 | "POPLIN-P": { 54 | "popsize": 500, 55 | "num_elites": 50, 56 | "max_iters": 5, 57 | "alpha": 0.1 58 | }, 59 | "POPLIN-A": { 60 | "popsize": 500, 61 | "num_elites": 50, 62 | "max_iters": 5, 63 | "alpha": 0.1 64 | } 65 | } 66 | 67 | @staticmethod 68 | def obs_preproc(obs): 69 | """ @brief: no cheating of the observation function 70 | """ 71 | if isinstance(obs, np.ndarray): 72 | return obs 73 | else: 74 | return obs 75 | 76 | @staticmethod 77 | def obs_postproc(obs, pred): 78 | if isinstance(obs, np.ndarray): 79 | return obs + pred 80 | else: 81 | return obs + pred 82 | 83 | @staticmethod 84 | def targ_proc(obs, next_obs): 85 | return next_obs - obs 86 | 87 | @staticmethod 88 | def obs_cost_fn(obs): 89 | """ @brief: 90 | see mbbl.env.gym_env.walker.py for reward details 91 | """ 92 | if isinstance(obs, np.ndarray): 93 | velocity_cost = -obs[:, -1] # the qvel for the root-x joint 94 | return velocity_cost 95 | else: 96 | velocity_cost = -obs[:, -1] # the qvel for the root-x joint 97 | return velocity_cost 98 | 99 | @staticmethod 100 | def ac_cost_fn(acs): 101 | if isinstance(acs, np.ndarray): 102 | return 0.0001 * np.sum(np.square(acs), axis=1) 103 | else: 104 | return 0.0001 * tf.reduce_sum(tf.square(acs), axis=1) 105 | 106 | def nn_constructor(self, model_init_cfg, misc=None): 107 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 108 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 109 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 110 | model_dir=model_init_cfg.get("model_dir", None), 111 | misc=misc 112 | )) 113 | if not model_init_cfg.get("load_model", False): 114 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 115 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 116 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 117 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 118 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 119 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 120 | return model 121 | 122 | def gp_constructor(self, model_init_cfg): 123 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 124 | name="model", 125 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 126 | kernel_args=model_init_cfg.get("kernel_args", {}), 127 | num_inducing_points=get_required_argument( 128 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 129 | ), 130 | sess=self.SESS 131 | )) 132 | return model 133 | 134 | 135 | CONFIG_MODULE = FixedSwimmerConfigModule 136 | -------------------------------------------------------------------------------- /dmbrl/config/gym_hopper.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class HopperConfigModule: 14 | ENV_NAME = "MBRLGYM_Hopper-v0" 15 | TASK_HORIZON = 1000 16 | NTRAIN_ITERS = 300 17 | NROLLOUTS_PER_ITER = 1 18 | PLAN_HOR = 30 19 | INIT_VAR = 0.25 20 | MODEL_IN, MODEL_OUT = 14, 11 # obs - > 11, action 3 21 | GP_NINDUCING_POINTS = 300 22 | 23 | def __init__(self): 24 | # self.ENV = gym.make(self.ENV_NAME) 25 | from mbbl.env.gym_env import walker 26 | self.ENV = walker.env(env_name='gym_hopper', rand_seed=1234, 27 | misc_info={'reset_type': 'gym'}) 28 | cfg = tf.ConfigProto() 29 | cfg.gpu_options.allow_growth = True 30 | self.SESS = tf.Session(config=cfg) 31 | self.NN_TRAIN_CFG = {"epochs": 5} 32 | self.OPT_CFG = { 33 | "Random": { 34 | "popsize": 2500 35 | }, 36 | "GBPRandom": { 37 | "popsize": 2500 38 | }, 39 | "GBPCEM": { 40 | "popsize": 500, 41 | "num_elites": 50, 42 | "max_iters": 5, 43 | "alpha": 0.1 44 | }, 45 | "CEM": { 46 | "popsize": 500, 47 | "num_elites": 50, 48 | "max_iters": 5, 49 | "alpha": 0.1 50 | }, 51 | "POPLIN-P": { 52 | "popsize": 500, 53 | "num_elites": 50, 54 | "max_iters": 5, 55 | "alpha": 0.1 56 | }, 57 | "POPLIN-A": { 58 | "popsize": 500, 59 | "num_elites": 50, 60 | "max_iters": 5, 61 | "alpha": 0.1 62 | } 63 | } 64 | 65 | @staticmethod 66 | def obs_preproc(obs): 67 | """ @brief: no cheating of the observation function 68 | """ 69 | if isinstance(obs, np.ndarray): 70 | return obs 71 | else: 72 | return obs 73 | 74 | @staticmethod 75 | def obs_postproc(obs, pred): 76 | if isinstance(obs, np.ndarray): 77 | return obs + pred 78 | else: 79 | return obs + pred 80 | 81 | @staticmethod 82 | def targ_proc(obs, next_obs): 83 | return next_obs - obs 84 | 85 | @staticmethod 86 | def obs_cost_fn(obs): 87 | """ @brief: 88 | see mbbl.env.gym_env.walker.py for reward details 89 | """ 90 | if isinstance(obs, np.ndarray): 91 | velocity_cost = -obs[:, 5] # the qvel for the root-x joint 92 | height_cost = 3 * np.square(obs[:, 0] - 1.3) # the height 93 | return velocity_cost + height_cost 94 | else: 95 | velocity_cost = -obs[:, 5] # the qvel for the root-x joint 96 | height_cost = 3 * tf.square(obs[:, 0] - 1.3) # the height 97 | return velocity_cost + height_cost 98 | 99 | @staticmethod 100 | def ac_cost_fn(acs): 101 | if isinstance(acs, np.ndarray): 102 | return 0.1 * np.sum(np.square(acs), axis=1) 103 | else: 104 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 105 | 106 | def nn_constructor(self, model_init_cfg, misc=None): 107 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 108 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 109 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 110 | model_dir=model_init_cfg.get("model_dir", None), 111 | misc=misc 112 | )) 113 | if not model_init_cfg.get("load_model", False): 114 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 115 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 116 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 117 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 118 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 119 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 120 | return model 121 | 122 | def gp_constructor(self, model_init_cfg): 123 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 124 | name="model", 125 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 126 | kernel_args=model_init_cfg.get("kernel_args", {}), 127 | num_inducing_points=get_required_argument( 128 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 129 | ), 130 | sess=self.SESS 131 | )) 132 | return model 133 | 134 | 135 | CONFIG_MODULE = HopperConfigModule 136 | -------------------------------------------------------------------------------- /dmbrl/config/gym_invertedPendulum.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | """ 12 | Module name, ENV_NAME 13 | MODEL_IN, MODEL_OUT, 14 | import env, env_name 15 | """ 16 | 17 | 18 | class GymINVPendulumConfigModule: 19 | ENV_NAME = "MBRLGYM_invpendulum-v0" 20 | TASK_HORIZON = 1000 21 | NTRAIN_ITERS = 300 22 | NROLLOUTS_PER_ITER = 1 23 | PLAN_HOR = 30 24 | INIT_VAR = 0.25 25 | MODEL_IN, MODEL_OUT = 5, 4 # obs -> 4, action -> 1 26 | GP_NINDUCING_POINTS = 300 27 | 28 | def __init__(self): 29 | # self.ENV = gym.make(self.ENV_NAME) 30 | from mbbl.env.gym_env import invertedPendulum 31 | self.ENV = invertedPendulum.env( 32 | env_name='gym_invertedPendulum', rand_seed=1234, 33 | misc_info={'reset_type': 'gym'} 34 | ) 35 | cfg = tf.ConfigProto() 36 | cfg.gpu_options.allow_growth = True 37 | self.SESS = tf.Session(config=cfg) 38 | self.NN_TRAIN_CFG = {"epochs": 5} 39 | self.OPT_CFG = { 40 | "Random": { 41 | "popsize": 2500 42 | }, 43 | "GBPRandom": { 44 | "popsize": 2500 45 | }, 46 | "GBPCEM": { 47 | "popsize": 500, 48 | "num_elites": 50, 49 | "max_iters": 5, 50 | "alpha": 0.1 51 | }, 52 | "CEM": { 53 | "popsize": 500, 54 | "num_elites": 50, 55 | "max_iters": 5, 56 | "alpha": 0.1 57 | }, 58 | "POPLIN-P": { 59 | "popsize": 500, 60 | "num_elites": 50, 61 | "max_iters": 5, 62 | "alpha": 0.1 63 | }, 64 | "POPLIN-A": { 65 | "popsize": 500, 66 | "num_elites": 50, 67 | "max_iters": 5, 68 | "alpha": 0.1 69 | } 70 | } 71 | 72 | @staticmethod 73 | def obs_preproc(obs): 74 | """ @brief: no cheating of the observation function 75 | """ 76 | if isinstance(obs, np.ndarray): 77 | return obs 78 | else: 79 | return obs 80 | 81 | @staticmethod 82 | def obs_postproc(obs, pred): 83 | if isinstance(obs, np.ndarray): 84 | return obs + pred 85 | else: 86 | return obs + pred 87 | 88 | @staticmethod 89 | def targ_proc(obs, next_obs): 90 | return next_obs - obs 91 | 92 | @staticmethod 93 | def obs_cost_fn(obs): 94 | """ @brief: 95 | see mbbl.env.gym_env.walker.py for reward details 96 | 97 | # ypos penalty 98 | ypos = data_dict['start_state'][ypos_ob_pos] 99 | ypos_reward = -(ypos - ypos_target) ** 2 100 | """ 101 | return obs[:, 1] ** 2 102 | 103 | @staticmethod 104 | def ac_cost_fn(acs): 105 | if isinstance(acs, np.ndarray): 106 | return np.sum(np.square(acs), axis=1) * 0.0 107 | else: 108 | return tf.reduce_sum(tf.square(acs), axis=1) * 0.0 109 | 110 | def nn_constructor(self, model_init_cfg, misc=None): 111 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 112 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 113 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 114 | model_dir=model_init_cfg.get("model_dir", None), 115 | misc=misc 116 | )) 117 | if not model_init_cfg.get("load_model", False): 118 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 119 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 120 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 121 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 122 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 123 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 124 | return model 125 | 126 | def gp_constructor(self, model_init_cfg): 127 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 128 | name="model", 129 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 130 | kernel_args=model_init_cfg.get("kernel_args", {}), 131 | num_inducing_points=get_required_argument( 132 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 133 | ), 134 | sess=self.SESS 135 | )) 136 | return model 137 | 138 | 139 | CONFIG_MODULE = GymINVPendulumConfigModule 140 | -------------------------------------------------------------------------------- /dmbrl/config/gym_pendulum.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | """ 12 | Module name, 13 | MODEL_IN, MODEL_OUT, 14 | import env, env_name 15 | """ 16 | 17 | 18 | class GymPendulumConfigModule: 19 | ENV_NAME = "MBRLGYM_pendulum-v0" 20 | TASK_HORIZON = 1000 21 | NTRAIN_ITERS = 300 22 | NROLLOUTS_PER_ITER = 1 23 | PLAN_HOR = 30 24 | INIT_VAR = 0.25 25 | MODEL_IN, MODEL_OUT = 4, 3 # obs -> 3, action -> 1 26 | GP_NINDUCING_POINTS = 300 27 | 28 | def __init__(self): 29 | # self.ENV = gym.make(self.ENV_NAME) 30 | from mbbl.env.gym_env import pendulum 31 | self.ENV = pendulum.env(env_name='gym_pendulum', rand_seed=1234, 32 | misc_info={'reset_type': 'gym'}) 33 | cfg = tf.ConfigProto() 34 | cfg.gpu_options.allow_growth = True 35 | self.SESS = tf.Session(config=cfg) 36 | self.NN_TRAIN_CFG = {"epochs": 5} 37 | self.OPT_CFG = { 38 | "Random": { 39 | "popsize": 2500 40 | }, 41 | "GBPRandom": { 42 | "popsize": 2500 43 | }, 44 | "GBPCEM": { 45 | "popsize": 500, 46 | "num_elites": 50, 47 | "max_iters": 5, 48 | "alpha": 0.1 49 | }, 50 | "CEM": { 51 | "popsize": 500, 52 | "num_elites": 50, 53 | "max_iters": 5, 54 | "alpha": 0.1 55 | }, 56 | "POPLIN-P": { 57 | "popsize": 500, 58 | "num_elites": 50, 59 | "max_iters": 5, 60 | "alpha": 0.1 61 | }, 62 | "POPLIN-A": { 63 | "popsize": 500, 64 | "num_elites": 50, 65 | "max_iters": 5, 66 | "alpha": 0.1 67 | } 68 | } 69 | 70 | @staticmethod 71 | def obs_preproc(obs): 72 | """ @brief: no cheating of the observation function 73 | """ 74 | if isinstance(obs, np.ndarray): 75 | return obs 76 | else: 77 | return obs 78 | 79 | @staticmethod 80 | def obs_postproc(obs, pred): 81 | if isinstance(obs, np.ndarray): 82 | return obs + pred 83 | else: 84 | return obs + pred 85 | 86 | @staticmethod 87 | def targ_proc(obs, next_obs): 88 | return next_obs - obs 89 | 90 | @staticmethod 91 | def obs_cost_fn(obs): 92 | """ @brief: 93 | see mbbl.env.gym_env.walker.py for reward details 94 | 95 | def reward(data_dict): 96 | action = data_dict['action'] 97 | true_action = action * self._env.env.max_torque 98 | 99 | max_torque = self._env.env.max_torque 100 | torque = np.clip(true_action, -max_torque, max_torque)[0] 101 | 102 | y, x, thetadot = data_dict['start_state'] 103 | 104 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (torque ** 2) 105 | # note: reward is the negative cost 106 | return -costs 107 | """ 108 | y = obs[:, 0] 109 | x = obs[:, 1] 110 | thetadot = obs[:, 2] 111 | cost = y + tf.abs(0.1 * x) + 0.1 * (thetadot ** 2) 112 | return cost 113 | 114 | @staticmethod 115 | def ac_cost_fn(acs): 116 | max_torque = 2.0 117 | 118 | if isinstance(acs, np.ndarray): 119 | clip_torque = np.clip(acs, -max_torque, max_torque) 120 | return 0.001 * np.sum(np.square(clip_torque), axis=1) 121 | else: 122 | clip_torque = tf.clip_by_value(acs, -max_torque, max_torque) 123 | return 0.001 * tf.reduce_sum(tf.square(clip_torque), axis=1) 124 | 125 | def nn_constructor(self, model_init_cfg, misc=None): 126 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 127 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 128 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 129 | model_dir=model_init_cfg.get("model_dir", None), 130 | misc=misc 131 | )) 132 | if not model_init_cfg.get("load_model", False): 133 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 134 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 135 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 136 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 137 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 138 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 139 | return model 140 | 141 | def gp_constructor(self, model_init_cfg): 142 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 143 | name="model", 144 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 145 | kernel_args=model_init_cfg.get("kernel_args", {}), 146 | num_inducing_points=get_required_argument( 147 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 148 | ), 149 | sess=self.SESS 150 | )) 151 | return model 152 | 153 | 154 | CONFIG_MODULE = GymPendulumConfigModule 155 | -------------------------------------------------------------------------------- /dmbrl/config/gym_reacher.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class ReacherConfigModule: 14 | ENV_NAME = "MBRLGYM_Reacher-v0" 15 | TASK_HORIZON = 1000 16 | NTRAIN_ITERS = 300 17 | NROLLOUTS_PER_ITER = 1 18 | PLAN_HOR = 30 19 | INIT_VAR = 0.25 20 | MODEL_IN, MODEL_OUT = 13, 11 # obs - > 11, action 2 21 | GP_NINDUCING_POINTS = 300 22 | 23 | def __init__(self): 24 | # self.ENV = gym.make(self.ENV_NAME) 25 | from mbbl.env.gym_env import reacher 26 | self.ENV = reacher.env(env_name='gym_reacher', rand_seed=1234, 27 | misc_info={'reset_type': 'gym'}) 28 | cfg = tf.ConfigProto() 29 | cfg.gpu_options.allow_growth = True 30 | self.SESS = tf.Session(config=cfg) 31 | self.NN_TRAIN_CFG = {"epochs": 5} 32 | self.OPT_CFG = { 33 | "Random": { 34 | "popsize": 2500 35 | }, 36 | "GBPRandom": { 37 | "popsize": 2500 38 | }, 39 | "GBPCEM": { 40 | "popsize": 500, 41 | "num_elites": 50, 42 | "max_iters": 5, 43 | "alpha": 0.1 44 | }, 45 | "CEM": { 46 | "popsize": 500, 47 | "num_elites": 50, 48 | "max_iters": 5, 49 | "alpha": 0.1 50 | }, 51 | "POPLIN-P": { 52 | "popsize": 500, 53 | "num_elites": 50, 54 | "max_iters": 5, 55 | "alpha": 0.1 56 | }, 57 | "POPLIN-A": { 58 | "popsize": 500, 59 | "num_elites": 50, 60 | "max_iters": 5, 61 | "alpha": 0.1 62 | } 63 | } 64 | 65 | @staticmethod 66 | def obs_preproc(obs): 67 | """ @brief: no cheating of the observation function 68 | """ 69 | if isinstance(obs, np.ndarray): 70 | return obs 71 | else: 72 | return obs 73 | 74 | @staticmethod 75 | def obs_postproc(obs, pred): 76 | if isinstance(obs, np.ndarray): 77 | return obs + pred 78 | else: 79 | return obs + pred 80 | 81 | @staticmethod 82 | def targ_proc(obs, next_obs): 83 | return next_obs - obs 84 | 85 | @staticmethod 86 | def obs_cost_fn(obs): 87 | """ @brief: 88 | see mbbl.env.gym_env.walker.py for reward details 89 | """ 90 | if isinstance(obs, np.ndarray): 91 | return np.linalg.norm(obs[:, -3:]) 92 | else: 93 | return tf.linalg.norm(obs[:, -3:]) 94 | 95 | @staticmethod 96 | def ac_cost_fn(acs): 97 | if isinstance(acs, np.ndarray): 98 | return np.sum(np.square(acs), axis=1) 99 | else: 100 | return tf.reduce_sum(tf.square(acs), axis=1) 101 | 102 | def nn_constructor(self, model_init_cfg, misc=None): 103 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 104 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 105 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 106 | model_dir=model_init_cfg.get("model_dir", None), 107 | misc=misc 108 | )) 109 | if not model_init_cfg.get("load_model", False): 110 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 111 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 112 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 113 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 114 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 115 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 116 | return model 117 | 118 | def gp_constructor(self, model_init_cfg): 119 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 120 | name="model", 121 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 122 | kernel_args=model_init_cfg.get("kernel_args", {}), 123 | num_inducing_points=get_required_argument( 124 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 125 | ), 126 | sess=self.SESS 127 | )) 128 | return model 129 | 130 | 131 | CONFIG_MODULE = ReacherConfigModule 132 | -------------------------------------------------------------------------------- /dmbrl/config/gym_swimmer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class SwimmerConfigModule: 14 | """ 15 | @brief: migrate the gym module from the mbbl repo 16 | 'gym_cheetah': { 17 | 'path': 'mbbl.env.gym_env.walker', 18 | 'ob_size': 17, 'action_size': 6, 'max_length': 1000 19 | } 20 | """ 21 | ENV_NAME = "MBRLGYM_SWIMMER-v0" 22 | TASK_HORIZON = 1000 23 | NTRAIN_ITERS = 300 24 | NROLLOUTS_PER_ITER = 1 25 | PLAN_HOR = 30 26 | INIT_VAR = 0.25 27 | MODEL_IN, MODEL_OUT = 10, 8 # obs - > 8, action 2 28 | GP_NINDUCING_POINTS = 300 29 | 30 | def __init__(self): 31 | # self.ENV = gym.make(self.ENV_NAME) 32 | from mbbl.env.gym_env import walker 33 | self.ENV = walker.env(env_name='gym_swimmer', rand_seed=1234, 34 | misc_info={'reset_type': 'gym'}) 35 | cfg = tf.ConfigProto() 36 | cfg.gpu_options.allow_growth = True 37 | self.SESS = tf.Session(config=cfg) 38 | self.NN_TRAIN_CFG = {"epochs": 5} 39 | self.OPT_CFG = { 40 | "Random": { 41 | "popsize": 2500 42 | }, 43 | "GBPRandom": { 44 | "popsize": 2500 45 | }, 46 | "GBPCEM": { 47 | "popsize": 500, 48 | "num_elites": 50, 49 | "max_iters": 5, 50 | "alpha": 0.1 51 | }, 52 | "CEM": { 53 | "popsize": 500, 54 | "num_elites": 50, 55 | "max_iters": 5, 56 | "alpha": 0.1 57 | }, 58 | "POPLIN-P": { 59 | "popsize": 500, 60 | "num_elites": 50, 61 | "max_iters": 5, 62 | "alpha": 0.1 63 | }, 64 | "POPLIN-A": { 65 | "popsize": 500, 66 | "num_elites": 50, 67 | "max_iters": 5, 68 | "alpha": 0.1 69 | } 70 | } 71 | 72 | @staticmethod 73 | def obs_preproc(obs): 74 | """ @brief: no cheating of the observation function 75 | """ 76 | if isinstance(obs, np.ndarray): 77 | return obs 78 | else: 79 | return obs 80 | 81 | @staticmethod 82 | def obs_postproc(obs, pred): 83 | if isinstance(obs, np.ndarray): 84 | return obs + pred 85 | else: 86 | return obs + pred 87 | 88 | @staticmethod 89 | def targ_proc(obs, next_obs): 90 | return next_obs - obs 91 | 92 | @staticmethod 93 | def obs_cost_fn(obs): 94 | """ @brief: 95 | see mbbl.env.gym_env.walker.py for reward details 96 | """ 97 | if isinstance(obs, np.ndarray): 98 | velocity_cost = -obs[:, 3] # the qvel for the root-x joint 99 | return velocity_cost 100 | else: 101 | velocity_cost = -obs[:, 3] # the qvel for the root-x joint 102 | return velocity_cost 103 | 104 | @staticmethod 105 | def ac_cost_fn(acs): 106 | if isinstance(acs, np.ndarray): 107 | return 0.0001 * np.sum(np.square(acs), axis=1) 108 | else: 109 | return 0.0001 * tf.reduce_sum(tf.square(acs), axis=1) 110 | 111 | def nn_constructor(self, model_init_cfg, misc=None): 112 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 113 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 114 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 115 | model_dir=model_init_cfg.get("model_dir", None), 116 | misc=misc 117 | )) 118 | if not model_init_cfg.get("load_model", False): 119 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 120 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 121 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 122 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 123 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 124 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 125 | return model 126 | 127 | def gp_constructor(self, model_init_cfg): 128 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 129 | name="model", 130 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 131 | kernel_args=model_init_cfg.get("kernel_args", {}), 132 | num_inducing_points=get_required_argument( 133 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 134 | ), 135 | sess=self.SESS 136 | )) 137 | return model 138 | 139 | 140 | CONFIG_MODULE = SwimmerConfigModule 141 | -------------------------------------------------------------------------------- /dmbrl/config/gym_walker2d.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | 9 | from dmbrl.misc.DotmapUtils import get_required_argument 10 | from dmbrl.modeling.layers import FC 11 | 12 | 13 | class WalkerConfigModule: 14 | """ 15 | @brief: migrate the gym module from the mbbl repo 16 | 'gym_cheetah': { 17 | 'path': 'mbbl.env.gym_env.walker', 18 | 'ob_size': 17, 'action_size': 6, 'max_length': 1000 19 | } 20 | """ 21 | ENV_NAME = "MBRLGYM_Walker-v0" 22 | TASK_HORIZON = 1000 23 | NTRAIN_ITERS = 300 24 | NROLLOUTS_PER_ITER = 1 25 | PLAN_HOR = 30 26 | INIT_VAR = 0.25 27 | MODEL_IN, MODEL_OUT = 23, 17 # obs - > 17, action 6 28 | GP_NINDUCING_POINTS = 300 29 | 30 | def __init__(self): 31 | # self.ENV = gym.make(self.ENV_NAME) 32 | from mbbl.env.gym_env import walker 33 | self.ENV = walker.env(env_name='gym_walker2d', rand_seed=1234, 34 | misc_info={'reset_type': 'gym'}) 35 | cfg = tf.ConfigProto() 36 | cfg.gpu_options.allow_growth = True 37 | self.SESS = tf.Session(config=cfg) 38 | self.NN_TRAIN_CFG = {"epochs": 5} 39 | self.OPT_CFG = { 40 | "Random": { 41 | "popsize": 2500 42 | }, 43 | "GBPRandom": { 44 | "popsize": 2500 45 | }, 46 | "GBPCEM": { 47 | "popsize": 500, 48 | "num_elites": 50, 49 | "max_iters": 5, 50 | "alpha": 0.1 51 | }, 52 | "CEM": { 53 | "popsize": 500, 54 | "num_elites": 50, 55 | "max_iters": 5, 56 | "alpha": 0.1 57 | }, 58 | "POPLIN-P": { 59 | "popsize": 500, 60 | "num_elites": 50, 61 | "max_iters": 5, 62 | "alpha": 0.1 63 | }, 64 | "POPLIN-A": { 65 | "popsize": 500, 66 | "num_elites": 50, 67 | "max_iters": 5, 68 | "alpha": 0.1 69 | } 70 | } 71 | 72 | @staticmethod 73 | def obs_preproc(obs): 74 | """ @brief: no cheating of the observation function 75 | """ 76 | if isinstance(obs, np.ndarray): 77 | return obs 78 | else: 79 | return obs 80 | 81 | @staticmethod 82 | def obs_postproc(obs, pred): 83 | if isinstance(obs, np.ndarray): 84 | return obs + pred 85 | else: 86 | return obs + pred 87 | 88 | @staticmethod 89 | def targ_proc(obs, next_obs): 90 | return next_obs - obs 91 | 92 | @staticmethod 93 | def obs_cost_fn(obs): 94 | """ @brief: 95 | see mbbl.env.gym_env.walker.py for reward details 96 | """ 97 | if isinstance(obs, np.ndarray): 98 | velocity_cost = -obs[:, 8] # the qvel for the root-x joint 99 | height_cost = 3 * np.square(obs[:, 0] - 1.3) # the height 100 | return velocity_cost + height_cost 101 | else: 102 | velocity_cost = -obs[:, 8] # the qvel for the root-x joint 103 | height_cost = 3 * tf.square(obs[:, 0] - 1.3) # the height 104 | return velocity_cost + height_cost 105 | 106 | @staticmethod 107 | def ac_cost_fn(acs): 108 | if isinstance(acs, np.ndarray): 109 | return 0.1 * np.sum(np.square(acs), axis=1) 110 | else: 111 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 112 | 113 | def nn_constructor(self, model_init_cfg, misc=None): 114 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 115 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 116 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 117 | model_dir=model_init_cfg.get("model_dir", None), 118 | misc=misc 119 | )) 120 | if not model_init_cfg.get("load_model", False): 121 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 122 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 123 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 124 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 125 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 126 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 127 | return model 128 | 129 | def gp_constructor(self, model_init_cfg): 130 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 131 | name="model", 132 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 133 | kernel_args=model_init_cfg.get("kernel_args", {}), 134 | num_inducing_points=get_required_argument( 135 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 136 | ), 137 | sess=self.SESS 138 | )) 139 | return model 140 | 141 | 142 | CONFIG_MODULE = WalkerConfigModule 143 | -------------------------------------------------------------------------------- /dmbrl/config/halfcheetah.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | import gym 9 | 10 | from dmbrl.misc.DotmapUtils import get_required_argument 11 | from dmbrl.modeling.layers import FC 12 | import dmbrl.env 13 | 14 | 15 | class HalfCheetahConfigModule: 16 | ENV_NAME = "MBRLHalfCheetah-v0" 17 | TASK_HORIZON = 1000 18 | NTRAIN_ITERS = 300 19 | NROLLOUTS_PER_ITER = 1 20 | PLAN_HOR = 30 21 | INIT_VAR = 0.25 22 | MODEL_IN, MODEL_OUT = 24, 18 # obs - > 18, action 6 23 | GP_NINDUCING_POINTS = 300 24 | 25 | def __init__(self): 26 | self.ENV = gym.make(self.ENV_NAME) 27 | cfg = tf.ConfigProto() 28 | cfg.gpu_options.allow_growth = True 29 | self.SESS = tf.Session(config=cfg) 30 | self.NN_TRAIN_CFG = {"epochs": 5} 31 | self.OPT_CFG = { 32 | "Random": { 33 | "popsize": 2500 34 | }, 35 | "GBPRandom": { 36 | "popsize": 2500 37 | }, 38 | "GBPCEM": { 39 | "popsize": 500, 40 | "num_elites": 50, 41 | "max_iters": 5, 42 | "alpha": 0.1 43 | }, 44 | "CEM": { 45 | "popsize": 500, 46 | "num_elites": 50, 47 | "max_iters": 5, 48 | "alpha": 0.1 49 | }, 50 | "POPLIN-P": { 51 | "popsize": 500, 52 | "num_elites": 50, 53 | "max_iters": 5, 54 | "alpha": 0.1 55 | }, 56 | "POPLIN-A": { 57 | "popsize": 500, 58 | "num_elites": 50, 59 | "max_iters": 5, 60 | "alpha": 0.1 61 | } 62 | } 63 | 64 | @staticmethod 65 | def obs_preproc(obs): 66 | if isinstance(obs, np.ndarray): 67 | return np.concatenate([obs[:, 1:2], np.sin(obs[:, 2:3]), np.cos(obs[:, 2:3]), obs[:, 3:]], axis=1) 68 | else: 69 | return tf.concat([obs[:, 1:2], tf.sin(obs[:, 2:3]), tf.cos(obs[:, 2:3]), obs[:, 3:]], axis=1) 70 | 71 | @staticmethod 72 | def obs_postproc(obs, pred): 73 | if isinstance(obs, np.ndarray): 74 | return np.concatenate([pred[:, :1], obs[:, 1:] + pred[:, 1:]], axis=1) 75 | else: 76 | return tf.concat([pred[:, :1], obs[:, 1:] + pred[:, 1:]], axis=1) 77 | 78 | @staticmethod 79 | def targ_proc(obs, next_obs): 80 | return np.concatenate([next_obs[:, :1], next_obs[:, 1:] - obs[:, 1:]], axis=1) 81 | 82 | @staticmethod 83 | def obs_cost_fn(obs): 84 | return -obs[:, 0] 85 | 86 | @staticmethod 87 | def ac_cost_fn(acs): 88 | if isinstance(acs, np.ndarray): 89 | return 0.1 * np.sum(np.square(acs), axis=1) 90 | else: 91 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 92 | 93 | def nn_constructor(self, model_init_cfg, misc=None): 94 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 95 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 96 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 97 | model_dir=model_init_cfg.get("model_dir", None), 98 | misc=misc 99 | )) 100 | if not model_init_cfg.get("load_model", False): 101 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.000025)) 102 | model.add(FC(200, activation="swish", weight_decay=0.00005)) 103 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 104 | model.add(FC(200, activation="swish", weight_decay=0.000075)) 105 | model.add(FC(self.MODEL_OUT, weight_decay=0.0001)) 106 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 107 | return model 108 | 109 | def gp_constructor(self, model_init_cfg): 110 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 111 | name="model", 112 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 113 | kernel_args=model_init_cfg.get("kernel_args", {}), 114 | num_inducing_points=get_required_argument( 115 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 116 | ), 117 | sess=self.SESS 118 | )) 119 | return model 120 | 121 | 122 | CONFIG_MODULE = HalfCheetahConfigModule 123 | -------------------------------------------------------------------------------- /dmbrl/config/pusher.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | import gym 9 | 10 | from dmbrl.misc.DotmapUtils import get_required_argument 11 | from dmbrl.modeling.layers import FC 12 | import dmbrl.env 13 | 14 | 15 | class PusherConfigModule: 16 | ENV_NAME = "MBRLPusher-v0" 17 | TASK_HORIZON = 150 18 | NTRAIN_ITERS = 100 19 | NROLLOUTS_PER_ITER = 1 20 | PLAN_HOR = 25 21 | INIT_VAR = 0.25 22 | MODEL_IN, MODEL_OUT = 27, 20 23 | GP_NINDUCING_POINTS = 200 24 | 25 | def __init__(self): 26 | self.ENV = gym.make(self.ENV_NAME) 27 | cfg = tf.ConfigProto() 28 | cfg.gpu_options.allow_growth = True 29 | self.SESS = tf.Session(config=cfg) 30 | self.NN_TRAIN_CFG = {"epochs": 5} 31 | self.OPT_CFG = { 32 | "Random": { 33 | "popsize": 2500 34 | }, 35 | "CEM": { 36 | "popsize": 500, 37 | "num_elites": 50, 38 | "max_iters": 5, 39 | "alpha": 0.1 40 | }, 41 | "GBPRandom": { 42 | "popsize": 2500 43 | }, 44 | "GBPCEM": { 45 | "popsize": 500, 46 | "num_elites": 50, 47 | "max_iters": 5, 48 | "alpha": 0.1 49 | }, 50 | "POPLIN-P": { 51 | "popsize": 500, 52 | "num_elites": 50, 53 | "max_iters": 5, 54 | "alpha": 0.1 55 | }, 56 | "POPLIN-A": { 57 | "popsize": 500, 58 | "num_elites": 50, 59 | "max_iters": 5, 60 | "alpha": 0.1 61 | } 62 | } 63 | 64 | @staticmethod 65 | def obs_postproc(obs, pred): 66 | return obs + pred 67 | 68 | @staticmethod 69 | def targ_proc(obs, next_obs): 70 | return next_obs - obs 71 | 72 | def obs_cost_fn(self, obs): 73 | to_w, og_w = 0.5, 1.25 74 | tip_pos, obj_pos, goal_pos = obs[:, 14:17], obs[:, 17:20], self.ENV.ac_goal_pos 75 | 76 | if isinstance(obs, np.ndarray): 77 | tip_obj_dist = np.sum(np.abs(tip_pos - obj_pos), axis=1) 78 | obj_goal_dist = np.sum(np.abs(goal_pos - obj_pos), axis=1) 79 | return to_w * tip_obj_dist + og_w * obj_goal_dist 80 | else: 81 | tip_obj_dist = tf.reduce_sum(tf.abs(tip_pos - obj_pos), axis=1) 82 | obj_goal_dist = tf.reduce_sum(tf.abs(goal_pos - obj_pos), axis=1) 83 | return to_w * tip_obj_dist + og_w * obj_goal_dist 84 | 85 | @staticmethod 86 | def ac_cost_fn(acs): 87 | if isinstance(acs, np.ndarray): 88 | return 0.1 * np.sum(np.square(acs), axis=1) 89 | else: 90 | return 0.1 * tf.reduce_sum(tf.square(acs), axis=1) 91 | 92 | def nn_constructor(self, model_init_cfg, misc): 93 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 94 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 95 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 96 | model_dir=model_init_cfg.get("model_dir", None), 97 | misc=misc 98 | )) 99 | if not model_init_cfg.get("load_model", False): 100 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.00025)) 101 | model.add(FC(200, activation="swish", weight_decay=0.0005)) 102 | model.add(FC(200, activation="swish", weight_decay=0.0005)) 103 | model.add(FC(self.MODEL_OUT, weight_decay=0.00075)) 104 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 105 | return model 106 | 107 | def gp_constructor(self, model_init_cfg): 108 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 109 | name="model", 110 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 111 | kernel_args=model_init_cfg.get("kernel_args", {}), 112 | num_inducing_points=get_required_argument( 113 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 114 | ), 115 | sess=self.SESS 116 | )) 117 | return model 118 | 119 | 120 | CONFIG_MODULE = PusherConfigModule 121 | -------------------------------------------------------------------------------- /dmbrl/config/reacher.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | import gym 9 | 10 | from dmbrl.misc.DotmapUtils import get_required_argument 11 | from dmbrl.modeling.layers import FC 12 | import dmbrl.env 13 | 14 | 15 | class ReacherConfigModule: 16 | ENV_NAME = "MBRLReacher3D-v0" 17 | TASK_HORIZON = 150 18 | NTRAIN_ITERS = 100 19 | NROLLOUTS_PER_ITER = 1 20 | PLAN_HOR = 25 21 | INIT_VAR = 0.25 22 | MODEL_IN, MODEL_OUT = 24, 17 23 | GP_NINDUCING_POINTS = 200 24 | 25 | def __init__(self): 26 | self.ENV = gym.make(self.ENV_NAME) 27 | self.ENV.reset() 28 | cfg = tf.ConfigProto() 29 | cfg.gpu_options.allow_growth = True 30 | self.SESS = tf.Session(config=cfg) 31 | self.NN_TRAIN_CFG = {"epochs": 5} 32 | self.OPT_CFG = { 33 | "Random": { 34 | "popsize": 2000 35 | }, 36 | "CEM": { 37 | "popsize": 400, 38 | "num_elites": 40, 39 | "max_iters": 5, 40 | "alpha": 0.1 41 | }, 42 | "GBPRandom": { 43 | "popsize": 2000 44 | }, 45 | "GBPCEM": { 46 | "popsize": 400, 47 | "num_elites": 40, 48 | "max_iters": 5, 49 | "alpha": 0.1 50 | }, 51 | "POPLIN-P": { 52 | "popsize": 400, 53 | "num_elites": 40, 54 | "max_iters": 5, 55 | "alpha": 0.1 56 | }, 57 | "POPLIN-A": { 58 | "popsize": 400, 59 | "num_elites": 40, 60 | "max_iters": 5, 61 | "alpha": 0.1 62 | } 63 | } 64 | self.UPDATE_FNS = [self.update_goal] 65 | 66 | self.goal = tf.Variable(self.ENV.goal, dtype=tf.float32) 67 | self.SESS.run(self.goal.initializer) 68 | 69 | @staticmethod 70 | def obs_postproc(obs, pred): 71 | return obs + pred 72 | 73 | @staticmethod 74 | def targ_proc(obs, next_obs): 75 | return next_obs - obs 76 | 77 | def update_goal(self, sess=None): 78 | if sess is not None: 79 | self.goal.load(self.ENV.goal, sess) 80 | 81 | def obs_cost_fn(self, obs): 82 | if isinstance(obs, np.ndarray): 83 | return np.sum(np.square(ReacherConfigModule.get_ee_pos(obs, are_tensors=False) - self.ENV.goal), axis=1) 84 | else: 85 | return tf.reduce_sum(tf.square(ReacherConfigModule.get_ee_pos(obs, are_tensors=True) - self.goal), axis=1) 86 | 87 | @staticmethod 88 | def ac_cost_fn(acs): 89 | if isinstance(acs, np.ndarray): 90 | return 0.01 * np.sum(np.square(acs), axis=1) 91 | else: 92 | return 0.01 * tf.reduce_sum(tf.square(acs), axis=1) 93 | 94 | def nn_constructor(self, model_init_cfg, misc=None): 95 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 96 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 97 | sess=self.SESS, load_model=model_init_cfg.get("load_model", False), 98 | model_dir=model_init_cfg.get("model_dir", None), 99 | misc=misc 100 | )) 101 | if not model_init_cfg.get("load_model", False): 102 | model.add(FC(200, input_dim=self.MODEL_IN, activation="swish", weight_decay=0.00025)) 103 | model.add(FC(200, activation="swish", weight_decay=0.0005)) 104 | model.add(FC(200, activation="swish", weight_decay=0.0005)) 105 | model.add(FC(200, activation="swish", weight_decay=0.0005)) 106 | model.add(FC(self.MODEL_OUT, weight_decay=0.00075)) 107 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.00075}) 108 | return model 109 | 110 | def gp_constructor(self, model_init_cfg): 111 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 112 | name="model", 113 | kernel_class=get_required_argument(model_init_cfg, "kernel_class", "Must provide kernel class"), 114 | kernel_args=model_init_cfg.get("kernel_args", {}), 115 | num_inducing_points=get_required_argument( 116 | model_init_cfg, "num_inducing_points", "Must provide number of inducing points." 117 | ), 118 | sess=self.SESS 119 | )) 120 | return model 121 | 122 | @staticmethod 123 | def get_ee_pos(states, are_tensors=False): 124 | theta1, theta2, theta3, theta4, theta5, theta6, theta7 = \ 125 | states[:, :1], states[:, 1:2], states[:, 2:3], states[:, 3:4], states[:, 4:5], states[:, 5:6], states[:, 6:] 126 | if are_tensors: 127 | rot_axis = tf.concat([tf.cos(theta2) * tf.cos(theta1), tf.cos(theta2) * tf.sin(theta1), -tf.sin(theta2)], 128 | axis=1) 129 | rot_perp_axis = tf.concat([-tf.sin(theta1), tf.cos(theta1), tf.zeros(tf.shape(theta1))], axis=1) 130 | cur_end = tf.concat([ 131 | 0.1 * tf.cos(theta1) + 0.4 * tf.cos(theta1) * tf.cos(theta2), 132 | 0.1 * tf.sin(theta1) + 0.4 * tf.sin(theta1) * tf.cos(theta2) - 0.188, 133 | -0.4 * tf.sin(theta2) 134 | ], axis=1) 135 | 136 | for length, hinge, roll in [(0.321, theta4, theta3), (0.16828, theta6, theta5)]: 137 | perp_all_axis = tf.cross(rot_axis, rot_perp_axis) 138 | x = tf.cos(hinge) * rot_axis 139 | y = tf.sin(hinge) * tf.sin(roll) * rot_perp_axis 140 | z = -tf.sin(hinge) * tf.cos(roll) * perp_all_axis 141 | new_rot_axis = x + y + z 142 | new_rot_perp_axis = tf.cross(new_rot_axis, rot_axis) 143 | new_rot_perp_axis = tf.where(tf.less(tf.norm(new_rot_perp_axis, axis=1), 1e-30), 144 | rot_perp_axis, new_rot_perp_axis) 145 | new_rot_perp_axis /= tf.norm(new_rot_perp_axis, axis=1, keepdims=True) 146 | rot_axis, rot_perp_axis, cur_end = new_rot_axis, new_rot_perp_axis, cur_end + length * new_rot_axis 147 | else: 148 | rot_axis = np.concatenate([np.cos(theta2) * np.cos(theta1), np.cos(theta2) * np.sin(theta1), -np.sin(theta2)], 149 | axis=1) 150 | rot_perp_axis = np.concatenate([-np.sin(theta1), np.cos(theta1), np.zeros(theta1.shape)], axis=1) 151 | cur_end = np.concatenate([ 152 | 0.1 * np.cos(theta1) + 0.4 * np.cos(theta1) * np.cos(theta2), 153 | 0.1 * np.sin(theta1) + 0.4 * np.sin(theta1) * np.cos(theta2) - 0.188, 154 | -0.4 * np.sin(theta2) 155 | ], axis=1) 156 | 157 | for length, hinge, roll in [(0.321, theta4, theta3), (0.16828, theta6, theta5)]: 158 | perp_all_axis = np.cross(rot_axis, rot_perp_axis) 159 | x = np.cos(hinge) * rot_axis 160 | y = np.sin(hinge) * np.sin(roll) * rot_perp_axis 161 | z = -np.sin(hinge) * np.cos(roll) * perp_all_axis 162 | new_rot_axis = x + y + z 163 | new_rot_perp_axis = np.cross(new_rot_axis, rot_axis) 164 | new_rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] = \ 165 | rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] 166 | new_rot_perp_axis /= np.linalg.norm(new_rot_perp_axis, axis=1, keepdims=True) 167 | rot_axis, rot_perp_axis, cur_end = new_rot_axis, new_rot_perp_axis, cur_end + length * new_rot_axis 168 | 169 | return cur_end 170 | 171 | 172 | CONFIG_MODULE = ReacherConfigModule 173 | -------------------------------------------------------------------------------- /dmbrl/config/reward_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | import tensorflow as tf 5 | 6 | import numpy as np 7 | 8 | # The value returned by tolerance() at `margin` distance from `bounds` interval. 9 | _DEFAULT_VALUE_AT_MARGIN = 0.1 10 | 11 | 12 | def _sigmoids(x, value_at_1, sigmoid): 13 | """Returns 1 when `x` == 0, between 0 and 1 otherwise. 14 | Args: 15 | x: A scalar or numpy array. 16 | value_at_1: A float between 0 and 1 specifying the output when `x` == 1. 17 | sigmoid: String, choice of sigmoid type. 18 | Returns: 19 | A numpy array with values between 0.0 and 1.0. 20 | Raises: 21 | ValueError: If not 0 < `value_at_1` < 1, except for `linear`, `cosine` and 22 | `quadratic` sigmoids which allow `value_at_1` == 0. 23 | ValueError: If `sigmoid` is of an unknown type. 24 | """ 25 | if sigmoid in ('cosine', 'linear', 'quadratic'): 26 | if not 0 <= value_at_1 < 1: 27 | raise ValueError('`value_at_1` must be nonnegative and smaller than 1, ' 28 | 'got {}.'.format(value_at_1)) 29 | else: 30 | if not 0 < value_at_1 < 1: 31 | raise ValueError('`value_at_1` must be strictly between 0 and 1, ' 32 | 'got {}.'.format(value_at_1)) 33 | 34 | if sigmoid == 'gaussian': 35 | scale = tf.sqrt(-2 * tf.log(value_at_1)) 36 | return tf.exp(-0.5 * (x * scale) ** 2) 37 | 38 | elif sigmoid == 'hyperbolic': 39 | scale = tf.acosh(1 / value_at_1) 40 | return 1 / tf.cosh(x * scale) 41 | 42 | elif sigmoid == 'long_tail': 43 | scale = tf.sqrt(1 / value_at_1 - 1) 44 | return 1 / ((x * scale) ** 2 + 1) 45 | 46 | elif sigmoid == 'cosine': 47 | scale = tf.acos(2 * value_at_1 - 1) / np.pi 48 | scaled_x = x * scale 49 | return tf.where(abs(scaled_x) < 1, 50 | (1 + tf.cos(np.pi * scaled_x)) / 2, 0.0 * scaled_x) 51 | 52 | elif sigmoid == 'linear': 53 | scale = 1.0 - value_at_1 54 | scaled_x = x * scale 55 | return tf.where(abs(scaled_x) < 1, 1 - scaled_x, 0.0 * scaled_x) 56 | 57 | elif sigmoid == 'quadratic': 58 | scale = tf.sqrt(1.0 - value_at_1) 59 | scaled_x = x * scale 60 | return tf.where(abs(scaled_x) < 1, 1 - scaled_x ** 2, 0.0 * scaled_x) 61 | 62 | elif sigmoid == 'tanh_squared': 63 | scale = tf.arctanh(tf.sqrt(1 - value_at_1)) 64 | return 1 - tf.tanh(x * scale) ** 2 65 | 66 | else: 67 | raise ValueError('Unknown sigmoid type {!r}.'.format(sigmoid)) 68 | 69 | 70 | def tolerance(x, bounds=(0.0, 0.0), margin=0.0, sigmoid='gaussian', 71 | value_at_margin=_DEFAULT_VALUE_AT_MARGIN): 72 | """Returns 1 when `x` falls inside the bounds, between 0 and 1 otherwise. 73 | Args: 74 | x: A scalar or numpy array. 75 | bounds: A tuple of floats specifying inclusive `(lower, upper)` bounds for 76 | the target interval. These can be infinite if the interval is unbounded 77 | at one or both ends, or they can be equal to one another if the target 78 | value is exact. 79 | margin: Float. Parameter that controls how steeply the output decreases as 80 | `x` moves out-of-bounds. 81 | * If `margin == 0` then the output will be 0 for all values of `x` 82 | outside of `bounds`. 83 | * If `margin > 0` then the output will decrease sigmoidally with 84 | increasing distance from the nearest bound. 85 | sigmoid: String, choice of sigmoid type. Valid values are: 'gaussian', 86 | 'linear', 'hyperbolic', 'long_tail', 'cosine', 'tanh_squared'. 87 | value_at_margin: A float between 0 and 1 specifying the output value when 88 | the distance from `x` to the nearest bound is equal to `margin`. Ignored 89 | if `margin == 0`. 90 | Returns: 91 | A float or numpy array with values between 0.0 and 1.0. 92 | Raises: 93 | ValueError: If `bounds[0] > bounds[1]`. 94 | ValueError: If `margin` is negative. 95 | """ 96 | lower, upper = bounds 97 | if lower > upper: 98 | raise ValueError('Lower bound must be <= upper bound.') 99 | if margin < 0: 100 | raise ValueError('`margin` must be non-negative.') 101 | 102 | in_bounds = tf.logical_and(lower <= x, x <= upper) 103 | if margin == 0: 104 | value = tf.where(in_bounds, 1.0, 0.0) 105 | else: 106 | d = tf.where(x < lower, lower - x, x - upper) / margin 107 | value = tf.where(in_bounds, 108 | 1.0 + d * 0.0, 109 | _sigmoids(d, value_at_margin, sigmoid)) 110 | 111 | return value 112 | -------------------------------------------------------------------------------- /dmbrl/config/template.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from dotmap import DotMap 8 | import gym 9 | 10 | from dmbrl.misc.DotmapUtils import get_required_argument 11 | from dmbrl.modeling.layers import FC 12 | 13 | 14 | class EnvConfigModule: 15 | ENV_NAME = None 16 | TASK_HORIZON = None 17 | NTRAIN_ITERS = None 18 | NROLLOUTS_PER_ITER = None 19 | PLAN_HOR = None 20 | 21 | def __init__(self): 22 | self.ENV = gym.make(self.ENV_NAME) 23 | cfg = tf.ConfigProto() 24 | cfg.gpu_options.allow_growth = True 25 | self.SESS = tf.Session(config=cfg) 26 | self.NN_TRAIN_CFG = {"epochs": None} 27 | self.OPT_CFG = { 28 | "Random": { 29 | "popsize": None 30 | }, 31 | "CEM": { 32 | "popsize": None, 33 | "num_elites": None, 34 | "max_iters": None, 35 | "alpha": None 36 | } 37 | } 38 | self.UPDATE_FNS = [] 39 | 40 | # Fill in other things to be done here. 41 | 42 | @staticmethod 43 | def obs_preproc(obs): 44 | # Note: Must be able to process both NumPy and Tensorflow arrays. 45 | if isinstance(obs, np.ndarray): 46 | raise NotImplementedError() 47 | else: 48 | raise NotImplementedError 49 | 50 | @staticmethod 51 | def obs_postproc(obs, pred): 52 | # Note: Must be able to process both NumPy and Tensorflow arrays. 53 | if isinstance(obs, np.ndarray): 54 | raise NotImplementedError() 55 | else: 56 | raise NotImplementedError() 57 | 58 | @staticmethod 59 | def targ_proc(obs, next_obs): 60 | # Note: Only needs to process NumPy arrays. 61 | raise NotImplementedError() 62 | 63 | @staticmethod 64 | def obs_cost_fn(obs): 65 | # Note: Must be able to process both NumPy and Tensorflow arrays. 66 | if isinstance(obs, np.ndarray): 67 | raise NotImplementedError() 68 | else: 69 | raise NotImplementedError() 70 | 71 | @staticmethod 72 | def ac_cost_fn(acs): 73 | # Note: Must be able to process both NumPy and Tensorflow arrays. 74 | if isinstance(acs, np.ndarray): 75 | raise NotImplementedError() 76 | else: 77 | raise NotImplementedError() 78 | 79 | def nn_constructor(self, model_init_cfg): 80 | model = get_required_argument(model_init_cfg, "model_class", "Must provide model class")(DotMap( 81 | name="model", num_networks=get_required_argument(model_init_cfg, "num_nets", "Must provide ensemble size"), 82 | sess=self.SESS 83 | )) 84 | # Construct model below. For example: 85 | # model.add(FC(*args)) 86 | # ... 87 | # model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 88 | return model 89 | 90 | 91 | CONFIG_MODULE = EnvConfigModule 92 | 93 | -------------------------------------------------------------------------------- /dmbrl/config/view_humanoid.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | ''' 3 | from dm_control import suite 4 | from dm_control import viewer 5 | import numpy as np 6 | 7 | test_env = suite.load(domain_name="humanoid", task_name="stand") 8 | action_spec = test_env.action_spec() 9 | 10 | def initialize_episode(physics): 11 | with physics.reset_context(): 12 | physics.data.qpos[:] = 0.0 13 | physics.data.qpos[2] = 1.33 14 | physics.data.qvel[:] = 0.0 15 | print(physics.head_height()) 16 | print(physics.head_height()) 17 | print(physics.head_height()) 18 | test_env.task.initialize_episode = initialize_episode 19 | 20 | # Define a uniform random policy. 21 | def random_policy(time_step): 22 | del time_step # Unused. 23 | return np.random.uniform(low=action_spec.minimum, 24 | high=action_spec.maximum, 25 | size=action_spec.shape) 26 | 27 | # Launch the viewer application. 28 | viewer.launch(test_env, policy=random_policy) 29 | ''' 30 | from dm_control import suite 31 | import matplotlib.pyplot as plt 32 | import numpy as np 33 | 34 | max_frame = 90 35 | 36 | width = 480 37 | height = 480 38 | video = np.zeros((90, height, 2 * width, 3), dtype=np.uint8) 39 | 40 | # Load one task: 41 | env = suite.load(domain_name="humanoid", task_name="walk") 42 | 43 | # Step through an episode and print out reward, discount and observation. 44 | action_spec = env.action_spec() 45 | time_step = env.reset() 46 | 47 | with env.physics.reset_context(): 48 | env.physics.data.qpos[:] = 0.0 49 | env.physics.data.qpos[2] = 1.33 50 | env.physics.data.qvel[:] = 0.0 51 | head_pos = [] 52 | while not time_step.last(): 53 | for i in range(max_frame): 54 | action = np.random.uniform(action_spec.minimum, 55 | action_spec.maximum, 56 | size=action_spec.shape) 57 | time_step = env.step(action) 58 | 59 | head_pos.append(env.physics.head_height()) 60 | video[i] = np.hstack([env.physics.render(height, width, camera_id=0), 61 | env.physics.render(height, width, camera_id=1)]) 62 | # print(time_step.reward, time_step.discount, time_step.observation) 63 | for i in range(max_frame): 64 | print(head_pos[i]) 65 | img = plt.imshow(video[i]) 66 | plt.pause(1) # Need min display time > 0.0. 67 | plt.draw() 68 | -------------------------------------------------------------------------------- /dmbrl/controllers/Controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | 6 | class Controller: 7 | def __init__(self, *args, **kwargs): 8 | """Creates class instance. 9 | """ 10 | self._policy_network = None 11 | pass 12 | 13 | def train(self, obs_trajs, acs_trajs, rews_trajs): 14 | """Trains this controller using lists of trajectories. 15 | """ 16 | raise NotImplementedError("Must be implemented in subclass.") 17 | 18 | def reset(self): 19 | """Resets this controller. 20 | """ 21 | raise NotImplementedError("Must be implemented in subclass.") 22 | 23 | def act(self, obs, t, get_pred_cost=False): 24 | """Performs an action. 25 | """ 26 | raise NotImplementedError("Must be implemented in subclass.") 27 | 28 | def dump_logs(self, primary_logdir, iter_logdir): 29 | """Dumps logs into primary log directory and per-train iteration log directory. 30 | """ 31 | raise NotImplementedError("Must be implemented in subclass.") 32 | 33 | def get_policy_network(self): 34 | return self._policy_network 35 | 36 | def train_policy_network(self): 37 | return False 38 | -------------------------------------------------------------------------------- /dmbrl/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | from .MPC import MPC 2 | -------------------------------------------------------------------------------- /dmbrl/env/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | 4 | register( 5 | id='MBRLCartpole-v0', 6 | entry_point='dmbrl.env.cartpole:CartpoleEnv' 7 | ) 8 | 9 | 10 | register( 11 | id='MBRLReacher3D-v0', 12 | entry_point='dmbrl.env.reacher:Reacher3DEnv' 13 | ) 14 | 15 | 16 | register( 17 | id='MBRLPusher-v0', 18 | entry_point='dmbrl.env.pusher:PusherEnv' 19 | ) 20 | 21 | 22 | register( 23 | id='MBRLHalfCheetah-v0', 24 | entry_point='dmbrl.env.half_cheetah:HalfCheetahEnv' 25 | ) 26 | -------------------------------------------------------------------------------- /dmbrl/env/assets/cartpole.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 35 | 36 | -------------------------------------------------------------------------------- /dmbrl/env/assets/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 96 | -------------------------------------------------------------------------------- /dmbrl/env/assets/pusher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 102 | -------------------------------------------------------------------------------- /dmbrl/env/cartpole.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | 7 | import numpy as np 8 | from gym import utils 9 | from gym.envs.mujoco import mujoco_env 10 | 11 | 12 | class CartpoleEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | PENDULUM_LENGTH = 0.6 14 | 15 | def __init__(self): 16 | utils.EzPickle.__init__(self) 17 | dir_path = os.path.dirname(os.path.realpath(__file__)) 18 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/cartpole.xml' % dir_path, 2) 19 | 20 | def _step(self, a): 21 | self.do_simulation(a, self.frame_skip) 22 | ob = self._get_obs() 23 | 24 | cost_lscale = CartpoleEnv.PENDULUM_LENGTH 25 | reward = np.exp( 26 | -np.sum(np.square(self._get_ee_pos(ob) - np.array([0.0, CartpoleEnv.PENDULUM_LENGTH]))) / (cost_lscale ** 2) 27 | ) 28 | reward -= 0.01 * np.sum(np.square(a)) 29 | 30 | done = False 31 | return ob, reward, done, {} 32 | 33 | def reset_model(self): 34 | qpos = self.init_qpos + np.random.normal(0, 0.1, np.shape(self.init_qpos)) 35 | qvel = self.init_qvel + np.random.normal(0, 0.1, np.shape(self.init_qvel)) 36 | self.set_state(qpos, qvel) 37 | return self._get_obs() 38 | 39 | def _get_obs(self): 40 | return np.concatenate([self.model.data.qpos, self.model.data.qvel]).ravel() 41 | 42 | @staticmethod 43 | def _get_ee_pos(x): 44 | x0, theta = x[0], x[1] 45 | return np.array([ 46 | x0 - CartpoleEnv.PENDULUM_LENGTH * np.sin(theta), 47 | -CartpoleEnv.PENDULUM_LENGTH * np.cos(theta) 48 | ]) 49 | 50 | def viewer_setup(self): 51 | v = self.viewer 52 | v.cam.trackbodyid = 0 53 | v.cam.distance = v.model.stat.extent 54 | -------------------------------------------------------------------------------- /dmbrl/env/half_cheetah.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | 7 | import numpy as np 8 | from gym import utils 9 | from gym.envs.mujoco import mujoco_env 10 | 11 | 12 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | 14 | def __init__(self): 15 | self.prev_qpos = None 16 | dir_path = os.path.dirname(os.path.realpath(__file__)) 17 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/half_cheetah.xml' % dir_path, 5) 18 | utils.EzPickle.__init__(self) 19 | 20 | def _step(self, action): 21 | self.prev_qpos = np.copy(self.model.data.qpos.flat) 22 | self.do_simulation(action, self.frame_skip) 23 | ob = self._get_obs() 24 | 25 | reward_ctrl = -0.1 * np.square(action).sum() 26 | reward_run = ob[0] - 0.0 * np.square(ob[2]) 27 | reward = reward_run + reward_ctrl 28 | 29 | done = False 30 | return ob, reward, done, {} 31 | 32 | def _get_obs(self): 33 | return np.concatenate([ 34 | (self.model.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt, 35 | self.model.data.qpos.flat[1:], 36 | self.model.data.qvel.flat, 37 | ]) 38 | 39 | def reset_model(self): 40 | qpos = self.init_qpos + np.random.normal(loc=0, scale=0.001, size=self.model.nq) 41 | qvel = self.init_qvel + np.random.normal(loc=0, scale=0.001, size=self.model.nv) 42 | self.set_state(qpos, qvel) 43 | self.prev_qpos = np.copy(self.model.data.qpos.flat) 44 | return self._get_obs() 45 | 46 | def viewer_setup(self): 47 | self.viewer.cam.distance = self.model.stat.extent * 0.25 48 | self.viewer.cam.elevation = -55 49 | -------------------------------------------------------------------------------- /dmbrl/env/pusher.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | 7 | import numpy as np 8 | from gym import utils 9 | from gym.envs.mujoco import mujoco_env 10 | 11 | 12 | class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | def __init__(self): 14 | dir_path = os.path.dirname(os.path.realpath(__file__)) 15 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/pusher.xml' % dir_path, 4) 16 | utils.EzPickle.__init__(self) 17 | self.reset_model() 18 | 19 | def _step(self, a): 20 | obj_pos = self.get_body_com("object"), 21 | vec_1 = obj_pos - self.get_body_com("tips_arm") 22 | vec_2 = obj_pos - self.get_body_com("goal") 23 | 24 | reward_near = -np.sum(np.abs(vec_1)) 25 | reward_dist = -np.sum(np.abs(vec_2)) 26 | reward_ctrl = -np.square(a).sum() 27 | reward = 1.25 * reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near 28 | 29 | self.do_simulation(a, self.frame_skip) 30 | ob = self._get_obs() 31 | done = False 32 | return ob, reward, done, {} 33 | 34 | def viewer_setup(self): 35 | self.viewer.cam.trackbodyid = -1 36 | self.viewer.cam.distance = 4.0 37 | 38 | def reset_model(self): 39 | qpos = self.init_qpos 40 | 41 | self.goal_pos = np.asarray([0, 0]) 42 | self.cylinder_pos = np.array([-0.25, 0.15]) + np.random.normal(0, 0.025, [2]) 43 | 44 | qpos[-4:-2] = self.cylinder_pos 45 | qpos[-2:] = self.goal_pos 46 | qvel = self.init_qvel + self.np_random.uniform(low=-0.005, 47 | high=0.005, size=self.model.nv) 48 | qvel[-4:] = 0 49 | self.set_state(qpos, qvel) 50 | self.ac_goal_pos = self.get_body_com("goal") 51 | 52 | return self._get_obs() 53 | 54 | def _get_obs(self): 55 | return np.concatenate([ 56 | self.model.data.qpos.flat[:7], 57 | self.model.data.qvel.flat[:7], 58 | self.get_body_com("tips_arm"), 59 | self.get_body_com("object"), 60 | ]) 61 | -------------------------------------------------------------------------------- /dmbrl/env/reacher.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | 7 | import numpy as np 8 | from gym import utils 9 | from gym.envs.mujoco import mujoco_env 10 | 11 | 12 | class Reacher3DEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | def __init__(self): 14 | self.viewer = None 15 | utils.EzPickle.__init__(self) 16 | dir_path = os.path.dirname(os.path.realpath(__file__)) 17 | self.goal = np.zeros(3) 18 | mujoco_env.MujocoEnv.__init__(self, os.path.join(dir_path, 'assets/reacher3d.xml'), 2) 19 | 20 | def _step(self, a): 21 | self.do_simulation(a, self.frame_skip) 22 | ob = self._get_obs() 23 | reward = -np.sum(np.square(self.get_EE_pos(ob[None]) - self.goal)) 24 | reward -= 0.01 * np.square(a).sum() 25 | done = False 26 | return ob, reward, done, dict(reward_dist=0, reward_ctrl=0) 27 | 28 | def viewer_setup(self): 29 | self.viewer.cam.trackbodyid = 1 30 | self.viewer.cam.distance = 2.5 31 | self.viewer.cam.elevation = -30 32 | self.viewer.cam.azimuth = 270 33 | 34 | def reset_model(self): 35 | qpos, qvel = np.copy(self.init_qpos), np.copy(self.init_qvel) 36 | qpos[-3:] += np.random.normal(loc=0, scale=0.1, size=[3]) 37 | qvel[-3:] = 0 38 | self.goal = qpos[-3:] 39 | self.set_state(qpos, qvel) 40 | return self._get_obs() 41 | 42 | def _get_obs(self): 43 | return np.concatenate([ 44 | self.model.data.qpos.flat, 45 | self.model.data.qvel.flat[:-3], 46 | ]) 47 | 48 | def get_EE_pos(self, states): 49 | theta1, theta2, theta3, theta4, theta5, theta6, theta7 = \ 50 | states[:, :1], states[:, 1:2], states[:, 2:3], states[:, 3:4], states[:, 4:5], states[:, 5:6], states[:, 6:] 51 | 52 | rot_axis = np.concatenate([np.cos(theta2) * np.cos(theta1), np.cos(theta2) * np.sin(theta1), -np.sin(theta2)], 53 | axis=1) 54 | rot_perp_axis = np.concatenate([-np.sin(theta1), np.cos(theta1), np.zeros(theta1.shape)], axis=1) 55 | cur_end = np.concatenate([ 56 | 0.1 * np.cos(theta1) + 0.4 * np.cos(theta1) * np.cos(theta2), 57 | 0.1 * np.sin(theta1) + 0.4 * np.sin(theta1) * np.cos(theta2) - 0.188, 58 | -0.4 * np.sin(theta2) 59 | ], axis=1) 60 | 61 | for length, hinge, roll in [(0.321, theta4, theta3), (0.16828, theta6, theta5)]: 62 | perp_all_axis = np.cross(rot_axis, rot_perp_axis) 63 | x = np.cos(hinge) * rot_axis 64 | y = np.sin(hinge) * np.sin(roll) * rot_perp_axis 65 | z = -np.sin(hinge) * np.cos(roll) * perp_all_axis 66 | new_rot_axis = x + y + z 67 | new_rot_perp_axis = np.cross(new_rot_axis, rot_axis) 68 | new_rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] = \ 69 | rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] 70 | new_rot_perp_axis /= np.linalg.norm(new_rot_perp_axis, axis=1, keepdims=True) 71 | rot_axis, rot_perp_axis, cur_end = new_rot_axis, new_rot_perp_axis, cur_end + length * new_rot_axis 72 | 73 | return cur_end 74 | 75 | -------------------------------------------------------------------------------- /dmbrl/misc/Agent.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import numpy as np 6 | from gym.monitoring import VideoRecorder 7 | from dotmap import DotMap 8 | from dmbrl.misc import logger 9 | 10 | import time 11 | 12 | 13 | class Agent: 14 | """An general class for RL agents. 15 | """ 16 | 17 | def __init__(self, params): 18 | """Initializes an agent. 19 | 20 | Arguments: 21 | params: (DotMap) A DotMap of agent parameters. 22 | .env: (OpenAI gym environment) The environment for this agent. 23 | .noisy_actions: (bool) Indicates whether random Gaussian noise will 24 | be added to the actions of this agent. 25 | .noise_stddev: (float) The standard deviation to be used for the 26 | action noise if params.noisy_actions is True. 27 | """ 28 | self.env = params.env 29 | 30 | # load the imitation data if needed 31 | if hasattr(self.env, '_expert_data_loaded') and \ 32 | (not self.env._expert_data_loaded): 33 | self.env.load_expert_data( 34 | params.params.misc.ctrl_cfg.il_cfg.expert_amc_dir 35 | ) 36 | 37 | self.noise_stddev = params.noise_stddev if params.get("noisy_actions", False) else None 38 | 39 | if isinstance(self.env, DotMap): 40 | raise ValueError("Environment must be provided to the agent at initialization.") 41 | if (not isinstance(self.noise_stddev, float)) and params.get("noisy_actions", False): 42 | raise ValueError("Must provide standard deviation for noise for noisy actions.") 43 | 44 | if self.noise_stddev is not None: 45 | self.dU = self.env.action_space.shape[0] 46 | self._debug = 1 47 | 48 | def sample(self, horizon, policy, record_fname=None, test_policy=False, average=False): 49 | """Samples a rollout from the agent. 50 | 51 | Arguments: 52 | horizon: (int) The length of the rollout to generate from the agent. 53 | policy: (policy) The policy that the agent will use for actions. 54 | record_fname: (str/None) The name of the file to which a recording of the rollout 55 | will be saved. If None, the rollout will not be recorded. 56 | 57 | Returns: (dict) A dictionary containing data from the rollout. 58 | The keys of the dictionary are 'obs', 'ac', and 'reward_sum'. 59 | """ 60 | if test_policy: 61 | logger.info('Testing the policy') 62 | video_record = record_fname is not None 63 | recorder = None if not video_record else VideoRecorder(self.env, record_fname) 64 | 65 | times, rewards = [], [] 66 | O, A, reward_sum, done = [self.env.reset()], [], 0, False 67 | self._debug += 1 68 | 69 | policy.reset() 70 | # for t in range(20): 71 | for t in range(horizon): 72 | if hasattr(self.env, 'render_imitation'): 73 | self.env.render_imitation() 74 | if t % 50 == 10 and t > 1: 75 | logger.info('Current timesteps: %d / %d, average time: %.5f' 76 | % (t, horizon, np.mean(times))) 77 | if video_record: 78 | recorder.capture_frame() 79 | start = time.time() 80 | if test_policy: 81 | A.append(policy.act(O[t], t, test_policy=test_policy, average=average)) 82 | else: 83 | A.append(policy.act(O[t], t)) 84 | times.append(time.time() - start) 85 | 86 | if self.noise_stddev is None: 87 | obs, reward, done, info = self.env.step(A[t]) 88 | else: 89 | action = A[t] + np.random.normal(loc=0, scale=self.noise_stddev, 90 | size=[self.dU]) 91 | action = np.minimum(np.maximum(action, 92 | self.env.action_space.low), 93 | self.env.action_space.high) 94 | obs, reward, done, info = self.env.step(action) 95 | O.append(obs) 96 | reward_sum += reward 97 | rewards.append(reward) 98 | if done: 99 | break 100 | 101 | if video_record: 102 | recorder.capture_frame() 103 | recorder.close() 104 | 105 | logger.info("Average action selection time: %.4f" % np.mean(times)) 106 | logger.info("Rollout length: %d" % len(A)) 107 | 108 | return { 109 | "obs": np.array(O), 110 | "ac": np.array(A), 111 | "reward_sum": reward_sum, 112 | "rewards": np.array(rewards), 113 | } 114 | -------------------------------------------------------------------------------- /dmbrl/misc/DotmapUtils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | 6 | def get_required_argument(dotmap, key, message, default=None): 7 | val = dotmap.get(key, default) 8 | if val is default: 9 | raise ValueError(message) 10 | return val 11 | -------------------------------------------------------------------------------- /dmbrl/misc/MBExp.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | from time import localtime, strftime 7 | 8 | from scipy.io import savemat 9 | from dotmap import DotMap 10 | 11 | from dmbrl.misc.DotmapUtils import get_required_argument 12 | from dmbrl.misc.Agent import Agent 13 | from dmbrl.misc import logger 14 | import copy 15 | import numpy as np 16 | 17 | 18 | class MBExperiment: 19 | 20 | def __init__(self, params): 21 | """Initializes class instance. 22 | 23 | Argument: 24 | params (DotMap): A DotMap containing the following: 25 | .sim_cfg: 26 | .env (gym.env): Environment for this experiment 27 | .task_hor (int): Task horizon 28 | .stochastic (bool): (optional) If True, agent adds noise to its actions. 29 | Must provide noise_std (see below). Defaults to False. 30 | .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I) 31 | will be added. 32 | 33 | .exp_cfg: 34 | .ntrain_iters (int): Number of training iterations to be performed. 35 | .nrollouts_per_iter (int): (optional) Number of rollouts done between training 36 | iterations. Defaults to 1. 37 | .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1. 38 | .policy (controller): Policy that will be trained. 39 | 40 | .log_cfg: 41 | .logdir (str): Parent of directory path where experiment data will be saved. 42 | Experiment will be saved in logdir/ 43 | .nrecord (int): (optional) Number of rollouts to record for every iteration. 44 | Defaults to 0. 45 | .neval (int): (optional) Number of rollouts for performance evaluation. 46 | Defaults to 1. 47 | """ 48 | self.env = get_required_argument(params.sim_cfg, "env", "Must provide environment.") 49 | self.task_hor = get_required_argument(params.sim_cfg, "task_hor", "Must provide task horizon.") 50 | self._params = params 51 | params.sim_cfg.misc = copy.copy(params) 52 | if params.sim_cfg.get("stochastic", False): 53 | self.agent = Agent(DotMap( 54 | env=self.env, noisy_actions=True, 55 | noise_stddev=get_required_argument( 56 | params.sim_cfg, 57 | "noise_std", 58 | "Must provide noise standard deviation in the case of a stochastic environment." 59 | ), 60 | params=params 61 | )) 62 | else: 63 | self.agent = Agent(DotMap(env=self.env, noisy_actions=False, params=params)) 64 | 65 | self.ntrain_iters = get_required_argument( 66 | params.exp_cfg, "ntrain_iters", "Must provide number of training iterations." 67 | ) 68 | self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1) 69 | self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1) 70 | self.policy = get_required_argument(params.exp_cfg, "policy", "Must provide a policy.") 71 | 72 | self.logdir = os.path.join( 73 | get_required_argument(params.log_cfg, "logdir", "Must provide log parent directory."), 74 | strftime("%Y-%m-%d--%H:%M:%S", localtime()) 75 | ) 76 | logger.set_file_handler(path=self.logdir) 77 | logger.info('Starting the experiments') 78 | self.nrecord = params.log_cfg.get("nrecord", 0) 79 | self.neval = params.log_cfg.get("neval", 1) 80 | 81 | def run_experiment(self): 82 | """Perform experiment. 83 | """ 84 | os.makedirs(self.logdir, exist_ok=True) 85 | 86 | traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], [] 87 | test_traj_obs, test_traj_acs, test_traj_rets = [], [], [] 88 | episode_iter_id = [] 89 | 90 | # Perform initial rollouts 91 | samples = [] 92 | needed_num_steps = self.ninit_rollouts * self.task_hor 93 | finished_num_steps = 0 94 | """ 95 | # TODO DEBUG 96 | needed_num_steps = 64 97 | self.task_hor = 64 98 | """ 99 | while True: 100 | samples.append( 101 | self.agent.sample( 102 | self.task_hor, self.policy 103 | ) 104 | ) 105 | traj_obs.append(samples[-1]["obs"]) 106 | traj_acs.append(samples[-1]["ac"]) 107 | traj_rews.append(samples[-1]["rewards"]) 108 | finished_num_steps += len(samples[-1]["ac"]) 109 | 110 | if finished_num_steps >= needed_num_steps: 111 | break 112 | 113 | if self.ninit_rollouts > 0: 114 | self.policy.train( 115 | [sample["obs"] for sample in samples], 116 | [sample["ac"] for sample in samples], 117 | [sample["rewards"] for sample in samples] 118 | ) 119 | 120 | # Training loop 121 | for i in range(self.ntrain_iters): 122 | 123 | logger.info("####################################################################") 124 | logger.info("Starting training iteration %d." % (i + 1)) 125 | 126 | iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1)) 127 | os.makedirs(iter_dir, exist_ok=True) 128 | 129 | samples = [] 130 | assert self.nrecord == 0 131 | 132 | needed_num_steps = self.task_hor * \ 133 | (max(self.neval, self.nrollouts_per_iter) - self.nrecord) 134 | finished_num_steps = 0 135 | while True: 136 | samples.append( 137 | self.agent.sample( 138 | self.task_hor, self.policy 139 | ) 140 | ) 141 | finished_num_steps += len(samples[-1]["ac"]) 142 | 143 | if finished_num_steps >= needed_num_steps: 144 | break 145 | logger.info("Rewards obtained: {}".format( 146 | [sample["reward_sum"] for sample in samples[:self.neval]]) 147 | ) 148 | # test the policy if needed 149 | if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0: 150 | test_data = [] 151 | for _ in range(5): 152 | test_data.append( 153 | self.agent.sample(self.task_hor, self.policy, 154 | test_policy=True, average=False) 155 | ) 156 | test_traj_rets.extend([ 157 | np.mean([i_test_data["reward_sum"] for i_test_data in test_data]) 158 | ]) 159 | test_traj_obs.extend( 160 | [i_test_data["obs"] for i_test_data in test_data] 161 | ) 162 | test_traj_acs.extend( 163 | [i_test_data["ac"] for i_test_data in test_data] 164 | ) 165 | 166 | traj_obs.extend([sample["obs"] for sample in samples]) 167 | traj_acs.extend([sample["ac"] for sample in samples]) 168 | traj_rets.extend([sample["reward_sum"] for sample in samples]) 169 | traj_rews.extend([sample["rewards"] for sample in samples]) 170 | episode_iter_id.extend([i] * len(samples)) 171 | samples = samples[:self.nrollouts_per_iter] 172 | 173 | self.policy.dump_logs(self.logdir, iter_dir) 174 | savemat( 175 | os.path.join(self.logdir, "logs.mat"), 176 | { 177 | "observations": traj_obs, 178 | "actions": traj_acs, 179 | "returns": traj_rets, 180 | "rewards": traj_rews, 181 | "test_returns": test_traj_rets, 182 | "test_obs": test_traj_obs, 183 | "test_acs": test_traj_acs, 184 | 'episode_iter_id': episode_iter_id 185 | } 186 | ) 187 | # Delete iteration directory if not used 188 | if len(os.listdir(iter_dir)) == 0: 189 | os.rmdir(iter_dir) 190 | 191 | if i < self.ntrain_iters - 1: 192 | self.policy.train( 193 | [sample["obs"] for sample in samples], 194 | [sample["ac"] for sample in samples], 195 | [sample["rewards"] for sample in samples] 196 | ) 197 | 198 | # TODO: train the policy network 199 | -------------------------------------------------------------------------------- /dmbrl/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/dmbrl/misc/__init__.py -------------------------------------------------------------------------------- /dmbrl/misc/logger.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @brief: 3 | # The logger here will be called all across the project. It is inspired 4 | # by Yuxin Wu (ppwwyyxx@gmail.com) 5 | # 6 | # @author: 7 | # Tingwu Wang, 2017, Feb, 20th 8 | # ----------------------------------------------------------------------------- 9 | 10 | import logging 11 | import sys 12 | import os 13 | import datetime 14 | from termcolor import colored 15 | 16 | __all__ = ['set_file_handler'] # the actual worker is the '_logger' 17 | 18 | 19 | class _MyFormatter(logging.Formatter): 20 | ''' 21 | @brief: 22 | a class to make sure the format could be used 23 | ''' 24 | 25 | def format(self, record): 26 | date = colored('[%(asctime)s @%(filename)s:%(lineno)d]', 'green') 27 | msg = '%(message)s' 28 | 29 | if record.levelno == logging.WARNING: 30 | fmt = date + ' ' + \ 31 | colored('WRN', 'red', attrs=[]) + ' ' + msg 32 | elif record.levelno == logging.ERROR or \ 33 | record.levelno == logging.CRITICAL: 34 | fmt = date + ' ' + \ 35 | colored('ERR', 'red', attrs=['underline']) + ' ' + msg 36 | else: 37 | fmt = date + ' ' + msg 38 | 39 | if hasattr(self, '_style'): 40 | # Python3 compatibilty 41 | self._style._fmt = fmt 42 | self._fmt = fmt 43 | 44 | return super(self.__class__, self).format(record) 45 | 46 | 47 | _logger = logging.getLogger('joint_embedding') 48 | _logger.propagate = False 49 | _logger.setLevel(logging.INFO) 50 | 51 | # set the console output handler 52 | con_handler = logging.StreamHandler(sys.stdout) 53 | con_handler.setFormatter(_MyFormatter(datefmt='%m%d %H:%M:%S')) 54 | _logger.addHandler(con_handler) 55 | 56 | 57 | class GLOBAL_PATH(object): 58 | 59 | def __init__(self, path=None): 60 | if path is None: 61 | path = os.getcwd() 62 | self.path = path 63 | 64 | def _set_path(self, path): 65 | self.path = path 66 | 67 | def _get_path(self): 68 | return self.path 69 | 70 | 71 | PATH = GLOBAL_PATH() 72 | 73 | 74 | # set the file output handler 75 | def set_file_handler(path=None, prefix='', time_str=''): 76 | if time_str == '': 77 | file_name = prefix + \ 78 | datetime.datetime.now().strftime("%A_%d_%B_%Y_%I:%M%p") + '.log' 79 | else: 80 | file_name = prefix + time_str + '.log' 81 | 82 | path = os.path.abspath(path) 83 | 84 | path = os.path.join(path, file_name) 85 | if not os.path.exists(path): 86 | os.makedirs(path) 87 | 88 | PATH._set_path(path) 89 | # from tensorboard_logger import configure 90 | # configure(path) 91 | 92 | file_handler = logging.FileHandler( 93 | filename=os.path.join(path, 'logger.log'), encoding='utf-8', mode='w' 94 | ) 95 | file_handler.setFormatter(_MyFormatter(datefmt='%m%d %H:%M:%S')) 96 | _logger.addHandler(file_handler) 97 | 98 | _logger.info('Log file set to {}'.format(path)) 99 | return 100 | 101 | 102 | def _get_path(): 103 | return PATH._get_path() 104 | 105 | 106 | _LOGGING_METHOD = ['info', 'warning', 'error', 'critical', 107 | 'warn', 'exception', 'debug'] 108 | 109 | # export logger functions 110 | for func in _LOGGING_METHOD: 111 | locals()[func] = getattr(_logger, func) 112 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .cem import CEMOptimizer 2 | from .random import RandomOptimizer 3 | from .gbp_rs import GBPRandomOptimizer 4 | from .gbp_cem import GBPCEMOptimizer 5 | from .POPLIN_A import POPLINAOptimizer 6 | from .POPLIN_P import POPLINPOptimizer 7 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/cem.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import scipy.stats as stats 8 | from dmbrl.misc import logger 9 | 10 | from .optimizer import Optimizer 11 | 12 | 13 | class CEMOptimizer(Optimizer): 14 | """A Tensorflow-compatible CEM optimizer. 15 | """ 16 | 17 | def __init__(self, sol_dim, max_iters, popsize, num_elites, tf_session=None, 18 | upper_bound=None, lower_bound=None, epsilon=0.001, alpha=0.25, 19 | params=None): 20 | """Creates an instance of this class. 21 | 22 | Arguments: 23 | sol_dim (int): The dimensionality of the problem space 24 | max_iters (int): The maximum number of iterations to perform during optimization 25 | popsize (int): The number of candidate solutions to be sampled at every iteration 26 | num_elites (int): The number of top solutions that will be used to obtain the distribution 27 | at the next iteration. 28 | tf_session (tf.Session): (optional) Session to be used for this optimizer. Defaults to None, 29 | in which case any functions passed in cannot be tf.Tensor-valued. 30 | upper_bound (np.array): An array of upper bounds 31 | lower_bound (np.array): An array of lower bounds 32 | epsilon (float): A minimum variance. If the maximum variance drops below epsilon, optimization is 33 | stopped. 34 | alpha (float): Controls how much of the previous mean and variance is used for the next iteration. 35 | next_mean = alpha * old_mean + (1 - alpha) * elite_mean, and similarly for variance. 36 | """ 37 | from dmbrl.modeling.models import GT_dynamics 38 | self._gt_compile_cost = GT_dynamics.compile_cost 39 | super().__init__() 40 | self.sol_dim, self.max_iters, self.popsize, self.num_elites = \ 41 | sol_dim, max_iters, popsize, num_elites 42 | self.ub, self.lb = upper_bound, lower_bound 43 | self.epsilon, self.alpha = epsilon, alpha 44 | self.tf_sess = tf_session 45 | self.debug = False 46 | 47 | self._params = params 48 | 49 | if num_elites > popsize: 50 | raise ValueError("Number of elites must be at most the population size.") 51 | 52 | if self.tf_sess is not None: 53 | with self.tf_sess.graph.as_default(): 54 | with tf.variable_scope("CEMSolver"): 55 | self.init_mean = tf.placeholder(dtype=tf.float32, shape=[sol_dim]) 56 | self.init_var = tf.placeholder(dtype=tf.float32, shape=[sol_dim]) 57 | 58 | self.num_opt_iters, self.mean, self.var = None, None, None 59 | self.tf_compatible, self.cost_function = None, None 60 | 61 | if self._params.il_cfg.use_gt_dynamics: 62 | self._dynamics = GT_dynamics.GT(self._params) 63 | 64 | def setup(self, cost_function, tf_compatible): 65 | """Sets up this optimizer using a given cost function. 66 | 67 | Arguments: 68 | cost_function (func): A function for computing costs over a batch of candidate solutions. 69 | tf_compatible (bool): True if the cost function provided is tf.Tensor-valued. 70 | 71 | Returns: None 72 | """ 73 | if tf_compatible and self.tf_sess is None: 74 | raise RuntimeError("Cannot pass in a tf.Tensor-valued cost function without passing in a TensorFlow " 75 | "session into the constructor") 76 | 77 | self.tf_compatible = tf_compatible 78 | 79 | if not tf_compatible: 80 | self.cost_function = cost_function 81 | else: 82 | def continue_optimization(t, mean, var, best_val, best_sol): 83 | return tf.logical_and(tf.less(t, self.max_iters), tf.reduce_max(var) > self.epsilon) 84 | 85 | def iteration(t, mean, var, best_val, best_sol): 86 | lb_dist, ub_dist = mean - self.lb, self.ub - mean 87 | constrained_var = tf.minimum(tf.minimum(tf.square(lb_dist / 2), tf.square(ub_dist / 2)), var) 88 | samples = tf.truncated_normal([self.popsize, self.sol_dim], mean, tf.sqrt(constrained_var)) 89 | 90 | costs = cost_function(samples) 91 | values, indices = tf.nn.top_k(-costs, k=self.num_elites, sorted=True) 92 | 93 | best_val, best_sol = tf.cond( 94 | tf.less(-values[0], best_val), 95 | lambda: (-values[0], samples[indices[0]]), 96 | lambda: (best_val, best_sol) 97 | ) 98 | 99 | elites = tf.gather(samples, indices) 100 | new_mean = tf.reduce_mean(elites, axis=0) 101 | new_var = tf.reduce_mean(tf.square(elites - new_mean), axis=0) 102 | 103 | mean = self.alpha * mean + (1 - self.alpha) * new_mean 104 | var = self.alpha * var + (1 - self.alpha) * new_var 105 | 106 | return t + 1, mean, var, best_val, best_sol 107 | 108 | with self.tf_sess.graph.as_default(): 109 | self.num_opt_iters, self.mean, self.var, self.best_val, self.best_sol = tf.while_loop( 110 | cond=continue_optimization, body=iteration, 111 | loop_vars=[0, self.init_mean, self.init_var, float("inf"), self.init_mean] 112 | ) 113 | 114 | def reset(self): 115 | pass 116 | 117 | def obtain_solution(self, init_mean, init_var, per, dU, obs=None): 118 | """Optimizes the cost function using the provided initial candidate distribution 119 | 120 | Arguments: 121 | init_mean (np.ndarray): The mean of the initial candidate distribution. 122 | init_var (np.ndarray): The variance of the initial candidate distribution. 123 | """ 124 | if self.tf_compatible: 125 | sol, solvar = self.tf_sess.run( 126 | [self.mean, self.var], 127 | feed_dict={self.init_mean: init_mean, self.init_var: init_var} 128 | ) 129 | else: 130 | assert self._params.il_cfg.use_gt_dynamics 131 | mean, var, t = init_mean, init_var, 0 132 | X = stats.truncnorm(-2, 2, loc=np.zeros_like(mean), scale=np.ones_like(mean)) 133 | 134 | cfg = {'plan_hor': self._params.opt_cfg.plan_hor, 135 | 'dU': self._params.env.action_space.shape[0]} 136 | while (t < self.max_iters) and np.max(var) > self.epsilon: 137 | lb_dist, ub_dist = mean - self.lb, self.ub - mean 138 | constrained_var = np.minimum(np.minimum(np.square(lb_dist / 2), np.square(ub_dist / 2)), var) 139 | 140 | samples = X.rvs(size=[self.popsize, self.sol_dim]) * np.sqrt(constrained_var) + mean 141 | costs = self._gt_compile_cost( 142 | obs, samples, cfg, self._dynamics, 143 | self._dynamics._numpy_reward_function 144 | ) 145 | costs = np.reshape(costs, [-1]) 146 | elites = samples[np.argsort(costs)][:self.num_elites] 147 | 148 | new_mean = np.mean(elites, axis=0) 149 | new_var = np.var(elites, axis=0) 150 | 151 | mean = self.alpha * mean + (1 - self.alpha) * new_mean 152 | var = self.alpha * var + (1 - self.alpha) * new_var 153 | logger.info('variance of elite: {}'.format(np.var(elites))) 154 | logger.info('Mean perforamnce: {}'.format( 155 | np.mean(costs[np.argsort(costs)][:self.num_elites])) 156 | ) 157 | 158 | t += 1 159 | sol, solvar = mean, var 160 | sol = np.reshape(sol, [-1]) 161 | 162 | # prev_sol is going to be used next timestep 163 | prev_sol = self.update_prev_sol(per, dU, sol) 164 | return sol, prev_sol 165 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/gbp_rs.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from .optimizer import Optimizer 9 | from dmbrl.misc import logger 10 | 11 | 12 | class GBPRandomOptimizer(Optimizer): 13 | """ @brief: use gradient based planning to update the policy network 14 | """ 15 | 16 | def __init__(self, sol_dim, popsize, tf_session, 17 | upper_bound=None, lower_bound=None, params=None): 18 | """Creates an instance of this class. 19 | 20 | Arguments: 21 | sol_dim (int): The dimensionality of the problem space 22 | popsize (int): The number of candidate solutions to be sampled at every iteration 23 | num_elites (int): The number of top solutions that will be used to obtain the distribution 24 | at the next iteration. 25 | tf_session (tf.Session): (optional) Session to be used for this optimizer. Defaults to None, 26 | in which case any functions passed in cannot be tf.Tensor-valued. 27 | upper_bound (np.array): An array of upper bounds 28 | lower_bound (np.array): An array of lower bounds 29 | """ 30 | super().__init__() 31 | self._params = params 32 | self._print_count = 0 33 | 34 | self.sol_dim = sol_dim 35 | self.popsize = popsize 36 | self.ub, self.lb = upper_bound, lower_bound 37 | self.tf_sess = tf_session 38 | self.solution = None 39 | self.tf_compatible, self.cost_function = None, None 40 | 41 | self._debug = {} 42 | self._debug['old_sol'] = 0.0 43 | self._debug_start = False 44 | 45 | def setup(self, cost_function, tf_compatible): 46 | """Sets up this optimizer using a given cost function. 47 | 48 | Arguments: 49 | cost_function (func): A function for computing costs over a batch of candidate solutions. 50 | tf_compatible (bool): True if the cost function provided is tf.Tensor-valued. 51 | 52 | Returns: None 53 | """ 54 | if tf_compatible and self.tf_sess is None: 55 | raise RuntimeError("Cannot pass in a tf.Tensor-valued cost function without passing in a TensorFlow " 56 | "session into the constructor") 57 | 58 | if not tf_compatible: 59 | self.tf_compatible = False 60 | self.cost_function = cost_function 61 | else: 62 | with self.tf_sess.graph.as_default(): 63 | self.tf_compatible = True 64 | self._candidate_solutions = tf.Variable( 65 | np.random.uniform(self.lb, self.ub, [self.popsize, self.sol_dim]), 66 | dtype=tf.float32 67 | ) 68 | self.tf_sess.run( 69 | tf.variables_initializer([self._candidate_solutions]) 70 | ) 71 | 72 | self._costs = costs = cost_function(self._candidate_solutions) 73 | self._choice = tf.argmin(costs) 74 | self.solution = \ 75 | self._candidate_solutions[tf.cast(self._choice, tf.int32)] 76 | 77 | # the update loss 78 | self._adam_optimizer = \ 79 | tf.train.AdamOptimizer(learning_rate=self._params.gbp_cfg.lr) 80 | self._planning_optimizer = self._adam_optimizer.minimize( 81 | costs, var_list=[self._candidate_solutions] 82 | ) 83 | self.tf_sess.run( 84 | tf.variables_initializer(self._adam_optimizer.variables()) 85 | ) 86 | self._average_cost = tf.reduce_mean(costs) 87 | self._min_cost = tf.reduce_min(costs) 88 | self._values, self._indices = tf.nn.top_k(-costs, k=10, sorted=True) 89 | 90 | # debug information 91 | self._debug_actions = self.solution 92 | 93 | def reset(self): 94 | pass 95 | 96 | def obtain_solution(self, init_mean, init_var, per, dU, obs=None): 97 | """Optimizes the cost function provided in setup(). 98 | do gradient based planning 99 | 100 | Arguments: 101 | init_mean (np.ndarray): The mean of the initial candidate distribution. 102 | init_var (np.ndarray): The variance of the initial candidate distribution. 103 | """ 104 | assert self.tf_compatible 105 | self._print_count = (self._print_count + 1) % 20 106 | self._print = self._print_count == 0 107 | 108 | # step 1: initialize the action candidates TODO: use init_mean 109 | self._old_solutions = np.concatenate( 110 | [self.tf_sess.run(self._candidate_solutions)[:, 6:], 111 | np.random.uniform(self.lb[0], self.ub[0], [self.popsize, 6])], 112 | axis=1 113 | ) 114 | self._candidate_solutions.load(self._old_solutions, self.tf_sess) 115 | 116 | avg_cost, min_cost = self.tf_sess.run( 117 | [self._average_cost, self._min_cost] 118 | ) 119 | if self._print: 120 | logger.info('Init -> Avg_cost: %.3f, Min_cost: %.3f' % 121 | (avg_cost, min_cost)) 122 | 123 | # step 2: do gradient based planning 124 | for gbp_iteration in range(self._params.gbp_cfg.plan_iter): 125 | _, avg_cost, min_cost = self.tf_sess.run( 126 | [self._planning_optimizer, self._average_cost, self._min_cost] 127 | ) 128 | avg_cost, min_cost = self.tf_sess.run( 129 | [self._average_cost, self._min_cost] 130 | ) 131 | if self._print: 132 | logger.info('Iter %d > Avg_cost: %.3f, Min_cost: %.3f' % 133 | (self._params.gbp_cfg.plan_iter, avg_cost, min_cost)) 134 | 135 | sol = self.tf_sess.run(self.solution) 136 | prev_sol = self.update_prev_sol(per, dU, sol) 137 | 138 | return sol, prev_sol 139 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/optimizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from __future__ import division 4 | import numpy as np 5 | 6 | 7 | class Optimizer: 8 | 9 | def __init__(self, *args, **kwargs): 10 | self.sy_cur_obs = None 11 | self._proposed_act_seqs_ph = None 12 | pass 13 | 14 | def setup(self, cost_function, tf_compatible): 15 | raise NotImplementedError("Must be implemented in subclass.") 16 | 17 | def reset(self): 18 | raise NotImplementedError("Must be implemented in subclass.") 19 | 20 | def obtain_solution(self, *args, **kwargs): 21 | raise NotImplementedError("Must be implemented in subclass.") 22 | 23 | def get_policy_network(self): 24 | return None 25 | 26 | def train_policy_network(self): 27 | return False 28 | 29 | def set_sy_cur_obs(self, sy_cur_obs): 30 | # NOTE: it is a hack! be careful 31 | self.sy_cur_obs = sy_cur_obs 32 | 33 | def forward_policy_propose(self, predict_next_obs, sy_cur_obs): 34 | pass 35 | 36 | def reset_prev_sol(self, prev_sol): 37 | return prev_sol 38 | 39 | def update_prev_sol(self, per, dU, soln): 40 | prev_sol = np.concatenate([np.copy(soln)[per * dU:], 41 | np.zeros(per * dU)]) 42 | return prev_sol 43 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/BC_A_policy.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @author: 3 | # Tingwu Wang 4 | # ----------------------------------------------------------------------------- 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | from . import base_policy 9 | from . import tf_networks 10 | from dmbrl.misc import logger 11 | 12 | 13 | class policy_network(base_policy.base_policy_network): 14 | ''' @brief: 15 | In this object class, we define the network structure, the restore 16 | function and save function. 17 | 18 | @self.args.training_scheme 19 | @BC-AR: (action space) behavior cloning with the real data 20 | @BC-AI: (action space) behavior cloning using imaginary dataset. 21 | 22 | @AVG-R: (weight space) behavior cloning by setting the weight to 23 | the average of the weights selected during sampling 24 | @BC-PR: (weight space) behavior cloning by distilling the policy 25 | produced by the weights during sampling 26 | @AVG-I: (weight space) AVG-R but with imaginary dataset 27 | @BC-PI: (weight space) BC-PR but with imaginary dataset 28 | ''' 29 | 30 | def __init__(self, args, session, name_scope, 31 | observation_size, action_size): 32 | 33 | super(policy_network, self).__init__( 34 | args, session, name_scope, observation_size, action_size 35 | ) 36 | assert self.args.training_scheme in ['BC-AR', 'BC-AI'] 37 | assert self.args.cem_type in ['POPLINA-INIT', 'POPLINA-REPLAN'] 38 | 39 | def build_network(self): 40 | """ @brief: Note that build_network is only needed for the training 41 | """ 42 | network_shape = [self._observation_size] + \ 43 | self.args.policy_network_shape + [self._action_size] 44 | num_layer = len(network_shape) - 1 45 | act_type = ['tanh'] * (num_layer - 1) + [None] 46 | norm_type = [None] * (num_layer - 1) + [None] 47 | init_data = [] 48 | for _ in range(num_layer): 49 | init_data.append( 50 | {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 51 | 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} 52 | ) 53 | init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std 54 | 55 | self._MLP = tf_networks.MLP( 56 | dims=network_shape, scope='policy_mlp', train=True, 57 | activation_type=act_type, normalizer_type=norm_type, 58 | init_data=init_data 59 | ) 60 | 61 | # fetch all the trainable variables 62 | self._set_var_list() 63 | 64 | def build_loss(self): 65 | 66 | self._build_ph() 67 | self._tensor, self._update_operator = {}, {} 68 | 69 | # construct the input to the forward network, we normalize the state 70 | # input, and concatenate with the action 71 | self._tensor['normalized_start_state'] = ( 72 | self._input_ph['start_state'] - 73 | self._whitening_operator['state_mean'] 74 | ) / self._whitening_operator['state_std'] 75 | self._tensor['net_input'] = self._tensor['normalized_start_state'] 76 | 77 | # the output policy of the network 78 | self._tensor['action'] = self._MLP(self._tensor['net_input']) 79 | 80 | self._input_ph['target_action'] = tf.placeholder( 81 | tf.float32, [None, self._action_size], name='target_action' 82 | ) 83 | 84 | self._update_operator['loss'] = tf.reduce_mean( 85 | tf.square(self._input_ph['target_action'] - 86 | self._tensor['action']) 87 | ) 88 | 89 | self._update_operator['update_op'] = tf.train.AdamOptimizer( 90 | learning_rate=self.args.policy_lr, 91 | ).minimize(self._update_operator['loss']) 92 | logger.info("policy training learning rate: {}".format( 93 | self.args.policy_lr) 94 | ) 95 | 96 | def train(self, data_dict, training_info={}): 97 | 98 | # Step 1: update the running mean 99 | imaginary_dataset = training_info['imaginary_dataset'] 100 | 101 | # Step 2: data processing 102 | if self.args.training_scheme == 'BC-AR': 103 | data_dict['target_action'] = data_dict['action'] # for training 104 | elif self.args.training_scheme == 'BC-AI': 105 | # add imaginary data to the dataset 106 | for key in ['start_state', 'action']: 107 | data_dict[key] = \ 108 | np.concatenate([data_dict[key], imaginary_dataset[key]]) 109 | data_dict['target_action'] = data_dict['action'] # for training 110 | 111 | else: 112 | raise NotImplementedError 113 | 114 | self._set_whitening_var(data_dict['whitening_stats']) 115 | self.optimize_weights(data_dict, ['start_state', 'target_action']) 116 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/BC_WA_policy.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @author: 3 | # Tingwu Wang 4 | # ----------------------------------------------------------------------------- 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | from . import base_policy 9 | from . import tf_networks 10 | from . import tf_utils 11 | from dmbrl.misc import logger 12 | 13 | 14 | class policy_network(base_policy.base_policy_network): 15 | ''' @brief: 16 | In this object class, we define the network structure, the restore 17 | function and save function. 18 | 19 | @self.args.training_scheme 20 | @BC-AR: (action space) behavior cloning with the real data 21 | @BC-AI: (action space) behavior cloning using imaginary dataset. 22 | 23 | @AVG-R: (weight space) behavior cloning by setting the weight to 24 | the average of the weights selected during sampling 25 | @BC-PR: (weight space) behavior cloning by distilling the policy 26 | produced by the weights during sampling 27 | @AVG-I: (weight space) AVG-R but with imaginary dataset 28 | @BC-PI: (weight space) BC-PR but with imaginary dataset 29 | ''' 30 | 31 | def __init__(self, args, session, name_scope, 32 | observation_size, action_size): 33 | 34 | super(policy_network, self).__init__( 35 | args, session, name_scope, observation_size, action_size 36 | ) 37 | assert self.args.training_scheme in ['AVG-R', 'AVG-I'] 38 | assert self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] 39 | 40 | def build_network(self): 41 | """ @brief: Note that build_network is only needed for the training 42 | """ 43 | network_shape = [self._observation_size] + \ 44 | self.args.policy_network_shape + [self._action_size] 45 | num_layer = len(network_shape) - 1 46 | act_type = ['tanh'] * (num_layer - 1) + [None] 47 | norm_type = [None] * (num_layer - 1) + [None] 48 | init_data = [] 49 | for _ in range(num_layer): 50 | init_data.append( 51 | {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 52 | 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} 53 | ) 54 | init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std 55 | 56 | self._MLP = tf_networks.W_MLP( 57 | dims=network_shape, scope='policy_mlp', train=True, 58 | activation_type=act_type, normalizer_type=norm_type, 59 | init_data=init_data 60 | ) 61 | 62 | # fetch all the trainable variables 63 | self._set_var_list() 64 | 65 | def build_loss(self): 66 | 67 | self._build_ph() 68 | self._tensor, self._update_operator = {}, {} 69 | 70 | self._MLP_var_list = self._MLP.get_variable_list() 71 | self._set_weight = tf_utils.set_network_weights( 72 | self._session, self._MLP_var_list, '' 73 | ) 74 | logger.info("policy training learning rate: {}".format( 75 | self.args.policy_lr) 76 | ) 77 | 78 | self._session.run(tf.variables_initializer(tf.global_variables())) 79 | 80 | # synchronize the two networks if needed 81 | if self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] and \ 82 | self.args.training_scheme in ['BC-PR', 'BC-PI']: 83 | weight_dict = self._get_weight() # get from MLP 84 | self._set_weight(weight_dict) # set the target MLP 85 | 86 | def train(self, data_dict, training_info={}): 87 | 88 | # Step 1: update the running mean 89 | imaginary_dataset = training_info['imaginary_dataset'] 90 | 91 | # Step 2: data processing 92 | if self.args.training_scheme in ['AVG-R']: 93 | data_dict['target_weight'] = data_dict['weight'] # for training 94 | data_dict['weight'] = data_dict['target_weight'] # for training 95 | 96 | elif self.args.training_scheme in ['AVG-I']: 97 | for key in ['start_state', 'weight']: 98 | data_dict[key] = \ 99 | np.concatenate([data_dict[key], imaginary_dataset[key]]) 100 | data_dict['target_weight'] = data_dict['weight'] # for training 101 | data_dict['weight'] = data_dict['target_weight'] # for training 102 | 103 | else: 104 | raise NotImplementedError 105 | 106 | # Step 3: parse the test set and train the network 107 | # get the average of the weights 108 | self._set_whitening_var(data_dict['whitening_stats']) 109 | average_weights = \ 110 | np.reshape(np.mean(data_dict['target_weight'], axis=0), [1, -1]) 111 | 112 | if self.args.zero_weight == 'yes': 113 | average_weights *= 0.0 114 | logger.warning('Using Zero Weights') 115 | weight_dict = \ 116 | self._MLP.parse_np_weight_vec_into_dict(average_weights) 117 | 118 | # set the weights 119 | self._set_weight(weight_dict) 120 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/BC_WD_policy.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @author: 3 | # Tingwu Wang 4 | # ----------------------------------------------------------------------------- 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | from . import base_policy 9 | from . import tf_networks 10 | from . import tf_utils 11 | from . import whitening_util 12 | from dmbrl.misc import logger 13 | 14 | 15 | class policy_network(base_policy.base_policy_network): 16 | ''' @brief: 17 | In this object class, we define the network structure, the restore 18 | function and save function. 19 | 20 | @self.args.training_scheme 21 | @BC-AR: (action space) behavior cloning with the real data 22 | @BC-AI: (action space) behavior cloning using imaginary dataset. 23 | 24 | @AVG-R: (weight space) behavior cloning by setting the weight to 25 | the average of the weights selected during sampling 26 | @BC-PR: (weight space) behavior cloning by distilling the policy 27 | produced by the weights during sampling 28 | @AVG-I: (weight space) AVG-R but with imaginary dataset 29 | @BC-PI: (weight space) BC-PR but with imaginary dataset 30 | ''' 31 | 32 | def __init__(self, args, session, name_scope, 33 | observation_size, action_size): 34 | 35 | super(policy_network, self).__init__( 36 | args, session, name_scope, observation_size, action_size 37 | ) 38 | assert self.args.cem_type in ['POPLINP-SEP', 'POPLINP-UNI'] 39 | assert self.args.training_scheme in ['BC-PR', 'BC-PI'] 40 | 41 | def build_network(self): 42 | """ @brief: Note that build_network is only needed for the training 43 | """ 44 | network_shape = [self._observation_size] + \ 45 | self.args.policy_network_shape + [self._action_size] 46 | num_layer = len(network_shape) - 1 47 | act_type = ['tanh'] * (num_layer - 1) + [None] 48 | norm_type = [None] * (num_layer - 1) + [None] 49 | init_data = [] 50 | # TODO: be careful when it comes to batchnorm 51 | assert norm_type[0] is not 'batchnorm' and \ 52 | norm_type[0] is not 'batch_norm' 53 | 54 | for _ in range(num_layer): 55 | init_data.append( 56 | {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 57 | 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} 58 | ) 59 | init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std 60 | 61 | self._MLP = tf_networks.W_MLP( 62 | dims=network_shape, scope='policy_mlp', train=True, 63 | activation_type=act_type, normalizer_type=norm_type, 64 | init_data=init_data 65 | ) 66 | self._target_MLP = tf_networks.W_MLP( 67 | dims=network_shape, scope='target_policy_mlp', train=True, 68 | activation_type=act_type, normalizer_type=norm_type, 69 | init_data=init_data 70 | ) 71 | 72 | # fetch all the trainable variables 73 | self._set_var_list() 74 | 75 | def build_loss(self): 76 | """ @brief: the MLP is used to generate samples, 77 | while the target_MLP is used during the training. target_MLP is 78 | always older than the MLP, and we feed the dataset into target_MLP 79 | to train MLP. 80 | 81 | After each update, we synchronize target_MLP by copying weights from 82 | MLP. 83 | """ 84 | 85 | self._build_ph() 86 | self._tensor, self._update_operator = {}, {} 87 | whitening_util.add_whitening_operator( 88 | self._whitening_operator, self._whitening_variable, 89 | 'target_state', self._observation_size 90 | ) 91 | 92 | # the weight input_ph is always set to 0.0 93 | self._input_ph['weight'] = tf.placeholder( 94 | shape=[None, self._MLP.get_weight_size()], 95 | dtype=tf.float32, name='weight_noise' 96 | ) 97 | # the actual weight generated from the planning 98 | self._input_ph['target_weight'] = tf.placeholder( 99 | shape=[None, self._MLP.get_weight_size()], dtype=tf.float32, 100 | name='target_weight_noise' 101 | ) 102 | self._tensor['net_input'] = ( 103 | self._input_ph['start_state'] - 104 | self._whitening_operator['state_mean'] 105 | ) / self._whitening_operator['state_std'] 106 | self._tensor['target_net_input'] = ( 107 | self._input_ph['start_state'] - 108 | self._whitening_operator['target_state_mean'] 109 | ) / self._whitening_operator['target_state_std'] 110 | 111 | # the output policy of the network 112 | self._tensor['action'] = self._MLP( 113 | self._tensor['net_input'], self._input_ph['weight'] 114 | ) 115 | self._tensor['target_action'] = self._target_MLP( 116 | self._tensor['target_net_input'], 117 | self._input_ph['target_weight'] 118 | ) 119 | 120 | # the distillation loss 121 | self._update_operator['loss'] = tf.reduce_mean( 122 | tf.square(self._tensor['target_action'] - 123 | self._tensor['action']) 124 | ) 125 | self._target_MLP_var_list = self._target_MLP.get_variable_list() 126 | self._MLP_var_list = self._MLP.get_variable_list() 127 | 128 | self._update_operator['update_op'] = tf.train.AdamOptimizer( 129 | learning_rate=self.args.policy_lr, 130 | ).minimize(self._update_operator['loss'], 131 | var_list=self._MLP_var_list) 132 | logger.info("policy training learning rate: {}".format( 133 | self.args.policy_lr) 134 | ) 135 | 136 | # synchronize the weights 137 | self._get_weight = tf_utils.get_network_weights( 138 | self._session, self._MLP_var_list, 'policy_mlp' 139 | ) 140 | self._set_weight = tf_utils.set_network_weights( 141 | self._session, self._target_MLP_var_list, 'target_policy_mlp' 142 | ) 143 | 144 | self._session.run(tf.variables_initializer(tf.global_variables())) 145 | 146 | # synchronize the two networks if needed 147 | self._set_weight(self._get_weight()) # set the target MLP 148 | 149 | def train(self, data_dict, training_info={}): 150 | 151 | # Step 1: update the running mean 152 | imaginary_dataset = training_info['imaginary_dataset'] 153 | 154 | # Step 2: data processing 155 | if self.args.training_scheme in ['BC-PR']: 156 | data_dict['target_weight'] = data_dict['weight'] # for training 157 | data_dict['weight'] = 0.0 * data_dict['weight'] # for training 158 | 159 | elif self.args.training_scheme in ['BC-PI']: 160 | for key in ['start_state', 'weight']: 161 | data_dict[key] = \ 162 | np.concatenate([data_dict[key], imaginary_dataset[key]]) 163 | data_dict['target_weight'] = data_dict['weight'] # for training 164 | data_dict['weight'] = 0.0 * data_dict['weight'] # for training 165 | 166 | else: 167 | raise NotImplementedError 168 | 169 | self._set_whitening_var(data_dict['whitening_stats']) 170 | self.optimize_weights(data_dict, 171 | ['start_state', 'target_weight', 'weight']) 172 | 173 | # synchronize the networks 174 | whitening_util.copy_whitening_var(data_dict['whitening_stats'], 175 | 'state', 'target_state') 176 | whitening_util.set_whitening_var( 177 | self._session, self._whitening_operator, 178 | data_dict['whitening_stats'], ['target_state'] 179 | ) 180 | if self.args.zero_weight == 'yes': 181 | logger.warning('Using Random Weights') 182 | else: 183 | self._set_weight(self._get_weight()) 184 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/dmbrl/misc/optimizers/policy_network/__init__.py -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/base_policy.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @author: 3 | # Tingwu Wang 4 | # ----------------------------------------------------------------------------- 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from . import whitening_util 9 | from . import tf_utils 10 | from dmbrl.misc import logger 11 | 12 | 13 | def limit_action(action, lb=-1, ub=1): 14 | 15 | return tf.minimum(tf.maximum(action, lb), ub) 16 | 17 | 18 | class base_policy_network(object): 19 | ''' 20 | @brief: 21 | In this object class, we define the network structure, the restore 22 | function and save function. 23 | It will only be called in the agent/agent.py 24 | ''' 25 | 26 | def __init__(self, args, session, name_scope, 27 | observation_size, action_size): 28 | self.args = args 29 | 30 | self._session = session 31 | self._name_scope = name_scope 32 | 33 | self._observation_size = observation_size 34 | self._action_size = action_size 35 | 36 | # self._task_name = args.task_name 37 | self._network_shape = args.policy_network_shape 38 | 39 | self._npr = np.random.RandomState(args.seed) 40 | 41 | self._whitening_operator = {} 42 | self._whitening_variable = [] 43 | 44 | def build_network(self): 45 | raise NotImplementedError 46 | 47 | def build_loss(self): 48 | raise NotImplementedError 49 | 50 | def _build_ph(self): 51 | 52 | # initialize the running mean and std (whitening) 53 | whitening_util.add_whitening_operator( 54 | self._whitening_operator, self._whitening_variable, 55 | 'state', self._observation_size 56 | ) 57 | 58 | # initialize the input placeholder 59 | self._input_ph = { 60 | 'start_state': tf.placeholder( 61 | tf.float32, [None, self._observation_size], name='start_state' 62 | ) 63 | } 64 | 65 | def get_input_placeholder(self): 66 | return self._input_ph 67 | 68 | def get_weights(self): 69 | return None 70 | 71 | def set_weights(self, weights_dict): 72 | pass 73 | 74 | def forward_network(self, observation, weight_vec=None): 75 | normalized_start_state = ( 76 | observation - self._whitening_operator['state_mean'] 77 | ) / self._whitening_operator['state_std'] 78 | 79 | # the output policy of the network 80 | if weight_vec is None: 81 | action = self._MLP(normalized_start_state) 82 | else: 83 | action = self._MLP(normalized_start_state, weight_vec) 84 | 85 | action = limit_action(action) 86 | 87 | return action 88 | 89 | def _set_var_list(self): 90 | # collect the tf variable and the trainable tf variable 91 | self._trainable_var_list = [var for var in tf.trainable_variables() 92 | if self._name_scope in var.name] 93 | 94 | self._all_var_list = [var for var in tf.global_variables() 95 | if self._name_scope in var.name] 96 | 97 | # the weights that actually matter 98 | self._network_var_list = \ 99 | self._trainable_var_list + self._whitening_variable 100 | 101 | self._set_network_weights = tf_utils.set_network_weights( 102 | self._session, self._network_var_list, self._name_scope 103 | ) 104 | 105 | self._get_network_weights = tf_utils.get_network_weights( 106 | self._session, self._network_var_list, self._name_scope 107 | ) 108 | 109 | def load_checkpoint(self, ckpt_path): 110 | pass 111 | 112 | def save_checkpoint(self, ckpt_path): 113 | pass 114 | 115 | def get_whitening_operator(self): 116 | return self._whitening_operator 117 | 118 | def _set_whitening_var(self, whitening_stats): 119 | whitening_util.set_whitening_var( 120 | self._session, self._whitening_operator, whitening_stats, ['state'] 121 | ) 122 | 123 | def train(self, data_dict, replay_buffer, training_info={}): 124 | raise NotImplementedError 125 | 126 | def eval(self, data_dict): 127 | raise NotImplementedError 128 | 129 | def act(self, data_dict): 130 | raise NotImplementedError 131 | 132 | def optimize_weights(self, data_dict, training_keys): 133 | 134 | test_set_id = np.arange(len(data_dict['start_state'])) 135 | num_test_data = int(len(test_set_id) * self.args.pct_testset) 136 | self._npr.shuffle(test_set_id) 137 | test_set = {key: data_dict[key][test_set_id][:num_test_data] 138 | for key in training_keys} 139 | train_set = {key: data_dict[key][test_set_id][num_test_data:] 140 | for key in training_keys} 141 | test_error = old_test_error = np.inf 142 | 143 | # supervised training the behavior (behavior cloning) 144 | for epoch in range(self.args.policy_epochs): 145 | total_batch_len = len(train_set['start_state']) 146 | total_batch_inds = np.arange(total_batch_len) 147 | self._npr.shuffle(total_batch_inds) 148 | num_minibatch = \ 149 | max(total_batch_len // self.args.minibatch_size, 1) 150 | train_error = [] 151 | 152 | for start in range(num_minibatch): 153 | start = start * self.args.minibatch_size 154 | end = min(start + self.args.minibatch_size, total_batch_len) 155 | batch_inds = total_batch_inds[start: end] 156 | feed_dict = {self._input_ph[key]: data_dict[key][batch_inds] 157 | for key in training_keys} 158 | 159 | error, _ = self._session.run( 160 | [self._update_operator['loss'], 161 | self._update_operator['update_op']], feed_dict=feed_dict 162 | ) 163 | train_error.append(error) 164 | 165 | # see the test error 166 | feed_dict = {self._input_ph[key]: test_set[key] 167 | for key in training_keys} 168 | 169 | test_error = self._session.run( 170 | self._update_operator['loss'], feed_dict=feed_dict 171 | ) 172 | logger.info('Epoch %d; Train Error: %.6f; Test Error: %.6f' % 173 | (epoch, np.mean(train_error), test_error)) 174 | 175 | if test_error > old_test_error and epoch % 5 == 0: 176 | # TODO: MAKE A COUNTER HERE 177 | logger.info('Early stoping') 178 | break 179 | else: 180 | old_test_error = test_error 181 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/gmm_util.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @brief: 3 | # ----------------------------------------------------------------------------- 4 | 5 | import numpy as np 6 | 7 | 8 | def get_conditional_gaussian(mean, cov, observation_size): 9 | """ @brief: see the function with the same name in mbbl 10 | 11 | y = f_c + f_d.dot(x) 12 | cov(y) = pi_cov 13 | """ 14 | 15 | condition_size = observation_size 16 | pi_x = np.linalg.solve(cov[:condition_size, :condition_size], 17 | cov[:condition_size, condition_size:]).T 18 | pi_c = mean[condition_size:] - pi_x.dot(mean[:condition_size]) 19 | pi_cov = cov[condition_size:, condition_size:] - \ 20 | pi_x.dot(cov[:condition_size, :condition_size]).dot(pi_x.T) 21 | pi_cov = 0.5 * (pi_cov + pi_cov.T) 22 | 23 | # return {'pol_k': pi_c, 'pol_K': pi_x, 'pol_S': pi_cov} 24 | return {'f_c': pi_c, 'f_d': pi_x, 'cov': pi_cov} 25 | 26 | 27 | def get_gmm_posterior(gmm, gmm_weights, data): 28 | """ @brief: see the function with the same name in mbbl 29 | """ 30 | 31 | # posterior mean of gmm (C --> num_cluster, N --> num_data) 32 | response = gmm.predict_proba(np.reshape(data, [1, -1])) # (N, C) 33 | # (C, 1) 34 | avg_response = np.reshape(np.mean(np.array(response), axis=0), [-1, 1]) 35 | pos_mean = np.mean(avg_response * gmm_weights['mean'], axis=0) # (Vec) 36 | 37 | # posterior cov = (sum_i) res_i * (cov_i + \mu_i(\mu_i - \mu)^T) 38 | diff_mu = gmm_weights['mean'] - np.expand_dims(pos_mean, axis=0) # (C, Vec) 39 | mui_mui_muT = np.expand_dims(gmm_weights['mean'], axis=1) * \ 40 | np.expand_dims(diff_mu, axis=2) # (C, Vec, Vec), the outer product 41 | response_expand = np.expand_dims(avg_response, axis=2) 42 | pos_cov = np.sum((gmm_weights['cov'] + mui_mui_muT) * 43 | response_expand, axis=0) 44 | 45 | return pos_mean, pos_cov 46 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/tf_norm.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # @brief: define the batchnorm and layernorm in this function 3 | # ------------------------------------------------------------------------------ 4 | 5 | import tensorflow as tf 6 | 7 | 8 | def layer_norm(x, name_scope, epsilon=1e-5, use_bias=True, 9 | use_scale=True, gamma_init=None, data_format='NHWC'): 10 | """ 11 | @Brief: code modified from ppwwyyxx github.com/ppwwyyxx/tensorpack/, 12 | under layer_norm.py. 13 | Layer Normalization layer, as described in the paper: 14 | https://arxiv.org/abs/1607.06450. 15 | @input: 16 | x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should 17 | match data_format. 18 | """ 19 | with tf.variable_scope(name_scope): 20 | shape = x.get_shape().as_list() 21 | ndims = len(shape) 22 | assert ndims in [2, 4] 23 | 24 | mean, var = tf.nn.moments(x, list(range(1, len(shape))), keep_dims=True) 25 | 26 | if data_format == 'NCHW': 27 | chan = shape[1] 28 | new_shape = [1, chan, 1, 1] 29 | else: 30 | chan = shape[-1] 31 | new_shape = [1, 1, 1, chan] 32 | if ndims == 2: 33 | new_shape = [1, chan] 34 | 35 | if use_bias: 36 | beta = tf.get_variable( 37 | 'beta', [chan], initializer=tf.constant_initializer() 38 | ) 39 | beta = tf.reshape(beta, new_shape) 40 | else: 41 | beta = tf.zeros([1] * ndims, name='beta') 42 | if use_scale: 43 | if gamma_init is None: 44 | gamma_init = tf.constant_initializer(1.0) 45 | gamma = tf.get_variable('gamma', [chan], initializer=gamma_init) 46 | gamma = tf.reshape(gamma, new_shape) 47 | else: 48 | gamma = tf.ones([1] * ndims, name='gamma') 49 | 50 | ret = tf.nn.batch_normalization( 51 | x, mean, var, beta, gamma, epsilon, name='output' 52 | ) 53 | return ret 54 | 55 | 56 | def batch_norm_with_train(x, name_scope, epsilon=1e-5, momentum=0.9): 57 | ret = tf.contrib.layers.batch_norm( 58 | x, decay=momentum, updates_collections=None, epsilon=epsilon, 59 | scale=True, is_training=True, scope=name_scope 60 | ) 61 | return ret 62 | 63 | 64 | def batch_norm_without_train(x, name_scope, epsilon=1e-5, momentum=0.9): 65 | ret = tf.contrib.layers.batch_norm( 66 | x, decay=momentum, updates_collections=None, epsilon=epsilon, 67 | scale=True, is_training=False, scope=name_scope 68 | ) 69 | return ret 70 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/tf_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @brief: 3 | # ----------------------------------------------------------------------------- 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | def get_weight_decay_loss(var_list): 10 | weight_decay_dict = {} 11 | weight_decay_sum = 0.0 12 | for var in var_list: 13 | i_weight_decay = tf.nn.l2_loss(var) 14 | weight_decay_dict[var.name] = i_weight_decay 15 | weight_decay_sum += tf.reduce_mean(weight_decay_sum) 16 | return weight_decay_sum, weight_decay_dict 17 | 18 | 19 | def logsigmoid(x): 20 | return -tf.nn.softplus(-x) 21 | 22 | 23 | def logit_bernoulli_entropy(logits): 24 | ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits) 25 | return ent 26 | 27 | 28 | def gauss_selfKL_firstfixed(mu, logstd): 29 | ''' 30 | @brief: 31 | KL divergence with itself, holding first argument fixed 32 | Use stop gradient to cut the gradient flows 33 | ''' 34 | mu1, logstd1 = map(tf.stop_gradient, [mu, logstd]) 35 | mu2, logstd2 = mu, logstd 36 | 37 | return gauss_KL(mu1, logstd1, mu2, logstd2) 38 | 39 | 40 | def gauss_log_prob(mu, logstd, x): 41 | # probability to take action x, given paramaterized guassian distribution 42 | var = tf.exp(2 * logstd) 43 | gp = - tf.square(x - mu) / (2 * var) \ 44 | - .5 * tf.log(tf.constant(2 * np.pi)) \ 45 | - logstd 46 | return tf.reduce_sum(gp, [1]) 47 | 48 | 49 | def gauss_KL(mu1, logstd1, mu2, logstd2): 50 | # KL divergence between two paramaterized guassian distributions 51 | var1 = tf.exp(2 * logstd1) 52 | var2 = tf.exp(2 * logstd2) 53 | 54 | kl = tf.reduce_sum( 55 | logstd2 - logstd1 + (var1 + tf.square(mu1 - mu2)) / (2 * var2) - 0.5 56 | ) 57 | return kl 58 | 59 | 60 | def gauss_ent(mu, logstd): 61 | # shannon entropy for a paramaterized guassian distributions 62 | h = tf.reduce_sum( 63 | logstd + tf.constant(0.5 * np.log(2 * np.pi * np.e), tf.float32) 64 | ) 65 | return h 66 | 67 | 68 | def slice_2d(x, inds0, inds1): 69 | inds0 = tf.cast(inds0, tf.int64) 70 | inds1 = tf.cast(inds1, tf.int64) 71 | shape = tf.cast(tf.shape(x), tf.int64) 72 | ncols = shape[1] 73 | x_flat = tf.reshape(x, [-1]) 74 | return tf.gather(x_flat, inds0 * ncols + inds1) 75 | 76 | 77 | def var_shape(x): 78 | out = [k.value for k in x.get_shape()] 79 | assert all(isinstance(a, int) for a in out), \ 80 | "shape function assumes that shape is fully known" 81 | return out 82 | 83 | 84 | def numel(x): 85 | return np.prod(var_shape(x)) 86 | 87 | 88 | def l2_loss(var_list): 89 | l2_norm = tf.constant(0.) 90 | for var in var_list: 91 | l2_norm += tf.nn.l2_loss(var) 92 | return l2_norm 93 | 94 | 95 | def flatgrad(loss, var_list): 96 | grads = tf.gradients(loss, var_list) 97 | return tf.concat( 98 | [tf.reshape(grad, [numel(v)]) for (v, grad) in zip(var_list, grads)], 0 99 | ) 100 | 101 | 102 | class SetFromFlat(object): 103 | 104 | def __init__(self, session, var_list): 105 | self.session = session 106 | assigns = [] 107 | shapes = map(var_shape, var_list) 108 | total_size = sum(np.prod(shape) for shape in shapes) 109 | self.theta = theta = tf.placeholder(tf.float32, [total_size]) 110 | start = 0 111 | assigns = [] 112 | for (shape, v) in zip(shapes, var_list): 113 | size = np.prod(shape) 114 | assigns.append( 115 | tf.assign(v, tf.reshape(theta[start:start + size], shape)) 116 | ) 117 | start += size 118 | self.op = tf.group(*assigns) 119 | 120 | def __call__(self, theta): 121 | self.session.run(self.op, feed_dict={self.theta: theta}) 122 | 123 | 124 | class GetFlat(object): 125 | 126 | def __init__(self, session, var_list): 127 | self.session = session 128 | self.op = tf.concat([tf.reshape(v, [numel(v)]) for v in var_list], 0) 129 | 130 | def __call__(self): 131 | return self.op.eval(session=self.session) 132 | 133 | 134 | class get_network_weights(object): 135 | """ @brief: 136 | call this function to get the weights in the policy network 137 | """ 138 | 139 | def __init__(self, session, var_list, base_namescope): 140 | self._session = session 141 | self._base_namescope = base_namescope 142 | # self._op is a dict, note that the base namescope is removed, as the 143 | # worker and the trainer has different base_namescope 144 | self._op = { 145 | var.name.replace(self._base_namescope, ''): var 146 | for var in var_list 147 | } 148 | 149 | def __call__(self): 150 | return self._session.run(self._op) 151 | 152 | 153 | class set_network_weights(object): 154 | """ @brief: 155 | Call this function to set the weights in the policy network 156 | """ 157 | 158 | def __init__(self, session, var_list, base_namescope): 159 | self._session = session 160 | self._base_namescope = base_namescope 161 | 162 | self._var_list = var_list 163 | self._placeholders = {} 164 | self._assigns = [] 165 | 166 | with tf.get_default_graph().as_default(): 167 | for var in self._var_list: 168 | var_name = var.name.replace(self._base_namescope, '') 169 | self._placeholders[var_name] = tf.placeholder( 170 | tf.float32, var.get_shape() 171 | ) 172 | self._assigns.append( 173 | tf.assign(var, self._placeholders[var_name]) 174 | ) 175 | 176 | def __call__(self, weight_dict): 177 | assert len(weight_dict) == len(self._var_list) 178 | 179 | feed_dict = {} 180 | for var in self._var_list: 181 | var_name = var.name.replace(self._base_namescope, '') 182 | assert var_name in weight_dict 183 | feed_dict[self._placeholders[var_name]] = weight_dict[var_name] 184 | 185 | self._session.run(self._assigns, feed_dict) 186 | 187 | 188 | def xavier_initializer(self, shape): 189 | dim_sum = np.sum(shape) 190 | if len(shape) == 1: 191 | dim_sum += 1 192 | bound = np.sqrt(6.0 / dim_sum) 193 | return tf.random_uniform(shape, minval=-bound, maxval=bound) 194 | 195 | 196 | def fully_connected(input_layer, input_size, output_size, weight_init, 197 | bias_init, scope, trainable): 198 | with tf.variable_scope(scope): 199 | w = tf.get_variable( 200 | "w", [input_size, output_size], 201 | initializer=weight_init, trainable=trainable 202 | ) 203 | b = tf.get_variable( 204 | "b", [output_size], initializer=bias_init, trainable=trainable 205 | ) 206 | return tf.matmul(input_layer, w) + b 207 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/policy_network/whitening_util.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # @author: 3 | # Tingwu Wang 4 | # ----------------------------------------------------------------------------- 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | _ALLOW_KEY = ['state', 'diff_state', 'action'] 10 | 11 | 12 | def init_whitening_stats(key_list): 13 | whitening_stats = {} 14 | for key in key_list: 15 | whitening_stats[key] = {'mean': 0.0, 'variance': 1, 'step': 0.01, 16 | 'square_sum': 0.01, 'sum': 0.0, 'std': np.nan} 17 | return whitening_stats 18 | 19 | 20 | def update_whitening_stats(whitening_stats, rollout_data, key): 21 | # collect the info 22 | new_sum, new_step_sum, new_sq_sum = 0.0, 0.0, 0.0 23 | 24 | if type(rollout_data) is dict: 25 | new_sum += rollout_data[key].sum(axis=0) 26 | new_sq_sum += (np.square(rollout_data[key])).sum(axis=0) 27 | new_step_sum += rollout_data[key].shape[0] 28 | else: 29 | assert type(rollout_data) is list 30 | for i_episode in rollout_data: 31 | if key == 'state': 32 | i_data = i_episode['obs'] 33 | elif key == 'action': 34 | i_data = i_episode['actions'] 35 | else: 36 | assert key == 'diff_state' 37 | i_data = i_episode['obs'][1:] - i_episode['obs'][:-1] 38 | 39 | new_sum += i_data.sum(axis=0) 40 | new_sq_sum += (np.square(i_data)).sum(axis=0) 41 | new_step_sum += i_data.shape[0] 42 | 43 | # update the whitening info 44 | whitening_stats[key]['step'] += new_step_sum 45 | whitening_stats[key]['sum'] += new_sum 46 | whitening_stats[key]['square_sum'] += new_sq_sum 47 | whitening_stats[key]['mean'] = \ 48 | whitening_stats[key]['sum'] / whitening_stats[key]['step'] 49 | whitening_stats[key]['variance'] = np.maximum( 50 | whitening_stats[key]['square_sum'] / whitening_stats[key]['step'] - 51 | np.square(whitening_stats[key]['mean']), 1e-2 52 | ) 53 | whitening_stats[key]['std'] = \ 54 | (whitening_stats[key]['variance'] + 1e-6) ** .5 55 | 56 | 57 | def add_whitening_operator(whitening_operator, whitening_variable, name, size): 58 | 59 | with tf.variable_scope('whitening_' + name): 60 | whitening_operator[name + '_mean'] = tf.Variable( 61 | np.zeros([1, size], np.float32), 62 | name=name + "_mean", trainable=False 63 | ) 64 | whitening_operator[name + '_std'] = tf.Variable( 65 | np.ones([1, size], np.float32), 66 | name=name + "_std", trainable=False 67 | ) 68 | whitening_variable.append(whitening_operator[name + '_mean']) 69 | whitening_variable.append(whitening_operator[name + '_std']) 70 | 71 | # the reset placeholders 72 | whitening_operator[name + '_mean_ph'] = tf.placeholder( 73 | tf.float32, shape=(1, size), name=name + '_reset_mean_ph' 74 | ) 75 | whitening_operator[name + '_std_ph'] = tf.placeholder( 76 | tf.float32, shape=(1, size), name=name + '_reset_std_ph' 77 | ) 78 | 79 | # the tensorflow operators 80 | whitening_operator[name + '_mean_op'] = \ 81 | whitening_operator[name + '_mean'].assign( 82 | whitening_operator[name + '_mean_ph'] 83 | ) 84 | 85 | whitening_operator[name + '_std_op'] = \ 86 | whitening_operator[name + '_std'].assign( 87 | whitening_operator[name + '_std_ph'] 88 | ) 89 | 90 | 91 | def copy_whitening_var(whitening_stats, input_name, output_name): 92 | whitening_stats[output_name] = {} 93 | whitening_stats[output_name]['mean'] = whitening_stats[input_name]['mean'] 94 | whitening_stats[output_name]['std'] = whitening_stats[input_name]['std'] 95 | 96 | 97 | def set_whitening_var(session, whitening_operator, whitening_stats, key_list): 98 | 99 | for i_key in key_list: 100 | for i_item in ['mean', 'std']: 101 | session.run( 102 | whitening_operator[i_key + '_' + i_item + '_op'], 103 | feed_dict={whitening_operator[i_key + '_' + i_item + '_ph']: 104 | np.reshape(whitening_stats[i_key][i_item], [1, -1])} 105 | ) 106 | 107 | 108 | def append_normalized_data_dict(data_dict, whitening_stats, 109 | target=['start_state', 'diff_state', 110 | 'end_state']): 111 | data_dict['n_start_state'] = \ 112 | (data_dict['start_state'] - whitening_stats['state']['mean']) / \ 113 | whitening_stats['state']['std'] 114 | data_dict['n_end_state'] = \ 115 | (data_dict['end_state'] - whitening_stats['state']['mean']) / \ 116 | whitening_stats['state']['std'] 117 | data_dict['n_diff_state'] = \ 118 | (data_dict['end_state'] - data_dict['start_state'] - 119 | whitening_stats['diff_state']['mean']) / \ 120 | whitening_stats['diff_state']['std'] 121 | data_dict['diff_state'] = \ 122 | data_dict['end_state'] - data_dict['start_state'] 123 | -------------------------------------------------------------------------------- /dmbrl/misc/optimizers/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from .optimizer import Optimizer 9 | 10 | 11 | class RandomOptimizer(Optimizer): 12 | 13 | def __init__(self, sol_dim, popsize, tf_session, 14 | upper_bound=None, lower_bound=None, params=None): 15 | """Creates an instance of this class. 16 | 17 | Arguments: 18 | sol_dim (int): The dimensionality of the problem space 19 | popsize (int): The number of candidate solutions to be sampled at every iteration 20 | num_elites (int): The number of top solutions that will be used to obtain the distribution 21 | at the next iteration. 22 | tf_session (tf.Session): (optional) Session to be used for this optimizer. Defaults to None, 23 | in which case any functions passed in cannot be tf.Tensor-valued. 24 | upper_bound (np.array): An array of upper bounds 25 | lower_bound (np.array): An array of lower bounds 26 | """ 27 | super().__init__() 28 | self.sol_dim = sol_dim 29 | self.popsize = popsize 30 | self.ub, self.lb = upper_bound, lower_bound 31 | self.tf_sess = tf_session 32 | self.solution = None 33 | self.tf_compatible, self.cost_function = None, None 34 | 35 | def setup(self, cost_function, tf_compatible): 36 | """Sets up this optimizer using a given cost function. 37 | 38 | Arguments: 39 | cost_function (func): A function for computing costs over a batch of candidate solutions. 40 | tf_compatible (bool): True if the cost function provided is tf.Tensor-valued. 41 | 42 | Returns: None 43 | """ 44 | if tf_compatible and self.tf_sess is None: 45 | raise RuntimeError("Cannot pass in a tf.Tensor-valued cost function without passing in a TensorFlow " 46 | "session into the constructor") 47 | 48 | if not tf_compatible: 49 | self.tf_compatible = False 50 | self.cost_function = cost_function 51 | else: 52 | with self.tf_sess.graph.as_default(): 53 | self.tf_compatible = True 54 | solutions = tf.random_uniform([self.popsize, self.sol_dim], self.ub, self.lb) 55 | costs = cost_function(solutions) 56 | self.solution = solutions[tf.cast(tf.argmin(costs), tf.int32)] 57 | 58 | def reset(self): 59 | pass 60 | 61 | def obtain_solution(self, init_mean, init_var, per, dU, obs=None): 62 | """Optimizes the cost function provided in setup(). 63 | 64 | Arguments: 65 | init_mean (np.ndarray): The mean of the initial candidate distribution. 66 | init_var (np.ndarray): The variance of the initial candidate distribution. 67 | """ 68 | if self.tf_compatible: 69 | sol = self.tf_sess.run(self.solution) 70 | return sol, self.update_prev_sol(per, dU, sol) 71 | else: 72 | solutions = np.random.uniform(self.lb, self.ub, [self.popsize, self.sol_dim]) 73 | costs = self.cost_function(solutions) 74 | return solutions[np.argmin(costs)], \ 75 | self.update_prev_sol(solutions[np.argmin(costs)], per, dU) 76 | -------------------------------------------------------------------------------- /dmbrl/modeling/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .FC import FC -------------------------------------------------------------------------------- /dmbrl/modeling/models/GT_dynamics.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | 6 | import numpy as np 7 | 8 | 9 | def none_constructor(model_init_cfg, misc=None): 10 | return GT(None) 11 | 12 | 13 | def compile_cost(init_obs, ac_seqs, cfg, gt_dynamics, numpy_reward_function, 14 | traj_id=0, cem_type=None, tf_data_dict=None): 15 | assert cem_type is None 16 | assert tf_data_dict is None 17 | 18 | t, nopt = 0, ac_seqs.shape[0] 19 | init_costs = np.zeros([nopt, 1]) 20 | ac_seqs = np.reshape(ac_seqs, [-1, cfg['plan_hor'], cfg['dU']]) 21 | ac_seqs = np.transpose(ac_seqs, [1, 0, 2]) 22 | init_obs = np.tile(init_obs[None], [nopt, 1]) 23 | cur_obs = init_obs 24 | total_cost = init_costs 25 | 26 | expert_obs = gt_dynamics.expert_obs(traj_id) 27 | timestep_left = int(len(expert_obs) - init_obs[0, -1] - 1) 28 | 29 | plan_depth = min(cfg['plan_hor'], timestep_left) 30 | 31 | for i_iter in range(plan_depth): 32 | cur_acs = ac_seqs[t] 33 | next_obs, _ = gt_dynamics.predict(cur_obs, cur_acs) 34 | 35 | ''' 36 | if i_iter == plan_depth - 1: 37 | delta_cost = -numpy_reward_function(next_obs, cur_acs, expert_obs) 38 | total_cost += delta_cost.reshape(total_cost.shape) 39 | else: 40 | delta_cost = 0.0 41 | ''' 42 | delta_cost = -numpy_reward_function(next_obs, cur_acs, expert_obs) 43 | total_cost += delta_cost.reshape(total_cost.shape) 44 | cur_obs = next_obs 45 | 46 | return total_cost 47 | 48 | 49 | class GT: 50 | """ @brief: groundtruth dynamics 51 | """ 52 | 53 | def __init__(self, params): 54 | """Initializes a class instance. 55 | 56 | Arguments: 57 | params (DotMap): A dotmap of model parameters. 58 | .name (str): Model name, used for logging/use in variable scopes. 59 | Warning: Models with the same name will overwrite each other. 60 | .num_networks (int): (optional) The number of networks in the ensemble. Defaults to 1. 61 | Ignored if model is being loaded. 62 | .model_dir (str/None): (optional) Path to directory from which model will be loaded, and 63 | saved by default. Defaults to None. 64 | .load_model (bool): (optional) If True, model will be loaded from the model directory, 65 | assuming that the files are generated by a model of the same name. Defaults to False. 66 | .sess (tf.Session/None): The session that this model will use. 67 | If None, creates a session with its own associated graph. Defaults to None. 68 | """ 69 | # Instance variables 70 | self.finalized = False 71 | self.layers, self.decays, self.optvars, self.nonoptvars = [], [], [], [] 72 | self.scaler = None 73 | 74 | # Training objects 75 | self.optimizer = None 76 | self.sy_train_in, self.sy_train_targ = None, None 77 | self.train_op, self.mse_loss = None, None 78 | 79 | # Prediction objects 80 | self.sy_pred_in2d, self.sy_pred_mean2d_fac = None, None 81 | self.sy_pred_mean2d, self.sy_pred_var2d = None, None 82 | self.sy_pred_in3d, self.sy_pred_mean3d_fac = None, None 83 | self.num_nets = 1 84 | 85 | # the groundtruth dynamics environment 86 | if params is not None: 87 | self.name = 'non_tensorflow' 88 | self.model_dir = params.get('model_dir', None) 89 | 90 | self._misc_args = params.misc 91 | misc_info = {'reset_type': 'gym', 'groundtruth_model': True, 92 | 'expert_amc_dir': params.il_cfg.expert_amc_dir, 93 | 'add_timestep_into_ob': True} 94 | 95 | # TODO: 96 | from dmbrl.env import im_dmhumanoid 97 | self._dynamics_env = im_dmhumanoid.IMDMHumanoid( 98 | 'cmu-humanoid-imitation', 1234, misc_info 99 | ) 100 | self._numpy_reward_function = im_dmhumanoid.numpy_reward_function 101 | self._dynamics_env.reset() 102 | 103 | def expert_obs(self, traj_id): 104 | return self._dynamics_env.expert_obs(traj_id) 105 | 106 | @property 107 | def is_probabilistic(self): 108 | return True if self.num_nets > 1 else False 109 | 110 | @property 111 | def is_tf_model(self): 112 | return False 113 | 114 | @property 115 | def sess(self): 116 | return None 117 | 118 | ################################### 119 | # Network Structure Setup Methods # 120 | ################################### 121 | 122 | def add(self, layer): 123 | pass 124 | 125 | def pop(self): 126 | pass 127 | 128 | def finalize(self, optimizer, optimizer_args=None, *args, **kwargs): 129 | self.finalized = True 130 | 131 | ################# 132 | # Model Methods # 133 | ################# 134 | 135 | def train(self, inputs, targets, batch_size=32, epochs=100, 136 | hide_progress=False, holdout_ratio=0.0, max_logging=5000): 137 | pass 138 | 139 | def predict(self, observations, actions): 140 | num_data = observations.shape[0] 141 | end_state = [] 142 | for i_data in range(num_data): 143 | i_end_state = self._dynamics_env.fdynamics( 144 | {'start_state': observations[i_data], 'action': actions[i_data]} 145 | ) 146 | end_state.append(i_end_state) 147 | return np.array(end_state), None 148 | 149 | def save(self, savedir=None): 150 | pass 151 | 152 | def _load_structure(self): 153 | pass 154 | 155 | ####################### 156 | # Compilation methods # 157 | ####################### 158 | 159 | def _compile_outputs(self, inputs): 160 | return None 161 | 162 | def _compile_losses(self, inputs, targets): 163 | return None 164 | -------------------------------------------------------------------------------- /dmbrl/modeling/models/TFGP.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | import gpflow 9 | 10 | from dmbrl.misc.DotmapUtils import get_required_argument 11 | from dmbrl.misc import logger 12 | 13 | 14 | class TFGP: 15 | def __init__(self, params): 16 | """Initializes class instance. 17 | 18 | Arguments: 19 | params 20 | .name (str): Model name 21 | .kernel_class (class): Kernel class 22 | .kernel_args (args): Kernel args 23 | .num_inducing_points (int): Number of inducing points 24 | .sess (tf.Session): Tensorflow session 25 | """ 26 | self.name = params.get("name", "GP") 27 | self.kernel_class = get_required_argument(params, "kernel_class", "Must provide kernel class.") 28 | self.kernel_args = params.get("kernel_args", {}) 29 | self.num_inducing_points = get_required_argument( 30 | params, "num_inducing_points", "Must provide number of inducing points." 31 | ) 32 | 33 | if params.get("sess", None) is None: 34 | config = tf.ConfigProto() 35 | config.gpu_options.allow_growth = True 36 | self._sess = tf.Session(config=config) 37 | else: 38 | self._sess = params.get("sess") 39 | 40 | with self._sess.as_default(): 41 | with tf.variable_scope(self.name): 42 | output_dim = self.kernel_args["output_dim"] 43 | del self.kernel_args["output_dim"] 44 | self.model = gpflow.models.SGPR( 45 | np.zeros([1, self.kernel_args["input_dim"]]), 46 | np.zeros([1, output_dim]), 47 | kern=self.kernel_class(**self.kernel_args), 48 | Z=np.zeros([self.num_inducing_points, self.kernel_args["input_dim"]]) 49 | ) 50 | self.model.initialize() 51 | 52 | @property 53 | def is_probabilistic(self): 54 | return True 55 | 56 | @property 57 | def sess(self): 58 | return self._sess 59 | 60 | @property 61 | def is_tf_model(self): 62 | return True 63 | 64 | def train(self, inputs, targets, 65 | *args, **kwargs): 66 | """Optimizes the parameters of the internal GP model. 67 | 68 | Arguments: 69 | inputs: (np.ndarray) An array of inputs. 70 | targets: (np.ndarray) An array of targets. 71 | num_restarts: (int) The number of times that the optimization of 72 | the GP will be restarted to obtain a good set of parameters. 73 | 74 | Returns: None. 75 | """ 76 | perm = np.random.permutation(inputs.shape[0]) 77 | inputs, targets = inputs[perm], targets[perm] 78 | Z = np.copy(inputs[:self.num_inducing_points]) 79 | if Z.shape[0] < self.num_inducing_points: 80 | Z = np.concatenate([Z, np.zeros([self.num_inducing_points - Z.shape[0], Z.shape[1]])]) 81 | self.model.X = inputs 82 | self.model.Y = targets 83 | self.model.feature.Z = Z 84 | with self.sess.as_default(): 85 | self.model.compile() 86 | logger.info("Optimizing model... ", end="") 87 | gpflow.train.ScipyOptimizer().minimize(self.model) 88 | logger.info("Done.") 89 | 90 | def predict(self, inputs, *args, **kwargs): 91 | """Returns the predictions of this model on inputs. 92 | 93 | Arguments: 94 | inputs: (np.ndarray) The inputs on which predictions will be returned. 95 | ign_var: (bool) If True, only returns the mean prediction 96 | 97 | Returns: (np.ndarrays) The mean and variance of the model on the new points. 98 | """ 99 | if self.model is None: 100 | raise RuntimeError("Cannot make predictions without initial batch of data.") 101 | 102 | with self.sess.as_default(): 103 | mean, var = self.model.predict_y(inputs) 104 | return mean, var 105 | 106 | def create_prediction_tensors(self, inputs, *args, **kwargs): 107 | "" 108 | if self.model is None: 109 | raise RuntimeError("Cannot make predictions without initial batch of data.") 110 | 111 | inputs = tf.cast(inputs, tf.float64) 112 | mean, var = self.model._build_predict(inputs, full_cov=False) 113 | return tf.cast(mean, dtype=tf.float32), tf.cast(var, tf.float32) 114 | 115 | def save(self, *args, **kwargs): 116 | pass 117 | -------------------------------------------------------------------------------- /dmbrl/modeling/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .BNN import BNN 2 | from .NN import NN 3 | from .TFGP import TFGP 4 | -------------------------------------------------------------------------------- /dmbrl/modeling/utils/TensorStandardScaler.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | class TensorStandardScaler: 10 | """Helper class for automatically normalizing inputs into the network. 11 | """ 12 | def __init__(self, x_dim): 13 | """Initializes a scaler. 14 | 15 | Arguments: 16 | x_dim (int): The dimensionality of the inputs into the scaler. 17 | 18 | Returns: None. 19 | """ 20 | self.fitted = False 21 | with tf.variable_scope("Scaler"): 22 | self.mu = tf.get_variable( 23 | name="scaler_mu", shape=[1, x_dim], initializer=tf.constant_initializer(0.0), 24 | trainable=False 25 | ) 26 | self.sigma = tf.get_variable( 27 | name="scaler_std", shape=[1, x_dim], initializer=tf.constant_initializer(1.0), 28 | trainable=False 29 | ) 30 | 31 | self.cached_mu, self.cached_sigma = np.zeros([0, x_dim]), np.ones([1, x_dim]) 32 | 33 | def fit(self, data): 34 | """Runs two ops, one for assigning the mean of the data to the internal mean, and 35 | another for assigning the standard deviation of the data to the internal standard deviation. 36 | This function must be called within a 'with .as_default()' block. 37 | 38 | Arguments: 39 | data (np.ndarray): A numpy array containing the input 40 | 41 | Returns: None. 42 | """ 43 | mu = np.mean(data, axis=0, keepdims=True) 44 | sigma = np.std(data, axis=0, keepdims=True) 45 | sigma[sigma < 1e-12] = 1.0 46 | 47 | self.mu.load(mu) 48 | self.sigma.load(sigma) 49 | self.fitted = True 50 | self.cache() 51 | 52 | def transform(self, data): 53 | """Transforms the input matrix data using the parameters of this scaler. 54 | 55 | Arguments: 56 | data (np.array): A numpy array containing the points to be transformed. 57 | 58 | Returns: (np.array) The transformed dataset. 59 | """ 60 | return (data - self.mu) / self.sigma 61 | 62 | def inverse_transform(self, data): 63 | """Undoes the transformation performed by this scaler. 64 | 65 | Arguments: 66 | data (np.array): A numpy array containing the points to be transformed. 67 | 68 | Returns: (np.array) The transformed dataset. 69 | """ 70 | return self.sigma * data + self.mu 71 | 72 | def get_vars(self): 73 | """Returns a list of variables managed by this object. 74 | 75 | Returns: (list) The list of variables. 76 | """ 77 | return [self.mu, self.sigma] 78 | 79 | def cache(self): 80 | """Caches current values of this scaler. 81 | 82 | Returns: None. 83 | """ 84 | self.cached_mu = self.mu.eval() 85 | self.cached_sigma = self.sigma.eval() 86 | 87 | def load_cache(self): 88 | """Loads values from the cache 89 | 90 | Returns: None. 91 | """ 92 | self.mu.load(self.cached_mu) 93 | self.sigma.load(self.cached_sigma) 94 | -------------------------------------------------------------------------------- /dmbrl/modeling/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .TensorStandardScaler import TensorStandardScaler -------------------------------------------------------------------------------- /img/curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/img/curve.png -------------------------------------------------------------------------------- /img/policy_control.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/img/policy_control.png -------------------------------------------------------------------------------- /img/reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/img/reward.png -------------------------------------------------------------------------------- /img/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WilsonWangTHU/POPLIN/edd8dba50f9049c6164eda774602bef0c299cb51/img/table.png -------------------------------------------------------------------------------- /mbexp.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | import argparse 7 | import pprint 8 | import copy 9 | 10 | from dotmap import DotMap 11 | 12 | from dmbrl.misc.MBExp import MBExperiment 13 | from dmbrl.controllers.MPC import MPC 14 | from dmbrl.config import create_config 15 | from dmbrl.misc import logger 16 | 17 | 18 | def main(env, ctrl_type, ctrl_args, overrides, logdir, args): 19 | ctrl_args = DotMap(**{key: val for (key, val) in ctrl_args}) 20 | cfg = create_config(env, ctrl_type, ctrl_args, overrides, logdir) 21 | logger.info('\n' + pprint.pformat(cfg)) 22 | 23 | # add the part of popsize 24 | if ctrl_type == "MPC": 25 | cfg.exp_cfg.exp_cfg.policy = MPC(cfg.ctrl_cfg) 26 | 27 | cfg.exp_cfg.misc = copy.copy(cfg) 28 | exp = MBExperiment(cfg.exp_cfg) 29 | 30 | if not os.path.exists(exp.logdir): 31 | os.makedirs(exp.logdir) 32 | with open(os.path.join(exp.logdir, "config.txt"), "w") as f: 33 | f.write(pprint.pformat(cfg.toDict())) 34 | 35 | exp.run_experiment() 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-env', type=str, required=True, 41 | help='Environment name: select from [cartpole, reacher, pusher, halfcheetah]') 42 | parser.add_argument('-ca', '--ctrl_arg', action='append', nargs=2, default=[], 43 | help='Controller arguments, see https://github.com/kchua/handful-of-trials#controller-arguments') 44 | parser.add_argument('-o', '--override', action='append', nargs=2, default=[], 45 | help='Override default parameters, see https://github.com/kchua/handful-of-trials#overrides') 46 | parser.add_argument('-logdir', type=str, default='log', 47 | help='Directory to which results will be logged (default: ./log)') 48 | parser.add_argument('-e_popsize', type=int, default=500, 49 | help='different popsize to use') 50 | args = parser.parse_args() 51 | 52 | main(args.env, "MPC", args.ctrl_arg, args.override, args.logdir, args) 53 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dotmap==1.2.20 2 | future==0.16.0 3 | gpflow 4 | gym==0.9.4 5 | mujoco-py==0.5.7 6 | numpy==1.14.0 7 | scipy==0.19.0 8 | tensorflow-gpu==1.9.0 9 | tqdm==4.19.4 10 | termcolor 11 | -------------------------------------------------------------------------------- /scripts/mbexp.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | import argparse 7 | import pprint 8 | import copy 9 | 10 | from dotmap import DotMap 11 | 12 | from dmbrl.misc.MBExp import MBExperiment 13 | from dmbrl.controllers.MPC import MPC 14 | from dmbrl.config import create_config 15 | from dmbrl.misc import logger 16 | 17 | 18 | def main(env, ctrl_type, ctrl_args, overrides, logdir, args): 19 | ctrl_args = DotMap(**{key: val for (key, val) in ctrl_args}) 20 | cfg = create_config(env, ctrl_type, ctrl_args, overrides, logdir) 21 | logger.info('\n' + pprint.pformat(cfg)) 22 | 23 | # add the part of popsize 24 | if ctrl_type == "MPC": 25 | cfg.exp_cfg.exp_cfg.policy = MPC(cfg.ctrl_cfg) 26 | 27 | cfg.exp_cfg.misc = copy.copy(cfg) 28 | exp = MBExperiment(cfg.exp_cfg) 29 | 30 | if not os.path.exists(exp.logdir): 31 | os.makedirs(exp.logdir) 32 | with open(os.path.join(exp.logdir, "config.txt"), "w") as f: 33 | f.write(pprint.pformat(cfg.toDict())) 34 | 35 | exp.run_experiment() 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-env', type=str, required=True, 41 | help='Environment name: select from [cartpole, reacher, pusher, halfcheetah]') 42 | parser.add_argument('-ca', '--ctrl_arg', action='append', nargs=2, default=[], 43 | help='Controller arguments, see https://github.com/kchua/handful-of-trials#controller-arguments') 44 | parser.add_argument('-o', '--override', action='append', nargs=2, default=[], 45 | help='Override default parameters, see https://github.com/kchua/handful-of-trials#overrides') 46 | parser.add_argument('-logdir', type=str, default='log', 47 | help='Directory to which results will be logged (default: ./log)') 48 | parser.add_argument('-e_popsize', type=int, default=500, 49 | help='different popsize to use') 50 | args = parser.parse_args() 51 | 52 | main(args.env, "MPC", args.ctrl_arg, args.override, args.logdir, args) 53 | -------------------------------------------------------------------------------- /scripts/render.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import os 6 | import argparse 7 | import pprint 8 | 9 | from dotmap import DotMap 10 | 11 | from dmbrl.misc.MBExp import MBExperiment 12 | from dmbrl.controllers.MPC import MPC 13 | from dmbrl.config import create_config 14 | 15 | 16 | def main(env, ctrl_type, ctrl_args, overrides, model_dir, logdir): 17 | ctrl_args = DotMap(**{key: val for (key, val) in ctrl_args}) 18 | 19 | overrides.append(["ctrl_cfg.prop_cfg.model_init_cfg.model_dir", model_dir]) 20 | overrides.append(["ctrl_cfg.prop_cfg.model_init_cfg.load_model", "True"]) 21 | overrides.append(["ctrl_cfg.prop_cfg.model_pretrained", "True"]) 22 | overrides.append(["exp_cfg.exp_cfg.ninit_rollouts", "0"]) 23 | overrides.append(["exp_cfg.exp_cfg.ntrain_iters", "1"]) 24 | overrides.append(["exp_cfg.log_cfg.nrecord", "1"]) 25 | 26 | cfg = create_config(env, ctrl_type, ctrl_args, overrides, logdir) 27 | cfg.pprint() 28 | 29 | if ctrl_type == "MPC": 30 | cfg.exp_cfg.exp_cfg.policy = MPC(cfg.ctrl_cfg) 31 | exp = MBExperiment(cfg.exp_cfg) 32 | 33 | os.makedirs(exp.logdir) 34 | with open(os.path.join(exp.logdir, "config.txt"), "w") as f: 35 | f.write(pprint.pformat(cfg.toDict())) 36 | 37 | exp.run_experiment() 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument('-env', type=str, required=True) 43 | parser.add_argument('-ca', '--ctrl_arg', action='append', nargs=2, default=[]) 44 | parser.add_argument('-o', '--override', action='append', nargs=2, default=[]) 45 | parser.add_argument('-model-dir', type=str, required=True) 46 | parser.add_argument('-logdir', type=str, required=True) 47 | args = parser.parse_args() 48 | 49 | main(args.env, "MPC", args.ctrl_arg, args.override, args.model_dir, args.logdir) 50 | -------------------------------------------------------------------------------- /show_result.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from scipy.io import loadmat 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | file_list = glob.glob('./log/*/*/logs.mat') 7 | file_list = [name for name in file_list if 'old' not in name] 8 | file_list = [name for name in file_list if '2500' in name] 9 | legend_lable = [] 10 | 11 | colormap = plt.cm.gist_ncar 12 | # plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(file_list))]) 13 | 14 | for name in file_list: 15 | returns = loadmat(name)['returns'] 16 | print(name + '\n') 17 | print(returns) 18 | print('\n\n') 19 | # import pdb; pdb.set_trace() 20 | plt.plot(returns.reshape([-1])) 21 | legend_lable.append(name.split('/')[2]) 22 | 23 | plt.legend(legend_lable) 24 | plt.show() 25 | -------------------------------------------------------------------------------- /show_with_test_result.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from scipy.io import loadmat 3 | import matplotlib.pyplot as plt 4 | 5 | file_list = glob.glob('./log/*/*/logs.mat') 6 | # file_list = [name for name in file_list if 'WRA' in name] 7 | file_list = [name for name in file_list if 'GAN-I' in name] 8 | # file_list = [name for name in file_list if 'R_0.1__' in name] 9 | # mode = 'full' # full, all 10 | mode = 'test' # full, all 11 | legend_lable = [] 12 | 13 | colormap = plt.cm.gist_ncar 14 | # plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(file_list))]) 15 | 16 | for name in file_list: 17 | returns = loadmat(name)['test_returns'] 18 | print(name + '\n') 19 | print(returns) 20 | print('\n\n') 21 | # import pdb; pdb.set_trace() 22 | if mode in ['test', 'all']: 23 | plt.plot(returns.reshape([-1])) 24 | legend_lable.append('test_' + name.split('/')[2]) 25 | 26 | returns = loadmat(name)['returns'] 27 | if mode in ['full', 'all']: 28 | plt.plot(returns.reshape([-1])) 29 | legend_lable.append('full_' + name.split('/')[2]) 30 | 31 | plt.legend(legend_lable) 32 | plt.show() 33 | --------------------------------------------------------------------------------