├── .gitignore
├── README.md
├── agent.py
├── envs
    ├── 2Dpusher_env.py
    └── __init__.py
├── model.py
├── mpc.py
├── opencv_draw.py
├── run.py
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # MacOS files
 7 | .DS_Store
 8 | .AppleDouble
 9 | .LSOverride
10 | 
11 | # Other files
12 | *.pdf
13 | *.tex
14 | *.log
15 | *.npy
16 | *.png
17 | *.jpg
18 | 
19 | # VSCODE files
20 | .vscode/
21 | 
22 | # Project files
23 | /*.project
24 | /*.pydevproject
25 | /.settings
26 | 
27 | # Sub-directories
28 | template/
29 | writeup/
30 | src/
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Model-based Reinforcement Learning
 2 | ```
 3 | |-- envs
 4 |     |-- 2Dpusher_env.py # Implementation of the pusher environment 
 5 |     |-- __init__.py # Register two pusher environmnets
 6 | |-- run.py # Entry of all the experiments. Also check out the commented code in this file.
 7 | |-- mpc.py # MPC to optimize model; Call CEM from this file
 8 | |-- model.py # Creat and train the ensemble dynamics model
 9 | |-- agent.py # An agent that can be used to interact with the environment
10 |              # and a random policy
11 | |-- util.py # Some utility functions we kindly provide. You do not have to use them.
12 | |-- opencv_draw.py # Contains features for rendering the Box2D environments. You don't have to do anything to this file.
13 | ```
14 | 
15 | # Prerequisite
16 | Run `pip install opencv-python`
17 |  
18 | # Guidelines for Implementation
19 | * When implementing the `MPC` class, use the `mpc_params` that is passed into this class.
20 | * One tip is to write a separate `CEMOptimizer` and `RandomOptimizer`, which optimize a cost function over 
21 | action sequences.
22 | * We provide a state based cost function. To get the cost of a trajectory, you need to sum/average over the cost of all
23 | the states.
24 | * Sometimes the cost function can give `nan` values. Make sure to deal with such cases in your code.
25 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Agent:
 5 |     def __init__(self, env):
 6 |         self.env = env
 7 | 
 8 |     def sample(self, horizon, policy):
 9 |         """
10 |         Sample a rollout from the agent.
11 | 
12 |         Arguments:
13 |           horizon (int): the length of the rollout
14 |           policy: the policy that the agent will use for actions
15 |         """
16 |         rewards = []
17 |         states, actions, reward_sum, done = [self.env.reset()], [], 0, False
18 | 
19 |         policy.reset()
20 |         for t in range(horizon):
21 |             print('time step: {}/{}'.format(t + 1, horizon), end='\r',
22 |                   flush=True)
23 |             actions.append(policy.act(states[t], t))
24 |             state, reward, done, info = self.env.step(actions[t])
25 |             states.append(state)
26 |             reward_sum += reward
27 |             rewards.append(reward)
28 |             if done:
29 |                 # print(info['done'])
30 |                 break
31 |         # print("Rollout length: ", len(actions))
32 | 
33 |         return {
34 |             "obs": np.array(states),
35 |             "ac": np.array(actions),
36 |             "reward_sum": reward_sum,
37 |             "rewards": np.array(rewards),
38 |         }
39 | 
40 | 
41 | class RandomPolicy:
42 |     def __init__(self, action_dim):
43 |         self.action_dim = action_dim
44 | 
45 |     def reset(self):
46 |         pass
47 | 
48 |     def act(self, arg1, arg2):
49 |         return np.random.uniform(size=self.action_dim) * 2 - 1
50 | 


--------------------------------------------------------------------------------
/envs/2Dpusher_env.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import copy
  4 | import Box2D
  5 | import numpy as np
  6 | 
  7 | from Box2D.b2 import (circleShape, fixtureDef, polygonShape)
  8 | from opencv_draw import OpencvDrawFuncs
  9 | 
 10 | from gym import spaces
 11 | from gym.utils import seeding
 12 | 
 13 | MIN_COORD = 0
 14 | MAX_COORD = 5
 15 | PUSHER_START = np.array([1.0, 1.0])
 16 | BOX_START = np.array([2.0, 2.0])
 17 | FORCE_MULT = 1
 18 | RAD = 0.2
 19 | SIDE_GAP_MULT = 2
 20 | BOX_RAD = 0.2
 21 | GOAL_RAD = 0.5
 22 | MAX_STEPS = 40
 23 | FPS = 4
 24 | 
 25 | 
 26 | class Pusher2d(gym.Env):
 27 |     def __init__(self, control_noise=0.):
 28 |         self.control_noise = control_noise
 29 |         self.seed()
 30 |         self.world = Box2D.b2World(gravity=(0, 0))
 31 |         self.pusher = None
 32 |         self.box = None
 33 |         # Actions: x-movement, y-movement (clipped -1 to 1)
 34 |         self.action_space = spaces.Box(np.ones(2) * -1, np.ones(2), dtype=np.float32)
 35 |         # State: pusher xy position, box xy position, pusher xy velocity, box xy velocity, goal xy position
 36 |         self.observation_space = spaces.Box(np.ones(10) * MIN_COORD, np.ones(10) * MAX_COORD, dtype=np.float32)
 37 |         self.reset()
 38 |         self.drawer = OpencvDrawFuncs(w=240, h=180, ppm=40)
 39 |         self.drawer.install()
 40 | 
 41 |     def seed(self, seed=None):
 42 |         self.np_random, seed = seeding.np_random(seed)
 43 |         return [seed]
 44 | 
 45 |     def random_place(self):
 46 |         """ returns [x, y] within an area slightly away from the initial box position """
 47 |         return [self.np_random.uniform(BOX_START[0] + BOX_RAD + GOAL_RAD, MAX_COORD - RAD * SIDE_GAP_MULT),
 48 |                 self.np_random.uniform(BOX_START[1] + BOX_RAD + GOAL_RAD, MAX_COORD - RAD * SIDE_GAP_MULT)]
 49 | 
 50 |     def _destroy(self):
 51 |         """ removes instantiated Box2D entities """
 52 |         if not self.box:
 53 |             return
 54 |         self.world.DestroyBody(self.box)
 55 |         self.world.DestroyBody(self.pusher)
 56 | 
 57 |     def reset(self):
 58 |         """ standard Gym method; returns first state of episode """
 59 |         self._destroy()
 60 |         self.pusher = self.world.CreateDynamicBody(
 61 |             position=PUSHER_START[:],
 62 |             fixtures=fixtureDef(
 63 |                 shape=circleShape(radius=RAD, pos=(0, 0)),
 64 |                 density=1.0
 65 |             )
 66 |         )
 67 |         self.box = self.world.CreateDynamicBody(
 68 |             position=BOX_START[:],
 69 |             fixtures=fixtureDef(
 70 |                 shape=circleShape(radius=BOX_RAD, pos=(0, 0)),
 71 |                 density=1.0
 72 |             )
 73 |         )
 74 |         self.goal_pos = self.random_place()
 75 |         self.elapsed_steps = 0
 76 |         return self._get_obs()
 77 | 
 78 |     def step(self, action, render=False):
 79 |         """ standard Gym method; returns s, r, d, i """
 80 |         if render:
 81 |             self.drawer.clear_screen()
 82 |             self.drawer.draw_world(self.world)
 83 | 
 84 |         action = np.clip(action, -1, 1).astype(np.float32)
 85 |         if self.control_noise > 0.:
 86 |             action += np.random.normal(0., scale=self.control_noise, size=action.shape)
 87 | 
 88 |         self.elapsed_steps += 1
 89 |         self.pusher._b2Body__SetLinearVelocity((FORCE_MULT * action[0], FORCE_MULT * action[1]))
 90 |         self.box._b2Body__SetActive(True)
 91 |         self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
 92 | 
 93 |         if render:
 94 |             cv2.imshow("world", self.drawer.screen)
 95 |             cv2.waitKey(20)
 96 |         done = False
 97 |         reward = -1
 98 |         obj_coords = np.concatenate([self.pusher.position.tuple, self.box.position.tuple])
 99 |         info = {"done": None}
100 |         # check if out of bounds
101 |         if np.min(obj_coords) < MIN_COORD or np.max(obj_coords) > MAX_COORD:
102 |             reward = -1 * (MAX_STEPS - self.elapsed_steps + 2)
103 |             done = True
104 |             info['done'] = 'unstable simulation'
105 |         # check if out of time
106 |         elif self.elapsed_steps >= MAX_STEPS:
107 |             done = True
108 |             info["done"] = "max_steps_reached"
109 |         # check if goal reached
110 |         elif np.linalg.norm(np.array(self.box.position.tuple) - self.goal_pos) < RAD + GOAL_RAD:
111 |             done = True
112 |             reward = 0
113 |             info["done"] = "goal reached"
114 | 
115 |         return self._get_obs(), reward, done, info
116 | 
117 |     def _get_obs(self):
118 |         """ returns current state of environment """
119 |         state = np.concatenate([self.pusher.position.tuple,
120 |                                 self.box.position.tuple,
121 |                                 self.pusher.linearVelocity.tuple,
122 |                                 self.box.linearVelocity.tuple,
123 |                                 self.goal_pos])
124 |         return state
125 | 
126 |     def apply_hindsight(self, states, actions, goal_state):
127 |         """ returns list of new states and list of new rewards for use with HER """
128 |         goal = goal_state[2:4]  # get new goal location (last location of box)
129 |         states.append(goal_state)
130 |         num_tuples = len(actions)
131 |         her_states, her_rewards = [], []
132 |         states[0][-2:] = goal.copy()
133 |         her_states.append(states[0])
134 |         # for each state, adjust goal and calculate reward obtained
135 |         for i in range(1, num_tuples + 1):
136 |             state = states[i]
137 |             state[-2:] = goal.copy()
138 |             reward = self._HER_calc_reward(state)
139 |             her_states.append(state)
140 |             her_rewards.append(reward)
141 |         return her_states, her_rewards
142 | 
143 |     def _HER_calc_reward(self, state):
144 |         """ given state, returns reward for transitioning to this state (used by HER) """
145 |         if np.linalg.norm(state[2:4] - state[4:6]) < RAD + GOAL_RAD:
146 |             return 0
147 |         else:
148 |             return -1
149 | 
150 |     def set_state(self, state):
151 |         self.pusher.position = state[:2]
152 |         self.box.position = state[2:4]
153 |         self.pusher.linearVelocity = state[4:6]
154 |         self.box.linearVelocity = state[6:8]
155 |         if len(state) == 10:  # The state can also be observation only, which does not include the goal
156 |             self.goal_pos = state[8:10]
157 | 
158 |     def get_state(self):
159 |         return copy.copy(self._get_obs())
160 | 
161 |     def get_nxt_state(self, state, action):
162 |         original_state = self.get_state()
163 |         original_elapsed_steps = self.elapsed_steps
164 |         self.set_state(state)
165 |         nxt_state, _, _, _ = self.step(action)
166 |         nxt_state = nxt_state[:8]
167 | 
168 |         # Make sure there is no side effect
169 |         self.set_state(original_state)
170 |         self.elapsed_steps = original_elapsed_steps
171 |         return nxt_state
172 | 


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | register(
 4 |     id='Pushing2D-v1',
 5 |     entry_point='envs.2Dpusher_env:Pusher2d',
 6 |     kwargs={'control_noise': 0}
 7 | )
 8 | 
 9 | # Control noise is added to the action
10 | register(
11 |     id='Pushing2DNoisyControl-v1',
12 |     entry_point='envs.2Dpusher_env:Pusher2d',
13 |     kwargs={'control_noise': 0.3}
14 | )
15 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import tensorflow.keras.backend as K
  5 | 
  6 | from util import ZFilter
  7 | from tensorflow.keras.models import Model
  8 | from tensorflow.keras.optimizers import Adam
  9 | from tensorflow.keras.regularizers import l2
 10 | from tensorflow.keras.layers import Dense, Input
 11 | 
 12 | 
 13 | HIDDEN1_UNITS = 400
 14 | HIDDEN2_UNITS = 400
 15 | HIDDEN3_UNITS = 400
 16 | 
 17 | 
 18 | class PENN:
 19 |     """
 20 |     (P)robabilistic (E)nsemble of (N)eural (N)etworks
 21 |     """
 22 | 
 23 |     def __init__(self, num_nets, state_dim, action_dim, learning_rate):
 24 |         """
 25 |         :param num_nets: number of networks in the ensemble
 26 |         :param state_dim: state dimension
 27 |         :param action_dim: action dimension
 28 |         :param learning_rate:
 29 |         """
 30 | 
 31 |         self.sess = tf.Session()
 32 |         self.num_nets = num_nets
 33 |         self.state_dim = state_dim
 34 |         self.action_dim = action_dim
 35 |         self.learning_rate = learning_rate
 36 |         K.set_session(self.sess)
 37 | 
 38 |         # Log variance bounds
 39 |         self.max_logvar = tf.Variable(-3 * np.ones([1, self.state_dim]),
 40 |                                       dtype=tf.float32)
 41 |         self.min_logvar = tf.Variable(-7 * np.ones([1, self.state_dim]),
 42 |                                       dtype=tf.float32)
 43 |         # Define ops for model output and optimization
 44 |         self.inputs = list()
 45 |         self.losses = list()
 46 |         self.means = list()
 47 |         self.logvars = list()
 48 |         self.models = list()
 49 |         self.outputs = list()
 50 |         self.targets = list()
 51 |         self.optimizations = list()
 52 |         for model in range(self.num_nets):
 53 |             model, inp = self.create_network()
 54 |             self.inputs.append(inp)
 55 |             self.models.append(model)
 56 |             output = self.get_output(model.output)
 57 |             mean, logvar = output
 58 |             self.means.append(mean)
 59 |             self.logvars.append(logvar)
 60 |             self.outputs.append(output)
 61 |             target = tf.placeholder(tf.float32, shape=(None, self.state_dim))
 62 |             self.targets.append(target)
 63 |             var = tf.exp(logvar)
 64 |             inv_var = tf.divide(1, var)
 65 |             norm_output = mean - target
 66 |             # Calculate loss: Mahalanobis distance + log(det(cov))
 67 |             loss = tf.multiply(tf.multiply(norm_output, inv_var), norm_output)
 68 |             loss = tf.reduce_sum(loss, axis=1)
 69 |             loss += tf.math.log(tf.math.reduce_prod(var, axis=1))
 70 |             self.losses.append(loss)
 71 |             optimizer = Adam(lr=learning_rate)
 72 |             weights = model.trainable_weights
 73 |             gradients = tf.gradients(loss, weights)
 74 |             optimize = optimizer.apply_gradients(zip(gradients, weights))
 75 |             self.optimizations.append(optimize)
 76 |         self.sess.run(tf.initialize_all_variables())
 77 | 
 78 |     def predict(self, states, actions, idxs):
 79 |         """Predicts the next states from the ensemble given states and actions
 80 |         Args:
 81 |             states: input states
 82 |             actions: actions to be taken
 83 |             idxs: indices of the models to be used for generating the next
 84 |             state
 85 |         Returns:
 86 |             next_states: resulting states
 87 |         """
 88 |         next_states = np.zeros_like(states)
 89 |         input = np.concatenate((states, actions), axis=1)
 90 |         assert input.shape[1] == (self.action_dim + self.state_dim)
 91 |         feed_dict = {inp: input for inp in self.inputs}
 92 |         outs = self.sess.run([self.means, self.logvars], feed_dict=feed_dict)
 93 |         means = np.array(outs[0])
 94 |         logvars = np.array(outs[1])
 95 |         assert means.shape[0] == self.num_nets == logvars.shape[0]
 96 |         means = means[idxs, range(means.shape[1]), :]
 97 |         logvars = logvars[idxs, range(logvars.shape[1]), :]
 98 |         sigma = np.sqrt(np.exp(logvars))
 99 |         next_states = np.random.normal(means, sigma, size=means.shape)
100 |         return next_states
101 | 
102 |     def get_output(self, output):
103 |         """
104 |         Args:
105 |             output: tf variable representing the output of the keras models,
106 |             i.e., model.output
107 |         Returns:
108 |             mean and log variance tf tensors
109 |         """
110 |         mean = output[:, :self.state_dim]
111 |         raw_v = output[:, self.state_dim:]
112 |         logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - raw_v)
113 |         logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar)
114 |         return mean, logvar
115 | 
116 |     def create_network(self):
117 |         inp = Input(shape=[self.state_dim + self.action_dim])
118 |         h1 = Dense(HIDDEN1_UNITS, activation='relu',
119 |                    kernel_regularizer=l2(0.0001))(inp)
120 |         h2 = Dense(HIDDEN2_UNITS, activation='relu',
121 |                    kernel_regularizer=l2(0.0001))(h1)
122 |         h3 = Dense(HIDDEN3_UNITS, activation='relu',
123 |                    kernel_regularizer=l2(0.0001))(h2)
124 |         out = Dense(2 * self.state_dim, activation='linear',
125 |                     kernel_regularizer=l2(0.0001))(h3)
126 |         model = Model(inputs=inp, outputs=out)
127 |         return model, inp
128 | 
129 |     def get_indices(self, n):
130 |         return np.random.choice(range(n), size=n, replace=True)
131 | 
132 |     def train(self, inputs, targets, batch_size=128, epochs=5):
133 |         """
134 |         Args:
135 |             inputs: state and action inputs.  Assumes that inputs are
136 |             standardized.
137 |             targets: resulting states
138 |         """
139 |         # Shuffle the data
140 |         rows = inputs.shape[0]
141 |         shuffled_indices = np.random.permutation(rows)
142 |         inputs = inputs[shuffled_indices, :]
143 |         targets = targets[shuffled_indices, :]
144 |         # Sample data indices for different models
145 |         indices = [self.get_indices(rows) for _ in range(self.num_nets)]
146 |         total_loss = list()
147 |         rmse_loss = list()
148 |         rmse_name = 'rmse_pets.npy' if self.num_nets > 1 else 'rmse_smd.npy'
149 |         loss_name = 'loss_pets.npy' if self.num_nets > 1 else 'loss_smd.npy'
150 |         for epoch in range(epochs):
151 |             print('Epoch {}/{}'.format(epoch + 1, epochs))
152 |             num_batches = math.ceil(rows / batch_size)
153 |             for batch in range(num_batches):
154 |                 # Sample a batch and get inputs and targets
155 |                 idx = batch * batch_size
156 |                 inps = [inputs[indices[x][idx:idx + batch_size]]
157 |                         for x in range(self.num_nets)]
158 |                 targs = [targets[indices[x][idx:idx + batch_size]]
159 |                          for x in range(self.num_nets)]
160 |                 # RMSE
161 |                 feed_dict = {inp: input for inp, input in zip(
162 |                     self.inputs, inps)}
163 |                 outputs = self.sess.run(self.outputs, feed_dict=feed_dict)
164 |                 gaussians = [(mean, np.sqrt(np.exp(logvar)))
165 |                              for mean, logvar in outputs]
166 |                 next_states = [np.random.normal(mean, logvar, mean.shape)
167 |                                for mean, logvar in gaussians]
168 |                 rmse = [np.sqrt(np.square(targs[i] - next_states[i]).mean(
169 |                         axis=1)) for i in range(self.num_nets)]
170 |                 rmse = [np.mean(r) for r in rmse]
171 |                 feed_dict_2 = {targ: target for targ, target in zip(
172 |                     self.targets, targs)}
173 |                 feed_dict.update(feed_dict_2)
174 |                 # Loss corresponding to all models
175 |                 losses = self.sess.run(self.losses, feed_dict=feed_dict)
176 |                 loss = [np.mean(model_loss) for model_loss in losses]
177 |                 self.sess.run(self.optimizations, feed_dict=feed_dict)
178 |                 print('Batch {}/{} | Loss: {:.5f} | RMSE: {:.5f}'.format(
179 |                     batch + 1, num_batches, np.mean(loss), np.mean(rmse)),
180 |                     end='\r', flush=True)
181 |                 total_loss.append(loss)
182 |                 rmse_loss.append(rmse)
183 |             print('\n', '*'*40)
184 |             np.save(rmse_name, rmse_loss)
185 |             np.save(loss_name, total_loss)
186 | 


--------------------------------------------------------------------------------
/mpc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class MPC:
  5 |     def __init__(self,
  6 |                  env,
  7 |                  plan_horizon,
  8 |                  model,
  9 |                  popsize,
 10 |                  num_elites,
 11 |                  max_iters,
 12 |                  num_particles=6,
 13 |                  use_mpc=True,
 14 |                  use_gt_dynamics=True,
 15 |                  use_random_optimizer=False):
 16 |         """
 17 |         :param env:
 18 |         :param plan_horizon:
 19 |         :param model: The learned dynamics model to use, which can be None if
 20 |             use_gt_dynamics is True
 21 |         :param popsize: Population size
 22 |         :param num_elites: CEM parameter
 23 |         :param max_iters: CEM parameter
 24 |         :param num_particles: Number of trajectories for TS1
 25 |         :param use_gt_dynamics: Whether to use the ground truth dynamics from
 26 |             the environment
 27 |         :param use_mpc: Whether to use only the first action of a planned
 28 |             trajectory
 29 |         :param use_random_optimizer: Whether to use CEM or take random actions
 30 |         """
 31 | 
 32 |         self.env = env
 33 | 
 34 |         self.popsize = popsize
 35 |         self.max_iters = max_iters
 36 |         self.num_elites = num_elites
 37 |         self.plan_horizon = plan_horizon
 38 |         self.num_particles = num_particles
 39 | 
 40 |         self.use_mpc = use_mpc
 41 |         self.use_gt_dynamics = use_gt_dynamics
 42 |         self.use_random_optimizer = use_random_optimizer
 43 | 
 44 |         self.num_nets = None if model is None else model.num_nets
 45 | 
 46 |         self.state_dim, self.action_dim = 8, env.action_space.shape[0]
 47 |         self.ac_ub, self.ac_lb = env.action_space.high, env.action_space.low
 48 | 
 49 |         self.goal = self.env.goal_pos
 50 |         # Set up optimizer
 51 |         self.model = model
 52 | 
 53 |         if use_gt_dynamics:
 54 |             self.predict_next_state = self.predict_next_state_gt
 55 |             assert num_particles == 1
 56 |         else:
 57 |             self.predict_next_state = self.predict_next_state_model
 58 | 
 59 |         # Initialize your planner with the relevant arguments.
 60 |         # Write different optimizers for cem and random actions respectively
 61 |         if self.use_random_optimizer:
 62 |             self.opt = self.random_optimizer
 63 |         else:
 64 |             self.opt = self.cem_optimizer
 65 | 
 66 |     def obs_cost_fn(self, state):
 67 |         """ Cost function of the current state """
 68 |         # Weights for different terms
 69 |         W_PUSHER = 1
 70 |         W_GOAL = 2
 71 |         W_DIFF = 5
 72 | 
 73 |         pusher_x, pusher_y = state[0], state[1]
 74 |         box_x, box_y = state[2], state[3]
 75 |         goal_x, goal_y = self.goal[0], self.goal[1]
 76 | 
 77 |         pusher_box = np.array([box_x - pusher_x, box_y - pusher_y])
 78 |         box_goal = np.array([goal_x - box_x, goal_y - box_y])
 79 |         d_box = np.sqrt(np.dot(pusher_box, pusher_box))
 80 |         d_goal = np.sqrt(np.dot(box_goal, box_goal))
 81 |         diff_coord = np.abs(box_x / box_y - goal_x / goal_y)
 82 |         # the -0.4 is to adjust for the radius of the box and pusher
 83 |         x = max(d_box - 0.4, 0)
 84 |         cost = W_PUSHER * x + W_GOAL * d_goal + W_DIFF * diff_coord
 85 |         return cost
 86 | 
 87 |     def predict_next_state_model(self, states, actions):
 88 |         """Given a list of state action pairs, use the learned model to
 89 |         predict the next state.
 90 | 
 91 |         Returns:
 92 |             cost: cost of the given action sequence.
 93 |         """
 94 |         # Initial cost is the same for all sequences
 95 |         # TODO: Remove cost calculation from this function
 96 |         rows = actions.shape[0]  # M*P
 97 |         cost = np.array([self.obs_cost_fn(states[0, :])] * rows)
 98 |         sampler = self.ts1sampling(rows)
 99 |         for i in range(self.plan_horizon):
100 |             idx = i * self.action_dim
101 |             action = actions[:, idx:idx + self.action_dim]
102 |             idxs = sampler[:, i]
103 |             next_states = self.model.predict(states, action, idxs)
104 |             states = next_states
105 |             cost += np.apply_along_axis(self.obs_cost_fn, axis=1, arr=states)
106 |         return cost
107 | 
108 |     def predict_next_state_gt(self, states, actions):
109 |         """Given a list of state action pairs, use the ground truth dynamics
110 |         to predict the next state.
111 |         """
112 |         for i in range(self.plan_horizon):
113 |             idx = i * self.action_dim
114 |             action = actions[idx:idx + self.action_dim]
115 |             next_state = self.env.get_nxt_state(states[i], action)
116 |             states.append(next_state)
117 |         return states
118 | 
119 |     def random_optimizer(self, state):
120 |         """Implements the random optimizer. It gives the best action sequence
121 |         for a certain initial state.
122 |         """
123 |         # Generate M*I action sequences of length T according to N(0, 0.5I)
124 |         total_sequences = self.popsize * self.max_iters
125 |         shape = (total_sequences, self.plan_horizon * self.action_dim)
126 |         self.reset()  # resets mu and sigma
127 |         actions = np.random.normal(self.mu, self.sigma, size=shape)
128 |         actions = np.clip(actions, a_min=-1, a_max=1)
129 |         if not self.use_gt_dynamics:
130 |             repeated_actions = np.tile(actions, reps=(self.num_particles, 1))
131 |             rows = repeated_actions.shape[0]
132 |             states = np.tile(state, reps=(rows, 1))
133 |             costs = self.predict_next_state_model(states, repeated_actions)
134 |             costs = costs.reshape(self.num_particles, -1)
135 |             costs = np.mean(costs, axis=0)  # these are M*I costs
136 |             assert costs.shape[0] == self.popsize * self.max_iters
137 |             min_cost_idx = np.argmin(costs)
138 |             return actions[min_cost_idx]
139 |         else:
140 |             best_cost = np.inf
141 |             best_action_sequence = np.zeros_like(self.mu)
142 |             for i in range(total_sequences):
143 |                 states = self.predict_next_state_gt([state], actions[i, :])
144 |                 assert len(states) == self.plan_horizon + 1
145 |                 cost = sum(self.obs_cost_fn(x) for x in states)
146 |                 cost /= self.num_particles
147 |                 if cost < best_cost:
148 |                     best_cost = cost
149 |                     best_action_sequence = actions[i, :]
150 |             return best_action_sequence
151 | 
152 |     def ts1sampling(self, n):
153 |         s = (n, self.plan_horizon)
154 |         return np.random.choice(range(self.num_nets), size=s, replace=True)
155 | 
156 |     def cem_optimizer(self, state):
157 |         """Implements the Cross Entropy Method optimizer. It gives the action
158 |         sequence for a certain initial state by choosing elite sequences and
159 |         using their mean.
160 |         """
161 |         mu = self.mu
162 |         sigma = self.sigma
163 |         for i in range(self.max_iters):
164 |             # Generate M action sequences of length T according to N(mu, std)
165 |             shape = (self.popsize, self.plan_horizon * self.action_dim)
166 |             actions = np.random.normal(mu, sigma, size=shape)
167 |             actions = np.clip(actions, a_min=-1, a_max=1)
168 |             costs = None
169 |             if not self.use_gt_dynamics:
170 |                 reps = (self.num_particles, 1)
171 |                 repeated_actions = np.tile(actions, reps=reps)
172 |                 rows = repeated_actions.shape[0]
173 |                 states = np.tile(state, reps=(rows, 1))
174 |                 costs = self.predict_next_state_model(states, repeated_actions)
175 |                 costs = costs.reshape(self.num_particles, -1)
176 |                 costs = np.mean(costs, axis=0)  # these are M costs
177 |             else:
178 |                 costs = list()
179 |                 for m in range(self.popsize):
180 |                     states = self.predict_next_state_gt([state], actions[m, :])
181 |                     assert len(states) == self.plan_horizon + 1
182 |                     cost = sum(self.obs_cost_fn(x) for x in states)
183 |                     cost /= self.num_particles
184 |                     costs.append(cost)
185 |             # Calculate mean and std using the elite action sequences
186 |             costs = np.argsort(costs)
187 |             elite_sequences = costs[:self.num_elites]
188 |             elite_actions = actions[elite_sequences, :]
189 |             assert elite_actions.shape[0] == self.num_elites
190 |             mu = np.mean(elite_actions, axis=0)
191 |             sigma = np.std(elite_actions, axis=0)
192 |         return mu
193 | 
194 |     def train(self, obs_trajs, acs_trajs, rews_trajs, epochs=5):
195 |         """
196 |         Take the input obs, acs, rews and append to existing transitions the
197 |         train model.
198 | 
199 |         Args:
200 |           obs_trajs: states
201 |           acs_trajs: actions
202 |           rews_trajs: rewards (NOTE: this may not be used)
203 |           epochs: number of epochs to train for
204 |         """
205 |         assert len(obs_trajs) == len(acs_trajs)
206 |         input_states = [traj[:-1, :self.state_dim] for traj in obs_trajs]
207 |         input_states = np.concatenate(input_states, axis=0)
208 |         assert input_states.shape[1] == self.state_dim
209 |         targets = [traj[1:, :self.state_dim] for traj in obs_trajs]
210 |         targets = np.concatenate(targets, axis=0)
211 |         assert targets.shape[1] == self.state_dim
212 |         actions = [acs for acs in acs_trajs]
213 |         actions = np.concatenate(actions, axis=0)
214 |         assert actions.shape[1] == self.action_dim
215 |         inputs = np.concatenate((input_states, actions), axis=1)
216 |         assert inputs.shape[1] == (self.state_dim + self.action_dim)
217 |         self.model.train(inputs, targets, epochs=epochs)
218 | 
219 |     def reset(self):
220 |         """Initializes variables mu and sigma.
221 |         """
222 |         self.mu = np.zeros(self.plan_horizon * self.action_dim)
223 |         self.reset_sigma()
224 | 
225 |     def act(self, state, t):
226 |         """
227 |         Find the action for current state.
228 | 
229 |         Arguments:
230 |           state: current state
231 |           t: current timestep
232 |         """
233 |         self.goal = state[self.state_dim:]
234 |         assert len(self.goal) == 2
235 |         state = state[:self.state_dim]
236 |         if self.use_mpc:
237 |             mu = self.opt(state)
238 |             action = mu[:self.action_dim]  # Get the first action
239 |             action = action.copy()
240 |             mu[:-self.action_dim] = mu[self.action_dim:]
241 |             mu[-self.action_dim:] = 0
242 |             self.mu = mu
243 |         else:
244 |             if t % self.plan_horizon == 0:
245 |                 self.mu = self.opt(state)
246 |             idx = (t % self.plan_horizon) * self.action_dim
247 |             action = self.mu[idx:idx + self.action_dim]
248 |         return action
249 | 
250 |     def reset_sigma(self):
251 |         """Resets/initializes the value of sigma.
252 |         """
253 |         sigma = [0.5 ** 0.5] * (self.plan_horizon * self.action_dim)
254 |         self.sigma = np.array(sigma)
255 | 


--------------------------------------------------------------------------------
/opencv_draw.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Python version Copyright (c) 2015 John Stowers
  4 | #
  5 | # This software is provided 'as-is', without any express or implied
  6 | # warranty.  In no event will the authors be held liable for any damages
  7 | # arising from the use of this software.
  8 | # Permission is granted to anyone to use this software for any purpose,
  9 | # including commercial applications, and to alter it and redistribute it
 10 | # freely, subject to the following restrictions:
 11 | # 1. The origin of this software must not be misrepresented; you must not
 12 | # claim that you wrote the original software. If you use this software
 13 | # in a product, an acknowledgment in the product documentation would be
 14 | # appreciated but is not required.
 15 | # 2. Altered source versions must be plainly marked as such, and must not be
 16 | # misrepresented as being the original software.
 17 | # 3. This notice may not be removed or altered from any source distribution.
 18 | 
 19 | import cv2
 20 | import numpy as np
 21 | 
 22 | from Box2D import (b2Color, b2DistanceJoint, b2MouseJoint, b2PulleyJoint)
 23 | from Box2D.b2 import (staticBody, dynamicBody, kinematicBody, polygonShape,
 24 |                       circleShape, loopShape, edgeShape)
 25 | 
 26 | 
 27 | def cvcolor(color):
 28 |     return int(255.0 * color[2]), int(255.0 * color[1]), int(255.0 * color[0])
 29 | 
 30 | 
 31 | def cvcoord(pos):
 32 |     return tuple(map(int, pos))
 33 | 
 34 | 
 35 | class OpencvDrawFuncs(object):
 36 | 
 37 |     def __init__(self, w, h, ppm, fill_polygon=True, flip_y=True):
 38 |         self._w = w
 39 |         self._h = h
 40 |         self._ppm = ppm
 41 |         self._colors = {
 42 |             staticBody: (255, 255, 255),
 43 |             dynamicBody: (127, 127, 127),
 44 |             kinematicBody: (127, 127, 230),
 45 |         }
 46 |         self._fill_polygon = fill_polygon
 47 |         self._flip_y = flip_y
 48 |         self.screen = np.zeros((self._h, self._w, 3), np.uint8)
 49 | 
 50 |     def install(self):
 51 |         polygonShape.draw = self._draw_polygon
 52 |         circleShape.draw = self._draw_circle
 53 |         loopShape.draw = self._draw_loop
 54 |         edgeShape.draw = self._draw_edge
 55 | 
 56 |     def draw_world(self, world):
 57 |         for body in world.bodies:
 58 |             for fixture in body.fixtures:
 59 |                 fixture.shape.draw(body, fixture)
 60 |         for joint in world.joints:
 61 |             self._draw_joint(joint)
 62 | 
 63 |     def clear_screen(self, screen=None):
 64 |         if screen is None:
 65 |             self.screen.fill(0)
 66 |         else:
 67 |             self.screen = screen
 68 | 
 69 |     def _fix_vertices(self, vertices):
 70 |         if self._flip_y:
 71 |             return [(v[0], self._h - v[1]) for v in vertices]
 72 |         else:
 73 |             return [(v[0], v[1]) for v in vertices]
 74 | 
 75 |     def _draw_joint(self, joint):
 76 |         bodyA, bodyB = joint.bodyA, joint.bodyB
 77 |         xf1, xf2 = bodyA.transform, bodyB.transform
 78 |         x1, x2 = xf1.position, xf2.position
 79 |         p1, p2 = joint.anchorA, joint.anchorB
 80 |         color = b2Color(0.5, 0.8, 0.8)
 81 | 
 82 |         x1, x2, p1, p2 = self._fix_vertices((x1 * self._ppm, x2 * self._ppm,
 83 |                                              p1 * self._ppm, p2 * self._ppm))
 84 | 
 85 |         if isinstance(joint, b2DistanceJoint):
 86 |             cv2.line(self.screen, cvcoord(p1), cvcoord(p2), cvcolor(color), 1)
 87 |         elif isinstance(joint, b2PulleyJoint):
 88 |             s1, s2 = joint.groundAnchorA, joint.groundAnchorB
 89 |             s1, s2 = self._fix_vertices((s1 * self._ppm, s2 * self._ppm))
 90 |             cv2.line(self.screen, cvcoord(s1), cvcoord(p1), cvcolor(color), 1)
 91 |             cv2.line(self.screen, cvcoord(s2), cvcoord(p2), cvcolor(color), 1)
 92 |             cv2.line(self.screen, cvcoord(s1), cvcoord(s2), cvcolor(color), 1)
 93 |         elif isinstance(joint, b2MouseJoint):
 94 |             pass  # don't draw it here
 95 |         else:
 96 |             cv2.line(self.screen, cvcoord(x1), cvcoord(p1), cvcolor(color), 1)
 97 |             cv2.line(self.screen, cvcoord(p1), cvcoord(p2), cvcolor(color), 1)
 98 |             cv2.line(self.screen, cvcoord(x2), cvcoord(p2), cvcolor(color), 1)
 99 | 
100 |     def _draw_polygon(self, body, fixture):
101 |         polygon = fixture.shape
102 | 
103 |         transform = body.transform
104 |         vertices = self._fix_vertices([transform * v * self._ppm
105 |                                        for v in polygon.vertices])
106 | 
107 |         pts = np.array(vertices, np.int32)
108 |         pts = pts.reshape((-1, 1, 2))
109 |         cv2.polylines(self.screen, [pts], True, self._colors[body.type])
110 | 
111 |         if self._fill_polygon:
112 |             lightc = np.array(self._colors[body.type], dtype=int) * 0.5
113 |             cv2.fillPoly(self.screen, [pts], lightc)
114 | 
115 |     def _draw_circle(self, body, fixture):
116 |         circle = fixture.shape
117 |         position = self._fix_vertices(
118 |             [body.transform * circle.pos * self._ppm])[0]
119 |         cv2.circle(self.screen, cvcoord(position), int(
120 |             circle.radius * self._ppm), self._colors[body.type], 1)
121 | 
122 |     def _draw_edge(self, body, fixture):
123 |         edge = fixture.shape
124 |         v = [body.transform * edge.vertex1 * self._ppm,
125 |              body.transform * edge.vertex2 * self._ppm]
126 |         vertices = self._fix_vertices(v)
127 |         cv2.line(self.screen, cvcoord(vertices[0]),
128 |                  cvcoord(vertices[1]), self._colors[body.type], 1)
129 | 
130 |     def _draw_loop(self, body, fixture):
131 |         loop = fixture.shape
132 |         transform = body.transform
133 |         vertices = self._fix_vertices([transform * v * self._ppm
134 |                                        for v in loop.vertices])
135 |         v1 = vertices[-1]
136 |         for v2 in vertices:
137 |             cv2.line(self.screen, cvcoord(v1), cvcoord(v2),
138 |                      self._colors[body.type], 1)
139 |             v1 = v2
140 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gym
  3 | import envs
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | from collections import deque
  8 | from mpc import MPC
  9 | from model import PENN
 10 | from agent import Agent, RandomPolicy
 11 | 
 12 | # Training params
 13 | TASK_HORIZON = 40
 14 | PLAN_HORIZON = 5
 15 | # CEM params
 16 | POPSIZE = 200
 17 | NUM_ELITES = 20
 18 | MAX_ITERS = 5
 19 | # Model params
 20 | LR = 1e-3
 21 | # Dims
 22 | STATE_DIM = 8
 23 | 
 24 | 
 25 | class ExperimentGTDynamics(object):
 26 |     def __init__(self, env_name='Pushing2D-v1', mpc_params=None):
 27 |         self.env = gym.make(env_name)
 28 |         self.task_horizon = TASK_HORIZON
 29 |         self.agent = Agent(self.env)
 30 |         self.warmup = False  # Does not need model
 31 |         mpc_params['use_gt_dynamics'] = True
 32 |         self.cem_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE,
 33 |                               NUM_ELITES, MAX_ITERS, **mpc_params,
 34 |                               use_random_optimizer=False)
 35 |         self.random_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE,
 36 |                                  NUM_ELITES, MAX_ITERS, **mpc_params,
 37 |                                  use_random_optimizer=True)
 38 | 
 39 |     def test(self, num_episodes, optimizer='cem'):
 40 |         samples = []
 41 |         print('Optimizer:', optimizer)
 42 |         for j in range(num_episodes):
 43 |             print('Test episode {}'.format(j + 1))
 44 |             samples.append(
 45 |                 self.agent.sample(
 46 |                     self.task_horizon, self.cem_policy if
 47 |                     optimizer == 'cem' else self.random_policy
 48 |                 )
 49 |             )
 50 |         avg_return = np.mean([sample["reward_sum"] for sample in samples])
 51 |         avg_success = np.mean([sample["rewards"][-1] == 0
 52 |                                for sample in samples])
 53 |         return avg_return, avg_success
 54 | 
 55 | 
 56 | class ExperimentModelDynamics:
 57 |     def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None):
 58 |         self.env = gym.make(env_name)
 59 |         self.task_horizon = TASK_HORIZON
 60 |         self.agent = Agent(self.env)
 61 |         mpc_params['use_gt_dynamics'] = False
 62 |         self.model = PENN(num_nets, STATE_DIM,
 63 |                           len(self.env.action_space.sample()), LR)
 64 |         self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE,
 65 |                               NUM_ELITES, MAX_ITERS, **mpc_params,
 66 |                               use_random_optimizer=False)
 67 |         self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE,
 68 |                                  NUM_ELITES, MAX_ITERS, **mpc_params,
 69 |                                  use_random_optimizer=True)
 70 |         self.random_policy_no_mpc = RandomPolicy(
 71 |             len(self.env.action_space.sample()))
 72 | 
 73 |     def test(self, num_episodes, optimizer='cem'):
 74 |         samples = list()
 75 |         for j in range(num_episodes):
 76 |             print('Test episode {}'.format(j + 1))
 77 |             samples.append(
 78 |                 self.agent.sample(
 79 |                     self.task_horizon, self.cem_policy
 80 |                     if optimizer == 'cem' else self.random_policy
 81 |                 )
 82 |             )
 83 |         avg_return = np.mean([sample["reward_sum"] for sample in samples])
 84 |         avg_success = np.mean([sample["rewards"][-1] == 0
 85 |                                for sample in samples])
 86 |         return avg_return, avg_success
 87 | 
 88 |     def model_warmup(self, num_episodes, num_epochs):
 89 |         """Train a single probabilistic model using a random policy
 90 |         """
 91 |         samples = list()
 92 |         for i in range(num_episodes):
 93 |             samples.append(self.agent.sample(
 94 |                 self.task_horizon, self.random_policy_no_mpc))
 95 |         print('')
 96 |         print('*'*20, 'TRAINING', '*'*20)
 97 |         self.cem_policy.train(
 98 |             [sample["obs"] for sample in samples],
 99 |             [sample["ac"] for sample in samples],
100 |             [sample["rewards"] for sample in samples],
101 |             epochs=num_epochs
102 |         )
103 | 
104 |     def train(self,
105 |               num_train_epochs,
106 |               num_episodes_per_epoch,
107 |               evaluation_interval,
108 |               use_buffer=True):
109 |         """Jointly training the model and the policy
110 |         """
111 | 
112 |         def save(filename, arr):
113 |             np.save(filename, arr)
114 | 
115 |         # Buffer has a max size of 100 episodes worth of data
116 |         obs_buffer = deque(maxlen=40 * 100)
117 |         ac_buffer = deque(maxlen=40 * 100)
118 |         rewards_buffer = deque(maxlen=40 * 100)
119 |         success_cem = list()
120 |         success_random = list()
121 |         success_name_cem = 'success_pets_cem.npy'
122 |         success_name_ran = 'success_pets_random.npy'
123 |         for i in range(num_train_epochs):
124 |             print("Starting training epoch %d." % (i + 1))
125 |             # Collect data using the current dynamics model
126 |             samples = list()
127 |             for j in range(num_episodes_per_epoch):
128 |                 samples.append(
129 |                     self.agent.sample(self.task_horizon, self.cem_policy))
130 |             # Update replay buffers
131 |             new_obs = [sample["obs"] for sample in samples]
132 |             new_ac = [sample["ac"] for sample in samples]
133 |             new_rewards = [sample["rewards"] for sample in samples]
134 |             obs_buffer.extend(new_obs)
135 |             ac_buffer.extend(new_ac)
136 |             rewards_buffer.extend(new_rewards)
137 |             print("Rewards obtained:", [sample["reward_sum"]
138 |                   for sample in samples])
139 |             # Train the dynamics model using the data
140 |             if use_buffer:
141 |                 self.cem_policy.train(obs_buffer, ac_buffer, rewards_buffer,
142 |                                       epochs=5)
143 |             else:
144 |                 self.cem_policy.train(new_obs, new_ac, new_rewards, epochs=5)
145 |             # Test
146 |             num_test_episodes = 20
147 |             if (i + 1) % evaluation_interval == 0:
148 |                 avg_return, avg_success_cem = self.test(
149 |                     num_test_episodes, optimizer='cem')
150 |                 print('Test success CEM + MPC:', avg_success_cem)
151 |                 success_cem.append(avg_success_cem)
152 |                 save(success_name_cem, success_cem)
153 |                 avg_return, avg_success_ran = self.test(
154 |                     num_test_episodes, optimizer='random')
155 |                 print('Test success Random + MPC:', avg_success_ran)
156 |                 success_random.append(avg_success_ran)
157 |                 save(success_name_ran, success_random)
158 | 
159 | 
160 | def test_cem_gt_dynamics(num_episode=10):
161 |     mpc_params = {'use_mpc': False, 'num_particles': 1}
162 |     exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params)
163 |     avg_reward, avg_success = exp.test(num_episode)
164 |     print('CEM PushingEnv: avg_reward: {}, avg_success: {}'.format(
165 |         avg_reward, avg_success))
166 | 
167 |     mpc_params = {'use_mpc': True, 'num_particles': 1}
168 |     exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params)
169 |     avg_reward, avg_success = exp.test(num_episode)
170 |     print('MPC PushingEnv: avg_reward: {}, avg_success: {}'.format(
171 |         avg_reward, avg_success))
172 | 
173 |     mpc_params = {'use_mpc': False, 'num_particles': 1}
174 |     exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1',
175 |                                mpc_params=mpc_params)
176 |     avg_reward, avg_success = exp.test(num_episode)
177 |     print('CEM PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format(
178 |         avg_reward, avg_success))
179 | 
180 |     mpc_params = {'use_mpc': True, 'num_particles': 1}
181 |     exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1',
182 |                                mpc_params=mpc_params)
183 |     avg_reward, avg_success = exp.test(num_episode)
184 |     print('CEM+MPC PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format(
185 |         avg_reward, avg_success))
186 | 
187 |     mpc_params = {'use_mpc': False, 'num_particles': 1}
188 |     exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params)
189 |     avg_reward, avg_success = exp.test(num_episode, optimizer='random')
190 |     print('Random PushingEnv: avg_reward: {}, avg_success: {}'.format(
191 |         avg_reward, avg_success))
192 | 
193 |     mpc_params = {'use_mpc': True, 'num_particles': 1}
194 |     exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params)
195 |     avg_reward, avg_success = exp.test(num_episode, optimizer='random')
196 |     print('Random+MPC PushingEnv: avg_reward: {}, avg_success: {}'.format(
197 |         avg_reward, avg_success))
198 | 
199 |     mpc_params = {'use_mpc': False, 'num_particles': 1}
200 |     exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1',
201 |                                mpc_params=mpc_params)
202 |     avg_reward, avg_success = exp.test(num_episode, optimizer='random')
203 |     print('Random PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format(
204 |         avg_reward, avg_success))
205 | 
206 |     mpc_params = {'use_mpc': True, 'num_particles': 1}
207 |     exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1',
208 |                                mpc_params=mpc_params)
209 |     avg_reward, avg_success = exp.test(num_episode, optimizer='random')
210 |     print('Random+MPC PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format(
211 |         avg_reward, avg_success))
212 | 
213 | 
214 | def train_single_dynamics(num_test_episode=50):
215 |     num_nets = 1
216 |     num_episodes = 1000
217 |     num_epochs = 100
218 |     mpc_params = {'use_mpc': True, 'num_particles': 1}
219 |     exp = ExperimentModelDynamics(env_name='Pushing2D-v1', num_nets=num_nets,
220 |                                   mpc_params=mpc_params)
221 |     print('*'*20, 'WARMUP', '*'*20)
222 |     exp.model_warmup(num_episodes=num_episodes, num_epochs=num_epochs)
223 |     avg_reward, avg_success = exp.test(num_test_episode, optimizer='random')
224 |     print('MPC PushingEnv: avg_reward: {}, avg_success: {}'.format(
225 |         avg_reward, avg_success))
226 | 
227 | 
228 | def train_pets():
229 |     num_nets = 2
230 |     num_epochs = 500
231 |     evaluation_interval = 50
232 |     num_episodes_per_epoch = 1
233 | 
234 |     mpc_params = {'use_mpc': True, 'num_particles': 6}
235 |     exp = ExperimentModelDynamics(env_name='Pushing2D-v1', num_nets=num_nets,
236 |                                   mpc_params=mpc_params)
237 |     print('*'*20, 'WARMUP', '*'*20)
238 |     exp.model_warmup(num_episodes=100, num_epochs=10)
239 |     exp.train(num_train_epochs=num_epochs,
240 |               num_episodes_per_epoch=num_episodes_per_epoch,
241 |               evaluation_interval=evaluation_interval)
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     # Disable AVX/FMA and TF warnings
246 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
247 |     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
248 | 
249 |     test_cem_gt_dynamics(50)
250 |     train_single_dynamics(50)
251 |     train_pets()
252 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | # Some of the code from the following sources:
 2 | # https://github.com/joschu/modular_rl
 3 | # https://github.com/Khrylx/PyTorch-RL
 4 | # http://www.johndcook.com/blog/standard_deviation
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def estimate_advantages(rewards, masks, values, gamma, tau):
10 |     deltas = np.zeros(len(rewards))
11 |     advantages = np.zeros(len(rewards))
12 | 
13 |     prev_value = 0
14 |     prev_advantage = 0
15 |     for i in reversed(range(len(rewards))):
16 |         deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i]
17 |         advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]
18 | 
19 |         prev_value = values[i]
20 |         prev_advantage = advantages[i]
21 | 
22 |     returns = values + advantages
23 |     advantages = (advantages - np.mean(advantages)) / np.std(advantages)
24 | 
25 |     return advantages, returns
26 | 
27 | 
28 | class RunningStat(object):
29 |     def __init__(self, shape):
30 |         self._n = 0
31 |         self._M = np.zeros(shape)
32 |         self._S = np.zeros(shape)
33 | 
34 |     def push(self, x):
35 |         x = np.asarray(x)
36 |         assert x.shape == self._M.shape
37 |         self._n += 1
38 |         if self._n == 1:
39 |             self._M[...] = x
40 |         else:
41 |             oldM = self._M.copy()
42 |             self._M[...] = oldM + (x - oldM) / self._n
43 |             self._S[...] = self._S + (x - oldM) * (x - self._M)
44 | 
45 |     @property
46 |     def n(self):
47 |         return self._n
48 | 
49 |     @property
50 |     def mean(self):
51 |         return self._M
52 | 
53 |     @property
54 |     def var(self):
55 |         return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
56 | 
57 |     @property
58 |     def std(self):
59 |         return np.sqrt(self.var)
60 | 
61 |     @property
62 |     def shape(self):
63 |         return self._M.shape
64 | 
65 | 
66 | class ZFilter:
67 |     """
68 |     y = (x-mean)/std
69 |     using running estimates of mean,std
70 |     """
71 | 
72 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
73 |         self.demean = demean
74 |         self.destd = destd
75 |         self.clip = clip
76 |         self.shape = shape
77 | 
78 |         self.rs = RunningStat(shape)
79 | 
80 |     def reset(self):
81 |         self.rs = RunningStat(self.shape)
82 | 
83 |     def fit(self, X):
84 |         for x in X:
85 |             self.rs.push(x)
86 | 
87 |     def __call__(self, x, update=True):
88 |         if update:
89 |             self.rs.push(x)
90 |         if self.demean:
91 |             x = x - self.rs.mean
92 |         if self.destd:
93 |             x = x / (self.rs.std + 1e-8)
94 |         if self.clip:
95 |             x = np.clip(x, -self.clip, self.clip)
96 |         return x
97 | 


--------------------------------------------------------------------------------