├── .gitignore ├── README.md ├── agent.py ├── envs ├── 2Dpusher_env.py └── __init__.py ├── model.py ├── mpc.py ├── opencv_draw.py ├── run.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # MacOS files 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Other files 12 | *.pdf 13 | *.tex 14 | *.log 15 | *.npy 16 | *.png 17 | *.jpg 18 | 19 | # VSCODE files 20 | .vscode/ 21 | 22 | # Project files 23 | /*.project 24 | /*.pydevproject 25 | /.settings 26 | 27 | # Sub-directories 28 | template/ 29 | writeup/ 30 | src/ 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Model-based Reinforcement Learning 2 | ``` 3 | |-- envs 4 | |-- 2Dpusher_env.py # Implementation of the pusher environment 5 | |-- __init__.py # Register two pusher environmnets 6 | |-- run.py # Entry of all the experiments. Also check out the commented code in this file. 7 | |-- mpc.py # MPC to optimize model; Call CEM from this file 8 | |-- model.py # Creat and train the ensemble dynamics model 9 | |-- agent.py # An agent that can be used to interact with the environment 10 | # and a random policy 11 | |-- util.py # Some utility functions we kindly provide. You do not have to use them. 12 | |-- opencv_draw.py # Contains features for rendering the Box2D environments. You don't have to do anything to this file. 13 | ``` 14 | 15 | # Prerequisite 16 | Run `pip install opencv-python` 17 | 18 | # Guidelines for Implementation 19 | * When implementing the `MPC` class, use the `mpc_params` that is passed into this class. 20 | * One tip is to write a separate `CEMOptimizer` and `RandomOptimizer`, which optimize a cost function over 21 | action sequences. 22 | * We provide a state based cost function. To get the cost of a trajectory, you need to sum/average over the cost of all 23 | the states. 24 | * Sometimes the cost function can give `nan` values. Make sure to deal with such cases in your code. 25 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Agent: 5 | def __init__(self, env): 6 | self.env = env 7 | 8 | def sample(self, horizon, policy): 9 | """ 10 | Sample a rollout from the agent. 11 | 12 | Arguments: 13 | horizon (int): the length of the rollout 14 | policy: the policy that the agent will use for actions 15 | """ 16 | rewards = [] 17 | states, actions, reward_sum, done = [self.env.reset()], [], 0, False 18 | 19 | policy.reset() 20 | for t in range(horizon): 21 | print('time step: {}/{}'.format(t + 1, horizon), end='\r', 22 | flush=True) 23 | actions.append(policy.act(states[t], t)) 24 | state, reward, done, info = self.env.step(actions[t]) 25 | states.append(state) 26 | reward_sum += reward 27 | rewards.append(reward) 28 | if done: 29 | # print(info['done']) 30 | break 31 | # print("Rollout length: ", len(actions)) 32 | 33 | return { 34 | "obs": np.array(states), 35 | "ac": np.array(actions), 36 | "reward_sum": reward_sum, 37 | "rewards": np.array(rewards), 38 | } 39 | 40 | 41 | class RandomPolicy: 42 | def __init__(self, action_dim): 43 | self.action_dim = action_dim 44 | 45 | def reset(self): 46 | pass 47 | 48 | def act(self, arg1, arg2): 49 | return np.random.uniform(size=self.action_dim) * 2 - 1 50 | -------------------------------------------------------------------------------- /envs/2Dpusher_env.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import gym 3 | import copy 4 | import Box2D 5 | import numpy as np 6 | 7 | from Box2D.b2 import (circleShape, fixtureDef, polygonShape) 8 | from opencv_draw import OpencvDrawFuncs 9 | 10 | from gym import spaces 11 | from gym.utils import seeding 12 | 13 | MIN_COORD = 0 14 | MAX_COORD = 5 15 | PUSHER_START = np.array([1.0, 1.0]) 16 | BOX_START = np.array([2.0, 2.0]) 17 | FORCE_MULT = 1 18 | RAD = 0.2 19 | SIDE_GAP_MULT = 2 20 | BOX_RAD = 0.2 21 | GOAL_RAD = 0.5 22 | MAX_STEPS = 40 23 | FPS = 4 24 | 25 | 26 | class Pusher2d(gym.Env): 27 | def __init__(self, control_noise=0.): 28 | self.control_noise = control_noise 29 | self.seed() 30 | self.world = Box2D.b2World(gravity=(0, 0)) 31 | self.pusher = None 32 | self.box = None 33 | # Actions: x-movement, y-movement (clipped -1 to 1) 34 | self.action_space = spaces.Box(np.ones(2) * -1, np.ones(2), dtype=np.float32) 35 | # State: pusher xy position, box xy position, pusher xy velocity, box xy velocity, goal xy position 36 | self.observation_space = spaces.Box(np.ones(10) * MIN_COORD, np.ones(10) * MAX_COORD, dtype=np.float32) 37 | self.reset() 38 | self.drawer = OpencvDrawFuncs(w=240, h=180, ppm=40) 39 | self.drawer.install() 40 | 41 | def seed(self, seed=None): 42 | self.np_random, seed = seeding.np_random(seed) 43 | return [seed] 44 | 45 | def random_place(self): 46 | """ returns [x, y] within an area slightly away from the initial box position """ 47 | return [self.np_random.uniform(BOX_START[0] + BOX_RAD + GOAL_RAD, MAX_COORD - RAD * SIDE_GAP_MULT), 48 | self.np_random.uniform(BOX_START[1] + BOX_RAD + GOAL_RAD, MAX_COORD - RAD * SIDE_GAP_MULT)] 49 | 50 | def _destroy(self): 51 | """ removes instantiated Box2D entities """ 52 | if not self.box: 53 | return 54 | self.world.DestroyBody(self.box) 55 | self.world.DestroyBody(self.pusher) 56 | 57 | def reset(self): 58 | """ standard Gym method; returns first state of episode """ 59 | self._destroy() 60 | self.pusher = self.world.CreateDynamicBody( 61 | position=PUSHER_START[:], 62 | fixtures=fixtureDef( 63 | shape=circleShape(radius=RAD, pos=(0, 0)), 64 | density=1.0 65 | ) 66 | ) 67 | self.box = self.world.CreateDynamicBody( 68 | position=BOX_START[:], 69 | fixtures=fixtureDef( 70 | shape=circleShape(radius=BOX_RAD, pos=(0, 0)), 71 | density=1.0 72 | ) 73 | ) 74 | self.goal_pos = self.random_place() 75 | self.elapsed_steps = 0 76 | return self._get_obs() 77 | 78 | def step(self, action, render=False): 79 | """ standard Gym method; returns s, r, d, i """ 80 | if render: 81 | self.drawer.clear_screen() 82 | self.drawer.draw_world(self.world) 83 | 84 | action = np.clip(action, -1, 1).astype(np.float32) 85 | if self.control_noise > 0.: 86 | action += np.random.normal(0., scale=self.control_noise, size=action.shape) 87 | 88 | self.elapsed_steps += 1 89 | self.pusher._b2Body__SetLinearVelocity((FORCE_MULT * action[0], FORCE_MULT * action[1])) 90 | self.box._b2Body__SetActive(True) 91 | self.world.Step(1.0 / FPS, 6 * 30, 2 * 30) 92 | 93 | if render: 94 | cv2.imshow("world", self.drawer.screen) 95 | cv2.waitKey(20) 96 | done = False 97 | reward = -1 98 | obj_coords = np.concatenate([self.pusher.position.tuple, self.box.position.tuple]) 99 | info = {"done": None} 100 | # check if out of bounds 101 | if np.min(obj_coords) < MIN_COORD or np.max(obj_coords) > MAX_COORD: 102 | reward = -1 * (MAX_STEPS - self.elapsed_steps + 2) 103 | done = True 104 | info['done'] = 'unstable simulation' 105 | # check if out of time 106 | elif self.elapsed_steps >= MAX_STEPS: 107 | done = True 108 | info["done"] = "max_steps_reached" 109 | # check if goal reached 110 | elif np.linalg.norm(np.array(self.box.position.tuple) - self.goal_pos) < RAD + GOAL_RAD: 111 | done = True 112 | reward = 0 113 | info["done"] = "goal reached" 114 | 115 | return self._get_obs(), reward, done, info 116 | 117 | def _get_obs(self): 118 | """ returns current state of environment """ 119 | state = np.concatenate([self.pusher.position.tuple, 120 | self.box.position.tuple, 121 | self.pusher.linearVelocity.tuple, 122 | self.box.linearVelocity.tuple, 123 | self.goal_pos]) 124 | return state 125 | 126 | def apply_hindsight(self, states, actions, goal_state): 127 | """ returns list of new states and list of new rewards for use with HER """ 128 | goal = goal_state[2:4] # get new goal location (last location of box) 129 | states.append(goal_state) 130 | num_tuples = len(actions) 131 | her_states, her_rewards = [], [] 132 | states[0][-2:] = goal.copy() 133 | her_states.append(states[0]) 134 | # for each state, adjust goal and calculate reward obtained 135 | for i in range(1, num_tuples + 1): 136 | state = states[i] 137 | state[-2:] = goal.copy() 138 | reward = self._HER_calc_reward(state) 139 | her_states.append(state) 140 | her_rewards.append(reward) 141 | return her_states, her_rewards 142 | 143 | def _HER_calc_reward(self, state): 144 | """ given state, returns reward for transitioning to this state (used by HER) """ 145 | if np.linalg.norm(state[2:4] - state[4:6]) < RAD + GOAL_RAD: 146 | return 0 147 | else: 148 | return -1 149 | 150 | def set_state(self, state): 151 | self.pusher.position = state[:2] 152 | self.box.position = state[2:4] 153 | self.pusher.linearVelocity = state[4:6] 154 | self.box.linearVelocity = state[6:8] 155 | if len(state) == 10: # The state can also be observation only, which does not include the goal 156 | self.goal_pos = state[8:10] 157 | 158 | def get_state(self): 159 | return copy.copy(self._get_obs()) 160 | 161 | def get_nxt_state(self, state, action): 162 | original_state = self.get_state() 163 | original_elapsed_steps = self.elapsed_steps 164 | self.set_state(state) 165 | nxt_state, _, _, _ = self.step(action) 166 | nxt_state = nxt_state[:8] 167 | 168 | # Make sure there is no side effect 169 | self.set_state(original_state) 170 | self.elapsed_steps = original_elapsed_steps 171 | return nxt_state 172 | -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='Pushing2D-v1', 5 | entry_point='envs.2Dpusher_env:Pusher2d', 6 | kwargs={'control_noise': 0} 7 | ) 8 | 9 | # Control noise is added to the action 10 | register( 11 | id='Pushing2DNoisyControl-v1', 12 | entry_point='envs.2Dpusher_env:Pusher2d', 13 | kwargs={'control_noise': 0.3} 14 | ) 15 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import tensorflow as tf 4 | import tensorflow.keras.backend as K 5 | 6 | from util import ZFilter 7 | from tensorflow.keras.models import Model 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.regularizers import l2 10 | from tensorflow.keras.layers import Dense, Input 11 | 12 | 13 | HIDDEN1_UNITS = 400 14 | HIDDEN2_UNITS = 400 15 | HIDDEN3_UNITS = 400 16 | 17 | 18 | class PENN: 19 | """ 20 | (P)robabilistic (E)nsemble of (N)eural (N)etworks 21 | """ 22 | 23 | def __init__(self, num_nets, state_dim, action_dim, learning_rate): 24 | """ 25 | :param num_nets: number of networks in the ensemble 26 | :param state_dim: state dimension 27 | :param action_dim: action dimension 28 | :param learning_rate: 29 | """ 30 | 31 | self.sess = tf.Session() 32 | self.num_nets = num_nets 33 | self.state_dim = state_dim 34 | self.action_dim = action_dim 35 | self.learning_rate = learning_rate 36 | K.set_session(self.sess) 37 | 38 | # Log variance bounds 39 | self.max_logvar = tf.Variable(-3 * np.ones([1, self.state_dim]), 40 | dtype=tf.float32) 41 | self.min_logvar = tf.Variable(-7 * np.ones([1, self.state_dim]), 42 | dtype=tf.float32) 43 | # Define ops for model output and optimization 44 | self.inputs = list() 45 | self.losses = list() 46 | self.means = list() 47 | self.logvars = list() 48 | self.models = list() 49 | self.outputs = list() 50 | self.targets = list() 51 | self.optimizations = list() 52 | for model in range(self.num_nets): 53 | model, inp = self.create_network() 54 | self.inputs.append(inp) 55 | self.models.append(model) 56 | output = self.get_output(model.output) 57 | mean, logvar = output 58 | self.means.append(mean) 59 | self.logvars.append(logvar) 60 | self.outputs.append(output) 61 | target = tf.placeholder(tf.float32, shape=(None, self.state_dim)) 62 | self.targets.append(target) 63 | var = tf.exp(logvar) 64 | inv_var = tf.divide(1, var) 65 | norm_output = mean - target 66 | # Calculate loss: Mahalanobis distance + log(det(cov)) 67 | loss = tf.multiply(tf.multiply(norm_output, inv_var), norm_output) 68 | loss = tf.reduce_sum(loss, axis=1) 69 | loss += tf.math.log(tf.math.reduce_prod(var, axis=1)) 70 | self.losses.append(loss) 71 | optimizer = Adam(lr=learning_rate) 72 | weights = model.trainable_weights 73 | gradients = tf.gradients(loss, weights) 74 | optimize = optimizer.apply_gradients(zip(gradients, weights)) 75 | self.optimizations.append(optimize) 76 | self.sess.run(tf.initialize_all_variables()) 77 | 78 | def predict(self, states, actions, idxs): 79 | """Predicts the next states from the ensemble given states and actions 80 | Args: 81 | states: input states 82 | actions: actions to be taken 83 | idxs: indices of the models to be used for generating the next 84 | state 85 | Returns: 86 | next_states: resulting states 87 | """ 88 | next_states = np.zeros_like(states) 89 | input = np.concatenate((states, actions), axis=1) 90 | assert input.shape[1] == (self.action_dim + self.state_dim) 91 | feed_dict = {inp: input for inp in self.inputs} 92 | outs = self.sess.run([self.means, self.logvars], feed_dict=feed_dict) 93 | means = np.array(outs[0]) 94 | logvars = np.array(outs[1]) 95 | assert means.shape[0] == self.num_nets == logvars.shape[0] 96 | means = means[idxs, range(means.shape[1]), :] 97 | logvars = logvars[idxs, range(logvars.shape[1]), :] 98 | sigma = np.sqrt(np.exp(logvars)) 99 | next_states = np.random.normal(means, sigma, size=means.shape) 100 | return next_states 101 | 102 | def get_output(self, output): 103 | """ 104 | Args: 105 | output: tf variable representing the output of the keras models, 106 | i.e., model.output 107 | Returns: 108 | mean and log variance tf tensors 109 | """ 110 | mean = output[:, :self.state_dim] 111 | raw_v = output[:, self.state_dim:] 112 | logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - raw_v) 113 | logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) 114 | return mean, logvar 115 | 116 | def create_network(self): 117 | inp = Input(shape=[self.state_dim + self.action_dim]) 118 | h1 = Dense(HIDDEN1_UNITS, activation='relu', 119 | kernel_regularizer=l2(0.0001))(inp) 120 | h2 = Dense(HIDDEN2_UNITS, activation='relu', 121 | kernel_regularizer=l2(0.0001))(h1) 122 | h3 = Dense(HIDDEN3_UNITS, activation='relu', 123 | kernel_regularizer=l2(0.0001))(h2) 124 | out = Dense(2 * self.state_dim, activation='linear', 125 | kernel_regularizer=l2(0.0001))(h3) 126 | model = Model(inputs=inp, outputs=out) 127 | return model, inp 128 | 129 | def get_indices(self, n): 130 | return np.random.choice(range(n), size=n, replace=True) 131 | 132 | def train(self, inputs, targets, batch_size=128, epochs=5): 133 | """ 134 | Args: 135 | inputs: state and action inputs. Assumes that inputs are 136 | standardized. 137 | targets: resulting states 138 | """ 139 | # Shuffle the data 140 | rows = inputs.shape[0] 141 | shuffled_indices = np.random.permutation(rows) 142 | inputs = inputs[shuffled_indices, :] 143 | targets = targets[shuffled_indices, :] 144 | # Sample data indices for different models 145 | indices = [self.get_indices(rows) for _ in range(self.num_nets)] 146 | total_loss = list() 147 | rmse_loss = list() 148 | rmse_name = 'rmse_pets.npy' if self.num_nets > 1 else 'rmse_smd.npy' 149 | loss_name = 'loss_pets.npy' if self.num_nets > 1 else 'loss_smd.npy' 150 | for epoch in range(epochs): 151 | print('Epoch {}/{}'.format(epoch + 1, epochs)) 152 | num_batches = math.ceil(rows / batch_size) 153 | for batch in range(num_batches): 154 | # Sample a batch and get inputs and targets 155 | idx = batch * batch_size 156 | inps = [inputs[indices[x][idx:idx + batch_size]] 157 | for x in range(self.num_nets)] 158 | targs = [targets[indices[x][idx:idx + batch_size]] 159 | for x in range(self.num_nets)] 160 | # RMSE 161 | feed_dict = {inp: input for inp, input in zip( 162 | self.inputs, inps)} 163 | outputs = self.sess.run(self.outputs, feed_dict=feed_dict) 164 | gaussians = [(mean, np.sqrt(np.exp(logvar))) 165 | for mean, logvar in outputs] 166 | next_states = [np.random.normal(mean, logvar, mean.shape) 167 | for mean, logvar in gaussians] 168 | rmse = [np.sqrt(np.square(targs[i] - next_states[i]).mean( 169 | axis=1)) for i in range(self.num_nets)] 170 | rmse = [np.mean(r) for r in rmse] 171 | feed_dict_2 = {targ: target for targ, target in zip( 172 | self.targets, targs)} 173 | feed_dict.update(feed_dict_2) 174 | # Loss corresponding to all models 175 | losses = self.sess.run(self.losses, feed_dict=feed_dict) 176 | loss = [np.mean(model_loss) for model_loss in losses] 177 | self.sess.run(self.optimizations, feed_dict=feed_dict) 178 | print('Batch {}/{} | Loss: {:.5f} | RMSE: {:.5f}'.format( 179 | batch + 1, num_batches, np.mean(loss), np.mean(rmse)), 180 | end='\r', flush=True) 181 | total_loss.append(loss) 182 | rmse_loss.append(rmse) 183 | print('\n', '*'*40) 184 | np.save(rmse_name, rmse_loss) 185 | np.save(loss_name, total_loss) 186 | -------------------------------------------------------------------------------- /mpc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MPC: 5 | def __init__(self, 6 | env, 7 | plan_horizon, 8 | model, 9 | popsize, 10 | num_elites, 11 | max_iters, 12 | num_particles=6, 13 | use_mpc=True, 14 | use_gt_dynamics=True, 15 | use_random_optimizer=False): 16 | """ 17 | :param env: 18 | :param plan_horizon: 19 | :param model: The learned dynamics model to use, which can be None if 20 | use_gt_dynamics is True 21 | :param popsize: Population size 22 | :param num_elites: CEM parameter 23 | :param max_iters: CEM parameter 24 | :param num_particles: Number of trajectories for TS1 25 | :param use_gt_dynamics: Whether to use the ground truth dynamics from 26 | the environment 27 | :param use_mpc: Whether to use only the first action of a planned 28 | trajectory 29 | :param use_random_optimizer: Whether to use CEM or take random actions 30 | """ 31 | 32 | self.env = env 33 | 34 | self.popsize = popsize 35 | self.max_iters = max_iters 36 | self.num_elites = num_elites 37 | self.plan_horizon = plan_horizon 38 | self.num_particles = num_particles 39 | 40 | self.use_mpc = use_mpc 41 | self.use_gt_dynamics = use_gt_dynamics 42 | self.use_random_optimizer = use_random_optimizer 43 | 44 | self.num_nets = None if model is None else model.num_nets 45 | 46 | self.state_dim, self.action_dim = 8, env.action_space.shape[0] 47 | self.ac_ub, self.ac_lb = env.action_space.high, env.action_space.low 48 | 49 | self.goal = self.env.goal_pos 50 | # Set up optimizer 51 | self.model = model 52 | 53 | if use_gt_dynamics: 54 | self.predict_next_state = self.predict_next_state_gt 55 | assert num_particles == 1 56 | else: 57 | self.predict_next_state = self.predict_next_state_model 58 | 59 | # Initialize your planner with the relevant arguments. 60 | # Write different optimizers for cem and random actions respectively 61 | if self.use_random_optimizer: 62 | self.opt = self.random_optimizer 63 | else: 64 | self.opt = self.cem_optimizer 65 | 66 | def obs_cost_fn(self, state): 67 | """ Cost function of the current state """ 68 | # Weights for different terms 69 | W_PUSHER = 1 70 | W_GOAL = 2 71 | W_DIFF = 5 72 | 73 | pusher_x, pusher_y = state[0], state[1] 74 | box_x, box_y = state[2], state[3] 75 | goal_x, goal_y = self.goal[0], self.goal[1] 76 | 77 | pusher_box = np.array([box_x - pusher_x, box_y - pusher_y]) 78 | box_goal = np.array([goal_x - box_x, goal_y - box_y]) 79 | d_box = np.sqrt(np.dot(pusher_box, pusher_box)) 80 | d_goal = np.sqrt(np.dot(box_goal, box_goal)) 81 | diff_coord = np.abs(box_x / box_y - goal_x / goal_y) 82 | # the -0.4 is to adjust for the radius of the box and pusher 83 | x = max(d_box - 0.4, 0) 84 | cost = W_PUSHER * x + W_GOAL * d_goal + W_DIFF * diff_coord 85 | return cost 86 | 87 | def predict_next_state_model(self, states, actions): 88 | """Given a list of state action pairs, use the learned model to 89 | predict the next state. 90 | 91 | Returns: 92 | cost: cost of the given action sequence. 93 | """ 94 | # Initial cost is the same for all sequences 95 | # TODO: Remove cost calculation from this function 96 | rows = actions.shape[0] # M*P 97 | cost = np.array([self.obs_cost_fn(states[0, :])] * rows) 98 | sampler = self.ts1sampling(rows) 99 | for i in range(self.plan_horizon): 100 | idx = i * self.action_dim 101 | action = actions[:, idx:idx + self.action_dim] 102 | idxs = sampler[:, i] 103 | next_states = self.model.predict(states, action, idxs) 104 | states = next_states 105 | cost += np.apply_along_axis(self.obs_cost_fn, axis=1, arr=states) 106 | return cost 107 | 108 | def predict_next_state_gt(self, states, actions): 109 | """Given a list of state action pairs, use the ground truth dynamics 110 | to predict the next state. 111 | """ 112 | for i in range(self.plan_horizon): 113 | idx = i * self.action_dim 114 | action = actions[idx:idx + self.action_dim] 115 | next_state = self.env.get_nxt_state(states[i], action) 116 | states.append(next_state) 117 | return states 118 | 119 | def random_optimizer(self, state): 120 | """Implements the random optimizer. It gives the best action sequence 121 | for a certain initial state. 122 | """ 123 | # Generate M*I action sequences of length T according to N(0, 0.5I) 124 | total_sequences = self.popsize * self.max_iters 125 | shape = (total_sequences, self.plan_horizon * self.action_dim) 126 | self.reset() # resets mu and sigma 127 | actions = np.random.normal(self.mu, self.sigma, size=shape) 128 | actions = np.clip(actions, a_min=-1, a_max=1) 129 | if not self.use_gt_dynamics: 130 | repeated_actions = np.tile(actions, reps=(self.num_particles, 1)) 131 | rows = repeated_actions.shape[0] 132 | states = np.tile(state, reps=(rows, 1)) 133 | costs = self.predict_next_state_model(states, repeated_actions) 134 | costs = costs.reshape(self.num_particles, -1) 135 | costs = np.mean(costs, axis=0) # these are M*I costs 136 | assert costs.shape[0] == self.popsize * self.max_iters 137 | min_cost_idx = np.argmin(costs) 138 | return actions[min_cost_idx] 139 | else: 140 | best_cost = np.inf 141 | best_action_sequence = np.zeros_like(self.mu) 142 | for i in range(total_sequences): 143 | states = self.predict_next_state_gt([state], actions[i, :]) 144 | assert len(states) == self.plan_horizon + 1 145 | cost = sum(self.obs_cost_fn(x) for x in states) 146 | cost /= self.num_particles 147 | if cost < best_cost: 148 | best_cost = cost 149 | best_action_sequence = actions[i, :] 150 | return best_action_sequence 151 | 152 | def ts1sampling(self, n): 153 | s = (n, self.plan_horizon) 154 | return np.random.choice(range(self.num_nets), size=s, replace=True) 155 | 156 | def cem_optimizer(self, state): 157 | """Implements the Cross Entropy Method optimizer. It gives the action 158 | sequence for a certain initial state by choosing elite sequences and 159 | using their mean. 160 | """ 161 | mu = self.mu 162 | sigma = self.sigma 163 | for i in range(self.max_iters): 164 | # Generate M action sequences of length T according to N(mu, std) 165 | shape = (self.popsize, self.plan_horizon * self.action_dim) 166 | actions = np.random.normal(mu, sigma, size=shape) 167 | actions = np.clip(actions, a_min=-1, a_max=1) 168 | costs = None 169 | if not self.use_gt_dynamics: 170 | reps = (self.num_particles, 1) 171 | repeated_actions = np.tile(actions, reps=reps) 172 | rows = repeated_actions.shape[0] 173 | states = np.tile(state, reps=(rows, 1)) 174 | costs = self.predict_next_state_model(states, repeated_actions) 175 | costs = costs.reshape(self.num_particles, -1) 176 | costs = np.mean(costs, axis=0) # these are M costs 177 | else: 178 | costs = list() 179 | for m in range(self.popsize): 180 | states = self.predict_next_state_gt([state], actions[m, :]) 181 | assert len(states) == self.plan_horizon + 1 182 | cost = sum(self.obs_cost_fn(x) for x in states) 183 | cost /= self.num_particles 184 | costs.append(cost) 185 | # Calculate mean and std using the elite action sequences 186 | costs = np.argsort(costs) 187 | elite_sequences = costs[:self.num_elites] 188 | elite_actions = actions[elite_sequences, :] 189 | assert elite_actions.shape[0] == self.num_elites 190 | mu = np.mean(elite_actions, axis=0) 191 | sigma = np.std(elite_actions, axis=0) 192 | return mu 193 | 194 | def train(self, obs_trajs, acs_trajs, rews_trajs, epochs=5): 195 | """ 196 | Take the input obs, acs, rews and append to existing transitions the 197 | train model. 198 | 199 | Args: 200 | obs_trajs: states 201 | acs_trajs: actions 202 | rews_trajs: rewards (NOTE: this may not be used) 203 | epochs: number of epochs to train for 204 | """ 205 | assert len(obs_trajs) == len(acs_trajs) 206 | input_states = [traj[:-1, :self.state_dim] for traj in obs_trajs] 207 | input_states = np.concatenate(input_states, axis=0) 208 | assert input_states.shape[1] == self.state_dim 209 | targets = [traj[1:, :self.state_dim] for traj in obs_trajs] 210 | targets = np.concatenate(targets, axis=0) 211 | assert targets.shape[1] == self.state_dim 212 | actions = [acs for acs in acs_trajs] 213 | actions = np.concatenate(actions, axis=0) 214 | assert actions.shape[1] == self.action_dim 215 | inputs = np.concatenate((input_states, actions), axis=1) 216 | assert inputs.shape[1] == (self.state_dim + self.action_dim) 217 | self.model.train(inputs, targets, epochs=epochs) 218 | 219 | def reset(self): 220 | """Initializes variables mu and sigma. 221 | """ 222 | self.mu = np.zeros(self.plan_horizon * self.action_dim) 223 | self.reset_sigma() 224 | 225 | def act(self, state, t): 226 | """ 227 | Find the action for current state. 228 | 229 | Arguments: 230 | state: current state 231 | t: current timestep 232 | """ 233 | self.goal = state[self.state_dim:] 234 | assert len(self.goal) == 2 235 | state = state[:self.state_dim] 236 | if self.use_mpc: 237 | mu = self.opt(state) 238 | action = mu[:self.action_dim] # Get the first action 239 | action = action.copy() 240 | mu[:-self.action_dim] = mu[self.action_dim:] 241 | mu[-self.action_dim:] = 0 242 | self.mu = mu 243 | else: 244 | if t % self.plan_horizon == 0: 245 | self.mu = self.opt(state) 246 | idx = (t % self.plan_horizon) * self.action_dim 247 | action = self.mu[idx:idx + self.action_dim] 248 | return action 249 | 250 | def reset_sigma(self): 251 | """Resets/initializes the value of sigma. 252 | """ 253 | sigma = [0.5 ** 0.5] * (self.plan_horizon * self.action_dim) 254 | self.sigma = np.array(sigma) 255 | -------------------------------------------------------------------------------- /opencv_draw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Python version Copyright (c) 2015 John Stowers 4 | # 5 | # This software is provided 'as-is', without any express or implied 6 | # warranty. In no event will the authors be held liable for any damages 7 | # arising from the use of this software. 8 | # Permission is granted to anyone to use this software for any purpose, 9 | # including commercial applications, and to alter it and redistribute it 10 | # freely, subject to the following restrictions: 11 | # 1. The origin of this software must not be misrepresented; you must not 12 | # claim that you wrote the original software. If you use this software 13 | # in a product, an acknowledgment in the product documentation would be 14 | # appreciated but is not required. 15 | # 2. Altered source versions must be plainly marked as such, and must not be 16 | # misrepresented as being the original software. 17 | # 3. This notice may not be removed or altered from any source distribution. 18 | 19 | import cv2 20 | import numpy as np 21 | 22 | from Box2D import (b2Color, b2DistanceJoint, b2MouseJoint, b2PulleyJoint) 23 | from Box2D.b2 import (staticBody, dynamicBody, kinematicBody, polygonShape, 24 | circleShape, loopShape, edgeShape) 25 | 26 | 27 | def cvcolor(color): 28 | return int(255.0 * color[2]), int(255.0 * color[1]), int(255.0 * color[0]) 29 | 30 | 31 | def cvcoord(pos): 32 | return tuple(map(int, pos)) 33 | 34 | 35 | class OpencvDrawFuncs(object): 36 | 37 | def __init__(self, w, h, ppm, fill_polygon=True, flip_y=True): 38 | self._w = w 39 | self._h = h 40 | self._ppm = ppm 41 | self._colors = { 42 | staticBody: (255, 255, 255), 43 | dynamicBody: (127, 127, 127), 44 | kinematicBody: (127, 127, 230), 45 | } 46 | self._fill_polygon = fill_polygon 47 | self._flip_y = flip_y 48 | self.screen = np.zeros((self._h, self._w, 3), np.uint8) 49 | 50 | def install(self): 51 | polygonShape.draw = self._draw_polygon 52 | circleShape.draw = self._draw_circle 53 | loopShape.draw = self._draw_loop 54 | edgeShape.draw = self._draw_edge 55 | 56 | def draw_world(self, world): 57 | for body in world.bodies: 58 | for fixture in body.fixtures: 59 | fixture.shape.draw(body, fixture) 60 | for joint in world.joints: 61 | self._draw_joint(joint) 62 | 63 | def clear_screen(self, screen=None): 64 | if screen is None: 65 | self.screen.fill(0) 66 | else: 67 | self.screen = screen 68 | 69 | def _fix_vertices(self, vertices): 70 | if self._flip_y: 71 | return [(v[0], self._h - v[1]) for v in vertices] 72 | else: 73 | return [(v[0], v[1]) for v in vertices] 74 | 75 | def _draw_joint(self, joint): 76 | bodyA, bodyB = joint.bodyA, joint.bodyB 77 | xf1, xf2 = bodyA.transform, bodyB.transform 78 | x1, x2 = xf1.position, xf2.position 79 | p1, p2 = joint.anchorA, joint.anchorB 80 | color = b2Color(0.5, 0.8, 0.8) 81 | 82 | x1, x2, p1, p2 = self._fix_vertices((x1 * self._ppm, x2 * self._ppm, 83 | p1 * self._ppm, p2 * self._ppm)) 84 | 85 | if isinstance(joint, b2DistanceJoint): 86 | cv2.line(self.screen, cvcoord(p1), cvcoord(p2), cvcolor(color), 1) 87 | elif isinstance(joint, b2PulleyJoint): 88 | s1, s2 = joint.groundAnchorA, joint.groundAnchorB 89 | s1, s2 = self._fix_vertices((s1 * self._ppm, s2 * self._ppm)) 90 | cv2.line(self.screen, cvcoord(s1), cvcoord(p1), cvcolor(color), 1) 91 | cv2.line(self.screen, cvcoord(s2), cvcoord(p2), cvcolor(color), 1) 92 | cv2.line(self.screen, cvcoord(s1), cvcoord(s2), cvcolor(color), 1) 93 | elif isinstance(joint, b2MouseJoint): 94 | pass # don't draw it here 95 | else: 96 | cv2.line(self.screen, cvcoord(x1), cvcoord(p1), cvcolor(color), 1) 97 | cv2.line(self.screen, cvcoord(p1), cvcoord(p2), cvcolor(color), 1) 98 | cv2.line(self.screen, cvcoord(x2), cvcoord(p2), cvcolor(color), 1) 99 | 100 | def _draw_polygon(self, body, fixture): 101 | polygon = fixture.shape 102 | 103 | transform = body.transform 104 | vertices = self._fix_vertices([transform * v * self._ppm 105 | for v in polygon.vertices]) 106 | 107 | pts = np.array(vertices, np.int32) 108 | pts = pts.reshape((-1, 1, 2)) 109 | cv2.polylines(self.screen, [pts], True, self._colors[body.type]) 110 | 111 | if self._fill_polygon: 112 | lightc = np.array(self._colors[body.type], dtype=int) * 0.5 113 | cv2.fillPoly(self.screen, [pts], lightc) 114 | 115 | def _draw_circle(self, body, fixture): 116 | circle = fixture.shape 117 | position = self._fix_vertices( 118 | [body.transform * circle.pos * self._ppm])[0] 119 | cv2.circle(self.screen, cvcoord(position), int( 120 | circle.radius * self._ppm), self._colors[body.type], 1) 121 | 122 | def _draw_edge(self, body, fixture): 123 | edge = fixture.shape 124 | v = [body.transform * edge.vertex1 * self._ppm, 125 | body.transform * edge.vertex2 * self._ppm] 126 | vertices = self._fix_vertices(v) 127 | cv2.line(self.screen, cvcoord(vertices[0]), 128 | cvcoord(vertices[1]), self._colors[body.type], 1) 129 | 130 | def _draw_loop(self, body, fixture): 131 | loop = fixture.shape 132 | transform = body.transform 133 | vertices = self._fix_vertices([transform * v * self._ppm 134 | for v in loop.vertices]) 135 | v1 = vertices[-1] 136 | for v2 in vertices: 137 | cv2.line(self.screen, cvcoord(v1), cvcoord(v2), 138 | self._colors[body.type], 1) 139 | v1 = v2 140 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import envs 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from collections import deque 8 | from mpc import MPC 9 | from model import PENN 10 | from agent import Agent, RandomPolicy 11 | 12 | # Training params 13 | TASK_HORIZON = 40 14 | PLAN_HORIZON = 5 15 | # CEM params 16 | POPSIZE = 200 17 | NUM_ELITES = 20 18 | MAX_ITERS = 5 19 | # Model params 20 | LR = 1e-3 21 | # Dims 22 | STATE_DIM = 8 23 | 24 | 25 | class ExperimentGTDynamics(object): 26 | def __init__(self, env_name='Pushing2D-v1', mpc_params=None): 27 | self.env = gym.make(env_name) 28 | self.task_horizon = TASK_HORIZON 29 | self.agent = Agent(self.env) 30 | self.warmup = False # Does not need model 31 | mpc_params['use_gt_dynamics'] = True 32 | self.cem_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE, 33 | NUM_ELITES, MAX_ITERS, **mpc_params, 34 | use_random_optimizer=False) 35 | self.random_policy = MPC(self.env, PLAN_HORIZON, None, POPSIZE, 36 | NUM_ELITES, MAX_ITERS, **mpc_params, 37 | use_random_optimizer=True) 38 | 39 | def test(self, num_episodes, optimizer='cem'): 40 | samples = [] 41 | print('Optimizer:', optimizer) 42 | for j in range(num_episodes): 43 | print('Test episode {}'.format(j + 1)) 44 | samples.append( 45 | self.agent.sample( 46 | self.task_horizon, self.cem_policy if 47 | optimizer == 'cem' else self.random_policy 48 | ) 49 | ) 50 | avg_return = np.mean([sample["reward_sum"] for sample in samples]) 51 | avg_success = np.mean([sample["rewards"][-1] == 0 52 | for sample in samples]) 53 | return avg_return, avg_success 54 | 55 | 56 | class ExperimentModelDynamics: 57 | def __init__(self, env_name='Pushing2D-v1', num_nets=1, mpc_params=None): 58 | self.env = gym.make(env_name) 59 | self.task_horizon = TASK_HORIZON 60 | self.agent = Agent(self.env) 61 | mpc_params['use_gt_dynamics'] = False 62 | self.model = PENN(num_nets, STATE_DIM, 63 | len(self.env.action_space.sample()), LR) 64 | self.cem_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, 65 | NUM_ELITES, MAX_ITERS, **mpc_params, 66 | use_random_optimizer=False) 67 | self.random_policy = MPC(self.env, PLAN_HORIZON, self.model, POPSIZE, 68 | NUM_ELITES, MAX_ITERS, **mpc_params, 69 | use_random_optimizer=True) 70 | self.random_policy_no_mpc = RandomPolicy( 71 | len(self.env.action_space.sample())) 72 | 73 | def test(self, num_episodes, optimizer='cem'): 74 | samples = list() 75 | for j in range(num_episodes): 76 | print('Test episode {}'.format(j + 1)) 77 | samples.append( 78 | self.agent.sample( 79 | self.task_horizon, self.cem_policy 80 | if optimizer == 'cem' else self.random_policy 81 | ) 82 | ) 83 | avg_return = np.mean([sample["reward_sum"] for sample in samples]) 84 | avg_success = np.mean([sample["rewards"][-1] == 0 85 | for sample in samples]) 86 | return avg_return, avg_success 87 | 88 | def model_warmup(self, num_episodes, num_epochs): 89 | """Train a single probabilistic model using a random policy 90 | """ 91 | samples = list() 92 | for i in range(num_episodes): 93 | samples.append(self.agent.sample( 94 | self.task_horizon, self.random_policy_no_mpc)) 95 | print('') 96 | print('*'*20, 'TRAINING', '*'*20) 97 | self.cem_policy.train( 98 | [sample["obs"] for sample in samples], 99 | [sample["ac"] for sample in samples], 100 | [sample["rewards"] for sample in samples], 101 | epochs=num_epochs 102 | ) 103 | 104 | def train(self, 105 | num_train_epochs, 106 | num_episodes_per_epoch, 107 | evaluation_interval, 108 | use_buffer=True): 109 | """Jointly training the model and the policy 110 | """ 111 | 112 | def save(filename, arr): 113 | np.save(filename, arr) 114 | 115 | # Buffer has a max size of 100 episodes worth of data 116 | obs_buffer = deque(maxlen=40 * 100) 117 | ac_buffer = deque(maxlen=40 * 100) 118 | rewards_buffer = deque(maxlen=40 * 100) 119 | success_cem = list() 120 | success_random = list() 121 | success_name_cem = 'success_pets_cem.npy' 122 | success_name_ran = 'success_pets_random.npy' 123 | for i in range(num_train_epochs): 124 | print("Starting training epoch %d." % (i + 1)) 125 | # Collect data using the current dynamics model 126 | samples = list() 127 | for j in range(num_episodes_per_epoch): 128 | samples.append( 129 | self.agent.sample(self.task_horizon, self.cem_policy)) 130 | # Update replay buffers 131 | new_obs = [sample["obs"] for sample in samples] 132 | new_ac = [sample["ac"] for sample in samples] 133 | new_rewards = [sample["rewards"] for sample in samples] 134 | obs_buffer.extend(new_obs) 135 | ac_buffer.extend(new_ac) 136 | rewards_buffer.extend(new_rewards) 137 | print("Rewards obtained:", [sample["reward_sum"] 138 | for sample in samples]) 139 | # Train the dynamics model using the data 140 | if use_buffer: 141 | self.cem_policy.train(obs_buffer, ac_buffer, rewards_buffer, 142 | epochs=5) 143 | else: 144 | self.cem_policy.train(new_obs, new_ac, new_rewards, epochs=5) 145 | # Test 146 | num_test_episodes = 20 147 | if (i + 1) % evaluation_interval == 0: 148 | avg_return, avg_success_cem = self.test( 149 | num_test_episodes, optimizer='cem') 150 | print('Test success CEM + MPC:', avg_success_cem) 151 | success_cem.append(avg_success_cem) 152 | save(success_name_cem, success_cem) 153 | avg_return, avg_success_ran = self.test( 154 | num_test_episodes, optimizer='random') 155 | print('Test success Random + MPC:', avg_success_ran) 156 | success_random.append(avg_success_ran) 157 | save(success_name_ran, success_random) 158 | 159 | 160 | def test_cem_gt_dynamics(num_episode=10): 161 | mpc_params = {'use_mpc': False, 'num_particles': 1} 162 | exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params) 163 | avg_reward, avg_success = exp.test(num_episode) 164 | print('CEM PushingEnv: avg_reward: {}, avg_success: {}'.format( 165 | avg_reward, avg_success)) 166 | 167 | mpc_params = {'use_mpc': True, 'num_particles': 1} 168 | exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params) 169 | avg_reward, avg_success = exp.test(num_episode) 170 | print('MPC PushingEnv: avg_reward: {}, avg_success: {}'.format( 171 | avg_reward, avg_success)) 172 | 173 | mpc_params = {'use_mpc': False, 'num_particles': 1} 174 | exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1', 175 | mpc_params=mpc_params) 176 | avg_reward, avg_success = exp.test(num_episode) 177 | print('CEM PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format( 178 | avg_reward, avg_success)) 179 | 180 | mpc_params = {'use_mpc': True, 'num_particles': 1} 181 | exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1', 182 | mpc_params=mpc_params) 183 | avg_reward, avg_success = exp.test(num_episode) 184 | print('CEM+MPC PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format( 185 | avg_reward, avg_success)) 186 | 187 | mpc_params = {'use_mpc': False, 'num_particles': 1} 188 | exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params) 189 | avg_reward, avg_success = exp.test(num_episode, optimizer='random') 190 | print('Random PushingEnv: avg_reward: {}, avg_success: {}'.format( 191 | avg_reward, avg_success)) 192 | 193 | mpc_params = {'use_mpc': True, 'num_particles': 1} 194 | exp = ExperimentGTDynamics(env_name='Pushing2D-v1', mpc_params=mpc_params) 195 | avg_reward, avg_success = exp.test(num_episode, optimizer='random') 196 | print('Random+MPC PushingEnv: avg_reward: {}, avg_success: {}'.format( 197 | avg_reward, avg_success)) 198 | 199 | mpc_params = {'use_mpc': False, 'num_particles': 1} 200 | exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1', 201 | mpc_params=mpc_params) 202 | avg_reward, avg_success = exp.test(num_episode, optimizer='random') 203 | print('Random PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format( 204 | avg_reward, avg_success)) 205 | 206 | mpc_params = {'use_mpc': True, 'num_particles': 1} 207 | exp = ExperimentGTDynamics(env_name='Pushing2DNoisyControl-v1', 208 | mpc_params=mpc_params) 209 | avg_reward, avg_success = exp.test(num_episode, optimizer='random') 210 | print('Random+MPC PushingEnv Noisy: avg_reward: {}, avg_success: {}'.format( 211 | avg_reward, avg_success)) 212 | 213 | 214 | def train_single_dynamics(num_test_episode=50): 215 | num_nets = 1 216 | num_episodes = 1000 217 | num_epochs = 100 218 | mpc_params = {'use_mpc': True, 'num_particles': 1} 219 | exp = ExperimentModelDynamics(env_name='Pushing2D-v1', num_nets=num_nets, 220 | mpc_params=mpc_params) 221 | print('*'*20, 'WARMUP', '*'*20) 222 | exp.model_warmup(num_episodes=num_episodes, num_epochs=num_epochs) 223 | avg_reward, avg_success = exp.test(num_test_episode, optimizer='random') 224 | print('MPC PushingEnv: avg_reward: {}, avg_success: {}'.format( 225 | avg_reward, avg_success)) 226 | 227 | 228 | def train_pets(): 229 | num_nets = 2 230 | num_epochs = 500 231 | evaluation_interval = 50 232 | num_episodes_per_epoch = 1 233 | 234 | mpc_params = {'use_mpc': True, 'num_particles': 6} 235 | exp = ExperimentModelDynamics(env_name='Pushing2D-v1', num_nets=num_nets, 236 | mpc_params=mpc_params) 237 | print('*'*20, 'WARMUP', '*'*20) 238 | exp.model_warmup(num_episodes=100, num_epochs=10) 239 | exp.train(num_train_epochs=num_epochs, 240 | num_episodes_per_epoch=num_episodes_per_epoch, 241 | evaluation_interval=evaluation_interval) 242 | 243 | 244 | if __name__ == "__main__": 245 | # Disable AVX/FMA and TF warnings 246 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 247 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 248 | 249 | test_cem_gt_dynamics(50) 250 | train_single_dynamics(50) 251 | train_pets() 252 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | # Some of the code from the following sources: 2 | # https://github.com/joschu/modular_rl 3 | # https://github.com/Khrylx/PyTorch-RL 4 | # http://www.johndcook.com/blog/standard_deviation 5 | 6 | import numpy as np 7 | 8 | 9 | def estimate_advantages(rewards, masks, values, gamma, tau): 10 | deltas = np.zeros(len(rewards)) 11 | advantages = np.zeros(len(rewards)) 12 | 13 | prev_value = 0 14 | prev_advantage = 0 15 | for i in reversed(range(len(rewards))): 16 | deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i] 17 | advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i] 18 | 19 | prev_value = values[i] 20 | prev_advantage = advantages[i] 21 | 22 | returns = values + advantages 23 | advantages = (advantages - np.mean(advantages)) / np.std(advantages) 24 | 25 | return advantages, returns 26 | 27 | 28 | class RunningStat(object): 29 | def __init__(self, shape): 30 | self._n = 0 31 | self._M = np.zeros(shape) 32 | self._S = np.zeros(shape) 33 | 34 | def push(self, x): 35 | x = np.asarray(x) 36 | assert x.shape == self._M.shape 37 | self._n += 1 38 | if self._n == 1: 39 | self._M[...] = x 40 | else: 41 | oldM = self._M.copy() 42 | self._M[...] = oldM + (x - oldM) / self._n 43 | self._S[...] = self._S + (x - oldM) * (x - self._M) 44 | 45 | @property 46 | def n(self): 47 | return self._n 48 | 49 | @property 50 | def mean(self): 51 | return self._M 52 | 53 | @property 54 | def var(self): 55 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 56 | 57 | @property 58 | def std(self): 59 | return np.sqrt(self.var) 60 | 61 | @property 62 | def shape(self): 63 | return self._M.shape 64 | 65 | 66 | class ZFilter: 67 | """ 68 | y = (x-mean)/std 69 | using running estimates of mean,std 70 | """ 71 | 72 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 73 | self.demean = demean 74 | self.destd = destd 75 | self.clip = clip 76 | self.shape = shape 77 | 78 | self.rs = RunningStat(shape) 79 | 80 | def reset(self): 81 | self.rs = RunningStat(self.shape) 82 | 83 | def fit(self, X): 84 | for x in X: 85 | self.rs.push(x) 86 | 87 | def __call__(self, x, update=True): 88 | if update: 89 | self.rs.push(x) 90 | if self.demean: 91 | x = x - self.rs.mean 92 | if self.destd: 93 | x = x / (self.rs.std + 1e-8) 94 | if self.clip: 95 | x = np.clip(x, -self.clip, self.clip) 96 | return x 97 | --------------------------------------------------------------------------------