├── data ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-26-31 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-27-37 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-28-13 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-16-05 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-18-18 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-21-23 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-21-56 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-43-58 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-29 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-45 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-58 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-54-06 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-00-39 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-01-58 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-17-09 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-18-25 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-30-29 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-31-54 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-32-14 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-33-12 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-34-21 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-04-22 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-04-49 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-05-08 │ └── log.txt ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-05-24 │ └── log.txt └── mb_mpc_HalfCheetah-v1_11-01-2018_13-06-02 │ └── log.txt ├── __pycache__ ├── logz.cpython-35.pyc ├── dynamics.cpython-35.pyc ├── cheetah_env.cpython-35.pyc ├── controllers.cpython-35.pyc └── cost_functions.cpython-35.pyc ├── README.md ├── cheetah_env.py ├── cost_functions.py ├── policy.py ├── controllers.py ├── dynamics.py ├── logz.py ├── plot.py ├── iLQR.py └── main.py /data/mb_mpc_HalfCheetah-v1_10-01-2018_19-26-31/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_10-01-2018_19-27-37/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_10-01-2018_19-28-13/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-16-05/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-18-18/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-21-23/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-21-56/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-43-58/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-29/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-45/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-58/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_11-54-06/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-00-39/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-01-58/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-17-09/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-18-25/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-30-29/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-31-54/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-32-14/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-33-12/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_12-34-21/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_13-04-22/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_13-04-49/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_13-05-08/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_13-05-24/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/mb_mpc_HalfCheetah-v1_11-01-2018_13-06-02/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /__pycache__/logz.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/logz.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/dynamics.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/dynamics.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/cheetah_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/cheetah_env.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/controllers.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/controllers.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/cost_functions.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/cost_functions.cpython-35.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### About 2 | 3 | This work aims at implementing simple MPC controller for gym's Mujoco models as described in **Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning** 4 | and build on it by adding LQR based controllers instead of using simple shooting methods. Such controllers are then applied in parallel and the stored trajectories are used to learn a general 5 | neural network policy. 6 | 7 | 8 | ### Dependencies 9 | 10 | This code has been tested on python3 and requires [mujoco_py](https://github.com/openai/mujoco-py) installed. 11 | 12 | ### How to Run 13 | 14 | Please use `python3 main.py` to run. Passing `--load_model` will restore the previously stored policy parameters. 15 | 16 | -------------------------------------------------------------------------------- /cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 8 | utils.EzPickle.__init__(self) 9 | 10 | def _step(self, action): 11 | xposbefore = self.model.data.qpos[0, 0] 12 | self.do_simulation(action, self.frame_skip) 13 | xposafter = self.model.data.qpos[0, 0] 14 | ob = self._get_obs() 15 | reward_ctrl = - 0.1 * np.square(action).sum() 16 | reward_run = (xposafter - xposbefore)/self.dt 17 | reward = reward_ctrl + reward_run 18 | done = False 19 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 20 | 21 | def _get_obs(self): 22 | return np.concatenate([ 23 | self.model.data.qpos.flat[1:], 24 | self.model.data.qvel.flat, 25 | self.get_body_com("torso").flat, 26 | # self.get_body_comvel("torso").flat, 27 | ]) 28 | 29 | def reset_model(self): 30 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 31 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 32 | self.set_state(qpos, qvel) 33 | return self._get_obs() 34 | 35 | def viewer_setup(self): 36 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /cost_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | #======================================================== 5 | # 6 | # Environment-specific cost functions: 7 | # 8 | def reacher_cost_fn(state, action, next_state): 9 | 10 | if len(state.shape) > 1: 11 | 12 | scores=np.zeros((state.shape[0],)) 13 | 14 | scores += np.linalg.norm(next_state[:,-1]) 15 | return scores 16 | 17 | score = np.linalg.norm(next_state[-1]) 18 | return score 19 | 20 | def pendulum_cost_fn(state, action, next_state): 21 | 22 | if len(state.shape) > 1: 23 | 24 | scores=np.zeros((state.shape[0],)) 25 | 26 | scores += np.abs(next_state[:,1]) #+ 0.001 * action**2 27 | return scores 28 | 29 | score = np.abs(next_state[1]) #+ 0.001 * action**2 30 | 31 | return score 32 | 33 | def cheetah_cost_fn(state, action, next_state): 34 | if len(state.shape) > 1: 35 | 36 | heading_penalty_factor=10 37 | scores=np.zeros((state.shape[0],)) 38 | 39 | #dont move front shin back so far that you tilt forward 40 | front_leg = state[:,5] 41 | my_range = 0.2 42 | scores[front_leg>=my_range] += heading_penalty_factor 43 | 44 | front_shin = state[:,6] 45 | my_range = 0 46 | scores[front_shin>=my_range] += heading_penalty_factor 47 | 48 | front_foot = state[:,7] 49 | my_range = 0 50 | scores[front_foot>=my_range] += heading_penalty_factor 51 | 52 | scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1)) 53 | return scores 54 | 55 | heading_penalty_factor=10 56 | score = 0 57 | 58 | #dont move front shin back so far that you tilt forward 59 | front_leg = state[5] 60 | my_range = 0.2 61 | if front_leg>=my_range: 62 | score += heading_penalty_factor 63 | 64 | front_shin = state[6] 65 | my_range = 0 66 | if front_shin>=my_range: 67 | score += heading_penalty_factor 68 | 69 | front_foot = state[7] 70 | my_range = 0 71 | if front_foot>=my_range: 72 | score += heading_penalty_factor 73 | 74 | score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2)) 75 | return score 76 | 77 | #======================================================== 78 | # 79 | # Cost function for a whole trajectory: 80 | # 81 | 82 | def trajectory_cost_fn(cost_fn, states, actions, next_states): 83 | trajectory_cost = 0 84 | for i in range(len(actions)): 85 | trajectory_cost += cost_fn(states[i], actions[i], next_states[i]) 86 | return trajectory_cost 87 | -------------------------------------------------------------------------------- /policy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | # Predefined function to build a feedforward neural network 5 | def build_mlp(input_placeholder, 6 | output_size, 7 | scope, 8 | n_layers=1, 9 | size=64, 10 | activation=tf.tanh, 11 | output_activation=None 12 | ): 13 | out = input_placeholder 14 | with tf.variable_scope(scope): 15 | for _ in range(n_layers): 16 | out = tf.layers.dense(out, size, activation=activation) 17 | out = tf.layers.dense(out, output_size, activation=output_activation) 18 | return out 19 | 20 | class NNPolicy(): 21 | def __init__(self, 22 | env, 23 | normalization, 24 | batch_size, 25 | iterations, 26 | learning_rate, 27 | sess, 28 | model_path, 29 | save_path, 30 | load_model 31 | ): 32 | """ YOUR CODE HERE """ 33 | """ Note: Be careful about normalization """ 34 | 35 | self.normalization = normalization 36 | self.batch_size = batch_size 37 | self.iterations = iterations 38 | 39 | self.states = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32) 40 | self.actions = tf.placeholder(shape = [None, env.action_space.shape[0]], dtype = tf.float32) 41 | 42 | self.mean = build_mlp(self.states, env.action_space.shape[0], "policy")#, n_layers, size, activation, output_activation) 43 | 44 | self.loss = tf.reduce_mean(tf.square((self.actions) - self.mean)) 45 | self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) 46 | 47 | self.sess = sess 48 | self.model_path = model_path 49 | self.save_path = save_path 50 | self.saver = tf.train.Saver(max_to_keep=1) 51 | if load_model: 52 | self.saver.restore(self.sess, self.model_path) 53 | 54 | def fit(self, data): 55 | """ 56 | Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions and imitate the actions taken by an expert. 57 | """ 58 | 59 | """YOUR CODE HERE """ 60 | observations = np.concatenate([path['observations'] for path in data]) 61 | actions = np.concatenate([path['actions'] for path in data]) 62 | 63 | "Normalize the data" 64 | observations = (observations - self.normalization[0]) / (self.normalization[1] + 1e-10) 65 | actions = (actions - self.normalization[4]) / (self.normalization[5] + 1e-10) 66 | 67 | for i in range(self.iterations): 68 | 69 | batch_id = np.random.choice(observations.shape[0], self.batch_size)#, replace = False) 70 | _ = tf.get_default_session().run(self.update_op, feed_dict = {self.states : observations[batch_id], self.actions : actions[batch_id]}) 71 | print("saving now...") 72 | self.saver.save(self.sess, self.save_path, global_step=0) 73 | 74 | def get_action(self, states): 75 | """ Write a function to take in a batch of (unnormalized) states and return the (unnormalized) actions as predicted by using the model """ 76 | """ YOUR CODE HERE """ 77 | 78 | mean = tf.get_default_session().run(self.mean, feed_dict = {self.states : states}) 79 | std = np.ones(mean.shape[1], dtype=np.float32) 80 | 81 | return tf.get_default_session().run(tf.contrib.distributions.MultivariateNormalDiag(mean, std).sample()) 82 | -------------------------------------------------------------------------------- /controllers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from cost_functions import trajectory_cost_fn 3 | import time 4 | from iLQR import iLQR 5 | 6 | class Controller(): 7 | def __init__(self): 8 | pass 9 | 10 | # Get the appropriate action(s) for this state(s) 11 | def get_action(self, state, steps): 12 | pass 13 | 14 | 15 | class RandomController(Controller): 16 | def __init__(self, env): 17 | """ YOUR CODE HERE """ 18 | self.env = env 19 | 20 | def get_action(self, state, steps): 21 | """ YOUR CODE HERE """ 22 | """ Your code should randomly sample an action uniformly from the action space """ 23 | action = self.env.action_space.sample() 24 | return action 25 | 26 | 27 | class MPCcontroller(Controller): 28 | """ Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """ 29 | def __init__(self, 30 | env, 31 | dyn_model, 32 | horizon=5, 33 | cost_fn=None, 34 | num_simulated_paths=10, 35 | ): 36 | self.env = env 37 | self.dyn_model = dyn_model 38 | self.horizon = horizon 39 | self.cost_fn = cost_fn 40 | self.num_simulated_paths = num_simulated_paths 41 | 42 | def get_action(self, state, steps): 43 | """ YOUR CODE HERE """ 44 | """ Note: be careful to batch your simulations through the model for speed """ 45 | 46 | # sample sequeces of action_space 47 | action_set = np.reshape([self.env.action_space.sample() for i in range(self.num_simulated_paths * self.horizon)], (self.num_simulated_paths, self.horizon, self.env.action_space.shape[0])) 48 | 49 | state = np.repeat(state.reshape([1,-1]), self.num_simulated_paths, axis=0) 50 | cost = np.zeros([self.num_simulated_paths], dtype = np.float32) 51 | 52 | # predict next next_states 53 | for i in range(self.horizon): 54 | next_state = self.dyn_model.predict(state, action_set[:,i,:]) 55 | 56 | cost += self.cost_fn(state[:,:], action_set[:,i,:], next_state[:,:]) 57 | 58 | state = next_state 59 | 60 | # calculate cost and choose optimal path 61 | act = np.argmin(cost) 62 | return action_set[act, 0] 63 | 64 | class LQRcontroller(iLQR): 65 | 66 | def __init__(self, 67 | env, 68 | delta, 69 | T, 70 | dyn_model, 71 | cost_fn, 72 | iterations, 73 | ): 74 | iLQR.__init__(self, env, delta, T, dyn_model, cost_fn) 75 | self.iterations = iterations 76 | 77 | def get_action(self, state, steps): 78 | 79 | if not steps: 80 | self.U_hat = np.reshape([np.zeros(self.env.action_space.sample().shape) for i in range(self.T - 1)], (self.T - 1, self.env.action_space.shape[0])) 81 | 82 | self.X_hat = [] 83 | self.X_hat.append(state) 84 | x = state 85 | 86 | for i in range(self.T - 1): 87 | next_state = self.dyn_model.predict(x, self.U_hat[i,:]) 88 | 89 | self.X_hat.append(next_state[0]) 90 | x = next_state 91 | 92 | self.X_hat = np.asarray(self.X_hat) 93 | 94 | for i in range(self.iterations): 95 | self.backward(self.X_hat, self.U_hat) 96 | 97 | x = state #X_hat[0] 98 | 99 | U = np.zeros(self.U_hat.shape) 100 | X = np.zeros(self.X_hat.shape) 101 | 102 | for t in range(self.T - 1): 103 | u = self.get_action_one_step(x, t, self.X_hat[t], self.U_hat[t]) 104 | #print("control", u) 105 | X[t] = x 106 | U[t] = u 107 | 108 | x = self.dyn_model.predict(x, u)[0] 109 | 110 | X[-1] = x 111 | #print("X_hat for iteration {} is {}".format(i, X_hat)) 112 | #print("U_hat for iteration {} is {}".format(i, self.U_hat)) 113 | self.X_hat = X 114 | self.U_hat = U 115 | 116 | return self.U_hat[0] 117 | -------------------------------------------------------------------------------- /dynamics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | # Predefined function to build a feedforward neural network 5 | def build_mlp(input_placeholder, 6 | output_size, 7 | scope, 8 | n_layers=2, 9 | size=500, 10 | activation=tf.nn.relu, 11 | output_activation=None 12 | ): 13 | out = input_placeholder 14 | with tf.variable_scope(scope): 15 | for _ in range(n_layers): 16 | out = tf.layers.dense(out, size, activation=activation) 17 | out = tf.layers.dense(out, output_size, activation=output_activation) 18 | return out 19 | 20 | class NNDynamicsModel(): 21 | def __init__(self, 22 | env, 23 | n_layers, 24 | size, 25 | activation, 26 | output_activation, 27 | normalization, 28 | batch_size, 29 | iterations, 30 | learning_rate, 31 | sess 32 | ): 33 | """ YOUR CODE HERE """ 34 | """ Note: Be careful about normalization """ 35 | 36 | self.normalization = normalization 37 | self.batch_size = batch_size 38 | self.iterations = iterations 39 | 40 | self.states = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32) 41 | self.actions = tf.placeholder(shape = [None, env.action_space.shape[0]], dtype = tf.float32) 42 | self.deltas = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32) 43 | state_action_pair = tf.concat([self.states, self.actions], 1) 44 | 45 | self.model = build_mlp(state_action_pair, env.observation_space.shape[0], "model", n_layers, size, activation, output_activation) 46 | 47 | self.loss = tf.reduce_mean(tf.square((self.deltas) - self.model)) 48 | self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) 49 | 50 | def fit(self, data): 51 | """ 52 | Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, normalized actions to normalized state differences (s_t+1 - s_t) 53 | """ 54 | 55 | """YOUR CODE HERE """ 56 | observations = np.concatenate([path['observations'] for path in data]) 57 | actions = np.concatenate([path['actions'] for path in data]) 58 | next_observations = np.concatenate([path['next_observations'] for path in data]) 59 | deltas = next_observations - observations 60 | 61 | "Normalize the data" 62 | observations = (observations - self.normalization[0]) / (self.normalization[1] + 1e-10) 63 | actions = (actions - self.normalization[4]) / (self.normalization[5] + 1e-10) 64 | deltas = (deltas - self.normalization[2]) / (self.normalization[3] + 1e-10) 65 | 66 | for i in range(self.iterations): 67 | 68 | batch_id = np.random.choice(observations.shape[0], self.batch_size)#, replace = False) 69 | _ = tf.get_default_session().run(self.update_op, feed_dict = {self.states : observations[batch_id], self.actions : actions[batch_id], self.deltas : deltas[batch_id]}) 70 | 71 | 72 | def predict(self, states, actions): 73 | """ Write a function to take in a batch of (unnormalized) states and (unnormalized) actions and return the (unnormalized) next states as predicted by using the model """ 74 | """ YOUR CODE HERE """ 75 | 76 | if len(states.shape) == 1: 77 | states = states.reshape((1, states.shape[0])) 78 | if len(actions.shape) == 1: 79 | actions = actions.reshape((1, actions.shape[0])) 80 | 81 | next_observations = states + tf.get_default_session().run(self.model, feed_dict = {self.states : states, self.actions : actions}) 82 | return next_observations 83 | -------------------------------------------------------------------------------- /logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | if osp.exists(G.output_dir): 55 | print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir) 56 | else: 57 | os.makedirs(G.output_dir) 58 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 59 | atexit.register(G.output_file.close) 60 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 61 | 62 | def log_tabular(key, val): 63 | """ 64 | Log a value of some diagnostic 65 | Call this once for each diagnostic quantity, each iteration 66 | """ 67 | if G.first_row: 68 | G.log_headers.append(key) 69 | else: 70 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 71 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 72 | G.log_current_row[key] = val 73 | 74 | def save_params(params): 75 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 76 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 77 | 78 | def pickle_tf_vars(): 79 | """ 80 | Saves tensorflow variables 81 | Requires them to be initialized first, also a default session must exist 82 | """ 83 | _dict = {v.name : v.eval() for v in tf.global_variables()} 84 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 85 | pickle.dump(_dict, f) 86 | 87 | 88 | def dump_tabular(): 89 | """ 90 | Write all of the diagnostics from the current iteration 91 | """ 92 | vals = [] 93 | key_lens = [len(key) for key in G.log_headers] 94 | max_key_len = max(15,max(key_lens)) 95 | keystr = '%'+'%d'%max_key_len 96 | fmt = "| " + keystr + "s | %15s |" 97 | n_slashes = 22 + max_key_len 98 | print("-"*n_slashes) 99 | for key in G.log_headers: 100 | val = G.log_current_row.get(key, "") 101 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 102 | else: valstr = val 103 | print(fmt%(key, valstr)) 104 | vals.append(val) 105 | print("-"*n_slashes) 106 | if G.output_file is not None: 107 | if G.first_row: 108 | G.output_file.write("\t".join(G.log_headers)) 109 | G.output_file.write("\n") 110 | G.output_file.write("\t".join(map(str,vals))) 111 | G.output_file.write("\n") 112 | G.output_file.flush() 113 | G.log_current_row.clear() 114 | G.first_row=False 115 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | plt.legend(loc='best').draggable() 57 | plt.show() 58 | 59 | 60 | def get_datasets(fpath, condition=None): 61 | unit = 0 62 | datasets = [] 63 | for root, dir, files in os.walk(fpath): 64 | if 'log.txt' in files: 65 | param_path = open(os.path.join(root,'params.json')) 66 | params = json.load(param_path) 67 | exp_name = params['exp_name'] 68 | 69 | log_path = os.path.join(root,'log.txt') 70 | experiment_data = pd.read_table(log_path) 71 | 72 | experiment_data.insert( 73 | len(experiment_data.columns), 74 | 'Unit', 75 | unit 76 | ) 77 | experiment_data.insert( 78 | len(experiment_data.columns), 79 | 'Condition', 80 | condition or exp_name 81 | ) 82 | 83 | datasets.append(experiment_data) 84 | unit += 1 85 | 86 | return datasets 87 | 88 | 89 | def main(): 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('logdir', nargs='*') 93 | parser.add_argument('--legend', nargs='*') 94 | parser.add_argument('--value', default='AverageReturn', nargs='*') 95 | args = parser.parse_args() 96 | 97 | use_legend = False 98 | if args.legend is not None: 99 | assert len(args.legend) == len(args.logdir), \ 100 | "Must give a legend title for each set of experiments." 101 | use_legend = True 102 | 103 | data = [] 104 | if use_legend: 105 | for logdir, legend_title in zip(args.logdir, args.legend): 106 | data += get_datasets(logdir, legend_title) 107 | else: 108 | for logdir in args.logdir: 109 | data += get_datasets(logdir) 110 | 111 | if isinstance(args.value, list): 112 | values = args.value 113 | else: 114 | values = [args.value] 115 | for value in values: 116 | plot_data(data, value=value) 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /iLQR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | from scipy.linalg import inv 4 | from scipy.optimize import approx_fprime 5 | import time 6 | from autograd import jacobian 7 | 8 | class iLQR: 9 | 10 | def __init__(self, env, delta, T, dyn_model, cost_fn): 11 | 12 | self.env = env 13 | self.min_factor = 2 14 | self.factor = self.min_factor 15 | self.min_mu = 1e-6 16 | self.mu = self.min_mu 17 | self.delta = delta 18 | self.T = T 19 | self.dyn_model = dyn_model 20 | self.cost_fn = cost_fn 21 | self.control_low = self.env.action_space.low 22 | self.control_high = self.env.action_space.high 23 | 24 | def increase(self, mu): 25 | self.factor = np.maximum(self.factor, self.factor * self.min_factor) 26 | self.mu = np.maximum(self.min_mu, self.mu * self.factor) 27 | 28 | def decrease(self, mu): 29 | self.factor = np.minimum(1 / self.min_factor, self.factor / self.min_factor) 30 | if self.mu * self.factor > self.min_mu: 31 | self.mu = self.mu * self.factor 32 | else: 33 | self.mu = 0 34 | 35 | def simulate_step(self, x): 36 | 37 | xu = [x[:self.env.observation_space.shape[0]], x[self.env.observation_space.shape[0]:]] 38 | 39 | next_x = self.dyn_model.predict(xu[0], xu[1]) 40 | "get cost" 41 | cost = self.cost_fn(xu[0], xu[1], next_x[0]) 42 | 43 | return next_x[0], cost 44 | 45 | def simulate_next_state(self, x, i): 46 | return self.simulate_step(x)[0][i] 47 | 48 | def simulate_cost(self, x): 49 | return self.simulate_step(x)[1] 50 | 51 | def approx_fdoubleprime(self, x, i): 52 | return approx_fprime(x, self.simulate_cost, self.delta)[i] 53 | 54 | def finite_difference(self, x, u): 55 | 56 | "calling finite difference for delta perturbation" 57 | xu = np.concatenate((x, u)) 58 | 59 | F = np.zeros((x.shape[0], xu.shape[0])) 60 | 61 | for i in range(x.shape[0]): 62 | F[i,:] = approx_fprime(xu, self.simulate_next_state, self.delta, i) 63 | 64 | c = approx_fprime(xu, self.simulate_cost, self.delta) 65 | 66 | C = np.zeros((len(xu), len(xu))) 67 | 68 | for i in range(xu.shape[0]): 69 | C[i,:] = approx_fprime(xu, self.approx_fdoubleprime, self.delta, i) 70 | 71 | f = np.zeros((len(x))) 72 | 73 | return C, F, c, f 74 | 75 | def differentiate(self, x_seq, u_seq): 76 | 77 | "get gradient values using finite difference" 78 | 79 | C, F, c, f = [], [], [], [] 80 | 81 | for t in range(self.T - 1): 82 | 83 | Ct, Ft, ct, ft = self.finite_difference(x_seq[t], u_seq[t]) 84 | 85 | C.append(Ct) 86 | F.append(Ft) 87 | c.append(ct) 88 | f.append(ft) 89 | 90 | "TODO : C, F, c, f for time step T are different. Why ?" 91 | 92 | u = np.zeros((u_seq[0].shape)) 93 | 94 | Ct, Ft, ct, ft = self.finite_difference(x_seq[-1], u) 95 | 96 | C.append(Ct) 97 | F.append(Ft) 98 | c.append(ct) 99 | f.append(ft) 100 | 101 | return C, F, c, f 102 | 103 | def backward(self, x_seq, u_seq): 104 | 105 | "initialize F_t, C_t, f_t, c_t, V_t, v_t" 106 | C, F, c, f = self.differentiate(x_seq, u_seq) 107 | 108 | n = x_seq[0].shape[0] 109 | 110 | "initialize V_t1 and v_t1" 111 | 112 | c_x = c[-1][:n] 113 | c_u = c[-1][n:] 114 | 115 | C_xx = C[-1][:n,:n] 116 | C_xu = C[-1][:n,n:] 117 | C_ux = C[-1][n:,:n] 118 | C_uu = C[-1][n:,n:] 119 | 120 | #C_uu = C_uu + self.mu * np.eye(C_uu.shape[0]) 121 | 122 | K = np.zeros((self.T+1, u_seq[0].shape[0], x_seq[0].shape[0])) 123 | k = np.zeros((self.T+1, u_seq[0].shape[0])) 124 | 125 | V = np.zeros((self.T+1, x_seq[0].shape[0], x_seq[0].shape[0])) 126 | v = np.zeros((self.T+1, x_seq[0].shape[0])) 127 | 128 | #K[-1] = -np.dot(inv(C_uu), C_ux) 129 | #k[-1] = -np.dot(inv(C_uu), c_u) 130 | 131 | #V[-1] = C_xx + np.dot(C_xu, K[-1]) + np.dot(K[-1].T, C_ux) + np.dot(np.dot(K[-1].T, C_uu), K[-1]) 132 | #v[-1] = c_x + np.dot(C_xu, k[-1]) + np.dot(K[-1].T, c_u) + np.dot(np.dot(K[-1].T, C_uu), k[-1]) 133 | V[-1] = C_xx 134 | v[-1] = c_x 135 | 136 | "initialize Q_t1 and q_t1" 137 | 138 | Q = list(np.zeros((self.T))) 139 | q = list(np.zeros((self.T))) 140 | 141 | self.std = [] 142 | "loop till horizon" 143 | 144 | t = self.T-1 145 | while t >= 0: 146 | #for t in range(self.T-1, -1, -1): 147 | "update Q" 148 | 149 | Q[t] = C[t] + np.dot(np.dot(F[t].T, V[t+1] + self.mu * np.eye(V[t+1].shape[0])), F[t]) #+ 0.01 * np.eye(V[t+1].shape[0])), 150 | q[t] = c[t] + np.dot(F[t].T, v[t+1]) #+ np.dot(np.dot(F[t].T, V[t+1] + 0.00001 * np.eye(V[t+1].shape[0])), f[t]) 151 | 152 | "differentiate Q to get Q_uu, Q_xx, Q_ux, Q_u, Q_x" 153 | 154 | q_x = q[t][:n] 155 | q_u = q[t][n:] 156 | 157 | Q_xx = Q[t][:n,:n] 158 | Q_xu = Q[t][:n,n:] 159 | Q_ux = Q[t][n:,:n] 160 | Q_uu = Q[t][n:,n:] 161 | 162 | #Q_uu = Q_uu + 100 * np.eye(Q_uu.shape[0]) 163 | try: 164 | np.linalg.cholesky(Q_uu) 165 | except: 166 | self.increase(self.mu) 167 | continue 168 | 169 | "update K, k, V, v" 170 | #print("q_uu", Q_uu) 171 | K[t] = -np.dot(inv(Q_uu), Q_ux) 172 | k[t] = -np.dot(inv(Q_uu), q_u) 173 | 174 | V[t] = Q_xx + np.dot(Q_xu, K[t]) + np.dot(K[t].T, Q_ux) + np.dot(np.dot(K[t].T, Q_uu), K[t]) 175 | v[t] = q_x + np.dot(Q_xu, k[t]) + np.dot(K[t].T, q_u) + np.dot(np.dot(K[t].T, Q_uu), k[t]) 176 | 177 | self.std.append(inv(Q_uu)) 178 | self.decrease(self.mu) 179 | t -= 1 180 | 181 | self.K = K 182 | self.k = k 183 | 184 | def get_action_one_step(self, state, t, x, u): 185 | 186 | "TODO : Add delta U's to given action array" 187 | #print("Q_uu ", self.std[t]) 188 | mean = np.dot(self.K[t], (state - x)) + self.k[t] + u 189 | return np.clip(np.random.normal(mean, self.std[t]), self.control_low, self.control_high) 190 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from dynamics import NNDynamicsModel 5 | from policy import NNPolicy 6 | from controllers import MPCcontroller, RandomController, LQRcontroller 7 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn, pendulum_cost_fn, reacher_cost_fn 8 | import time 9 | import logz 10 | import os 11 | import copy 12 | #import matplotlib.pyplot as plt 13 | from cheetah_env import HalfCheetahEnvNew 14 | from mpi4py import MPI 15 | 16 | def sample(env, 17 | controller, 18 | num_paths=10, 19 | horizon=1000, 20 | render=False, 21 | verbose=False): 22 | """ 23 | Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 24 | and returns rollouts by running on the env. 25 | Each path can have elements for observations, next_observations, rewards, returns, actions, etc. 26 | """ 27 | paths = [] 28 | """ YOUR CODE HERE """ 29 | for i in range(num_paths): 30 | 31 | ob = env.reset() 32 | obs, acs, rewards, next_obs = [], [], [], [] 33 | steps = 0 34 | #print("sampling trajectory %d"%i) 35 | 36 | while True: 37 | obs.append(ob) 38 | ac = controller.get_action(ob, steps) 39 | env.render() 40 | print("control", ac) 41 | acs.append(ac) 42 | next_ob, rew, done, _ = env.step(ac) 43 | steps += 1 44 | ob = next_ob 45 | rewards.append(rew) 46 | next_obs.append(next_ob) 47 | if done or steps > 100: 48 | print("steps", steps) 49 | break 50 | path = {"observations" : np.array(obs), 51 | "rewards" : np.array(rewards), 52 | "actions" : np.array(acs), 53 | "next_observations" : np.array(next_obs)} 54 | paths.append(path) 55 | 56 | return paths 57 | 58 | # Utility to compute cost a path for a given cost function 59 | def path_cost(cost_fn, path): 60 | return trajectory_cost_fn(cost_fn, path['observations'], path['actions'], path['next_observations']) 61 | 62 | def compute_normalization(data): 63 | """ 64 | Write a function to take in a dataset and compute the means, and stds. 65 | Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions 66 | """ 67 | 68 | """ YOUR CODE HERE """ 69 | 70 | mean_obs = np.mean(np.concatenate([path['observations'] for path in data]), axis=0) 71 | std_obs = np.std(np.concatenate([path['observations'] for path in data]), axis=0) 72 | mean_action = np.mean(np.concatenate([path['actions'] for path in data]), axis=0) 73 | std_action = np.std(np.concatenate([path['actions'] for path in data]), axis=0) 74 | mean_deltas = np.mean(np.concatenate([path['next_observations'] - path['observations'] for path in data]), axis=0) 75 | std_deltas = np.std(np.concatenate([path['next_observations'] - path['observations'] for path in data]), axis=0) 76 | 77 | return mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action 78 | 79 | 80 | def plot_comparison(env, dyn_model): 81 | """ 82 | Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 83 | """ 84 | """ YOUR CODE HERE """ 85 | pass 86 | 87 | def train(env, 88 | cost_fn, 89 | load_model, 90 | model_path, 91 | logdir=None, 92 | render=False, 93 | learning_rate_dyn=1e-3, 94 | learning_rate_policy=1e-4, 95 | onpol_iters=10, 96 | dynamics_iters=60, 97 | policy_iters=100, 98 | batch_size=512, 99 | num_paths_random=10, 100 | num_paths_onpol=5, 101 | num_simulated_paths=10000, 102 | env_horizon=1000, 103 | mpc_horizon=15, 104 | n_layers=2, 105 | size=500, 106 | activation=tf.nn.relu, 107 | output_activation=None, 108 | ): 109 | 110 | """ 111 | 112 | Arguments: 113 | 114 | onpol_iters Number of iterations of onpolicy aggregation for the loop to run. 115 | 116 | dynamics_iters Number of iterations of training for the dynamics model 117 | |_ which happen per iteration of the aggregation loop. 118 | 119 | batch_size Batch size for dynamics training. 120 | 121 | num_paths_random Number of paths/trajectories/rollouts generated 122 | | by a random agent. We use these to train our 123 | |_ initial dynamics model. 124 | 125 | num_paths_onpol Number of paths to collect at each iteration of 126 | |_ aggregation, using the Model Predictive Control policy. 127 | 128 | num_simulated_paths How many fictitious rollouts the MPC policy 129 | | should generate each time it is asked for an 130 | |_ action. 131 | 132 | env_horizon Number of timesteps in each path. 133 | 134 | mpc_horizon The MPC policy generates actions by imagining 135 | | fictitious rollouts, and picking the first action 136 | | of the best fictitious rollout. This argument is 137 | | how many timesteps should be in each fictitious 138 | |_ rollout. 139 | 140 | n_layers/size/activations Neural network architecture arguments. 141 | 142 | """ 143 | 144 | #logz.configure_output_dir(logdir) 145 | 146 | #======================================================== 147 | # 148 | # First, we need a lot of data generated by a random 149 | # agent, with which we'll begin to train our dynamics 150 | # model. 151 | 152 | random_controller = RandomController(env) 153 | 154 | """ YOUR CODE HERE """ 155 | data = sample(env, random_controller, num_paths_random, env_horizon) 156 | 157 | 158 | #======================================================== 159 | # 160 | # The random data will be used to get statistics (mean 161 | # and std) for the observations, actions, and deltas 162 | # (where deltas are o_{t+1} - o_t). These will be used 163 | # for normalizing inputs and denormalizing outputs 164 | # from the dynamics network. 165 | # 166 | """ YOUR CODE HERE """ 167 | normalization = compute_normalization(data) 168 | 169 | 170 | #======================================================== 171 | # 172 | # Build dynamics model and MPC controllers. 173 | # 174 | sess = tf.Session() 175 | 176 | dyn_model = NNDynamicsModel(env=env, 177 | n_layers=n_layers, 178 | size=size, 179 | activation=activation, 180 | output_activation=output_activation, 181 | normalization=normalization, 182 | batch_size=batch_size, 183 | iterations=dynamics_iters, 184 | learning_rate=learning_rate_dyn, 185 | sess=sess) 186 | 187 | policy = NNPolicy(env=env, 188 | normalization=normalization, 189 | batch_size=batch_size, 190 | iterations=policy_iters, 191 | learning_rate=learning_rate_policy, 192 | sess=sess, 193 | model_path=model_path, 194 | save_path="./policy/", 195 | load_model=load_model) 196 | 197 | mpc_controller = MPCcontroller(env=env, 198 | dyn_model=dyn_model, 199 | horizon=mpc_horizon, 200 | cost_fn=cost_fn, 201 | num_simulated_paths=num_simulated_paths) 202 | 203 | lqr_controller = LQRcontroller(env=env, 204 | delta=0.005, 205 | T=50, 206 | dyn_model=dyn_model, 207 | cost_fn=cost_fn, 208 | iterations=1) 209 | 210 | comm = MPI.COMM_WORLD 211 | size = comm.Get_size() 212 | rank = comm.Get_rank() 213 | 214 | #======================================================== 215 | # 216 | # Tensorflow session building. 217 | # 218 | 219 | sess.__enter__() 220 | tf.global_variables_initializer().run() 221 | 222 | #======================================================== 223 | # 224 | # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 225 | # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 226 | # 227 | 228 | # training the MPC controller as well as dynamics 229 | for itr in range(onpol_iters): 230 | 231 | print("fitting dynamics for worker ", rank) 232 | dyn_model.fit(data) 233 | print("sampling new trajectories from worker ", rank) 234 | new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon) 235 | 236 | data += new_data 237 | comm.send(new_data, 0) 238 | 239 | if rank == 0: 240 | costs, returns = [], [] 241 | 242 | for path in data: 243 | 244 | costs.append(path_cost(cost_fn, path)) 245 | returns.append(np.sum(path['rewards'])) 246 | 247 | print("returns ",returns) 248 | 249 | for i in range(1, size): 250 | data += comm.recv(source=i) 251 | 252 | print("fitting policy...") 253 | policy.fit(data) 254 | 255 | # LOGGING 256 | # Statistics for performance of MPC policy using 257 | # our learned dynamics model 258 | logz.log_tabular('Iteration', itr) 259 | # In terms of cost function which your MPC controller uses to plan 260 | logz.log_tabular('AverageCost', np.mean(costs)) 261 | logz.log_tabular('StdCost', np.std(costs)) 262 | logz.log_tabular('MinimumCost', np.min(costs)) 263 | logz.log_tabular('MaximumCost', np.max(costs)) 264 | # In terms of true environment reward of your rolled out trajectory using the MPC controller 265 | logz.log_tabular('AverageReturn', np.mean(returns)) 266 | logz.log_tabular('StdReturn', np.std(returns)) 267 | logz.log_tabular('MinimumReturn', np.min(returns)) 268 | logz.log_tabular('MaximumReturn', np.max(returns)) 269 | 270 | logz.dump_tabular() 271 | 272 | # applying the learned neural policy 273 | if rank == 0: 274 | ob = env.reset() 275 | 276 | while True: 277 | a = policy.get_action(ob.reshape((1, ob.shape[0]))) 278 | 279 | # control clipping to be added 280 | 281 | next_ob, reward, done, info = env.step(a[0]) 282 | print("action", a) 283 | print("predicted ob", dyn_model.predict(ob, a)) 284 | print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10)) 285 | env.render() 286 | ob = next_ob 287 | if done: 288 | ob = env.reset() 289 | 290 | def main(): 291 | 292 | import argparse 293 | parser = argparse.ArgumentParser() 294 | parser.add_argument('--env_name', type=str, default='Pendulum-v1') 295 | # Experiment meta-params 296 | parser.add_argument('--exp_name', type=str, default='mb_mpc') 297 | parser.add_argument('--seed', type=int, default=5) 298 | parser.add_argument('--render', action='store_true') 299 | parser.add_argument('--model_path', '-mp', type=str, default="./policy/-0") 300 | parser.add_argument('--load_model', '-lm', type=str, default=False) 301 | # Training args 302 | parser.add_argument('--learning_rate_dyn', '-lr', type=float, default=1e-3) 303 | parser.add_argument('--learning_rate_policy', '-lrp', type=float, default=1e-4) 304 | parser.add_argument('--onpol_iters', '-n', type=int, default=5) 305 | parser.add_argument('--dyn_iters', '-nd', type=int, default=100) 306 | parser.add_argument('--policy_iters', '-ndp', type=int, default=100) 307 | parser.add_argument('--batch_size', '-b', type=int, default=512) 308 | # Data collection 309 | parser.add_argument('--random_paths', '-r', type=int, default=10) 310 | parser.add_argument('--onpol_paths', '-d', type=int, default=10) 311 | parser.add_argument('--simulated_paths', '-sp', type=int, default=1000) 312 | parser.add_argument('--ep_len', '-ep', type=int, default=1000) 313 | # Neural network architecture args 314 | parser.add_argument('--n_layers', '-l', type=int, default=2) 315 | parser.add_argument('--size', '-s', type=int, default=500) 316 | # MPC Controller 317 | parser.add_argument('--mpc_horizon', '-m', type=int, default=20) 318 | args = parser.parse_args() 319 | 320 | # Set seed 321 | np.random.seed(args.seed) 322 | tf.set_random_seed(args.seed) 323 | 324 | # Make data directory if it does not already exist 325 | #if not(os.path.exists('data')): 326 | # os.makedirs('data') 327 | #logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 328 | #logdir = os.path.join('data', logdir) 329 | #if not(os.path.exists(logdir)): 330 | # os.makedirs(logdir) 331 | 332 | # Make env 333 | if args.env_name is "HalfCheetah-v1": 334 | env = HalfCheetahEnvNew() 335 | cost_fn = cheetah_cost_fn 336 | 337 | elif args.env_name is "Pendulum-v1": 338 | env = gym.make('InvertedPendulum-v1') 339 | cost_fn = pendulum_cost_fn 340 | 341 | elif args.env_name is "Reacher-v1": 342 | env = gym.make('Reacher-v1') 343 | cost_fn = reacher_cost_fn 344 | train(env=env, 345 | cost_fn=cost_fn, 346 | load_model=args.load_model, 347 | model_path=args.model_path, 348 | logdir=None, 349 | render=args.render, 350 | learning_rate_dyn=args.learning_rate_dyn, 351 | learning_rate_policy=args.learning_rate_policy, 352 | onpol_iters=args.onpol_iters, 353 | dynamics_iters=args.dyn_iters, 354 | batch_size=args.batch_size, 355 | num_paths_random=args.random_paths, 356 | num_paths_onpol=args.onpol_paths, 357 | num_simulated_paths=args.simulated_paths, 358 | env_horizon=args.ep_len, 359 | mpc_horizon=args.mpc_horizon, 360 | n_layers = args.n_layers, 361 | size=args.size, 362 | activation=tf.nn.relu, 363 | output_activation=None, 364 | ) 365 | 366 | if __name__ == "__main__": 367 | main() 368 | --------------------------------------------------------------------------------