├── data
    ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-26-31
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-27-37
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_10-01-2018_19-28-13
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-16-05
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-18-18
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-21-23
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-21-56
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-43-58
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-29
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-45
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-51-58
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_11-54-06
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-00-39
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-01-58
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-17-09
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-18-25
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-30-29
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-31-54
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-32-14
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-33-12
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_12-34-21
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-04-22
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-04-49
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-05-08
    │   └── log.txt
    ├── mb_mpc_HalfCheetah-v1_11-01-2018_13-05-24
    │   └── log.txt
    └── mb_mpc_HalfCheetah-v1_11-01-2018_13-06-02
    │   └── log.txt
├── __pycache__
    ├── logz.cpython-35.pyc
    ├── dynamics.cpython-35.pyc
    ├── cheetah_env.cpython-35.pyc
    ├── controllers.cpython-35.pyc
    └── cost_functions.cpython-35.pyc
├── README.md
├── cheetah_env.py
├── cost_functions.py
├── policy.py
├── controllers.py
├── dynamics.py
├── logz.py
├── plot.py
├── iLQR.py
└── main.py


/data/mb_mpc_HalfCheetah-v1_10-01-2018_19-26-31/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_10-01-2018_19-27-37/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_10-01-2018_19-28-13/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-16-05/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-18-18/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-21-23/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-21-56/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-43-58/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-29/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-45/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-51-58/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_11-54-06/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-00-39/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-01-58/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-17-09/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-18-25/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-30-29/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-31-54/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-32-14/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-33-12/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_12-34-21/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_13-04-22/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_13-04-49/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_13-05-08/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_13-05-24/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/mb_mpc_HalfCheetah-v1_11-01-2018_13-06-02/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/__pycache__/logz.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/logz.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/dynamics.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/dynamics.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/cheetah_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/cheetah_env.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/controllers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/controllers.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/cost_functions.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/MPC/HEAD/__pycache__/cost_functions.cpython-35.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### About
 2 | 
 3 | This work aims at implementing simple MPC controller for gym's Mujoco models as described in **Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning** 
 4 | and build on it by adding LQR based controllers instead of using simple shooting methods. Such controllers are then applied in parallel and the stored trajectories are used to learn a general 
 5 | neural network policy.
 6 | 
 7 | 
 8 | ### Dependencies
 9 | 
10 | This code has been tested on python3 and requires [mujoco_py](https://github.com/openai/mujoco-py) installed.
11 | 
12 | ### How to Run
13 | 
14 | Please use `python3 main.py` to run. Passing `--load_model` will restore the previously stored policy parameters.
15 | 
16 | 


--------------------------------------------------------------------------------
/cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def _step(self, action):
11 |         xposbefore = self.model.data.qpos[0, 0]
12 |         self.do_simulation(action, self.frame_skip)
13 |         xposafter = self.model.data.qpos[0, 0]
14 |         ob = self._get_obs()
15 |         reward_ctrl = - 0.1 * np.square(action).sum()
16 |         reward_run = (xposafter - xposbefore)/self.dt
17 |         reward = reward_ctrl + reward_run
18 |         done = False
19 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
20 | 
21 |     def _get_obs(self):
22 |         return np.concatenate([
23 |             self.model.data.qpos.flat[1:],
24 |             self.model.data.qvel.flat,
25 |             self.get_body_com("torso").flat,
26 |             # self.get_body_comvel("torso").flat,
27 |         ])
28 | 
29 |     def reset_model(self):
30 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
31 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
32 |         self.set_state(qpos, qvel)
33 |         return self._get_obs()
34 | 
35 |     def viewer_setup(self):
36 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/cost_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | #========================================================
 5 | #
 6 | # Environment-specific cost functions:
 7 | #
 8 | def reacher_cost_fn(state, action, next_state):
 9 | 
10 |     if len(state.shape) > 1:
11 | 
12 |         scores=np.zeros((state.shape[0],))
13 | 
14 |         scores += np.linalg.norm(next_state[:,-1])
15 |         return scores
16 | 
17 |     score = np.linalg.norm(next_state[-1])
18 |     return score
19 | 
20 | def pendulum_cost_fn(state, action, next_state):
21 | 
22 |     if len(state.shape) > 1:
23 | 
24 |         scores=np.zeros((state.shape[0],))
25 | 
26 |         scores += np.abs(next_state[:,1]) #+ 0.001 * action**2
27 |         return scores
28 | 
29 |     score = np.abs(next_state[1]) #+ 0.001 * action**2
30 | 
31 |     return score
32 | 
33 | def cheetah_cost_fn(state, action, next_state):
34 |     if len(state.shape) > 1:
35 | 
36 |         heading_penalty_factor=10
37 |         scores=np.zeros((state.shape[0],))
38 | 
39 |         #dont move front shin back so far that you tilt forward
40 |         front_leg = state[:,5]
41 |         my_range = 0.2
42 |         scores[front_leg>=my_range] += heading_penalty_factor
43 | 
44 |         front_shin = state[:,6]
45 |         my_range = 0
46 |         scores[front_shin>=my_range] += heading_penalty_factor
47 | 
48 |         front_foot = state[:,7]
49 |         my_range = 0
50 |         scores[front_foot>=my_range] += heading_penalty_factor
51 | 
52 |         scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1))
53 |         return scores
54 | 
55 |     heading_penalty_factor=10
56 |     score = 0
57 | 
58 |     #dont move front shin back so far that you tilt forward
59 |     front_leg = state[5]
60 |     my_range = 0.2
61 |     if front_leg>=my_range:
62 |         score += heading_penalty_factor
63 | 
64 |     front_shin = state[6]
65 |     my_range = 0
66 |     if front_shin>=my_range:
67 |         score += heading_penalty_factor
68 | 
69 |     front_foot = state[7]
70 |     my_range = 0
71 |     if front_foot>=my_range:
72 |         score += heading_penalty_factor
73 | 
74 |     score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2))
75 |     return score
76 | 
77 | #========================================================
78 | #
79 | # Cost function for a whole trajectory:
80 | #
81 | 
82 | def trajectory_cost_fn(cost_fn, states, actions, next_states):
83 |     trajectory_cost = 0
84 |     for i in range(len(actions)):
85 |         trajectory_cost += cost_fn(states[i], actions[i], next_states[i])
86 |     return trajectory_cost
87 | 


--------------------------------------------------------------------------------
/policy.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | # Predefined function to build a feedforward neural network
 5 | def build_mlp(input_placeholder,
 6 |               output_size,
 7 |               scope,
 8 |               n_layers=1,
 9 |               size=64,
10 |               activation=tf.tanh,
11 |               output_activation=None
12 |               ):
13 |     out = input_placeholder
14 |     with tf.variable_scope(scope):
15 |         for _ in range(n_layers):
16 |             out = tf.layers.dense(out, size, activation=activation)
17 |         out = tf.layers.dense(out, output_size, activation=output_activation)
18 |     return out
19 | 
20 | class NNPolicy():
21 |     def __init__(self,
22 |                  env,
23 |                  normalization,
24 |                  batch_size,
25 |                  iterations,
26 |                  learning_rate,
27 |                  sess,
28 |                  model_path,
29 |                  save_path,
30 |                  load_model
31 |                  ):
32 |         """ YOUR CODE HERE """
33 |         """ Note: Be careful about normalization """
34 | 
35 |         self.normalization = normalization
36 |         self.batch_size = batch_size
37 |         self.iterations = iterations
38 | 
39 |         self.states = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32)
40 |         self.actions = tf.placeholder(shape = [None, env.action_space.shape[0]], dtype = tf.float32)
41 | 
42 |         self.mean = build_mlp(self.states, env.action_space.shape[0], "policy")#, n_layers, size, activation, output_activation)
43 | 
44 |         self.loss = tf.reduce_mean(tf.square((self.actions) - self.mean))
45 |         self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
46 | 
47 |         self.sess = sess
48 |         self.model_path = model_path
49 |         self.save_path = save_path
50 |         self.saver = tf.train.Saver(max_to_keep=1)
51 |         if load_model:
52 |             self.saver.restore(self.sess, self.model_path)
53 | 
54 |     def fit(self, data):
55 |         """
56 |         Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions and imitate the actions taken by an expert.
57 |         """
58 | 
59 |         """YOUR CODE HERE """
60 |         observations = np.concatenate([path['observations'] for path in data])
61 |         actions = np.concatenate([path['actions'] for path in data])
62 | 
63 |         "Normalize the data"
64 |         observations = (observations - self.normalization[0]) / (self.normalization[1] + 1e-10)
65 |         actions = (actions - self.normalization[4]) / (self.normalization[5] + 1e-10)
66 | 
67 |         for i in range(self.iterations):
68 | 
69 |             batch_id = np.random.choice(observations.shape[0], self.batch_size)#, replace = False)
70 |             _ = tf.get_default_session().run(self.update_op, feed_dict = {self.states : observations[batch_id], self.actions : actions[batch_id]})
71 |         print("saving now...")
72 |         self.saver.save(self.sess, self.save_path, global_step=0)
73 | 
74 |     def get_action(self, states):
75 |         """ Write a function to take in a batch of (unnormalized) states and return the (unnormalized) actions as predicted by using the model """
76 |         """ YOUR CODE HERE """
77 | 
78 |         mean = tf.get_default_session().run(self.mean, feed_dict = {self.states : states})
79 |         std = np.ones(mean.shape[1], dtype=np.float32)
80 | 
81 |         return tf.get_default_session().run(tf.contrib.distributions.MultivariateNormalDiag(mean, std).sample())
82 | 


--------------------------------------------------------------------------------
/controllers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from cost_functions import trajectory_cost_fn
  3 | import time
  4 | from iLQR import iLQR
  5 | 
  6 | class Controller():
  7 | 	def __init__(self):
  8 | 		pass
  9 | 
 10 | 	# Get the appropriate action(s) for this state(s)
 11 | 	def get_action(self, state, steps):
 12 | 		pass
 13 | 
 14 | 
 15 | class RandomController(Controller):
 16 | 	def __init__(self, env):
 17 | 		""" YOUR CODE HERE """
 18 | 		self.env = env
 19 | 
 20 | 	def get_action(self, state, steps):
 21 | 		""" YOUR CODE HERE """
 22 | 		""" Your code should randomly sample an action uniformly from the action space """
 23 | 		action = self.env.action_space.sample()
 24 | 		return action
 25 | 
 26 | 
 27 | class MPCcontroller(Controller):
 28 | 	""" Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """
 29 | 	def __init__(self,
 30 | 				 env,
 31 | 				 dyn_model,
 32 | 				 horizon=5,
 33 | 				 cost_fn=None,
 34 | 				 num_simulated_paths=10,
 35 | 				 ):
 36 | 		self.env = env
 37 | 		self.dyn_model = dyn_model
 38 | 		self.horizon = horizon
 39 | 		self.cost_fn = cost_fn
 40 | 		self.num_simulated_paths = num_simulated_paths
 41 | 
 42 | 	def get_action(self, state, steps):
 43 | 		""" YOUR CODE HERE """
 44 | 		""" Note: be careful to batch your simulations through the model for speed """
 45 | 
 46 | 		# sample sequeces of action_space
 47 | 		action_set = np.reshape([self.env.action_space.sample() for i in range(self.num_simulated_paths * self.horizon)], (self.num_simulated_paths, self.horizon, self.env.action_space.shape[0]))
 48 | 
 49 | 		state = np.repeat(state.reshape([1,-1]), self.num_simulated_paths, axis=0)
 50 | 		cost = np.zeros([self.num_simulated_paths], dtype = np.float32)
 51 | 
 52 | 		# predict next next_states
 53 | 		for i in range(self.horizon):
 54 | 			next_state = self.dyn_model.predict(state, action_set[:,i,:])
 55 | 
 56 | 			cost += self.cost_fn(state[:,:], action_set[:,i,:], next_state[:,:])
 57 | 
 58 | 			state = next_state
 59 | 
 60 | 		# calculate cost and choose optimal path
 61 | 		act = np.argmin(cost)
 62 | 		return action_set[act, 0]
 63 | 
 64 | class LQRcontroller(iLQR):
 65 | 
 66 | 	def __init__(self,
 67 | 				env,
 68 | 				delta,
 69 | 				T,
 70 | 				dyn_model,
 71 | 				cost_fn,
 72 | 				iterations,
 73 | 				):
 74 | 		iLQR.__init__(self, env, delta, T, dyn_model, cost_fn)
 75 | 		self.iterations = iterations
 76 | 
 77 | 	def get_action(self, state, steps):
 78 | 
 79 | 		if not steps:
 80 | 			self.U_hat = np.reshape([np.zeros(self.env.action_space.sample().shape) for i in range(self.T - 1)], (self.T - 1, self.env.action_space.shape[0]))
 81 | 
 82 | 			self.X_hat = []
 83 | 			self.X_hat.append(state)
 84 | 			x = state
 85 | 
 86 | 			for i in range(self.T - 1):
 87 | 				next_state = self.dyn_model.predict(x, self.U_hat[i,:])
 88 | 
 89 | 				self.X_hat.append(next_state[0])
 90 | 				x = next_state
 91 | 
 92 | 			self.X_hat = np.asarray(self.X_hat)
 93 | 
 94 | 		for i in range(self.iterations):
 95 | 			self.backward(self.X_hat, self.U_hat)
 96 | 
 97 | 			x = state #X_hat[0]
 98 | 
 99 | 			U = np.zeros(self.U_hat.shape)
100 | 			X = np.zeros(self.X_hat.shape)
101 | 
102 | 			for t in range(self.T - 1):
103 | 				u = self.get_action_one_step(x, t, self.X_hat[t], self.U_hat[t])
104 | 				#print("control", u)
105 | 				X[t] = x
106 | 				U[t] = u
107 | 
108 | 				x = self.dyn_model.predict(x, u)[0]
109 | 
110 | 			X[-1] = x
111 | 			#print("X_hat for iteration {} is {}".format(i, X_hat))
112 | 			#print("U_hat for iteration {} is {}".format(i, self.U_hat))
113 | 			self.X_hat = X
114 | 			self.U_hat = U
115 | 
116 | 		return self.U_hat[0]
117 | 


--------------------------------------------------------------------------------
/dynamics.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | # Predefined function to build a feedforward neural network
 5 | def build_mlp(input_placeholder,
 6 |               output_size,
 7 |               scope,
 8 |               n_layers=2,
 9 |               size=500,
10 |               activation=tf.nn.relu,
11 |               output_activation=None
12 |               ):
13 |     out = input_placeholder
14 |     with tf.variable_scope(scope):
15 |         for _ in range(n_layers):
16 |             out = tf.layers.dense(out, size, activation=activation)
17 |         out = tf.layers.dense(out, output_size, activation=output_activation)
18 |     return out
19 | 
20 | class NNDynamicsModel():
21 |     def __init__(self,
22 |                  env,
23 |                  n_layers,
24 |                  size,
25 |                  activation,
26 |                  output_activation,
27 |                  normalization,
28 |                  batch_size,
29 |                  iterations,
30 |                  learning_rate,
31 |                  sess
32 |                  ):
33 |         """ YOUR CODE HERE """
34 |         """ Note: Be careful about normalization """
35 | 
36 |         self.normalization = normalization
37 |         self.batch_size = batch_size
38 |         self.iterations = iterations
39 | 
40 |         self.states = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32)
41 |         self.actions = tf.placeholder(shape = [None, env.action_space.shape[0]], dtype = tf.float32)
42 |         self.deltas = tf.placeholder(shape = [None, env.observation_space.shape[0]], dtype = tf.float32)
43 |         state_action_pair = tf.concat([self.states, self.actions], 1)
44 | 
45 |         self.model = build_mlp(state_action_pair, env.observation_space.shape[0], "model", n_layers, size, activation, output_activation)
46 | 
47 |         self.loss = tf.reduce_mean(tf.square((self.deltas) - self.model))
48 |         self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
49 | 
50 |     def fit(self, data):
51 |         """
52 |         Write a function to take in a dataset of (unnormalized)states, (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, normalized actions to normalized state differences (s_t+1 - s_t)
53 |         """
54 | 
55 |         """YOUR CODE HERE """
56 |         observations = np.concatenate([path['observations'] for path in data])
57 |         actions = np.concatenate([path['actions'] for path in data])
58 |         next_observations = np.concatenate([path['next_observations'] for path in data])
59 |         deltas = next_observations - observations
60 | 
61 |         "Normalize the data"
62 |         observations = (observations - self.normalization[0]) / (self.normalization[1] + 1e-10)
63 |         actions = (actions - self.normalization[4]) / (self.normalization[5] + 1e-10)
64 |         deltas = (deltas - self.normalization[2]) / (self.normalization[3] + 1e-10)
65 | 
66 |         for i in range(self.iterations):
67 | 
68 |             batch_id = np.random.choice(observations.shape[0], self.batch_size)#, replace = False)
69 |             _ = tf.get_default_session().run(self.update_op, feed_dict = {self.states : observations[batch_id], self.actions : actions[batch_id], self.deltas : deltas[batch_id]})
70 | 
71 | 
72 |     def predict(self, states, actions):
73 |         """ Write a function to take in a batch of (unnormalized) states and (unnormalized) actions and return the (unnormalized) next states as predicted by using the model """
74 |         """ YOUR CODE HERE """
75 | 
76 |         if len(states.shape) == 1:
77 |             states = states.reshape((1, states.shape[0]))
78 |         if len(actions.shape) == 1:
79 |             actions = actions.reshape((1, actions.shape[0]))
80 |             
81 |         next_observations = states + tf.get_default_session().run(self.model, feed_dict = {self.states : states, self.actions : actions})
82 |         return next_observations
83 | 


--------------------------------------------------------------------------------
/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     if osp.exists(G.output_dir):
 55 |         print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir)
 56 |     else:
 57 |         os.makedirs(G.output_dir)
 58 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 59 |     atexit.register(G.output_file.close)
 60 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 61 | 
 62 | def log_tabular(key, val):
 63 |     """
 64 |     Log a value of some diagnostic
 65 |     Call this once for each diagnostic quantity, each iteration
 66 |     """
 67 |     if G.first_row:
 68 |         G.log_headers.append(key)
 69 |     else:
 70 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 71 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 72 |     G.log_current_row[key] = val
 73 | 
 74 | def save_params(params):
 75 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 76 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 77 | 
 78 | def pickle_tf_vars():  
 79 |     """
 80 |     Saves tensorflow variables
 81 |     Requires them to be initialized first, also a default session must exist
 82 |     """
 83 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 84 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 85 |         pickle.dump(_dict, f)
 86 |     
 87 | 
 88 | def dump_tabular():
 89 |     """
 90 |     Write all of the diagnostics from the current iteration
 91 |     """
 92 |     vals = []
 93 |     key_lens = [len(key) for key in G.log_headers]
 94 |     max_key_len = max(15,max(key_lens))
 95 |     keystr = '%'+'%d'%max_key_len
 96 |     fmt = "| " + keystr + "s | %15s |"
 97 |     n_slashes = 22 + max_key_len
 98 |     print("-"*n_slashes)
 99 |     for key in G.log_headers:
100 |         val = G.log_current_row.get(key, "")
101 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
102 |         else: valstr = val
103 |         print(fmt%(key, valstr))
104 |         vals.append(val)
105 |     print("-"*n_slashes)
106 |     if G.output_file is not None:
107 |         if G.first_row:
108 |             G.output_file.write("\t".join(G.log_headers))
109 |             G.output_file.write("\n")
110 |         G.output_file.write("\t".join(map(str,vals)))
111 |         G.output_file.write("\n")
112 |         G.output_file.flush()
113 |     G.log_current_row.clear()
114 |     G.first_row=False
115 | 


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 |     plt.legend(loc='best').draggable()
 57 |     plt.show()
 58 | 
 59 | 
 60 | def get_datasets(fpath, condition=None):
 61 |     unit = 0
 62 |     datasets = []
 63 |     for root, dir, files in os.walk(fpath):
 64 |         if 'log.txt' in files:
 65 |             param_path = open(os.path.join(root,'params.json'))
 66 |             params = json.load(param_path)
 67 |             exp_name = params['exp_name']
 68 |             
 69 |             log_path = os.path.join(root,'log.txt')
 70 |             experiment_data = pd.read_table(log_path)
 71 | 
 72 |             experiment_data.insert(
 73 |                 len(experiment_data.columns),
 74 |                 'Unit',
 75 |                 unit
 76 |                 )
 77 |             experiment_data.insert(
 78 |                 len(experiment_data.columns),
 79 |                 'Condition',
 80 |                 condition or exp_name
 81 |                 )
 82 | 
 83 |             datasets.append(experiment_data)
 84 |             unit += 1
 85 | 
 86 |     return datasets
 87 | 
 88 | 
 89 | def main():
 90 |     import argparse
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('logdir', nargs='*')
 93 |     parser.add_argument('--legend', nargs='*')
 94 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_legend = False
 98 |     if args.legend is not None:
 99 |         assert len(args.legend) == len(args.logdir), \
100 |             "Must give a legend title for each set of experiments."
101 |         use_legend = True
102 | 
103 |     data = []
104 |     if use_legend:
105 |         for logdir, legend_title in zip(args.logdir, args.legend):
106 |             data += get_datasets(logdir, legend_title)
107 |     else:
108 |         for logdir in args.logdir:
109 |             data += get_datasets(logdir)
110 | 
111 |     if isinstance(args.value, list):
112 |         values = args.value
113 |     else:
114 |         values = [args.value]
115 |     for value in values:
116 |         plot_data(data, value=value)
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/iLQR.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | from scipy.linalg import inv
  4 | from scipy.optimize import approx_fprime
  5 | import time
  6 | from autograd import jacobian
  7 | 
  8 | class iLQR:
  9 | 
 10 |     def __init__(self, env, delta, T, dyn_model, cost_fn):
 11 | 
 12 |         self.env = env
 13 |         self.min_factor = 2
 14 |         self.factor = self.min_factor
 15 |         self.min_mu = 1e-6
 16 |         self.mu = self.min_mu
 17 |         self.delta = delta
 18 |         self.T = T
 19 |         self.dyn_model = dyn_model
 20 |         self.cost_fn = cost_fn
 21 |         self.control_low = self.env.action_space.low
 22 |         self.control_high = self.env.action_space.high
 23 | 
 24 |     def increase(self, mu):
 25 |         self.factor = np.maximum(self.factor, self.factor * self.min_factor)
 26 |         self.mu = np.maximum(self.min_mu, self.mu * self.factor)
 27 | 
 28 |     def decrease(self, mu):
 29 |         self.factor = np.minimum(1 / self.min_factor, self.factor / self.min_factor)
 30 |         if self.mu * self.factor > self.min_mu:
 31 |             self.mu = self.mu * self.factor
 32 |         else:
 33 |             self.mu = 0
 34 | 
 35 |     def simulate_step(self, x):
 36 | 
 37 |         xu = [x[:self.env.observation_space.shape[0]], x[self.env.observation_space.shape[0]:]]
 38 | 
 39 |         next_x = self.dyn_model.predict(xu[0], xu[1])
 40 |         "get cost"
 41 |         cost = self.cost_fn(xu[0], xu[1], next_x[0])
 42 | 
 43 |         return next_x[0], cost
 44 | 
 45 |     def simulate_next_state(self, x, i):
 46 |         return self.simulate_step(x)[0][i]
 47 | 
 48 |     def simulate_cost(self, x):
 49 |         return self.simulate_step(x)[1]
 50 | 
 51 |     def approx_fdoubleprime(self, x, i):
 52 |         return approx_fprime(x, self.simulate_cost, self.delta)[i]
 53 | 
 54 |     def finite_difference(self, x, u):
 55 | 
 56 |         "calling finite difference for delta perturbation"
 57 |         xu = np.concatenate((x, u))
 58 | 
 59 |         F = np.zeros((x.shape[0], xu.shape[0]))
 60 | 
 61 |         for i in range(x.shape[0]):
 62 |             F[i,:] = approx_fprime(xu, self.simulate_next_state, self.delta, i)
 63 | 
 64 |         c = approx_fprime(xu, self.simulate_cost, self.delta)
 65 | 
 66 |         C = np.zeros((len(xu), len(xu)))
 67 | 
 68 |         for i in range(xu.shape[0]):
 69 |             C[i,:] = approx_fprime(xu, self.approx_fdoubleprime, self.delta, i)
 70 | 
 71 |         f = np.zeros((len(x)))
 72 | 
 73 |         return C, F, c, f
 74 | 
 75 |     def differentiate(self, x_seq, u_seq):
 76 | 
 77 |         "get gradient values using finite difference"
 78 | 
 79 |         C, F, c, f = [], [], [], []
 80 | 
 81 |         for t in range(self.T - 1):
 82 | 
 83 |             Ct, Ft, ct, ft = self.finite_difference(x_seq[t], u_seq[t])
 84 | 
 85 |             C.append(Ct)
 86 |             F.append(Ft)
 87 |             c.append(ct)
 88 |             f.append(ft)
 89 | 
 90 |         "TODO : C, F, c, f for time step T are different. Why ?"
 91 | 
 92 |         u = np.zeros((u_seq[0].shape))
 93 | 
 94 |         Ct, Ft, ct, ft = self.finite_difference(x_seq[-1], u)
 95 | 
 96 |         C.append(Ct)
 97 |         F.append(Ft)
 98 |         c.append(ct)
 99 |         f.append(ft)
100 | 
101 |         return C, F, c, f
102 | 
103 |     def backward(self, x_seq, u_seq):
104 | 
105 |         "initialize F_t, C_t, f_t, c_t, V_t, v_t"
106 |         C, F, c, f = self.differentiate(x_seq, u_seq)
107 | 
108 |         n = x_seq[0].shape[0]
109 | 
110 |         "initialize V_t1 and v_t1"
111 | 
112 |         c_x = c[-1][:n]
113 |         c_u = c[-1][n:]
114 | 
115 |         C_xx = C[-1][:n,:n]
116 |         C_xu = C[-1][:n,n:]
117 |         C_ux = C[-1][n:,:n]
118 |         C_uu = C[-1][n:,n:]
119 | 
120 |         #C_uu = C_uu + self.mu * np.eye(C_uu.shape[0])
121 | 
122 |         K = np.zeros((self.T+1, u_seq[0].shape[0], x_seq[0].shape[0]))
123 |         k = np.zeros((self.T+1, u_seq[0].shape[0]))
124 | 
125 |         V = np.zeros((self.T+1, x_seq[0].shape[0], x_seq[0].shape[0]))
126 |         v = np.zeros((self.T+1, x_seq[0].shape[0]))
127 | 
128 |         #K[-1] = -np.dot(inv(C_uu), C_ux)
129 |         #k[-1] = -np.dot(inv(C_uu), c_u)
130 | 
131 |         #V[-1] = C_xx + np.dot(C_xu, K[-1]) + np.dot(K[-1].T, C_ux) + np.dot(np.dot(K[-1].T, C_uu), K[-1])
132 |         #v[-1] = c_x + np.dot(C_xu, k[-1]) + np.dot(K[-1].T, c_u) + np.dot(np.dot(K[-1].T, C_uu), k[-1])
133 |         V[-1] = C_xx
134 |         v[-1] = c_x
135 | 
136 |         "initialize Q_t1 and q_t1"
137 | 
138 |         Q = list(np.zeros((self.T)))
139 |         q = list(np.zeros((self.T)))
140 | 
141 |         self.std = []
142 |         "loop till horizon"
143 | 
144 |         t = self.T-1
145 |         while t >= 0:
146 |         #for t in range(self.T-1, -1, -1):
147 |             "update Q"
148 | 
149 |             Q[t] = C[t] + np.dot(np.dot(F[t].T, V[t+1] + self.mu * np.eye(V[t+1].shape[0])), F[t]) #+ 0.01 * np.eye(V[t+1].shape[0])),
150 |             q[t] = c[t] + np.dot(F[t].T, v[t+1]) #+ np.dot(np.dot(F[t].T, V[t+1] + 0.00001 * np.eye(V[t+1].shape[0])), f[t])
151 | 
152 |             "differentiate Q to get Q_uu, Q_xx, Q_ux, Q_u, Q_x"
153 | 
154 |             q_x = q[t][:n]
155 |             q_u = q[t][n:]
156 | 
157 |             Q_xx = Q[t][:n,:n]
158 |             Q_xu = Q[t][:n,n:]
159 |             Q_ux = Q[t][n:,:n]
160 |             Q_uu = Q[t][n:,n:]
161 | 
162 |             #Q_uu = Q_uu + 100 * np.eye(Q_uu.shape[0])
163 |             try:
164 |                 np.linalg.cholesky(Q_uu)
165 |             except:
166 |                 self.increase(self.mu)
167 |                 continue
168 | 
169 |             "update K, k, V, v"
170 |             #print("q_uu", Q_uu)
171 |             K[t] = -np.dot(inv(Q_uu), Q_ux)
172 |             k[t] = -np.dot(inv(Q_uu), q_u)
173 | 
174 |             V[t] = Q_xx + np.dot(Q_xu, K[t]) + np.dot(K[t].T, Q_ux) + np.dot(np.dot(K[t].T, Q_uu), K[t])
175 |             v[t] = q_x + np.dot(Q_xu, k[t]) + np.dot(K[t].T, q_u) + np.dot(np.dot(K[t].T, Q_uu), k[t])
176 | 
177 |             self.std.append(inv(Q_uu))
178 |             self.decrease(self.mu)
179 |             t -= 1
180 | 
181 |         self.K = K
182 |         self.k = k
183 | 
184 |     def get_action_one_step(self, state, t, x, u):
185 | 
186 |         "TODO : Add delta U's to given action array"
187 |         #print("Q_uu ", self.std[t])
188 |         mean = np.dot(self.K[t], (state - x)) + self.k[t] + u
189 |         return np.clip(np.random.normal(mean, self.std[t]), self.control_low, self.control_high)
190 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | from dynamics import NNDynamicsModel
  5 | from policy import NNPolicy
  6 | from controllers import MPCcontroller, RandomController, LQRcontroller
  7 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn, pendulum_cost_fn, reacher_cost_fn
  8 | import time
  9 | import logz
 10 | import os
 11 | import copy
 12 | #import matplotlib.pyplot as plt
 13 | from cheetah_env import HalfCheetahEnvNew
 14 | from mpi4py import MPI
 15 | 
 16 | def sample(env,
 17 |            controller,
 18 |            num_paths=10,
 19 |            horizon=1000,
 20 |            render=False,
 21 |            verbose=False):
 22 |     """
 23 |         Write a sampler function which takes in an environment, a controller (either random or the MPC controller),
 24 |         and returns rollouts by running on the env.
 25 |         Each path can have elements for observations, next_observations, rewards, returns, actions, etc.
 26 |     """
 27 |     paths = []
 28 |     """ YOUR CODE HERE """
 29 |     for i in range(num_paths):
 30 | 
 31 |         ob = env.reset()
 32 |         obs, acs, rewards, next_obs = [], [], [], []
 33 |         steps = 0
 34 |         #print("sampling trajectory %d"%i)
 35 | 
 36 |         while True:
 37 |             obs.append(ob)
 38 |             ac = controller.get_action(ob, steps)
 39 |             env.render()
 40 |             print("control", ac)
 41 |             acs.append(ac)
 42 |             next_ob, rew, done, _ = env.step(ac)
 43 |             steps += 1
 44 |             ob = next_ob
 45 |             rewards.append(rew)
 46 |             next_obs.append(next_ob)
 47 |             if done or steps > 100:
 48 |                 print("steps", steps)
 49 |                 break
 50 |         path = {"observations" : np.array(obs),
 51 |                 "rewards" : np.array(rewards),
 52 |                 "actions" : np.array(acs),
 53 |                 "next_observations" : np.array(next_obs)}
 54 |         paths.append(path)
 55 | 
 56 |     return paths
 57 | 
 58 | # Utility to compute cost a path for a given cost function
 59 | def path_cost(cost_fn, path):
 60 |     return trajectory_cost_fn(cost_fn, path['observations'], path['actions'], path['next_observations'])
 61 | 
 62 | def compute_normalization(data):
 63 |     """
 64 |     Write a function to take in a dataset and compute the means, and stds.
 65 |     Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions
 66 |     """
 67 | 
 68 |     """ YOUR CODE HERE """
 69 | 
 70 |     mean_obs = np.mean(np.concatenate([path['observations'] for path in data]), axis=0)
 71 |     std_obs = np.std(np.concatenate([path['observations'] for path in data]), axis=0)
 72 |     mean_action = np.mean(np.concatenate([path['actions'] for path in data]), axis=0)
 73 |     std_action = np.std(np.concatenate([path['actions'] for path in data]), axis=0)
 74 |     mean_deltas = np.mean(np.concatenate([path['next_observations'] - path['observations'] for path in data]), axis=0)
 75 |     std_deltas = np.std(np.concatenate([path['next_observations'] - path['observations'] for path in data]), axis=0)
 76 | 
 77 |     return mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action
 78 | 
 79 | 
 80 | def plot_comparison(env, dyn_model):
 81 |     """
 82 |     Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions.
 83 |     """
 84 |     """ YOUR CODE HERE """
 85 |     pass
 86 | 
 87 | def train(env,
 88 |          cost_fn,
 89 |          load_model,
 90 |          model_path,
 91 |          logdir=None,
 92 |          render=False,
 93 |          learning_rate_dyn=1e-3,
 94 |          learning_rate_policy=1e-4,
 95 |          onpol_iters=10,
 96 |          dynamics_iters=60,
 97 |          policy_iters=100,
 98 |          batch_size=512,
 99 |          num_paths_random=10,
100 |          num_paths_onpol=5,
101 |          num_simulated_paths=10000,
102 |          env_horizon=1000,
103 |          mpc_horizon=15,
104 |          n_layers=2,
105 |          size=500,
106 |          activation=tf.nn.relu,
107 |          output_activation=None,
108 |          ):
109 | 
110 |     """
111 | 
112 |     Arguments:
113 | 
114 |     onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.
115 | 
116 |     dynamics_iters              Number of iterations of training for the dynamics model
117 |     |_                          which happen per iteration of the aggregation loop.
118 | 
119 |     batch_size                  Batch size for dynamics training.
120 | 
121 |     num_paths_random            Number of paths/trajectories/rollouts generated
122 |     |                           by a random agent. We use these to train our
123 |     |_                          initial dynamics model.
124 | 
125 |     num_paths_onpol             Number of paths to collect at each iteration of
126 |     |_                          aggregation, using the Model Predictive Control policy.
127 | 
128 |     num_simulated_paths         How many fictitious rollouts the MPC policy
129 |     |                           should generate each time it is asked for an
130 |     |_                          action.
131 | 
132 |     env_horizon                 Number of timesteps in each path.
133 | 
134 |     mpc_horizon                 The MPC policy generates actions by imagining
135 |     |                           fictitious rollouts, and picking the first action
136 |     |                           of the best fictitious rollout. This argument is
137 |     |                           how many timesteps should be in each fictitious
138 |     |_                          rollout.
139 | 
140 |     n_layers/size/activations   Neural network architecture arguments.
141 | 
142 |     """
143 | 
144 |     #logz.configure_output_dir(logdir)
145 | 
146 |     #========================================================
147 |     #
148 |     # First, we need a lot of data generated by a random
149 |     # agent, with which we'll begin to train our dynamics
150 |     # model.
151 | 
152 |     random_controller = RandomController(env)
153 | 
154 |     """ YOUR CODE HERE """
155 |     data = sample(env, random_controller, num_paths_random, env_horizon)
156 | 
157 | 
158 |     #========================================================
159 |     #
160 |     # The random data will be used to get statistics (mean
161 |     # and std) for the observations, actions, and deltas
162 |     # (where deltas are o_{t+1} - o_t). These will be used
163 |     # for normalizing inputs and denormalizing outputs
164 |     # from the dynamics network.
165 |     #
166 |     """ YOUR CODE HERE """
167 |     normalization = compute_normalization(data)
168 | 
169 | 
170 |     #========================================================
171 |     #
172 |     # Build dynamics model and MPC controllers.
173 |     #
174 |     sess = tf.Session()
175 | 
176 |     dyn_model = NNDynamicsModel(env=env,
177 |                                 n_layers=n_layers,
178 |                                 size=size,
179 |                                 activation=activation,
180 |                                 output_activation=output_activation,
181 |                                 normalization=normalization,
182 |                                 batch_size=batch_size,
183 |                                 iterations=dynamics_iters,
184 |                                 learning_rate=learning_rate_dyn,
185 |                                 sess=sess)
186 | 
187 |     policy = NNPolicy(env=env,
188 |                     normalization=normalization,
189 |                     batch_size=batch_size,
190 |                     iterations=policy_iters,
191 |                     learning_rate=learning_rate_policy,
192 |                     sess=sess,
193 |                     model_path=model_path,
194 |                     save_path="./policy/",
195 |                     load_model=load_model)
196 | 
197 |     mpc_controller = MPCcontroller(env=env,
198 |                                    dyn_model=dyn_model,
199 |                                    horizon=mpc_horizon,
200 |                                    cost_fn=cost_fn,
201 |                                    num_simulated_paths=num_simulated_paths)
202 | 
203 |     lqr_controller = LQRcontroller(env=env,
204 |                                    delta=0.005,
205 |                                    T=50,
206 |                                    dyn_model=dyn_model,
207 |                                    cost_fn=cost_fn,
208 |                                    iterations=1)
209 | 
210 |     comm = MPI.COMM_WORLD
211 |     size = comm.Get_size()
212 |     rank = comm.Get_rank()
213 | 
214 |     #========================================================
215 |     #
216 |     # Tensorflow session building.
217 |     #
218 | 
219 |     sess.__enter__()
220 |     tf.global_variables_initializer().run()
221 | 
222 |     #========================================================
223 |     #
224 |     # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
225 |     # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
226 |     #
227 | 
228 |     # training the MPC controller as well as dynamics
229 |     for itr in range(onpol_iters):
230 | 
231 |         print("fitting dynamics for worker ", rank)
232 |         dyn_model.fit(data)
233 |         print("sampling new trajectories from worker ", rank)
234 |         new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon)
235 | 
236 |         data += new_data
237 |         comm.send(new_data, 0)
238 | 
239 |         if rank == 0:
240 |             costs, returns = [], []
241 | 
242 |             for path in data:
243 | 
244 |                 costs.append(path_cost(cost_fn, path))
245 |                 returns.append(np.sum(path['rewards']))
246 | 
247 |             print("returns ",returns)
248 | 
249 |             for i in range(1, size):
250 |                 data += comm.recv(source=i)
251 | 
252 |             print("fitting policy...")
253 |             policy.fit(data)
254 | 
255 |             # LOGGING
256 |             # Statistics for performance of MPC policy using
257 |             # our learned dynamics model
258 |             logz.log_tabular('Iteration', itr)
259 |             # In terms of cost function which your MPC controller uses to plan
260 |             logz.log_tabular('AverageCost', np.mean(costs))
261 |             logz.log_tabular('StdCost', np.std(costs))
262 |             logz.log_tabular('MinimumCost', np.min(costs))
263 |             logz.log_tabular('MaximumCost', np.max(costs))
264 |             # In terms of true environment reward of your rolled out trajectory using the MPC controller
265 |             logz.log_tabular('AverageReturn', np.mean(returns))
266 |             logz.log_tabular('StdReturn', np.std(returns))
267 |             logz.log_tabular('MinimumReturn', np.min(returns))
268 |             logz.log_tabular('MaximumReturn', np.max(returns))
269 | 
270 |             logz.dump_tabular()
271 | 
272 |     # applying the learned neural policy
273 |     if rank == 0:
274 |         ob = env.reset()
275 | 
276 |         while True:
277 |             a = policy.get_action(ob.reshape((1, ob.shape[0])))
278 | 
279 |             # control clipping to be added
280 | 
281 |             next_ob, reward, done, info = env.step(a[0])
282 |             print("action", a)
283 |             print("predicted ob", dyn_model.predict(ob, a))
284 |             print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10))
285 |             env.render()
286 |             ob = next_ob
287 |             if done:
288 |                 ob = env.reset()
289 | 
290 | def main():
291 | 
292 |     import argparse
293 |     parser = argparse.ArgumentParser()
294 |     parser.add_argument('--env_name', type=str, default='Pendulum-v1')
295 |     # Experiment meta-params
296 |     parser.add_argument('--exp_name', type=str, default='mb_mpc')
297 |     parser.add_argument('--seed', type=int, default=5)
298 |     parser.add_argument('--render', action='store_true')
299 |     parser.add_argument('--model_path', '-mp', type=str, default="./policy/-0")
300 |     parser.add_argument('--load_model', '-lm', type=str, default=False)
301 |     # Training args
302 |     parser.add_argument('--learning_rate_dyn', '-lr', type=float, default=1e-3)
303 |     parser.add_argument('--learning_rate_policy', '-lrp', type=float, default=1e-4)
304 |     parser.add_argument('--onpol_iters', '-n', type=int, default=5)
305 |     parser.add_argument('--dyn_iters', '-nd', type=int, default=100)
306 |     parser.add_argument('--policy_iters', '-ndp', type=int, default=100)
307 |     parser.add_argument('--batch_size', '-b', type=int, default=512)
308 |     # Data collection
309 |     parser.add_argument('--random_paths', '-r', type=int, default=10)
310 |     parser.add_argument('--onpol_paths', '-d', type=int, default=10)
311 |     parser.add_argument('--simulated_paths', '-sp', type=int, default=1000)
312 |     parser.add_argument('--ep_len', '-ep', type=int, default=1000)
313 |     # Neural network architecture args
314 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
315 |     parser.add_argument('--size', '-s', type=int, default=500)
316 |     # MPC Controller
317 |     parser.add_argument('--mpc_horizon', '-m', type=int, default=20)
318 |     args = parser.parse_args()
319 | 
320 |     # Set seed
321 |     np.random.seed(args.seed)
322 |     tf.set_random_seed(args.seed)
323 | 
324 |     # Make data directory if it does not already exist
325 |     #if not(os.path.exists('data')):
326 |     #    os.makedirs('data')
327 |     #logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
328 |     #logdir = os.path.join('data', logdir)
329 |     #if not(os.path.exists(logdir)):
330 |     #    os.makedirs(logdir)
331 | 
332 |     # Make env
333 |     if args.env_name is "HalfCheetah-v1":
334 |         env = HalfCheetahEnvNew()
335 |         cost_fn = cheetah_cost_fn
336 | 
337 |     elif args.env_name is "Pendulum-v1":
338 |         env = gym.make('InvertedPendulum-v1')
339 |         cost_fn = pendulum_cost_fn
340 | 
341 |     elif args.env_name is "Reacher-v1":
342 |         env = gym.make('Reacher-v1')
343 |         cost_fn = reacher_cost_fn
344 |     train(env=env,
345 |                  cost_fn=cost_fn,
346 |                  load_model=args.load_model,
347 |                  model_path=args.model_path,
348 |                  logdir=None,
349 |                  render=args.render,
350 |                  learning_rate_dyn=args.learning_rate_dyn,
351 |                  learning_rate_policy=args.learning_rate_policy,
352 |                  onpol_iters=args.onpol_iters,
353 |                  dynamics_iters=args.dyn_iters,
354 |                  batch_size=args.batch_size,
355 |                  num_paths_random=args.random_paths,
356 |                  num_paths_onpol=args.onpol_paths,
357 |                  num_simulated_paths=args.simulated_paths,
358 |                  env_horizon=args.ep_len,
359 |                  mpc_horizon=args.mpc_horizon,
360 |                  n_layers = args.n_layers,
361 |                  size=args.size,
362 |                  activation=tf.nn.relu,
363 |                  output_activation=None,
364 |                  )
365 | 
366 | if __name__ == "__main__":
367 |     main()
368 | 


--------------------------------------------------------------------------------