├── Docs ├── plant.gif ├── simulink_model.png └── tracking.gif ├── IQL_conventional ├── __init__.py ├── agents │ ├── __init__.py │ ├── models.py │ ├── policies.py │ └── utils.py ├── config │ ├── config_ford.ini │ └── config_gym.ini ├── env │ ├── __init__.py │ ├── env_ford.py │ ├── test.py │ └── utils.py ├── main.py ├── trainer.py └── utils.py ├── README.md └── examples ├── PV ├── parameters.m ├── pv_inverter.slx ├── pv_inverter_pid.slx └── pv_inverter_pv.slx ├── plant_ex ├── plant.py └── plant.slx └── tracking ├── linear_controller.m ├── tracking.py ├── tracking.slx └── tracking1.slx /Docs/plant.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/Docs/plant.gif -------------------------------------------------------------------------------- /Docs/simulink_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/Docs/simulink_model.png -------------------------------------------------------------------------------- /Docs/tracking.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/Docs/tracking.gif -------------------------------------------------------------------------------- /IQL_conventional/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/IQL_conventional/__init__.py -------------------------------------------------------------------------------- /IQL_conventional/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/IQL_conventional/agents/__init__.py -------------------------------------------------------------------------------- /IQL_conventional/agents/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .utils import * 3 | from .policies import * 4 | import logging 5 | import multiprocessing as mp 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | class A2C: 11 | def __init__(self, n_s, n_a, total_step, model_config, seed=0, n_f=None): 12 | # load parameters 13 | self.name = 'a2c' 14 | self.n_agent = 1 15 | # init reward norm/clip 16 | self.reward_clip = model_config.getfloat('reward_clip') 17 | self.reward_norm = model_config.getfloat('reward_norm') 18 | self.n_s = n_s 19 | self.n_a = n_a 20 | self.n_step = model_config.getint('batch_size') 21 | # init tf 22 | tf.reset_default_graph() 23 | tf.set_random_seed(seed) 24 | config = tf.ConfigProto(allow_soft_placement=True) 25 | self.sess = tf.Session(config=config) 26 | self.policy = self._init_policy(n_s, n_a, n_f, model_config) 27 | self.saver = tf.train.Saver(max_to_keep=15) 28 | if total_step: 29 | # training 30 | self.total_step = total_step 31 | self._init_scheduler(model_config) 32 | self._init_train(model_config) 33 | self.sess.run(tf.global_variables_initializer()) 34 | 35 | def _init_policy(self, n_s, n_a, n_f, model_config, agent_name=None): 36 | n_fw = model_config.getint('num_fw') 37 | n_ft = model_config.getint('num_ft') 38 | n_lstm = model_config.getint('num_lstm') 39 | policy = None 40 | return policy 41 | 42 | def _init_scheduler(self, model_config): 43 | lr_init = model_config.getfloat('lr_init') 44 | lr_decay = model_config.get('lr_decay') 45 | beta_init = model_config.getfloat('entropy_coef_init') 46 | beta_decay = model_config.get('entropy_decay') 47 | if lr_decay == 'constant': 48 | self.lr_scheduler = Scheduler(lr_init, decay=lr_decay) 49 | if beta_decay == 'constant': 50 | self.beta_scheduler = Scheduler(beta_init, decay=beta_decay) 51 | 52 | def _init_train(self, model_config): 53 | # init loss 54 | v_coef = model_config.getfloat('value_coef') 55 | max_grad_norm = model_config.getfloat('max_grad_norm') 56 | alpha = model_config.getfloat('rmsp_alpha') 57 | epsilon = model_config.getfloat('rmsp_epsilon') 58 | self.policy.prepare_loss(v_coef, max_grad_norm, alpha, epsilon) 59 | 60 | # init replay buffer 61 | gamma = model_config.getfloat('gamma') 62 | self.trans_buffer = OnPolicyBuffer(gamma) 63 | 64 | def save(self, model_dir, global_step): 65 | self.saver.save(self.sess, model_dir + 'checkpoint', 66 | global_step=global_step) 67 | 68 | def load(self, model_dir, checkpoint=None): 69 | save_file = None 70 | save_step = 0 71 | if os.path.exists(model_dir): 72 | if checkpoint is None: 73 | for file in os.listdir(model_dir): 74 | if file.startswith('checkpoint'): 75 | prefix = file.split('.')[0] 76 | tokens = prefix.split('-') 77 | if len(tokens) != 2: 78 | continue 79 | cur_step = int(tokens[1]) 80 | if cur_step > save_step: 81 | save_file = prefix 82 | save_step = cur_step 83 | else: 84 | save_file = 'checkpoint-' + str(int(checkpoint)) 85 | if save_file is not None: 86 | self.saver.restore(self.sess, model_dir + save_file) 87 | logging.info('Checkpoint loaded: %s' % save_file) 88 | return True 89 | logging.error('Can not find old checkpoint for %s' % model_dir) 90 | return False 91 | 92 | def reset(self): 93 | self.policy._reset() 94 | 95 | def backward(self, R, summary_writer=None, global_step=None): 96 | cur_lr = self.lr_scheduler.get(self.n_step) 97 | cur_beta = self.beta_scheduler.get(self.n_step) 98 | obs, acts, dones, Rs, Advs = self.trans_buffer.sample_transition(R) 99 | self.policy.backward(self.sess, obs, acts, dones, Rs, Advs, cur_lr, cur_beta, 100 | summary_writer=summary_writer, global_step=global_step) 101 | 102 | def forward(self, ob, done, out_type='pv'): 103 | return self.policy.forward(self.sess, ob, done, out_type) 104 | 105 | def add_transition(self, ob, action, reward, value, done): 106 | # Hard code the reward norm for negative reward only 107 | if (self.reward_norm): 108 | reward /= self.reward_norm 109 | if self.reward_clip: 110 | reward = np.clip(reward, -self.reward_clip, self.reward_clip) 111 | self.trans_buffer.add_transition(ob, action, reward, value, done) 112 | 113 | 114 | class IQL(A2C): 115 | def __init__(self, n_s_ls, n_a_ls, total_step, model_config, seed=0, model_type='dqn'): 116 | self.name = 'iql' 117 | self.model_type = model_type 118 | self.agents = [] 119 | self.n_agent = len(n_s_ls) 120 | self.reward_clip = model_config.getfloat('reward_clip') 121 | self.reward_norm = model_config.getfloat('reward_norm') 122 | self.n_s_ls = n_s_ls 123 | self.n_a_ls = n_a_ls 124 | self.n_step = model_config.getint('batch_size') 125 | # init tf 126 | tf.reset_default_graph() 127 | tf.set_random_seed(seed) 128 | config = tf.ConfigProto(allow_soft_placement=True) 129 | self.sess = tf.Session(config=config) 130 | self.policy_ls = [] 131 | for i, (n_s, n_a) in enumerate(zip(self.n_s_ls, self.n_a_ls)): 132 | # agent_name is needed to differentiate multi-agents 133 | self.policy_ls.append(self._init_policy(n_s, n_a, model_config, 134 | agent_name='{:d}a'.format(i))) 135 | self.saver = tf.train.Saver(max_to_keep=5) 136 | if total_step: 137 | # training 138 | self.total_step = total_step 139 | self._init_scheduler(model_config) 140 | self._init_train(model_config) 141 | self.cur_step = 0 142 | self.sess.run(tf.global_variables_initializer()) 143 | 144 | def _init_policy(self, n_s, n_a, model_config, agent_name=None): 145 | if self.model_type == 'dqn': 146 | n_h = model_config.getint('num_h') 147 | n_fc = model_config.getint('num_fc') 148 | policy = DeepQPolicy(n_s, n_a, self.n_step, n_fc0=n_fc, n_fc=n_h, 149 | name=agent_name) 150 | return policy 151 | 152 | def _init_scheduler(self, model_config): 153 | lr_init = model_config.getfloat('lr_init') 154 | lr_decay = model_config.get('lr_decay') 155 | eps_init = model_config.getfloat('epsilon_init') 156 | eps_decay = model_config.get('epsilon_decay') 157 | if lr_decay == 'constant': 158 | self.lr_scheduler = Scheduler(lr_init, decay=lr_decay) 159 | else: 160 | lr_min = model_config.getfloat('lr_min') 161 | self.lr_scheduler = Scheduler( 162 | lr_init, lr_min, self.total_step, decay=lr_decay) 163 | if eps_decay == 'constant': 164 | self.eps_scheduler = Scheduler(eps_init, decay=eps_decay) 165 | else: 166 | eps_min = model_config.getfloat('epsilon_min') 167 | eps_ratio = model_config.getfloat('epsilon_ratio') 168 | self.eps_scheduler = Scheduler(eps_init, eps_min, self.total_step * eps_ratio, 169 | decay=eps_decay) 170 | 171 | def _init_train(self, model_config): 172 | # init loss 173 | max_grad_norm = model_config.getfloat('max_grad_norm') 174 | gamma = model_config.getfloat('gamma') 175 | buffer_size = model_config.getfloat('buffer_size') 176 | self.trans_buffer_ls = [] 177 | for i in range(self.n_agent): 178 | self.policy_ls[i].prepare_loss(max_grad_norm, gamma) 179 | self.trans_buffer_ls.append(ReplayBuffer(buffer_size, self.n_step)) 180 | 181 | def backward(self, summary_writer=None, global_step=None): 182 | # update networks 183 | cur_lr = self.lr_scheduler.get(self.n_step) 184 | if self.trans_buffer_ls[0].size < self.trans_buffer_ls[0].batch_size: 185 | return 186 | for i in range(self.n_agent): 187 | for k in range(10): # update network 10 times 188 | obs, acts, next_obs, rs, dones = self.trans_buffer_ls[i].sample_transition() 189 | if i == 0: 190 | self.policy_ls[i].backward(self.sess, obs, np.squeeze(acts), next_obs, dones, rs, cur_lr, 191 | summary_writer=summary_writer, 192 | global_step=global_step + k) 193 | else: 194 | self.policy_ls[i].backward( 195 | self.sess, obs, acts, next_obs, dones, rs, cur_lr) 196 | 197 | def forward(self, obs, mode='act', stochastic=False): 198 | # get actions and policies 199 | if mode == 'explore': 200 | eps = self.eps_scheduler.get(1) 201 | action = [] 202 | qs_ls = [] 203 | for i in range(self.n_agent): 204 | qs = self.policy_ls[i].forward(self.sess, obs) # here we only have one agent, ori = obs[i] 205 | if (mode == 'explore') and (np.random.random() < eps): 206 | action.append(np.random.randint(self.n_a_ls[i])) 207 | else: 208 | if not stochastic: 209 | action.append(np.argmax(qs)) 210 | else: 211 | qs = qs / np.sum(qs) 212 | action.append(np.random.choice(np.arange(len(qs)), p=qs)) 213 | qs_ls.append(qs) 214 | return action, qs_ls 215 | 216 | def reset(self): 217 | # do nothing 218 | return 219 | 220 | def add_transition(self, obs, actions, rewards, next_obs, done): 221 | # add experiences to buffers accordingly 222 | if (self.reward_norm): 223 | rewards = rewards / self.reward_norm 224 | if self.reward_clip: 225 | rewards = np.clip(rewards, -self.reward_clip, self.reward_clip) 226 | for i in range(self.n_agent): 227 | self.trans_buffer_ls[i].add_transition(obs, actions, 228 | rewards, next_obs, done) -------------------------------------------------------------------------------- /IQL_conventional/agents/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from .utils import * 4 | 5 | 6 | class QPolicy: 7 | def __init__(self, n_a, n_s, n_step, policy_name, agent_name): 8 | self.name = policy_name 9 | if agent_name is not None: 10 | # for multi-agent system 11 | self.name += '_' + str(agent_name) 12 | self.n_a = n_a 13 | self.n_s = n_s 14 | self.n_step = n_step 15 | 16 | def forward(self, ob, *_args, **_kwargs): 17 | raise NotImplementedError() 18 | 19 | def _build_fc_net(self, h, n_fc_ls): 20 | for i, n_fc in enumerate(n_fc_ls): 21 | h = fc(h, 'q_fc_%d' % i, n_fc) 22 | q = fc(h, 'q', self.n_a, act=lambda x: x) 23 | return tf.squeeze(q) 24 | 25 | def _build_net(self): 26 | raise NotImplementedError() 27 | 28 | def prepare_loss(self, max_grad_norm, gamma): 29 | self.A = tf.placeholder(tf.int32, [self.n_step]) 30 | self.S1 = tf.placeholder( 31 | tf.float32, [self.n_step, self.n_s]) 32 | self.R = tf.placeholder(tf.float32, [self.n_step]) 33 | self.DONE = tf.placeholder(tf.bool, [self.n_step]) 34 | A_sparse = tf.one_hot(self.A, self.n_a) 35 | 36 | # backward, calculate loss 37 | with tf.variable_scope(self.name + '_q', reuse=True): 38 | q0s = self._build_net(self.S) 39 | q0 = tf.reduce_sum(q0s * A_sparse, axis=1) 40 | with tf.variable_scope(self.name + '_q', reuse=True): 41 | q1s = self._build_net(self.S1) 42 | q1 = tf.reduce_max(q1s, axis=1) 43 | tq = tf.stop_gradient(tf.where(self.DONE, self.R, self.R + gamma * q1)) 44 | self.loss = tf.reduce_mean(tf.square(q0 - tq)) 45 | 46 | wts = tf.trainable_variables(scope=self.name) 47 | grads = tf.gradients(self.loss, wts) 48 | if max_grad_norm > 0: 49 | grads, self.grad_norm = tf.clip_by_global_norm( 50 | grads, max_grad_norm) 51 | self.lr = tf.placeholder(tf.float32, []) 52 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 53 | self._train = self.optimizer.apply_gradients(list(zip(grads, wts))) 54 | # monitor training 55 | if self.name.endswith('_0a'): 56 | summaries = [] 57 | summaries.append(tf.summary.scalar( 58 | 'train/%s_loss' % self.name, self.loss)) 59 | summaries.append(tf.summary.scalar('train/%s_q' % 60 | self.name, tf.reduce_mean(q0))) 61 | summaries.append(tf.summary.scalar('train/%s_tq' % 62 | self.name, tf.reduce_mean(tq))) 63 | summaries.append(tf.summary.scalar( 64 | 'train/%s_gradnorm' % self.name, self.grad_norm)) 65 | self.summary = tf.summary.merge(summaries) 66 | 67 | 68 | class DeepQPolicy(QPolicy): 69 | def __init__(self, n_s, n_a, n_step, n_fc0=128, n_fc=64, name=None): 70 | super().__init__(n_a, n_s, n_step, 'dqn', name) 71 | self.n_fc = n_fc 72 | self.n_fc0 = n_fc0 73 | self.S = tf.placeholder(tf.float32, [None, n_s]) 74 | with tf.variable_scope(self.name + '_q'): 75 | self.qvalues = self._build_net(self.S) 76 | 77 | def _build_net(self, S): 78 | h0 = fc(S[:, :self.n_s], 'q_fcw', self.n_fc0) 79 | h1 = fc(S[:, self.n_s:], 'q_fct', self.n_fc0 / 4) 80 | h = tf.concat([h0, h1], 1) 81 | return self._build_fc_net(h, [self.n_fc]) 82 | 83 | def forward(self, sess, ob): 84 | return sess.run(self.qvalues, {self.S: np.array([ob])}) 85 | 86 | def backward(self, sess, obs, acts, next_obs, dones, rs, cur_lr, 87 | summary_writer=None, global_step=None): 88 | # update networks 89 | if summary_writer is None: 90 | ops = self._train 91 | else: 92 | ops = [self.summary, self._train] 93 | outs = sess.run(ops, 94 | {self.S: obs, 95 | self.A: acts, 96 | self.S1: next_obs, 97 | self.DONE: dones, 98 | self.R: rs, 99 | self.lr: cur_lr}) 100 | if summary_writer is not None: 101 | summary_writer.add_summary(outs[0], global_step=global_step) -------------------------------------------------------------------------------- /IQL_conventional/agents/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | 5 | """ 6 | initializers 7 | """ 8 | DEFAULT_SCALE = np.sqrt(2) 9 | DEFAULT_MODE = 'fan_in' 10 | 11 | 12 | def ortho_init(scale=DEFAULT_SCALE, mode=None): 13 | def _ortho_init(shape, dtype, partition_info=None): 14 | # lasagne ortho init for tf 15 | shape = tuple(shape) 16 | if len(shape) == 2: # fc: in, out 17 | flat_shape = shape 18 | elif (len(shape) == 3) or (len(shape) == 4): # 1d/2dcnn: (in_h), in_w, in_c, out 19 | flat_shape = (np.prod(shape[:-1]), shape[-1]) 20 | a = np.random.standard_normal(flat_shape) 21 | u, _, v = np.linalg.svd(a, full_matrices=False) 22 | q = u if u.shape == flat_shape else v # pick the one with the correct shape 23 | q = q.reshape(shape) 24 | return (scale * q).astype(np.float32) 25 | return _ortho_init 26 | 27 | 28 | def norm_init(scale=DEFAULT_SCALE, mode=DEFAULT_MODE): 29 | def _norm_init(shape, dtype, partition_info=None): 30 | shape = tuple(shape) 31 | if len(shape) == 2: 32 | n_in = shape[0] 33 | elif (len(shape) == 3) or (len(shape) == 4): 34 | n_in = np.prod(shape[:-1]) 35 | a = np.random.standard_normal(shape) 36 | if mode == 'fan_in': 37 | n = n_in 38 | elif mode == 'fan_out': 39 | n = shape[-1] 40 | elif mode == 'fan_avg': 41 | n = 0.5 * (n_in + shape[-1]) 42 | return (scale * a / np.sqrt(n)).astype(np.float32) 43 | 44 | 45 | DEFAULT_METHOD = ortho_init 46 | """ 47 | layers 48 | """ 49 | 50 | 51 | def conv(x, scope, n_out, f_size, stride=1, pad='VALID', f_size_w=None, act=tf.nn.relu, 52 | conv_dim=1, init_scale=DEFAULT_SCALE, init_mode=None, init_method=DEFAULT_METHOD): 53 | with tf.variable_scope(scope): 54 | b = tf.get_variable( 55 | "b", [n_out], initializer=tf.constant_initializer(0.0)) 56 | if conv_dim == 1: 57 | n_c = x.shape[2].value 58 | w = tf.get_variable("w", [f_size, n_c, n_out], 59 | initializer=init_method(init_scale, init_mode)) 60 | z = tf.nn.conv1d(x, w, stride=stride, padding=pad) + b 61 | elif conv_dim == 2: 62 | n_c = x.shape[3].value 63 | if f_size_w is None: 64 | f_size_w = f_size 65 | w = tf.get_variable("w", [f_size, f_size_w, n_c, n_out], 66 | initializer=init_method(init_scale, init_mode)) 67 | z = tf.nn.conv2d( 68 | x, w, strides=[1, stride, stride, 1], padding=pad) + b 69 | return act(z) 70 | 71 | 72 | def fc(x, scope, n_out, act=tf.nn.relu, init_scale=DEFAULT_SCALE, 73 | init_mode=DEFAULT_MODE, init_method=DEFAULT_METHOD): 74 | with tf.variable_scope(scope): 75 | n_in = x.shape[1].value 76 | w = tf.get_variable("w", [n_in, n_out], 77 | initializer=init_method(init_scale, init_mode)) 78 | b = tf.get_variable( 79 | "b", [n_out], initializer=tf.constant_initializer(0.0)) 80 | z = tf.matmul(x, w) + b 81 | return act(z) 82 | 83 | 84 | def batch_to_seq(x): 85 | n_step = x.shape[0].value 86 | if len(x.shape) == 1: 87 | x = tf.expand_dims(x, -1) 88 | return tf.split(axis=0, num_or_size_splits=n_step, value=x) 89 | 90 | 91 | def seq_to_batch(x): 92 | return tf.concat(axis=0, values=x) 93 | 94 | 95 | def lstm(xs, dones, s, scope, init_scale=DEFAULT_SCALE, init_mode=DEFAULT_MODE, 96 | init_method=DEFAULT_METHOD): 97 | xs = batch_to_seq(xs) 98 | # need dones to reset states 99 | dones = batch_to_seq(dones) 100 | n_in = xs[0].shape[1].value 101 | n_out = s.shape[0] // 2 102 | with tf.variable_scope(scope): 103 | wx = tf.get_variable("wx", [n_in, n_out*4], 104 | initializer=init_method(init_scale, init_mode)) 105 | wh = tf.get_variable("wh", [n_out, n_out*4], 106 | initializer=init_method(init_scale, init_mode)) 107 | b = tf.get_variable( 108 | "b", [n_out*4], initializer=tf.constant_initializer(0.0)) 109 | s = tf.expand_dims(s, 0) 110 | c, h = tf.split(axis=1, num_or_size_splits=2, value=s) 111 | for ind, (x, done) in enumerate(zip(xs, dones)): 112 | c = c * (1-done) 113 | h = h * (1-done) 114 | z = tf.matmul(x, wx) + tf.matmul(h, wh) + b 115 | i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) 116 | i = tf.nn.sigmoid(i) 117 | f = tf.nn.sigmoid(f) 118 | o = tf.nn.sigmoid(o) 119 | u = tf.tanh(u) 120 | c = f*c + i*u 121 | h = o*tf.tanh(c) 122 | xs[ind] = h 123 | s = tf.concat(axis=1, values=[c, h]) 124 | return seq_to_batch(xs), tf.squeeze(s) 125 | 126 | 127 | def test_layers(): 128 | print(tf.__version__) 129 | tf.reset_default_graph() 130 | sess = tf.Session() 131 | n_step = 5 132 | fc_x = tf.placeholder(tf.float32, [None, 10]) 133 | lstm_x = tf.placeholder(tf.float32, [n_step, 2]) 134 | lstm_done = tf.placeholder(tf.float32, [n_step]) 135 | lstm_s = tf.placeholder(tf.float32, [20]) 136 | conv1_x = tf.placeholder(tf.float32, [None, 8, 1]) 137 | conv2_x = tf.placeholder(tf.float32, [None, 8, 8, 1]) 138 | fc_out = fc(fc_x, 'fc', 10) 139 | lstm_out, lstm_ns = lstm(lstm_x, lstm_done, lstm_s, 'lstm') 140 | conv1_out = conv(conv1_x, 'conv1', 10, 4, conv_dim=1) 141 | conv2_out = conv(conv2_x, 'conv2', 10, 4, conv_dim=2) 142 | sess.run(tf.global_variables_initializer()) 143 | inputs = {'fc': {fc_x: np.random.randn(n_step, 10)}, 144 | 'lstm_done': {lstm_x: np.zeros((n_step, 2)), 145 | lstm_done: np.ones(n_step), 146 | lstm_s: np.random.randn(20)}, 147 | 'lstm': {lstm_x: np.random.randn(n_step, 2), 148 | lstm_done: np.zeros(n_step), 149 | lstm_s: np.random.randn(20)}, 150 | 'conv1': {conv1_x: np.random.randn(n_step, 8, 1)}, 151 | 'conv2': {conv2_x: np.random.randn(n_step, 8, 8, 1)}} 152 | outputs = {'fc': [fc_out], 'lstm_done': [lstm_out, lstm_ns], 153 | 'conv1': [conv1_out], 'conv2': [conv2_out], 154 | 'lstm': [lstm_out, lstm_ns]} 155 | for scope in ['fc', 'lstm', 'conv1', 'conv2']: 156 | print(scope) 157 | wts = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) 158 | for wt in wts: 159 | wt_val = wt.eval(sess) 160 | print(wt_val.shape) 161 | print(np.mean(wt_val), np.std(wt_val), 162 | np.min(wt_val), np.max(wt_val)) 163 | print('=====================================') 164 | for x_name in inputs: 165 | print(x_name) 166 | out = sess.run(outputs[x_name], inputs[x_name]) 167 | if x_name.startswith('lstm'): 168 | print(out[0]) 169 | print(out[1]) 170 | else: 171 | print(out[0].shape) 172 | 173 | 174 | """ 175 | buffers 176 | """ 177 | 178 | 179 | class TransBuffer: 180 | def reset(self): 181 | self.buffer = [] 182 | 183 | @property 184 | def size(self): 185 | return len(self.buffer) 186 | 187 | def add_transition(self, ob, a, r, *_args, **_kwargs): 188 | raise NotImplementedError() 189 | 190 | def sample_transition(self, *_args, **_kwargs): 191 | raise NotImplementedError() 192 | 193 | 194 | class OnPolicyBuffer(TransBuffer): 195 | def __init__(self, gamma): 196 | self.gamma = gamma 197 | self.reset() 198 | 199 | def reset(self, done=False): 200 | # the done before each step is required 201 | self.obs = [] 202 | self.acts = [] 203 | self.rs = [] 204 | self.vs = [] 205 | self.dones = [done] 206 | 207 | def add_transition(self, ob, a, r, v, done): 208 | self.obs.append(ob) 209 | self.acts.append(a) 210 | self.rs.append(r) 211 | self.vs.append(v) 212 | self.dones.append(done) 213 | 214 | def _add_R_Adv(self, R): 215 | Rs = [] 216 | Advs = [] 217 | # use post-step dones here 218 | for r, v, done in zip(self.rs[::-1], self.vs[::-1], self.dones[:0:-1]): 219 | R = r + self.gamma * R * (1.-done) 220 | Adv = R - v 221 | Rs.append(R) 222 | Advs.append(Adv) 223 | Rs.reverse() 224 | Advs.reverse() 225 | self.Rs = Rs 226 | self.Advs = Advs 227 | 228 | def sample_transition(self, R, discrete=True): 229 | self._add_R_Adv(R) 230 | obs = np.array(self.obs, dtype=np.float32) 231 | if discrete: 232 | acts = np.array(self.acts, dtype=np.int32) 233 | else: 234 | acts = np.array(self.acts, dtype=np.float32) 235 | Rs = np.array(self.Rs, dtype=np.float32) 236 | Advs = np.array(self.Advs, dtype=np.float32) 237 | # use pre-step dones here 238 | dones = np.array(self.dones[:-1], dtype=np.bool) 239 | self.reset(self.dones[-1]) 240 | return obs, acts, dones, Rs, Advs 241 | 242 | 243 | class ReplayBuffer(TransBuffer): 244 | def __init__(self, buffer_size, batch_size): 245 | self.buffer_size = buffer_size 246 | self.batch_size = batch_size 247 | self.cum_size = 0 248 | self.buffer = [] 249 | 250 | def add_transition(self, ob, a, r, next_ob, done): 251 | experience = (ob, a, r, next_ob, done) 252 | if self.cum_size < self.buffer_size: 253 | self.buffer.append(experience) 254 | else: 255 | ind = int(self.cum_size % self.buffer_size) 256 | self.buffer[ind] = experience 257 | self.cum_size += 1 258 | 259 | def reset(self): 260 | self.buffer = [] 261 | self.cum_size = 0 262 | 263 | def sample_transition(self): 264 | # Randomly sample batch_size examples 265 | minibatch = random.sample(self.buffer, self.batch_size) 266 | state_batch = np.asarray([data[0] for data in minibatch]) 267 | action_batch = np.asarray([data[1] for data in minibatch]) 268 | next_state_batch = np.asarray([data[3] for data in minibatch]) 269 | reward_batch = np.asarray([data[2] for data in minibatch]) 270 | done_batch = np.asarray([data[4] for data in minibatch]) 271 | return state_batch, action_batch, next_state_batch, reward_batch, done_batch 272 | 273 | @property 274 | def size(self): 275 | return min(self.buffer_size, self.cum_size) 276 | 277 | 278 | """ 279 | util functions 280 | """ 281 | 282 | 283 | class Scheduler: 284 | def __init__(self, val_init, val_min=0, total_step=0, decay='linear'): 285 | self.val = val_init 286 | self.N = float(total_step) 287 | self.val_min = val_min 288 | self.decay = decay 289 | self.n = 0 290 | 291 | def get(self, n_step): 292 | self.n += n_step 293 | if self.decay == 'linear': 294 | return max(self.val_min, self.val * (1 - self.n / self.N)) 295 | else: 296 | return self.val 297 | 298 | 299 | if __name__ == '__main__': 300 | test_layers() 301 | -------------------------------------------------------------------------------- /IQL_conventional/config/config_ford.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | max_grad_norm = 40 3 | gamma = 0.99 4 | lr_init = 1e-4 5 | lr_decay = constant 6 | epsilon_init = 0.9 7 | epsilon_min = 0.1 8 | epsilon_decay = linear 9 | epsilon_ratio = 0.5 10 | num_fc = 128 11 | num_h = 64 12 | batch_size = 64 13 | buffer_size = 1e6 14 | reward_norm = 1.0 15 | reward_clip = 5.0 16 | 17 | 18 | [TRAIN_CONFIG] 19 | total_step = 1.5e6 20 | test_interval = 1e4 21 | log_interval = 10000 22 | 23 | 24 | [ENV_CONFIG] 25 | sample_time = 0.2 26 | episode_length = 765 27 | # 1 for True and 0 for False 28 | discrete = 1 29 | rendering = 0 30 | # objective is used to choose different reward functions 31 | objective = max_flow 32 | seed = 42 33 | test_seeds = 10000,20000,30000 -------------------------------------------------------------------------------- /IQL_conventional/config/config_gym.ini: -------------------------------------------------------------------------------- 1 | [MODEL_CONFIG] 2 | max_grad_norm = 40 3 | gamma = 0.99 4 | lr_init = 1e-4 5 | lr_decay = constant 6 | epsilon_init = 0.9 7 | epsilon_min = 0.1 8 | epsilon_decay = linear 9 | epsilon_ratio = 0.5 10 | num_fc = 128 11 | num_h = 64 12 | batch_size = 128 13 | buffer_size = 1e6 14 | reward_norm = 1.0 15 | reward_clip = 5.0 16 | 17 | [TRAIN_CONFIG] 18 | total_step = 1e6 19 | test_interval = 2e4 20 | log_interval = 1e4 21 | rendering = 0 22 | 23 | [ENV_CONFIG] 24 | scenario = Acrobot-v1 25 | seed = 0 -------------------------------------------------------------------------------- /IQL_conventional/env/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DongChen06/Python2Simulink/f7660e4d2ef656e74e0cde6837b2652c432ef288/IQL_conventional/env/__init__.py -------------------------------------------------------------------------------- /IQL_conventional/env/env_ford.py: -------------------------------------------------------------------------------- 1 | import matlab.engine 2 | import numpy as np 3 | 4 | import sys 5 | sys.path.append("../") 6 | 7 | import gym 8 | import argparse 9 | import configparser 10 | import time 11 | import random 12 | from collections import deque 13 | from gym.utils import seeding 14 | from env.utils import * 15 | 16 | 17 | discrete_resolution = 10 18 | 19 | 20 | def parse_args(): 21 | default_base_dir = '/home/derek/PycharmProjects/Python2Simulink/DDQN_Ford/Data' 22 | default_config_dir = 'DDQN_Ford\config\config_ford.ini' 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('--base-dir', type=str, required=False, 25 | default=default_base_dir, help="experiment base dir") 26 | parser.add_argument('--config-dir', type=str, required=False, 27 | default=default_config_dir, help="experiment config dir") 28 | parser.add_argument('--is_training', type=str, required=False, 29 | default=True, help="True=train, False=evaluation") 30 | parser.add_argument('--test-mode', type=str, required=False, 31 | default='no_test', 32 | help="test mode during training", 33 | choices=['no_test', 'in_train_test', 'after_train_test', 'all_test']) 34 | 35 | args = parser.parse_args() 36 | return args 37 | 38 | 39 | class FordEnv(gym.Env): 40 | """ 41 | This is the environment for ford project which is built on Matlab and python. 42 | 43 | Observation: 44 | Type: Box(7) 45 | Num Observation Min Max 46 | 0 VehicleSpd_mph 0 100 47 | 1 Engine_Spd_c__radps -1e4 1e4 48 | 2 MG1_Spd_radps -1e4 1e4 49 | 3 MG2_Spd_radps -1e4 1e4 50 | 4 Acc_pad 0 1 51 | 5 Dec_pad 0 1 52 | 6 WheelTqDemand_Nm -1e4 1e4 53 | 54 | Actions: 55 | Type: Discrete(discrete_resolution) 56 | Num Action 57 | 0 Push cart to the left 58 | 1 Push cart to the right 59 | """ 60 | 61 | def __init__(self, config, modelName='tracking', discrete=True, time_step=765): 62 | # Setup gym environment 63 | self.modelName = config.get('modelName') 64 | self.model_address = config.get('modelAddress') 65 | # file name of parameters, we need to run it first 66 | self.rendering = int(config.getfloat('rendering')) 67 | self.sample_time = config.getfloat('sample_time') 68 | self.episode_length = int(config.getfloat('episode_length')) 69 | self.seed(66) 70 | 71 | low = np.array([0, -1e4, -1e4, -1e4, 0, 0, -1e4]) 72 | high = np.array([100, 1e4, 1e4, 1e4, 1, 1, 1e4]) 73 | 74 | if discrete is True: 75 | self.action_space = gym.spaces.Discrete(discrete_resolution) 76 | self.observation_space = gym.spaces.Box( 77 | low, high, dtype=np.float32) 78 | else: 79 | self.action_space = gym.spaces.Box(np.array([-1, 0]), np.array([1, 1]), 80 | dtype=np.float32) 81 | self.observation_space = gym.spaces.Box( 82 | -high, high, dtype=np.float32) 83 | 84 | try: 85 | 86 | # initialize matlab and env 87 | self.engMAT = MatEng() 88 | 89 | except Exception as e: 90 | self.close() 91 | raise e 92 | 93 | def seed(self, seed=None): 94 | self.np_random, seed = seeding.np_random(seed) 95 | return [seed] 96 | 97 | def reset(self, ): 98 | self.steps = 0 99 | # reset the matlab model 100 | self.obs = self.engMAT.reset_env(self.rendering) 101 | 102 | def close(self): 103 | self.engMAT.disconnect() 104 | 105 | def render(self, ): 106 | self.engMAT.updateFig() 107 | 108 | def step(self, action): 109 | if action is not None: 110 | obs_new, self.last_reward, self.terminal_state, _ = self.engMAT.run_step( 111 | action) 112 | 113 | if self.rendering: 114 | self.render() 115 | 116 | if self.steps >= int(self.episode_length / self.sample_time) - 1: 117 | self.terminal_state = True 118 | 119 | self.steps += 1 120 | 121 | return obs_new, self.last_reward, self.terminal_state, _ 122 | 123 | 124 | if __name__ == "__main__": 125 | args = parse_args() 126 | base_dir = args.base_dir 127 | config_dir = args.config_dir 128 | config = configparser.ConfigParser() 129 | config.read(config_dir) 130 | epoch = 0 131 | # Example of using FordEnv with sample controller 132 | env = FordEnv(config['ENV_CONFIG']) 133 | action_size = env.action_space.n 134 | print('--------------') 135 | print("Simulation starting...") 136 | while True: 137 | env.reset() 138 | rewards = 0 139 | last_reward = 0 140 | while True: 141 | # print('--------------') 142 | # print("steps = ", env.steps) 143 | # print("rewards = ", last_reward) 144 | action = np.random.randint(action_size, size=1) 145 | # Take an action 146 | obs, last_reward, done, _ = env.step(4) # action[0], 4 147 | rewards += last_reward 148 | if done: 149 | break 150 | print('--------------') 151 | print("steps = ", env.steps) 152 | print("rewards = ", rewards) 153 | epoch += 1 154 | env.close() 155 | -------------------------------------------------------------------------------- /IQL_conventional/env/test.py: -------------------------------------------------------------------------------- 1 | import matlab.engine 2 | 3 | modelAddress = r'C:\Users\Dong\Google Drive\Dong Chen\Ford_proj\CX482_IVA_PDP_EncryptedSimulinkModel' 4 | modelName = 'Cx482_IVA_forPDP_wDriverModel_realtime_v27_ProtecModel' 5 | 6 | eng = matlab.engine.start_matlab() 7 | eng.cd(modelAddress, nargout=0) 8 | # eng.ls(nargout=0) 9 | eng.Run_Sim(nargout=0) 10 | try: 11 | print("Connected to Matlab") 12 | eng.eval("model = '{}'".format(modelName), nargout=0) 13 | eng.eval("load_system(model)", nargout=0) 14 | 15 | eng.set_param('{}/Optimal Controller/u1'.format(modelName), 16 | 'value', str(0), nargout=0) 17 | eng.set_param(modelName, 'SimulationCommand', 'start', 18 | 'SimulationCommand', 'pause', nargout=0) 19 | print('----') 20 | v_mph = eng.eval('v_mph') 21 | if(type(v_mph) == float): 22 | print(eng.eval('v_mph')) 23 | print(eng.eval('target_speed')) 24 | print(eng.eval('Fuel_kg')) 25 | print(eng.eval('Acc_pad')) 26 | else: 27 | target_speed = eng.eval('target_speed') 28 | print(v_mph[-1][0]) 29 | print(target_speed[-1][0]) 30 | Fuel_kg = eng.eval('Fuel_kg') 31 | print(Fuel_kg[-1][0]) 32 | Acc_pad = eng.eval('Acc_pad') 33 | print(Acc_pad[-1][0]) 34 | while (eng.get_param(modelName, 'SimulationStatus') != ('stopped' or 'terminating')): 35 | eng.set_param('{}/Optimal Controller/u1'.format(modelName), 36 | 'value', str(0), nargout=0) 37 | eng.set_param(modelName, 'SimulationCommand', 'continue', 38 | 'SimulationCommand', 'pause', nargout=0) 39 | print('----') 40 | v_mph = eng.eval('v_mph') 41 | if(type(v_mph) == float): 42 | print(eng.eval('v_mph')) 43 | print(eng.eval('target_speed')) 44 | print(eng.eval('Fuel_kg')) 45 | print(eng.eval('Acc_pad')) 46 | else: 47 | print(1) 48 | target_speed = eng.eval('target_speed') 49 | print(v_mph[-1][0]) 50 | print(target_speed[-1][0]) 51 | Fuel_kg = eng.eval('Fuel_kg') 52 | print(Fuel_kg[-1][0]) 53 | Acc_pad = eng.eval('Acc_pad') 54 | print(Acc_pad[-1][0]) 55 | 56 | except Exception as e: 57 | print("eng is closed") 58 | eng.set_param(modelName, 'SimulationCommand', 'stop', nargout=0) 59 | eng.quit() 60 | -------------------------------------------------------------------------------- /IQL_conventional/env/utils.py: -------------------------------------------------------------------------------- 1 | import matlab.engine 2 | import matplotlib.pyplot as plt 3 | import time 4 | 5 | 6 | class MatEng(): 7 | def __init__(self): 8 | self.model_address = r'C:\Users\Dong\Google Drive\Dong Chen\Ford_proj\CX482_IVA_PDP_EncryptedSimulinkModel' 9 | self.modelName = 'Cx482_IVA_forPDP_wDriverModel_realtime_v27_ProtecModel' 10 | self.eng = None 11 | 12 | def reset_env(self, rendering=False): 13 | self.terminal_state = False 14 | self.last_reward = 0 15 | self.t = 0 16 | self.tHist = [] 17 | self.x1Hist = [] 18 | self.x2Hist = [] 19 | # reuse last engine to save loading time 20 | if self.eng == None: 21 | print("Starting matlab") 22 | self.eng = matlab.engine.start_matlab() 23 | else: 24 | # reset matlab after one epoch 25 | self.eng.close("all", nargout=0) 26 | self.eng.bdclose("all", nargout=0) 27 | self.eng.clear("classes", nargout=0) 28 | if rendering: 29 | self.terminate_fig() 30 | 31 | # go to the model folder 32 | self.eng.cd(self.model_address, nargout=0) 33 | # run the simulation configurations (parameters) 34 | # eng.ls(nargout=0) 35 | self.eng.Run_Sim(nargout=0) 36 | # Load the model 37 | self.eng.eval("model = '{}'".format(self.modelName), nargout=0) 38 | self.eng.eval("load_system(model)", nargout=0) 39 | 40 | self.setControlAction(0) 41 | print("Initialized Model") 42 | # enable fast restart 43 | self.eng.set_param(self.modelName, 'FastRestart', 'on', nargout=0) 44 | # Start Simulation and then Instantly pause 45 | self.eng.set_param(self.modelName, 'SimulationCommand', 46 | 'start', 'SimulationCommand', 'pause', nargout=0) 47 | obs = self.getObservations() 48 | if rendering: 49 | # initialize plot 50 | self.initialize_plot() 51 | return obs 52 | 53 | def setControlAction(self, u1): 54 | # set value of control action 55 | self.eng.set_param( 56 | '{}/Optimal Controller/u1'.format(self.modelName), 'value', str(u1), nargout=0) 57 | 58 | def getObservations(self, ): 59 | # get system Output and Time History 60 | tHist = self.eng.eval('tHist') 61 | v_mph = self.eng.eval('v_mph') 62 | engine_spd = self.eng.eval('engine_spd') 63 | MG1_spd = self.eng.eval('MG1_spd') 64 | MG2_spd = self.eng.eval('MG2_spd') 65 | Acc_pad = self.eng.eval('Acc_pad') 66 | Dec_pad = self.eng.eval('Dec_pad') 67 | WheelTD = self.eng.eval('WheelTD') 68 | Fuel_kg = self.eng.eval('Fuel_kg') 69 | SOC_C = self.eng.eval('SOC_C') 70 | target_speed = self.eng.eval('target_speed') 71 | eng_ori = self.eng.eval('eng_ori') 72 | eng_new = self.eng.eval('eng_new') 73 | if(type(v_mph) == float): 74 | self.Fuel_kg = Fuel_kg 75 | self.SOC_C = SOC_C 76 | self.target_speed = target_speed 77 | # for plotting use 78 | self.tHist.append(tHist) 79 | # self.x1Hist.append(eng_ori) 80 | # self.x2Hist.append(eng_new) 81 | self.x1Hist.append(v_mph) 82 | self.x2Hist.append(target_speed * 0.621371192237334) 83 | # self.x1Hist.append(int(Fuel_kg) * 1000) 84 | return (v_mph, engine_spd, MG1_spd, MG2_spd, Acc_pad, Dec_pad, WheelTD) 85 | else: 86 | self.Fuel_kg = Fuel_kg[-1][0] 87 | self.SOC_C = SOC_C[-1][0] 88 | self.target_speed = target_speed[-1][0] 89 | # for plotting use 90 | self.tHist.append(tHist[-1][0]) 91 | # self.x1Hist.append(eng_ori[-1][0]) 92 | # self.x2Hist.append(eng_new[-1][0]) # target_speed[-1][0] * 0.621371192237334 93 | self.x1Hist.append(v_mph[-1][0]) 94 | self.x2Hist.append(target_speed[-1][0] * 0.621371192237334) 95 | # self.x1Hist.append(int(Fuel_kg[-1][0]) * 1000) 96 | return (v_mph[-1][0], engine_spd[-1][0], MG1_spd[-1][0], MG2_spd[-1][0], Acc_pad[-1][0], Dec_pad[-1][0], WheelTD[-1][0]) 97 | 98 | def run_step(self, action): 99 | u1 = -50 + (action + 1) * 10 100 | # u1 = -200 101 | # if u1 < 0: 102 | # u1 = 0 103 | # u1 = -10 + (action + 1) * 2 104 | # Set the Control Action 105 | self.setControlAction(u1) 106 | # start = time.time() 107 | # Pause the Simulation for each timestep 108 | # self.eng.workspace['Pause_time'] = self.t + 0.3 109 | self.eng.set_param(self.modelName, 'SimulationCommand', 110 | 'StepForward', nargout=0) 111 | # tHist = self.eng.eval('tHist') 112 | # if type(tHist) == float: 113 | # self.t = tHist 114 | # else: 115 | # self.t = tHist[-1][0] 116 | # print(self.t) 117 | # end = time.time() 118 | # print(end - start) 119 | # start = time.time() 120 | obs = self.getObservations() 121 | # end = time.time() 122 | # print(end - start) 123 | 124 | # compute the reward 125 | self.reward_fn() 126 | 127 | # if (self.eng.get_param(self.modelName, 'SimulationStatus') == ('stopped' or 'terminating')): 128 | # print(True) 129 | # self.terminal_state = True 130 | 131 | return obs, self.last_reward, self.terminal_state, True 132 | 133 | def reward_fn(self,): 134 | # reward = fuel_consumption + speed_tracking + SOC 135 | self.last_reward = self.Fuel_kg + self.SOC_C + self.target_speed 136 | 137 | def disconnect(self,): 138 | print("eng is closed") 139 | self.eng.set_param( 140 | self.modelName, 'SimulationCommand', 'stop', nargout=0) 141 | self.eng.quit() 142 | 143 | def initialize_plot(self, ): 144 | # Initialize the graph 145 | self.fig = plt.figure() 146 | self.fig1, = plt.plot(self.tHist, self.x1Hist, 147 | color='red', linewidth=1) 148 | self.fig2, = plt.plot(self.tHist, self.x2Hist, color='k', linewidth=1) 149 | # for speed tracking 150 | plt.xlim(0, 800) 151 | plt.ylim(-10, 100) 152 | # engine torque 153 | # plt.xlim(0, 800) 154 | # plt.ylim(-50, 400) 155 | # for fuel consumption 156 | # plt.xlim(0, 800) 157 | # plt.ylim(0, 3) 158 | plt.ylabel("Output") 159 | plt.xlabel("Time(s)") 160 | # plt.legend('x1', 'x2', loc='upper right') 161 | plt.title("System Response") 162 | 163 | def updateFig(self, ): 164 | # Update the Graph 165 | self.fig1.set_xdata(self.tHist) 166 | self.fig1.set_ydata(self.x1Hist) 167 | self.fig2.set_xdata(self.tHist) 168 | self.fig2.set_ydata(self.x2Hist) 169 | plt.ion() 170 | plt.pause(0.001) 171 | plt.show() 172 | 173 | def terminate_fig(self,): 174 | plt.close(self.fig) 175 | # plt.close(self.fig2) 176 | -------------------------------------------------------------------------------- /IQL_conventional/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | 4 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 6 | 7 | import argparse 8 | import configparser 9 | import gym 10 | import tensorflow as tf 11 | import tensorflow.contrib.layers as layers 12 | from env.env_ford import FordEnv 13 | from utils import * 14 | from trainer import * 15 | from agents.models import IQL 16 | 17 | 18 | def parse_args(): 19 | default_base_dir = 'Data' 20 | default_config_dir = 'config/config_gym.ini' 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--base-dir', type=str, required=False, 23 | default=default_base_dir, help="experiment base dir") 24 | parser.add_argument('--config-dir', type=str, required=False, 25 | default=default_config_dir, help="experiment config dir") 26 | parser.add_argument('--is_training', type=str, required=False, 27 | default=True, help="True=train, False=evaluation") 28 | parser.add_argument('--test-mode', type=str, required=False, 29 | default='no_test', 30 | help="test mode during training", 31 | choices=['no_test', 'in_train_test', 'after_train_test', 'all_test']) 32 | 33 | args = parser.parse_args() 34 | return args 35 | 36 | 37 | def train_fn(args): 38 | base_dir = args.base_dir 39 | dirs = init_dir(base_dir) 40 | init_log(dirs['log']) 41 | config_dir = args.config_dir 42 | # copy_file(config_dir, dirs['data']) 43 | config = configparser.ConfigParser() 44 | config.read(config_dir) 45 | 46 | # test during training, test after training 47 | in_test, post_test = init_test_flag(args.test_mode) 48 | 49 | # Initialize environment 50 | print("Initializing environment") 51 | # env = FordEnv(config['ENV_CONFIG']) 52 | env = gym.make("CartPole-v0") 53 | n_s = env.observation_space.shape 54 | logging.info('Training: s dim: %d, a dim %d' % 55 | (n_s[0], env.action_space.n)) 56 | n_s_ls = [n_s[0]] 57 | n_a_ls = [env.action_space.n] 58 | # init step counter 59 | total_step = int(config.getfloat('TRAIN_CONFIG', 'total_step')) 60 | test_step = int(config.getfloat('TRAIN_CONFIG', 'test_interval')) 61 | log_step = int(config.getfloat('TRAIN_CONFIG', 'log_interval')) 62 | global_counter = Counter(total_step, test_step, log_step) 63 | 64 | seed = config.getint('ENV_CONFIG', 'seed') 65 | 66 | model = IQL(n_s_ls, n_a_ls, total_step, config['MODEL_CONFIG'], 67 | seed=0, model_type='dqn') 68 | 69 | summary_writer = tf.summary.FileWriter(dirs['log']) 70 | trainer = Trainer(env, model, global_counter, 71 | summary_writer, in_test, output_path=dirs['data']) 72 | trainer.run() 73 | 74 | # post-training test 75 | if post_test: 76 | tester = Tester(env, model, global_counter, 77 | summary_writer, dirs['data']) 78 | tester.run_offline(dirs['data']) 79 | 80 | # save model 81 | final_step = global_counter.cur_step 82 | logging.info('Training: save final model at step %d ...' % final_step) 83 | model.save(dirs['model'], final_step) 84 | 85 | 86 | def evaluate_fn(args): 87 | pass 88 | 89 | 90 | if __name__ == '__main__': 91 | args = parse_args() 92 | if args.is_training is True: 93 | train_fn(args) 94 | else: 95 | evaluate_fn(args) 96 | -------------------------------------------------------------------------------- /IQL_conventional/trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | import tensorflow as tf 4 | import pandas as pd 5 | import time 6 | 7 | 8 | class Trainer(): 9 | def __init__(self, env, model, global_counter, summary_writer, run_test, output_path=None, rendering=False): 10 | self.cur_step = 0 11 | self.rendering = rendering 12 | self.global_counter = global_counter 13 | self.env = env 14 | self.agent = 'iql' # TODO 15 | self.model = model 16 | self.sess = self.model.sess 17 | self.n_step = self.model.n_step # bacth size 18 | self.summary_writer = summary_writer 19 | self.run_test = run_test # ToDo 20 | # assert self.env.T % self.n_step == 0 21 | self.data = [] 22 | self.output_path = output_path 23 | if run_test: 24 | self.test_num = self.env.test_num 25 | logging.info('Testing: total test num: %d' % self.test_num) 26 | self._init_summary() 27 | 28 | def _init_summary(self): 29 | self.train_reward = tf.placeholder(tf.float32, []) 30 | self.train_summary = tf.summary.scalar( 31 | 'train_reward', self.train_reward) 32 | self.test_reward = tf.placeholder(tf.float32, []) 33 | self.test_summary = tf.summary.scalar('test_reward', self.test_reward) 34 | 35 | def _add_summary(self, reward, global_step, is_train=True): 36 | if is_train: 37 | summ = self.sess.run(self.train_summary, { 38 | self.train_reward: reward}) 39 | else: 40 | summ = self.sess.run(self.test_summary, {self.test_reward: reward}) 41 | self.summary_writer.add_summary(summ, global_step=global_step) 42 | 43 | def take_action(self, prev_ob, prev_done): 44 | # take actions for a batch size 45 | ob = prev_ob 46 | done = prev_done 47 | rewards = 0 # ori = [] 48 | for _ in range(self.n_step): 49 | if self.agent.endswith('a2c'): 50 | policy, value = self.model.forward(ob, done) 51 | action = [] 52 | for pi in policy: 53 | action.append(np.random.choice(np.arange(len(pi)), p=pi)) 54 | else: 55 | action, policy = self.model.forward(ob, mode='explore') 56 | next_ob, reward, done, _ = self.env.step(action[0]) # ori = action, global_reward 57 | if self.rendering: 58 | self.env.render() 59 | rewards += reward 60 | global_step = self.global_counter.next() 61 | self.cur_step += 1 62 | self.model.add_transition(ob, action, reward, next_ob, done) 63 | if done: 64 | break 65 | ob = next_ob 66 | return ob, done, _, rewards 67 | 68 | def evaluate(self, test_ind, demo=False, policy_type='default'): 69 | # test function 70 | ob = self.env.reset(gui=demo, test_ind=test_ind) 71 | # note this done is pre-decision to reset LSTM states! 72 | done = True 73 | self.model.reset() 74 | rewards = [] 75 | while True: 76 | if self.agent == 'greedy': 77 | action = self.model.forward(ob) 78 | elif self.agent.endswith('a2c'): 79 | # policy-based on-poicy learning 80 | policy = self.model.forward(ob, done, 'p') 81 | if self.agent == 'ma2c': 82 | self.env.update_fingerprint(policy) 83 | if self.agent == 'a2c': 84 | if policy_type != 'deterministic': 85 | action = np.random.choice( 86 | np.arange(len(policy)), p=policy) 87 | else: 88 | action = np.argmax(np.array(policy)) 89 | else: 90 | action = [] 91 | for pi in policy: 92 | if policy_type != 'deterministic': 93 | action.append(np.random.choice( 94 | np.arange(len(pi)), p=pi)) 95 | else: 96 | action.append(np.argmax(np.array(pi))) 97 | else: 98 | # value-based off-policy learning 99 | if policy_type != 'stochastic': 100 | action, _ = self.model.forward(ob) 101 | else: 102 | action, _ = self.model.forward(ob, stochastic=True) 103 | next_ob, reward, done, global_reward = self.env.step(action) 104 | rewards.append(global_reward) 105 | if done: 106 | break 107 | ob = next_ob 108 | mean_reward = np.mean(np.array(rewards)) 109 | std_reward = np.std(np.array(rewards)) 110 | return mean_reward, std_reward 111 | 112 | def run(self): 113 | while not self.global_counter.should_stop(): 114 | # test or not 115 | if self.run_test and self.global_counter.should_test(): 116 | rewards = [] 117 | global_step = self.global_counter.cur_step 118 | for test_ind in range(self.test_num): 119 | mean_reward, std_reward = self.evaluate(test_ind) 120 | self.env.terminate() 121 | rewards.append(mean_reward) 122 | log = {'agent': self.agent, 123 | 'step': global_step, 124 | 'test_id': test_ind, 125 | 'avg_reward': mean_reward, 126 | 'std_reward': std_reward} 127 | self.data.append(log) 128 | avg_reward = np.mean(np.array(rewards)) 129 | self._add_summary(avg_reward, global_step, is_train=False) 130 | logging.info('Testing: global step %d, avg R: %.2f' % 131 | (global_step, avg_reward)) 132 | 133 | # train 134 | ob = self.env.reset() 135 | done = True 136 | self.model.reset() 137 | self.cur_step = 0 138 | rewards = [] 139 | while True: 140 | ob, done, _, cur_rewards = self.take_action(ob, done) 141 | rewards.append(cur_rewards) # ori 142 | global_step = self.global_counter.cur_step 143 | # update network for each bach size steps 144 | self.model.backward(self.summary_writer, global_step) 145 | # termination 146 | if done: 147 | break 148 | rewards = np.array(rewards) # reward for one epoch 149 | mean_reward = np.mean(rewards) 150 | std_reward = np.std(rewards) 151 | log = {'agent': self.agent, 152 | 'step': global_step, 153 | 'test_id': -1, 154 | 'avg_reward': mean_reward, 155 | 'std_reward': std_reward} 156 | self.data.append(log) 157 | self._add_summary(mean_reward, global_step) 158 | self.summary_writer.flush() 159 | df = pd.DataFrame(self.data) # data: dictionary 160 | df.to_csv(self.output_path + 'train_reward.csv') 161 | 162 | 163 | class Tester(Trainer): 164 | def __init__(self, env, model, global_counter, summary_writer, output_path): 165 | super().__init__(env, model, global_counter, summary_writer) 166 | self.env.train_mode = False 167 | self.test_num = self.env.test_num 168 | self.output_path = output_path 169 | self.data = [] 170 | logging.info('Testing: total test num: %d' % self.test_num) 171 | 172 | def _init_summary(self): 173 | self.reward = tf.placeholder(tf.float32, []) 174 | self.summary = tf.summary.scalar('test_reward', self.reward) 175 | 176 | def run_offline(self): 177 | # enable traffic measurments for offline test 178 | is_record = True 179 | record_stats = False 180 | self.env.cur_episode = 0 181 | self.env.init_data(is_record, record_stats, self.output_path) 182 | rewards = [] 183 | for test_ind in range(self.test_num): 184 | rewards.append(self.evaluate(test_ind)) 185 | self.env.terminate() 186 | time.sleep(2) 187 | self.env.collect_tripinfo() 188 | avg_reward = np.mean(np.array(rewards)) 189 | logging.info('Offline testing: avg R: %.2f' % avg_reward) 190 | self.env.output_data() 191 | 192 | def run_online(self, coord): 193 | self.env.cur_episode = 0 194 | while not coord.should_stop(): 195 | time.sleep(30) 196 | if self.global_counter.should_test(): 197 | rewards = [] 198 | global_step = self.global_counter.cur_step 199 | for test_ind in range(self.test_num): 200 | cur_reward = self.evaluate(test_ind) 201 | self.env.terminate() 202 | rewards.append(cur_reward) 203 | log = {'agent': self.agent, 204 | 'step': global_step, 205 | 'test_id': test_ind, 206 | 'reward': cur_reward} 207 | self.data.append(log) 208 | avg_reward = np.mean(np.array(rewards)) 209 | self._add_summary(avg_reward, global_step) 210 | logging.info('Testing: global step %d, avg R: %.2f' % 211 | (global_step, avg_reward)) 212 | # self.global_counter.update_test(avg_reward) 213 | df = pd.DataFrame(self.data) 214 | df.to_csv(self.output_path + 'train_reward.csv') 215 | 216 | 217 | class Evaluator(Tester): 218 | def __init__(self, env, model, output_path, demo=False, policy_type='default'): 219 | self.env = env 220 | self.model = model 221 | self.agent = self.env.agent 222 | self.env.train_mode = False 223 | self.test_num = self.env.test_num 224 | self.output_path = output_path 225 | self.demo = demo 226 | self.policy_type = policy_type 227 | 228 | def run(self): 229 | is_record = True 230 | record_stats = False 231 | self.env.cur_episode = 0 232 | self.env.init_data(is_record, record_stats, self.output_path) 233 | time.sleep(1) 234 | for test_ind in range(self.test_num): 235 | reward, _ = self.evaluate( 236 | test_ind, demo=self.demo, policy_type=self.policy_type) 237 | self.env.terminate() 238 | logging.info('test %i, avg reward %.2f' % (test_ind, reward)) 239 | time.sleep(2) 240 | self.env.collect_tripinfo() 241 | self.env.output_data() -------------------------------------------------------------------------------- /IQL_conventional/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import numpy as np 4 | import tensorflow as tf 5 | import time 6 | import os 7 | import pandas as pd 8 | import subprocess 9 | 10 | 11 | def check_dir(cur_dir): 12 | if not os.path.exists(cur_dir): 13 | return False 14 | return True 15 | 16 | 17 | def copy_file(src_dir, tar_dir): 18 | cmd = 'cp %s %s' % (src_dir, tar_dir) 19 | subprocess.check_call(cmd, shell=True) 20 | 21 | 22 | def find_file(cur_dir, suffix='.ini'): 23 | for file in os.listdir(cur_dir): 24 | if file.endswith(suffix): 25 | return cur_dir + '/' + file 26 | logging.error('Cannot find %s file' % suffix) 27 | return None 28 | 29 | 30 | def init_dir(base_dir, pathes=['log', 'data', 'model']): 31 | if not os.path.exists(base_dir): 32 | os.mkdir(base_dir) 33 | dirs = {} 34 | for path in pathes: 35 | cur_dir = base_dir + "/%s/" % path 36 | if not os.path.exists(cur_dir): 37 | os.mkdir(cur_dir) 38 | dirs[path] = cur_dir 39 | return dirs 40 | 41 | 42 | def init_log(log_dir): 43 | logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', 44 | level=logging.INFO, 45 | handlers=[ 46 | logging.FileHandler('%s/%d.log' % 47 | (log_dir, time.time())), 48 | logging.StreamHandler() 49 | ]) 50 | 51 | 52 | def init_test_flag(test_mode): 53 | if test_mode == 'no_test': 54 | return False, False 55 | if test_mode == 'in_train_test': 56 | return True, False 57 | if test_mode == 'after_train_test': 58 | return False, True 59 | if test_mode == 'all_test': 60 | return True, True 61 | return False, False 62 | 63 | 64 | def plot_train(data_dirs, labels): 65 | pass 66 | 67 | 68 | def plot_evaluation(data_dirs, labels): 69 | pass 70 | 71 | 72 | class Counter: 73 | def __init__(self, total_step, test_step, log_step): 74 | self.counter = itertools.count(1) 75 | self.cur_step = 0 76 | self.cur_test_step = 0 77 | self.total_step = total_step 78 | self.test_step = test_step 79 | self.log_step = log_step 80 | self.stop = False 81 | # self.init_test = True 82 | 83 | def next(self): 84 | self.cur_step = next(self.counter) 85 | return self.cur_step 86 | 87 | def should_test(self): 88 | # if self.init_test: 89 | # self.init_test = False 90 | # return True 91 | test = False 92 | if (self.cur_step - self.cur_test_step) >= self.test_step: 93 | test = True 94 | self.cur_test_step = self.cur_step 95 | return test 96 | 97 | # def update_test(self, reward): 98 | # if self.prev_reward is not None: 99 | # if abs(self.prev_reward - reward) <= self.delta_reward: 100 | # self.stop = True 101 | # self.prev_reward = reward 102 | 103 | def should_log(self): 104 | return (self.cur_step % self.log_step == 0) 105 | 106 | def should_stop(self): 107 | if self.cur_step >= self.total_step: 108 | return True 109 | return self.stop 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Python2Simulink 2 | =============== 3 | #### Robotics and Intelligent Vehicle Automation Lab (RIVAL) 4 | - Built by Dong Chen 5 | - Started on Jan.11, 2020 6 | 7 | A bridge between Python and Simulink. 8 | This file aims to build a bridge between Python and Simulink. At each time step, the python script will send a command (input) to the simulink model, then the simulink model executes for one step and then returns the results to Python scripts for the decision usage. 9 | 10 | ## Install MATLAB Engine API for Python 11 | ------- 12 | 13 | Install the MATLAB Engine API follow the instruction [Installation](https://www.mathworks.com/help/matlab/matlab_external/install-the-matlab-engine-for-python.html). 14 | 15 | ## Applications 16 | ------- 17 | 18 | 1. plant example 19 | 20 | In this example, we build a PI controller to regulate a secord-order system to a reference value (10 here). The Python script compute the control input and sends then value to the Simulink model. Then Simulink model runs for one step and returns the output value to the python script. 21 | 22 |
23 |
24 |
Fig.1 Figure of regulation result
25 |
34 |
35 |
Fig.2 simulink model
36 |
41 |
42 |
Fig.3 Tracking result, x1(blue) and xd1(orange)
43 |