├── Closed_loop_training.py
├── Environment_simulator.py
├── Human_exoskeleton_model.py
└── README.md


/Closed_loop_training.py:
--------------------------------------------------------------------------------
  1 | """Pseudocode for PPO RL training."""
  2 | 
  3 | import torch
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | import Environment
  7 | 
  8 | # for exo
  9 | TRAJECTORY_FIELDS_EXO = [
 10 | 	'state',   # State of the agent.
 11 | 	'action',  # Action taken by the agent.
 12 | 	'reward',  # Reward for the agent after taking the step.
 13 | 	'value',   # value of the value network
 14 | 	'logits',  # logits of the action network.
 15 | ]
 16 | 
 17 | # for human
 18 | TRAJECTORY_FIELDS_HUMAN = [
 19 | 	'state',   # State of the agent.
 20 | 	'action',  # Action taken by the agent.
 21 | 	'reward',  # Reward for the agent after taking the step.
 22 | 	'value',   # value of the value network
 23 | 	'logits',  # logits of the action network.
 24 | ]
 25 | 
 26 | # for muscle
 27 | TRAJECTORY_FIELDS_MUSCLES = [
 28 | 	'JtA', 
 29 | 	'tau_des',    # desired tau
 30 | 	'L', 
 31 | 	'b',
 32 | ]
 33 | 
 34 | MAX_ITERATION = 100000
 35 | REPLAY_BUFFER_SIZE = 30000
 36 | NUM_AGENT = 16
 37 | 
 38 | class PPO(object):
 39 | 	def __init__(self,):
 40 | 		# create multiple simulation environments
 41 | 		self.env = EnvManager(Environment, NUM_AGENT)
 42 | 		# build models for exoskeleton, human, and human muscle
 43 | 		self.exo_model = SimulationNN(self.num_state,self.num_action)
 44 | 		self.human_model = SimulationHumanNN(self.num_human_state, self.num_human_action)
 45 | 		self.muscle_model = MuscleNN(self.MuscleRelatedDofs, self.num_muscles)
 46 | 
 47 | 		# create replay buffer for  exoskeleton, human, and human muscle
 48 | 		self.replay_exo_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_EXO)
 49 | 		self.replay_human_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_HUMAN)
 50 | 		self.replay_muscle_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_MUSCLES)
 51 | 
 52 | 		self.batch_size = 128
 53 | 		self.learning_rate = 1e-4
 54 | 
 55 | 		# create optimizer for exoskeleton, human, and human muscle models
 56 | 		self.optimizer_exo = optim.Adam(self.exo_model.parameters(),lr=self.learning_rate) 
 57 | 		self.optimizer_human = optim.Adam(self.human_model.parameters(),lr=self.learning_rate)
 58 | 		self.optimizer_muscle =  optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate)
 59 | 
 60 | 		self.episodes = [None]*NUM_AGENT
 61 | 		for j in range(NUM_AGENT):
 62 | 			self.episodes[j] = EpisodeBuffer()
 63 | 
 64 | 		self.gamma = 0.99
 65 | 		self.lb = 0.99
 66 | 		self.w_entropy = -0.001
 67 | 
 68 | 
 69 | 	def ComputeTDandGAE(self):
 70 | 		self.replay_exo_buffer.Clear()
 71 | 		self.replay_human_buffer.Clear()
 72 | 		self.replay_muscle_buffer.Clear()
 73 | 
 74 | 		for epi in self.total_episodes:
 75 | 			data = epi.GetData()
 76 | 			
 77 | 			states_exo, actions_exo, rewards_exo, values_exo, logprobs_exo, \
 78 | 			states_human, actions_human, rewards_human, values_human, logprobs_human = zip(*data)
 79 | 
 80 | 			# Here omit how to compute Temporal Difference(TD) and Generalized Advantage Estimation (GAE),
 81 | 			# we follows the common way as discribed in PPO: https://arxiv.org/pdf/1707.06347.pdf
 82 | 
 83 | 			self.replay_exo_buffer.Push(states_exo[i], actions_exo[i], logprobs_exo[i], TD_exo[i], advantages_exo[i])
 84 | 			self.replay_human_buffer.Push(states_human[i], actions_human[i], logprobs_human[i], TD_human[i], advantages_human[i])
 85 | 			
 86 | 			muscle_tuples = self.env.GetMuscleTuples()
 87 | 			for i in range(len(muscle_tuples)):
 88 | 				self.muscle_buffer.Push(muscle_tuples[i][0],muscle_tuples[i][1],muscle_tuples[i][2],muscle_tuples[i][3])
 89 | 
 90 | 
 91 | 	def GenerateTransitions(self):
 92 | 		self.total_episodes = []
 93 | 		rewards_exo = [None]*NUM_AGENT
 94 | 		rewards_human = [None] * NUM_AGENT
 95 | 
 96 | 		local_step = 0
 97 | 		while True:
 98 | 			# Get observation/state of exoskeleton and human from simulators.
 99 | 			states_exo = self.env.GetExoObservations()
100 | 			states_human = self.env.GetHumanObservations()
101 | 
102 | 			# Predict and Apply action of human
103 | 			action_dist_human, values_human = self.human_model(states_human)
104 | 			actions_human = action_dist_human.sample()
105 | 			logprobs_human = action_dist_human.log_prob(actions_human)
106 | 			self.env.SetHumanActions(actions_human)
107 | 
108 | 			# Predict and Apply action of exo
109 | 			action_dist_exo, values_exo = self.exo_model(states_exo)
110 | 			actions_exo = action_dist_exo.sample()
111 | 			logprobs_exo = action_dist_exo.log_prob(actions_exo)
112 | 			self.env.SetExoActions(actions_exo)
113 | 
114 | 			# Predict and Apply action of muscle
115 | 			
116 | 			muscle_torque = self.env.GetMuscleTorques()
117 | 			desired_torque = self.env.GetDesiredTorquesHuman()
118 | 			activations = self.muscle_model(muscle_torque, desired_torque)
119 | 			self.env.SetActivationLevels(activations)
120 | 			self.env.Steps()
121 | 
122 | 			for j in range(NUM_AGENT):
123 | 				# check if the episode of jth agent ends
124 | 				if not self.env.IsEndOfEpisode(j):
125 | 					# Obtain the rewards after taking action in the simulator
126 | 					rewards_exo[j] = self.env.GetExoReward(j) 
127 | 					rewards_human[j] = self.env.GetHumanReward(j)
128 | 					
129 | 					self.episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j], \
130 | 											states_human[j], actions_human[j], rewards_human[j], values_human[j], logprobs_human[j])
131 | 
132 | 					local_step += 1
133 | 	
134 | 				else:
135 | 					self.total_episodes.append(self.episodes[j])
136 | 					self.episodes[j] = EpisodeBuffer()
137 | 					self.env.Reset(j)
138 | 
139 | 
140 | 	def GetLoss(self, a_dist, value, action, lp, td, gae):
141 | 		'''Critic Loss'''
142 | 		loss_critic = ((value-td).pow(2)).mean()
143 | 		'''Actor Loss'''
144 | 		ratio = torch.exp(a_dist.log_prob(action)-lp)
145 | 		gae = (gae-gae.mean())/(gae.std()+ 1E-5)
146 | 		loss_actor = (ratio * gae).mean()
147 | 		'''Entropy Loss'''
148 | 		loss_entropy = - self.w_entropy * a_dist.entropy().mean()
149 | 		loss = loss_actor + loss_critic + loss_entropy
150 | 		return loss
151 | 
152 | 	def OptimizeSimulationExoNN(self):
153 | 		all_transitions = np.array(self.replay_exo_buffer.buffer)
154 | 		for j in range(self.num_epochs):
155 | 			np.random.shuffle(all_transitions)
156 | 			for i in range(len(all_transitions)//self.batch_size):
157 | 				transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size]
158 | 				state, action, lp, td, gae = transitions
159 | 				a_dist, value = self.exo_model(state)
160 | 				loss = self.GetLoss(a_dist, value, action, lp, td, gae)
161 | 				self.optimizer_exo.zero_grad()
162 | 				loss.backward()
163 | 				self.optimizer_exo.step()
164 | 
165 | 	def OptimizeSimulationHumanNN(self):
166 | 		all_transitions = np.array(self.replay_human_buffer.buffer)
167 | 		for j in range(self.num_epochs):
168 | 			np.random.shuffle(all_transitions)
169 | 			for i in range(len(all_transitions)//self.batch_size):
170 | 				transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size]
171 | 				state, action, lp, td, gae = transitions
172 | 				a_dist, value = self.human_model(state)
173 | 				loss = self.GetLoss(a_dist, value, action, lp, td, gae)
174 | 				self.optimizer_human.zero_grad()
175 | 				loss.backward()
176 | 				self.optimizer_human.step()
177 | 
178 | 	def OptimizeMuscleNN(self):
179 | 		muscle_transitions = np.array(self.muscle_buffer.buffer)
180 | 		for j in range(self.num_epochs_muscle):
181 | 			np.random.shuffle(muscle_transitions)
182 | 			for i in range(len(muscle_transitions)//self.muscle_batch_size):
183 | 				tuples = muscle_transitions[i*self.muscle_batch_size:(i+1)*self.muscle_batch_size]
184 | 				JtA, tau_des, L, b  = tuples
185 | 				activation = self.muscle_model(JtA, tau_des)
186 | 				tau = torch.einsum('ijk,ik->ij',(L, activation)) + b
187 | 				loss_reg = (activation).pow(2).mean()
188 | 				loss_target = (((tau-stack_tau_des)/100.0).pow(2)).mean() 
189 | 				loss = 0.01*loss_reg + loss_target
190 | 				self.optimizer_muscle.zero_grad()
191 | 				loss.backward()
192 | 				self.optimizer_muscle.step()
193 | 
194 | 	def Train(self):		
195 | 		# generate transition
196 | 		self.GenerateTransitions()
197 | 		# prepare data
198 | 		self.ComputeTDandGAE()
199 | 		# optimize each model
200 | 		self.OptimizeSimulationNN()
201 | 		self.OptimizeSimulationHumanNN()
202 | 		self.OptimizeMuscleNN()
203 | 
204 | 
205 | if __name__=="__main__":
206 | 	ppo = PPO()
207 | 	for i in range(MAX_ITERATION):
208 | 		ppo.Train()
209 | 
210 | 
211 | 		


--------------------------------------------------------------------------------
/Environment_simulator.py:
--------------------------------------------------------------------------------
 1 | """Interface demonstration of environment simulator"""
 2 | 
 3 | class Environment(object):
 4 | 
 5 |     def __init__(self,):
 6 |         pass
 7 | 
 8 |     def GetMuscleTuples(self):
 9 |         """Returns the hidden state of the agent for the start of an episode."""
10 |         # Network details elided.
11 |         return initial_state
12 | 
13 | 
14 |     def GetExoObservations(self):
15 |         """return the state of the exskeletion in the simulator"""
16 |         return exo_state
17 | 
18 |     def GetHumanObservations(self):
19 |         """return the state of the human in the simulator"""
20 |         return human_state
21 | 
22 |     def SetHumanActions(self, actions):
23 |         """Set the action for the human"""
24 |         return None
25 | 
26 |     def SetExoActions(self, actions):
27 |         """Set the action for the exskeletion"""
28 |         return None
29 | 
30 |     def GetMuscleTorques(self):
31 |         """return the muscle torque"""
32 |         return muscle_torque
33 | 
34 |     def GetDesiredTorquesHuman(self):
35 |         """return the desired torque of the human as the target"""
36 |         return desired_torque_human
37 |     
38 |     def SetActivationLevels(self, activations):
39 |         """set the activate levels of muscles in the simulator"""
40 |         return None
41 |     
42 |     def Steps(self):
43 |         """Performs simulation step in the simulator"""
44 |         return None
45 | 


--------------------------------------------------------------------------------
/Human_exoskeleton_model.py:
--------------------------------------------------------------------------------
  1 | """ Neural network architecture definition for exoskeleton, human and muscles"""
  2 | 
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | MultiVariateNormal = torch.distributions.Normal
  7 | 
  8 | class MuscleNN(nn.Module):
  9 |     def __init__(self, num_total_muscle_related_dofs, num_dofs, num_muscles):
 10 |         super(MuscleNN, self).__init__()
 11 |         self.num_total_muscle_related_dofs = num_total_muscle_related_dofs
 12 |         self.num_dofs = num_dofs
 13 |         self.num_muscles = num_muscles
 14 | 
 15 |         self.fc = nn.Sequential(
 16 |             nn.Linear(num_total_muscle_related_dofs + num_dofs, num_h1),
 17 |             nn.LeakyReLU(0.2, inplace=True),
 18 |             nn.Linear(1024, num_h2),
 19 |             nn.LeakyReLU(0.2, inplace=True),
 20 |             nn.Linear(512, num_h3),
 21 |             nn.LeakyReLU(0.2, inplace=True),
 22 |             nn.Linear(512, num_muscles),
 23 |             nn.Tanh(),
 24 |         )
 25 |         self.std_muscle_tau = torch.zeros(self.num_total_muscle_related_dofs)
 26 |         self.std_tau = torch.zeros(self.num_dofs)
 27 | 
 28 |         for i in range(self.num_total_muscle_related_dofs):
 29 |             self.std_muscle_tau[i] = 200.0
 30 | 
 31 |         for i in range(self.num_dofs):
 32 |             self.std_tau[i] = 200.0
 33 |         
 34 |     def forward(self, muscle_tau, tau):
 35 |         muscle_tau = muscle_tau / self.std_muscle_tau
 36 |         tau = tau / self.std_tau
 37 |         out = self.fc(torch.cat([muscle_tau, tau], dim=1))
 38 |         return nn.ReLU()(out)
 39 | 
 40 |     def get_activation(self, muscle_tau, tau):
 41 |         act = self.forward(muscle_tau.reshape(1, -1), tau.reshape(1, -1))
 42 |         return act.cpu().detach().numpy()
 43 | 
 44 | 
 45 | class SimulationExoNN(nn.Module):
 46 |     def __init__(self, num_states, num_actions):
 47 |         super(SimulationExoNN, self).__init__()
 48 | 
 49 |         num_h1 = 128
 50 |         num_h2 = 64
 51 |         self.num_actions = num_actions
 52 |         self.policy = nn.Sequential(
 53 |             nn.Linear(num_states, num_h1),
 54 |             nn.ReLU(),
 55 |             nn.Linear(num_h1, num_h2),
 56 |             nn.ReLU(),
 57 |             nn.Linear(num_h2, num_actions)
 58 |         )
 59 | 
 60 |         self.value = nn.Sequential(
 61 |             nn.Linear(num_states, num_h1),
 62 |             nn.ReLU(),
 63 |             nn.Linear(num_h1, num_h2),
 64 |             nn.ReLU(),
 65 |             nn.Linear(num_h2, 1)
 66 |         )
 67 |         self.log_std = nn.Parameter(torch.zeros(num_actions))
 68 | 
 69 |     def forward(self, x):
 70 |         p_out = self.policy(x)
 71 |         p_out = MultiVariateNormal(p_out, self.log_std.exp())
 72 |         v_out = self.value(x)
 73 |         return p_out, v_out
 74 | 
 75 |     def get_action(self, s):
 76 |         ts = torch.tensor(s)
 77 |         p, _ = self.forward(ts)
 78 |         return p.loc.cpu().detach().numpy()
 79 | 
 80 | class SimulationHumanNN(nn.Module):
 81 |     def __init__(self, num_states, num_actions):
 82 |         super(SimulationHumanNN, self).__init__()
 83 | 
 84 |         num_h1 = 256
 85 |         num_h2 = 256
 86 |         self.num_actions = num_actions
 87 |         self.policy = nn.Sequential(
 88 |             nn.Linear(num_states, num_h1),
 89 |             nn.ReLU(),
 90 |             nn.Linear(num_h1, num_h2),
 91 |             nn.ReLU(),
 92 |             nn.Linear(num_h2, num_actions)
 93 |         )
 94 |         self.value = nn.Sequential(
 95 |             nn.Linear(num_states, num_h1),
 96 |             nn.ReLU(),
 97 |             nn.Linear(num_h1, num_h2),
 98 |             nn.ReLU(),
 99 |             nn.Linear(num_h2, 1)
100 |         )
101 |         self.log_std = nn.Parameter(torch.zeros(num_actions))
102 | 
103 |     def forward(self, x):
104 |         p_out = self.policy(x)
105 |         p_out = MultiVariateNormal(p_out, self.log_std.exp())
106 |         v_out = self.value(x)
107 |         return p_out, v_out
108 | 
109 |     def get_action(self, s):
110 |         ts = torch.tensor(s)
111 |         p, _ = self.forward(ts)
112 |         return p.loc.cpu().detach().numpy()
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning_in_simulation
2 | 
3 | Environment_simulator.py --Interface demonstration of environment simulator
4 | 
5 | Human_exoskeleton_model.py --Neural network architecture definition for the exoskeleton, human and muscles.
6 | 
7 | Closed_loop_training.py --Sample code for PPO Reinforcement Learning training.
8 | 


--------------------------------------------------------------------------------