├── Closed_loop_training.py ├── Environment_simulator.py ├── Human_exoskeleton_model.py └── README.md /Closed_loop_training.py: -------------------------------------------------------------------------------- 1 | """Pseudocode for PPO RL training.""" 2 | 3 | import torch 4 | import torch.optim as optim 5 | import numpy as np 6 | import Environment 7 | 8 | # for exo 9 | TRAJECTORY_FIELDS_EXO = [ 10 | 'state', # State of the agent. 11 | 'action', # Action taken by the agent. 12 | 'reward', # Reward for the agent after taking the step. 13 | 'value', # value of the value network 14 | 'logits', # logits of the action network. 15 | ] 16 | 17 | # for human 18 | TRAJECTORY_FIELDS_HUMAN = [ 19 | 'state', # State of the agent. 20 | 'action', # Action taken by the agent. 21 | 'reward', # Reward for the agent after taking the step. 22 | 'value', # value of the value network 23 | 'logits', # logits of the action network. 24 | ] 25 | 26 | # for muscle 27 | TRAJECTORY_FIELDS_MUSCLES = [ 28 | 'JtA', 29 | 'tau_des', # desired tau 30 | 'L', 31 | 'b', 32 | ] 33 | 34 | MAX_ITERATION = 100000 35 | REPLAY_BUFFER_SIZE = 30000 36 | NUM_AGENT = 16 37 | 38 | class PPO(object): 39 | def __init__(self,): 40 | # create multiple simulation environments 41 | self.env = EnvManager(Environment, NUM_AGENT) 42 | # build models for exoskeleton, human, and human muscle 43 | self.exo_model = SimulationNN(self.num_state,self.num_action) 44 | self.human_model = SimulationHumanNN(self.num_human_state, self.num_human_action) 45 | self.muscle_model = MuscleNN(self.MuscleRelatedDofs, self.num_muscles) 46 | 47 | # create replay buffer for exoskeleton, human, and human muscle 48 | self.replay_exo_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_EXO) 49 | self.replay_human_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_HUMAN) 50 | self.replay_muscle_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, TRAJECTORY_FIELDS_MUSCLES) 51 | 52 | self.batch_size = 128 53 | self.learning_rate = 1e-4 54 | 55 | # create optimizer for exoskeleton, human, and human muscle models 56 | self.optimizer_exo = optim.Adam(self.exo_model.parameters(),lr=self.learning_rate) 57 | self.optimizer_human = optim.Adam(self.human_model.parameters(),lr=self.learning_rate) 58 | self.optimizer_muscle = optim.Adam(self.muscle_model.parameters(),lr=self.learning_rate) 59 | 60 | self.episodes = [None]*NUM_AGENT 61 | for j in range(NUM_AGENT): 62 | self.episodes[j] = EpisodeBuffer() 63 | 64 | self.gamma = 0.99 65 | self.lb = 0.99 66 | self.w_entropy = -0.001 67 | 68 | 69 | def ComputeTDandGAE(self): 70 | self.replay_exo_buffer.Clear() 71 | self.replay_human_buffer.Clear() 72 | self.replay_muscle_buffer.Clear() 73 | 74 | for epi in self.total_episodes: 75 | data = epi.GetData() 76 | 77 | states_exo, actions_exo, rewards_exo, values_exo, logprobs_exo, \ 78 | states_human, actions_human, rewards_human, values_human, logprobs_human = zip(*data) 79 | 80 | # Here omit how to compute Temporal Difference(TD) and Generalized Advantage Estimation (GAE), 81 | # we follows the common way as discribed in PPO: https://arxiv.org/pdf/1707.06347.pdf 82 | 83 | self.replay_exo_buffer.Push(states_exo[i], actions_exo[i], logprobs_exo[i], TD_exo[i], advantages_exo[i]) 84 | self.replay_human_buffer.Push(states_human[i], actions_human[i], logprobs_human[i], TD_human[i], advantages_human[i]) 85 | 86 | muscle_tuples = self.env.GetMuscleTuples() 87 | for i in range(len(muscle_tuples)): 88 | self.muscle_buffer.Push(muscle_tuples[i][0],muscle_tuples[i][1],muscle_tuples[i][2],muscle_tuples[i][3]) 89 | 90 | 91 | def GenerateTransitions(self): 92 | self.total_episodes = [] 93 | rewards_exo = [None]*NUM_AGENT 94 | rewards_human = [None] * NUM_AGENT 95 | 96 | local_step = 0 97 | while True: 98 | # Get observation/state of exoskeleton and human from simulators. 99 | states_exo = self.env.GetExoObservations() 100 | states_human = self.env.GetHumanObservations() 101 | 102 | # Predict and Apply action of human 103 | action_dist_human, values_human = self.human_model(states_human) 104 | actions_human = action_dist_human.sample() 105 | logprobs_human = action_dist_human.log_prob(actions_human) 106 | self.env.SetHumanActions(actions_human) 107 | 108 | # Predict and Apply action of exo 109 | action_dist_exo, values_exo = self.exo_model(states_exo) 110 | actions_exo = action_dist_exo.sample() 111 | logprobs_exo = action_dist_exo.log_prob(actions_exo) 112 | self.env.SetExoActions(actions_exo) 113 | 114 | # Predict and Apply action of muscle 115 | 116 | muscle_torque = self.env.GetMuscleTorques() 117 | desired_torque = self.env.GetDesiredTorquesHuman() 118 | activations = self.muscle_model(muscle_torque, desired_torque) 119 | self.env.SetActivationLevels(activations) 120 | self.env.Steps() 121 | 122 | for j in range(NUM_AGENT): 123 | # check if the episode of jth agent ends 124 | if not self.env.IsEndOfEpisode(j): 125 | # Obtain the rewards after taking action in the simulator 126 | rewards_exo[j] = self.env.GetExoReward(j) 127 | rewards_human[j] = self.env.GetHumanReward(j) 128 | 129 | self.episodes[j].Push(states[j], actions[j], rewards[j], values[j], logprobs[j], \ 130 | states_human[j], actions_human[j], rewards_human[j], values_human[j], logprobs_human[j]) 131 | 132 | local_step += 1 133 | 134 | else: 135 | self.total_episodes.append(self.episodes[j]) 136 | self.episodes[j] = EpisodeBuffer() 137 | self.env.Reset(j) 138 | 139 | 140 | def GetLoss(self, a_dist, value, action, lp, td, gae): 141 | '''Critic Loss''' 142 | loss_critic = ((value-td).pow(2)).mean() 143 | '''Actor Loss''' 144 | ratio = torch.exp(a_dist.log_prob(action)-lp) 145 | gae = (gae-gae.mean())/(gae.std()+ 1E-5) 146 | loss_actor = (ratio * gae).mean() 147 | '''Entropy Loss''' 148 | loss_entropy = - self.w_entropy * a_dist.entropy().mean() 149 | loss = loss_actor + loss_critic + loss_entropy 150 | return loss 151 | 152 | def OptimizeSimulationExoNN(self): 153 | all_transitions = np.array(self.replay_exo_buffer.buffer) 154 | for j in range(self.num_epochs): 155 | np.random.shuffle(all_transitions) 156 | for i in range(len(all_transitions)//self.batch_size): 157 | transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size] 158 | state, action, lp, td, gae = transitions 159 | a_dist, value = self.exo_model(state) 160 | loss = self.GetLoss(a_dist, value, action, lp, td, gae) 161 | self.optimizer_exo.zero_grad() 162 | loss.backward() 163 | self.optimizer_exo.step() 164 | 165 | def OptimizeSimulationHumanNN(self): 166 | all_transitions = np.array(self.replay_human_buffer.buffer) 167 | for j in range(self.num_epochs): 168 | np.random.shuffle(all_transitions) 169 | for i in range(len(all_transitions)//self.batch_size): 170 | transitions = all_transitions[i*self.batch_size:(i+1)*self.batch_size] 171 | state, action, lp, td, gae = transitions 172 | a_dist, value = self.human_model(state) 173 | loss = self.GetLoss(a_dist, value, action, lp, td, gae) 174 | self.optimizer_human.zero_grad() 175 | loss.backward() 176 | self.optimizer_human.step() 177 | 178 | def OptimizeMuscleNN(self): 179 | muscle_transitions = np.array(self.muscle_buffer.buffer) 180 | for j in range(self.num_epochs_muscle): 181 | np.random.shuffle(muscle_transitions) 182 | for i in range(len(muscle_transitions)//self.muscle_batch_size): 183 | tuples = muscle_transitions[i*self.muscle_batch_size:(i+1)*self.muscle_batch_size] 184 | JtA, tau_des, L, b = tuples 185 | activation = self.muscle_model(JtA, tau_des) 186 | tau = torch.einsum('ijk,ik->ij',(L, activation)) + b 187 | loss_reg = (activation).pow(2).mean() 188 | loss_target = (((tau-stack_tau_des)/100.0).pow(2)).mean() 189 | loss = 0.01*loss_reg + loss_target 190 | self.optimizer_muscle.zero_grad() 191 | loss.backward() 192 | self.optimizer_muscle.step() 193 | 194 | def Train(self): 195 | # generate transition 196 | self.GenerateTransitions() 197 | # prepare data 198 | self.ComputeTDandGAE() 199 | # optimize each model 200 | self.OptimizeSimulationNN() 201 | self.OptimizeSimulationHumanNN() 202 | self.OptimizeMuscleNN() 203 | 204 | 205 | if __name__=="__main__": 206 | ppo = PPO() 207 | for i in range(MAX_ITERATION): 208 | ppo.Train() 209 | 210 | 211 | -------------------------------------------------------------------------------- /Environment_simulator.py: -------------------------------------------------------------------------------- 1 | """Interface demonstration of environment simulator""" 2 | 3 | class Environment(object): 4 | 5 | def __init__(self,): 6 | pass 7 | 8 | def GetMuscleTuples(self): 9 | """Returns the hidden state of the agent for the start of an episode.""" 10 | # Network details elided. 11 | return initial_state 12 | 13 | 14 | def GetExoObservations(self): 15 | """return the state of the exskeletion in the simulator""" 16 | return exo_state 17 | 18 | def GetHumanObservations(self): 19 | """return the state of the human in the simulator""" 20 | return human_state 21 | 22 | def SetHumanActions(self, actions): 23 | """Set the action for the human""" 24 | return None 25 | 26 | def SetExoActions(self, actions): 27 | """Set the action for the exskeletion""" 28 | return None 29 | 30 | def GetMuscleTorques(self): 31 | """return the muscle torque""" 32 | return muscle_torque 33 | 34 | def GetDesiredTorquesHuman(self): 35 | """return the desired torque of the human as the target""" 36 | return desired_torque_human 37 | 38 | def SetActivationLevels(self, activations): 39 | """set the activate levels of muscles in the simulator""" 40 | return None 41 | 42 | def Steps(self): 43 | """Performs simulation step in the simulator""" 44 | return None 45 | -------------------------------------------------------------------------------- /Human_exoskeleton_model.py: -------------------------------------------------------------------------------- 1 | """ Neural network architecture definition for exoskeleton, human and muscles""" 2 | 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | MultiVariateNormal = torch.distributions.Normal 7 | 8 | class MuscleNN(nn.Module): 9 | def __init__(self, num_total_muscle_related_dofs, num_dofs, num_muscles): 10 | super(MuscleNN, self).__init__() 11 | self.num_total_muscle_related_dofs = num_total_muscle_related_dofs 12 | self.num_dofs = num_dofs 13 | self.num_muscles = num_muscles 14 | 15 | self.fc = nn.Sequential( 16 | nn.Linear(num_total_muscle_related_dofs + num_dofs, num_h1), 17 | nn.LeakyReLU(0.2, inplace=True), 18 | nn.Linear(1024, num_h2), 19 | nn.LeakyReLU(0.2, inplace=True), 20 | nn.Linear(512, num_h3), 21 | nn.LeakyReLU(0.2, inplace=True), 22 | nn.Linear(512, num_muscles), 23 | nn.Tanh(), 24 | ) 25 | self.std_muscle_tau = torch.zeros(self.num_total_muscle_related_dofs) 26 | self.std_tau = torch.zeros(self.num_dofs) 27 | 28 | for i in range(self.num_total_muscle_related_dofs): 29 | self.std_muscle_tau[i] = 200.0 30 | 31 | for i in range(self.num_dofs): 32 | self.std_tau[i] = 200.0 33 | 34 | def forward(self, muscle_tau, tau): 35 | muscle_tau = muscle_tau / self.std_muscle_tau 36 | tau = tau / self.std_tau 37 | out = self.fc(torch.cat([muscle_tau, tau], dim=1)) 38 | return nn.ReLU()(out) 39 | 40 | def get_activation(self, muscle_tau, tau): 41 | act = self.forward(muscle_tau.reshape(1, -1), tau.reshape(1, -1)) 42 | return act.cpu().detach().numpy() 43 | 44 | 45 | class SimulationExoNN(nn.Module): 46 | def __init__(self, num_states, num_actions): 47 | super(SimulationExoNN, self).__init__() 48 | 49 | num_h1 = 128 50 | num_h2 = 64 51 | self.num_actions = num_actions 52 | self.policy = nn.Sequential( 53 | nn.Linear(num_states, num_h1), 54 | nn.ReLU(), 55 | nn.Linear(num_h1, num_h2), 56 | nn.ReLU(), 57 | nn.Linear(num_h2, num_actions) 58 | ) 59 | 60 | self.value = nn.Sequential( 61 | nn.Linear(num_states, num_h1), 62 | nn.ReLU(), 63 | nn.Linear(num_h1, num_h2), 64 | nn.ReLU(), 65 | nn.Linear(num_h2, 1) 66 | ) 67 | self.log_std = nn.Parameter(torch.zeros(num_actions)) 68 | 69 | def forward(self, x): 70 | p_out = self.policy(x) 71 | p_out = MultiVariateNormal(p_out, self.log_std.exp()) 72 | v_out = self.value(x) 73 | return p_out, v_out 74 | 75 | def get_action(self, s): 76 | ts = torch.tensor(s) 77 | p, _ = self.forward(ts) 78 | return p.loc.cpu().detach().numpy() 79 | 80 | class SimulationHumanNN(nn.Module): 81 | def __init__(self, num_states, num_actions): 82 | super(SimulationHumanNN, self).__init__() 83 | 84 | num_h1 = 256 85 | num_h2 = 256 86 | self.num_actions = num_actions 87 | self.policy = nn.Sequential( 88 | nn.Linear(num_states, num_h1), 89 | nn.ReLU(), 90 | nn.Linear(num_h1, num_h2), 91 | nn.ReLU(), 92 | nn.Linear(num_h2, num_actions) 93 | ) 94 | self.value = nn.Sequential( 95 | nn.Linear(num_states, num_h1), 96 | nn.ReLU(), 97 | nn.Linear(num_h1, num_h2), 98 | nn.ReLU(), 99 | nn.Linear(num_h2, 1) 100 | ) 101 | self.log_std = nn.Parameter(torch.zeros(num_actions)) 102 | 103 | def forward(self, x): 104 | p_out = self.policy(x) 105 | p_out = MultiVariateNormal(p_out, self.log_std.exp()) 106 | v_out = self.value(x) 107 | return p_out, v_out 108 | 109 | def get_action(self, s): 110 | ts = torch.tensor(s) 111 | p, _ = self.forward(ts) 112 | return p.loc.cpu().detach().numpy() 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning_in_simulation 2 | 3 | Environment_simulator.py --Interface demonstration of environment simulator 4 | 5 | Human_exoskeleton_model.py --Neural network architecture definition for the exoskeleton, human and muscles. 6 | 7 | Closed_loop_training.py --Sample code for PPO Reinforcement Learning training. 8 | --------------------------------------------------------------------------------