├── DDPG_Generalized_Training_Code.m
├── PPO_Generalized_Training_Code.m
├── README.md
├── SAC_Generalized_Training_Code.m
├── TD3_Generalized_Training_Code.m
└── TRPO_Generalized_Training_Code.m


/DDPG_Generalized_Training_Code.m:
--------------------------------------------------------------------------------
  1 | %% DDPG Model Network and Code
  2 | %% Flags / Settings
  3 | parallelComputing_flag = 0;  % Whether use Parallel computing
  4 | load_Saved_Agent_flag = 0;
  5 | %% Load Saved Agent
  6 | if load_Saved_Agent_flag == 1
  7 |     savedAgent_dir = 'saved_Agents01';   
  8 |     listing = dir(fullfile(savedAgent_dir, '*.mat'));
  9 |     for i = 1:length(listing)
 10 |          temp_String= string(listing(i).name);
 11 |          temp_String = extractAfter(temp_String,5); 
 12 |          temp_String = extractBefore(temp_String,'.mat'); 
 13 |          agent_names(i,1) = str2num(temp_String);
 14 |          
 15 |     end
 16 |     sorted_agent_names = sort(agent_names,'ascend');
 17 |     last_Agent = sorted_agent_names(end);
 18 |     agent_Name = append('\Agent',num2str(last_Agent), '.mat');
 19 |     load([savedAgent_dir agent_Name]);
 20 |     [ep_reward ep_no] = max(savedAgentResult.EpisodeReward);
 21 |     load([savedAgent_dir append('\Agent', num2str(ep_no), '.mat')]);
 22 |     plot(savedAgentResult)
 23 | end
 24 | %% Model Intialization
 25 | mdl = 'Your Simulink Mdl Name';
 26 | open_system(mdl); % opens system model
 27 | agentblk = [mdl 'Location/To/RL Agent Block']; % Replace with the location to RL Agent Block in Simulink
 28 | 
 29 | %% Sample Time & Simulation Duration
 30 | T_Sample = 0.1; % Replace it with your own sample time
 31 | T_Total = 205; % Replace it with with your own Total Simulation Time
 32 | set_param(mdl,'StartTime','0','StopTime',int2str(T_Total)); % Set Start and Stop Time in Simulink
 33 | 
 34 | %% Observation Info
 35 | numObs = 5; % Enter Observation Number
 36 | obsInfo = rlNumericSpec([numObs 1],'LowerLimit',(-inf*zeros(numObs,1)),'UpperLimit',inf*zeros(numObs,1)); % Upper and Lower Limit of +- inf
 37 | obsInfo.Name = 'Observations';
 38 | obsInfo.Description = 'This is the description about observation info';
 39 | numOfObservations = obsInfo.Dimension(1); 
 40 | 
 41 | %% Action Info
 42 | 
 43 | Act1_Min = -5;
 44 | Act1_Max = 5;
 45 | 
 46 | % theta Controller
 47 | Act2_Min = -0.04;
 48 | Act2_Max = 0.04;
 49 | 
 50 | % Action Object
 51 | numAct = 1; % Number of Actions
 52 | actInfo = rlNumericSpec([numAct 1],'LowerLimit',Act1_Min*zeros(numAct,1),'UpperLimit' ,Act2_Min*zeros(numAct,1));
 53 | actInfo.Name = 'Action';
 54 | numActions = actInfo.Dimension(1);
 55 | 
 56 | %% Create Environment
 57 | %rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo,
 58 | %actInfo,'UseFastRestart','on'); % if you want to use Fast Restart
 59 | rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo, actInfo); % Creates Environment
 60 | %% Environment Reset Function
 61 | % To define the initial condition for the certain variable, specify an environment reset function using an anonymous function handle. 
 62 | % The reset function localResetFcn, which is defined at the end of the example.
 63 | rl_env.ResetFcn = @(in)localResetFcn(in);
 64 | % Fix the random generator seed for reproducibility.
 65 | rng('default')
 66 | %% Create Agent DDPG
 67 | % A DDPG agent approximates the long-term reward given observations and actions using a critic value function representation. 
 68 | % To create the critic, first create a deep neural network with two inputs, the state and action, and one output. 
 69 | % For more information on creating a neural network value function representation, see Create Policy and Value Function Representations.
 70 | nI = obsInfo.Dimension(1);  % number of inputs
 71 | nL = 128;                            % number of neurons
 72 | nO = actInfo.Dimension(1);    % number of outputs
 73 | 
 74 | statePath = [
 75 |     featureInputLayer(nI,'Normalization','none','Name','observation')
 76 |     fullyConnectedLayer(nL,'Name','fc1')
 77 |     reluLayer('Name','relu1')
 78 |     fullyConnectedLayer(nL,'Name','fc2')
 79 |     additionLayer(2,'Name','add')
 80 |     reluLayer('Name','relu2')
 81 |     fullyConnectedLayer(nL,'Name','fc3')
 82 |     reluLayer('Name','relu3')
 83 |     fullyConnectedLayer(1,'Name','fc4')];
 84 | 
 85 | actionPath = [
 86 |     featureInputLayer(nO,'Normalization','none','Name','action')
 87 |     fullyConnectedLayer(nL, 'Name', 'fc5')];
 88 | 
 89 | %% Critic Netwrok
 90 | criticNetwork = layerGraph(statePath);
 91 | criticNetwork = addLayers(criticNetwork, actionPath);
 92 |     
 93 | criticNetwork = connectLayers(criticNetwork,'fc5','add/in2');
 94 | 
 95 | % Specify options for the critic representation using rlRepresentationOptions.
 96 | criticOptions = rlRepresentationOptions('Optimizer','adam','LearnRate',1e-3,'GradientThreshold',1,'L2RegularizationFactor',2e-4,'UseDevice',"gpu"); %Use GPU for Training
 97 | 
 98 | % Create the critic representation using the specified neural network and options. 
 99 | % You must also specify the action and observation info for the critic, which you obtain from the environment interface. 
100 | % For more information, see rlQValueRepresentation.
101 | critic = rlQValueRepresentation(criticNetwork,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'},criticOptions);
102 | 
103 | % Design, visualize, and train deep learning networks
104 | %deepNetworkDesigner(criticNetwork);
105 | % View the critic network configuration.
106 | figure('Name','Critic Network');
107 | plot(criticNetwork);
108 | 
109 | %% Actor Netwrok
110 | % A DDPG agent decides which action to take given observations by using an actor representation.
111 | % To create the actor, first create a deep neural network with one input, the observation, and one output, the action.
112 | % Construct the actor similarly to the critic. For more information, see rlDeterministicActorRepresentation.
113 | actorNetwork = [
114 |     featureInputLayer(nI,'Normalization','none','Name','observation')
115 |     fullyConnectedLayer(nL,'Name','fc1')
116 |     reluLayer('Name','relu1')
117 |     fullyConnectedLayer(nL,'Name','fc2')
118 |     reluLayer('Name','relu2')
119 |     fullyConnectedLayer(nL,'Name','fc3')
120 |     reluLayer('Name','relu3')
121 |     fullyConnectedLayer(1,'Name','fc4')
122 |     tanhLayer('Name','tanh1')
123 |     scalingLayer('Name','ActorScaling1','Scale',Act1_Max,'Bias',-0.5)];
124 | 
125 | actorOptions = rlRepresentationOptions('Optimizer','adam','LearnRate',1e-4,'GradientThreshold',1,'L2RegularizationFactor',1e-5,'UseDevice',"gpu"); %Use GPU for Training
126 | 
127 | actor = rlDeterministicActorRepresentation(actorNetwork,obsInfo,actInfo,'Observation',{'observation'},'Action',{'ActorScaling1'},actorOptions);
128 | 
129 | % Design, visualize, and train deep learning networks
130 | %deepNetworkDesigner(actorNetwork);
131 | % View the actor network configuration.
132 | figure('Name','Actor Network');
133 | plot(layerGraph(actorNetwork));
134 | 
135 | %% Agent Options
136 | % To create the DDPG agent, first specify the DDPG agent options using rlDDPGAgentOptions.
137 | agentOptions = rlDDPGAgentOptions(...
138 |     'SampleTime',T_Sample,...
139 |     'TargetSmoothFactor',1e-3,...
140 |     'SaveExperienceBufferWithAgent',true, ...
141 |     'ExperienceBufferLength',1e8,...
142 |     'DiscountFactor',0.99,...
143 |     'MiniBatchSize',64);
144 | agentOptions.NoiseOptions.Variance = 0.3;
145 | agentOptions.NoiseOptions.VarianceDecayRate = 1e-5;
146 | agentOptions.ResetExperienceBufferBeforeTraining = false;
147 | agentOptions.SaveExperienceBufferWithAgent = true;
148 | 
149 | % Then, create the DDPG agent using the specified actor representation, critic representation, and agent options. 
150 | % For more information, see rlDDPGAgent.
151 | agent = rlDDPGAgent(actor,critic,agentOptions);
152 | 
153 | %% Specify Training Options and Train Agent
154 | % For this example, the training options for the DDPG and TD3 agents are the same.
155 | % Run each training session for 5000 episodes with each episode lasting at most maxSteps time steps.
156 | % Display the training progress in the Episode Manager dialog box (set the Plots option) and disable the command line display (set the Verbose option).
157 | % Terminate the training only when it reaches the maximum number of episodes (maxEpisodes). Doing so allows the comparison of the learning curves for multiple agents over the entire training session. 
158 | maxEpisodes = 1000000;
159 | maxSteps = floor(T_Total/T_Sample);
160 | trainingOptions = rlTrainingOptions(...
161 |     'MaxEpisodes',maxEpisodes,...
162 |     'MaxStepsPerEpisode',maxSteps,...
163 |     'ScoreAveragingWindowLength',100,...
164 |     'Verbose',true,...
165 |     'Plots','training-progress',...
166 |     'StopTrainingCriteria','EpisodeCount',...
167 |     'StopTrainingValue',maxEpisodes,...
168 |     'SaveAgentCriteria','EpisodeFrequency',...
169 |     'SaveAgentValue',100, ...
170 |     'SaveAgentDirectory','saved_Agents01' ...
171 |     ); % saves every 100th episode for DDPG Agent
172 | 
173 | % To train the agent in parallel, specify the following training options. 
174 | % Training in parallel requires Parallel Computing Toolbox™. 
175 | % If you do not have Parallel Computing Toolbox software installed, set UseParallel to false.
176 | % Set the UseParallel option to true.
177 | % Train the agent in parallel asynchronously.
178 | % After every 32 steps, have each worker send experiences to the parallel pool client (the MATLAB® process which starts the training). 
179 | % DDPG and TD3 agents require workers to send experiences to the client.
180 | if parallelComputing_flag==1
181 |      save_system(mdl);
182 |      num_cores = feature('numcores'); % Get number of CPU Cores
183 |      parpool(floor(num_cores*.75)); % Use 75% fo Available Cores
184 |      trainingOptions.UseParallel = true;
185 |      trainingOptions.ParallelizationOptions.Mode = 'async';
186 |      trainingOptions.ParallelizationOptions.StepsUntilDataIsSent = 32;
187 |      trainingOptions.ParallelizationOptions.DataToSendFromWorkers = 'Experiences';
188 |  end
189 | 
190 | %% Train the agent.
191 | trainingStats = train(agent,rl_env,trainingOptions)
192 | 
193 | %% Simulate DDPG Agent
194 | % To validate the performance of the trained agent, simulate the agent within the Simulink environment by uncommenting the following commands. 
195 | % For more information on agent simulation, see rlSimulationOptions and sim.
196 | 
197 |  simOptions = rlSimulationOptions('MaxSteps',maxsteps);
198 |  experience = sim(env,agent,simOptions);
199 | 
200 | %% Reset Function Definition
201 | function in = localResetFcn(in)
202 |     mdl = 'Your Simulink Model Name';    
203 |     in =Simulink.SimulationInput(mdl); 
204 |     
205 |     % LOGIC TO INITIALIZE A VARIABLE HERE
206 |     % alt = answer;
207 | 
208 |     %change  value in model worspace
209 |     mdlWks = get_param(mdl,'ModelWorkspace');
210 |     assignin(mdlWks,'variable name',alt) % assigns value to Base Workspace of Model
211 | end
212 | 
213 | %% CopyRights
214 | % Everything is designed with the Help of Mathworks and its documentation. 
215 | % Talha Bin Riaz


--------------------------------------------------------------------------------
/PPO_Generalized_Training_Code.m:
--------------------------------------------------------------------------------
  1 | %% PPO Model Network and Code
  2 | %% Flags / Settings
  3 | parallelComputing_flag = 0;  % Whether use Parallel computing
  4 | load_Saved_Agent_flag = 0;
  5 | %% Load Saved Agent
  6 | if load_Saved_Agent_flag == 1
  7 |     savedAgent_dir = 'saved_Agents01';   
  8 |     listing = dir(fullfile(savedAgent_dir, '*.mat'));
  9 |     for i = 1:length(listing)
 10 |          temp_String= string(listing(i).name);
 11 |          temp_String = extractAfter(temp_String,5); 
 12 |          temp_String = extractBefore(temp_String,'.mat'); 
 13 |          agent_names(i,1) = str2num(temp_String);
 14 |          
 15 |     end
 16 |     sorted_agent_names = sort(agent_names,'ascend');
 17 |     last_Agent = sorted_agent_names(end);
 18 |     agent_Name = append('\Agent',num2str(last_Agent), '.mat');
 19 |     load([savedAgent_dir agent_Name]);
 20 |     [ep_reward ep_no] = max(savedAgentResult.EpisodeReward);
 21 |     load([savedAgent_dir append('\Agent', num2str(ep_no), '.mat')]);
 22 |     plot(savedAgentResult)
 23 | end
 24 | %% Model Intialization
 25 | mdl = 'Your Simulink Mdl Name';
 26 | open_system(mdl); % opens system model
 27 | agentblk = [mdl 'Location/To/RL Agent Block']; % Replace with the location to RL Agent Block in Simulink
 28 | 
 29 | %% Sample Time & Simulation Duration
 30 | T_Sample = 0.1; % Replace it with your own sample time
 31 | T_Total = 205; % Replace it with with your own Total Simulation Time
 32 | set_param(mdl,'StartTime','0','StopTime',int2str(T_Total)); % Set Start and Stop Time in Simulink
 33 | 
 34 | %% Observation Info
 35 | numObs = 5; % Enter Observation Number
 36 | obsInfo = rlNumericSpec([numObs 1],'LowerLimit',(-inf*zeros(numObs,1)),'UpperLimit',inf*zeros(numObs,1)); % Upper and Lower Limit of +- inf
 37 | obsInfo.Name = 'Observations';
 38 | obsInfo.Description = 'This is the description about observation info';
 39 | numOfObservations = obsInfo.Dimension(1); 
 40 | 
 41 | %% Action Info
 42 | 
 43 | Act1_Min = -5;
 44 | Act1_Max = 5;
 45 | 
 46 | % theta Controller
 47 | Act2_Min = -0.04;
 48 | Act2_Max = 0.04;
 49 | 
 50 | % Action Object
 51 | numAct = 1; % Number of Actions
 52 | actInfo = rlNumericSpec([numAct 1],'LowerLimit',Act1_Min*zeros(numAct,1),'UpperLimit' ,Act2_Min*zeros(numAct,1));
 53 | actInfo.Name = 'Action';
 54 | numActions = actInfo.Dimension(1);
 55 | 
 56 | %% Create Environment
 57 | %rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo,
 58 | %actInfo,'UseFastRestart','on'); % if you want to use Fast Restart
 59 | rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo, actInfo); % Creates Environment
 60 | %% Environment Reset Function
 61 | % To define the initial condition for the certain variable, specify an environment reset function using an anonymous function handle. 
 62 | % The reset function localResetFcn, which is defined at the end of the example.
 63 | rl_env.ResetFcn = @(in)localResetFcn(in);
 64 | % Fix the random generator seed for reproducibility.
 65 | rng('default')
 66 | %% Create Agent PPO
 67 | % A PPO agent approximates the long-term reward given observations and actions using a critic value function representation. 
 68 | % To create the critic, first create a deep neural network with two inputs, the state and action, and one output. 
 69 | % For more information on creating a neural network value function representation, see Create Policy and Value Function Representations.
 70 | nI = obsInfo.Dimension(1);  % number of inputs
 71 | nL = 128;                            % number of neurons
 72 | nO = actInfo.Dimension(1);    % number of outputs
 73 | 
 74 | criticNetwork = [
 75 |     featureInputLayer(nI,'Normalization','none','Name','observation')
 76 |     fullyConnectedLayer(nL,'Name','fc1')
 77 |     reluLayer('Name','relu1')
 78 |     fullyConnectedLayer(nL,'Name','fc2')
 79 |     reluLayer('Name','relu2')
 80 |     fullyConnectedLayer(nL,'Name','fc3')
 81 |     reluLayer('Name','relu3')
 82 |     fullyConnectedLayer(1,'Name','fc4')];
 83 | 
 84 | %% Critic Netwrok
 85 | 
 86 | criticNetwork = dlnetwork(criticNetwork);  
 87 | 
 88 | % Specify options for the critic representation using rlOptimizerOptions.
 89 | criticOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-3,'GradientThreshold',1,'L2RegularizationFactor',2e-4); %Use GPU for Training
 90 | 
 91 | % Create the critic representation using the specified neural network and options. 
 92 | % You must also specify the action and observation info for the critic, which you obtain from the environment interface. 
 93 | % For more information, see rlQValueRepresentation.
 94 | critic = rlValueFunction(criticNetwork,obsInfo,'Observation',{'observation'},'UseDevice',"gpu");
 95 | 
 96 | % Design, visualize, and train deep learning networks
 97 | % View the critic network configuration.
 98 | figure('Name','Critic Network');
 99 | plot(criticNetwork);
100 | 
101 | %% Actor Netwrok
102 | % A PPO agent decides which action to take given observations by using an actor representation.
103 | % To create the actor, first create a deep neural network with one input, the observation, and one output, the action.
104 | % Construct the actor similarly to the critic. For more information, see rlContinuousGaussianActor.
105 | 
106 | 
107 | commonPath = [
108 |     featureInputLayer(nI,'Normalization','none','Name','comPathIn')
109 |     fullyConnectedLayer(nL,'Name','fc1_c')
110 |     reluLayer('Name','relu1_c')
111 |     fullyConnectedLayer(nL,'Name','fc2_c')
112 |     reluLayer('Name','comPathOut')
113 | ];
114 | 
115 | meanPath = [
116 |     fullyConnectedLayer(1,'Name','meanPathIn')
117 |     tanhLayer('Name','tanh1_m')
118 |     scalingLayer('Name','meanPathOut','Scale',Act1_Max,'Bias',-0.5)
119 |     ];
120 | 
121 | sdevPath = [
122 |     fullyConnectedLayer(1,'Name','stdPathIn')
123 |     softplusLayer('Name','stdPathOut')
124 |     ];
125 | 
126 | actorNetwork = layerGraph(commonPath);
127 | actorNetwork = addLayers(actorNetwork,meanPath);
128 | actorNetwork = addLayers(actorNetwork,sdevPath);
129 | 
130 | actorNetwork = connectLayers(actorNetwork,"comPathOut","meanPathIn/in");
131 | actorNetwork = connectLayers(actorNetwork,"comPathOut","stdPathIn/in");
132 | 
133 | actorOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-4,'GradientThreshold',1,'L2RegularizationFactor',1e-5);
134 | 
135 | actor = rlContinuousGaussianActor(actorNetwork,obsInfo,actInfo,'ActionMeanOutputNames',{'meanPathOut'}, ...
136 |     'ActionStandardDeviationOutputNames',{'stdPathOut'},'ObservationInputNames',{'comPathIn'},'UseDevice','gpu'); %Use GPU for Training
137 | 
138 | % Design, visualize, and train deep learning networks
139 | % View the actor network configuration.
140 | figure('Name','Actor Network');
141 | plot(actorNetwork);
142 | 
143 | %% Agent Options
144 | % To create the PPO agent, first specify the PPO agent options using rlPPOAgentOptions.
145 | agentOptions = rlPPOAgentOptions(...
146 |     'SampleTime',T_Sample, ...
147 |      ExperienceHorizon=1024,ClipFactor=0.04, EntropyLossWeight=0.1,NumEpoch=3
148 |      AdvantageEstimateMethod="gae",GAEFactor=0.5, ...
149 |      DiscountFactor=0.997,ActorOptimizerOptions=actorOptions,CriticOptimizerOptions=criticOptions);
150 | 
151 | % Then, create the PPO agent using the specified actor representation, critic representation, and agent options. 
152 | % For more information, see rlPPOAgent.
153 | 
154 | agent = rlPPOAgent(actor,critic,agentOptions);
155 | 
156 | %% Specify Training Options and Train Agent
157 | % For this example, the training options for the DDPG and TD3 agents are the same.
158 | % Run each training session for 5000 episodes with each episode lasting at most maxSteps time steps.
159 | % Display the training progress in the Episode Manager dialog box (set the Plots option) and disable the command line display (set the Verbose option).
160 | % Terminate the training only when it reaches the maximum number of episodes (maxEpisodes). Doing so allows the comparison of the learning curves for multiple agents over the entire training session. 
161 | maxEpisodes = 1000000;
162 | maxSteps = floor(T_Total / T_Sample);
163 | 
164 | % Configure Parallelization Options
165 | parallelOptions = rl.option.ParallelTraining(...
166 |     'Mode', 'async'); % Async Parallel Training Mode
167 | 
168 | % Define training options for the reinforcement learning agent
169 | trainingOptions = rlTrainingOptions(...
170 |     'MaxEpisodes', maxEpisodes, ...
171 |     'MaxStepsPerEpisode', maxSteps, ...
172 |     'ScoreAveragingWindowLength', 100, ...
173 |     'Verbose', true, ...
174 |     'Plots', 'training-progress', ...
175 |     'StopTrainingCriteria', 'EpisodeCount', ...
176 |     'StopTrainingValue', maxEpisodes, ...
177 |     'SaveAgentCriteria', 'EpisodeSteps', ...
178 |     'SaveAgentValue', 900, ...
179 |     'SaveAgentDirectory', 'savedAgents_1', ...
180 |     'UseParallel', true, ...
181 |     'ParallelizationOptions', parallelOptions);
182 | 
183 | 
184 | % To train the agent in parallel, specify the following training options. 
185 | % Training in parallel requires Parallel Computing Toolbox™. 
186 | % If you do not have Parallel Computing Toolbox software installed, set UseParallel to false.
187 | % Set the UseParallel option to true.
188 | % Train the agent in parallel asynchronously.
189 | if parallelComputing_flag==1
190 |     save_system(mdl);
191 |     % Set up the parallel pool using 75% of available CPU cores
192 |     num_cores = feature('numcores');
193 |     parpool(floor(num_cores * 0.5));
194 |     % Ensure the GPU is selected on all workers
195 |     parfevalOnAll(@() gpuDevice(1), 0);
196 | end
197 | 
198 | %% Train the agent.
199 | trainingStats = train(agent,rl_env,trainingOptions)
200 | 
201 | %% Simulate PPO Agent
202 | % To validate the performance of the trained agent, simulate the agent within the Simulink environment by uncommenting the following commands. 
203 | % For more information on agent simulation, see rlSimulationOptions and sim.
204 | 
205 | simOptions = rlSimulationOptions('MaxSteps',maxsteps);
206 | experience = sim(env,agent,simOptions);
207 | 
208 | %% Reset Function Definition
209 | function in = localResetFcn(in)
210 |     mdl = 'Your Simulink Model Name';    
211 |     in =Simulink.SimulationInput(mdl); 
212 |     
213 |     % LOGIC TO INITIALIZE A VARIABLE HERE
214 |     % alt = answer;
215 | 
216 |     %change  value in model worspace
217 |     mdlWks = get_param(mdl,'ModelWorkspace');
218 |     assignin(mdlWks,'variable name',alt) % assigns value to Base Workspace of Model
219 | end
220 | 
221 | %% CopyRights
222 | % Everything is designed with the Help of Mathworks and its documentation. 
223 | % Talha Bin Riaz
224 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MATLAB_RL_Agent_Architecture
2 | Following repository contains the Architecture and its code for training in Continuous Domain RL Agents that include: Deep Deterministic Policy Gradient (DDPG), Trust Region Policy Optimization (TRPO), Proximal Policy Optimization (PPO), Soft Actor Critic (SAC) and Twin Delayed Deep Deterministic Policy Gradient (TD3).
3 | 


--------------------------------------------------------------------------------
/SAC_Generalized_Training_Code.m:
--------------------------------------------------------------------------------
  1 | %% SAC Model Network and Code
  2 | %% Flags / Settings
  3 | parallelComputing_flag = 0;  % Whether use Parallel computing
  4 | load_Saved_Agent_flag = 0;
  5 | %% Load Saved Agent
  6 | if load_Saved_Agent_flag == 1
  7 |     savedAgent_dir = 'saved_Agents01';   
  8 |     listing = dir(fullfile(savedAgent_dir, '*.mat'));
  9 |     for i = 1:length(listing)
 10 |          temp_String= string(listing(i).name);
 11 |          temp_String = extractAfter(temp_String,5); 
 12 |          temp_String = extractBefore(temp_String,'.mat'); 
 13 |          agent_names(i,1) = str2num(temp_String);
 14 |          
 15 |     end
 16 |     sorted_agent_names = sort(agent_names,'ascend');
 17 |     last_Agent = sorted_agent_names(end);
 18 |     agent_Name = append('\Agent',num2str(last_Agent), '.mat');
 19 |     load([savedAgent_dir agent_Name]);
 20 |     [ep_reward ep_no] = max(savedAgentResult.EpisodeReward);
 21 |     load([savedAgent_dir append('\Agent', num2str(ep_no), '.mat')]);
 22 |     plot(savedAgentResult)
 23 | end
 24 | %% Model Intialization
 25 | mdl = 'Your Simulink Mdl Name';
 26 | open_system(mdl); % opens system model
 27 | agentblk = [mdl 'Location/To/RL Agent Block']; % Replace with the location to RL Agent Block in Simulink
 28 | 
 29 | %% Sample Time & Simulation Duration
 30 | T_Sample = 0.1; % Replace it with your own sample time
 31 | T_Total = 205; % Replace it with with your own Total Simulation Time
 32 | set_param(mdl,'StartTime','0','StopTime',int2str(T_Total)); % Set Start and Stop Time in Simulink
 33 | 
 34 | %% Observation Info
 35 | numObs = 5; % Enter Observation Number
 36 | obsInfo = rlNumericSpec([numObs 1],'LowerLimit',(-inf*zeros(numObs,1)),'UpperLimit',inf*zeros(numObs,1)); % Upper and Lower Limit of +- inf
 37 | obsInfo.Name = 'Observations';
 38 | obsInfo.Description = 'This is the description about observation info';
 39 | numOfObservations = obsInfo.Dimension(1); 
 40 | 
 41 | %% Action Info
 42 | 
 43 | Act1_Min = -5;
 44 | Act1_Max = 5;
 45 | 
 46 | % theta Controller
 47 | Act2_Min = -0.04;
 48 | Act2_Max = 0.04;
 49 | 
 50 | % Action Object
 51 | numAct = 1; % Number of Actions
 52 | actInfo = rlNumericSpec([numAct 1],'LowerLimit',Act1_Min*zeros(numAct,1),'UpperLimit' ,Act2_Min*zeros(numAct,1));
 53 | actInfo.Name = 'Action';
 54 | numActions = actInfo.Dimension(1);
 55 | 
 56 | %% Create Environment
 57 | %rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo,
 58 | %actInfo,'UseFastRestart','on'); % if you want to use Fast Restart
 59 | rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo, actInfo); % Creates Environment
 60 | %% Environment Reset Function
 61 | % To define the initial condition for the certain variable, specify an environment reset function using an anonymous function handle. 
 62 | % The reset function localResetFcn, which is defined at the end of the example.
 63 | rl_env.ResetFcn = @(in)localResetFcn(in);
 64 | % Fix the random generator seed for reproducibility.
 65 | rng('default')
 66 | %% Create Agent SAC
 67 | % A SAC agent approximates the long-term reward given observations and actions using a critic value function representation. 
 68 | % To create the critic, first create a deep neural network with two inputs, the state and action, and one output. 
 69 | % For more information on creating a neural network value function representation, see Create Policy and Value Function Representations.
 70 | nI = obsInfo.Dimension(1);  % number of inputs
 71 | nL = 128;                            % number of neurons
 72 | nO = actInfo.Dimension(1);    % number of outputs
 73 | 
 74 | statePath = [
 75 |     featureInputLayer(nI,'Normalization','none','Name','observation')
 76 |     fullyConnectedLayer(nL,'Name','fc1')
 77 |     reluLayer('Name','relu1')
 78 |     fullyConnectedLayer(nL,'Name','fc2')
 79 |     additionLayer(2,'Name','add')
 80 |     reluLayer('Name','relu2')
 81 |     fullyConnectedLayer(nL,'Name','fc3')
 82 |     reluLayer('Name','relu3')
 83 |     fullyConnectedLayer(1,'Name','fc4')];
 84 | 
 85 | actionPath = [
 86 |     featureInputLayer(nO,'Normalization','none','Name','action')
 87 |     fullyConnectedLayer(nL, 'Name', 'fc5')];
 88 | 
 89 | %% Critic Netwrok
 90 | criticNetwork = layerGraph(statePath);
 91 | criticNetwork = addLayers(criticNetwork, actionPath);
 92 |     
 93 | criticNetwork = connectLayers(criticNetwork,'fc5','add/in2');
 94 | 
 95 | criticNetwork = dlnetwork(criticNetwork,Initialize=false);
 96 | % Specify options for the critic representation using rlRepresentationOptions.
 97 | criticOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-3,'GradientThreshold',1,'L2RegularizationFactor',2e-4); %Use GPU for Training
 98 | 
 99 | criticNetwork_1 = initialize(criticNetwork);
100 | criticNetwork_2 = initialize(criticNetwork);
101 | 
102 | % Create the critic representation using the specified neural network and options. 
103 | % You must also specify the action and observation info for the critic, which you obtain from the environment interface. 
104 | % For more information, see rlQValueRepresentation.
105 | critic_1 = rlQValueFunction(criticNetwork_1,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'});
106 | critic_2 = rlQValueFunction(criticNetwork_2,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'});
107 | 
108 | % Design, visualize, and train deep learning networks
109 | % View the critic network configuration.
110 | figure('Name','Critic Network');
111 | plot(criticNetwork);
112 | 
113 | %% Actor Netwrok
114 | % A SAC agent decides which action to take given observations by using an actor representation.
115 | % To create the actor, first create a deep neural network with one input, the observation, and one output, the action.
116 | % Construct the actor similarly to the critic. For more information, see rlDeterministicActorRepresentation.
117 | 
118 | commonPath = [
119 |     featureInputLayer(nI,'Normalization','none','Name','obsInLyr')
120 |     fullyConnectedLayer(nL,'Name','fc1_c')
121 |     reluLayer('Name','relu1_c')
122 |     fullyConnectedLayer(nL,'Name','fc2_c')
123 |     reluLayer('Name','relu2_c')
124 |     fullyConnectedLayer(nL,'Name','fc3_c')
125 |     reluLayer('Name','relu3_c')
126 |     fullyConnectedLayer(1,'Name','comPathOut')
127 | ];
128 | 
129 | meanPath = [
130 |     fullyConnectedLayer(2,'Name','meanFC')
131 |     fullyConnectedLayer(nL,'Name','fc1_m')
132 |     reluLayer('Name','relu1_m')
133 |     fullyConnectedLayer(nL,'Name','fc2_m')
134 |     reluLayer('Name','relu2_m')
135 |     fullyConnectedLayer(nL,'Name','fc3_m')
136 |     reluLayer('Name','relu3_m')
137 |     fullyConnectedLayer(1,'Name','fc4_m')
138 |     tanhLayer('Name','tanh1_m')
139 |     scalingLayer('Name','meanPathOut','Scale',5,'Bias',-0.5)
140 |     ];
141 | 
142 | sdevPath = [
143 |     fullyConnectedLayer(2,'Name','stdFC')
144 |     reluLayer('Name','relu1_s')
145 |     fullyConnectedLayer(nL,'Name','fc2_s')
146 |     reluLayer('Name','relu2_s')
147 |     fullyConnectedLayer(nL,'Name','fc3_s')
148 |     reluLayer('Name','relu3_s')
149 |     fullyConnectedLayer(1,'Name','fc4_s')
150 |     reluLayer('Name','relu4_s')
151 |     scalingLayer('Name','stdPathOut','Scale',5)
152 | ];
153 | 
154 | actorNetwork = layerGraph(commonPath);
155 | actorNetwork = addLayers(actorNetwork,meanPath);
156 | actorNetwork = addLayers(actorNetwork,sdevPath);
157 | 
158 | actorNetwork = connectLayers(actorNetwork,"comPathOut","meanFC/in");
159 | actorNetwork = connectLayers(actorNetwork,"comPathOut","stdFC/in");
160 | actorNetwork = dlnetwork(actorNetwork);
161 | actorOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-4,'GradientThreshold',1,'L2RegularizationFactor',1e-5); %Use GPU for Training
162 | 
163 | actor = rlContinuousGaussianActor(actorNetwork,obsInfo,actInfo,'ActionMeanOutputNames',{'meanPathOut'}, ...
164 |     'ActionStandardDeviationOutputNames',{'stdPathOut'},'ObservationInputNames',{'obsInLyr'},'UseDevice','gpu'); %Use GPU for Training
165 | 
166 | % Design, visualize, and train deep learning networks
167 | % View the actor network configuration.
168 | figure('Name','Actor Network');
169 | plot(layerGraph(actorNetwork));
170 | 
171 | %% Agent Options
172 | % To create the SAC agent, first specify the SAC agent options using rlSACAgentOptions.
173 | agentOptions = rlSACAgentOptions(...
174 |     'SampleTime',T_Sample,...
175 |     'TargetSmoothFactor',1e-3,...
176 |     'SaveExperienceBufferWithAgent',true, ...
177 |     'ExperienceBufferLength',1e8,...
178 |     'DiscountFactor',0.99,...
179 |     'MiniBatchSize',64, ...
180 |     'NumWarmStartSteps',1000);
181 | 
182 | for ct =1:2
183 | 
184 |     agentOptions.CriticOptimizerOptions(ct) = criticOptions;
185 | 
186 | end
187 | 
188 | % Then, create the SAC agent using the specified actor representation, critic representation, and agent options. 
189 | % For more information, see rlSACAgent.
190 | agent = rlSACAgent(actor,[critic_1,critic_2],agentOptions);
191 | 
192 | %% Specify Training Options and Train Agent
193 | % For this example, the training options for the DDPG and TD3 agents are the same.
194 | % Run each training session for 5000 episodes with each episode lasting at most maxSteps time steps.
195 | % Display the training progress in the Episode Manager dialog box (set the Plots option) and disable the command line display (set the Verbose option).
196 | % Terminate the training only when it reaches the maximum number of episodes (maxEpisodes). Doing so allows the comparison of the learning curves for multiple agents over the entire training session. 
197 | maxEpisodes = 1000000;
198 | maxSteps = floor(T_Total/T_Sample);
199 | trainingOptions = rlTrainingOptions(...
200 |     'MaxEpisodes',maxEpisodes,...
201 |     'MaxStepsPerEpisode',maxSteps,...
202 |     'ScoreAveragingWindowLength',100,...
203 |     'Verbose',true,...
204 |     'Plots','training-progress',...
205 |     'StopTrainingCriteria','EpisodeCount',...
206 |     'StopTrainingValue',maxEpisodes,...
207 |     'SaveAgentCriteria','EpisodeSteps',...
208 |     'SaveAgentValue',500, ...
209 |     'SaveAgentDirectory','saved_agents01' ...
210 |     );
211 | 
212 | % To train the agent in parallel, specify the following training options. 
213 | % Training in parallel requires Parallel Computing Toolbox™. 
214 | % If you do not have Parallel Computing Toolbox software installed, set UseParallel to false.
215 | % Set the UseParallel option to true.
216 | % Train the agent in parallel asynchronously.
217 |  if parallelComputing_flag==1
218 |      %save_system(mdl);
219 |      num_cores = feature('numcores'); % Get number of CPU Cores
220 |      parpool(floor(num_cores*.75)); % Use 75% fo Available Cores
221 |      trainingOptions.UseParallel = true;
222 |      trainingOptions.ParallelizationOptions.Mode = 'async';
223 | end
224 | 
225 | %% Train the agent.
226 | trainingStats = train(agent,rl_env,trainingOptions)
227 | 
228 | %% Simulate SAC Agent
229 | % To validate the performance of the trained agent, simulate the agent within the Simulink environment by uncommenting the following commands. 
230 | % For more information on agent simulation, see rlSimulationOptions and sim.
231 | 
232 | simOptions = rlSimulationOptions('MaxSteps',maxsteps);
233 | experience = sim(env,agent,simOptions);
234 | 
235 | %% Reset Function Definition
236 | function in = localResetFcn(in)
237 |     mdl = 'Your Simulink Model Name';    
238 |     in =Simulink.SimulationInput(mdl); 
239 |     
240 |     % LOGIC TO INITIALIZE A VARIABLE HERE
241 |     % alt = answer;
242 | 
243 |     %change  value in model worspace
244 |     mdlWks = get_param(mdl,'ModelWorkspace');
245 |     assignin(mdlWks,'variable name',alt) % assigns value to Base Workspace of Model
246 | end
247 | 
248 | %% CopyRights
249 | % Everything is designed with the Help of Mathworks and its documentation. 
250 | % Talha Bin Riaz


--------------------------------------------------------------------------------
/TD3_Generalized_Training_Code.m:
--------------------------------------------------------------------------------
  1 | %% SAC Model Network and Code
  2 | %% Flags / Settings
  3 | parallelComputing_flag = 0;  % Whether use Parallel computing
  4 | load_Saved_Agent_flag = 0;
  5 | %% Load Saved Agent
  6 | if load_Saved_Agent_flag == 1
  7 |     savedAgent_dir = 'saved_Agents01';   
  8 |     listing = dir(fullfile(savedAgent_dir, '*.mat'));
  9 |     for i = 1:length(listing)
 10 |          temp_String= string(listing(i).name);
 11 |          temp_String = extractAfter(temp_String,5); 
 12 |          temp_String = extractBefore(temp_String,'.mat'); 
 13 |          agent_names(i,1) = str2num(temp_String);
 14 |          
 15 |     end
 16 |     sorted_agent_names = sort(agent_names,'ascend');
 17 |     last_Agent = sorted_agent_names(end);
 18 |     agent_Name = append('\Agent',num2str(last_Agent), '.mat');
 19 |     load([savedAgent_dir agent_Name]);
 20 |     [ep_reward ep_no] = max(savedAgentResult.EpisodeReward);
 21 |     load([savedAgent_dir append('\Agent', num2str(ep_no), '.mat')]);
 22 |     plot(savedAgentResult)
 23 | end
 24 | %% Model Intialization
 25 | mdl = 'Your Simulink Mdl Name';
 26 | open_system(mdl); % opens system model
 27 | agentblk = [mdl 'Location/To/RL Agent Block']; % Replace with the location to RL Agent Block in Simulink
 28 | 
 29 | %% Sample Time & Simulation Duration
 30 | T_Sample = 0.1; % Replace it with your own sample time
 31 | T_Total = 205; % Replace it with with your own Total Simulation Time
 32 | set_param(mdl,'StartTime','0','StopTime',int2str(T_Total)); % Set Start and Stop Time in Simulink
 33 | 
 34 | %% Observation Info
 35 | numObs = 5; % Enter Observation Number
 36 | obsInfo = rlNumericSpec([numObs 1],'LowerLimit',(-inf*zeros(numObs,1)),'UpperLimit',inf*zeros(numObs,1)); % Upper and Lower Limit of +- inf
 37 | obsInfo.Name = 'Observations';
 38 | obsInfo.Description = 'This is the description about observation info';
 39 | numOfObservations = obsInfo.Dimension(1); 
 40 | 
 41 | %% Action Info
 42 | 
 43 | Act1_Min = -5;
 44 | Act1_Max = 5;
 45 | 
 46 | % theta Controller
 47 | Act2_Min = -0.04;
 48 | Act2_Max = 0.04;
 49 | 
 50 | % Action Object
 51 | numAct = 1; % Number of Actions
 52 | actInfo = rlNumericSpec([numAct 1],'LowerLimit',Act1_Min*zeros(numAct,1),'UpperLimit' ,Act2_Min*zeros(numAct,1));
 53 | actInfo.Name = 'Action';
 54 | numActions = actInfo.Dimension(1);
 55 | 
 56 | %% Create Environment
 57 | %rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo,
 58 | %actInfo,'UseFastRestart','on'); % if you want to use Fast Restart
 59 | rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo, actInfo); % Creates Environment
 60 | %% Environment Reset Function
 61 | % To define the initial condition for the certain variable, specify an environment reset function using an anonymous function handle. 
 62 | % The reset function localResetFcn, which is defined at the end of the example.
 63 | rl_env.ResetFcn = @(in)localResetFcn(in);
 64 | % Fix the random generator seed for reproducibility.
 65 | rng('default')
 66 | 
 67 | %% Create Agent TD3
 68 | % A TD3 agent approximates the long-term reward given observations and actions using a critic value function representation. 
 69 | % To create the critic, first create a deep neural network with two inputs, the state and action, and one output. 
 70 | % For more information on creating a neural network value function representation, see Create Policy and Value Function Representations.
 71 | nI = obsInfo.Dimension(1);  % number of inputs
 72 | nL = 128;                            % number of neurons
 73 | nO = actInfo.Dimension(1);    % number of outputs
 74 | 
 75 | statePath = [
 76 |     featureInputLayer(nI,'Normalization','none','Name','observation')
 77 |     fullyConnectedLayer(nL,'Name','fc1')
 78 |     reluLayer('Name','relu1')
 79 |     fullyConnectedLayer(nL,'Name','fc2')
 80 |     additionLayer(2,'Name','add')
 81 |     reluLayer('Name','relu2')
 82 |     fullyConnectedLayer(nL,'Name','fc3')
 83 |     reluLayer('Name','relu3')
 84 |     fullyConnectedLayer(1,'Name','fc4')];
 85 | 
 86 | actionPath = [
 87 |     featureInputLayer(nO,'Normalization','none','Name','action')
 88 |     fullyConnectedLayer(nL, 'Name', 'fc5')];
 89 | 
 90 | %% Critic Netwrok
 91 | criticNetwork = layerGraph(statePath);
 92 | criticNetwork = addLayers(criticNetwork, actionPath);
 93 |     
 94 | criticNetwork = connectLayers(criticNetwork,'fc5','add/in2');
 95 | 
 96 | criticNetwork = dlnetwork(criticNetwork,Initialize=false);
 97 | % Specify options for the critic representation using rlRepresentationOptions.
 98 | criticOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-3,'GradientThreshold',1,'L2RegularizationFactor',2e-4); %Use GPU for Training
 99 | 
100 | criticNetwork_1 = initialize(criticNetwork);
101 | criticNetwork_2 = initialize(criticNetwork);
102 | 
103 | % Create the critic representation using the specified neural network and options. 
104 | % You must also specify the action and observation info for the critic, which you obtain from the environment interface. 
105 | % For more information, see rlQValueRepresentation.
106 | critic_1 = rlQValueFunction(criticNetwork_1,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'});
107 | critic_2 = rlQValueFunction(criticNetwork_2,obsInfo,actInfo,'Observation',{'observation'},'Action',{'action'});
108 | 
109 | % Design, visualize, and train deep learning networks
110 | % View the critic network configuration.
111 | figure('Name','Critic Network');
112 | plot(criticNetwork);
113 | 
114 | %% Actor Netwrok
115 | % A TD3 agent decides which action to take given observations by using an actor representation.
116 | % To create the actor, first create a deep neural network with one input, the observation, and one output, the action.
117 | % Construct the actor similarly to the critic. For more information, see rlDeterministicActorRepresentation.
118 | 
119 | actorNetwork = [
120 |     featureInputLayer(nI,'Normalization','none','Name','observation')
121 |     fullyConnectedLayer(nL,'Name','fc1')
122 |     reluLayer('Name','relu1')
123 |     fullyConnectedLayer(nL,'Name','fc2')
124 |     reluLayer('Name','relu2')
125 |     fullyConnectedLayer(nL,'Name','fc3')
126 |     reluLayer('Name','relu3')
127 |     fullyConnectedLayer(1,'Name','fc4')
128 |     tanhLayer('Name','tanh1')
129 |     % scalingLayer('Name','ActorScaling1','Scale',5)];
130 |     scalingLayer('Name','ActorScaling1','Scale',5,'Bias',-0.5)];
131 | 
132 | actorNetwork = dlnetwork(actorNetwork);
133 | actorOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-4,'GradientThreshold',1,'L2RegularizationFactor',1e-5); %Use GPU for Training
134 | 
135 | actor = rlContinuousDeterministicActor(actorNetwork,obsInfo,actInfo,'ObservationInputNames',{'observation'}); %Use GPU for Training
136 | 
137 | % Design, visualize, and train deep learning networks
138 | % View the actor network configuration.
139 | figure('Name','Actor Network');
140 | plot(layerGraph(actorNetwork));
141 | 
142 | %% Agent Options
143 | % To create the TD3 agent, first specify the TD3 agent options using rlTD3AgentOptions.
144 | agentOptions = rlTD3AgentOptions(...
145 |     'SampleTime',T_Sample,...
146 |     'TargetSmoothFactor',1e-3,...
147 |     'SaveExperienceBufferWithAgent',true, ...
148 |     'ExperienceBufferLength',1e8,...
149 |     'DiscountFactor',0.99,...
150 |     'MiniBatchSize',64);
151 | 
152 | agentOptions.ActorOptimizerOptions = actorOptions;
153 | agentOptions.CriticOptimizerOptions = criticOptions;
154 | agentOptions.ExplorationModel.Variance = 0.05;
155 | agentOptions.ExplorationModel.VarianceDecayRate = 2e-04;
156 | agentOptions.ExplorationModel.VarianceMin = 0.001;
157 | agentOptions.TargetPolicySmoothModel.Variance = 0.1;
158 | agentOptions.TargetPolicySmoothModel.VarianceDecayRate = 1e-04;
159 | 
160 | 
161 | 
162 | for ct =1:2
163 | 
164 |     agentOptions.CriticOptimizerOptions(ct) = criticOptions;
165 | 
166 | end
167 | 
168 | % Then, create the TD3 agent using the specified actor representation, critic representation, and agent options. 
169 | % For more information, see rlTD3Agent.
170 | agent = rlTD3Agent(actor,[critic_1,critic_2],agentOptions);
171 | 
172 | %% Specify Training Options and Train Agent
173 | % For this example, the training options for the DDPG and TD3 agents are the same.
174 | % Run each training session for 5000 episodes with each episode lasting at most maxSteps time steps.
175 | % Display the training progress in the Episode Manager dialog box (set the Plots option) and disable the command line display (set the Verbose option).
176 | % Terminate the training only when it reaches the maximum number of episodes (maxEpisodes). Doing so allows the comparison of the learning curves for multiple agents over the entire training session. 
177 | % Define maximum episodes and steps
178 | % Define maximum episodes and steps
179 | maxEpisodes = 1000000;
180 | maxSteps = floor(T_Total / T_Sample);
181 | 
182 | % Configure Parallelization Options
183 | parallelOptions = rl.option.ParallelTraining(...
184 |     'Mode', 'async');
185 | 
186 | % Define training options for the reinforcement learning agent
187 | trainingOptions = rlTrainingOptions(...
188 |     'MaxEpisodes', maxEpisodes, ...
189 |     'MaxStepsPerEpisode', maxSteps, ...
190 |     'ScoreAveragingWindowLength', 100, ...
191 |     'Verbose', true, ...
192 |     'Plots', 'training-progress', ...
193 |     'StopTrainingCriteria', 'EpisodeCount', ...
194 |     'StopTrainingValue', maxEpisodes, ...
195 |     'SaveAgentCriteria', 'EpisodeSteps', ...
196 |     'SaveAgentValue', 900, ...
197 |     'SaveAgentDirectory', 'savedAgents_1', ...
198 |     'UseParallel', true, ... % Enable parallel training
199 |     'ParallelizationOptions', parallelOptions);
200 | 
201 | 
202 | % To train the agent in parallel, specify the following training options. 
203 | % Training in parallel requires Parallel Computing Toolbox™. 
204 | % If you do not have Parallel Computing Toolbox software installed, set UseParallel to false.
205 | % Set the UseParallel option to true.
206 | % Train the agent in parallel asynchronously.
207 | % After every 32 steps, have each worker send experiences to the parallel pool client (the MATLAB® process which starts the training). 
208 | % DDPG and TD3 agents require workers to send experiences to the client.
209 | if parallelComputing_flag==1
210 |     save_system(mdl);
211 |     % Set up the parallel pool using 75% of available CPU cores
212 |     num_cores = feature('numcores');
213 |     parpool(floor(num_cores * 0.5));
214 |     % Ensure the GPU is selected on all workers
215 |     parfevalOnAll(@() gpuDevice(1), 0);
216 | end
217 | 
218 | %% Train the agent.
219 | trainingStats = train(agent,rl_env,trainingOptions)
220 | 
221 | %% Simulate TD3 Agent
222 | % To validate the performance of the trained agent, simulate the agent within the Simulink environment by uncommenting the following commands. 
223 | % For more information on agent simulation, see rlSimulationOptions and sim.
224 | 
225 | simOptions = rlSimulationOptions('MaxSteps',maxsteps);
226 | experience = sim(env,agent,simOptions);
227 | 
228 | %% Reset Function Definition
229 | function in = localResetFcn(in)
230 |     mdl = 'Your Simulink Model Name';    
231 |     in =Simulink.SimulationInput(mdl); 
232 |     
233 |     % LOGIC TO INITIALIZE A VARIABLE HERE
234 |     % alt = answer;
235 | 
236 |     %change  value in model worspace
237 |     mdlWks = get_param(mdl,'ModelWorkspace');
238 |     assignin(mdlWks,'variable name',alt) % assigns value to Base Workspace of Model
239 | end
240 | 
241 | %% CopyRights
242 | % Everything is designed with the Help of Mathworks and its documentation. 
243 | % Talha Bin Riaz
244 | 
245 | 


--------------------------------------------------------------------------------
/TRPO_Generalized_Training_Code.m:
--------------------------------------------------------------------------------
  1 | %% TRPO Model Network and Code
  2 | %% Flags / Settings
  3 | parallelComputing_flag = 0;  % Whether use Parallel computing
  4 | load_Saved_Agent_flag = 0;
  5 | %% Load Saved Agent
  6 | if load_Saved_Agent_flag == 1
  7 |     savedAgent_dir = 'saved_Agents01';   
  8 |     listing = dir(fullfile(savedAgent_dir, '*.mat'));
  9 |     for i = 1:length(listing)
 10 |          temp_String= string(listing(i).name);
 11 |          temp_String = extractAfter(temp_String,5); 
 12 |          temp_String = extractBefore(temp_String,'.mat'); 
 13 |          agent_names(i,1) = str2num(temp_String);
 14 |          
 15 |     end
 16 |     sorted_agent_names = sort(agent_names,'ascend');
 17 |     last_Agent = sorted_agent_names(end);
 18 |     agent_Name = append('\Agent',num2str(last_Agent), '.mat');
 19 |     load([savedAgent_dir agent_Name]);
 20 |     [ep_reward ep_no] = max(savedAgentResult.EpisodeReward);
 21 |     load([savedAgent_dir append('\Agent', num2str(ep_no), '.mat')]);
 22 |     plot(savedAgentResult)
 23 | end
 24 | %% Model Intialization
 25 | mdl = 'Your Simulink Mdl Name';
 26 | open_system(mdl); % opens system model
 27 | agentblk = [mdl 'Location/To/RL Agent Block']; % Replace with the location to RL Agent Block in Simulink
 28 | 
 29 | %% Sample Time & Simulation Duration
 30 | T_Sample = 0.1; % Replace it with your own sample time
 31 | T_Total = 205; % Replace it with with your own Total Simulation Time
 32 | set_param(mdl,'StartTime','0','StopTime',int2str(T_Total)); % Set Start and Stop Time in Simulink
 33 | 
 34 | %% Observation Info
 35 | numObs = 5; % Enter Observation Number
 36 | obsInfo = rlNumericSpec([numObs 1],'LowerLimit',(-inf*zeros(numObs,1)),'UpperLimit',inf*zeros(numObs,1)); % Upper and Lower Limit of +- inf
 37 | obsInfo.Name = 'Observations';
 38 | obsInfo.Description = 'This is the description about observation info';
 39 | numOfObservations = obsInfo.Dimension(1); 
 40 | 
 41 | %% Action Info
 42 | 
 43 | Act1_Min = -5;
 44 | Act1_Max = 5;
 45 | 
 46 | % theta Controller
 47 | Act2_Min = -0.04;
 48 | Act2_Max = 0.04;
 49 | 
 50 | % Action Object
 51 | numAct = 1; % Number of Actions
 52 | actInfo = rlNumericSpec([numAct 1],'LowerLimit',Act1_Min*zeros(numAct,1),'UpperLimit' ,Act2_Min*zeros(numAct,1));
 53 | actInfo.Name = 'Action';
 54 | numActions = actInfo.Dimension(1);
 55 | 
 56 | %% Create Environment
 57 | %rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo,
 58 | %actInfo,'UseFastRestart','on'); % if you want to use Fast Restart
 59 | rl_env = rlSimulinkEnv(mdl, agentblk, obsInfo, actInfo); % Creates Environment
 60 | %% Environment Reset Function
 61 | % To define the initial condition for the certain variable, specify an environment reset function using an anonymous function handle. 
 62 | % The reset function localResetFcn, which is defined at the end of the example.
 63 | rl_env.ResetFcn = @(in)localResetFcn(in);
 64 | % Fix the random generator seed for reproducibility.
 65 | rng('default')
 66 | 
 67 | %% Create Agent TRPO
 68 | % A TRPO agent approximates the long-term reward given observations and actions using a critic value function representation. 
 69 | % To create the critic, first create a deep neural network with two inputs, the state and action, and one output. 
 70 | % For more information on creating a neural network value function representation, see Create Policy and Value Function Representations.
 71 | nI = obsInfo.Dimension(1);  % number of inputs
 72 | nL = 128;                            % number of neurons
 73 | nO = actInfo.Dimension(1);    % number of outputs
 74 | 
 75 | criticNet = [
 76 |     featureInputLayer(nI,'Normalization','none','Name','observation')
 77 |     fullyConnectedLayer(nL,'Name','fc1')
 78 |     reluLayer('Name','relu1')
 79 |     fullyConnectedLayer(nL,'Name','fc2')
 80 |     reluLayer('Name','relu2')
 81 |     fullyConnectedLayer(nL,'Name','fc3')
 82 |     reluLayer('Name','relu3')
 83 |     fullyConnectedLayer(1,'Name','fc4')];
 84 | 
 85 | 
 86 | %% Critic Netwrok
 87 | criticNetwork = layerGraph(criticNet);
 88 | 
 89 | % criticNetwork = dlnetwork(criticNetwork,Initialize=false);
 90 | % Specify options for the critic representation using rlRepresentationOptions.
 91 | criticOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-3,'GradientThreshold',1,'L2RegularizationFactor',2e-4); %Use GPU for Training  %LR: 1e-3; GT: 1
 92 | 
 93 | % Create the critic representation using the specified neural network and options. 
 94 | % You must also specify the action and observation info for the critic, which you obtain from the environment interface. 
 95 | % For more information, see rlQValueRepresentation.
 96 | critic = rlValueFunction(criticNetwork,obsInfo,'Observation',{'observation'},'UseDevice',"gpu");
 97 | 
 98 | % Design, visualize, and train deep learning networks
 99 | % View the critic network configuration.
100 | figure('Name','Critic Network');
101 | plot(criticNetwork);
102 | 
103 | %% Actor Netwrok
104 | % A TRPO agent decides which action to take given observations by using an actor representation.
105 | % To create the actor, first create a deep neural network with one input, the observation, and one output, the action.
106 | % Construct the actor similarly to the critic. For more information, see rlDeterministicActorRepresentation.
107 | 
108 | 
109 | commonPath = [
110 |     featureInputLayer(nI,'Normalization','none','Name','comPathIn')
111 |     fullyConnectedLayer(nL,'Name','fc1_c')
112 |     reluLayer('Name','relu1_c')
113 |     fullyConnectedLayer(nL,'Name','fc2_c')
114 |     reluLayer('Name','comPathOut')
115 | ];
116 | 
117 | meanPath = [
118 |     fullyConnectedLayer(1,'Name','meanPathIn')
119 |     tanhLayer('Name','tanh1_m')
120 |     scalingLayer('Name','meanPathOut','Scale',5,'Bias',-0.5)
121 |     ];
122 | 
123 | sdevPath = [
124 |     fullyConnectedLayer(1,'Name','stdPathIn')
125 |     softplusLayer('Name','stdPathOut')
126 | ];
127 | 
128 | actorNetwork = layerGraph(commonPath);
129 | actorNetwork = addLayers(actorNetwork,meanPath);
130 | actorNetwork = addLayers(actorNetwork,sdevPath);
131 | 
132 | actorNetwork = connectLayers(actorNetwork,"comPathOut","meanPathIn/in");
133 | actorNetwork = connectLayers(actorNetwork,"comPathOut","stdPathIn/in");
134 | 
135 | actorOptions = rlOptimizerOptions('Optimizer','adam','LearnRate',1e-4,'GradientThreshold',1,'L2RegularizationFactor',1e-5);
136 | 
137 | actor = rlContinuousGaussianActor(actorNetwork,obsInfo,actInfo,'ActionMeanOutputNames',{'meanPathOut'}, ...
138 |     'ActionStandardDeviationOutputNames',{'stdPathOut'},'ObservationInputNames',{'comPathIn'},'UseDevice','gpu'); %Use GPU for Training
139 | 
140 | % Design, visualize, and train deep learning networks
141 | % View the actor network configuration.
142 | figure('Name','Actor Network');
143 | plot(layerGraph(dlnetwork(actorNetwork)));
144 | 
145 | %% Agent Options
146 | % To create the TRPO agent, first specify the TRPO agent options using rlTRPOAgentOptions.
147 | agentOptions = rlTRPOAgentOptions(...
148 |     'SampleTime',T_Sample,...
149 |      'CriticOptimizerOptions',criticOptions, ... % 'UseDeterministicExploitation',1,... 
150 |      'DiscountFactor',0.99,...
151 |      'MiniBatchSize',128,'AdvantageEstimateMethod',"gae", ...
152 |      "ExperienceHorizon",512,"EntropyLossWeight",0.95, ...
153 |      "KLDivergenceLimit",0.02,"NumIterationsConjugateGradient",10, ...
154 |      "NumEpoch",3,"GAEFactor",0.95,"NumIterationsLineSearch",10, ...
155 |      "ConjugateGradientDamping",0.01,"ConjugateGradientResidualTolerance",1e-08);
156 | 
157 | % Then, create the TRPO agent using the specified actor representation, critic representation, and agent options. 
158 | % For more information, see rlTRPOAgent.
159 | agent = rlTRPOAgent(actor,critic,agentOptions);
160 | 
161 | %% Specify Training Options and Train Agent
162 | % For this example, the training options for the DDPG and TD3 agents are the same.
163 | % Run each training session for 5000 episodes with each episode lasting at most maxSteps time steps.
164 | % Display the training progress in the Episode Manager dialog box (set the Plots option) and disable the command line display (set the Verbose option).
165 | % Terminate the training only when it reaches the maximum number of episodes (maxEpisodes). Doing so allows the comparison of the learning curves for multiple agents over the entire training session. 
166 | maxEpisodes = 1000000;
167 | maxSteps = floor(T_Total/T_Sample);
168 | trainingOptions = rlTrainingOptions(...
169 |     'MaxEpisodes',maxEpisodes,...
170 |     'MaxStepsPerEpisode',maxSteps,...
171 |     'ScoreAveragingWindowLength',100,...
172 |     'Verbose',true,...
173 |     'Plots','training-progress',...
174 |     'StopTrainingCriteria','EpisodeCount',...
175 |     'StopTrainingValue',maxEpisodes,...
176 |     'SaveAgentCriteria','EpisodeSteps',...
177 |     'SaveAgentValue',1000, ...
178 |     'SaveAgentDirectory','savedAgents_1' ,'UseParallel',1 ...
179 |     );
180 | 
181 | % To train the agent in parallel, specify the following training options. 
182 | % Training in parallel requires Parallel Computing Toolbox™. 
183 | % If you do not have Parallel Computing Toolbox software installed, set UseParallel to false.
184 | % Set the UseParallel option to true.
185 | % Train the agent in parallel asynchronously.
186 | % After every 32 steps, have each worker send experiences to the parallel pool client (the MATLAB® process which starts the training). 
187 | % DDPG and TD3 agents require workers to send experiences to the client.
188 | if parallelComputing_flag==1
189 |      save_system(mdl);
190 |      num_cores = feature('numcores'); % Get number of CPU Cores
191 |      parpool(floor(num_cores*.25)); % Use 75% fo Available Cores
192 |      trainingOptions.UseParallel = true;
193 |      trainingOptions.ParallelizationOptions.Mode = 'async';
194 |      trainingOptions.ParallelizationOptions.StepsUntilDataIsSent = 32;
195 |      trainingOptions.ParallelizationOptions.DataToSendFromWorkers = 'Experiences';
196 |  end
197 | 
198 | %% Train the agent.
199 | trainingStats = train(agent,rl_env,trainingOptions)
200 | 
201 | %% Simulate TTRPO Agent
202 | % To validate the performance of the trained agent, simulate the agent within the Simulink environment by uncommenting the following commands. 
203 | % For more information on agent simulation, see rlSimulationOptions and sim.
204 | 
205 | simOptions = rlSimulationOptions('MaxSteps',maxsteps);
206 | experience = sim(env,agent,simOptions);
207 | 
208 | %% Reset Function Definition
209 | function in = localResetFcn(in)
210 |     mdl = 'Your Simulink Model Name';    
211 |     in =Simulink.SimulationInput(mdl); 
212 |     
213 |     % LOGIC TO INITIALIZE A VARIABLE HERE
214 |     % alt = answer;
215 | 
216 |     %change  value in model worspace
217 |     mdlWks = get_param(mdl,'ModelWorkspace');
218 |     assignin(mdlWks,'variable name',alt) % assigns value to Base Workspace of Model
219 | end
220 | 
221 | %% CopyRights
222 | % Everything is designed with the Help of Mathworks and its documentation. 
223 | % Talha Bin Riaz
224 | 
225 | 


--------------------------------------------------------------------------------