├── README.md
├── integral_reinforcement_learning
    ├── irl_main.m
    └── irl_ode.m
├── model_free_integral_reinforcement_learning
    ├── README.md
    ├── eg1
    │   ├── eg1_c1_main.m
    │   ├── eg1_c1_ode.m
    │   ├── eg1_c1_parallel_ode.m
    │   ├── eg1_c2_main.m
    │   ├── eg1_c2_nn_pf.m
    │   ├── eg1_c2_nn_pf_dphi.m
    │   └── eg1_c2_ode.m
    └── eg2
    │   ├── eg2_c1_main.m
    │   ├── eg2_c1_nn_pf.m
    │   ├── eg2_c1_nn_pf_dphi.m
    │   ├── eg2_c1_ode.m
    │   ├── eg2_c2_main.m
    │   └── eg2_c2_ode.m
├── model_free_nonzero_sum_games
    ├── README.md
    ├── eg_cacc
    │   └── cacc_model_free_learning.m
    └── eg_two_player_ndg
    │   └── tndg_model_free_learning.m
├── online_learning_policy_update
    └── puol_algorithm.m
├── online_learning_without_initial_admissible_control
    └── ol_algorithm.m
├── parallel_control_based_optimal_tracking
    ├── pc_ot_adp_main.m
    ├── pc_ot_adp_nn_pf_dphi.m
    └── pc_ot_adp_ode.m
├── policy_iteration
    ├── pi_algorithm.m
    ├── pi_icp.m
    ├── pi_state_data.m
    ├── pi_validation.m
    ├── training_data
    │   ├── actor_init.mat
    │   └── state_data.mat
    └── training_results
    │   └── actor_critic.mat
├── policy_iteration_continuous_time
    └── pi_algorithm.m
├── value_iteration
    ├── training_data
    │   └── state_data.mat
    ├── training_results
    │   └── actor_critic.mat
    ├── vi_algorithm.m
    ├── vi_state_data.m
    └── vi_validation.m
└── value_iteration_positive_semi_definite_initial_value_function
    ├── vi_controlled_system.m
    ├── vi_cost_function.m
    ├── vi_pf_actor.m
    ├── vi_pf_critic.m
    ├── vi_pf_main.m
    └── vi_state_data.m


/README.md:
--------------------------------------------------------------------------------
 1 | # Adaptive Dynamic Programming and Reinforcement Learning (ADPRL) for nonlinear optimal control
 2 | Adaptive dynamic programming (ADP), also known as approximate dynamic programming, neuro-dynamic programming, and reinforcement learning (RL), is a class of promising techniques to solve the problems of optimal control for discrete-time (DT) and continuous-time (CT) nonlinear systems.
 3 | 
 4 |  MATLAB codes of ADPRL, including iterative and online ADPRL, are provided. Currently added ADPRL algorithms include:
 5 |  - [Value iteraion for DT systems](https://ieeexplore.ieee.org/abstract/document/4554208)
 6 |  - [Value iteraion for DT systems (positive semi definite initial value function)](https://ieeexplore.ieee.org/abstract/document/7314890)
 7 |  - [Policy iteration for DT systems](https://ieeexplore.ieee.org/abstract/document/6609085)
 8 |  - [Policy iteration for CT systems](https://books.google.com/books?hl=zh-CN&lr=&id=U3Gtlot_hYEC&oi=fnd&pg=PR11&dq=optimal+control&ots=wd8jE5BAjv&sig=Gyy6RyPbILR_8hoybCd8etcX41o#v=onepage&q&f=false)
 9 |  - [Integral reinforcement learning for partially unknown CT systems](https://www.sciencedirect.com/science/article/abs/pii/S0893608009000446)
10 |  - [Model-free integral reinforcement learning for completely unknown nonaffine CT systems](https://www.sciencedirect.com/science/article/pii/S0925231224001929)
11 |  - [Model-free nonzero-sum games for completely unknown nonaffine CT systems](https://ieeexplore.ieee.org/document/10849990)
12 |  - [Online learning policy update for DT systems](https://ieeexplore.ieee.org/abstract/document/6208889)
13 |  - [Online learning without initial admissible control for CT systems](https://ieeexplore.ieee.org/document/5717676)
14 |  - [Parallel control-based optimal tracking for nonaffine CT systems](https://www.ieee-jas.net/en/article/doi/10.1109/JAS.2020.1003426)
15 | 
16 | ### Requirements
17 | ********
18 | - MATLAB
19 | 


--------------------------------------------------------------------------------
/integral_reinforcement_learning/irl_main.m:
--------------------------------------------------------------------------------
 1 | % This demo checks the feasibility of the integral reinforcement learning (IRL) algorithm
 2 | 
 3 | % By J. Lu
 4 | % Date: Nov. 23, 2020 
 5 | 
 6 | %-------------------------------- start -----------------------------------
 7 | clear; close all; clc;
 8 | 
 9 | % parameters
10 | global K; global u; global A; global B; global Q; global R;
11 | P = zeros(4,4);
12 | uu = 0; % record control signal u
13 | 
14 | % system matrices
15 | A = [-0.0665   11.5       0         0;
16 |          0     -2.5      2.5        0;
17 |        -9.5      0    -13.736  -13.736;
18 |         0.6      0       0         0];
19 | B = [0 0 13.736 0]';
20 | Q = diag([1,1,1,1]);
21 | R = 1;
22 | 
23 | % initial condition
24 | x0 = [0 0.1 0 0 0]; % x(5) is the integral of x'Qx + u'Ru
25 | 
26 | % a stable K
27 | K = [0.8267    1.7003    0.7049    0.4142];
28 | pole = eig(A - B*K)
29 | 
30 | % simulation parameters
31 | Fsamples = 120;
32 | T = 0.05; % sample time
33 | SS = 0.001; % step size for simulation
34 | j = 0;
35 | nop = 10;
36 | tol = 1e-4;
37 | 
38 | figure(1),hold on;
39 | xlabel('Time (s)');
40 | ylabel('State trajectories'); 
41 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
42 | grid on;
43 | for k=1:Fsamples
44 |     j = j + 1;
45 |     tspan = 0:SS:T;
46 |     [t,x]= ode23('irl_ode',tspan,x0);
47 |     uu = [uu u];
48 |     figure(1),plot(t + T*(k-1),x(:,1),'b',t + T*(k-1),x(:,2),'r',t + T*(k-1),x(:,3),'g',t + T*(k-1),x(:,4),'k','linewidth',1),hold on;
49 |     x1 = x(length(t),1:4);
50 |     X(j,:) = [ x0(1)^2 x0(1)*x0(2) x0(1)*x0(3) x0(1)*x0(4) ...
51 |                x0(2)^2 x0(2)*x0(3) x0(2)*x0(4)...
52 |                x0(3)^2 x0(3)*x0(4) x0(4)^2 ] - ...
53 |              [ x1(1)^2 x1(1)*x1(2) x1(1)*x1(3) x1(1)*x1(4) ...
54 |                x1(2)^2 x1(2)*x1(3) x1(2)*x1(4)...
55 |                x1(3)^2 x1(3)*x1(4) x1(4)^2 ];
56 |     
57 |     Y(j,:) = x(length(t),5) - x(1,5);
58 |     x0 = [x(length(t),1:4),x(length(t),5)];
59 | 
60 |     if mod(k,nop) == 0 && norm(X) > tol
61 |         weights = pinv(X)*Y;
62 |         P=[ weights(1)    weights(2)/2  weights(3)/2  weights(4)/2;...
63 |             weights(2)/2  weights(5)    weights(6)/2  weights(7)/2;...
64 |             weights(3)/2  weights(6)/2  weights(8)    weights(9)/2;...
65 |             weights(4)/2  weights(7)/2  weights(9)/2  weights(10)      ];
66 |         K = inv(R)*B'*P;
67 |         jj = 1;
68 |         X = zeros(nop,10);
69 |         Y = zeros(nop,1);
70 |         j = 0;
71 |     end
72 | end
73 | 
74 | P
75 | [~,Popt] = lqr(A,B,Q,R)
76 | 
77 | % plot result
78 | figure(1), hold off;
79 | 
80 | figure(2),plot(T*[0:Fsamples],uu,'b','linewidth',1)
81 | xlabel('Time (s)');
82 | ylabel('Control trajectory'); 
83 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
84 | grid on;
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/integral_reinforcement_learning/irl_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = irl_ode(~,x)
 2 | % parameters
 3 | global K; global u; global A; global B; global Q; global R;
 4 | 
 5 | x = [x(1) x(2) x(3) x(4)]';
 6 | 
 7 | % calculate the control signal
 8 | u = -K*x;
 9 | 
10 | xdot = [A*x + B*u; x'*Q*x + u'*R*u];


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/README.md:
--------------------------------------------------------------------------------
 1 | # Model-free integral reinforcement learning (IRL)
 2 | ********
 3 | Source code for paper:   
 4 | J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach.
 5 | 
 6 | ### Description
 7 | ********
 8 | A novel online optimal control method for unknown continuous-time nonaffine nonlinear systems *without recovering unknown systems*.
 9 | 
10 | ### Requirements
11 | ********
12 | - MATLAB
13 | 
14 | ### How to run
15 | ********
16 | - Directly run eg1_c1_main.m, eg1_c2_main.m, eg2_c1_main.m, or eg2_c2_main.m.
17 | 
18 | ### Citation
19 | ********
20 | ```angular2html
21 | @article{lu2023nearly,
22 |   title = {Nearly Optimal Stabilization of Unknown Continuous-Time Nonlinear Systems: A New Parallel Control Approach},
23 |   author = {Lu, Jingwei and Wang, Xingxia and Wei, Qinglai and Wang, Fei-Yue},
24 |   doi = {10.1016/j.neucom.2024.127421},
25 |   year = {2024}
26 | }
27 | ```
28 | 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c1_main.m:
--------------------------------------------------------------------------------
  1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach.
  2 | % Code for Case I in Example 1
  3 | % By J. Lu
  4 | 
  5 | %-------------------------------- start -----------------------------------
  6 | clear; close all; clc;
  7 | 
  8 | global A; global B; 
  9 | global Aa; global Ba; 
 10 | global Q; global R;
 11 | global Qa; global Ra;
 12 | global Ta;
 13 | global Kopt; global Kaopt;
 14 | global u; 
 15 | global ua;
 16 | 
 17 | 
 18 | A = [0 1;-9 -3]; B = [0; 1];
 19 | 
 20 | Ta = 1; 
 21 | Aa = [A B; zeros(1,2) -1/Ta]; Ba = [zeros(2,1); (1/Ta)*diag([1])];
 22 | 
 23 | Q = 1*eye(2); R = 1*eye(1);
 24 | Qa = diag([diag(Q)' diag(R)']); Ra = 1e0*diag([1]);
 25 | 
 26 | [Kopt, Popt] = lqr(A,B,Q,R);
 27 | [Kaopt, Paopt] = lqr(Aa,Ba,Qa,Ra);
 28 | 
 29 | Fsamples = 400; 
 30 | T = 0.05; 
 31 | ss = T/10; 
 32 | 
 33 | % origin system
 34 | % state and control
 35 | x0 = [1;1];
 36 | x = [x0;0];
 37 | xx = [x(1:2)];
 38 | uu = [-Kopt*x0];
 39 | 
 40 | % performance
 41 | J = 0;
 42 | 
 43 | h = waitbar(0,'please wait');
 44 | for k = 1:Fsamples
 45 |     tspan = 0:ss:T;
 46 |     [t,x]= ode45(@eg1_c1_ode, tspan, x);
 47 |     J = J + x(length(t),3);
 48 |     x = [x(length(t),1:2),0];
 49 |     xx = [xx x(1:2)'];
 50 |     uu = [uu u];
 51 | 
 52 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 53 | end
 54 | close(h);
 55 | 
 56 | J
 57 | 
 58 | % aas
 59 | % state and control
 60 | xa = [x0;uu(:,1);0];
 61 | xxa = [xa(1:2)];
 62 | uua = [uu(:,1)];
 63 | 
 64 | % performance
 65 | Ja = 0;
 66 | 
 67 | h = waitbar(0,'please wait');
 68 | for k = 1:Fsamples
 69 |     tspan = 0:ss:T;
 70 |     [t,xa]= ode45(@eg1_c1_parallel_ode, tspan, xa);
 71 |     Ja = Ja + xa(length(t),4);
 72 |     xa = [xa(length(t),1:3),0];
 73 |     xxa = [xxa xa(1:2)'];
 74 |     uua = [uua xa(3)];
 75 |     
 76 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 77 | end
 78 | close(h);
 79 | 
 80 | Ja
 81 | 
 82 | Jopt = x0'*Popt*x0
 83 | 
 84 | fprintf('Ta: %d; Ra: %d; difference: %d.\n',Ta,Ra,Ja-Jopt)
 85 | 
 86 | 
 87 | % result
 88 | figure(1), % states 
 89 | subplot(2,1,1)
 90 | plot(T*((1:size(xx,2))-1),xx(1,:),'r',T*((1:size(xxa,2))-1),xxa(1,:),'b--','linewidth',1)
 91 | xlabel('Time (s)');
 92 | ylabel('$x_1$','Interpreter','latex');
 93 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 94 | grid on;
 95 | subplot(2,1,2)
 96 | plot(T*((1:size(xx,2))-1),xx(2,:),'r',T*((1:size(xxa,2))-1),xxa(2,:),'b--','linewidth',1)
 97 | xlabel('Time (s)');
 98 | ylabel('$x_2$','Interpreter','latex');
 99 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
100 | grid on;
101 | 
102 | figure(2), % control
103 | plot(T*((1:size(uu,2))-1),uu(1,:),'r',T*((1:size(uua,2))-1),uua(1,:),'b--','linewidth',1)
104 | xlabel('Time (s)');
105 | ylabel('$u$','Interpreter','latex');
106 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
107 | grid on;
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c1_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = eg1_c1_ode(~,x)
 2 | 
 3 | global A; global B;
 4 | global Q; global R;
 5 | global Kopt;
 6 | global u;
 7 | 
 8 | u = -Kopt*[x(1);x(2)];
 9 | 
10 | xdot = [A*[x(1);x(2)]+B*u; [x(1);x(2)]'*Q*[x(1);x(2)]+u'*R*u];
11 | 
12 | end 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c1_parallel_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = eg1_c1_parallel_ode(~,x)
 2 | 
 3 | global A; global B; 
 4 | global Q; global R;
 5 | global Ta;
 6 | global Kaopt;
 7 | global ua;
 8 | 
 9 | ua = x(3);
10 | 
11 | v = -Kaopt*[x(1);x(2);x(3)];
12 | 
13 | xdot = [A*[x(1);x(2)]+B*ua; (1/Ta)*(v - x(3)); [x(1);x(2)]'*Q*[x(1);x(2)]+ua'*R*ua];
14 | 
15 | end


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c2_main.m:
--------------------------------------------------------------------------------
  1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach.
  2 | % Code for Case II in Example 1
  3 | % By J. Lu
  4 | 
  5 | %-------------------------------- start -----------------------------------
  6 | clear; close all; clc;
  7 | 
  8 | global A; global B;
  9 | global Ba;
 10 | global Qa; global Ra;
 11 | global Ta;
 12 | global dw;
 13 | global wa;
 14 | 
 15 | A = [0 1;-9 -3]; B = [0; 1];
 16 | 
 17 | Ta = 1;
 18 | Aa = [A B; zeros(1,2) -1/Ta];
 19 | Ba = [zeros(2,1); (1/Ta)*diag([1])];
 20 | 
 21 | Q = 1*eye(2); R = 1*eye(1);
 22 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]);
 23 | 
 24 | [Kaopt, Paopt] = lqr(Aa,Ba,Qa,Ra);
 25 | 
 26 | Fsamples = 10000;
 27 | T = 0.1; 
 28 | ss = T/10; 
 29 | 
 30 | % state, control, and critic and action NN weights
 31 | x00 = [1;-1];
 32 | x = [x00;0;0;1;0;0;1;0;1];
 33 | wa = x(5:end);
 34 | x0 = x;
 35 | xx = [x];
 36 | uu = [];
 37 | 
 38 | phi = eg1_c2_nn_pf(x(1:3));
 39 | pphi = [];
 40 | pp = [];
 41 | 
 42 | % NN learning law
 43 | dw = zeros(length(phi),1);
 44 | 
 45 | wwa = [wa];
 46 | wwc = [x(5:end)];
 47 | 
 48 | ac = 30;
 49 | aa = 0.1;
 50 | 
 51 | cw = 1e-2;
 52 | 
 53 | rpb_k = Fsamples - 100; % remove the probing noise at the last 100 steps
 54 | 
 55 | Gamma = eye(length(phi));
 56 | 
 57 | h = waitbar(0,'please wait');
 58 | for k = 1:Fsamples
 59 |     
 60 |     % probing noise
 61 |     if norm(x(1:2),2) <= 1 && k <= rpb_k
 62 |         x = [x(1);x(2);x(3)+10*(rand(1,1)-0.5);0;x(5:end)'];
 63 |         phi = eg1_c2_nn_pf(x(1:3));
 64 |     end
 65 |     
 66 |     tspan = 0:ss:T;
 67 |     [t,x]= ode45(@eg1_c2_ode, tspan, x);
 68 |     
 69 |     phi_next = eg1_c2_nn_pf(x(length(t),1:3));
 70 |     pphi = phi_next-phi;
 71 |     phi = phi_next;
 72 |     
 73 |     pp = x(length(t),4);
 74 |     
 75 |     % critic learning law
 76 |     dw = - ac*pphi/((1+pphi'*pphi)^2)*(pp+pphi'*x(length(t),5:end)');
 77 |    
 78 |     % action learning law
 79 |     if norm(dw,2) <= cw
 80 |         wa = wa + (-aa*Gamma*(wa - x(length(t),5:end)'));
 81 |         phi = eg1_c2_nn_pf(x(length(t),1:3));
 82 |     end
 83 |     
 84 |     x = [x(length(t),1:3),0,x(length(t),5:end)];
 85 |     xx = [xx x'];
 86 |     uu = [uu x(3)];
 87 |     wwa = [wwa wa];
 88 |     wwc = [wwc x(5:end)'];
 89 |     
 90 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 91 | end
 92 | close(h);
 93 | 
 94 | wc = xx(5:end,end)'
 95 | wa = wa'
 96 | wopt = [Paopt(1,1); 2*Paopt(1,2); 2*Paopt(1,3); Paopt(2,2); 2*Paopt(2,3); Paopt(3,3)]'
 97 | difference_initial = x0(5:end)' - wopt
 98 | difference_final = wc - wopt
 99 | 
100 | % result
101 | figure(1), % states 
102 | subplot(2,1,1)
103 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1)
104 | xlabel('Time (s)');
105 | ylabel('$x_1$','Interpreter','latex');
106 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
107 | grid on;
108 | subplot(2,1,2)
109 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1)
110 | xlabel('Time (s)');
111 | ylabel('$x_2$','Interpreter','latex');
112 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
113 | grid on;
114 | 
115 | figure(2), % control
116 | plot(T*((1:size(uu,2))-1),uu(1,:),'linewidth',1)
117 | xlabel('Time (s)');
118 | ylabel('$u$','Interpreter','latex');
119 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
120 | grid on;
121 | 
122 | figure(3), % wc
123 | plot(T*((1:size(xx,2))-1),xx(5:end,:),'linewidth',1)
124 | xlabel('Time (s)');
125 | ylabel('Critic newtork weights');
126 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
127 | grid on;
128 | 
129 | figure(4), % wa
130 | plot(T*((1:size(wwa,2))-1),wwa,'linewidth',1)
131 | xlabel('Time (s)');
132 | ylabel('Action newtork weights');
133 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
134 | grid on;
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c2_nn_pf.m:
--------------------------------------------------------------------------------
1 | function y = eg1_c2_nn_pf(x)
2 | y = [x(1)^2; x(1)*x(2); x(1)*x(3); x(2)^2; x(2)*x(3); x(3)^2];
3 | end


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c2_nn_pf_dphi.m:
--------------------------------------------------------------------------------
 1 | function dphi = eg1_c2_nn_pf_dphi(x)
 2 | dphi = [
 3 |          [  2*x(1),       0,         0   ];
 4 |          [    x(2),    x(1),         0   ];
 5 |          [    x(3),       0,      x(1)   ];
 6 |          [       0,  2*x(2),         0   ];
 7 |          [       0,    x(3),      x(2)   ];
 8 |          [       0,       0,    2*x(3)   ];
 9 |         ];
10 | end


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg1/eg1_c2_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = eg1_c2_ode(~,x)
 2 | 
 3 | global A; global B;
 4 | global Ba;
 5 | global Qa; global Ra;
 6 | global Ta;
 7 | global dw;
 8 | global wa;
 9 | 
10 | u = x(3);
11 | 
12 | dphi = eg1_c2_nn_pf_dphi(x(1:3));
13 |     
14 | v = -0.5*inv(Ra)*Ba'*dphi'*wa;
15 | 
16 | xdot = [A*[x(1);x(2)]+B*u; (1/Ta)*(v - x(3)); [x(1);x(2);x(3)]'*Qa*[x(1);x(2);x(3)]+v'*Ra*v;dw];
17 | 
18 | end 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c1_main.m:
--------------------------------------------------------------------------------
  1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach.
  2 | % Code for Case I in Example 2
  3 | % By J. Lu
  4 | 
  5 | %-------------------------------- start -----------------------------------
  6 | clear; close all; clc;
  7 | 
  8 | global M; global g; global L;
  9 | global k1; global k2; global k3;
 10 | global Ga;
 11 | global Q; global R; global Ra;
 12 | global Ta;
 13 | global dw;
 14 | global wa;
 15 | 
 16 | Ta = 1;
 17 | Ga = [zeros(2,1); (1/Ta)*diag([1])];
 18 | 
 19 | M = 0.3; L = 0.6; g = 10; k1 = 0.8; k2 = 0.2; k3 = 1; 
 20 | 
 21 | Q = 5*eye(2);R = 0.1*eye(1);
 22 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]);
 23 | 
 24 | Fsamples = 100000;
 25 | T = 0.1; 
 26 | ss = T/10; 
 27 | 
 28 | % state, control, and critic and action NN weights
 29 | x00 = 1*[1;0];
 30 | wc = [1;0;0;1;0;1;zeros(6,1)]; 
 31 | 
 32 | x = [x00;-1;0;0;wc]; 
 33 | wa = x(6:end);
 34 | x0 = x;
 35 | xx = [x];
 36 | uu = [];
 37 | 
 38 | phi = eg2_c1_nn_pf(x(1:3));
 39 | pphi = [];
 40 | pp = [];
 41 | 
 42 | % NN learning law
 43 | dw = zeros(length(phi),1);
 44 | 
 45 | 
 46 | wwa = [wa];
 47 | wwc = [x(6:end)];
 48 | 
 49 | ac = 100; 
 50 | aa = 0.08;
 51 | 
 52 | cw = 1e-2;
 53 | 
 54 | rpb_k = Fsamples - 200; % remove the probing noise at the last 200 steps
 55 | 
 56 | Gamma = eye(length(phi));
 57 | 
 58 | h = waitbar(0,'please wait');
 59 | for k = 1:Fsamples
 60 |       
 61 |     % probing noise
 62 |     if norm(x(1:2),2) <= 0.2 && k <= rpb_k
 63 |         x = [x(1);x(2);x(3)+2*(rand(1,1)-0.5);0;0;x(6:end)'];
 64 |         phi = eg2_c1_nn_pf(x(1:3));
 65 |     end
 66 |     
 67 |     tspan = 0:ss:T;
 68 |     [t,x]= ode45(@eg2_c1_ode, tspan, x);
 69 |     
 70 |     phi_next = eg2_c1_nn_pf(x(length(t),1:3));
 71 |     pphi = phi_next-phi;
 72 |     phi = phi_next;
 73 |     
 74 |     pp = x(length(t),4);
 75 |     
 76 |     % critic learning law
 77 |     dw = - ac*pphi/((1+pphi'*pphi)^2)*(pp+pphi'*x(length(t),6:end)');
 78 |    
 79 |     % action learning law
 80 |     if norm(dw,2) <= cw
 81 |         wa = wa + (-aa*Gamma*(wa - x(length(t),6:end)'));
 82 |         phi = eg2_c1_nn_pf(x(length(t),1:3));
 83 |     end
 84 |     
 85 |     x = [x(length(t),1:3),0,0,x(length(t),6:end)];
 86 |     xx = [xx x'];
 87 |     uu = [uu x(3)];
 88 |     wwa = [wwa wa];
 89 |     wwc = [wwc x(6:end)'];
 90 |     
 91 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 92 | end
 93 | close(h);
 94 | 
 95 | wc = xx(6:end,end)'
 96 | wa = wa'
 97 | 
 98 | % result
 99 | figure(1), % states 
100 | subplot(2,1,1)
101 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1)
102 | xlabel('Time (s)');
103 | ylabel('$x_1$','Interpreter','latex');
104 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
105 | grid on;
106 | subplot(2,1,2)
107 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1)
108 | xlabel('Time (s)');
109 | ylabel('$x_2$','Interpreter','latex');
110 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
111 | grid on;
112 | 
113 | figure(2), % control
114 | plot(T*((1:size(uu,2))-1),uu(1,:),'linewidth',1)
115 | xlabel('Time (s)');
116 | ylabel('$u$','Interpreter','latex');
117 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
118 | grid on;
119 | 
120 | figure(3), % wc
121 | plot(T*((1:size(xx,2))-1),xx(5:end,:),'linewidth',1)
122 | xlabel('Time (s)');
123 | ylabel('Critic newtork weights');
124 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
125 | grid on;
126 | 
127 | figure(4), % wa
128 | plot(T*((1:size(wwa,2))-1),wwa,'linewidth',1)
129 | xlabel('Time (s)');
130 | ylabel('Action newtork weights');
131 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
132 | grid on;
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c1_nn_pf.m:
--------------------------------------------------------------------------------
1 | function y = eg2_c1_nn_pf(x)
2 | y = [ x(1)^2;  x(1)*x(2);  x(2)^2;  x(1)*x(3);  x(2)*x(3);  x(3)^2;...
3 |       x(1)^3;  x(2)^3;     x(3)^3;  x(1)^4;     x(2)^4;     x(3)^4  ];
4 | end


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c1_nn_pf_dphi.m:
--------------------------------------------------------------------------------
 1 | function dphi = eg2_c1_nn_pf_dphi(x) 
 2 | dphi = [
 3 |          [  2*x(1),       0,         0   ];
 4 |          [    x(2),    x(1),         0   ];
 5 |          [    x(3),       0,      x(1)   ];
 6 |          [       0,  2*x(2),         0   ];
 7 |          [       0,    x(3),      x(2)   ];
 8 |          [       0,       0,    2*x(3)   ];
 9 |          [  3*x(1)^2,         0,         0   ];
10 |          [         0,  3*x(2)^2,         0   ];
11 |          [         0,         0,  3*x(3)^2   ];
12 |          [  4*x(1)^3,         0,         0   ];
13 |          [         0,  4*x(2)^3,         0   ];
14 |          [         0,         0,  4*x(3)^3   ];
15 |         ];
16 | end


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c1_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = eg2_c1_ode(~,x)
 2 | 
 3 | global M; global g; global L;
 4 | global k1; global k2; global k3;
 5 | global Ga;
 6 | global Q; global R; global Ra;
 7 | global Ta;
 8 | global dw;
 9 | global wa;
10 | 
11 | u = x(3);
12 | 
13 | dphi = eg2_c1_nn_pf_dphi(x(1:3));
14 |     
15 | v = -0.5*inv(Ra)*Ga'*dphi'*wa;
16 | 
17 | f = [x(2);-M*g*L/k1*sin(x(1))-k2/k1*x(2)+k3/k1*(tanh(u)+u)];
18 | 
19 | xdot = [ f; (1/Ta)*(v - x(3));...
20 |          [x(1);x(2)]'*Q*[x(1);x(2)] + 1*log(1+[x(3)]'*R*[x(3)])+v'*Ra*v;...
21 |          [x(1);x(2)]'*Q*[x(1);x(2)] + 1*log(1+[x(3)]'*R*[x(3)]); dw ];
22 | 
23 | 
24 | 
25 | end 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c2_main.m:
--------------------------------------------------------------------------------
 1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach.
 2 | % Code for Case II in Example 2
 3 | % By J. Lu
 4 | 
 5 | %-------------------------------- start -----------------------------------
 6 | clear; close all; clc;
 7 | 
 8 | global M; global g; global L;
 9 | global k1; global k2; global k3;
10 | global Ga;
11 | global Ta;
12 | global wa;
13 | 
14 | Ta = 1;
15 | Ga = [zeros(2,1); (1/Ta)*diag([1])];
16 | 
17 | M = 0.3; L = 0.6; g = 10; k1 = 0.8; k2 = 0.2; k3 = 1; 
18 | 
19 | Fsamples = 150; 
20 | T = 0.05; 
21 | ss = T/10; 
22 | 
23 | wa = [10.4347996132347;4.74187769922673;4.59458795955741;-0.140646935292767;5.46611496181774;2.82980614212775;0.00266940936499440;0.0224520628430915;0.00480024500336202;2.63761282635573;-0.179632857958409;-0.150728264946674];
24 | 
25 | x00 = [1;0];
26 | u00 = 2;
27 | x = [x00;u00]; 
28 | xx0 = [x];
29 | uu0 = [u00];
30 | 
31 | for k = 1:Fsamples
32 |     tspan = 0:ss:T;
33 |     [t,x]= ode45(@eg2_c2_ode, tspan, x);
34 |    
35 |     x = x(length(t),:);
36 |     xx0 = [xx0 x'];
37 |     uu0 = [uu0 x(3)];
38 | end
39 | 
40 | 
41 | u01 = -2;
42 | x = [x00;u01]; 
43 | xx1 = [x];
44 | uu1 = [u01];
45 | 
46 | for k = 1:Fsamples
47 |     tspan = 0:ss:T;
48 |     [t,x]= ode45(@eg2_c2_ode, tspan, x);
49 |    
50 |     x = x(length(t),:);
51 |     xx1 = [xx1 x'];
52 |     uu1 = [uu1 x(3)];
53 | end
54 | 
55 | 
56 | figure(1), % 3d
57 | plot3(xx0(1,:),xx0(2,:),uu0,'r','linewidth',1),hold on;
58 | plot3(xx1(1,:),xx1(2,:),uu1,'b-.','linewidth',1),hold on;
59 | plot3(x00(1,:),x00(2,:),u00,'ro','MarkerFaceColor','r','linewidth',1,'MarkerSize',6),hold on;
60 | plot3(x00(1,:),x00(2,:),u01,'bo','MarkerFaceColor','b','linewidth',1,'MarkerSize',6),hold on;
61 | xlim([-0.5 1.1])
62 | xlabel('$x_1$','Interpreter','latex');
63 | ylabel('$x_2$','Interpreter','latex');
64 | zlabel('$u$','Interpreter','latex');
65 | h=legend('$u^1_0=2$','$u^2_0=-2$');
66 | set(h,'Interpreter','latex','Position',[0.154401278625391 0.266349206349207 0.236911728463504 0.106714288938613]);    
67 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
68 | grid on;
69 | view([426.299998823416 14.7740121081114])
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/model_free_integral_reinforcement_learning/eg2/eg2_c2_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = eg2_c2_ode(~,x)
 2 | 
 3 | global M; global g; global L;
 4 | global k1; global k2; global k3;
 5 | global Ga;
 6 | global Ra;
 7 | global Ta;
 8 | global wa;
 9 | 
10 | u = x(3);
11 | 
12 | dphi = eg2_c1_nn_pf_dphi(x(1:3));
13 |     
14 | v = -0.5*inv(Ra)*Ga'*dphi'*wa;
15 | 
16 | f = [x(2);-M*g*L/k1*sin(x(1))-k2/k1*x(2)+k3/k1*(tanh(u)+u)];
17 | 
18 | xdot = [f; (1/Ta)*(v - x(3)) ];
19 | 
20 | end 


--------------------------------------------------------------------------------
/model_free_nonzero_sum_games/README.md:
--------------------------------------------------------------------------------
 1 | # Model-free nonzero-sum games (NZSGs)
 2 | ********
 3 | Code for paper:  
 4 | [J. Lu, Q. Wei, F.-Y. Wang. Parallel control for nonzero-sum games with completely unknown nonlinear dynamics via reinforcement learning. *IEEE Transactions on Systems, Man, and Cybernetics: Systems*, 2025, doi: 10.1109/TSMC.2025.3526357.](https://ieeexplore.ieee.org/document/10849990)
 5 | 
 6 | ### Description
 7 | ********
 8 | A model-free optimal control method is developed for unknown continuous-time NZSGs *without reconstructing unknown dynamics or employing off-policy IRL*.
 9 | 
10 | ### Requirements
11 | ********
12 | - MATLAB
13 | 
14 | ### Citation
15 | ********
16 | ```angular2html
17 | @article{lu2025parallel,
18 |   title={Parallel control for nonzero-sum games with completely unknown nonlinear dynamics via reinforcement learning},
19 |   author={Lu, Jingwei and Wei, Qinglai and Wang, Fei-Yue},
20 |   journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems},
21 |   doi = {10.1109/TSMC.2025.3526357},
22 |   year={2025}
23 | }
24 | ```
25 | 


--------------------------------------------------------------------------------
/model_free_nonzero_sum_games/eg_cacc/cacc_model_free_learning.m:
--------------------------------------------------------------------------------
  1 | function cacc_model_free_learning
  2 | % ''Parallel Control for Nonzero-Sum Games With Completely Unknown
  3 | %   Nonlinear Dynamics via Reinforcement Learning''
  4 | %
  5 | % Code for cooperative adaptive cruise control (CACC)
  6 | %
  7 | % By J. Lu
  8 | % Date: Aug. 12, 2024
  9 | 
 10 | %-------------------------------- start -----------------------------------
 11 | clear; close all; clc;
 12 | 
 13 | global Ga1; global Ga2;
 14 | global Q1; global R11; global R12;
 15 | global Q2; global R21; global R22;
 16 | global Qa1; global Ra11; global Ra12;
 17 | global Qa2; global Ra21; global Ra22;
 18 | global w1; global w2;
 19 | global dw1; global dw2;
 20 | 
 21 | % cacc information & performance indices
 22 | n = 6;
 23 | m1 = 1;
 24 | m2 = 1;
 25 | eta = n + m1 + m2;
 26 | 
 27 | Ga1 = [ zeros(n,m1); eye(m1); zeros(m2,m1) ]; % $\mathcal{G}_1$
 28 | Ga2 = [ zeros(n,m2); zeros(m1,m2); eye(m2) ]; % $\mathcal{G}_2$
 29 | 
 30 | Q1 = 2*eye(n); R11 = 2*eye(m1); R12 = 2*eye(m2);
 31 | Q2 = 1*eye(n); R21 = 1*eye(m1); R22 = 1*eye(m2);
 32 | Qa1 = [ Q1,zeros(n,m1+m2);zeros(m1,n),R11,zeros(m1,m2);zeros(m2,n+m1),R12 ]; Ra11 = 1*eye(m1); Ra12 = 0*eye(m2);
 33 | Qa2 = [ Q2,zeros(n,m1+m2);zeros(m1,n),R21,zeros(m1,m2);zeros(m2,n+m1),R22 ]; Ra21 = 0*eye(m1); Ra22 = 1*eye(m2);
 34 | 
 35 | Fsamples = 100000;
 36 | T = 0.5; 
 37 | ss = T/10; 
 38 | 
 39 | w1 = [  1   0   0   0   0   0   -1   0 ... 
 40 |         1   0   0   0   0  -1    0 ... 
 41 |         1   0   0   0   0   0 ...
 42 |         1   0   0   0  -1 ... 
 43 |         1   0   0  -1 ... 
 44 |         1   0   0 ... 
 45 |         1   0 ... 
 46 |         1 ]';  
 47 | w2 = w1;
 48 | 
 49 | x_ini = 5;
 50 | u_ini = 1;
 51 | 
 52 | x0 = x_ini*[ 1; 1; 0; 1; 1; 0 ];
 53 | u0 = u_ini*[ 0; 0 ];
 54 | x = [ x0; u0; 0; 0; 0; 0; w1; w2 ];
 55 | 
 56 | nn_length = length(w1);
 57 | 
 58 | xx = [ x ];
 59 | uu1 = [];
 60 | uu2 = [];
 61 | 
 62 | ell = length(w1); % number of hidden neurons
 63 | L = ell; %  need L >= ell to satisfy rank($\mathcal{D}_i$) = $\ell_i$
 64 | 
 65 | phi = cacc_nn_pf(x(1:eta));
 66 | pphi = zeros(ell,L+1); %  number of historical data (L) + real-time data (1)
 67 | pp1 = zeros(L+1,1);
 68 | pp2 = zeros(L+1,1);
 69 | 
 70 | dw1 = zeros(nn_length,1);
 71 | dw2 = zeros(nn_length,1);
 72 | 
 73 | ww1 = [ w1 ];
 74 | ww2 = [ w2 ];
 75 | 
 76 | l1 = 1; % learning rates
 77 | l2 = 1;
 78 | 
 79 | decay_interval = 1000;
 80 | 
 81 | l1_final = 1e-1; 
 82 | l2_final = 1e-1; 
 83 | l1_decay = (l1-l1_final)/(0.8*Fsamples/decay_interval);
 84 | l2_decay = (l2-l2_final)/(0.8*Fsamples/decay_interval);
 85 | 
 86 | i = 1;
 87 | 
 88 | h = waitbar(0,'Please wait');
 89 | for k = 1:Fsamples
 90 |     if norm(x(1:n),2) > 1e3  % the vehicular platoon is unstable and need to restart the learning
 91 |         break                % reducing the learning rates may provide a more stable learning process
 92 |     end
 93 | 
 94 |     if norm(x(1:n),2) <= 1  % reset controls
 95 |         u0 = 2*u_ini*[ (rand(1)-0.5); (rand(1)-0.5) ];
 96 |         x = [ x(1:n)'; u0; 0; 0; 0; 0; x(eta+5:end)' ];
 97 |         phi = cacc_nn_pf(x(1:eta));
 98 |     end
 99 | 
100 |     if mod(k,decay_interval) == 0
101 |         l1 = l1 - l1_decay;
102 |         l2 = l2 - l2_decay;
103 |     end
104 | 
105 |     if l1 < l1_final
106 |         l1 = l1_final;
107 |     end
108 | 
109 |     if l2 < l2_final
110 |         l2 = l2_final;
111 |     end
112 |     
113 |     tspan = 0:ss:T;
114 |     [t,x]= ode45(@cacc_ode, tspan, x);
115 |     
116 |     phi_next = cacc_nn_pf(x(length(t),1:eta));
117 | 
118 |     if i <= L + 1
119 |         pphi(:,i) = phi_next-phi;
120 |         pp1(i) = x(length(t),eta+3);
121 |         pp2(i) = x(length(t),eta+4);
122 | 
123 |         phi = phi_next;
124 | 
125 |         i = i + 1;
126 |     else
127 |         for j = 1:L
128 |             pphi(:,j) = pphi(:,j+1);
129 |             pp1(j) = pp1(j+1);
130 |             pp2(j) = pp2(j+1);
131 |         end
132 |         pphi(:,L+1) = phi_next-phi;
133 |         pp1(L+1) = x(length(t),eta+3);
134 |         pp2(L+1) = x(length(t),eta+4);
135 | 
136 |         phi = phi_next;
137 |     end
138 | 
139 |     w1 = x(length(t),eta+5:eta+4+nn_length)';
140 |     w2 = x(length(t),eta+5+nn_length:eta+4+2*nn_length)';   
141 |     
142 |     if i >= L + 1
143 |         dw1 = zeros(nn_length,1);
144 |         dw2 = zeros(nn_length,1);
145 |         for j = 1:L+1
146 |             dw1 = dw1 - l1*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp1(j)+pphi(:,j)'*w1);
147 |             dw2 = dw2 - l2*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp2(j)+pphi(:,j)'*w2);
148 |         end
149 |     end
150 | 
151 |     x = [x(length(t),1:eta),0,0,0,0,x(length(t),eta+5:end)];
152 | 
153 |     xx = [ xx x' ];
154 |     uu1 = [ uu1 x(7) ];
155 |     uu2 = [ uu2 x(8) ];
156 |     ww1 = [ ww1 w1 ];
157 |     ww2 = [ ww2 w2 ];
158 |     
159 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
160 | end
161 | close(h);
162 | 
163 | w1_display = w1'
164 | w2_display = w2'
165 | 
166 | % result
167 | figure(1), % vehicle 1 states 
168 | subplot(3,1,1)
169 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1)
170 | xlabel('Time (s)');
171 | ylabel('$e_{p,1}$','Interpreter','latex');
172 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
173 | grid on;
174 | subplot(3,1,2)
175 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1)
176 | xlabel('Time (s)');
177 | ylabel('$e_{v,1}$','Interpreter','latex');
178 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
179 | grid on;
180 | subplot(3,1,3)
181 | plot(T*((1:size(xx,2))-1),xx(3,:),'linewidth',1)
182 | xlabel('Time (s)');
183 | ylabel('$a_1$','Interpreter','latex');
184 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
185 | grid on;
186 | 
187 | figure(2), % vehicle 2 states 
188 | subplot(3,1,1)
189 | plot(T*((1:size(xx,2))-1),xx(4,:),'linewidth',1)
190 | xlabel('Time (s)');
191 | ylabel('$e_{p,2}$','Interpreter','latex');
192 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
193 | grid on;
194 | subplot(3,1,2)
195 | plot(T*((1:size(xx,2))-1),xx(5,:),'linewidth',1)
196 | xlabel('Time (s)');
197 | ylabel('$e_{v,2}$','Interpreter','latex');
198 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
199 | grid on;
200 | subplot(3,1,3)
201 | plot(T*((1:size(xx,2))-1),xx(6,:),'linewidth',1)
202 | xlabel('Time (s)');
203 | ylabel('$a_2$','Interpreter','latex');
204 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
205 | grid on;
206 | 
207 | figure(3), % vehicle 1 control
208 | plot(T*((1:size(uu1,2))-1),uu1,'linewidth',1)
209 | xlabel('Time (s)');
210 | ylabel('$u_1$','Interpreter','latex');
211 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
212 | grid on;
213 | 
214 | figure(4), % vehicle 2 control
215 | plot(T*((1:size(uu2,2))-1),uu1,'linewidth',1)
216 | xlabel('Time (s)');
217 | ylabel('$u_2$','Interpreter','latex');
218 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
219 | grid on;
220 | 
221 | figure(5), % w1
222 | plot(T*((1:size(ww1,2))-1),ww1,'linewidth',1)
223 | xlabel('Time (s)');
224 | ylabel('$\mathcal{W}_{1}$','Interpreter','latex');
225 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
226 | grid on;
227 | 
228 | figure(6), % w2
229 | plot(T*((1:size(ww2,2))-1),ww2,'linewidth',1)
230 | xlabel('Time (s)');
231 | ylabel('$\mathcal{W}_{2}$','Interpreter','latex');
232 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
233 | grid on;
234 | 
235 | end
236 | %--------------------------- cacc dynamics ------------------------------
237 | function xdot = cacc_ode(~,x)
238 | global Ga1; global Ga2;
239 | global Q1; global R11; global R12;
240 | global Q2; global R21; global R22;
241 | global Qa1; global Ra11; global Ra12;
242 | global Qa2; global Ra21; global Ra22;
243 | global w1; global w2;
244 | global dw1; global dw2;
245 | 
246 | td = 1; tau = 0.1;
247 | 
248 | x_state = [ x(1); x(2); x(3); x(4); x(5); x(6) ];
249 | 
250 | u1 = x(7);
251 | u2 = x(8);
252 | 
253 | u = [ u1; u2 ];
254 | 
255 | s = [ x_state; u ];
256 | 
257 | dphi = cacc_nn_pf_dphi(s);
258 |     
259 | v1 = -0.5*inv(Ra11)*Ga1'*dphi'*w1;
260 | v2 = -0.5*inv(Ra22)*Ga2'*dphi'*w2;
261 | 
262 | cacc_dynamics = [   x(2) - td*x(3);...
263 |                    -x(3);...
264 |                   (-x(3)+u(1))/tau;...
265 |                     x(5) - td*x(6);...
266 |                     x(3) - x(6);...
267 |                   (-x(6)+u(2))/tau  ];
268 | 
269 | xdot = [ cacc_dynamics;...
270 |          v1;...
271 |          v2;...
272 |          x_state'*Q1*x_state + u1'*R11*u1 + u2'*R12*u2;...
273 |          x_state'*Q2*x_state + u1'*R21*u1 + u2'*R22*u2;...
274 |          s'*Qa1*s + v1'*Ra11*v1 + v2'*Ra12*v2;...
275 |          s'*Qa2*s + v1'*Ra21*v1 + v2'*Ra22*v2;...
276 |          dw1;...
277 |          dw2 ];
278 | end
279 | %------------------- activation function vector of critic ------------------
280 | function y = cacc_nn_pf(x)
281 | x1 = x(1);
282 | x2 = x(2);
283 | x3 = x(3);
284 | x4 = x(4);
285 | x5 = x(5);
286 | x6 = x(6);
287 | x7 = x(7);
288 | x8 = x(8);
289 | 
290 | y = [ x1^2; x1*x2; x1*x3; x1*x4; x1*x5; x1*x6; x1*x7; x1*x8;...
291 |       x2^2; x2*x3; x2*x4; x2*x5; x2*x6; x2*x7; x2*x8;...
292 |       x3^2; x3*x4; x3*x5; x3*x6; x3*x7; x3*x8;...
293 |       x4^2; x4*x5; x4*x6; x4*x7; x4*x8;...
294 |       x5^2; x5*x6; x5*x7; x5*x8;...
295 |       x6^2; x6*x7; x6*x8;...
296 |       x7^2; x7*x8;...
297 |       x8^2 ];
298 | end
299 | %-------------- derivative of critic activation function vector -----------
300 | function dphi = cacc_nn_pf_dphi(x)
301 | x1 = x(1);
302 | x2 = x(2);
303 | x3 = x(3);
304 | x4 = x(4);
305 | x5 = x(5);
306 | x6 = x(6);
307 | x7 = x(7);
308 | x8 = x(8);
309 | 
310 | dphi = [
311 |          [  2*x1,          0,          0,          0,          0,          0,          0,          0  ];
312 |          [    x2,         x1,          0,          0,          0,          0,          0,          0  ]; 
313 |          [    x3,          0,         x1,          0,          0,          0,          0,          0  ]; 
314 |          [    x4,          0,          0,         x1,          0,          0,          0,          0  ]; 
315 |          [    x5,          0,          0,          0,         x1,          0,          0,          0  ];
316 |          [    x6,          0,          0,          0,          0,         x1,          0,          0  ];
317 |          [    x7,          0,          0,          0,          0,          0,         x1,          0  ];
318 |          [    x8,          0,          0,          0,          0,          0,          0,         x1  ];
319 |          [     0,       2*x2,          0,          0,          0,          0,          0,          0  ];
320 |          [     0,         x3,         x2,          0,          0,          0,          0,          0  ];
321 |          [     0,         x4,          0,         x2,          0,          0,          0,          0  ];
322 |          [     0,         x5,          0,          0,         x2,          0,          0,          0  ];
323 |          [     0,         x6,          0,          0,          0,         x2,          0,          0  ];
324 |          [     0,         x7,          0,          0,          0,          0,         x2,          0  ];
325 |          [     0,         x8,          0,          0,          0,          0,          0,         x2  ];
326 |          [     0,          0,       2*x3,          0,          0,          0,          0,          0  ];
327 |          [     0,          0,         x4,         x3,          0,          0,          0,          0  ];
328 |          [     0,          0,         x5,          0,         x3,          0,          0,          0  ];
329 |          [     0,          0,         x6,          0,          0,         x3,          0,          0  ];
330 |          [     0,          0,         x7,          0,          0,          0,         x3,          0  ];
331 |          [     0,          0,         x8,          0,          0,          0,          0,         x3  ];
332 |          [     0,          0,          0,       2*x4,          0,          0,          0,          0  ]; 
333 |          [     0,          0,          0,         x5,         x4,          0,          0,          0  ];
334 |          [     0,          0,          0,         x6,          0,         x4,          0,          0  ];
335 |          [     0,          0,          0,         x7,          0,          0,         x4,          0  ];
336 |          [     0,          0,          0,         x8,          0,          0,          0,         x4  ];
337 |          [     0,          0,          0,          0,       2*x5,          0,          0,          0  ];
338 |          [     0,          0,          0,          0,         x6,         x5,          0,          0  ];
339 |          [     0,          0,          0,          0,         x7,          0,         x5,          0  ];
340 |          [     0,          0,          0,          0,         x8,          0,          0,         x5  ];
341 |          [     0,          0,          0,          0,          0,       2*x6,          0,          0  ];
342 |          [     0,          0,          0,          0,          0,         x7,         x6,          0  ];
343 |          [     0,          0,          0,          0,          0,         x8,          0,         x6  ];
344 |          [     0,          0,          0,          0,          0,         0,        2*x7,          0  ];
345 |          [     0,          0,          0,          0,          0,         0,          x8,         x7  ];
346 |          [     0,          0,          0,          0,          0,         0,           0,       2*x8  ];
347 |        ];
348 | end
349 | 
350 | 
351 | 
352 | 


--------------------------------------------------------------------------------
/model_free_nonzero_sum_games/eg_two_player_ndg/tndg_model_free_learning.m:
--------------------------------------------------------------------------------
  1 | function tndg_model_free_learning
  2 | % ''Parallel Control for Nonzero-Sum Games With Completely Unknown
  3 | %   Nonlinear Dynamics via Reinforcement Learning''
  4 | %
  5 | % Code for 2-player nonlinear differential game (TNDG)
  6 | %
  7 | % By J. Lu
  8 | % Date: Aug. 12, 2024
  9 | 
 10 | %-------------------------------- start -----------------------------------
 11 | clear; close all; clc;
 12 | 
 13 | global Ga1; global Ga2;
 14 | global Q1; global R11; global R12;
 15 | global Q2; global R21; global R22;
 16 | global Qa1; global Ra11; global Ra12;
 17 | global Qa2; global Ra21; global Ra22;
 18 | global w1; global w2;
 19 | global dw1; global dw2;
 20 | 
 21 | % system information & performance indices
 22 | n = 2;
 23 | m1 = 1;
 24 | m2 = 1;
 25 | eta = n + m1 + m2;
 26 | 
 27 | Ga1 = [ zeros(n,m1); eye(m1); zeros(m2,m1) ]; % $\mathcal{G}_1$
 28 | Ga2 = [ zeros(n,m2); zeros(m1,m2); eye(m2) ]; % $\mathcal{G}_2$
 29 | 
 30 | Q1 = 2*eye(n); R11 = 2*eye(m1); R12 = 2*eye(m2);
 31 | Q2 = 1*eye(n); R21 = 1*eye(m1); R22 = 1*eye(m2);
 32 | Qa1 = [ Q1,zeros(n,m1+m2);zeros(m1,n),R11,zeros(m1,m2);zeros(m2,n+m1),R12 ]; Ra11 = 1*eye(m1); Ra12 = 0*eye(m2);
 33 | Qa2 = [ Q2,zeros(n,m1+m2);zeros(m1,n),R21,zeros(m1,m2);zeros(m2,n+m1),R22 ]; Ra21 = 0*eye(m1); Ra22 = 1*eye(m2);
 34 | 
 35 | Fsamples = 50000;
 36 | T = 0.2; 
 37 | ss = T/10; 
 38 | 
 39 | w1 = [  1   0   1   1   ...
 40 |         1   1   1   ... 
 41 |         1  -1   ... 
 42 |         1 ]'; 
 43 | w2 = w1;
 44 | 
 45 | x_ini = 1;
 46 | u_ini = 1;
 47 | 
 48 | x0 = x_ini*[ 1; -1 ];
 49 | u0 = u_ini*[ 0; 0 ];
 50 | x = [ x0; u0; 0; 0; 0; 0; w1; w2 ];
 51 | 
 52 | nn_length = length(w1);
 53 | 
 54 | xx = [ x ];
 55 | uu1 = [];
 56 | uu2 = [];
 57 | 
 58 | ell = length(w1); % number of hidden neurons
 59 | L = ell; %  need L >= ell to satisfy rank($\mathcal{D}_i$) = $\ell_i$
 60 | 
 61 | phi = tndg_nn_pf(x(1:eta));
 62 | pphi = zeros(ell,L+1); %  number of historical data (L) + real-time data (1)
 63 | pp1 = zeros(L+1,1);
 64 | pp2 = zeros(L+1,1);
 65 | 
 66 | dw1 = zeros(nn_length,1);
 67 | dw2 = zeros(nn_length,1);
 68 | 
 69 | ww1 = [ w1 ];
 70 | ww2 = [ w2 ];
 71 | 
 72 | l1 = 0.5; % learning rates
 73 | l2 = 0.5;
 74 | 
 75 | l1_final = 1e-5;
 76 | l2_final = 1e-5;
 77 | 
 78 | l_decay_interval = 1000;
 79 | 
 80 | l1_decay = (l1-l1_final)/(0.8*Fsamples/l_decay_interval);
 81 | l2_decay = (l2-l2_final)/(0.8*Fsamples/l_decay_interval);
 82 | 
 83 | i = 1;
 84 | 
 85 | h = waitbar(0,'Please wait');
 86 | for k = 1:Fsamples
 87 |     if norm(x(1:n),2) > 1e3 % the system is unstable and need to restart the learning
 88 |         break
 89 |     end
 90 | 
 91 |     if mod(k,l_decay_interval) == 0
 92 |         l1 = l1 - l1_decay;
 93 |         l2 = l2 - l2_decay;
 94 |     end
 95 | 
 96 |     if l1 < l1_final
 97 |         l1 = l1_final;
 98 |     end
 99 |     
100 |     if l2 < l2_final
101 |         l2 = l2_final;
102 |     end
103 | 
104 |     if (norm(x(1:n),2) <= 0.1)  % reset controls
105 |         u0 = 2*u_ini*[ (rand(1)-0.5); (rand(1)-0.5) ];
106 |         x = [ x(1:n)'; u0; 0; 0; 0; 0; x(eta+5:end)' ];
107 |         phi = tndg_nn_pf(x(1:eta));
108 |     end
109 | 
110 |     tspan = 0:ss:T;
111 |     [t,x]= ode45(@tndg_ode, tspan, x);
112 |     
113 |     phi_next = tndg_nn_pf(x(length(t),1:eta));
114 | 
115 |     if i <= L + 1
116 |         pphi(:,i) = phi_next-phi;
117 |         pp1(i) = x(length(t),eta+3);
118 |         pp2(i) = x(length(t),eta+4);
119 | 
120 |         phi = phi_next;
121 | 
122 |         i = i + 1;
123 |     else
124 |         for j = 1:L
125 |             pphi(:,j) = pphi(:,j+1);
126 |             pp1(j) = pp1(j+1);
127 |             pp2(j) = pp2(j+1);
128 |         end
129 |         pphi(:,L+1) = phi_next-phi;
130 |         pp1(L+1) = x(length(t),eta+3);
131 |         pp2(L+1) = x(length(t),eta+4);
132 | 
133 |         phi = phi_next;
134 |     end
135 | 
136 |     w1 = x(length(t),eta+5:eta+4+nn_length)';
137 |     w2 = x(length(t),eta+5+nn_length:eta+4+2*nn_length)';   
138 |     
139 |     if i >= L + 1
140 |         dw1 = zeros(nn_length,1);
141 |         dw2 = zeros(nn_length,1);
142 |         for j = 1:L+1
143 |             dw1 = dw1 - l1*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp1(j)+pphi(:,j)'*w1);
144 |             dw2 = dw2 - l2*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp2(j)+pphi(:,j)'*w2);
145 |         end
146 |     end    
147 | 
148 |     x = [x(length(t),1:eta),0,0,0,0,x(length(t),eta+5:end)];
149 | 
150 |     xx = [ xx x' ];
151 |     uu1 = [ uu1 x(n+1) ];
152 |     uu2 = [ uu2 x(n+2) ];
153 |     ww1 = [ ww1 w1 ];
154 |     ww2 = [ ww2 w2 ];
155 |     
156 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
157 | end
158 | close(h);
159 | 
160 | w1_display = w1'
161 | w2_display = w2'
162 | 
163 | % result
164 | figure(1), % states
165 | subplot(2,1,1)
166 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1)
167 | xlabel('Time (s)');
168 | ylabel('$x_{1}$','Interpreter','latex');
169 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
170 | grid on;
171 | subplot(2,1,2)
172 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1)
173 | xlabel('Time (s)');
174 | ylabel('$x_{2}$','Interpreter','latex');
175 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
176 | grid on;
177 | 
178 | figure(2), % control 1
179 | plot(T*((1:size(uu1,2))-1),uu1,'linewidth',1)
180 | xlabel('Time (s)');
181 | ylabel('$u_1$','Interpreter','latex');
182 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
183 | grid on;
184 | 
185 | figure(3), % control 2
186 | plot(T*((1:size(uu2,2))-1),uu2,'linewidth',1)
187 | xlabel('Time (s)');
188 | ylabel('$u_2$','Interpreter','latex');
189 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
190 | grid on;
191 | 
192 | figure(4), % w1
193 | plot(T*((1:size(ww1,2))-1),ww1,'linewidth',1)
194 | xlabel('Time (s)');
195 | ylabel('$\mathcal{W}_{1}$','Interpreter','latex');
196 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
197 | grid on;
198 | 
199 | figure(5), % w2
200 | plot(T*((1:size(ww2,2))-1),ww2,'linewidth',1)
201 | xlabel('Time (s)');
202 | ylabel('$\mathcal{W}_{2}$','Interpreter','latex');
203 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
204 | grid on;
205 | 
206 | end
207 | %---------------------------- tndg dynamics -------------------------------
208 | function xdot = tndg_ode(~,x)
209 | global Ga1; global Ga2;
210 | global Q1; global R11; global R12;
211 | global Q2; global R21; global R22;
212 | global Qa1; global Ra11; global Ra12;
213 | global Qa2; global Ra21; global Ra22;
214 | global w1; global w2;
215 | global dw1; global dw2;
216 | 
217 | x_state = [ x(1); x(2); ];
218 | 
219 | u1 = x(3);
220 | u2 = x(4);
221 | 
222 | u = [ u1; u2 ];
223 | 
224 | s = [ x_state; u ];
225 | 
226 | dphi = tndg_nn_pf_dphi(s);
227 |     
228 | v1 = -0.5*inv(Ra11)*Ga1'*dphi'*w1;
229 | v2 = -0.5*inv(Ra22)*Ga2'*dphi'*w2;
230 | 
231 | g1 = [ 0; cos(2*x(1))+2 ];
232 | g2 = [ 0; sin(4*x(1)^2)+2 ];  
233 | 
234 | tndg_dynamics = [  -2*x(1) + x(2);...
235 |                    -0.5*x(1) - x(2) + x(1)^2*x(2) + 0.05*x(2)*(cos(2*x(1))+2)^2 + 0.05*x(2)*(sin(4*x(1))+2)^2  ]...
236 |                 + g1*u1 + g2*u2; 
237 | 
238 | xdot = [ tndg_dynamics;...
239 |          v1;...
240 |          v2;...
241 |          x_state'*Q1*x_state + u1'*R11*u1 + u2'*R12*u2;...
242 |          x_state'*Q2*x_state + u1'*R21*u1 + u2'*R22*u2;...
243 |          s'*Qa1*s + v1'*Ra11*v1 + v2'*Ra12*v2;...
244 |          s'*Qa2*s + v1'*Ra21*v1 + v2'*Ra22*v2;...
245 |          dw1;...
246 |          dw2 ];
247 | end
248 | %------------------- activation function vector of critic ------------------
249 | function y = tndg_nn_pf(x)
250 | x1 = x(1);
251 | x2 = x(2);
252 | x3 = x(3);
253 | x4 = x(4);
254 | 
255 | y = [ x1^2; x1*x2; x1*x3; x1*x4;...
256 |       x2^2; x2*x3; x2*x4;...
257 |       x3^2; x3*x4;...
258 |       x4^2 ];
259 | end
260 | %-------------- derivative of critic activation function vector -----------
261 | function dphi = tndg_nn_pf_dphi(x)
262 | x1 = x(1);
263 | x2 = x(2);
264 | x3 = x(3);
265 | x4 = x(4);
266 | 
267 | dphi = [
268 |          [  2*x1,          0,          0,          0  ];
269 |          [    x2,         x1,          0,          0  ]; 
270 |          [    x3,          0,         x1,          0  ]; 
271 |          [    x4,          0,          0,         x1  ]; 
272 |          [     0,       2*x2,          0,          0  ];
273 |          [     0,         x3,         x2,          0  ];
274 |          [     0,         x4,          0,         x2  ];
275 |          [     0,          0,       2*x3,          0  ];
276 |          [     0,          0,         x4,         x3  ];
277 |          [     0,          0,          0,       2*x4  ];   
278 |        ];
279 | end
280 | 
281 | 
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/online_learning_policy_update/puol_algorithm.m:
--------------------------------------------------------------------------------
  1 | function puol_algorithm
  2 | % This demo checks the feasibility of policy update online learning
  3 | 
  4 | % By J. Lu
  5 | % Date: Apr. 16, 2021
  6 | 
  7 | %-------------------------------- start -----------------------------------
  8 | clear; close all; clc;
  9 | 
 10 | A = [0 -0.8;0.8 1.8]; B = [0; -1];
 11 | Q = eye(2); R = 1;
 12 | 
 13 | ac = 10e-6;
 14 | aa = 0.1;
 15 | wc = zeros(8,1);
 16 | wa = [0.5;1.4;0;0;0;0];
 17 | wa_ini = wa;
 18 | Nc = 4;
 19 | 
 20 | x = 5*[(rand(1,1)-0.5);(rand(1,1)-0.5)];
 21 | xx = [x];
 22 | xxc = [];
 23 | rrc = [];
 24 | uu = [];
 25 | wwc = [wc];
 26 | wwa = [wa];
 27 | 
 28 | N = 10000;
 29 | 
 30 | h = waitbar(0,'please wait');
 31 | for k = 1:N
 32 |     if length(xxc) < Nc
 33 |         u = actor(wa,actor_activation_function(x));
 34 |     else
 35 |         if length(xxc) == Nc
 36 |             wa = actor_weights(x,actor(wa,actor_activation_function(x)),wa,aa,wc,A,B,Q,R);
 37 |             u = actor(wa,actor_activation_function(x));
 38 |         end
 39 |     end
 40 | 
 41 |     r = x'*Q*x+u'*R*u;
 42 | 
 43 |     % update system states
 44 |     x = controlled_system(x,u,A,B);
 45 |     
 46 |     xx = [xx x];
 47 |     uu = [uu u];
 48 |     
 49 |     if length(xxc) < Nc
 50 |         xxc = [xxc x];
 51 |         rrc = [rrc r];
 52 |     else
 53 |         if length(xxc) == Nc
 54 |             xxc_temp = xxc(:,2:end);
 55 |             xxc_temp = [xxc_temp x];
 56 |             xxc = xxc_temp;
 57 |             
 58 |             rrc_temp = rrc(:,2:end);
 59 |             rrc_temp = [rrc_temp r];
 60 |             rrc = rrc_temp;
 61 |         end
 62 |     end
 63 |     
 64 |     if length(xxc) == Nc 
 65 |         wc = critic_weights(xxc,rrc,wc,ac);
 66 |     end
 67 |     
 68 |     if norm(x,2)<10e-5  % states converge to 0 and reset the initial states
 69 |         % usually, it is necessary to change controls to change system states
 70 |         % since the paper does not explain how to excite the system
 71 |         % a simple reset of initial states is used
 72 |         x = 5*[(rand(1,1)-0.5);(rand(1,1)-0.5)];
 73 |         xxc = [];
 74 |         rrc = [];
 75 |     end
 76 | 
 77 |     wwc = [wwc wc];
 78 |     wwa = [wwa wa];
 79 |     
 80 |     waitbar(k/N,h,['Running...',num2str(k/N*100),'%']);
 81 | end
 82 | close(h);
 83 | 
 84 | % check results
 85 | wa_ini = wa_ini'
 86 | 
 87 | wc_final = wc'
 88 | wa_final = wa'
 89 | 
 90 | [Kopt, Popt] = dlqr(A,B,Q,R);
 91 | 
 92 | wc_opt = [ Popt(1,1) 2*Popt(1,2) Popt(2,2) ]
 93 | wa_opt = [-Kopt(1,1) -Kopt(1,2)]
 94 | 
 95 | 
 96 | figure(1), % states 
 97 | plot(((1:size(xx,2))-1),xx,'linewidth',1)
 98 | xlabel('Time steps');
 99 | ylabel('States');
100 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
101 | grid on;
102 | 
103 | figure(2), % control
104 | plot(((1:size(uu,2))-1),uu,'b','linewidth',1)
105 | xlabel('Time steps');
106 | ylabel('Control');
107 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
108 | grid on;
109 | 
110 | figure(3), % critic weights
111 | plot(((1:size(wwc,2))-1),wwc,'linewidth',1)
112 | xlabel('Time (s)');
113 | ylabel('Critic weights');
114 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
115 | grid on;
116 | 
117 | figure(4), % actors weights
118 | plot(((1:size(wwa,2))-1),wwa,'linewidth',1)
119 | xlabel('Time (s)');
120 | ylabel('Action weights');
121 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
122 | grid on;
123 | 
124 | end
125 | 
126 | %--------------------------- outpout of system ----------------------------
127 | function y = controlled_system(x,u,A,B)
128 | y = A*x + B*u; 
129 | end
130 | %-------------------------------- actor -----------------------------------
131 | function y = actor(wa,oa)
132 | y = wa'*oa;
133 | end
134 | %----------------------- critic activation function -----------------------
135 | function y = critic_activation_function(x)
136 | y = [ x(1)^2;             x(1)*x(2);      x(2)^2;...
137 |       x(1)^4;             x(1)^3*x(2);    x(1)^2*x(2)^2;...
138 |       x(1)*x(2)^3;        x(2)^4 ];
139 | end
140 | %----------------------- action activation function -----------------------
141 | function y = actor_activation_function(x)
142 | y = [ x(1) x(2) (x(1)^2)*x(2) x(1)^3 x(1)*x(2)^2 x(2)^3]';
143 | end
144 | %------------------------- update critic weights --------------------------
145 | function wc = critic_weights(xx,rr,w,ac)
146 | X = [];
147 | Y = [];
148 | 
149 | for k = length(xx):-1:2
150 |     X = [ X critic_activation_function(xx(:,k)) -...
151 |             critic_activation_function(xx(:,k-1)) ];
152 | end
153 | 
154 | for j = length(rr):-1:2
155 |     Y = [ Y rr(j)];
156 | end
157 | 
158 | E = Y + w'*X;
159 | wc = X*pinv(X'*X)*(ac*E'-Y');
160 | end
161 | %-------------------------- update actor weights --------------------------
162 | function wa = actor_weights(x,u0,w,aa,wc,A,B,Q,R)
163 | rho = actor_activation_function(x);
164 | 
165 | objective = @(u) x'*Q*x+u'*R*u + ...
166 |                  wc'*critic_activation_function(controlled_system(x,u,A,B));
167 | uopt = fminunc(objective,u0);
168 | 
169 | ue =  w'*rho - uopt;
170 | 
171 | wa = w - aa*rho*ue/(rho'*rho+1);
172 | end
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/online_learning_without_initial_admissible_control/ol_algorithm.m:
--------------------------------------------------------------------------------
  1 | function ol_algorithm
  2 | % This demo checks the feasibility of online leranbing without 
  3 | % initial admissible control
  4 | 
  5 | % By J. Lu
  6 | % Date: Mar. 16, 2021
  7 | 
  8 | %-------------------------------- start -----------------------------------
  9 | clear; close all; clc;
 10 | 
 11 | global u;
 12 | global Q; global R; 
 13 | global probing_noise;
 14 | 
 15 | Q = 1*eye(2); R = 1;
 16 | 
 17 | wc_ini = [0.1;0;0.1];
 18 | wc = wc_ini;
 19 | 
 20 | length_nn = length(wc_ini);
 21 | 
 22 | x_ini = [1;-1];
 23 | x0 = [x_ini; wc]';
 24 | xx = [x0'];
 25 | uu = [];
 26 | wwc = [wc];
 27 | 
 28 | % simulation settings
 29 | Fsamples = 5000;
 30 | T = 0.1; 
 31 | ss = T/10; 
 32 | tspan = 0:ss:T;
 33 | 
 34 | % generate the probing noise
 35 | A = 10;
 36 | noise = sqrt(A)*randn(1,Fsamples/T);
 37 | 
 38 | N = length(noise);
 39 | n = 1;
 40 | 
 41 | h = waitbar(0,'please wait');
 42 | for k = 1:Fsamples
 43 |     probing_noise = noise(n);
 44 | 
 45 |     [t,x]= ode45(@spiol_ode, tspan, x0);
 46 | 
 47 |     x0 = x(length(t),:);
 48 |     xx = [xx x0'];
 49 |     uu = [uu u];
 50 |     wwc = [wwc, x0(3:end)'];
 51 | 
 52 |     n = n + 1;
 53 | 
 54 |     if n > N
 55 |         n = 1;
 56 |     end
 57 | 
 58 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 59 | end
 60 | close(h)
 61 | 
 62 | % check results
 63 | wc_ini = wc_ini'
 64 | wc_final = x0(3:end)
 65 | 
 66 | wc_opt = [0.5 0 1]
 67 | 
 68 | % results
 69 | figure(1), % states 
 70 | plot(T*((1:size(xx,2))-1),xx(1:2,:),'linewidth',1)
 71 | xlabel('Time (s)');
 72 | ylabel('$x$','Interpreter','latex');
 73 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 74 | grid on;
 75 | 
 76 | figure(2), % control
 77 | plot(T*((1:size(uu,2))-1),uu,'b','linewidth',1)
 78 | xlabel('Time (s)');
 79 | ylabel('Control');
 80 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 81 | grid on;
 82 | 
 83 | figure(3), % critic weights
 84 | plot(T*((1:size(wwc,2))-1),wwc,'linewidth',1)
 85 | xlabel('Time (s)');
 86 | ylabel('Critic weights');
 87 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 88 | grid on;
 89 | 
 90 | end
 91 | 
 92 | %-------------------------------- ode -------------------------------------
 93 | function sdot = spiol_ode(~,s)
 94 | global u;
 95 | global Q; global R;
 96 | global probing_noise;
 97 | 
 98 | % learning parameters
 99 | a1 = 25;
100 | a2 = 0.01;
101 | 
102 | x = s(1:2);
103 | wc = s(3:end);
104 | 
105 | f = [ -x(1)+x(2);...
106 |       -x(1)/2-x(2)*(1-(cos(2*x(1)+2)^2))/2 ];
107 | 
108 | g = [ 0;...
109 |       cos(2*x(1)+2)];
110 | 
111 | pphi = pd_phi(x);
112 | 
113 | ux = -0.5*inv(R)*g'*pphi'*wc;
114 | 
115 | % candidate Lyapunov function V = 0.5*(x1^2+x2^2);
116 | pV = [ x(1); x(2) ];
117 | 
118 | dV = x'*(f+g*u);
119 | 
120 | if dV > 0
121 |     k = 0.5;
122 | else
123 |     k = 0;
124 | end
125 | 
126 | sigma = pphi*(f+g*ux);
127 | 
128 | dwc = -a1*sigma/((sigma'*sigma+1)^2)*(sigma'*wc+x'*Q*x+ux'*R*ux)...
129 |                  + k*a2*pphi*g*inv(R)*g'*pV;
130 | 
131 | u = ux + probing_noise;
132 | 
133 | sdot = [ f+g*u; dwc ];
134 | end 
135 | %------------- partial derivative of activation function vector -----------
136 | function pphi = pd_phi(x)
137 | pphi = [ 2*x(1)     0;...
138 |            x(2)     x(1);...
139 |            0      2*x(2)];
140 | end 
141 | 
142 | 
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/parallel_control_based_optimal_tracking/pc_ot_adp_main.m:
--------------------------------------------------------------------------------
  1 | % Code for paper
  2 | % J. Lu, Q. Wei, and F.-Y. Wang, "Parallel control for optimal tracking via
  3 | % adaptive dynamic programming," IEEE/CAA Journal of Automatica Sinica,
  4 | % vol. 7, no. 6, pp. 1662-1674, Nov. 2020.
  5 | %
  6 | % The paper is available at:
  7 | % https://www.ieee-jas.net/en/article/doi/10.1109/JAS.2020.1003426
  8 | %
  9 | % The main contribution of this paper is to propose an online optimal
 10 | % tracking control method for nonaffine systems without reconstructing the
 11 | % nonaffine system into an affine form at a certain operating point
 12 | %
 13 | % P.S. This code is slightly different from the paper 
 14 | %
 15 | % By J. Lu
 16 | % Date: Apr. 8, 2023
 17 | 
 18 | %-------------------------------- start -----------------------------------
 19 | clear;close all;clc;
 20 | 
 21 | global Ga;
 22 | global Q; global R; global Qa; global Ra;
 23 | global w;
 24 | global a1; global a2;
 25 | global ra;
 26 | global ea;
 27 | global v;
 28 | 
 29 | % parameters
 30 | r = [0.5;0.125]; % reference signal
 31 | ur = 1.1947448; % reference control
 32 | % this code constructs the augmented system and reference signal first and derives the augmented tracking error system later 
 33 | % the paper obtains the tracking error system first and derives the augmented tracking error system later 
 34 | % both methods are acceptable
 35 | ra = [r;ur]; % augmented reference signal
 36 | 
 37 | Ga = [zeros(2,1); diag([1])];
 38 | 
 39 | Q = 1*eye(2); R = 1*eye(1);
 40 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]);
 41 | 
 42 | 
 43 | Fsamples = 20000;
 44 | T = 0.05; 
 45 | ss = T/10; 
 46 | 
 47 | x00 = [0;0];
 48 | u00 = 0;
 49 | w00 = zeros(12,1); % initial critic network weights
 50 | x = [x00;u00;0;0;w00];
 51 | w = x(6:end);
 52 | x0 = x;
 53 | xx = [x];
 54 | uu = [x(3)];
 55 | rr = [r];
 56 | vv = [];
 57 | 
 58 | ww = [w];
 59 | 
 60 | % learning rates are different from the paper due to the introduction of the probing noise
 61 | a1 = 20;
 62 | a2 = 0.1;
 63 | 
 64 | Jo = 0;
 65 | Ja = 0;
 66 | 
 67 | rpb_k = Fsamples - 200;
 68 | 
 69 | 
 70 | h = waitbar(0,'please wait');
 71 | for k = 1:Fsamples
 72 |       
 73 |     if k~= 1 && norm(ea(1:3),2) <= 0.5 && k <= rpb_k % probing noise
 74 |         x = [ra + 0.5*(rand(3,1)-0.5);x(4:end)'];
 75 |     end
 76 |     
 77 |     tspan = 0:ss:T;
 78 |     [t,x]= ode45(@pc_ot_adp_ode, tspan, x);
 79 |         
 80 |     Jo = Jo + x(length(t),4);
 81 |     Ja = Ja + x(length(t),5);
 82 |     
 83 |     x = [ x(length(t),1:3),0,0,x(length(t),6:end) ];
 84 |     w = x(6:end);
 85 |     
 86 |     rr = [rr r];
 87 |     xx = [xx x'];
 88 |     uu = [uu x(3)];
 89 |     vv = [vv v];
 90 |     ww = [ww w'];
 91 |     
 92 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
 93 | end
 94 | close(h);
 95 | 
 96 | Jo
 97 | Ja
 98 | 
 99 | % result
100 | figure(1), % states 
101 | plot(T*((1:size(rr,2))-1),rr(1,:),'r','linewidth',1),hold on;
102 | plot(T*((1:size(xx,2))-1),xx(1,:),'b--','linewidth',1);
103 | xlabel('Time (s)');
104 | ylabel('$x_1$','Interpreter','latex');
105 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
106 | grid on;
107 | 
108 | figure(2), % states 
109 | plot(T*((1:size(rr,2))-1),rr(2,:),'r','linewidth',1),hold on;
110 | plot(T*((1:size(xx,2))-1),xx(2,:),'b--','linewidth',1);
111 | xlabel('Time (s)');
112 | ylabel('$x_2$','Interpreter','latex');
113 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
114 | grid on;
115 | 
116 | figure(3), % real control
117 | plot(T*((1:size(uu,2))-1),uu,'b','linewidth',1)
118 | xlabel('Time (s)');
119 | ylabel('$u$','Interpreter','latex');
120 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
121 | grid on;
122 | 
123 | figure(4), % virtual control
124 | plot(T*((1:size(vv,2))-1),vv,'b','linewidth',1)
125 | xlabel('Time (s)');
126 | ylabel('$v$','Interpreter','latex');
127 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
128 | grid on;
129 | 
130 | figure(5), % wc
131 | plot(T*((1:size(ww,2))-1),ww,'linewidth',1)
132 | xlabel('Time (s)');
133 | ylabel('Critic newtork weights');
134 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
135 | grid on;
136 | 
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/parallel_control_based_optimal_tracking/pc_ot_adp_nn_pf_dphi.m:
--------------------------------------------------------------------------------
 1 | function dphi = pc_ot_adp_nn_pf_dphi(x) 
 2 | dphi = [
 3 |          [  2*x(1),       0,         0   ];
 4 |          [    x(2),    x(1),         0   ];
 5 |          [    x(3),       0,      x(1)   ];
 6 |          [       0,  2*x(2),         0   ];
 7 |          [       0,    x(3),      x(2)   ];
 8 |          [       0,       0,    2*x(3)   ];
 9 |          [  3*x(1)^2,         0,         0   ];
10 |          [         0,  3*x(2)^2,         0   ];
11 |          [         0,         0,  3*x(3)^2   ];
12 |          [  4*x(1)^3,         0,         0   ];
13 |          [         0,  4*x(2)^3,         0   ];
14 |          [         0,         0,  4*x(3)^3   ];
15 |         ];
16 | end


--------------------------------------------------------------------------------
/parallel_control_based_optimal_tracking/pc_ot_adp_ode.m:
--------------------------------------------------------------------------------
 1 | function xdot = pc_ot_adp_ode(~,x)
 2 | 
 3 | global Ga;
 4 | global Q; global R; global Qa; global Ra;
 5 | global dw;
 6 | global w;
 7 | global a1; global a2;
 8 | global ra;
 9 | global ea;
10 | global v;
11 | 
12 | w = x(6:end);
13 | 
14 | u = x(3);
15 | 
16 | fo_x = [-x(1)^3 + x(2); x(1)^2 - x(1) - x(2) + 0.15*u^3 + sin(0.1*u) ]; % nonaffine system
17 | 
18 | fo_r = [-ra(1)^3 + ra(2); ra(1)^2 - ra(1) - ra(2) + 0.15*ra(3)^3 + sin(0.1*ra(3)) ]; % reference signal
19 | 
20 | % construct the augmented system and reference signal first and derive the augmented tracking error system later 
21 | % the paper obtains the tracking error system first and derives the augmented tracking error system later 
22 | % both methods are acceptable
23 | fa_e = [fo_x - fo_r; 0]; 
24 | 
25 | ea = x(1:3) - ra;
26 | 
27 | dphi = pc_ot_adp_nn_pf_dphi(ea(1:3));
28 | 
29 | v = -0.5*inv(Ra)*Ga'*dphi'*w;
30 | 
31 | dVe = ea'*(fa_e+Ga*v); % V = 0.5*(e1^2+e2^2+e3^2);
32 | 
33 | pVe = [ea(1);ea(2);ea(3)];
34 | 
35 | if dVe > 0
36 |     k = 0.5;
37 | end
38 | if dVe <= 0
39 |     k = 0;
40 | end
41 | 
42 | sigma = dphi*(fa_e+Ga*v);
43 | dw = -a1*sigma/((sigma'*sigma + 1)^2)*(sigma'*w + ea'*Qa*ea + v'*Ra*v)+k*a2*dphi*Ga*inv(Ra)*Ga'*pVe;
44 | 
45 | xdot = [ fo_x; v;...
46 |          [ea(1);ea(2)]'*Q*[ea(1);ea(2)]+ea(3)'*R*ea(3);...
47 |          [ea(1);ea(2);ea(3)]'*Qa*[ea(1);ea(2);ea(3)]+v'*Ra*v;...
48 |          dw];
49 | 
50 | end 


--------------------------------------------------------------------------------
/policy_iteration/pi_algorithm.m:
--------------------------------------------------------------------------------
  1 | function pi_algorithm
  2 | % This demo checks the feasibility of the policy iteration adaptive dynamic 
  3 | % programming  algorithm
  4 | 
  5 | % By J. Lu
  6 | % Date: Apr. 21, 2020 
  7 | 
  8 | %-------------------------------- start -----------------------------------
  9 | clear; close all; clc;
 10 | 
 11 | % information of system & cost function
 12 | global A; global B; global Q; global R;
 13 | 
 14 | load training_data/state_data.mat;
 15 | load training_data/actor_init.mat;
 16 | 
 17 | % action network
 18 | actor = actor_init;
 19 | 
 20 | actor_epoch = 20000;
 21 | actor_err_goal = 1e-9;
 22 | actor_lr = 0.01;
 23 | actor.trainParam.epochs = actor_epoch; 
 24 | actor.trainParam.goal = actor_err_goal; 
 25 | actor.trainParam.show = 10; 
 26 | actor.trainParam.lr = actor_lr; 
 27 | 
 28 | % critic network
 29 | critic_middle_num = 15;
 30 | critic_epoch = 10000;
 31 | critic_err_goal = 1e-9;
 32 | critic_lr = 0.01;
 33 | critic = newff(minmax(x_train), [critic_middle_num 1], {'tansig' 'purelin'},'trainlm');
 34 | critic.trainParam.epochs = critic_epoch;
 35 | critic.trainParam.goal = critic_err_goal; 
 36 | critic.trainParam.show = 10;  
 37 | critic.trainParam.lr = critic_lr; 
 38 | critic.biasConnect = [1;0];
 39 | 
 40 | epoch = 10;
 41 | eval_step = 400;
 42 | performance_index = ones(1,epoch + 1);
 43 | 
 44 | figure(1),hold on;
 45 | h = waitbar(0,'Please wait');
 46 | for i = 1:epoch
 47 |     % update critic
 48 |     % evaluate policy
 49 |     critic_target = evaluate_policy(actor, x_train, eval_step);
 50 |     critic = train(critic,x_train,critic_target); 
 51 |     
 52 |     performance_index(i) = critic(x0);
 53 |     figure(1),plot(i,performance_index(i),'*'),xlim([1 epoch]),hold on;
 54 |     
 55 |     waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']);
 56 |     if i == epoch
 57 |         break;
 58 |     end
 59 |     
 60 |     % update actor
 61 |     actor_target = zeros(control_dim,size(x_train,2));
 62 |     for j = 1:size(x_train,2)
 63 |         x = x_train(:,j);
 64 |         if x == zeros(state_dim,1)
 65 |             ud = zeros(control_dim,1);
 66 |         else
 67 |             objective = @(u) cost_function(x,u) + critic(controlled_system(x,u));
 68 |             u0 = actor(x);
 69 |             ud = fminunc(objective, u0);
 70 |         end
 71 |         actor_target(:,j) = ud;
 72 |     end
 73 |     
 74 |     actor = train(actor, x_train, actor_target);
 75 | end
 76 | close(h)
 77 | 
 78 | figure(1),
 79 | xlabel('Iterations');
 80 | ylabel('$V(x_0)$','Interpreter','latex');
 81 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 82 | grid on;
 83 | hold off;
 84 | 
 85 | save training_results/actor_critic actor critic
 86 | end
 87 | 
 88 | %---------------------------- evaluate policy -----------------------------
 89 | function y = evaluate_policy(actor,x,eval_step)
 90 | critic_target = zeros(1,size(x,2));
 91 | for k = 1:eval_step
 92 |     uep = actor(x);
 93 |     critic_target = critic_target +  cost_function(x,uep);
 94 |     x = controlled_system(x,uep);
 95 | end
 96 | y = critic_target;
 97 | end
 98 | %--------------------------- outpout of system ----------------------------
 99 | function y = controlled_system(x,u)
100 | % system matrices
101 | global A; global B;
102 | y = A*x + B*u;  % dot product should be adopt in nolinear systems
103 | end
104 | %----------------------------- cost function ------------------------------
105 | function y = cost_function(x,u)
106 | global Q; global R;
107 | y = (diag(x'*Q*x) + diag(u'*R*u))';
108 | end
109 | 
110 | 


--------------------------------------------------------------------------------
/policy_iteration/pi_icp.m:
--------------------------------------------------------------------------------
 1 | %-------------------------- obtain the initial admissible control ----------------------------
 2 | clear; close all; clc;
 3 | 
 4 | load training_data/state_data.mat
 5 | 
 6 | [K, P] = dlqr(A,B,Q,100*R);
 7 | 
 8 | actor_target = -K*x_train;
 9 | 
10 | cover = 1;
11 | if isempty(dir('training_data/actor_init.mat')) == 1 || cover == 1
12 |     % action network
13 |     actor_init_middle_num = 15;
14 |     actor_init_epoch = 10000;
15 |     actor_init_err_goal = 1e-9;
16 |     actor_init_lr = 0.01;
17 |     actor_init = newff(minmax(x_train), [actor_init_middle_num control_dim], {'tansig' 'purelin'},'trainlm');
18 |     actor_init.trainParam.epochs = actor_init_epoch;
19 |     actor_init.trainParam.goal = actor_init_err_goal;
20 |     actor_init.trainParam.show = 10;
21 |     actor_init.trainParam.lr = actor_init_lr;
22 |     actor_init.biasConnect = [1;0];
23 |     
24 |     actor_init = train(actor_init, x_train, actor_target);
25 |     
26 |     save training_data/actor_init actor_init
27 | else
28 |     load training_data/actor_init
29 | end
30 | 
31 | 
32 | %-------------------------- test the initial control ----------------------------
33 | x = x0;
34 | x_net = x;
35 | 
36 | xx = x;
37 | xx_net = x_net;
38 | uu = [];
39 | uu_net = [];
40 | 
41 | Fsamples = 200;
42 | JK = 0;
43 | Jnet = 0;
44 | 
45 | h = waitbar(0,'Please wait');
46 | for k = 1:Fsamples
47 |     u = -K*x;
48 |     u_net = actor_init(x_net);
49 |     JK = JK + x'*Q*x + u'*R*u;
50 |     Jnet = Jnet + x_net'*Q*x_net + u_net'*R*u_net;
51 |     x = A*x + B*u;
52 |     xx = [xx x];
53 |     x_net = A*x_net + B*u_net;
54 |     xx_net = [xx_net x_net];
55 |     uu = [uu u];
56 |     uu_net = [uu_net u_net];
57 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
58 | end
59 | close(h)
60 | 
61 | JK
62 | Jnet
63 | 
64 | figure,
65 | plot(0:Fsamples,xx,'b-',0:Fsamples,xx_net,'r--','linewidth',1)
66 | xlabel('Time steps');
67 | ylabel('States'); 
68 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
69 | grid on;
70 | 
71 | figure,
72 | plot(0:Fsamples-1,uu,'b-',0:Fsamples-1,uu_net,'r--','linewidth',1)
73 | xlabel('Time steps');
74 | ylabel('Control');
75 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
76 | grid on;
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/policy_iteration/pi_state_data.m:
--------------------------------------------------------------------------------
 1 | %------------------------- generate training data & system information ----------------------------
 2 | clear; close all; clc;
 3 | 
 4 | % system matrices
 5 | A = [  0,      0.1;...
 6 |        0.3,    -1   ];
 7 | B = [  0;...
 8 |        0.5  ];
 9 | 
10 | state_dim = size(A,1);
11 | control_dim = size(B,2);
12 | 
13 | % cost function parameters
14 | Q = 1*eye(state_dim);
15 | R = 1*eye(control_dim);
16 | 
17 | % training data
18 | x_train = zeros(state_dim,1);
19 | x0 = [1;-1];
20 | 
21 | for i = 1:50
22 |     x_train = [x_train, zeros(state_dim,1)];  
23 |     x_train = [x_train,2*(rand(state_dim,1)-0.5)]; 
24 |     x_train = [x_train,1*(rand(state_dim,1)-0.5)];
25 |     x_train = [x_train,0.5*(rand(state_dim,1)-0.5)];
26 | end
27 | 
28 | r = randperm(size(x_train,2));   % randomization according to column
29 | x_train = x_train(:, r);         % reorder
30 | 
31 | save training_data/state_data x_train state_dim control_dim A B Q R x0;
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/policy_iteration/pi_validation.m:
--------------------------------------------------------------------------------
 1 | clear; close all; clc;
 2 | 
 3 | load training_data/state_data.mat;
 4 | load training_results/actor_critic.mat
 5 | 
 6 | 
 7 | [Kopt, Popt] = dlqr(A,B,Q,R);
 8 | 
 9 | x = x0;
10 | x_net = x;
11 | xx = x;
12 | xx_net = x_net;
13 | uu_opt = [];
14 | uu_net = [];
15 | 
16 | Jreal = 0;
17 | 
18 | Fsamples = 50;
19 | h = waitbar(0,'Please wait');
20 | for k = 1:Fsamples
21 |     uopt = -Kopt*x;
22 |     x = A*x + B*(uopt);
23 |     xx = [xx x];
24 |     u_net = sim(actor,x_net);
25 |     Jreal = Jreal + x_net'*Q*x_net + u_net'*R*u_net;
26 |     x_net = A*x_net + B*u_net;
27 |     xx_net = [xx_net x_net];
28 |     uu_opt = [uu_opt uopt];
29 |     uu_net = [uu_net u_net];
30 |     waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']);
31 | end
32 | close(h)
33 | 
34 | Jopt = x0'*Popt*x0
35 | Jnet = critic(x0)
36 | Jreal
37 | 
38 | figure,
39 | plot(0:Fsamples,xx,'b-',0:Fsamples,xx_net,'r--','linewidth',1)
40 | xlabel('Time steps');
41 | ylabel('States'); 
42 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
43 | grid on;
44 | figure,
45 | plot(0:Fsamples-1,uu_opt,'b-',0:Fsamples-1,uu_net,'r--','linewidth',1)
46 | legend('Optimal ','NN','Interpreter','latex'); 
47 | xlabel('Time steps');
48 | ylabel('Control');
49 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
50 | grid on;
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/policy_iteration/training_data/actor_init.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_data/actor_init.mat


--------------------------------------------------------------------------------
/policy_iteration/training_data/state_data.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_data/state_data.mat


--------------------------------------------------------------------------------
/policy_iteration/training_results/actor_critic.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_results/actor_critic.mat


--------------------------------------------------------------------------------
/policy_iteration_continuous_time/pi_algorithm.m:
--------------------------------------------------------------------------------
  1 | function pi_algorithm
  2 | % This demo checks the feasibility of the policy iteration adaptive dynamic 
  3 | % programming algorithm for continuous-time systems
  4 | 
  5 | % By J. Lu
  6 | % Date: Nov. 27, 2020 
  7 | 
  8 | %-------------------------------- start -----------------------------------
  9 | clear; close all; clc;
 10 | 
 11 | % system information
 12 | A = [-0.0665   11.5       0         0;
 13 |          0     -2.5      2.5        0;
 14 |        -9.5      0    -13.736  -13.736;
 15 |         0.6      0       0         0];
 16 | B = [0; 0; 13.736; 0];
 17 | 
 18 | n = 4;
 19 | m = 1;
 20 | 
 21 | Q = eye(n); R = eye(m);
 22 | 
 23 | [~,P_opt] = lqr(A,B,Q,R);
 24 | [~,P_init] = lqr(A,B,Q,0.001*R); % obtain initial weights
 25 | 
 26 | w_opt = [ P_opt(1,1); 2*P_opt(1,2); 2*P_opt(1,3); 2*P_opt(1,4);...
 27 |           P_opt(2,2); 2*P_opt(2,3); 2*P_opt(2,4);...
 28 |           P_opt(3,3); 2*P_opt(3,4);...
 29 |           P_opt(4,4) ];
 30 | 
 31 | w_init = [ P_init(1,1); 2*P_init(1,2); 2*P_init(1,3); 2*P_init(1,4);...
 32 |            P_init(2,2); 2*P_init(2,3); 2*P_init(2,4);...
 33 |            P_init(3,3); 2*P_init(3,4);...
 34 |            P_init(4,4) ];
 35 | 
 36 | w = w_init;
 37 | 
 38 | ww = [ w ];
 39 | 
 40 | % generate training data
 41 | x0 = [ 1; -1; 0; 0 ];
 42 | x_train = [ x0 ];
 43 | 
 44 | for i = 1:100 
 45 |     x_train = [ x_train, zeros(n,1) ];
 46 |     x_train = [ x_train,10*(rand(n,1)-0.5) ];
 47 |     x_train = [ x_train,5*(rand(n,1)-0.5) ];
 48 |     x_train = [ x_train,2*(rand(n,1)-0.5) ];
 49 |     x_train = [ x_train,1*(rand(n,1)-0.5) ];
 50 |     x_train = [ x_train,0.5*(rand(n,1)-0.5) ];
 51 |     x_train = [ x_train,0.1*(rand(n,1)-0.5) ];
 52 |     x_train = [ x_train,0.05*(rand(n,1)-0.5) ];
 53 |     x_train = [ x_train,0.01*(rand(n,1)-0.5) ];
 54 | end
 55 | 
 56 | L = length(x_train);
 57 | 
 58 | epoch = 50;
 59 | 
 60 | h = waitbar(0,'please wait');
 61 | for k = 1:epoch 
 62 |     UU = zeros(L,1);
 63 |     XX = zeros(L,length(w));
 64 |     for i = 1:L
 65 |         x = x_train(:,i);
 66 | 
 67 |         dphi = [
 68 |                  [  2*x(1),          0,          0,          0  ];
 69 |                  [    x(2),       x(1),          0,          0  ];
 70 |                  [    x(3),          0,       x(1),          0  ];
 71 |                  [    x(4),          0,          0,       x(1)  ];
 72 |                  [       0,     2*x(2),          0,          0  ];
 73 |                  [       0,       x(3),       x(2),          0  ];
 74 |                  [       0,       x(4),          0,       x(2)  ];
 75 |                  [       0,          0,      2*x(3),         0  ];
 76 |                  [       0,          0,        x(4),      x(3)  ];
 77 |                  [       0,          0,          0,     2*x(4)  ];
 78 |             ];
 79 | 
 80 |         u = -0.5*inv(R)*B'*dphi'*w;
 81 | 
 82 |         UU(i) = x'*Q*x + u'*R*u;
 83 |         XX(i,:) = (dphi*(A*x + B*u))';
 84 |     end
 85 | 
 86 |     w = -XX\UU;
 87 | 
 88 |     ww = [ ww w ];
 89 |     
 90 |     waitbar(k/epoch,h,['Running...',num2str(k/epoch*100),'%']);
 91 | end
 92 | close(h);
 93 | 
 94 | % comparison
 95 | w_init_display = w_init'
 96 | w_final_display = w'
 97 | w_opt_display = w_opt'
 98 | 
 99 | figure(1), 
100 | plot(((1:size(ww,2))-1),ww,'linewidth',1)
101 | xlabel('Epoch');
102 | ylabel('$w$','Interpreter','latex');
103 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
104 | grid on;
105 | 
106 | end
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/value_iteration/training_data/state_data.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/value_iteration/training_data/state_data.mat


--------------------------------------------------------------------------------
/value_iteration/training_results/actor_critic.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/value_iteration/training_results/actor_critic.mat


--------------------------------------------------------------------------------
/value_iteration/vi_algorithm.m:
--------------------------------------------------------------------------------
  1 | function vi_algorithm
  2 | % This demo checks the feasibility of value iteration adaptive dynamic programming 
  3 | 
  4 | % By J. Lu
  5 | % Date: May 6, 2020 
  6 | 
  7 | %-------------------------------- start -----------------------------------
  8 | clear; close all; clc;
  9 | 
 10 | % information of system & cost function
 11 | global A; global B; global Q; global R;
 12 | 
 13 | load training_data/state_data.mat
 14 | 
 15 | % action network
 16 | actor_middle_num = 8;
 17 | actor_epoch = 20000;
 18 | actor_err_goal = 1e-7;
 19 | actor_lr = 0.05;
 20 | actor = newff(minmax(x_train), [actor_middle_num control_dim], {'tansig' 'purelin'},'trainlm');
 21 | actor.trainParam.epochs = actor_epoch; % max epochs
 22 | actor.trainParam.goal = actor_err_goal; % tolerance error
 23 | actor.trainParam.show = 10;  % interval
 24 | actor.trainParam.lr = actor_lr; % learning rate - traingd,traingdm
 25 | actor.biasConnect = [1;0];% bias 
 26 | 
 27 | % critic network
 28 | critic_middle_num = 8;
 29 | critic_epoch = 10000;
 30 | critic_err_goal = 1e-7;
 31 | critic_lr = 0.01;
 32 | critic = newff(minmax(x_train), [critic_middle_num 1], {'tansig' 'purelin'},'trainlm');
 33 | critic.trainParam.epochs = critic_epoch;
 34 | critic.trainParam.goal = critic_err_goal; 
 35 | critic.trainParam.show = 10;  
 36 | critic.trainParam.lr = critic_lr; 
 37 | critic.biasConnect = [1;0];
 38 | critic_last = critic;
 39 | 
 40 | epoch = 50;
 41 | tol = 1e-9;
 42 | performance_index = zeros(1,epoch + 1);
 43 | 
 44 | critic_set = cell(1,epoch);
 45 | actor_set = cell(1,epoch);
 46 | 
 47 | figure(1),plot(0,0,'*')
 48 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 49 | grid on;
 50 | hold on;
 51 | h = waitbar(0,'please wait');
 52 | for i = 1:epoch
 53 |     % update action network
 54 |     actor_target = zeros(1,size(x_train,2));
 55 |     if i ~= 1
 56 |         for j = 1:size(x_train,2)
 57 |             x = x_train(:,j);
 58 |             if x == zeros(state_dim,1)
 59 |                 ut = zeros(control_dim,1);
 60 |             else
 61 |                 objective = @(u) cost_function(x,u) + critic(controlled_system(x,u));
 62 |                 u0 = actor(x);
 63 |                 ut = fminunc(objective,u0);
 64 |                 actor_target(j) = ut;
 65 |             end
 66 |         end
 67 |     end
 68 | 
 69 |     actor = train(actor, x_train, actor_target);    
 70 | 
 71 |     % update critic network
 72 |     if i == 1
 73 |         critic_target = cost_function(x_train,0);
 74 |     else
 75 |         x_next = controlled_system(x_train,actor(x_train));
 76 |         critic_target = cost_function(x_train,(actor(x_train))) + critic_last(x_next);
 77 |     end
 78 |     for j = 1:size(zeros_index,2)
 79 |         critic_target(:,zeros_index(j)) = zeros(1,1);
 80 |     end
 81 |     critic = train(critic,[x_train,-x_train],[critic_target,critic_target]);
 82 |     
 83 |     if i ~= 1 && mse(critic(x_train),critic_last(x_train)) <= tol 
 84 |         critic_set{i} = critic;
 85 |         actor_set{i} = actor;
 86 |         break;
 87 |     end
 88 |     
 89 |     critic_last = critic;
 90 |     
 91 |     critic_set{i} = critic;
 92 |     actor_set{i} = actor;
 93 |     
 94 |     performance_index(i+1) = critic(x0);
 95 |     figure(1),plot(i,performance_index(i+1),'*')
 96 |     waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']);
 97 | end
 98 | close(h);
 99 | 
100 | figure(1),
101 | xlabel('Iterations');
102 | ylabel('Iterative $V(x_0)$','Interpreter','latex');
103 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
104 | grid on;
105 | hold off;
106 | 
107 | save training_results/actor_critic critic_set actor_set critic actor
108 | 
109 | end
110 | 
111 | %--------------------------- outpout of system ----------------------------
112 | function y = controlled_system(x,u)
113 | % system matrices
114 | global A; global B;
115 | y = A*x + B*u;  % dot product should be adopt in nolinear systems
116 | end
117 | %----------------------------- cost function ------------------------------
118 | function y = cost_function(x,u)
119 | global Q; global R;
120 | y = (diag(x'*Q*x) + diag(u'*R*u))';
121 | end
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/value_iteration/vi_state_data.m:
--------------------------------------------------------------------------------
 1 | %-------------------------- generate training data ------------------------
 2 | clear; close all; clc;
 3 | 
 4 | A = [0,0.1; 0.3, -1]; B = [0;0.5];
 5 | Q = 1*eye(2); R = 1;
 6 | control_dim = 1;
 7 | state_dim = 2;
 8 | x0 = [1;-1];
 9 | 
10 | x_train = zeros(state_dim,1);
11 | u_train = zeros(control_dim,1);
12 | 
13 | for i = 1:50
14 |     x_train = [x_train, zeros(state_dim,1)];
15 |     x_train = [x_train,4*(rand(state_dim,1)-0.5)];   % [-2 2]
16 |     x_train = [x_train,2*(rand(state_dim,1)-0.5)];   % [-1 1]
17 |     x_train = [x_train,1*(rand(state_dim,1)-0.5)];   % [-0.5 0.5]
18 |     x_train = [x_train,0.2*(rand(state_dim,1)-0.5)]; % [-0.1 0.1]
19 |     u_train = [u_train,zeros(control_dim,1)];
20 |     u_train = [u_train,4*(rand(control_dim,1)-0.5)];   % [-2 2]
21 |     u_train = [u_train,2*(rand(control_dim,1)-0.5)];   % [-1 1]
22 |     u_train = [u_train,1*(rand(control_dim,1)-0.5)];   % [-0.5 0.5]
23 |     u_train = [u_train,0.2*(rand(control_dim,1)-0.5)]; % [-0.1 0.1]
24 | end
25 | 
26 | r = randperm(size(x_train,2));   % randomization according column
27 | x_train = x_train(:,r);         % reorder
28 | [~,n] = find(x_train == zeros(state_dim,1));
29 | 
30 | zeros_index = [];
31 | for i = 1:size(n,1)/state_dim
32 |     zeros_index = [zeros_index, n(state_dim*i)];
33 | end
34 | 
35 | save training_data/state_data x_train zeros_index x0 control_dim state_dim A B Q R
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/value_iteration/vi_validation.m:
--------------------------------------------------------------------------------
 1 | function vi_validation
 2 | %-------------- validate training results --------------
 3 | clear; close all; clc;
 4 | 
 5 | % information of system & cost function
 6 | global A; global B; global Q; global R;
 7 | 
 8 | load training_data/state_data.mat
 9 | load training_results/actor_critic.mat
10 | 
11 | [Kopt, Popt] = dlqr(A,B,Q,R);
12 | 
13 | Fsamples = 50; 
14 | 
15 | x0 = [1;-1];
16 | x = x0;
17 | x_net = x0;
18 | xx = x.*ones(length(x0),Fsamples+1);
19 | xx_net = x_net.*ones(length(x0),Fsamples+1);
20 | uu_opt = zeros(control_dim,Fsamples);
21 | uu_net = zeros(control_dim,Fsamples);
22 | 
23 | Jreal = 0;
24 | 
25 | for k = 1:Fsamples
26 |     u_opt = -Kopt*x;
27 |     x = controlled_system(x,u_opt);
28 |     uu_opt(:,k) = u_opt;
29 |     xx(:,k+1) = x;
30 |     
31 |     u_net = actor(x_net);
32 |     Jreal = Jreal + x_net'*Q*x_net + u_net'*R*u_net;
33 |     x_net = controlled_system(x_net,u_net);
34 |     uu_net(:,k) = u_net;
35 |     xx_net(:,k+1) = x_net;
36 | end
37 | 
38 | Jopt = x0'*Popt*x0
39 | Jnet = critic(x0)
40 | Jreal
41 | 
42 | figure(1)
43 | subplot(2,1,1)
44 | plot(0:Fsamples,xx(1,:),'r',0:Fsamples,xx_net(1,:),'b--','linewidth',1)
45 | legend('Optimal','Action network');
46 | ylabel('$x_1$','Interpreter','latex');
47 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
48 | grid on;
49 | subplot(2,1,2)
50 | plot(0:Fsamples,xx(2,:),'r',0:Fsamples,xx_net(2,:),'b--','linewidth',1)
51 | legend('Optimal','Action network');
52 | xlabel('Time steps');
53 | ylabel('$x_2$','Interpreter','latex');
54 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
55 | grid on;
56 | 
57 | figure(2)
58 | plot(0:Fsamples-1,uu_opt,'r',0:Fsamples-1,uu_net,'b--','linewidth',1)
59 | legend('Optimal','Action network');
60 | xlabel('Time steps');
61 | ylabel('Controls');
62 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
63 | grid on;
64 | 
65 | end
66 | 
67 | %--------------------------- outpout of system ----------------------------
68 | function y = controlled_system(x,u)
69 | % system matrices
70 | global A; global B;
71 | y = A*x + B*u;  % dot product should be adopt in nolinear systems
72 | end
73 | 
74 |     
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_controlled_system.m:
--------------------------------------------------------------------------------
1 | function x_next = vi_controlled_system(x,u,A,B)
2 | %--------------------------- controlled system ----------------------------
3 | x_next = A*x + B*u;
4 | end


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_cost_function.m:
--------------------------------------------------------------------------------
1 | function y = vi_cost_function(x,u,Q,R)
2 | %----------------------------- cost function ------------------------------
3 | y = (diag(x'*Q*x) + diag(u'*R*u))';
4 | end


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_pf_actor.m:
--------------------------------------------------------------------------------
1 | function y = vi_pf_actor(x)
2 | y = [   x(1,:);   x(2,:)   ];
3 | end


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_pf_critic.m:
--------------------------------------------------------------------------------
1 | function y = vi_pf_critic(x)
2 | y = [   x(1,:).^2;   x(1,:).*x(2,:);   x(2,:).^2;   ];
3 | end


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_pf_main.m:
--------------------------------------------------------------------------------
  1 | % This demo checks the feasibility value iteration adaptive dynamic 
  2 | % programming using an arbitrary positive semi-definite function as
  3 | % the initial value function
  4 | 
  5 | % By J. Lu
  6 | % Date: May 6, 2020 
  7 | 
  8 | %-------------------------------- start -----------------------------------
  9 | clear; close all; clc;
 10 | 
 11 | % utilize vi_state_data.m to generate training data and then run this file
 12 | load state_data
 13 | 
 14 | % the value function of linear systems can be approximated using quadratic
 15 | % form and eigenvalues of its corresponding matrix are 3.9645 and 11.0355
 16 | weights_critic_ini = 10*[1;-0.5;0.5]; 
 17 | 
 18 | % eigenvalues can be computed by
 19 | % P_ini = [ weights_critic_ini(1) weights_critic_ini(2)/2; weights_critic_ini(2)/2 weights_critic_ini(3) ]
 20 | % eig(P_ini)
 21 | 
 22 | weights_critic = weights_critic_ini; 
 23 | weights_actor = zeros(2,1); 
 24 | 
 25 | xV = vi_pf_critic(x_train);
 26 | xV_minus = vi_pf_critic(-x_train);
 27 | xu = vi_pf_actor(x_train);
 28 | 
 29 | epoch = 20;
 30 | performance_index = zeros(1,epoch + 1);
 31 | 
 32 | performance_index(1) = weights_critic'*vi_pf_critic(x0);
 33 | 
 34 | weights_critic_set = weights_critic;
 35 | weights_actor_set = [];
 36 | 
 37 | figure(1),plot(0,performance_index(1),'*'),hold on;
 38 | h = waitbar(0,'please wait');
 39 | for i = 1:epoch
 40 |     % update action network
 41 |     % by optimal tool
 42 |     if i ~= 1
 43 |         actor_target = zeros(control_dim,size(x_train,2));
 44 |         for j = 1:size(x_train,2)
 45 |             x = x_train(:,j);
 46 |             if x == zeros(state_dim,1)
 47 |                 ut = zeros(control_dim,1);
 48 |             else
 49 |                 objective = @(u) vi_cost_function(x,u,Q,R) + ...
 50 |                     weights_critic'*vi_pf_critic(vi_controlled_system(x,u,A,B));
 51 |                 u0 = weights_actor'*vi_pf_actor(x);
 52 |                 ut = fminunc(objective,u0);
 53 |             end
 54 |             actor_target(:,j) = ut;
 55 |         end
 56 |         weights_actor = xu'\actor_target';
 57 |     end
 58 |     
 59 |     % update critic network
 60 |     if i == 1
 61 |         critic_target = vi_cost_function(x_train,0,Q,R)';
 62 |     else
 63 |         x_next = vi_controlled_system(x_train,weights_actor'*xu,A,B);
 64 |         critic_target = vi_cost_function(x_train,weights_actor'*xu,Q,R)'+ ...
 65 |                         (weights_critic'*vi_pf_critic(x_next))';
 66 |     end
 67 |     weights_critic = [ xV xV_minus]'\[ critic_target; critic_target];
 68 |     
 69 |     weights_actor_set = [weights_actor_set weights_actor];
 70 |     weights_critic_set = [weights_critic_set weights_critic];
 71 |     performance_index(i+1) = weights_critic'*vi_pf_critic(x0);
 72 |     figure(1),plot(i,performance_index(i+1),'*'),hold on;
 73 |     waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']);
 74 | end
 75 | close(h);
 76 | 
 77 | % check results
 78 | P_ini = [ weights_critic_ini(1) weights_critic_ini(2)/2; weights_critic_ini(2)/2 weights_critic_ini(3) ]
 79 | 
 80 | K_final = -weights_actor'
 81 | P_final = [ weights_critic(1) weights_critic(2)/2; weights_critic(2)/2 weights_critic(3) ]
 82 | 
 83 | [Kopt, Popt] = dlqr(A,B,Q,R)
 84 | 
 85 | 
 86 | figure(1),
 87 | xlabel('Iterations');
 88 | ylabel('Iterative $V(x_0)$','Interpreter','latex');
 89 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 90 | grid on;
 91 | hold off;
 92 | 
 93 | figure(2),
 94 | plot((1:length(weights_critic_set))-1,weights_critic_set,'linewidth',1)
 95 | xlabel('Iterations');
 96 | ylabel('Critic NN weights'); 
 97 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
 98 | grid on;
 99 | 
100 | figure(3),
101 | plot((1:length(weights_actor_set))-1,weights_actor_set,'linewidth',1)
102 | xlabel('Iterations');
103 | ylabel('Action NN weights'); 
104 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1);
105 | grid on;
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/value_iteration_positive_semi_definite_initial_value_function/vi_state_data.m:
--------------------------------------------------------------------------------
 1 | %-------------------------- generate training data ------------------------
 2 | clear; close all; clc;
 3 | 
 4 | A = [0,0.1; 0.3, -1]; B = [0;0.5];
 5 | Q = 1*eye(2); R = 1;
 6 | control_dim = 1;
 7 | state_dim = 2;
 8 | x0 = [1;-1];
 9 | 
10 | Jcompression = 1;
11 | ucompression = 1;
12 | 
13 | x_train = zeros(state_dim,1);
14 | u_train = zeros(control_dim,1);
15 | 
16 | for i = 1:30
17 |     x_train = [x_train, zeros(state_dim,1)];
18 |     x_train = [x_train,4*(rand(state_dim,1)-0.5)];   % [-2 2]
19 |     x_train = [x_train,2*(rand(state_dim,1)-0.5)];   % [-1 1]
20 |     x_train = [x_train,1*(rand(state_dim,1)-0.5)];   % [-0.5 0.5]
21 |     x_train = [x_train,0.2*(rand(state_dim,1)-0.5)]; % [-0.1 0.1]
22 |     u_train = [u_train,zeros(control_dim,1)];
23 |     u_train = [u_train,4*(rand(control_dim,1)-0.5)];   % [-2 2]
24 |     u_train = [u_train,2*(rand(control_dim,1)-0.5)];   % [-1 1]
25 |     u_train = [u_train,1*(rand(control_dim,1)-0.5)];   % [-0.5 0.5]
26 |     u_train = [u_train,0.2*(rand(control_dim,1)-0.5)]; % [-0.1 0.1]
27 | end
28 | 
29 | r = randperm(size(x_train,2));   % randomization according column
30 | x_train = x_train(:,r);         % reorder
31 | [~,n] = find(x_train == zeros(state_dim,1));
32 | 
33 | zeros_index = [];
34 | for i = 1:size(n,1)/state_dim
35 |     zeros_index = [zeros_index, n(state_dim*i)];
36 | end
37 | 
38 | ru = randperm(size(u_train,2));   % randomization according column
39 | u_train = u_train(:,ru);         % reorder
40 | 
41 | xu_train = [x_train; u_train];
42 | x_next_train = A*x_train + B*u_train;
43 | 
44 | save state_data x_train u_train xu_train x_next_train zeros_index x0 control_dim state_dim A B Q R
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------