├── README.md ├── integral_reinforcement_learning ├── irl_main.m └── irl_ode.m ├── model_free_integral_reinforcement_learning ├── README.md ├── eg1 │ ├── eg1_c1_main.m │ ├── eg1_c1_ode.m │ ├── eg1_c1_parallel_ode.m │ ├── eg1_c2_main.m │ ├── eg1_c2_nn_pf.m │ ├── eg1_c2_nn_pf_dphi.m │ └── eg1_c2_ode.m └── eg2 │ ├── eg2_c1_main.m │ ├── eg2_c1_nn_pf.m │ ├── eg2_c1_nn_pf_dphi.m │ ├── eg2_c1_ode.m │ ├── eg2_c2_main.m │ └── eg2_c2_ode.m ├── model_free_nonzero_sum_games ├── README.md ├── eg_cacc │ └── cacc_model_free_learning.m └── eg_two_player_ndg │ └── tndg_model_free_learning.m ├── online_learning_policy_update └── puol_algorithm.m ├── online_learning_without_initial_admissible_control └── ol_algorithm.m ├── parallel_control_based_optimal_tracking ├── pc_ot_adp_main.m ├── pc_ot_adp_nn_pf_dphi.m └── pc_ot_adp_ode.m ├── policy_iteration ├── pi_algorithm.m ├── pi_icp.m ├── pi_state_data.m ├── pi_validation.m ├── training_data │ ├── actor_init.mat │ └── state_data.mat └── training_results │ └── actor_critic.mat ├── policy_iteration_continuous_time └── pi_algorithm.m ├── value_iteration ├── training_data │ └── state_data.mat ├── training_results │ └── actor_critic.mat ├── vi_algorithm.m ├── vi_state_data.m └── vi_validation.m └── value_iteration_positive_semi_definite_initial_value_function ├── vi_controlled_system.m ├── vi_cost_function.m ├── vi_pf_actor.m ├── vi_pf_critic.m ├── vi_pf_main.m └── vi_state_data.m /README.md: -------------------------------------------------------------------------------- 1 | # Adaptive Dynamic Programming and Reinforcement Learning (ADPRL) for nonlinear optimal control 2 | Adaptive dynamic programming (ADP), also known as approximate dynamic programming, neuro-dynamic programming, and reinforcement learning (RL), is a class of promising techniques to solve the problems of optimal control for discrete-time (DT) and continuous-time (CT) nonlinear systems. 3 | 4 | MATLAB codes of ADPRL, including iterative and online ADPRL, are provided. Currently added ADPRL algorithms include: 5 | - [Value iteraion for DT systems](https://ieeexplore.ieee.org/abstract/document/4554208) 6 | - [Value iteraion for DT systems (positive semi definite initial value function)](https://ieeexplore.ieee.org/abstract/document/7314890) 7 | - [Policy iteration for DT systems](https://ieeexplore.ieee.org/abstract/document/6609085) 8 | - [Policy iteration for CT systems](https://books.google.com/books?hl=zh-CN&lr=&id=U3Gtlot_hYEC&oi=fnd&pg=PR11&dq=optimal+control&ots=wd8jE5BAjv&sig=Gyy6RyPbILR_8hoybCd8etcX41o#v=onepage&q&f=false) 9 | - [Integral reinforcement learning for partially unknown CT systems](https://www.sciencedirect.com/science/article/abs/pii/S0893608009000446) 10 | - [Model-free integral reinforcement learning for completely unknown nonaffine CT systems](https://www.sciencedirect.com/science/article/pii/S0925231224001929) 11 | - [Model-free nonzero-sum games for completely unknown nonaffine CT systems](https://ieeexplore.ieee.org/document/10849990) 12 | - [Online learning policy update for DT systems](https://ieeexplore.ieee.org/abstract/document/6208889) 13 | - [Online learning without initial admissible control for CT systems](https://ieeexplore.ieee.org/document/5717676) 14 | - [Parallel control-based optimal tracking for nonaffine CT systems](https://www.ieee-jas.net/en/article/doi/10.1109/JAS.2020.1003426) 15 | 16 | ### Requirements 17 | ******** 18 | - MATLAB 19 | -------------------------------------------------------------------------------- /integral_reinforcement_learning/irl_main.m: -------------------------------------------------------------------------------- 1 | % This demo checks the feasibility of the integral reinforcement learning (IRL) algorithm 2 | 3 | % By J. Lu 4 | % Date: Nov. 23, 2020 5 | 6 | %-------------------------------- start ----------------------------------- 7 | clear; close all; clc; 8 | 9 | % parameters 10 | global K; global u; global A; global B; global Q; global R; 11 | P = zeros(4,4); 12 | uu = 0; % record control signal u 13 | 14 | % system matrices 15 | A = [-0.0665 11.5 0 0; 16 | 0 -2.5 2.5 0; 17 | -9.5 0 -13.736 -13.736; 18 | 0.6 0 0 0]; 19 | B = [0 0 13.736 0]'; 20 | Q = diag([1,1,1,1]); 21 | R = 1; 22 | 23 | % initial condition 24 | x0 = [0 0.1 0 0 0]; % x(5) is the integral of x'Qx + u'Ru 25 | 26 | % a stable K 27 | K = [0.8267 1.7003 0.7049 0.4142]; 28 | pole = eig(A - B*K) 29 | 30 | % simulation parameters 31 | Fsamples = 120; 32 | T = 0.05; % sample time 33 | SS = 0.001; % step size for simulation 34 | j = 0; 35 | nop = 10; 36 | tol = 1e-4; 37 | 38 | figure(1),hold on; 39 | xlabel('Time (s)'); 40 | ylabel('State trajectories'); 41 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 42 | grid on; 43 | for k=1:Fsamples 44 | j = j + 1; 45 | tspan = 0:SS:T; 46 | [t,x]= ode23('irl_ode',tspan,x0); 47 | uu = [uu u]; 48 | figure(1),plot(t + T*(k-1),x(:,1),'b',t + T*(k-1),x(:,2),'r',t + T*(k-1),x(:,3),'g',t + T*(k-1),x(:,4),'k','linewidth',1),hold on; 49 | x1 = x(length(t),1:4); 50 | X(j,:) = [ x0(1)^2 x0(1)*x0(2) x0(1)*x0(3) x0(1)*x0(4) ... 51 | x0(2)^2 x0(2)*x0(3) x0(2)*x0(4)... 52 | x0(3)^2 x0(3)*x0(4) x0(4)^2 ] - ... 53 | [ x1(1)^2 x1(1)*x1(2) x1(1)*x1(3) x1(1)*x1(4) ... 54 | x1(2)^2 x1(2)*x1(3) x1(2)*x1(4)... 55 | x1(3)^2 x1(3)*x1(4) x1(4)^2 ]; 56 | 57 | Y(j,:) = x(length(t),5) - x(1,5); 58 | x0 = [x(length(t),1:4),x(length(t),5)]; 59 | 60 | if mod(k,nop) == 0 && norm(X) > tol 61 | weights = pinv(X)*Y; 62 | P=[ weights(1) weights(2)/2 weights(3)/2 weights(4)/2;... 63 | weights(2)/2 weights(5) weights(6)/2 weights(7)/2;... 64 | weights(3)/2 weights(6)/2 weights(8) weights(9)/2;... 65 | weights(4)/2 weights(7)/2 weights(9)/2 weights(10) ]; 66 | K = inv(R)*B'*P; 67 | jj = 1; 68 | X = zeros(nop,10); 69 | Y = zeros(nop,1); 70 | j = 0; 71 | end 72 | end 73 | 74 | P 75 | [~,Popt] = lqr(A,B,Q,R) 76 | 77 | % plot result 78 | figure(1), hold off; 79 | 80 | figure(2),plot(T*[0:Fsamples],uu,'b','linewidth',1) 81 | xlabel('Time (s)'); 82 | ylabel('Control trajectory'); 83 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 84 | grid on; 85 | 86 | 87 | -------------------------------------------------------------------------------- /integral_reinforcement_learning/irl_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = irl_ode(~,x) 2 | % parameters 3 | global K; global u; global A; global B; global Q; global R; 4 | 5 | x = [x(1) x(2) x(3) x(4)]'; 6 | 7 | % calculate the control signal 8 | u = -K*x; 9 | 10 | xdot = [A*x + B*u; x'*Q*x + u'*R*u]; -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/README.md: -------------------------------------------------------------------------------- 1 | # Model-free integral reinforcement learning (IRL) 2 | ******** 3 | Source code for paper: 4 | J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach. 5 | 6 | ### Description 7 | ******** 8 | A novel online optimal control method for unknown continuous-time nonaffine nonlinear systems *without recovering unknown systems*. 9 | 10 | ### Requirements 11 | ******** 12 | - MATLAB 13 | 14 | ### How to run 15 | ******** 16 | - Directly run eg1_c1_main.m, eg1_c2_main.m, eg2_c1_main.m, or eg2_c2_main.m. 17 | 18 | ### Citation 19 | ******** 20 | ```angular2html 21 | @article{lu2023nearly, 22 | title = {Nearly Optimal Stabilization of Unknown Continuous-Time Nonlinear Systems: A New Parallel Control Approach}, 23 | author = {Lu, Jingwei and Wang, Xingxia and Wei, Qinglai and Wang, Fei-Yue}, 24 | doi = {10.1016/j.neucom.2024.127421}, 25 | year = {2024} 26 | } 27 | ``` 28 | -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c1_main.m: -------------------------------------------------------------------------------- 1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach. 2 | % Code for Case I in Example 1 3 | % By J. Lu 4 | 5 | %-------------------------------- start ----------------------------------- 6 | clear; close all; clc; 7 | 8 | global A; global B; 9 | global Aa; global Ba; 10 | global Q; global R; 11 | global Qa; global Ra; 12 | global Ta; 13 | global Kopt; global Kaopt; 14 | global u; 15 | global ua; 16 | 17 | 18 | A = [0 1;-9 -3]; B = [0; 1]; 19 | 20 | Ta = 1; 21 | Aa = [A B; zeros(1,2) -1/Ta]; Ba = [zeros(2,1); (1/Ta)*diag([1])]; 22 | 23 | Q = 1*eye(2); R = 1*eye(1); 24 | Qa = diag([diag(Q)' diag(R)']); Ra = 1e0*diag([1]); 25 | 26 | [Kopt, Popt] = lqr(A,B,Q,R); 27 | [Kaopt, Paopt] = lqr(Aa,Ba,Qa,Ra); 28 | 29 | Fsamples = 400; 30 | T = 0.05; 31 | ss = T/10; 32 | 33 | % origin system 34 | % state and control 35 | x0 = [1;1]; 36 | x = [x0;0]; 37 | xx = [x(1:2)]; 38 | uu = [-Kopt*x0]; 39 | 40 | % performance 41 | J = 0; 42 | 43 | h = waitbar(0,'please wait'); 44 | for k = 1:Fsamples 45 | tspan = 0:ss:T; 46 | [t,x]= ode45(@eg1_c1_ode, tspan, x); 47 | J = J + x(length(t),3); 48 | x = [x(length(t),1:2),0]; 49 | xx = [xx x(1:2)']; 50 | uu = [uu u]; 51 | 52 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 53 | end 54 | close(h); 55 | 56 | J 57 | 58 | % aas 59 | % state and control 60 | xa = [x0;uu(:,1);0]; 61 | xxa = [xa(1:2)]; 62 | uua = [uu(:,1)]; 63 | 64 | % performance 65 | Ja = 0; 66 | 67 | h = waitbar(0,'please wait'); 68 | for k = 1:Fsamples 69 | tspan = 0:ss:T; 70 | [t,xa]= ode45(@eg1_c1_parallel_ode, tspan, xa); 71 | Ja = Ja + xa(length(t),4); 72 | xa = [xa(length(t),1:3),0]; 73 | xxa = [xxa xa(1:2)']; 74 | uua = [uua xa(3)]; 75 | 76 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 77 | end 78 | close(h); 79 | 80 | Ja 81 | 82 | Jopt = x0'*Popt*x0 83 | 84 | fprintf('Ta: %d; Ra: %d; difference: %d.\n',Ta,Ra,Ja-Jopt) 85 | 86 | 87 | % result 88 | figure(1), % states 89 | subplot(2,1,1) 90 | plot(T*((1:size(xx,2))-1),xx(1,:),'r',T*((1:size(xxa,2))-1),xxa(1,:),'b--','linewidth',1) 91 | xlabel('Time (s)'); 92 | ylabel('$x_1$','Interpreter','latex'); 93 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 94 | grid on; 95 | subplot(2,1,2) 96 | plot(T*((1:size(xx,2))-1),xx(2,:),'r',T*((1:size(xxa,2))-1),xxa(2,:),'b--','linewidth',1) 97 | xlabel('Time (s)'); 98 | ylabel('$x_2$','Interpreter','latex'); 99 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 100 | grid on; 101 | 102 | figure(2), % control 103 | plot(T*((1:size(uu,2))-1),uu(1,:),'r',T*((1:size(uua,2))-1),uua(1,:),'b--','linewidth',1) 104 | xlabel('Time (s)'); 105 | ylabel('$u$','Interpreter','latex'); 106 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 107 | grid on; 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c1_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = eg1_c1_ode(~,x) 2 | 3 | global A; global B; 4 | global Q; global R; 5 | global Kopt; 6 | global u; 7 | 8 | u = -Kopt*[x(1);x(2)]; 9 | 10 | xdot = [A*[x(1);x(2)]+B*u; [x(1);x(2)]'*Q*[x(1);x(2)]+u'*R*u]; 11 | 12 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c1_parallel_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = eg1_c1_parallel_ode(~,x) 2 | 3 | global A; global B; 4 | global Q; global R; 5 | global Ta; 6 | global Kaopt; 7 | global ua; 8 | 9 | ua = x(3); 10 | 11 | v = -Kaopt*[x(1);x(2);x(3)]; 12 | 13 | xdot = [A*[x(1);x(2)]+B*ua; (1/Ta)*(v - x(3)); [x(1);x(2)]'*Q*[x(1);x(2)]+ua'*R*ua]; 14 | 15 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c2_main.m: -------------------------------------------------------------------------------- 1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach. 2 | % Code for Case II in Example 1 3 | % By J. Lu 4 | 5 | %-------------------------------- start ----------------------------------- 6 | clear; close all; clc; 7 | 8 | global A; global B; 9 | global Ba; 10 | global Qa; global Ra; 11 | global Ta; 12 | global dw; 13 | global wa; 14 | 15 | A = [0 1;-9 -3]; B = [0; 1]; 16 | 17 | Ta = 1; 18 | Aa = [A B; zeros(1,2) -1/Ta]; 19 | Ba = [zeros(2,1); (1/Ta)*diag([1])]; 20 | 21 | Q = 1*eye(2); R = 1*eye(1); 22 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]); 23 | 24 | [Kaopt, Paopt] = lqr(Aa,Ba,Qa,Ra); 25 | 26 | Fsamples = 10000; 27 | T = 0.1; 28 | ss = T/10; 29 | 30 | % state, control, and critic and action NN weights 31 | x00 = [1;-1]; 32 | x = [x00;0;0;1;0;0;1;0;1]; 33 | wa = x(5:end); 34 | x0 = x; 35 | xx = [x]; 36 | uu = []; 37 | 38 | phi = eg1_c2_nn_pf(x(1:3)); 39 | pphi = []; 40 | pp = []; 41 | 42 | % NN learning law 43 | dw = zeros(length(phi),1); 44 | 45 | wwa = [wa]; 46 | wwc = [x(5:end)]; 47 | 48 | ac = 30; 49 | aa = 0.1; 50 | 51 | cw = 1e-2; 52 | 53 | rpb_k = Fsamples - 100; % remove the probing noise at the last 100 steps 54 | 55 | Gamma = eye(length(phi)); 56 | 57 | h = waitbar(0,'please wait'); 58 | for k = 1:Fsamples 59 | 60 | % probing noise 61 | if norm(x(1:2),2) <= 1 && k <= rpb_k 62 | x = [x(1);x(2);x(3)+10*(rand(1,1)-0.5);0;x(5:end)']; 63 | phi = eg1_c2_nn_pf(x(1:3)); 64 | end 65 | 66 | tspan = 0:ss:T; 67 | [t,x]= ode45(@eg1_c2_ode, tspan, x); 68 | 69 | phi_next = eg1_c2_nn_pf(x(length(t),1:3)); 70 | pphi = phi_next-phi; 71 | phi = phi_next; 72 | 73 | pp = x(length(t),4); 74 | 75 | % critic learning law 76 | dw = - ac*pphi/((1+pphi'*pphi)^2)*(pp+pphi'*x(length(t),5:end)'); 77 | 78 | % action learning law 79 | if norm(dw,2) <= cw 80 | wa = wa + (-aa*Gamma*(wa - x(length(t),5:end)')); 81 | phi = eg1_c2_nn_pf(x(length(t),1:3)); 82 | end 83 | 84 | x = [x(length(t),1:3),0,x(length(t),5:end)]; 85 | xx = [xx x']; 86 | uu = [uu x(3)]; 87 | wwa = [wwa wa]; 88 | wwc = [wwc x(5:end)']; 89 | 90 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 91 | end 92 | close(h); 93 | 94 | wc = xx(5:end,end)' 95 | wa = wa' 96 | wopt = [Paopt(1,1); 2*Paopt(1,2); 2*Paopt(1,3); Paopt(2,2); 2*Paopt(2,3); Paopt(3,3)]' 97 | difference_initial = x0(5:end)' - wopt 98 | difference_final = wc - wopt 99 | 100 | % result 101 | figure(1), % states 102 | subplot(2,1,1) 103 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1) 104 | xlabel('Time (s)'); 105 | ylabel('$x_1$','Interpreter','latex'); 106 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 107 | grid on; 108 | subplot(2,1,2) 109 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1) 110 | xlabel('Time (s)'); 111 | ylabel('$x_2$','Interpreter','latex'); 112 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 113 | grid on; 114 | 115 | figure(2), % control 116 | plot(T*((1:size(uu,2))-1),uu(1,:),'linewidth',1) 117 | xlabel('Time (s)'); 118 | ylabel('$u$','Interpreter','latex'); 119 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 120 | grid on; 121 | 122 | figure(3), % wc 123 | plot(T*((1:size(xx,2))-1),xx(5:end,:),'linewidth',1) 124 | xlabel('Time (s)'); 125 | ylabel('Critic newtork weights'); 126 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 127 | grid on; 128 | 129 | figure(4), % wa 130 | plot(T*((1:size(wwa,2))-1),wwa,'linewidth',1) 131 | xlabel('Time (s)'); 132 | ylabel('Action newtork weights'); 133 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 134 | grid on; 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c2_nn_pf.m: -------------------------------------------------------------------------------- 1 | function y = eg1_c2_nn_pf(x) 2 | y = [x(1)^2; x(1)*x(2); x(1)*x(3); x(2)^2; x(2)*x(3); x(3)^2]; 3 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c2_nn_pf_dphi.m: -------------------------------------------------------------------------------- 1 | function dphi = eg1_c2_nn_pf_dphi(x) 2 | dphi = [ 3 | [ 2*x(1), 0, 0 ]; 4 | [ x(2), x(1), 0 ]; 5 | [ x(3), 0, x(1) ]; 6 | [ 0, 2*x(2), 0 ]; 7 | [ 0, x(3), x(2) ]; 8 | [ 0, 0, 2*x(3) ]; 9 | ]; 10 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg1/eg1_c2_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = eg1_c2_ode(~,x) 2 | 3 | global A; global B; 4 | global Ba; 5 | global Qa; global Ra; 6 | global Ta; 7 | global dw; 8 | global wa; 9 | 10 | u = x(3); 11 | 12 | dphi = eg1_c2_nn_pf_dphi(x(1:3)); 13 | 14 | v = -0.5*inv(Ra)*Ba'*dphi'*wa; 15 | 16 | xdot = [A*[x(1);x(2)]+B*u; (1/Ta)*(v - x(3)); [x(1);x(2);x(3)]'*Qa*[x(1);x(2);x(3)]+v'*Ra*v;dw]; 17 | 18 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c1_main.m: -------------------------------------------------------------------------------- 1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach. 2 | % Code for Case I in Example 2 3 | % By J. Lu 4 | 5 | %-------------------------------- start ----------------------------------- 6 | clear; close all; clc; 7 | 8 | global M; global g; global L; 9 | global k1; global k2; global k3; 10 | global Ga; 11 | global Q; global R; global Ra; 12 | global Ta; 13 | global dw; 14 | global wa; 15 | 16 | Ta = 1; 17 | Ga = [zeros(2,1); (1/Ta)*diag([1])]; 18 | 19 | M = 0.3; L = 0.6; g = 10; k1 = 0.8; k2 = 0.2; k3 = 1; 20 | 21 | Q = 5*eye(2);R = 0.1*eye(1); 22 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]); 23 | 24 | Fsamples = 100000; 25 | T = 0.1; 26 | ss = T/10; 27 | 28 | % state, control, and critic and action NN weights 29 | x00 = 1*[1;0]; 30 | wc = [1;0;0;1;0;1;zeros(6,1)]; 31 | 32 | x = [x00;-1;0;0;wc]; 33 | wa = x(6:end); 34 | x0 = x; 35 | xx = [x]; 36 | uu = []; 37 | 38 | phi = eg2_c1_nn_pf(x(1:3)); 39 | pphi = []; 40 | pp = []; 41 | 42 | % NN learning law 43 | dw = zeros(length(phi),1); 44 | 45 | 46 | wwa = [wa]; 47 | wwc = [x(6:end)]; 48 | 49 | ac = 100; 50 | aa = 0.08; 51 | 52 | cw = 1e-2; 53 | 54 | rpb_k = Fsamples - 200; % remove the probing noise at the last 200 steps 55 | 56 | Gamma = eye(length(phi)); 57 | 58 | h = waitbar(0,'please wait'); 59 | for k = 1:Fsamples 60 | 61 | % probing noise 62 | if norm(x(1:2),2) <= 0.2 && k <= rpb_k 63 | x = [x(1);x(2);x(3)+2*(rand(1,1)-0.5);0;0;x(6:end)']; 64 | phi = eg2_c1_nn_pf(x(1:3)); 65 | end 66 | 67 | tspan = 0:ss:T; 68 | [t,x]= ode45(@eg2_c1_ode, tspan, x); 69 | 70 | phi_next = eg2_c1_nn_pf(x(length(t),1:3)); 71 | pphi = phi_next-phi; 72 | phi = phi_next; 73 | 74 | pp = x(length(t),4); 75 | 76 | % critic learning law 77 | dw = - ac*pphi/((1+pphi'*pphi)^2)*(pp+pphi'*x(length(t),6:end)'); 78 | 79 | % action learning law 80 | if norm(dw,2) <= cw 81 | wa = wa + (-aa*Gamma*(wa - x(length(t),6:end)')); 82 | phi = eg2_c1_nn_pf(x(length(t),1:3)); 83 | end 84 | 85 | x = [x(length(t),1:3),0,0,x(length(t),6:end)]; 86 | xx = [xx x']; 87 | uu = [uu x(3)]; 88 | wwa = [wwa wa]; 89 | wwc = [wwc x(6:end)']; 90 | 91 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 92 | end 93 | close(h); 94 | 95 | wc = xx(6:end,end)' 96 | wa = wa' 97 | 98 | % result 99 | figure(1), % states 100 | subplot(2,1,1) 101 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1) 102 | xlabel('Time (s)'); 103 | ylabel('$x_1$','Interpreter','latex'); 104 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 105 | grid on; 106 | subplot(2,1,2) 107 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1) 108 | xlabel('Time (s)'); 109 | ylabel('$x_2$','Interpreter','latex'); 110 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 111 | grid on; 112 | 113 | figure(2), % control 114 | plot(T*((1:size(uu,2))-1),uu(1,:),'linewidth',1) 115 | xlabel('Time (s)'); 116 | ylabel('$u$','Interpreter','latex'); 117 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 118 | grid on; 119 | 120 | figure(3), % wc 121 | plot(T*((1:size(xx,2))-1),xx(5:end,:),'linewidth',1) 122 | xlabel('Time (s)'); 123 | ylabel('Critic newtork weights'); 124 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 125 | grid on; 126 | 127 | figure(4), % wa 128 | plot(T*((1:size(wwa,2))-1),wwa,'linewidth',1) 129 | xlabel('Time (s)'); 130 | ylabel('Action newtork weights'); 131 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 132 | grid on; 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c1_nn_pf.m: -------------------------------------------------------------------------------- 1 | function y = eg2_c1_nn_pf(x) 2 | y = [ x(1)^2; x(1)*x(2); x(2)^2; x(1)*x(3); x(2)*x(3); x(3)^2;... 3 | x(1)^3; x(2)^3; x(3)^3; x(1)^4; x(2)^4; x(3)^4 ]; 4 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c1_nn_pf_dphi.m: -------------------------------------------------------------------------------- 1 | function dphi = eg2_c1_nn_pf_dphi(x) 2 | dphi = [ 3 | [ 2*x(1), 0, 0 ]; 4 | [ x(2), x(1), 0 ]; 5 | [ x(3), 0, x(1) ]; 6 | [ 0, 2*x(2), 0 ]; 7 | [ 0, x(3), x(2) ]; 8 | [ 0, 0, 2*x(3) ]; 9 | [ 3*x(1)^2, 0, 0 ]; 10 | [ 0, 3*x(2)^2, 0 ]; 11 | [ 0, 0, 3*x(3)^2 ]; 12 | [ 4*x(1)^3, 0, 0 ]; 13 | [ 0, 4*x(2)^3, 0 ]; 14 | [ 0, 0, 4*x(3)^3 ]; 15 | ]; 16 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c1_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = eg2_c1_ode(~,x) 2 | 3 | global M; global g; global L; 4 | global k1; global k2; global k3; 5 | global Ga; 6 | global Q; global R; global Ra; 7 | global Ta; 8 | global dw; 9 | global wa; 10 | 11 | u = x(3); 12 | 13 | dphi = eg2_c1_nn_pf_dphi(x(1:3)); 14 | 15 | v = -0.5*inv(Ra)*Ga'*dphi'*wa; 16 | 17 | f = [x(2);-M*g*L/k1*sin(x(1))-k2/k1*x(2)+k3/k1*(tanh(u)+u)]; 18 | 19 | xdot = [ f; (1/Ta)*(v - x(3));... 20 | [x(1);x(2)]'*Q*[x(1);x(2)] + 1*log(1+[x(3)]'*R*[x(3)])+v'*Ra*v;... 21 | [x(1);x(2)]'*Q*[x(1);x(2)] + 1*log(1+[x(3)]'*R*[x(3)]); dw ]; 22 | 23 | 24 | 25 | end -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c2_main.m: -------------------------------------------------------------------------------- 1 | % J. Lu, X. Wang, Q. Wei, F.-Y. Wang. Nearly optimal stabilization of unknown continuous-time nonlinear systems: A new parallel control approach. 2 | % Code for Case II in Example 2 3 | % By J. Lu 4 | 5 | %-------------------------------- start ----------------------------------- 6 | clear; close all; clc; 7 | 8 | global M; global g; global L; 9 | global k1; global k2; global k3; 10 | global Ga; 11 | global Ta; 12 | global wa; 13 | 14 | Ta = 1; 15 | Ga = [zeros(2,1); (1/Ta)*diag([1])]; 16 | 17 | M = 0.3; L = 0.6; g = 10; k1 = 0.8; k2 = 0.2; k3 = 1; 18 | 19 | Fsamples = 150; 20 | T = 0.05; 21 | ss = T/10; 22 | 23 | wa = [10.4347996132347;4.74187769922673;4.59458795955741;-0.140646935292767;5.46611496181774;2.82980614212775;0.00266940936499440;0.0224520628430915;0.00480024500336202;2.63761282635573;-0.179632857958409;-0.150728264946674]; 24 | 25 | x00 = [1;0]; 26 | u00 = 2; 27 | x = [x00;u00]; 28 | xx0 = [x]; 29 | uu0 = [u00]; 30 | 31 | for k = 1:Fsamples 32 | tspan = 0:ss:T; 33 | [t,x]= ode45(@eg2_c2_ode, tspan, x); 34 | 35 | x = x(length(t),:); 36 | xx0 = [xx0 x']; 37 | uu0 = [uu0 x(3)]; 38 | end 39 | 40 | 41 | u01 = -2; 42 | x = [x00;u01]; 43 | xx1 = [x]; 44 | uu1 = [u01]; 45 | 46 | for k = 1:Fsamples 47 | tspan = 0:ss:T; 48 | [t,x]= ode45(@eg2_c2_ode, tspan, x); 49 | 50 | x = x(length(t),:); 51 | xx1 = [xx1 x']; 52 | uu1 = [uu1 x(3)]; 53 | end 54 | 55 | 56 | figure(1), % 3d 57 | plot3(xx0(1,:),xx0(2,:),uu0,'r','linewidth',1),hold on; 58 | plot3(xx1(1,:),xx1(2,:),uu1,'b-.','linewidth',1),hold on; 59 | plot3(x00(1,:),x00(2,:),u00,'ro','MarkerFaceColor','r','linewidth',1,'MarkerSize',6),hold on; 60 | plot3(x00(1,:),x00(2,:),u01,'bo','MarkerFaceColor','b','linewidth',1,'MarkerSize',6),hold on; 61 | xlim([-0.5 1.1]) 62 | xlabel('$x_1$','Interpreter','latex'); 63 | ylabel('$x_2$','Interpreter','latex'); 64 | zlabel('$u$','Interpreter','latex'); 65 | h=legend('$u^1_0=2$','$u^2_0=-2$'); 66 | set(h,'Interpreter','latex','Position',[0.154401278625391 0.266349206349207 0.236911728463504 0.106714288938613]); 67 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 68 | grid on; 69 | view([426.299998823416 14.7740121081114]) 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /model_free_integral_reinforcement_learning/eg2/eg2_c2_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = eg2_c2_ode(~,x) 2 | 3 | global M; global g; global L; 4 | global k1; global k2; global k3; 5 | global Ga; 6 | global Ra; 7 | global Ta; 8 | global wa; 9 | 10 | u = x(3); 11 | 12 | dphi = eg2_c1_nn_pf_dphi(x(1:3)); 13 | 14 | v = -0.5*inv(Ra)*Ga'*dphi'*wa; 15 | 16 | f = [x(2);-M*g*L/k1*sin(x(1))-k2/k1*x(2)+k3/k1*(tanh(u)+u)]; 17 | 18 | xdot = [f; (1/Ta)*(v - x(3)) ]; 19 | 20 | end -------------------------------------------------------------------------------- /model_free_nonzero_sum_games/README.md: -------------------------------------------------------------------------------- 1 | # Model-free nonzero-sum games (NZSGs) 2 | ******** 3 | Code for paper: 4 | [J. Lu, Q. Wei, F.-Y. Wang. Parallel control for nonzero-sum games with completely unknown nonlinear dynamics via reinforcement learning. *IEEE Transactions on Systems, Man, and Cybernetics: Systems*, 2025, doi: 10.1109/TSMC.2025.3526357.](https://ieeexplore.ieee.org/document/10849990) 5 | 6 | ### Description 7 | ******** 8 | A model-free optimal control method is developed for unknown continuous-time NZSGs *without reconstructing unknown dynamics or employing off-policy IRL*. 9 | 10 | ### Requirements 11 | ******** 12 | - MATLAB 13 | 14 | ### Citation 15 | ******** 16 | ```angular2html 17 | @article{lu2025parallel, 18 | title={Parallel control for nonzero-sum games with completely unknown nonlinear dynamics via reinforcement learning}, 19 | author={Lu, Jingwei and Wei, Qinglai and Wang, Fei-Yue}, 20 | journal={IEEE Transactions on Systems, Man, and Cybernetics: Systems}, 21 | doi = {10.1109/TSMC.2025.3526357}, 22 | year={2025} 23 | } 24 | ``` 25 | -------------------------------------------------------------------------------- /model_free_nonzero_sum_games/eg_cacc/cacc_model_free_learning.m: -------------------------------------------------------------------------------- 1 | function cacc_model_free_learning 2 | % ''Parallel Control for Nonzero-Sum Games With Completely Unknown 3 | % Nonlinear Dynamics via Reinforcement Learning'' 4 | % 5 | % Code for cooperative adaptive cruise control (CACC) 6 | % 7 | % By J. Lu 8 | % Date: Aug. 12, 2024 9 | 10 | %-------------------------------- start ----------------------------------- 11 | clear; close all; clc; 12 | 13 | global Ga1; global Ga2; 14 | global Q1; global R11; global R12; 15 | global Q2; global R21; global R22; 16 | global Qa1; global Ra11; global Ra12; 17 | global Qa2; global Ra21; global Ra22; 18 | global w1; global w2; 19 | global dw1; global dw2; 20 | 21 | % cacc information & performance indices 22 | n = 6; 23 | m1 = 1; 24 | m2 = 1; 25 | eta = n + m1 + m2; 26 | 27 | Ga1 = [ zeros(n,m1); eye(m1); zeros(m2,m1) ]; % $\mathcal{G}_1$ 28 | Ga2 = [ zeros(n,m2); zeros(m1,m2); eye(m2) ]; % $\mathcal{G}_2$ 29 | 30 | Q1 = 2*eye(n); R11 = 2*eye(m1); R12 = 2*eye(m2); 31 | Q2 = 1*eye(n); R21 = 1*eye(m1); R22 = 1*eye(m2); 32 | Qa1 = [ Q1,zeros(n,m1+m2);zeros(m1,n),R11,zeros(m1,m2);zeros(m2,n+m1),R12 ]; Ra11 = 1*eye(m1); Ra12 = 0*eye(m2); 33 | Qa2 = [ Q2,zeros(n,m1+m2);zeros(m1,n),R21,zeros(m1,m2);zeros(m2,n+m1),R22 ]; Ra21 = 0*eye(m1); Ra22 = 1*eye(m2); 34 | 35 | Fsamples = 100000; 36 | T = 0.5; 37 | ss = T/10; 38 | 39 | w1 = [ 1 0 0 0 0 0 -1 0 ... 40 | 1 0 0 0 0 -1 0 ... 41 | 1 0 0 0 0 0 ... 42 | 1 0 0 0 -1 ... 43 | 1 0 0 -1 ... 44 | 1 0 0 ... 45 | 1 0 ... 46 | 1 ]'; 47 | w2 = w1; 48 | 49 | x_ini = 5; 50 | u_ini = 1; 51 | 52 | x0 = x_ini*[ 1; 1; 0; 1; 1; 0 ]; 53 | u0 = u_ini*[ 0; 0 ]; 54 | x = [ x0; u0; 0; 0; 0; 0; w1; w2 ]; 55 | 56 | nn_length = length(w1); 57 | 58 | xx = [ x ]; 59 | uu1 = []; 60 | uu2 = []; 61 | 62 | ell = length(w1); % number of hidden neurons 63 | L = ell; % need L >= ell to satisfy rank($\mathcal{D}_i$) = $\ell_i$ 64 | 65 | phi = cacc_nn_pf(x(1:eta)); 66 | pphi = zeros(ell,L+1); % number of historical data (L) + real-time data (1) 67 | pp1 = zeros(L+1,1); 68 | pp2 = zeros(L+1,1); 69 | 70 | dw1 = zeros(nn_length,1); 71 | dw2 = zeros(nn_length,1); 72 | 73 | ww1 = [ w1 ]; 74 | ww2 = [ w2 ]; 75 | 76 | l1 = 1; % learning rates 77 | l2 = 1; 78 | 79 | decay_interval = 1000; 80 | 81 | l1_final = 1e-1; 82 | l2_final = 1e-1; 83 | l1_decay = (l1-l1_final)/(0.8*Fsamples/decay_interval); 84 | l2_decay = (l2-l2_final)/(0.8*Fsamples/decay_interval); 85 | 86 | i = 1; 87 | 88 | h = waitbar(0,'Please wait'); 89 | for k = 1:Fsamples 90 | if norm(x(1:n),2) > 1e3 % the vehicular platoon is unstable and need to restart the learning 91 | break % reducing the learning rates may provide a more stable learning process 92 | end 93 | 94 | if norm(x(1:n),2) <= 1 % reset controls 95 | u0 = 2*u_ini*[ (rand(1)-0.5); (rand(1)-0.5) ]; 96 | x = [ x(1:n)'; u0; 0; 0; 0; 0; x(eta+5:end)' ]; 97 | phi = cacc_nn_pf(x(1:eta)); 98 | end 99 | 100 | if mod(k,decay_interval) == 0 101 | l1 = l1 - l1_decay; 102 | l2 = l2 - l2_decay; 103 | end 104 | 105 | if l1 < l1_final 106 | l1 = l1_final; 107 | end 108 | 109 | if l2 < l2_final 110 | l2 = l2_final; 111 | end 112 | 113 | tspan = 0:ss:T; 114 | [t,x]= ode45(@cacc_ode, tspan, x); 115 | 116 | phi_next = cacc_nn_pf(x(length(t),1:eta)); 117 | 118 | if i <= L + 1 119 | pphi(:,i) = phi_next-phi; 120 | pp1(i) = x(length(t),eta+3); 121 | pp2(i) = x(length(t),eta+4); 122 | 123 | phi = phi_next; 124 | 125 | i = i + 1; 126 | else 127 | for j = 1:L 128 | pphi(:,j) = pphi(:,j+1); 129 | pp1(j) = pp1(j+1); 130 | pp2(j) = pp2(j+1); 131 | end 132 | pphi(:,L+1) = phi_next-phi; 133 | pp1(L+1) = x(length(t),eta+3); 134 | pp2(L+1) = x(length(t),eta+4); 135 | 136 | phi = phi_next; 137 | end 138 | 139 | w1 = x(length(t),eta+5:eta+4+nn_length)'; 140 | w2 = x(length(t),eta+5+nn_length:eta+4+2*nn_length)'; 141 | 142 | if i >= L + 1 143 | dw1 = zeros(nn_length,1); 144 | dw2 = zeros(nn_length,1); 145 | for j = 1:L+1 146 | dw1 = dw1 - l1*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp1(j)+pphi(:,j)'*w1); 147 | dw2 = dw2 - l2*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp2(j)+pphi(:,j)'*w2); 148 | end 149 | end 150 | 151 | x = [x(length(t),1:eta),0,0,0,0,x(length(t),eta+5:end)]; 152 | 153 | xx = [ xx x' ]; 154 | uu1 = [ uu1 x(7) ]; 155 | uu2 = [ uu2 x(8) ]; 156 | ww1 = [ ww1 w1 ]; 157 | ww2 = [ ww2 w2 ]; 158 | 159 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 160 | end 161 | close(h); 162 | 163 | w1_display = w1' 164 | w2_display = w2' 165 | 166 | % result 167 | figure(1), % vehicle 1 states 168 | subplot(3,1,1) 169 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1) 170 | xlabel('Time (s)'); 171 | ylabel('$e_{p,1}$','Interpreter','latex'); 172 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 173 | grid on; 174 | subplot(3,1,2) 175 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1) 176 | xlabel('Time (s)'); 177 | ylabel('$e_{v,1}$','Interpreter','latex'); 178 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 179 | grid on; 180 | subplot(3,1,3) 181 | plot(T*((1:size(xx,2))-1),xx(3,:),'linewidth',1) 182 | xlabel('Time (s)'); 183 | ylabel('$a_1$','Interpreter','latex'); 184 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 185 | grid on; 186 | 187 | figure(2), % vehicle 2 states 188 | subplot(3,1,1) 189 | plot(T*((1:size(xx,2))-1),xx(4,:),'linewidth',1) 190 | xlabel('Time (s)'); 191 | ylabel('$e_{p,2}$','Interpreter','latex'); 192 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 193 | grid on; 194 | subplot(3,1,2) 195 | plot(T*((1:size(xx,2))-1),xx(5,:),'linewidth',1) 196 | xlabel('Time (s)'); 197 | ylabel('$e_{v,2}$','Interpreter','latex'); 198 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 199 | grid on; 200 | subplot(3,1,3) 201 | plot(T*((1:size(xx,2))-1),xx(6,:),'linewidth',1) 202 | xlabel('Time (s)'); 203 | ylabel('$a_2$','Interpreter','latex'); 204 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 205 | grid on; 206 | 207 | figure(3), % vehicle 1 control 208 | plot(T*((1:size(uu1,2))-1),uu1,'linewidth',1) 209 | xlabel('Time (s)'); 210 | ylabel('$u_1$','Interpreter','latex'); 211 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 212 | grid on; 213 | 214 | figure(4), % vehicle 2 control 215 | plot(T*((1:size(uu2,2))-1),uu1,'linewidth',1) 216 | xlabel('Time (s)'); 217 | ylabel('$u_2$','Interpreter','latex'); 218 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 219 | grid on; 220 | 221 | figure(5), % w1 222 | plot(T*((1:size(ww1,2))-1),ww1,'linewidth',1) 223 | xlabel('Time (s)'); 224 | ylabel('$\mathcal{W}_{1}$','Interpreter','latex'); 225 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 226 | grid on; 227 | 228 | figure(6), % w2 229 | plot(T*((1:size(ww2,2))-1),ww2,'linewidth',1) 230 | xlabel('Time (s)'); 231 | ylabel('$\mathcal{W}_{2}$','Interpreter','latex'); 232 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 233 | grid on; 234 | 235 | end 236 | %--------------------------- cacc dynamics ------------------------------ 237 | function xdot = cacc_ode(~,x) 238 | global Ga1; global Ga2; 239 | global Q1; global R11; global R12; 240 | global Q2; global R21; global R22; 241 | global Qa1; global Ra11; global Ra12; 242 | global Qa2; global Ra21; global Ra22; 243 | global w1; global w2; 244 | global dw1; global dw2; 245 | 246 | td = 1; tau = 0.1; 247 | 248 | x_state = [ x(1); x(2); x(3); x(4); x(5); x(6) ]; 249 | 250 | u1 = x(7); 251 | u2 = x(8); 252 | 253 | u = [ u1; u2 ]; 254 | 255 | s = [ x_state; u ]; 256 | 257 | dphi = cacc_nn_pf_dphi(s); 258 | 259 | v1 = -0.5*inv(Ra11)*Ga1'*dphi'*w1; 260 | v2 = -0.5*inv(Ra22)*Ga2'*dphi'*w2; 261 | 262 | cacc_dynamics = [ x(2) - td*x(3);... 263 | -x(3);... 264 | (-x(3)+u(1))/tau;... 265 | x(5) - td*x(6);... 266 | x(3) - x(6);... 267 | (-x(6)+u(2))/tau ]; 268 | 269 | xdot = [ cacc_dynamics;... 270 | v1;... 271 | v2;... 272 | x_state'*Q1*x_state + u1'*R11*u1 + u2'*R12*u2;... 273 | x_state'*Q2*x_state + u1'*R21*u1 + u2'*R22*u2;... 274 | s'*Qa1*s + v1'*Ra11*v1 + v2'*Ra12*v2;... 275 | s'*Qa2*s + v1'*Ra21*v1 + v2'*Ra22*v2;... 276 | dw1;... 277 | dw2 ]; 278 | end 279 | %------------------- activation function vector of critic ------------------ 280 | function y = cacc_nn_pf(x) 281 | x1 = x(1); 282 | x2 = x(2); 283 | x3 = x(3); 284 | x4 = x(4); 285 | x5 = x(5); 286 | x6 = x(6); 287 | x7 = x(7); 288 | x8 = x(8); 289 | 290 | y = [ x1^2; x1*x2; x1*x3; x1*x4; x1*x5; x1*x6; x1*x7; x1*x8;... 291 | x2^2; x2*x3; x2*x4; x2*x5; x2*x6; x2*x7; x2*x8;... 292 | x3^2; x3*x4; x3*x5; x3*x6; x3*x7; x3*x8;... 293 | x4^2; x4*x5; x4*x6; x4*x7; x4*x8;... 294 | x5^2; x5*x6; x5*x7; x5*x8;... 295 | x6^2; x6*x7; x6*x8;... 296 | x7^2; x7*x8;... 297 | x8^2 ]; 298 | end 299 | %-------------- derivative of critic activation function vector ----------- 300 | function dphi = cacc_nn_pf_dphi(x) 301 | x1 = x(1); 302 | x2 = x(2); 303 | x3 = x(3); 304 | x4 = x(4); 305 | x5 = x(5); 306 | x6 = x(6); 307 | x7 = x(7); 308 | x8 = x(8); 309 | 310 | dphi = [ 311 | [ 2*x1, 0, 0, 0, 0, 0, 0, 0 ]; 312 | [ x2, x1, 0, 0, 0, 0, 0, 0 ]; 313 | [ x3, 0, x1, 0, 0, 0, 0, 0 ]; 314 | [ x4, 0, 0, x1, 0, 0, 0, 0 ]; 315 | [ x5, 0, 0, 0, x1, 0, 0, 0 ]; 316 | [ x6, 0, 0, 0, 0, x1, 0, 0 ]; 317 | [ x7, 0, 0, 0, 0, 0, x1, 0 ]; 318 | [ x8, 0, 0, 0, 0, 0, 0, x1 ]; 319 | [ 0, 2*x2, 0, 0, 0, 0, 0, 0 ]; 320 | [ 0, x3, x2, 0, 0, 0, 0, 0 ]; 321 | [ 0, x4, 0, x2, 0, 0, 0, 0 ]; 322 | [ 0, x5, 0, 0, x2, 0, 0, 0 ]; 323 | [ 0, x6, 0, 0, 0, x2, 0, 0 ]; 324 | [ 0, x7, 0, 0, 0, 0, x2, 0 ]; 325 | [ 0, x8, 0, 0, 0, 0, 0, x2 ]; 326 | [ 0, 0, 2*x3, 0, 0, 0, 0, 0 ]; 327 | [ 0, 0, x4, x3, 0, 0, 0, 0 ]; 328 | [ 0, 0, x5, 0, x3, 0, 0, 0 ]; 329 | [ 0, 0, x6, 0, 0, x3, 0, 0 ]; 330 | [ 0, 0, x7, 0, 0, 0, x3, 0 ]; 331 | [ 0, 0, x8, 0, 0, 0, 0, x3 ]; 332 | [ 0, 0, 0, 2*x4, 0, 0, 0, 0 ]; 333 | [ 0, 0, 0, x5, x4, 0, 0, 0 ]; 334 | [ 0, 0, 0, x6, 0, x4, 0, 0 ]; 335 | [ 0, 0, 0, x7, 0, 0, x4, 0 ]; 336 | [ 0, 0, 0, x8, 0, 0, 0, x4 ]; 337 | [ 0, 0, 0, 0, 2*x5, 0, 0, 0 ]; 338 | [ 0, 0, 0, 0, x6, x5, 0, 0 ]; 339 | [ 0, 0, 0, 0, x7, 0, x5, 0 ]; 340 | [ 0, 0, 0, 0, x8, 0, 0, x5 ]; 341 | [ 0, 0, 0, 0, 0, 2*x6, 0, 0 ]; 342 | [ 0, 0, 0, 0, 0, x7, x6, 0 ]; 343 | [ 0, 0, 0, 0, 0, x8, 0, x6 ]; 344 | [ 0, 0, 0, 0, 0, 0, 2*x7, 0 ]; 345 | [ 0, 0, 0, 0, 0, 0, x8, x7 ]; 346 | [ 0, 0, 0, 0, 0, 0, 0, 2*x8 ]; 347 | ]; 348 | end 349 | 350 | 351 | 352 | -------------------------------------------------------------------------------- /model_free_nonzero_sum_games/eg_two_player_ndg/tndg_model_free_learning.m: -------------------------------------------------------------------------------- 1 | function tndg_model_free_learning 2 | % ''Parallel Control for Nonzero-Sum Games With Completely Unknown 3 | % Nonlinear Dynamics via Reinforcement Learning'' 4 | % 5 | % Code for 2-player nonlinear differential game (TNDG) 6 | % 7 | % By J. Lu 8 | % Date: Aug. 12, 2024 9 | 10 | %-------------------------------- start ----------------------------------- 11 | clear; close all; clc; 12 | 13 | global Ga1; global Ga2; 14 | global Q1; global R11; global R12; 15 | global Q2; global R21; global R22; 16 | global Qa1; global Ra11; global Ra12; 17 | global Qa2; global Ra21; global Ra22; 18 | global w1; global w2; 19 | global dw1; global dw2; 20 | 21 | % system information & performance indices 22 | n = 2; 23 | m1 = 1; 24 | m2 = 1; 25 | eta = n + m1 + m2; 26 | 27 | Ga1 = [ zeros(n,m1); eye(m1); zeros(m2,m1) ]; % $\mathcal{G}_1$ 28 | Ga2 = [ zeros(n,m2); zeros(m1,m2); eye(m2) ]; % $\mathcal{G}_2$ 29 | 30 | Q1 = 2*eye(n); R11 = 2*eye(m1); R12 = 2*eye(m2); 31 | Q2 = 1*eye(n); R21 = 1*eye(m1); R22 = 1*eye(m2); 32 | Qa1 = [ Q1,zeros(n,m1+m2);zeros(m1,n),R11,zeros(m1,m2);zeros(m2,n+m1),R12 ]; Ra11 = 1*eye(m1); Ra12 = 0*eye(m2); 33 | Qa2 = [ Q2,zeros(n,m1+m2);zeros(m1,n),R21,zeros(m1,m2);zeros(m2,n+m1),R22 ]; Ra21 = 0*eye(m1); Ra22 = 1*eye(m2); 34 | 35 | Fsamples = 50000; 36 | T = 0.2; 37 | ss = T/10; 38 | 39 | w1 = [ 1 0 1 1 ... 40 | 1 1 1 ... 41 | 1 -1 ... 42 | 1 ]'; 43 | w2 = w1; 44 | 45 | x_ini = 1; 46 | u_ini = 1; 47 | 48 | x0 = x_ini*[ 1; -1 ]; 49 | u0 = u_ini*[ 0; 0 ]; 50 | x = [ x0; u0; 0; 0; 0; 0; w1; w2 ]; 51 | 52 | nn_length = length(w1); 53 | 54 | xx = [ x ]; 55 | uu1 = []; 56 | uu2 = []; 57 | 58 | ell = length(w1); % number of hidden neurons 59 | L = ell; % need L >= ell to satisfy rank($\mathcal{D}_i$) = $\ell_i$ 60 | 61 | phi = tndg_nn_pf(x(1:eta)); 62 | pphi = zeros(ell,L+1); % number of historical data (L) + real-time data (1) 63 | pp1 = zeros(L+1,1); 64 | pp2 = zeros(L+1,1); 65 | 66 | dw1 = zeros(nn_length,1); 67 | dw2 = zeros(nn_length,1); 68 | 69 | ww1 = [ w1 ]; 70 | ww2 = [ w2 ]; 71 | 72 | l1 = 0.5; % learning rates 73 | l2 = 0.5; 74 | 75 | l1_final = 1e-5; 76 | l2_final = 1e-5; 77 | 78 | l_decay_interval = 1000; 79 | 80 | l1_decay = (l1-l1_final)/(0.8*Fsamples/l_decay_interval); 81 | l2_decay = (l2-l2_final)/(0.8*Fsamples/l_decay_interval); 82 | 83 | i = 1; 84 | 85 | h = waitbar(0,'Please wait'); 86 | for k = 1:Fsamples 87 | if norm(x(1:n),2) > 1e3 % the system is unstable and need to restart the learning 88 | break 89 | end 90 | 91 | if mod(k,l_decay_interval) == 0 92 | l1 = l1 - l1_decay; 93 | l2 = l2 - l2_decay; 94 | end 95 | 96 | if l1 < l1_final 97 | l1 = l1_final; 98 | end 99 | 100 | if l2 < l2_final 101 | l2 = l2_final; 102 | end 103 | 104 | if (norm(x(1:n),2) <= 0.1) % reset controls 105 | u0 = 2*u_ini*[ (rand(1)-0.5); (rand(1)-0.5) ]; 106 | x = [ x(1:n)'; u0; 0; 0; 0; 0; x(eta+5:end)' ]; 107 | phi = tndg_nn_pf(x(1:eta)); 108 | end 109 | 110 | tspan = 0:ss:T; 111 | [t,x]= ode45(@tndg_ode, tspan, x); 112 | 113 | phi_next = tndg_nn_pf(x(length(t),1:eta)); 114 | 115 | if i <= L + 1 116 | pphi(:,i) = phi_next-phi; 117 | pp1(i) = x(length(t),eta+3); 118 | pp2(i) = x(length(t),eta+4); 119 | 120 | phi = phi_next; 121 | 122 | i = i + 1; 123 | else 124 | for j = 1:L 125 | pphi(:,j) = pphi(:,j+1); 126 | pp1(j) = pp1(j+1); 127 | pp2(j) = pp2(j+1); 128 | end 129 | pphi(:,L+1) = phi_next-phi; 130 | pp1(L+1) = x(length(t),eta+3); 131 | pp2(L+1) = x(length(t),eta+4); 132 | 133 | phi = phi_next; 134 | end 135 | 136 | w1 = x(length(t),eta+5:eta+4+nn_length)'; 137 | w2 = x(length(t),eta+5+nn_length:eta+4+2*nn_length)'; 138 | 139 | if i >= L + 1 140 | dw1 = zeros(nn_length,1); 141 | dw2 = zeros(nn_length,1); 142 | for j = 1:L+1 143 | dw1 = dw1 - l1*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp1(j)+pphi(:,j)'*w1); 144 | dw2 = dw2 - l2*pphi(:,j)/((1+pphi(:,j)'*pphi(:,j))^2)*(pp2(j)+pphi(:,j)'*w2); 145 | end 146 | end 147 | 148 | x = [x(length(t),1:eta),0,0,0,0,x(length(t),eta+5:end)]; 149 | 150 | xx = [ xx x' ]; 151 | uu1 = [ uu1 x(n+1) ]; 152 | uu2 = [ uu2 x(n+2) ]; 153 | ww1 = [ ww1 w1 ]; 154 | ww2 = [ ww2 w2 ]; 155 | 156 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 157 | end 158 | close(h); 159 | 160 | w1_display = w1' 161 | w2_display = w2' 162 | 163 | % result 164 | figure(1), % states 165 | subplot(2,1,1) 166 | plot(T*((1:size(xx,2))-1),xx(1,:),'linewidth',1) 167 | xlabel('Time (s)'); 168 | ylabel('$x_{1}$','Interpreter','latex'); 169 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 170 | grid on; 171 | subplot(2,1,2) 172 | plot(T*((1:size(xx,2))-1),xx(2,:),'linewidth',1) 173 | xlabel('Time (s)'); 174 | ylabel('$x_{2}$','Interpreter','latex'); 175 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 176 | grid on; 177 | 178 | figure(2), % control 1 179 | plot(T*((1:size(uu1,2))-1),uu1,'linewidth',1) 180 | xlabel('Time (s)'); 181 | ylabel('$u_1$','Interpreter','latex'); 182 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 183 | grid on; 184 | 185 | figure(3), % control 2 186 | plot(T*((1:size(uu2,2))-1),uu2,'linewidth',1) 187 | xlabel('Time (s)'); 188 | ylabel('$u_2$','Interpreter','latex'); 189 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 190 | grid on; 191 | 192 | figure(4), % w1 193 | plot(T*((1:size(ww1,2))-1),ww1,'linewidth',1) 194 | xlabel('Time (s)'); 195 | ylabel('$\mathcal{W}_{1}$','Interpreter','latex'); 196 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 197 | grid on; 198 | 199 | figure(5), % w2 200 | plot(T*((1:size(ww2,2))-1),ww2,'linewidth',1) 201 | xlabel('Time (s)'); 202 | ylabel('$\mathcal{W}_{2}$','Interpreter','latex'); 203 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 204 | grid on; 205 | 206 | end 207 | %---------------------------- tndg dynamics ------------------------------- 208 | function xdot = tndg_ode(~,x) 209 | global Ga1; global Ga2; 210 | global Q1; global R11; global R12; 211 | global Q2; global R21; global R22; 212 | global Qa1; global Ra11; global Ra12; 213 | global Qa2; global Ra21; global Ra22; 214 | global w1; global w2; 215 | global dw1; global dw2; 216 | 217 | x_state = [ x(1); x(2); ]; 218 | 219 | u1 = x(3); 220 | u2 = x(4); 221 | 222 | u = [ u1; u2 ]; 223 | 224 | s = [ x_state; u ]; 225 | 226 | dphi = tndg_nn_pf_dphi(s); 227 | 228 | v1 = -0.5*inv(Ra11)*Ga1'*dphi'*w1; 229 | v2 = -0.5*inv(Ra22)*Ga2'*dphi'*w2; 230 | 231 | g1 = [ 0; cos(2*x(1))+2 ]; 232 | g2 = [ 0; sin(4*x(1)^2)+2 ]; 233 | 234 | tndg_dynamics = [ -2*x(1) + x(2);... 235 | -0.5*x(1) - x(2) + x(1)^2*x(2) + 0.05*x(2)*(cos(2*x(1))+2)^2 + 0.05*x(2)*(sin(4*x(1))+2)^2 ]... 236 | + g1*u1 + g2*u2; 237 | 238 | xdot = [ tndg_dynamics;... 239 | v1;... 240 | v2;... 241 | x_state'*Q1*x_state + u1'*R11*u1 + u2'*R12*u2;... 242 | x_state'*Q2*x_state + u1'*R21*u1 + u2'*R22*u2;... 243 | s'*Qa1*s + v1'*Ra11*v1 + v2'*Ra12*v2;... 244 | s'*Qa2*s + v1'*Ra21*v1 + v2'*Ra22*v2;... 245 | dw1;... 246 | dw2 ]; 247 | end 248 | %------------------- activation function vector of critic ------------------ 249 | function y = tndg_nn_pf(x) 250 | x1 = x(1); 251 | x2 = x(2); 252 | x3 = x(3); 253 | x4 = x(4); 254 | 255 | y = [ x1^2; x1*x2; x1*x3; x1*x4;... 256 | x2^2; x2*x3; x2*x4;... 257 | x3^2; x3*x4;... 258 | x4^2 ]; 259 | end 260 | %-------------- derivative of critic activation function vector ----------- 261 | function dphi = tndg_nn_pf_dphi(x) 262 | x1 = x(1); 263 | x2 = x(2); 264 | x3 = x(3); 265 | x4 = x(4); 266 | 267 | dphi = [ 268 | [ 2*x1, 0, 0, 0 ]; 269 | [ x2, x1, 0, 0 ]; 270 | [ x3, 0, x1, 0 ]; 271 | [ x4, 0, 0, x1 ]; 272 | [ 0, 2*x2, 0, 0 ]; 273 | [ 0, x3, x2, 0 ]; 274 | [ 0, x4, 0, x2 ]; 275 | [ 0, 0, 2*x3, 0 ]; 276 | [ 0, 0, x4, x3 ]; 277 | [ 0, 0, 0, 2*x4 ]; 278 | ]; 279 | end 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /online_learning_policy_update/puol_algorithm.m: -------------------------------------------------------------------------------- 1 | function puol_algorithm 2 | % This demo checks the feasibility of policy update online learning 3 | 4 | % By J. Lu 5 | % Date: Apr. 16, 2021 6 | 7 | %-------------------------------- start ----------------------------------- 8 | clear; close all; clc; 9 | 10 | A = [0 -0.8;0.8 1.8]; B = [0; -1]; 11 | Q = eye(2); R = 1; 12 | 13 | ac = 10e-6; 14 | aa = 0.1; 15 | wc = zeros(8,1); 16 | wa = [0.5;1.4;0;0;0;0]; 17 | wa_ini = wa; 18 | Nc = 4; 19 | 20 | x = 5*[(rand(1,1)-0.5);(rand(1,1)-0.5)]; 21 | xx = [x]; 22 | xxc = []; 23 | rrc = []; 24 | uu = []; 25 | wwc = [wc]; 26 | wwa = [wa]; 27 | 28 | N = 10000; 29 | 30 | h = waitbar(0,'please wait'); 31 | for k = 1:N 32 | if length(xxc) < Nc 33 | u = actor(wa,actor_activation_function(x)); 34 | else 35 | if length(xxc) == Nc 36 | wa = actor_weights(x,actor(wa,actor_activation_function(x)),wa,aa,wc,A,B,Q,R); 37 | u = actor(wa,actor_activation_function(x)); 38 | end 39 | end 40 | 41 | r = x'*Q*x+u'*R*u; 42 | 43 | % update system states 44 | x = controlled_system(x,u,A,B); 45 | 46 | xx = [xx x]; 47 | uu = [uu u]; 48 | 49 | if length(xxc) < Nc 50 | xxc = [xxc x]; 51 | rrc = [rrc r]; 52 | else 53 | if length(xxc) == Nc 54 | xxc_temp = xxc(:,2:end); 55 | xxc_temp = [xxc_temp x]; 56 | xxc = xxc_temp; 57 | 58 | rrc_temp = rrc(:,2:end); 59 | rrc_temp = [rrc_temp r]; 60 | rrc = rrc_temp; 61 | end 62 | end 63 | 64 | if length(xxc) == Nc 65 | wc = critic_weights(xxc,rrc,wc,ac); 66 | end 67 | 68 | if norm(x,2)<10e-5 % states converge to 0 and reset the initial states 69 | % usually, it is necessary to change controls to change system states 70 | % since the paper does not explain how to excite the system 71 | % a simple reset of initial states is used 72 | x = 5*[(rand(1,1)-0.5);(rand(1,1)-0.5)]; 73 | xxc = []; 74 | rrc = []; 75 | end 76 | 77 | wwc = [wwc wc]; 78 | wwa = [wwa wa]; 79 | 80 | waitbar(k/N,h,['Running...',num2str(k/N*100),'%']); 81 | end 82 | close(h); 83 | 84 | % check results 85 | wa_ini = wa_ini' 86 | 87 | wc_final = wc' 88 | wa_final = wa' 89 | 90 | [Kopt, Popt] = dlqr(A,B,Q,R); 91 | 92 | wc_opt = [ Popt(1,1) 2*Popt(1,2) Popt(2,2) ] 93 | wa_opt = [-Kopt(1,1) -Kopt(1,2)] 94 | 95 | 96 | figure(1), % states 97 | plot(((1:size(xx,2))-1),xx,'linewidth',1) 98 | xlabel('Time steps'); 99 | ylabel('States'); 100 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 101 | grid on; 102 | 103 | figure(2), % control 104 | plot(((1:size(uu,2))-1),uu,'b','linewidth',1) 105 | xlabel('Time steps'); 106 | ylabel('Control'); 107 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 108 | grid on; 109 | 110 | figure(3), % critic weights 111 | plot(((1:size(wwc,2))-1),wwc,'linewidth',1) 112 | xlabel('Time (s)'); 113 | ylabel('Critic weights'); 114 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 115 | grid on; 116 | 117 | figure(4), % actors weights 118 | plot(((1:size(wwa,2))-1),wwa,'linewidth',1) 119 | xlabel('Time (s)'); 120 | ylabel('Action weights'); 121 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 122 | grid on; 123 | 124 | end 125 | 126 | %--------------------------- outpout of system ---------------------------- 127 | function y = controlled_system(x,u,A,B) 128 | y = A*x + B*u; 129 | end 130 | %-------------------------------- actor ----------------------------------- 131 | function y = actor(wa,oa) 132 | y = wa'*oa; 133 | end 134 | %----------------------- critic activation function ----------------------- 135 | function y = critic_activation_function(x) 136 | y = [ x(1)^2; x(1)*x(2); x(2)^2;... 137 | x(1)^4; x(1)^3*x(2); x(1)^2*x(2)^2;... 138 | x(1)*x(2)^3; x(2)^4 ]; 139 | end 140 | %----------------------- action activation function ----------------------- 141 | function y = actor_activation_function(x) 142 | y = [ x(1) x(2) (x(1)^2)*x(2) x(1)^3 x(1)*x(2)^2 x(2)^3]'; 143 | end 144 | %------------------------- update critic weights -------------------------- 145 | function wc = critic_weights(xx,rr,w,ac) 146 | X = []; 147 | Y = []; 148 | 149 | for k = length(xx):-1:2 150 | X = [ X critic_activation_function(xx(:,k)) -... 151 | critic_activation_function(xx(:,k-1)) ]; 152 | end 153 | 154 | for j = length(rr):-1:2 155 | Y = [ Y rr(j)]; 156 | end 157 | 158 | E = Y + w'*X; 159 | wc = X*pinv(X'*X)*(ac*E'-Y'); 160 | end 161 | %-------------------------- update actor weights -------------------------- 162 | function wa = actor_weights(x,u0,w,aa,wc,A,B,Q,R) 163 | rho = actor_activation_function(x); 164 | 165 | objective = @(u) x'*Q*x+u'*R*u + ... 166 | wc'*critic_activation_function(controlled_system(x,u,A,B)); 167 | uopt = fminunc(objective,u0); 168 | 169 | ue = w'*rho - uopt; 170 | 171 | wa = w - aa*rho*ue/(rho'*rho+1); 172 | end 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /online_learning_without_initial_admissible_control/ol_algorithm.m: -------------------------------------------------------------------------------- 1 | function ol_algorithm 2 | % This demo checks the feasibility of online leranbing without 3 | % initial admissible control 4 | 5 | % By J. Lu 6 | % Date: Mar. 16, 2021 7 | 8 | %-------------------------------- start ----------------------------------- 9 | clear; close all; clc; 10 | 11 | global u; 12 | global Q; global R; 13 | global probing_noise; 14 | 15 | Q = 1*eye(2); R = 1; 16 | 17 | wc_ini = [0.1;0;0.1]; 18 | wc = wc_ini; 19 | 20 | length_nn = length(wc_ini); 21 | 22 | x_ini = [1;-1]; 23 | x0 = [x_ini; wc]'; 24 | xx = [x0']; 25 | uu = []; 26 | wwc = [wc]; 27 | 28 | % simulation settings 29 | Fsamples = 5000; 30 | T = 0.1; 31 | ss = T/10; 32 | tspan = 0:ss:T; 33 | 34 | % generate the probing noise 35 | A = 10; 36 | noise = sqrt(A)*randn(1,Fsamples/T); 37 | 38 | N = length(noise); 39 | n = 1; 40 | 41 | h = waitbar(0,'please wait'); 42 | for k = 1:Fsamples 43 | probing_noise = noise(n); 44 | 45 | [t,x]= ode45(@spiol_ode, tspan, x0); 46 | 47 | x0 = x(length(t),:); 48 | xx = [xx x0']; 49 | uu = [uu u]; 50 | wwc = [wwc, x0(3:end)']; 51 | 52 | n = n + 1; 53 | 54 | if n > N 55 | n = 1; 56 | end 57 | 58 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 59 | end 60 | close(h) 61 | 62 | % check results 63 | wc_ini = wc_ini' 64 | wc_final = x0(3:end) 65 | 66 | wc_opt = [0.5 0 1] 67 | 68 | % results 69 | figure(1), % states 70 | plot(T*((1:size(xx,2))-1),xx(1:2,:),'linewidth',1) 71 | xlabel('Time (s)'); 72 | ylabel('$x$','Interpreter','latex'); 73 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 74 | grid on; 75 | 76 | figure(2), % control 77 | plot(T*((1:size(uu,2))-1),uu,'b','linewidth',1) 78 | xlabel('Time (s)'); 79 | ylabel('Control'); 80 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 81 | grid on; 82 | 83 | figure(3), % critic weights 84 | plot(T*((1:size(wwc,2))-1),wwc,'linewidth',1) 85 | xlabel('Time (s)'); 86 | ylabel('Critic weights'); 87 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 88 | grid on; 89 | 90 | end 91 | 92 | %-------------------------------- ode ------------------------------------- 93 | function sdot = spiol_ode(~,s) 94 | global u; 95 | global Q; global R; 96 | global probing_noise; 97 | 98 | % learning parameters 99 | a1 = 25; 100 | a2 = 0.01; 101 | 102 | x = s(1:2); 103 | wc = s(3:end); 104 | 105 | f = [ -x(1)+x(2);... 106 | -x(1)/2-x(2)*(1-(cos(2*x(1)+2)^2))/2 ]; 107 | 108 | g = [ 0;... 109 | cos(2*x(1)+2)]; 110 | 111 | pphi = pd_phi(x); 112 | 113 | ux = -0.5*inv(R)*g'*pphi'*wc; 114 | 115 | % candidate Lyapunov function V = 0.5*(x1^2+x2^2); 116 | pV = [ x(1); x(2) ]; 117 | 118 | dV = x'*(f+g*u); 119 | 120 | if dV > 0 121 | k = 0.5; 122 | else 123 | k = 0; 124 | end 125 | 126 | sigma = pphi*(f+g*ux); 127 | 128 | dwc = -a1*sigma/((sigma'*sigma+1)^2)*(sigma'*wc+x'*Q*x+ux'*R*ux)... 129 | + k*a2*pphi*g*inv(R)*g'*pV; 130 | 131 | u = ux + probing_noise; 132 | 133 | sdot = [ f+g*u; dwc ]; 134 | end 135 | %------------- partial derivative of activation function vector ----------- 136 | function pphi = pd_phi(x) 137 | pphi = [ 2*x(1) 0;... 138 | x(2) x(1);... 139 | 0 2*x(2)]; 140 | end 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /parallel_control_based_optimal_tracking/pc_ot_adp_main.m: -------------------------------------------------------------------------------- 1 | % Code for paper 2 | % J. Lu, Q. Wei, and F.-Y. Wang, "Parallel control for optimal tracking via 3 | % adaptive dynamic programming," IEEE/CAA Journal of Automatica Sinica, 4 | % vol. 7, no. 6, pp. 1662-1674, Nov. 2020. 5 | % 6 | % The paper is available at: 7 | % https://www.ieee-jas.net/en/article/doi/10.1109/JAS.2020.1003426 8 | % 9 | % The main contribution of this paper is to propose an online optimal 10 | % tracking control method for nonaffine systems without reconstructing the 11 | % nonaffine system into an affine form at a certain operating point 12 | % 13 | % P.S. This code is slightly different from the paper 14 | % 15 | % By J. Lu 16 | % Date: Apr. 8, 2023 17 | 18 | %-------------------------------- start ----------------------------------- 19 | clear;close all;clc; 20 | 21 | global Ga; 22 | global Q; global R; global Qa; global Ra; 23 | global w; 24 | global a1; global a2; 25 | global ra; 26 | global ea; 27 | global v; 28 | 29 | % parameters 30 | r = [0.5;0.125]; % reference signal 31 | ur = 1.1947448; % reference control 32 | % this code constructs the augmented system and reference signal first and derives the augmented tracking error system later 33 | % the paper obtains the tracking error system first and derives the augmented tracking error system later 34 | % both methods are acceptable 35 | ra = [r;ur]; % augmented reference signal 36 | 37 | Ga = [zeros(2,1); diag([1])]; 38 | 39 | Q = 1*eye(2); R = 1*eye(1); 40 | Qa = diag([diag(Q)' diag(R)']); Ra = 1*diag([1]); 41 | 42 | 43 | Fsamples = 20000; 44 | T = 0.05; 45 | ss = T/10; 46 | 47 | x00 = [0;0]; 48 | u00 = 0; 49 | w00 = zeros(12,1); % initial critic network weights 50 | x = [x00;u00;0;0;w00]; 51 | w = x(6:end); 52 | x0 = x; 53 | xx = [x]; 54 | uu = [x(3)]; 55 | rr = [r]; 56 | vv = []; 57 | 58 | ww = [w]; 59 | 60 | % learning rates are different from the paper due to the introduction of the probing noise 61 | a1 = 20; 62 | a2 = 0.1; 63 | 64 | Jo = 0; 65 | Ja = 0; 66 | 67 | rpb_k = Fsamples - 200; 68 | 69 | 70 | h = waitbar(0,'please wait'); 71 | for k = 1:Fsamples 72 | 73 | if k~= 1 && norm(ea(1:3),2) <= 0.5 && k <= rpb_k % probing noise 74 | x = [ra + 0.5*(rand(3,1)-0.5);x(4:end)']; 75 | end 76 | 77 | tspan = 0:ss:T; 78 | [t,x]= ode45(@pc_ot_adp_ode, tspan, x); 79 | 80 | Jo = Jo + x(length(t),4); 81 | Ja = Ja + x(length(t),5); 82 | 83 | x = [ x(length(t),1:3),0,0,x(length(t),6:end) ]; 84 | w = x(6:end); 85 | 86 | rr = [rr r]; 87 | xx = [xx x']; 88 | uu = [uu x(3)]; 89 | vv = [vv v]; 90 | ww = [ww w']; 91 | 92 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 93 | end 94 | close(h); 95 | 96 | Jo 97 | Ja 98 | 99 | % result 100 | figure(1), % states 101 | plot(T*((1:size(rr,2))-1),rr(1,:),'r','linewidth',1),hold on; 102 | plot(T*((1:size(xx,2))-1),xx(1,:),'b--','linewidth',1); 103 | xlabel('Time (s)'); 104 | ylabel('$x_1$','Interpreter','latex'); 105 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 106 | grid on; 107 | 108 | figure(2), % states 109 | plot(T*((1:size(rr,2))-1),rr(2,:),'r','linewidth',1),hold on; 110 | plot(T*((1:size(xx,2))-1),xx(2,:),'b--','linewidth',1); 111 | xlabel('Time (s)'); 112 | ylabel('$x_2$','Interpreter','latex'); 113 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 114 | grid on; 115 | 116 | figure(3), % real control 117 | plot(T*((1:size(uu,2))-1),uu,'b','linewidth',1) 118 | xlabel('Time (s)'); 119 | ylabel('$u$','Interpreter','latex'); 120 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 121 | grid on; 122 | 123 | figure(4), % virtual control 124 | plot(T*((1:size(vv,2))-1),vv,'b','linewidth',1) 125 | xlabel('Time (s)'); 126 | ylabel('$v$','Interpreter','latex'); 127 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 128 | grid on; 129 | 130 | figure(5), % wc 131 | plot(T*((1:size(ww,2))-1),ww,'linewidth',1) 132 | xlabel('Time (s)'); 133 | ylabel('Critic newtork weights'); 134 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 135 | grid on; 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /parallel_control_based_optimal_tracking/pc_ot_adp_nn_pf_dphi.m: -------------------------------------------------------------------------------- 1 | function dphi = pc_ot_adp_nn_pf_dphi(x) 2 | dphi = [ 3 | [ 2*x(1), 0, 0 ]; 4 | [ x(2), x(1), 0 ]; 5 | [ x(3), 0, x(1) ]; 6 | [ 0, 2*x(2), 0 ]; 7 | [ 0, x(3), x(2) ]; 8 | [ 0, 0, 2*x(3) ]; 9 | [ 3*x(1)^2, 0, 0 ]; 10 | [ 0, 3*x(2)^2, 0 ]; 11 | [ 0, 0, 3*x(3)^2 ]; 12 | [ 4*x(1)^3, 0, 0 ]; 13 | [ 0, 4*x(2)^3, 0 ]; 14 | [ 0, 0, 4*x(3)^3 ]; 15 | ]; 16 | end -------------------------------------------------------------------------------- /parallel_control_based_optimal_tracking/pc_ot_adp_ode.m: -------------------------------------------------------------------------------- 1 | function xdot = pc_ot_adp_ode(~,x) 2 | 3 | global Ga; 4 | global Q; global R; global Qa; global Ra; 5 | global dw; 6 | global w; 7 | global a1; global a2; 8 | global ra; 9 | global ea; 10 | global v; 11 | 12 | w = x(6:end); 13 | 14 | u = x(3); 15 | 16 | fo_x = [-x(1)^3 + x(2); x(1)^2 - x(1) - x(2) + 0.15*u^3 + sin(0.1*u) ]; % nonaffine system 17 | 18 | fo_r = [-ra(1)^3 + ra(2); ra(1)^2 - ra(1) - ra(2) + 0.15*ra(3)^3 + sin(0.1*ra(3)) ]; % reference signal 19 | 20 | % construct the augmented system and reference signal first and derive the augmented tracking error system later 21 | % the paper obtains the tracking error system first and derives the augmented tracking error system later 22 | % both methods are acceptable 23 | fa_e = [fo_x - fo_r; 0]; 24 | 25 | ea = x(1:3) - ra; 26 | 27 | dphi = pc_ot_adp_nn_pf_dphi(ea(1:3)); 28 | 29 | v = -0.5*inv(Ra)*Ga'*dphi'*w; 30 | 31 | dVe = ea'*(fa_e+Ga*v); % V = 0.5*(e1^2+e2^2+e3^2); 32 | 33 | pVe = [ea(1);ea(2);ea(3)]; 34 | 35 | if dVe > 0 36 | k = 0.5; 37 | end 38 | if dVe <= 0 39 | k = 0; 40 | end 41 | 42 | sigma = dphi*(fa_e+Ga*v); 43 | dw = -a1*sigma/((sigma'*sigma + 1)^2)*(sigma'*w + ea'*Qa*ea + v'*Ra*v)+k*a2*dphi*Ga*inv(Ra)*Ga'*pVe; 44 | 45 | xdot = [ fo_x; v;... 46 | [ea(1);ea(2)]'*Q*[ea(1);ea(2)]+ea(3)'*R*ea(3);... 47 | [ea(1);ea(2);ea(3)]'*Qa*[ea(1);ea(2);ea(3)]+v'*Ra*v;... 48 | dw]; 49 | 50 | end -------------------------------------------------------------------------------- /policy_iteration/pi_algorithm.m: -------------------------------------------------------------------------------- 1 | function pi_algorithm 2 | % This demo checks the feasibility of the policy iteration adaptive dynamic 3 | % programming algorithm 4 | 5 | % By J. Lu 6 | % Date: Apr. 21, 2020 7 | 8 | %-------------------------------- start ----------------------------------- 9 | clear; close all; clc; 10 | 11 | % information of system & cost function 12 | global A; global B; global Q; global R; 13 | 14 | load training_data/state_data.mat; 15 | load training_data/actor_init.mat; 16 | 17 | % action network 18 | actor = actor_init; 19 | 20 | actor_epoch = 20000; 21 | actor_err_goal = 1e-9; 22 | actor_lr = 0.01; 23 | actor.trainParam.epochs = actor_epoch; 24 | actor.trainParam.goal = actor_err_goal; 25 | actor.trainParam.show = 10; 26 | actor.trainParam.lr = actor_lr; 27 | 28 | % critic network 29 | critic_middle_num = 15; 30 | critic_epoch = 10000; 31 | critic_err_goal = 1e-9; 32 | critic_lr = 0.01; 33 | critic = newff(minmax(x_train), [critic_middle_num 1], {'tansig' 'purelin'},'trainlm'); 34 | critic.trainParam.epochs = critic_epoch; 35 | critic.trainParam.goal = critic_err_goal; 36 | critic.trainParam.show = 10; 37 | critic.trainParam.lr = critic_lr; 38 | critic.biasConnect = [1;0]; 39 | 40 | epoch = 10; 41 | eval_step = 400; 42 | performance_index = ones(1,epoch + 1); 43 | 44 | figure(1),hold on; 45 | h = waitbar(0,'Please wait'); 46 | for i = 1:epoch 47 | % update critic 48 | % evaluate policy 49 | critic_target = evaluate_policy(actor, x_train, eval_step); 50 | critic = train(critic,x_train,critic_target); 51 | 52 | performance_index(i) = critic(x0); 53 | figure(1),plot(i,performance_index(i),'*'),xlim([1 epoch]),hold on; 54 | 55 | waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']); 56 | if i == epoch 57 | break; 58 | end 59 | 60 | % update actor 61 | actor_target = zeros(control_dim,size(x_train,2)); 62 | for j = 1:size(x_train,2) 63 | x = x_train(:,j); 64 | if x == zeros(state_dim,1) 65 | ud = zeros(control_dim,1); 66 | else 67 | objective = @(u) cost_function(x,u) + critic(controlled_system(x,u)); 68 | u0 = actor(x); 69 | ud = fminunc(objective, u0); 70 | end 71 | actor_target(:,j) = ud; 72 | end 73 | 74 | actor = train(actor, x_train, actor_target); 75 | end 76 | close(h) 77 | 78 | figure(1), 79 | xlabel('Iterations'); 80 | ylabel('$V(x_0)$','Interpreter','latex'); 81 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 82 | grid on; 83 | hold off; 84 | 85 | save training_results/actor_critic actor critic 86 | end 87 | 88 | %---------------------------- evaluate policy ----------------------------- 89 | function y = evaluate_policy(actor,x,eval_step) 90 | critic_target = zeros(1,size(x,2)); 91 | for k = 1:eval_step 92 | uep = actor(x); 93 | critic_target = critic_target + cost_function(x,uep); 94 | x = controlled_system(x,uep); 95 | end 96 | y = critic_target; 97 | end 98 | %--------------------------- outpout of system ---------------------------- 99 | function y = controlled_system(x,u) 100 | % system matrices 101 | global A; global B; 102 | y = A*x + B*u; % dot product should be adopt in nolinear systems 103 | end 104 | %----------------------------- cost function ------------------------------ 105 | function y = cost_function(x,u) 106 | global Q; global R; 107 | y = (diag(x'*Q*x) + diag(u'*R*u))'; 108 | end 109 | 110 | -------------------------------------------------------------------------------- /policy_iteration/pi_icp.m: -------------------------------------------------------------------------------- 1 | %-------------------------- obtain the initial admissible control ---------------------------- 2 | clear; close all; clc; 3 | 4 | load training_data/state_data.mat 5 | 6 | [K, P] = dlqr(A,B,Q,100*R); 7 | 8 | actor_target = -K*x_train; 9 | 10 | cover = 1; 11 | if isempty(dir('training_data/actor_init.mat')) == 1 || cover == 1 12 | % action network 13 | actor_init_middle_num = 15; 14 | actor_init_epoch = 10000; 15 | actor_init_err_goal = 1e-9; 16 | actor_init_lr = 0.01; 17 | actor_init = newff(minmax(x_train), [actor_init_middle_num control_dim], {'tansig' 'purelin'},'trainlm'); 18 | actor_init.trainParam.epochs = actor_init_epoch; 19 | actor_init.trainParam.goal = actor_init_err_goal; 20 | actor_init.trainParam.show = 10; 21 | actor_init.trainParam.lr = actor_init_lr; 22 | actor_init.biasConnect = [1;0]; 23 | 24 | actor_init = train(actor_init, x_train, actor_target); 25 | 26 | save training_data/actor_init actor_init 27 | else 28 | load training_data/actor_init 29 | end 30 | 31 | 32 | %-------------------------- test the initial control ---------------------------- 33 | x = x0; 34 | x_net = x; 35 | 36 | xx = x; 37 | xx_net = x_net; 38 | uu = []; 39 | uu_net = []; 40 | 41 | Fsamples = 200; 42 | JK = 0; 43 | Jnet = 0; 44 | 45 | h = waitbar(0,'Please wait'); 46 | for k = 1:Fsamples 47 | u = -K*x; 48 | u_net = actor_init(x_net); 49 | JK = JK + x'*Q*x + u'*R*u; 50 | Jnet = Jnet + x_net'*Q*x_net + u_net'*R*u_net; 51 | x = A*x + B*u; 52 | xx = [xx x]; 53 | x_net = A*x_net + B*u_net; 54 | xx_net = [xx_net x_net]; 55 | uu = [uu u]; 56 | uu_net = [uu_net u_net]; 57 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 58 | end 59 | close(h) 60 | 61 | JK 62 | Jnet 63 | 64 | figure, 65 | plot(0:Fsamples,xx,'b-',0:Fsamples,xx_net,'r--','linewidth',1) 66 | xlabel('Time steps'); 67 | ylabel('States'); 68 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 69 | grid on; 70 | 71 | figure, 72 | plot(0:Fsamples-1,uu,'b-',0:Fsamples-1,uu_net,'r--','linewidth',1) 73 | xlabel('Time steps'); 74 | ylabel('Control'); 75 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 76 | grid on; 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /policy_iteration/pi_state_data.m: -------------------------------------------------------------------------------- 1 | %------------------------- generate training data & system information ---------------------------- 2 | clear; close all; clc; 3 | 4 | % system matrices 5 | A = [ 0, 0.1;... 6 | 0.3, -1 ]; 7 | B = [ 0;... 8 | 0.5 ]; 9 | 10 | state_dim = size(A,1); 11 | control_dim = size(B,2); 12 | 13 | % cost function parameters 14 | Q = 1*eye(state_dim); 15 | R = 1*eye(control_dim); 16 | 17 | % training data 18 | x_train = zeros(state_dim,1); 19 | x0 = [1;-1]; 20 | 21 | for i = 1:50 22 | x_train = [x_train, zeros(state_dim,1)]; 23 | x_train = [x_train,2*(rand(state_dim,1)-0.5)]; 24 | x_train = [x_train,1*(rand(state_dim,1)-0.5)]; 25 | x_train = [x_train,0.5*(rand(state_dim,1)-0.5)]; 26 | end 27 | 28 | r = randperm(size(x_train,2)); % randomization according to column 29 | x_train = x_train(:, r); % reorder 30 | 31 | save training_data/state_data x_train state_dim control_dim A B Q R x0; 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /policy_iteration/pi_validation.m: -------------------------------------------------------------------------------- 1 | clear; close all; clc; 2 | 3 | load training_data/state_data.mat; 4 | load training_results/actor_critic.mat 5 | 6 | 7 | [Kopt, Popt] = dlqr(A,B,Q,R); 8 | 9 | x = x0; 10 | x_net = x; 11 | xx = x; 12 | xx_net = x_net; 13 | uu_opt = []; 14 | uu_net = []; 15 | 16 | Jreal = 0; 17 | 18 | Fsamples = 50; 19 | h = waitbar(0,'Please wait'); 20 | for k = 1:Fsamples 21 | uopt = -Kopt*x; 22 | x = A*x + B*(uopt); 23 | xx = [xx x]; 24 | u_net = sim(actor,x_net); 25 | Jreal = Jreal + x_net'*Q*x_net + u_net'*R*u_net; 26 | x_net = A*x_net + B*u_net; 27 | xx_net = [xx_net x_net]; 28 | uu_opt = [uu_opt uopt]; 29 | uu_net = [uu_net u_net]; 30 | waitbar(k/Fsamples,h,['Running...',num2str(k/Fsamples*100),'%']); 31 | end 32 | close(h) 33 | 34 | Jopt = x0'*Popt*x0 35 | Jnet = critic(x0) 36 | Jreal 37 | 38 | figure, 39 | plot(0:Fsamples,xx,'b-',0:Fsamples,xx_net,'r--','linewidth',1) 40 | xlabel('Time steps'); 41 | ylabel('States'); 42 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 43 | grid on; 44 | figure, 45 | plot(0:Fsamples-1,uu_opt,'b-',0:Fsamples-1,uu_net,'r--','linewidth',1) 46 | legend('Optimal ','NN','Interpreter','latex'); 47 | xlabel('Time steps'); 48 | ylabel('Control'); 49 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 50 | grid on; 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /policy_iteration/training_data/actor_init.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_data/actor_init.mat -------------------------------------------------------------------------------- /policy_iteration/training_data/state_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_data/state_data.mat -------------------------------------------------------------------------------- /policy_iteration/training_results/actor_critic.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/policy_iteration/training_results/actor_critic.mat -------------------------------------------------------------------------------- /policy_iteration_continuous_time/pi_algorithm.m: -------------------------------------------------------------------------------- 1 | function pi_algorithm 2 | % This demo checks the feasibility of the policy iteration adaptive dynamic 3 | % programming algorithm for continuous-time systems 4 | 5 | % By J. Lu 6 | % Date: Nov. 27, 2020 7 | 8 | %-------------------------------- start ----------------------------------- 9 | clear; close all; clc; 10 | 11 | % system information 12 | A = [-0.0665 11.5 0 0; 13 | 0 -2.5 2.5 0; 14 | -9.5 0 -13.736 -13.736; 15 | 0.6 0 0 0]; 16 | B = [0; 0; 13.736; 0]; 17 | 18 | n = 4; 19 | m = 1; 20 | 21 | Q = eye(n); R = eye(m); 22 | 23 | [~,P_opt] = lqr(A,B,Q,R); 24 | [~,P_init] = lqr(A,B,Q,0.001*R); % obtain initial weights 25 | 26 | w_opt = [ P_opt(1,1); 2*P_opt(1,2); 2*P_opt(1,3); 2*P_opt(1,4);... 27 | P_opt(2,2); 2*P_opt(2,3); 2*P_opt(2,4);... 28 | P_opt(3,3); 2*P_opt(3,4);... 29 | P_opt(4,4) ]; 30 | 31 | w_init = [ P_init(1,1); 2*P_init(1,2); 2*P_init(1,3); 2*P_init(1,4);... 32 | P_init(2,2); 2*P_init(2,3); 2*P_init(2,4);... 33 | P_init(3,3); 2*P_init(3,4);... 34 | P_init(4,4) ]; 35 | 36 | w = w_init; 37 | 38 | ww = [ w ]; 39 | 40 | % generate training data 41 | x0 = [ 1; -1; 0; 0 ]; 42 | x_train = [ x0 ]; 43 | 44 | for i = 1:100 45 | x_train = [ x_train, zeros(n,1) ]; 46 | x_train = [ x_train,10*(rand(n,1)-0.5) ]; 47 | x_train = [ x_train,5*(rand(n,1)-0.5) ]; 48 | x_train = [ x_train,2*(rand(n,1)-0.5) ]; 49 | x_train = [ x_train,1*(rand(n,1)-0.5) ]; 50 | x_train = [ x_train,0.5*(rand(n,1)-0.5) ]; 51 | x_train = [ x_train,0.1*(rand(n,1)-0.5) ]; 52 | x_train = [ x_train,0.05*(rand(n,1)-0.5) ]; 53 | x_train = [ x_train,0.01*(rand(n,1)-0.5) ]; 54 | end 55 | 56 | L = length(x_train); 57 | 58 | epoch = 50; 59 | 60 | h = waitbar(0,'please wait'); 61 | for k = 1:epoch 62 | UU = zeros(L,1); 63 | XX = zeros(L,length(w)); 64 | for i = 1:L 65 | x = x_train(:,i); 66 | 67 | dphi = [ 68 | [ 2*x(1), 0, 0, 0 ]; 69 | [ x(2), x(1), 0, 0 ]; 70 | [ x(3), 0, x(1), 0 ]; 71 | [ x(4), 0, 0, x(1) ]; 72 | [ 0, 2*x(2), 0, 0 ]; 73 | [ 0, x(3), x(2), 0 ]; 74 | [ 0, x(4), 0, x(2) ]; 75 | [ 0, 0, 2*x(3), 0 ]; 76 | [ 0, 0, x(4), x(3) ]; 77 | [ 0, 0, 0, 2*x(4) ]; 78 | ]; 79 | 80 | u = -0.5*inv(R)*B'*dphi'*w; 81 | 82 | UU(i) = x'*Q*x + u'*R*u; 83 | XX(i,:) = (dphi*(A*x + B*u))'; 84 | end 85 | 86 | w = -XX\UU; 87 | 88 | ww = [ ww w ]; 89 | 90 | waitbar(k/epoch,h,['Running...',num2str(k/epoch*100),'%']); 91 | end 92 | close(h); 93 | 94 | % comparison 95 | w_init_display = w_init' 96 | w_final_display = w' 97 | w_opt_display = w_opt' 98 | 99 | figure(1), 100 | plot(((1:size(ww,2))-1),ww,'linewidth',1) 101 | xlabel('Epoch'); 102 | ylabel('$w$','Interpreter','latex'); 103 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 104 | grid on; 105 | 106 | end 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /value_iteration/training_data/state_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/value_iteration/training_data/state_data.mat -------------------------------------------------------------------------------- /value_iteration/training_results/actor_critic.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lujingweihh/Adaptive-dynamic-programming-algorithms/100893bfc9df3e55dc346720939fba4ee5cc8b93/value_iteration/training_results/actor_critic.mat -------------------------------------------------------------------------------- /value_iteration/vi_algorithm.m: -------------------------------------------------------------------------------- 1 | function vi_algorithm 2 | % This demo checks the feasibility of value iteration adaptive dynamic programming 3 | 4 | % By J. Lu 5 | % Date: May 6, 2020 6 | 7 | %-------------------------------- start ----------------------------------- 8 | clear; close all; clc; 9 | 10 | % information of system & cost function 11 | global A; global B; global Q; global R; 12 | 13 | load training_data/state_data.mat 14 | 15 | % action network 16 | actor_middle_num = 8; 17 | actor_epoch = 20000; 18 | actor_err_goal = 1e-7; 19 | actor_lr = 0.05; 20 | actor = newff(minmax(x_train), [actor_middle_num control_dim], {'tansig' 'purelin'},'trainlm'); 21 | actor.trainParam.epochs = actor_epoch; % max epochs 22 | actor.trainParam.goal = actor_err_goal; % tolerance error 23 | actor.trainParam.show = 10; % interval 24 | actor.trainParam.lr = actor_lr; % learning rate - traingd,traingdm 25 | actor.biasConnect = [1;0];% bias 26 | 27 | % critic network 28 | critic_middle_num = 8; 29 | critic_epoch = 10000; 30 | critic_err_goal = 1e-7; 31 | critic_lr = 0.01; 32 | critic = newff(minmax(x_train), [critic_middle_num 1], {'tansig' 'purelin'},'trainlm'); 33 | critic.trainParam.epochs = critic_epoch; 34 | critic.trainParam.goal = critic_err_goal; 35 | critic.trainParam.show = 10; 36 | critic.trainParam.lr = critic_lr; 37 | critic.biasConnect = [1;0]; 38 | critic_last = critic; 39 | 40 | epoch = 50; 41 | tol = 1e-9; 42 | performance_index = zeros(1,epoch + 1); 43 | 44 | critic_set = cell(1,epoch); 45 | actor_set = cell(1,epoch); 46 | 47 | figure(1),plot(0,0,'*') 48 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 49 | grid on; 50 | hold on; 51 | h = waitbar(0,'please wait'); 52 | for i = 1:epoch 53 | % update action network 54 | actor_target = zeros(1,size(x_train,2)); 55 | if i ~= 1 56 | for j = 1:size(x_train,2) 57 | x = x_train(:,j); 58 | if x == zeros(state_dim,1) 59 | ut = zeros(control_dim,1); 60 | else 61 | objective = @(u) cost_function(x,u) + critic(controlled_system(x,u)); 62 | u0 = actor(x); 63 | ut = fminunc(objective,u0); 64 | actor_target(j) = ut; 65 | end 66 | end 67 | end 68 | 69 | actor = train(actor, x_train, actor_target); 70 | 71 | % update critic network 72 | if i == 1 73 | critic_target = cost_function(x_train,0); 74 | else 75 | x_next = controlled_system(x_train,actor(x_train)); 76 | critic_target = cost_function(x_train,(actor(x_train))) + critic_last(x_next); 77 | end 78 | for j = 1:size(zeros_index,2) 79 | critic_target(:,zeros_index(j)) = zeros(1,1); 80 | end 81 | critic = train(critic,[x_train,-x_train],[critic_target,critic_target]); 82 | 83 | if i ~= 1 && mse(critic(x_train),critic_last(x_train)) <= tol 84 | critic_set{i} = critic; 85 | actor_set{i} = actor; 86 | break; 87 | end 88 | 89 | critic_last = critic; 90 | 91 | critic_set{i} = critic; 92 | actor_set{i} = actor; 93 | 94 | performance_index(i+1) = critic(x0); 95 | figure(1),plot(i,performance_index(i+1),'*') 96 | waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']); 97 | end 98 | close(h); 99 | 100 | figure(1), 101 | xlabel('Iterations'); 102 | ylabel('Iterative $V(x_0)$','Interpreter','latex'); 103 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 104 | grid on; 105 | hold off; 106 | 107 | save training_results/actor_critic critic_set actor_set critic actor 108 | 109 | end 110 | 111 | %--------------------------- outpout of system ---------------------------- 112 | function y = controlled_system(x,u) 113 | % system matrices 114 | global A; global B; 115 | y = A*x + B*u; % dot product should be adopt in nolinear systems 116 | end 117 | %----------------------------- cost function ------------------------------ 118 | function y = cost_function(x,u) 119 | global Q; global R; 120 | y = (diag(x'*Q*x) + diag(u'*R*u))'; 121 | end 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /value_iteration/vi_state_data.m: -------------------------------------------------------------------------------- 1 | %-------------------------- generate training data ------------------------ 2 | clear; close all; clc; 3 | 4 | A = [0,0.1; 0.3, -1]; B = [0;0.5]; 5 | Q = 1*eye(2); R = 1; 6 | control_dim = 1; 7 | state_dim = 2; 8 | x0 = [1;-1]; 9 | 10 | x_train = zeros(state_dim,1); 11 | u_train = zeros(control_dim,1); 12 | 13 | for i = 1:50 14 | x_train = [x_train, zeros(state_dim,1)]; 15 | x_train = [x_train,4*(rand(state_dim,1)-0.5)]; % [-2 2] 16 | x_train = [x_train,2*(rand(state_dim,1)-0.5)]; % [-1 1] 17 | x_train = [x_train,1*(rand(state_dim,1)-0.5)]; % [-0.5 0.5] 18 | x_train = [x_train,0.2*(rand(state_dim,1)-0.5)]; % [-0.1 0.1] 19 | u_train = [u_train,zeros(control_dim,1)]; 20 | u_train = [u_train,4*(rand(control_dim,1)-0.5)]; % [-2 2] 21 | u_train = [u_train,2*(rand(control_dim,1)-0.5)]; % [-1 1] 22 | u_train = [u_train,1*(rand(control_dim,1)-0.5)]; % [-0.5 0.5] 23 | u_train = [u_train,0.2*(rand(control_dim,1)-0.5)]; % [-0.1 0.1] 24 | end 25 | 26 | r = randperm(size(x_train,2)); % randomization according column 27 | x_train = x_train(:,r); % reorder 28 | [~,n] = find(x_train == zeros(state_dim,1)); 29 | 30 | zeros_index = []; 31 | for i = 1:size(n,1)/state_dim 32 | zeros_index = [zeros_index, n(state_dim*i)]; 33 | end 34 | 35 | save training_data/state_data x_train zeros_index x0 control_dim state_dim A B Q R 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /value_iteration/vi_validation.m: -------------------------------------------------------------------------------- 1 | function vi_validation 2 | %-------------- validate training results -------------- 3 | clear; close all; clc; 4 | 5 | % information of system & cost function 6 | global A; global B; global Q; global R; 7 | 8 | load training_data/state_data.mat 9 | load training_results/actor_critic.mat 10 | 11 | [Kopt, Popt] = dlqr(A,B,Q,R); 12 | 13 | Fsamples = 50; 14 | 15 | x0 = [1;-1]; 16 | x = x0; 17 | x_net = x0; 18 | xx = x.*ones(length(x0),Fsamples+1); 19 | xx_net = x_net.*ones(length(x0),Fsamples+1); 20 | uu_opt = zeros(control_dim,Fsamples); 21 | uu_net = zeros(control_dim,Fsamples); 22 | 23 | Jreal = 0; 24 | 25 | for k = 1:Fsamples 26 | u_opt = -Kopt*x; 27 | x = controlled_system(x,u_opt); 28 | uu_opt(:,k) = u_opt; 29 | xx(:,k+1) = x; 30 | 31 | u_net = actor(x_net); 32 | Jreal = Jreal + x_net'*Q*x_net + u_net'*R*u_net; 33 | x_net = controlled_system(x_net,u_net); 34 | uu_net(:,k) = u_net; 35 | xx_net(:,k+1) = x_net; 36 | end 37 | 38 | Jopt = x0'*Popt*x0 39 | Jnet = critic(x0) 40 | Jreal 41 | 42 | figure(1) 43 | subplot(2,1,1) 44 | plot(0:Fsamples,xx(1,:),'r',0:Fsamples,xx_net(1,:),'b--','linewidth',1) 45 | legend('Optimal','Action network'); 46 | ylabel('$x_1$','Interpreter','latex'); 47 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 48 | grid on; 49 | subplot(2,1,2) 50 | plot(0:Fsamples,xx(2,:),'r',0:Fsamples,xx_net(2,:),'b--','linewidth',1) 51 | legend('Optimal','Action network'); 52 | xlabel('Time steps'); 53 | ylabel('$x_2$','Interpreter','latex'); 54 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 55 | grid on; 56 | 57 | figure(2) 58 | plot(0:Fsamples-1,uu_opt,'r',0:Fsamples-1,uu_net,'b--','linewidth',1) 59 | legend('Optimal','Action network'); 60 | xlabel('Time steps'); 61 | ylabel('Controls'); 62 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 63 | grid on; 64 | 65 | end 66 | 67 | %--------------------------- outpout of system ---------------------------- 68 | function y = controlled_system(x,u) 69 | % system matrices 70 | global A; global B; 71 | y = A*x + B*u; % dot product should be adopt in nolinear systems 72 | end 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_controlled_system.m: -------------------------------------------------------------------------------- 1 | function x_next = vi_controlled_system(x,u,A,B) 2 | %--------------------------- controlled system ---------------------------- 3 | x_next = A*x + B*u; 4 | end -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_cost_function.m: -------------------------------------------------------------------------------- 1 | function y = vi_cost_function(x,u,Q,R) 2 | %----------------------------- cost function ------------------------------ 3 | y = (diag(x'*Q*x) + diag(u'*R*u))'; 4 | end -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_pf_actor.m: -------------------------------------------------------------------------------- 1 | function y = vi_pf_actor(x) 2 | y = [ x(1,:); x(2,:) ]; 3 | end -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_pf_critic.m: -------------------------------------------------------------------------------- 1 | function y = vi_pf_critic(x) 2 | y = [ x(1,:).^2; x(1,:).*x(2,:); x(2,:).^2; ]; 3 | end -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_pf_main.m: -------------------------------------------------------------------------------- 1 | % This demo checks the feasibility value iteration adaptive dynamic 2 | % programming using an arbitrary positive semi-definite function as 3 | % the initial value function 4 | 5 | % By J. Lu 6 | % Date: May 6, 2020 7 | 8 | %-------------------------------- start ----------------------------------- 9 | clear; close all; clc; 10 | 11 | % utilize vi_state_data.m to generate training data and then run this file 12 | load state_data 13 | 14 | % the value function of linear systems can be approximated using quadratic 15 | % form and eigenvalues of its corresponding matrix are 3.9645 and 11.0355 16 | weights_critic_ini = 10*[1;-0.5;0.5]; 17 | 18 | % eigenvalues can be computed by 19 | % P_ini = [ weights_critic_ini(1) weights_critic_ini(2)/2; weights_critic_ini(2)/2 weights_critic_ini(3) ] 20 | % eig(P_ini) 21 | 22 | weights_critic = weights_critic_ini; 23 | weights_actor = zeros(2,1); 24 | 25 | xV = vi_pf_critic(x_train); 26 | xV_minus = vi_pf_critic(-x_train); 27 | xu = vi_pf_actor(x_train); 28 | 29 | epoch = 20; 30 | performance_index = zeros(1,epoch + 1); 31 | 32 | performance_index(1) = weights_critic'*vi_pf_critic(x0); 33 | 34 | weights_critic_set = weights_critic; 35 | weights_actor_set = []; 36 | 37 | figure(1),plot(0,performance_index(1),'*'),hold on; 38 | h = waitbar(0,'please wait'); 39 | for i = 1:epoch 40 | % update action network 41 | % by optimal tool 42 | if i ~= 1 43 | actor_target = zeros(control_dim,size(x_train,2)); 44 | for j = 1:size(x_train,2) 45 | x = x_train(:,j); 46 | if x == zeros(state_dim,1) 47 | ut = zeros(control_dim,1); 48 | else 49 | objective = @(u) vi_cost_function(x,u,Q,R) + ... 50 | weights_critic'*vi_pf_critic(vi_controlled_system(x,u,A,B)); 51 | u0 = weights_actor'*vi_pf_actor(x); 52 | ut = fminunc(objective,u0); 53 | end 54 | actor_target(:,j) = ut; 55 | end 56 | weights_actor = xu'\actor_target'; 57 | end 58 | 59 | % update critic network 60 | if i == 1 61 | critic_target = vi_cost_function(x_train,0,Q,R)'; 62 | else 63 | x_next = vi_controlled_system(x_train,weights_actor'*xu,A,B); 64 | critic_target = vi_cost_function(x_train,weights_actor'*xu,Q,R)'+ ... 65 | (weights_critic'*vi_pf_critic(x_next))'; 66 | end 67 | weights_critic = [ xV xV_minus]'\[ critic_target; critic_target]; 68 | 69 | weights_actor_set = [weights_actor_set weights_actor]; 70 | weights_critic_set = [weights_critic_set weights_critic]; 71 | performance_index(i+1) = weights_critic'*vi_pf_critic(x0); 72 | figure(1),plot(i,performance_index(i+1),'*'),hold on; 73 | waitbar(i/epoch,h,['Training controller...',num2str(i/epoch*100),'%']); 74 | end 75 | close(h); 76 | 77 | % check results 78 | P_ini = [ weights_critic_ini(1) weights_critic_ini(2)/2; weights_critic_ini(2)/2 weights_critic_ini(3) ] 79 | 80 | K_final = -weights_actor' 81 | P_final = [ weights_critic(1) weights_critic(2)/2; weights_critic(2)/2 weights_critic(3) ] 82 | 83 | [Kopt, Popt] = dlqr(A,B,Q,R) 84 | 85 | 86 | figure(1), 87 | xlabel('Iterations'); 88 | ylabel('Iterative $V(x_0)$','Interpreter','latex'); 89 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 90 | grid on; 91 | hold off; 92 | 93 | figure(2), 94 | plot((1:length(weights_critic_set))-1,weights_critic_set,'linewidth',1) 95 | xlabel('Iterations'); 96 | ylabel('Critic NN weights'); 97 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 98 | grid on; 99 | 100 | figure(3), 101 | plot((1:length(weights_actor_set))-1,weights_actor_set,'linewidth',1) 102 | xlabel('Iterations'); 103 | ylabel('Action NN weights'); 104 | set(gca,'FontName','Times New Roman','FontSize',14,'linewidth',1); 105 | grid on; 106 | 107 | 108 | -------------------------------------------------------------------------------- /value_iteration_positive_semi_definite_initial_value_function/vi_state_data.m: -------------------------------------------------------------------------------- 1 | %-------------------------- generate training data ------------------------ 2 | clear; close all; clc; 3 | 4 | A = [0,0.1; 0.3, -1]; B = [0;0.5]; 5 | Q = 1*eye(2); R = 1; 6 | control_dim = 1; 7 | state_dim = 2; 8 | x0 = [1;-1]; 9 | 10 | Jcompression = 1; 11 | ucompression = 1; 12 | 13 | x_train = zeros(state_dim,1); 14 | u_train = zeros(control_dim,1); 15 | 16 | for i = 1:30 17 | x_train = [x_train, zeros(state_dim,1)]; 18 | x_train = [x_train,4*(rand(state_dim,1)-0.5)]; % [-2 2] 19 | x_train = [x_train,2*(rand(state_dim,1)-0.5)]; % [-1 1] 20 | x_train = [x_train,1*(rand(state_dim,1)-0.5)]; % [-0.5 0.5] 21 | x_train = [x_train,0.2*(rand(state_dim,1)-0.5)]; % [-0.1 0.1] 22 | u_train = [u_train,zeros(control_dim,1)]; 23 | u_train = [u_train,4*(rand(control_dim,1)-0.5)]; % [-2 2] 24 | u_train = [u_train,2*(rand(control_dim,1)-0.5)]; % [-1 1] 25 | u_train = [u_train,1*(rand(control_dim,1)-0.5)]; % [-0.5 0.5] 26 | u_train = [u_train,0.2*(rand(control_dim,1)-0.5)]; % [-0.1 0.1] 27 | end 28 | 29 | r = randperm(size(x_train,2)); % randomization according column 30 | x_train = x_train(:,r); % reorder 31 | [~,n] = find(x_train == zeros(state_dim,1)); 32 | 33 | zeros_index = []; 34 | for i = 1:size(n,1)/state_dim 35 | zeros_index = [zeros_index, n(state_dim*i)]; 36 | end 37 | 38 | ru = randperm(size(u_train,2)); % randomization according column 39 | u_train = u_train(:,ru); % reorder 40 | 41 | xu_train = [x_train; u_train]; 42 | x_next_train = A*x_train + B*u_train; 43 | 44 | save state_data x_train u_train xu_train x_next_train zeros_index x0 control_dim state_dim A B Q R 45 | 46 | 47 | 48 | 49 | --------------------------------------------------------------------------------