├── .gitattributes ├── Chap1-DynamicProgramming ├── createSimpleGW.m ├── policy_evaluation_exercise.m ├── policy_evaluation_solution.m ├── policy_iteration_exercise.m ├── policy_iteration_solution.m ├── value_iteration_exercise.m └── value_iteration_solution.m ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/createSimpleGW.m: -------------------------------------------------------------------------------- 1 | function gw = createSimpleGW() 2 | % gw = createSimpleGW() creates gridworld in example 4.1 of 3 | % Reinforcement Learning, An Introduction - Sutton and Barto 4 | 5 | % 4 by 4 grid world. 6 | % action that take agent off the grid leave its location unchanged 7 | % reward is -1 on all transitions until the terminal state is reached 8 | % 2 terminal states "[1,1]", "[4,4]" 9 | 10 | gw = createGridWorld(4,4); 11 | % set up initial state 12 | gw.CurrentState = "[2,2]"; 13 | % set up terminal state 14 | gw.TerminalStates = ["[1,1]" "[4,4]"]; 15 | % number of states 16 | nS = numel(gw.States); 17 | % number of possible actions 18 | nA = numel(gw.Actions); 19 | 20 | % cannot move from terminal states 21 | gw.T(state2idx(gw,gw.TerminalStates),:,:) = 0; 22 | 23 | % set rewards transitions (-1 for all state, action, next state) 24 | gw.R = repmat(-1*ones(nS),[1 1 nA]); 25 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/policy_evaluation_exercise.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Random policy to be evaluate 11 | % from each state, equally likely to take any action 12 | randomPolicy = (1/nA)*ones(nS,nA); 13 | 14 | %% Policy Evaluation 15 | discountFactor = 1; 16 | tol = 1e-3; 17 | nIteration = 200; 18 | [vTable,fcnCount] = policy_evaluation(randomPolicy, gw, discountFactor, nIteration, tol); 19 | 20 | %% Display State value function 21 | vTable = reshape(vTable,gw.GridSize); 22 | fprintf('Value Evaluation converged after %d iterations;\n',fcnCount); 23 | disp(vTable) 24 | 25 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol) 26 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment 27 | % nS, nA: number of states, actions 28 | % 29 | % Inputs: 30 | % - policy: nS-by-nA matrix, each element stores the probability of taking 31 | % an action from a given state 32 | % - env: grid world environment 33 | % - discountFactor: discount factor (aka gamma) 34 | % - nIteration: maximum number of iteration before return 35 | % - tol: stop evaluation once value function change is less than tol for all 36 | % states 37 | % 38 | % Outputs: 39 | % - vTable: state value function V(s), nS-by-1 40 | % - fcnCount: iteration counter 41 | 42 | %% Set up useful variables 43 | nS = numel(env.States); 44 | nA = numel(env.Actions); 45 | 46 | %% InitializationV(s) = 0 47 | vTable = zeros(nS,1); 48 | fcnCount = 1; 49 | 50 | %% Begin Iteration 51 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻') 52 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/policy_evaluation_solution.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Random policy to be evaluate 11 | % from each state, equally likely to take any action 12 | randomPolicy = (1/nA)*ones(nS,nA); 13 | 14 | %% Policy Evaluation 15 | discountFactor = 1; 16 | tol = 1e-3; 17 | nIteration = 200; 18 | [vTable,fcnCount] = policy_evaluation(random_policy, gw, discountFactor, nIteration, tol); 19 | % display State value function 20 | vTable = reshape(vTable,gw.GridSize); 21 | fprintf('Value Evaluation converged after %d iterations;\n',fcnCount); 22 | disp(vTable) 23 | 24 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol) 25 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment 26 | % nS, nA: number of states, actions 27 | % 28 | % Inputs: 29 | % - policy: nS-by-nA matrix, each element stores the probability of taking 30 | % an action from a given state 31 | % - env: grid world environment 32 | % - discountFactor: discount factor (aka gamma) 33 | % - nIteration: maximum number of iteration before return 34 | % - tol: stop evaluation once value function change is less than tol for all 35 | % states 36 | % 37 | % Outputs: 38 | % - vTable: state value function V(s), nS-by-1 39 | % - fcnCount: iteration counter 40 | 41 | %% Set up useful variables 42 | nS = numel(env.States); 43 | nA = numel(env.Actions); 44 | 45 | %% InitializationV(s) = 0 46 | vTable = zeros(nS,1); 47 | terminate = false; 48 | fcnCount = 1; 49 | 50 | %% Begin Iteration 51 | while (~terminate) && (fcnCount < nIteration) 52 | delta = 0; 53 | for stateIdx = 1:nS 54 | tempStateValue = 0; 55 | for actionIdx = 1:nA 56 | % pi(a|s) 57 | actionProbability = policy(stateIdx,actionIdx); 58 | 59 | % get transition probility p(s',r|s,a) and r from state transition 60 | transitionProb = env.T(stateIdx,:,actionIdx); 61 | reward = env.R(stateIdx,:,actionIdx)'; 62 | 63 | % Bellman equation for state value function V(s) update 64 | tempStateValue = tempStateValue + ... 65 | actionProbability*(transitionProb*(reward + discountFactor*vTable)); 66 | end 67 | 68 | % find how much change in V(s) 69 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx)))); 70 | 71 | % update V(s) 72 | vTable(stateIdx) = tempStateValue; 73 | end 74 | 75 | % terminate if change in V(s) is less than tolerance 76 | if delta < tol 77 | terminate = true; 78 | end 79 | 80 | % increment counter 81 | fcnCount = fcnCount + 1; 82 | end 83 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/policy_iteration_exercise.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Policy Iteration 11 | discountFactor = 1; 12 | tol = 1e-3; 13 | nIteration = 200; 14 | [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(gw, discountFactor, nIteration, tol); 15 | % display State value function 16 | vTable = reshape(vTable,gw.GridSize); 17 | fprintf('Policy Iteration converged after %d iterations;\n',fcnCount); 18 | disp(vTable) 19 | 20 | %% Local Functions 21 | function [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(env, discountFactor, nIteration, tol) 22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, discountFactor, nIteration, tol) 23 | % evaluate a policy on an MDP environment with Policy Iteration algorithm 24 | % nS, nA: number of states, actions 25 | % 26 | % Inputs: 27 | % - env: grid world environment 28 | % - discountFactor: discount factor (aka gamma) 29 | % - nIteration: maximum number of iteration before return 30 | % - tol: stop evaluation once value function change is less than tol for 31 | % all states 32 | % 33 | % Outputs: 34 | % - vTable: state value function V(s), nS-by-1 35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best 36 | % action at a given state, 0 otherwise 37 | % - fcnCount: iteration counter 38 | 39 | %% Set up useful variables 40 | nS = numel(env.States); 41 | nA = numel(env.Actions); 42 | 43 | %% Initialization V, policy 44 | vTableTemp = zeros(nS,1); 45 | optimalDeterministicPolicy = ones(nS,nA)/nA; 46 | policyStable = false; 47 | fcnCount = 1; 48 | 49 | %% Begin iteration 50 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻') 51 | 52 | end 53 | 54 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol) 55 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment 56 | % nS, nA: number of states, actions 57 | % 58 | % Inputs: 59 | % - policy: nS-by-nA matrix, each element stores the probability of taking 60 | % an action from a given state 61 | % - env: grid world environment 62 | % - discountFactor: discount factor (aka gamma) 63 | % - nIteration: maximum number of iteration before return 64 | % - tol: stop evaluation once value function change is less than tol for all 65 | % states 66 | % 67 | % Outputs: 68 | % - vTable: state value function V(s), nS-by-1 69 | % - fcnCount: iteration counter 70 | 71 | %% Set up useful variables 72 | nS = numel(env.States); 73 | nA = numel(env.Actions); 74 | 75 | %% InitializationV(s) = 0 76 | vTable = zeros(nS,1); 77 | terminate = false; 78 | fcnCount = 1; 79 | 80 | %% Begin Iteration 81 | while (~terminate) && (fcnCount < nIteration) 82 | delta = 0; 83 | for stateIdx = 1:nS 84 | tempStateValue = 0; 85 | for actionIdx = 1:nA 86 | % pi(a|s) 87 | actionProbability = policy(stateIdx,actionIdx); 88 | 89 | % get transition probility p(s',r|s,a) and r from state transition 90 | transitionProb = env.T(stateIdx,:,actionIdx); 91 | reward = env.R(stateIdx,:,actionIdx)'; 92 | 93 | % Bellman equation for state value function V(s) update 94 | tempStateValue = tempStateValue + ... 95 | actionProbability*(transitionProb*(reward + discountFactor*vTable)); 96 | end 97 | 98 | % find how much change in V(s) 99 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx)))); 100 | 101 | % update V(s) 102 | vTable(stateIdx) = tempStateValue; 103 | end 104 | 105 | % terminate if change in V(s) is less than tolerance 106 | if delta < tol 107 | terminate = true; 108 | end 109 | 110 | % increment counter 111 | fcnCount = fcnCount + 1; 112 | end 113 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/policy_iteration_solution.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Policy Iteration 11 | discountFactor = 1; 12 | tol = 1e-3; 13 | nIteration = 200; 14 | [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(gw, discountFactor, nIteration, tol); 15 | % display State value function 16 | vTable = reshape(vTable,gw.GridSize); 17 | fprintf('Policy Iteration converged after %d iterations;\n',fcnCount); 18 | disp(vTable) 19 | 20 | %% Local Functions 21 | function [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(env, discountFactor, nIteration, tol) 22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, discountFactor, nIteration, tol) 23 | % evaluate a policy on an MDP environment with Policy Iteration algorithm 24 | % nS, nA: number of states, actions 25 | % 26 | % Inputs: 27 | % - env: grid world environment 28 | % - discountFactor: discount factor (aka gamma) 29 | % - nIteration: maximum number of iteration before return 30 | % - tol: stop evaluation once value function change is less than tol for 31 | % all states 32 | % 33 | % Outputs: 34 | % - vTable: state value function V(s), nS-by-1 35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best 36 | % action at a given state, 0 otherwise 37 | % - fcnCount: iteration counter 38 | 39 | %% Set up useful variables 40 | nS = numel(env.States); 41 | nA = numel(env.Actions); 42 | 43 | %% Initialization V, policy 44 | vTableTemp = zeros(nS,1); 45 | optimalDeterministicPolicy = ones(nS,nA)/nA; 46 | policyStable = false; 47 | fcnCount = 1; 48 | 49 | %% Begin iteration 50 | while ~policyStable && (fcnCount < nIteration) 51 | 52 | %% Policy Evaluation 53 | vTable = policy_evaluation(optimalDeterministicPolicy, env, discountFactor, nIteration, tol); 54 | 55 | %% Policy Improvement 56 | newPolicy = zeros(nS,nA); 57 | 58 | % for each state s 59 | for stateIdx = 1:nS 60 | vTableTempPerState = zeros(nA,1); 61 | for actionIdx = 1:nA 62 | % get reward, transition probility from traversing state transition 63 | transitionProb = env.T(stateIdx,:,actionIdx); 64 | reward = env.R(stateIdx,:,actionIdx)'; 65 | 66 | % Bellman equation for state value function (individual action) 67 | vTableTempPerState(actionIdx) = transitionProb*(reward + discountFactor*vTable); 68 | end 69 | 70 | % Choose optimal deterministic action 71 | [vTableTemp(stateIdx),bestAction] = max(vTableTempPerState); 72 | newPolicy(stateIdx,bestAction) = 1; 73 | end 74 | 75 | % if policy does not change after a iteration, terminate 76 | if isequal(newPolicy, optimalDeterministicPolicy) 77 | policyStable = true; 78 | vTable = vTableTemp; 79 | end 80 | 81 | % update for next iteration 82 | optimalDeterministicPolicy = newPolicy; 83 | fcnCount = fcnCount + 1; 84 | end 85 | 86 | end 87 | 88 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol) 89 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment 90 | % nS, nA: number of states, actions 91 | % 92 | % Inputs: 93 | % - policy: nS-by-nA matrix, each element stores the probability of taking 94 | % an action from a given state 95 | % - env: grid world environment 96 | % - discountFactor: discount factor (aka gamma) 97 | % - nIteration: maximum number of iteration before return 98 | % - tol: stop evaluation once value function change is less than tol for all 99 | % states 100 | % 101 | % Outputs: 102 | % - vTable: state value function V(s), nS-by-1 103 | % - fcnCount: iteration counter 104 | 105 | %% Set up useful variables 106 | nS = numel(env.States); 107 | nA = numel(env.Actions); 108 | 109 | %% InitializationV(s) = 0 110 | vTable = zeros(nS,1); 111 | terminate = false; 112 | fcnCount = 1; 113 | 114 | %% Begin Iteration 115 | while (~terminate) && (fcnCount < nIteration) 116 | delta = 0; 117 | for stateIdx = 1:nS 118 | tempStateValue = 0; 119 | for actionIdx = 1:nA 120 | % pi(a|s) 121 | actionProbability = policy(stateIdx,actionIdx); 122 | 123 | % get transition probility p(s',r|s,a) and r from state transition 124 | transitionProb = env.T(stateIdx,:,actionIdx); 125 | reward = env.R(stateIdx,:,actionIdx)'; 126 | 127 | % Bellman equation for state value function V(s) update 128 | tempStateValue = tempStateValue + ... 129 | actionProbability*(transitionProb*(reward + discountFactor*vTable)); 130 | end 131 | 132 | % find how much change in V(s) 133 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx)))); 134 | 135 | % update V(s) 136 | vTable(stateIdx) = tempStateValue; 137 | end 138 | 139 | % terminate if change in V(s) is less than tolerance 140 | if delta < tol 141 | terminate = true; 142 | end 143 | 144 | % increment counter 145 | fcnCount = fcnCount + 1; 146 | end 147 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/value_iteration_exercise.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Value Iteration 11 | discount_factor = 1; 12 | tol = 1e-3; 13 | nIteration = 200; 14 | [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(gw, discount_factor, nIteration, tol); 15 | 16 | %% Display State value function 17 | vTable = reshape(vTable,gw.GridSize); 18 | fprintf('Value Iteration converged after %d iterations;\n',fcnCount); 19 | disp(vTable) 20 | 21 | function [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(env, discountFactor, nIteration, tol) 22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, gamma, nIteration, tol) 23 | % evaluate a policy on an MDP environment with Value Iteration algorithm 24 | % nS, nA: number of states, actions 25 | % 26 | % Inputs: 27 | % - env: grid world environment 28 | % - discountFactor: discount factor (aka gamma) 29 | % - nIteration: maximum number of iteration before return 30 | % - tol: stop evaluation once value function change is less than tol for 31 | % all states 32 | % 33 | % Outputs: 34 | % - vTable: state value function V(s), nS-by-1 35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best 36 | % action at a given state, 0 otherwise 37 | % - fcnCount: iteration counter 38 | 39 | %% Set up useful variables 40 | nS = numel(env.States); 41 | nA = numel(env.Actions); 42 | 43 | %% Initialization V 44 | vTable = zeros(nS,1); 45 | terminate = false; 46 | fcnCount = 1; 47 | 48 | %% Begin iteration 49 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻') 50 | 51 | %% Do 1 more iteration to get optimal deterministic policy 52 | optimalDeterministicPolicy = zeros(nS,nA); 53 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻') 54 | 55 | end -------------------------------------------------------------------------------- /Chap1-DynamicProgramming/value_iteration_solution.m: -------------------------------------------------------------------------------- 1 | close all; clear; clc 2 | 3 | %% Set up GridWorld environment Sutton-Barto 4.1 4 | gw = createSimpleGW(); 5 | % number of states 6 | nS = numel(gw.States); 7 | % number of possible actions 8 | nA = numel(gw.Actions); 9 | 10 | %% Value Iteration 11 | discount_factor = 1; 12 | tol = 1e-3; 13 | nIteration = 200; 14 | [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(gw, discount_factor, nIteration, tol); 15 | 16 | %% Display State value function 17 | vTable = reshape(vTable,gw.GridSize); 18 | fprintf('Value Iteration converged after %d iterations;\n',fcnCount); 19 | disp(vTable) 20 | 21 | function [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(env, discountFactor, nIteration, tol) 22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, gamma, nIteration, tol) 23 | % evaluate a policy on an MDP environment with Value Iteration algorithm 24 | % nS, nA: number of states, actions 25 | % 26 | % Inputs: 27 | % - env: grid world environment 28 | % - discountFactor: discount factor (aka gamma) 29 | % - nIteration: maximum number of iteration before return 30 | % - tol: stop evaluation once value function change is less than tol for 31 | % all states 32 | % 33 | % Outputs: 34 | % - vTable: state value function V(s), nS-by-1 35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best 36 | % action at a given state, 0 otherwise 37 | % - fcnCount: iteration counter 38 | 39 | %% Set up useful variables 40 | nS = numel(env.States); 41 | nA = numel(env.Actions); 42 | 43 | %% Initialization V 44 | vTable = zeros(nS,1); 45 | terminate = false; 46 | fcnCount = 1; 47 | 48 | %% Begin iteration 49 | while(~terminate) && (fcnCount < nIteration) 50 | vTableTemp = zeros(nS,1); 51 | % For each state s 52 | for stateIdx = 1:nS 53 | stateValueTemp = zeros(nA,1); 54 | for actionIdx = 1:nA 55 | % get reward, transition probility from traversing state transition 56 | transitionProb = env.T(stateIdx,:,actionIdx); 57 | reward = env.R(stateIdx,:,actionIdx)'; 58 | 59 | % Bellman equation for state value function (individual action) 60 | stateValueTemp(actionIdx) = transitionProb*(reward + discountFactor*vTable); 61 | end 62 | % Choose best value 63 | vTableTemp(stateIdx) = max(stateValueTemp); 64 | end 65 | 66 | % terminate if change in V(s) is less than tolerance 67 | if sum(abs(vTableTemp(:)-vTable(:))) < tol 68 | terminate = true; 69 | end 70 | 71 | % update for next iteration 72 | vTable = vTableTemp; 73 | fcnCount = fcnCount + 1; 74 | end 75 | 76 | %% Do 1 more iteration to get optimal deterministic policy 77 | optimalDeterministicPolicy = zeros(nS,nA); 78 | for stateIdx = 1:nS 79 | stateValueTemp = zeros(nA,1); 80 | for actionIdx = 1:nA 81 | % get reward, transition probility from traversing state transition 82 | transitionProb = env.T(stateIdx,:,actionIdx); 83 | reward = env.R(stateIdx,:,actionIdx)'; 84 | 85 | % Bellman equation for state value function (individual action) 86 | stateValueTemp(actionIdx) = transitionProb*(reward + discountFactor*vTable); 87 | end 88 | 89 | % set the best action probability to 1 90 | [~,bestActionIdx] = max(stateValueTemp); 91 | optimalDeterministicPolicy(stateIdx,bestActionIdx) = 1; 92 | end 93 | 94 | end -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 anhtran1995 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL Course MATLAB 2 | 3 | This repository provides complimentary coding exercises and solutions for [RL Learning Roadmap][11]. The coding exercises format is based on the awesome [WildML Learning Reinforcement Learning course][3] by [Denny Britz][6]. Exercises focus on implementing algorithms (the meat of RL). 4 | 5 | Given many RL courses in the community in Python, I create some coding exercises for MATLAB users. If you don't use MATLAB but want to follow [RL Learning Roadmap][11], rework these exercises in your favorite framework. 6 | 7 | This is not an official MathWorks product. For online courses from MathWorks, see https://matlabacademy.mathworks.com/. 8 | 9 | ## Table of Contents 10 | Coding Exercises requires [Reinforcement Learning Toolbox](https://www.mathworks.com/products/reinforcement-learning.html) but you can always reimplement from scratch. 11 | 12 | • Chapter 1 - Dynamic Programming
13 | • Chapter 2 - Temporal-Difference (TD) Learning (WIP)
14 | • Chapter 3 - Function Approximation (WIP)
15 | • Chapter 4 - Policy Gradient (WIP)
16 | • Chapter 5 - Advanced Policy Gradient (WIP)
17 | • Chapter 6 - Partially Observable Environment (WIP)
18 | • Chapter 7 - Model-based (WIP)
19 | 20 | ## References 21 | Learning materials referred from: 22 | 23 | • [Reinforcement Learning Toolbox][12], The MathWorks
24 | • [Reinforcement Learning: An Introduction][1] (textbook), Sutton and Barto
25 | • [Deep Reinforcement Learning][10] (course), UC Berkeley
26 | • [OpenAI Spinning Up][9](textbook/blog)
27 | • [WildML Learning Reinforcement Learning][3] (python course with exercises/solutions), Denny Britz
28 | • [MATLAB RL Tech Talks][1] (videos), The MathWorks
29 | • [David Silver’s RL course][4]
30 | • [Simple Reinforcement Learning][7] (blog), Arthur Juliani
31 | • [Deep Learning Specialization Coursera][8] (course), Andrew Ng (you can audit for free, highly recommend course 1 + 2 to get Deep Learning foundations)
32 | 33 | [1]: https://www.mathworks.com/videos/series/reinforcement-learning.html 34 | [2]: http://incompleteideas.net/book/RLbook2018.pdf 35 | [3]: https://github.com/dennybritz/reinforcement-learning 36 | [4]: https://www.davidsilver.uk/teaching/ 37 | [5]: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf 38 | [6]: https://twitter.com/dennybritz?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor 39 | [7]: https://medium.com/@awjuliani 40 | [8]: https://www.coursera.org/specializations/deep-learning 41 | [9]: https://spinningup.openai.com/en/latest/spinningup/rl_intro.html 42 | [10]: http://rail.eecs.berkeley.edu/deeprlcourse/ 43 | [11]: https://github.com/anhOfTheStars/RLStudyGuide 44 | [12]: https://www.mathworks.com/products/reinforcement-learning.html 45 | --------------------------------------------------------------------------------