├── .gitattributes
├── Chap1-DynamicProgramming
├── createSimpleGW.m
├── policy_evaluation_exercise.m
├── policy_evaluation_solution.m
├── policy_iteration_exercise.m
├── policy_iteration_solution.m
├── value_iteration_exercise.m
└── value_iteration_solution.m
├── LICENSE
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/createSimpleGW.m:
--------------------------------------------------------------------------------
1 | function gw = createSimpleGW()
2 | % gw = createSimpleGW() creates gridworld in example 4.1 of
3 | % Reinforcement Learning, An Introduction - Sutton and Barto
4 |
5 | % 4 by 4 grid world.
6 | % action that take agent off the grid leave its location unchanged
7 | % reward is -1 on all transitions until the terminal state is reached
8 | % 2 terminal states "[1,1]", "[4,4]"
9 |
10 | gw = createGridWorld(4,4);
11 | % set up initial state
12 | gw.CurrentState = "[2,2]";
13 | % set up terminal state
14 | gw.TerminalStates = ["[1,1]" "[4,4]"];
15 | % number of states
16 | nS = numel(gw.States);
17 | % number of possible actions
18 | nA = numel(gw.Actions);
19 |
20 | % cannot move from terminal states
21 | gw.T(state2idx(gw,gw.TerminalStates),:,:) = 0;
22 |
23 | % set rewards transitions (-1 for all state, action, next state)
24 | gw.R = repmat(-1*ones(nS),[1 1 nA]);
25 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/policy_evaluation_exercise.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Random policy to be evaluate
11 | % from each state, equally likely to take any action
12 | randomPolicy = (1/nA)*ones(nS,nA);
13 |
14 | %% Policy Evaluation
15 | discountFactor = 1;
16 | tol = 1e-3;
17 | nIteration = 200;
18 | [vTable,fcnCount] = policy_evaluation(randomPolicy, gw, discountFactor, nIteration, tol);
19 |
20 | %% Display State value function
21 | vTable = reshape(vTable,gw.GridSize);
22 | fprintf('Value Evaluation converged after %d iterations;\n',fcnCount);
23 | disp(vTable)
24 |
25 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol)
26 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment
27 | % nS, nA: number of states, actions
28 | %
29 | % Inputs:
30 | % - policy: nS-by-nA matrix, each element stores the probability of taking
31 | % an action from a given state
32 | % - env: grid world environment
33 | % - discountFactor: discount factor (aka gamma)
34 | % - nIteration: maximum number of iteration before return
35 | % - tol: stop evaluation once value function change is less than tol for all
36 | % states
37 | %
38 | % Outputs:
39 | % - vTable: state value function V(s), nS-by-1
40 | % - fcnCount: iteration counter
41 |
42 | %% Set up useful variables
43 | nS = numel(env.States);
44 | nA = numel(env.Actions);
45 |
46 | %% InitializationV(s) = 0
47 | vTable = zeros(nS,1);
48 | fcnCount = 1;
49 |
50 | %% Begin Iteration
51 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻')
52 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/policy_evaluation_solution.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Random policy to be evaluate
11 | % from each state, equally likely to take any action
12 | randomPolicy = (1/nA)*ones(nS,nA);
13 |
14 | %% Policy Evaluation
15 | discountFactor = 1;
16 | tol = 1e-3;
17 | nIteration = 200;
18 | [vTable,fcnCount] = policy_evaluation(random_policy, gw, discountFactor, nIteration, tol);
19 | % display State value function
20 | vTable = reshape(vTable,gw.GridSize);
21 | fprintf('Value Evaluation converged after %d iterations;\n',fcnCount);
22 | disp(vTable)
23 |
24 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol)
25 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment
26 | % nS, nA: number of states, actions
27 | %
28 | % Inputs:
29 | % - policy: nS-by-nA matrix, each element stores the probability of taking
30 | % an action from a given state
31 | % - env: grid world environment
32 | % - discountFactor: discount factor (aka gamma)
33 | % - nIteration: maximum number of iteration before return
34 | % - tol: stop evaluation once value function change is less than tol for all
35 | % states
36 | %
37 | % Outputs:
38 | % - vTable: state value function V(s), nS-by-1
39 | % - fcnCount: iteration counter
40 |
41 | %% Set up useful variables
42 | nS = numel(env.States);
43 | nA = numel(env.Actions);
44 |
45 | %% InitializationV(s) = 0
46 | vTable = zeros(nS,1);
47 | terminate = false;
48 | fcnCount = 1;
49 |
50 | %% Begin Iteration
51 | while (~terminate) && (fcnCount < nIteration)
52 | delta = 0;
53 | for stateIdx = 1:nS
54 | tempStateValue = 0;
55 | for actionIdx = 1:nA
56 | % pi(a|s)
57 | actionProbability = policy(stateIdx,actionIdx);
58 |
59 | % get transition probility p(s',r|s,a) and r from state transition
60 | transitionProb = env.T(stateIdx,:,actionIdx);
61 | reward = env.R(stateIdx,:,actionIdx)';
62 |
63 | % Bellman equation for state value function V(s) update
64 | tempStateValue = tempStateValue + ...
65 | actionProbability*(transitionProb*(reward + discountFactor*vTable));
66 | end
67 |
68 | % find how much change in V(s)
69 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx))));
70 |
71 | % update V(s)
72 | vTable(stateIdx) = tempStateValue;
73 | end
74 |
75 | % terminate if change in V(s) is less than tolerance
76 | if delta < tol
77 | terminate = true;
78 | end
79 |
80 | % increment counter
81 | fcnCount = fcnCount + 1;
82 | end
83 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/policy_iteration_exercise.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Policy Iteration
11 | discountFactor = 1;
12 | tol = 1e-3;
13 | nIteration = 200;
14 | [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(gw, discountFactor, nIteration, tol);
15 | % display State value function
16 | vTable = reshape(vTable,gw.GridSize);
17 | fprintf('Policy Iteration converged after %d iterations;\n',fcnCount);
18 | disp(vTable)
19 |
20 | %% Local Functions
21 | function [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(env, discountFactor, nIteration, tol)
22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, discountFactor, nIteration, tol)
23 | % evaluate a policy on an MDP environment with Policy Iteration algorithm
24 | % nS, nA: number of states, actions
25 | %
26 | % Inputs:
27 | % - env: grid world environment
28 | % - discountFactor: discount factor (aka gamma)
29 | % - nIteration: maximum number of iteration before return
30 | % - tol: stop evaluation once value function change is less than tol for
31 | % all states
32 | %
33 | % Outputs:
34 | % - vTable: state value function V(s), nS-by-1
35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best
36 | % action at a given state, 0 otherwise
37 | % - fcnCount: iteration counter
38 |
39 | %% Set up useful variables
40 | nS = numel(env.States);
41 | nA = numel(env.Actions);
42 |
43 | %% Initialization V, policy
44 | vTableTemp = zeros(nS,1);
45 | optimalDeterministicPolicy = ones(nS,nA)/nA;
46 | policyStable = false;
47 | fcnCount = 1;
48 |
49 | %% Begin iteration
50 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻')
51 |
52 | end
53 |
54 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol)
55 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment
56 | % nS, nA: number of states, actions
57 | %
58 | % Inputs:
59 | % - policy: nS-by-nA matrix, each element stores the probability of taking
60 | % an action from a given state
61 | % - env: grid world environment
62 | % - discountFactor: discount factor (aka gamma)
63 | % - nIteration: maximum number of iteration before return
64 | % - tol: stop evaluation once value function change is less than tol for all
65 | % states
66 | %
67 | % Outputs:
68 | % - vTable: state value function V(s), nS-by-1
69 | % - fcnCount: iteration counter
70 |
71 | %% Set up useful variables
72 | nS = numel(env.States);
73 | nA = numel(env.Actions);
74 |
75 | %% InitializationV(s) = 0
76 | vTable = zeros(nS,1);
77 | terminate = false;
78 | fcnCount = 1;
79 |
80 | %% Begin Iteration
81 | while (~terminate) && (fcnCount < nIteration)
82 | delta = 0;
83 | for stateIdx = 1:nS
84 | tempStateValue = 0;
85 | for actionIdx = 1:nA
86 | % pi(a|s)
87 | actionProbability = policy(stateIdx,actionIdx);
88 |
89 | % get transition probility p(s',r|s,a) and r from state transition
90 | transitionProb = env.T(stateIdx,:,actionIdx);
91 | reward = env.R(stateIdx,:,actionIdx)';
92 |
93 | % Bellman equation for state value function V(s) update
94 | tempStateValue = tempStateValue + ...
95 | actionProbability*(transitionProb*(reward + discountFactor*vTable));
96 | end
97 |
98 | % find how much change in V(s)
99 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx))));
100 |
101 | % update V(s)
102 | vTable(stateIdx) = tempStateValue;
103 | end
104 |
105 | % terminate if change in V(s) is less than tolerance
106 | if delta < tol
107 | terminate = true;
108 | end
109 |
110 | % increment counter
111 | fcnCount = fcnCount + 1;
112 | end
113 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/policy_iteration_solution.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Policy Iteration
11 | discountFactor = 1;
12 | tol = 1e-3;
13 | nIteration = 200;
14 | [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(gw, discountFactor, nIteration, tol);
15 | % display State value function
16 | vTable = reshape(vTable,gw.GridSize);
17 | fprintf('Policy Iteration converged after %d iterations;\n',fcnCount);
18 | disp(vTable)
19 |
20 | %% Local Functions
21 | function [vTable, optimalDeterministicPolicy, fcnCount] = policy_iteration(env, discountFactor, nIteration, tol)
22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, discountFactor, nIteration, tol)
23 | % evaluate a policy on an MDP environment with Policy Iteration algorithm
24 | % nS, nA: number of states, actions
25 | %
26 | % Inputs:
27 | % - env: grid world environment
28 | % - discountFactor: discount factor (aka gamma)
29 | % - nIteration: maximum number of iteration before return
30 | % - tol: stop evaluation once value function change is less than tol for
31 | % all states
32 | %
33 | % Outputs:
34 | % - vTable: state value function V(s), nS-by-1
35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best
36 | % action at a given state, 0 otherwise
37 | % - fcnCount: iteration counter
38 |
39 | %% Set up useful variables
40 | nS = numel(env.States);
41 | nA = numel(env.Actions);
42 |
43 | %% Initialization V, policy
44 | vTableTemp = zeros(nS,1);
45 | optimalDeterministicPolicy = ones(nS,nA)/nA;
46 | policyStable = false;
47 | fcnCount = 1;
48 |
49 | %% Begin iteration
50 | while ~policyStable && (fcnCount < nIteration)
51 |
52 | %% Policy Evaluation
53 | vTable = policy_evaluation(optimalDeterministicPolicy, env, discountFactor, nIteration, tol);
54 |
55 | %% Policy Improvement
56 | newPolicy = zeros(nS,nA);
57 |
58 | % for each state s
59 | for stateIdx = 1:nS
60 | vTableTempPerState = zeros(nA,1);
61 | for actionIdx = 1:nA
62 | % get reward, transition probility from traversing state transition
63 | transitionProb = env.T(stateIdx,:,actionIdx);
64 | reward = env.R(stateIdx,:,actionIdx)';
65 |
66 | % Bellman equation for state value function (individual action)
67 | vTableTempPerState(actionIdx) = transitionProb*(reward + discountFactor*vTable);
68 | end
69 |
70 | % Choose optimal deterministic action
71 | [vTableTemp(stateIdx),bestAction] = max(vTableTempPerState);
72 | newPolicy(stateIdx,bestAction) = 1;
73 | end
74 |
75 | % if policy does not change after a iteration, terminate
76 | if isequal(newPolicy, optimalDeterministicPolicy)
77 | policyStable = true;
78 | vTable = vTableTemp;
79 | end
80 |
81 | % update for next iteration
82 | optimalDeterministicPolicy = newPolicy;
83 | fcnCount = fcnCount + 1;
84 | end
85 |
86 | end
87 |
88 | function [vTable,fcnCount] = policy_evaluation(policy, env, discountFactor, nIteration, tol)
89 | % [vTable,fcnCount] = policy_evaluation_gw(policy, env, gamma, nIteration, tol) evaluate a policy on an MDP environment
90 | % nS, nA: number of states, actions
91 | %
92 | % Inputs:
93 | % - policy: nS-by-nA matrix, each element stores the probability of taking
94 | % an action from a given state
95 | % - env: grid world environment
96 | % - discountFactor: discount factor (aka gamma)
97 | % - nIteration: maximum number of iteration before return
98 | % - tol: stop evaluation once value function change is less than tol for all
99 | % states
100 | %
101 | % Outputs:
102 | % - vTable: state value function V(s), nS-by-1
103 | % - fcnCount: iteration counter
104 |
105 | %% Set up useful variables
106 | nS = numel(env.States);
107 | nA = numel(env.Actions);
108 |
109 | %% InitializationV(s) = 0
110 | vTable = zeros(nS,1);
111 | terminate = false;
112 | fcnCount = 1;
113 |
114 | %% Begin Iteration
115 | while (~terminate) && (fcnCount < nIteration)
116 | delta = 0;
117 | for stateIdx = 1:nS
118 | tempStateValue = 0;
119 | for actionIdx = 1:nA
120 | % pi(a|s)
121 | actionProbability = policy(stateIdx,actionIdx);
122 |
123 | % get transition probility p(s',r|s,a) and r from state transition
124 | transitionProb = env.T(stateIdx,:,actionIdx);
125 | reward = env.R(stateIdx,:,actionIdx)';
126 |
127 | % Bellman equation for state value function V(s) update
128 | tempStateValue = tempStateValue + ...
129 | actionProbability*(transitionProb*(reward + discountFactor*vTable));
130 | end
131 |
132 | % find how much change in V(s)
133 | delta = max(delta,(abs(tempStateValue - vTable(stateIdx))));
134 |
135 | % update V(s)
136 | vTable(stateIdx) = tempStateValue;
137 | end
138 |
139 | % terminate if change in V(s) is less than tolerance
140 | if delta < tol
141 | terminate = true;
142 | end
143 |
144 | % increment counter
145 | fcnCount = fcnCount + 1;
146 | end
147 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/value_iteration_exercise.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Value Iteration
11 | discount_factor = 1;
12 | tol = 1e-3;
13 | nIteration = 200;
14 | [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(gw, discount_factor, nIteration, tol);
15 |
16 | %% Display State value function
17 | vTable = reshape(vTable,gw.GridSize);
18 | fprintf('Value Iteration converged after %d iterations;\n',fcnCount);
19 | disp(vTable)
20 |
21 | function [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(env, discountFactor, nIteration, tol)
22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, gamma, nIteration, tol)
23 | % evaluate a policy on an MDP environment with Value Iteration algorithm
24 | % nS, nA: number of states, actions
25 | %
26 | % Inputs:
27 | % - env: grid world environment
28 | % - discountFactor: discount factor (aka gamma)
29 | % - nIteration: maximum number of iteration before return
30 | % - tol: stop evaluation once value function change is less than tol for
31 | % all states
32 | %
33 | % Outputs:
34 | % - vTable: state value function V(s), nS-by-1
35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best
36 | % action at a given state, 0 otherwise
37 | % - fcnCount: iteration counter
38 |
39 | %% Set up useful variables
40 | nS = numel(env.States);
41 | nA = numel(env.Actions);
42 |
43 | %% Initialization V
44 | vTable = zeros(nS,1);
45 | terminate = false;
46 | fcnCount = 1;
47 |
48 | %% Begin iteration
49 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻')
50 |
51 | %% Do 1 more iteration to get optimal deterministic policy
52 | optimalDeterministicPolicy = zeros(nS,nA);
53 | error('ENTER YOUR CODE HERE (╯°□°)╯︵ ┻━┻')
54 |
55 | end
--------------------------------------------------------------------------------
/Chap1-DynamicProgramming/value_iteration_solution.m:
--------------------------------------------------------------------------------
1 | close all; clear; clc
2 |
3 | %% Set up GridWorld environment Sutton-Barto 4.1
4 | gw = createSimpleGW();
5 | % number of states
6 | nS = numel(gw.States);
7 | % number of possible actions
8 | nA = numel(gw.Actions);
9 |
10 | %% Value Iteration
11 | discount_factor = 1;
12 | tol = 1e-3;
13 | nIteration = 200;
14 | [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(gw, discount_factor, nIteration, tol);
15 |
16 | %% Display State value function
17 | vTable = reshape(vTable,gw.GridSize);
18 | fprintf('Value Iteration converged after %d iterations;\n',fcnCount);
19 | disp(vTable)
20 |
21 | function [vTable, optimalDeterministicPolicy, fcnCount] = value_iteration(env, discountFactor, nIteration, tol)
22 | % [vTable,optimalDeterministicPolicy,fcnCount] = value_iteration(env, gamma, nIteration, tol)
23 | % evaluate a policy on an MDP environment with Value Iteration algorithm
24 | % nS, nA: number of states, actions
25 | %
26 | % Inputs:
27 | % - env: grid world environment
28 | % - discountFactor: discount factor (aka gamma)
29 | % - nIteration: maximum number of iteration before return
30 | % - tol: stop evaluation once value function change is less than tol for
31 | % all states
32 | %
33 | % Outputs:
34 | % - vTable: state value function V(s), nS-by-1
35 | % - optimalDeterministicPolicy: nS-by-nA probability matrix 1 for best
36 | % action at a given state, 0 otherwise
37 | % - fcnCount: iteration counter
38 |
39 | %% Set up useful variables
40 | nS = numel(env.States);
41 | nA = numel(env.Actions);
42 |
43 | %% Initialization V
44 | vTable = zeros(nS,1);
45 | terminate = false;
46 | fcnCount = 1;
47 |
48 | %% Begin iteration
49 | while(~terminate) && (fcnCount < nIteration)
50 | vTableTemp = zeros(nS,1);
51 | % For each state s
52 | for stateIdx = 1:nS
53 | stateValueTemp = zeros(nA,1);
54 | for actionIdx = 1:nA
55 | % get reward, transition probility from traversing state transition
56 | transitionProb = env.T(stateIdx,:,actionIdx);
57 | reward = env.R(stateIdx,:,actionIdx)';
58 |
59 | % Bellman equation for state value function (individual action)
60 | stateValueTemp(actionIdx) = transitionProb*(reward + discountFactor*vTable);
61 | end
62 | % Choose best value
63 | vTableTemp(stateIdx) = max(stateValueTemp);
64 | end
65 |
66 | % terminate if change in V(s) is less than tolerance
67 | if sum(abs(vTableTemp(:)-vTable(:))) < tol
68 | terminate = true;
69 | end
70 |
71 | % update for next iteration
72 | vTable = vTableTemp;
73 | fcnCount = fcnCount + 1;
74 | end
75 |
76 | %% Do 1 more iteration to get optimal deterministic policy
77 | optimalDeterministicPolicy = zeros(nS,nA);
78 | for stateIdx = 1:nS
79 | stateValueTemp = zeros(nA,1);
80 | for actionIdx = 1:nA
81 | % get reward, transition probility from traversing state transition
82 | transitionProb = env.T(stateIdx,:,actionIdx);
83 | reward = env.R(stateIdx,:,actionIdx)';
84 |
85 | % Bellman equation for state value function (individual action)
86 | stateValueTemp(actionIdx) = transitionProb*(reward + discountFactor*vTable);
87 | end
88 |
89 | % set the best action probability to 1
90 | [~,bestActionIdx] = max(stateValueTemp);
91 | optimalDeterministicPolicy(stateIdx,bestActionIdx) = 1;
92 | end
93 |
94 | end
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 anhtran1995
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RL Course MATLAB
2 |
3 | This repository provides complimentary coding exercises and solutions for [RL Learning Roadmap][11]. The coding exercises format is based on the awesome [WildML Learning Reinforcement Learning course][3] by [Denny Britz][6]. Exercises focus on implementing algorithms (the meat of RL).
4 |
5 | Given many RL courses in the community in Python, I create some coding exercises for MATLAB users. If you don't use MATLAB but want to follow [RL Learning Roadmap][11], rework these exercises in your favorite framework.
6 |
7 | This is not an official MathWorks product. For online courses from MathWorks, see https://matlabacademy.mathworks.com/.
8 |
9 | ## Table of Contents
10 | Coding Exercises requires [Reinforcement Learning Toolbox](https://www.mathworks.com/products/reinforcement-learning.html) but you can always reimplement from scratch.
11 |
12 | • Chapter 1 - Dynamic Programming
13 | • Chapter 2 - Temporal-Difference (TD) Learning (WIP)
14 | • Chapter 3 - Function Approximation (WIP)
15 | • Chapter 4 - Policy Gradient (WIP)
16 | • Chapter 5 - Advanced Policy Gradient (WIP)
17 | • Chapter 6 - Partially Observable Environment (WIP)
18 | • Chapter 7 - Model-based (WIP)
19 |
20 | ## References
21 | Learning materials referred from:
22 |
23 | • [Reinforcement Learning Toolbox][12], The MathWorks
24 | • [Reinforcement Learning: An Introduction][1] (textbook), Sutton and Barto
25 | • [Deep Reinforcement Learning][10] (course), UC Berkeley
26 | • [OpenAI Spinning Up][9](textbook/blog)
27 | • [WildML Learning Reinforcement Learning][3] (python course with exercises/solutions), Denny Britz
28 | • [MATLAB RL Tech Talks][1] (videos), The MathWorks
29 | • [David Silver’s RL course][4]
30 | • [Simple Reinforcement Learning][7] (blog), Arthur Juliani
31 | • [Deep Learning Specialization Coursera][8] (course), Andrew Ng (you can audit for free, highly recommend course 1 + 2 to get Deep Learning foundations)
32 |
33 | [1]: https://www.mathworks.com/videos/series/reinforcement-learning.html
34 | [2]: http://incompleteideas.net/book/RLbook2018.pdf
35 | [3]: https://github.com/dennybritz/reinforcement-learning
36 | [4]: https://www.davidsilver.uk/teaching/
37 | [5]: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
38 | [6]: https://twitter.com/dennybritz?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor
39 | [7]: https://medium.com/@awjuliani
40 | [8]: https://www.coursera.org/specializations/deep-learning
41 | [9]: https://spinningup.openai.com/en/latest/spinningup/rl_intro.html
42 | [10]: http://rail.eecs.berkeley.edu/deeprlcourse/
43 | [11]: https://github.com/anhOfTheStars/RLStudyGuide
44 | [12]: https://www.mathworks.com/products/reinforcement-learning.html
45 |
--------------------------------------------------------------------------------