├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── bandit_problems
├── README.md
├── __init__.py
├── agents.py
├── bandits.py
├── exercises
│ ├── decreasing_epsilon.py
│ ├── ex_2_2_a.py
│ ├── ex_2_2_b.py
│ └── showdown.py
├── results
│ ├── decreasing_epsilon.png
│ ├── decreasing_epsilon_optimality.png
│ ├── exercise_2_2_a.png
│ ├── exercise_2_2_a_optimality.png
│ ├── exercise_2_2_b.png
│ ├── movingBandit.png
│ ├── showdown.png
│ ├── showdown_op.png
│ ├── softmax.png
│ ├── softmax_2.png
│ ├── softmax_temps.png
│ └── softmax_vs_greedy.png
└── test_bed.py
├── dynamic_programming
├── README.md
├── __init__.py
├── car_rentals.py
├── exercises
│ ├── car_rental_exercise.py
│ ├── ex_4_5.py
│ └── ex_4_9.py
├── gamblers.py
└── results
│ ├── e45_optimal_policy.png
│ ├── e45_optimal_value.png
│ ├── e45_policy_0.png
│ ├── e45_policy_1.png
│ ├── e45_policy_2.png
│ ├── gambler_optimal_policy.png
│ ├── gamblers_value_iteration.png
│ ├── jack_optimal_policy.png
│ ├── jack_optimal_value.png
│ ├── jack_policy_0.png
│ ├── jack_policy_1.png
│ ├── jack_policy_2.png
│ ├── jack_policy_3.png
│ ├── jack_policy_4.png
│ ├── jack_policy_5.png
│ ├── policy_4_9_a.png
│ ├── policy_4_9_b.png
│ ├── value_4_9_a.png
│ └── value_4_9_b.png
├── environments
├── __init__.py
├── blackjack
│ ├── __init__.py
│ ├── blackjack.py
│ ├── blackjack_policies.py
│ └── interactive_blackjack.py
└── racing
│ ├── __init__.py
│ ├── interactive_racetrack.py
│ ├── racetracks
│ └── racetrack_a.csv
│ ├── racing.py
│ ├── run_trained_racetrack_bot.py
│ └── trained_policies
│ ├── mc_learning.npy
│ ├── q_learning.npy
│ ├── random.npy
│ └── sarsa.npy
├── lib
└── policy.py
├── monte_carlo
├── README.md
├── __init__.py
├── exercises
│ ├── __init__.py
│ ├── blackjack_policy_improvement.py
│ ├── blackjack_soft_policy_improvement.py
│ ├── mc_blackjack.py
│ └── mc_racetrack.py
├── mc.py
└── results
│ ├── ace_optimal.png
│ ├── ace_policy_evaluation_10000.png
│ ├── ace_policy_evaluation_500000.png
│ ├── no_ace_optimal.png
│ ├── no_ace_policy_evaluation_10000.png
│ ├── no_ace_policy_evaluation_500000.png
│ ├── trained_bot_racing.gif
│ └── untrained_bot_racing.gif
├── requirements.txt
├── rl_problem
├── README.md
├── __init__.py
├── exercises
│ ├── ex_3_17.py
│ └── gridworld_uniform_policy.py
├── gridworld.py
└── results
│ ├── gridworld.png
│ ├── optimal.png
│ └── uniform.png
├── runner.py
└── td_learning
├── README.md
├── __init__.py
├── exercises
├── q_learning_racing.py
└── sarsa_racing.py
├── results
├── q_learning_trained_bot.gif
└── sarsa_trained_bot.gif
└── td.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | *.pyc
3 | .idea
4 | venv
5 | scratch
6 | *sublime*
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Nicholas Cellino
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning: An Introduction
2 |
3 | This repository contains (some of the) programming exercises from [Reinforcement Learning: An Introduction (Second Edition)](https://mitpress.mit.edu/books/reinforcement-learning)
4 | by Richard S. Sutton and Andrew G. Barto. Each subdirectory in this project contains an overview of a topic covered
5 | in the book, the results from the exercises, and Python code for the exercises. There are also reproductions of some
6 | of the figures from the book and Python code to go along with them as well.
7 |
8 | This is a work in progress.
9 |
10 | ## Topics
11 |
12 | 1. [Chapter 2 - Bandit Problems](./bandit_problems)
13 | 2. [Chapter 3 - The Reinforcement Learning Problem](./rl_problem)
14 | 3. [Chapter 4 - Dynamic Programming](./dynamic_programming)
15 | 4. [Chapter 5 - Monte Carlo Methods](./monte_carlo)
16 | 5. [Chapter 6 - Temporal-Difference Learning](./td_learning)
17 |
18 | ## Getting Started
19 | This project uses Python 3.6 and [venv](https://docs.python.org/3/library/venv.html)
20 | (Note: This is distinct from [virtualenv](https://virtualenv.pypa.io/en/stable/). There
21 | are some issues using matplotlib on OSX with virtualenv).
22 | Ensure that you have both of these installed on your system.
23 |
24 | Then, in the project directory, create your virtual environment:
25 | ```
26 | python3.6 -m venv venv
27 | ```
28 | This creates a folder called `venv` in which we can install Python libraries
29 | like [numpy](http://www.numpy.org/) and [matplotlib](http://matplotlib.org/).
30 |
31 | To tell your system to use this environment instead of the system-wide python environment, run:
32 | ```
33 | source venv/bin/activate
34 | ```
35 | You will need to do this anytime you want to run examples.
36 |
37 |
38 | Next, to install the required libraries into the virtual environment, run:
39 | ```
40 | pip install -r requirements.txt
41 | ```
42 |
43 | All set! Run exercises by calling runner.py followed by the path to the exercise. For example:
44 | ```
45 | python runner.py bandit_problems/exercises/ex_2_2_a.py
46 | ```
47 |
48 | For some of the exercises, you can pass arguments to specify certain things about their execution (for example, number of trials in the case
49 | of the n-armed-bandit problems). You can see what these parameters are by passing `-h` like so:
50 | ```
51 | python runner.py bandit_problems/exercises/ex_2_2_a.py -h
52 | ```
53 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/__init__.py
--------------------------------------------------------------------------------
/bandit_problems/README.md:
--------------------------------------------------------------------------------
1 | # Chapter 2 - Bandit Problems
2 |
3 | In this chapter, we learn about the N-Armed-Bandit problem. Consider this problem:
4 |
5 | There are 10 different slot machines. For each slot machine, you pull a lever and
6 | get a certain reward, maybe 0 tokens, maybe 10, maybe a million. You get 1000 pulls.
7 | Your job is to end up with as many tokens as you can by the end of the 1000 pulls.
8 | What is your strategy?
9 |
10 | If the slot machines are all exactly the same, then it doesn't really matter what you do.
11 | You could use all your pulls on 1 machine or choose randomly for each pull and, on average,
12 | you'll get the same result. But what if the machines are not all the same? What if
13 | some of the machines are better than others? For example, say you tried slot machine 1 for
14 | a few pulls and got the following results:
15 |
16 | 1. 3 tokens
17 | 2. 7 tokens
18 | 3. 6 tokens
19 | 4. 5 tokens
20 | 5. 7 tokens
21 | 6. 4 tokens
22 |
23 | Then you try machine 2 for a few pulls and get the following results:
24 |
25 | 1. 8 tokens
26 | 2. 6 tokens
27 | 3. 9 tokens
28 | 4. 8 tokens
29 | 5. 10 tokens
30 | 6. 7 tokens
31 |
32 | While the rewards are still random, machine 2 seems to be giving better results than machine 1
33 | on average. So we need to come up with a strategy that exploits that information in order to get
34 | the most possible tokens at the end.
35 |
36 | This is the essence of the N-Armed-Bandit problem. How do we come up with a strategy to maximize
37 | our reward?
38 |
39 | ### How we approach the problem
40 |
41 | So we need to figure out what the best slot machine is and choose that one as much as possible.
42 | In order to determine which slot machine is the best one, we need to try all the different
43 | slot machines and see which ones give the best rewards.
44 |
45 | So if we have 1000 pulls, we can try each slot machine 100 times, average the results,
46 | and then we'll have a pretty good estimate of how good each slot machine is, right?
47 | Well yeah, but then we've spent all of our pulls so we can't exploit that information.
48 | So how about we try each machine once, then spend the rest of our pulls on whichever one
49 | gave us the best reward? Well that doesn't really guarantee that we've found the best
50 | machine because we only tried each once.
51 |
52 | So we need to balance exploration (finding which machine is the best) with exploitation
53 | (exploiting our knowledge to get the most possible reward).
54 |
55 | ### Epsilon Greedy Method
56 |
57 | The epsilon greedy method is very simple. Basically, we use the reward from each pull
58 | to maintain an estimate for how good each slot machine is. For some percentage of
59 | our pulls, we pick the slot machine that we estimate to be the best. For the rest of our
60 | pulls, we pick a slot machine randomly.
61 |
62 | The percentage of pulls that we choose randomly is ε (epsilon). So for example,
63 | ε = 0.1 means we choose randomly 10% of the time and are greedy (choose our best estimate)
64 | 90% of the time.
65 |
66 | Here are some results showing the performance of the epsilon greedy methods.
67 |
68 | 
69 |
70 | The values at each pull are averages over 2000 trials.
71 |
72 | We can see that ε=0 does not perform too well. This is because it does not spend any
73 | time exploring. It picks some slot machine as the best and chooses it every time no
74 | matter what. With ε=0.1, we can see that we do a little better. We spend more time exploring
75 | so we are able to get better results, but we plateau because we only ever choose our best
76 | estimate for 90% of pulls. With ε=0.01, we do not learn as fast, but we eventually reach a
77 | higher average reward than ε=0.1 because once we figure out which slot machine is best,
78 | we choose it 99% of the time.
79 |
80 | 
81 |
82 | This graph shows the percent of the time that each method has chosen the optimal action
83 | at each pull number. We see that for ε=0, it rarely finds the optimal action,
84 | and it doesn't spend any time exploring. For ε=0.1, it spends 10% of its
85 | time exploring so it learns very fast, but it also plateaus because it will
86 | only exploit its knowledge 90% of the time.
87 |
88 | Maybe we can improve this a little. Maybe we'd want to do a little more exploring
89 | at the beginning of our session and as we get towards the end, be more greedy. We can
90 | do that!
91 |
92 | 
93 |
94 | The different lines here show methods where we decrease epsilon at different
95 | rates.
96 |
97 | 
98 |
99 | ### Softmax Method
100 |
101 | With the epsilon greedy method, we kind of took an all or nothing approach
102 | to exploration and exploitation. Either we were exploring, and we'd choose
103 | our arm totally randomly or we were exploiting and being totally greedy.
104 | Softmax methods, on the other hand, explore all the time but use their estimates
105 | of each arm's value to weight how often they choose that arm. This means that
106 | they will choose the arm they estimate to be the best most often and the arm
107 | they estimate to be the worst least often and every arm in between is weighted
108 | accordingly as well.
109 |
110 | 
111 |
112 | They have a parameter called the "temperature" which essentially says how
113 | much to weigh our estimates. Higher temperatures place less importance on
114 | our estimates and choose actions equi-probably. Lower temperatures place more
115 | importance on our estimates and so choose the actions we estimate to be better
116 | more often. As the temperature approaches 0, we start to be greedy 100% of the
117 | time. Picking the temperature is tricky and seems to be mostly a trial and error
118 | type thing. I am not sure if there is a more scientific way to approach that.
119 |
120 | ### 10-Armed Bandit Showdown
121 |
122 | So which bandit performed the best?
123 |
124 | 
125 |
126 | 
127 |
128 | The quantity we are trying to maximize is total rewards which is represented
129 | graphically as the area under the curve. In this experiment, the strategy
130 | in which we decrease epsilon over time performed the best. One interesting
131 | thing that we can see here is that, although the softmax agent generally chooses the optimal
132 | action less than the epsilon greedy agent, it performs about the same because it chooses
133 | "okay" actions much more than it chooses the really bad actions.
134 |
135 | #### Sources:
136 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
137 |
--------------------------------------------------------------------------------
/bandit_problems/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/__init__.py
--------------------------------------------------------------------------------
/bandit_problems/agents.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from numpy.random import random, random_integers, normal
4 | import numpy as np
5 |
6 | class Agent:
7 |
8 | def __init__(self, num_arms):
9 | self._num_arms = num_arms
10 | self._results = np.zeros((self._num_arms, 2))
11 | self._value_estimates = normal(0, 0.01, size=(self._num_arms))
12 |
13 | def reset(self):
14 | self._value_estimates = normal(size=(self._num_arms))
15 | self._results = np.zeros((self._num_arms, 2))
16 |
17 | def _update_value_estimate(self, reward, arm):
18 | self._results[arm, 0] += reward
19 | self._results[arm, 1] += 1
20 | self._value_estimates[arm] = self._results[arm, 0] / self._results[arm, 1]
21 |
22 | def do_pull(self, bandit):
23 | arm = self._choose_arm()
24 | reward = bandit.pull_arm(arm)
25 | self._update_value_estimate(reward, arm)
26 | return reward, bandit.was_optimal_choice(arm)
27 |
28 | class SoftmaxAgent(Agent):
29 |
30 | def __init__(self, temperature, num_arms):
31 | Agent.__init__(self, num_arms)
32 | self._temperature = temperature
33 |
34 | def _gibbs_distribution(self):
35 | dist = np.exp(self._value_estimates/self._temperature)
36 | return dist / np.sum(dist)
37 |
38 | def _get_sample(self, dist):
39 | cumulative_dist = np.cumsum(dist)
40 | r = random()
41 | for i in range(len(cumulative_dist)):
42 | if r < cumulative_dist[i]:
43 | return i
44 |
45 | def _choose_arm(self):
46 | dist = self._gibbs_distribution()
47 | return self._get_sample(dist)
48 |
49 | def __str__(self):
50 | return f'Softmax Agent (t={self._temperature})'
51 |
52 |
53 | class EpsilonGreedyAgent(Agent):
54 |
55 | def __init__(self, epsilon, num_arms):
56 | Agent.__init__(self, num_arms)
57 | self._starting_epsilon = epsilon
58 | self._epsilon = epsilon
59 |
60 | def reset(self):
61 | self._epsilon = self._starting_epsilon
62 | Agent.reset(self)
63 |
64 | def _choose_arm(self):
65 | if random() < self._epsilon:
66 | return random_integers(0, len(self._results) - 1)
67 | else:
68 | return np.argmax(self._value_estimates)
69 |
70 | def __str__(self):
71 | return f'Epsilon Greedy Agent (ε={self._epsilon})'
72 |
73 |
74 | class FixedAlphaEpsilonGreedyAgent(EpsilonGreedyAgent):
75 |
76 | def __init__(self, epsilon, num_arms, alpha=0.1):
77 | EpsilonGreedyAgent.__init__(self, epsilon, num_arms)
78 | self._alpha = alpha
79 |
80 | def _update_value_estimate(self, reward, arm):
81 | self._value_estimates[arm] += self._alpha * (reward - self._value_estimates[arm])
82 |
83 | def __str__(self):
84 | return f'Fixed Alpha Epsilon Greedy Agent (ε={self._epsilon}, α={self._alpha})'
85 |
86 |
87 | class AdjustableEpsilonGreedyAgent(EpsilonGreedyAgent):
88 |
89 | def __init__(self, num_arms, num_turns):
90 | EpsilonGreedyAgent.__init__(self, 1.0, num_arms)
91 | self._num_turns = num_turns
92 | self._num_pulls = 0
93 |
94 | def reset(self):
95 | self._num_pulls = 0
96 | EpsilonGreedyAgent.reset(self)
97 |
98 | def do_pull(self, bandit):
99 | self._adjust_epsilon()
100 | reward, was_optimal = Agent.do_pull(self, bandit)
101 | self._num_pulls += 1
102 | return reward, was_optimal
103 |
104 |
105 | class ExponentialDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent):
106 |
107 | def __init__(self, num_arms, num_turns, decline_rate=1.001):
108 | AdjustableEpsilonGreedyAgent.__init__(self, num_arms, num_turns)
109 | self._decline_rate = decline_rate
110 |
111 | # Calculates and sets the next epsilon value
112 | def _adjust_epsilon(self):
113 | self._epsilon = ((1 - (self._decline_rate**(-self._num_pulls))) /
114 | (self._decline_rate**(-self._num_turns) - 1)) + 1
115 |
116 | def __str__(self):
117 | return f'Exponentially Decreasing Epsilon Greedy Agent (decline_rate={self._decline_rate})'
118 |
119 |
120 | class LinearDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent):
121 |
122 | # Sets the next epsilon value
123 | def _adjust_epsilon(self):
124 | progress = float(self._num_pulls) / self._num_turns
125 | self._epsilon = 1 - progress
126 |
127 | def __str__(self):
128 | return f'Linearly Decreasing Epsilon Greedy Agent'
129 |
--------------------------------------------------------------------------------
/bandit_problems/bandits.py:
--------------------------------------------------------------------------------
1 | from numpy.random import normal, randn
2 | import numpy as np
3 |
4 | class NArmedBandit(object):
5 |
6 | def __init__(self, n):
7 | self._arms = randn(n)
8 |
9 | def pull_arm(self, arm):
10 | self.validate_arm(arm)
11 | return self._arms[arm] + normal()
12 |
13 | def num_arms(self):
14 | return len(self._arms)
15 |
16 | def validate_arm(self, arm):
17 | if arm < 0 or arm >= self.num_arms():
18 | raise ValueError("This arm does not exist.")
19 |
20 | def was_optimal_choice(self, arm):
21 | """
22 | Tells if the choice was optimal.
23 |
24 | Should be used for analysis purposes only
25 | (in other words, not for actually solving the problem)
26 | """
27 | self.validate_arm(arm)
28 | return np.argmax(self._arms) == arm
29 |
30 |
31 | class MovingNArmedBandit(NArmedBandit):
32 |
33 | def __init__(self, n, sigma=0.1):
34 | super(MovingNArmedBandit, self).__init__(n)
35 | self._sigma = sigma
36 |
37 | def pull_arm(self, arm):
38 | value = super(MovingNArmedBandit, self).pull_arm(arm)
39 | self._arms += self._sigma * randn(len(self._arms))
40 | return value
41 |
--------------------------------------------------------------------------------
/bandit_problems/exercises/decreasing_epsilon.py:
--------------------------------------------------------------------------------
1 | from bandit_problems.agents import *
2 | from bandit_problems.test_bed import TestBed
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
6 | parser.add_argument('--arms',
7 | type=int,
8 | help='Number of arms for the bandit',
9 | default=10)
10 | parser.add_argument('--trials',
11 | type=int,
12 | help='Number of trials to average over',
13 | default=2000)
14 | parser.add_argument('--pulls',
15 | type=int,
16 | help='Number of pulls per trial',
17 | default=1000)
18 | args = parser.parse_args()
19 |
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 |
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.01))
28 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.0075))
29 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015))
30 |
31 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
32 | tb.run()
33 | tb.plot_results(title='Decreasing Epsilon Value')
34 |
--------------------------------------------------------------------------------
/bandit_problems/exercises/ex_2_2_a.py:
--------------------------------------------------------------------------------
1 | from bandit_problems.agents import *
2 | from bandit_problems.test_bed import TestBed
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
6 | parser.add_argument('--arms',
7 | type=int,
8 | help='Number of arms for the bandit',
9 | default=10)
10 | parser.add_argument('--trials',
11 | type=int,
12 | help='Number of trials to average over',
13 | default=2000)
14 | parser.add_argument('--pulls',
15 | type=int,
16 | help='Number of pulls per trial',
17 | default=3000)
18 | args = parser.parse_args()
19 |
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 |
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0, num_arms))
27 | agents.append(EpsilonGreedyAgent(0.01, num_arms))
28 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
29 |
30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
31 | tb.run()
32 | tb.plot_results(title='Exercise 2.2')
33 |
--------------------------------------------------------------------------------
/bandit_problems/exercises/ex_2_2_b.py:
--------------------------------------------------------------------------------
1 | from bandit_problems.agents import *
2 | from bandit_problems.test_bed import TestBed
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
6 | parser.add_argument('--arms',
7 | type=int,
8 | help='Number of arms for the bandit',
9 | default=10)
10 | parser.add_argument('--trials',
11 | type=int,
12 | help='Number of trials to average over',
13 | default=2000)
14 | parser.add_argument('--pulls',
15 | type=int,
16 | help='Number of pulls per trial',
17 | default=1000)
18 | args = parser.parse_args()
19 |
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 |
25 | agents = []
26 | agents.append(SoftmaxAgent(0.1, num_arms))
27 | agents.append(SoftmaxAgent(0.2, num_arms))
28 | agents.append(SoftmaxAgent(0.3, num_arms))
29 | agents.append(SoftmaxAgent(0.4, num_arms))
30 | agents.append(SoftmaxAgent(0.5, num_arms))
31 |
32 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
33 | tb.run()
34 | tb.plot_results(title='Exercise 2.2')
35 |
--------------------------------------------------------------------------------
/bandit_problems/exercises/showdown.py:
--------------------------------------------------------------------------------
1 | from bandit_problems.agents import *
2 | from bandit_problems.test_bed import TestBed
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description="Bandit Showdown")
6 | parser.add_argument('--arms',
7 | type=int,
8 | help='Number of arms for the bandit',
9 | default=10)
10 | parser.add_argument('--trials',
11 | type=int,
12 | help='Number of trials to average over',
13 | default=2000)
14 | parser.add_argument('--pulls',
15 | type=int,
16 | help='Number of pulls per trial',
17 | default=3000)
18 | args = parser.parse_args()
19 |
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 |
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015))
28 | agents.append(SoftmaxAgent(0.3, num_arms))
29 |
30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
31 | tb.run()
32 | tb.plot_results(title='Decreasing Epsilon Value')
--------------------------------------------------------------------------------
/bandit_problems/results/decreasing_epsilon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon.png
--------------------------------------------------------------------------------
/bandit_problems/results/decreasing_epsilon_optimality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon_optimality.png
--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a.png
--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_a_optimality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a_optimality.png
--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_b.png
--------------------------------------------------------------------------------
/bandit_problems/results/movingBandit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/movingBandit.png
--------------------------------------------------------------------------------
/bandit_problems/results/showdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown.png
--------------------------------------------------------------------------------
/bandit_problems/results/showdown_op.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown_op.png
--------------------------------------------------------------------------------
/bandit_problems/results/softmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax.png
--------------------------------------------------------------------------------
/bandit_problems/results/softmax_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_2.png
--------------------------------------------------------------------------------
/bandit_problems/results/softmax_temps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_temps.png
--------------------------------------------------------------------------------
/bandit_problems/results/softmax_vs_greedy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_vs_greedy.png
--------------------------------------------------------------------------------
/bandit_problems/test_bed.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from tqdm import tqdm
4 |
5 | from bandit_problems.bandits import NArmedBandit, MovingNArmedBandit
6 |
7 |
8 | class TestBed:
9 |
10 | _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
11 |
12 | def __init__(self,
13 | agents,
14 | num_arms=10,
15 | num_trials=2000,
16 | num_pulls=1000):
17 | self._num_arms = num_arms
18 | self._num_trials = num_trials
19 | self._num_pulls = num_pulls
20 | self._agents = agents
21 | self._results = np.zeros((len(agents), num_pulls))
22 | self._optimal_choices = np.zeros((len(agents), num_pulls))
23 |
24 | def _reset_agents(self):
25 | for agent in self._agents:
26 | agent.reset()
27 |
28 | def run(self):
29 | for trial_num in tqdm(range(self._num_trials)):
30 | b = NArmedBandit(self._num_arms)
31 | self._reset_agents()
32 | for pull in range(self._num_pulls):
33 | for i in range(len(self._agents)):
34 | reward, was_optimal = self._agents[i].do_pull(b)
35 | self._results[i, pull] += reward
36 | if was_optimal:
37 | self._optimal_choices[i, pull] += 1
38 |
39 | def run_moving(self):
40 | for trial_num in tqdm(range(self._num_trials)):
41 | b = MovingNArmedBandit(self._num_arms, 0.1)
42 | self._reset_agents()
43 | for pull in range(self._num_pulls):
44 | for i in range(len(self._agents)):
45 | reward, was_optimal = self._agents[i].do_pull(b)
46 | self._results[i, pull] += reward
47 | if was_optimal:
48 | self._optimal_choices[i, pull] += 1
49 |
50 | def plot_results(self, title):
51 | plt.figure(1)
52 | avgs = self._results / self._num_trials
53 | for i in range(len(self._agents)):
54 | plt.plot(avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i]))
55 | plt.title(title)
56 | plt.xlabel('Pull Number')
57 | plt.ylabel('Average Reward')
58 | plt.legend(loc=4)
59 |
60 | plt.figure(2)
61 | optimal_choices_avgs = self._optimal_choices / self._num_trials
62 | for i in range(len(self._agents)):
63 | plt.plot(optimal_choices_avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i]))
64 | plt.title(title)
65 | plt.xlabel('Pull Number')
66 | plt.ylabel('Percent Optimal Action Choice')
67 | plt.legend(loc=4)
68 |
69 | plt.show()
70 |
--------------------------------------------------------------------------------
/dynamic_programming/README.md:
--------------------------------------------------------------------------------
1 | # Chapter 4 - Dynamic Programming
2 |
3 | In this chapter, we learn about using dynamic programming techniques to solve
4 | finite MDPs. By "solve," in this context, we mean find the optimal way to behave
5 | in the MDP so as to maximize our return.
6 |
7 | ## Policy Evaluation
8 |
9 | The first important idea from this chapter is policy **evaluation**. This simply refers
10 | to the process of determining the value functions for a certain policy. One way
11 | to do this using dynamic programming is by taking an iterative approach.
12 |
13 | We start with a given policy π and an arbitrary state-value function v(s)- we can
14 | choose the state-value function that is 0 for all states. Then, we try to calculate v(s)
15 | for each state in the state space. To do so, we look ahead one action,
16 | and for each action, we look ahead at
17 | the possible next states. For each of these actions a and next states s', we calculate
18 | the return, which is the sum of the expected immediate reward and the discounted sum of
19 | the return of the next state. We sum all these together, with each weighted
20 | by their probability of occurring. Since the return of the next state is not actually
21 | known, this is still only an estimate, but if we apply this procedure iteratively,
22 | we are guaranteed to converge to the true value function.
23 |
24 | ## Policy Improvement
25 |
26 | Okay, so with policy evaluation, we have a method to learn the value function
27 | for a given policy in an environment. But our goal is to find the optimal way
28 | to behave in this environment- the optimal policy.
29 |
30 | Once we have the value function, this is actually pretty easy. If we know the
31 | value function for a certain policy, we can look at each state and see if the
32 | policy takes the optimal action from that state- remember that we know at this point
33 | the value of all possible next states, the expected rewards from each action, and the
34 | probability of transitioning from state s to state s' given the action a. If it does not
35 | take the optimal action, then there is clearly an opportunity to **improve** this policy.
36 | We can improve the policy by, from each state, selecting the action that gives us
37 | the most return. Put another way, we should be **greedy** with respect to the policy's
38 | value function. Once we do this, we end up with another policy which is better than
39 | the one we started with. More formally, the state-value function of this policy is greater
40 | than or equal to the state-value function of the previous policy for every state s.
41 | If the state-value function is higher for every state, that intuitively means this policy can
42 | extract more return from this environment in the long run.
43 |
44 | ## Policy Iteration
45 |
46 | The policy iteration algorithm combines these two algorithms in order to find the optimal policy. We start with
47 | an arbitrary policy and value function. Then, we evaluate this policy. Then, we improve that policy. Then, we evaluate
48 | this policy. And so on, until the policy remains the same for two steps in a row. At this point, the policy is greedy
49 | with respect to its own value function. This implies that this policy's value function satisfies the Bellman
50 | optimality equation and thus, this is an optimal policy.
51 |
52 |
53 | ### Exercise: Jack's Car Rental
54 |
55 | Jack's Car Rental problem is described in Sutton and Barto **Example 4.2** and **Exercise 4.5**.
56 |
57 | The basic problem is this: Jack manages two dealerships for his car rental business. Let's call them A and B.
58 | Every day, some customers arrive at each location and request cars. If Jack has a car for them, he can rent it to them
59 | and get $10. If he does not have a car, he loses their business and makes no money. Jack can move cars between dealerships
60 | at night for a cost of $2/car to help make sure he has cars where they are needed, but he can only move a maximum of 5 cars
61 | per night. Every day, some number of people
62 | also return cars to each dealership, and those are available for rental the next day. The number of people who
63 | request and return cars to each dealership are Poisson random variables.
64 |
65 | For dealership A, the request and return probabilities have expected values 3 and 3, respectively.
66 |
67 | For dealership B, the request and return probabilities have expected values 4 and 2, respectively.
68 |
69 | Also, there can be no more than 20 cars at each location- any additional cars get returned to the nationwide company.
70 |
71 | We can use policy iteration to find the optimal policy for this environment. The states in this environment are how many
72 | cars are at each dealership. The actions are how many cars we move from A to B (a negative number means we move cars from
73 | B to A). So the actions are integers in the range \[-5, 5\]. The rewards are how much money Jack makes in each time step.
74 | The book says to use a discount factor of 0.9, so that's what we'll do.
75 |
76 | Here are my results for running policy iteration on this problem:
77 |
78 | 
79 |
80 | 
81 |
82 | 
83 |
84 | 
85 |
86 | 
87 |
88 | 
89 |
90 | 
91 |
92 | 
93 |
94 | As you can see, I started with the policy that moves 0 cars no matter what. At each iteration,
95 | the policy changes slightly until there is no difference between policy 5 and the optimal policy. I'm not sure
96 | why my results differ slightly from those shown in the book (Figure 4.4).
97 | Policy 1 is slightly different when dealer B has 20 cars and my optimal value function looks
98 | to max out at a slightly higher value. This may be due to mistakes on my part or different convergence
99 | criteria. The rest, however, seem to conform exactly to the figures in the book.
100 |
101 | ### Exercise: Jack's Car Rental With Help
102 |
103 | Now, we add a couple things to this problem.
104 |
105 | One of Jack's employees takes the bus home from near dealership A to near dealership B every night.
106 | She is willing to drive a car from A to B for free.
107 |
108 | Also, Jack's parking lot just shrunk. If he has more than 10 cars at a certain dealership,
109 | he will now have to rent an additional lot for a cost of $4 for that location.
110 |
111 | Here are my results for running policy iteration on that problem:
112 |
113 | 
114 |
115 | 
116 |
117 | 
118 |
119 | 
120 |
121 | 
122 |
123 | While I am not positive that these results are correct, we can see by inspection that
124 | the optimal policy does make sense. For example, it usually makes sense to take advantage of that free car
125 | transport from A to B because B usually gets more requests than A, unless it means that it will make dealership
126 | B have more than 10 cars. We also see where this policy tries to avoid that $4 parking lot
127 | overhead.
128 |
129 | ## Value Iteration
130 |
131 | Value iteration functions in a similar way to policy iteration but takes a shortcut. It essentially cuts short
132 | the policy evaluation step and attempts, at each iteration, to maximize the value function by being greedy with respect
133 | to the previous value function.
134 |
135 | ### Exercise: Gambler's Problem
136 |
137 | A gambler flips a coin. If it lands on heads, he wins. If he lands on tails, he loses. He starts off with
138 | $1 and can bet in dollar increments. His goal is to get to $100.
139 |
140 | So that states are how much money he has, and the actions are how much he bets. The rewards are 0 for everything
141 | except if he gets to the $100 state, in which case, he gets a reward of 1.
142 |
143 | Here are the results of running value iteration on this problem:
144 |
145 | 
146 |
147 | We can see how these value functions are tending towards a single function as we iterate further.
148 |
149 | 
150 |
151 | This is one optimal policy for this problem. There are different optimal policies for this problem. This one
152 | was chosen to replicate the result in Sutton and Barto: it is generated by choosing the most conservative/lowest bet
153 | out of all the optimal bets.
154 |
155 | ### Exercise: Gambler's Problem (ph=0.25 and ph=0.55)
156 |
157 | #### ph=0.25 Results
158 |
159 | 
160 |
161 | 
162 |
163 | #### ph=0.55 Results
164 |
165 | 
166 |
167 | 
168 |
169 |
170 | #### Sources:
171 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
172 |
--------------------------------------------------------------------------------
/dynamic_programming/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/__init__.py
--------------------------------------------------------------------------------
/dynamic_programming/car_rentals.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from mpl_toolkits.mplot3d import Axes3D
4 | import math
5 | from tqdm import tqdm
6 |
7 | class JacksCarRental:
8 |
9 | EXPECTED_RETURNS_A = 3
10 | EXPECTED_REQUESTS_A = 3
11 | EXPECTED_RETURNS_B = 2
12 | EXPECTED_REQUESTS_B = 4
13 | MOVING_CAR_COST = 2
14 | RENTAL_SALE_PRICE = 10
15 |
16 | # Don't bother computing poisson for anything above this
17 | # It will be very close to 0
18 | POISSON_CUTOFF = 14
19 |
20 | def __init__(self, max_cars=21):
21 | """
22 | :param max_cars: Non-inclusive upper-bound for how many cars can be at a dealership
23 | """
24 | self.max_cars = max_cars
25 | self.action_space = np.arange(-5, 6)
26 | self.a_transitions = self.init_transition_probabilities('A')
27 | self.b_transitions = self.init_transition_probabilities('B')
28 | self.a_expected_revenue = self.init_expected_revenue('A')
29 | self.b_expected_revenue = self.init_expected_revenue('B')
30 |
31 | def init_expected_revenue(self, dealership):
32 | """
33 | Returns a self.max_cars x self.max_cars x len(self.action_space) array.
34 | Each cell holds the expected revenue for the specified dealership with
35 | the specified previous state, next state, and action.
36 | :param dealership: 'A' or 'B'
37 | """
38 | revenue = np.zeros((self.action_space.shape[0], self.max_cars, self.max_cars))
39 | for cars in range(self.max_cars):
40 | for cars_after in range(self.max_cars):
41 | for action in self.action_space:
42 | if (dealership is 'A' and cars - action < 0) or (dealership is 'B' and cars + action < 0):
43 | continue
44 | revenue[action, cars, cars_after] = self.get_expected_revenue(dealership, action, cars, cars_after)
45 | return revenue
46 |
47 | def get_expected_revenue(self, dealership, action, now, after):
48 | if dealership is 'A':
49 | after_move = now - action
50 | elif dealership is 'B':
51 | after_move = now + action
52 | else:
53 | raise ValueError('Dealership must be A or B')
54 |
55 | expected_revenue = 0.0
56 | for requests in range(self.POISSON_CUTOFF):
57 | probability = self.expected_requests_probability(dealership, requests)
58 | expected_revenue += probability * self.RENTAL_SALE_PRICE * min(after_move, requests)
59 |
60 | return expected_revenue
61 |
62 | def init_transition_probabilities(self, dealership):
63 | ret = np.zeros((self.max_cars, self.max_cars))
64 | for current in range(ret.shape[0]):
65 | for next in range(ret.shape[1]):
66 | probability = 0.0
67 | for requests in range(self.POISSON_CUTOFF):
68 | for returns in range(self.POISSON_CUTOFF):
69 | cars_after_requests = max(current - requests, 0)
70 | cars_after_returns = min(cars_after_requests + returns, self.max_cars - 1)
71 | if cars_after_returns == next:
72 | request_probability = self.expected_requests_probability(dealership, requests)
73 | return_probability = self.expected_returns_probability(dealership, returns)
74 | probability += request_probability * return_probability
75 | ret[current, next] = probability
76 | return ret
77 |
78 | def expected_returns_probability(self, dealership, returns):
79 | if dealership is 'A':
80 | return self.poisson(self.EXPECTED_RETURNS_A, returns)
81 | elif dealership is 'B':
82 | return self.poisson(self.EXPECTED_RETURNS_B, returns)
83 | else:
84 | raise ValueError('Dealership must be A or B')
85 |
86 | def expected_requests_probability(self, dealership, requests):
87 | if dealership is 'A':
88 | return self.poisson(self.EXPECTED_REQUESTS_A, requests)
89 | elif dealership is 'B':
90 | return self.poisson(self.EXPECTED_REQUESTS_B, requests)
91 | else:
92 | raise ValueError('Dealership must be A or B')
93 |
94 | def poisson(self, expected, num):
95 | ret = ((expected**num)/math.factorial(num))*math.exp(-expected)
96 | return ret
97 |
98 | def get_action_cost(self, current, action):
99 | return abs(action) * self.MOVING_CAR_COST
100 |
101 | def get_expected_reward(self, action, current, next):
102 | cost = self.get_action_cost(current, action)
103 |
104 | expected_sales_a = self.a_expected_revenue[action, current[0], next[0]]
105 | expected_sales_b = self.b_expected_revenue[action, current[1], next[1]]
106 |
107 | return expected_sales_a + expected_sales_b - cost
108 |
109 | def next_state_probability(self, current, next, action):
110 | immediate_a = current[0] - action
111 | immediate_b = current[1] + action
112 | if immediate_a < 0 or immediate_a > (self.max_cars - 1):
113 | return 0.0
114 | elif immediate_b < 0 or immediate_b > (self.max_cars - 1):
115 | return 0.0
116 | probability_a = self.a_transitions[immediate_a, next[0]]
117 | probability_b = self.b_transitions[immediate_b, next[1]]
118 | return probability_a * probability_b
119 |
120 | def expected_return(self, state, action, state_value, gamma):
121 | (a, b) = state
122 | next_state_gain_expectation = 0.0
123 | for a_prime in range(self.max_cars):
124 | for b_prime in range(self.max_cars):
125 | probability_next_state = self.next_state_probability((a, b), (a_prime, b_prime), action)
126 | immediate_reward = self.get_expected_reward(action, (a, b), (a_prime, b_prime))
127 | next_state_gain_expectation += probability_next_state * (immediate_reward + gamma * state_value[a_prime, b_prime])
128 | return next_state_gain_expectation
129 |
130 | def evaluate_policy(self, policy, gamma=0.9, convergence=1.0):
131 | """
132 | Generates a value function for a given deterministic policy.
133 | The policy should specify the action [-5, +5] for each
134 | state, which is the number of cars at location A and the number
135 | of cars at location B, where each ranges from 0 to 20.
136 |
137 | :param policy: A self.max_cars x self.max_cars array
138 | :return: A self.max_cars x self.max_cars array
139 | """
140 | ret = np.zeros((self.max_cars, self.max_cars))
141 | diff = np.inf
142 | print(f'Evaluating policy until diff < {convergence}')
143 | while diff > convergence:
144 | temp = np.copy(ret)
145 | for a in range(policy.shape[0]):
146 | for b in range(policy.shape[1]):
147 | ret[a, b] = self.expected_return((a, b), policy[a, b], temp, gamma)
148 | diff = np.max(np.fabs(np.subtract(ret, temp)))
149 | print(f'Diff: {diff}')
150 | return ret
151 |
152 | def get_greedy_policy(self, value, gamma=0.9):
153 | """
154 | Generates a policy that is greedy with respect to the provided value function.
155 |
156 | :param value: A self.max_cars x self.max_cars array
157 | :return: A self.max_cars x self.max_cars array
158 | """
159 | policy = np.zeros((self.max_cars, self.max_cars))
160 | print('Improving Policy...')
161 | for a in tqdm(range(policy.shape[0])):
162 | for b in range(policy.shape[1]):
163 | best_action = [None, -np.inf]
164 | for action in np.arange(-5, 6):
165 | if a - action < 0 or b + action < 0:
166 | # This action is not allowed if it makes one dealership have less than 0 cars
167 | continue
168 | next_state_gain_expectation = self.expected_return((a, b), action, value, gamma)
169 | if next_state_gain_expectation > best_action[1]:
170 | best_action[0] = action
171 | best_action[1] = next_state_gain_expectation
172 | policy[a, b] = best_action[0]
173 | return policy.astype(int)
174 |
175 | def run_policy_improvement(self, gamma=0.9, convergence=5.0):
176 | initial_policy = np.zeros((self.max_cars, self.max_cars), dtype=int)
177 | policies = [initial_policy]
178 | value = None
179 | while len(policies) < 2 or not np.array_equal(policies[-1], policies[-2]):
180 | value = self.evaluate_policy(policies[-1], gamma, convergence)
181 | greedy = self.get_greedy_policy(value)
182 | policies.append(greedy)
183 | return policies, value
184 |
185 | def plot_results(self, policies, value_function):
186 | self.plot_value_function(value_function, figure=1)
187 | self.plot_policies(policies, starting_fig=2)
188 | plt.show()
189 |
190 | def plot_value_function(self, value_function, figure=1):
191 | fig = plt.figure(figure)
192 | ax = fig.add_subplot(111, projection='3d')
193 | x = np.arange(0, self.max_cars)
194 | y = np.arange(0, self.max_cars)
195 | X, Y = np.meshgrid(x, y)
196 | ax.plot_wireframe(X, Y, value_function)
197 | fig.suptitle('Optimal Value Function')
198 | plt.xlabel('# of Cars at Dealership B')
199 | plt.ylabel('# of Cars at Dealership A')
200 |
201 | def plot_policies(self, policies, starting_fig=1):
202 | figure = starting_fig
203 | for i in range(len(policies)):
204 | fig = plt.figure(figure)
205 | figure += 1
206 | policy = policies[i]
207 | plt.imshow(policy, cmap='jet')
208 | plt.ylabel('# of Cars at Dealership A')
209 | plt.xlabel('# of Cars at Dealership B')
210 | plt.xticks(np.arange(0, policy.shape[0], 1))
211 | plt.yticks(np.arange(0, policy.shape[1], 1))
212 | plt.gca().invert_yaxis()
213 | if i == (len(policies) - 1):
214 | fig.suptitle('Optimal Policy')
215 | else:
216 | fig.suptitle(f'Policy {i}')
217 |
218 | # Annotate states
219 | for i in range(policy.shape[0]):
220 | for j in range(policy.shape[1]):
221 | plt.text(j, i, '%d' % policy[i,j], horizontalalignment='center', verticalalignment='center')
222 |
223 | plt.colorbar()
224 |
225 |
226 | class JacksCarRentalWithHelp(JacksCarRental):
227 |
228 | SECOND_PARKING_LOT_COST = 4
229 |
230 | def get_action_cost(self, current, action):
231 | if action > 0:
232 | moving_cost = self.MOVING_CAR_COST * (action - 1)
233 | else:
234 | moving_cost = self.MOVING_CAR_COST * abs(action)
235 |
236 | overnight_cars_a = current[0] - action
237 | overnight_cars_b = current[1] + action
238 |
239 | parking_cost = 0
240 | if overnight_cars_a > 10:
241 | parking_cost += self.SECOND_PARKING_LOT_COST
242 | if overnight_cars_b > 10:
243 | parking_cost += self.SECOND_PARKING_LOT_COST
244 |
245 | return moving_cost + parking_cost
246 |
--------------------------------------------------------------------------------
/dynamic_programming/exercises/car_rental_exercise.py:
--------------------------------------------------------------------------------
1 | from dynamic_programming.car_rentals import JacksCarRental
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser(description="Car Rental Exercise")
5 |
6 | parser.add_argument('--convergence',
7 | type=float,
8 | help='Convergence criteria for policy evaluation',
9 | default=1.0)
10 | args = parser.parse_args()
11 |
12 | jcr = JacksCarRental()
13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence)
14 | jcr.plot_results(policies, optimal_value)
15 |
--------------------------------------------------------------------------------
/dynamic_programming/exercises/ex_4_5.py:
--------------------------------------------------------------------------------
1 | from dynamic_programming.car_rentals import JacksCarRentalWithHelp
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser(description="Exercise 4.5")
5 |
6 | parser.add_argument('--convergence',
7 | type=float,
8 | help='Convergence criteria for policy evaluation',
9 | default=1.0)
10 | args = parser.parse_args()
11 |
12 | jcr = JacksCarRentalWithHelp()
13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence)
14 | jcr.plot_results(policies, optimal_value)
15 |
--------------------------------------------------------------------------------
/dynamic_programming/exercises/ex_4_9.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from dynamic_programming.gamblers import GamblersProblem
3 |
4 | # Win probability 0.25
5 | gambler = GamblersProblem(win_probability=0.25)
6 | value_funcs = gambler.value_iteration()
7 | policy = gambler.get_greedy_policy(value_funcs[-1])
8 | next_figure = gambler.plot_results(value_funcs[0:5], policy)
9 |
10 | # Win probability 0.55
11 | gambler = GamblersProblem(win_probability=0.55)
12 | value_funcs = gambler.value_iteration()
13 | policy = gambler.get_greedy_policy(value_funcs[-1])
14 | gambler.plot_results(value_funcs[0:5], policy, figure=next_figure)
15 |
16 | plt.show()
17 |
--------------------------------------------------------------------------------
/dynamic_programming/gamblers.py:
--------------------------------------------------------------------------------
1 | import math
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 |
6 | class GamblersProblem():
7 |
8 | _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
9 |
10 | def __init__(self, win_probability=0.4):
11 | self._win_probability = win_probability
12 |
13 | def get_possible_next_states(self, state, action):
14 | ret = []
15 | # Either we win
16 | ret.append(state + action)
17 | # or we lose
18 | ret.append(state - action)
19 | return list(set(ret))
20 |
21 | def probability_next_state(self, state, action, next_state):
22 | # Special "sink" states
23 | if state == 0:
24 | if next_state == 0:
25 | return 1.0
26 | return 0.0
27 | if state == 100:
28 | if next_state == 100:
29 | return 1.0
30 | return 0.0
31 |
32 | # Loss
33 | if next_state == (state - action):
34 | return 1 - self._win_probability
35 | # Win
36 | elif next_state == (state + action):
37 | return self._win_probability
38 | else:
39 | # Should never actually make it here
40 | return 0.0
41 |
42 | def reward(self, state, action, next_state):
43 | if next_state == 100:
44 | return 1.0
45 | else:
46 | return 0.0
47 |
48 | def value_iteration(self, convergence=0.0001):
49 | diff = np.inf
50 | value = np.zeros(101)
51 | temp = np.copy(value)
52 | ret = []
53 | while diff > convergence:
54 | for state in range(1, value.shape[0] - 1):
55 | action_space = np.arange(0, min(state, 100 - state) + 1)
56 | best_value = None
57 | for action in action_space:
58 | possible_next_states = self.get_possible_next_states(state, action)
59 | gain = 0.0
60 | for next_state in possible_next_states:
61 | gain += self.probability_next_state(state, action, next_state) * (
62 | self.reward(state, action, next_state) + temp[next_state]
63 | )
64 | if best_value is None or gain > best_value:
65 | best_value = gain
66 | value[state] = best_value
67 | diff = np.max(np.fabs(np.subtract(temp, value)))
68 | temp = np.copy(value)
69 | ret.append(temp)
70 | return ret
71 |
72 | def get_greedy_policy(self, value):
73 | policy = np.zeros(101)
74 | for state in np.arange(1, 100):
75 | action_space = np.arange(0, min(state, 100 - state) + 1)
76 | best_action = [None, -np.inf]
77 | for action in action_space:
78 | possible_next_states = self.get_possible_next_states(state, action)
79 | gain = 0.0
80 | for next_state in possible_next_states:
81 | gain += self.probability_next_state(state, action, next_state) * (
82 | self.reward(state, action, next_state) + value[next_state]
83 | )
84 | if best_action[0] is None:
85 | best_action[0] = action
86 | best_action[1] = gain
87 | elif math.isclose(gain, best_action[1]):
88 | # Tie breaking strategy
89 | # Choose more conservative action
90 | if action < best_action[0]:
91 | best_action[0] = action
92 | elif gain > best_action[1]:
93 | best_action[0] = action
94 | best_action[1] = gain
95 | policy[state] = best_action[0]
96 | return policy
97 |
98 | def plot_value_functions(self, value_functions):
99 | for i in range(len(value_functions)):
100 | plt.plot(value_functions[i][0:-1], self._plot_colors[i%len(self._plot_colors)], label=f'Value Function {i}')
101 | plt.title(f"Gambler's Problem Value Iteration (Win Probability = {self._win_probability})")
102 | plt.xlabel('Capital')
103 | plt.ylabel('Value')
104 | plt.legend(loc=4)
105 |
106 | def plot_policy(self, policy):
107 | plt.plot(np.arange(0, 101), policy)
108 | plt.title(f'Optimal Policy for Gambler (Win Probability = {self._win_probability})')
109 | plt.xlabel('Captial')
110 | plt.ylabel('Stake')
111 |
112 | def plot_results(self, value_functions, policy, figure=1):
113 | plt.figure(figure)
114 | self.plot_value_functions(value_functions)
115 | plt.figure(figure + 1)
116 | self.plot_policy(policy)
117 | return figure + 2
118 |
119 | if __name__ == '__main__':
120 | gmb = GamblersProblem()
121 | values = gmb.value_iteration(convergence=0.001)
122 | policy = gmb.get_greedy_policy(values[-1])
123 | gmb.plot_value_functions(values)
124 | gmb.plot_policy(policy)
125 |
--------------------------------------------------------------------------------
/dynamic_programming/results/e45_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_policy.png
--------------------------------------------------------------------------------
/dynamic_programming/results/e45_optimal_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_value.png
--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_0.png
--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_1.png
--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_2.png
--------------------------------------------------------------------------------
/dynamic_programming/results/gambler_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gambler_optimal_policy.png
--------------------------------------------------------------------------------
/dynamic_programming/results/gamblers_value_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gamblers_value_iteration.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_policy.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_optimal_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_value.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_0.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_1.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_2.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_3.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_4.png
--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_5.png
--------------------------------------------------------------------------------
/dynamic_programming/results/policy_4_9_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_a.png
--------------------------------------------------------------------------------
/dynamic_programming/results/policy_4_9_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_b.png
--------------------------------------------------------------------------------
/dynamic_programming/results/value_4_9_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_a.png
--------------------------------------------------------------------------------
/dynamic_programming/results/value_4_9_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_b.png
--------------------------------------------------------------------------------
/environments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/__init__.py
--------------------------------------------------------------------------------
/environments/blackjack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/blackjack/__init__.py
--------------------------------------------------------------------------------
/environments/blackjack/blackjack.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from mpl_toolkits.mplot3d import Axes3D
5 |
6 |
7 | class BlackjackPlotter:
8 |
9 | @staticmethod
10 | def plot_value_functions(value):
11 | reshaped_value = np.reshape(value, BlackjackStates.state_space_shape())
12 | BlackjackPlotter.plot_value_function(
13 | reshaped_value[:, :, 0],
14 | title='Value Function (Usable ace)',
15 | figure=1
16 | )
17 | BlackjackPlotter.plot_value_function(
18 | reshaped_value[:, :, 1],
19 | title='Value Function (No usable ace)',
20 | figure=2)
21 | plt.show()
22 |
23 | @staticmethod
24 | def plot_value_function(value_function, title='Value Function', figure=1):
25 | fig = plt.figure(figure)
26 | ax = fig.add_subplot(111, projection='3d')
27 | x = np.arange(12, 22)
28 | y = np.arange(1, 11)
29 | X, Y = np.meshgrid(x, y)
30 | ax.plot_wireframe(X, Y, value_function)
31 | fig.suptitle(title)
32 | plt.xlabel('Player sum')
33 | plt.ylabel('Dealer showing')
34 |
35 | @staticmethod
36 | def plot_policies(policies):
37 | reshaped_policy = policies.reshape(BlackjackStates.state_space_shape())
38 | ace_policy = reshaped_policy[:, :, 0]
39 | BlackjackPlotter.plot_policy(ace_policy, title='Ace policy', figure=1)
40 | no_ace_policy = reshaped_policy[:, :, 1]
41 | BlackjackPlotter.plot_policy(no_ace_policy, title='No ace policy', figure=2)
42 | plt.show()
43 |
44 | @staticmethod
45 | def plot_policy(policy, title='Blackjack Policy', figure=1):
46 | policy = np.transpose(policy)
47 | fig = plt.figure(figure)
48 | ax = fig.subplots()
49 | fig.suptitle(title)
50 | plt.imshow(policy, cmap='jet')
51 | plt.gca().invert_yaxis()
52 |
53 | plt.xlabel('Dealer showing')
54 | plt.xticks(np.arange(0, len(BlackjackStates.DEALER_CARDS), 1))
55 | ax.set_xticklabels(BlackjackStates.DEALER_CARDS)
56 |
57 | plt.ylabel('Agent sum')
58 | plt.yticks(np.arange(0, len(BlackjackStates.AGENT_SUMS), 1))
59 | ax.set_yticklabels(BlackjackStates.AGENT_SUMS)
60 |
61 | for i in range(policy.shape[0]):
62 | for j in range(policy.shape[1]):
63 | if policy[i, j] == Blackjack.HIT_ACTION:
64 | label = 'HIT'
65 | else:
66 | label = 'STAY'
67 | plt.text(j, i, f'{label}', horizontalalignment='center', verticalalignment='center')
68 |
69 |
70 | class BlackjackStates:
71 |
72 | DEALER_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10]
73 | AGENT_SUMS = [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
74 | USABLE_ACE = [True, False]
75 | STATES = []
76 | for dealer_card in DEALER_CARDS:
77 | for agent_sum in AGENT_SUMS:
78 | for _usable_ace in USABLE_ACE:
79 | STATES.append((dealer_card, agent_sum, _usable_ace))
80 |
81 | @staticmethod
82 | def state_space_shape():
83 | return (len(BlackjackStates.DEALER_CARDS),
84 | len(BlackjackStates.AGENT_SUMS),
85 | len(BlackjackStates.USABLE_ACE))
86 |
87 | @staticmethod
88 | def num_states():
89 | return (len(BlackjackStates.DEALER_CARDS) *
90 | len(BlackjackStates.AGENT_SUMS) *
91 | len(BlackjackStates.USABLE_ACE))
92 |
93 | @staticmethod
94 | def id_to_state(id):
95 | return BlackjackStates.STATES[id]
96 |
97 | @staticmethod
98 | def state_to_id(state):
99 | dealer_card_index = BlackjackStates.DEALER_CARDS.index(state[0])
100 | agent_sum_index = BlackjackStates.AGENT_SUMS.index(state[1])
101 | usable_ace_index = BlackjackStates.USABLE_ACE.index(state[2])
102 | return (
103 | dealer_card_index * len(BlackjackStates.AGENT_SUMS) * len(BlackjackStates.USABLE_ACE) +
104 | agent_sum_index * len(BlackjackStates.USABLE_ACE) +
105 | usable_ace_index
106 | )
107 |
108 | @staticmethod
109 | def print_state(state):
110 | if type(state) is int:
111 | state = BlackjackStates.id_to_state(state)
112 | dealer_card = state[0]
113 | agent_sum = state[1]
114 | usable_ace = state[2]
115 | print(f'Dealer: {dealer_card}, Agent sum: {agent_sum}, Ace: {usable_ace}')
116 |
117 |
118 | class Blackjack:
119 |
120 | GAME_OVER_STATE = -1
121 | HIT_ACTION = 0
122 | STAY_ACTION = 1
123 | HIT_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
124 |
125 | def __init__(self, verbose=True):
126 | self._states = []
127 | self._verbose = verbose
128 |
129 | def _blackjack_sum(self, hand):
130 | """
131 | Sums a list of cards with blackjack rules.
132 | In other words, if a hand contains an ace, it counts it as
133 | a 1 or 11 depending on what is appropriate.
134 |
135 | If a hand has more than 1 ace, at most 1 can count as 11.
136 | """
137 | running_total = 0
138 | num_aces = 0
139 | for card in hand:
140 | if card == 'A':
141 | num_aces += 1
142 | else:
143 | running_total += card
144 |
145 | # Count all aces as 1s by default
146 | running_total += num_aces
147 |
148 | if num_aces > 0 and running_total + 10 <= 21:
149 | # Count 1 ace as 11
150 | running_total += 10
151 |
152 | return running_total
153 |
154 | def _draw_card(self):
155 | return self.HIT_CARDS[randint(0, len(self.HIT_CARDS) - 1)]
156 |
157 | def _player_draw_card(self):
158 | """
159 | Returns a card value in the range [1, 10] because a player can't draw
160 | another usable ace.
161 | """
162 | card = self._draw_card()
163 | if card == 'A':
164 | return 1
165 | else:
166 | return card
167 |
168 | def debug_print(self, message):
169 | if self._verbose:
170 | print(message)
171 |
172 | def num_states(self):
173 | return BlackjackStates.num_states()
174 |
175 | def num_actions(self):
176 | return 2
177 |
178 | def get_starting_state(self):
179 | return self.get_random_state()
180 |
181 | def get_random_state(self):
182 | return randint(0, self.num_states() - 1)
183 |
184 | def perform_action(self, state_id, action):
185 | state = BlackjackStates.id_to_state(state_id)
186 | dealer_card = state[0]
187 | player_sum = state[1]
188 | usable_ace = state[2]
189 | if action == self.HIT_ACTION:
190 | self.debug_print(f'You hit!')
191 | card = self._player_draw_card()
192 | self.debug_print(f'You drew {card}')
193 | player_sum += card
194 | if player_sum > 21:
195 | if usable_ace:
196 | # Ace becomes 1
197 | player_sum -= 10
198 | next_state = (dealer_card, player_sum, False)
199 | return (0, BlackjackStates.state_to_id(next_state), False)
200 | else:
201 | # Lose
202 | self.debug_print(f'You busted with {player_sum}.')
203 | return (-1, self.GAME_OVER_STATE, True)
204 | else:
205 | # Still <= 21
206 | next_state = (dealer_card, player_sum, usable_ace)
207 | return (0, BlackjackStates.state_to_id(next_state), False)
208 | elif action == self.STAY_ACTION:
209 | self.debug_print(f'You stayed!')
210 | # Dealer's turn
211 | dealer_cards = [dealer_card]
212 | dealer_sum = self._blackjack_sum(dealer_cards)
213 |
214 | blackjack = False
215 | if player_sum == 21 and usable_ace:
216 | self.debug_print(f'You have a blackjack!')
217 | blackjack = True
218 |
219 | # Dealer must hit until he has over 17
220 | while dealer_sum < 17:
221 | card = self._draw_card()
222 | self.debug_print(f'Dealer had {dealer_sum}, and drew {card}')
223 | dealer_cards.append(card)
224 | dealer_sum = self._blackjack_sum(dealer_cards)
225 | if dealer_sum != 21 and blackjack:
226 | # If dealer doesn't have 21 after first draw,
227 | # player immediately wins.
228 | self.debug_print(f'You win!')
229 | return (1, self.GAME_OVER_STATE, True)
230 |
231 | if dealer_sum > 21:
232 | # Dealer busted
233 | self.debug_print(f'Dealer busted.')
234 | return (1, self.GAME_OVER_STATE, True)
235 | else:
236 | if dealer_sum > player_sum:
237 | # Lose
238 | self.debug_print(f'Dealer won with {dealer_sum}.')
239 | return (-1, self.GAME_OVER_STATE, True)
240 | elif dealer_sum == player_sum:
241 | self.debug_print(f'Draw. Dealer and player both have {player_sum}.')
242 | return (0, self.GAME_OVER_STATE, True)
243 | else:
244 | # Win
245 | self.debug_print(f'You won! Dealer: {dealer_sum}. You: {player_sum}.')
246 | return (1, self.GAME_OVER_STATE, True)
247 | else:
248 | raise ValueError('This is not a valid action.')
249 |
250 | def is_terminal(self, state):
251 | return state == self.GAME_OVER_STATE
252 |
--------------------------------------------------------------------------------
/environments/blackjack/blackjack_policies.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates
4 |
5 |
6 | class BlackjackPolicy:
7 |
8 | def _get_action_by_state(self, state):
9 | raise NotImplementedError('This must be implemented.')
10 |
11 | def get_action(self, state_id):
12 | blackjack_state = BlackjackStates.id_to_state(state_id)
13 | return self._get_action_by_state(blackjack_state)
14 |
15 | @staticmethod
16 | def generate_policy(stay_on=[]):
17 | policy = np.zeros(BlackjackStates.num_states())
18 | for state_id in range(policy.shape[0]):
19 | state = BlackjackStates.id_to_state(state_id)
20 | dealer_card = state[0]
21 | agent_sum = state[1]
22 | ace = state[2]
23 | if agent_sum in stay_on:
24 | policy[state_id] = Blackjack.STAY_ACTION
25 | else:
26 | policy[state_id] = Blackjack.HIT_ACTION
27 | return policy
28 |
--------------------------------------------------------------------------------
/environments/blackjack/interactive_blackjack.py:
--------------------------------------------------------------------------------
1 | from environments.blackjack.blackjack import *
2 |
3 | blackjack = Blackjack()
4 | state = blackjack.get_random_state()
5 |
6 | while not blackjack.is_terminal(state):
7 | (dealer_card, player_sum, usable_ace) = BlackjackStates.id_to_state(state)
8 |
9 | if usable_ace:
10 | ace_string = 'with ace'
11 | else:
12 | ace_string = 'no ace'
13 | print(f'--- Dealer showing: {dealer_card} --- You: {player_sum} ({ace_string}) ---')
14 |
15 | action = None
16 | while action is None:
17 | action = input('Hit (0) or stay (1)?: ')
18 | if action in ['0', '1']:
19 | action = int(action)
20 | else:
21 | action = None
22 | print('Invalid action')
23 |
24 | print()
25 | reward, state = blackjack.perform_action(state, action)
26 |
--------------------------------------------------------------------------------
/environments/racing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/__init__.py
--------------------------------------------------------------------------------
/environments/racing/interactive_racetrack.py:
--------------------------------------------------------------------------------
1 | from environments.racing.racing import RaceTrackGame
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser(description='Interactive Race Track Game')
5 |
6 | parser.add_argument('racetrack',
7 | type=str,
8 | help='Path to racetrack csv file')
9 |
10 | args = parser.parse_args()
11 |
12 | RaceTrackGame.run(args.racetrack)
13 |
--------------------------------------------------------------------------------
/environments/racing/racetracks/racetrack_a.csv:
--------------------------------------------------------------------------------
1 | 0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,2
2 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
3 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
4 | 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
5 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
6 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
7 | 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0
8 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
9 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
10 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
11 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
12 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
13 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
14 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
15 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
16 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
17 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
18 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
19 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
20 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
21 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
22 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
23 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
24 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
25 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
26 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
27 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
28 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
29 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
30 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0
31 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0
32 | 0,0,0,3,3,3,3,3,3,0,0,0,0,0,0,0
33 |
--------------------------------------------------------------------------------
/environments/racing/racing.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import random
4 | import sys
5 | import time
6 |
7 | import numpy as np
8 | import pygame
9 |
10 |
11 | class RacerBot:
12 |
13 | def __init__(self, policy):
14 | self.policy = policy
15 |
16 | def get_action(self, state_id):
17 | choices = np.arange(0, self.policy.shape[1])
18 | probabilities = self.policy[state_id]
19 | probabilities /= probabilities.sum()
20 | return np.random.choice(choices, p=probabilities)
21 |
22 |
23 | class RaceTrack:
24 |
25 | OOB = 0
26 | TRACK = 1
27 | FINISH = 2
28 | START = 3
29 | CAR = 4
30 |
31 | MAX_SPEED = 5
32 |
33 | def __init__(self, csv_path):
34 | self.track = []
35 | self.start_locations = []
36 | self.finish_locations = []
37 | with open(csv_path, 'r') as csvfile:
38 | track_layout = csv.reader(csvfile, delimiter=',')
39 | row_num = 0
40 | for row in track_layout:
41 | new_row = []
42 | col_num = 0
43 | for cell in row:
44 | new_cell = int(cell)
45 | if new_cell == RaceTrack.START:
46 | self.start_locations.append([col_num, row_num])
47 | if new_cell == RaceTrack.FINISH:
48 | self.finish_locations.append([col_num, row_num])
49 | new_row.append(new_cell)
50 | col_num += 1
51 | self.track.append(new_row)
52 | row_num += 1
53 |
54 | self.states = []
55 | for col in range(len(self.track[0])):
56 | for row in range(len(self.track)):
57 | for horizontal_speed in np.arange(0, self.MAX_SPEED):
58 | for vertical_speed in np.arange(0, self.MAX_SPEED):
59 | self.states.append((col, row, horizontal_speed, vertical_speed))
60 |
61 | self.actions = []
62 | for horizontal_accel in np.arange(-1, 2):
63 | for vertical_accel in np.arange(-1, 2):
64 | self.actions.append((horizontal_accel, vertical_accel))
65 |
66 | def num_states(self):
67 | return len(self.states)
68 |
69 | def num_actions(self):
70 | return len(self.actions)
71 |
72 | def action_to_id(self, action):
73 | return (
74 | (action[0] + 1) * 3 +
75 | (action[1] + 1)
76 | )
77 |
78 | def id_to_action(self, id):
79 | return self.actions[id]
80 |
81 | def state_to_id(self, state):
82 | col = state[0]
83 | row = state[1]
84 | horizontal_speed = state[2]
85 | vertical_speed = state[3]
86 | return (
87 | col * len(self.track) * self.MAX_SPEED * self.MAX_SPEED +
88 | row * self.MAX_SPEED * self.MAX_SPEED +
89 | horizontal_speed * self.MAX_SPEED +
90 | vertical_speed
91 | )
92 |
93 | def id_to_state(self, id):
94 | return self.states[id]
95 |
96 | def perform_action(self, state_id, action_id):
97 | """
98 | Returns reward, next state, and if we finished.
99 | """
100 | state = self.id_to_state(state_id)
101 | current_location = [state[0], state[1]]
102 | current_speed = [state[2], state[3]]
103 | action = self.id_to_action(action_id)
104 |
105 | current_speed[0] = max(min(current_speed[0] + action[0], self.MAX_SPEED - 1), 0)
106 | current_speed[1] = max(min(current_speed[1] + action[1], self.MAX_SPEED - 1), 0)
107 | if current_speed[0] == 0 and current_speed[1] == 0:
108 | current_speed[1] = 1
109 | if self.crosses_finish_line(current_location, current_speed):
110 | next_state = self.starting_line_state()
111 | return (0, self.state_to_id(next_state), True)
112 | else:
113 | next_location = self.get_next_location(current_location, current_speed)
114 | if self.out_of_bounds(next_location):
115 | next_state = self.starting_line_state()
116 | return (-5, self.state_to_id(next_state), False)
117 | next_state = (next_location[0], next_location[1], current_speed[0], current_speed[1])
118 | return (-1, self.state_to_id(next_state), False)
119 |
120 | def crosses_finish_line(self, position, speed):
121 | horizontal = speed[0]
122 | vertical = speed[1]
123 | intermediate_location = [0, 0]
124 | intermediate_location[0] = position[0]
125 | intermediate_location[1] = position[1]
126 | while (horizontal + vertical > 0):
127 | if horizontal >= vertical:
128 | intermediate_location[0] += 1
129 | horizontal -= 1
130 | else:
131 | intermediate_location[1] -=1
132 | vertical -= 1
133 | for finish_location in self.finish_locations:
134 | if intermediate_location[0] == finish_location[0] and intermediate_location[1] == finish_location[1]:
135 | return True
136 | return False
137 |
138 | def out_of_bounds(self, location):
139 | return (location[0] < 0 or location[0] >= self.dimensions[0] or
140 | location[1] < 0 or location[1] >= self.dimensions[1] or
141 | self.track[location[1]][location[0]] == self.OOB)
142 |
143 | def get_next_location(self, location, speed):
144 | next_loc = [location[0] + speed[0], location[1] - speed[1]]
145 | return next_loc
146 |
147 | def get_starting_state(self):
148 | return self.state_to_id(self.starting_line_state())
149 |
150 | def starting_line_state(self):
151 | random_start = self.start_locations[random.randint(0, len(self.start_locations) - 1)]
152 | ret = (random_start[0], random_start[1], 0, 0)
153 | return ret
154 |
155 | @property
156 | def dimensions(self):
157 | return (len(self.track[0]), len(self.track))
158 |
159 |
160 | class RaceTrackGame:
161 |
162 | CAPTION = 'Racing Game'
163 | SCREEN_SIZE = (500, 800)
164 |
165 | OOB_COLOR = (240, 252, 22)
166 | TRACK_COLOR = (147, 150, 155)
167 | FINISH_COLOR = (1, 75, 234)
168 | START_COLOR = (2, 234, 72)
169 | CAR_COLOR = (0, 0, 0)
170 | BACKGROUND_COLOR = (0, 50, 50)
171 | CELL_BORDER = 2
172 | SPEED_RIGHT_MARGIN = SCREEN_SIZE[0]/2
173 |
174 | FONT_SIZE = 25
175 | FONT_HEIGHT = 30
176 | FONT_COLOR = (255, 255, 255)
177 |
178 | TOP_BOTTOM_MARGIN = 10
179 | TRACK_LOCATION = (20, 40)
180 | LEFT_RIGHT_MARGIN = 10
181 |
182 | TRACK_SIZE = (SCREEN_SIZE[0] - 2 * LEFT_RIGHT_MARGIN, SCREEN_SIZE[1] - FONT_HEIGHT - 2 * TOP_BOTTOM_MARGIN)
183 |
184 | def __init__(self, racetrack_csv):
185 | self.screen = pygame.display.get_surface()
186 | self.screen_rect = self.screen.get_rect()
187 | self.done = False
188 | self.keys = pygame.key.get_pressed()
189 | self.racetrack = RaceTrack(racetrack_csv)
190 | self.current_action = [0, 0]
191 | self.font = pygame.font.SysFont(pygame.font.get_default_font(), self.FONT_SIZE)
192 |
193 | self.cell_size = self.get_cell_size()
194 | self.track_top_left = self.get_track_drawing_info()
195 |
196 | self.current_state = self.racetrack.starting_line_state()
197 |
198 | self.current_score = 0
199 |
200 | def get_cell_size(self):
201 | track_dimensions = self.racetrack.dimensions
202 | return (int(self.TRACK_SIZE[0] / track_dimensions[0]), int(self.TRACK_SIZE[1] / track_dimensions[1]))
203 |
204 | def get_track_drawing_info(self):
205 | track_dimensions = self.racetrack.dimensions
206 |
207 | # Correct for rounding
208 | actual_track_size = (self.cell_size[0] * track_dimensions[0], self.cell_size[1] * track_dimensions[1])
209 | margins = (self.TRACK_SIZE[0] - actual_track_size[0], self.TRACK_SIZE[1] - actual_track_size[1])
210 |
211 | track_top_left = (self.LEFT_RIGHT_MARGIN + margins[0] / 2, self.FONT_HEIGHT + self.TOP_BOTTOM_MARGIN + margins[1] / 2)
212 |
213 | return track_top_left
214 |
215 | def update_current_action(self):
216 | # Forward
217 | if self.keys[pygame.K_i]:
218 | self.current_action[1] = min(self.current_action[1] + 1, 1)
219 | # Back
220 | if self.keys[pygame.K_k]:
221 | self.current_action[1] = max(self.current_action[1] - 1, -1)
222 | # Left
223 | if self.keys[pygame.K_j]:
224 | self.current_action[0] = max(self.current_action[0] - 1, -1)
225 | # Right
226 | if self.keys[pygame.K_l]:
227 | self.current_action[0] = min(self.current_action[0] + 1, 1)
228 |
229 | def draw(self, state, action):
230 | self.screen.fill(RaceTrackGame.BACKGROUND_COLOR)
231 | self.render_current_action(action)
232 | self.render_game_state(state)
233 |
234 | def render_game_state(self, state):
235 | self.render_track()
236 | self.render_current_speed((state[2], state[3]))
237 | self.render_car((state[0], state[1]))
238 |
239 | def render_current_action(self, action):
240 | current_action_string = f'[H: {action[0]}, V: {action[1]}]'
241 | text_surface = self.font.render(current_action_string, True, self.FONT_COLOR)
242 | self.screen.blit(text_surface, (10, 10))
243 |
244 | def render_current_speed(self, speed):
245 | current_speed_string = f'Current speed: H: {speed[0]}, V: {speed[1]}'
246 | text_surface = self.font.render(current_speed_string, True, self.FONT_COLOR)
247 | self.screen.blit(text_surface, (self.SCREEN_SIZE[0] - self.SPEED_RIGHT_MARGIN, 10))
248 |
249 | def render_track(self):
250 | for row in range(len(self.racetrack.track)):
251 | for col in range(len(self.racetrack.track[row])):
252 | cell = self.racetrack.track[row][col]
253 | self.draw_cell(cell, col, row)
254 |
255 | def render_car(self, location):
256 | self.draw_cell(RaceTrack.CAR, location[0], location[1])
257 |
258 | def get_track_pixel_pos(self, col, row):
259 | return (self.track_top_left[0] + col*self.cell_size[0], self.track_top_left[1] + row*self.cell_size[1])
260 |
261 | def draw_cell(self, cell, col, row):
262 | if cell == RaceTrack.OOB:
263 | color = RaceTrackGame.OOB_COLOR
264 | elif cell == RaceTrack.FINISH:
265 | color = RaceTrackGame.FINISH_COLOR
266 | elif cell == RaceTrack.TRACK:
267 | color = RaceTrackGame.TRACK_COLOR
268 | elif cell == RaceTrack.START:
269 | color = RaceTrackGame.START_COLOR
270 | elif cell == RaceTrack.CAR:
271 | color = RaceTrackGame.CAR_COLOR
272 | else:
273 | raise ValueError('Unknown cell type')
274 |
275 | draw_position = self.get_track_pixel_pos(col, row)
276 |
277 | pygame.draw.rect(self.screen, color, (draw_position[0], draw_position[1], self.cell_size[0] - self.CELL_BORDER, self.cell_size[1] - self.CELL_BORDER))
278 |
279 | def event_loop(self):
280 | for event in pygame.event.get():
281 | self.keys = pygame.key.get_pressed()
282 | if event.type == pygame.QUIT or self.keys[pygame.K_ESCAPE]:
283 | self.done = True
284 | self.update_current_action()
285 | if self.keys[pygame.K_RETURN]:
286 | a = self.racetrack.action_to_id(self.current_action)
287 | s = self.racetrack.state_to_id(self.current_state)
288 | (r, s, finished) = self.racetrack.perform_action(s, a)
289 | self.current_score += r
290 | self.current_state = self.racetrack.id_to_state(s)
291 | if finished:
292 | print('Finished!!')
293 | print(f'You scored: {self.current_score}')
294 | self.current_score = 0
295 |
296 | def bot_loop(self, bot, episodes, timestep):
297 | for episode in range(episodes):
298 | state = self.racetrack.starting_line_state()
299 | s = self.racetrack.state_to_id(state)
300 | done = False
301 | steps = 0
302 | while not done:
303 | steps += 1
304 | a = bot.get_action(s)
305 | self.draw(self.racetrack.id_to_state(s), self.racetrack.id_to_action(a))
306 | pygame.display.flip()
307 | (r, s, done) = self.racetrack.perform_action(s, a)
308 | time.sleep(timestep)
309 | print(f'Finished in {steps} steps!')
310 |
311 |
312 | def main_loop(self):
313 | while not self.done:
314 | self.event_loop()
315 | self.draw(self.current_state, self.current_action)
316 | pygame.display.flip()
317 |
318 | @staticmethod
319 | def init():
320 | os.environ['SDL_VIDEO_CENTERED'] = '1'
321 | pygame.init()
322 | pygame.display.set_caption(RaceTrackGame.CAPTION)
323 | pygame.display.set_mode(RaceTrackGame.SCREEN_SIZE)
324 |
325 | @staticmethod
326 | def quit():
327 | pygame.quit()
328 | sys.exit()
329 |
330 | @staticmethod
331 | def bot_run(racetrack_file, policy_file, episodes=10, timestep=1):
332 | RaceTrackGame.init()
333 | policy = np.load(policy_file)
334 | bot = RacerBot(policy)
335 | game = RaceTrackGame(racetrack_file)
336 | game.bot_loop(bot, episodes, timestep)
337 | RaceTrackGame.quit()
338 |
339 | @staticmethod
340 | def run(racetrack_file):
341 | RaceTrackGame.init()
342 | game = RaceTrackGame(racetrack_file)
343 | game.main_loop()
344 | RaceTrackGame.quit()
--------------------------------------------------------------------------------
/environments/racing/run_trained_racetrack_bot.py:
--------------------------------------------------------------------------------
1 | from environments.racing.racing import RaceTrackGame
2 | import argparse
3 |
4 |
5 | parser = argparse.ArgumentParser(description='Plays the racetrack game with the specified policy.')
6 |
7 | parser.add_argument('racetrack',
8 | type=str,
9 | help='Path to racetrack csv file')
10 | parser.add_argument('policy',
11 | type=str,
12 | help='Path to serialized policy file')
13 | parser.add_argument('--timestep',
14 | type=float,
15 | help='Length of timesteps (s)',
16 | default=0.1)
17 | parser.add_argument('--episodes',
18 | type=int,
19 | help='Number of episodes to train over',
20 | default=10)
21 | parser.add_argument('--verbose',
22 | type=bool,
23 | help='Print (a lot of) log messages',
24 | default=False)
25 | args = parser.parse_args()
26 |
27 | RaceTrackGame.bot_run(args.racetrack, args.policy, episodes=args.episodes, timestep=args.timestep)
28 |
--------------------------------------------------------------------------------
/environments/racing/trained_policies/mc_learning.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/mc_learning.npy
--------------------------------------------------------------------------------
/environments/racing/trained_policies/q_learning.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/q_learning.npy
--------------------------------------------------------------------------------
/environments/racing/trained_policies/random.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/random.npy
--------------------------------------------------------------------------------
/environments/racing/trained_policies/sarsa.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/sarsa.npy
--------------------------------------------------------------------------------
/lib/policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def sample_action(policy, state):
5 | """
6 | Samples a policy for an action given the current state.
7 | """
8 | choices = np.arange(0, policy.shape[1])
9 | probabilities = policy[state]
10 |
11 | return np.random.choice(choices, p=probabilities)
12 |
13 |
14 | def get_epsilon_greedy_policy(Q, epsilon):
15 | num_actions = Q.shape[1]
16 | policy = (epsilon/num_actions) * np.ones(Q.shape)
17 |
18 | greedy_action_indices = np.argmax(Q, axis=1)
19 | policy[np.arange(0, Q.shape[0]), greedy_action_indices] += (1 - epsilon)
20 |
21 | return policy
22 |
23 |
24 | def get_greedy_policy(Q):
25 | return np.argmax(Q, axis=1)
--------------------------------------------------------------------------------
/monte_carlo/README.md:
--------------------------------------------------------------------------------
1 | # Monte Carlo Methods
2 |
3 | In this chapter, we learn about Monte Carlo methods for learning the optimal behavior policy for
4 | finite MDPs. This is just like what we did in the last chapter except here, we do not assume any
5 | knowledge about the inner workings, or the model, of the MDP. For dynamic programming methods, we needed
6 | to know the transition probabilities for state transitions and the rewards associated with them in order to
7 | learn the optimal policy. Here, we learn the policy from experience alone.
8 |
9 | ## Blackjack: Policy Evaluation
10 |
11 | Here, we test out Monte Carlo policy evaluation on a Blackjack environment. We are evaluating the policy
12 | which stays only on 20 or 21 and hits on everything else. Below, you can see my results for running policy evaluation
13 | on this policy, which reproduces Figure 5.2 from the textbook.
14 |
15 | For 10,000 episodes:
16 |
17 | 
18 | 
19 |
20 | For 500,000 episodes:
21 |
22 | 
23 | 
24 |
25 | As you can see, using more episodes gives you a better, less noisy picture of the value function.
26 |
27 | ## Blackjack: Monte Carlo Control
28 |
29 | Here, we use a Monte Carlo method to learn the optimal policy. We use the pattern of generalized policy iteration
30 | to do so. Basically, this means we use Monte Carlo simulation to evaluate an arbitrary policy, improve that policy
31 | by being greedy with respect to our evaluation, evaluate that new policy, improve that policy by being greedy with
32 | respect to that evaluation, and so on until the policy stops changing (that means we have reached the optimal policy).
33 |
34 | 
35 |
36 | 
37 |
38 | ### Exercise 5.4: Racetrack Problem
39 |
40 | For this problem, I used on policy, first visit, epsilon soft Monte Carlo control to learn a policy for how
41 | to drive a car around a racetrack environment. The exact details of this problem are given in the text. Below,
42 | you can see how an agent behaves before and after training with this control method.
43 |
44 | Before:
45 |
46 | 
47 |
48 | As you can see, this bot crashes into the walls a lot and takes a long time to make it to the target (the blue line).
49 |
50 | After:
51 |
52 | 
53 |
54 | This bot clearly has learned some things about this environment. While it is still not behaving optimally, it is
55 | performing much better than the untrained bot on this environment.
56 |
57 | #### Sources:
58 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012
--------------------------------------------------------------------------------
/monte_carlo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/__init__.py
--------------------------------------------------------------------------------
/monte_carlo/exercises/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/exercises/__init__.py
--------------------------------------------------------------------------------
/monte_carlo/exercises/blackjack_policy_improvement.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter
4 | from monte_carlo import mc
5 |
6 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Improvement')
7 |
8 | parser.add_argument('--iterations',
9 | type=int,
10 | help='Number of iterations to run',
11 | default=5000000)
12 | parser.add_argument('--verbose',
13 | type=bool,
14 | help='Print (a lot of) log messages',
15 | default=False)
16 | args = parser.parse_args()
17 |
18 |
19 | blackjack = Blackjack(verbose=args.verbose)
20 | optimal_policy, Q = mc.det_policy_improvement(blackjack, iterations=args.iterations)
21 |
22 | if args.verbose:
23 | for state_id in range(optimal_policy.shape[0]):
24 | print('--------------------------------')
25 | BlackjackStates.print_state(state_id)
26 | if (optimal_policy[state_id] == Blackjack.HIT_ACTION):
27 | print('HIT')
28 | else:
29 | print('STAY')
30 |
31 | BlackjackPlotter.plot_policies(optimal_policy)
32 |
--------------------------------------------------------------------------------
/monte_carlo/exercises/blackjack_soft_policy_improvement.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import lib.policy
4 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter
5 | from monte_carlo import mc
6 |
7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Soft Policy Improvement')
8 |
9 | parser.add_argument('--iterations',
10 | type=int,
11 | help='Number of iterations to run',
12 | default=1000000)
13 | parser.add_argument('--verbose',
14 | type=bool,
15 | help='Print (a lot of) log messages',
16 | default=False)
17 | args = parser.parse_args()
18 |
19 |
20 | blackjack = Blackjack(verbose=args.verbose)
21 | soft_optimal_policy, Q = mc.on_policy_fv_mc_e_soft_control(
22 | blackjack,
23 | epsilon_func=lambda ep, eps: 0.0,
24 | alpha_func=lambda n: 1/n,
25 | episodes=args.iterations,
26 | random_start=True
27 | )
28 |
29 | optimal_policy = lib.policy.get_greedy_policy(Q)
30 |
31 | if args.verbose:
32 | for state_id in range(optimal_policy.shape[0]):
33 | print('--------------------------------')
34 | BlackjackStates.print_state(state_id)
35 | if (optimal_policy[state_id] == Blackjack.HIT_ACTION):
36 | print('HIT')
37 | else:
38 | print('STAY')
39 |
40 | BlackjackPlotter.plot_policies(optimal_policy)
--------------------------------------------------------------------------------
/monte_carlo/exercises/mc_blackjack.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from environments.blackjack.blackjack import Blackjack, BlackjackPlotter
4 | from environments.blackjack.blackjack_policies import BlackjackPolicy
5 | from monte_carlo import mc
6 |
7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Evaluation')
8 |
9 | parser.add_argument('--episodes',
10 | type=int,
11 | help='Number of episodes to train over',
12 | default=10000)
13 | parser.add_argument('--verbose',
14 | type=bool,
15 | help='Print (a lot of) log messages',
16 | default=False)
17 | args = parser.parse_args()
18 |
19 |
20 | blackjack = Blackjack(verbose=args.verbose)
21 | policy = BlackjackPolicy.generate_policy(stay_on=[20, 21])
22 |
23 | value = mc.fv_policy_evaluation(blackjack, policy, episodes=args.episodes)
24 | BlackjackPlotter.plot_value_functions(value)
25 |
--------------------------------------------------------------------------------
/monte_carlo/exercises/mc_racetrack.py:
--------------------------------------------------------------------------------
1 | from environments.racing.racing import RaceTrack
2 | from monte_carlo import mc
3 | import numpy as np
4 | import argparse
5 |
6 |
7 | parser = argparse.ArgumentParser(description='Monte Carlo Racetrack Policy Improvement')
8 |
9 | parser.add_argument('racetrack',
10 | type=str,
11 | help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 | type=str,
14 | help='Path at which to save policy file')
15 | parser.add_argument('--episodes',
16 | type=int,
17 | help='Number of episodes to train over',
18 | default=1000)
19 | parser.add_argument('--verbose',
20 | type=bool,
21 | help='Print (a lot of) log messages',
22 | default=False)
23 | args = parser.parse_args()
24 |
25 |
26 | racetrack = RaceTrack(args.racetrack)
27 | policy, Q = mc.on_policy_fv_mc_e_soft_control(
28 | racetrack,
29 | epsilon_func=lambda ep, eps: 1 - (ep/eps),
30 | alpha_func=lambda n: 0.1,
31 | episodes=args.episodes
32 | )
33 |
34 | np.save(args.policy, policy)
35 |
--------------------------------------------------------------------------------
/monte_carlo/mc.py:
--------------------------------------------------------------------------------
1 | """
2 | Monte Carlo methods
3 |
4 | An environment is assumed to support the following operations:
5 | environment.num_states(): Returns the number of states in the environment
6 | environment.num_actions(): Returns the number of actions in the environment
7 | environment.get_random_state(): Returns a random state
8 | environment.perform_action(a): Returns a reward and the next state (r, s')
9 | environment.is_terminal(s): Returns whether a state is terminal or not
10 |
11 | A deterministic policy is a environment.num_states x 1 array
12 | A non-deterministic policy is a environment.num_states x environment.num_actions array
13 | """
14 | import numpy as np
15 | from tqdm import tqdm
16 |
17 | from lib.policy import sample_action, get_greedy_policy
18 |
19 |
20 | def det_policy_improvement(environment, iterations=100000):
21 | policy = np.zeros(environment.num_states(), dtype=int)
22 | Q = np.zeros((environment.num_states(), environment.num_actions()))
23 | N = np.zeros((environment.num_states(), environment.num_actions()))
24 |
25 | for i in tqdm(range(iterations)):
26 |
27 | states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True)
28 |
29 | for state, actions_performed in states_seen.items():
30 | for action, gain in actions_performed.items():
31 | N[state, action] = N[state, action] + 1
32 | Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action])
33 |
34 | policy = get_greedy_policy(Q)
35 |
36 | return policy, Q
37 |
38 |
39 | def one_episode_state_action_values(environment, policy, random_start=True):
40 | s = environment.get_starting_state()
41 | states_seen = {}
42 | first_action = True
43 | episode_over = False
44 | steps_taken = 0
45 | while not episode_over:
46 | # If this is the first time we've seen this state
47 | if states_seen.get(s, None) is None:
48 | states_seen[s] = {}
49 |
50 | if first_action and random_start:
51 | a = np.random.randint(0, environment.num_actions())
52 | first_action = False
53 | else:
54 | # Perform our action
55 | a = policy(s)
56 |
57 | # If this is the first time we've performed this action
58 | # in this state
59 | if states_seen[s].get(a, None) is None:
60 | states_seen[s][a] = 0
61 |
62 | (r, s_prime, episode_over) = environment.perform_action(s, a)
63 |
64 | # Update our gain counters
65 | states_seen = \
66 | {
67 | state: {action: gain + r for action, gain in actions_performed.items()}
68 | for state, actions_performed
69 | in states_seen.items()
70 | }
71 |
72 | steps_taken += 1
73 |
74 | # Update current state
75 | s = s_prime
76 |
77 | print(f'{steps_taken}')
78 |
79 | return states_seen
80 |
81 |
82 | def on_policy_fv_mc_e_soft_control(
83 | environment,
84 | epsilon_func=lambda ep, eps: 0.1,
85 | alpha_func=lambda n: 0.1,
86 | episodes=10000,
87 | random_start=False
88 | ):
89 | # Initialize with uniform random policy
90 |
91 | policy = (1/environment.num_actions()) * np.ones((environment.num_states(), environment.num_actions()))
92 |
93 | Q = np.zeros((environment.num_states(), environment.num_actions()))
94 | N = np.zeros((environment.num_states(), environment.num_actions()))
95 |
96 | for episode in range(episodes):
97 | states_seen = one_episode_state_action_values(environment, lambda s: sample_action(policy, s), random_start=random_start)
98 | for state, actions_performed in states_seen.items():
99 | for action, gain in actions_performed.items():
100 | N[state, action] = N[state, action] + 1
101 | Q[state, action] = Q[state, action] + alpha_func(N[state, action])*(gain - Q[state, action])
102 | epsilon = epsilon_func(episode, episodes)
103 | num_actions = Q.shape[1]
104 | policy[state] = (epsilon/num_actions)
105 | policy[state, np.argmax(Q[state])] += 1 - epsilon
106 |
107 | return policy, Q
108 |
109 |
110 | def det_fv_policy_q_evaluation(environment, policy, episodes=10000):
111 | """
112 | First visit MC action-value deterministic policy evaluation with exploring starts.
113 |
114 | Returns the action-value function.
115 | """
116 | Q = np.zeros((environment.num_states(), environment.num_actions()))
117 | N = np.zeros((environment.num_states(), environment.num_actions()))
118 |
119 | for episode in tqdm(range(episodes)):
120 | states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True)
121 | for state, actions_performed in states_seen.items():
122 | for action, gain in actions_performed.items():
123 | N[state, action] = N[state, action] + 1
124 | Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action])
125 |
126 | return Q
127 |
128 |
129 | def fv_policy_evaluation(environment, policy, episodes=10000):
130 | """
131 | First visit MC policy evaluation.
132 |
133 | Returns the state-value function.
134 | """
135 | V = np.zeros(environment.num_states())
136 | N = np.zeros(environment.num_states())
137 |
138 | for episode in tqdm(range(episodes)):
139 | s = environment.get_random_state()
140 | states_seen = {}
141 | episode_over = False
142 | while not episode_over:
143 | # If this is the first time we've seen this state
144 | if states_seen.get(s, None) is None:
145 | states_seen[s] = 0
146 |
147 | # Perform our action
148 | a = policy[s]
149 | (r, s_prime, episode_over) = environment.perform_action(s, a)
150 |
151 | # Update our gain counters
152 | states_seen = {state: gain + r for state, gain in states_seen.items()}
153 |
154 | # Update current state
155 | s = s_prime
156 | for state, gain in states_seen.items():
157 | N[state] = N[state] + 1
158 | V[state] = V[state] + (1.0/(N[state]))*(gain - V[state])
159 |
160 | return V
--------------------------------------------------------------------------------
/monte_carlo/results/ace_optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_optimal.png
--------------------------------------------------------------------------------
/monte_carlo/results/ace_policy_evaluation_10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_10000.png
--------------------------------------------------------------------------------
/monte_carlo/results/ace_policy_evaluation_500000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_500000.png
--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_optimal.png
--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_policy_evaluation_10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_10000.png
--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_policy_evaluation_500000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_500000.png
--------------------------------------------------------------------------------
/monte_carlo/results/trained_bot_racing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/trained_bot_racing.gif
--------------------------------------------------------------------------------
/monte_carlo/results/untrained_bot_racing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/untrained_bot_racing.gif
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cycler==0.10.0
2 | matplotlib==2.1.1
3 | numpy==1.13.3
4 | pygame==1.9.3
5 | pyparsing==2.2.0
6 | python-dateutil==2.6.1
7 | pytz==2017.3
8 | six==1.11.0
9 | tqdm==4.19.5
10 |
--------------------------------------------------------------------------------
/rl_problem/README.md:
--------------------------------------------------------------------------------
1 | # The Reinforcement Learning Problem
2 |
3 | In this chapter, we learn about the full reinforcement learning problem.
4 | The problem consists of an environment and an agent. We have control over
5 | the agent and are responsible for choosing which **actions** the agent takes.
6 | The environment is outside of the agent and thus, we have no control over it
7 | in general. The agent and environment interact in a simple way. At every time
8 | step, the agent performs some action, and the environment responds with
9 | the next **state** and an immediate **reward**.
10 |
11 |
12 | #### Exercise 3.1
13 | The first exercise is to come up with three example tasks that we can fit into
14 | the reinforcement learning framework. Here are mine:
15 |
16 | 1. A program that plays blackjack. The state is made up of the cards
17 | that it can see on the table. The possible actions are hit or stay. The rewards
18 | would simply be +1 if the hand is won, -1 if the hand is lost, and 0 for any
19 | action that does not cause the hand to end.
20 | 2. A traffic light controller. The reward is the number of cars it is
21 | allowing to pass through so that it promotes effective traffic flow. The state
22 | is readings from distant sensors that the controller has on each side which
23 | tell it how far a car is from each side. The controller can make each of its
24 | four sides one of three colors so there are 3^4 possible actions.
25 | 3. A piano playing program. The action in this case is very simple- which keys
26 | do we press and lift? The state is the keys that have already been played or are already
27 | currently pressed. The reward could be supplied by human listeners and could be a numerical
28 | representation of how much they are currently enjoying the music.
29 |
30 | ### Gridworld
31 |
32 | A very simple example of the reinforcement learning problem is gridworld.
33 |
34 | 
35 |
36 | The states are all of the cells on the grid. The possible actions that
37 | we can take are up, down, left, and right. The rules of the environment are:
38 | - If we try to make a move that would take us off the grid, we get a reward
39 | of -1
40 | - If we are on the A square, however, every move takes us to A' and results in
41 | a reward of 10
42 | - If we are on the B square, every move takes us to B' and results in a reward of
43 | 5
44 | - Every other move results in a reward of 0 and takes you to the square you would
45 | expect
46 |
47 | What is the optimal way to act in this environment? In other words, how
48 | do we act so that our "long-term reward" is maximized?
49 |
50 | #### Discounting
51 |
52 | One tricky thing about maximizing "long-term reward" is that this little game could
53 | potentially go on infinitely. To make the problem more simple, mathematically,
54 | we use a strategy called discounting. This basically just means we weight
55 | future rewards in an exponentially decreasing way.
56 |
57 | γ0R0 + γ1R1 + γ2R2 + ...
58 | (where 0 <= γ < 1)
59 |
60 | We call the discounted sum of future rewards the "return" which is essentially a
61 | representation of expected long-term reward.
62 |
63 | #### Policies
64 |
65 | So we need to decide how to act in this environment. A policy specifies
66 | how an agent acts. For example, we could have a random policy, where the agent
67 | moves in a random direction every time step, or we can have an "always down" policy
68 | where the agent always moves down. In general, the policy is just a probability
69 | distribution which tells us the probability of taking each action depending on
70 | which state we are in.
71 |
72 | #### Value Functions
73 |
74 | A value function describes the "value" of a particular state or action.
75 | The "value" of a state is basically the expected future reward from the state,
76 | which we call the "return" as was mentioned above. This value function depends
77 | upon a policy because in order to know how much reward we can expect from a
78 | particular state, we need to know how we are going to act.
79 |
80 | #### Uniform Policy Value Function
81 |
82 | Here is the value function for the uniform random policy (where we choose
83 | an action randomly from every state).
84 |
85 | 
86 |
87 | #### Optimal Value Function
88 |
89 | What we are really interested in, though, is the optimal policy. The policy
90 | that gives us the most possible return from any given state. The optimal
91 | value function gives us the most possible return from any given state, so from
92 | that, we can derive the optimal policy. Here is the optimal value function
93 | for gridworld, solved using value iteration.
94 |
95 | 
96 |
97 | #### Sources:
98 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
--------------------------------------------------------------------------------
/rl_problem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/__init__.py
--------------------------------------------------------------------------------
/rl_problem/exercises/ex_3_17.py:
--------------------------------------------------------------------------------
1 | from rl_problem.gridworld import GridWorld
2 |
3 | g = GridWorld()
4 |
5 | optimal_value_function = g.get_optimal_value_function()
6 | print(optimal_value_function.reshape(5, 5))
7 |
--------------------------------------------------------------------------------
/rl_problem/exercises/gridworld_uniform_policy.py:
--------------------------------------------------------------------------------
1 | from rl_problem.gridworld import GridWorld
2 |
3 | # Using uniform policy
4 | g = GridWorld()
5 | uniform_policy = g.get_uniform_policy()
6 | value_func = g.get_value_function(uniform_policy).reshape(5, 5)
7 | print(value_func)
8 |
9 |
--------------------------------------------------------------------------------
/rl_problem/gridworld.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from itertools import product
3 |
4 |
5 | class GridWorld:
6 |
7 | UP = 0
8 | RIGHT = 1
9 | DOWN = 2
10 | LEFT = 3
11 |
12 | def __init__(self, size=5):
13 | self.size = size
14 | self.action_space = [self.UP, self.RIGHT, self.DOWN, self.LEFT]
15 | self.A = (0, 1)
16 | self.A_prime = (4, 1)
17 | self.B = (0, 3)
18 | self.B_prime = (2, 3)
19 | self._rewards = self._init_rewards()
20 | self._transitions = self._init_state_transitions()
21 |
22 | def get_uniform_policy(self):
23 | policy = np.zeros((4, self.size, self.size))
24 | policy[:, :, :] = 0.25
25 | return policy
26 |
27 | def get_expected_rewards(self, policy):
28 | expected_rewards = policy * self._rewards
29 | return expected_rewards.sum(axis=0)
30 |
31 | def _init_rewards(self):
32 | rewards = np.zeros((4, self.size, self.size))
33 | rewards[[self.UP, self.DOWN], [0, self.size - 1], :] = -1
34 | rewards[[self.LEFT, self.RIGHT], :, [0, self.size - 1]] = -1
35 | # Special A location
36 | rewards[:, self.A[0], self.A[1]] = 10
37 | # Special B location
38 | rewards[:, self.B[0], self.B[1]] = 5
39 | return rewards
40 |
41 | def _init_state_transitions(self):
42 | state_transitions = np.zeros((4, self.size, self.size, self.size, self.size))
43 | # Normal cases
44 | for row in range(self.size):
45 | for col in range(self.size):
46 | if row != 0:
47 | state_transitions[self.UP, row, col, row-1, col] = 1
48 | if row != self.size - 1:
49 | state_transitions[self.DOWN, row, col, row+1, col] = 1
50 | if col != 0:
51 | state_transitions[self.LEFT, row, col, row, col-1] = 1
52 | if col != self.size - 1:
53 | state_transitions[self.RIGHT, row, col, row, col+1] = 1
54 |
55 | # Handle edges
56 | for col in range(self.size):
57 | # Moving up or down in top or botton row leaves you in same state
58 | state_transitions[[self.UP, self.DOWN], [0, self.size - 1], col, [0, self.size - 1], col] = 1
59 | for row in range(self.size):
60 | # Moving left or right in leftmost or rightmost column leaves you in same state
61 | state_transitions[[self.LEFT, self.RIGHT], row, [0, self.size - 1], row, [0, self.size - 1]] = 1
62 |
63 | # Handle A and B
64 | state_transitions[:, [self.A[0], self.B[0]], [self.A[1], self.B[1]], :, :] = 0
65 | state_transitions[:, self.A[0], self.A[1], self.A_prime[0], self.A_prime[1]] = 1
66 | state_transitions[:, self.B[0], self.B[1], self.B_prime[0], self.B_prime[1]] = 1
67 |
68 | return state_transitions
69 |
70 | def get_value_function(self, policy, gamma=0.9):
71 | # Solve V = R + gamma*P(s,s')*V
72 | transition_probabilities = self.get_transition_probabilities(policy)
73 | expected_rewards = self.get_expected_rewards(policy).reshape(self.size**2)
74 | right_side_inverse = np.linalg.inv(np.identity(self.size**2) - gamma*transition_probabilities)
75 | return np.matmul(right_side_inverse, expected_rewards)
76 |
77 | def get_transition_probabilities(self, policy):
78 | ret = np.zeros((self.size**2, self.size**2))
79 | for action in self.action_space:
80 | # p(a|s)
81 | action_policy = np.tile(policy[action, :, :].reshape(self.size**2), (self.size**2, 1))
82 | # p(s'|s, a)
83 | state_transitions = self._transitions[action, :, :, :, :].reshape(self.size**2, self.size**2)
84 | ret = np.add(ret, np.multiply(action_policy, state_transitions))
85 | return ret
86 |
87 | def get_optimal_value_function(self, gamma=0.9, convergence=0.01):
88 | ret = np.zeros(self.size**2)
89 | copy = np.copy(ret)
90 | diff = None
91 | while diff is None or diff > convergence:
92 | for row, col in product(range(self.size), range(self.size)):
93 | new_reward = None
94 | for action in self.action_space:
95 | next_state_distribution = self._transitions[action, row, col].reshape(self.size**2)
96 | expected_rewards = np.matmul(next_state_distribution, ret)
97 | test = self._rewards[action, row, col] + gamma*expected_rewards
98 | if new_reward is None or test > new_reward:
99 | new_reward = test
100 | copy[row*self.size + col] = new_reward
101 | diff = np.sum(np.fabs(np.subtract(ret, copy)))
102 | ret = copy
103 | copy = np.copy(ret)
104 | return ret
105 |
--------------------------------------------------------------------------------
/rl_problem/results/gridworld.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/gridworld.png
--------------------------------------------------------------------------------
/rl_problem/results/optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/optimal.png
--------------------------------------------------------------------------------
/rl_problem/results/uniform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/uniform.png
--------------------------------------------------------------------------------
/runner.py:
--------------------------------------------------------------------------------
1 | import sys, runpy
2 | import os.path
3 |
4 | sys.path.append(os.path.dirname(__file__))
5 |
6 | executable = sys.argv[1]
7 | sys.argv = sys.argv[1:]
8 |
9 | runpy.run_path(executable)
10 |
--------------------------------------------------------------------------------
/td_learning/README.md:
--------------------------------------------------------------------------------
1 | # Temporal-Difference Learning
2 |
3 | In this chapter, we learn about temporal difference (TD) learning. Like Monte Carlo methods, temporal difference
4 | learning methods allow us to learn an optimal policy in a model-free environment. This means that we learn the
5 | optimal policy through experience.
6 |
7 | The difference between TD and Monte Carlo is that TD using a technique called bootstrapping. Basically, Monte Carlo methods
8 | determine the value of a state based on a sample of all of the rewards that follow from it for the rest of the episode.
9 | So if I am in state 1, then go to state 2, 3, 4, etc up to state 10 and then the episode terminates and I get a reward
10 | of +1 at the end, I update my value estimate for each of those states with that final reward.
11 |
12 | TD learning methods use the knowledge that has already been accumulated to update value estimates instead of visited
13 | states. So if I am in state 1, then go to state 2, I update my value estimate for state 1 with the immediate reward I
14 | obtained plus my current value estimate for state 2. So I am updating my value estimate based on other estimates- this
15 | is bootstrapping. This is the same idea that Dynamic Programming methods use.
16 |
17 | ## SARSA: On-Policy TD Control for the Racetrack Problem
18 |
19 | SARSA is one algorithm for doing control using temporal difference learning. So we have an initial, arbitrary policy.
20 | We start in a certain state **S**, take an action **A**, observe reward **R**, arrive in a new state **S'**, and then
21 | take a new action **A'**. All of these (S, A, R, S', A') are used to update our action-value estimates (the estimates
22 | of the value of each action-state pair). So we were in state S and took action A, which gave us reward R, and caused up
23 | to end up in state S', about to take action A'. So you could say the observed value of (S, A) is the reward we just
24 | received plus the value of (S', A'). So this is the value that we move our value estimate of (S, A) towards. And that's
25 | it for updating our Q (action-value estimates). As for the policy that we follow, we simply behave in an epsilon greedy
26 | way with respect to our current Q. As we follow this and train, our Q approaches the optimal state-action value function
27 | and our policy approaches the optimal policy.
28 |
29 | Here is my result for applying SARSA control to the racetrack problem:
30 |
31 | 
32 |
33 | ## Q-Learning: Off-Policy TD Control for the Racetrack Problem
34 |
35 | Q-learning using TD learning techniques but is a bit more clever than SARSA. Q learning is an "off-policy" learning
36 | technique. This means that the policy that is being learned is not the same as the one that is being followed while
37 | learning. This is different from SARSA which follows a certain policy, improves that same policy, and eventually
38 | returns that policy. The advantage of "off-policy" methods is that they allow your learning agent to explore and
39 | take riskier actions while the policy being learned can be greedy and only choose the actions that it already
40 | knows are good.
41 |
42 | Here is my result for applying Q-learning to the racetrack problem:
43 |
44 | 
45 |
46 | #### Sources:
47 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
--------------------------------------------------------------------------------
/td_learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/__init__.py
--------------------------------------------------------------------------------
/td_learning/exercises/q_learning_racing.py:
--------------------------------------------------------------------------------
1 | from environments.racing.racing import RaceTrack
2 | from td_learning import td
3 | import numpy as np
4 | import argparse
5 |
6 |
7 | parser = argparse.ArgumentParser(description='Q Learning Racetrack')
8 |
9 | parser.add_argument('racetrack',
10 | type=str,
11 | help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 | type=str,
14 | help='Path at which to save policy file')
15 | parser.add_argument('--convergence',
16 | type=float,
17 | help='Convergence criteria for Q',
18 | default=10000)
19 | parser.add_argument('--verbose',
20 | type=bool,
21 | help='Print (a lot of) log messages',
22 | default=False)
23 | args = parser.parse_args()
24 |
25 | racetrack = RaceTrack(args.racetrack)
26 | policy, Q = td.q_learning(
27 | racetrack,
28 | alpha_func=lambda n: 1/n,
29 | epsilon=0.2,
30 | convergence=args.convergence
31 | )
32 |
33 | np.save(args.policy, policy)
34 |
--------------------------------------------------------------------------------
/td_learning/exercises/sarsa_racing.py:
--------------------------------------------------------------------------------
1 | from environments.racing.racing import RaceTrack
2 | from td_learning import td
3 | import numpy as np
4 | import argparse
5 |
6 |
7 | parser = argparse.ArgumentParser(description='Sarsa Racetrack Policy Improvement')
8 |
9 | parser.add_argument('racetrack',
10 | type=str,
11 | help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 | type=str,
14 | help='Path at which to save policy file')
15 | parser.add_argument('--episodes',
16 | type=int,
17 | help='Number of episodes to train over',
18 | default=1000)
19 | parser.add_argument('--verbose',
20 | type=bool,
21 | help='Print (a lot of) log messages',
22 | default=False)
23 | args = parser.parse_args()
24 |
25 | racetrack = RaceTrack(args.racetrack)
26 | policy, Q = td.sarsa(
27 | racetrack,
28 | alpha_func=lambda n: 1/n,
29 | epsilon_func=lambda ep, eps: 1 - (ep/eps),
30 | episodes=args.episodes
31 | )
32 |
33 | np.save(args.policy, policy)
34 |
--------------------------------------------------------------------------------
/td_learning/results/q_learning_trained_bot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/q_learning_trained_bot.gif
--------------------------------------------------------------------------------
/td_learning/results/sarsa_trained_bot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/sarsa_trained_bot.gif
--------------------------------------------------------------------------------
/td_learning/td.py:
--------------------------------------------------------------------------------
1 | """
2 | Temporal Difference learning methods
3 |
4 | An environment is assumed to support the following operations:
5 | environment.num_states(): Returns the number of states in the environment
6 | environment.num_actions(): Returns the number of actions in the environment
7 | environment.get_random_state(): Returns a random state
8 | environment.perform_action(a): Returns a reward, the next state (r, s'), and whether
9 | the episode is over
10 |
11 | A deterministic policy is a environment.num_states x 1 array
12 | A non-deterministic policy is a environment.num_states x environment.num_actions array
13 | """
14 | import numpy as np
15 | from tqdm import tqdm
16 |
17 | from lib.policy import sample_action, get_epsilon_greedy_policy
18 |
19 | def sarsa(
20 | environment,
21 | epsilon_func=lambda ep, eps: 0.1,
22 | alpha_func=lambda n: 0.1,
23 | episodes=10000
24 | ):
25 | Q = np.zeros((environment.num_states(), environment.num_actions()))
26 | N = np.zeros((environment.num_states(), environment.num_actions()))
27 | policy = get_epsilon_greedy_policy(Q, (1.0/environment.num_actions()))
28 | for ep in tqdm(range(episodes)):
29 | episode_over = False
30 | s = environment.get_starting_state()
31 | a = sample_action(policy, s)
32 | while not episode_over:
33 | (r, s_prime, episode_over) = environment.perform_action(s, a)
34 |
35 | N[s, a] = N[s, a] + 1
36 |
37 | policy = get_epsilon_greedy_policy(Q, epsilon_func(ep, episodes))
38 | a_prime = sample_action(policy, s)
39 |
40 | Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + Q[s_prime, a_prime] - Q[s, a])
41 |
42 | s = s_prime
43 | a = a_prime
44 | return policy, Q
45 |
46 | def q_learning(
47 | environment,
48 | epsilon=0.3,
49 | alpha_func=lambda n: 0.2,
50 | convergence=0.1
51 | ):
52 | Q = np.zeros((environment.num_states(), environment.num_actions()))
53 | N = np.zeros((environment.num_states(), environment.num_actions()))
54 | diff = np.inf
55 | while diff > convergence:
56 | temp = np.copy(Q)
57 | # Perform 10,000 episodes, then check how much q has changed
58 | for ep in tqdm(range(10000)):
59 | episode_over = False
60 | s = environment.get_starting_state()
61 | while not episode_over:
62 | policy = get_epsilon_greedy_policy(Q, epsilon)
63 | a = sample_action(policy, s)
64 |
65 | (r, s_prime, episode_over) = environment.perform_action(s, a)
66 |
67 | N[s, a] = N[s, a] + 1
68 | Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + np.amax(Q[s_prime]) - Q[s, a])
69 |
70 | s = s_prime
71 | diff = np.sum(np.fabs(np.subtract(Q, temp)))
72 | print(f'Diff: {diff}')
73 |
74 | return get_epsilon_greedy_policy(Q, 0.0), Q
75 |
--------------------------------------------------------------------------------