├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── bandit_problems
    ├── README.md
    ├── __init__.py
    ├── agents.py
    ├── bandits.py
    ├── exercises
    │   ├── decreasing_epsilon.py
    │   ├── ex_2_2_a.py
    │   ├── ex_2_2_b.py
    │   └── showdown.py
    ├── results
    │   ├── decreasing_epsilon.png
    │   ├── decreasing_epsilon_optimality.png
    │   ├── exercise_2_2_a.png
    │   ├── exercise_2_2_a_optimality.png
    │   ├── exercise_2_2_b.png
    │   ├── movingBandit.png
    │   ├── showdown.png
    │   ├── showdown_op.png
    │   ├── softmax.png
    │   ├── softmax_2.png
    │   ├── softmax_temps.png
    │   └── softmax_vs_greedy.png
    └── test_bed.py
├── dynamic_programming
    ├── README.md
    ├── __init__.py
    ├── car_rentals.py
    ├── exercises
    │   ├── car_rental_exercise.py
    │   ├── ex_4_5.py
    │   └── ex_4_9.py
    ├── gamblers.py
    └── results
    │   ├── e45_optimal_policy.png
    │   ├── e45_optimal_value.png
    │   ├── e45_policy_0.png
    │   ├── e45_policy_1.png
    │   ├── e45_policy_2.png
    │   ├── gambler_optimal_policy.png
    │   ├── gamblers_value_iteration.png
    │   ├── jack_optimal_policy.png
    │   ├── jack_optimal_value.png
    │   ├── jack_policy_0.png
    │   ├── jack_policy_1.png
    │   ├── jack_policy_2.png
    │   ├── jack_policy_3.png
    │   ├── jack_policy_4.png
    │   ├── jack_policy_5.png
    │   ├── policy_4_9_a.png
    │   ├── policy_4_9_b.png
    │   ├── value_4_9_a.png
    │   └── value_4_9_b.png
├── environments
    ├── __init__.py
    ├── blackjack
    │   ├── __init__.py
    │   ├── blackjack.py
    │   ├── blackjack_policies.py
    │   └── interactive_blackjack.py
    └── racing
    │   ├── __init__.py
    │   ├── interactive_racetrack.py
    │   ├── racetracks
    │       └── racetrack_a.csv
    │   ├── racing.py
    │   ├── run_trained_racetrack_bot.py
    │   └── trained_policies
    │       ├── mc_learning.npy
    │       ├── q_learning.npy
    │       ├── random.npy
    │       └── sarsa.npy
├── lib
    └── policy.py
├── monte_carlo
    ├── README.md
    ├── __init__.py
    ├── exercises
    │   ├── __init__.py
    │   ├── blackjack_policy_improvement.py
    │   ├── blackjack_soft_policy_improvement.py
    │   ├── mc_blackjack.py
    │   └── mc_racetrack.py
    ├── mc.py
    └── results
    │   ├── ace_optimal.png
    │   ├── ace_policy_evaluation_10000.png
    │   ├── ace_policy_evaluation_500000.png
    │   ├── no_ace_optimal.png
    │   ├── no_ace_policy_evaluation_10000.png
    │   ├── no_ace_policy_evaluation_500000.png
    │   ├── trained_bot_racing.gif
    │   └── untrained_bot_racing.gif
├── requirements.txt
├── rl_problem
    ├── README.md
    ├── __init__.py
    ├── exercises
    │   ├── ex_3_17.py
    │   └── gridworld_uniform_policy.py
    ├── gridworld.py
    └── results
    │   ├── gridworld.png
    │   ├── optimal.png
    │   └── uniform.png
├── runner.py
└── td_learning
    ├── README.md
    ├── __init__.py
    ├── exercises
        ├── q_learning_racing.py
        └── sarsa_racing.py
    ├── results
        ├── q_learning_trained_bot.gif
        └── sarsa_trained_bot.gif
    └── td.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | *.pyc
3 | .idea
4 | venv
5 | scratch
6 | *sublime*
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Nicholas Cellino
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning: An Introduction
 2 | 
 3 | This repository contains (some of the) programming exercises from [Reinforcement Learning: An Introduction (Second Edition)](https://mitpress.mit.edu/books/reinforcement-learning)
 4 | by Richard S. Sutton and Andrew G. Barto. Each subdirectory in this project contains an overview of a topic covered
 5 | in the book, the results from the exercises, and Python code for the exercises. There are also reproductions of some
 6 | of the figures from the book and Python code to go along with them as well.
 7 | 
 8 | This is a work in progress.
 9 | 
10 | ## Topics
11 | 
12 | 1. [Chapter 2 - Bandit Problems](./bandit_problems)
13 | 2. [Chapter 3 - The Reinforcement Learning Problem](./rl_problem)
14 | 3. [Chapter 4 - Dynamic Programming](./dynamic_programming)
15 | 4. [Chapter 5 - Monte Carlo Methods](./monte_carlo)
16 | 5. [Chapter 6 - Temporal-Difference Learning](./td_learning)
17 | 
18 | ## Getting Started
19 | This project uses Python 3.6 and [venv](https://docs.python.org/3/library/venv.html)
20 | (Note: This is distinct from [virtualenv](https://virtualenv.pypa.io/en/stable/). There
21 | are some issues using matplotlib on OSX with virtualenv).
22 | Ensure that you have both of these installed on your system.
23 | 
24 | Then, in the project directory, create your virtual environment:
25 | ```
26 | python3.6 -m venv venv
27 | ```
28 | This creates a folder called `venv` in which we can install Python libraries
29 | like [numpy](http://www.numpy.org/) and [matplotlib](http://matplotlib.org/).
30 | 
31 | To tell your system to use this environment instead of the system-wide python environment, run:
32 | ```
33 | source venv/bin/activate
34 | ```
35 | You will need to do this anytime you want to run examples.
36 | 
37 | 
38 | Next, to install the required libraries into the virtual environment, run:
39 | ```
40 | pip install -r requirements.txt
41 | ```
42 | 
43 | All set! Run exercises by calling runner.py followed by the path to the exercise. For example:
44 | ```
45 | python runner.py bandit_problems/exercises/ex_2_2_a.py
46 | ```
47 | 
48 | For some of the exercises, you can pass arguments to specify certain things about their execution (for example, number of trials in the case
49 | of the n-armed-bandit problems). You can see what these parameters are by passing `-h` like so:
50 | ```
51 | python runner.py bandit_problems/exercises/ex_2_2_a.py -h
52 | ```
53 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/__init__.py


--------------------------------------------------------------------------------
/bandit_problems/README.md:
--------------------------------------------------------------------------------
  1 | # Chapter 2 - Bandit Problems
  2 | 
  3 | In this chapter, we learn about the N-Armed-Bandit problem. Consider this problem:
  4 | 
  5 | There are 10 different slot machines. For each slot machine, you pull a lever and
  6 | get a certain reward, maybe 0 tokens, maybe 10, maybe a million. You get 1000 pulls.
  7 | Your job is to end up with as many tokens as you can by the end of the 1000 pulls. 
  8 | What is your strategy?
  9 | 
 10 | If the slot machines are all exactly the same, then it doesn't really matter what you do.
 11 | You could use all your pulls on 1 machine or choose randomly for each pull and, on average,
 12 | you'll get the same result. But what if the machines are not all the same? What if
 13 | some of the machines are better than others? For example, say you tried slot machine 1 for
 14 | a few pulls and got the following results:
 15 | 
 16 | 1. 3 tokens
 17 | 2. 7 tokens
 18 | 3. 6 tokens
 19 | 4. 5 tokens
 20 | 5. 7 tokens
 21 | 6. 4 tokens
 22 | 
 23 | Then you try machine 2 for a few pulls and get the following results:
 24 | 
 25 | 1. 8 tokens
 26 | 2. 6 tokens
 27 | 3. 9 tokens
 28 | 4. 8 tokens
 29 | 5. 10 tokens
 30 | 6. 7 tokens
 31 | 
 32 | While the rewards are still random, machine 2 seems to be giving better results than machine 1
 33 | on average. So we need to come up with a strategy that exploits that information in order to get
 34 | the most possible tokens at the end.
 35 | 
 36 | This is the essence of the N-Armed-Bandit problem. How do we come up with a strategy to maximize
 37 | our reward?
 38 | 
 39 | ### How we approach the problem
 40 | 
 41 | So we need to figure out what the best slot machine is and choose that one as much as possible.
 42 | In order to determine which slot machine is the best one, we need to try all the different
 43 | slot machines and see which ones give the best rewards. 
 44 | 
 45 | So if we have 1000 pulls, we can try each slot machine 100 times, average the results, 
 46 | and then we'll have a pretty good estimate of how good each slot machine is, right? 
 47 | Well yeah, but then we've spent all of our pulls so we can't exploit that information. 
 48 | So how about we try each machine once, then spend the rest of our pulls on whichever one
 49 | gave us the best reward? Well that doesn't really guarantee that we've found the best
 50 | machine because we only tried each once.
 51 | 
 52 | So we need to balance exploration (finding which machine is the best) with exploitation
 53 | (exploiting our knowledge to get the most possible reward).
 54 | 
 55 | ### Epsilon Greedy Method
 56 | 
 57 | The epsilon greedy method is very simple. Basically, we use the reward from each pull
 58 | to maintain an estimate for how good each slot machine is. For some percentage of
 59 | our pulls, we pick the slot machine that we estimate to be the best. For the rest of our
 60 | pulls, we pick a slot machine randomly.
 61 | 
 62 | The percentage of pulls that we choose randomly is ε (epsilon). So for example, 
 63 | ε = 0.1 means we choose randomly 10% of the time and are greedy (choose our best estimate)
 64 | 90% of the time.
 65 | 
 66 | Here are some results showing the performance of the epsilon greedy methods.
 67 | 
 68 | ![Epsilon Greedy Methods](./results/exercise_2_2_a.png)
 69 | 
 70 | The values at each pull are averages over 2000 trials.
 71 | 
 72 | We can see that ε=0 does not perform too well. This is because it does not spend any
 73 | time exploring. It picks some slot machine as the best and chooses it every time no
 74 | matter what. With ε=0.1, we can see that we do a little better. We spend more time exploring
 75 | so we are able to get better results, but we plateau because we only ever choose our best
 76 | estimate for 90% of pulls. With ε=0.01, we do not learn as fast, but we eventually reach a
 77 | higher average reward than ε=0.1 because once we figure out which slot machine is best,
 78 | we choose it 99% of the time.
 79 | 
 80 | ![Epsilon Greedy Methods Optimal Choice %](./results/exercise_2_2_a_optimality.png)
 81 | 
 82 | This graph shows the percent of the time that each method has chosen the optimal action
 83 | at each pull number. We see that for ε=0, it rarely finds the optimal action,
 84 | and it doesn't spend any time exploring. For ε=0.1, it spends 10% of its
 85 | time exploring so it learns very fast, but it also plateaus because it will
 86 | only exploit its knowledge 90% of the time.
 87 | 
 88 | Maybe we can improve this a little. Maybe we'd want to do a little more exploring
 89 | at the beginning of our session and as we get towards the end, be more greedy. We can
 90 | do that!
 91 | 
 92 | ![Decreasing Epsilon Methods](./results/decreasing_epsilon.png)
 93 | 
 94 | The different lines here show methods where we decrease epsilon at different
 95 | rates.
 96 | 
 97 | ![Decreasing Epsilon Methods Optimal Choice %](./results/decreasing_epsilon_optimality.png)
 98 | 
 99 | ### Softmax Method
100 | 
101 | With the epsilon greedy method, we kind of took an all or nothing approach
102 | to exploration and exploitation. Either we were exploring, and we'd choose
103 | our arm totally randomly or we were exploiting and being totally greedy.
104 | Softmax methods, on the other hand, explore all the time but use their estimates
105 | of each arm's value to weight how often they choose that arm. This means that
106 | they will choose the arm they estimate to be the best most often and the arm
107 | they estimate to be the worst least often and every arm in between is weighted
108 | accordingly as well.
109 | 
110 | ![Softmax Methods](./results/exercise_2_2_b.png)
111 | 
112 | They have a parameter called the "temperature" which essentially says how
113 | much to weigh our estimates. Higher temperatures place less importance on
114 | our estimates and choose actions equi-probably. Lower temperatures place more
115 | importance on our estimates and so choose the actions we estimate to be better
116 | more often. As the temperature approaches 0, we start to be greedy 100% of the
117 | time. Picking the temperature is tricky and seems to be mostly a trial and error
118 | type thing. I am not sure if there is a more scientific way to approach that.
119 | 
120 | ### 10-Armed Bandit Showdown
121 | 
122 | So which bandit performed the best?
123 | 
124 | ![Softmax vs Epsilon Greedy](./results/showdown.png)
125 | 
126 | ![Softmax vs Epsilon Greedy Optimal Choice %](./results/showdown_op.png)
127 | 
128 | The quantity we are trying to maximize is total rewards which is represented
129 | graphically as the area under the curve. In this experiment, the strategy
130 | in which we decrease epsilon over time performed the best. One interesting
131 | thing that we can see here is that, although the softmax agent generally chooses the optimal
132 | action less than the epsilon greedy agent, it performs about the same because it chooses
133 | "okay" actions much more than it chooses the really bad actions.
134 | 
135 | #### Sources:
136 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
137 | 


--------------------------------------------------------------------------------
/bandit_problems/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/__init__.py


--------------------------------------------------------------------------------
/bandit_problems/agents.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from numpy.random import random, random_integers, normal
  4 | import numpy as np
  5 | 
  6 | class Agent:
  7 | 
  8 |     def __init__(self, num_arms):
  9 |         self._num_arms = num_arms
 10 |         self._results = np.zeros((self._num_arms, 2))
 11 |         self._value_estimates = normal(0, 0.01, size=(self._num_arms))
 12 |     
 13 |     def reset(self):
 14 |         self._value_estimates = normal(size=(self._num_arms))
 15 |         self._results = np.zeros((self._num_arms, 2))
 16 |             
 17 |     def _update_value_estimate(self, reward, arm):
 18 |         self._results[arm, 0] += reward
 19 |         self._results[arm, 1] += 1
 20 |         self._value_estimates[arm] = self._results[arm, 0] / self._results[arm, 1]
 21 |             
 22 |     def do_pull(self, bandit):
 23 |         arm = self._choose_arm()
 24 |         reward = bandit.pull_arm(arm)
 25 |         self._update_value_estimate(reward, arm)
 26 |         return reward, bandit.was_optimal_choice(arm)
 27 | 
 28 | class SoftmaxAgent(Agent):
 29 | 
 30 |     def __init__(self, temperature, num_arms):
 31 |         Agent.__init__(self, num_arms)
 32 |         self._temperature = temperature
 33 |     
 34 |     def _gibbs_distribution(self):
 35 |         dist = np.exp(self._value_estimates/self._temperature)
 36 |         return dist / np.sum(dist)
 37 |     
 38 |     def _get_sample(self, dist):
 39 |         cumulative_dist = np.cumsum(dist)
 40 |         r = random()
 41 |         for i in range(len(cumulative_dist)):
 42 |             if r < cumulative_dist[i]:
 43 |                 return i
 44 |     
 45 |     def _choose_arm(self):
 46 |         dist = self._gibbs_distribution()
 47 |         return self._get_sample(dist)
 48 | 
 49 |     def __str__(self):
 50 |         return f'Softmax Agent (t={self._temperature})'
 51 | 
 52 |         
 53 | class EpsilonGreedyAgent(Agent):
 54 | 
 55 |     def __init__(self, epsilon, num_arms):
 56 |         Agent.__init__(self, num_arms)
 57 |         self._starting_epsilon = epsilon
 58 |         self._epsilon = epsilon
 59 |     
 60 |     def reset(self):
 61 |         self._epsilon = self._starting_epsilon
 62 |         Agent.reset(self)
 63 |     
 64 |     def _choose_arm(self):
 65 |         if random() < self._epsilon:
 66 |             return random_integers(0, len(self._results) - 1)
 67 |         else:
 68 |             return np.argmax(self._value_estimates)
 69 | 
 70 |     def __str__(self):
 71 |         return f'Epsilon Greedy Agent (ε={self._epsilon})'
 72 | 
 73 | 
 74 | class FixedAlphaEpsilonGreedyAgent(EpsilonGreedyAgent):
 75 | 
 76 |     def __init__(self, epsilon, num_arms, alpha=0.1):
 77 |         EpsilonGreedyAgent.__init__(self, epsilon, num_arms)
 78 |         self._alpha = alpha
 79 | 
 80 |     def _update_value_estimate(self, reward, arm):
 81 |         self._value_estimates[arm] += self._alpha * (reward - self._value_estimates[arm])
 82 | 
 83 |     def __str__(self):
 84 |         return f'Fixed Alpha Epsilon Greedy Agent (ε={self._epsilon}, α={self._alpha})'
 85 | 
 86 | 
 87 | class AdjustableEpsilonGreedyAgent(EpsilonGreedyAgent):
 88 | 
 89 |     def __init__(self, num_arms, num_turns):
 90 |         EpsilonGreedyAgent.__init__(self, 1.0, num_arms)
 91 |         self._num_turns = num_turns
 92 |         self._num_pulls = 0
 93 | 
 94 |     def reset(self):
 95 |         self._num_pulls = 0
 96 |         EpsilonGreedyAgent.reset(self)
 97 |     
 98 |     def do_pull(self, bandit):
 99 |         self._adjust_epsilon()
100 |         reward, was_optimal = Agent.do_pull(self, bandit)
101 |         self._num_pulls += 1
102 |         return reward, was_optimal
103 | 
104 | 
105 | class ExponentialDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent):
106 | 
107 |     def __init__(self, num_arms, num_turns, decline_rate=1.001):
108 |         AdjustableEpsilonGreedyAgent.__init__(self, num_arms, num_turns)
109 |         self._decline_rate = decline_rate
110 | 
111 |     # Calculates and sets the next epsilon value
112 |     def _adjust_epsilon(self):
113 |         self._epsilon = ((1 - (self._decline_rate**(-self._num_pulls))) /
114 |                          (self._decline_rate**(-self._num_turns) - 1)) + 1
115 | 
116 |     def __str__(self):
117 |         return f'Exponentially Decreasing Epsilon Greedy Agent (decline_rate={self._decline_rate})'
118 | 
119 | 
120 | class LinearDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent):
121 | 
122 |     # Sets the next epsilon value
123 |     def _adjust_epsilon(self):
124 |         progress = float(self._num_pulls) / self._num_turns
125 |         self._epsilon = 1 - progress
126 | 
127 |     def __str__(self):
128 |         return f'Linearly Decreasing Epsilon Greedy Agent'
129 | 


--------------------------------------------------------------------------------
/bandit_problems/bandits.py:
--------------------------------------------------------------------------------
 1 | from numpy.random import normal, randn
 2 | import numpy as np
 3 | 
 4 | class NArmedBandit(object):
 5 | 
 6 |     def __init__(self, n):
 7 |         self._arms = randn(n)
 8 |     
 9 |     def pull_arm(self, arm):
10 |         self.validate_arm(arm)
11 |         return self._arms[arm] + normal()
12 |     
13 |     def num_arms(self):
14 |         return len(self._arms)
15 | 
16 |     def validate_arm(self, arm):
17 |         if arm < 0 or arm >= self.num_arms():
18 |             raise ValueError("This arm does not exist.")
19 | 
20 |     def was_optimal_choice(self, arm):
21 |         """
22 |         Tells if the choice was optimal.
23 | 
24 |         Should be used for analysis purposes only
25 |         (in other words, not for actually solving the problem)
26 |         """
27 |         self.validate_arm(arm)
28 |         return np.argmax(self._arms) == arm
29 | 
30 | 
31 | class MovingNArmedBandit(NArmedBandit):
32 | 
33 |     def __init__(self, n, sigma=0.1):
34 |         super(MovingNArmedBandit, self).__init__(n)
35 |         self._sigma = sigma
36 | 
37 |     def pull_arm(self, arm):
38 |         value = super(MovingNArmedBandit, self).pull_arm(arm)
39 |         self._arms += self._sigma * randn(len(self._arms))
40 |         return value
41 | 


--------------------------------------------------------------------------------
/bandit_problems/exercises/decreasing_epsilon.py:
--------------------------------------------------------------------------------
 1 | from bandit_problems.agents import *
 2 | from bandit_problems.test_bed import TestBed
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
 6 | parser.add_argument('--arms',
 7 |                     type=int,
 8 |                     help='Number of arms for the bandit',
 9 |                     default=10)
10 | parser.add_argument('--trials',
11 |                     type=int,
12 |                     help='Number of trials to average over',
13 |                     default=2000)
14 | parser.add_argument('--pulls',
15 |                     type=int,
16 |                     help='Number of pulls per trial',
17 |                     default=1000)
18 | args = parser.parse_args()
19 | 
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 | 
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.01))
28 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.0075))
29 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015))
30 | 
31 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
32 | tb.run()
33 | tb.plot_results(title='Decreasing Epsilon Value')
34 | 


--------------------------------------------------------------------------------
/bandit_problems/exercises/ex_2_2_a.py:
--------------------------------------------------------------------------------
 1 | from bandit_problems.agents import *
 2 | from bandit_problems.test_bed import TestBed
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
 6 | parser.add_argument('--arms',
 7 |                     type=int,
 8 |                     help='Number of arms for the bandit',
 9 |                     default=10)
10 | parser.add_argument('--trials',
11 |                     type=int,
12 |                     help='Number of trials to average over',
13 |                     default=2000)
14 | parser.add_argument('--pulls',
15 |                     type=int,
16 |                     help='Number of pulls per trial',
17 |                     default=3000)
18 | args = parser.parse_args()
19 | 
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 | 
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0, num_arms))
27 | agents.append(EpsilonGreedyAgent(0.01, num_arms))
28 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
29 | 
30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
31 | tb.run()
32 | tb.plot_results(title='Exercise 2.2')
33 | 


--------------------------------------------------------------------------------
/bandit_problems/exercises/ex_2_2_b.py:
--------------------------------------------------------------------------------
 1 | from bandit_problems.agents import *
 2 | from bandit_problems.test_bed import TestBed
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Exercise 2.2")
 6 | parser.add_argument('--arms',
 7 |                     type=int,
 8 |                     help='Number of arms for the bandit',
 9 |                     default=10)
10 | parser.add_argument('--trials',
11 |                     type=int,
12 |                     help='Number of trials to average over',
13 |                     default=2000)
14 | parser.add_argument('--pulls',
15 |                     type=int,
16 |                     help='Number of pulls per trial',
17 |                     default=1000)
18 | args = parser.parse_args()
19 | 
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 | 
25 | agents = []
26 | agents.append(SoftmaxAgent(0.1, num_arms))
27 | agents.append(SoftmaxAgent(0.2, num_arms))
28 | agents.append(SoftmaxAgent(0.3, num_arms))
29 | agents.append(SoftmaxAgent(0.4, num_arms))
30 | agents.append(SoftmaxAgent(0.5, num_arms))
31 | 
32 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
33 | tb.run()
34 | tb.plot_results(title='Exercise 2.2')
35 | 


--------------------------------------------------------------------------------
/bandit_problems/exercises/showdown.py:
--------------------------------------------------------------------------------
 1 | from bandit_problems.agents import *
 2 | from bandit_problems.test_bed import TestBed
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Bandit Showdown")
 6 | parser.add_argument('--arms',
 7 |                     type=int,
 8 |                     help='Number of arms for the bandit',
 9 |                     default=10)
10 | parser.add_argument('--trials',
11 |                     type=int,
12 |                     help='Number of trials to average over',
13 |                     default=2000)
14 | parser.add_argument('--pulls',
15 |                     type=int,
16 |                     help='Number of pulls per trial',
17 |                     default=3000)
18 | args = parser.parse_args()
19 | 
20 | # Parameters
21 | num_arms = args.arms
22 | num_trials = args.trials
23 | num_pulls = args.pulls
24 | 
25 | agents = []
26 | agents.append(EpsilonGreedyAgent(0.1, num_arms))
27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015))
28 | agents.append(SoftmaxAgent(0.3, num_arms))
29 | 
30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls)
31 | tb.run()
32 | tb.plot_results(title='Decreasing Epsilon Value')


--------------------------------------------------------------------------------
/bandit_problems/results/decreasing_epsilon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon.png


--------------------------------------------------------------------------------
/bandit_problems/results/decreasing_epsilon_optimality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon_optimality.png


--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a.png


--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_a_optimality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a_optimality.png


--------------------------------------------------------------------------------
/bandit_problems/results/exercise_2_2_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_b.png


--------------------------------------------------------------------------------
/bandit_problems/results/movingBandit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/movingBandit.png


--------------------------------------------------------------------------------
/bandit_problems/results/showdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown.png


--------------------------------------------------------------------------------
/bandit_problems/results/showdown_op.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown_op.png


--------------------------------------------------------------------------------
/bandit_problems/results/softmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax.png


--------------------------------------------------------------------------------
/bandit_problems/results/softmax_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_2.png


--------------------------------------------------------------------------------
/bandit_problems/results/softmax_temps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_temps.png


--------------------------------------------------------------------------------
/bandit_problems/results/softmax_vs_greedy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_vs_greedy.png


--------------------------------------------------------------------------------
/bandit_problems/test_bed.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | 
 5 | from bandit_problems.bandits import NArmedBandit, MovingNArmedBandit
 6 | 
 7 | 
 8 | class TestBed:
 9 | 
10 |     _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
11 | 
12 |     def __init__(self,
13 |                  agents,
14 |                  num_arms=10,
15 |                  num_trials=2000,
16 |                  num_pulls=1000):
17 |         self._num_arms = num_arms
18 |         self._num_trials = num_trials
19 |         self._num_pulls = num_pulls
20 |         self._agents = agents
21 |         self._results = np.zeros((len(agents), num_pulls))
22 |         self._optimal_choices = np.zeros((len(agents), num_pulls))
23 | 
24 |     def _reset_agents(self):
25 |         for agent in self._agents:
26 |             agent.reset()
27 | 
28 |     def run(self):
29 |         for trial_num in tqdm(range(self._num_trials)):
30 |             b = NArmedBandit(self._num_arms)
31 |             self._reset_agents()
32 |             for pull in range(self._num_pulls):
33 |                 for i in range(len(self._agents)):
34 |                     reward, was_optimal = self._agents[i].do_pull(b)
35 |                     self._results[i, pull] += reward
36 |                     if was_optimal:
37 |                         self._optimal_choices[i, pull] += 1
38 | 
39 |     def run_moving(self):
40 |         for trial_num in tqdm(range(self._num_trials)):
41 |             b = MovingNArmedBandit(self._num_arms, 0.1)
42 |             self._reset_agents()
43 |             for pull in range(self._num_pulls):
44 |                 for i in range(len(self._agents)):
45 |                     reward, was_optimal = self._agents[i].do_pull(b)
46 |                     self._results[i, pull] += reward
47 |                     if was_optimal:
48 |                         self._optimal_choices[i, pull] += 1
49 | 
50 |     def plot_results(self, title):
51 |         plt.figure(1)
52 |         avgs = self._results / self._num_trials
53 |         for i in range(len(self._agents)):
54 |             plt.plot(avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i]))
55 |         plt.title(title)
56 |         plt.xlabel('Pull Number')
57 |         plt.ylabel('Average Reward')
58 |         plt.legend(loc=4)
59 | 
60 |         plt.figure(2)
61 |         optimal_choices_avgs = self._optimal_choices / self._num_trials
62 |         for i in range(len(self._agents)):
63 |             plt.plot(optimal_choices_avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i]))
64 |         plt.title(title)
65 |         plt.xlabel('Pull Number')
66 |         plt.ylabel('Percent Optimal Action Choice')
67 |         plt.legend(loc=4)
68 | 
69 |         plt.show()
70 | 


--------------------------------------------------------------------------------
/dynamic_programming/README.md:
--------------------------------------------------------------------------------
  1 | # Chapter 4 - Dynamic Programming
  2 | 
  3 | In this chapter, we learn about using dynamic programming techniques to solve
  4 | finite MDPs. By "solve," in this context, we mean find the optimal way to behave
  5 | in the MDP so as to maximize our return.
  6 | 
  7 | ## Policy Evaluation
  8 | 
  9 | The first important idea from this chapter is policy **evaluation**. This simply refers
 10 | to the process of determining the value functions for a certain policy. One way
 11 | to do this using dynamic programming is by taking an iterative approach.
 12 | 
 13 | We start with a given policy π and an arbitrary state-value function v(s)- we can
 14 | choose the state-value function that is 0 for all states. Then, we try to calculate v(s)
 15 | for each state in the state space. To do so, we look ahead one action,
 16 | and for each action, we look ahead at
 17 | the possible next states. For each of these actions a and next states s', we calculate
 18 | the return, which is the sum of the expected immediate reward and the discounted sum of
 19 | the return of the next state. We sum all these together, with each weighted
 20 | by their probability of occurring. Since the return of the next state is not actually
 21 | known, this is still only an estimate, but if we apply this procedure iteratively,
 22 | we are guaranteed to converge to the true value function.
 23 | 
 24 | ## Policy Improvement
 25 | 
 26 | Okay, so with policy evaluation, we have a method to learn the value function
 27 | for a given policy in an environment. But our goal is to find the optimal way
 28 | to behave in this environment- the optimal policy.
 29 | 
 30 | Once we have the value function, this is actually pretty easy. If we know the
 31 | value function for a certain policy, we can look at each state and see if the
 32 | policy takes the optimal action from that state- remember that we know at this point
 33 | the value of all possible next states, the expected rewards from each action, and the
 34 | probability of transitioning from state s to state s' given the action a. If it does not
 35 | take the optimal action, then there is clearly an opportunity to **improve** this policy.
 36 | We can improve the policy by, from each state, selecting the action that gives us
 37 | the most return. Put another way, we should be **greedy** with respect to the policy's
 38 | value function. Once we do this, we end up with another policy which is better than
 39 | the one we started with. More formally, the state-value function of this policy is greater
 40 | than or equal to the state-value function of the previous policy for every state s.
 41 | If the state-value function is higher for every state, that intuitively means this policy can
 42 | extract more return from this environment in the long run.
 43 | 
 44 | ## Policy Iteration
 45 | 
 46 | The policy iteration algorithm combines these two algorithms in order to find the optimal policy. We start with
 47 | an arbitrary policy and value function. Then, we evaluate this policy. Then, we improve that policy. Then, we evaluate
 48 | this policy. And so on, until the policy remains the same for two steps in a row. At this point, the policy is greedy
 49 | with respect to its own value function. This implies that this policy's value function satisfies the Bellman
 50 | optimality equation and thus, this is an optimal policy.
 51 | 
 52 | 
 53 | ### Exercise: Jack's Car Rental
 54 | 
 55 | Jack's Car Rental problem is described in Sutton and Barto **Example 4.2** and **Exercise 4.5**.
 56 | 
 57 | The basic problem is this: Jack manages two dealerships for his car rental business. Let's call them A and B.
 58 | Every day, some customers arrive at each location and request cars. If Jack has a car for them, he can rent it to them
 59 | and get $10. If he does not have a car, he loses their business and makes no money. Jack can move cars between dealerships
 60 | at night for a cost of $2/car to help make sure he has cars where they are needed, but he can only move a maximum of 5 cars
 61 | per night.  Every day, some number of people
 62 | also return cars to each dealership, and those are available for rental the next day. The number of people who
 63 | request and return cars to each dealership are Poisson random variables.
 64 | 
 65 | For dealership A, the request and return probabilities have expected values 3 and 3, respectively.
 66 | 
 67 | For dealership B, the request and return probabilities have expected values 4 and 2, respectively.
 68 | 
 69 | Also, there can be no more than 20 cars at each location- any additional cars get returned to the nationwide company.
 70 | 
 71 | We can use policy iteration to find the optimal policy for this environment. The states in this environment are how many
 72 | cars are at each dealership. The actions are how many cars we move from A to B (a negative number means we move cars from
 73 | B to A). So the actions are integers in the range \[-5, 5\]. The rewards are how much money Jack makes in each time step.
 74 | The book says to use a discount factor of 0.9, so that's what we'll do.
 75 | 
 76 | Here are my results for running policy iteration on this problem:
 77 | 
 78 | ![Policy 0](./results/jack_policy_0.png)
 79 | 
 80 | ![Policy 1](./results/jack_policy_1.png)
 81 | 
 82 | ![Policy 2](./results/jack_policy_2.png)
 83 | 
 84 | ![Policy 3](./results/jack_policy_3.png)
 85 | 
 86 | ![Policy 4](./results/jack_policy_4.png)
 87 | 
 88 | ![Policy 5](./results/jack_policy_5.png)
 89 | 
 90 | ![Optimal Policy](./results/jack_optimal_policy.png)
 91 | 
 92 | ![Optimal Value](./results/jack_optimal_value.png)
 93 | 
 94 | As you can see, I started with the policy that moves 0 cars no matter what. At each iteration,
 95 | the policy changes slightly until there is no difference between policy 5 and the optimal policy. I'm not sure
 96 | why my results differ slightly from those shown in the book (Figure 4.4).
 97 | Policy 1 is slightly different when dealer B has 20 cars and my optimal value function looks
 98 | to max out at a slightly higher value. This may be due to mistakes on my part or different convergence
 99 | criteria. The rest, however, seem to conform exactly to the figures in the book.
100 | 
101 | ### Exercise: Jack's Car Rental With Help
102 | 
103 | Now, we add a couple things to this problem.
104 | 
105 | One of Jack's employees takes the bus home from near dealership A to near dealership B every night.
106 | She is willing to drive a car from A to B for free.
107 | 
108 | Also, Jack's parking lot just shrunk. If he has more than 10 cars at a certain dealership,
109 | he will now have to rent an additional lot for a cost of $4 for that location.
110 | 
111 | Here are my results for running policy iteration on that problem:
112 | 
113 | ![Policy 0](./results/e45_policy_0.png)
114 | 
115 | ![Policy 1](./results/e45_policy_1.png)
116 | 
117 | ![Policy 2](./results/e45_policy_2.png)
118 | 
119 | ![Optimal Policy](./results/e45_optimal_policy.png)
120 | 
121 | ![Optimal Value](./results/e45_optimal_value.png)
122 | 
123 | While I am not positive that these results are correct, we can see by inspection that
124 | the optimal policy does make sense. For example, it usually makes sense to take advantage of that free car
125 | transport from A to B because B usually gets more requests than A, unless it means that it will make dealership
126 | B have more than 10 cars. We also see where this policy tries to avoid that $4 parking lot
127 | overhead.
128 | 
129 | ## Value Iteration
130 | 
131 | Value iteration functions in a similar way to policy iteration but takes a shortcut. It essentially cuts short
132 | the policy evaluation step and attempts, at each iteration, to maximize the value function by being greedy with respect
133 | to the previous value function.
134 | 
135 | ### Exercise: Gambler's Problem
136 | 
137 | A gambler flips a coin. If it lands on heads, he wins. If he lands on tails, he loses. He starts off with
138 | $1 and can bet in dollar increments. His goal is to get to $100.
139 | 
140 | So that states are how much money he has, and the actions are how much he bets. The rewards are 0 for everything
141 | except if he gets to the $100 state, in which case, he gets a reward of 1.
142 | 
143 | Here are the results of running value iteration on this problem:
144 | 
145 | ![Gambler's Value Iteration](./results/gamblers_value_iteration.png)
146 | 
147 | We can see how these value functions are tending towards a single function as we iterate further.
148 | 
149 | ![Gambler's Optimal Policy](./results/gambler_optimal_policy.png)
150 | 
151 | This is one optimal policy for this problem. There are different optimal policies for this problem. This one
152 | was chosen to replicate the result in Sutton and Barto: it is generated by choosing the most conservative/lowest bet
153 | out of all the optimal bets.
154 | 
155 | ### Exercise: Gambler's Problem (p<sub>h</sub>=0.25 and p<sub>h</sub>=0.55)
156 | 
157 | #### p<sub>h</sub>=0.25 Results
158 | 
159 | ![Value 0.25](./results/value_4_9_a.png)
160 | 
161 | ![Policy 0.25](./results/policy_4_9_a.png)
162 | 
163 | #### p<sub>h</sub>=0.55 Results
164 | 
165 | ![Value 0.55](./results/value_4_9_b.png)
166 | 
167 | ![Policy 0.55](./results/policy_4_9_b.png)
168 | 
169 | 
170 | #### Sources:
171 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.
172 | 


--------------------------------------------------------------------------------
/dynamic_programming/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/__init__.py


--------------------------------------------------------------------------------
/dynamic_programming/car_rentals.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from mpl_toolkits.mplot3d import Axes3D
  4 | import math
  5 | from tqdm import tqdm
  6 | 
  7 | class JacksCarRental:
  8 | 
  9 |     EXPECTED_RETURNS_A = 3
 10 |     EXPECTED_REQUESTS_A = 3
 11 |     EXPECTED_RETURNS_B = 2
 12 |     EXPECTED_REQUESTS_B = 4
 13 |     MOVING_CAR_COST = 2
 14 |     RENTAL_SALE_PRICE = 10
 15 | 
 16 |     # Don't bother computing poisson for anything above this
 17 |     # It will be very close to 0
 18 |     POISSON_CUTOFF = 14
 19 | 
 20 |     def __init__(self, max_cars=21):
 21 |         """
 22 |         :param max_cars: Non-inclusive upper-bound for how many cars can be at a dealership
 23 |         """
 24 |         self.max_cars = max_cars
 25 |         self.action_space = np.arange(-5, 6)
 26 |         self.a_transitions = self.init_transition_probabilities('A')
 27 |         self.b_transitions = self.init_transition_probabilities('B')
 28 |         self.a_expected_revenue = self.init_expected_revenue('A')
 29 |         self.b_expected_revenue = self.init_expected_revenue('B')
 30 | 
 31 |     def init_expected_revenue(self, dealership):
 32 |         """
 33 |         Returns a self.max_cars x self.max_cars x len(self.action_space) array.
 34 |         Each cell holds the expected revenue for the specified dealership with
 35 |         the specified previous state, next state, and action.
 36 |         :param dealership: 'A' or 'B'
 37 |         """
 38 |         revenue = np.zeros((self.action_space.shape[0], self.max_cars, self.max_cars))
 39 |         for cars in range(self.max_cars):
 40 |             for cars_after in range(self.max_cars):
 41 |                 for action in self.action_space:
 42 |                     if (dealership is 'A' and cars - action < 0) or (dealership is 'B' and cars + action < 0):
 43 |                         continue
 44 |                     revenue[action, cars, cars_after] = self.get_expected_revenue(dealership, action, cars, cars_after)
 45 |         return revenue
 46 | 
 47 |     def get_expected_revenue(self, dealership, action, now, after):
 48 |         if dealership is 'A':
 49 |             after_move = now - action
 50 |         elif dealership is 'B':
 51 |             after_move = now + action
 52 |         else:
 53 |             raise ValueError('Dealership must be A or B')
 54 | 
 55 |         expected_revenue = 0.0
 56 |         for requests in range(self.POISSON_CUTOFF):
 57 |             probability = self.expected_requests_probability(dealership, requests)
 58 |             expected_revenue += probability * self.RENTAL_SALE_PRICE * min(after_move, requests)
 59 | 
 60 |         return expected_revenue
 61 | 
 62 |     def init_transition_probabilities(self, dealership):
 63 |         ret = np.zeros((self.max_cars, self.max_cars))
 64 |         for current in range(ret.shape[0]):
 65 |             for next in range(ret.shape[1]):
 66 |                 probability = 0.0
 67 |                 for requests in range(self.POISSON_CUTOFF):
 68 |                     for returns in range(self.POISSON_CUTOFF):
 69 |                         cars_after_requests = max(current - requests, 0)
 70 |                         cars_after_returns = min(cars_after_requests + returns, self.max_cars - 1)
 71 |                         if cars_after_returns == next:
 72 |                             request_probability = self.expected_requests_probability(dealership, requests)
 73 |                             return_probability = self.expected_returns_probability(dealership, returns)
 74 |                             probability += request_probability * return_probability
 75 |                 ret[current, next] = probability
 76 |         return ret
 77 | 
 78 |     def expected_returns_probability(self, dealership, returns):
 79 |         if dealership is 'A':
 80 |             return self.poisson(self.EXPECTED_RETURNS_A, returns)
 81 |         elif dealership is 'B':
 82 |             return self.poisson(self.EXPECTED_RETURNS_B, returns)
 83 |         else:
 84 |             raise ValueError('Dealership must be A or B')
 85 | 
 86 |     def expected_requests_probability(self, dealership, requests):
 87 |         if dealership is 'A':
 88 |             return self.poisson(self.EXPECTED_REQUESTS_A, requests)
 89 |         elif dealership is 'B':
 90 |             return self.poisson(self.EXPECTED_REQUESTS_B, requests)
 91 |         else:
 92 |             raise ValueError('Dealership must be A or B')
 93 | 
 94 |     def poisson(self, expected, num):
 95 |         ret = ((expected**num)/math.factorial(num))*math.exp(-expected)
 96 |         return ret
 97 | 
 98 |     def get_action_cost(self, current, action):
 99 |         return abs(action) * self.MOVING_CAR_COST
100 | 
101 |     def get_expected_reward(self, action, current, next):
102 |         cost = self.get_action_cost(current, action)
103 | 
104 |         expected_sales_a = self.a_expected_revenue[action, current[0], next[0]]
105 |         expected_sales_b = self.b_expected_revenue[action, current[1], next[1]]
106 | 
107 |         return expected_sales_a + expected_sales_b - cost
108 | 
109 |     def next_state_probability(self, current, next, action):
110 |         immediate_a = current[0] - action
111 |         immediate_b = current[1] + action
112 |         if immediate_a < 0 or immediate_a > (self.max_cars - 1):
113 |             return 0.0
114 |         elif immediate_b < 0 or immediate_b > (self.max_cars - 1):
115 |             return 0.0
116 |         probability_a = self.a_transitions[immediate_a, next[0]]
117 |         probability_b = self.b_transitions[immediate_b, next[1]]
118 |         return probability_a * probability_b
119 | 
120 |     def expected_return(self, state, action, state_value, gamma):
121 |         (a, b) = state
122 |         next_state_gain_expectation = 0.0
123 |         for a_prime in range(self.max_cars):
124 |             for b_prime in range(self.max_cars):
125 |                 probability_next_state = self.next_state_probability((a, b), (a_prime, b_prime), action)
126 |                 immediate_reward = self.get_expected_reward(action, (a, b), (a_prime, b_prime))
127 |                 next_state_gain_expectation += probability_next_state * (immediate_reward + gamma * state_value[a_prime, b_prime])
128 |         return next_state_gain_expectation
129 | 
130 |     def evaluate_policy(self, policy, gamma=0.9, convergence=1.0):
131 |         """
132 |         Generates a value function for a given deterministic policy.
133 |         The policy should specify the action [-5, +5] for each
134 |         state, which is the number of cars at location A and the number
135 |         of cars at location B, where each ranges from 0 to 20.
136 | 
137 |         :param policy: A self.max_cars x self.max_cars array
138 |         :return: A self.max_cars x self.max_cars  array
139 |         """
140 |         ret = np.zeros((self.max_cars, self.max_cars))
141 |         diff = np.inf
142 |         print(f'Evaluating policy until diff < {convergence}')
143 |         while diff > convergence:
144 |             temp = np.copy(ret)
145 |             for a in range(policy.shape[0]):
146 |                 for b in range(policy.shape[1]):
147 |                     ret[a, b] = self.expected_return((a, b), policy[a, b], temp, gamma)
148 |             diff = np.max(np.fabs(np.subtract(ret, temp)))
149 |             print(f'Diff: {diff}')
150 |         return ret
151 | 
152 |     def get_greedy_policy(self, value, gamma=0.9):
153 |         """
154 |         Generates a policy that is greedy with respect to the provided value function.
155 | 
156 |         :param value: A self.max_cars x self.max_cars array
157 |         :return: A self.max_cars x self.max_cars array
158 |         """
159 |         policy = np.zeros((self.max_cars, self.max_cars))
160 |         print('Improving Policy...')
161 |         for a in tqdm(range(policy.shape[0])):
162 |             for b in range(policy.shape[1]):
163 |                 best_action = [None, -np.inf]
164 |                 for action in np.arange(-5, 6):
165 |                     if a - action < 0 or b + action < 0:
166 |                         # This action is not allowed if it makes one dealership have less than 0 cars
167 |                         continue
168 |                     next_state_gain_expectation = self.expected_return((a, b), action, value, gamma)
169 |                     if next_state_gain_expectation > best_action[1]:
170 |                         best_action[0] = action
171 |                         best_action[1] = next_state_gain_expectation
172 |                 policy[a, b] = best_action[0]
173 |         return policy.astype(int)
174 | 
175 |     def run_policy_improvement(self, gamma=0.9, convergence=5.0):
176 |         initial_policy = np.zeros((self.max_cars, self.max_cars), dtype=int)
177 |         policies = [initial_policy]
178 |         value = None
179 |         while len(policies) < 2 or not np.array_equal(policies[-1], policies[-2]):
180 |             value = self.evaluate_policy(policies[-1], gamma, convergence)
181 |             greedy = self.get_greedy_policy(value)
182 |             policies.append(greedy)
183 |         return policies, value
184 | 
185 |     def plot_results(self, policies, value_function):
186 |         self.plot_value_function(value_function, figure=1)
187 |         self.plot_policies(policies, starting_fig=2)
188 |         plt.show()
189 | 
190 |     def plot_value_function(self, value_function, figure=1):
191 |         fig = plt.figure(figure)
192 |         ax = fig.add_subplot(111, projection='3d')
193 |         x = np.arange(0, self.max_cars)
194 |         y = np.arange(0, self.max_cars)
195 |         X, Y = np.meshgrid(x, y)
196 |         ax.plot_wireframe(X, Y, value_function)
197 |         fig.suptitle('Optimal Value Function')
198 |         plt.xlabel('# of Cars at Dealership B')
199 |         plt.ylabel('# of Cars at Dealership A')
200 | 
201 |     def plot_policies(self, policies, starting_fig=1):
202 |         figure = starting_fig
203 |         for i in range(len(policies)):
204 |             fig = plt.figure(figure)
205 |             figure += 1
206 |             policy = policies[i]
207 |             plt.imshow(policy, cmap='jet')
208 |             plt.ylabel('# of Cars at Dealership A')
209 |             plt.xlabel('# of Cars at Dealership B')
210 |             plt.xticks(np.arange(0, policy.shape[0], 1))
211 |             plt.yticks(np.arange(0, policy.shape[1], 1))
212 |             plt.gca().invert_yaxis()
213 |             if i == (len(policies) - 1):
214 |                 fig.suptitle('Optimal Policy')
215 |             else:
216 |                 fig.suptitle(f'Policy {i}')
217 | 
218 |             # Annotate states
219 |             for i in range(policy.shape[0]):
220 |                 for j in range(policy.shape[1]):
221 |                     plt.text(j, i, '%d' % policy[i,j], horizontalalignment='center', verticalalignment='center')
222 | 
223 |             plt.colorbar()
224 | 
225 | 
226 | class JacksCarRentalWithHelp(JacksCarRental):
227 | 
228 |     SECOND_PARKING_LOT_COST = 4
229 | 
230 |     def get_action_cost(self, current, action):
231 |         if action > 0:
232 |             moving_cost = self.MOVING_CAR_COST * (action - 1)
233 |         else:
234 |             moving_cost = self.MOVING_CAR_COST * abs(action)
235 | 
236 |         overnight_cars_a = current[0] - action
237 |         overnight_cars_b = current[1] + action
238 | 
239 |         parking_cost = 0
240 |         if overnight_cars_a > 10:
241 |             parking_cost += self.SECOND_PARKING_LOT_COST
242 |         if overnight_cars_b > 10:
243 |             parking_cost += self.SECOND_PARKING_LOT_COST
244 | 
245 |         return moving_cost + parking_cost
246 | 


--------------------------------------------------------------------------------
/dynamic_programming/exercises/car_rental_exercise.py:
--------------------------------------------------------------------------------
 1 | from dynamic_programming.car_rentals import JacksCarRental
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description="Car Rental Exercise")
 5 | 
 6 | parser.add_argument('--convergence',
 7 |                     type=float,
 8 |                     help='Convergence criteria for policy evaluation',
 9 |                     default=1.0)
10 | args = parser.parse_args()
11 | 
12 | jcr = JacksCarRental()
13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence)
14 | jcr.plot_results(policies, optimal_value)
15 | 


--------------------------------------------------------------------------------
/dynamic_programming/exercises/ex_4_5.py:
--------------------------------------------------------------------------------
 1 | from dynamic_programming.car_rentals import JacksCarRentalWithHelp
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description="Exercise 4.5")
 5 | 
 6 | parser.add_argument('--convergence',
 7 |                     type=float,
 8 |                     help='Convergence criteria for policy evaluation',
 9 |                     default=1.0)
10 | args = parser.parse_args()
11 | 
12 | jcr = JacksCarRentalWithHelp()
13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence)
14 | jcr.plot_results(policies, optimal_value)
15 | 


--------------------------------------------------------------------------------
/dynamic_programming/exercises/ex_4_9.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from dynamic_programming.gamblers import GamblersProblem
 3 | 
 4 | # Win probability 0.25
 5 | gambler = GamblersProblem(win_probability=0.25)
 6 | value_funcs = gambler.value_iteration()
 7 | policy = gambler.get_greedy_policy(value_funcs[-1])
 8 | next_figure = gambler.plot_results(value_funcs[0:5], policy)
 9 | 
10 | # Win probability 0.55
11 | gambler = GamblersProblem(win_probability=0.55)
12 | value_funcs = gambler.value_iteration()
13 | policy = gambler.get_greedy_policy(value_funcs[-1])
14 | gambler.plot_results(value_funcs[0:5], policy, figure=next_figure)
15 | 
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/dynamic_programming/gamblers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | 
  5 | 
  6 | class GamblersProblem():
  7 | 
  8 |     _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
  9 | 
 10 |     def __init__(self, win_probability=0.4):
 11 |         self._win_probability = win_probability
 12 | 
 13 |     def get_possible_next_states(self, state, action):
 14 |         ret = []
 15 |         # Either we win
 16 |         ret.append(state + action)
 17 |         # or we lose
 18 |         ret.append(state - action)
 19 |         return list(set(ret))
 20 | 
 21 |     def probability_next_state(self, state, action, next_state):
 22 |         # Special "sink" states
 23 |         if state == 0:
 24 |             if next_state == 0:
 25 |                 return 1.0
 26 |             return 0.0
 27 |         if state == 100:
 28 |             if next_state == 100:
 29 |                 return 1.0
 30 |             return 0.0
 31 | 
 32 |         # Loss
 33 |         if next_state == (state - action):
 34 |             return 1 - self._win_probability
 35 |         # Win
 36 |         elif next_state == (state + action):
 37 |             return self._win_probability
 38 |         else:
 39 |             # Should never actually make it here
 40 |             return 0.0
 41 | 
 42 |     def reward(self, state, action, next_state):
 43 |         if next_state == 100:
 44 |             return 1.0
 45 |         else:
 46 |             return 0.0
 47 | 
 48 |     def value_iteration(self, convergence=0.0001):
 49 |         diff = np.inf
 50 |         value = np.zeros(101)
 51 |         temp = np.copy(value)
 52 |         ret = []
 53 |         while diff > convergence:
 54 |             for state in range(1, value.shape[0] - 1):
 55 |                 action_space = np.arange(0, min(state, 100 - state) + 1)
 56 |                 best_value = None
 57 |                 for action in action_space:
 58 |                     possible_next_states = self.get_possible_next_states(state, action)
 59 |                     gain = 0.0
 60 |                     for next_state in possible_next_states:
 61 |                         gain += self.probability_next_state(state, action, next_state) * (
 62 |                             self.reward(state, action, next_state) + temp[next_state]
 63 |                         )
 64 |                     if best_value is None or gain > best_value:
 65 |                         best_value = gain
 66 |                 value[state] = best_value
 67 |             diff = np.max(np.fabs(np.subtract(temp, value)))
 68 |             temp = np.copy(value)
 69 |             ret.append(temp)
 70 |         return ret
 71 | 
 72 |     def get_greedy_policy(self, value):
 73 |         policy = np.zeros(101)
 74 |         for state in np.arange(1, 100):
 75 |             action_space = np.arange(0, min(state, 100 - state) + 1)
 76 |             best_action = [None, -np.inf]
 77 |             for action in action_space:
 78 |                 possible_next_states = self.get_possible_next_states(state, action)
 79 |                 gain = 0.0
 80 |                 for next_state in possible_next_states:
 81 |                     gain += self.probability_next_state(state, action, next_state) * (
 82 |                         self.reward(state, action, next_state) + value[next_state]
 83 |                     )
 84 |                 if best_action[0] is None:
 85 |                     best_action[0] = action
 86 |                     best_action[1] = gain
 87 |                 elif math.isclose(gain, best_action[1]):
 88 |                     # Tie breaking strategy
 89 |                     # Choose more conservative action
 90 |                     if action < best_action[0]:
 91 |                         best_action[0] = action
 92 |                 elif gain > best_action[1]:
 93 |                     best_action[0] = action
 94 |                     best_action[1] = gain
 95 |             policy[state] = best_action[0]
 96 |         return policy
 97 | 
 98 |     def plot_value_functions(self, value_functions):
 99 |         for i in range(len(value_functions)):
100 |             plt.plot(value_functions[i][0:-1], self._plot_colors[i%len(self._plot_colors)], label=f'Value Function {i}')
101 |         plt.title(f"Gambler's Problem Value Iteration (Win Probability = {self._win_probability})")
102 |         plt.xlabel('Capital')
103 |         plt.ylabel('Value')
104 |         plt.legend(loc=4)
105 | 
106 |     def plot_policy(self, policy):
107 |         plt.plot(np.arange(0, 101), policy)
108 |         plt.title(f'Optimal Policy for Gambler (Win Probability = {self._win_probability})')
109 |         plt.xlabel('Captial')
110 |         plt.ylabel('Stake')
111 | 
112 |     def plot_results(self, value_functions, policy, figure=1):
113 |         plt.figure(figure)
114 |         self.plot_value_functions(value_functions)
115 |         plt.figure(figure + 1)
116 |         self.plot_policy(policy)
117 |         return figure + 2
118 | 
119 | if __name__ == '__main__':
120 |     gmb = GamblersProblem()
121 |     values = gmb.value_iteration(convergence=0.001)
122 |     policy = gmb.get_greedy_policy(values[-1])
123 |     gmb.plot_value_functions(values)
124 |     gmb.plot_policy(policy)
125 | 


--------------------------------------------------------------------------------
/dynamic_programming/results/e45_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_policy.png


--------------------------------------------------------------------------------
/dynamic_programming/results/e45_optimal_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_value.png


--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_0.png


--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_1.png


--------------------------------------------------------------------------------
/dynamic_programming/results/e45_policy_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_2.png


--------------------------------------------------------------------------------
/dynamic_programming/results/gambler_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gambler_optimal_policy.png


--------------------------------------------------------------------------------
/dynamic_programming/results/gamblers_value_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gamblers_value_iteration.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_optimal_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_policy.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_optimal_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_value.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_0.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_1.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_2.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_3.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_4.png


--------------------------------------------------------------------------------
/dynamic_programming/results/jack_policy_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_5.png


--------------------------------------------------------------------------------
/dynamic_programming/results/policy_4_9_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_a.png


--------------------------------------------------------------------------------
/dynamic_programming/results/policy_4_9_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_b.png


--------------------------------------------------------------------------------
/dynamic_programming/results/value_4_9_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_a.png


--------------------------------------------------------------------------------
/dynamic_programming/results/value_4_9_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_b.png


--------------------------------------------------------------------------------
/environments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/__init__.py


--------------------------------------------------------------------------------
/environments/blackjack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/blackjack/__init__.py


--------------------------------------------------------------------------------
/environments/blackjack/blackjack.py:
--------------------------------------------------------------------------------
  1 | from random import randint
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from mpl_toolkits.mplot3d import Axes3D
  5 | 
  6 | 
  7 | class BlackjackPlotter:
  8 | 
  9 |     @staticmethod
 10 |     def plot_value_functions(value):
 11 |         reshaped_value = np.reshape(value, BlackjackStates.state_space_shape())
 12 |         BlackjackPlotter.plot_value_function(
 13 |             reshaped_value[:, :, 0],
 14 |             title='Value Function (Usable ace)',
 15 |             figure=1
 16 |         )
 17 |         BlackjackPlotter.plot_value_function(
 18 |             reshaped_value[:, :, 1],
 19 |             title='Value Function (No usable ace)',
 20 |             figure=2)
 21 |         plt.show()
 22 | 
 23 |     @staticmethod
 24 |     def plot_value_function(value_function, title='Value Function', figure=1):
 25 |         fig = plt.figure(figure)
 26 |         ax = fig.add_subplot(111, projection='3d')
 27 |         x = np.arange(12, 22)
 28 |         y = np.arange(1, 11)
 29 |         X, Y = np.meshgrid(x, y)
 30 |         ax.plot_wireframe(X, Y, value_function)
 31 |         fig.suptitle(title)
 32 |         plt.xlabel('Player sum')
 33 |         plt.ylabel('Dealer showing')
 34 | 
 35 |     @staticmethod
 36 |     def plot_policies(policies):
 37 |         reshaped_policy = policies.reshape(BlackjackStates.state_space_shape())
 38 |         ace_policy = reshaped_policy[:, :, 0]
 39 |         BlackjackPlotter.plot_policy(ace_policy, title='Ace policy', figure=1)
 40 |         no_ace_policy = reshaped_policy[:, :, 1]
 41 |         BlackjackPlotter.plot_policy(no_ace_policy, title='No ace policy', figure=2)
 42 |         plt.show()
 43 | 
 44 |     @staticmethod
 45 |     def plot_policy(policy, title='Blackjack Policy', figure=1):
 46 |         policy = np.transpose(policy)
 47 |         fig = plt.figure(figure)
 48 |         ax = fig.subplots()
 49 |         fig.suptitle(title)
 50 |         plt.imshow(policy, cmap='jet')
 51 |         plt.gca().invert_yaxis()
 52 | 
 53 |         plt.xlabel('Dealer showing')
 54 |         plt.xticks(np.arange(0, len(BlackjackStates.DEALER_CARDS), 1))
 55 |         ax.set_xticklabels(BlackjackStates.DEALER_CARDS)
 56 | 
 57 |         plt.ylabel('Agent sum')
 58 |         plt.yticks(np.arange(0, len(BlackjackStates.AGENT_SUMS), 1))
 59 |         ax.set_yticklabels(BlackjackStates.AGENT_SUMS)
 60 | 
 61 |         for i in range(policy.shape[0]):
 62 |             for j in range(policy.shape[1]):
 63 |                 if policy[i, j] == Blackjack.HIT_ACTION:
 64 |                     label = 'HIT'
 65 |                 else:
 66 |                     label = 'STAY'
 67 |                 plt.text(j, i, f'{label}', horizontalalignment='center', verticalalignment='center')
 68 | 
 69 | 
 70 | class BlackjackStates:
 71 | 
 72 |     DEALER_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10]
 73 |     AGENT_SUMS = [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
 74 |     USABLE_ACE = [True, False]
 75 |     STATES = []
 76 |     for dealer_card in DEALER_CARDS:
 77 |         for agent_sum in AGENT_SUMS:
 78 |             for _usable_ace in USABLE_ACE:
 79 |                 STATES.append((dealer_card, agent_sum, _usable_ace))
 80 | 
 81 |     @staticmethod
 82 |     def state_space_shape():
 83 |         return (len(BlackjackStates.DEALER_CARDS),
 84 |                 len(BlackjackStates.AGENT_SUMS),
 85 |                 len(BlackjackStates.USABLE_ACE))
 86 | 
 87 |     @staticmethod
 88 |     def num_states():
 89 |         return (len(BlackjackStates.DEALER_CARDS) *
 90 |                 len(BlackjackStates.AGENT_SUMS) *
 91 |                 len(BlackjackStates.USABLE_ACE))
 92 | 
 93 |     @staticmethod
 94 |     def id_to_state(id):
 95 |         return BlackjackStates.STATES[id]
 96 | 
 97 |     @staticmethod
 98 |     def state_to_id(state):
 99 |         dealer_card_index = BlackjackStates.DEALER_CARDS.index(state[0])
100 |         agent_sum_index = BlackjackStates.AGENT_SUMS.index(state[1])
101 |         usable_ace_index = BlackjackStates.USABLE_ACE.index(state[2])
102 |         return (
103 |             dealer_card_index * len(BlackjackStates.AGENT_SUMS) * len(BlackjackStates.USABLE_ACE) +
104 |             agent_sum_index * len(BlackjackStates.USABLE_ACE) +
105 |             usable_ace_index
106 |         )
107 | 
108 |     @staticmethod
109 |     def print_state(state):
110 |         if type(state) is int:
111 |             state = BlackjackStates.id_to_state(state)
112 |         dealer_card = state[0]
113 |         agent_sum = state[1]
114 |         usable_ace = state[2]
115 |         print(f'Dealer: {dealer_card}, Agent sum: {agent_sum}, Ace: {usable_ace}')
116 | 
117 | 
118 | class Blackjack:
119 | 
120 |     GAME_OVER_STATE = -1
121 |     HIT_ACTION = 0
122 |     STAY_ACTION = 1
123 |     HIT_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
124 | 
125 |     def __init__(self, verbose=True):
126 |         self._states = []
127 |         self._verbose = verbose
128 | 
129 |     def _blackjack_sum(self, hand):
130 |         """
131 |         Sums a list of cards with blackjack rules.
132 |         In other words, if a hand contains an ace, it counts it as
133 |         a 1 or 11 depending on what is appropriate.
134 | 
135 |         If a hand has more than 1 ace, at most 1 can count as 11.
136 |         """
137 |         running_total = 0
138 |         num_aces = 0
139 |         for card in hand:
140 |             if card == 'A':
141 |                 num_aces += 1
142 |             else:
143 |                 running_total += card
144 | 
145 |         # Count all aces as 1s by default
146 |         running_total += num_aces
147 | 
148 |         if num_aces > 0 and running_total + 10 <= 21:
149 |             # Count 1 ace as 11
150 |             running_total += 10
151 | 
152 |         return running_total
153 | 
154 |     def _draw_card(self):
155 |         return self.HIT_CARDS[randint(0, len(self.HIT_CARDS) - 1)]
156 | 
157 |     def _player_draw_card(self):
158 |         """
159 |         Returns a card value in the range [1, 10] because a player can't draw
160 |         another usable ace.
161 |         """
162 |         card = self._draw_card()
163 |         if card == 'A':
164 |             return 1
165 |         else:
166 |             return card
167 | 
168 |     def debug_print(self, message):
169 |         if self._verbose:
170 |             print(message)
171 | 
172 |     def num_states(self):
173 |         return BlackjackStates.num_states()
174 | 
175 |     def num_actions(self):
176 |         return 2
177 | 
178 |     def get_starting_state(self):
179 |         return self.get_random_state()
180 | 
181 |     def get_random_state(self):
182 |         return randint(0, self.num_states() - 1)
183 | 
184 |     def perform_action(self, state_id, action):
185 |         state = BlackjackStates.id_to_state(state_id)
186 |         dealer_card = state[0]
187 |         player_sum = state[1]
188 |         usable_ace = state[2]
189 |         if action == self.HIT_ACTION:
190 |             self.debug_print(f'You hit!')
191 |             card = self._player_draw_card()
192 |             self.debug_print(f'You drew {card}')
193 |             player_sum += card
194 |             if player_sum > 21:
195 |                 if usable_ace:
196 |                     # Ace becomes 1
197 |                     player_sum -= 10
198 |                     next_state = (dealer_card, player_sum, False)
199 |                     return (0, BlackjackStates.state_to_id(next_state), False)
200 |                 else:
201 |                     # Lose
202 |                     self.debug_print(f'You busted with {player_sum}.')
203 |                     return (-1, self.GAME_OVER_STATE, True)
204 |             else:
205 |                 # Still <= 21
206 |                 next_state = (dealer_card, player_sum, usable_ace)
207 |                 return (0, BlackjackStates.state_to_id(next_state), False)
208 |         elif action == self.STAY_ACTION:
209 |             self.debug_print(f'You stayed!')
210 |             # Dealer's turn
211 |             dealer_cards = [dealer_card]
212 |             dealer_sum = self._blackjack_sum(dealer_cards)
213 | 
214 |             blackjack = False
215 |             if player_sum == 21 and usable_ace:
216 |                 self.debug_print(f'You have a blackjack!')
217 |                 blackjack = True
218 | 
219 |             # Dealer must hit until he has over 17
220 |             while dealer_sum < 17:
221 |                 card = self._draw_card()
222 |                 self.debug_print(f'Dealer had {dealer_sum}, and drew {card}')
223 |                 dealer_cards.append(card)
224 |                 dealer_sum = self._blackjack_sum(dealer_cards)
225 |                 if dealer_sum != 21 and blackjack:
226 |                     # If dealer doesn't have 21 after first draw,
227 |                     # player immediately wins.
228 |                     self.debug_print(f'You win!')
229 |                     return (1, self.GAME_OVER_STATE, True)
230 | 
231 |             if dealer_sum > 21:
232 |                 # Dealer busted
233 |                 self.debug_print(f'Dealer busted.')
234 |                 return (1, self.GAME_OVER_STATE, True)
235 |             else:
236 |                 if dealer_sum > player_sum:
237 |                     # Lose
238 |                     self.debug_print(f'Dealer won with {dealer_sum}.')
239 |                     return (-1, self.GAME_OVER_STATE, True)
240 |                 elif dealer_sum == player_sum:
241 |                     self.debug_print(f'Draw. Dealer and player both have {player_sum}.')
242 |                     return (0, self.GAME_OVER_STATE, True)
243 |                 else:
244 |                     # Win
245 |                     self.debug_print(f'You won! Dealer: {dealer_sum}. You: {player_sum}.')
246 |                     return (1, self.GAME_OVER_STATE, True)
247 |         else:
248 |             raise ValueError('This is not a valid action.')
249 | 
250 |     def is_terminal(self, state):
251 |         return state == self.GAME_OVER_STATE
252 | 


--------------------------------------------------------------------------------
/environments/blackjack/blackjack_policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates
 4 | 
 5 | 
 6 | class BlackjackPolicy:
 7 | 
 8 |     def _get_action_by_state(self, state):
 9 |         raise NotImplementedError('This must be implemented.')
10 | 
11 |     def get_action(self, state_id):
12 |         blackjack_state = BlackjackStates.id_to_state(state_id)
13 |         return self._get_action_by_state(blackjack_state)
14 | 
15 |     @staticmethod
16 |     def generate_policy(stay_on=[]):
17 |         policy = np.zeros(BlackjackStates.num_states())
18 |         for state_id in range(policy.shape[0]):
19 |             state = BlackjackStates.id_to_state(state_id)
20 |             dealer_card = state[0]
21 |             agent_sum = state[1]
22 |             ace = state[2]
23 |             if agent_sum in stay_on:
24 |                 policy[state_id] = Blackjack.STAY_ACTION
25 |             else:
26 |                 policy[state_id] = Blackjack.HIT_ACTION
27 |         return policy
28 | 


--------------------------------------------------------------------------------
/environments/blackjack/interactive_blackjack.py:
--------------------------------------------------------------------------------
 1 | from environments.blackjack.blackjack import *
 2 | 
 3 | blackjack = Blackjack()
 4 | state = blackjack.get_random_state()
 5 | 
 6 | while not blackjack.is_terminal(state):
 7 |     (dealer_card, player_sum, usable_ace) = BlackjackStates.id_to_state(state)
 8 | 
 9 |     if usable_ace:
10 |         ace_string = 'with ace'
11 |     else:
12 |         ace_string = 'no ace'
13 |     print(f'--- Dealer showing: {dealer_card} --- You: {player_sum} ({ace_string}) ---')
14 | 
15 |     action = None
16 |     while action is None:
17 |         action = input('Hit (0) or stay (1)?: ')
18 |         if action in ['0', '1']:
19 |             action = int(action)
20 |         else:
21 |             action = None
22 |             print('Invalid action')
23 | 
24 |     print()
25 |     reward, state = blackjack.perform_action(state, action)
26 | 


--------------------------------------------------------------------------------
/environments/racing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/__init__.py


--------------------------------------------------------------------------------
/environments/racing/interactive_racetrack.py:
--------------------------------------------------------------------------------
 1 | from environments.racing.racing import RaceTrackGame
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='Interactive Race Track Game')
 5 | 
 6 | parser.add_argument('racetrack',
 7 |                     type=str,
 8 |                     help='Path to racetrack csv file')
 9 | 
10 | args = parser.parse_args()
11 | 
12 | RaceTrackGame.run(args.racetrack)
13 | 


--------------------------------------------------------------------------------
/environments/racing/racetracks/racetrack_a.csv:
--------------------------------------------------------------------------------
 1 | 0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,2
 2 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
 3 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2
 4 | 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
 5 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
 6 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
 7 | 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0
 8 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
 9 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
10 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
11 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
12 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
13 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
14 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
15 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
16 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
17 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
18 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
19 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
20 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
21 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
22 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
23 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
24 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
25 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
26 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
27 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
28 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
29 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0
30 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0
31 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0
32 | 0,0,0,3,3,3,3,3,3,0,0,0,0,0,0,0
33 | 


--------------------------------------------------------------------------------
/environments/racing/racing.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import random
  4 | import sys
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import pygame
  9 | 
 10 | 
 11 | class RacerBot:
 12 | 
 13 |     def __init__(self, policy):
 14 |         self.policy = policy
 15 | 
 16 |     def get_action(self, state_id):
 17 |         choices = np.arange(0, self.policy.shape[1])
 18 |         probabilities = self.policy[state_id]
 19 |         probabilities /= probabilities.sum()
 20 |         return np.random.choice(choices, p=probabilities)
 21 | 
 22 | 
 23 | class RaceTrack:
 24 | 
 25 |     OOB = 0
 26 |     TRACK = 1
 27 |     FINISH = 2
 28 |     START = 3
 29 |     CAR = 4
 30 | 
 31 |     MAX_SPEED = 5
 32 | 
 33 |     def __init__(self, csv_path):
 34 |         self.track = []
 35 |         self.start_locations = []
 36 |         self.finish_locations = []
 37 |         with open(csv_path, 'r') as csvfile:
 38 |             track_layout = csv.reader(csvfile, delimiter=',')
 39 |             row_num = 0
 40 |             for row in track_layout:
 41 |                 new_row = []
 42 |                 col_num = 0
 43 |                 for cell in row:
 44 |                     new_cell = int(cell)
 45 |                     if new_cell == RaceTrack.START:
 46 |                         self.start_locations.append([col_num, row_num])
 47 |                     if new_cell == RaceTrack.FINISH:
 48 |                         self.finish_locations.append([col_num, row_num])
 49 |                     new_row.append(new_cell)
 50 |                     col_num += 1
 51 |                 self.track.append(new_row)
 52 |                 row_num += 1
 53 | 
 54 |         self.states = []
 55 |         for col in range(len(self.track[0])):
 56 |             for row in range(len(self.track)):
 57 |                 for horizontal_speed in np.arange(0, self.MAX_SPEED):
 58 |                     for vertical_speed in np.arange(0, self.MAX_SPEED):
 59 |                         self.states.append((col, row, horizontal_speed, vertical_speed))
 60 | 
 61 |         self.actions = []
 62 |         for horizontal_accel in np.arange(-1, 2):
 63 |             for vertical_accel in np.arange(-1, 2):
 64 |                 self.actions.append((horizontal_accel, vertical_accel))
 65 | 
 66 |     def num_states(self):
 67 |         return len(self.states)
 68 | 
 69 |     def num_actions(self):
 70 |         return len(self.actions)
 71 | 
 72 |     def action_to_id(self, action):
 73 |         return (
 74 |             (action[0] + 1) * 3 +
 75 |             (action[1] + 1)
 76 |         )
 77 | 
 78 |     def id_to_action(self, id):
 79 |         return self.actions[id]
 80 | 
 81 |     def state_to_id(self, state):
 82 |         col = state[0]
 83 |         row = state[1]
 84 |         horizontal_speed = state[2]
 85 |         vertical_speed = state[3]
 86 |         return (
 87 |             col * len(self.track) * self.MAX_SPEED * self.MAX_SPEED +
 88 |             row * self.MAX_SPEED * self.MAX_SPEED +
 89 |             horizontal_speed * self.MAX_SPEED +
 90 |             vertical_speed
 91 |         )
 92 | 
 93 |     def id_to_state(self, id):
 94 |         return self.states[id]
 95 | 
 96 |     def perform_action(self, state_id, action_id):
 97 |         """
 98 |         Returns reward, next state, and if we finished.
 99 |         """
100 |         state = self.id_to_state(state_id)
101 |         current_location = [state[0], state[1]]
102 |         current_speed = [state[2], state[3]]
103 |         action = self.id_to_action(action_id)
104 | 
105 |         current_speed[0] = max(min(current_speed[0] + action[0], self.MAX_SPEED - 1), 0)
106 |         current_speed[1] = max(min(current_speed[1] + action[1], self.MAX_SPEED - 1), 0)
107 |         if current_speed[0] == 0 and current_speed[1] == 0:
108 |             current_speed[1] = 1
109 |         if self.crosses_finish_line(current_location, current_speed):
110 |             next_state = self.starting_line_state()
111 |             return (0, self.state_to_id(next_state), True)
112 |         else:
113 |             next_location = self.get_next_location(current_location, current_speed)
114 |             if self.out_of_bounds(next_location):
115 |                 next_state = self.starting_line_state()
116 |                 return (-5, self.state_to_id(next_state), False)
117 |             next_state = (next_location[0], next_location[1], current_speed[0], current_speed[1])
118 |             return (-1, self.state_to_id(next_state), False)
119 | 
120 |     def crosses_finish_line(self, position, speed):
121 |         horizontal = speed[0]
122 |         vertical = speed[1]
123 |         intermediate_location = [0, 0]
124 |         intermediate_location[0] = position[0]
125 |         intermediate_location[1] = position[1]
126 |         while (horizontal + vertical > 0):
127 |             if horizontal >= vertical:
128 |                 intermediate_location[0] += 1
129 |                 horizontal -= 1
130 |             else:
131 |                 intermediate_location[1] -=1
132 |                 vertical -= 1
133 |             for finish_location in self.finish_locations:
134 |                 if intermediate_location[0] == finish_location[0] and intermediate_location[1] == finish_location[1]:
135 |                     return True
136 |         return False
137 | 
138 |     def out_of_bounds(self, location):
139 |         return (location[0] < 0 or location[0] >= self.dimensions[0] or
140 |                 location[1] < 0 or location[1] >= self.dimensions[1] or
141 |                 self.track[location[1]][location[0]] == self.OOB)
142 | 
143 |     def get_next_location(self, location, speed):
144 |         next_loc = [location[0] + speed[0], location[1] - speed[1]]
145 |         return next_loc
146 | 
147 |     def get_starting_state(self):
148 |         return self.state_to_id(self.starting_line_state())
149 | 
150 |     def starting_line_state(self):
151 |         random_start = self.start_locations[random.randint(0, len(self.start_locations) - 1)]
152 |         ret = (random_start[0], random_start[1], 0, 0)
153 |         return ret
154 | 
155 |     @property
156 |     def dimensions(self):
157 |         return (len(self.track[0]), len(self.track))
158 | 
159 | 
160 | class RaceTrackGame:
161 | 
162 |     CAPTION = 'Racing Game'
163 |     SCREEN_SIZE = (500, 800)
164 | 
165 |     OOB_COLOR = (240, 252, 22)
166 |     TRACK_COLOR = (147, 150, 155)
167 |     FINISH_COLOR = (1, 75, 234)
168 |     START_COLOR = (2, 234, 72)
169 |     CAR_COLOR = (0, 0, 0)
170 |     BACKGROUND_COLOR = (0, 50, 50)
171 |     CELL_BORDER = 2
172 |     SPEED_RIGHT_MARGIN = SCREEN_SIZE[0]/2
173 | 
174 |     FONT_SIZE = 25
175 |     FONT_HEIGHT = 30
176 |     FONT_COLOR = (255, 255, 255)
177 | 
178 |     TOP_BOTTOM_MARGIN = 10
179 |     TRACK_LOCATION = (20, 40)
180 |     LEFT_RIGHT_MARGIN = 10
181 | 
182 |     TRACK_SIZE = (SCREEN_SIZE[0] - 2 * LEFT_RIGHT_MARGIN, SCREEN_SIZE[1] - FONT_HEIGHT - 2 * TOP_BOTTOM_MARGIN)
183 | 
184 |     def __init__(self, racetrack_csv):
185 |         self.screen = pygame.display.get_surface()
186 |         self.screen_rect = self.screen.get_rect()
187 |         self.done = False
188 |         self.keys = pygame.key.get_pressed()
189 |         self.racetrack = RaceTrack(racetrack_csv)
190 |         self.current_action = [0, 0]
191 |         self.font = pygame.font.SysFont(pygame.font.get_default_font(), self.FONT_SIZE)
192 | 
193 |         self.cell_size = self.get_cell_size()
194 |         self.track_top_left = self.get_track_drawing_info()
195 | 
196 |         self.current_state = self.racetrack.starting_line_state()
197 | 
198 |         self.current_score = 0
199 | 
200 |     def get_cell_size(self):
201 |         track_dimensions = self.racetrack.dimensions
202 |         return (int(self.TRACK_SIZE[0] / track_dimensions[0]), int(self.TRACK_SIZE[1] / track_dimensions[1]))
203 | 
204 |     def get_track_drawing_info(self):
205 |         track_dimensions = self.racetrack.dimensions
206 | 
207 |         # Correct for rounding
208 |         actual_track_size = (self.cell_size[0] * track_dimensions[0], self.cell_size[1] * track_dimensions[1])
209 |         margins = (self.TRACK_SIZE[0] - actual_track_size[0], self.TRACK_SIZE[1] - actual_track_size[1])
210 | 
211 |         track_top_left = (self.LEFT_RIGHT_MARGIN + margins[0] / 2, self.FONT_HEIGHT + self.TOP_BOTTOM_MARGIN + margins[1] / 2)
212 | 
213 |         return track_top_left
214 | 
215 |     def update_current_action(self):
216 |         # Forward
217 |         if self.keys[pygame.K_i]:
218 |             self.current_action[1] = min(self.current_action[1] + 1, 1)
219 |         # Back
220 |         if self.keys[pygame.K_k]:
221 |             self.current_action[1] = max(self.current_action[1] - 1, -1)
222 |         # Left
223 |         if self.keys[pygame.K_j]:
224 |             self.current_action[0] = max(self.current_action[0] - 1, -1)
225 |         # Right
226 |         if self.keys[pygame.K_l]:
227 |             self.current_action[0] = min(self.current_action[0] + 1, 1)
228 | 
229 |     def draw(self, state, action):
230 |         self.screen.fill(RaceTrackGame.BACKGROUND_COLOR)
231 |         self.render_current_action(action)
232 |         self.render_game_state(state)
233 | 
234 |     def render_game_state(self, state):
235 |         self.render_track()
236 |         self.render_current_speed((state[2], state[3]))
237 |         self.render_car((state[0], state[1]))
238 | 
239 |     def render_current_action(self, action):
240 |         current_action_string = f'[H: {action[0]}, V: {action[1]}]'
241 |         text_surface = self.font.render(current_action_string, True, self.FONT_COLOR)
242 |         self.screen.blit(text_surface, (10, 10))
243 | 
244 |     def render_current_speed(self, speed):
245 |         current_speed_string = f'Current speed: H: {speed[0]}, V: {speed[1]}'
246 |         text_surface = self.font.render(current_speed_string, True, self.FONT_COLOR)
247 |         self.screen.blit(text_surface, (self.SCREEN_SIZE[0] - self.SPEED_RIGHT_MARGIN, 10))
248 | 
249 |     def render_track(self):
250 |         for row in range(len(self.racetrack.track)):
251 |             for col in range(len(self.racetrack.track[row])):
252 |                 cell = self.racetrack.track[row][col]
253 |                 self.draw_cell(cell, col, row)
254 | 
255 |     def render_car(self, location):
256 |         self.draw_cell(RaceTrack.CAR, location[0], location[1])
257 | 
258 |     def get_track_pixel_pos(self, col, row):
259 |         return (self.track_top_left[0] + col*self.cell_size[0], self.track_top_left[1] + row*self.cell_size[1])
260 | 
261 |     def draw_cell(self, cell, col, row):
262 |         if cell == RaceTrack.OOB:
263 |             color = RaceTrackGame.OOB_COLOR
264 |         elif cell == RaceTrack.FINISH:
265 |             color = RaceTrackGame.FINISH_COLOR
266 |         elif cell == RaceTrack.TRACK:
267 |             color = RaceTrackGame.TRACK_COLOR
268 |         elif cell == RaceTrack.START:
269 |             color = RaceTrackGame.START_COLOR
270 |         elif cell == RaceTrack.CAR:
271 |             color = RaceTrackGame.CAR_COLOR
272 |         else:
273 |             raise ValueError('Unknown cell type')
274 | 
275 |         draw_position = self.get_track_pixel_pos(col, row)
276 | 
277 |         pygame.draw.rect(self.screen, color, (draw_position[0], draw_position[1], self.cell_size[0] - self.CELL_BORDER, self.cell_size[1] - self.CELL_BORDER))
278 | 
279 |     def event_loop(self):
280 |         for event in pygame.event.get():
281 |             self.keys = pygame.key.get_pressed()
282 |             if event.type == pygame.QUIT or self.keys[pygame.K_ESCAPE]:
283 |                 self.done = True
284 |             self.update_current_action()
285 |             if self.keys[pygame.K_RETURN]:
286 |                 a = self.racetrack.action_to_id(self.current_action)
287 |                 s = self.racetrack.state_to_id(self.current_state)
288 |                 (r, s, finished) = self.racetrack.perform_action(s, a)
289 |                 self.current_score += r
290 |                 self.current_state = self.racetrack.id_to_state(s)
291 |                 if finished:
292 |                     print('Finished!!')
293 |                     print(f'You scored: {self.current_score}')
294 |                     self.current_score = 0
295 | 
296 |     def bot_loop(self, bot, episodes, timestep):
297 |         for episode in range(episodes):
298 |             state = self.racetrack.starting_line_state()
299 |             s = self.racetrack.state_to_id(state)
300 |             done = False
301 |             steps = 0
302 |             while not done:
303 |                 steps += 1
304 |                 a = bot.get_action(s)
305 |                 self.draw(self.racetrack.id_to_state(s), self.racetrack.id_to_action(a))
306 |                 pygame.display.flip()
307 |                 (r, s, done) = self.racetrack.perform_action(s, a)
308 |                 time.sleep(timestep)
309 |             print(f'Finished in {steps} steps!')
310 | 
311 | 
312 |     def main_loop(self):
313 |         while not self.done:
314 |             self.event_loop()
315 |             self.draw(self.current_state, self.current_action)
316 |             pygame.display.flip()
317 | 
318 |     @staticmethod
319 |     def init():
320 |         os.environ['SDL_VIDEO_CENTERED'] = '1'
321 |         pygame.init()
322 |         pygame.display.set_caption(RaceTrackGame.CAPTION)
323 |         pygame.display.set_mode(RaceTrackGame.SCREEN_SIZE)
324 | 
325 |     @staticmethod
326 |     def quit():
327 |         pygame.quit()
328 |         sys.exit()
329 | 
330 |     @staticmethod
331 |     def bot_run(racetrack_file, policy_file, episodes=10, timestep=1):
332 |         RaceTrackGame.init()
333 |         policy = np.load(policy_file)
334 |         bot = RacerBot(policy)
335 |         game = RaceTrackGame(racetrack_file)
336 |         game.bot_loop(bot, episodes, timestep)
337 |         RaceTrackGame.quit()
338 | 
339 |     @staticmethod
340 |     def run(racetrack_file):
341 |         RaceTrackGame.init()
342 |         game = RaceTrackGame(racetrack_file)
343 |         game.main_loop()
344 |         RaceTrackGame.quit()


--------------------------------------------------------------------------------
/environments/racing/run_trained_racetrack_bot.py:
--------------------------------------------------------------------------------
 1 | from environments.racing.racing import RaceTrackGame
 2 | import argparse
 3 | 
 4 | 
 5 | parser = argparse.ArgumentParser(description='Plays the racetrack game with the specified policy.')
 6 | 
 7 | parser.add_argument('racetrack',
 8 |                     type=str,
 9 |                     help='Path to racetrack csv file')
10 | parser.add_argument('policy',
11 |                     type=str,
12 |                     help='Path to serialized policy file')
13 | parser.add_argument('--timestep',
14 |                     type=float,
15 |                     help='Length of timesteps (s)',
16 |                     default=0.1)
17 | parser.add_argument('--episodes',
18 |                     type=int,
19 |                     help='Number of episodes to train over',
20 |                     default=10)
21 | parser.add_argument('--verbose',
22 |                     type=bool,
23 |                     help='Print (a lot of) log messages',
24 |                     default=False)
25 | args = parser.parse_args()
26 | 
27 | RaceTrackGame.bot_run(args.racetrack, args.policy, episodes=args.episodes, timestep=args.timestep)
28 | 


--------------------------------------------------------------------------------
/environments/racing/trained_policies/mc_learning.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/mc_learning.npy


--------------------------------------------------------------------------------
/environments/racing/trained_policies/q_learning.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/q_learning.npy


--------------------------------------------------------------------------------
/environments/racing/trained_policies/random.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/random.npy


--------------------------------------------------------------------------------
/environments/racing/trained_policies/sarsa.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/sarsa.npy


--------------------------------------------------------------------------------
/lib/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def sample_action(policy, state):
 5 |     """
 6 |     Samples a policy for an action given the current state.
 7 |     """
 8 |     choices = np.arange(0, policy.shape[1])
 9 |     probabilities = policy[state]
10 | 
11 |     return np.random.choice(choices, p=probabilities)
12 | 
13 | 
14 | def get_epsilon_greedy_policy(Q, epsilon):
15 |     num_actions = Q.shape[1]
16 |     policy = (epsilon/num_actions) * np.ones(Q.shape)
17 | 
18 |     greedy_action_indices = np.argmax(Q, axis=1)
19 |     policy[np.arange(0, Q.shape[0]), greedy_action_indices] += (1 - epsilon)
20 | 
21 |     return policy
22 | 
23 | 
24 | def get_greedy_policy(Q):
25 |     return np.argmax(Q, axis=1)


--------------------------------------------------------------------------------
/monte_carlo/README.md:
--------------------------------------------------------------------------------
 1 | # Monte Carlo Methods
 2 | 
 3 | In this chapter, we learn about Monte Carlo methods for learning the optimal behavior policy for
 4 | finite MDPs. This is just like what we did in the last chapter except here, we do not assume any
 5 | knowledge about the inner workings, or the model, of the MDP. For dynamic programming methods, we needed
 6 | to know the transition probabilities for state transitions and the rewards associated with them in order to
 7 | learn the optimal policy. Here, we learn the policy from experience alone.
 8 | 
 9 | ## Blackjack: Policy Evaluation
10 | 
11 | Here, we test out Monte Carlo policy evaluation on a Blackjack environment. We are evaluating the policy
12 | which stays only on 20 or 21 and hits on everything else. Below, you can see my results for running policy evaluation
13 | on this policy, which reproduces Figure 5.2 from the textbook.
14 | 
15 | For 10,000 episodes:
16 | 
17 | ![BlackjackEvalAce10000](./results/ace_policy_evaluation_10000.png)
18 | ![BlackjackEvalNoAce10000](./results/no_ace_policy_evaluation_10000.png)
19 | 
20 | For 500,000 episodes:
21 | 
22 | ![BlackjackEvalAce500000](./results/ace_policy_evaluation_500000.png)
23 | ![BlackjackEvalNoAce500000](./results/no_ace_policy_evaluation_500000.png)
24 | 
25 | As you can see, using more episodes gives you a better, less noisy picture of the value function.
26 | 
27 | ## Blackjack: Monte Carlo Control
28 | 
29 | Here, we use a Monte Carlo method to learn the optimal policy. We use the pattern of generalized policy iteration
30 | to do so. Basically, this means we use Monte Carlo simulation to evaluate an arbitrary policy, improve that policy
31 | by being greedy with respect to our evaluation, evaluate that new policy, improve that policy by being greedy with
32 | respect to that evaluation, and so on until the policy stops changing (that means we have reached the optimal policy).
33 | 
34 | ![BlackjackUsableAce](./results/ace_optimal.png)
35 | 
36 | ![BlackjackNonUsableAce](./results/no_ace_optimal.png)
37 | 
38 | ### Exercise 5.4: Racetrack Problem 
39 | 
40 | For this problem, I used on policy, first visit, epsilon soft Monte Carlo control to learn a policy for how
41 | to drive a car around a racetrack environment. The exact details of this problem are given in the text. Below,
42 | you can see how an agent behaves before and after training with this control method.
43 | 
44 | Before:
45 | 
46 | ![BotBeforeTraining](./results/untrained_bot_racing.gif)
47 | 
48 | As you can see, this bot crashes into the walls a lot and takes a long time to make it to the target (the blue line).
49 | 
50 | After:
51 | 
52 | ![BotAfterTraining](./results/trained_bot_racing.gif)
53 | 
54 | This bot clearly has learned some things about this environment. While it is still not behaving optimally, it is
55 | performing much better than the untrained bot on this environment.
56 | 
57 | #### Sources:
58 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012


--------------------------------------------------------------------------------
/monte_carlo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/__init__.py


--------------------------------------------------------------------------------
/monte_carlo/exercises/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/exercises/__init__.py


--------------------------------------------------------------------------------
/monte_carlo/exercises/blackjack_policy_improvement.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter
 4 | from monte_carlo import mc
 5 | 
 6 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Improvement')
 7 | 
 8 | parser.add_argument('--iterations',
 9 |                     type=int,
10 |                     help='Number of iterations to run',
11 |                     default=5000000)
12 | parser.add_argument('--verbose',
13 |                     type=bool,
14 |                     help='Print (a lot of) log messages',
15 |                     default=False)
16 | args = parser.parse_args()
17 | 
18 | 
19 | blackjack = Blackjack(verbose=args.verbose)
20 | optimal_policy, Q = mc.det_policy_improvement(blackjack, iterations=args.iterations)
21 | 
22 | if args.verbose:
23 |     for state_id in range(optimal_policy.shape[0]):
24 |         print('--------------------------------')
25 |         BlackjackStates.print_state(state_id)
26 |         if (optimal_policy[state_id] == Blackjack.HIT_ACTION):
27 |             print('HIT')
28 |         else:
29 |             print('STAY')
30 | 
31 | BlackjackPlotter.plot_policies(optimal_policy)
32 | 


--------------------------------------------------------------------------------
/monte_carlo/exercises/blackjack_soft_policy_improvement.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import lib.policy
 4 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter
 5 | from monte_carlo import mc
 6 | 
 7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Soft Policy Improvement')
 8 | 
 9 | parser.add_argument('--iterations',
10 |                     type=int,
11 |                     help='Number of iterations to run',
12 |                     default=1000000)
13 | parser.add_argument('--verbose',
14 |                     type=bool,
15 |                     help='Print (a lot of) log messages',
16 |                     default=False)
17 | args = parser.parse_args()
18 | 
19 | 
20 | blackjack = Blackjack(verbose=args.verbose)
21 | soft_optimal_policy, Q = mc.on_policy_fv_mc_e_soft_control(
22 |     blackjack,
23 |     epsilon_func=lambda ep, eps: 0.0,
24 |     alpha_func=lambda n: 1/n,
25 |     episodes=args.iterations,
26 |     random_start=True
27 | )
28 | 
29 | optimal_policy = lib.policy.get_greedy_policy(Q)
30 | 
31 | if args.verbose:
32 |     for state_id in range(optimal_policy.shape[0]):
33 |         print('--------------------------------')
34 |         BlackjackStates.print_state(state_id)
35 |         if (optimal_policy[state_id] == Blackjack.HIT_ACTION):
36 |             print('HIT')
37 |         else:
38 |             print('STAY')
39 | 
40 | BlackjackPlotter.plot_policies(optimal_policy)


--------------------------------------------------------------------------------
/monte_carlo/exercises/mc_blackjack.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from environments.blackjack.blackjack import Blackjack, BlackjackPlotter
 4 | from environments.blackjack.blackjack_policies import BlackjackPolicy
 5 | from monte_carlo import mc
 6 | 
 7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Evaluation')
 8 | 
 9 | parser.add_argument('--episodes',
10 |                     type=int,
11 |                     help='Number of episodes to train over',
12 |                     default=10000)
13 | parser.add_argument('--verbose',
14 |                     type=bool,
15 |                     help='Print (a lot of) log messages',
16 |                     default=False)
17 | args = parser.parse_args()
18 | 
19 | 
20 | blackjack = Blackjack(verbose=args.verbose)
21 | policy = BlackjackPolicy.generate_policy(stay_on=[20, 21])
22 | 
23 | value = mc.fv_policy_evaluation(blackjack, policy, episodes=args.episodes)
24 | BlackjackPlotter.plot_value_functions(value)
25 | 


--------------------------------------------------------------------------------
/monte_carlo/exercises/mc_racetrack.py:
--------------------------------------------------------------------------------
 1 | from environments.racing.racing import RaceTrack
 2 | from monte_carlo import mc
 3 | import numpy as np
 4 | import argparse
 5 | 
 6 | 
 7 | parser = argparse.ArgumentParser(description='Monte Carlo Racetrack Policy Improvement')
 8 | 
 9 | parser.add_argument('racetrack',
10 |                     type=str,
11 |                     help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 |                     type=str,
14 |                     help='Path at which to save policy file')
15 | parser.add_argument('--episodes',
16 |                     type=int,
17 |                     help='Number of episodes to train over',
18 |                     default=1000)
19 | parser.add_argument('--verbose',
20 |                     type=bool,
21 |                     help='Print (a lot of) log messages',
22 |                     default=False)
23 | args = parser.parse_args()
24 | 
25 | 
26 | racetrack = RaceTrack(args.racetrack)
27 | policy, Q = mc.on_policy_fv_mc_e_soft_control(
28 |     racetrack,
29 |     epsilon_func=lambda ep, eps: 1 - (ep/eps),
30 |     alpha_func=lambda n: 0.1,
31 |     episodes=args.episodes
32 | )
33 | 
34 | np.save(args.policy, policy)
35 | 


--------------------------------------------------------------------------------
/monte_carlo/mc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Monte Carlo methods
  3 | 
  4 | An environment is assumed to support the following operations:
  5 |     environment.num_states(): Returns the number of states in the environment
  6 |     environment.num_actions(): Returns the number of actions in the environment
  7 |     environment.get_random_state(): Returns a random state
  8 |     environment.perform_action(a): Returns a reward and the next state (r, s')
  9 |     environment.is_terminal(s): Returns whether a state is terminal or not
 10 | 
 11 | A deterministic policy is a environment.num_states x 1 array
 12 | A non-deterministic policy is a environment.num_states x environment.num_actions array
 13 | """
 14 | import numpy as np
 15 | from tqdm import tqdm
 16 | 
 17 | from lib.policy import sample_action, get_greedy_policy
 18 | 
 19 | 
 20 | def det_policy_improvement(environment, iterations=100000):
 21 |     policy = np.zeros(environment.num_states(), dtype=int)
 22 |     Q = np.zeros((environment.num_states(), environment.num_actions()))
 23 |     N = np.zeros((environment.num_states(), environment.num_actions()))
 24 | 
 25 |     for i in tqdm(range(iterations)):
 26 | 
 27 |         states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True)
 28 | 
 29 |         for state, actions_performed in states_seen.items():
 30 |             for action, gain in actions_performed.items():
 31 |                 N[state, action] = N[state, action] + 1
 32 |                 Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action])
 33 | 
 34 |         policy = get_greedy_policy(Q)
 35 | 
 36 |     return policy, Q
 37 | 
 38 | 
 39 | def one_episode_state_action_values(environment, policy, random_start=True):
 40 |     s = environment.get_starting_state()
 41 |     states_seen = {}
 42 |     first_action = True
 43 |     episode_over = False
 44 |     steps_taken = 0
 45 |     while not episode_over:
 46 |         # If this is the first time we've seen this state
 47 |         if states_seen.get(s, None) is None:
 48 |             states_seen[s] = {}
 49 | 
 50 |         if first_action and random_start:
 51 |             a = np.random.randint(0, environment.num_actions())
 52 |             first_action = False
 53 |         else:
 54 |             # Perform our action
 55 |             a = policy(s)
 56 | 
 57 |         # If this is the first time we've performed this action
 58 |         # in this state
 59 |         if states_seen[s].get(a, None) is None:
 60 |             states_seen[s][a] = 0
 61 | 
 62 |         (r, s_prime, episode_over) = environment.perform_action(s, a)
 63 | 
 64 |         # Update our gain counters
 65 |         states_seen = \
 66 |             {
 67 |                 state: {action: gain + r for action, gain in actions_performed.items()}
 68 |                 for state, actions_performed
 69 |                 in states_seen.items()
 70 |             }
 71 | 
 72 |         steps_taken += 1
 73 | 
 74 |         # Update current state
 75 |         s = s_prime
 76 | 
 77 |     print(f'{steps_taken}')
 78 | 
 79 |     return states_seen
 80 | 
 81 | 
 82 | def on_policy_fv_mc_e_soft_control(
 83 |         environment,
 84 |         epsilon_func=lambda ep, eps: 0.1,
 85 |         alpha_func=lambda n: 0.1,
 86 |         episodes=10000,
 87 |         random_start=False
 88 |     ):
 89 |     # Initialize with uniform random policy
 90 | 
 91 |     policy = (1/environment.num_actions()) * np.ones((environment.num_states(), environment.num_actions()))
 92 | 
 93 |     Q = np.zeros((environment.num_states(), environment.num_actions()))
 94 |     N = np.zeros((environment.num_states(), environment.num_actions()))
 95 | 
 96 |     for episode in range(episodes):
 97 |         states_seen = one_episode_state_action_values(environment, lambda s: sample_action(policy, s), random_start=random_start)
 98 |         for state, actions_performed in states_seen.items():
 99 |             for action, gain in actions_performed.items():
100 |                 N[state, action] = N[state, action] + 1
101 |                 Q[state, action] = Q[state, action] + alpha_func(N[state, action])*(gain - Q[state, action])
102 |             epsilon = epsilon_func(episode, episodes)
103 |             num_actions = Q.shape[1]
104 |             policy[state] = (epsilon/num_actions)
105 |             policy[state, np.argmax(Q[state])] += 1 - epsilon
106 | 
107 |     return policy, Q
108 | 
109 | 
110 | def det_fv_policy_q_evaluation(environment, policy, episodes=10000):
111 |     """
112 |     First visit MC action-value deterministic policy evaluation with exploring starts.
113 | 
114 |     Returns the action-value function.
115 |     """
116 |     Q = np.zeros((environment.num_states(), environment.num_actions()))
117 |     N = np.zeros((environment.num_states(), environment.num_actions()))
118 | 
119 |     for episode in tqdm(range(episodes)):
120 |         states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True)
121 |         for state, actions_performed in states_seen.items():
122 |             for action, gain in actions_performed.items():
123 |                 N[state, action] = N[state, action] + 1
124 |                 Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action])
125 | 
126 |     return Q
127 | 
128 | 
129 | def fv_policy_evaluation(environment, policy, episodes=10000):
130 |     """
131 |     First visit MC policy evaluation.
132 | 
133 |     Returns the state-value function.
134 |     """
135 |     V = np.zeros(environment.num_states())
136 |     N = np.zeros(environment.num_states())
137 | 
138 |     for episode in tqdm(range(episodes)):
139 |         s = environment.get_random_state()
140 |         states_seen = {}
141 |         episode_over = False
142 |         while not episode_over:
143 |             # If this is the first time we've seen this state
144 |             if states_seen.get(s, None) is None:
145 |                 states_seen[s] = 0
146 | 
147 |             # Perform our action
148 |             a = policy[s]
149 |             (r, s_prime, episode_over) = environment.perform_action(s, a)
150 | 
151 |             # Update our gain counters
152 |             states_seen = {state: gain + r for state, gain in states_seen.items()}
153 | 
154 |             # Update current state
155 |             s = s_prime
156 |         for state, gain in states_seen.items():
157 |             N[state] = N[state] + 1
158 |             V[state] = V[state] + (1.0/(N[state]))*(gain - V[state])
159 | 
160 |     return V


--------------------------------------------------------------------------------
/monte_carlo/results/ace_optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_optimal.png


--------------------------------------------------------------------------------
/monte_carlo/results/ace_policy_evaluation_10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_10000.png


--------------------------------------------------------------------------------
/monte_carlo/results/ace_policy_evaluation_500000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_500000.png


--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_optimal.png


--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_policy_evaluation_10000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_10000.png


--------------------------------------------------------------------------------
/monte_carlo/results/no_ace_policy_evaluation_500000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_500000.png


--------------------------------------------------------------------------------
/monte_carlo/results/trained_bot_racing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/trained_bot_racing.gif


--------------------------------------------------------------------------------
/monte_carlo/results/untrained_bot_racing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/untrained_bot_racing.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | matplotlib==2.1.1
 3 | numpy==1.13.3
 4 | pygame==1.9.3
 5 | pyparsing==2.2.0
 6 | python-dateutil==2.6.1
 7 | pytz==2017.3
 8 | six==1.11.0
 9 | tqdm==4.19.5
10 | 


--------------------------------------------------------------------------------
/rl_problem/README.md:
--------------------------------------------------------------------------------
 1 | # The Reinforcement Learning Problem
 2 | 
 3 | In this chapter, we learn about the full reinforcement learning problem.
 4 | The problem consists of an environment and an agent. We have control over
 5 | the agent and are responsible for choosing which **actions** the agent takes.
 6 | The environment is outside of the agent and thus, we have no control over it
 7 | in general. The agent and environment interact in a simple way. At every time
 8 | step, the agent performs some action, and the environment responds with
 9 | the next **state** and an immediate **reward**.
10 | 
11 | 
12 | #### Exercise 3.1
13 | The first exercise is to come up with three example tasks that we can fit into
14 | the reinforcement learning framework. Here are mine:
15 | 
16 | 1. A program that plays blackjack. The state is made up of the cards
17 | that it can see on the table. The possible actions are hit or stay. The rewards
18 | would simply be +1 if the hand is won, -1 if the hand is lost, and 0 for any
19 | action that does not cause the hand to end.
20 | 2. A traffic light controller. The reward is the number of cars it is
21 | allowing to pass through so that it promotes effective traffic flow. The state
22 | is readings from distant sensors that the controller has on each side which
23 | tell it how far a car is from each side. The controller can make each of its
24 | four sides one of three colors so there are 3^4 possible actions.
25 | 3. A piano playing program. The action in this case is very simple- which keys 
26 | do we press and lift? The state is the keys that have already been played or are already
27 | currently pressed. The reward could be supplied by human listeners and could be a numerical
28 | representation of how much they are currently enjoying the music.
29 | 
30 | ### Gridworld
31 | 
32 | A very simple example of the reinforcement learning problem is gridworld.
33 | 
34 | ![Gridworld](./results/gridworld.png)
35 | 
36 | The states are all of the cells on the grid. The possible actions that
37 | we can take are up, down, left, and right. The rules of the environment are:
38 | - If we try to make a move that would take us off the grid, we get a reward
39 | of -1
40 | - If we are on the A square, however, every move takes us to A' and results in
41 | a reward of 10
42 | - If we are on the B square, every move takes us to B' and results in a reward of
43 | 5
44 | - Every other move results in a reward of 0 and takes you to the square you would
45 | expect
46 | 
47 | What is the optimal way to act in this environment? In other words, how
48 | do we act so that our "long-term reward" is maximized?
49 | 
50 | #### Discounting
51 | 
52 | One tricky thing about maximizing "long-term reward" is that this little game could
53 | potentially go on infinitely. To make the problem more simple, mathematically,
54 | we use a strategy called discounting. This basically just means we weight
55 | future rewards in an exponentially decreasing way.
56 | 
57 | γ<sup>0</sup>R<sub>0</sub> + γ<sup>1</sup>R<sub>1</sub> + γ<sup>2</sup>R<sub>2</sub> + ...
58 | (where 0 <= γ < 1)
59 | 
60 | We call the discounted sum of future rewards the "return" which is essentially a
61 | representation of expected long-term reward.
62 | 
63 | #### Policies
64 | 
65 | So we need to decide how to act in this environment. A policy specifies
66 | how an agent acts. For example, we could have a random policy, where the agent
67 | moves in a random direction every time step, or we can have an "always down" policy
68 | where the agent always moves down. In general, the policy is just a probability
69 | distribution which tells us the probability of taking each action depending on
70 | which state we are in.
71 | 
72 | #### Value Functions
73 | 
74 | A value function describes the "value" of a particular state or action.
75 | The "value" of a state is basically the expected future reward from the state,
76 | which we call the "return" as was mentioned above. This value function depends
77 | upon a policy because in order to know how much reward we can expect from a
78 | particular state, we need to know how we are going to act.
79 | 
80 | #### Uniform Policy Value Function
81 | 
82 | Here is the value function for the uniform random policy (where we choose
83 | an action randomly from every state).
84 | 
85 | ![Uniform Random Policy Value Function](./results/uniform.png)
86 | 
87 | #### Optimal Value Function
88 | 
89 | What we are really interested in, though, is the optimal policy. The policy
90 | that gives us the most possible return from any given state. The optimal
91 | value function gives us the most possible return from any given state, so from
92 | that, we can derive the optimal policy. Here is the optimal value function
93 | for gridworld, solved using value iteration.
94 | 
95 | ![Optimal Value Function](./results/optimal.png)
96 | 
97 | #### Sources:
98 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.


--------------------------------------------------------------------------------
/rl_problem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/__init__.py


--------------------------------------------------------------------------------
/rl_problem/exercises/ex_3_17.py:
--------------------------------------------------------------------------------
1 | from rl_problem.gridworld import GridWorld
2 | 
3 | g = GridWorld()
4 | 
5 | optimal_value_function = g.get_optimal_value_function()
6 | print(optimal_value_function.reshape(5, 5))
7 | 


--------------------------------------------------------------------------------
/rl_problem/exercises/gridworld_uniform_policy.py:
--------------------------------------------------------------------------------
1 | from rl_problem.gridworld import GridWorld
2 | 
3 | # Using uniform policy
4 | g = GridWorld()
5 | uniform_policy = g.get_uniform_policy()
6 | value_func = g.get_value_function(uniform_policy).reshape(5, 5)
7 | print(value_func)
8 | 
9 | 


--------------------------------------------------------------------------------
/rl_problem/gridworld.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from itertools import product
  3 | 
  4 | 
  5 | class GridWorld:
  6 | 
  7 |     UP = 0
  8 |     RIGHT = 1
  9 |     DOWN = 2
 10 |     LEFT = 3
 11 | 
 12 |     def __init__(self, size=5):
 13 |         self.size = size
 14 |         self.action_space = [self.UP, self.RIGHT, self.DOWN, self.LEFT]
 15 |         self.A = (0, 1)
 16 |         self.A_prime = (4, 1)
 17 |         self.B = (0, 3)
 18 |         self.B_prime = (2, 3)
 19 |         self._rewards = self._init_rewards()
 20 |         self._transitions = self._init_state_transitions()
 21 | 
 22 |     def get_uniform_policy(self):
 23 |         policy = np.zeros((4, self.size, self.size))
 24 |         policy[:, :, :] = 0.25
 25 |         return policy
 26 | 
 27 |     def get_expected_rewards(self, policy):
 28 |         expected_rewards = policy * self._rewards
 29 |         return expected_rewards.sum(axis=0)
 30 | 
 31 |     def _init_rewards(self):
 32 |         rewards = np.zeros((4, self.size, self.size))
 33 |         rewards[[self.UP, self.DOWN], [0, self.size - 1], :] = -1
 34 |         rewards[[self.LEFT, self.RIGHT], :, [0, self.size - 1]] = -1
 35 |         # Special A location
 36 |         rewards[:, self.A[0], self.A[1]] = 10
 37 |         # Special B location
 38 |         rewards[:, self.B[0], self.B[1]] = 5
 39 |         return rewards
 40 | 
 41 |     def _init_state_transitions(self):
 42 |         state_transitions = np.zeros((4, self.size, self.size, self.size, self.size))
 43 |         # Normal cases
 44 |         for row in range(self.size):
 45 |             for col in range(self.size):
 46 |                 if row != 0:
 47 |                     state_transitions[self.UP, row, col, row-1, col] = 1
 48 |                 if row != self.size - 1:
 49 |                     state_transitions[self.DOWN, row, col, row+1, col] = 1
 50 |                 if col != 0:
 51 |                     state_transitions[self.LEFT, row, col, row, col-1] = 1
 52 |                 if col != self.size - 1:
 53 |                     state_transitions[self.RIGHT, row, col, row, col+1] = 1
 54 | 
 55 |         # Handle edges
 56 |         for col in range(self.size):
 57 |             # Moving up or down in top or botton row leaves you in same state
 58 |             state_transitions[[self.UP, self.DOWN], [0, self.size - 1], col, [0, self.size - 1], col] = 1
 59 |         for row in range(self.size):
 60 |             # Moving left or right in leftmost or rightmost column leaves you in same state
 61 |             state_transitions[[self.LEFT, self.RIGHT], row, [0, self.size - 1], row, [0, self.size - 1]] = 1
 62 | 
 63 |         # Handle A and B
 64 |         state_transitions[:, [self.A[0], self.B[0]], [self.A[1], self.B[1]], :, :] = 0
 65 |         state_transitions[:, self.A[0], self.A[1], self.A_prime[0], self.A_prime[1]] = 1
 66 |         state_transitions[:, self.B[0], self.B[1], self.B_prime[0], self.B_prime[1]] = 1
 67 | 
 68 |         return state_transitions
 69 | 
 70 |     def get_value_function(self, policy, gamma=0.9):
 71 |         # Solve V = R + gamma*P(s,s')*V
 72 |         transition_probabilities = self.get_transition_probabilities(policy)
 73 |         expected_rewards = self.get_expected_rewards(policy).reshape(self.size**2)
 74 |         right_side_inverse = np.linalg.inv(np.identity(self.size**2) - gamma*transition_probabilities)
 75 |         return np.matmul(right_side_inverse, expected_rewards)
 76 | 
 77 |     def get_transition_probabilities(self, policy):
 78 |         ret = np.zeros((self.size**2, self.size**2))
 79 |         for action in self.action_space:
 80 |             # p(a|s)
 81 |             action_policy = np.tile(policy[action, :, :].reshape(self.size**2), (self.size**2, 1))
 82 |             # p(s'|s, a)
 83 |             state_transitions = self._transitions[action, :, :, :, :].reshape(self.size**2, self.size**2)
 84 |             ret = np.add(ret, np.multiply(action_policy, state_transitions))
 85 |         return ret
 86 | 
 87 |     def get_optimal_value_function(self, gamma=0.9, convergence=0.01):
 88 |         ret = np.zeros(self.size**2)
 89 |         copy = np.copy(ret)
 90 |         diff = None
 91 |         while diff is None or diff > convergence:
 92 |             for row, col in product(range(self.size), range(self.size)):
 93 |                 new_reward = None
 94 |                 for action in self.action_space:
 95 |                     next_state_distribution = self._transitions[action, row, col].reshape(self.size**2)
 96 |                     expected_rewards = np.matmul(next_state_distribution, ret)
 97 |                     test = self._rewards[action, row, col] + gamma*expected_rewards
 98 |                     if new_reward is None or test > new_reward:
 99 |                         new_reward = test
100 |                 copy[row*self.size + col] = new_reward
101 |             diff = np.sum(np.fabs(np.subtract(ret, copy)))
102 |             ret = copy
103 |             copy = np.copy(ret)
104 |         return ret
105 | 


--------------------------------------------------------------------------------
/rl_problem/results/gridworld.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/gridworld.png


--------------------------------------------------------------------------------
/rl_problem/results/optimal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/optimal.png


--------------------------------------------------------------------------------
/rl_problem/results/uniform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/uniform.png


--------------------------------------------------------------------------------
/runner.py:
--------------------------------------------------------------------------------
 1 | import sys, runpy
 2 | import os.path
 3 | 
 4 | sys.path.append(os.path.dirname(__file__))
 5 | 
 6 | executable = sys.argv[1]
 7 | sys.argv = sys.argv[1:]
 8 | 
 9 | runpy.run_path(executable)
10 | 


--------------------------------------------------------------------------------
/td_learning/README.md:
--------------------------------------------------------------------------------
 1 | # Temporal-Difference Learning
 2 | 
 3 | In this chapter, we learn about temporal difference (TD) learning. Like Monte Carlo methods, temporal difference
 4 | learning methods allow us to learn an optimal policy in a model-free environment. This means that we learn the
 5 | optimal policy through experience.
 6 | 
 7 | The difference between TD and Monte Carlo is that TD using a technique called bootstrapping. Basically, Monte Carlo methods
 8 | determine the value of a state based on a sample of all of the rewards that follow from it for the rest of the episode.
 9 | So if I am in state 1, then go to state 2, 3, 4, etc up to state 10 and then the episode terminates and I get a reward
10 | of +1 at the end, I update my value estimate for each of those states with that final reward.
11 | 
12 | TD learning methods use the knowledge that has already been accumulated to update value estimates instead of visited
13 | states. So if I am in state 1, then go to state 2, I update my value estimate for state 1 with the immediate reward I
14 | obtained plus my current value estimate for state 2. So I am updating my value estimate based on other estimates- this
15 | is bootstrapping. This is the same idea that Dynamic Programming methods use.
16 | 
17 | ## SARSA: On-Policy TD Control for the Racetrack Problem
18 | 
19 | SARSA is one algorithm for doing control using temporal difference learning. So we have an initial, arbitrary policy.
20 | We start in a certain state **S**, take an action **A**, observe reward **R**, arrive in a new state **S'**, and then
21 | take a new action **A'**. All of these (S, A, R, S', A') are used to update our action-value estimates (the estimates
22 | of the value of each action-state pair). So we were in state S and took action A, which gave us reward R, and caused up
23 | to end up in state S', about to take action A'. So you could say the observed value of (S, A) is the reward we just
24 | received plus the value of (S', A'). So this is the value that we move our value estimate of (S, A) towards. And that's
25 | it for updating our Q (action-value estimates). As for the policy that we follow, we simply behave in an epsilon greedy
26 | way with respect to our current Q. As we follow this and train, our Q approaches the optimal state-action value function
27 | and our policy approaches the optimal policy.
28 | 
29 | Here is my result for applying SARSA control to the racetrack problem:
30 | 
31 | ![SarsaRacing](./results/sarsa_trained_bot.gif)
32 | 
33 | ## Q-Learning: Off-Policy TD Control for the Racetrack Problem
34 | 
35 | Q-learning using TD learning techniques but is a bit more clever than SARSA. Q learning is an "off-policy" learning
36 | technique. This means that the policy that is being learned is not the same as the one that is being followed while
37 | learning. This is different from SARSA which follows a certain policy, improves that same policy, and eventually
38 | returns that policy. The advantage of "off-policy" methods is that they allow your learning agent to explore and
39 | take riskier actions while the policy being learned can be greedy and only choose the actions that it already
40 | knows are good.
41 | 
42 | Here is my result for applying Q-learning to the racetrack problem:
43 | 
44 | ![QLearningRacing](./results/q_learning_trained_bot.gif)
45 | 
46 | #### Sources:
47 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012.


--------------------------------------------------------------------------------
/td_learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/__init__.py


--------------------------------------------------------------------------------
/td_learning/exercises/q_learning_racing.py:
--------------------------------------------------------------------------------
 1 | from environments.racing.racing import RaceTrack
 2 | from td_learning import td
 3 | import numpy as np
 4 | import argparse
 5 | 
 6 | 
 7 | parser = argparse.ArgumentParser(description='Q Learning Racetrack')
 8 | 
 9 | parser.add_argument('racetrack',
10 |                     type=str,
11 |                     help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 |                     type=str,
14 |                     help='Path at which to save policy file')
15 | parser.add_argument('--convergence',
16 |                     type=float,
17 |                     help='Convergence criteria for Q',
18 |                     default=10000)
19 | parser.add_argument('--verbose',
20 |                     type=bool,
21 |                     help='Print (a lot of) log messages',
22 |                     default=False)
23 | args = parser.parse_args()
24 | 
25 | racetrack = RaceTrack(args.racetrack)
26 | policy, Q = td.q_learning(
27 |     racetrack,
28 |     alpha_func=lambda n: 1/n,
29 |     epsilon=0.2,
30 |     convergence=args.convergence
31 | )
32 | 
33 | np.save(args.policy, policy)
34 | 


--------------------------------------------------------------------------------
/td_learning/exercises/sarsa_racing.py:
--------------------------------------------------------------------------------
 1 | from environments.racing.racing import RaceTrack
 2 | from td_learning import td
 3 | import numpy as np
 4 | import argparse
 5 | 
 6 | 
 7 | parser = argparse.ArgumentParser(description='Sarsa Racetrack Policy Improvement')
 8 | 
 9 | parser.add_argument('racetrack',
10 |                     type=str,
11 |                     help='Path to racetrack csv file')
12 | parser.add_argument('policy',
13 |                     type=str,
14 |                     help='Path at which to save policy file')
15 | parser.add_argument('--episodes',
16 |                     type=int,
17 |                     help='Number of episodes to train over',
18 |                     default=1000)
19 | parser.add_argument('--verbose',
20 |                     type=bool,
21 |                     help='Print (a lot of) log messages',
22 |                     default=False)
23 | args = parser.parse_args()
24 | 
25 | racetrack = RaceTrack(args.racetrack)
26 | policy, Q = td.sarsa(
27 |     racetrack,
28 |     alpha_func=lambda n: 1/n,
29 |     epsilon_func=lambda ep, eps: 1 - (ep/eps),
30 |     episodes=args.episodes
31 | )
32 | 
33 | np.save(args.policy, policy)
34 | 


--------------------------------------------------------------------------------
/td_learning/results/q_learning_trained_bot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/q_learning_trained_bot.gif


--------------------------------------------------------------------------------
/td_learning/results/sarsa_trained_bot.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/sarsa_trained_bot.gif


--------------------------------------------------------------------------------
/td_learning/td.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Temporal Difference learning methods
 3 | 
 4 | An environment is assumed to support the following operations:
 5 |     environment.num_states(): Returns the number of states in the environment
 6 |     environment.num_actions(): Returns the number of actions in the environment
 7 |     environment.get_random_state(): Returns a random state
 8 |     environment.perform_action(a): Returns a reward, the next state (r, s'), and whether
 9 |                                    the episode is over
10 | 
11 | A deterministic policy is a environment.num_states x 1 array
12 | A non-deterministic policy is a environment.num_states x environment.num_actions array
13 | """
14 | import numpy as np
15 | from tqdm import tqdm
16 | 
17 | from lib.policy import sample_action, get_epsilon_greedy_policy
18 | 
19 | def sarsa(
20 |         environment,
21 |         epsilon_func=lambda ep, eps: 0.1,
22 |         alpha_func=lambda n: 0.1,
23 |         episodes=10000
24 |     ):
25 |     Q = np.zeros((environment.num_states(), environment.num_actions()))
26 |     N = np.zeros((environment.num_states(), environment.num_actions()))
27 |     policy = get_epsilon_greedy_policy(Q, (1.0/environment.num_actions()))
28 |     for ep in tqdm(range(episodes)):
29 |         episode_over = False
30 |         s = environment.get_starting_state()
31 |         a = sample_action(policy, s)
32 |         while not episode_over:
33 |             (r, s_prime, episode_over) = environment.perform_action(s, a)
34 | 
35 |             N[s, a] = N[s, a] + 1
36 | 
37 |             policy = get_epsilon_greedy_policy(Q, epsilon_func(ep, episodes))
38 |             a_prime = sample_action(policy, s)
39 | 
40 |             Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + Q[s_prime, a_prime] - Q[s, a])
41 | 
42 |             s = s_prime
43 |             a = a_prime
44 |     return policy, Q
45 | 
46 | def q_learning(
47 |         environment,
48 |         epsilon=0.3,
49 |         alpha_func=lambda n: 0.2,
50 |         convergence=0.1
51 |     ):
52 |     Q = np.zeros((environment.num_states(), environment.num_actions()))
53 |     N = np.zeros((environment.num_states(), environment.num_actions()))
54 |     diff = np.inf
55 |     while diff > convergence:
56 |         temp = np.copy(Q)
57 |         # Perform 10,000 episodes, then check how much q has changed
58 |         for ep in tqdm(range(10000)):
59 |             episode_over = False
60 |             s = environment.get_starting_state()
61 |             while not episode_over:
62 |                 policy = get_epsilon_greedy_policy(Q, epsilon)
63 |                 a = sample_action(policy, s)
64 | 
65 |                 (r, s_prime, episode_over) = environment.perform_action(s, a)
66 | 
67 |                 N[s, a] = N[s, a] + 1
68 |                 Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + np.amax(Q[s_prime]) - Q[s, a])
69 | 
70 |                 s = s_prime
71 |         diff = np.sum(np.fabs(np.subtract(Q, temp)))
72 |         print(f'Diff: {diff}')
73 | 
74 |     return get_epsilon_greedy_policy(Q, 0.0), Q
75 | 


--------------------------------------------------------------------------------