├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── bandit_problems ├── README.md ├── __init__.py ├── agents.py ├── bandits.py ├── exercises │ ├── decreasing_epsilon.py │ ├── ex_2_2_a.py │ ├── ex_2_2_b.py │ └── showdown.py ├── results │ ├── decreasing_epsilon.png │ ├── decreasing_epsilon_optimality.png │ ├── exercise_2_2_a.png │ ├── exercise_2_2_a_optimality.png │ ├── exercise_2_2_b.png │ ├── movingBandit.png │ ├── showdown.png │ ├── showdown_op.png │ ├── softmax.png │ ├── softmax_2.png │ ├── softmax_temps.png │ └── softmax_vs_greedy.png └── test_bed.py ├── dynamic_programming ├── README.md ├── __init__.py ├── car_rentals.py ├── exercises │ ├── car_rental_exercise.py │ ├── ex_4_5.py │ └── ex_4_9.py ├── gamblers.py └── results │ ├── e45_optimal_policy.png │ ├── e45_optimal_value.png │ ├── e45_policy_0.png │ ├── e45_policy_1.png │ ├── e45_policy_2.png │ ├── gambler_optimal_policy.png │ ├── gamblers_value_iteration.png │ ├── jack_optimal_policy.png │ ├── jack_optimal_value.png │ ├── jack_policy_0.png │ ├── jack_policy_1.png │ ├── jack_policy_2.png │ ├── jack_policy_3.png │ ├── jack_policy_4.png │ ├── jack_policy_5.png │ ├── policy_4_9_a.png │ ├── policy_4_9_b.png │ ├── value_4_9_a.png │ └── value_4_9_b.png ├── environments ├── __init__.py ├── blackjack │ ├── __init__.py │ ├── blackjack.py │ ├── blackjack_policies.py │ └── interactive_blackjack.py └── racing │ ├── __init__.py │ ├── interactive_racetrack.py │ ├── racetracks │ └── racetrack_a.csv │ ├── racing.py │ ├── run_trained_racetrack_bot.py │ └── trained_policies │ ├── mc_learning.npy │ ├── q_learning.npy │ ├── random.npy │ └── sarsa.npy ├── lib └── policy.py ├── monte_carlo ├── README.md ├── __init__.py ├── exercises │ ├── __init__.py │ ├── blackjack_policy_improvement.py │ ├── blackjack_soft_policy_improvement.py │ ├── mc_blackjack.py │ └── mc_racetrack.py ├── mc.py └── results │ ├── ace_optimal.png │ ├── ace_policy_evaluation_10000.png │ ├── ace_policy_evaluation_500000.png │ ├── no_ace_optimal.png │ ├── no_ace_policy_evaluation_10000.png │ ├── no_ace_policy_evaluation_500000.png │ ├── trained_bot_racing.gif │ └── untrained_bot_racing.gif ├── requirements.txt ├── rl_problem ├── README.md ├── __init__.py ├── exercises │ ├── ex_3_17.py │ └── gridworld_uniform_policy.py ├── gridworld.py └── results │ ├── gridworld.png │ ├── optimal.png │ └── uniform.png ├── runner.py └── td_learning ├── README.md ├── __init__.py ├── exercises ├── q_learning_racing.py └── sarsa_racing.py ├── results ├── q_learning_trained_bot.gif └── sarsa_trained_bot.gif └── td.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | .idea 4 | venv 5 | scratch 6 | *sublime* 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Nicholas Cellino 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning: An Introduction 2 | 3 | This repository contains (some of the) programming exercises from [Reinforcement Learning: An Introduction (Second Edition)](https://mitpress.mit.edu/books/reinforcement-learning) 4 | by Richard S. Sutton and Andrew G. Barto. Each subdirectory in this project contains an overview of a topic covered 5 | in the book, the results from the exercises, and Python code for the exercises. There are also reproductions of some 6 | of the figures from the book and Python code to go along with them as well. 7 | 8 | This is a work in progress. 9 | 10 | ## Topics 11 | 12 | 1. [Chapter 2 - Bandit Problems](./bandit_problems) 13 | 2. [Chapter 3 - The Reinforcement Learning Problem](./rl_problem) 14 | 3. [Chapter 4 - Dynamic Programming](./dynamic_programming) 15 | 4. [Chapter 5 - Monte Carlo Methods](./monte_carlo) 16 | 5. [Chapter 6 - Temporal-Difference Learning](./td_learning) 17 | 18 | ## Getting Started 19 | This project uses Python 3.6 and [venv](https://docs.python.org/3/library/venv.html) 20 | (Note: This is distinct from [virtualenv](https://virtualenv.pypa.io/en/stable/). There 21 | are some issues using matplotlib on OSX with virtualenv). 22 | Ensure that you have both of these installed on your system. 23 | 24 | Then, in the project directory, create your virtual environment: 25 | ``` 26 | python3.6 -m venv venv 27 | ``` 28 | This creates a folder called `venv` in which we can install Python libraries 29 | like [numpy](http://www.numpy.org/) and [matplotlib](http://matplotlib.org/). 30 | 31 | To tell your system to use this environment instead of the system-wide python environment, run: 32 | ``` 33 | source venv/bin/activate 34 | ``` 35 | You will need to do this anytime you want to run examples. 36 | 37 | 38 | Next, to install the required libraries into the virtual environment, run: 39 | ``` 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | All set! Run exercises by calling runner.py followed by the path to the exercise. For example: 44 | ``` 45 | python runner.py bandit_problems/exercises/ex_2_2_a.py 46 | ``` 47 | 48 | For some of the exercises, you can pass arguments to specify certain things about their execution (for example, number of trials in the case 49 | of the n-armed-bandit problems). You can see what these parameters are by passing `-h` like so: 50 | ``` 51 | python runner.py bandit_problems/exercises/ex_2_2_a.py -h 52 | ``` 53 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/__init__.py -------------------------------------------------------------------------------- /bandit_problems/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2 - Bandit Problems 2 | 3 | In this chapter, we learn about the N-Armed-Bandit problem. Consider this problem: 4 | 5 | There are 10 different slot machines. For each slot machine, you pull a lever and 6 | get a certain reward, maybe 0 tokens, maybe 10, maybe a million. You get 1000 pulls. 7 | Your job is to end up with as many tokens as you can by the end of the 1000 pulls. 8 | What is your strategy? 9 | 10 | If the slot machines are all exactly the same, then it doesn't really matter what you do. 11 | You could use all your pulls on 1 machine or choose randomly for each pull and, on average, 12 | you'll get the same result. But what if the machines are not all the same? What if 13 | some of the machines are better than others? For example, say you tried slot machine 1 for 14 | a few pulls and got the following results: 15 | 16 | 1. 3 tokens 17 | 2. 7 tokens 18 | 3. 6 tokens 19 | 4. 5 tokens 20 | 5. 7 tokens 21 | 6. 4 tokens 22 | 23 | Then you try machine 2 for a few pulls and get the following results: 24 | 25 | 1. 8 tokens 26 | 2. 6 tokens 27 | 3. 9 tokens 28 | 4. 8 tokens 29 | 5. 10 tokens 30 | 6. 7 tokens 31 | 32 | While the rewards are still random, machine 2 seems to be giving better results than machine 1 33 | on average. So we need to come up with a strategy that exploits that information in order to get 34 | the most possible tokens at the end. 35 | 36 | This is the essence of the N-Armed-Bandit problem. How do we come up with a strategy to maximize 37 | our reward? 38 | 39 | ### How we approach the problem 40 | 41 | So we need to figure out what the best slot machine is and choose that one as much as possible. 42 | In order to determine which slot machine is the best one, we need to try all the different 43 | slot machines and see which ones give the best rewards. 44 | 45 | So if we have 1000 pulls, we can try each slot machine 100 times, average the results, 46 | and then we'll have a pretty good estimate of how good each slot machine is, right? 47 | Well yeah, but then we've spent all of our pulls so we can't exploit that information. 48 | So how about we try each machine once, then spend the rest of our pulls on whichever one 49 | gave us the best reward? Well that doesn't really guarantee that we've found the best 50 | machine because we only tried each once. 51 | 52 | So we need to balance exploration (finding which machine is the best) with exploitation 53 | (exploiting our knowledge to get the most possible reward). 54 | 55 | ### Epsilon Greedy Method 56 | 57 | The epsilon greedy method is very simple. Basically, we use the reward from each pull 58 | to maintain an estimate for how good each slot machine is. For some percentage of 59 | our pulls, we pick the slot machine that we estimate to be the best. For the rest of our 60 | pulls, we pick a slot machine randomly. 61 | 62 | The percentage of pulls that we choose randomly is ε (epsilon). So for example, 63 | ε = 0.1 means we choose randomly 10% of the time and are greedy (choose our best estimate) 64 | 90% of the time. 65 | 66 | Here are some results showing the performance of the epsilon greedy methods. 67 | 68 | ![Epsilon Greedy Methods](./results/exercise_2_2_a.png) 69 | 70 | The values at each pull are averages over 2000 trials. 71 | 72 | We can see that ε=0 does not perform too well. This is because it does not spend any 73 | time exploring. It picks some slot machine as the best and chooses it every time no 74 | matter what. With ε=0.1, we can see that we do a little better. We spend more time exploring 75 | so we are able to get better results, but we plateau because we only ever choose our best 76 | estimate for 90% of pulls. With ε=0.01, we do not learn as fast, but we eventually reach a 77 | higher average reward than ε=0.1 because once we figure out which slot machine is best, 78 | we choose it 99% of the time. 79 | 80 | ![Epsilon Greedy Methods Optimal Choice %](./results/exercise_2_2_a_optimality.png) 81 | 82 | This graph shows the percent of the time that each method has chosen the optimal action 83 | at each pull number. We see that for ε=0, it rarely finds the optimal action, 84 | and it doesn't spend any time exploring. For ε=0.1, it spends 10% of its 85 | time exploring so it learns very fast, but it also plateaus because it will 86 | only exploit its knowledge 90% of the time. 87 | 88 | Maybe we can improve this a little. Maybe we'd want to do a little more exploring 89 | at the beginning of our session and as we get towards the end, be more greedy. We can 90 | do that! 91 | 92 | ![Decreasing Epsilon Methods](./results/decreasing_epsilon.png) 93 | 94 | The different lines here show methods where we decrease epsilon at different 95 | rates. 96 | 97 | ![Decreasing Epsilon Methods Optimal Choice %](./results/decreasing_epsilon_optimality.png) 98 | 99 | ### Softmax Method 100 | 101 | With the epsilon greedy method, we kind of took an all or nothing approach 102 | to exploration and exploitation. Either we were exploring, and we'd choose 103 | our arm totally randomly or we were exploiting and being totally greedy. 104 | Softmax methods, on the other hand, explore all the time but use their estimates 105 | of each arm's value to weight how often they choose that arm. This means that 106 | they will choose the arm they estimate to be the best most often and the arm 107 | they estimate to be the worst least often and every arm in between is weighted 108 | accordingly as well. 109 | 110 | ![Softmax Methods](./results/exercise_2_2_b.png) 111 | 112 | They have a parameter called the "temperature" which essentially says how 113 | much to weigh our estimates. Higher temperatures place less importance on 114 | our estimates and choose actions equi-probably. Lower temperatures place more 115 | importance on our estimates and so choose the actions we estimate to be better 116 | more often. As the temperature approaches 0, we start to be greedy 100% of the 117 | time. Picking the temperature is tricky and seems to be mostly a trial and error 118 | type thing. I am not sure if there is a more scientific way to approach that. 119 | 120 | ### 10-Armed Bandit Showdown 121 | 122 | So which bandit performed the best? 123 | 124 | ![Softmax vs Epsilon Greedy](./results/showdown.png) 125 | 126 | ![Softmax vs Epsilon Greedy Optimal Choice %](./results/showdown_op.png) 127 | 128 | The quantity we are trying to maximize is total rewards which is represented 129 | graphically as the area under the curve. In this experiment, the strategy 130 | in which we decrease epsilon over time performed the best. One interesting 131 | thing that we can see here is that, although the softmax agent generally chooses the optimal 132 | action less than the epsilon greedy agent, it performs about the same because it chooses 133 | "okay" actions much more than it chooses the really bad actions. 134 | 135 | #### Sources: 136 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012. 137 | -------------------------------------------------------------------------------- /bandit_problems/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/__init__.py -------------------------------------------------------------------------------- /bandit_problems/agents.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from numpy.random import random, random_integers, normal 4 | import numpy as np 5 | 6 | class Agent: 7 | 8 | def __init__(self, num_arms): 9 | self._num_arms = num_arms 10 | self._results = np.zeros((self._num_arms, 2)) 11 | self._value_estimates = normal(0, 0.01, size=(self._num_arms)) 12 | 13 | def reset(self): 14 | self._value_estimates = normal(size=(self._num_arms)) 15 | self._results = np.zeros((self._num_arms, 2)) 16 | 17 | def _update_value_estimate(self, reward, arm): 18 | self._results[arm, 0] += reward 19 | self._results[arm, 1] += 1 20 | self._value_estimates[arm] = self._results[arm, 0] / self._results[arm, 1] 21 | 22 | def do_pull(self, bandit): 23 | arm = self._choose_arm() 24 | reward = bandit.pull_arm(arm) 25 | self._update_value_estimate(reward, arm) 26 | return reward, bandit.was_optimal_choice(arm) 27 | 28 | class SoftmaxAgent(Agent): 29 | 30 | def __init__(self, temperature, num_arms): 31 | Agent.__init__(self, num_arms) 32 | self._temperature = temperature 33 | 34 | def _gibbs_distribution(self): 35 | dist = np.exp(self._value_estimates/self._temperature) 36 | return dist / np.sum(dist) 37 | 38 | def _get_sample(self, dist): 39 | cumulative_dist = np.cumsum(dist) 40 | r = random() 41 | for i in range(len(cumulative_dist)): 42 | if r < cumulative_dist[i]: 43 | return i 44 | 45 | def _choose_arm(self): 46 | dist = self._gibbs_distribution() 47 | return self._get_sample(dist) 48 | 49 | def __str__(self): 50 | return f'Softmax Agent (t={self._temperature})' 51 | 52 | 53 | class EpsilonGreedyAgent(Agent): 54 | 55 | def __init__(self, epsilon, num_arms): 56 | Agent.__init__(self, num_arms) 57 | self._starting_epsilon = epsilon 58 | self._epsilon = epsilon 59 | 60 | def reset(self): 61 | self._epsilon = self._starting_epsilon 62 | Agent.reset(self) 63 | 64 | def _choose_arm(self): 65 | if random() < self._epsilon: 66 | return random_integers(0, len(self._results) - 1) 67 | else: 68 | return np.argmax(self._value_estimates) 69 | 70 | def __str__(self): 71 | return f'Epsilon Greedy Agent (ε={self._epsilon})' 72 | 73 | 74 | class FixedAlphaEpsilonGreedyAgent(EpsilonGreedyAgent): 75 | 76 | def __init__(self, epsilon, num_arms, alpha=0.1): 77 | EpsilonGreedyAgent.__init__(self, epsilon, num_arms) 78 | self._alpha = alpha 79 | 80 | def _update_value_estimate(self, reward, arm): 81 | self._value_estimates[arm] += self._alpha * (reward - self._value_estimates[arm]) 82 | 83 | def __str__(self): 84 | return f'Fixed Alpha Epsilon Greedy Agent (ε={self._epsilon}, α={self._alpha})' 85 | 86 | 87 | class AdjustableEpsilonGreedyAgent(EpsilonGreedyAgent): 88 | 89 | def __init__(self, num_arms, num_turns): 90 | EpsilonGreedyAgent.__init__(self, 1.0, num_arms) 91 | self._num_turns = num_turns 92 | self._num_pulls = 0 93 | 94 | def reset(self): 95 | self._num_pulls = 0 96 | EpsilonGreedyAgent.reset(self) 97 | 98 | def do_pull(self, bandit): 99 | self._adjust_epsilon() 100 | reward, was_optimal = Agent.do_pull(self, bandit) 101 | self._num_pulls += 1 102 | return reward, was_optimal 103 | 104 | 105 | class ExponentialDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent): 106 | 107 | def __init__(self, num_arms, num_turns, decline_rate=1.001): 108 | AdjustableEpsilonGreedyAgent.__init__(self, num_arms, num_turns) 109 | self._decline_rate = decline_rate 110 | 111 | # Calculates and sets the next epsilon value 112 | def _adjust_epsilon(self): 113 | self._epsilon = ((1 - (self._decline_rate**(-self._num_pulls))) / 114 | (self._decline_rate**(-self._num_turns) - 1)) + 1 115 | 116 | def __str__(self): 117 | return f'Exponentially Decreasing Epsilon Greedy Agent (decline_rate={self._decline_rate})' 118 | 119 | 120 | class LinearDecreaseEpsilonGreedyAgent(AdjustableEpsilonGreedyAgent): 121 | 122 | # Sets the next epsilon value 123 | def _adjust_epsilon(self): 124 | progress = float(self._num_pulls) / self._num_turns 125 | self._epsilon = 1 - progress 126 | 127 | def __str__(self): 128 | return f'Linearly Decreasing Epsilon Greedy Agent' 129 | -------------------------------------------------------------------------------- /bandit_problems/bandits.py: -------------------------------------------------------------------------------- 1 | from numpy.random import normal, randn 2 | import numpy as np 3 | 4 | class NArmedBandit(object): 5 | 6 | def __init__(self, n): 7 | self._arms = randn(n) 8 | 9 | def pull_arm(self, arm): 10 | self.validate_arm(arm) 11 | return self._arms[arm] + normal() 12 | 13 | def num_arms(self): 14 | return len(self._arms) 15 | 16 | def validate_arm(self, arm): 17 | if arm < 0 or arm >= self.num_arms(): 18 | raise ValueError("This arm does not exist.") 19 | 20 | def was_optimal_choice(self, arm): 21 | """ 22 | Tells if the choice was optimal. 23 | 24 | Should be used for analysis purposes only 25 | (in other words, not for actually solving the problem) 26 | """ 27 | self.validate_arm(arm) 28 | return np.argmax(self._arms) == arm 29 | 30 | 31 | class MovingNArmedBandit(NArmedBandit): 32 | 33 | def __init__(self, n, sigma=0.1): 34 | super(MovingNArmedBandit, self).__init__(n) 35 | self._sigma = sigma 36 | 37 | def pull_arm(self, arm): 38 | value = super(MovingNArmedBandit, self).pull_arm(arm) 39 | self._arms += self._sigma * randn(len(self._arms)) 40 | return value 41 | -------------------------------------------------------------------------------- /bandit_problems/exercises/decreasing_epsilon.py: -------------------------------------------------------------------------------- 1 | from bandit_problems.agents import * 2 | from bandit_problems.test_bed import TestBed 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Exercise 2.2") 6 | parser.add_argument('--arms', 7 | type=int, 8 | help='Number of arms for the bandit', 9 | default=10) 10 | parser.add_argument('--trials', 11 | type=int, 12 | help='Number of trials to average over', 13 | default=2000) 14 | parser.add_argument('--pulls', 15 | type=int, 16 | help='Number of pulls per trial', 17 | default=1000) 18 | args = parser.parse_args() 19 | 20 | # Parameters 21 | num_arms = args.arms 22 | num_trials = args.trials 23 | num_pulls = args.pulls 24 | 25 | agents = [] 26 | agents.append(EpsilonGreedyAgent(0.1, num_arms)) 27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.01)) 28 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.0075)) 29 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015)) 30 | 31 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls) 32 | tb.run() 33 | tb.plot_results(title='Decreasing Epsilon Value') 34 | -------------------------------------------------------------------------------- /bandit_problems/exercises/ex_2_2_a.py: -------------------------------------------------------------------------------- 1 | from bandit_problems.agents import * 2 | from bandit_problems.test_bed import TestBed 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Exercise 2.2") 6 | parser.add_argument('--arms', 7 | type=int, 8 | help='Number of arms for the bandit', 9 | default=10) 10 | parser.add_argument('--trials', 11 | type=int, 12 | help='Number of trials to average over', 13 | default=2000) 14 | parser.add_argument('--pulls', 15 | type=int, 16 | help='Number of pulls per trial', 17 | default=3000) 18 | args = parser.parse_args() 19 | 20 | # Parameters 21 | num_arms = args.arms 22 | num_trials = args.trials 23 | num_pulls = args.pulls 24 | 25 | agents = [] 26 | agents.append(EpsilonGreedyAgent(0, num_arms)) 27 | agents.append(EpsilonGreedyAgent(0.01, num_arms)) 28 | agents.append(EpsilonGreedyAgent(0.1, num_arms)) 29 | 30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls) 31 | tb.run() 32 | tb.plot_results(title='Exercise 2.2') 33 | -------------------------------------------------------------------------------- /bandit_problems/exercises/ex_2_2_b.py: -------------------------------------------------------------------------------- 1 | from bandit_problems.agents import * 2 | from bandit_problems.test_bed import TestBed 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Exercise 2.2") 6 | parser.add_argument('--arms', 7 | type=int, 8 | help='Number of arms for the bandit', 9 | default=10) 10 | parser.add_argument('--trials', 11 | type=int, 12 | help='Number of trials to average over', 13 | default=2000) 14 | parser.add_argument('--pulls', 15 | type=int, 16 | help='Number of pulls per trial', 17 | default=1000) 18 | args = parser.parse_args() 19 | 20 | # Parameters 21 | num_arms = args.arms 22 | num_trials = args.trials 23 | num_pulls = args.pulls 24 | 25 | agents = [] 26 | agents.append(SoftmaxAgent(0.1, num_arms)) 27 | agents.append(SoftmaxAgent(0.2, num_arms)) 28 | agents.append(SoftmaxAgent(0.3, num_arms)) 29 | agents.append(SoftmaxAgent(0.4, num_arms)) 30 | agents.append(SoftmaxAgent(0.5, num_arms)) 31 | 32 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls) 33 | tb.run() 34 | tb.plot_results(title='Exercise 2.2') 35 | -------------------------------------------------------------------------------- /bandit_problems/exercises/showdown.py: -------------------------------------------------------------------------------- 1 | from bandit_problems.agents import * 2 | from bandit_problems.test_bed import TestBed 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(description="Bandit Showdown") 6 | parser.add_argument('--arms', 7 | type=int, 8 | help='Number of arms for the bandit', 9 | default=10) 10 | parser.add_argument('--trials', 11 | type=int, 12 | help='Number of trials to average over', 13 | default=2000) 14 | parser.add_argument('--pulls', 15 | type=int, 16 | help='Number of pulls per trial', 17 | default=3000) 18 | args = parser.parse_args() 19 | 20 | # Parameters 21 | num_arms = args.arms 22 | num_trials = args.trials 23 | num_pulls = args.pulls 24 | 25 | agents = [] 26 | agents.append(EpsilonGreedyAgent(0.1, num_arms)) 27 | agents.append(ExponentialDecreaseEpsilonGreedyAgent(num_arms, num_pulls, decline_rate=1.015)) 28 | agents.append(SoftmaxAgent(0.3, num_arms)) 29 | 30 | tb = TestBed(agents, num_arms, num_trials=num_trials, num_pulls=num_pulls) 31 | tb.run() 32 | tb.plot_results(title='Decreasing Epsilon Value') -------------------------------------------------------------------------------- /bandit_problems/results/decreasing_epsilon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon.png -------------------------------------------------------------------------------- /bandit_problems/results/decreasing_epsilon_optimality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/decreasing_epsilon_optimality.png -------------------------------------------------------------------------------- /bandit_problems/results/exercise_2_2_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a.png -------------------------------------------------------------------------------- /bandit_problems/results/exercise_2_2_a_optimality.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_a_optimality.png -------------------------------------------------------------------------------- /bandit_problems/results/exercise_2_2_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/exercise_2_2_b.png -------------------------------------------------------------------------------- /bandit_problems/results/movingBandit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/movingBandit.png -------------------------------------------------------------------------------- /bandit_problems/results/showdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown.png -------------------------------------------------------------------------------- /bandit_problems/results/showdown_op.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/showdown_op.png -------------------------------------------------------------------------------- /bandit_problems/results/softmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax.png -------------------------------------------------------------------------------- /bandit_problems/results/softmax_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_2.png -------------------------------------------------------------------------------- /bandit_problems/results/softmax_temps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_temps.png -------------------------------------------------------------------------------- /bandit_problems/results/softmax_vs_greedy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/bandit_problems/results/softmax_vs_greedy.png -------------------------------------------------------------------------------- /bandit_problems/test_bed.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | from bandit_problems.bandits import NArmedBandit, MovingNArmedBandit 6 | 7 | 8 | class TestBed: 9 | 10 | _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] 11 | 12 | def __init__(self, 13 | agents, 14 | num_arms=10, 15 | num_trials=2000, 16 | num_pulls=1000): 17 | self._num_arms = num_arms 18 | self._num_trials = num_trials 19 | self._num_pulls = num_pulls 20 | self._agents = agents 21 | self._results = np.zeros((len(agents), num_pulls)) 22 | self._optimal_choices = np.zeros((len(agents), num_pulls)) 23 | 24 | def _reset_agents(self): 25 | for agent in self._agents: 26 | agent.reset() 27 | 28 | def run(self): 29 | for trial_num in tqdm(range(self._num_trials)): 30 | b = NArmedBandit(self._num_arms) 31 | self._reset_agents() 32 | for pull in range(self._num_pulls): 33 | for i in range(len(self._agents)): 34 | reward, was_optimal = self._agents[i].do_pull(b) 35 | self._results[i, pull] += reward 36 | if was_optimal: 37 | self._optimal_choices[i, pull] += 1 38 | 39 | def run_moving(self): 40 | for trial_num in tqdm(range(self._num_trials)): 41 | b = MovingNArmedBandit(self._num_arms, 0.1) 42 | self._reset_agents() 43 | for pull in range(self._num_pulls): 44 | for i in range(len(self._agents)): 45 | reward, was_optimal = self._agents[i].do_pull(b) 46 | self._results[i, pull] += reward 47 | if was_optimal: 48 | self._optimal_choices[i, pull] += 1 49 | 50 | def plot_results(self, title): 51 | plt.figure(1) 52 | avgs = self._results / self._num_trials 53 | for i in range(len(self._agents)): 54 | plt.plot(avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i])) 55 | plt.title(title) 56 | plt.xlabel('Pull Number') 57 | plt.ylabel('Average Reward') 58 | plt.legend(loc=4) 59 | 60 | plt.figure(2) 61 | optimal_choices_avgs = self._optimal_choices / self._num_trials 62 | for i in range(len(self._agents)): 63 | plt.plot(optimal_choices_avgs[i], self._plot_colors[i%len(self._plot_colors)], label=str(self._agents[i])) 64 | plt.title(title) 65 | plt.xlabel('Pull Number') 66 | plt.ylabel('Percent Optimal Action Choice') 67 | plt.legend(loc=4) 68 | 69 | plt.show() 70 | -------------------------------------------------------------------------------- /dynamic_programming/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 4 - Dynamic Programming 2 | 3 | In this chapter, we learn about using dynamic programming techniques to solve 4 | finite MDPs. By "solve," in this context, we mean find the optimal way to behave 5 | in the MDP so as to maximize our return. 6 | 7 | ## Policy Evaluation 8 | 9 | The first important idea from this chapter is policy **evaluation**. This simply refers 10 | to the process of determining the value functions for a certain policy. One way 11 | to do this using dynamic programming is by taking an iterative approach. 12 | 13 | We start with a given policy π and an arbitrary state-value function v(s)- we can 14 | choose the state-value function that is 0 for all states. Then, we try to calculate v(s) 15 | for each state in the state space. To do so, we look ahead one action, 16 | and for each action, we look ahead at 17 | the possible next states. For each of these actions a and next states s', we calculate 18 | the return, which is the sum of the expected immediate reward and the discounted sum of 19 | the return of the next state. We sum all these together, with each weighted 20 | by their probability of occurring. Since the return of the next state is not actually 21 | known, this is still only an estimate, but if we apply this procedure iteratively, 22 | we are guaranteed to converge to the true value function. 23 | 24 | ## Policy Improvement 25 | 26 | Okay, so with policy evaluation, we have a method to learn the value function 27 | for a given policy in an environment. But our goal is to find the optimal way 28 | to behave in this environment- the optimal policy. 29 | 30 | Once we have the value function, this is actually pretty easy. If we know the 31 | value function for a certain policy, we can look at each state and see if the 32 | policy takes the optimal action from that state- remember that we know at this point 33 | the value of all possible next states, the expected rewards from each action, and the 34 | probability of transitioning from state s to state s' given the action a. If it does not 35 | take the optimal action, then there is clearly an opportunity to **improve** this policy. 36 | We can improve the policy by, from each state, selecting the action that gives us 37 | the most return. Put another way, we should be **greedy** with respect to the policy's 38 | value function. Once we do this, we end up with another policy which is better than 39 | the one we started with. More formally, the state-value function of this policy is greater 40 | than or equal to the state-value function of the previous policy for every state s. 41 | If the state-value function is higher for every state, that intuitively means this policy can 42 | extract more return from this environment in the long run. 43 | 44 | ## Policy Iteration 45 | 46 | The policy iteration algorithm combines these two algorithms in order to find the optimal policy. We start with 47 | an arbitrary policy and value function. Then, we evaluate this policy. Then, we improve that policy. Then, we evaluate 48 | this policy. And so on, until the policy remains the same for two steps in a row. At this point, the policy is greedy 49 | with respect to its own value function. This implies that this policy's value function satisfies the Bellman 50 | optimality equation and thus, this is an optimal policy. 51 | 52 | 53 | ### Exercise: Jack's Car Rental 54 | 55 | Jack's Car Rental problem is described in Sutton and Barto **Example 4.2** and **Exercise 4.5**. 56 | 57 | The basic problem is this: Jack manages two dealerships for his car rental business. Let's call them A and B. 58 | Every day, some customers arrive at each location and request cars. If Jack has a car for them, he can rent it to them 59 | and get $10. If he does not have a car, he loses their business and makes no money. Jack can move cars between dealerships 60 | at night for a cost of $2/car to help make sure he has cars where they are needed, but he can only move a maximum of 5 cars 61 | per night. Every day, some number of people 62 | also return cars to each dealership, and those are available for rental the next day. The number of people who 63 | request and return cars to each dealership are Poisson random variables. 64 | 65 | For dealership A, the request and return probabilities have expected values 3 and 3, respectively. 66 | 67 | For dealership B, the request and return probabilities have expected values 4 and 2, respectively. 68 | 69 | Also, there can be no more than 20 cars at each location- any additional cars get returned to the nationwide company. 70 | 71 | We can use policy iteration to find the optimal policy for this environment. The states in this environment are how many 72 | cars are at each dealership. The actions are how many cars we move from A to B (a negative number means we move cars from 73 | B to A). So the actions are integers in the range \[-5, 5\]. The rewards are how much money Jack makes in each time step. 74 | The book says to use a discount factor of 0.9, so that's what we'll do. 75 | 76 | Here are my results for running policy iteration on this problem: 77 | 78 | ![Policy 0](./results/jack_policy_0.png) 79 | 80 | ![Policy 1](./results/jack_policy_1.png) 81 | 82 | ![Policy 2](./results/jack_policy_2.png) 83 | 84 | ![Policy 3](./results/jack_policy_3.png) 85 | 86 | ![Policy 4](./results/jack_policy_4.png) 87 | 88 | ![Policy 5](./results/jack_policy_5.png) 89 | 90 | ![Optimal Policy](./results/jack_optimal_policy.png) 91 | 92 | ![Optimal Value](./results/jack_optimal_value.png) 93 | 94 | As you can see, I started with the policy that moves 0 cars no matter what. At each iteration, 95 | the policy changes slightly until there is no difference between policy 5 and the optimal policy. I'm not sure 96 | why my results differ slightly from those shown in the book (Figure 4.4). 97 | Policy 1 is slightly different when dealer B has 20 cars and my optimal value function looks 98 | to max out at a slightly higher value. This may be due to mistakes on my part or different convergence 99 | criteria. The rest, however, seem to conform exactly to the figures in the book. 100 | 101 | ### Exercise: Jack's Car Rental With Help 102 | 103 | Now, we add a couple things to this problem. 104 | 105 | One of Jack's employees takes the bus home from near dealership A to near dealership B every night. 106 | She is willing to drive a car from A to B for free. 107 | 108 | Also, Jack's parking lot just shrunk. If he has more than 10 cars at a certain dealership, 109 | he will now have to rent an additional lot for a cost of $4 for that location. 110 | 111 | Here are my results for running policy iteration on that problem: 112 | 113 | ![Policy 0](./results/e45_policy_0.png) 114 | 115 | ![Policy 1](./results/e45_policy_1.png) 116 | 117 | ![Policy 2](./results/e45_policy_2.png) 118 | 119 | ![Optimal Policy](./results/e45_optimal_policy.png) 120 | 121 | ![Optimal Value](./results/e45_optimal_value.png) 122 | 123 | While I am not positive that these results are correct, we can see by inspection that 124 | the optimal policy does make sense. For example, it usually makes sense to take advantage of that free car 125 | transport from A to B because B usually gets more requests than A, unless it means that it will make dealership 126 | B have more than 10 cars. We also see where this policy tries to avoid that $4 parking lot 127 | overhead. 128 | 129 | ## Value Iteration 130 | 131 | Value iteration functions in a similar way to policy iteration but takes a shortcut. It essentially cuts short 132 | the policy evaluation step and attempts, at each iteration, to maximize the value function by being greedy with respect 133 | to the previous value function. 134 | 135 | ### Exercise: Gambler's Problem 136 | 137 | A gambler flips a coin. If it lands on heads, he wins. If he lands on tails, he loses. He starts off with 138 | $1 and can bet in dollar increments. His goal is to get to $100. 139 | 140 | So that states are how much money he has, and the actions are how much he bets. The rewards are 0 for everything 141 | except if he gets to the $100 state, in which case, he gets a reward of 1. 142 | 143 | Here are the results of running value iteration on this problem: 144 | 145 | ![Gambler's Value Iteration](./results/gamblers_value_iteration.png) 146 | 147 | We can see how these value functions are tending towards a single function as we iterate further. 148 | 149 | ![Gambler's Optimal Policy](./results/gambler_optimal_policy.png) 150 | 151 | This is one optimal policy for this problem. There are different optimal policies for this problem. This one 152 | was chosen to replicate the result in Sutton and Barto: it is generated by choosing the most conservative/lowest bet 153 | out of all the optimal bets. 154 | 155 | ### Exercise: Gambler's Problem (ph=0.25 and ph=0.55) 156 | 157 | #### ph=0.25 Results 158 | 159 | ![Value 0.25](./results/value_4_9_a.png) 160 | 161 | ![Policy 0.25](./results/policy_4_9_a.png) 162 | 163 | #### ph=0.55 Results 164 | 165 | ![Value 0.55](./results/value_4_9_b.png) 166 | 167 | ![Policy 0.55](./results/policy_4_9_b.png) 168 | 169 | 170 | #### Sources: 171 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012. 172 | -------------------------------------------------------------------------------- /dynamic_programming/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/__init__.py -------------------------------------------------------------------------------- /dynamic_programming/car_rentals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import Axes3D 4 | import math 5 | from tqdm import tqdm 6 | 7 | class JacksCarRental: 8 | 9 | EXPECTED_RETURNS_A = 3 10 | EXPECTED_REQUESTS_A = 3 11 | EXPECTED_RETURNS_B = 2 12 | EXPECTED_REQUESTS_B = 4 13 | MOVING_CAR_COST = 2 14 | RENTAL_SALE_PRICE = 10 15 | 16 | # Don't bother computing poisson for anything above this 17 | # It will be very close to 0 18 | POISSON_CUTOFF = 14 19 | 20 | def __init__(self, max_cars=21): 21 | """ 22 | :param max_cars: Non-inclusive upper-bound for how many cars can be at a dealership 23 | """ 24 | self.max_cars = max_cars 25 | self.action_space = np.arange(-5, 6) 26 | self.a_transitions = self.init_transition_probabilities('A') 27 | self.b_transitions = self.init_transition_probabilities('B') 28 | self.a_expected_revenue = self.init_expected_revenue('A') 29 | self.b_expected_revenue = self.init_expected_revenue('B') 30 | 31 | def init_expected_revenue(self, dealership): 32 | """ 33 | Returns a self.max_cars x self.max_cars x len(self.action_space) array. 34 | Each cell holds the expected revenue for the specified dealership with 35 | the specified previous state, next state, and action. 36 | :param dealership: 'A' or 'B' 37 | """ 38 | revenue = np.zeros((self.action_space.shape[0], self.max_cars, self.max_cars)) 39 | for cars in range(self.max_cars): 40 | for cars_after in range(self.max_cars): 41 | for action in self.action_space: 42 | if (dealership is 'A' and cars - action < 0) or (dealership is 'B' and cars + action < 0): 43 | continue 44 | revenue[action, cars, cars_after] = self.get_expected_revenue(dealership, action, cars, cars_after) 45 | return revenue 46 | 47 | def get_expected_revenue(self, dealership, action, now, after): 48 | if dealership is 'A': 49 | after_move = now - action 50 | elif dealership is 'B': 51 | after_move = now + action 52 | else: 53 | raise ValueError('Dealership must be A or B') 54 | 55 | expected_revenue = 0.0 56 | for requests in range(self.POISSON_CUTOFF): 57 | probability = self.expected_requests_probability(dealership, requests) 58 | expected_revenue += probability * self.RENTAL_SALE_PRICE * min(after_move, requests) 59 | 60 | return expected_revenue 61 | 62 | def init_transition_probabilities(self, dealership): 63 | ret = np.zeros((self.max_cars, self.max_cars)) 64 | for current in range(ret.shape[0]): 65 | for next in range(ret.shape[1]): 66 | probability = 0.0 67 | for requests in range(self.POISSON_CUTOFF): 68 | for returns in range(self.POISSON_CUTOFF): 69 | cars_after_requests = max(current - requests, 0) 70 | cars_after_returns = min(cars_after_requests + returns, self.max_cars - 1) 71 | if cars_after_returns == next: 72 | request_probability = self.expected_requests_probability(dealership, requests) 73 | return_probability = self.expected_returns_probability(dealership, returns) 74 | probability += request_probability * return_probability 75 | ret[current, next] = probability 76 | return ret 77 | 78 | def expected_returns_probability(self, dealership, returns): 79 | if dealership is 'A': 80 | return self.poisson(self.EXPECTED_RETURNS_A, returns) 81 | elif dealership is 'B': 82 | return self.poisson(self.EXPECTED_RETURNS_B, returns) 83 | else: 84 | raise ValueError('Dealership must be A or B') 85 | 86 | def expected_requests_probability(self, dealership, requests): 87 | if dealership is 'A': 88 | return self.poisson(self.EXPECTED_REQUESTS_A, requests) 89 | elif dealership is 'B': 90 | return self.poisson(self.EXPECTED_REQUESTS_B, requests) 91 | else: 92 | raise ValueError('Dealership must be A or B') 93 | 94 | def poisson(self, expected, num): 95 | ret = ((expected**num)/math.factorial(num))*math.exp(-expected) 96 | return ret 97 | 98 | def get_action_cost(self, current, action): 99 | return abs(action) * self.MOVING_CAR_COST 100 | 101 | def get_expected_reward(self, action, current, next): 102 | cost = self.get_action_cost(current, action) 103 | 104 | expected_sales_a = self.a_expected_revenue[action, current[0], next[0]] 105 | expected_sales_b = self.b_expected_revenue[action, current[1], next[1]] 106 | 107 | return expected_sales_a + expected_sales_b - cost 108 | 109 | def next_state_probability(self, current, next, action): 110 | immediate_a = current[0] - action 111 | immediate_b = current[1] + action 112 | if immediate_a < 0 or immediate_a > (self.max_cars - 1): 113 | return 0.0 114 | elif immediate_b < 0 or immediate_b > (self.max_cars - 1): 115 | return 0.0 116 | probability_a = self.a_transitions[immediate_a, next[0]] 117 | probability_b = self.b_transitions[immediate_b, next[1]] 118 | return probability_a * probability_b 119 | 120 | def expected_return(self, state, action, state_value, gamma): 121 | (a, b) = state 122 | next_state_gain_expectation = 0.0 123 | for a_prime in range(self.max_cars): 124 | for b_prime in range(self.max_cars): 125 | probability_next_state = self.next_state_probability((a, b), (a_prime, b_prime), action) 126 | immediate_reward = self.get_expected_reward(action, (a, b), (a_prime, b_prime)) 127 | next_state_gain_expectation += probability_next_state * (immediate_reward + gamma * state_value[a_prime, b_prime]) 128 | return next_state_gain_expectation 129 | 130 | def evaluate_policy(self, policy, gamma=0.9, convergence=1.0): 131 | """ 132 | Generates a value function for a given deterministic policy. 133 | The policy should specify the action [-5, +5] for each 134 | state, which is the number of cars at location A and the number 135 | of cars at location B, where each ranges from 0 to 20. 136 | 137 | :param policy: A self.max_cars x self.max_cars array 138 | :return: A self.max_cars x self.max_cars array 139 | """ 140 | ret = np.zeros((self.max_cars, self.max_cars)) 141 | diff = np.inf 142 | print(f'Evaluating policy until diff < {convergence}') 143 | while diff > convergence: 144 | temp = np.copy(ret) 145 | for a in range(policy.shape[0]): 146 | for b in range(policy.shape[1]): 147 | ret[a, b] = self.expected_return((a, b), policy[a, b], temp, gamma) 148 | diff = np.max(np.fabs(np.subtract(ret, temp))) 149 | print(f'Diff: {diff}') 150 | return ret 151 | 152 | def get_greedy_policy(self, value, gamma=0.9): 153 | """ 154 | Generates a policy that is greedy with respect to the provided value function. 155 | 156 | :param value: A self.max_cars x self.max_cars array 157 | :return: A self.max_cars x self.max_cars array 158 | """ 159 | policy = np.zeros((self.max_cars, self.max_cars)) 160 | print('Improving Policy...') 161 | for a in tqdm(range(policy.shape[0])): 162 | for b in range(policy.shape[1]): 163 | best_action = [None, -np.inf] 164 | for action in np.arange(-5, 6): 165 | if a - action < 0 or b + action < 0: 166 | # This action is not allowed if it makes one dealership have less than 0 cars 167 | continue 168 | next_state_gain_expectation = self.expected_return((a, b), action, value, gamma) 169 | if next_state_gain_expectation > best_action[1]: 170 | best_action[0] = action 171 | best_action[1] = next_state_gain_expectation 172 | policy[a, b] = best_action[0] 173 | return policy.astype(int) 174 | 175 | def run_policy_improvement(self, gamma=0.9, convergence=5.0): 176 | initial_policy = np.zeros((self.max_cars, self.max_cars), dtype=int) 177 | policies = [initial_policy] 178 | value = None 179 | while len(policies) < 2 or not np.array_equal(policies[-1], policies[-2]): 180 | value = self.evaluate_policy(policies[-1], gamma, convergence) 181 | greedy = self.get_greedy_policy(value) 182 | policies.append(greedy) 183 | return policies, value 184 | 185 | def plot_results(self, policies, value_function): 186 | self.plot_value_function(value_function, figure=1) 187 | self.plot_policies(policies, starting_fig=2) 188 | plt.show() 189 | 190 | def plot_value_function(self, value_function, figure=1): 191 | fig = plt.figure(figure) 192 | ax = fig.add_subplot(111, projection='3d') 193 | x = np.arange(0, self.max_cars) 194 | y = np.arange(0, self.max_cars) 195 | X, Y = np.meshgrid(x, y) 196 | ax.plot_wireframe(X, Y, value_function) 197 | fig.suptitle('Optimal Value Function') 198 | plt.xlabel('# of Cars at Dealership B') 199 | plt.ylabel('# of Cars at Dealership A') 200 | 201 | def plot_policies(self, policies, starting_fig=1): 202 | figure = starting_fig 203 | for i in range(len(policies)): 204 | fig = plt.figure(figure) 205 | figure += 1 206 | policy = policies[i] 207 | plt.imshow(policy, cmap='jet') 208 | plt.ylabel('# of Cars at Dealership A') 209 | plt.xlabel('# of Cars at Dealership B') 210 | plt.xticks(np.arange(0, policy.shape[0], 1)) 211 | plt.yticks(np.arange(0, policy.shape[1], 1)) 212 | plt.gca().invert_yaxis() 213 | if i == (len(policies) - 1): 214 | fig.suptitle('Optimal Policy') 215 | else: 216 | fig.suptitle(f'Policy {i}') 217 | 218 | # Annotate states 219 | for i in range(policy.shape[0]): 220 | for j in range(policy.shape[1]): 221 | plt.text(j, i, '%d' % policy[i,j], horizontalalignment='center', verticalalignment='center') 222 | 223 | plt.colorbar() 224 | 225 | 226 | class JacksCarRentalWithHelp(JacksCarRental): 227 | 228 | SECOND_PARKING_LOT_COST = 4 229 | 230 | def get_action_cost(self, current, action): 231 | if action > 0: 232 | moving_cost = self.MOVING_CAR_COST * (action - 1) 233 | else: 234 | moving_cost = self.MOVING_CAR_COST * abs(action) 235 | 236 | overnight_cars_a = current[0] - action 237 | overnight_cars_b = current[1] + action 238 | 239 | parking_cost = 0 240 | if overnight_cars_a > 10: 241 | parking_cost += self.SECOND_PARKING_LOT_COST 242 | if overnight_cars_b > 10: 243 | parking_cost += self.SECOND_PARKING_LOT_COST 244 | 245 | return moving_cost + parking_cost 246 | -------------------------------------------------------------------------------- /dynamic_programming/exercises/car_rental_exercise.py: -------------------------------------------------------------------------------- 1 | from dynamic_programming.car_rentals import JacksCarRental 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description="Car Rental Exercise") 5 | 6 | parser.add_argument('--convergence', 7 | type=float, 8 | help='Convergence criteria for policy evaluation', 9 | default=1.0) 10 | args = parser.parse_args() 11 | 12 | jcr = JacksCarRental() 13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence) 14 | jcr.plot_results(policies, optimal_value) 15 | -------------------------------------------------------------------------------- /dynamic_programming/exercises/ex_4_5.py: -------------------------------------------------------------------------------- 1 | from dynamic_programming.car_rentals import JacksCarRentalWithHelp 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description="Exercise 4.5") 5 | 6 | parser.add_argument('--convergence', 7 | type=float, 8 | help='Convergence criteria for policy evaluation', 9 | default=1.0) 10 | args = parser.parse_args() 11 | 12 | jcr = JacksCarRentalWithHelp() 13 | policies, optimal_value = jcr.run_policy_improvement(gamma=0.9, convergence=args.convergence) 14 | jcr.plot_results(policies, optimal_value) 15 | -------------------------------------------------------------------------------- /dynamic_programming/exercises/ex_4_9.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from dynamic_programming.gamblers import GamblersProblem 3 | 4 | # Win probability 0.25 5 | gambler = GamblersProblem(win_probability=0.25) 6 | value_funcs = gambler.value_iteration() 7 | policy = gambler.get_greedy_policy(value_funcs[-1]) 8 | next_figure = gambler.plot_results(value_funcs[0:5], policy) 9 | 10 | # Win probability 0.55 11 | gambler = GamblersProblem(win_probability=0.55) 12 | value_funcs = gambler.value_iteration() 13 | policy = gambler.get_greedy_policy(value_funcs[-1]) 14 | gambler.plot_results(value_funcs[0:5], policy, figure=next_figure) 15 | 16 | plt.show() 17 | -------------------------------------------------------------------------------- /dynamic_programming/gamblers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | 6 | class GamblersProblem(): 7 | 8 | _plot_colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] 9 | 10 | def __init__(self, win_probability=0.4): 11 | self._win_probability = win_probability 12 | 13 | def get_possible_next_states(self, state, action): 14 | ret = [] 15 | # Either we win 16 | ret.append(state + action) 17 | # or we lose 18 | ret.append(state - action) 19 | return list(set(ret)) 20 | 21 | def probability_next_state(self, state, action, next_state): 22 | # Special "sink" states 23 | if state == 0: 24 | if next_state == 0: 25 | return 1.0 26 | return 0.0 27 | if state == 100: 28 | if next_state == 100: 29 | return 1.0 30 | return 0.0 31 | 32 | # Loss 33 | if next_state == (state - action): 34 | return 1 - self._win_probability 35 | # Win 36 | elif next_state == (state + action): 37 | return self._win_probability 38 | else: 39 | # Should never actually make it here 40 | return 0.0 41 | 42 | def reward(self, state, action, next_state): 43 | if next_state == 100: 44 | return 1.0 45 | else: 46 | return 0.0 47 | 48 | def value_iteration(self, convergence=0.0001): 49 | diff = np.inf 50 | value = np.zeros(101) 51 | temp = np.copy(value) 52 | ret = [] 53 | while diff > convergence: 54 | for state in range(1, value.shape[0] - 1): 55 | action_space = np.arange(0, min(state, 100 - state) + 1) 56 | best_value = None 57 | for action in action_space: 58 | possible_next_states = self.get_possible_next_states(state, action) 59 | gain = 0.0 60 | for next_state in possible_next_states: 61 | gain += self.probability_next_state(state, action, next_state) * ( 62 | self.reward(state, action, next_state) + temp[next_state] 63 | ) 64 | if best_value is None or gain > best_value: 65 | best_value = gain 66 | value[state] = best_value 67 | diff = np.max(np.fabs(np.subtract(temp, value))) 68 | temp = np.copy(value) 69 | ret.append(temp) 70 | return ret 71 | 72 | def get_greedy_policy(self, value): 73 | policy = np.zeros(101) 74 | for state in np.arange(1, 100): 75 | action_space = np.arange(0, min(state, 100 - state) + 1) 76 | best_action = [None, -np.inf] 77 | for action in action_space: 78 | possible_next_states = self.get_possible_next_states(state, action) 79 | gain = 0.0 80 | for next_state in possible_next_states: 81 | gain += self.probability_next_state(state, action, next_state) * ( 82 | self.reward(state, action, next_state) + value[next_state] 83 | ) 84 | if best_action[0] is None: 85 | best_action[0] = action 86 | best_action[1] = gain 87 | elif math.isclose(gain, best_action[1]): 88 | # Tie breaking strategy 89 | # Choose more conservative action 90 | if action < best_action[0]: 91 | best_action[0] = action 92 | elif gain > best_action[1]: 93 | best_action[0] = action 94 | best_action[1] = gain 95 | policy[state] = best_action[0] 96 | return policy 97 | 98 | def plot_value_functions(self, value_functions): 99 | for i in range(len(value_functions)): 100 | plt.plot(value_functions[i][0:-1], self._plot_colors[i%len(self._plot_colors)], label=f'Value Function {i}') 101 | plt.title(f"Gambler's Problem Value Iteration (Win Probability = {self._win_probability})") 102 | plt.xlabel('Capital') 103 | plt.ylabel('Value') 104 | plt.legend(loc=4) 105 | 106 | def plot_policy(self, policy): 107 | plt.plot(np.arange(0, 101), policy) 108 | plt.title(f'Optimal Policy for Gambler (Win Probability = {self._win_probability})') 109 | plt.xlabel('Captial') 110 | plt.ylabel('Stake') 111 | 112 | def plot_results(self, value_functions, policy, figure=1): 113 | plt.figure(figure) 114 | self.plot_value_functions(value_functions) 115 | plt.figure(figure + 1) 116 | self.plot_policy(policy) 117 | return figure + 2 118 | 119 | if __name__ == '__main__': 120 | gmb = GamblersProblem() 121 | values = gmb.value_iteration(convergence=0.001) 122 | policy = gmb.get_greedy_policy(values[-1]) 123 | gmb.plot_value_functions(values) 124 | gmb.plot_policy(policy) 125 | -------------------------------------------------------------------------------- /dynamic_programming/results/e45_optimal_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_policy.png -------------------------------------------------------------------------------- /dynamic_programming/results/e45_optimal_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_optimal_value.png -------------------------------------------------------------------------------- /dynamic_programming/results/e45_policy_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_0.png -------------------------------------------------------------------------------- /dynamic_programming/results/e45_policy_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_1.png -------------------------------------------------------------------------------- /dynamic_programming/results/e45_policy_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/e45_policy_2.png -------------------------------------------------------------------------------- /dynamic_programming/results/gambler_optimal_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gambler_optimal_policy.png -------------------------------------------------------------------------------- /dynamic_programming/results/gamblers_value_iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/gamblers_value_iteration.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_optimal_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_policy.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_optimal_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_optimal_value.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_0.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_1.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_2.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_3.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_4.png -------------------------------------------------------------------------------- /dynamic_programming/results/jack_policy_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/jack_policy_5.png -------------------------------------------------------------------------------- /dynamic_programming/results/policy_4_9_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_a.png -------------------------------------------------------------------------------- /dynamic_programming/results/policy_4_9_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/policy_4_9_b.png -------------------------------------------------------------------------------- /dynamic_programming/results/value_4_9_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_a.png -------------------------------------------------------------------------------- /dynamic_programming/results/value_4_9_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/dynamic_programming/results/value_4_9_b.png -------------------------------------------------------------------------------- /environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/__init__.py -------------------------------------------------------------------------------- /environments/blackjack/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/blackjack/__init__.py -------------------------------------------------------------------------------- /environments/blackjack/blackjack.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from mpl_toolkits.mplot3d import Axes3D 5 | 6 | 7 | class BlackjackPlotter: 8 | 9 | @staticmethod 10 | def plot_value_functions(value): 11 | reshaped_value = np.reshape(value, BlackjackStates.state_space_shape()) 12 | BlackjackPlotter.plot_value_function( 13 | reshaped_value[:, :, 0], 14 | title='Value Function (Usable ace)', 15 | figure=1 16 | ) 17 | BlackjackPlotter.plot_value_function( 18 | reshaped_value[:, :, 1], 19 | title='Value Function (No usable ace)', 20 | figure=2) 21 | plt.show() 22 | 23 | @staticmethod 24 | def plot_value_function(value_function, title='Value Function', figure=1): 25 | fig = plt.figure(figure) 26 | ax = fig.add_subplot(111, projection='3d') 27 | x = np.arange(12, 22) 28 | y = np.arange(1, 11) 29 | X, Y = np.meshgrid(x, y) 30 | ax.plot_wireframe(X, Y, value_function) 31 | fig.suptitle(title) 32 | plt.xlabel('Player sum') 33 | plt.ylabel('Dealer showing') 34 | 35 | @staticmethod 36 | def plot_policies(policies): 37 | reshaped_policy = policies.reshape(BlackjackStates.state_space_shape()) 38 | ace_policy = reshaped_policy[:, :, 0] 39 | BlackjackPlotter.plot_policy(ace_policy, title='Ace policy', figure=1) 40 | no_ace_policy = reshaped_policy[:, :, 1] 41 | BlackjackPlotter.plot_policy(no_ace_policy, title='No ace policy', figure=2) 42 | plt.show() 43 | 44 | @staticmethod 45 | def plot_policy(policy, title='Blackjack Policy', figure=1): 46 | policy = np.transpose(policy) 47 | fig = plt.figure(figure) 48 | ax = fig.subplots() 49 | fig.suptitle(title) 50 | plt.imshow(policy, cmap='jet') 51 | plt.gca().invert_yaxis() 52 | 53 | plt.xlabel('Dealer showing') 54 | plt.xticks(np.arange(0, len(BlackjackStates.DEALER_CARDS), 1)) 55 | ax.set_xticklabels(BlackjackStates.DEALER_CARDS) 56 | 57 | plt.ylabel('Agent sum') 58 | plt.yticks(np.arange(0, len(BlackjackStates.AGENT_SUMS), 1)) 59 | ax.set_yticklabels(BlackjackStates.AGENT_SUMS) 60 | 61 | for i in range(policy.shape[0]): 62 | for j in range(policy.shape[1]): 63 | if policy[i, j] == Blackjack.HIT_ACTION: 64 | label = 'HIT' 65 | else: 66 | label = 'STAY' 67 | plt.text(j, i, f'{label}', horizontalalignment='center', verticalalignment='center') 68 | 69 | 70 | class BlackjackStates: 71 | 72 | DEALER_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10] 73 | AGENT_SUMS = [12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 74 | USABLE_ACE = [True, False] 75 | STATES = [] 76 | for dealer_card in DEALER_CARDS: 77 | for agent_sum in AGENT_SUMS: 78 | for _usable_ace in USABLE_ACE: 79 | STATES.append((dealer_card, agent_sum, _usable_ace)) 80 | 81 | @staticmethod 82 | def state_space_shape(): 83 | return (len(BlackjackStates.DEALER_CARDS), 84 | len(BlackjackStates.AGENT_SUMS), 85 | len(BlackjackStates.USABLE_ACE)) 86 | 87 | @staticmethod 88 | def num_states(): 89 | return (len(BlackjackStates.DEALER_CARDS) * 90 | len(BlackjackStates.AGENT_SUMS) * 91 | len(BlackjackStates.USABLE_ACE)) 92 | 93 | @staticmethod 94 | def id_to_state(id): 95 | return BlackjackStates.STATES[id] 96 | 97 | @staticmethod 98 | def state_to_id(state): 99 | dealer_card_index = BlackjackStates.DEALER_CARDS.index(state[0]) 100 | agent_sum_index = BlackjackStates.AGENT_SUMS.index(state[1]) 101 | usable_ace_index = BlackjackStates.USABLE_ACE.index(state[2]) 102 | return ( 103 | dealer_card_index * len(BlackjackStates.AGENT_SUMS) * len(BlackjackStates.USABLE_ACE) + 104 | agent_sum_index * len(BlackjackStates.USABLE_ACE) + 105 | usable_ace_index 106 | ) 107 | 108 | @staticmethod 109 | def print_state(state): 110 | if type(state) is int: 111 | state = BlackjackStates.id_to_state(state) 112 | dealer_card = state[0] 113 | agent_sum = state[1] 114 | usable_ace = state[2] 115 | print(f'Dealer: {dealer_card}, Agent sum: {agent_sum}, Ace: {usable_ace}') 116 | 117 | 118 | class Blackjack: 119 | 120 | GAME_OVER_STATE = -1 121 | HIT_ACTION = 0 122 | STAY_ACTION = 1 123 | HIT_CARDS = ['A', 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] 124 | 125 | def __init__(self, verbose=True): 126 | self._states = [] 127 | self._verbose = verbose 128 | 129 | def _blackjack_sum(self, hand): 130 | """ 131 | Sums a list of cards with blackjack rules. 132 | In other words, if a hand contains an ace, it counts it as 133 | a 1 or 11 depending on what is appropriate. 134 | 135 | If a hand has more than 1 ace, at most 1 can count as 11. 136 | """ 137 | running_total = 0 138 | num_aces = 0 139 | for card in hand: 140 | if card == 'A': 141 | num_aces += 1 142 | else: 143 | running_total += card 144 | 145 | # Count all aces as 1s by default 146 | running_total += num_aces 147 | 148 | if num_aces > 0 and running_total + 10 <= 21: 149 | # Count 1 ace as 11 150 | running_total += 10 151 | 152 | return running_total 153 | 154 | def _draw_card(self): 155 | return self.HIT_CARDS[randint(0, len(self.HIT_CARDS) - 1)] 156 | 157 | def _player_draw_card(self): 158 | """ 159 | Returns a card value in the range [1, 10] because a player can't draw 160 | another usable ace. 161 | """ 162 | card = self._draw_card() 163 | if card == 'A': 164 | return 1 165 | else: 166 | return card 167 | 168 | def debug_print(self, message): 169 | if self._verbose: 170 | print(message) 171 | 172 | def num_states(self): 173 | return BlackjackStates.num_states() 174 | 175 | def num_actions(self): 176 | return 2 177 | 178 | def get_starting_state(self): 179 | return self.get_random_state() 180 | 181 | def get_random_state(self): 182 | return randint(0, self.num_states() - 1) 183 | 184 | def perform_action(self, state_id, action): 185 | state = BlackjackStates.id_to_state(state_id) 186 | dealer_card = state[0] 187 | player_sum = state[1] 188 | usable_ace = state[2] 189 | if action == self.HIT_ACTION: 190 | self.debug_print(f'You hit!') 191 | card = self._player_draw_card() 192 | self.debug_print(f'You drew {card}') 193 | player_sum += card 194 | if player_sum > 21: 195 | if usable_ace: 196 | # Ace becomes 1 197 | player_sum -= 10 198 | next_state = (dealer_card, player_sum, False) 199 | return (0, BlackjackStates.state_to_id(next_state), False) 200 | else: 201 | # Lose 202 | self.debug_print(f'You busted with {player_sum}.') 203 | return (-1, self.GAME_OVER_STATE, True) 204 | else: 205 | # Still <= 21 206 | next_state = (dealer_card, player_sum, usable_ace) 207 | return (0, BlackjackStates.state_to_id(next_state), False) 208 | elif action == self.STAY_ACTION: 209 | self.debug_print(f'You stayed!') 210 | # Dealer's turn 211 | dealer_cards = [dealer_card] 212 | dealer_sum = self._blackjack_sum(dealer_cards) 213 | 214 | blackjack = False 215 | if player_sum == 21 and usable_ace: 216 | self.debug_print(f'You have a blackjack!') 217 | blackjack = True 218 | 219 | # Dealer must hit until he has over 17 220 | while dealer_sum < 17: 221 | card = self._draw_card() 222 | self.debug_print(f'Dealer had {dealer_sum}, and drew {card}') 223 | dealer_cards.append(card) 224 | dealer_sum = self._blackjack_sum(dealer_cards) 225 | if dealer_sum != 21 and blackjack: 226 | # If dealer doesn't have 21 after first draw, 227 | # player immediately wins. 228 | self.debug_print(f'You win!') 229 | return (1, self.GAME_OVER_STATE, True) 230 | 231 | if dealer_sum > 21: 232 | # Dealer busted 233 | self.debug_print(f'Dealer busted.') 234 | return (1, self.GAME_OVER_STATE, True) 235 | else: 236 | if dealer_sum > player_sum: 237 | # Lose 238 | self.debug_print(f'Dealer won with {dealer_sum}.') 239 | return (-1, self.GAME_OVER_STATE, True) 240 | elif dealer_sum == player_sum: 241 | self.debug_print(f'Draw. Dealer and player both have {player_sum}.') 242 | return (0, self.GAME_OVER_STATE, True) 243 | else: 244 | # Win 245 | self.debug_print(f'You won! Dealer: {dealer_sum}. You: {player_sum}.') 246 | return (1, self.GAME_OVER_STATE, True) 247 | else: 248 | raise ValueError('This is not a valid action.') 249 | 250 | def is_terminal(self, state): 251 | return state == self.GAME_OVER_STATE 252 | -------------------------------------------------------------------------------- /environments/blackjack/blackjack_policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates 4 | 5 | 6 | class BlackjackPolicy: 7 | 8 | def _get_action_by_state(self, state): 9 | raise NotImplementedError('This must be implemented.') 10 | 11 | def get_action(self, state_id): 12 | blackjack_state = BlackjackStates.id_to_state(state_id) 13 | return self._get_action_by_state(blackjack_state) 14 | 15 | @staticmethod 16 | def generate_policy(stay_on=[]): 17 | policy = np.zeros(BlackjackStates.num_states()) 18 | for state_id in range(policy.shape[0]): 19 | state = BlackjackStates.id_to_state(state_id) 20 | dealer_card = state[0] 21 | agent_sum = state[1] 22 | ace = state[2] 23 | if agent_sum in stay_on: 24 | policy[state_id] = Blackjack.STAY_ACTION 25 | else: 26 | policy[state_id] = Blackjack.HIT_ACTION 27 | return policy 28 | -------------------------------------------------------------------------------- /environments/blackjack/interactive_blackjack.py: -------------------------------------------------------------------------------- 1 | from environments.blackjack.blackjack import * 2 | 3 | blackjack = Blackjack() 4 | state = blackjack.get_random_state() 5 | 6 | while not blackjack.is_terminal(state): 7 | (dealer_card, player_sum, usable_ace) = BlackjackStates.id_to_state(state) 8 | 9 | if usable_ace: 10 | ace_string = 'with ace' 11 | else: 12 | ace_string = 'no ace' 13 | print(f'--- Dealer showing: {dealer_card} --- You: {player_sum} ({ace_string}) ---') 14 | 15 | action = None 16 | while action is None: 17 | action = input('Hit (0) or stay (1)?: ') 18 | if action in ['0', '1']: 19 | action = int(action) 20 | else: 21 | action = None 22 | print('Invalid action') 23 | 24 | print() 25 | reward, state = blackjack.perform_action(state, action) 26 | -------------------------------------------------------------------------------- /environments/racing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/__init__.py -------------------------------------------------------------------------------- /environments/racing/interactive_racetrack.py: -------------------------------------------------------------------------------- 1 | from environments.racing.racing import RaceTrackGame 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Interactive Race Track Game') 5 | 6 | parser.add_argument('racetrack', 7 | type=str, 8 | help='Path to racetrack csv file') 9 | 10 | args = parser.parse_args() 11 | 12 | RaceTrackGame.run(args.racetrack) 13 | -------------------------------------------------------------------------------- /environments/racing/racetracks/racetrack_a.csv: -------------------------------------------------------------------------------- 1 | 0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,2 2 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2 3 | 0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2 4 | 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2 5 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2 6 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2 7 | 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0 8 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 9 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 10 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 11 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 12 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 13 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 14 | 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 15 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 16 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 17 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 18 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 19 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 20 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 21 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 22 | 0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0 23 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 24 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 25 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 26 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 27 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 28 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 29 | 0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0 30 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0 31 | 0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0 32 | 0,0,0,3,3,3,3,3,3,0,0,0,0,0,0,0 33 | -------------------------------------------------------------------------------- /environments/racing/racing.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import random 4 | import sys 5 | import time 6 | 7 | import numpy as np 8 | import pygame 9 | 10 | 11 | class RacerBot: 12 | 13 | def __init__(self, policy): 14 | self.policy = policy 15 | 16 | def get_action(self, state_id): 17 | choices = np.arange(0, self.policy.shape[1]) 18 | probabilities = self.policy[state_id] 19 | probabilities /= probabilities.sum() 20 | return np.random.choice(choices, p=probabilities) 21 | 22 | 23 | class RaceTrack: 24 | 25 | OOB = 0 26 | TRACK = 1 27 | FINISH = 2 28 | START = 3 29 | CAR = 4 30 | 31 | MAX_SPEED = 5 32 | 33 | def __init__(self, csv_path): 34 | self.track = [] 35 | self.start_locations = [] 36 | self.finish_locations = [] 37 | with open(csv_path, 'r') as csvfile: 38 | track_layout = csv.reader(csvfile, delimiter=',') 39 | row_num = 0 40 | for row in track_layout: 41 | new_row = [] 42 | col_num = 0 43 | for cell in row: 44 | new_cell = int(cell) 45 | if new_cell == RaceTrack.START: 46 | self.start_locations.append([col_num, row_num]) 47 | if new_cell == RaceTrack.FINISH: 48 | self.finish_locations.append([col_num, row_num]) 49 | new_row.append(new_cell) 50 | col_num += 1 51 | self.track.append(new_row) 52 | row_num += 1 53 | 54 | self.states = [] 55 | for col in range(len(self.track[0])): 56 | for row in range(len(self.track)): 57 | for horizontal_speed in np.arange(0, self.MAX_SPEED): 58 | for vertical_speed in np.arange(0, self.MAX_SPEED): 59 | self.states.append((col, row, horizontal_speed, vertical_speed)) 60 | 61 | self.actions = [] 62 | for horizontal_accel in np.arange(-1, 2): 63 | for vertical_accel in np.arange(-1, 2): 64 | self.actions.append((horizontal_accel, vertical_accel)) 65 | 66 | def num_states(self): 67 | return len(self.states) 68 | 69 | def num_actions(self): 70 | return len(self.actions) 71 | 72 | def action_to_id(self, action): 73 | return ( 74 | (action[0] + 1) * 3 + 75 | (action[1] + 1) 76 | ) 77 | 78 | def id_to_action(self, id): 79 | return self.actions[id] 80 | 81 | def state_to_id(self, state): 82 | col = state[0] 83 | row = state[1] 84 | horizontal_speed = state[2] 85 | vertical_speed = state[3] 86 | return ( 87 | col * len(self.track) * self.MAX_SPEED * self.MAX_SPEED + 88 | row * self.MAX_SPEED * self.MAX_SPEED + 89 | horizontal_speed * self.MAX_SPEED + 90 | vertical_speed 91 | ) 92 | 93 | def id_to_state(self, id): 94 | return self.states[id] 95 | 96 | def perform_action(self, state_id, action_id): 97 | """ 98 | Returns reward, next state, and if we finished. 99 | """ 100 | state = self.id_to_state(state_id) 101 | current_location = [state[0], state[1]] 102 | current_speed = [state[2], state[3]] 103 | action = self.id_to_action(action_id) 104 | 105 | current_speed[0] = max(min(current_speed[0] + action[0], self.MAX_SPEED - 1), 0) 106 | current_speed[1] = max(min(current_speed[1] + action[1], self.MAX_SPEED - 1), 0) 107 | if current_speed[0] == 0 and current_speed[1] == 0: 108 | current_speed[1] = 1 109 | if self.crosses_finish_line(current_location, current_speed): 110 | next_state = self.starting_line_state() 111 | return (0, self.state_to_id(next_state), True) 112 | else: 113 | next_location = self.get_next_location(current_location, current_speed) 114 | if self.out_of_bounds(next_location): 115 | next_state = self.starting_line_state() 116 | return (-5, self.state_to_id(next_state), False) 117 | next_state = (next_location[0], next_location[1], current_speed[0], current_speed[1]) 118 | return (-1, self.state_to_id(next_state), False) 119 | 120 | def crosses_finish_line(self, position, speed): 121 | horizontal = speed[0] 122 | vertical = speed[1] 123 | intermediate_location = [0, 0] 124 | intermediate_location[0] = position[0] 125 | intermediate_location[1] = position[1] 126 | while (horizontal + vertical > 0): 127 | if horizontal >= vertical: 128 | intermediate_location[0] += 1 129 | horizontal -= 1 130 | else: 131 | intermediate_location[1] -=1 132 | vertical -= 1 133 | for finish_location in self.finish_locations: 134 | if intermediate_location[0] == finish_location[0] and intermediate_location[1] == finish_location[1]: 135 | return True 136 | return False 137 | 138 | def out_of_bounds(self, location): 139 | return (location[0] < 0 or location[0] >= self.dimensions[0] or 140 | location[1] < 0 or location[1] >= self.dimensions[1] or 141 | self.track[location[1]][location[0]] == self.OOB) 142 | 143 | def get_next_location(self, location, speed): 144 | next_loc = [location[0] + speed[0], location[1] - speed[1]] 145 | return next_loc 146 | 147 | def get_starting_state(self): 148 | return self.state_to_id(self.starting_line_state()) 149 | 150 | def starting_line_state(self): 151 | random_start = self.start_locations[random.randint(0, len(self.start_locations) - 1)] 152 | ret = (random_start[0], random_start[1], 0, 0) 153 | return ret 154 | 155 | @property 156 | def dimensions(self): 157 | return (len(self.track[0]), len(self.track)) 158 | 159 | 160 | class RaceTrackGame: 161 | 162 | CAPTION = 'Racing Game' 163 | SCREEN_SIZE = (500, 800) 164 | 165 | OOB_COLOR = (240, 252, 22) 166 | TRACK_COLOR = (147, 150, 155) 167 | FINISH_COLOR = (1, 75, 234) 168 | START_COLOR = (2, 234, 72) 169 | CAR_COLOR = (0, 0, 0) 170 | BACKGROUND_COLOR = (0, 50, 50) 171 | CELL_BORDER = 2 172 | SPEED_RIGHT_MARGIN = SCREEN_SIZE[0]/2 173 | 174 | FONT_SIZE = 25 175 | FONT_HEIGHT = 30 176 | FONT_COLOR = (255, 255, 255) 177 | 178 | TOP_BOTTOM_MARGIN = 10 179 | TRACK_LOCATION = (20, 40) 180 | LEFT_RIGHT_MARGIN = 10 181 | 182 | TRACK_SIZE = (SCREEN_SIZE[0] - 2 * LEFT_RIGHT_MARGIN, SCREEN_SIZE[1] - FONT_HEIGHT - 2 * TOP_BOTTOM_MARGIN) 183 | 184 | def __init__(self, racetrack_csv): 185 | self.screen = pygame.display.get_surface() 186 | self.screen_rect = self.screen.get_rect() 187 | self.done = False 188 | self.keys = pygame.key.get_pressed() 189 | self.racetrack = RaceTrack(racetrack_csv) 190 | self.current_action = [0, 0] 191 | self.font = pygame.font.SysFont(pygame.font.get_default_font(), self.FONT_SIZE) 192 | 193 | self.cell_size = self.get_cell_size() 194 | self.track_top_left = self.get_track_drawing_info() 195 | 196 | self.current_state = self.racetrack.starting_line_state() 197 | 198 | self.current_score = 0 199 | 200 | def get_cell_size(self): 201 | track_dimensions = self.racetrack.dimensions 202 | return (int(self.TRACK_SIZE[0] / track_dimensions[0]), int(self.TRACK_SIZE[1] / track_dimensions[1])) 203 | 204 | def get_track_drawing_info(self): 205 | track_dimensions = self.racetrack.dimensions 206 | 207 | # Correct for rounding 208 | actual_track_size = (self.cell_size[0] * track_dimensions[0], self.cell_size[1] * track_dimensions[1]) 209 | margins = (self.TRACK_SIZE[0] - actual_track_size[0], self.TRACK_SIZE[1] - actual_track_size[1]) 210 | 211 | track_top_left = (self.LEFT_RIGHT_MARGIN + margins[0] / 2, self.FONT_HEIGHT + self.TOP_BOTTOM_MARGIN + margins[1] / 2) 212 | 213 | return track_top_left 214 | 215 | def update_current_action(self): 216 | # Forward 217 | if self.keys[pygame.K_i]: 218 | self.current_action[1] = min(self.current_action[1] + 1, 1) 219 | # Back 220 | if self.keys[pygame.K_k]: 221 | self.current_action[1] = max(self.current_action[1] - 1, -1) 222 | # Left 223 | if self.keys[pygame.K_j]: 224 | self.current_action[0] = max(self.current_action[0] - 1, -1) 225 | # Right 226 | if self.keys[pygame.K_l]: 227 | self.current_action[0] = min(self.current_action[0] + 1, 1) 228 | 229 | def draw(self, state, action): 230 | self.screen.fill(RaceTrackGame.BACKGROUND_COLOR) 231 | self.render_current_action(action) 232 | self.render_game_state(state) 233 | 234 | def render_game_state(self, state): 235 | self.render_track() 236 | self.render_current_speed((state[2], state[3])) 237 | self.render_car((state[0], state[1])) 238 | 239 | def render_current_action(self, action): 240 | current_action_string = f'[H: {action[0]}, V: {action[1]}]' 241 | text_surface = self.font.render(current_action_string, True, self.FONT_COLOR) 242 | self.screen.blit(text_surface, (10, 10)) 243 | 244 | def render_current_speed(self, speed): 245 | current_speed_string = f'Current speed: H: {speed[0]}, V: {speed[1]}' 246 | text_surface = self.font.render(current_speed_string, True, self.FONT_COLOR) 247 | self.screen.blit(text_surface, (self.SCREEN_SIZE[0] - self.SPEED_RIGHT_MARGIN, 10)) 248 | 249 | def render_track(self): 250 | for row in range(len(self.racetrack.track)): 251 | for col in range(len(self.racetrack.track[row])): 252 | cell = self.racetrack.track[row][col] 253 | self.draw_cell(cell, col, row) 254 | 255 | def render_car(self, location): 256 | self.draw_cell(RaceTrack.CAR, location[0], location[1]) 257 | 258 | def get_track_pixel_pos(self, col, row): 259 | return (self.track_top_left[0] + col*self.cell_size[0], self.track_top_left[1] + row*self.cell_size[1]) 260 | 261 | def draw_cell(self, cell, col, row): 262 | if cell == RaceTrack.OOB: 263 | color = RaceTrackGame.OOB_COLOR 264 | elif cell == RaceTrack.FINISH: 265 | color = RaceTrackGame.FINISH_COLOR 266 | elif cell == RaceTrack.TRACK: 267 | color = RaceTrackGame.TRACK_COLOR 268 | elif cell == RaceTrack.START: 269 | color = RaceTrackGame.START_COLOR 270 | elif cell == RaceTrack.CAR: 271 | color = RaceTrackGame.CAR_COLOR 272 | else: 273 | raise ValueError('Unknown cell type') 274 | 275 | draw_position = self.get_track_pixel_pos(col, row) 276 | 277 | pygame.draw.rect(self.screen, color, (draw_position[0], draw_position[1], self.cell_size[0] - self.CELL_BORDER, self.cell_size[1] - self.CELL_BORDER)) 278 | 279 | def event_loop(self): 280 | for event in pygame.event.get(): 281 | self.keys = pygame.key.get_pressed() 282 | if event.type == pygame.QUIT or self.keys[pygame.K_ESCAPE]: 283 | self.done = True 284 | self.update_current_action() 285 | if self.keys[pygame.K_RETURN]: 286 | a = self.racetrack.action_to_id(self.current_action) 287 | s = self.racetrack.state_to_id(self.current_state) 288 | (r, s, finished) = self.racetrack.perform_action(s, a) 289 | self.current_score += r 290 | self.current_state = self.racetrack.id_to_state(s) 291 | if finished: 292 | print('Finished!!') 293 | print(f'You scored: {self.current_score}') 294 | self.current_score = 0 295 | 296 | def bot_loop(self, bot, episodes, timestep): 297 | for episode in range(episodes): 298 | state = self.racetrack.starting_line_state() 299 | s = self.racetrack.state_to_id(state) 300 | done = False 301 | steps = 0 302 | while not done: 303 | steps += 1 304 | a = bot.get_action(s) 305 | self.draw(self.racetrack.id_to_state(s), self.racetrack.id_to_action(a)) 306 | pygame.display.flip() 307 | (r, s, done) = self.racetrack.perform_action(s, a) 308 | time.sleep(timestep) 309 | print(f'Finished in {steps} steps!') 310 | 311 | 312 | def main_loop(self): 313 | while not self.done: 314 | self.event_loop() 315 | self.draw(self.current_state, self.current_action) 316 | pygame.display.flip() 317 | 318 | @staticmethod 319 | def init(): 320 | os.environ['SDL_VIDEO_CENTERED'] = '1' 321 | pygame.init() 322 | pygame.display.set_caption(RaceTrackGame.CAPTION) 323 | pygame.display.set_mode(RaceTrackGame.SCREEN_SIZE) 324 | 325 | @staticmethod 326 | def quit(): 327 | pygame.quit() 328 | sys.exit() 329 | 330 | @staticmethod 331 | def bot_run(racetrack_file, policy_file, episodes=10, timestep=1): 332 | RaceTrackGame.init() 333 | policy = np.load(policy_file) 334 | bot = RacerBot(policy) 335 | game = RaceTrackGame(racetrack_file) 336 | game.bot_loop(bot, episodes, timestep) 337 | RaceTrackGame.quit() 338 | 339 | @staticmethod 340 | def run(racetrack_file): 341 | RaceTrackGame.init() 342 | game = RaceTrackGame(racetrack_file) 343 | game.main_loop() 344 | RaceTrackGame.quit() -------------------------------------------------------------------------------- /environments/racing/run_trained_racetrack_bot.py: -------------------------------------------------------------------------------- 1 | from environments.racing.racing import RaceTrackGame 2 | import argparse 3 | 4 | 5 | parser = argparse.ArgumentParser(description='Plays the racetrack game with the specified policy.') 6 | 7 | parser.add_argument('racetrack', 8 | type=str, 9 | help='Path to racetrack csv file') 10 | parser.add_argument('policy', 11 | type=str, 12 | help='Path to serialized policy file') 13 | parser.add_argument('--timestep', 14 | type=float, 15 | help='Length of timesteps (s)', 16 | default=0.1) 17 | parser.add_argument('--episodes', 18 | type=int, 19 | help='Number of episodes to train over', 20 | default=10) 21 | parser.add_argument('--verbose', 22 | type=bool, 23 | help='Print (a lot of) log messages', 24 | default=False) 25 | args = parser.parse_args() 26 | 27 | RaceTrackGame.bot_run(args.racetrack, args.policy, episodes=args.episodes, timestep=args.timestep) 28 | -------------------------------------------------------------------------------- /environments/racing/trained_policies/mc_learning.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/mc_learning.npy -------------------------------------------------------------------------------- /environments/racing/trained_policies/q_learning.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/q_learning.npy -------------------------------------------------------------------------------- /environments/racing/trained_policies/random.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/random.npy -------------------------------------------------------------------------------- /environments/racing/trained_policies/sarsa.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/environments/racing/trained_policies/sarsa.npy -------------------------------------------------------------------------------- /lib/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sample_action(policy, state): 5 | """ 6 | Samples a policy for an action given the current state. 7 | """ 8 | choices = np.arange(0, policy.shape[1]) 9 | probabilities = policy[state] 10 | 11 | return np.random.choice(choices, p=probabilities) 12 | 13 | 14 | def get_epsilon_greedy_policy(Q, epsilon): 15 | num_actions = Q.shape[1] 16 | policy = (epsilon/num_actions) * np.ones(Q.shape) 17 | 18 | greedy_action_indices = np.argmax(Q, axis=1) 19 | policy[np.arange(0, Q.shape[0]), greedy_action_indices] += (1 - epsilon) 20 | 21 | return policy 22 | 23 | 24 | def get_greedy_policy(Q): 25 | return np.argmax(Q, axis=1) -------------------------------------------------------------------------------- /monte_carlo/README.md: -------------------------------------------------------------------------------- 1 | # Monte Carlo Methods 2 | 3 | In this chapter, we learn about Monte Carlo methods for learning the optimal behavior policy for 4 | finite MDPs. This is just like what we did in the last chapter except here, we do not assume any 5 | knowledge about the inner workings, or the model, of the MDP. For dynamic programming methods, we needed 6 | to know the transition probabilities for state transitions and the rewards associated with them in order to 7 | learn the optimal policy. Here, we learn the policy from experience alone. 8 | 9 | ## Blackjack: Policy Evaluation 10 | 11 | Here, we test out Monte Carlo policy evaluation on a Blackjack environment. We are evaluating the policy 12 | which stays only on 20 or 21 and hits on everything else. Below, you can see my results for running policy evaluation 13 | on this policy, which reproduces Figure 5.2 from the textbook. 14 | 15 | For 10,000 episodes: 16 | 17 | ![BlackjackEvalAce10000](./results/ace_policy_evaluation_10000.png) 18 | ![BlackjackEvalNoAce10000](./results/no_ace_policy_evaluation_10000.png) 19 | 20 | For 500,000 episodes: 21 | 22 | ![BlackjackEvalAce500000](./results/ace_policy_evaluation_500000.png) 23 | ![BlackjackEvalNoAce500000](./results/no_ace_policy_evaluation_500000.png) 24 | 25 | As you can see, using more episodes gives you a better, less noisy picture of the value function. 26 | 27 | ## Blackjack: Monte Carlo Control 28 | 29 | Here, we use a Monte Carlo method to learn the optimal policy. We use the pattern of generalized policy iteration 30 | to do so. Basically, this means we use Monte Carlo simulation to evaluate an arbitrary policy, improve that policy 31 | by being greedy with respect to our evaluation, evaluate that new policy, improve that policy by being greedy with 32 | respect to that evaluation, and so on until the policy stops changing (that means we have reached the optimal policy). 33 | 34 | ![BlackjackUsableAce](./results/ace_optimal.png) 35 | 36 | ![BlackjackNonUsableAce](./results/no_ace_optimal.png) 37 | 38 | ### Exercise 5.4: Racetrack Problem 39 | 40 | For this problem, I used on policy, first visit, epsilon soft Monte Carlo control to learn a policy for how 41 | to drive a car around a racetrack environment. The exact details of this problem are given in the text. Below, 42 | you can see how an agent behaves before and after training with this control method. 43 | 44 | Before: 45 | 46 | ![BotBeforeTraining](./results/untrained_bot_racing.gif) 47 | 48 | As you can see, this bot crashes into the walls a lot and takes a long time to make it to the target (the blue line). 49 | 50 | After: 51 | 52 | ![BotAfterTraining](./results/trained_bot_racing.gif) 53 | 54 | This bot clearly has learned some things about this environment. While it is still not behaving optimally, it is 55 | performing much better than the untrained bot on this environment. 56 | 57 | #### Sources: 58 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012 -------------------------------------------------------------------------------- /monte_carlo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/__init__.py -------------------------------------------------------------------------------- /monte_carlo/exercises/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/exercises/__init__.py -------------------------------------------------------------------------------- /monte_carlo/exercises/blackjack_policy_improvement.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter 4 | from monte_carlo import mc 5 | 6 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Improvement') 7 | 8 | parser.add_argument('--iterations', 9 | type=int, 10 | help='Number of iterations to run', 11 | default=5000000) 12 | parser.add_argument('--verbose', 13 | type=bool, 14 | help='Print (a lot of) log messages', 15 | default=False) 16 | args = parser.parse_args() 17 | 18 | 19 | blackjack = Blackjack(verbose=args.verbose) 20 | optimal_policy, Q = mc.det_policy_improvement(blackjack, iterations=args.iterations) 21 | 22 | if args.verbose: 23 | for state_id in range(optimal_policy.shape[0]): 24 | print('--------------------------------') 25 | BlackjackStates.print_state(state_id) 26 | if (optimal_policy[state_id] == Blackjack.HIT_ACTION): 27 | print('HIT') 28 | else: 29 | print('STAY') 30 | 31 | BlackjackPlotter.plot_policies(optimal_policy) 32 | -------------------------------------------------------------------------------- /monte_carlo/exercises/blackjack_soft_policy_improvement.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import lib.policy 4 | from environments.blackjack.blackjack import Blackjack, BlackjackStates, BlackjackPlotter 5 | from monte_carlo import mc 6 | 7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Soft Policy Improvement') 8 | 9 | parser.add_argument('--iterations', 10 | type=int, 11 | help='Number of iterations to run', 12 | default=1000000) 13 | parser.add_argument('--verbose', 14 | type=bool, 15 | help='Print (a lot of) log messages', 16 | default=False) 17 | args = parser.parse_args() 18 | 19 | 20 | blackjack = Blackjack(verbose=args.verbose) 21 | soft_optimal_policy, Q = mc.on_policy_fv_mc_e_soft_control( 22 | blackjack, 23 | epsilon_func=lambda ep, eps: 0.0, 24 | alpha_func=lambda n: 1/n, 25 | episodes=args.iterations, 26 | random_start=True 27 | ) 28 | 29 | optimal_policy = lib.policy.get_greedy_policy(Q) 30 | 31 | if args.verbose: 32 | for state_id in range(optimal_policy.shape[0]): 33 | print('--------------------------------') 34 | BlackjackStates.print_state(state_id) 35 | if (optimal_policy[state_id] == Blackjack.HIT_ACTION): 36 | print('HIT') 37 | else: 38 | print('STAY') 39 | 40 | BlackjackPlotter.plot_policies(optimal_policy) -------------------------------------------------------------------------------- /monte_carlo/exercises/mc_blackjack.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from environments.blackjack.blackjack import Blackjack, BlackjackPlotter 4 | from environments.blackjack.blackjack_policies import BlackjackPolicy 5 | from monte_carlo import mc 6 | 7 | parser = argparse.ArgumentParser(description='Blackjack Monte Carlo Policy Evaluation') 8 | 9 | parser.add_argument('--episodes', 10 | type=int, 11 | help='Number of episodes to train over', 12 | default=10000) 13 | parser.add_argument('--verbose', 14 | type=bool, 15 | help='Print (a lot of) log messages', 16 | default=False) 17 | args = parser.parse_args() 18 | 19 | 20 | blackjack = Blackjack(verbose=args.verbose) 21 | policy = BlackjackPolicy.generate_policy(stay_on=[20, 21]) 22 | 23 | value = mc.fv_policy_evaluation(blackjack, policy, episodes=args.episodes) 24 | BlackjackPlotter.plot_value_functions(value) 25 | -------------------------------------------------------------------------------- /monte_carlo/exercises/mc_racetrack.py: -------------------------------------------------------------------------------- 1 | from environments.racing.racing import RaceTrack 2 | from monte_carlo import mc 3 | import numpy as np 4 | import argparse 5 | 6 | 7 | parser = argparse.ArgumentParser(description='Monte Carlo Racetrack Policy Improvement') 8 | 9 | parser.add_argument('racetrack', 10 | type=str, 11 | help='Path to racetrack csv file') 12 | parser.add_argument('policy', 13 | type=str, 14 | help='Path at which to save policy file') 15 | parser.add_argument('--episodes', 16 | type=int, 17 | help='Number of episodes to train over', 18 | default=1000) 19 | parser.add_argument('--verbose', 20 | type=bool, 21 | help='Print (a lot of) log messages', 22 | default=False) 23 | args = parser.parse_args() 24 | 25 | 26 | racetrack = RaceTrack(args.racetrack) 27 | policy, Q = mc.on_policy_fv_mc_e_soft_control( 28 | racetrack, 29 | epsilon_func=lambda ep, eps: 1 - (ep/eps), 30 | alpha_func=lambda n: 0.1, 31 | episodes=args.episodes 32 | ) 33 | 34 | np.save(args.policy, policy) 35 | -------------------------------------------------------------------------------- /monte_carlo/mc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Monte Carlo methods 3 | 4 | An environment is assumed to support the following operations: 5 | environment.num_states(): Returns the number of states in the environment 6 | environment.num_actions(): Returns the number of actions in the environment 7 | environment.get_random_state(): Returns a random state 8 | environment.perform_action(a): Returns a reward and the next state (r, s') 9 | environment.is_terminal(s): Returns whether a state is terminal or not 10 | 11 | A deterministic policy is a environment.num_states x 1 array 12 | A non-deterministic policy is a environment.num_states x environment.num_actions array 13 | """ 14 | import numpy as np 15 | from tqdm import tqdm 16 | 17 | from lib.policy import sample_action, get_greedy_policy 18 | 19 | 20 | def det_policy_improvement(environment, iterations=100000): 21 | policy = np.zeros(environment.num_states(), dtype=int) 22 | Q = np.zeros((environment.num_states(), environment.num_actions())) 23 | N = np.zeros((environment.num_states(), environment.num_actions())) 24 | 25 | for i in tqdm(range(iterations)): 26 | 27 | states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True) 28 | 29 | for state, actions_performed in states_seen.items(): 30 | for action, gain in actions_performed.items(): 31 | N[state, action] = N[state, action] + 1 32 | Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action]) 33 | 34 | policy = get_greedy_policy(Q) 35 | 36 | return policy, Q 37 | 38 | 39 | def one_episode_state_action_values(environment, policy, random_start=True): 40 | s = environment.get_starting_state() 41 | states_seen = {} 42 | first_action = True 43 | episode_over = False 44 | steps_taken = 0 45 | while not episode_over: 46 | # If this is the first time we've seen this state 47 | if states_seen.get(s, None) is None: 48 | states_seen[s] = {} 49 | 50 | if first_action and random_start: 51 | a = np.random.randint(0, environment.num_actions()) 52 | first_action = False 53 | else: 54 | # Perform our action 55 | a = policy(s) 56 | 57 | # If this is the first time we've performed this action 58 | # in this state 59 | if states_seen[s].get(a, None) is None: 60 | states_seen[s][a] = 0 61 | 62 | (r, s_prime, episode_over) = environment.perform_action(s, a) 63 | 64 | # Update our gain counters 65 | states_seen = \ 66 | { 67 | state: {action: gain + r for action, gain in actions_performed.items()} 68 | for state, actions_performed 69 | in states_seen.items() 70 | } 71 | 72 | steps_taken += 1 73 | 74 | # Update current state 75 | s = s_prime 76 | 77 | print(f'{steps_taken}') 78 | 79 | return states_seen 80 | 81 | 82 | def on_policy_fv_mc_e_soft_control( 83 | environment, 84 | epsilon_func=lambda ep, eps: 0.1, 85 | alpha_func=lambda n: 0.1, 86 | episodes=10000, 87 | random_start=False 88 | ): 89 | # Initialize with uniform random policy 90 | 91 | policy = (1/environment.num_actions()) * np.ones((environment.num_states(), environment.num_actions())) 92 | 93 | Q = np.zeros((environment.num_states(), environment.num_actions())) 94 | N = np.zeros((environment.num_states(), environment.num_actions())) 95 | 96 | for episode in range(episodes): 97 | states_seen = one_episode_state_action_values(environment, lambda s: sample_action(policy, s), random_start=random_start) 98 | for state, actions_performed in states_seen.items(): 99 | for action, gain in actions_performed.items(): 100 | N[state, action] = N[state, action] + 1 101 | Q[state, action] = Q[state, action] + alpha_func(N[state, action])*(gain - Q[state, action]) 102 | epsilon = epsilon_func(episode, episodes) 103 | num_actions = Q.shape[1] 104 | policy[state] = (epsilon/num_actions) 105 | policy[state, np.argmax(Q[state])] += 1 - epsilon 106 | 107 | return policy, Q 108 | 109 | 110 | def det_fv_policy_q_evaluation(environment, policy, episodes=10000): 111 | """ 112 | First visit MC action-value deterministic policy evaluation with exploring starts. 113 | 114 | Returns the action-value function. 115 | """ 116 | Q = np.zeros((environment.num_states(), environment.num_actions())) 117 | N = np.zeros((environment.num_states(), environment.num_actions())) 118 | 119 | for episode in tqdm(range(episodes)): 120 | states_seen = one_episode_state_action_values(environment, lambda s: policy[s], random_start=True) 121 | for state, actions_performed in states_seen.items(): 122 | for action, gain in actions_performed.items(): 123 | N[state, action] = N[state, action] + 1 124 | Q[state, action] = Q[state, action] + (1.0/(N[state, action]))*(gain - Q[state, action]) 125 | 126 | return Q 127 | 128 | 129 | def fv_policy_evaluation(environment, policy, episodes=10000): 130 | """ 131 | First visit MC policy evaluation. 132 | 133 | Returns the state-value function. 134 | """ 135 | V = np.zeros(environment.num_states()) 136 | N = np.zeros(environment.num_states()) 137 | 138 | for episode in tqdm(range(episodes)): 139 | s = environment.get_random_state() 140 | states_seen = {} 141 | episode_over = False 142 | while not episode_over: 143 | # If this is the first time we've seen this state 144 | if states_seen.get(s, None) is None: 145 | states_seen[s] = 0 146 | 147 | # Perform our action 148 | a = policy[s] 149 | (r, s_prime, episode_over) = environment.perform_action(s, a) 150 | 151 | # Update our gain counters 152 | states_seen = {state: gain + r for state, gain in states_seen.items()} 153 | 154 | # Update current state 155 | s = s_prime 156 | for state, gain in states_seen.items(): 157 | N[state] = N[state] + 1 158 | V[state] = V[state] + (1.0/(N[state]))*(gain - V[state]) 159 | 160 | return V -------------------------------------------------------------------------------- /monte_carlo/results/ace_optimal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_optimal.png -------------------------------------------------------------------------------- /monte_carlo/results/ace_policy_evaluation_10000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_10000.png -------------------------------------------------------------------------------- /monte_carlo/results/ace_policy_evaluation_500000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/ace_policy_evaluation_500000.png -------------------------------------------------------------------------------- /monte_carlo/results/no_ace_optimal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_optimal.png -------------------------------------------------------------------------------- /monte_carlo/results/no_ace_policy_evaluation_10000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_10000.png -------------------------------------------------------------------------------- /monte_carlo/results/no_ace_policy_evaluation_500000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/no_ace_policy_evaluation_500000.png -------------------------------------------------------------------------------- /monte_carlo/results/trained_bot_racing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/trained_bot_racing.gif -------------------------------------------------------------------------------- /monte_carlo/results/untrained_bot_racing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/monte_carlo/results/untrained_bot_racing.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | matplotlib==2.1.1 3 | numpy==1.13.3 4 | pygame==1.9.3 5 | pyparsing==2.2.0 6 | python-dateutil==2.6.1 7 | pytz==2017.3 8 | six==1.11.0 9 | tqdm==4.19.5 10 | -------------------------------------------------------------------------------- /rl_problem/README.md: -------------------------------------------------------------------------------- 1 | # The Reinforcement Learning Problem 2 | 3 | In this chapter, we learn about the full reinforcement learning problem. 4 | The problem consists of an environment and an agent. We have control over 5 | the agent and are responsible for choosing which **actions** the agent takes. 6 | The environment is outside of the agent and thus, we have no control over it 7 | in general. The agent and environment interact in a simple way. At every time 8 | step, the agent performs some action, and the environment responds with 9 | the next **state** and an immediate **reward**. 10 | 11 | 12 | #### Exercise 3.1 13 | The first exercise is to come up with three example tasks that we can fit into 14 | the reinforcement learning framework. Here are mine: 15 | 16 | 1. A program that plays blackjack. The state is made up of the cards 17 | that it can see on the table. The possible actions are hit or stay. The rewards 18 | would simply be +1 if the hand is won, -1 if the hand is lost, and 0 for any 19 | action that does not cause the hand to end. 20 | 2. A traffic light controller. The reward is the number of cars it is 21 | allowing to pass through so that it promotes effective traffic flow. The state 22 | is readings from distant sensors that the controller has on each side which 23 | tell it how far a car is from each side. The controller can make each of its 24 | four sides one of three colors so there are 3^4 possible actions. 25 | 3. A piano playing program. The action in this case is very simple- which keys 26 | do we press and lift? The state is the keys that have already been played or are already 27 | currently pressed. The reward could be supplied by human listeners and could be a numerical 28 | representation of how much they are currently enjoying the music. 29 | 30 | ### Gridworld 31 | 32 | A very simple example of the reinforcement learning problem is gridworld. 33 | 34 | ![Gridworld](./results/gridworld.png) 35 | 36 | The states are all of the cells on the grid. The possible actions that 37 | we can take are up, down, left, and right. The rules of the environment are: 38 | - If we try to make a move that would take us off the grid, we get a reward 39 | of -1 40 | - If we are on the A square, however, every move takes us to A' and results in 41 | a reward of 10 42 | - If we are on the B square, every move takes us to B' and results in a reward of 43 | 5 44 | - Every other move results in a reward of 0 and takes you to the square you would 45 | expect 46 | 47 | What is the optimal way to act in this environment? In other words, how 48 | do we act so that our "long-term reward" is maximized? 49 | 50 | #### Discounting 51 | 52 | One tricky thing about maximizing "long-term reward" is that this little game could 53 | potentially go on infinitely. To make the problem more simple, mathematically, 54 | we use a strategy called discounting. This basically just means we weight 55 | future rewards in an exponentially decreasing way. 56 | 57 | γ0R0 + γ1R1 + γ2R2 + ... 58 | (where 0 <= γ < 1) 59 | 60 | We call the discounted sum of future rewards the "return" which is essentially a 61 | representation of expected long-term reward. 62 | 63 | #### Policies 64 | 65 | So we need to decide how to act in this environment. A policy specifies 66 | how an agent acts. For example, we could have a random policy, where the agent 67 | moves in a random direction every time step, or we can have an "always down" policy 68 | where the agent always moves down. In general, the policy is just a probability 69 | distribution which tells us the probability of taking each action depending on 70 | which state we are in. 71 | 72 | #### Value Functions 73 | 74 | A value function describes the "value" of a particular state or action. 75 | The "value" of a state is basically the expected future reward from the state, 76 | which we call the "return" as was mentioned above. This value function depends 77 | upon a policy because in order to know how much reward we can expect from a 78 | particular state, we need to know how we are going to act. 79 | 80 | #### Uniform Policy Value Function 81 | 82 | Here is the value function for the uniform random policy (where we choose 83 | an action randomly from every state). 84 | 85 | ![Uniform Random Policy Value Function](./results/uniform.png) 86 | 87 | #### Optimal Value Function 88 | 89 | What we are really interested in, though, is the optimal policy. The policy 90 | that gives us the most possible return from any given state. The optimal 91 | value function gives us the most possible return from any given state, so from 92 | that, we can derive the optimal policy. Here is the optimal value function 93 | for gridworld, solved using value iteration. 94 | 95 | ![Optimal Value Function](./results/optimal.png) 96 | 97 | #### Sources: 98 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012. -------------------------------------------------------------------------------- /rl_problem/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/__init__.py -------------------------------------------------------------------------------- /rl_problem/exercises/ex_3_17.py: -------------------------------------------------------------------------------- 1 | from rl_problem.gridworld import GridWorld 2 | 3 | g = GridWorld() 4 | 5 | optimal_value_function = g.get_optimal_value_function() 6 | print(optimal_value_function.reshape(5, 5)) 7 | -------------------------------------------------------------------------------- /rl_problem/exercises/gridworld_uniform_policy.py: -------------------------------------------------------------------------------- 1 | from rl_problem.gridworld import GridWorld 2 | 3 | # Using uniform policy 4 | g = GridWorld() 5 | uniform_policy = g.get_uniform_policy() 6 | value_func = g.get_value_function(uniform_policy).reshape(5, 5) 7 | print(value_func) 8 | 9 | -------------------------------------------------------------------------------- /rl_problem/gridworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import product 3 | 4 | 5 | class GridWorld: 6 | 7 | UP = 0 8 | RIGHT = 1 9 | DOWN = 2 10 | LEFT = 3 11 | 12 | def __init__(self, size=5): 13 | self.size = size 14 | self.action_space = [self.UP, self.RIGHT, self.DOWN, self.LEFT] 15 | self.A = (0, 1) 16 | self.A_prime = (4, 1) 17 | self.B = (0, 3) 18 | self.B_prime = (2, 3) 19 | self._rewards = self._init_rewards() 20 | self._transitions = self._init_state_transitions() 21 | 22 | def get_uniform_policy(self): 23 | policy = np.zeros((4, self.size, self.size)) 24 | policy[:, :, :] = 0.25 25 | return policy 26 | 27 | def get_expected_rewards(self, policy): 28 | expected_rewards = policy * self._rewards 29 | return expected_rewards.sum(axis=0) 30 | 31 | def _init_rewards(self): 32 | rewards = np.zeros((4, self.size, self.size)) 33 | rewards[[self.UP, self.DOWN], [0, self.size - 1], :] = -1 34 | rewards[[self.LEFT, self.RIGHT], :, [0, self.size - 1]] = -1 35 | # Special A location 36 | rewards[:, self.A[0], self.A[1]] = 10 37 | # Special B location 38 | rewards[:, self.B[0], self.B[1]] = 5 39 | return rewards 40 | 41 | def _init_state_transitions(self): 42 | state_transitions = np.zeros((4, self.size, self.size, self.size, self.size)) 43 | # Normal cases 44 | for row in range(self.size): 45 | for col in range(self.size): 46 | if row != 0: 47 | state_transitions[self.UP, row, col, row-1, col] = 1 48 | if row != self.size - 1: 49 | state_transitions[self.DOWN, row, col, row+1, col] = 1 50 | if col != 0: 51 | state_transitions[self.LEFT, row, col, row, col-1] = 1 52 | if col != self.size - 1: 53 | state_transitions[self.RIGHT, row, col, row, col+1] = 1 54 | 55 | # Handle edges 56 | for col in range(self.size): 57 | # Moving up or down in top or botton row leaves you in same state 58 | state_transitions[[self.UP, self.DOWN], [0, self.size - 1], col, [0, self.size - 1], col] = 1 59 | for row in range(self.size): 60 | # Moving left or right in leftmost or rightmost column leaves you in same state 61 | state_transitions[[self.LEFT, self.RIGHT], row, [0, self.size - 1], row, [0, self.size - 1]] = 1 62 | 63 | # Handle A and B 64 | state_transitions[:, [self.A[0], self.B[0]], [self.A[1], self.B[1]], :, :] = 0 65 | state_transitions[:, self.A[0], self.A[1], self.A_prime[0], self.A_prime[1]] = 1 66 | state_transitions[:, self.B[0], self.B[1], self.B_prime[0], self.B_prime[1]] = 1 67 | 68 | return state_transitions 69 | 70 | def get_value_function(self, policy, gamma=0.9): 71 | # Solve V = R + gamma*P(s,s')*V 72 | transition_probabilities = self.get_transition_probabilities(policy) 73 | expected_rewards = self.get_expected_rewards(policy).reshape(self.size**2) 74 | right_side_inverse = np.linalg.inv(np.identity(self.size**2) - gamma*transition_probabilities) 75 | return np.matmul(right_side_inverse, expected_rewards) 76 | 77 | def get_transition_probabilities(self, policy): 78 | ret = np.zeros((self.size**2, self.size**2)) 79 | for action in self.action_space: 80 | # p(a|s) 81 | action_policy = np.tile(policy[action, :, :].reshape(self.size**2), (self.size**2, 1)) 82 | # p(s'|s, a) 83 | state_transitions = self._transitions[action, :, :, :, :].reshape(self.size**2, self.size**2) 84 | ret = np.add(ret, np.multiply(action_policy, state_transitions)) 85 | return ret 86 | 87 | def get_optimal_value_function(self, gamma=0.9, convergence=0.01): 88 | ret = np.zeros(self.size**2) 89 | copy = np.copy(ret) 90 | diff = None 91 | while diff is None or diff > convergence: 92 | for row, col in product(range(self.size), range(self.size)): 93 | new_reward = None 94 | for action in self.action_space: 95 | next_state_distribution = self._transitions[action, row, col].reshape(self.size**2) 96 | expected_rewards = np.matmul(next_state_distribution, ret) 97 | test = self._rewards[action, row, col] + gamma*expected_rewards 98 | if new_reward is None or test > new_reward: 99 | new_reward = test 100 | copy[row*self.size + col] = new_reward 101 | diff = np.sum(np.fabs(np.subtract(ret, copy))) 102 | ret = copy 103 | copy = np.copy(ret) 104 | return ret 105 | -------------------------------------------------------------------------------- /rl_problem/results/gridworld.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/gridworld.png -------------------------------------------------------------------------------- /rl_problem/results/optimal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/optimal.png -------------------------------------------------------------------------------- /rl_problem/results/uniform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/rl_problem/results/uniform.png -------------------------------------------------------------------------------- /runner.py: -------------------------------------------------------------------------------- 1 | import sys, runpy 2 | import os.path 3 | 4 | sys.path.append(os.path.dirname(__file__)) 5 | 6 | executable = sys.argv[1] 7 | sys.argv = sys.argv[1:] 8 | 9 | runpy.run_path(executable) 10 | -------------------------------------------------------------------------------- /td_learning/README.md: -------------------------------------------------------------------------------- 1 | # Temporal-Difference Learning 2 | 3 | In this chapter, we learn about temporal difference (TD) learning. Like Monte Carlo methods, temporal difference 4 | learning methods allow us to learn an optimal policy in a model-free environment. This means that we learn the 5 | optimal policy through experience. 6 | 7 | The difference between TD and Monte Carlo is that TD using a technique called bootstrapping. Basically, Monte Carlo methods 8 | determine the value of a state based on a sample of all of the rewards that follow from it for the rest of the episode. 9 | So if I am in state 1, then go to state 2, 3, 4, etc up to state 10 and then the episode terminates and I get a reward 10 | of +1 at the end, I update my value estimate for each of those states with that final reward. 11 | 12 | TD learning methods use the knowledge that has already been accumulated to update value estimates instead of visited 13 | states. So if I am in state 1, then go to state 2, I update my value estimate for state 1 with the immediate reward I 14 | obtained plus my current value estimate for state 2. So I am updating my value estimate based on other estimates- this 15 | is bootstrapping. This is the same idea that Dynamic Programming methods use. 16 | 17 | ## SARSA: On-Policy TD Control for the Racetrack Problem 18 | 19 | SARSA is one algorithm for doing control using temporal difference learning. So we have an initial, arbitrary policy. 20 | We start in a certain state **S**, take an action **A**, observe reward **R**, arrive in a new state **S'**, and then 21 | take a new action **A'**. All of these (S, A, R, S', A') are used to update our action-value estimates (the estimates 22 | of the value of each action-state pair). So we were in state S and took action A, which gave us reward R, and caused up 23 | to end up in state S', about to take action A'. So you could say the observed value of (S, A) is the reward we just 24 | received plus the value of (S', A'). So this is the value that we move our value estimate of (S, A) towards. And that's 25 | it for updating our Q (action-value estimates). As for the policy that we follow, we simply behave in an epsilon greedy 26 | way with respect to our current Q. As we follow this and train, our Q approaches the optimal state-action value function 27 | and our policy approaches the optimal policy. 28 | 29 | Here is my result for applying SARSA control to the racetrack problem: 30 | 31 | ![SarsaRacing](./results/sarsa_trained_bot.gif) 32 | 33 | ## Q-Learning: Off-Policy TD Control for the Racetrack Problem 34 | 35 | Q-learning using TD learning techniques but is a bit more clever than SARSA. Q learning is an "off-policy" learning 36 | technique. This means that the policy that is being learned is not the same as the one that is being followed while 37 | learning. This is different from SARSA which follows a certain policy, improves that same policy, and eventually 38 | returns that policy. The advantage of "off-policy" methods is that they allow your learning agent to explore and 39 | take riskier actions while the policy being learned can be greedy and only choose the actions that it already 40 | knows are good. 41 | 42 | Here is my result for applying Q-learning to the racetrack problem: 43 | 44 | ![QLearningRacing](./results/q_learning_trained_bot.gif) 45 | 46 | #### Sources: 47 | 1. Sutton, Richard S., and Andrew G. Barto. Reinforcement Learning: an Introduction. 2nd ed., The MIT Press, 2012. -------------------------------------------------------------------------------- /td_learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/__init__.py -------------------------------------------------------------------------------- /td_learning/exercises/q_learning_racing.py: -------------------------------------------------------------------------------- 1 | from environments.racing.racing import RaceTrack 2 | from td_learning import td 3 | import numpy as np 4 | import argparse 5 | 6 | 7 | parser = argparse.ArgumentParser(description='Q Learning Racetrack') 8 | 9 | parser.add_argument('racetrack', 10 | type=str, 11 | help='Path to racetrack csv file') 12 | parser.add_argument('policy', 13 | type=str, 14 | help='Path at which to save policy file') 15 | parser.add_argument('--convergence', 16 | type=float, 17 | help='Convergence criteria for Q', 18 | default=10000) 19 | parser.add_argument('--verbose', 20 | type=bool, 21 | help='Print (a lot of) log messages', 22 | default=False) 23 | args = parser.parse_args() 24 | 25 | racetrack = RaceTrack(args.racetrack) 26 | policy, Q = td.q_learning( 27 | racetrack, 28 | alpha_func=lambda n: 1/n, 29 | epsilon=0.2, 30 | convergence=args.convergence 31 | ) 32 | 33 | np.save(args.policy, policy) 34 | -------------------------------------------------------------------------------- /td_learning/exercises/sarsa_racing.py: -------------------------------------------------------------------------------- 1 | from environments.racing.racing import RaceTrack 2 | from td_learning import td 3 | import numpy as np 4 | import argparse 5 | 6 | 7 | parser = argparse.ArgumentParser(description='Sarsa Racetrack Policy Improvement') 8 | 9 | parser.add_argument('racetrack', 10 | type=str, 11 | help='Path to racetrack csv file') 12 | parser.add_argument('policy', 13 | type=str, 14 | help='Path at which to save policy file') 15 | parser.add_argument('--episodes', 16 | type=int, 17 | help='Number of episodes to train over', 18 | default=1000) 19 | parser.add_argument('--verbose', 20 | type=bool, 21 | help='Print (a lot of) log messages', 22 | default=False) 23 | args = parser.parse_args() 24 | 25 | racetrack = RaceTrack(args.racetrack) 26 | policy, Q = td.sarsa( 27 | racetrack, 28 | alpha_func=lambda n: 1/n, 29 | epsilon_func=lambda ep, eps: 1 - (ep/eps), 30 | episodes=args.episodes 31 | ) 32 | 33 | np.save(args.policy, policy) 34 | -------------------------------------------------------------------------------- /td_learning/results/q_learning_trained_bot.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/q_learning_trained_bot.gif -------------------------------------------------------------------------------- /td_learning/results/sarsa_trained_bot.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NickCellino/reinforcement-learning-exercises/c83bd0c5bdbab2db7e995b92f1b93f3bad57ffb7/td_learning/results/sarsa_trained_bot.gif -------------------------------------------------------------------------------- /td_learning/td.py: -------------------------------------------------------------------------------- 1 | """ 2 | Temporal Difference learning methods 3 | 4 | An environment is assumed to support the following operations: 5 | environment.num_states(): Returns the number of states in the environment 6 | environment.num_actions(): Returns the number of actions in the environment 7 | environment.get_random_state(): Returns a random state 8 | environment.perform_action(a): Returns a reward, the next state (r, s'), and whether 9 | the episode is over 10 | 11 | A deterministic policy is a environment.num_states x 1 array 12 | A non-deterministic policy is a environment.num_states x environment.num_actions array 13 | """ 14 | import numpy as np 15 | from tqdm import tqdm 16 | 17 | from lib.policy import sample_action, get_epsilon_greedy_policy 18 | 19 | def sarsa( 20 | environment, 21 | epsilon_func=lambda ep, eps: 0.1, 22 | alpha_func=lambda n: 0.1, 23 | episodes=10000 24 | ): 25 | Q = np.zeros((environment.num_states(), environment.num_actions())) 26 | N = np.zeros((environment.num_states(), environment.num_actions())) 27 | policy = get_epsilon_greedy_policy(Q, (1.0/environment.num_actions())) 28 | for ep in tqdm(range(episodes)): 29 | episode_over = False 30 | s = environment.get_starting_state() 31 | a = sample_action(policy, s) 32 | while not episode_over: 33 | (r, s_prime, episode_over) = environment.perform_action(s, a) 34 | 35 | N[s, a] = N[s, a] + 1 36 | 37 | policy = get_epsilon_greedy_policy(Q, epsilon_func(ep, episodes)) 38 | a_prime = sample_action(policy, s) 39 | 40 | Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + Q[s_prime, a_prime] - Q[s, a]) 41 | 42 | s = s_prime 43 | a = a_prime 44 | return policy, Q 45 | 46 | def q_learning( 47 | environment, 48 | epsilon=0.3, 49 | alpha_func=lambda n: 0.2, 50 | convergence=0.1 51 | ): 52 | Q = np.zeros((environment.num_states(), environment.num_actions())) 53 | N = np.zeros((environment.num_states(), environment.num_actions())) 54 | diff = np.inf 55 | while diff > convergence: 56 | temp = np.copy(Q) 57 | # Perform 10,000 episodes, then check how much q has changed 58 | for ep in tqdm(range(10000)): 59 | episode_over = False 60 | s = environment.get_starting_state() 61 | while not episode_over: 62 | policy = get_epsilon_greedy_policy(Q, epsilon) 63 | a = sample_action(policy, s) 64 | 65 | (r, s_prime, episode_over) = environment.perform_action(s, a) 66 | 67 | N[s, a] = N[s, a] + 1 68 | Q[s, a] = Q[s, a] + alpha_func(N[s, a]) * (r + np.amax(Q[s_prime]) - Q[s, a]) 69 | 70 | s = s_prime 71 | diff = np.sum(np.fabs(np.subtract(Q, temp))) 72 | print(f'Diff: {diff}') 73 | 74 | return get_epsilon_greedy_policy(Q, 0.0), Q 75 | --------------------------------------------------------------------------------