├── .gitignore ├── requirements.txt ├── readme.md ├── example_13_1.py ├── figure_13_1.py └── figure_13_2.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.sublime-project 3 | *.sublime-workspace 4 | *.idea 5 | *.npy 6 | ve 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | joblib==0.13.2 3 | kiwisolver==1.1.0 4 | matplotlib==3.0.3 5 | numpy==1.16.3 6 | pyparsing==2.4.0 7 | python-dateutil==2.8.0 8 | scipy==1.3.1 9 | six==1.12.0 10 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | Installation instructions: 2 | 1. Install Python 3.7.0 if necessary. 3 | 2. Create a new Python virtual environment named 've' in the 'chapter_13_examples' directory: 4 | ```$ python3 -m venv ve``` 5 | 3. Activate the virtual environment: 6 | ```$ source ve/bin/activate``` 7 | 4. Upgrade pip: 8 | ```(ve)$ pip install --upgrade pip``` 9 | 5. Install required packages: 10 | ```(ve)$ pip install -r requirements.txt``` 11 | 12 | Instructions for running scripts: 13 | 1. Activate the virtual environment: 14 | ```$ source ve/bin/activate``` 15 | 2. Run the desired scripts: 16 | ```(ve)$ python example_13_1.py``` 17 | ```(ve)$ python figure_13_1.py --confidence_intervals``` 18 | ```(ve)$ python figure_13_2.py --confidence_intervals``` 19 | 3. Deactivate the virtual environment when finished: 20 | ```(ve)$ deactivate``` -------------------------------------------------------------------------------- /example_13_1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script requires Python 3.7.0 and the following packages: 3 | numpy==1.16.3 4 | matplotlib==3.0.3 (for plotting results) 5 | ''' 6 | 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # Calculate the value of the start state for a given 'right' probability: 13 | def v_s(pi): 14 | return (4 - 2 * pi)/(pi * pi - pi) 15 | 16 | 17 | # For each right probability, plot the value of the policy: 18 | right_probabilities = np.linspace(0.01, 0.99, 99) 19 | values = np.array([v_s(pi) for pi in right_probabilities]) 20 | plt.plot(right_probabilities, values, color='black') 21 | 22 | # Plot the value of e-greedy left policy: 23 | pi_e_greedy_left = .05 24 | v_e_greedy_left = v_s(pi_e_greedy_left) 25 | plt.plot(pi_e_greedy_left, v_e_greedy_left, color='black', marker='o') 26 | plt.annotate('$\\epsilon$-greedy \'left\'', (pi_e_greedy_left, v_e_greedy_left), xycoords='data', xytext=(10,-3), textcoords='offset points') 27 | 28 | # Plot the value of e-greedy right policy: 29 | pi_e_greedy_right = .95 30 | v_e_greedy_right = v_s(pi_e_greedy_right) 31 | plt.plot(pi_e_greedy_right, v_e_greedy_right, color='black', marker='o') 32 | plt.annotate('$\\epsilon$-greedy \'right\'', (pi_e_greedy_right, v_e_greedy_right), xycoords='data', xytext=(-85,-3), textcoords='offset points') 33 | 34 | # Plot the value of the optimal stochastic policy: 35 | pi_opt = 2 - np.sqrt(2) 36 | v_opt = v_s(pi_opt) 37 | plt.plot(pi_opt, v_opt, color='black', marker='o') 38 | plt.annotate('optimal stochastic policy', (pi_opt, v_opt), xycoords='data', xytext=(0,10), textcoords='offset points') 39 | 40 | # Configure the figure: 41 | plt.xlabel('Probability of action \'right\'') 42 | plt.ylabel('$J(\\mathbf{\\theta}) = v_{\\pi_{\\mathbf{\\theta}}}(S)$') 43 | plt.title('Short corridor with switched actions') 44 | plt.ylim([-100,0]) 45 | plt.yticks(list(plt.yticks()[0]) + [-11]) 46 | plt.savefig('example_13_1.png') -------------------------------------------------------------------------------- /figure_13_1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script requires Python 3.7.0 and the following packages: 3 | numpy==1.16.3 4 | matplotlib==3.0.3 (for plotting results) 5 | joblib==0.13.2 (for running experiments in parallel) 6 | scipy==1.3.1 (for computing error bars) 7 | ''' 8 | 9 | 10 | import argparse 11 | import os.path 12 | import numpy as np 13 | import scipy.stats as st 14 | import matplotlib.pyplot as plt 15 | from joblib import Parallel, delayed 16 | 17 | 18 | class ShortCorridor: 19 | start_state = 0 20 | goal_state = 3 21 | num_states = 4 22 | num_actions = 2 23 | left = 0 24 | right = 1 25 | 26 | @staticmethod 27 | def init(): 28 | return ShortCorridor.start_state 29 | 30 | @staticmethod 31 | def reset(): 32 | return ShortCorridor.start_state 33 | 34 | @staticmethod 35 | def step(state, action): 36 | assert ShortCorridor.start_state <= state < ShortCorridor.goal_state 37 | assert action == ShortCorridor.left or action == ShortCorridor.right 38 | 39 | if action == ShortCorridor.left: 40 | if state == 1: 41 | state += 1 42 | elif ShortCorridor.start_state < state: 43 | state -= 1 44 | elif action == ShortCorridor.right: 45 | if state == 1: 46 | state -= 1 47 | elif state < ShortCorridor.goal_state: 48 | state += 1 49 | else: 50 | raise ValueError('Invalid Action!') 51 | 52 | if state == ShortCorridor.goal_state: 53 | return -1, None 54 | else: 55 | return -1, state 56 | 57 | 58 | class ReinforceAgent: 59 | """ 60 | A REINFORCE agent with a discrete policy parameterization and linear function approximation. 61 | """ 62 | 63 | def __init__(self, num_actions, alpha): 64 | self.num_actions = num_actions 65 | self.alpha = alpha 66 | # Initialize the policy parameters: 67 | self.theta = np.log([[19], [1]]) # 5% chance of taking action 'right' 68 | 69 | def pi(self, x_s): 70 | """ 71 | Compute action probabilities from action preferences: 72 | :param x_s: state feature vector 73 | :return: an array of action probabilities 74 | """ 75 | # Compute action preferences for the given feature vector: 76 | preferences = self.theta.dot(x_s) 77 | # Convert overflows to underflows: 78 | preferences = preferences - preferences.max() 79 | # Convert the preferences into probabilities: 80 | exp_prefs = np.exp(preferences) 81 | return exp_prefs / np.sum(exp_prefs) 82 | 83 | def select_action(self, x_s): 84 | return np.random.choice(2, p=self.pi(x_s).squeeze()) 85 | 86 | def eligibility_vector(self, a, s): 87 | return self.x(s, a) - self.pi(self.x(s)) * (self.x(s, ShortCorridor.left) + self.x(s, ShortCorridor.right)) 88 | 89 | def x(self, s, a=None): 90 | """ 91 | Function approximator that computes state or state-action features. 92 | """ 93 | if a is None: 94 | return np.array([[1]]) 95 | elif a == ShortCorridor.right: 96 | return np.array([[0], [1]]) 97 | elif a == ShortCorridor.left: 98 | return np.array([[1], [0]]) 99 | else: 100 | raise ValueError('Invalid Action!') 101 | 102 | def learn(self, s_t, a_t, g_t): 103 | # Get state features: 104 | x_s = self.x(s_t) 105 | 106 | # Update policy weights: 107 | self.theta += self.alpha * g_t * self.eligibility_vector(a_t, s_t) 108 | 109 | 110 | def experiment(returns, alpha_index, alpha, run_num, random_seed, num_episodes, max_timesteps): 111 | np.random.seed(random_seed) 112 | agent = ReinforceAgent(num_actions=ShortCorridor.num_actions, alpha=alpha) 113 | 114 | for episode_num in range(num_episodes): 115 | episode = [] 116 | g = 0.0 117 | t = 0 118 | 119 | # Start an episode: 120 | s = ShortCorridor.init() 121 | x_s = agent.x(s) 122 | 123 | # Play out the episode: 124 | while (s is not None) and (t < max_timesteps): 125 | # Select action to take: 126 | a = agent.select_action(x_s) 127 | 128 | # Take action a, observe reward r' and next state s': 129 | r_prime, s_prime = ShortCorridor.step(s, a) 130 | 131 | # Save sequence for later: 132 | episode.append((s, a, r_prime)) 133 | 134 | # Update counters: 135 | s = s_prime 136 | g = g + r_prime 137 | t = t + 1 138 | 139 | # Store returns: 140 | returns[alpha_index, run_num, episode_num] = g 141 | 142 | # Episode finished, so update the agent: 143 | gt = g 144 | for t in range(len(episode)): 145 | # Unpack timestep: 146 | s, a, r_prime = episode[t] 147 | 148 | agent.learn(s, a, gt) 149 | 150 | # Compute return from t until end of episode for next timestep: 151 | gt = gt - r_prime 152 | 153 | 154 | if __name__ == '__main__': 155 | parser = argparse.ArgumentParser(description='A script to generate figure 13.1 from Sutton and Barto (2nd Ed.)') 156 | parser.add_argument('--alphas', type=float, nargs='*', default=[2**-12, 2**-13, 2**-14], help='Policy step sizes') 157 | parser.add_argument('--num_runs', type=int, default=100, help='The number of runs to average over') 158 | parser.add_argument('--num_episodes', type=int, default=1000, help='The number of episodes per run') 159 | parser.add_argument('--max_timesteps', type=int, default=1000, help='The maximum number of timesteps allowed per episode') 160 | parser.add_argument('--random_seed', type=int, default=2565, help='The random seed to use') 161 | parser.add_argument('--num_cpus', type=int, default=-1, help='The number of cpus to use') 162 | parser.add_argument('--confidence_intervals', action='store_true', help='Plot confidence intervals') 163 | args = parser.parse_args() 164 | 165 | # Set the random seed: 166 | np.random.seed(args.random_seed) 167 | # Generate a random seed for each run: 168 | random_seeds = [np.random.randint(low=0, high=2**32) for run in range(args.num_runs)] 169 | 170 | # If the data file already exists, use it instead of re-generating the data: 171 | if os.path.exists('returns_13_1.npy'): 172 | # Create memmapped arrays to be populated in parallel: 173 | returns = np.memmap('returns_13_1.npy', shape=(len(args.alphas), args.num_runs, args.num_episodes), dtype=np.int16, mode='r') 174 | else: 175 | # Create memmapped arrays to be populated in parallel: 176 | returns = np.memmap('returns_13_1.npy', shape=(len(args.alphas), args.num_runs, args.num_episodes), dtype=np.int16, mode='w+') 177 | 178 | # Run experiments in parallel: 179 | Parallel(n_jobs=args.num_cpus, verbose=10)(delayed(experiment)(returns, alpha_index, alpha, run_num, random_seed, args.num_episodes, args.max_timesteps) for run_num, random_seed in enumerate(random_seeds) for alpha_index, alpha in enumerate(args.alphas)) 180 | 181 | 182 | # Plot the results: 183 | fig = plt.figure() 184 | ax = fig.add_subplot(111) 185 | for alpha_index, alpha in enumerate(args.alphas): 186 | # Average over runs: 187 | means = np.mean(returns[alpha_index], axis=0) 188 | p = plt.plot(np.arange(args.num_episodes), means, label='2^{}'.format(int(np.log2(alpha)))) # keep reference for colour-matching with errorbars. 189 | 190 | if args.confidence_intervals: 191 | # Plot 95% confidence intervals: 192 | sems = st.sem(returns[alpha_index], axis=0) 193 | confs = sems * st.t.ppf((1.0 + 0.95) / 2, args.num_runs - 1) 194 | ax.errorbar(np.arange(args.num_episodes), means, yerr=[confs, confs], color=p[0].get_color(), alpha=.15) 195 | 196 | ax.legend(title='Step size $\\alpha$:') 197 | ax.set_xlabel('Episode') 198 | ax.set_ylabel('Total reward on episode') 199 | ax.set_ylim(-90,-10) 200 | ax.set_title('Performance of REINFORCE (averaged over {} runs)'.format(args.num_runs)) 201 | plt.savefig('figure_13_1.png') -------------------------------------------------------------------------------- /figure_13_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script requires Python 3.7.0 and the following packages: 3 | numpy==1.16.3 4 | matplotlib==3.0.3 (for plotting results) 5 | joblib==0.13.2 (for running experiments in parallel) 6 | scipy==1.3.1 (for computing error bars) 7 | ''' 8 | 9 | import random 10 | import argparse 11 | import os.path 12 | import numpy as np 13 | import scipy.stats as st 14 | import matplotlib.pyplot as plt 15 | from joblib import Parallel, delayed 16 | 17 | 18 | class ShortCorridor: 19 | start_state = 0 20 | goal_state = 3 21 | num_states = 4 22 | num_actions = 2 23 | left = 0 24 | right = 1 25 | 26 | @staticmethod 27 | def init(): 28 | return ShortCorridor.start_state 29 | 30 | @staticmethod 31 | def reset(): 32 | return ShortCorridor.start_state 33 | 34 | @staticmethod 35 | def step(state, action): 36 | assert ShortCorridor.start_state <= state < ShortCorridor.goal_state 37 | assert action == ShortCorridor.left or action == ShortCorridor.right 38 | 39 | if action == ShortCorridor.left: 40 | if state == 1: 41 | state += 1 42 | elif ShortCorridor.start_state < state: 43 | state -= 1 44 | elif action == ShortCorridor.right: 45 | if state == 1: 46 | state -= 1 47 | elif state < ShortCorridor.goal_state: 48 | state += 1 49 | else: 50 | raise ValueError('Invalid Action!') 51 | 52 | if state == ShortCorridor.goal_state: 53 | return -1, None 54 | else: 55 | return -1, state 56 | 57 | 58 | class ReinforceWithBaseline: 59 | """ 60 | A REINFORCE agent with a discrete policy parameterization, linear function approximation, and an optional baseline. 61 | """ 62 | 63 | def __init__(self, num_actions, alpha, beta=0.0): 64 | self.num_actions = num_actions 65 | self.alpha = alpha 66 | self.beta = beta 67 | 68 | # Initialize the policy parameters: 69 | self.theta = np.log([[19], [1]]) # 5% chance of taking action 'right' 70 | 71 | # Initialize the value parameters: 72 | self.w = np.zeros((1, 1)) 73 | 74 | def pi(self, x_s): 75 | """ 76 | Compute action probabilities from action preferences: 77 | :param x_s: state feature vector 78 | :return: an array of action probabilities 79 | """ 80 | # Compute action preferences for the given feature vector: 81 | preferences = self.theta.dot(x_s) 82 | # Convert overflows to underflows: 83 | preferences = preferences - preferences.max() 84 | # Convert the preferences into probabilities: 85 | exp_prefs = np.exp(preferences) 86 | return exp_prefs / np.sum(exp_prefs) 87 | 88 | def select_action(self, x_s): 89 | return np.random.choice(2, p=self.pi(x_s).squeeze()) 90 | 91 | def eligibility_vector(self, a, s): 92 | return self.x(s, a) - self.pi(self.x(s)) * (self.x(s, ShortCorridor.left) + self.x(s, ShortCorridor.right)) 93 | 94 | def x(self, s, a=None): 95 | """ 96 | Function approximator that computes state or state-action features. 97 | """ 98 | if a is None: 99 | return np.array([[1]]) 100 | elif a == ShortCorridor.right: 101 | return np.array([[0], [1]]) 102 | elif a == ShortCorridor.left: 103 | return np.array([[1], [0]]) 104 | else: 105 | raise ValueError('Invalid Action!') 106 | 107 | def learn(self, s_t, a_t, g_t): 108 | # Get state features: 109 | x_s = self.x(s_t) 110 | 111 | # Compare return with baseline (state value estimate): 112 | delta = g_t - self.w.dot(x_s) 113 | 114 | # Update baseline weights: 115 | self.w += self.beta * delta * x_s 116 | 117 | # Update policy weights: 118 | self.theta += self.alpha * delta * self.eligibility_vector(a_t, s_t) 119 | 120 | 121 | def experiment(returns, alpha_index, alpha, beta_index, beta, run_num, random_seed, num_episodes, max_timesteps): 122 | np.random.seed(random_seed) 123 | agent = ReinforceWithBaseline(num_actions=ShortCorridor.num_actions, alpha=alpha, beta=beta) 124 | 125 | for episode_num in range(num_episodes): 126 | episode = [] 127 | g = 0.0 128 | t = 0 129 | 130 | # Start an episode: 131 | s = ShortCorridor.init() 132 | x_s = agent.x(s) 133 | 134 | # Play out the episode: 135 | while (s is not None) and (t < max_timesteps): 136 | # Select action to take: 137 | a = agent.select_action(x_s) 138 | 139 | # Take action a, observe reward r' and next state s': 140 | r_prime, s_prime = ShortCorridor.step(s, a) 141 | 142 | # Save sequence for later: 143 | episode.append((s, a, r_prime)) 144 | 145 | # Update counters: 146 | s = s_prime 147 | g = g + r_prime 148 | t = t + 1 149 | 150 | # Store returns: 151 | returns[alpha_index, beta_index, run_num, episode_num] = g 152 | 153 | # Episode finished, so update the agent: 154 | gt = g 155 | for t in range(len(episode)): 156 | # Unpack timestep: 157 | s, a, r_prime = episode[t] 158 | 159 | agent.learn(s, a, gt) 160 | 161 | # Compute return from t until end of episode for next timestep: 162 | gt = gt - r_prime 163 | 164 | 165 | if __name__ == '__main__': 166 | parser = argparse.ArgumentParser(description='A script to generate figure 13.1 from Sutton and Barto (2nd Ed.)') 167 | parser.add_argument('--alphas', type=float, nargs='*', default=[2**-9, 2**-13], help='Policy step sizes') 168 | parser.add_argument('--betas', type=float, nargs='*', default=[2**-6, 0.], help='Baseline step sizes') 169 | parser.add_argument('--num_runs', type=int, default=100, help='The number of runs to average over') 170 | parser.add_argument('--num_episodes', type=int, default=1000, help='The number of episodes per run') 171 | parser.add_argument('--max_timesteps', type=int, default=1000, help='The maximum number of timesteps allowed per episode') 172 | parser.add_argument('--random_seed', type=int, default=2565, help='The random seed to use') 173 | parser.add_argument('--num_cpus', type=int, default=-1, help='The number of cpus to use') 174 | parser.add_argument('--confidence_intervals', action='store_true', help='Plot confidence intervals') 175 | args = parser.parse_args() 176 | 177 | # Set the random seed: 178 | random.seed(args.random_seed) 179 | # Generate a random seed for each run without replacement: 180 | random_seeds = random.sample(range(2**32), args.num_runs) 181 | 182 | # If the data file already exists, use it instead of re-generating the data: 183 | if os.path.exists('returns_13_2.npy'): 184 | # Create memmapped arrays to be populated in parallel: 185 | returns = np.memmap('returns_13_2.npy', shape=(len(args.alphas), len(args.betas), args.num_runs, args.num_episodes), dtype=np.int16, mode='r') 186 | else: 187 | # Create memmapped arrays to be populated in parallel: 188 | returns = np.memmap('returns_13_2.npy', shape=(len(args.alphas), len(args.betas), args.num_runs, args.num_episodes), dtype=np.int16, mode='w+') 189 | 190 | # Run experiments in parallel: 191 | Parallel(n_jobs=args.num_cpus, verbose=10)(delayed(experiment)(returns, index, alpha, index, args.betas[index], run_num, random_seed, args.num_episodes, args.max_timesteps) for run_num, random_seed in enumerate(random_seeds) for index, alpha in enumerate(args.alphas)) 192 | 193 | 194 | # Plot the results: 195 | fig = plt.figure() 196 | ax = fig.add_subplot(111) 197 | # Plot each pair of alpha, beta: 198 | for index, alpha in enumerate(args.alphas): 199 | 200 | # Average over runs: 201 | means = np.mean(returns[index, index], axis=0) 202 | if args.betas[index] == 0.: 203 | label = '2^{}'.format(int(np.log2(alpha))) 204 | else: 205 | label = '2^{}, 2^{}'.format(int(np.log2(alpha)), int(np.log2(args.betas[index]))) 206 | 207 | p = plt.plot(np.arange(args.num_episodes), means, label=label) # keep reference for colour-matching with errorbars. 208 | 209 | if args.confidence_intervals: 210 | # Plot 95% confidence intervals: 211 | sems = st.sem(returns[index, index], axis=0) 212 | confs = sems * st.t.ppf((1.0 + 0.95) / 2, args.num_runs - 1) 213 | ax.errorbar(np.arange(args.num_episodes), means, yerr=[confs, confs], color=p[0].get_color(), alpha=.15) 214 | 215 | ax.legend(title='Step sizes $\\alpha^{\\theta}, \\alpha^{w}$:') 216 | ax.set_xlabel('Episode') 217 | ax.set_ylabel('Total reward on episode') 218 | ax.set_ylim(-90,-10) 219 | ax.set_title('Performance of REINFORCE with and without baseline\n(averaged over {} runs)'.format(args.num_runs)) 220 | plt.savefig('figure_13_2.png') 221 | --------------------------------------------------------------------------------