├── .gitignore
├── requirements.txt
├── readme.md
├── example_13_1.py
├── figure_13_1.py
└── figure_13_2.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.sublime-project
3 | *.sublime-workspace
4 | *.idea
5 | *.npy
6 | ve
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | joblib==0.13.2
 3 | kiwisolver==1.1.0
 4 | matplotlib==3.0.3
 5 | numpy==1.16.3
 6 | pyparsing==2.4.0
 7 | python-dateutil==2.8.0
 8 | scipy==1.3.1
 9 | six==1.12.0
10 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | Installation instructions:
 2 | 1. Install Python 3.7.0 if necessary.
 3 | 2. Create a new Python virtual environment named 've' in the 'chapter_13_examples' directory:  
 4 |     ```$ python3 -m venv ve```
 5 | 3. Activate the virtual environment:  
 6 |     ```$ source ve/bin/activate```
 7 | 4. Upgrade pip:  
 8 |     ```(ve)$ pip install --upgrade pip```
 9 | 5. Install required packages:  
10 |     ```(ve)$ pip install -r requirements.txt```
11 | 
12 | Instructions for running scripts:
13 | 1. Activate the virtual environment:  
14 |     ```$ source ve/bin/activate```
15 | 2. Run the desired scripts:  
16 |     ```(ve)$ python example_13_1.py```  
17 |     ```(ve)$ python figure_13_1.py --confidence_intervals```  
18 |     ```(ve)$ python figure_13_2.py --confidence_intervals```
19 | 3. Deactivate the virtual environment when finished:  
20 |     ```(ve)$ deactivate```


--------------------------------------------------------------------------------
/example_13_1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This script requires Python 3.7.0 and the following packages:
 3 | numpy==1.16.3
 4 | matplotlib==3.0.3 (for plotting results)
 5 | '''
 6 | 
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | 
12 | # Calculate the value of the start state for a given 'right' probability:
13 | def v_s(pi):
14 |     return (4 - 2 * pi)/(pi * pi - pi)
15 | 
16 | 
17 | # For each right probability, plot the value of the policy:
18 | right_probabilities = np.linspace(0.01, 0.99, 99)
19 | values = np.array([v_s(pi) for pi in right_probabilities])
20 | plt.plot(right_probabilities, values, color='black')
21 | 
22 | # Plot the value of e-greedy left policy:
23 | pi_e_greedy_left = .05
24 | v_e_greedy_left = v_s(pi_e_greedy_left)
25 | plt.plot(pi_e_greedy_left, v_e_greedy_left, color='black', marker='o')
26 | plt.annotate('$\\epsilon$-greedy \'left\'', (pi_e_greedy_left, v_e_greedy_left), xycoords='data', xytext=(10,-3), textcoords='offset points')
27 | 
28 | # Plot the value of e-greedy right policy:
29 | pi_e_greedy_right = .95
30 | v_e_greedy_right = v_s(pi_e_greedy_right)
31 | plt.plot(pi_e_greedy_right, v_e_greedy_right, color='black', marker='o')
32 | plt.annotate('$\\epsilon$-greedy \'right\'', (pi_e_greedy_right, v_e_greedy_right), xycoords='data', xytext=(-85,-3), textcoords='offset points')
33 | 
34 | # Plot the value of the optimal stochastic policy:
35 | pi_opt = 2 - np.sqrt(2)
36 | v_opt = v_s(pi_opt)
37 | plt.plot(pi_opt, v_opt, color='black', marker='o')
38 | plt.annotate('optimal stochastic policy', (pi_opt, v_opt), xycoords='data', xytext=(0,10), textcoords='offset points')
39 | 
40 | # Configure the figure:
41 | plt.xlabel('Probability of action \'right\'')
42 | plt.ylabel('$J(\\mathbf{\\theta}) = v_{\\pi_{\\mathbf{\\theta}}}(S)$')
43 | plt.title('Short corridor with switched actions')
44 | plt.ylim([-100,0])
45 | plt.yticks(list(plt.yticks()[0]) + [-11])
46 | plt.savefig('example_13_1.png')


--------------------------------------------------------------------------------
/figure_13_1.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This script requires Python 3.7.0 and the following packages:
  3 | numpy==1.16.3
  4 | matplotlib==3.0.3 (for plotting results)
  5 | joblib==0.13.2 (for running experiments in parallel)
  6 | scipy==1.3.1 (for computing error bars)
  7 | '''
  8 | 
  9 | 
 10 | import argparse
 11 | import os.path
 12 | import numpy as np
 13 | import scipy.stats as st
 14 | import matplotlib.pyplot as plt
 15 | from joblib import Parallel, delayed
 16 | 
 17 | 
 18 | class ShortCorridor:
 19 |     start_state = 0
 20 |     goal_state = 3
 21 |     num_states = 4
 22 |     num_actions = 2
 23 |     left = 0
 24 |     right = 1
 25 | 
 26 |     @staticmethod
 27 |     def init():
 28 |         return ShortCorridor.start_state
 29 | 
 30 |     @staticmethod
 31 |     def reset():
 32 |         return ShortCorridor.start_state
 33 | 
 34 |     @staticmethod
 35 |     def step(state, action):
 36 |         assert ShortCorridor.start_state <= state < ShortCorridor.goal_state
 37 |         assert action == ShortCorridor.left or action == ShortCorridor.right
 38 | 
 39 |         if action == ShortCorridor.left:
 40 |             if state == 1:
 41 |                 state += 1
 42 |             elif ShortCorridor.start_state < state:
 43 |                 state -= 1
 44 |         elif action == ShortCorridor.right:
 45 |             if state == 1:
 46 |                 state -= 1
 47 |             elif state < ShortCorridor.goal_state:
 48 |                 state += 1
 49 |         else:
 50 |             raise ValueError('Invalid Action!')
 51 | 
 52 |         if state == ShortCorridor.goal_state:
 53 |             return -1, None
 54 |         else:
 55 |             return -1, state
 56 | 
 57 | 
 58 | class ReinforceAgent:
 59 |     """
 60 |     A REINFORCE agent with a discrete policy parameterization and linear function approximation.
 61 |     """
 62 | 
 63 |     def __init__(self, num_actions, alpha):
 64 |         self.num_actions = num_actions
 65 |         self.alpha = alpha
 66 |         # Initialize the policy parameters:
 67 |         self.theta = np.log([[19], [1]]) # 5% chance of taking action 'right'
 68 | 
 69 |     def pi(self, x_s):
 70 |         """
 71 |         Compute action probabilities from action preferences:
 72 |         :param x_s: state feature vector
 73 |         :return: an array of action probabilities
 74 |         """
 75 |         # Compute action preferences for the given feature vector:
 76 |         preferences = self.theta.dot(x_s)
 77 |         # Convert overflows to underflows:
 78 |         preferences = preferences - preferences.max()
 79 |         # Convert the preferences into probabilities:
 80 |         exp_prefs = np.exp(preferences)
 81 |         return exp_prefs / np.sum(exp_prefs)
 82 | 
 83 |     def select_action(self, x_s):
 84 |         return np.random.choice(2, p=self.pi(x_s).squeeze())
 85 | 
 86 |     def eligibility_vector(self, a, s):
 87 |         return self.x(s, a) - self.pi(self.x(s)) * (self.x(s, ShortCorridor.left) + self.x(s, ShortCorridor.right))
 88 | 
 89 |     def x(self, s, a=None):
 90 |         """
 91 |         Function approximator that computes state or state-action features.
 92 |         """
 93 |         if a is None:
 94 |             return np.array([[1]])
 95 |         elif a == ShortCorridor.right:
 96 |             return np.array([[0], [1]])
 97 |         elif a == ShortCorridor.left:
 98 |             return np.array([[1], [0]])
 99 |         else:
100 |             raise ValueError('Invalid Action!')
101 | 
102 |     def learn(self, s_t, a_t, g_t):
103 |         # Get state features:
104 |         x_s = self.x(s_t)
105 | 
106 |         # Update policy weights:
107 |         self.theta += self.alpha * g_t * self.eligibility_vector(a_t, s_t)
108 | 
109 | 
110 | def experiment(returns, alpha_index, alpha, run_num, random_seed, num_episodes, max_timesteps):
111 |     np.random.seed(random_seed)
112 |     agent = ReinforceAgent(num_actions=ShortCorridor.num_actions, alpha=alpha)
113 | 
114 |     for episode_num in range(num_episodes):
115 |         episode = []
116 |         g = 0.0
117 |         t = 0
118 | 
119 |         # Start an episode:
120 |         s = ShortCorridor.init()
121 |         x_s = agent.x(s)
122 | 
123 |         # Play out the episode:
124 |         while (s is not None) and (t < max_timesteps):
125 |             # Select action to take:
126 |             a = agent.select_action(x_s)
127 | 
128 |             # Take action a, observe reward r' and next state s':
129 |             r_prime, s_prime = ShortCorridor.step(s, a)
130 | 
131 |             # Save sequence for later:
132 |             episode.append((s, a, r_prime))
133 | 
134 |             # Update counters:
135 |             s = s_prime
136 |             g = g + r_prime
137 |             t = t + 1
138 | 
139 |         # Store returns:
140 |         returns[alpha_index, run_num, episode_num] = g
141 | 
142 |         # Episode finished, so update the agent:
143 |         gt = g
144 |         for t in range(len(episode)):
145 |             # Unpack timestep:
146 |             s, a, r_prime = episode[t]
147 | 
148 |             agent.learn(s, a, gt)
149 | 
150 |             # Compute return from t until end of episode for next timestep:
151 |             gt = gt - r_prime
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     parser = argparse.ArgumentParser(description='A script to generate figure 13.1 from Sutton and Barto (2nd Ed.)')
156 |     parser.add_argument('--alphas', type=float, nargs='*', default=[2**-12, 2**-13, 2**-14], help='Policy step sizes')
157 |     parser.add_argument('--num_runs', type=int, default=100, help='The number of runs to average over')
158 |     parser.add_argument('--num_episodes', type=int, default=1000, help='The number of episodes per run')
159 |     parser.add_argument('--max_timesteps', type=int, default=1000, help='The maximum number of timesteps allowed per episode')
160 |     parser.add_argument('--random_seed', type=int, default=2565, help='The random seed to use')
161 |     parser.add_argument('--num_cpus', type=int, default=-1, help='The number of cpus to use')
162 |     parser.add_argument('--confidence_intervals', action='store_true', help='Plot confidence intervals')
163 |     args = parser.parse_args()
164 | 
165 |     # Set the random seed:
166 |     np.random.seed(args.random_seed)
167 |     # Generate a random seed for each run:
168 |     random_seeds = [np.random.randint(low=0, high=2**32) for run in range(args.num_runs)]
169 | 
170 |     # If the data file already exists, use it instead of re-generating the data:
171 |     if os.path.exists('returns_13_1.npy'):
172 |         # Create memmapped arrays to be populated in parallel:
173 |         returns = np.memmap('returns_13_1.npy', shape=(len(args.alphas), args.num_runs, args.num_episodes), dtype=np.int16, mode='r')
174 |     else:
175 |         # Create memmapped arrays to be populated in parallel:
176 |         returns = np.memmap('returns_13_1.npy', shape=(len(args.alphas), args.num_runs, args.num_episodes), dtype=np.int16, mode='w+')
177 | 
178 |         # Run experiments in parallel:
179 |         Parallel(n_jobs=args.num_cpus, verbose=10)(delayed(experiment)(returns, alpha_index, alpha, run_num, random_seed, args.num_episodes, args.max_timesteps) for run_num, random_seed in enumerate(random_seeds) for alpha_index, alpha in enumerate(args.alphas))
180 | 
181 | 
182 |     # Plot the results:
183 |     fig = plt.figure()
184 |     ax = fig.add_subplot(111)
185 |     for alpha_index, alpha in enumerate(args.alphas):
186 |             # Average over runs:
187 |             means = np.mean(returns[alpha_index], axis=0)
188 |             p = plt.plot(np.arange(args.num_episodes), means, label='2^{}'.format(int(np.log2(alpha))))  # keep reference for colour-matching with errorbars.
189 | 
190 |             if args.confidence_intervals:
191 |                 # Plot 95% confidence intervals:
192 |                 sems = st.sem(returns[alpha_index], axis=0)
193 |                 confs = sems * st.t.ppf((1.0 + 0.95) / 2, args.num_runs - 1)
194 |                 ax.errorbar(np.arange(args.num_episodes), means, yerr=[confs, confs], color=p[0].get_color(), alpha=.15)
195 | 
196 |     ax.legend(title='Step size $\\alpha$:')
197 |     ax.set_xlabel('Episode')
198 |     ax.set_ylabel('Total reward on episode')
199 |     ax.set_ylim(-90,-10)
200 |     ax.set_title('Performance of REINFORCE (averaged over {} runs)'.format(args.num_runs))
201 |     plt.savefig('figure_13_1.png')


--------------------------------------------------------------------------------
/figure_13_2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This script requires Python 3.7.0 and the following packages:
  3 | numpy==1.16.3
  4 | matplotlib==3.0.3 (for plotting results)
  5 | joblib==0.13.2 (for running experiments in parallel)
  6 | scipy==1.3.1 (for computing error bars)
  7 | '''
  8 | 
  9 | import random
 10 | import argparse
 11 | import os.path
 12 | import numpy as np
 13 | import scipy.stats as st
 14 | import matplotlib.pyplot as plt
 15 | from joblib import Parallel, delayed
 16 | 
 17 | 
 18 | class ShortCorridor:
 19 |     start_state = 0
 20 |     goal_state = 3
 21 |     num_states = 4
 22 |     num_actions = 2
 23 |     left = 0
 24 |     right = 1
 25 | 
 26 |     @staticmethod
 27 |     def init():
 28 |         return ShortCorridor.start_state
 29 | 
 30 |     @staticmethod
 31 |     def reset():
 32 |         return ShortCorridor.start_state
 33 | 
 34 |     @staticmethod
 35 |     def step(state, action):
 36 |         assert ShortCorridor.start_state <= state < ShortCorridor.goal_state
 37 |         assert action == ShortCorridor.left or action == ShortCorridor.right
 38 | 
 39 |         if action == ShortCorridor.left:
 40 |             if state == 1:
 41 |                 state += 1
 42 |             elif ShortCorridor.start_state < state:
 43 |                 state -= 1
 44 |         elif action == ShortCorridor.right:
 45 |             if state == 1:
 46 |                 state -= 1
 47 |             elif state < ShortCorridor.goal_state:
 48 |                 state += 1
 49 |         else:
 50 |             raise ValueError('Invalid Action!')
 51 | 
 52 |         if state == ShortCorridor.goal_state:
 53 |             return -1, None
 54 |         else:
 55 |             return -1, state
 56 | 
 57 | 
 58 | class ReinforceWithBaseline:
 59 |     """
 60 |     A REINFORCE agent with a discrete policy parameterization, linear function approximation, and an optional baseline.
 61 |     """
 62 | 
 63 |     def __init__(self, num_actions, alpha, beta=0.0):
 64 |         self.num_actions = num_actions
 65 |         self.alpha = alpha
 66 |         self.beta = beta
 67 | 
 68 |         # Initialize the policy parameters:
 69 |         self.theta = np.log([[19], [1]]) # 5% chance of taking action 'right'
 70 | 
 71 |         # Initialize the value parameters:
 72 |         self.w = np.zeros((1, 1))
 73 | 
 74 |     def pi(self, x_s):
 75 |         """
 76 |         Compute action probabilities from action preferences:
 77 |         :param x_s: state feature vector
 78 |         :return: an array of action probabilities
 79 |         """
 80 |         # Compute action preferences for the given feature vector:
 81 |         preferences = self.theta.dot(x_s)
 82 |         # Convert overflows to underflows:
 83 |         preferences = preferences - preferences.max()
 84 |         # Convert the preferences into probabilities:
 85 |         exp_prefs = np.exp(preferences)
 86 |         return exp_prefs / np.sum(exp_prefs)
 87 | 
 88 |     def select_action(self, x_s):
 89 |         return np.random.choice(2, p=self.pi(x_s).squeeze())
 90 | 
 91 |     def eligibility_vector(self, a, s):
 92 |         return self.x(s, a) - self.pi(self.x(s)) * (self.x(s, ShortCorridor.left) + self.x(s, ShortCorridor.right))
 93 | 
 94 |     def x(self, s, a=None):
 95 |         """
 96 |         Function approximator that computes state or state-action features.
 97 |         """
 98 |         if a is None:
 99 |             return np.array([[1]])
100 |         elif a == ShortCorridor.right:
101 |             return np.array([[0], [1]])
102 |         elif a == ShortCorridor.left:
103 |             return np.array([[1], [0]])
104 |         else:
105 |             raise ValueError('Invalid Action!')
106 | 
107 |     def learn(self, s_t, a_t, g_t):
108 |         # Get state features:
109 |         x_s = self.x(s_t)
110 | 
111 |         # Compare return with baseline (state value estimate):
112 |         delta = g_t - self.w.dot(x_s)
113 | 
114 |         # Update baseline weights:
115 |         self.w += self.beta * delta * x_s
116 | 
117 |         # Update policy weights:
118 |         self.theta += self.alpha * delta * self.eligibility_vector(a_t, s_t)
119 | 
120 | 
121 | def experiment(returns, alpha_index, alpha, beta_index, beta, run_num, random_seed, num_episodes, max_timesteps):
122 |     np.random.seed(random_seed)
123 |     agent = ReinforceWithBaseline(num_actions=ShortCorridor.num_actions, alpha=alpha, beta=beta)
124 | 
125 |     for episode_num in range(num_episodes):
126 |         episode = []
127 |         g = 0.0
128 |         t = 0
129 | 
130 |         # Start an episode:
131 |         s = ShortCorridor.init()
132 |         x_s = agent.x(s)
133 | 
134 |         # Play out the episode:
135 |         while (s is not None) and (t < max_timesteps):
136 |             # Select action to take:
137 |             a = agent.select_action(x_s)
138 | 
139 |             # Take action a, observe reward r' and next state s':
140 |             r_prime, s_prime = ShortCorridor.step(s, a)
141 | 
142 |             # Save sequence for later:
143 |             episode.append((s, a, r_prime))
144 | 
145 |             # Update counters:
146 |             s = s_prime
147 |             g = g + r_prime
148 |             t = t + 1
149 | 
150 |         # Store returns:
151 |         returns[alpha_index, beta_index, run_num, episode_num] = g
152 | 
153 |         # Episode finished, so update the agent:
154 |         gt = g
155 |         for t in range(len(episode)):
156 |             # Unpack timestep:
157 |             s, a, r_prime = episode[t]
158 | 
159 |             agent.learn(s, a, gt)
160 | 
161 |             # Compute return from t until end of episode for next timestep:
162 |             gt = gt - r_prime
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     parser = argparse.ArgumentParser(description='A script to generate figure 13.1 from Sutton and Barto (2nd Ed.)')
167 |     parser.add_argument('--alphas', type=float, nargs='*', default=[2**-9, 2**-13], help='Policy step sizes')
168 |     parser.add_argument('--betas', type=float, nargs='*', default=[2**-6, 0.], help='Baseline step sizes')
169 |     parser.add_argument('--num_runs', type=int, default=100, help='The number of runs to average over')
170 |     parser.add_argument('--num_episodes', type=int, default=1000, help='The number of episodes per run')
171 |     parser.add_argument('--max_timesteps', type=int, default=1000, help='The maximum number of timesteps allowed per episode')
172 |     parser.add_argument('--random_seed', type=int, default=2565, help='The random seed to use')
173 |     parser.add_argument('--num_cpus', type=int, default=-1, help='The number of cpus to use')
174 |     parser.add_argument('--confidence_intervals', action='store_true', help='Plot confidence intervals')
175 |     args = parser.parse_args()
176 | 
177 |     # Set the random seed:
178 |     random.seed(args.random_seed)
179 |     # Generate a random seed for each run without replacement:
180 |     random_seeds = random.sample(range(2**32), args.num_runs)
181 | 
182 |     # If the data file already exists, use it instead of re-generating the data:
183 |     if os.path.exists('returns_13_2.npy'):
184 |         # Create memmapped arrays to be populated in parallel:
185 |         returns = np.memmap('returns_13_2.npy', shape=(len(args.alphas), len(args.betas), args.num_runs, args.num_episodes), dtype=np.int16, mode='r')
186 |     else:
187 |         # Create memmapped arrays to be populated in parallel:
188 |         returns = np.memmap('returns_13_2.npy', shape=(len(args.alphas), len(args.betas), args.num_runs, args.num_episodes), dtype=np.int16, mode='w+')
189 | 
190 |         # Run experiments in parallel:
191 |         Parallel(n_jobs=args.num_cpus, verbose=10)(delayed(experiment)(returns, index, alpha, index, args.betas[index], run_num, random_seed, args.num_episodes, args.max_timesteps) for run_num, random_seed in enumerate(random_seeds) for index, alpha in enumerate(args.alphas))
192 | 
193 | 
194 |     # Plot the results:
195 |     fig = plt.figure()
196 |     ax = fig.add_subplot(111)
197 |     # Plot each pair of alpha, beta:
198 |     for index, alpha in enumerate(args.alphas):
199 | 
200 |             # Average over runs:
201 |             means = np.mean(returns[index, index], axis=0)
202 |             if args.betas[index] == 0.:
203 |                 label = '2^{}'.format(int(np.log2(alpha)))
204 |             else:
205 |                 label = '2^{}, 2^{}'.format(int(np.log2(alpha)), int(np.log2(args.betas[index])))
206 | 
207 |             p = plt.plot(np.arange(args.num_episodes), means, label=label)  # keep reference for colour-matching with errorbars.
208 | 
209 |             if args.confidence_intervals:
210 |                 # Plot 95% confidence intervals:
211 |                 sems = st.sem(returns[index, index], axis=0)
212 |                 confs = sems * st.t.ppf((1.0 + 0.95) / 2, args.num_runs - 1)
213 |                 ax.errorbar(np.arange(args.num_episodes), means, yerr=[confs, confs], color=p[0].get_color(), alpha=.15)
214 | 
215 |     ax.legend(title='Step sizes $\\alpha^{\\theta}, \\alpha^{w}$:')
216 |     ax.set_xlabel('Episode')
217 |     ax.set_ylabel('Total reward on episode')
218 |     ax.set_ylim(-90,-10)
219 |     ax.set_title('Performance of REINFORCE with and without baseline\n(averaged over {} runs)'.format(args.num_runs))
220 |     plt.savefig('figure_13_2.png')
221 | 


--------------------------------------------------------------------------------