├── .gitignore
├── ch6-td
    ├── standard.png
    ├── grid.txt
    ├── wind.txt
    ├── kings_stochastic_wind.png
    ├── kings_deterministic_wind.png
    ├── kings_stochastic_wind_can_stay.png
    ├── standard_moves_stochastic_wind.png
    └── windy.py
├── ch8-dyna-q
    ├── grid.txt
    ├── sample_iter_01.png
    └── dyna-maze.py
├── ch12-eligibility_traces
    ├── grid.txt
    ├── wind.txt
    ├── figures
    │   ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png
    │   ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png
    │   ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png
    │   ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png
    │   └── true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png
    └── windy.py
├── ch13-policy_gradient
    ├── grid.txt
    ├── wind.txt
    ├── value_imgs
    │   ├── value_000000.png
    │   ├── value_001000.png
    │   ├── value_002000.png
    │   ├── value_003000.png
    │   ├── value_004000.png
    │   ├── value_005000.png
    │   ├── value_006000.png
    │   ├── value_007000.png
    │   ├── value_008000.png
    │   ├── value_009000.png
    │   ├── counts_000000.png
    │   ├── counts_001000.png
    │   ├── counts_002000.png
    │   ├── counts_003000.png
    │   ├── counts_004000.png
    │   ├── counts_005000.png
    │   ├── counts_006000.png
    │   ├── counts_007000.png
    │   ├── counts_008000.png
    │   └── counts_009000.png
    ├── figures
    │   ├── REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png
    │   ├── REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png
    │   ├── REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png
    │   ├── REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png
    │   ├── REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png
    │   ├── REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png
    │   ├── actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png
    │   ├── actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png
    │   ├── actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png
    │   ├── actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png
    │   └── actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png
    ├── reinforce.py
    ├── reinforce_with_baseline.py
    └── windy.py
├── ch5-monte_carlo
    ├── result_iter_000100000.png
    ├── sample_iter_000100000.png
    ├── racetrack_tiny.txt
    ├── racetrack_small.txt
    ├── racetrack.txt
    └── racetrack.py
├── README.md
├── ch11-function_approximation
    └── baird_counterexample.py
├── ch4-value_iteration
    └── gambler.py
└── ch4-policy_iteration
    └── jacksrental_v1.py


/.gitignore:
--------------------------------------------------------------------------------
1 | sandbox/
2 | *.DS_Store
3 | *~


--------------------------------------------------------------------------------
/ch6-td/standard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/standard.png


--------------------------------------------------------------------------------
/ch8-dyna-q/grid.txt:
--------------------------------------------------------------------------------
1 | 0000000023
2 | 0020000020
3 | 1020000020
4 | 0020000000
5 | 0000002000
6 | 0000000000
7 | 


--------------------------------------------------------------------------------
/ch6-td/grid.txt:
--------------------------------------------------------------------------------
1 | 0000000000
2 | 0000000000
3 | 0000000000
4 | 1000000200
5 | 0000000000
6 | 0000000000
7 | 0000000000
8 | 


--------------------------------------------------------------------------------
/ch6-td/wind.txt:
--------------------------------------------------------------------------------
1 | 0001112210
2 | 0001112210
3 | 0001112210
4 | 0001112210
5 | 0001112210
6 | 0001112210
7 | 0001112210
8 | 


--------------------------------------------------------------------------------
/ch8-dyna-q/sample_iter_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch8-dyna-q/sample_iter_01.png


--------------------------------------------------------------------------------
/ch6-td/kings_stochastic_wind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_stochastic_wind.png


--------------------------------------------------------------------------------
/ch6-td/kings_deterministic_wind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_deterministic_wind.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/grid.txt:
--------------------------------------------------------------------------------
1 | 0000000000
2 | 0000000000
3 | 0000000000
4 | 1000000200
5 | 0000000000
6 | 0000000000
7 | 0000000000
8 | 


--------------------------------------------------------------------------------
/ch12-eligibility_traces/wind.txt:
--------------------------------------------------------------------------------
1 | 0001112210
2 | 0001112210
3 | 0001112210
4 | 0001112210
5 | 0001112210
6 | 0001112210
7 | 0001112210
8 | 


--------------------------------------------------------------------------------
/ch13-policy_gradient/grid.txt:
--------------------------------------------------------------------------------
1 | 0000000000
2 | 0000000000
3 | 0000000000
4 | 0000100200
5 | 0000000000
6 | 0000000000
7 | 0000000000
8 | 


--------------------------------------------------------------------------------
/ch13-policy_gradient/wind.txt:
--------------------------------------------------------------------------------
1 | 0001112210
2 | 0001112210
3 | 0001112210
4 | 0001112210
5 | 0001112210
6 | 0001112210
7 | 0001112210
8 | 


--------------------------------------------------------------------------------
/ch5-monte_carlo/result_iter_000100000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch5-monte_carlo/result_iter_000100000.png


--------------------------------------------------------------------------------
/ch5-monte_carlo/sample_iter_000100000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch5-monte_carlo/sample_iter_000100000.png


--------------------------------------------------------------------------------
/ch6-td/kings_stochastic_wind_can_stay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_stochastic_wind_can_stay.png


--------------------------------------------------------------------------------
/ch6-td/standard_moves_stochastic_wind.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/standard_moves_stochastic_wind.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_000000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_000000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_001000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_001000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_002000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_002000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_003000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_003000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_004000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_004000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_005000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_005000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_006000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_006000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_007000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_007000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_008000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_008000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/value_009000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_009000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_000000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_000000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_001000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_001000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_002000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_002000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_003000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_003000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_004000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_004000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_005000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_005000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_006000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_006000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_007000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_007000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_008000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_008000.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/value_imgs/counts_009000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_009000.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png


--------------------------------------------------------------------------------
/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png


--------------------------------------------------------------------------------
/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repo contains my solutions to programming exercises in the book
2 | Reinforcement Learning: An Introduction, by Sutton and Barto, 2nd Edition
3 | (2018).
4 | 
5 | It also contains implementations of some RL algorithms presented in the book that are not required as exercises.
6 | 
7 | These scripts should only be considered as a reference. Use at your own risk.
8 | 
9 | 


--------------------------------------------------------------------------------
/ch5-monte_carlo/racetrack_tiny.txt:
--------------------------------------------------------------------------------
  1 | 11113
  2 | 11100
  3 | 11100
  4 | 22000
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/ch5-monte_carlo/racetrack_small.txt:
--------------------------------------------------------------------------------
  1 | 11111113
  2 | 11111113
  3 | 11111113
  4 | 11111113
  5 | 11111000
  6 | 11110000
  7 | 11110000
  8 | 11110000
  9 | 11110000
 10 | 22220000
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/ch11-function_approximation/baird_counterexample.py:
--------------------------------------------------------------------------------
 1 | """ Sutton and Barto 2nd edition, Exercise 11.3
 2 | Divergence of semi-gradient Q-learning in Baird's counterexample
 3 | 
 4 | jlezama@fing.edu.uy
 5 | """
 6 | 
 7 | import os
 8 | import numpy as np
 9 | from scipy.stats.distributions import poisson
10 | 
11 | 
12 | import matplotlib
13 | # Force matplotlib to not use any Xwindows backend.
14 | matplotlib.use('Agg')
15 | import matplotlib.pyplot as plt
16 | 
17 | 
18 | 
19 | ################################################################################
20 | 
21 | 
22 | def one_step(w_t, x_t, x_tp1, rho_t, gamma=0.99, alpha=0.01, R_tp1=0):
23 |     
24 |     delta_t = R_tp1 + gamma * np.dot(w_t,x_tp1) - np.dot(w_t, x_t)
25 |     w_tp1 = w_t + alpha * rho_t * delta_t * x_t
26 |     return w_tp1
27 | 
28 | 
29 | ################################################################################
30 | Ns = 7 # number of  states
31 | 
32 | # states matrix
33 | X = np.asarray([[2,0,0,0,0,0,0,1],[0,2,0,0,0,0,0,1],[0,0,2,0,0,0,0,1],[0,0,0,2,0,0,0,1],[0,0,0,0,2,0,0,1],[0,0,0,0,0,2,0,1],[0,0,0,0,0,0,1,2]])
34 | 
35 | 
36 | 
37 | # initial state is state 7
38 | i_t = Ns
39 | x_t = X[i_t-1]
40 | 
41 | # initital weight vector
42 | w_t = np.ones(8)
43 | 
44 | 
45 | 
46 | for it in range(100000):
47 |     if i_t == Ns:
48 |         i_tp1 = np.random.randint(Ns+1)
49 |     else:
50 |         i_tp1 = Ns
51 | 
52 |     # importance sampling: target policy over behavior
53 |     if i_t ==Ns and i_tp1 <Ns:
54 |         rho_t = 0/(7/6.) # never seen in pi
55 |     elif i_t == Ns and i_tp1 == Ns:
56 |         rho_t = 7/1.
57 |     else:
58 |         rho_t = 1
59 | 
60 |     x_tp1 =  X[i_tp1-1]
61 |     w_t = one_step(w_t, x_t, x_tp1, rho_t)
62 | 
63 |     i_t = i_tp1
64 |     x_t = x_tp1
65 | 
66 | 
67 | print w_t, np.dot(x_t,w_t)
68 | 


--------------------------------------------------------------------------------
/ch5-monte_carlo/racetrack.txt:
--------------------------------------------------------------------------------
  1 | 000111111111111113
  2 | 001111111111111113
  3 | 001111111111111113
  4 | 011111111111111113
  5 | 111111111111111113
  6 | 111111111111111113
  7 | 111111111110000000
  8 | 111111111000000000
  9 | 111111111000000000
 10 | 111111111000000000
 11 | 111111111000000000
 12 | 111111111000000000
 13 | 111111111000000000
 14 | 011111111000000000
 15 | 011111111000000000
 16 | 011111111000000000
 17 | 011111111000000000
 18 | 011111111000000000
 19 | 011111111000000000
 20 | 011111111000000000
 21 | 011111111000000000
 22 | 011111111000000000
 23 | 001111111000000000
 24 | 001111111000000000
 25 | 001111111000000000
 26 | 001111111000000000
 27 | 001111111000000000
 28 | 001111111000000000
 29 | 001111111000000000
 30 | 000111111000000000
 31 | 000111111000000000
 32 | 000222222000000000
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/ch4-value_iteration/gambler.py:
--------------------------------------------------------------------------------
 1 | """ Gambler Problem. Exercise 4.9 Sutton and Barto 2nd edition.
 2 | jlezama@fing.edu.uy
 3 | """
 4 | import os
 5 | import numpy as np
 6 | from scipy.stats.distributions import poisson
 7 | import matplotlib
 8 | import matplotlib.pyplot as plt
 9 | 
10 | def value_iteration(V, pi, PR, gamma):
11 |     global THETA
12 | 
13 |     Delta = np.inf
14 | 
15 |     it = 0
16 | 
17 |     while Delta>THETA:
18 |         print 'entering while'
19 |         Delta = 0
20 |         for s in range(100):
21 |             v = V[s]
22 | 
23 |             argmax_a = -np.inf
24 |             max_return = -np.inf
25 |             for a in range(min(s,100-s)+1):
26 |                 expected_return = 0
27 |                 for s_prime in [s-a, s+a]:
28 |                     P, R = PR(s,a,s_prime)
29 |                     expected_return += P * (R + gamma * V[s_prime])
30 | 
31 |                     
32 |                 if expected_return> max_return:
33 |                     max_return = expected_return
34 |                     argmax_a = a
35 |                 # if expected_return == max_return:
36 |                 #     argmax_a = np.random.choice([a, argmax_a])
37 | 
38 |             V[s] = max_return
39 |             pi[s] = argmax_a
40 |             Delta = max(Delta, np.abs(v-V[s]))
41 |         
42 |         it+=1
43 |         plot(V, pi, it)
44 |             
45 |     return V, pi
46 | 
47 | def PR(s, a, s_prime):
48 |     global p_h
49 | 
50 | 
51 |     # with probabilty p_h you get s+a, with probability 1-p_h you get s-a
52 |     if s_prime == s+a:
53 |         return p_h, int(s_prime==100)
54 |     elif s_prime == s-a:
55 |         return 1-p_h, 0
56 | 
57 |     else:
58 |         return 0, 0
59 | 
60 | 
61 | def plot(V, pi, it):
62 |     global fig, axes
63 |     os.system("mkdir -p gambler_figures")
64 |     
65 |     fig, axes = plt.subplots(1, 2)
66 | 
67 |     ax = axes[0]
68 |     im = ax.plot(V)
69 |     ax.set_title('V')
70 |     
71 |     ax = axes[1]
72 |     im = ax.bar(range(101),pi)
73 |     ax.set_title('pi')
74 |     
75 |     plt.savefig('gambler_figures/result_iter_%02i.png' % it)
76 | 
77 |     plt.clf()
78 | 
79 | ################################################################################ 
80 | # MAIN LOOP
81 | 
82 | THETA = 1e-16
83 | p_h = 0.4
84 | 
85 | gamma = 1;
86 | 
87 | V = np.zeros(101)
88 | pi = np.zeros(101)
89 | 
90 | 
91 | V, pi = value_iteration(V, pi, PR, gamma)
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/ch13-policy_gradient/reinforce.py:
--------------------------------------------------------------------------------
  1 | """ Sutton and Barto 2nd edition, Chapter 13. Policy Gradient Methods
  2 | Implementation of REINFORCE algorithm for the short corridor example
  3 | 
  4 | jlezama@fing.edu.uy
  5 | """
  6 | 
  7 | import os
  8 | import numpy as np
  9 | from scipy.stats.distributions import poisson
 10 | 
 11 | 
 12 | import matplotlib
 13 | # Force matplotlib to not use any Xwindows backend.
 14 | matplotlib.use('Agg')
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | 
 18 | 
 19 | ################################################################################
 20 | def one_step(s,a):
 21 |     """receives current state and action s,a, returns reward and next state, r,
 22 | s_prime. a is either 0 (left) or 1 (right)
 23 |     """
 24 |     
 25 |     R = -1
 26 |     if s == 0:
 27 |         s_prime = a # left (0) goes to state 0, right (1) goes to state 1
 28 |     elif s == 1:
 29 |         s_prime = 2 if a ==0 else 0 # reversed motion
 30 |     elif s == 2:
 31 |         s_prime = 3 if a == 1 else 1
 32 |         
 33 |     return R, s_prime
 34 |         
 35 | 
 36 | ################################################################################
 37 | def x(s,a):
 38 |     xs = np.asarray([[0,1],[1,0]])
 39 |     x = xs[a]
 40 |     return x
 41 | 
 42 | 
 43 | ################################################################################
 44 | def compute_pi(theta, s):
 45 |     # compute soft-max for linear feature theta^T.x
 46 |     h = np.zeros(2)
 47 | 
 48 |     for a in range(2):
 49 |         h[a] = np.dot(x(s,a), theta)
 50 | 
 51 |     h -= np.max(h)
 52 | 
 53 |     pi = np.exp(h)/np.sum(np.exp(h))
 54 | 
 55 |     return pi
 56 | 
 57 | 
 58 | ################################################################################
 59 | def compute_grad(theta,s,a):
 60 |     # compute soft-max for linear feature theta^T.x
 61 |     pi = compute_pi(theta,s)
 62 | 
 63 |     
 64 |     not_a = np.abs(a-1)
 65 |     
 66 |     grad = x(s,a) - pi[not_a] * x(s,not_a)
 67 | 
 68 | 
 69 |     return grad
 70 | 
 71 | def v(w,s):
 72 |     """ See Sutton and Barto 2nd edition 13.4, page 273 """
 73 |     return w
 74 | 
 75 | 
 76 | ################################################################################
 77 | def REINFORCE(theta, gamma = 1.0, alpha=2**-13):
 78 |     EPISODES= 200
 79 | 
 80 |     GOAL = 3
 81 | 
 82 | 
 83 |     G_0s = []
 84 |     for ep in range(EPISODES):
 85 |         G = 0
 86 |         s = 0
 87 |         a = np.argmax(compute_pi(theta,s))
 88 | 
 89 |         ep_s = []
 90 |         ep_a = []
 91 |         ep_R = []
 92 | 
 93 |         ep_s.append(s)
 94 |         ep_a.append(a)
 95 | 
 96 |         R, s = one_step(s,a)
 97 |         
 98 |         ep_R.append(R)
 99 |         
100 | 
101 |         while s != GOAL:
102 | 
103 |             pi = compute_pi(theta, s)
104 |             a = np.random.choice(2, p=pi)
105 | 
106 |             # print s,pi,a
107 |             # if np.random.rand()<0.1:
108 |             #     a = np.random.choice([0,1])
109 |             
110 | 
111 |             R, s = one_step(s,a)
112 | 
113 |             ep_s.append(s)
114 |             ep_a.append(a)
115 |         
116 |             ep_R.append(R)
117 | 
118 |         # print 'GOAL!'
119 |         ep_R = np.asarray(ep_R)
120 | 
121 |         for t in range(len(ep_s)):
122 |            G_t = np.sum(ep_R[t:])
123 |            theta += alpha * (gamma**t) * G_t * compute_grad(theta, ep_s[t],ep_a[t])
124 |            
125 |         G_0 = np.sum(ep_R)
126 | 
127 |         #print 'ep %i, G_0 %f' % (ep, G_0), theta
128 |         G_0s.append(G_0)
129 |         
130 |     return theta, np.asarray(G_0s)
131 | 
132 |  
133 | ################################################################################
134 | # MAIN LOOP
135 | 
136 | RUNS = 100
137 | 
138 | theta = np.random.randn(2)
139 | 
140 | theta, G_0s = REINFORCE(theta)
141 | 
142 | G_0s = G_0s.reshape(1,-1)
143 | 
144 | for i in range(RUNS):
145 |     theta = np.random.randn(2)
146 |     theta, G_0s_t = REINFORCE(theta)
147 |     G_0s = np.concatenate((G_0s, G_0s_t.reshape(1,-1)), axis=0)
148 | 
149 |     print 'RUN %i/%i' % (i,RUNS)
150 | 
151 | print G_0s.shape
152 | print np.mean(G_0s,axis=0)
153 | 
154 | 
155 | savefname = 'G.png'
156 | plt.plot(np.mean(G_0s, axis=0))
157 | plt.savefig(savefname)
158 | 
159 | 
160 | os.system('open %s' %  savefname)
161 | 


--------------------------------------------------------------------------------
/ch13-policy_gradient/reinforce_with_baseline.py:
--------------------------------------------------------------------------------
  1 | """ Sutton and Barto 2nd edition, Chapter 13. Policy Gradient Methods
  2 | Implementation of REINFORCE algorithm for the short corridor example
  3 | 
  4 | jlezama@fing.edu.uy
  5 | """
  6 | 
  7 | import os
  8 | import numpy as np
  9 | from scipy.stats.distributions import poisson
 10 | 
 11 | 
 12 | import matplotlib
 13 | # Force matplotlib to not use any Xwindows backend.
 14 | matplotlib.use('Agg')
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | 
 18 | 
 19 | ################################################################################
 20 | def one_step(s,a):
 21 |     """receives current state and action s,a, returns reward and next state, r,
 22 | s_prime. a is either 0 (left) or 1 (right)
 23 |     """
 24 |     
 25 |     R = -1
 26 |     if s == 0:
 27 |         s_prime = a # left (0) goes to state 0, right (1) goes to state 1
 28 |     elif s == 1:
 29 |         s_prime = 2 if a ==0 else 0 # reversed motion
 30 |     elif s == 2:
 31 |         s_prime = 3 if a == 1 else 1
 32 |         
 33 |     return R, s_prime
 34 |         
 35 | 
 36 | ################################################################################
 37 | def x(s,a):
 38 |     xs = np.asarray([[0,1],[1,0]])
 39 |     x = xs[a]
 40 |     return x
 41 | 
 42 | 
 43 | ################################################################################
 44 | def compute_pi(theta, s):
 45 |     # compute soft-max for linear feature theta^T.x
 46 |     h = np.zeros(2)
 47 | 
 48 |     for a in range(2):
 49 |         h[a] = np.dot(x(s,a), theta)
 50 | 
 51 |     h -= np.max(h)
 52 | 
 53 |     pi = np.exp(h)/np.sum(np.exp(h))
 54 | 
 55 |     return pi
 56 | 
 57 | 
 58 | 
 59 | ################################################################################
 60 | def compute_grad(theta,s,a):
 61 |     # compute soft-max for linear feature theta^T.x
 62 |     pi = compute_pi(theta,s)
 63 | 
 64 |     
 65 |     not_a = np.abs(a-1)
 66 |     
 67 |     grad = x(s,a) - np.dot(pi[not_a], x(s,not_a))
 68 | 
 69 | 
 70 |     return grad
 71 | 
 72 | def v(w,s):
 73 |     """ See Sutton and Barto 2nd edition 13.4, page 273 """
 74 |     return w
 75 | 
 76 | 
 77 | ################################################################################
 78 | def REINFORCE(theta, w, gamma = 1.0, alpha_theta=2**-9, alpha_w=2**-6):
 79 |     EPISODES= 200
 80 | 
 81 |     GOAL = 3
 82 | 
 83 | 
 84 |     G_0s = []
 85 |     for ep in range(EPISODES):
 86 |         G = 0
 87 |         s = 0
 88 |         a = np.argmax(compute_pi(theta,s))
 89 | 
 90 |         ep_s = []
 91 |         ep_a = []
 92 |         ep_R = []
 93 | 
 94 |         ep_s.append(s)
 95 |         ep_a.append(a)
 96 | 
 97 |         R, s = one_step(s,a)
 98 |         
 99 |         ep_R.append(R)
100 |         
101 | 
102 |         while s != GOAL:
103 | 
104 |             pi = compute_pi(theta, s)
105 |             a = np.random.choice(2, p=pi)
106 | 
107 |             # print s,pi,a
108 |             # if np.random.rand()<0.1:
109 |             #     a = np.random.choice([0,1])
110 |             
111 | 
112 |             R, s = one_step(s,a)
113 | 
114 |             ep_s.append(s)
115 |             ep_a.append(a)
116 |         
117 |             ep_R.append(R)
118 | 
119 |         # print 'GOAL!'
120 |         ep_R = np.asarray(ep_R)
121 | 
122 |         for t in range(len(ep_s)):
123 |            G_t = np.sum(ep_R[t:])
124 |            delta_t =  G_t - v(w,s)
125 | 
126 |            w += alpha_w * (gamma**t) * delta_t
127 |            
128 |            theta += alpha_theta * (gamma**t) * delta_t  * compute_grad(theta, ep_s[t],ep_a[t])
129 |            
130 |         G_0 = np.sum(ep_R)
131 | 
132 |         #print 'ep %i, G_0 %f' % (ep, G_0), theta
133 |         G_0s.append(G_0)
134 |         
135 |     return theta, np.asarray(G_0s)
136 | 
137 |  
138 | ################################################################################
139 | # MAIN LOOP
140 | 
141 | RUNS = 100
142 | 
143 | theta = np.random.randn(2)
144 | 
145 | w = np.random.randn(1)
146 | 
147 | theta, G_0s = REINFORCE(theta, w)
148 | 
149 | G_0s = G_0s.reshape(1,-1)
150 | 
151 | for i in range(RUNS):
152 |     theta = np.random.randn(2)
153 |     w = np.random.rand(1)
154 | 
155 |     theta, G_0s_t = REINFORCE(theta,w)
156 |     G_0s = np.concatenate((G_0s, G_0s_t.reshape(1,-1)), axis=0)
157 | 
158 |     print 'RUN %i/%i' % (i,RUNS)
159 | 
160 | print G_0s.shape
161 | 
162 | 
163 | 
164 | savefname = 'G_baseline.png'
165 | plt.plot(np.mean(G_0s, axis=0))
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | plt.savefig(savefname)
175 | 
176 | 
177 | os.system('open %s' %  savefname)
178 | 


--------------------------------------------------------------------------------
/ch6-td/windy.py:
--------------------------------------------------------------------------------
  1 | """ Windy Gridworld Problem. Exercises 6.9 and 6.10, Sutton and Barto 2nd edition.
  2 | jlezama@fing.edu.uy
  3 | """
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from scipy.stats.distributions import poisson
  8 | 
  9 | 
 10 | import matplotlib
 11 | # Force matplotlib to not use any Xwindows backend.
 12 | matplotlib.use('Agg')
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | ################################################################################
 16 | # MAIN FUNCTIONS
 17 | def sarsa(Q, EPISODES, alpha, gamma,  eps, stochastic_wind):
 18 |     global wind, grid, START, GOAL
 19 | 
 20 |     H, W = wind.shape
 21 |     
 22 | 
 23 |     for episode in range(EPISODES):
 24 |     
 25 |         Is, Js = np.where(grid==START)
 26 |         # S = (Is, Js)
 27 |         
 28 |         A, vx, vy = epsilon_greedy(Q, Is, Js, eps)
 29 | 
 30 |         steps = 0
 31 |         
 32 |         while grid[Is, Js] != GOAL:
 33 |             # take action A
 34 |             Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
 35 |             Js_prime = Js + vx
 36 |     
 37 |             Is_prime = min(H-1, max(0,Is_prime))
 38 |             Js_prime = min(W-1, max(0,Js_prime))
 39 |     
 40 |             # choose A_prime from S_prime
 41 |             A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps)
 42 |     
 43 |             Q[Is, Js, A] += alpha * (-1 + gamma*Q[Is_prime, Js_prime, A_prime] - Q[Is,Js,A])
 44 | 
 45 |             Is = Is_prime
 46 |             Js = Js_prime
 47 | 
 48 |             vx = vx_prime
 49 |             vy = vy_prime
 50 |             A = A_prime
 51 | 
 52 |             steps+=1
 53 |     
 54 |             if steps %1000 ==0:
 55 |                 print 'step %i' % steps, Is, Js, Is_prime, Js_prime, vy, vx
 56 |                
 57 |         print 'reached goal in %i steps, episode %i'% (steps, episode)
 58 |     
 59 |     return Q
 60 | 
 61 | ################################################################################
 62 | # AUX FUNCTIONS
 63 | 
 64 | def epsilon_greedy(Q, Is, Js, eps):
 65 |     global actions
 66 | 
 67 | 
 68 |     best_actions_ix = np.where(Q[Is, Js, actions]==np.max(Q[Is, Js, actions]))[0]
 69 | 
 70 | 
 71 |     if len(best_actions_ix)>1:
 72 |         best_action_ix = np.random.choice(best_actions_ix)
 73 |     else:
 74 |         best_action_ix = best_actions_ix[0]
 75 | 
 76 |     best_action = actions[best_action_ix] if np.random.rand()>eps else np.random.choice(actions)
 77 | 
 78 |     vx, vy = action_to_pair(best_action)
 79 | 
 80 |     return best_action, vx, vy
 81 | 
 82 | def action_to_pair(a):
 83 |     assert(a>=0 and a<9)
 84 |     vx = int(np.floor(a/3)-1)
 85 |     vy = int(np.mod(a,3)-1)
 86 |     return vx, vy
 87 | 
 88 | def pair_to_action(vx, vy):
 89 |     assert np.abs(vx)<=1 and np.abs(vy)<=1
 90 | 
 91 |     return int((vx+1)*3 + vy + 1)
 92 | 
 93 | 
 94 | # 0 (-1, -1)
 95 | # 1 (-1, 0)
 96 | # 2 (-1, 1)
 97 | # 3 (0, -1)
 98 | # 4 (0, 0)
 99 | # 5 (0, 1)
100 | # 6 (1, -1)
101 | # 7 (1, 0)
102 | # 8 (1, 1)
103 | 
104 | # non-king moves: 1, 3, 5, 7
105 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8
106 | 
107 | def plot_sample(Q, it, stochastic_wind, eps=0):
108 |     global wind
109 | 
110 |     plt.clf()
111 |     x_t = []
112 |     y_t = []
113 | 
114 |     H, W = wind.shape
115 |     
116 |     Is, Js = np.where(grid==START)
117 |             
118 |     A, vx, vy = epsilon_greedy(Q, Is, Js, eps)
119 | 
120 |     steps = 0
121 | 
122 |     y_t.append(Is)
123 |     x_t.append(Js)
124 |         
125 |     while grid[Is, Js] != GOAL and steps <5000:
126 |             # take action A
127 |             Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
128 |             Js_prime = Js + vx
129 |     
130 |             Is_prime = min(H-1, max(0,Is_prime))
131 |             Js_prime = min(W-1, max(0,Js_prime))
132 |     
133 |             # choose A_prime from S_prime
134 |             A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps)
135 |     
136 |     
137 |             Is = Is_prime
138 |             Js = Js_prime
139 | 
140 |             vx = vx_prime
141 |             vy = vy_prime
142 |             A = A_prime
143 | 
144 |             y_t.append(Is)
145 |             x_t.append(Js)
146 | 
147 |             steps+=1
148 | 
149 |     if steps<5000:
150 |         print 'reached goal in %i steps, episode %i'% (steps, -1)
151 |     else:
152 |         print 'couldnt reach goal in 5000 steps with greedy'
153 | 
154 |     plt.imshow(wind, interpolation='none')
155 |     plt.plot(np.asarray(x_t), np.asarray(y_t),'o-')
156 |     plt.scatter(x_t[-1], y_t[-1], color='red', s=50)
157 | 
158 |     plt.savefig('sample_iter_%02i.png' % it)
159 | 
160 | ################################################################################ 
161 | # MAIN LOOP
162 | 
163 | START = 1
164 | GOAL = 2
165 | 
166 | GAMMA = 1
167 | EPSILON = 0.1
168 | ALPHA = 0.5
169 | 
170 | 
171 | EPISODES = 1500
172 | 
173 | 
174 | KINGS = False
175 | STAY = False # wether not moving is an option
176 | STOCHASTIC = False # stochastic wind
177 | 
178 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int)
179 | grid = np.genfromtxt('grid.txt', delimiter=1)
180 | 
181 | 
182 | if KINGS:
183 |     actions = [0, 1, 2, 3, 5, 6, 7, 8]
184 | else:
185 |     actions = [1, 3, 5, 7]
186 | 
187 | if STAY:
188 |     actions.append(4)
189 | 
190 | actions = np.asarray(actions).astype(int)
191 | 
192 | H, W = wind.shape
193 | 
194 | Q = np.zeros((H,W,9))
195 |         
196 | Q = sarsa(Q, EPISODES, ALPHA, GAMMA, EPSILON, STOCHASTIC)
197 | 
198 | 
199 | plot_sample(Q,1, STOCHASTIC)
200 | 


--------------------------------------------------------------------------------
/ch4-policy_iteration/jacksrental_v1.py:
--------------------------------------------------------------------------------
  1 | """ Jack's Car Rental Problem. Exercise 4.5, Sutton and Barto 2nd edition.
  2 | jlezama@fing.edu.uy
  3 | """
  4 | import os
  5 | import numpy as np
  6 | from scipy.stats.distributions import poisson
  7 | import matplotlib
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | def policy_evaluation(V, pi, PR, gamma):
 11 |     """
 12 |     V should be a dict containing values for all states
 13 |     pi should be a dict containing the chosen action for each state
 14 |     P contains the transition probabilities P(s'|s,a)
 15 |     R contains the reward function R(s',s,a)
 16 |     gamma is the discount factor
 17 |     """
 18 | 
 19 |     global MAX_CARS, THETA
 20 | 
 21 | 
 22 | 
 23 |     Delta = np.inf
 24 | 
 25 |     while Delta > THETA:
 26 |         print 'entering while'
 27 |         Delta = 0
 28 |         
 29 |         for s1 in range(MAX_CARS):
 30 |             for s2 in range(MAX_CARS):
 31 |         
 32 |                 print 'value evaluation for state %i,%i' % (s1, s2)
 33 |                 v = V[s1,s2]
 34 |                 a = pi[s1,s2]
 35 |                 V[s1,s2] = 0
 36 |                 for s1_prime in range(MAX_CARS):
 37 |                     for s2_prime in range(MAX_CARS):
 38 |                         # V[s] += P[s,a,s_prime]*(R[s,a,s_prime] + gamma*V[s_prime]) # dictionary version
 39 |                         P_sas_prime, R_sas_prime = PR((s1,s2), a, (s1_prime, s2_prime)) # funciton version
 40 |                         V[s1, s2] += P_sas_prime*(R_sas_prime + gamma*V[s1_prime, s2_prime]) # function version
 41 |                 Delta = max(Delta, abs(v - V[s1, s2]))
 42 |                 print Delta
 43 |         
 44 |     return V
 45 | 
 46 | def policy_improvement(V, pi, actions, PR, gamma):
 47 |     policy_stable = True
 48 | 
 49 | 
 50 |     for s1 in range(MAX_CARS):
 51 |         for s2 in range(MAX_CARS):
 52 |     
 53 |             print 'policy improvement for state %i/%i' % (s1, s2)
 54 |             old_action = pi[s1, s2]
 55 |     
 56 |             max_return = -np.inf
 57 |             argmax_a = -np.inf
 58 |     
 59 |             for a in actions:
 60 |                 expected_return = 0
 61 |                 for s1_prime in range(MAX_CARS):
 62 |                     for s2_prime in range(MAX_CARS):
 63 | 
 64 |                     
 65 |                         P_sas_prime, R_sas_prime = PR((s1,s2), a, (s1_prime, s2_prime)) # funciton version
 66 |                         expected_return += P_sas_prime*(R_sas_prime + gamma*V[s1_prime, s2_prime]) # function version
 67 |                     
 68 |                 if expected_return > max_return:
 69 |                     max_return = expected_return
 70 |                     argmax_a = a
 71 |                 
 72 |             pi[s1, s2] = argmax_a
 73 |             
 74 |             if old_action != pi[s1, s2]:
 75 |                 policy_stable = False
 76 |         
 77 |     return pi, policy_stable
 78 |                 
 79 | 
 80 | 
 81 | ################################################################################
 82 | # REWARD AND TRANSITION PROBABILITIES
 83 | ################################################################################
 84 | def PR(s, a, s_prime):
 85 |     global lambda_ret1, lambda_req1, lambda_ret2, lambda_req2, MAX_TRIPS
 86 | 
 87 |     if np.abs(a)>MAX_TRIPS:
 88 |         # maximum 5 cars returned
 89 |         return 0
 90 |     morning_loc1 = s[0] + a
 91 |     morning_loc2 = s[1] - a
 92 | 
 93 |     night_loc1 = s_prime[0]
 94 |     night_loc2 = s_prime[1]
 95 | 
 96 |     P1, R1 = prob_ret_req(morning_loc1, night_loc1, lambda_ret1, lambda_req1)
 97 |     P2, R2 = prob_ret_req(morning_loc2, night_loc2, lambda_ret2, lambda_req2)
 98 |     
 99 |     P = P1 * P2
100 |     R = R1 + R2 - np.abs(a)*2
101 | 
102 |     return P, R
103 | 
104 | def prob_ret_req(n_morning, n_night, lambda_ret, lambda_req):
105 |     """ 
106 |     Probability for one agency of having n_morning cars in the morning and
107 |     n_night cars in the night. Depends on the probabilities of returns and
108 |     requests, as well as the max car availability.
109 |     """
110 |     prob = 0
111 |     difference = n_night - n_morning
112 |     R = 0
113 | 
114 |     for ret in range(int(10*lambda_ret)):
115 |         for req in range(int(10*lambda_req)):
116 |             if ret-req != difference:
117 |                 continue
118 |             p_ret = poisson.pmf(ret, lambda_ret)
119 |             p_req = poisson.pmf(req, lambda_req)
120 |             
121 | 
122 | 
123 |             prob += p_ret*p_req
124 | 
125 |             R += p_ret * p_req * req * 10  # expected reward
126 | 
127 |     return prob, R
128 | 
129 | def plot(V, pi, it):
130 |     os.system("mkdir -p figures")
131 |     
132 |     fig, axes = plt.subplots(1, 2)
133 |     
134 |     ax = axes[0]
135 |     im = ax.imshow(V, interpolation='none')
136 |     ax.set_title('V')
137 |     ax.set_xlabel('Location 1')
138 |     ax.set_ylabel('Location 2')
139 |     
140 |     plt.colorbar(im, ax=ax)
141 | 
142 |     ax = axes[1]
143 |     im = ax.imshow(pi, interpolation='none')
144 |     ax.set_title('pi')
145 |     ax.set_xlabel('Location 1')
146 |     ax.set_ylabel('Location 2')
147 |     
148 |     plt.colorbar(im, ax=ax)
149 |     plt.savefig('figures/result_iter_%02i.png' % it)
150 | 
151 | ################################################################################
152 | # MAIN LOOP 
153 | ################################################################################
154 | 
155 | acc_factor = 2.0
156 | 
157 | THETA = 5.0
158 | MAX_CARS = int(20/acc_factor)
159 | MAX_TRIPS = int(5/acc_factor)
160 | 
161 | # DEFINE PARAMETERS
162 | 
163 | actions = range(-1*MAX_TRIPS, MAX_TRIPS+1)
164 | 
165 | V = np.zeros((MAX_CARS, MAX_CARS))
166 | pi = np.zeros((MAX_CARS, MAX_CARS))
167 | 
168 | 
169 | gamma = 0.9
170 | 
171 | lambda_ret1 = 3/acc_factor
172 | lambda_ret2 = 2/acc_factor
173 | lambda_req1 = 3/acc_factor
174 | lambda_req2 = 4/acc_factor
175 | 
176 | 
177 | # RUN ITERATIONS
178 | policy_stable = False
179 | 
180 | it = 0
181 | 
182 | plot(V, pi, it)
183 | 
184 | while not policy_stable:
185 |     V = policy_evaluation(V, pi, PR, gamma)
186 |     pi, policy_stable = policy_improvement(V, pi, actions, PR, gamma)
187 |     it += 1
188 |     plot(V, pi, it)
189 | 


--------------------------------------------------------------------------------
/ch8-dyna-q/dyna-maze.py:
--------------------------------------------------------------------------------
  1 | """ Dyna-Q Maze Exercise 8.4, based on Example 8.1 Sutton and Barto 2nd edition.
  2 | jlezama@fing.edu.uy
  3 | 
  4 | TODO: implement exploration bonus
  5 | 
  6 | """
  7 | 
  8 | import os
  9 | import numpy as np
 10 | from scipy.stats.distributions import poisson
 11 | 
 12 | 
 13 | import matplotlib
 14 | # Force matplotlib to not use any Xwindows backend.
 15 | matplotlib.use('Agg')
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | ################################################################################
 19 | # MAIN FUNCTIONS
 20 | def dynaq(Q, Model,  alpha, gamma,  eps, steps, planning_steps):
 21 |     global grid, START, GOAL, BLOCK
 22 | 
 23 |     H, W = grid.shape
 24 |     
 25 |     Is, Js = np.where(grid==START)
 26 | 
 27 |     goals = 0
 28 |     
 29 |     for step in range(steps):
 30 | 
 31 |         A = epsilon_greedy(Q, Is, Js, eps)
 32 |     
 33 |         vx, vy = action_to_pair(A)
 34 | 
 35 |         
 36 |         Is_prime = Is + vy 
 37 |         Js_prime = Js + vx
 38 |     
 39 |         Is_prime = min(H-1, max(0,Is_prime))
 40 |         Js_prime = min(W-1, max(0,Js_prime))
 41 | 
 42 |         if grid[Is_prime, Js_prime] == BLOCK:
 43 |             Is_prime = Is
 44 |             Js_prime = Js
 45 |         
 46 | 
 47 |         reached_goal = False
 48 |         if grid[Is_prime, Js_prime] == GOAL:
 49 |             R = 1
 50 |             reached_goal = True
 51 |             goals += 1
 52 |         else:
 53 |             R = 0
 54 | 
 55 |         # step (d)
 56 |         Q[Is, Js, A] = Q[Is, Js, A] + alpha * (R + gamma * np.max(Q[Is_prime, Js_prime,:]) -Q[Is, Js, A])
 57 | 
 58 |         # step (e)
 59 |         Model[Is, Js, A, 0] = R # reward
 60 |         Model[Is, Js, A, 1] = Is_prime # s'
 61 |         Model[Is, Js, A, 2] = Js_prime                                       
 62 | 
 63 | 
 64 |         
 65 |         # step (f)
 66 |         # get visited states
 67 |         Is_visited, Js_visited, A_visited = np.where(Model[:,:,:,1] >= 0) # -1 is default unvisited state
 68 |         for n in range(planning_steps):
 69 |             ix = np.random.randint(Is_visited.shape[0])
 70 |             Is_n = Is_visited[ix]
 71 |             Js_n = Js_visited[ix]
 72 |             A_n = A_visited[ix]
 73 | 
 74 |             R_n = Model[Is_n, Js_n, A_n, 0]
 75 |             Is_prime_n = Model[Is_n, Js_n, A_n, 1]
 76 |             Js_prime_n = Model[Is_n, Js_n, A_n, 2]
 77 | 
 78 |             Q[Is_n, Js_n] = Q[Is_n, Js_n] + alpha * (R_n + gamma * np.max(Q[Is_prime_n, Js_prime_n, :]) - Q[Is_n, Js_n, A_n])
 79 |     
 80 | 
 81 | 
 82 |         if reached_goal:
 83 |             Is, Js = np.where(grid==START)
 84 |         else:
 85 |             Is = Is_prime
 86 |             Js = Js_prime
 87 |             
 88 |         if step %1000 ==0:
 89 |             print 'step',  step, Is, Js, Is_prime, Js_prime, vy, vx, 'reached goal %i times' % goals
 90 |                
 91 | 
 92 |     
 93 |     return Q
 94 | 
 95 | ################################################################################
 96 | # AUX FUNCTIONS
 97 | 
 98 | def epsilon_greedy(Q, Is, Js, eps):
 99 |     global actions
100 | 
101 | 
102 |     best_actions_ix = np.where(Q[Is, Js, actions]==np.max(Q[Is, Js, actions]))[0]
103 | 
104 | 
105 |     if len(best_actions_ix)>1:
106 |         best_action_ix = np.random.choice(best_actions_ix)
107 |     else:
108 |         best_action_ix = best_actions_ix[0]
109 | 
110 |     best_action = actions[best_action_ix] if np.random.rand()>eps else np.random.choice(actions)
111 | 
112 |     return best_action
113 | 
114 | def action_to_pair(a):
115 |     # very lazy way to implement this
116 |     
117 |     if a == 0:
118 |         vy = -1
119 |         vx = 0
120 |     elif a == 1:
121 |         vy = 1
122 |         vx = 0
123 |     elif a == 2:
124 |         vy = 0
125 |         vx = -1
126 |     elif a == 3:
127 |         vy = 0
128 |         vx = 1
129 |     else:
130 |         raise ValueError('Invalid action')
131 | 
132 |     return vx, vy
133 | 
134 | def pair_to_action(vx, vy):
135 |     if (-1, 0) == (vy, vx):
136 |         return 0
137 |     elif (1, 0) == (vy, vx):
138 |         return 1
139 |     elif (0, -1) == (vy, vx):
140 |         return 2
141 |     elif (0, 1) == (vy, vx):
142 |         return 3
143 |     else:
144 |         raise ValuError('Invalid pair')
145 |     
146 | 
147 | 
148 | def plot_sample(Q, it,  eps=0):
149 |     global grid
150 | 
151 |     plt.clf()
152 |     x_t = []
153 |     y_t = []
154 | 
155 |     H, W = grid.shape
156 |     
157 |     Is, Js = np.where(grid==START)
158 |             
159 |     A = epsilon_greedy(Q, Is, Js, eps)
160 |     vx, vy = action_to_pair(A)
161 |     
162 |     steps = 0
163 | 
164 |     y_t.append(Is)
165 |     x_t.append(Js)
166 |         
167 |     while grid[Is, Js] != GOAL and steps <5000:
168 |             # take action A
169 |             Is_prime = Is + vy 
170 |             Js_prime = Js + vx
171 |     
172 |             Is_prime = min(H-1, max(0,Is_prime))
173 |             Js_prime = min(W-1, max(0,Js_prime))
174 | 
175 |             if grid[Is_prime, Js_prime] == BLOCK:
176 |                 Is_prime = Is
177 |                 Js_prime = Js
178 | 
179 |             
180 |             # choose A_prime from S_prime
181 |             A_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps)
182 | 
183 |             vx_prime, vy_prime = action_to_pair(A_prime)
184 |     
185 |             Is = Is_prime
186 |             Js = Js_prime
187 | 
188 |             vx = vx_prime
189 |             vy = vy_prime
190 |             A = A_prime
191 | 
192 |             y_t.append(Is)
193 |             x_t.append(Js)
194 | 
195 |             steps+=1
196 | 
197 |     if steps<5000:
198 |         print 'reached goal in %i steps, episode %i'% (steps, -1)
199 |     else:
200 |         print 'couldnt reach goal in 5000 steps with greedy'
201 | 
202 |     plt.imshow(grid, interpolation='none')
203 |     plt.plot(np.asarray(x_t), np.asarray(y_t),'o-')
204 |     plt.scatter(x_t[-1], y_t[-1], color='red', s=50)
205 | 
206 |     plt.savefig('sample_iter_%02i.png' % it)
207 | 
208 | ################################################################################ 
209 | # MAIN LOOP
210 | 
211 | START = 1
212 | GOAL = 3
213 | BLOCK = 2
214 | 
215 | GAMMA = 0.95
216 | EPSILON = 0.1
217 | ALPHA = 0.1
218 | 
219 | 
220 | STEPS = 10000
221 | PLANNING_STEPS = 10
222 | 
223 | 
224 | STAY = False # wether not moving is an option
225 | 
226 | grid = np.genfromtxt('grid.txt', delimiter=1).astype(int)
227 | 
228 | 
229 | actions = [0, 1, 2, 3] # up, down, left, right
230 | 
231 | 
232 | actions = np.asarray(actions).astype(int)
233 | 
234 | H, W = grid.shape
235 | 
236 | Q = np.zeros((H,W,4))
237 | 
238 | Model = np.ones((H,W,4,3)).astype(int)*-1 # R (1) and S' (2) for every S, A
239 | 
240 | Q = dynaq(Q, Model, ALPHA, GAMMA, EPSILON, STEPS, PLANNING_STEPS)
241 | 
242 | 
243 | plot_sample(Q,1)
244 | 


--------------------------------------------------------------------------------
/ch12-eligibility_traces/windy.py:
--------------------------------------------------------------------------------
  1 | """ Windy Gridworld Problem Using Eligibility Traces (Sutton and Barto 2nd edition, Chapter 12)
  2 | 
  3 | Includes REINFORCE (with and without baseline) and Actor-Critic
  4 | State representation can be a 3rd order polynomial on position or a one-hot vector.
  5 | 
  6 | 
  7 | 
  8 | April 18, 2018
  9 | 
 10 | jlezama@fing.edu.uy
 11 | """
 12 | 
 13 | import os
 14 | import numpy as np
 15 | from scipy.stats.distributions import poisson
 16 | 
 17 | 
 18 | import matplotlib
 19 | # Force matplotlib to not use any Xwindows backend.
 20 | matplotlib.use('Agg')
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | ################################################################################
 24 | # MAIN FUNCTIONS
 25 | 
 26 | def true_online_sarsa_lambda(w, alpha=0.125, lambda_=0.9, gamma=1.0):
 27 |     """ Sutton and Barto 2nd Edition, Section 12.7, Page 252 """
 28 | 
 29 |     global grid, GOAL, EPISODES, MAX_STEPS
 30 | 
 31 | 
 32 |     counts = []
 33 |     for ep in range(EPISODES):
 34 |         Is, Js = np.where(grid==START)
 35 | 
 36 |         Is = Is[0]
 37 |         Js = Js[0]
 38 | 
 39 | 
 40 |         epsilon = 0.1#max( 0.2*(1-ep/float(EPISODES))**2, 0.1)
 41 | 
 42 |         
 43 |         a = epsilon_greedy(Is, Js, epsilon=epsilon)
 44 | 
 45 |         
 46 |         xx = x(Is, Js, a)
 47 |         
 48 |         z = np.zeros_like(xx)
 49 | 
 50 |         Q_old = 0
 51 |         
 52 |         count = 0 
 53 | 
 54 |         ep_s = []
 55 |         ep_R = []
 56 | 
 57 |         
 58 | 
 59 |         while grid[Is, Js] != GOAL and (count < MAX_STEPS):
 60 |             count+=1
 61 | 
 62 |             R, Is_prime, Js_prime = step(Is, Js, a)
 63 |             a_prime = epsilon_greedy(Is_prime, Js_prime, epsilon=epsilon)
 64 | 
 65 |             
 66 |             xx_prime = x(Is_prime, Js_prime, a_prime)
 67 | 
 68 |             Q = np.dot(w,xx)
 69 |             Q_prime = np.dot(w,xx_prime)
 70 | 
 71 | 
 72 |             delta = R + gamma * Q_prime - Q
 73 | 
 74 | 
 75 |             
 76 |             z = gamma * lambda_ * z + (1 - alpha * gamma * lambda_ * np.dot(z,xx)) * xx
 77 |                          
 78 |             w_update = alpha * (delta + Q - Q_old) * z - alpha * (Q-Q_old) * xx
 79 | 
 80 |             # w_update = alpha* delta * xx
 81 | 
 82 |             #print np.max(np.abs(w_update))#, delta, 'Q', Q, [Is, Js],a , 'Q_prime',  Q_prime, [Is_prime, Js_prime], a_prime
 83 | 
 84 |             w += w_update
 85 | 
 86 |             Q_old = Q_prime
 87 |             
 88 |             Is = Is_prime
 89 |             Js = Js_prime
 90 |             a = a_prime
 91 | 
 92 |             xx = x(Is, Js, a)
 93 | 
 94 |             ep_s.append([Is, Js])
 95 |             
 96 | 
 97 |         counts.append(count)
 98 |         if ep % 100 == 0:
 99 |             print 'ep %i, count: %i' % (ep, count) #, ep_s
100 | 
101 |     # finished, plot steps per episode
102 |     plot_curve(counts, 'true_online_sarsa_lambda_returns--alpha_%2.2e--lambda_%2.2e' % (alpha, lambda_))
103 | ################################################################################
104 | ## AUX FUNCTIONS
105 | ################################################################################
106 | def epsilon_greedy(I,J, epsilon=0.1):
107 |     if np.random.rand() <  epsilon:
108 |         # return random action
109 |         return np.random.choice(9)
110 | 
111 |     max_q = -np.inf
112 |     max_a = -1
113 | 
114 | 
115 |     for a in range(9):
116 |         if q(w,I,J,a) > max_q:
117 |             max_q = q(w,I,J,a)
118 |             max_a = a
119 |     return max_a
120 |         
121 | 
122 | def step(Is, Js, a, stochastic_wind=0):
123 |     """ do one step in windy gridworld """
124 |     global grid, GOAL
125 | 
126 |     vx, vy = action_to_pair(a)
127 | 
128 |     Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
129 |     Js_prime = Js + vx
130 | 
131 |     Is_prime = min(H-1, max(0,Is_prime))
132 |     Js_prime = min(W-1, max(0,Js_prime))
133 |     
134 | 
135 |     if grid[Is_prime, Js_prime] == GOAL:
136 |         R = 0
137 |     else:
138 |         R = -1
139 | 
140 |     return R, Is_prime, Js_prime 
141 | 
142 | ################################################################################
143 | def x(I,J,a):
144 |     """ one-hot vector for SxA... super inefficient """
145 | 
146 |     global grid, dA, GOAL
147 |     
148 | 
149 |     H, W = grid.shape
150 | 
151 |     xx = np.zeros(H*W*dA) 
152 | 
153 |     if grid[I,J] == GOAL:
154 |         return xx
155 |     else:
156 |         xx[I*W + J + H*W*a] = 1
157 |         return xx
158 | 
159 | 
160 | 
161 | 
162 | def v(w,I,J):
163 |     """ See Sutton and Barto 2nd edition 13.4, page 273 """
164 |     global grid, GOAL
165 | 
166 |     if grid[I,J] == GOAL:
167 |         return 0
168 | 
169 |     return np.dot(w,x(I,J))
170 |     
171 | def q(w,I,J,a):
172 |     return np.dot(w,x(I,J,a))
173 | 
174 | 
175 | # non-king moves: 1, 3, 5, 7
176 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8
177 | ################################################################################
178 | def action_to_pair(a):
179 |     assert(a>=0 and a<9)
180 |     vx = int(np.floor(a/3)-1)
181 |     vy = int(np.mod(a,3)-1)
182 |     return vx, vy
183 | 
184 | def pair_to_action(vx, vy):
185 |     assert np.abs(vx)<=1 and np.abs(vy)<=1
186 | 
187 |     return int((vx+1)*3 + vy + 1)
188 | 
189 | 
190 | # ################################################################################
191 | # def plot_sample(Q, it, stochastic_wind, eps=0):
192 | #     global wind
193 | 
194 | #     plt.clf()
195 | #     x_t = []
196 | #     y_t = []
197 | 
198 | #     H, W = wind.shape
199 |     
200 | #     Is, Js = np.where(grid==START)
201 |             
202 | #     A, vx, vy = epsilon_greedy(Q, Is, Js, eps)
203 | 
204 | #     steps = 0
205 | 
206 | #     y_t.append(Is)
207 | #     x_t.append(Js)
208 |         
209 | #     while grid[Is, Js] != GOAL and steps <5000:
210 | #             # take action A
211 | #             Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
212 | #             Js_prime = Js + vx
213 |     
214 | #             Is_prime = min(H-1, max(0,Is_prime))
215 | #             Js_prime = min(W-1, max(0,Js_prime))
216 |     
217 | #             # choose A_prime from S_prime
218 | #             A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps)
219 |     
220 |     
221 | #             Is = Is_prime
222 | #             Js = Js_prime
223 | 
224 | #             vx = vx_prime
225 | #             vy = vy_prime
226 | #             A = A_prime
227 | 
228 | #             y_t.append(Is)
229 | #             x_t.append(Js)
230 | 
231 | #             steps+=1
232 | 
233 | #     if steps<5000:
234 | #         print 'reached goal in %i steps, episode %i'% (steps, -1)
235 | #     else:
236 | #         print 'couldnt reach goal in 5000 steps with greedy'
237 | 
238 | #     plt.imshow(wind, interpolation='none')
239 | #     plt.plot(np.asarray(x_t), np.asarray(y_t),'o-')
240 | #     plt.scatter(x_t[-1], y_t[-1], color='red', s=50)
241 | 
242 | #     plt.savefig('sample_iter_%02i.png' % it)
243 | 
244 | 
245 | def print_value_img(w,counts, ep):
246 |     global grid
247 |     H, W = grid.shape
248 | 
249 |     value = np.zeros((H,W))
250 |     for i in range(H):
251 |         for j in range(W):
252 |             value[i,j] = v(w, i, j)
253 |     
254 |     plt.clf()
255 |     plt.imshow(value, interpolation='none')
256 |     plt.colorbar()
257 |     plt.savefig('value_imgs/value_%06i.png' % ep)
258 | 
259 |     plt.clf()
260 |     plt.imshow(counts, interpolation='none')
261 |     plt.colorbar()
262 |     plt.savefig('value_imgs/counts_%06i.png' % ep)
263 | 
264 | 
265 | def plot_curve(a, title):
266 |     a = np.asarray(a)
267 |     plt.clf()
268 |     plt.plot(a)
269 |     plt.title(title)
270 |     os.system('mkdir -p figures')
271 |     plt.savefig('figures/%s.png' % title.replace(' ', '_'))
272 | 
273 | 
274 | 
275 | 
276 | ################################################################################ 
277 | # MAIN LOOP
278 | 
279 | START = 1
280 | GOAL = 2
281 | 
282 | 
283 | 
284 | EPISODES = 10000
285 | MAX_STEPS = 100
286 | 
287 | 
288 | KINGS = True
289 | STAY = False # wether not moving is an option
290 | STOCHASTIC = False # stochastic wind
291 | 
292 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int)
293 | grid = np.genfromtxt('grid.txt', delimiter=1)
294 | 
295 | 
296 | if KINGS:
297 |     actions = [0, 1, 2, 3, 4, 5, 6, 7, 8]
298 | else:
299 |     actions = [1, 3, 5, 7]
300 | 
301 | if STAY:
302 |     actions.append(4)
303 | 
304 | actions = np.asarray(actions).astype(int)
305 | 
306 | H, W = wind.shape
307 | 
308 | 
309 | ##
310 | ## Initialize parameters
311 | 
312 | # type of representation: 3rd degree polynomial or one-hot vector (indicator)
313 | #REPR = 'polynomial'
314 | REPR = 'indicator'
315 | 
316 | 
317 | dA = 9 # dimension of possible actions
318 | 
319 | w = np.zeros_like(x(0,0,0))
320 | 
321 | wind *= 0
322 | 
323 | 
324 | #print grid
325 | 
326 | if __name__ == '__main__':
327 | 
328 |     true_online_sarsa_lambda(w)
329 | 
330 | 
331 | 


--------------------------------------------------------------------------------
/ch5-monte_carlo/racetrack.py:
--------------------------------------------------------------------------------
  1 | """ Racetrack Problem. Exercise 5.8, Sutton and Barto 2nd edition.
  2 | jlezama@fing.edu.uy
  3 | """
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from scipy.stats.distributions import poisson
  8 | import matplotlib
  9 | # Force matplotlib to not use any Xwindows backend.
 10 | matplotlib.use('Agg')
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | 
 14 | ################################################################################
 15 | # MAIN FUNCTIONS
 16 | def on_policy_mc_control(Q,C,pi):
 17 |     """ R contains returns """
 18 |     global gamma, EPISODES, EPSILON
 19 | 
 20 |     for episode in range(EPISODES):
 21 | 
 22 |         if (episode % 10000)==0:
 23 |             print 'episode %i of %i' % (episode, EPISODES)
 24 |             plot_sample(pi, episode)
 25 |             plot(Q, pi, episode)
 26 |             EPSILON = max(0.1, EPSILON*.95)
 27 |             print 'new EPSILON', EPSILON
 28 | 
 29 | 
 30 | 
 31 |         S, A, R = generate_episode(pi)
 32 |         T = len(S)
 33 | 
 34 |         appeared = dict()
 35 |                 
 36 |         for t in range(T-1):
 37 |             Rt = R[t]
 38 |             St = S[t]
 39 |             At = A[t]
 40 | 
 41 |             if not (St,At) in appeared.keys():
 42 |                 appeared[(St,At)] = True
 43 |                 #print 'first (s, a) occurrence'
 44 |             else:
 45 |                 #print '(s, a) already occurred'
 46 |                 continue
 47 |                             
 48 | 
 49 |             
 50 |             x_t = St[0]
 51 |             y_t = St[1]
 52 |             vx_t = St[2]
 53 |             vy_t = St[3]
 54 |     
 55 |             # compute return (should be only for first appearance of S,A, TODO)
 56 |             Gt = 0
 57 |             for i in range(t,T-1):
 58 |                 Gt += (gamma**(i-t))*R[i]
 59 |             
 60 | 
 61 |             # print y_t, x_t, vx_t, vy_t,At
 62 |             C[y_t, x_t, vx_t, vy_t,At] += 1
 63 |             Q[y_t, x_t, vx_t, vy_t,At] += 1/C[y_t, x_t, vx_t, vy_t,At]*(Gt - Q[y_t, x_t, vx_t, vy_t,At])
 64 | 
 65 | 
 66 | 
 67 |         for t in range(T):
 68 |             St = S[t]
 69 |             x_t = St[0]
 70 |             y_t = St[1]
 71 |             vx_t = St[2]
 72 |             vy_t = St[3]
 73 | 
 74 |             best_actions = np.where(Q[y_t,x_t,vx_t,vy_t,:]==np.max(Q[y_t,x_t,vx_t,vy_t,:]))[0]
 75 |             if len(best_actions)>1:
 76 |                 best_action = np.random.choice(best_actions)
 77 |             else:
 78 |                 best_action = best_actions[0]
 79 |             
 80 |             pi[y_t, x_t, vx_t, vy_t] = best_action
 81 | 
 82 | 
 83 | 
 84 |     return Q, pi
 85 | 
 86 | def off_policy_mc_control(Q, C, pi):
 87 |     global gamma, EPISODES
 88 | 
 89 | 
 90 |     for iter in range(EPISODES):
 91 |         S, A, R = generate_episode()
 92 |         G = 0
 93 |         W = 1
 94 |     
 95 |         T = len(S)
 96 |     
 97 |         for t in reversed(range(T)):
 98 |             Rt = R[t-1]
 99 |             St = S[t-1]
100 |             At = A[t-1]
101 | 
102 |             #print 'Rt', Rt, t, T
103 | 
104 |     
105 |             x_t = St[0]
106 |             y_t = St[1]
107 |             vx_t = St[2]
108 |             vy_t = St[3]
109 |     
110 |             G = gamma*G + Rt
111 |             C[y_t,x_t,vx_t,vy_t,At] += W
112 | 
113 |             update = Q[y_t,x_t,vx_t,vy_t,At] + W/C[y_t,x_t,vx_t,vy_t,At] * ( G - Q[y_t,x_t,vx_t,vy_t,At] )
114 |             #print 'update', update
115 | 
116 |             #print 'before', Q[y_t,x_t,vx_t,vy_t,:]
117 |             Q[y_t,x_t,vx_t,vy_t,At] = update
118 | 
119 |             #print 'after', Q[y_t,x_t,vx_t,vy_t,:], At
120 | 
121 |             max_action = np.max(Q[y_t,x_t,vx_t,vy_t,:])
122 | 
123 |             best_actions = np.where(Q[y_t,x_t,vx_t,vy_t,:]==max_action)[0]
124 |             if len(best_actions)>1:
125 |                 best_action = np.random.choice(best_actions)
126 |             else:
127 |                 best_action = best_actions[0]
128 | 
129 |             #print 'max_action', max_action, Q[y_t,x_t,vx_t,vy_t,:], 'At', At, 'best_actions', best_actions, 'best_action', best_action
130 | 
131 |             # if np.abs(Q[y_t,x_t,vx_t,vy_t,At] - Q[y_t,x_t,vx_t,vy_t,best_action])<1e0:
132 |             #     best_action = At
133 |             #print 'best_action', best_action, Q[y_t,x_t,vx_t,vy_t,:], 'At', At
134 | 
135 |             pi[y_t,x_t,vx_t,vy_t] = best_action
136 | 
137 |             if best_action != At:
138 |                 print 'breaking!', T-t
139 |                 break
140 | 
141 |             dx, dy = action_to_pair(At)
142 | 
143 |             if dx ==0: 
144 |                 pdx = 3/6.
145 |             elif dx ==1:
146 |                 pdx = 2/6.
147 |             else:
148 |                 pdx = 1/6.
149 |             if dy == 1:
150 |                 pdy = 3/5.
151 |             elif dy == 0:
152 |                 pdy = 1/5.
153 |             elif dy ==-1:
154 |                 pdy = 1/5.
155 |             
156 | 
157 |             W *= 1/(pdx*pdy) # TODO b is not random
158 |         
159 |     return Q, pi
160 | 
161 | def generate_episode(pi=None, eps=None, noise=True):
162 |     global track, actions, MAX_T, EPSILON
163 | 
164 |     H, W = track.shape
165 |     
166 |     if eps is None:
167 |         eps = EPSILON
168 | 
169 | 
170 | 
171 |     # first state: random start location, 0 velocity
172 |     x_0, y_0 = random_start()
173 | 
174 | 
175 | 
176 |     S = [(x_0, y_0, 0, 0)]
177 |     A = []
178 |     R = []
179 | 
180 |     for t in range(MAX_T):
181 |         St = S[-1]
182 |         x_t = St[0]
183 |         y_t = St[1]
184 |         vx_t = St[2]
185 |         vy_t = St[3]
186 | 
187 | 
188 | 
189 |         # Noise with probability 0.1 at each time step the velocity increments are both zero
190 |         if noise and np.random.rand()<0.1:
191 |             delta_vx = 0
192 |             delta_vy = 0
193 |             At1 = pair_to_action(delta_vx, delta_vy)
194 |             
195 |             
196 |         elif pi is None:
197 |             # \epsilon-soft policy b
198 |             delta_vx = np.random.choice([-1, 1, 1, 0, 0, 0]) # choose mostly no horiz accel
199 |             delta_vy = np.random.choice([-1, 0, 1, 1, 1]) # choose mostly vert accel 
200 |             At1 = pair_to_action(delta_vx, delta_vy)
201 | 
202 |         else:
203 |             At1 = pi[y_t,x_t,vx_t,vy_t] if np.random.rand()>eps else np.random.randint(9)
204 |             delta_vx, delta_vy = action_to_pair(At1)
205 | 
206 |         
207 |         vx_t1 = max(0,min(MAX_SPEED, vx_t + delta_vx))
208 |         vy_t1 = max(0,min(MAX_SPEED, vy_t + delta_vy))
209 | 
210 |         if vx_t1==0 and vy_t1==0:
211 |             #print 'both zero!',t, delta_vx, delta_vy, At1
212 |             if np.random.rand()>0.5:
213 |                 vx_t1 = 1
214 |             else:
215 |                 vy_t1 = 1
216 |             At1 = pair_to_action(vx_t1, vy_t1) # should be 0,1 or 1,0
217 |             assert (vx_t1+vy_t1)==1
218 |                 
219 |         x_t1 = x_t + vx_t1
220 |         y_t1 = y_t - vy_t1 # vertical is negative to go up in matrix
221 | 
222 | 
223 |         # check if it went over boundary
224 |         touched_boundary = False
225 | 
226 | 
227 |         if x_t1 >= W or x_t1 < 0 or y_t1 >= H or y_t1 <0 or track[y_t1, x_t1] == BOUNDARY:
228 |             touched_boundary = True
229 | 
230 | 
231 |         for vxx in range(vx_t1):
232 |             if touched_boundary:
233 |                  break
234 |             for vyy in range(vy_t1):
235 |                 if track[y_t - vyy,x_t+vxx] == BOUNDARY:
236 |                     touched_boundary = True
237 |                     break
238 | 
239 | 
240 |         if touched_boundary:
241 |             x_t1, y_t1 = random_start()
242 |             vx_t1 = 0
243 |             vy_t1 = 0
244 | 
245 | 
246 | 
247 |         St1 = (x_t1, y_t1, vx_t1, vy_t1)
248 |         
249 |             
250 |         Rt1 = -1
251 | 
252 | 
253 |         S.append(St1)
254 |         A.append(At1)
255 |         R.append(Rt1)
256 | 
257 |         terminate = False
258 |         if track[y_t1, x_t1] == FINISH:
259 |             print 'FINISHED in %i steps!' % t
260 |             terminate = True
261 |             break
262 |         # print St1, At1, Rt1
263 |     if not terminate:
264 |         print 'didnt make it to the end ----------', eps
265 |     return S, A, R
266 | 
267 | 
268 | ################################################################################
269 | # AUX FUNCTIONS
270 | 
271 | def action_to_pair(a):
272 |     assert(a>=0 and a<9)
273 | 
274 |     vx = int(np.floor(a/3)-1)
275 |     vy = int(np.mod(a,3)-1)
276 | 
277 | 
278 |     return vx, vy
279 | 
280 | def pair_to_action(vx, vy):
281 |     assert np.abs(vx)<=1 and np.abs(vy)<=1
282 | 
283 |     return int((vx+1)*3 + vy + 1)
284 | 
285 | def random_start():
286 |     
287 |     global track
288 | 
289 |     # possible start positions
290 |     Is, Js = np.where(track==START)
291 |     ix_start = np.random.randint(len(Is))
292 |     return  Js[ix_start], Is[ix_start] # horizontal coord first
293 | 
294 | 
295 | def plot(Q, pi, it):
296 |     os.system("mkdir -p figures")
297 |     
298 |     fig, axes = plt.subplots(MAX_SPEED+1, MAX_SPEED+2)
299 |     
300 |     ax = axes[0,0]
301 | 
302 |     im = ax.imshow(np.mean(Q, axis=(2,3,4)), interpolation='none')
303 |     ax.set_title('Q')
304 | 
305 |     
306 |     plt.colorbar(im, ax=ax)
307 | 
308 |     count = MAX_SPEED+3
309 |     for vx in range(MAX_SPEED+1):
310 |         for vy in range(MAX_SPEED+1):
311 | 
312 |             ax = axes[vx, 1+vy]
313 |             im = ax.imshow(pi[:,:,vx,vy], interpolation='none')
314 |             ax.set_title('pi (vx: %i, vy: %i)' % (vx, vy))
315 |             count += 1
316 |     
317 |     plt.colorbar(im, ax=ax)
318 |     plt.savefig('figures/result_iter_%09i.png' % it)
319 | 
320 | def plot_old(Q, pi, it):
321 |     os.system("mkdir -p figures")
322 |     
323 |     fig, axes = plt.subplots(1, 2)
324 |     
325 |     ax = axes[0]
326 |     im = ax.imshow(np.mean(Q, axis=(2,3,4)), interpolation='none')
327 |     ax.set_title('Q')
328 |     ax.set_xlabel('Location 1')
329 |     ax.set_ylabel('Location 2')
330 |     
331 |     plt.colorbar(im, ax=ax)
332 | 
333 |     ax = axes[1]
334 |     im = ax.imshow(np.mean(pi, axis=(2,3)), interpolation='none')
335 |     ax.set_title('pi')
336 |     ax.set_xlabel('Location 1')
337 |     ax.set_ylabel('Location 2')
338 |     
339 |     plt.colorbar(im, ax=ax)
340 |     plt.savefig('figures/result_iter_%09i.png' % it)
341 | 
342 | def plot_sample(pi, it):
343 |     os.system('mkdir -p samples')
344 |     global track
345 | 
346 |     H, W = track.shape
347 | 
348 |     S, A, R = generate_episode(pi, eps=0, noise=False)
349 | 
350 |     f = open('samples/sample_iter_%09i.txt' % it, 'w')
351 |     f.write(str(S))
352 |     f.close()
353 | 
354 |     plt.clf()
355 |     x_t = []
356 |     y_t = []
357 | 
358 |     for St in S:
359 |         x_t.append(St[0])
360 |         y_t.append(St[1])
361 | 
362 |     plt.imshow(track, interpolation='none')
363 |     plt.plot(np.asarray(x_t), np.asarray(y_t),'o-')
364 |     plt.scatter(x_t[-1], y_t[-1], color='red', s=50)
365 | 
366 |     plt.savefig('samples/sample_iter_%09i.png' % it)
367 | 
368 | 
369 | ################################################################################ 
370 | # MAIN LOOP
371 | 
372 | 
373 | START = 2
374 | FINISH = 3
375 | TRACK = 1
376 | BOUNDARY = 0
377 | 
378 | gamma = 0.9
379 | accelerate_factor = 2.0
380 | 
381 | MAX_SPEED = 4
382 | MAX_T = 500 # Max episode length
383 | EPISODES = int(1e5)
384 | EPSILON = 0.5
385 | 
386 | track = np.genfromtxt('racetrack.txt', delimiter=1)
387 | 
388 | actions = range(9) # cartesian product of (-1, 0, 1)
389 | 
390 | H, W = track.shape
391 | 
392 | Q = np.ones((H,W,MAX_SPEED+1,MAX_SPEED+1,9))*(-1*MAX_T) # pessimism in the face of uncertainty :-)
393 | C = np.zeros((H,W,MAX_SPEED+1,MAX_SPEED+1,9))
394 | 
395 | 
396 | pi = np.random.randint(low=0, high=9, size=(H,W,MAX_SPEED+1,MAX_SPEED+1))
397 | 
398 | Q, pi = on_policy_mc_control(Q,C,pi)
399 | 
400 | 
401 | plot(Q, pi, 0)
402 | 
403 | # plot a few more samples episodes
404 | for i in range(10):
405 |     plot_sample(pi, EPISODES+i)
406 | 
407 | 


--------------------------------------------------------------------------------
/ch13-policy_gradient/windy.py:
--------------------------------------------------------------------------------
  1 | """ Windy Gridworld Problem Using Policy Gradient (Sutton and Barto 2nd edition, Chapter 13)
  2 | 
  3 | Includes REINFORCE (with and without baseline) and Actor-Critic
  4 | State representation can be a 3rd order polynomial on position or a one-hot vector.
  5 | 
  6 | 
  7 | 
  8 | April 11, 2018
  9 | 
 10 | jlezama@fing.edu.uy
 11 | """
 12 | 
 13 | import os
 14 | import numpy as np
 15 | from scipy.stats.distributions import poisson
 16 | 
 17 | 
 18 | import matplotlib
 19 | # Force matplotlib to not use any Xwindows backend.
 20 | matplotlib.use('Agg')
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | ################################################################################
 24 | # MAIN FUNCTIONS
 25 | def actor_critic(w, theta, alpha_w=1e-3, alpha_theta=1e-2, gamma=0.99):
 26 |     global GOAL, EPISODES, MAX_STEPS
 27 | 
 28 |     Gs = []
 29 | 
 30 |     count = 0
 31 |     for episode in range(EPISODES):
 32 | 
 33 | 
 34 |         
 35 |         if episode % 100 ==0:
 36 |             print 'NEW EPISODE %i/%i! (%i)' % (episode, EPISODES, count),  compute_pi(theta, 3, 6), action_to_pair(np.random.choice(9, p=compute_pi(theta, 3, 6)))
 37 | 
 38 |             print 'values', v(w,3,0), v(w,3,1), v(w,3,2), v(w,3,3), v(w,3,4), v(w,3,5), v(w,3,6), v(w,3,7), v(w,3,8), v(w,3,9)
 39 | 
 40 | 
 41 |         count = 0
 42 | 
 43 |         counts = np.zeros_like(grid)
 44 |         
 45 |         Is, Js = np.where(grid==START)
 46 |         II = 1
 47 | 
 48 | 
 49 |         G = 0
 50 | 
 51 | 
 52 |         while grid[Is, Js] != GOAL and count < MAX_STEPS:
 53 | 
 54 |             pi = compute_pi(theta, Is, Js)
 55 |             
 56 |             #print 'pi', pi, [Is, Js]
 57 | 
 58 |             a = np.random.choice(9, p=pi)
 59 | 
 60 |             #a = 7
 61 | 
 62 |             # if 1:#np.random.rand()<1.0:
 63 |             #     a = np.random.choice(9)
 64 |             
 65 |             R, Is_prime, Js_prime = step(Is, Js, a)
 66 | 
 67 |             G += R
 68 |             
 69 |             vx, vy = action_to_pair(a)
 70 |             
 71 |             delta = R + gamma * v(w, Is_prime, Js_prime) -v(w, Is, Js)
 72 | 
 73 |             w_update = alpha_w * II * delta * x(Is,Js)
 74 |             
 75 |             w += w_update
 76 | 
 77 |             theta_update =  alpha_theta * II * delta * compute_grad(theta, Is, Js, a)
 78 |             theta += theta_update
 79 | 
 80 | 
 81 | 
 82 | 
 83 |             # print 'moving from [%i %i] to [%i %i] (%i: %i, %i)' % (Is, Js, Is_prime, Js_prime,a, vx, vy)
 84 |             # print 'pi', pi
 85 |             # print 'grad', compute_grad(theta, Is, Js, a)[3:]
 86 |             # # print 'delta', delta
 87 |             #print 'theta_update', theta_update
 88 |             # print '-----'
 89 | 
 90 |             #raise
 91 | 
 92 |             counts[Is, Js] += 1
 93 |             
 94 |             II *= gamma
 95 |             Is = Is_prime
 96 |             Js = Js_prime
 97 | 
 98 |             count +=1
 99 | 
100 |             if count % 1000==0:
101 |                 print 'still computing', count
102 |                 print 'moving from [%i %i] to [%i %i] (%i: %i, %i)' % (Is, Js, Is_prime, Js_prime,a, vx, vy)
103 |                 print 'pi', pi
104 |                 print 'grad', compute_grad(theta, Is, Js, a)
105 |                 # print 'delta', delta
106 |                 print 'theta_update', theta_update
107 |                 print 'w_update', w_update
108 |                 print '-----'
109 |         
110 |         Gs.append(G)
111 |         
112 |         if episode % 1000 == 0:
113 |             print_value_img(w, counts, episode)
114 | 
115 |     plot_curve(Gs, 'actor_critic_returns--alpha_theta_%2.2e--alpha_w_%2.2e--_%s' % (alpha_theta, alpha_w, REPR))
116 | 
117 | 
118 | ################################################################################
119 | ################################################################################
120 | def REINFORCE(theta, w, gamma=1.0, alpha_theta=1e-5, alpha_w=1e-7):
121 |     global grid, GOAL, EPISODES, MAX_STEPS, REPR
122 | 
123 |     H,W = grid.shape
124 | 
125 |     counts = np.zeros((H,W))
126 | 
127 |     G_0s = []
128 |     for ep in range(EPISODES):
129 | 
130 | 
131 |         Is, Js = np.where(grid==START)
132 | 
133 | 
134 |         Is = Is[0]
135 |         Js =Js[0]
136 | 
137 | 
138 |         pi = compute_pi(theta,Is, Js)
139 |         a = np.random.choice(9, p=pi)
140 | 
141 | 
142 |         counts *= 0
143 |         
144 | #        a = 7
145 | 
146 | 
147 |         ep_s = []
148 |         ep_a = []
149 |         ep_R = []
150 | 
151 |         ep_s.append([Is, Js])
152 |         ep_a.append(a)
153 | 
154 |         R, Is, Js = step(Is, Js, a)
155 |         
156 | 
157 |         #print Is, Js, a, action_to_pair(a)
158 |         
159 | 
160 |         ep_R.append(R)
161 |         
162 | 
163 |         while grid[Is, Js] != GOAL and len(ep_s)<MAX_STEPS:
164 | 
165 |             pi = compute_pi(theta, Is, Js)
166 | 
167 | 
168 |             a = np.random.choice(9, p=pi)
169 | 
170 | 
171 | 
172 |             ep_s.append([Is, Js])
173 |             ep_a.append(a)
174 | 
175 | 
176 |             if len(ep_s) % 1000 == 0:
177 |                 print len(ep_s), Is, Js, pi,a, action_to_pair(a)
178 | 
179 |             # if np.random.rand()<0.05:
180 |             #      a = np.random.choice(9)
181 |             
182 |            # a= 7
183 | 
184 |             R, Is, Js = step(Is, Js, a)
185 | 
186 |             #print Is, Js
187 | 
188 |             counts[Is, Js]+=1
189 |         
190 |             ep_R.append(R)
191 | 
192 |         # print 'GOAL!'
193 |         ep_R = np.asarray(ep_R)
194 | 
195 | 
196 | 
197 | 
198 |         for t in range(len(ep_s)):
199 |            G_t = np.sum(ep_R[(t):])
200 |            Is_t, Js_t = ep_s[t][0], ep_s[t][1]
201 |            
202 |            delta = G_t - v(w, Is_t, Js_t)
203 | 
204 | 
205 |            w_update = alpha_w * (gamma**t) * delta * x(Is_t, Js_t)
206 |            
207 | 
208 |            w += w_update
209 | 
210 |            #print w_update
211 | 
212 |            # WITHOUT BASELINE:
213 |            #theta_update =  alpha_theta * (gamma**t) * G_t * compute_grad(theta, Is_t, Js_t,ep_a[t])
214 | 
215 |            # WITH BASELINE:
216 |            theta_update = alpha_theta * (gamma**t) * delta * compute_grad(theta, Is_t, Js_t,ep_a[t])
217 | 
218 |            theta += theta_update
219 |            
220 | 
221 |         G_0 = np.sum(ep_R)
222 | 
223 |         if ep % 100 == 0:
224 |             print 'ep %i, G_0 %f' % (ep, G_0),  compute_pi(theta, 3, 6), v(w, 3,0), v(w,3,1), v(w,3,2), v(w,3,3), v(w,3,4), v(w,3,5), v(w,3,6), v(w,3,7), v(w,3,8), v(w,3,9), w.shape
225 |             print ep_s
226 | 
227 |         G_0s.append(G_0)
228 |         
229 |         if ep % 1000 == 0:
230 |             print_value_img(w, counts, ep)
231 | 
232 | 
233 |     plot_curve(G_0s, 'REINFORCE_returns--alpha_theta_%2.2e--alpha_w_%2.2e--_%s' % (alpha_theta, alpha_w, REPR))
234 | 
235 |     return theta, np.asarray(G_0s)
236 | 
237 | 
238 | 
239 | ################################################################################
240 | ## AUX FUNCTIONS
241 | ################################################################################
242 | def step(Is, Js, a, stochastic_wind=0):
243 |     """ do one step in windy gridworld """
244 |     global grid, GOAL
245 | 
246 |     vx, vy = action_to_pair(a)
247 | 
248 |     Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
249 |     Js_prime = Js + vx
250 | 
251 |     Is_prime = min(H-1, max(0,Is_prime))
252 |     Js_prime = min(W-1, max(0,Js_prime))
253 |     
254 | 
255 |     if grid[Is_prime, Js_prime] == GOAL:
256 |         R = 0
257 |     else:
258 |         R = -1
259 | 
260 |     return R, Is_prime, Js_prime 
261 | 
262 | ################################################################################
263 | def x(I,J):
264 |     global REPR
265 |     if REPR == 'polynomial':
266 |         return x_polynomial(I,J)
267 |     elif REPR == 'indicator':
268 |         return x_indicator(I,J)
269 |     else:
270 |         raise ValueError('unknown representation type')
271 | 
272 | def x_indicator(I,J):
273 |     global grid
274 |     H, W = grid.shape
275 | 
276 |     xx = np.zeros(H*W) 
277 |     xx[I*W+J] = 1
278 |     
279 |     return xx
280 | 
281 | def x_polynomial(I, J):
282 |     # returns a vector representation of x
283 |     global grid
284 |     H,W = grid.shape
285 |     
286 | 
287 |     xx = np.zeros(10)
288 | 
289 | 
290 |     xx[0] = (I-H/2.)/float(H/2.)
291 |     xx[1] = (J-W/2.)/float(W/2.)
292 |     xx[2] = (xx[0])**2
293 |     xx[3] = (xx[1])**2
294 |     xx[4] = xx[0]*xx[1]
295 | 
296 |     xx[5] = xx[0]**3
297 |     xx[6] = xx[1]**3
298 |     xx[7] = xx[2]*xx[1]
299 |     xx[8] = xx[3]*xx[0]
300 | 
301 |     
302 |     xx[9] = 1 # bias term
303 |     
304 | 
305 |     return xx
306 | 
307 | 
308 | ################################################################################
309 | def compute_pi(theta, I, J):
310 |     # compute soft-max for linear feature theta^T.x
311 | 
312 |     xx = x(I,J)
313 | 
314 |     scores = np.dot(xx.T, theta)
315 | 
316 |     scores -= np.max(scores)
317 | 
318 |     pi = np.exp(scores)/np.sum(np.exp(scores))
319 | 
320 |     assert np.abs(np.sum(pi)-1)<1e-9, np.sum(pi)
321 | 
322 |     return pi
323 | 
324 | ################################################################################
325 | def compute_grad(theta,I,J,a):
326 |     # compute soft-max for linear feature theta^T.x
327 | 
328 |     global actions
329 |     pi = compute_pi(theta,I, J)
330 | 
331 | 
332 |     grad = np.zeros_like(theta)
333 | 
334 |     
335 |     for b in actions:
336 |         if b==a:
337 |             grad[:,b] = x(I,J)*(1-pi[b])
338 |         else:
339 |             grad[:,b] = -1*pi[b]*x(I,J)
340 |             
341 |     #print '----'
342 |     #print pi, grad[:,a],grad[:,a+1], x(I,J), a
343 |     # raise
344 | 
345 |     return grad
346 | 
347 | 
348 | def v(w,I,J):
349 |     """ See Sutton and Barto 2nd edition 13.4, page 273 """
350 |     global grid, GOAL
351 | 
352 |     if grid[I,J] == GOAL:
353 |         return 0
354 | 
355 |     return np.dot(w,x(I,J))
356 |     
357 | 
358 | 
359 | # non-king moves: 1, 3, 5, 7
360 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8
361 | ################################################################################
362 | def action_to_pair(a):
363 |     assert(a>=0 and a<9)
364 |     vx = int(np.floor(a/3)-1)
365 |     vy = int(np.mod(a,3)-1)
366 |     return vx, vy
367 | 
368 | def pair_to_action(vx, vy):
369 |     assert np.abs(vx)<=1 and np.abs(vy)<=1
370 | 
371 |     return int((vx+1)*3 + vy + 1)
372 | 
373 | 
374 | # ################################################################################
375 | # def plot_sample(Q, it, stochastic_wind, eps=0):
376 | #     global wind
377 | 
378 | #     plt.clf()
379 | #     x_t = []
380 | #     y_t = []
381 | 
382 | #     H, W = wind.shape
383 |     
384 | #     Is, Js = np.where(grid==START)
385 |             
386 | #     A, vx, vy = epsilon_greedy(Q, Is, Js, eps)
387 | 
388 | #     steps = 0
389 | 
390 | #     y_t.append(Is)
391 | #     x_t.append(Js)
392 |         
393 | #     while grid[Is, Js] != GOAL and steps <5000:
394 | #             # take action A
395 | #             Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1])
396 | #             Js_prime = Js + vx
397 |     
398 | #             Is_prime = min(H-1, max(0,Is_prime))
399 | #             Js_prime = min(W-1, max(0,Js_prime))
400 |     
401 | #             # choose A_prime from S_prime
402 | #             A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps)
403 |     
404 |     
405 | #             Is = Is_prime
406 | #             Js = Js_prime
407 | 
408 | #             vx = vx_prime
409 | #             vy = vy_prime
410 | #             A = A_prime
411 | 
412 | #             y_t.append(Is)
413 | #             x_t.append(Js)
414 | 
415 | #             steps+=1
416 | 
417 | #     if steps<5000:
418 | #         print 'reached goal in %i steps, episode %i'% (steps, -1)
419 | #     else:
420 | #         print 'couldnt reach goal in 5000 steps with greedy'
421 | 
422 | #     plt.imshow(wind, interpolation='none')
423 | #     plt.plot(np.asarray(x_t), np.asarray(y_t),'o-')
424 | #     plt.scatter(x_t[-1], y_t[-1], color='red', s=50)
425 | 
426 | #     plt.savefig('sample_iter_%02i.png' % it)
427 | 
428 | 
429 | def print_value_img(w,counts, ep):
430 |     global grid
431 |     H, W = grid.shape
432 | 
433 |     value = np.zeros((H,W))
434 |     for i in range(H):
435 |         for j in range(W):
436 |             value[i,j] = v(w, i, j)
437 |     
438 |     plt.clf()
439 |     plt.imshow(value, interpolation='none')
440 |     plt.colorbar()
441 |     plt.savefig('value_imgs/value_%06i.png' % ep)
442 | 
443 |     plt.clf()
444 |     plt.imshow(counts, interpolation='none')
445 |     plt.colorbar()
446 |     plt.savefig('value_imgs/counts_%06i.png' % ep)
447 | 
448 | 
449 | def plot_curve(a, title):
450 |     a = np.asarray(a)
451 |     plt.clf()
452 |     plt.plot(a)
453 |     plt.title(title)
454 |     plt.savefig('figures/%s.png' % title.replace(' ', '_'))
455 | 
456 | 
457 | 
458 | 
459 | ################################################################################ 
460 | # MAIN LOOP
461 | 
462 | START = 1
463 | GOAL = 2
464 | 
465 | 
466 | 
467 | EPISODES = 10000
468 | MAX_STEPS = 100
469 | 
470 | 
471 | 
472 | KINGS = True
473 | STAY = False # wether not moving is an option
474 | STOCHASTIC = False # stochastic wind
475 | 
476 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int)
477 | grid = np.genfromtxt('grid.txt', delimiter=1)
478 | 
479 | 
480 | if KINGS:
481 |     actions = [0, 1, 2, 3, 4, 5, 6, 7, 8]
482 | else:
483 |     actions = [1, 3, 5, 7]
484 | 
485 | if STAY:
486 |     actions.append(4)
487 | 
488 | actions = np.asarray(actions).astype(int)
489 | 
490 | H, W = wind.shape
491 | 
492 | 
493 | ##
494 | ## Initialize parameters
495 | 
496 | # type of representation: 3rd degree polynomial or one-hot vector (indicator)
497 | #REPR = 'polynomial'
498 | REPR = 'indicator'
499 | 
500 | 
501 | dA = 9 # dimension of possible actions
502 | dX = int(x(0,0).shape[0]) #H*W # dimension of x
503 | 
504 | theta = np.zeros((dX,dA))
505 | w = np.zeros(dX)
506 | 
507 | wind *= 0
508 | 
509 | 
510 | #print grid
511 | 
512 | if __name__ == '__main__':
513 | 
514 |     actor_critic(w, theta)
515 | 
516 |     #REINFORCE(theta,w)
517 | 
518 | 


--------------------------------------------------------------------------------