├── .gitignore ├── ch6-td ├── standard.png ├── grid.txt ├── wind.txt ├── kings_stochastic_wind.png ├── kings_deterministic_wind.png ├── kings_stochastic_wind_can_stay.png ├── standard_moves_stochastic_wind.png └── windy.py ├── ch8-dyna-q ├── grid.txt ├── sample_iter_01.png └── dyna-maze.py ├── ch12-eligibility_traces ├── grid.txt ├── wind.txt ├── figures │ ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png │ ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png │ ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png │ ├── true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png │ └── true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png └── windy.py ├── ch13-policy_gradient ├── grid.txt ├── wind.txt ├── value_imgs │ ├── value_000000.png │ ├── value_001000.png │ ├── value_002000.png │ ├── value_003000.png │ ├── value_004000.png │ ├── value_005000.png │ ├── value_006000.png │ ├── value_007000.png │ ├── value_008000.png │ ├── value_009000.png │ ├── counts_000000.png │ ├── counts_001000.png │ ├── counts_002000.png │ ├── counts_003000.png │ ├── counts_004000.png │ ├── counts_005000.png │ ├── counts_006000.png │ ├── counts_007000.png │ ├── counts_008000.png │ └── counts_009000.png ├── figures │ ├── REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png │ ├── REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png │ ├── REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png │ ├── REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png │ ├── REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png │ ├── REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png │ ├── actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png │ ├── actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png │ ├── actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png │ ├── actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png │ └── actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png ├── reinforce.py ├── reinforce_with_baseline.py └── windy.py ├── ch5-monte_carlo ├── result_iter_000100000.png ├── sample_iter_000100000.png ├── racetrack_tiny.txt ├── racetrack_small.txt ├── racetrack.txt └── racetrack.py ├── README.md ├── ch11-function_approximation └── baird_counterexample.py ├── ch4-value_iteration └── gambler.py └── ch4-policy_iteration └── jacksrental_v1.py /.gitignore: -------------------------------------------------------------------------------- 1 | sandbox/ 2 | *.DS_Store 3 | *~ -------------------------------------------------------------------------------- /ch6-td/standard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/standard.png -------------------------------------------------------------------------------- /ch8-dyna-q/grid.txt: -------------------------------------------------------------------------------- 1 | 0000000023 2 | 0020000020 3 | 1020000020 4 | 0020000000 5 | 0000002000 6 | 0000000000 7 | -------------------------------------------------------------------------------- /ch6-td/grid.txt: -------------------------------------------------------------------------------- 1 | 0000000000 2 | 0000000000 3 | 0000000000 4 | 1000000200 5 | 0000000000 6 | 0000000000 7 | 0000000000 8 | -------------------------------------------------------------------------------- /ch6-td/wind.txt: -------------------------------------------------------------------------------- 1 | 0001112210 2 | 0001112210 3 | 0001112210 4 | 0001112210 5 | 0001112210 6 | 0001112210 7 | 0001112210 8 | -------------------------------------------------------------------------------- /ch8-dyna-q/sample_iter_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch8-dyna-q/sample_iter_01.png -------------------------------------------------------------------------------- /ch6-td/kings_stochastic_wind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_stochastic_wind.png -------------------------------------------------------------------------------- /ch6-td/kings_deterministic_wind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_deterministic_wind.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/grid.txt: -------------------------------------------------------------------------------- 1 | 0000000000 2 | 0000000000 3 | 0000000000 4 | 1000000200 5 | 0000000000 6 | 0000000000 7 | 0000000000 8 | -------------------------------------------------------------------------------- /ch12-eligibility_traces/wind.txt: -------------------------------------------------------------------------------- 1 | 0001112210 2 | 0001112210 3 | 0001112210 4 | 0001112210 5 | 0001112210 6 | 0001112210 7 | 0001112210 8 | -------------------------------------------------------------------------------- /ch13-policy_gradient/grid.txt: -------------------------------------------------------------------------------- 1 | 0000000000 2 | 0000000000 3 | 0000000000 4 | 0000100200 5 | 0000000000 6 | 0000000000 7 | 0000000000 8 | -------------------------------------------------------------------------------- /ch13-policy_gradient/wind.txt: -------------------------------------------------------------------------------- 1 | 0001112210 2 | 0001112210 3 | 0001112210 4 | 0001112210 5 | 0001112210 6 | 0001112210 7 | 0001112210 8 | -------------------------------------------------------------------------------- /ch5-monte_carlo/result_iter_000100000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch5-monte_carlo/result_iter_000100000.png -------------------------------------------------------------------------------- /ch5-monte_carlo/sample_iter_000100000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch5-monte_carlo/sample_iter_000100000.png -------------------------------------------------------------------------------- /ch6-td/kings_stochastic_wind_can_stay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/kings_stochastic_wind_can_stay.png -------------------------------------------------------------------------------- /ch6-td/standard_moves_stochastic_wind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch6-td/standard_moves_stochastic_wind.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_000000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_000000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_001000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_001000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_002000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_002000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_003000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_003000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_004000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_004000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_005000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_005000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_006000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_006000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_007000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_007000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_008000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_008000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/value_009000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/value_009000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_000000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_000000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_001000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_001000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_002000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_002000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_003000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_003000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_004000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_004000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_005000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_005000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_006000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_006000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_007000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_007000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_008000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_008000.png -------------------------------------------------------------------------------- /ch13-policy_gradient/value_imgs/counts_009000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/value_imgs/counts_009000.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_0.00e+00.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_1.00e+00.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_5.00e-01.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-01--lambda_9.00e-01.png -------------------------------------------------------------------------------- /ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch12-eligibility_traces/figures/true_online_sarsa_lambda_returns--alpha_1.25e-03--lambda_5.00e-01.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-03--alpha_w_1.00e-03--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_1.00e-04--alpha_w_1.00e-05--_polynomial.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_2.00e-03--alpha_w_1.00e-03--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-03--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/REINFORCE_returns--alpha_theta_5.00e-05--alpha_w_1.00e-05--_polynomial.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-02--alpha_w_1.00e-03--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_1.00e-03--alpha_w_1.00e-04--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-03--alpha_w_1.00e-03--_indicator.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-04--_polynomial.png -------------------------------------------------------------------------------- /ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlezama/rl-book-exercises/HEAD/ch13-policy_gradient/figures/actor_critic_returns--alpha_theta_5.00e-04--alpha_w_1.00e-05--_polynomial.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains my solutions to programming exercises in the book 2 | Reinforcement Learning: An Introduction, by Sutton and Barto, 2nd Edition 3 | (2018). 4 | 5 | It also contains implementations of some RL algorithms presented in the book that are not required as exercises. 6 | 7 | These scripts should only be considered as a reference. Use at your own risk. 8 | 9 | -------------------------------------------------------------------------------- /ch5-monte_carlo/racetrack_tiny.txt: -------------------------------------------------------------------------------- 1 | 11113 2 | 11100 3 | 11100 4 | 22000 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /ch5-monte_carlo/racetrack_small.txt: -------------------------------------------------------------------------------- 1 | 11111113 2 | 11111113 3 | 11111113 4 | 11111113 5 | 11111000 6 | 11110000 7 | 11110000 8 | 11110000 9 | 11110000 10 | 22220000 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /ch11-function_approximation/baird_counterexample.py: -------------------------------------------------------------------------------- 1 | """ Sutton and Barto 2nd edition, Exercise 11.3 2 | Divergence of semi-gradient Q-learning in Baird's counterexample 3 | 4 | jlezama@fing.edu.uy 5 | """ 6 | 7 | import os 8 | import numpy as np 9 | from scipy.stats.distributions import poisson 10 | 11 | 12 | import matplotlib 13 | # Force matplotlib to not use any Xwindows backend. 14 | matplotlib.use('Agg') 15 | import matplotlib.pyplot as plt 16 | 17 | 18 | 19 | ################################################################################ 20 | 21 | 22 | def one_step(w_t, x_t, x_tp1, rho_t, gamma=0.99, alpha=0.01, R_tp1=0): 23 | 24 | delta_t = R_tp1 + gamma * np.dot(w_t,x_tp1) - np.dot(w_t, x_t) 25 | w_tp1 = w_t + alpha * rho_t * delta_t * x_t 26 | return w_tp1 27 | 28 | 29 | ################################################################################ 30 | Ns = 7 # number of states 31 | 32 | # states matrix 33 | X = np.asarray([[2,0,0,0,0,0,0,1],[0,2,0,0,0,0,0,1],[0,0,2,0,0,0,0,1],[0,0,0,2,0,0,0,1],[0,0,0,0,2,0,0,1],[0,0,0,0,0,2,0,1],[0,0,0,0,0,0,1,2]]) 34 | 35 | 36 | 37 | # initial state is state 7 38 | i_t = Ns 39 | x_t = X[i_t-1] 40 | 41 | # initital weight vector 42 | w_t = np.ones(8) 43 | 44 | 45 | 46 | for it in range(100000): 47 | if i_t == Ns: 48 | i_tp1 = np.random.randint(Ns+1) 49 | else: 50 | i_tp1 = Ns 51 | 52 | # importance sampling: target policy over behavior 53 | if i_t ==Ns and i_tp1 THETA: 18 | print 'entering while' 19 | Delta = 0 20 | for s in range(100): 21 | v = V[s] 22 | 23 | argmax_a = -np.inf 24 | max_return = -np.inf 25 | for a in range(min(s,100-s)+1): 26 | expected_return = 0 27 | for s_prime in [s-a, s+a]: 28 | P, R = PR(s,a,s_prime) 29 | expected_return += P * (R + gamma * V[s_prime]) 30 | 31 | 32 | if expected_return> max_return: 33 | max_return = expected_return 34 | argmax_a = a 35 | # if expected_return == max_return: 36 | # argmax_a = np.random.choice([a, argmax_a]) 37 | 38 | V[s] = max_return 39 | pi[s] = argmax_a 40 | Delta = max(Delta, np.abs(v-V[s])) 41 | 42 | it+=1 43 | plot(V, pi, it) 44 | 45 | return V, pi 46 | 47 | def PR(s, a, s_prime): 48 | global p_h 49 | 50 | 51 | # with probabilty p_h you get s+a, with probability 1-p_h you get s-a 52 | if s_prime == s+a: 53 | return p_h, int(s_prime==100) 54 | elif s_prime == s-a: 55 | return 1-p_h, 0 56 | 57 | else: 58 | return 0, 0 59 | 60 | 61 | def plot(V, pi, it): 62 | global fig, axes 63 | os.system("mkdir -p gambler_figures") 64 | 65 | fig, axes = plt.subplots(1, 2) 66 | 67 | ax = axes[0] 68 | im = ax.plot(V) 69 | ax.set_title('V') 70 | 71 | ax = axes[1] 72 | im = ax.bar(range(101),pi) 73 | ax.set_title('pi') 74 | 75 | plt.savefig('gambler_figures/result_iter_%02i.png' % it) 76 | 77 | plt.clf() 78 | 79 | ################################################################################ 80 | # MAIN LOOP 81 | 82 | THETA = 1e-16 83 | p_h = 0.4 84 | 85 | gamma = 1; 86 | 87 | V = np.zeros(101) 88 | pi = np.zeros(101) 89 | 90 | 91 | V, pi = value_iteration(V, pi, PR, gamma) 92 | 93 | 94 | -------------------------------------------------------------------------------- /ch13-policy_gradient/reinforce.py: -------------------------------------------------------------------------------- 1 | """ Sutton and Barto 2nd edition, Chapter 13. Policy Gradient Methods 2 | Implementation of REINFORCE algorithm for the short corridor example 3 | 4 | jlezama@fing.edu.uy 5 | """ 6 | 7 | import os 8 | import numpy as np 9 | from scipy.stats.distributions import poisson 10 | 11 | 12 | import matplotlib 13 | # Force matplotlib to not use any Xwindows backend. 14 | matplotlib.use('Agg') 15 | import matplotlib.pyplot as plt 16 | 17 | 18 | 19 | ################################################################################ 20 | def one_step(s,a): 21 | """receives current state and action s,a, returns reward and next state, r, 22 | s_prime. a is either 0 (left) or 1 (right) 23 | """ 24 | 25 | R = -1 26 | if s == 0: 27 | s_prime = a # left (0) goes to state 0, right (1) goes to state 1 28 | elif s == 1: 29 | s_prime = 2 if a ==0 else 0 # reversed motion 30 | elif s == 2: 31 | s_prime = 3 if a == 1 else 1 32 | 33 | return R, s_prime 34 | 35 | 36 | ################################################################################ 37 | def x(s,a): 38 | xs = np.asarray([[0,1],[1,0]]) 39 | x = xs[a] 40 | return x 41 | 42 | 43 | ################################################################################ 44 | def compute_pi(theta, s): 45 | # compute soft-max for linear feature theta^T.x 46 | h = np.zeros(2) 47 | 48 | for a in range(2): 49 | h[a] = np.dot(x(s,a), theta) 50 | 51 | h -= np.max(h) 52 | 53 | pi = np.exp(h)/np.sum(np.exp(h)) 54 | 55 | return pi 56 | 57 | 58 | ################################################################################ 59 | def compute_grad(theta,s,a): 60 | # compute soft-max for linear feature theta^T.x 61 | pi = compute_pi(theta,s) 62 | 63 | 64 | not_a = np.abs(a-1) 65 | 66 | grad = x(s,a) - pi[not_a] * x(s,not_a) 67 | 68 | 69 | return grad 70 | 71 | def v(w,s): 72 | """ See Sutton and Barto 2nd edition 13.4, page 273 """ 73 | return w 74 | 75 | 76 | ################################################################################ 77 | def REINFORCE(theta, gamma = 1.0, alpha=2**-13): 78 | EPISODES= 200 79 | 80 | GOAL = 3 81 | 82 | 83 | G_0s = [] 84 | for ep in range(EPISODES): 85 | G = 0 86 | s = 0 87 | a = np.argmax(compute_pi(theta,s)) 88 | 89 | ep_s = [] 90 | ep_a = [] 91 | ep_R = [] 92 | 93 | ep_s.append(s) 94 | ep_a.append(a) 95 | 96 | R, s = one_step(s,a) 97 | 98 | ep_R.append(R) 99 | 100 | 101 | while s != GOAL: 102 | 103 | pi = compute_pi(theta, s) 104 | a = np.random.choice(2, p=pi) 105 | 106 | # print s,pi,a 107 | # if np.random.rand()<0.1: 108 | # a = np.random.choice([0,1]) 109 | 110 | 111 | R, s = one_step(s,a) 112 | 113 | ep_s.append(s) 114 | ep_a.append(a) 115 | 116 | ep_R.append(R) 117 | 118 | # print 'GOAL!' 119 | ep_R = np.asarray(ep_R) 120 | 121 | for t in range(len(ep_s)): 122 | G_t = np.sum(ep_R[t:]) 123 | theta += alpha * (gamma**t) * G_t * compute_grad(theta, ep_s[t],ep_a[t]) 124 | 125 | G_0 = np.sum(ep_R) 126 | 127 | #print 'ep %i, G_0 %f' % (ep, G_0), theta 128 | G_0s.append(G_0) 129 | 130 | return theta, np.asarray(G_0s) 131 | 132 | 133 | ################################################################################ 134 | # MAIN LOOP 135 | 136 | RUNS = 100 137 | 138 | theta = np.random.randn(2) 139 | 140 | theta, G_0s = REINFORCE(theta) 141 | 142 | G_0s = G_0s.reshape(1,-1) 143 | 144 | for i in range(RUNS): 145 | theta = np.random.randn(2) 146 | theta, G_0s_t = REINFORCE(theta) 147 | G_0s = np.concatenate((G_0s, G_0s_t.reshape(1,-1)), axis=0) 148 | 149 | print 'RUN %i/%i' % (i,RUNS) 150 | 151 | print G_0s.shape 152 | print np.mean(G_0s,axis=0) 153 | 154 | 155 | savefname = 'G.png' 156 | plt.plot(np.mean(G_0s, axis=0)) 157 | plt.savefig(savefname) 158 | 159 | 160 | os.system('open %s' % savefname) 161 | -------------------------------------------------------------------------------- /ch13-policy_gradient/reinforce_with_baseline.py: -------------------------------------------------------------------------------- 1 | """ Sutton and Barto 2nd edition, Chapter 13. Policy Gradient Methods 2 | Implementation of REINFORCE algorithm for the short corridor example 3 | 4 | jlezama@fing.edu.uy 5 | """ 6 | 7 | import os 8 | import numpy as np 9 | from scipy.stats.distributions import poisson 10 | 11 | 12 | import matplotlib 13 | # Force matplotlib to not use any Xwindows backend. 14 | matplotlib.use('Agg') 15 | import matplotlib.pyplot as plt 16 | 17 | 18 | 19 | ################################################################################ 20 | def one_step(s,a): 21 | """receives current state and action s,a, returns reward and next state, r, 22 | s_prime. a is either 0 (left) or 1 (right) 23 | """ 24 | 25 | R = -1 26 | if s == 0: 27 | s_prime = a # left (0) goes to state 0, right (1) goes to state 1 28 | elif s == 1: 29 | s_prime = 2 if a ==0 else 0 # reversed motion 30 | elif s == 2: 31 | s_prime = 3 if a == 1 else 1 32 | 33 | return R, s_prime 34 | 35 | 36 | ################################################################################ 37 | def x(s,a): 38 | xs = np.asarray([[0,1],[1,0]]) 39 | x = xs[a] 40 | return x 41 | 42 | 43 | ################################################################################ 44 | def compute_pi(theta, s): 45 | # compute soft-max for linear feature theta^T.x 46 | h = np.zeros(2) 47 | 48 | for a in range(2): 49 | h[a] = np.dot(x(s,a), theta) 50 | 51 | h -= np.max(h) 52 | 53 | pi = np.exp(h)/np.sum(np.exp(h)) 54 | 55 | return pi 56 | 57 | 58 | 59 | ################################################################################ 60 | def compute_grad(theta,s,a): 61 | # compute soft-max for linear feature theta^T.x 62 | pi = compute_pi(theta,s) 63 | 64 | 65 | not_a = np.abs(a-1) 66 | 67 | grad = x(s,a) - np.dot(pi[not_a], x(s,not_a)) 68 | 69 | 70 | return grad 71 | 72 | def v(w,s): 73 | """ See Sutton and Barto 2nd edition 13.4, page 273 """ 74 | return w 75 | 76 | 77 | ################################################################################ 78 | def REINFORCE(theta, w, gamma = 1.0, alpha_theta=2**-9, alpha_w=2**-6): 79 | EPISODES= 200 80 | 81 | GOAL = 3 82 | 83 | 84 | G_0s = [] 85 | for ep in range(EPISODES): 86 | G = 0 87 | s = 0 88 | a = np.argmax(compute_pi(theta,s)) 89 | 90 | ep_s = [] 91 | ep_a = [] 92 | ep_R = [] 93 | 94 | ep_s.append(s) 95 | ep_a.append(a) 96 | 97 | R, s = one_step(s,a) 98 | 99 | ep_R.append(R) 100 | 101 | 102 | while s != GOAL: 103 | 104 | pi = compute_pi(theta, s) 105 | a = np.random.choice(2, p=pi) 106 | 107 | # print s,pi,a 108 | # if np.random.rand()<0.1: 109 | # a = np.random.choice([0,1]) 110 | 111 | 112 | R, s = one_step(s,a) 113 | 114 | ep_s.append(s) 115 | ep_a.append(a) 116 | 117 | ep_R.append(R) 118 | 119 | # print 'GOAL!' 120 | ep_R = np.asarray(ep_R) 121 | 122 | for t in range(len(ep_s)): 123 | G_t = np.sum(ep_R[t:]) 124 | delta_t = G_t - v(w,s) 125 | 126 | w += alpha_w * (gamma**t) * delta_t 127 | 128 | theta += alpha_theta * (gamma**t) * delta_t * compute_grad(theta, ep_s[t],ep_a[t]) 129 | 130 | G_0 = np.sum(ep_R) 131 | 132 | #print 'ep %i, G_0 %f' % (ep, G_0), theta 133 | G_0s.append(G_0) 134 | 135 | return theta, np.asarray(G_0s) 136 | 137 | 138 | ################################################################################ 139 | # MAIN LOOP 140 | 141 | RUNS = 100 142 | 143 | theta = np.random.randn(2) 144 | 145 | w = np.random.randn(1) 146 | 147 | theta, G_0s = REINFORCE(theta, w) 148 | 149 | G_0s = G_0s.reshape(1,-1) 150 | 151 | for i in range(RUNS): 152 | theta = np.random.randn(2) 153 | w = np.random.rand(1) 154 | 155 | theta, G_0s_t = REINFORCE(theta,w) 156 | G_0s = np.concatenate((G_0s, G_0s_t.reshape(1,-1)), axis=0) 157 | 158 | print 'RUN %i/%i' % (i,RUNS) 159 | 160 | print G_0s.shape 161 | 162 | 163 | 164 | savefname = 'G_baseline.png' 165 | plt.plot(np.mean(G_0s, axis=0)) 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | plt.savefig(savefname) 175 | 176 | 177 | os.system('open %s' % savefname) 178 | -------------------------------------------------------------------------------- /ch6-td/windy.py: -------------------------------------------------------------------------------- 1 | """ Windy Gridworld Problem. Exercises 6.9 and 6.10, Sutton and Barto 2nd edition. 2 | jlezama@fing.edu.uy 3 | """ 4 | 5 | import os 6 | import numpy as np 7 | from scipy.stats.distributions import poisson 8 | 9 | 10 | import matplotlib 11 | # Force matplotlib to not use any Xwindows backend. 12 | matplotlib.use('Agg') 13 | import matplotlib.pyplot as plt 14 | 15 | ################################################################################ 16 | # MAIN FUNCTIONS 17 | def sarsa(Q, EPISODES, alpha, gamma, eps, stochastic_wind): 18 | global wind, grid, START, GOAL 19 | 20 | H, W = wind.shape 21 | 22 | 23 | for episode in range(EPISODES): 24 | 25 | Is, Js = np.where(grid==START) 26 | # S = (Is, Js) 27 | 28 | A, vx, vy = epsilon_greedy(Q, Is, Js, eps) 29 | 30 | steps = 0 31 | 32 | while grid[Is, Js] != GOAL: 33 | # take action A 34 | Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1]) 35 | Js_prime = Js + vx 36 | 37 | Is_prime = min(H-1, max(0,Is_prime)) 38 | Js_prime = min(W-1, max(0,Js_prime)) 39 | 40 | # choose A_prime from S_prime 41 | A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps) 42 | 43 | Q[Is, Js, A] += alpha * (-1 + gamma*Q[Is_prime, Js_prime, A_prime] - Q[Is,Js,A]) 44 | 45 | Is = Is_prime 46 | Js = Js_prime 47 | 48 | vx = vx_prime 49 | vy = vy_prime 50 | A = A_prime 51 | 52 | steps+=1 53 | 54 | if steps %1000 ==0: 55 | print 'step %i' % steps, Is, Js, Is_prime, Js_prime, vy, vx 56 | 57 | print 'reached goal in %i steps, episode %i'% (steps, episode) 58 | 59 | return Q 60 | 61 | ################################################################################ 62 | # AUX FUNCTIONS 63 | 64 | def epsilon_greedy(Q, Is, Js, eps): 65 | global actions 66 | 67 | 68 | best_actions_ix = np.where(Q[Is, Js, actions]==np.max(Q[Is, Js, actions]))[0] 69 | 70 | 71 | if len(best_actions_ix)>1: 72 | best_action_ix = np.random.choice(best_actions_ix) 73 | else: 74 | best_action_ix = best_actions_ix[0] 75 | 76 | best_action = actions[best_action_ix] if np.random.rand()>eps else np.random.choice(actions) 77 | 78 | vx, vy = action_to_pair(best_action) 79 | 80 | return best_action, vx, vy 81 | 82 | def action_to_pair(a): 83 | assert(a>=0 and a<9) 84 | vx = int(np.floor(a/3)-1) 85 | vy = int(np.mod(a,3)-1) 86 | return vx, vy 87 | 88 | def pair_to_action(vx, vy): 89 | assert np.abs(vx)<=1 and np.abs(vy)<=1 90 | 91 | return int((vx+1)*3 + vy + 1) 92 | 93 | 94 | # 0 (-1, -1) 95 | # 1 (-1, 0) 96 | # 2 (-1, 1) 97 | # 3 (0, -1) 98 | # 4 (0, 0) 99 | # 5 (0, 1) 100 | # 6 (1, -1) 101 | # 7 (1, 0) 102 | # 8 (1, 1) 103 | 104 | # non-king moves: 1, 3, 5, 7 105 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8 106 | 107 | def plot_sample(Q, it, stochastic_wind, eps=0): 108 | global wind 109 | 110 | plt.clf() 111 | x_t = [] 112 | y_t = [] 113 | 114 | H, W = wind.shape 115 | 116 | Is, Js = np.where(grid==START) 117 | 118 | A, vx, vy = epsilon_greedy(Q, Is, Js, eps) 119 | 120 | steps = 0 121 | 122 | y_t.append(Is) 123 | x_t.append(Js) 124 | 125 | while grid[Is, Js] != GOAL and steps <5000: 126 | # take action A 127 | Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1]) 128 | Js_prime = Js + vx 129 | 130 | Is_prime = min(H-1, max(0,Is_prime)) 131 | Js_prime = min(W-1, max(0,Js_prime)) 132 | 133 | # choose A_prime from S_prime 134 | A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps) 135 | 136 | 137 | Is = Is_prime 138 | Js = Js_prime 139 | 140 | vx = vx_prime 141 | vy = vy_prime 142 | A = A_prime 143 | 144 | y_t.append(Is) 145 | x_t.append(Js) 146 | 147 | steps+=1 148 | 149 | if steps<5000: 150 | print 'reached goal in %i steps, episode %i'% (steps, -1) 151 | else: 152 | print 'couldnt reach goal in 5000 steps with greedy' 153 | 154 | plt.imshow(wind, interpolation='none') 155 | plt.plot(np.asarray(x_t), np.asarray(y_t),'o-') 156 | plt.scatter(x_t[-1], y_t[-1], color='red', s=50) 157 | 158 | plt.savefig('sample_iter_%02i.png' % it) 159 | 160 | ################################################################################ 161 | # MAIN LOOP 162 | 163 | START = 1 164 | GOAL = 2 165 | 166 | GAMMA = 1 167 | EPSILON = 0.1 168 | ALPHA = 0.5 169 | 170 | 171 | EPISODES = 1500 172 | 173 | 174 | KINGS = False 175 | STAY = False # wether not moving is an option 176 | STOCHASTIC = False # stochastic wind 177 | 178 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int) 179 | grid = np.genfromtxt('grid.txt', delimiter=1) 180 | 181 | 182 | if KINGS: 183 | actions = [0, 1, 2, 3, 5, 6, 7, 8] 184 | else: 185 | actions = [1, 3, 5, 7] 186 | 187 | if STAY: 188 | actions.append(4) 189 | 190 | actions = np.asarray(actions).astype(int) 191 | 192 | H, W = wind.shape 193 | 194 | Q = np.zeros((H,W,9)) 195 | 196 | Q = sarsa(Q, EPISODES, ALPHA, GAMMA, EPSILON, STOCHASTIC) 197 | 198 | 199 | plot_sample(Q,1, STOCHASTIC) 200 | -------------------------------------------------------------------------------- /ch4-policy_iteration/jacksrental_v1.py: -------------------------------------------------------------------------------- 1 | """ Jack's Car Rental Problem. Exercise 4.5, Sutton and Barto 2nd edition. 2 | jlezama@fing.edu.uy 3 | """ 4 | import os 5 | import numpy as np 6 | from scipy.stats.distributions import poisson 7 | import matplotlib 8 | import matplotlib.pyplot as plt 9 | 10 | def policy_evaluation(V, pi, PR, gamma): 11 | """ 12 | V should be a dict containing values for all states 13 | pi should be a dict containing the chosen action for each state 14 | P contains the transition probabilities P(s'|s,a) 15 | R contains the reward function R(s',s,a) 16 | gamma is the discount factor 17 | """ 18 | 19 | global MAX_CARS, THETA 20 | 21 | 22 | 23 | Delta = np.inf 24 | 25 | while Delta > THETA: 26 | print 'entering while' 27 | Delta = 0 28 | 29 | for s1 in range(MAX_CARS): 30 | for s2 in range(MAX_CARS): 31 | 32 | print 'value evaluation for state %i,%i' % (s1, s2) 33 | v = V[s1,s2] 34 | a = pi[s1,s2] 35 | V[s1,s2] = 0 36 | for s1_prime in range(MAX_CARS): 37 | for s2_prime in range(MAX_CARS): 38 | # V[s] += P[s,a,s_prime]*(R[s,a,s_prime] + gamma*V[s_prime]) # dictionary version 39 | P_sas_prime, R_sas_prime = PR((s1,s2), a, (s1_prime, s2_prime)) # funciton version 40 | V[s1, s2] += P_sas_prime*(R_sas_prime + gamma*V[s1_prime, s2_prime]) # function version 41 | Delta = max(Delta, abs(v - V[s1, s2])) 42 | print Delta 43 | 44 | return V 45 | 46 | def policy_improvement(V, pi, actions, PR, gamma): 47 | policy_stable = True 48 | 49 | 50 | for s1 in range(MAX_CARS): 51 | for s2 in range(MAX_CARS): 52 | 53 | print 'policy improvement for state %i/%i' % (s1, s2) 54 | old_action = pi[s1, s2] 55 | 56 | max_return = -np.inf 57 | argmax_a = -np.inf 58 | 59 | for a in actions: 60 | expected_return = 0 61 | for s1_prime in range(MAX_CARS): 62 | for s2_prime in range(MAX_CARS): 63 | 64 | 65 | P_sas_prime, R_sas_prime = PR((s1,s2), a, (s1_prime, s2_prime)) # funciton version 66 | expected_return += P_sas_prime*(R_sas_prime + gamma*V[s1_prime, s2_prime]) # function version 67 | 68 | if expected_return > max_return: 69 | max_return = expected_return 70 | argmax_a = a 71 | 72 | pi[s1, s2] = argmax_a 73 | 74 | if old_action != pi[s1, s2]: 75 | policy_stable = False 76 | 77 | return pi, policy_stable 78 | 79 | 80 | 81 | ################################################################################ 82 | # REWARD AND TRANSITION PROBABILITIES 83 | ################################################################################ 84 | def PR(s, a, s_prime): 85 | global lambda_ret1, lambda_req1, lambda_ret2, lambda_req2, MAX_TRIPS 86 | 87 | if np.abs(a)>MAX_TRIPS: 88 | # maximum 5 cars returned 89 | return 0 90 | morning_loc1 = s[0] + a 91 | morning_loc2 = s[1] - a 92 | 93 | night_loc1 = s_prime[0] 94 | night_loc2 = s_prime[1] 95 | 96 | P1, R1 = prob_ret_req(morning_loc1, night_loc1, lambda_ret1, lambda_req1) 97 | P2, R2 = prob_ret_req(morning_loc2, night_loc2, lambda_ret2, lambda_req2) 98 | 99 | P = P1 * P2 100 | R = R1 + R2 - np.abs(a)*2 101 | 102 | return P, R 103 | 104 | def prob_ret_req(n_morning, n_night, lambda_ret, lambda_req): 105 | """ 106 | Probability for one agency of having n_morning cars in the morning and 107 | n_night cars in the night. Depends on the probabilities of returns and 108 | requests, as well as the max car availability. 109 | """ 110 | prob = 0 111 | difference = n_night - n_morning 112 | R = 0 113 | 114 | for ret in range(int(10*lambda_ret)): 115 | for req in range(int(10*lambda_req)): 116 | if ret-req != difference: 117 | continue 118 | p_ret = poisson.pmf(ret, lambda_ret) 119 | p_req = poisson.pmf(req, lambda_req) 120 | 121 | 122 | 123 | prob += p_ret*p_req 124 | 125 | R += p_ret * p_req * req * 10 # expected reward 126 | 127 | return prob, R 128 | 129 | def plot(V, pi, it): 130 | os.system("mkdir -p figures") 131 | 132 | fig, axes = plt.subplots(1, 2) 133 | 134 | ax = axes[0] 135 | im = ax.imshow(V, interpolation='none') 136 | ax.set_title('V') 137 | ax.set_xlabel('Location 1') 138 | ax.set_ylabel('Location 2') 139 | 140 | plt.colorbar(im, ax=ax) 141 | 142 | ax = axes[1] 143 | im = ax.imshow(pi, interpolation='none') 144 | ax.set_title('pi') 145 | ax.set_xlabel('Location 1') 146 | ax.set_ylabel('Location 2') 147 | 148 | plt.colorbar(im, ax=ax) 149 | plt.savefig('figures/result_iter_%02i.png' % it) 150 | 151 | ################################################################################ 152 | # MAIN LOOP 153 | ################################################################################ 154 | 155 | acc_factor = 2.0 156 | 157 | THETA = 5.0 158 | MAX_CARS = int(20/acc_factor) 159 | MAX_TRIPS = int(5/acc_factor) 160 | 161 | # DEFINE PARAMETERS 162 | 163 | actions = range(-1*MAX_TRIPS, MAX_TRIPS+1) 164 | 165 | V = np.zeros((MAX_CARS, MAX_CARS)) 166 | pi = np.zeros((MAX_CARS, MAX_CARS)) 167 | 168 | 169 | gamma = 0.9 170 | 171 | lambda_ret1 = 3/acc_factor 172 | lambda_ret2 = 2/acc_factor 173 | lambda_req1 = 3/acc_factor 174 | lambda_req2 = 4/acc_factor 175 | 176 | 177 | # RUN ITERATIONS 178 | policy_stable = False 179 | 180 | it = 0 181 | 182 | plot(V, pi, it) 183 | 184 | while not policy_stable: 185 | V = policy_evaluation(V, pi, PR, gamma) 186 | pi, policy_stable = policy_improvement(V, pi, actions, PR, gamma) 187 | it += 1 188 | plot(V, pi, it) 189 | -------------------------------------------------------------------------------- /ch8-dyna-q/dyna-maze.py: -------------------------------------------------------------------------------- 1 | """ Dyna-Q Maze Exercise 8.4, based on Example 8.1 Sutton and Barto 2nd edition. 2 | jlezama@fing.edu.uy 3 | 4 | TODO: implement exploration bonus 5 | 6 | """ 7 | 8 | import os 9 | import numpy as np 10 | from scipy.stats.distributions import poisson 11 | 12 | 13 | import matplotlib 14 | # Force matplotlib to not use any Xwindows backend. 15 | matplotlib.use('Agg') 16 | import matplotlib.pyplot as plt 17 | 18 | ################################################################################ 19 | # MAIN FUNCTIONS 20 | def dynaq(Q, Model, alpha, gamma, eps, steps, planning_steps): 21 | global grid, START, GOAL, BLOCK 22 | 23 | H, W = grid.shape 24 | 25 | Is, Js = np.where(grid==START) 26 | 27 | goals = 0 28 | 29 | for step in range(steps): 30 | 31 | A = epsilon_greedy(Q, Is, Js, eps) 32 | 33 | vx, vy = action_to_pair(A) 34 | 35 | 36 | Is_prime = Is + vy 37 | Js_prime = Js + vx 38 | 39 | Is_prime = min(H-1, max(0,Is_prime)) 40 | Js_prime = min(W-1, max(0,Js_prime)) 41 | 42 | if grid[Is_prime, Js_prime] == BLOCK: 43 | Is_prime = Is 44 | Js_prime = Js 45 | 46 | 47 | reached_goal = False 48 | if grid[Is_prime, Js_prime] == GOAL: 49 | R = 1 50 | reached_goal = True 51 | goals += 1 52 | else: 53 | R = 0 54 | 55 | # step (d) 56 | Q[Is, Js, A] = Q[Is, Js, A] + alpha * (R + gamma * np.max(Q[Is_prime, Js_prime,:]) -Q[Is, Js, A]) 57 | 58 | # step (e) 59 | Model[Is, Js, A, 0] = R # reward 60 | Model[Is, Js, A, 1] = Is_prime # s' 61 | Model[Is, Js, A, 2] = Js_prime 62 | 63 | 64 | 65 | # step (f) 66 | # get visited states 67 | Is_visited, Js_visited, A_visited = np.where(Model[:,:,:,1] >= 0) # -1 is default unvisited state 68 | for n in range(planning_steps): 69 | ix = np.random.randint(Is_visited.shape[0]) 70 | Is_n = Is_visited[ix] 71 | Js_n = Js_visited[ix] 72 | A_n = A_visited[ix] 73 | 74 | R_n = Model[Is_n, Js_n, A_n, 0] 75 | Is_prime_n = Model[Is_n, Js_n, A_n, 1] 76 | Js_prime_n = Model[Is_n, Js_n, A_n, 2] 77 | 78 | Q[Is_n, Js_n] = Q[Is_n, Js_n] + alpha * (R_n + gamma * np.max(Q[Is_prime_n, Js_prime_n, :]) - Q[Is_n, Js_n, A_n]) 79 | 80 | 81 | 82 | if reached_goal: 83 | Is, Js = np.where(grid==START) 84 | else: 85 | Is = Is_prime 86 | Js = Js_prime 87 | 88 | if step %1000 ==0: 89 | print 'step', step, Is, Js, Is_prime, Js_prime, vy, vx, 'reached goal %i times' % goals 90 | 91 | 92 | 93 | return Q 94 | 95 | ################################################################################ 96 | # AUX FUNCTIONS 97 | 98 | def epsilon_greedy(Q, Is, Js, eps): 99 | global actions 100 | 101 | 102 | best_actions_ix = np.where(Q[Is, Js, actions]==np.max(Q[Is, Js, actions]))[0] 103 | 104 | 105 | if len(best_actions_ix)>1: 106 | best_action_ix = np.random.choice(best_actions_ix) 107 | else: 108 | best_action_ix = best_actions_ix[0] 109 | 110 | best_action = actions[best_action_ix] if np.random.rand()>eps else np.random.choice(actions) 111 | 112 | return best_action 113 | 114 | def action_to_pair(a): 115 | # very lazy way to implement this 116 | 117 | if a == 0: 118 | vy = -1 119 | vx = 0 120 | elif a == 1: 121 | vy = 1 122 | vx = 0 123 | elif a == 2: 124 | vy = 0 125 | vx = -1 126 | elif a == 3: 127 | vy = 0 128 | vx = 1 129 | else: 130 | raise ValueError('Invalid action') 131 | 132 | return vx, vy 133 | 134 | def pair_to_action(vx, vy): 135 | if (-1, 0) == (vy, vx): 136 | return 0 137 | elif (1, 0) == (vy, vx): 138 | return 1 139 | elif (0, -1) == (vy, vx): 140 | return 2 141 | elif (0, 1) == (vy, vx): 142 | return 3 143 | else: 144 | raise ValuError('Invalid pair') 145 | 146 | 147 | 148 | def plot_sample(Q, it, eps=0): 149 | global grid 150 | 151 | plt.clf() 152 | x_t = [] 153 | y_t = [] 154 | 155 | H, W = grid.shape 156 | 157 | Is, Js = np.where(grid==START) 158 | 159 | A = epsilon_greedy(Q, Is, Js, eps) 160 | vx, vy = action_to_pair(A) 161 | 162 | steps = 0 163 | 164 | y_t.append(Is) 165 | x_t.append(Js) 166 | 167 | while grid[Is, Js] != GOAL and steps <5000: 168 | # take action A 169 | Is_prime = Is + vy 170 | Js_prime = Js + vx 171 | 172 | Is_prime = min(H-1, max(0,Is_prime)) 173 | Js_prime = min(W-1, max(0,Js_prime)) 174 | 175 | if grid[Is_prime, Js_prime] == BLOCK: 176 | Is_prime = Is 177 | Js_prime = Js 178 | 179 | 180 | # choose A_prime from S_prime 181 | A_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps) 182 | 183 | vx_prime, vy_prime = action_to_pair(A_prime) 184 | 185 | Is = Is_prime 186 | Js = Js_prime 187 | 188 | vx = vx_prime 189 | vy = vy_prime 190 | A = A_prime 191 | 192 | y_t.append(Is) 193 | x_t.append(Js) 194 | 195 | steps+=1 196 | 197 | if steps<5000: 198 | print 'reached goal in %i steps, episode %i'% (steps, -1) 199 | else: 200 | print 'couldnt reach goal in 5000 steps with greedy' 201 | 202 | plt.imshow(grid, interpolation='none') 203 | plt.plot(np.asarray(x_t), np.asarray(y_t),'o-') 204 | plt.scatter(x_t[-1], y_t[-1], color='red', s=50) 205 | 206 | plt.savefig('sample_iter_%02i.png' % it) 207 | 208 | ################################################################################ 209 | # MAIN LOOP 210 | 211 | START = 1 212 | GOAL = 3 213 | BLOCK = 2 214 | 215 | GAMMA = 0.95 216 | EPSILON = 0.1 217 | ALPHA = 0.1 218 | 219 | 220 | STEPS = 10000 221 | PLANNING_STEPS = 10 222 | 223 | 224 | STAY = False # wether not moving is an option 225 | 226 | grid = np.genfromtxt('grid.txt', delimiter=1).astype(int) 227 | 228 | 229 | actions = [0, 1, 2, 3] # up, down, left, right 230 | 231 | 232 | actions = np.asarray(actions).astype(int) 233 | 234 | H, W = grid.shape 235 | 236 | Q = np.zeros((H,W,4)) 237 | 238 | Model = np.ones((H,W,4,3)).astype(int)*-1 # R (1) and S' (2) for every S, A 239 | 240 | Q = dynaq(Q, Model, ALPHA, GAMMA, EPSILON, STEPS, PLANNING_STEPS) 241 | 242 | 243 | plot_sample(Q,1) 244 | -------------------------------------------------------------------------------- /ch12-eligibility_traces/windy.py: -------------------------------------------------------------------------------- 1 | """ Windy Gridworld Problem Using Eligibility Traces (Sutton and Barto 2nd edition, Chapter 12) 2 | 3 | Includes REINFORCE (with and without baseline) and Actor-Critic 4 | State representation can be a 3rd order polynomial on position or a one-hot vector. 5 | 6 | 7 | 8 | April 18, 2018 9 | 10 | jlezama@fing.edu.uy 11 | """ 12 | 13 | import os 14 | import numpy as np 15 | from scipy.stats.distributions import poisson 16 | 17 | 18 | import matplotlib 19 | # Force matplotlib to not use any Xwindows backend. 20 | matplotlib.use('Agg') 21 | import matplotlib.pyplot as plt 22 | 23 | ################################################################################ 24 | # MAIN FUNCTIONS 25 | 26 | def true_online_sarsa_lambda(w, alpha=0.125, lambda_=0.9, gamma=1.0): 27 | """ Sutton and Barto 2nd Edition, Section 12.7, Page 252 """ 28 | 29 | global grid, GOAL, EPISODES, MAX_STEPS 30 | 31 | 32 | counts = [] 33 | for ep in range(EPISODES): 34 | Is, Js = np.where(grid==START) 35 | 36 | Is = Is[0] 37 | Js = Js[0] 38 | 39 | 40 | epsilon = 0.1#max( 0.2*(1-ep/float(EPISODES))**2, 0.1) 41 | 42 | 43 | a = epsilon_greedy(Is, Js, epsilon=epsilon) 44 | 45 | 46 | xx = x(Is, Js, a) 47 | 48 | z = np.zeros_like(xx) 49 | 50 | Q_old = 0 51 | 52 | count = 0 53 | 54 | ep_s = [] 55 | ep_R = [] 56 | 57 | 58 | 59 | while grid[Is, Js] != GOAL and (count < MAX_STEPS): 60 | count+=1 61 | 62 | R, Is_prime, Js_prime = step(Is, Js, a) 63 | a_prime = epsilon_greedy(Is_prime, Js_prime, epsilon=epsilon) 64 | 65 | 66 | xx_prime = x(Is_prime, Js_prime, a_prime) 67 | 68 | Q = np.dot(w,xx) 69 | Q_prime = np.dot(w,xx_prime) 70 | 71 | 72 | delta = R + gamma * Q_prime - Q 73 | 74 | 75 | 76 | z = gamma * lambda_ * z + (1 - alpha * gamma * lambda_ * np.dot(z,xx)) * xx 77 | 78 | w_update = alpha * (delta + Q - Q_old) * z - alpha * (Q-Q_old) * xx 79 | 80 | # w_update = alpha* delta * xx 81 | 82 | #print np.max(np.abs(w_update))#, delta, 'Q', Q, [Is, Js],a , 'Q_prime', Q_prime, [Is_prime, Js_prime], a_prime 83 | 84 | w += w_update 85 | 86 | Q_old = Q_prime 87 | 88 | Is = Is_prime 89 | Js = Js_prime 90 | a = a_prime 91 | 92 | xx = x(Is, Js, a) 93 | 94 | ep_s.append([Is, Js]) 95 | 96 | 97 | counts.append(count) 98 | if ep % 100 == 0: 99 | print 'ep %i, count: %i' % (ep, count) #, ep_s 100 | 101 | # finished, plot steps per episode 102 | plot_curve(counts, 'true_online_sarsa_lambda_returns--alpha_%2.2e--lambda_%2.2e' % (alpha, lambda_)) 103 | ################################################################################ 104 | ## AUX FUNCTIONS 105 | ################################################################################ 106 | def epsilon_greedy(I,J, epsilon=0.1): 107 | if np.random.rand() < epsilon: 108 | # return random action 109 | return np.random.choice(9) 110 | 111 | max_q = -np.inf 112 | max_a = -1 113 | 114 | 115 | for a in range(9): 116 | if q(w,I,J,a) > max_q: 117 | max_q = q(w,I,J,a) 118 | max_a = a 119 | return max_a 120 | 121 | 122 | def step(Is, Js, a, stochastic_wind=0): 123 | """ do one step in windy gridworld """ 124 | global grid, GOAL 125 | 126 | vx, vy = action_to_pair(a) 127 | 128 | Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1]) 129 | Js_prime = Js + vx 130 | 131 | Is_prime = min(H-1, max(0,Is_prime)) 132 | Js_prime = min(W-1, max(0,Js_prime)) 133 | 134 | 135 | if grid[Is_prime, Js_prime] == GOAL: 136 | R = 0 137 | else: 138 | R = -1 139 | 140 | return R, Is_prime, Js_prime 141 | 142 | ################################################################################ 143 | def x(I,J,a): 144 | """ one-hot vector for SxA... super inefficient """ 145 | 146 | global grid, dA, GOAL 147 | 148 | 149 | H, W = grid.shape 150 | 151 | xx = np.zeros(H*W*dA) 152 | 153 | if grid[I,J] == GOAL: 154 | return xx 155 | else: 156 | xx[I*W + J + H*W*a] = 1 157 | return xx 158 | 159 | 160 | 161 | 162 | def v(w,I,J): 163 | """ See Sutton and Barto 2nd edition 13.4, page 273 """ 164 | global grid, GOAL 165 | 166 | if grid[I,J] == GOAL: 167 | return 0 168 | 169 | return np.dot(w,x(I,J)) 170 | 171 | def q(w,I,J,a): 172 | return np.dot(w,x(I,J,a)) 173 | 174 | 175 | # non-king moves: 1, 3, 5, 7 176 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8 177 | ################################################################################ 178 | def action_to_pair(a): 179 | assert(a>=0 and a<9) 180 | vx = int(np.floor(a/3)-1) 181 | vy = int(np.mod(a,3)-1) 182 | return vx, vy 183 | 184 | def pair_to_action(vx, vy): 185 | assert np.abs(vx)<=1 and np.abs(vy)<=1 186 | 187 | return int((vx+1)*3 + vy + 1) 188 | 189 | 190 | # ################################################################################ 191 | # def plot_sample(Q, it, stochastic_wind, eps=0): 192 | # global wind 193 | 194 | # plt.clf() 195 | # x_t = [] 196 | # y_t = [] 197 | 198 | # H, W = wind.shape 199 | 200 | # Is, Js = np.where(grid==START) 201 | 202 | # A, vx, vy = epsilon_greedy(Q, Is, Js, eps) 203 | 204 | # steps = 0 205 | 206 | # y_t.append(Is) 207 | # x_t.append(Js) 208 | 209 | # while grid[Is, Js] != GOAL and steps <5000: 210 | # # take action A 211 | # Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1]) 212 | # Js_prime = Js + vx 213 | 214 | # Is_prime = min(H-1, max(0,Is_prime)) 215 | # Js_prime = min(W-1, max(0,Js_prime)) 216 | 217 | # # choose A_prime from S_prime 218 | # A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps) 219 | 220 | 221 | # Is = Is_prime 222 | # Js = Js_prime 223 | 224 | # vx = vx_prime 225 | # vy = vy_prime 226 | # A = A_prime 227 | 228 | # y_t.append(Is) 229 | # x_t.append(Js) 230 | 231 | # steps+=1 232 | 233 | # if steps<5000: 234 | # print 'reached goal in %i steps, episode %i'% (steps, -1) 235 | # else: 236 | # print 'couldnt reach goal in 5000 steps with greedy' 237 | 238 | # plt.imshow(wind, interpolation='none') 239 | # plt.plot(np.asarray(x_t), np.asarray(y_t),'o-') 240 | # plt.scatter(x_t[-1], y_t[-1], color='red', s=50) 241 | 242 | # plt.savefig('sample_iter_%02i.png' % it) 243 | 244 | 245 | def print_value_img(w,counts, ep): 246 | global grid 247 | H, W = grid.shape 248 | 249 | value = np.zeros((H,W)) 250 | for i in range(H): 251 | for j in range(W): 252 | value[i,j] = v(w, i, j) 253 | 254 | plt.clf() 255 | plt.imshow(value, interpolation='none') 256 | plt.colorbar() 257 | plt.savefig('value_imgs/value_%06i.png' % ep) 258 | 259 | plt.clf() 260 | plt.imshow(counts, interpolation='none') 261 | plt.colorbar() 262 | plt.savefig('value_imgs/counts_%06i.png' % ep) 263 | 264 | 265 | def plot_curve(a, title): 266 | a = np.asarray(a) 267 | plt.clf() 268 | plt.plot(a) 269 | plt.title(title) 270 | os.system('mkdir -p figures') 271 | plt.savefig('figures/%s.png' % title.replace(' ', '_')) 272 | 273 | 274 | 275 | 276 | ################################################################################ 277 | # MAIN LOOP 278 | 279 | START = 1 280 | GOAL = 2 281 | 282 | 283 | 284 | EPISODES = 10000 285 | MAX_STEPS = 100 286 | 287 | 288 | KINGS = True 289 | STAY = False # wether not moving is an option 290 | STOCHASTIC = False # stochastic wind 291 | 292 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int) 293 | grid = np.genfromtxt('grid.txt', delimiter=1) 294 | 295 | 296 | if KINGS: 297 | actions = [0, 1, 2, 3, 4, 5, 6, 7, 8] 298 | else: 299 | actions = [1, 3, 5, 7] 300 | 301 | if STAY: 302 | actions.append(4) 303 | 304 | actions = np.asarray(actions).astype(int) 305 | 306 | H, W = wind.shape 307 | 308 | 309 | ## 310 | ## Initialize parameters 311 | 312 | # type of representation: 3rd degree polynomial or one-hot vector (indicator) 313 | #REPR = 'polynomial' 314 | REPR = 'indicator' 315 | 316 | 317 | dA = 9 # dimension of possible actions 318 | 319 | w = np.zeros_like(x(0,0,0)) 320 | 321 | wind *= 0 322 | 323 | 324 | #print grid 325 | 326 | if __name__ == '__main__': 327 | 328 | true_online_sarsa_lambda(w) 329 | 330 | 331 | -------------------------------------------------------------------------------- /ch5-monte_carlo/racetrack.py: -------------------------------------------------------------------------------- 1 | """ Racetrack Problem. Exercise 5.8, Sutton and Barto 2nd edition. 2 | jlezama@fing.edu.uy 3 | """ 4 | 5 | import os 6 | import numpy as np 7 | from scipy.stats.distributions import poisson 8 | import matplotlib 9 | # Force matplotlib to not use any Xwindows backend. 10 | matplotlib.use('Agg') 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | ################################################################################ 15 | # MAIN FUNCTIONS 16 | def on_policy_mc_control(Q,C,pi): 17 | """ R contains returns """ 18 | global gamma, EPISODES, EPSILON 19 | 20 | for episode in range(EPISODES): 21 | 22 | if (episode % 10000)==0: 23 | print 'episode %i of %i' % (episode, EPISODES) 24 | plot_sample(pi, episode) 25 | plot(Q, pi, episode) 26 | EPSILON = max(0.1, EPSILON*.95) 27 | print 'new EPSILON', EPSILON 28 | 29 | 30 | 31 | S, A, R = generate_episode(pi) 32 | T = len(S) 33 | 34 | appeared = dict() 35 | 36 | for t in range(T-1): 37 | Rt = R[t] 38 | St = S[t] 39 | At = A[t] 40 | 41 | if not (St,At) in appeared.keys(): 42 | appeared[(St,At)] = True 43 | #print 'first (s, a) occurrence' 44 | else: 45 | #print '(s, a) already occurred' 46 | continue 47 | 48 | 49 | 50 | x_t = St[0] 51 | y_t = St[1] 52 | vx_t = St[2] 53 | vy_t = St[3] 54 | 55 | # compute return (should be only for first appearance of S,A, TODO) 56 | Gt = 0 57 | for i in range(t,T-1): 58 | Gt += (gamma**(i-t))*R[i] 59 | 60 | 61 | # print y_t, x_t, vx_t, vy_t,At 62 | C[y_t, x_t, vx_t, vy_t,At] += 1 63 | Q[y_t, x_t, vx_t, vy_t,At] += 1/C[y_t, x_t, vx_t, vy_t,At]*(Gt - Q[y_t, x_t, vx_t, vy_t,At]) 64 | 65 | 66 | 67 | for t in range(T): 68 | St = S[t] 69 | x_t = St[0] 70 | y_t = St[1] 71 | vx_t = St[2] 72 | vy_t = St[3] 73 | 74 | best_actions = np.where(Q[y_t,x_t,vx_t,vy_t,:]==np.max(Q[y_t,x_t,vx_t,vy_t,:]))[0] 75 | if len(best_actions)>1: 76 | best_action = np.random.choice(best_actions) 77 | else: 78 | best_action = best_actions[0] 79 | 80 | pi[y_t, x_t, vx_t, vy_t] = best_action 81 | 82 | 83 | 84 | return Q, pi 85 | 86 | def off_policy_mc_control(Q, C, pi): 87 | global gamma, EPISODES 88 | 89 | 90 | for iter in range(EPISODES): 91 | S, A, R = generate_episode() 92 | G = 0 93 | W = 1 94 | 95 | T = len(S) 96 | 97 | for t in reversed(range(T)): 98 | Rt = R[t-1] 99 | St = S[t-1] 100 | At = A[t-1] 101 | 102 | #print 'Rt', Rt, t, T 103 | 104 | 105 | x_t = St[0] 106 | y_t = St[1] 107 | vx_t = St[2] 108 | vy_t = St[3] 109 | 110 | G = gamma*G + Rt 111 | C[y_t,x_t,vx_t,vy_t,At] += W 112 | 113 | update = Q[y_t,x_t,vx_t,vy_t,At] + W/C[y_t,x_t,vx_t,vy_t,At] * ( G - Q[y_t,x_t,vx_t,vy_t,At] ) 114 | #print 'update', update 115 | 116 | #print 'before', Q[y_t,x_t,vx_t,vy_t,:] 117 | Q[y_t,x_t,vx_t,vy_t,At] = update 118 | 119 | #print 'after', Q[y_t,x_t,vx_t,vy_t,:], At 120 | 121 | max_action = np.max(Q[y_t,x_t,vx_t,vy_t,:]) 122 | 123 | best_actions = np.where(Q[y_t,x_t,vx_t,vy_t,:]==max_action)[0] 124 | if len(best_actions)>1: 125 | best_action = np.random.choice(best_actions) 126 | else: 127 | best_action = best_actions[0] 128 | 129 | #print 'max_action', max_action, Q[y_t,x_t,vx_t,vy_t,:], 'At', At, 'best_actions', best_actions, 'best_action', best_action 130 | 131 | # if np.abs(Q[y_t,x_t,vx_t,vy_t,At] - Q[y_t,x_t,vx_t,vy_t,best_action])<1e0: 132 | # best_action = At 133 | #print 'best_action', best_action, Q[y_t,x_t,vx_t,vy_t,:], 'At', At 134 | 135 | pi[y_t,x_t,vx_t,vy_t] = best_action 136 | 137 | if best_action != At: 138 | print 'breaking!', T-t 139 | break 140 | 141 | dx, dy = action_to_pair(At) 142 | 143 | if dx ==0: 144 | pdx = 3/6. 145 | elif dx ==1: 146 | pdx = 2/6. 147 | else: 148 | pdx = 1/6. 149 | if dy == 1: 150 | pdy = 3/5. 151 | elif dy == 0: 152 | pdy = 1/5. 153 | elif dy ==-1: 154 | pdy = 1/5. 155 | 156 | 157 | W *= 1/(pdx*pdy) # TODO b is not random 158 | 159 | return Q, pi 160 | 161 | def generate_episode(pi=None, eps=None, noise=True): 162 | global track, actions, MAX_T, EPSILON 163 | 164 | H, W = track.shape 165 | 166 | if eps is None: 167 | eps = EPSILON 168 | 169 | 170 | 171 | # first state: random start location, 0 velocity 172 | x_0, y_0 = random_start() 173 | 174 | 175 | 176 | S = [(x_0, y_0, 0, 0)] 177 | A = [] 178 | R = [] 179 | 180 | for t in range(MAX_T): 181 | St = S[-1] 182 | x_t = St[0] 183 | y_t = St[1] 184 | vx_t = St[2] 185 | vy_t = St[3] 186 | 187 | 188 | 189 | # Noise with probability 0.1 at each time step the velocity increments are both zero 190 | if noise and np.random.rand()<0.1: 191 | delta_vx = 0 192 | delta_vy = 0 193 | At1 = pair_to_action(delta_vx, delta_vy) 194 | 195 | 196 | elif pi is None: 197 | # \epsilon-soft policy b 198 | delta_vx = np.random.choice([-1, 1, 1, 0, 0, 0]) # choose mostly no horiz accel 199 | delta_vy = np.random.choice([-1, 0, 1, 1, 1]) # choose mostly vert accel 200 | At1 = pair_to_action(delta_vx, delta_vy) 201 | 202 | else: 203 | At1 = pi[y_t,x_t,vx_t,vy_t] if np.random.rand()>eps else np.random.randint(9) 204 | delta_vx, delta_vy = action_to_pair(At1) 205 | 206 | 207 | vx_t1 = max(0,min(MAX_SPEED, vx_t + delta_vx)) 208 | vy_t1 = max(0,min(MAX_SPEED, vy_t + delta_vy)) 209 | 210 | if vx_t1==0 and vy_t1==0: 211 | #print 'both zero!',t, delta_vx, delta_vy, At1 212 | if np.random.rand()>0.5: 213 | vx_t1 = 1 214 | else: 215 | vy_t1 = 1 216 | At1 = pair_to_action(vx_t1, vy_t1) # should be 0,1 or 1,0 217 | assert (vx_t1+vy_t1)==1 218 | 219 | x_t1 = x_t + vx_t1 220 | y_t1 = y_t - vy_t1 # vertical is negative to go up in matrix 221 | 222 | 223 | # check if it went over boundary 224 | touched_boundary = False 225 | 226 | 227 | if x_t1 >= W or x_t1 < 0 or y_t1 >= H or y_t1 <0 or track[y_t1, x_t1] == BOUNDARY: 228 | touched_boundary = True 229 | 230 | 231 | for vxx in range(vx_t1): 232 | if touched_boundary: 233 | break 234 | for vyy in range(vy_t1): 235 | if track[y_t - vyy,x_t+vxx] == BOUNDARY: 236 | touched_boundary = True 237 | break 238 | 239 | 240 | if touched_boundary: 241 | x_t1, y_t1 = random_start() 242 | vx_t1 = 0 243 | vy_t1 = 0 244 | 245 | 246 | 247 | St1 = (x_t1, y_t1, vx_t1, vy_t1) 248 | 249 | 250 | Rt1 = -1 251 | 252 | 253 | S.append(St1) 254 | A.append(At1) 255 | R.append(Rt1) 256 | 257 | terminate = False 258 | if track[y_t1, x_t1] == FINISH: 259 | print 'FINISHED in %i steps!' % t 260 | terminate = True 261 | break 262 | # print St1, At1, Rt1 263 | if not terminate: 264 | print 'didnt make it to the end ----------', eps 265 | return S, A, R 266 | 267 | 268 | ################################################################################ 269 | # AUX FUNCTIONS 270 | 271 | def action_to_pair(a): 272 | assert(a>=0 and a<9) 273 | 274 | vx = int(np.floor(a/3)-1) 275 | vy = int(np.mod(a,3)-1) 276 | 277 | 278 | return vx, vy 279 | 280 | def pair_to_action(vx, vy): 281 | assert np.abs(vx)<=1 and np.abs(vy)<=1 282 | 283 | return int((vx+1)*3 + vy + 1) 284 | 285 | def random_start(): 286 | 287 | global track 288 | 289 | # possible start positions 290 | Is, Js = np.where(track==START) 291 | ix_start = np.random.randint(len(Is)) 292 | return Js[ix_start], Is[ix_start] # horizontal coord first 293 | 294 | 295 | def plot(Q, pi, it): 296 | os.system("mkdir -p figures") 297 | 298 | fig, axes = plt.subplots(MAX_SPEED+1, MAX_SPEED+2) 299 | 300 | ax = axes[0,0] 301 | 302 | im = ax.imshow(np.mean(Q, axis=(2,3,4)), interpolation='none') 303 | ax.set_title('Q') 304 | 305 | 306 | plt.colorbar(im, ax=ax) 307 | 308 | count = MAX_SPEED+3 309 | for vx in range(MAX_SPEED+1): 310 | for vy in range(MAX_SPEED+1): 311 | 312 | ax = axes[vx, 1+vy] 313 | im = ax.imshow(pi[:,:,vx,vy], interpolation='none') 314 | ax.set_title('pi (vx: %i, vy: %i)' % (vx, vy)) 315 | count += 1 316 | 317 | plt.colorbar(im, ax=ax) 318 | plt.savefig('figures/result_iter_%09i.png' % it) 319 | 320 | def plot_old(Q, pi, it): 321 | os.system("mkdir -p figures") 322 | 323 | fig, axes = plt.subplots(1, 2) 324 | 325 | ax = axes[0] 326 | im = ax.imshow(np.mean(Q, axis=(2,3,4)), interpolation='none') 327 | ax.set_title('Q') 328 | ax.set_xlabel('Location 1') 329 | ax.set_ylabel('Location 2') 330 | 331 | plt.colorbar(im, ax=ax) 332 | 333 | ax = axes[1] 334 | im = ax.imshow(np.mean(pi, axis=(2,3)), interpolation='none') 335 | ax.set_title('pi') 336 | ax.set_xlabel('Location 1') 337 | ax.set_ylabel('Location 2') 338 | 339 | plt.colorbar(im, ax=ax) 340 | plt.savefig('figures/result_iter_%09i.png' % it) 341 | 342 | def plot_sample(pi, it): 343 | os.system('mkdir -p samples') 344 | global track 345 | 346 | H, W = track.shape 347 | 348 | S, A, R = generate_episode(pi, eps=0, noise=False) 349 | 350 | f = open('samples/sample_iter_%09i.txt' % it, 'w') 351 | f.write(str(S)) 352 | f.close() 353 | 354 | plt.clf() 355 | x_t = [] 356 | y_t = [] 357 | 358 | for St in S: 359 | x_t.append(St[0]) 360 | y_t.append(St[1]) 361 | 362 | plt.imshow(track, interpolation='none') 363 | plt.plot(np.asarray(x_t), np.asarray(y_t),'o-') 364 | plt.scatter(x_t[-1], y_t[-1], color='red', s=50) 365 | 366 | plt.savefig('samples/sample_iter_%09i.png' % it) 367 | 368 | 369 | ################################################################################ 370 | # MAIN LOOP 371 | 372 | 373 | START = 2 374 | FINISH = 3 375 | TRACK = 1 376 | BOUNDARY = 0 377 | 378 | gamma = 0.9 379 | accelerate_factor = 2.0 380 | 381 | MAX_SPEED = 4 382 | MAX_T = 500 # Max episode length 383 | EPISODES = int(1e5) 384 | EPSILON = 0.5 385 | 386 | track = np.genfromtxt('racetrack.txt', delimiter=1) 387 | 388 | actions = range(9) # cartesian product of (-1, 0, 1) 389 | 390 | H, W = track.shape 391 | 392 | Q = np.ones((H,W,MAX_SPEED+1,MAX_SPEED+1,9))*(-1*MAX_T) # pessimism in the face of uncertainty :-) 393 | C = np.zeros((H,W,MAX_SPEED+1,MAX_SPEED+1,9)) 394 | 395 | 396 | pi = np.random.randint(low=0, high=9, size=(H,W,MAX_SPEED+1,MAX_SPEED+1)) 397 | 398 | Q, pi = on_policy_mc_control(Q,C,pi) 399 | 400 | 401 | plot(Q, pi, 0) 402 | 403 | # plot a few more samples episodes 404 | for i in range(10): 405 | plot_sample(pi, EPISODES+i) 406 | 407 | -------------------------------------------------------------------------------- /ch13-policy_gradient/windy.py: -------------------------------------------------------------------------------- 1 | """ Windy Gridworld Problem Using Policy Gradient (Sutton and Barto 2nd edition, Chapter 13) 2 | 3 | Includes REINFORCE (with and without baseline) and Actor-Critic 4 | State representation can be a 3rd order polynomial on position or a one-hot vector. 5 | 6 | 7 | 8 | April 11, 2018 9 | 10 | jlezama@fing.edu.uy 11 | """ 12 | 13 | import os 14 | import numpy as np 15 | from scipy.stats.distributions import poisson 16 | 17 | 18 | import matplotlib 19 | # Force matplotlib to not use any Xwindows backend. 20 | matplotlib.use('Agg') 21 | import matplotlib.pyplot as plt 22 | 23 | ################################################################################ 24 | # MAIN FUNCTIONS 25 | def actor_critic(w, theta, alpha_w=1e-3, alpha_theta=1e-2, gamma=0.99): 26 | global GOAL, EPISODES, MAX_STEPS 27 | 28 | Gs = [] 29 | 30 | count = 0 31 | for episode in range(EPISODES): 32 | 33 | 34 | 35 | if episode % 100 ==0: 36 | print 'NEW EPISODE %i/%i! (%i)' % (episode, EPISODES, count), compute_pi(theta, 3, 6), action_to_pair(np.random.choice(9, p=compute_pi(theta, 3, 6))) 37 | 38 | print 'values', v(w,3,0), v(w,3,1), v(w,3,2), v(w,3,3), v(w,3,4), v(w,3,5), v(w,3,6), v(w,3,7), v(w,3,8), v(w,3,9) 39 | 40 | 41 | count = 0 42 | 43 | counts = np.zeros_like(grid) 44 | 45 | Is, Js = np.where(grid==START) 46 | II = 1 47 | 48 | 49 | G = 0 50 | 51 | 52 | while grid[Is, Js] != GOAL and count < MAX_STEPS: 53 | 54 | pi = compute_pi(theta, Is, Js) 55 | 56 | #print 'pi', pi, [Is, Js] 57 | 58 | a = np.random.choice(9, p=pi) 59 | 60 | #a = 7 61 | 62 | # if 1:#np.random.rand()<1.0: 63 | # a = np.random.choice(9) 64 | 65 | R, Is_prime, Js_prime = step(Is, Js, a) 66 | 67 | G += R 68 | 69 | vx, vy = action_to_pair(a) 70 | 71 | delta = R + gamma * v(w, Is_prime, Js_prime) -v(w, Is, Js) 72 | 73 | w_update = alpha_w * II * delta * x(Is,Js) 74 | 75 | w += w_update 76 | 77 | theta_update = alpha_theta * II * delta * compute_grad(theta, Is, Js, a) 78 | theta += theta_update 79 | 80 | 81 | 82 | 83 | # print 'moving from [%i %i] to [%i %i] (%i: %i, %i)' % (Is, Js, Is_prime, Js_prime,a, vx, vy) 84 | # print 'pi', pi 85 | # print 'grad', compute_grad(theta, Is, Js, a)[3:] 86 | # # print 'delta', delta 87 | #print 'theta_update', theta_update 88 | # print '-----' 89 | 90 | #raise 91 | 92 | counts[Is, Js] += 1 93 | 94 | II *= gamma 95 | Is = Is_prime 96 | Js = Js_prime 97 | 98 | count +=1 99 | 100 | if count % 1000==0: 101 | print 'still computing', count 102 | print 'moving from [%i %i] to [%i %i] (%i: %i, %i)' % (Is, Js, Is_prime, Js_prime,a, vx, vy) 103 | print 'pi', pi 104 | print 'grad', compute_grad(theta, Is, Js, a) 105 | # print 'delta', delta 106 | print 'theta_update', theta_update 107 | print 'w_update', w_update 108 | print '-----' 109 | 110 | Gs.append(G) 111 | 112 | if episode % 1000 == 0: 113 | print_value_img(w, counts, episode) 114 | 115 | plot_curve(Gs, 'actor_critic_returns--alpha_theta_%2.2e--alpha_w_%2.2e--_%s' % (alpha_theta, alpha_w, REPR)) 116 | 117 | 118 | ################################################################################ 119 | ################################################################################ 120 | def REINFORCE(theta, w, gamma=1.0, alpha_theta=1e-5, alpha_w=1e-7): 121 | global grid, GOAL, EPISODES, MAX_STEPS, REPR 122 | 123 | H,W = grid.shape 124 | 125 | counts = np.zeros((H,W)) 126 | 127 | G_0s = [] 128 | for ep in range(EPISODES): 129 | 130 | 131 | Is, Js = np.where(grid==START) 132 | 133 | 134 | Is = Is[0] 135 | Js =Js[0] 136 | 137 | 138 | pi = compute_pi(theta,Is, Js) 139 | a = np.random.choice(9, p=pi) 140 | 141 | 142 | counts *= 0 143 | 144 | # a = 7 145 | 146 | 147 | ep_s = [] 148 | ep_a = [] 149 | ep_R = [] 150 | 151 | ep_s.append([Is, Js]) 152 | ep_a.append(a) 153 | 154 | R, Is, Js = step(Is, Js, a) 155 | 156 | 157 | #print Is, Js, a, action_to_pair(a) 158 | 159 | 160 | ep_R.append(R) 161 | 162 | 163 | while grid[Is, Js] != GOAL and len(ep_s)0) * np.random.choice([-1,0,1]) 249 | Js_prime = Js + vx 250 | 251 | Is_prime = min(H-1, max(0,Is_prime)) 252 | Js_prime = min(W-1, max(0,Js_prime)) 253 | 254 | 255 | if grid[Is_prime, Js_prime] == GOAL: 256 | R = 0 257 | else: 258 | R = -1 259 | 260 | return R, Is_prime, Js_prime 261 | 262 | ################################################################################ 263 | def x(I,J): 264 | global REPR 265 | if REPR == 'polynomial': 266 | return x_polynomial(I,J) 267 | elif REPR == 'indicator': 268 | return x_indicator(I,J) 269 | else: 270 | raise ValueError('unknown representation type') 271 | 272 | def x_indicator(I,J): 273 | global grid 274 | H, W = grid.shape 275 | 276 | xx = np.zeros(H*W) 277 | xx[I*W+J] = 1 278 | 279 | return xx 280 | 281 | def x_polynomial(I, J): 282 | # returns a vector representation of x 283 | global grid 284 | H,W = grid.shape 285 | 286 | 287 | xx = np.zeros(10) 288 | 289 | 290 | xx[0] = (I-H/2.)/float(H/2.) 291 | xx[1] = (J-W/2.)/float(W/2.) 292 | xx[2] = (xx[0])**2 293 | xx[3] = (xx[1])**2 294 | xx[4] = xx[0]*xx[1] 295 | 296 | xx[5] = xx[0]**3 297 | xx[6] = xx[1]**3 298 | xx[7] = xx[2]*xx[1] 299 | xx[8] = xx[3]*xx[0] 300 | 301 | 302 | xx[9] = 1 # bias term 303 | 304 | 305 | return xx 306 | 307 | 308 | ################################################################################ 309 | def compute_pi(theta, I, J): 310 | # compute soft-max for linear feature theta^T.x 311 | 312 | xx = x(I,J) 313 | 314 | scores = np.dot(xx.T, theta) 315 | 316 | scores -= np.max(scores) 317 | 318 | pi = np.exp(scores)/np.sum(np.exp(scores)) 319 | 320 | assert np.abs(np.sum(pi)-1)<1e-9, np.sum(pi) 321 | 322 | return pi 323 | 324 | ################################################################################ 325 | def compute_grad(theta,I,J,a): 326 | # compute soft-max for linear feature theta^T.x 327 | 328 | global actions 329 | pi = compute_pi(theta,I, J) 330 | 331 | 332 | grad = np.zeros_like(theta) 333 | 334 | 335 | for b in actions: 336 | if b==a: 337 | grad[:,b] = x(I,J)*(1-pi[b]) 338 | else: 339 | grad[:,b] = -1*pi[b]*x(I,J) 340 | 341 | #print '----' 342 | #print pi, grad[:,a],grad[:,a+1], x(I,J), a 343 | # raise 344 | 345 | return grad 346 | 347 | 348 | def v(w,I,J): 349 | """ See Sutton and Barto 2nd edition 13.4, page 273 """ 350 | global grid, GOAL 351 | 352 | if grid[I,J] == GOAL: 353 | return 0 354 | 355 | return np.dot(w,x(I,J)) 356 | 357 | 358 | 359 | # non-king moves: 1, 3, 5, 7 360 | # king moves: 0, 1, 2, 3, 5, 6, 7, 8 361 | ################################################################################ 362 | def action_to_pair(a): 363 | assert(a>=0 and a<9) 364 | vx = int(np.floor(a/3)-1) 365 | vy = int(np.mod(a,3)-1) 366 | return vx, vy 367 | 368 | def pair_to_action(vx, vy): 369 | assert np.abs(vx)<=1 and np.abs(vy)<=1 370 | 371 | return int((vx+1)*3 + vy + 1) 372 | 373 | 374 | # ################################################################################ 375 | # def plot_sample(Q, it, stochastic_wind, eps=0): 376 | # global wind 377 | 378 | # plt.clf() 379 | # x_t = [] 380 | # y_t = [] 381 | 382 | # H, W = wind.shape 383 | 384 | # Is, Js = np.where(grid==START) 385 | 386 | # A, vx, vy = epsilon_greedy(Q, Is, Js, eps) 387 | 388 | # steps = 0 389 | 390 | # y_t.append(Is) 391 | # x_t.append(Js) 392 | 393 | # while grid[Is, Js] != GOAL and steps <5000: 394 | # # take action A 395 | # Is_prime = Is + vy - wind[Is,Js] + stochastic_wind * ( wind[Is,Js]>0) * np.random.choice([-1,0,1]) 396 | # Js_prime = Js + vx 397 | 398 | # Is_prime = min(H-1, max(0,Is_prime)) 399 | # Js_prime = min(W-1, max(0,Js_prime)) 400 | 401 | # # choose A_prime from S_prime 402 | # A_prime, vx_prime, vy_prime = epsilon_greedy(Q, Is_prime, Js_prime, eps) 403 | 404 | 405 | # Is = Is_prime 406 | # Js = Js_prime 407 | 408 | # vx = vx_prime 409 | # vy = vy_prime 410 | # A = A_prime 411 | 412 | # y_t.append(Is) 413 | # x_t.append(Js) 414 | 415 | # steps+=1 416 | 417 | # if steps<5000: 418 | # print 'reached goal in %i steps, episode %i'% (steps, -1) 419 | # else: 420 | # print 'couldnt reach goal in 5000 steps with greedy' 421 | 422 | # plt.imshow(wind, interpolation='none') 423 | # plt.plot(np.asarray(x_t), np.asarray(y_t),'o-') 424 | # plt.scatter(x_t[-1], y_t[-1], color='red', s=50) 425 | 426 | # plt.savefig('sample_iter_%02i.png' % it) 427 | 428 | 429 | def print_value_img(w,counts, ep): 430 | global grid 431 | H, W = grid.shape 432 | 433 | value = np.zeros((H,W)) 434 | for i in range(H): 435 | for j in range(W): 436 | value[i,j] = v(w, i, j) 437 | 438 | plt.clf() 439 | plt.imshow(value, interpolation='none') 440 | plt.colorbar() 441 | plt.savefig('value_imgs/value_%06i.png' % ep) 442 | 443 | plt.clf() 444 | plt.imshow(counts, interpolation='none') 445 | plt.colorbar() 446 | plt.savefig('value_imgs/counts_%06i.png' % ep) 447 | 448 | 449 | def plot_curve(a, title): 450 | a = np.asarray(a) 451 | plt.clf() 452 | plt.plot(a) 453 | plt.title(title) 454 | plt.savefig('figures/%s.png' % title.replace(' ', '_')) 455 | 456 | 457 | 458 | 459 | ################################################################################ 460 | # MAIN LOOP 461 | 462 | START = 1 463 | GOAL = 2 464 | 465 | 466 | 467 | EPISODES = 10000 468 | MAX_STEPS = 100 469 | 470 | 471 | 472 | KINGS = True 473 | STAY = False # wether not moving is an option 474 | STOCHASTIC = False # stochastic wind 475 | 476 | wind = np.genfromtxt('wind.txt', delimiter=1).astype(int) 477 | grid = np.genfromtxt('grid.txt', delimiter=1) 478 | 479 | 480 | if KINGS: 481 | actions = [0, 1, 2, 3, 4, 5, 6, 7, 8] 482 | else: 483 | actions = [1, 3, 5, 7] 484 | 485 | if STAY: 486 | actions.append(4) 487 | 488 | actions = np.asarray(actions).astype(int) 489 | 490 | H, W = wind.shape 491 | 492 | 493 | ## 494 | ## Initialize parameters 495 | 496 | # type of representation: 3rd degree polynomial or one-hot vector (indicator) 497 | #REPR = 'polynomial' 498 | REPR = 'indicator' 499 | 500 | 501 | dA = 9 # dimension of possible actions 502 | dX = int(x(0,0).shape[0]) #H*W # dimension of x 503 | 504 | theta = np.zeros((dX,dA)) 505 | w = np.zeros(dX) 506 | 507 | wind *= 0 508 | 509 | 510 | #print grid 511 | 512 | if __name__ == '__main__': 513 | 514 | actor_critic(w, theta) 515 | 516 | #REINFORCE(theta,w) 517 | 518 | --------------------------------------------------------------------------------