├── .idea
├── A-introduction-to-reinforcement-learning.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── RLF
├── DDPG.py
├── DQN.py
├── DQN.pyc
├── Dyna_Q.py
├── MC_ON-POLICY_RACETRACK.png
├── Q-LEARNING_RACETRACK.png
├── Q_A.md
├── SARSA_RACETRACK.png
├── SARSA_lambda_RACETRACK.png
├── TEST.py
├── dp.py
├── env.py
├── env.pyc
├── main.py
├── monte_carlo.py
├── monte_carlo.pyc
├── naive_Q_lambda_RACETRACK.png
├── result_analysis.py
├── result_analysis.pyc
├── td.py
├── td.pyc
├── td_lambda.py
└── td_lambda.pyc
├── chapter4
├── Jack’s_Car_Rental.py
└── The_Gambler.py
├── chapter5
├── Q_A.md
├── Racetrack.py
├── Racetrack_result_1.png
├── Racetrack_result_2.png
└── tmp_data.txt
├── chapter6
├── Q-LEARNING_RACETRACK.png
├── SARSA_RACETRACK.png
└── td.py
├── chapter7
├── SARSA_lambda_RACETRACK.png
├── naive_Q_lambda_RACETRACK.png
└── td_lambda.py
├── chapter8
└── Dyna_Q.py
└── papers
├── A3C.pdf
├── DDPG.pdf
├── DPG.pdf
├── DQN.pdf
├── DRL_simulated_Auto_Vehicle.pdf
├── TRPO.pdf
├── bookdraft2017june19.pdf
└── crossing.pdf
/.idea/A-introduction-to-reinforcement-learning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
145 |
146 |
147 |
148 | agent
149 |
150 |
151 | env
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 | 1502247603396
321 |
322 |
323 | 1502247603396
324 |
325 |
326 | 1502247893396
327 |
328 |
329 |
330 | 1502247893397
331 |
332 |
333 | 1502266132701
334 |
335 |
336 |
337 | 1502266132701
338 |
339 |
340 | 1503052836176
341 |
342 |
343 |
344 | 1503052836176
345 |
346 |
347 | 1503542521418
348 |
349 |
350 |
351 | 1503542521418
352 |
353 |
354 | 1503656116932
355 |
356 |
357 |
358 | 1503656116932
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A-introduction-to-reinforcement-learning
2 |
3 | ## Instructions
4 |
5 | ### - why do I maintain this repository?
6 | Since it's hard for me to find a complete and feasible algorithms set on ‘Reinforcement Learning’ which is of vital help for freshers who want to get into this field.
7 |
8 | Nowthat I have made a detailed reading notes on the excellent book "A-introduction-to-reinforcement-learning" and implemented most of its algorithms, why not share them to public?
9 |
10 | I believe the quote > 'talk is cheap, show me the code'.
11 |
12 | ### - what are in this repository?
13 | This repository consists of the algorithms from first 9 chapters in this book: [Reinforcement Learning:An introduction --2012 version](https://files.cnblogs.com/files/lvlvlvlvlv/SuttonBook.pdf)
14 |
15 | 
16 |
17 | The algorithms after 9th chapter will be added continuously by another version of this book.
18 |
19 | - **chapter4** : Dynamic Programming
20 |
21 | Includes two exercises:
22 | 1. [The Gambler](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter4/The_Gambler.py)
23 | 2. [Jack's Car Rental](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter4/Jack%E2%80%99s_Car_Rental.py)
24 |
25 | - **chapter5** : Monte Carlo Methods
26 |
27 | Includes an exercise called "racetrack" and experiment performances on **racetrack**.
28 | 1. [Racetrack.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter5/Racetrack.py)
29 |
30 | **Note**:the Monte Carlo algorithms from chapter 5 are all implemented in the single file [Racetrack.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter5/Racetrack.py). Specifically in func: `def update_policy(episode):`
31 |
32 | - **chapter6** : Temporal-Difference Learning
33 |
34 | Includes td-related algorithms and experiment performances on **racetrack**.
35 | 1. [td.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter6/td.py)
36 |
37 | **Note**: From this chapter, I quit implementing environment of every exercise. Since almost each exercise has a different environment. If I just used different algorithms on different environments, one can hardly has a comparison between those algorithms, Therefore, I decided to show the performances of different algorithms on single same environment: **racetrack**.
38 |
39 | 
40 |
41 | - **chapter7** : Eligibility Traces
42 |
43 | Includes td-lambda related algorithms and experiment performances on **racetrack**
44 | 1. [td_lambda.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter7/td_lambda.py)
45 |
46 | - **chapter8** : Planning and Learning with Tabular Methods
47 |
48 | Includes Dyna_Q algorithm.
49 | 1. [Dyna_Q](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/chapter8/Dyna_Q.py)
50 |
51 | - **RLF** : An reinforcement learning algorithms library which pulls together all algorithms mentioned above and *some new deep reinforment learning algorithms like DQN or DDPG* for purpose of convenient external call.
52 | **Note**: One remarkable point of this library is the seperation of *'environment'* and *'agent algorithm'* via the attribute of 'python Class'. In this way, you could add or modify your own environment without interfering agent part. Also, you can just connect agent algorithms(td,monte-carlo,dp,...) to your environment.
53 | + environment code:
54 | - [env.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/env.py)
55 | + agent algorithms:
56 | - [monte_carlo.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/monte_carlo.py)
57 | - [dp.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/dp.py)
58 | - [td.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/td.py)
59 | - [td_lambda.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/td_lambda.py)
60 | - [Dyna_Q.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/Dyna_Q.py)
61 | - [DQN.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/DQN.py)
62 | + main function:
63 | - [main.py](https://github.com/lvlvlvlvlv/A-introduction-to-reinforcement-learning/blob/master/RLF/main.py)
64 |
65 | ### - How to utilize them for your projects?
66 |
--------------------------------------------------------------------------------
/RLF/DDPG.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import random
4 |
5 | from collections import deque
6 | from keras.models import Sequential
7 | from keras.layers import Dense, Activation,Flatten, Input, merge, Lambda
8 |
9 |
10 | from keras.initializers import normal, identity
11 | from keras import optimizers
12 |
13 | class DDPG:
14 |
15 | def __init__(self,env):
16 |
17 | self.env = env
18 |
19 | self.TAU = 0.001
20 | self.actor_lr = 0.0001
21 | self.critic_lr = 0.001
22 | self.batch_size = 32
23 | self.gamma = 0.99
24 |
25 |
26 | def create_actor_network(self,state_shape,action_shape,h0_num,h1_num):
27 |
28 | if len(state_shape) < 2:
29 | model = Sequential()
30 | model.add(Dense(h0_num,activation='relu'))
31 | model.add(Dense(h1_num,activation='relu'))
32 | model.add(Dense(action_shape[0],activation='tanh'))
33 | model.compile(loss='mse',optimizer = optimizers.Adam(lr=self.actor_lr))
34 | else:
35 | print("Please check your state shape!")
36 |
37 |
38 |
39 | def create_critic_network(self):
40 | pass
41 |
42 |
--------------------------------------------------------------------------------
/RLF/DQN.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import random
4 | from collections import deque
5 | from keras.models import Sequential
6 | from keras.layers import Dense, Activation
7 | from keras import optimizers
8 |
9 | class DQN:
10 |
11 | def __init__(self,env):
12 |
13 | self.env = env
14 | self.memory = list()
15 |
16 | self.gamma = 0.9
17 | self.learning_rate = 0.0001
18 | self.epsilon = 1.0
19 | self.epsilon_decay = 0.995
20 | self.epsilon_min = 0.1
21 | self.create_model()
22 |
23 |
24 |
25 |
26 | def create_model(self):
27 |
28 | model = Sequential()
29 | model.add(Dense(64,input_dim=4,activation='tanh'))
30 | model.add(Dense(128,activation='tanh'))
31 | model.add(Dense(128, activation='tanh'))
32 | model.add(Dense(2, activation='linear'))
33 | model.compile(loss='mse',optimizer = optimizers.RMSprop(lr=self.learning_rate))
34 | self.model = model
35 |
36 | def remember(self,state,action,reward,next_state,done):
37 | self.memory.append((state,action,reward,next_state,done))
38 |
39 |
40 | def act(self, state):
41 | if np.random.rand() <= self.epsilon:
42 | return self.env.action_space.sample()
43 | act_values = self.model.predict(state)
44 | return np.argmax(act_values[0]) # returns action
45 |
46 | def Deep_Q_Learning(self,env,episode_num,max_timestep,mini_batch_size,eval_interval):
47 |
48 | c_env = env
49 | ep_idx = 0
50 |
51 | for ep_idx in range(episode_num):
52 |
53 | c_state = c_env.reset()
54 | c_state = np.reshape(c_state,[1,4])
55 |
56 | n = 0
57 |
58 | for n in range(max_timestep):
59 |
60 | c_action_idx = self.act(c_state)
61 | next_state,c_reward,done,_= env.step(c_action_idx)
62 | next_state = np.reshape(next_state,[1,4])
63 |
64 |
65 | if done:
66 | c_reward = -100
67 | else:
68 | c_reward = c_reward
69 |
70 | self.remember(c_state, c_action_idx, c_reward, next_state, done)
71 |
72 | c_state = next_state
73 |
74 | if done:
75 |
76 | print("episode: {}/{}, score: {}".format(ep_idx, episode_num, n))
77 | break
78 |
79 | #--------------------------- start replay training -------------------------#
80 |
81 | batch_size = min(mini_batch_size,len(self.memory))
82 | batches_idx = np.random.choice(len(self.memory),batch_size)
83 |
84 | for i in batches_idx:
85 | replay_c_state,replay_c_action_idx,replay_c_reward,replay_next_state,replay_done = self.memory[i]
86 |
87 | if replay_done:
88 | replay_c_target = replay_c_reward
89 | else:
90 | replay_c_target = replay_c_reward + self.gamma * np.amax(self.model.predict(replay_next_state)[0])
91 |
92 | replay_c_expectValue = self.model.predict(replay_c_state)
93 | replay_c_expectValue[0][replay_c_action_idx] = replay_c_target
94 |
95 | self.model.fit(replay_c_state,replay_c_expectValue,nb_epoch=1, verbose=0)
96 |
97 | if self.epsilon > self.epsilon_min:
98 | self.epsilon *= self.epsilon_decay
99 |
100 |
--------------------------------------------------------------------------------
/RLF/DQN.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/DQN.pyc
--------------------------------------------------------------------------------
/RLF/Dyna_Q.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 | class Dyna_Q:
5 |
6 | def __init__(self,state_list,action_list):
7 |
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 | self.state_num = len(self.states)
12 | self.action_num = len(self.actions)
13 |
14 | self.Q = dict()
15 | for s in self.states:
16 | self.Q[s] = np.zeros(self.action_num)
17 |
18 | self.Model = dict()
19 |
20 | for s in self.states:
21 | self.Model[s] = list()
22 | for i in range(self.action_num):
23 | rand_state = self.states[np.randint(0,self.state_num-1)]
24 | self.Model[s].append([0,rand_state])
25 |
26 | def set_policy(self,learning_type):
27 |
28 | self.pi = dict()
29 |
30 | if learning_type == 'Dyna-Q':
31 | for s in self.states:
32 | self.pi[s] = np.random.random(self.action_num)
33 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
34 |
35 | def Dyna_Q_learning(self,agent,episode_num,epsilon,alpha,gamma,max_timestep,planning_num,eval_interval):
36 |
37 | ep_idx = 0
38 |
39 | avg_ep_return_list = []
40 |
41 | observed_sa = dict()
42 |
43 | while ep_idx < episode_num:
44 |
45 | ep_idx += 1
46 |
47 | agent.c_state = agent.getInitState()
48 | agent.next_state = agent.c_state
49 |
50 | n = 0
51 |
52 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[agent.c_state])[0]
53 | agent.c_action = self.actions[c_action_idx]
54 |
55 | while not (agent.isTerminated() or n >= max_timestep) :
56 |
57 | agent.c_state = agent.next_state
58 | agent.c_action = self.actions[c_action_idx]
59 |
60 | if agent.c_state in observed_sa.keys():
61 | observed_sa[agent.c_state].append(c_action_idx)
62 | else:
63 | observed_sa[agent.c_state] = [c_action_idx]
64 |
65 | agent.c_state, agent.c_action, agent.c_reward, agent.next_state = agent.oneStep_generator()
66 |
67 | next_action_idx = np.random.choice(self.action_num, 1, p=self.pi[agent.next_state])[0]
68 |
69 | self.Q[agent.c_state][c_action_idx] += alpha * (agent.c_reward + gamma * self.Q[agent.next_state][next_action_idx] - self.Q[agent.c_state][c_action_idx])
70 |
71 | self.Model[agent.c_state][c_action_idx] = [agent.c_reward,agent.next_state]
72 |
73 | for plan_idx in range(planning_num):
74 | pass
75 |
76 |
77 |
--------------------------------------------------------------------------------
/RLF/MC_ON-POLICY_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/MC_ON-POLICY_RACETRACK.png
--------------------------------------------------------------------------------
/RLF/Q-LEARNING_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/Q-LEARNING_RACETRACK.png
--------------------------------------------------------------------------------
/RLF/Q_A.md:
--------------------------------------------------------------------------------
1 | --2017.7.20 记录 --:
2 | 问题:为什么MC的off-policy更新过程有latest time时间点,而TD的off-policy过程没有?
3 | 答案:
4 | --2017.7.30 记录--:
5 | 问题:不断修改MC的on-policy代码,总是看不到效果,有几个原因?
6 | 答案:有如下几个原因:1、训练过程的return有时比验证过程随机,因为训练过程要随机选择action,验证过程可以始终用greedy action。但是如果训练的正确,无论训练还是验证过程,都应看到return明显提升。
7 | 2、代码细节问题。这次最终找到的问题是:在选择action时使用了np.where()函数来获取index,但是使用方式不对。关键代码是这一句:sa_pair = (c_ep[i],np.where((np.array(self.actions)==c_ep[i+1]).all(1))[0][0])
8 | 3、在从一个list中选择最大值时,要时刻考虑到最大值也许会有多个,需要从这多个中随机选取,不然就固定选择首次出现的那个了。关键代码是这一句:best_action = self.actions[np.random.choice(np.where(self.Q[s] == np.amax(self.Q[s]))[0])]
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/RLF/SARSA_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/SARSA_RACETRACK.png
--------------------------------------------------------------------------------
/RLF/SARSA_lambda_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/SARSA_lambda_RACETRACK.png
--------------------------------------------------------------------------------
/RLF/TEST.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import deque
4 | from keras.models import Sequential
5 | from keras.layers import Dense, Activation
6 | from keras import optimizers
7 |
8 | model = Sequential()
9 | #model.add(Dense(10,input_dim=9,activation='tanh'))
10 | model.add(Dense(10,input_shape=[9,],activation='tanh'))
11 | print model.predict(np.ones((3,3)).reshape(1,9))
12 |
13 |
14 |
--------------------------------------------------------------------------------
/RLF/dp.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/dp.py
--------------------------------------------------------------------------------
/RLF/env.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 | import gym
4 |
5 |
6 | class RaceCar:
7 |
8 | global race_map
9 | race_map = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
10 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
11 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
12 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0],
13 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0],
14 | [0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,3,0,0,0],
15 | [0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,3,0,0,0],
16 | [0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,3,0,0,0],
17 | [0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0],
18 | [0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,0,0,0,0,0],
19 | [0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0],
20 | [0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
21 | [0,0,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
22 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
23 |
24 | global start_line
25 | start_line = [(12,5),(12,4),(12,3),(12,2)]
26 | global end_line
27 | end_line = [(5,16),(6,16),(7,16)]
28 | global start_velocity
29 | start_velocity = (0, 0)
30 |
31 | def __init__(self):
32 | self.race_map = race_map
33 | self.start_line = start_line
34 | self.end_line = end_line
35 |
36 | self.start_pos = (0,0)
37 | self.start_velocity = start_velocity
38 |
39 | self.c_state = (0,0,0,0)
40 | self.c_action = (0,0)
41 | self.c_reward = 0
42 | self.next_state = (0,0,0,0)
43 |
44 | def get_states(self):
45 |
46 | states = []
47 | rmx_n = self.race_map.shape[0]
48 | rmy_n = self.race_map.shape[1]
49 | vx_n = np.arange(0,5).shape[0]
50 | vy_n = np.arange(0,5).shape[0]
51 | states_num = rmx_n * rmy_n * vx_n * vy_n
52 | for i in range(rmx_n):
53 | for j in range(rmy_n):
54 | for m in range(0,5):
55 | for k in range(0,5):
56 | states.append((i,j,m,k))
57 | return states
58 |
59 | def get_actions(self):
60 |
61 | actions = []
62 | ax = np.arange(-1,2).shape[0]
63 | ay = np.arange(-1,2).shape[0]
64 | for i in range(-1,2):
65 | for j in range(-1,2):
66 | actions.append((i,j))
67 | return actions
68 |
69 | def pass_endLine(self,pre_pos,lat_pos):
70 |
71 | endLine_lx = end_line[0][0]
72 | endLine_hx = end_line[-1][0]
73 | endLine_y = end_line[0][1]
74 |
75 | if pre_pos[1] < endLine_y and lat_pos[1] >= endLine_y and (pre_pos[0] + lat_pos[0])/2 \
76 | <= endLine_hx and (pre_pos[0] + lat_pos[0])/2 >= endLine_lx:
77 | return True
78 |
79 | else:
80 | return False
81 |
82 |
83 | def avg_return_per_episode(self,ep):
84 |
85 | ep_length = len(ep)
86 | ep_return = 0.0
87 |
88 | for i in range(2,ep_length,3):
89 | ep_return += ep[i]
90 | return ep_return * 1.0
91 |
92 | def episode_generator(self,policy,max_num,is_greedy):
93 |
94 |
95 | self.start_pos = self.start_line[np.random.randint(0,len(start_line)-1)]
96 |
97 | start_state = (self.start_pos[0],self.start_pos[1],self.start_velocity[0],self.start_velocity[1])
98 | self.c_state = start_state
99 | self.next_state = start_state
100 |
101 | episode = []
102 | episode.append(self.c_state)
103 |
104 | #print("start_state:",c_state)
105 | n = 0
106 |
107 | while not self.isTerminated() and n < max_num:
108 |
109 |
110 | self.c_state = self.next_state
111 |
112 | action_list = self.get_actions()
113 |
114 | action_prob = policy[self.c_state]
115 |
116 | #print("action_list: ",action_list)
117 | print("action_prob: ",action_prob)
118 |
119 | if not is_greedy:
120 | self.c_action = action_list[np.random.choice(len(action_list),1,p=action_prob)[0]]
121 | else:
122 | self.c_action = action_list[np.argmax(action_prob)]
123 |
124 | self.c_state,self.c_action,self.c_reward,self.next_state = self.oneStep_generator()
125 |
126 | print "c_state:",self.c_state
127 | print "c_action:", self.c_action
128 | print "c_reward:", self.c_reward
129 | print "next_state:",self.next_state
130 |
131 | episode.append(self.c_action)
132 | episode.append(self.c_reward)
133 | episode.append(self.next_state)
134 | n += 1
135 | print "n:",n
136 |
137 |
138 |
139 |
140 | print("episode generated!")
141 | return episode
142 |
143 |
144 | def oneStep_generator(self):
145 |
146 | # return [s,a,r,s']
147 |
148 | self.next_state = (0, 0, 0, 0)
149 | self.c_reward = 0
150 |
151 | # gurantee that velocity less than 5, more or equal 0
152 | c_velocity = (max(min(self.c_state[2] + self.c_action[0], 4), 0), max(min(self.c_state[3] + self.c_action[1], 4), 0))
153 |
154 | x_state = (self.c_state[0] - c_velocity[1], self.c_state[1] + c_velocity[0], c_velocity[0], c_velocity[1])
155 |
156 | if x_state[0] < 0 or x_state[0] > 13 or x_state[1] < 0 or x_state[1] > 19 or self.race_map[
157 | x_state[0], x_state[1]] == 0:
158 |
159 | tmp_pos = self.start_line[np.random.randint(0, len(self.start_line) - 1)]
160 | self.next_state = (tmp_pos[0], tmp_pos[1], 0, 0)
161 | self.c_reward = -5
162 |
163 | elif c_velocity[0] == 0 and c_velocity[1] == 0:
164 |
165 | if np.random.choice(2, 1, p=[0.5, 0.5])[0] == 0:
166 | self.next_state = (x_state[0], x_state[1], 1, x_state[3])
167 |
168 | else:
169 | self.next_state = (x_state[0], x_state[1], x_state[2], 1)
170 |
171 | self.c_reward = -5
172 |
173 | else:
174 | self.next_state = x_state
175 | self.c_reward = -1
176 |
177 | return [self.c_state,self.c_action,self.c_reward,self.next_state]
178 |
179 |
180 |
181 | def getInitState(self):
182 |
183 | start_pos = self.start_line[np.random.randint(0, len(start_line) - 1)]
184 | return (start_pos[0],start_pos[1],self.start_velocity[0],self.start_velocity[1])
185 |
186 | '''
187 | def isTerminated(self,n,max_timestep):
188 |
189 |
190 |
191 | pre_pos = (self.c_state[0],self.c_state[1])
192 | lat_pos = (self.next_state[0],self.next_state[1])
193 |
194 | return self.pass_endLine(pre_pos,lat_pos) or n > max_timestep
195 | '''
196 |
197 | def isTerminated(self):
198 |
199 | if (self.next_state[0],self.next_state[1]) in self.end_line:
200 | return True
201 | else:
202 | return False
203 |
204 | class CartPole:
205 | def __init__(self):
206 | self.env = gym.make('CartPole-v0')
207 | def get_env(self):
208 | return self.env
--------------------------------------------------------------------------------
/RLF/env.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/env.pyc
--------------------------------------------------------------------------------
/RLF/main.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from env import *
5 | from result_analysis import *
6 | from monte_carlo import *
7 | from td import *
8 | from td_lambda import *
9 | from DQN import *
10 |
11 |
12 | if __name__ == '__main__':
13 |
14 |
15 | Lamborghini = RaceCar()
16 | Lam_states = Lamborghini.get_states()
17 | Lam_actions = Lamborghini.get_actions()
18 |
19 | '''
20 | MC = MonteCarlo(Lam_states,Lam_actions)
21 | learning_type = 'on-policy'
22 | MC.set_policy(learning_type)
23 |
24 | avg_ep_return_list = MC.on_policy_learning(Lamborghini,4000,0.2,200,50)
25 | '''
26 | '''
27 | TD = TemporalDifference(Lam_states,Lam_actions)
28 | TD.set_policy('sarsa')
29 | avg_ep_return_list = TD.sarsa_learning(Lamborghini,3000,0.1,0.5,1,200,50)
30 | xdata = range(0,3000,50)
31 | ydata = avg_ep_return_list
32 | '''
33 | '''
34 | TD = TemporalDifference(Lam_states, Lam_actions)
35 | TD.set_policy('q-learning')
36 | avg_ep_return_list = TD.Q_learning(Lamborghini, 10000, 0.1, 0.5, 1, 200, 100)
37 | xdata = range(0, 10000, 100)
38 | ydata = avg_ep_return_list
39 | '''
40 | '''
41 | td_lambda = Temporal_Difference_lambda(Lam_states,Lam_actions)
42 | td_lambda.set_policy('sarsa_lambda')
43 | avg_ep_return_list = td_lambda.sarsa_lambda(Lamborghini,2000,0.1,0.5,1,0.9,200,50)
44 |
45 | xdata = range(0,2000,50)
46 | ydata = avg_ep_return_list
47 | '''
48 | '''
49 | td_lambda = Temporal_Difference_lambda(Lam_states,Lam_actions)
50 | td_lambda.set_policy('naive_Q_lambda')
51 | avg_ep_return_list = td_lambda.sarsa_lambda(Lamborghini, 2000, 0.1, 0.5, 1, 0.9, 200, 50)
52 |
53 | xdata = range(0, 2000, 50)
54 | ydata = avg_ep_return_list
55 |
56 | fig = Line_Chart(xdata,ydata,"index","avg return","assessment of racetrack "+'with naive Q-lambda')
57 | fig.Draw_LineChart("")
58 | '''
59 |
60 | CartPole = CartPole()
61 | env = CartPole.get_env()
62 |
63 | DQN = DQN(env)
64 | DQN.Deep_Q_Learning(env,500,50000,32,0)
65 |
66 |
--------------------------------------------------------------------------------
/RLF/monte_carlo.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from env import *
5 | import time
6 |
7 | class MonteCarlo:
8 |
9 | def __init__(self,state_list,action_list):
10 | self.states = state_list
11 | self.actions = action_list
12 |
13 | self.state_num = len(self.states)
14 | self.action_num = len(self.actions)
15 |
16 | self.returns = dict()
17 | self.Q = dict()
18 | self.N = dict()
19 | self.D = dict()
20 |
21 | for s in self.states:
22 | #self.Q[s] = np.random.random(self.action_num)
23 | self.Q[s] = np.zeros(self.action_num)
24 | self.N[s] = np.zeros(self.action_num)
25 | self.D[s] = np.zeros(self.action_num)
26 |
27 | self.returns[s] = [[] for _ in xrange(self.action_num)]
28 |
29 |
30 | def set_policy(self,learning_type):
31 | self.pi = dict()
32 | self.mu = dict()
33 |
34 | if learning_type == 'off-policy':
35 | for s in self.states:
36 | idx = np.random.randint(0,self.action_num,size=1)[0]
37 | self.pi[s] = np.zeros(self.action_num)
38 | self.pi[s][idx] = 1.0
39 |
40 | self.mu[s] = np.random.random(self.action_num)
41 | self.mu[s] = self.mu[s] / np.sum(self.mu[s])
42 | elif learning_type == 'on-policy':
43 | for s in self.states:
44 | self.pi[s] = np.random.random(self.action_num)
45 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
46 | else:
47 | pass
48 |
49 | def get_policy(self,learning_type):
50 | if learning_type == 'off-policy':
51 | return {'target policy':self.pi,'behavior policy':self.mu}
52 |
53 | elif learning_type == 'on-policy':
54 | return self.pi
55 |
56 | def off_policy_learning(self,env,episode_num,epsilon,max_timestep,eval_interval):
57 |
58 | ep_idx = 0
59 | avg_ep_return_list = []
60 | while ep_idx < episode_num:
61 |
62 |
63 | if ep_idx % eval_interval == 0:
64 | eval_ep = env.episode_generator(self.pi,max_timestep)
65 | print("eval episode length:%d" %(len(eval_ep)/3))
66 | c_avg_return = env.avg_return_per_episode(eval_ep)
67 | avg_ep_return_list.append(c_avg_return)
68 | print("assessing return:%f" %c_avg_return)
69 |
70 | c_ep = env.episode_generator(self.mu,max_timestep)
71 | ep_length = len(c_ep)
72 |
73 | print("processing the %dth episode:" %ep_idx)
74 | print("episode length:%d\n" %(ep_length/3))
75 | latest_time = ep_length - 3
76 | checked_sa = set()
77 |
78 | for i in range(ep_length-3,-1,-3):
79 | tmp_s = c_ep[i-1]
80 | if np.where(self.actions == c_ep[i]) == np.where(self.pi[tmp_s] == 1.0):
81 | continue
82 | else:
83 | latest_time = i
84 | print("latest_time:%d" %(i/3))
85 | break
86 | for j in range(latest_time+2,ep_length-3,3):
87 | if c_ep[j] not in checked_sa:
88 | checked_sa.add(c_ep[j])
89 | W = 1.0
90 | G = 0
91 | G += c_ep[j+2]
92 | sa_idx = np.where(self.actions == c_ep[j+1])
93 | for m in range(j+3,ep_length,3):
94 | W *= 1.0 / self.mu[c_ep[m]][np.where(self.actions == c_ep[m+1])]
95 | G += c_ep[m+2]
96 | self.N[c_ep[j]][sa_idx] += W * G
97 | self.D[c_ep[j]][sa_idx] += W
98 | self.Q[c_ep[j]][sa_idx] = self.N[c_ep[j]][sa_idx] / self.D[c_ep[j]][sa_idx]
99 |
100 | for s in self.states:
101 | best_action_idx = np.argmax(self.Q[s])
102 | #print("best-action:",self.actions[best_action_idx])
103 | self.pi[s] = [0.0] * self.action_num
104 | self.pi[s][best_action_idx] = 1.0
105 |
106 | ep_idx += 1
107 | return avg_ep_return_list
108 |
109 | def on_policy_learning(self,env,episode_num,epsilon,max_timestep,eval_interval):
110 |
111 | ep_idx = 0
112 | avg_ep_return_list = []
113 |
114 | while ep_idx < episode_num:
115 |
116 | if ep_idx % eval_interval == 0:
117 | eval_ep = env.episode_generator(self.pi,max_timestep,True)
118 | print("eval episode length:%d" %(len(eval_ep)/3))
119 | c_avg_return = env.avg_return_per_episode(eval_ep)
120 | avg_ep_return_list.append(c_avg_return)
121 | print("assessing return:%f" %c_avg_return)
122 | print "avg return list length:",len(avg_ep_return_list)
123 |
124 | start_time = time.time()
125 | c_ep = env.episode_generator(self.pi,max_timestep,False)
126 |
127 |
128 |
129 | time1 = time.time()
130 | ep_length = len(c_ep)
131 |
132 | if not ep_length:
133 | print("episode is empty!")
134 | else:
135 | print("processing the %dth episode:" %ep_idx)
136 | print("episode length:%d\n" %(ep_length/3))
137 |
138 | checked_pair = set()
139 | for i in range(0,ep_length-1,3):
140 | sa_pair = (c_ep[i],np.where((np.array(self.actions)==c_ep[i+1]).all(1))[0][0])
141 | if sa_pair not in checked_pair:
142 | r = 0
143 | r = r + c_ep[i+2]
144 | for j in range(i+2+3,ep_length,3):
145 | r += c_ep[j]
146 | self.returns[sa_pair[0]][sa_pair[1]].append(r)
147 | checked_pair.add(sa_pair)
148 | self.Q[sa_pair[0]][sa_pair[1]] = sum(self.returns[sa_pair[0]][sa_pair[1]]) * 1.0 / len(self.returns[sa_pair[0]][sa_pair[1]])
149 | print("Q have been calculated!")
150 |
151 | time2 = time.time()
152 | checked_states = set()
153 | for i in range(0,ep_length,3):
154 | s = c_ep[i]
155 | tmpList_sa = []
156 | if s not in checked_states:
157 | checked_states.add(s)
158 |
159 | best_action = self.actions[np.random.choice(np.where(self.Q[s] == np.amax(self.Q[s]))[0])]
160 | print ("best_action: ",best_action)
161 |
162 | for aix in range(self.action_num):
163 | if self.actions[aix] == best_action:
164 | self.pi[s][aix] = 1 - epsilon + epsilon / self.pi[s].shape[0]
165 | else:
166 | self.pi[s][aix] = epsilon / self.pi[s].shape[0]
167 | print("policy updated done!")
168 | time3 = time.time()
169 | ep_idx += 1
170 |
171 | print("ep generator time:{:.2f}s".format(time1 - start_time))
172 | print("Q cal time:{:.2f}s".format(time2 - time1))
173 | print("policy update time:{:.2f}s".format(time3 - time2))
174 | return avg_ep_return_list
175 |
176 |
177 |
178 | '''
179 | Lamborghini = RaceCar()
180 | Lam_states = Lamborghini.get_states()
181 | Lam_actions = Lamborghini.get_actions()
182 |
183 | MC = MoneCarlo(Lam_states,Lam_actions)
184 | MC.set_policy('off-policy')
185 |
186 | ep = Lamborghini.episode_generator(MC.mu,200)
187 | print ep
188 | '''
189 |
190 |
191 |
192 |
--------------------------------------------------------------------------------
/RLF/monte_carlo.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/monte_carlo.pyc
--------------------------------------------------------------------------------
/RLF/naive_Q_lambda_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/naive_Q_lambda_RACETRACK.png
--------------------------------------------------------------------------------
/RLF/result_analysis.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 |
6 | class Line_Chart:
7 | def __init__(self,xdata,ydata,xlabel,ylabel,title):
8 | self.xdata = xdata
9 | self.ydata = ydata
10 | self.xlabel = xlabel
11 | self.ylabel = ylabel
12 | self.title = title
13 | def Draw_LineChart(self,label):
14 |
15 | plt.title(self.title)
16 | plt.xlabel(self.xlabel)
17 | plt.ylabel(self.ylabel)
18 |
19 | plt.plot(self.xdata,self.ydata,'y',label=label)
20 | plt.grid()
21 |
22 | plt.show()
23 |
--------------------------------------------------------------------------------
/RLF/result_analysis.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/result_analysis.pyc
--------------------------------------------------------------------------------
/RLF/td.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 |
5 | class TemporalDifference:
6 |
7 | def __init__(self,state_list,action_list):
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 |
12 | self.state_num = len(self.states)
13 | self.action_num = len(self.actions)
14 |
15 | self.Q = dict()
16 | for s in self.states:
17 | #self.Q[s] = np.random.random(self.action_num)
18 | self.Q[s] = np.zeros(self.action_num)
19 |
20 |
21 | def set_policy(self,learning_type):
22 |
23 | self.pi = dict()
24 | self.mu = dict()
25 |
26 | if learning_type == 'sarsa':
27 | for s in self.states:
28 | self.pi[s] = np.random.random(self.action_num)
29 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
30 |
31 | elif learning_type == 'q-learning':
32 | for s in self.states:
33 | idx = np.random.randint(0,self.action_num,size=1)[0]
34 | self.pi[s] = np.zeros(self.action_num)
35 | self.pi[s][idx] = 1.0
36 |
37 | self.mu[s] = np.random.random(self.action_num)
38 | self.mu[s] = self.mu[s] / np.sum(self.mu[s])
39 |
40 |
41 | def sarsa_learning(self,env,episode_num,epsilon,alpha,gamma,max_timestep,eval_interval):
42 |
43 | ep_idx = 0
44 | avg_ep_return_list = []
45 |
46 | while ep_idx < episode_num:
47 |
48 | if ep_idx % eval_interval == 0:
49 | eval_ep = env.episode_generator(self.pi,max_timestep,True)
50 | print("eval episode length:%d" %(len(eval_ep)/3))
51 | c_avg_return = env.avg_return_per_episode(eval_ep)
52 | avg_ep_return_list.append(c_avg_return)
53 | print("assessing return:%f" %c_avg_return)
54 | print "avg return list length:",len(avg_ep_return_list)
55 |
56 | ep_idx += 1
57 |
58 | env.c_state = env.getInitState()
59 | env.next_state = env.c_state
60 |
61 | n = 0
62 |
63 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.c_state])[0]
64 | env.c_action = self.actions[c_action_idx]
65 |
66 | #print "episode index:",ep_idx
67 | #print "env termination:",env.isTerminated()
68 |
69 | while not (env.isTerminated() or n >= max_timestep) :
70 |
71 | env.c_state = env.next_state
72 | env.c_action = self.actions[c_action_idx]
73 | #print "policy:",self.pi
74 |
75 | env.c_state,env.c_action,env.c_reward,env.next_state = env.oneStep_generator()
76 |
77 | next_action_idx = np.random.choice(self.action_num,1,p=self.pi[env.next_state])[0]
78 |
79 | self.Q[env.c_state][c_action_idx] += alpha * (env.c_reward + gamma * self.Q[env.next_state][next_action_idx] - self.Q[env.c_state][c_action_idx])
80 |
81 | # --------policy update at same time---------#
82 | c_best_action_idx = np.argmax(self.Q[env.c_state])
83 |
84 | for action_idx in range(self.action_num):
85 | if action_idx == c_best_action_idx:
86 | self.pi[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
87 | else:
88 | self.pi[env.c_state][action_idx] = epsilon / self.action_num
89 |
90 | c_action_idx = next_action_idx
91 |
92 | n += 1
93 | #print "n:",n
94 |
95 | return avg_ep_return_list
96 |
97 | def Q_learning(self,env,episode_num,epsilon,alpha,gamma,max_timestep,eval_interval):
98 |
99 | ep_idx = 0
100 | avg_ep_return_list = []
101 | while ep_idx < episode_num:
102 |
103 | if ep_idx % eval_interval == 0:
104 | eval_ep = env.episode_generator(self.pi,max_timestep,True)
105 | print("eval episode length:%d" %(len(eval_ep)/3))
106 | c_avg_return = env.avg_return_per_episode(eval_ep)
107 | avg_ep_return_list.append(c_avg_return)
108 | print("assessing return:%f" %c_avg_return)
109 | print "avg return list length:",len(avg_ep_return_list)
110 |
111 | ep_idx += 1
112 |
113 | env.c_state = env.getInitState()
114 | env.next_state = env.c_state
115 |
116 | n = 0
117 |
118 | while n < max_timestep and not env.isTerminated():
119 |
120 | env.c_state = env.next_state
121 |
122 | c_action_idx = np.random.choice(self.action_num,1,p=self.mu[env.c_state])[0]
123 | env.c_action = self.actions[c_action_idx]
124 |
125 |
126 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
127 |
128 | #print "c_state:",env.c_state
129 | #print "c_action:",env.c_action
130 | #print "c_reward:",env.c_reward
131 | #print "next_state:",env.next_state
132 | #print "c_state mu:",self.mu[env.c_state]
133 |
134 |
135 |
136 | self.Q[env.c_state][c_action_idx] += alpha * (
137 | env.c_reward + gamma * np.amax(self.Q[env.next_state]) - self.Q[env.c_state][c_action_idx])
138 |
139 |
140 |
141 | c_best_action_idx = np.argmax(self.Q[env.c_state])
142 |
143 | #print "c_state Q:",self.Q[env.c_state]
144 | #print "c_best_action_idx:",c_best_action_idx
145 |
146 | for action_idx in range(self.action_num):
147 | if action_idx == c_best_action_idx:
148 | self.mu[env.c_state][action_idx] = 1 - epsilon + epsilon/self.action_num
149 | else:
150 | self.mu[env.c_state][action_idx] = epsilon/self.action_num
151 |
152 |
153 | # --------policy update at same time---------#
154 | for action_idx in range(self.action_num):
155 | if action_idx == c_best_action_idx:
156 | self.pi[env.c_state][action_idx] = 1.0
157 | else:
158 | self.pi[env.c_state][action_idx] = 0.0
159 |
160 | n += 1
161 |
162 | return avg_ep_return_list
--------------------------------------------------------------------------------
/RLF/td.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/td.pyc
--------------------------------------------------------------------------------
/RLF/td_lambda.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 | class Temporal_Difference_lambda:
5 |
6 | def __init__(self,state_list,action_list):
7 |
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 | self.state_num = len(self.states)
12 | self.action_num = len(self.actions)
13 |
14 | self.Q = dict()
15 | for s in self.states:
16 | self.Q[s] = np.random.random(self.action_num)
17 | #self.Q[s] = np.zeros(self.action_num)
18 |
19 | self.Z = dict()
20 | for s in self.states:
21 | self.Z[s] = np.zeros(self.action_num)
22 |
23 | def reset_Z(self):
24 |
25 | for s in self.states:
26 | self.Z[s] = np.zeros(self.action_num)
27 |
28 | def set_policy(self,learning_type):
29 |
30 | self.pi = dict()
31 | self.mu = dict()
32 |
33 | if learning_type == 'sarsa_lambda':
34 | for s in self.states:
35 | self.pi[s] = np.random.random(self.action_num)
36 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
37 | elif learning_type == 'naive_Q_lambda':
38 | for s in self.states:
39 | idx = np.random.randint(0, self.action_num, size=1)[0]
40 | self.pi[s] = np.zeros(self.action_num)
41 | self.pi[s][idx] = 1.0
42 |
43 | self.mu[s] = np.random.random(self.action_num)
44 | self.mu[s] = self.mu[s] / np.sum(self.mu[s])
45 |
46 |
47 |
48 | def sarsa_lambda(self,env,episode_num,epsilon,alpha,gamma,Lambda,max_timestep,eval_interval):
49 |
50 | ep_idx = 0
51 | avg_ep_return_list = []
52 |
53 | while ep_idx < episode_num:
54 |
55 | if ep_idx % eval_interval == 0:
56 | eval_ep = env.episode_generator(self.pi, max_timestep, True)
57 | print("eval episode length:%d" % (len(eval_ep) / 3))
58 | c_avg_return = env.avg_return_per_episode(eval_ep)
59 | avg_ep_return_list.append(c_avg_return)
60 | print("assessing return:%f" % c_avg_return)
61 | print "avg return list length:", len(avg_ep_return_list)
62 |
63 | ep_idx += 1
64 | print "ep_idx:",ep_idx
65 |
66 | self.reset_Z()
67 |
68 | env.c_state = env.getInitState()
69 | env.next_state = env.c_state
70 |
71 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.c_state])[0]
72 | #env.c_action = self.actions[c_action_idx]
73 |
74 |
75 | n = 0
76 |
77 | while not (env.isTerminated() or n >= max_timestep):
78 |
79 | env.c_state = env.next_state
80 | env.c_action = self.actions[c_action_idx]
81 |
82 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
83 |
84 | next_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.next_state])[0]
85 |
86 | delta = env.c_reward + gamma * self.Q[env.next_state][next_action_idx] - self.Q[env.c_state][c_action_idx]
87 |
88 | self.Z[env.c_state][c_action_idx] += 1
89 |
90 | for s in self.states:
91 | for i in range(self.action_num):
92 | self.Q[s][i] += alpha * delta * self.Z[s][i]
93 | self.Z[s][i] *= gamma * Lambda
94 |
95 | # --------policy update at same time---------#
96 | c_best_action_idx = np.argmax(self.Q[env.c_state])
97 |
98 | for action_idx in range(self.action_num):
99 | if action_idx == c_best_action_idx:
100 | self.pi[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
101 | else:
102 | self.pi[env.c_state][action_idx] = epsilon / self.action_num
103 |
104 |
105 | c_action_idx = next_action_idx
106 | n += 1
107 | print "n:",n
108 |
109 | return avg_ep_return_list
110 |
111 | # to improve the performance of Watkin's Q(lambda) and reduce the complexity of Peng's Q(lambda),we introduce naive Q(lambda)
112 | def naive_Q_lambda(self,env,episode_num,epsilon,alpha,gamma,Lambda,max_timestep,eval_interval):
113 |
114 | ep_idx = 0
115 | avg_ep_return_list = []
116 |
117 | while ep_idx < episode_num:
118 |
119 | if ep_idx % eval_interval == 0:
120 | eval_ep = env.episode_generator(self.pi, max_timestep, True)
121 | print("eval episode length:%d" % (len(eval_ep) / 3))
122 | c_avg_return = env.avg_return_per_episode(eval_ep)
123 | avg_ep_return_list.append(c_avg_return)
124 | print("assessing return:%f" % c_avg_return)
125 | print "avg return list length:", len(avg_ep_return_list)
126 |
127 | ep_idx += 1
128 | print "ep_idx:", ep_idx
129 |
130 | self.reset_Z()
131 |
132 | env.c_state = env.getInitState()
133 | env.next_state = env.c_state
134 |
135 | c_action_idx = np.random.choice(self.action_num, 1, p=self.mu[env.c_state])[0]
136 | # env.c_action = self.actions[c_action_idx]
137 |
138 |
139 | n = 0
140 |
141 | while not (env.isTerminated() or n >= max_timestep):
142 | env.c_state = env.next_state
143 | env.c_action = self.actions[c_action_idx]
144 |
145 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
146 |
147 | next_action_idx = np.random.choice(self.action_num, 1, p=self.mu[env.next_state])[0]
148 | next_best_action_idx = np.argmax(self.Q[env.next_state])
149 |
150 | delta = env.c_reward + gamma * self.Q[env.next_state][next_best_action_idx] - self.Q[env.c_state][next_action_idx]
151 | self.Z[env.c_state][env.c_action] += 1
152 |
153 |
154 | if next_action_idx == next_best_action_idx:
155 | for s in self.states:
156 | for i in range(self.action_num):
157 | self.Q[s][i] += alpha * delta * self.Z[s][i]
158 | self.Z[s][i] *= Lambda * gamma
159 | else:
160 | for s in self.states:
161 | for i in range(self.action_num):
162 | self.Q[s][i] += alpha * delta * self.Z[s][i]
163 | self.Z[s][i] = 0.5
164 |
165 | c_best_action_idx = np.argmax(self.Q[env.c_state])
166 |
167 |
168 |
169 |
170 |
171 | # ------- update behavior policy --------- #
172 | for action_idx in range(self.action_num):
173 | if action_idx == c_best_action_idx:
174 | self.mu[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
175 | else:
176 | self.mu[env.c_state][action_idx] = epsilon / self.action_num
177 |
178 | # --------target policy update at same time---------#
179 | for action_idx in range(self.action_num):
180 | if action_idx == c_best_action_idx:
181 | self.pi[env.c_state][action_idx] = 1.0
182 | else:
183 | self.pi[env.c_state][action_idx] = 0.0
184 |
185 | c_action_idx = next_action_idx
186 | n += 1
187 |
188 | return avg_ep_return_list
--------------------------------------------------------------------------------
/RLF/td_lambda.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/RLF/td_lambda.pyc
--------------------------------------------------------------------------------
/chapter4/Jack’s_Car_Rental.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | # This is the code for programming exercises 4.5 called "Jack’s Car Rental" in book "reinforcement-learning-an-introduction" #
4 | # To better understand the code, please check Figure4.3 in chapter 4 for Pseudocode code #
5 | # The code draw some reference to the code published at https://github.com/ShangtongZhang/reinforcement-learning-an-introduction #
6 | # Author:Xubo Lv(lv_xubo@163.com) 2016.05 #
7 | ##################################################################################################################################
8 | import sys,os
9 | import numpy as np
10 | import random
11 | import copy
12 | from math import *
13 |
14 | # [1] Initialization
15 |
16 | # states num in location 1
17 | num_first = 21
18 | # states num in location 2
19 | num_second = 21
20 | # rent reward per car
21 | rent_reward = 10
22 | # move reward per car
23 | move_reward = -2
24 | #park reward for superflous cars
25 | park_reward = -4
26 |
27 | #discount parameter
28 | DISCOUNT = 0.9
29 |
30 | #poisson distribution expected value
31 | Poisson_request_first = 3
32 | Poisson_return_first = 3
33 | Poisson_request_second = 4
34 | Poisson_return_second = 2
35 |
36 |
37 | # Vital!! when n > 11, the probility goes nearly to zero with the respect of the expected value above
38 | # so we don't care about what will happen when n > 11
39 | Poisson_upbound = 11
40 |
41 | # max cars limit
42 | Max_cars = 20
43 | # max moves limit
44 | Max_moves = 5
45 |
46 | # the number of states
47 | num_states = num_first * num_second
48 | # value matrix for every state
49 | value = np.zeros((num_first,num_second))
50 | # policy matrix for every state
51 | policy = np.zeros((num_first,num_second))
52 |
53 | states = []
54 | actions = []
55 |
56 | # initializiton of states
57 | for i in range(0,num_first):
58 | for j in range(0,num_second):
59 | states.append([i,j])
60 | # all possible actions
61 | for i in range(-Max_moves,Max_moves+1,1):
62 | actions.append(i)
63 | # arbitrary initial policy
64 | for i in range(0,num_first):
65 | for j in range(0,num_second):
66 | #policy[i][j] = random.randint(-5,5)
67 | policy[i][j] = 0
68 |
69 |
70 | # store every probability of every possible rent number or return number in avoid of duplicate calculation
71 | poisson_table = dict()
72 | def poisson(n,lam):
73 |
74 | global poisson_table
75 | # to make no duplicate keys
76 | key = n * 10 + lam
77 | if key not in poisson_table.keys():
78 | poisson_table[key] = exp(-lam) * pow(lam,n) / factorial(n)
79 |
80 | return poisson_table[key]
81 |
82 |
83 | # core part: calculation of expected return for every possible state with possible actions
84 | # the returns are used for update value matrix
85 | def expect_return(single_state,single_action,value):
86 |
87 | returns = 0.0
88 | if (single_action >= 1):
89 | returns += move_reward * (single_action - 1)
90 | else:
91 | returns += move_reward * abs(single_action)
92 |
93 | _NumOfCarsFirst = int(min(single_state[0]-single_action,Max_cars))
94 | _NumOfCarsSecond = int(min(single_state[1]+single_action,Max_cars))
95 |
96 | if _NumOfCarsFirst > 10:
97 | returns += park_reward
98 | if _NumOfCarsSecond > 10:
99 | returns += park_reward
100 |
101 |
102 | for rental_request_first in range(0,Poisson_upbound):
103 | for rental_request_second in range(0,Poisson_upbound):
104 |
105 | #NumOfCarsFirst = int(min(single_state[0]-single_action,Max_cars))
106 | #NumOfCarsSecond = int(min(single_state[1]+single_action,Max_cars))
107 |
108 | NumOfCarsFirst = _NumOfCarsFirst
109 | NumOfCarsSecond = _NumOfCarsSecond
110 |
111 | realRentalFirst = min(NumOfCarsFirst,rental_request_first)
112 | realRentalSecond = min(NumOfCarsSecond,rental_request_second)
113 |
114 | NumOfCarsFirst -= realRentalFirst
115 | NumOfCarsSecond -= realRentalSecond
116 |
117 | reward = (realRentalFirst + realRentalSecond) * rent_reward
118 | prob = poisson(rental_request_first,Poisson_request_first) * \
119 | poisson(rental_request_second,Poisson_request_second)
120 |
121 | constant_return = True
122 | if constant_return:
123 | rental_return_first = Poisson_return_first
124 | rental_return_second = Poisson_return_second
125 |
126 | NumOfCarsFirst = min(NumOfCarsFirst + rental_return_first,Max_cars)
127 | NumOfCarsSecond = min(NumOfCarsSecond + rental_return_second,Max_cars)
128 |
129 | returns += prob * (reward + DISCOUNT * value[NumOfCarsFirst,NumOfCarsSecond])
130 |
131 | else:
132 | # vital!! temperary storage in avoid of the NumOfCarsFirst was modified by the following for loop
133 | NumOfCarsFirst_ = NumOfCarsFirst
134 | NumOfCarsSecond_ = NumOfCarsSecond
135 | prob_ = prob
136 |
137 | for rental_return_first in range(0,Poisson_upbound):
138 | for rental_return_second in range(0,Poisson_upbound):
139 |
140 | NumOfCarsFirst = NumOfCarsFirst_
141 | NumOfCarsSecond = NumOfCarsSecond_
142 | prob = prob_
143 |
144 | NumOfCarsFirst = min(NumOfCarsFirst + rental_return_first,Max_cars)
145 | NumOfCarsSecond = min(NumOfCarsSecond + rental_return_second,Max_cars)
146 |
147 | prob = poisson(rental_return_first,Poisson_return_first) *\
148 | poisson(rental_return_second,Poisson_return_second) * prob
149 |
150 | returns += prob * (reward + DISCOUNT * value[NumOfCarsFirst,NumOfCarsSecond])
151 |
152 | return returns
153 |
154 |
155 |
156 |
157 |
158 | if __name__ == "__main__":
159 |
160 |
161 | theta = 1e-4
162 | newStateValue = np.zeros((Max_cars + 1, Max_cars + 1))
163 | while True:
164 | # [2] Policy Evaluation
165 |
166 | # comment part is in-place version of policy iteration
167 | '''
168 | error = 0
169 | print "policy evaluation ..."
170 | for state in states:
171 | tmp = value[state[0],state[1]]
172 | value[state[0],state[1]] = expect_return(state,policy[state[0],state[1]],value)
173 | error = max(error,abs(tmp-value[state[0],state[1]]))
174 |
175 | print ("error:",error)
176 | if error > theta:
177 | continue
178 | '''
179 |
180 | for i, j in states:
181 | newStateValue[i,j] = expect_return([i, j], policy[i, j], value)
182 | error = np.sum(np.abs(newStateValue - value))
183 | print ("error:",error)
184 | if error >= 1e-4:
185 | value[:] = newStateValue
186 | continue
187 | value[:] = newStateValue
188 |
189 | # [3] policy improvement
190 |
191 | # comment part is in-place version of policy iteration
192 | '''
193 | print "evaluation done"
194 | policy_stable = True
195 |
196 | print "policy improvement ..."
197 | for state in states:
198 | tmp = policy[state[0],state[1]]
199 | action_seq = []
200 | for a in actions:
201 | if (a >= 0 and a <= state[0]) or (a < 0 and abs(a) <= state[1]):
202 | action_seq.append(expect_return(state,a,value))
203 | else:
204 | action_seq.append(-float('inf'))
205 |
206 | best_action_th = np.argmax(action_seq)
207 | policy[state[0],state[1]] = actions[best_action_th]
208 |
209 | if tmp != policy[state[0],state[1]]:
210 | policy_stable = False
211 |
212 |
213 | if policy_stable:
214 | break
215 | else:
216 | print "need evaluation again ..."
217 | '''
218 |
219 | print "evaluation done"
220 | print "policy improvement ..."
221 | newPolicy = np.zeros((Max_cars + 1, Max_cars + 1))
222 | for state in states:
223 | action_seq = []
224 | for a in actions:
225 | # here is where the former problem lies: a >= 0
226 | # if you don't be very careful about the conditions,then the ultimate result will contains a little bias
227 | if (a >= 0 and a <= state[0]) or (a < 0 and abs(a) <= state[1]):
228 | action_seq.append(expect_return(state,a,value))
229 | else:
230 | action_seq.append(-float('inf'))
231 |
232 | best_action_th = np.argmax(action_seq)
233 | newPolicy[state[0], state[1]] = actions[best_action_th]
234 |
235 | policyChanges = np.sum(newPolicy != policy)
236 | print('Policy for', policyChanges, 'states changed')
237 | if policyChanges == 0:
238 | policy = newPolicy
239 | break
240 | policy = newPolicy
241 | print ("need another evaluation...")
242 |
243 |
244 | print "optimal value function:",value
245 | print "optimal policy:",policy
246 |
--------------------------------------------------------------------------------
/chapter4/The_Gambler.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 | import math
4 | import matplotlib.pyplot as plt
5 |
6 | # current problem
7 | # <1> why the value would be over 2 if i add goal_reward in the function:expect_return
8 | # <2> why i can't reproduce the result of the book
9 |
10 |
11 |
12 | goal_reward = 1
13 | other_reward = 0
14 |
15 | state_num = 99
16 |
17 | states = []
18 | for i in range(state_num):
19 | states.append(i+1)
20 |
21 |
22 | values = np.zeros(state_num)
23 | policies = np.zeros(state_num)
24 |
25 | head_prob = 0.4
26 | back_prob = 0.6
27 |
28 | DISCOUNT = 1.0
29 |
30 | def get_actions(state):
31 | return_actions = []
32 | action_limit = min(state,100-state)
33 | for j in range(action_limit+1):
34 | return_actions.append(j)
35 | return return_actions
36 |
37 | def expect_return(single_state,single_action,values):
38 |
39 | returns = 0.0
40 | next_win_state = single_state + single_action
41 | next_lose_state = single_state - single_action
42 |
43 | returns = head_prob * (DISCOUNT * (1.0 if next_win_state==100 else values[next_win_state-1])) \
44 | + back_prob * (other_reward + DISCOUNT * (0 if next_lose_state==0 else values[next_lose_state-1]))
45 |
46 | #returns = head_prob * ((goal_reward if next_win_state==100 else other_reward) + DISCOUNT * (1.0 if next_win_state==100 else values[next_win_state-1])) \
47 | # + back_prob * (other_reward + DISCOUNT * (0 if next_lose_state==0 else values[next_lose_state-1]))
48 |
49 | return returns
50 |
51 |
52 | if __name__ == "__main__":
53 |
54 | while(True):
55 |
56 | Delta = 0.0
57 | theta = 1e-9
58 |
59 | for s in states:
60 | tmp_value = values[s-1]
61 | actions = get_actions(s)
62 | value_candidates = []
63 | for a in actions:
64 | value_candidates.append(expect_return(s,a,values))
65 |
66 | values[s-1] = np.max(value_candidates)
67 |
68 | print values[s-1]
69 | #Delta = np.max(Delta,np.abs(tmp_value-values[s-1]))
70 | Delta += np.abs(tmp_value-values[s-1])
71 | if Delta < theta:
72 | break
73 |
74 | for s in states:
75 | actions = get_actions(s)
76 | value_candidates = []
77 | for a in actions:
78 | value_candidates.append(expect_return(s,a,values))
79 |
80 | policies[s-1] = actions[np.argmax(value_candidates)]
81 |
82 |
83 | plt.figure(1)
84 | plt.xlabel('Capital')
85 | plt.ylabel('Value estimate')
86 | plt.plot(values)
87 |
88 | plt.figure(2)
89 | plt.scatter(states,policies)
90 | plt.xlabel('Capital')
91 | plt.ylabel('Final policy (stake)')
92 | plt.show()
93 |
94 |
95 |
--------------------------------------------------------------------------------
/chapter5/Q_A.md:
--------------------------------------------------------------------------------
1 | ----new----
2 |
3 | (1) passing finish line not just on right position
4 | (2) timestep limit to 200
5 |
6 | ----solved----
7 |
8 | (1) new function -> passFinishLine()
9 | (2) set variable n <= 200
10 |
11 | ----new----
12 |
13 | (1) complete the assessment part to see the improvement of performance
14 |
15 | ----solved----
16 |
17 | (1) best_action used to be unique every episode. the problem lies on the tmp_sa_list is not cleaned in time
18 | (2) after 1000 iterations with 50 as interval, performance looks better than before
19 |
20 | ----new---- 2017.7.11
21 |
22 | (1) add off-policy method and test
23 | (2) improve the passLine func, seems still questionable
24 |
25 | ----solved----
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/chapter5/Racetrack.py:
--------------------------------------------------------------------------------
1 | import sys,os
2 | import numpy as np
3 | import random
4 | import matplotlib.pyplot as plt
5 |
6 | import time
7 | import matplotlib.animation as animation
8 | #%matplotlib inline
9 |
10 | race_map = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
11 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
12 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
13 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0],
14 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0],
15 | [0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,3,0,0,0],
16 | [0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,3,0,0,0],
17 | [0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,3,0,0,0],
18 | [0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0],
19 | [0,0,0,1,1,1,1,1,0,0,1,1,1,1,1,0,0,0,0,0],
20 | [0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0],
21 | [0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
22 | [0,0,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
23 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
24 | '''
25 | race_map = np.array([
26 | [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
27 | [0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
28 | [0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3],
29 | [0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3],
30 | [0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3],
31 | [0,0,0,1,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
32 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
33 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
34 | [0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
35 | [0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
36 | [0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
37 | [0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
38 | [0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
39 | [0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
40 | [0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
41 | [0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
42 | [0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
43 | [0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
44 | [0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
45 | [0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
46 | [0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
47 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
48 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
49 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
50 | [0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
51 | [0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
52 | [0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
53 | [0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
54 | [0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
55 | [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
56 | [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
57 | [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
58 | [0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
59 | [0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
60 | [0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
61 | [0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
62 | [0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
63 | [0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
64 | [0,0,0,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
65 | ])
66 | '''
67 |
68 |
69 | start_line = [(12,5),(12,4),(12,3),(12,2)]
70 | finish_line = [(5,16),(6,16),(7,16)]
71 |
72 | start_velocity = (2,2)
73 |
74 | # states are the position of the car
75 | states = []
76 | rmx_n = race_map.shape[0]
77 | rmy_n = race_map.shape[1]
78 | vx_n = np.arange(0,5).shape[0]
79 | vy_n = np.arange(0,5).shape[0]
80 | states_num = rmx_n * rmy_n * vx_n * vy_n
81 | for i in range(rmx_n):
82 | for j in range(rmy_n):
83 | for m in range(0,5):
84 | for k in range(0,5):
85 | states.append((i,j,m,k))
86 |
87 | actions = []
88 | ax = np.arange(-1,2).shape[0]
89 | ay = np.arange(-1,2).shape[0]
90 | actions_num = ax * ay
91 | for i in range(-1,2):
92 | for j in range(-1,2):
93 | actions.append((i,j))
94 |
95 | policies = dict()
96 | for i,j,m,k in states:
97 | policies[(i,j,m,k)] = np.random.random(len(actions))
98 | policies[(i,j,m,k)] = policies[(i,j,m,k)] / np.sum(policies[(i,j,m,k)])
99 | #print policies
100 |
101 | Q = dict()
102 | '''
103 | for i,j,m,k in states:
104 | for ax,ay in actions:
105 | Q[((i,j,m,k),(ax,ay))] = 0
106 | '''
107 | for i,j,m,k in states:
108 | Q[(i,j,m,k)] = np.zeros(actions_num)
109 |
110 | Returns = dict()
111 | '''
112 | for i,j,m,k in states:
113 | for ax,ay in actions:
114 | Returns[((i,j,m,k),(ax,ay))] = list()
115 | '''
116 | for i,j,m,k in states:
117 | Returns[(i,j,m,k)] = [[] for _ in xrange(actions_num)]
118 |
119 | epsilon = 0.2
120 | '''
121 | def random_pick(some_list, probabilities):
122 | x = random.uniform(0, 1)
123 | cumulative_probability = 0.0
124 | for item, item_probability in zip(some_list, probabilities):
125 | cumulative_probability += item_probability
126 | if x < cumulative_probability: break
127 | return item
128 | '''
129 | #some_list = [(1,1),(2,2)]
130 | #prob = [0.6,0.4]
131 | #print random_pick(some_list,prob)
132 |
133 | def pass_finishLine(pre_pos,lat_pos):
134 | finishLine_lx = finish_line[0][0]
135 | finishLine_hx = finish_line[-1][0]
136 | finishLine_y = finish_line[0][1]
137 |
138 | if pre_pos[1] < finishLine_y and lat_pos[1] >= finishLine_y and (pre_pos[0] + lat_pos[0])/2 \
139 | <= finishLine_hx and (pre_pos[0] + lat_pos[0])/2 >= finishLine_lx:
140 | return True
141 | else:
142 | return False
143 |
144 | def avg_return_per_episode(ep):
145 | ep_length = len(ep)
146 | ep_return = 0.0
147 | for i in range(2,ep_length,3):
148 | ep_return += ep[i]
149 | return ep_return*1.0
150 |
151 | def episode_generator(is_greedy):
152 |
153 | start_pos = start_line[random.randint(0,len(start_line)-1)]
154 | end_pos = start_pos
155 | last_pos = start_pos
156 |
157 | start_state = (start_pos[0],start_pos[1],start_velocity[0],start_velocity[1])
158 | c_state = start_state
159 |
160 | episode = []
161 | episode.append(c_state)
162 |
163 | #print("start_state:",c_state)
164 | n = 0
165 |
166 | while not pass_finishLine(last_pos,end_pos) and n < 200:
167 |
168 | last_pos = end_pos
169 |
170 | action_list = actions
171 | print "all actions:",actions
172 |
173 | action_prob = policies[c_state]
174 | print "action_prob:",action_prob
175 |
176 | #print "action list:", action_list
177 | #print "action_prob:", action_prob
178 |
179 | if not is_greedy:
180 | c_action = actions[np.random.choice(len(action_list),1,p=action_prob)[0]]
181 | print "c_action_idx:",np.where((np.array(actions)==c_action).all(1))[0]
182 |
183 |
184 | else:
185 | c_action = actions[np.argmax(action_prob)]
186 |
187 | # gurantee that velocity less than 5, more or equal 0
188 | c_velocity = (max(min(c_state[2]+c_action[0],4),0),max(min(c_state[3]+c_action[1],4),0))
189 |
190 | #if c_velocity[0] == 0 and c_velocity[1] == 0:
191 | # continue
192 |
193 | print "c_action:",c_action
194 | # print "c_velocity:",c_velocity
195 |
196 | # unsure state remaining to be justified
197 | x_state = (c_state[0]-c_velocity[1],c_state[1]+c_velocity[0],c_velocity[0],c_velocity[1])
198 |
199 |
200 | # if the car crash to wall, send it back at random start pos
201 | if x_state[0] < 0 or x_state[0] > 13 or x_state[1] < 0 or x_state[1] > 19 or race_map[x_state[0],x_state[1]]==0:
202 | #print "stucking..."
203 | #print "stuck action:",c_action
204 | #print "stuck state:",x_state
205 | tmp_pos = start_line[random.randint(0,len(start_line)-1)]
206 | c_state = (tmp_pos[0],tmp_pos[1],2,2)
207 | c_reward = -5
208 |
209 | elif c_velocity[0] == 0 and c_velocity[1] == 0:
210 |
211 | if np.random.choice(2,1,p=[0.5,0.5])[0] == 0:
212 | c_state = (x_state[0],x_state[1],1,x_state[3])
213 |
214 | else:
215 | c_state = (x_state[0],x_state[1],x_state[2],1)
216 |
217 | c_reward = -5
218 |
219 | else:
220 | c_state = x_state
221 | c_reward = -1
222 |
223 | episode.append(c_action)
224 | episode.append(c_reward)
225 | episode.append(c_state)
226 | n += 1
227 | #print("action:",c_action)
228 | #print("next_state:",c_state)
229 | end_pos = (c_state[0],c_state[1])
230 | #print("end position:",end_pos)
231 |
232 | print("episode generated!")
233 | return episode
234 |
235 | # p --> pos of reward in pair ; n --> episode length
236 | def calReturnOfOnePair(p,n,episode):
237 | r = 0
238 | r = r + episode[p]
239 |
240 | for i in range(p+3,n,3):
241 | #print episode[i]
242 | r += episode[i]
243 |
244 | return r
245 |
246 | def cal_Q(episode):
247 | checked_pair = set()
248 | e_length = len(episode)
249 | if not e_length:
250 | print ("episode is empty!")
251 | else:
252 | # e_length-1 is for the omitting of terminal state, avoid the out of bound when episode[i+1]
253 | for i in range(0,e_length-1,3):
254 | #sa_pair = (episode[i],episode[i+1])
255 | sa_pair = (episode[i],np.where((np.array(actions)==episode[i+1]).all(1))[0][0])
256 | print "sa_pair:",sa_pair
257 |
258 | if sa_pair not in checked_pair:
259 | #Returns[sa_pair].append(calReturnOfOnePair(i+2,e_length,episode))
260 | Returns[sa_pair[0]][sa_pair[1]].append(calReturnOfOnePair(i+2,e_length,episode))
261 | checked_pair.add(sa_pair)
262 | #Q[sa_pair] = sum(Returns[sa_pair]) * 1.0 / len(Returns[sa_pair])
263 | Q[sa_pair[0]][sa_pair[1]] = sum(Returns[sa_pair[0]][sa_pair[1]]) * 1.0 / len(Returns[sa_pair[0]][sa_pair[1]])
264 | print("calculate Q done!")
265 |
266 | def update_policy(episode):
267 | #tmpList_sa = []
268 | checked_state = set()
269 | e_length = len(episode)
270 | for i in range(0,e_length,3):
271 | s = episode[i]
272 | #tmpList_sa = []
273 | if s not in checked_state:
274 | checked_state.add(s)
275 |
276 | '''
277 | for key in Q.keys():
278 | if key[0] == s:
279 | tmpList_sa.append((key[0],key[1],Q[key]))
280 | best_action = tmpList_sa[np.argmax([it[2] for it in tmpList_sa])][1]
281 | #print ("best_action: ",best_action)
282 | '''
283 | print "state:",s
284 | print "Q[s]:",Q[s]
285 |
286 | best_action = actions[np.random.choice(np.where(Q[s] == np.amax(Q[s]))[0])]
287 | print "best action:",best_action
288 |
289 | for aix in range(len(actions)):
290 | if actions[aix] == best_action:
291 | policies[s][aix] = 1 - epsilon + epsilon / policies[s].shape[0]
292 | else:
293 | policies[s][aix] = epsilon / policies[s].shape[0]
294 |
295 |
296 | print("update done!")
297 |
298 | class env:
299 |
300 | cur_state = (9,5,0,0)
301 | traces = race_map
302 |
303 |
304 | def __init__(self,policy):
305 | self.policy = policy
306 | def forward(self):
307 | cur_action_opt = self.policy[self.cur_state]
308 | best_action_th = np.argmax([it[2] for it in cur_action_opt])
309 | best_action = (cur_action_opt[best_action_th][0],cur_action_opt[best_action_th][1])
310 | cur_v = (self.cur_state[2] + best_action[0],self.cur_state[3] + best_action[1])
311 | self.cur_state = (self.cur_state[0]-cur_v[1],self.cur_state[1]+cur_v[0],cur_v[0],cur_v[1])
312 | self.traces[self.cur_state[0],self.cur_state[1]] = 2
313 | def stop(self):
314 | if (self.cur_state[0],self.cur_state[1]) in finish_line or race_map[self.cur_state[0],self.cur_state[1]] == 0:
315 | self.traces[self.cur_state[0],self.cur_state[1]] = 2
316 | return True
317 | else:
318 | return False
319 |
320 |
321 |
322 | '''
323 | fig = plt.figure()
324 | ax = fig.add_subplot(111,autoscale_on=False,xlim=(0,race_map.shape[1]-1),ylim=(0,race_map.shape[0]-1))
325 | ax.grid()
326 | im = ax.imshow(np.flipud(race_map),origin='upper', interpolation='none')
327 |
328 | anno_text = "Episode:%d,Timestep:%d,X_velocity:%d,Y_velocity:%d"
329 | annotation = ax.annotate(anno_text %(0,0,0,0),xy=(5,11),bbox=
330 | dict(boxstyle="round4,pad=0.3", fc="white", ec="b", lw=2))
331 | #annotation.set_animated(True)
332 |
333 | #plt.show()
334 | '''
335 | '''
336 | def param_update():
337 | for eth,ep in enumerate(ep_list):
338 | for sth in range(0,len(ep),3):
339 | yield(ep[sth][0],ep[sth][1],sth/3,eth,ep[sth][2],ep[sth][3])
340 | #print ep[th]
341 |
342 | def frame_update(step_info):
343 | x,y,sth,eth,x_v,y_v = step_info
344 | race_map_copy = np.copy(race_map)
345 | race_map_copy[x,y] = 4
346 | im.set_array(np.flipud(race_map_copy))
347 | annotation.set_text(anno_text % (eth,sth,x_v,y_v))
348 | return im,annotation
349 | '''
350 |
351 | monte_carlo_num = 3500
352 | ep_list = []
353 | avg_ep_return_list = []
354 | #f = open("tmp_data.txt",'w')
355 | for i in range(monte_carlo_num):
356 | start_time = time.time()
357 |
358 | ep = episode_generator(False)
359 |
360 | time1 = time.time()
361 | print("episode length:%d" %(len(ep)/3))
362 | print("processing %d episode:" %i)
363 | cal_Q(ep)
364 |
365 | time2 = time.time()
366 |
367 | update_policy(ep)
368 |
369 | time3 = time.time()
370 |
371 | #ep = episode_generator(True)
372 | arpe = avg_return_per_episode(ep)
373 | avg_ep_return_list.append(arpe)
374 |
375 |
376 | #print("ep generator time:{:.2f}s".format(time1-start_time))
377 | #print("Q cal time:{:.2f}s".format(time2-time1))
378 | #print("policy update time:{:.2f}s".format(time3-time2))
379 | #ep_list.append(ep)
380 | #arpe = avg_return_per_episode(ep)
381 | #avg_ep_return_list.append(arpe)
382 | #f.write(('episode%d'%i) + 'return:' + str(arpe))
383 | #f.write('\n')
384 | #f.close()
385 |
386 |
387 |
388 | plt.title("assessment of racetrack problem")
389 | plt.xlabel("episode index")
390 | plt.ylabel("episode average return")
391 | plt.plot(range(0,monte_carlo_num,50),[avg_ep_return_list[i] for i in range(0,monte_carlo_num,50)],'r',label='avg return with on-policy')
392 | plt.grid()
393 |
394 |
395 | #anim = animation.FuncAnimation(fig, frame_update, frames=param_update, blit=False,save_count=9000)
396 |
397 | plt.show()
398 |
399 |
400 |
401 | '''
402 | for i in range(monte_carlo_num):
403 | print("processing %d episode" %i)
404 | ep = episode_generator()
405 | cal_Q(ep)
406 | update_policy(ep)
407 |
408 | ag1 = env(policies)
409 | stop_flag = ag1.stop()
410 | while not stop_flag:
411 | ag1.forward()
412 | print ag1.traces
413 | #print policies
414 | '''
415 |
--------------------------------------------------------------------------------
/chapter5/Racetrack_result_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter5/Racetrack_result_1.png
--------------------------------------------------------------------------------
/chapter5/Racetrack_result_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter5/Racetrack_result_2.png
--------------------------------------------------------------------------------
/chapter5/tmp_data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter5/tmp_data.txt
--------------------------------------------------------------------------------
/chapter6/Q-LEARNING_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter6/Q-LEARNING_RACETRACK.png
--------------------------------------------------------------------------------
/chapter6/SARSA_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter6/SARSA_RACETRACK.png
--------------------------------------------------------------------------------
/chapter6/td.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 |
5 | class TemporalDifference:
6 |
7 | def __init__(self,state_list,action_list):
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 |
12 | self.state_num = len(self.states)
13 | self.action_num = len(self.actions)
14 |
15 | self.Q = dict()
16 | for s in self.states:
17 | #self.Q[s] = np.random.random(self.action_num)
18 | self.Q[s] = np.zeros(self.action_num)
19 |
20 |
21 | def set_policy(self,learning_type):
22 |
23 | self.pi = dict()
24 | self.mu = dict()
25 |
26 | if learning_type == 'sarsa':
27 | for s in self.states:
28 | self.pi[s] = np.random.random(self.action_num)
29 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
30 |
31 | elif learning_type == 'q-learning':
32 | for s in self.states:
33 | idx = np.random.randint(0,self.action_num,size=1)[0]
34 | self.pi[s] = np.zeros(self.action_num)
35 | self.pi[s][idx] = 1.0
36 |
37 | self.mu[s] = np.random.random(self.action_num)
38 | self.mu[s] = self.mu[s] / np.sum(self.mu[s])
39 |
40 |
41 | def sarsa_learning(self,env,episode_num,epsilon,alpha,gamma,max_timestep,eval_interval):
42 |
43 | ep_idx = 0
44 | avg_ep_return_list = []
45 |
46 | while ep_idx < episode_num:
47 |
48 | if ep_idx % eval_interval == 0:
49 | eval_ep = env.episode_generator(self.pi,max_timestep,True)
50 | print("eval episode length:%d" %(len(eval_ep)/3))
51 | c_avg_return = env.avg_return_per_episode(eval_ep)
52 | avg_ep_return_list.append(c_avg_return)
53 | print("assessing return:%f" %c_avg_return)
54 | print "avg return list length:",len(avg_ep_return_list)
55 |
56 | ep_idx += 1
57 |
58 | env.c_state = env.getInitState()
59 | env.next_state = env.c_state
60 |
61 | n = 0
62 |
63 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.c_state])[0]
64 | env.c_action = self.actions[c_action_idx]
65 |
66 | #print "episode index:",ep_idx
67 | #print "env termination:",env.isTerminated()
68 |
69 | while not (env.isTerminated() or n >= max_timestep) :
70 |
71 | env.c_state = env.next_state
72 | env.c_action = self.actions[c_action_idx]
73 | #print "policy:",self.pi
74 |
75 | env.c_state,env.c_action,env.c_reward,env.next_state = env.oneStep_generator()
76 |
77 | next_action_idx = np.random.choice(self.action_num,1,p=self.pi[env.next_state])[0]
78 |
79 | self.Q[env.c_state][c_action_idx] += alpha * (env.c_reward + gamma * self.Q[env.next_state][next_action_idx] - self.Q[env.c_state][c_action_idx])
80 |
81 | # --------policy update at same time---------#
82 | c_best_action_idx = np.argmax(self.Q[env.c_state])
83 |
84 | for action_idx in range(self.action_num):
85 | if action_idx == c_best_action_idx:
86 | self.pi[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
87 | else:
88 | self.pi[env.c_state][action_idx] = epsilon / self.action_num
89 |
90 | c_action_idx = next_action_idx
91 |
92 | n += 1
93 | #print "n:",n
94 |
95 | return avg_ep_return_list
96 |
97 | def Q_learning(self,env,episode_num,epsilon,alpha,gamma,max_timestep,eval_interval):
98 |
99 | ep_idx = 0
100 | avg_ep_return_list = []
101 | while ep_idx < episode_num:
102 |
103 | if ep_idx % eval_interval == 0:
104 | eval_ep = env.episode_generator(self.pi,max_timestep,True)
105 | print("eval episode length:%d" %(len(eval_ep)/3))
106 | c_avg_return = env.avg_return_per_episode(eval_ep)
107 | avg_ep_return_list.append(c_avg_return)
108 | print("assessing return:%f" %c_avg_return)
109 | print "avg return list length:",len(avg_ep_return_list)
110 |
111 | ep_idx += 1
112 |
113 | env.c_state = env.getInitState()
114 | env.next_state = env.c_state
115 |
116 | n = 0
117 |
118 | while n < max_timestep and not env.isTerminated():
119 |
120 | env.c_state = env.next_state
121 |
122 | c_action_idx = np.random.choice(self.action_num,1,p=self.mu[env.c_state])[0]
123 | env.c_action = self.actions[c_action_idx]
124 |
125 |
126 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
127 |
128 | #print "c_state:",env.c_state
129 | #print "c_action:",env.c_action
130 | #print "c_reward:",env.c_reward
131 | #print "next_state:",env.next_state
132 | #print "c_state mu:",self.mu[env.c_state]
133 |
134 |
135 |
136 | self.Q[env.c_state][c_action_idx] += alpha * (
137 | env.c_reward + gamma * np.amax(self.Q[env.next_state]) - self.Q[env.c_state][c_action_idx])
138 |
139 |
140 |
141 | c_best_action_idx = np.argmax(self.Q[env.c_state])
142 |
143 | #print "c_state Q:",self.Q[env.c_state]
144 | #print "c_best_action_idx:",c_best_action_idx
145 |
146 | for action_idx in range(self.action_num):
147 | if action_idx == c_best_action_idx:
148 | self.mu[env.c_state][action_idx] = 1 - epsilon + epsilon/self.action_num
149 | else:
150 | self.mu[env.c_state][action_idx] = epsilon/self.action_num
151 |
152 |
153 | # --------policy update at same time---------#
154 | for action_idx in range(self.action_num):
155 | if action_idx == c_best_action_idx:
156 | self.pi[env.c_state][action_idx] = 1.0
157 | else:
158 | self.pi[env.c_state][action_idx] = 0.0
159 |
160 | n += 1
161 |
162 | return avg_ep_return_list
--------------------------------------------------------------------------------
/chapter7/SARSA_lambda_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter7/SARSA_lambda_RACETRACK.png
--------------------------------------------------------------------------------
/chapter7/naive_Q_lambda_RACETRACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/chapter7/naive_Q_lambda_RACETRACK.png
--------------------------------------------------------------------------------
/chapter7/td_lambda.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 | class Temporal_Difference_lambda:
5 |
6 | def __init__(self,state_list,action_list):
7 |
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 | self.state_num = len(self.states)
12 | self.action_num = len(self.actions)
13 |
14 | self.Q = dict()
15 | for s in self.states:
16 | self.Q[s] = np.random.random(self.action_num)
17 | #self.Q[s] = np.zeros(self.action_num)
18 |
19 | self.Z = dict()
20 | for s in self.states:
21 | self.Z[s] = np.zeros(self.action_num)
22 |
23 | def reset_Z(self):
24 |
25 | for s in self.states:
26 | self.Z[s] = np.zeros(self.action_num)
27 |
28 | def set_policy(self,learning_type):
29 |
30 | self.pi = dict()
31 | self.mu = dict()
32 |
33 | if learning_type == 'sarsa_lambda':
34 | for s in self.states:
35 | self.pi[s] = np.random.random(self.action_num)
36 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
37 | elif learning_type == 'naive_Q_lambda':
38 | for s in self.states:
39 | idx = np.random.randint(0, self.action_num, size=1)[0]
40 | self.pi[s] = np.zeros(self.action_num)
41 | self.pi[s][idx] = 1.0
42 |
43 | self.mu[s] = np.random.random(self.action_num)
44 | self.mu[s] = self.mu[s] / np.sum(self.mu[s])
45 |
46 |
47 |
48 | def sarsa_lambda(self,env,episode_num,epsilon,alpha,gamma,Lambda,max_timestep,eval_interval):
49 |
50 | ep_idx = 0
51 | avg_ep_return_list = []
52 |
53 | while ep_idx < episode_num:
54 |
55 | if ep_idx % eval_interval == 0:
56 | eval_ep = env.episode_generator(self.pi, max_timestep, True)
57 | print("eval episode length:%d" % (len(eval_ep) / 3))
58 | c_avg_return = env.avg_return_per_episode(eval_ep)
59 | avg_ep_return_list.append(c_avg_return)
60 | print("assessing return:%f" % c_avg_return)
61 | print "avg return list length:", len(avg_ep_return_list)
62 |
63 | ep_idx += 1
64 | print "ep_idx:",ep_idx
65 |
66 | self.reset_Z()
67 |
68 | env.c_state = env.getInitState()
69 | env.next_state = env.c_state
70 |
71 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.c_state])[0]
72 | #env.c_action = self.actions[c_action_idx]
73 |
74 |
75 | n = 0
76 |
77 | while not (env.isTerminated() or n >= max_timestep):
78 |
79 | env.c_state = env.next_state
80 | env.c_action = self.actions[c_action_idx]
81 |
82 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
83 |
84 | next_action_idx = np.random.choice(self.action_num, 1, p=self.pi[env.next_state])[0]
85 |
86 | delta = env.c_reward + gamma * self.Q[env.next_state][next_action_idx] - self.Q[env.c_state][c_action_idx]
87 |
88 | self.Z[env.c_state][c_action_idx] += 1
89 |
90 | for s in self.states:
91 | for i in range(self.action_num):
92 | self.Q[s][i] += alpha * delta * self.Z[s][i]
93 | self.Z[s][i] *= gamma * Lambda
94 |
95 | # --------policy update at same time---------#
96 | c_best_action_idx = np.argmax(self.Q[env.c_state])
97 |
98 | for action_idx in range(self.action_num):
99 | if action_idx == c_best_action_idx:
100 | self.pi[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
101 | else:
102 | self.pi[env.c_state][action_idx] = epsilon / self.action_num
103 |
104 |
105 | c_action_idx = next_action_idx
106 | n += 1
107 | print "n:",n
108 |
109 | return avg_ep_return_list
110 |
111 | # to improve the performance of Watkin's Q(lambda) and reduce the complexity of Peng's Q(lambda),we introduce naive Q(lambda)
112 | def naive_Q_lambda(self,env,episode_num,epsilon,alpha,gamma,Lambda,max_timestep,eval_interval):
113 |
114 | ep_idx = 0
115 | avg_ep_return_list = []
116 |
117 | while ep_idx < episode_num:
118 |
119 | if ep_idx % eval_interval == 0:
120 | eval_ep = env.episode_generator(self.pi, max_timestep, True)
121 | print("eval episode length:%d" % (len(eval_ep) / 3))
122 | c_avg_return = env.avg_return_per_episode(eval_ep)
123 | avg_ep_return_list.append(c_avg_return)
124 | print("assessing return:%f" % c_avg_return)
125 | print "avg return list length:", len(avg_ep_return_list)
126 |
127 | ep_idx += 1
128 | print "ep_idx:", ep_idx
129 |
130 | self.reset_Z()
131 |
132 | env.c_state = env.getInitState()
133 | env.next_state = env.c_state
134 |
135 | c_action_idx = np.random.choice(self.action_num, 1, p=self.mu[env.c_state])[0]
136 | # env.c_action = self.actions[c_action_idx]
137 |
138 |
139 | n = 0
140 |
141 | while not (env.isTerminated() or n >= max_timestep):
142 | env.c_state = env.next_state
143 | env.c_action = self.actions[c_action_idx]
144 |
145 | env.c_state, env.c_action, env.c_reward, env.next_state = env.oneStep_generator()
146 |
147 | next_action_idx = np.random.choice(self.action_num, 1, p=self.mu[env.next_state])[0]
148 | next_best_action_idx = np.argmax(self.Q[env.next_state])
149 |
150 | delta = env.c_reward + gamma * self.Q[env.next_state][next_best_action_idx] - self.Q[env.c_state][next_action_idx]
151 | self.Z[env.c_state][env.c_action] += 1
152 |
153 |
154 | if next_action_idx == next_best_action_idx:
155 | for s in self.states:
156 | for i in range(self.action_num):
157 | self.Q[s][i] += alpha * delta * self.Z[s][i]
158 | self.Z[s][i] *= Lambda * gamma
159 | else:
160 | for s in self.states:
161 | for i in range(self.action_num):
162 | self.Q[s][i] += alpha * delta * self.Z[s][i]
163 | self.Z[s][i] = 0.5
164 |
165 | c_best_action_idx = np.argmax(self.Q[env.c_state])
166 |
167 |
168 |
169 |
170 |
171 | # ------- update behavior policy --------- #
172 | for action_idx in range(self.action_num):
173 | if action_idx == c_best_action_idx:
174 | self.mu[env.c_state][action_idx] = 1 - epsilon + epsilon / self.action_num
175 | else:
176 | self.mu[env.c_state][action_idx] = epsilon / self.action_num
177 |
178 | # --------target policy update at same time---------#
179 | for action_idx in range(self.action_num):
180 | if action_idx == c_best_action_idx:
181 | self.pi[env.c_state][action_idx] = 1.0
182 | else:
183 | self.pi[env.c_state][action_idx] = 0.0
184 |
185 | c_action_idx = next_action_idx
186 | n += 1
187 |
188 | return avg_ep_return_list
--------------------------------------------------------------------------------
/chapter8/Dyna_Q.py:
--------------------------------------------------------------------------------
1 | import os,sys
2 | import numpy as np
3 |
4 | class Dyna_Q:
5 |
6 | def __init__(self,state_list,action_list):
7 |
8 | self.states = state_list
9 | self.actions = action_list
10 |
11 | self.state_num = len(self.states)
12 | self.action_num = len(self.actions)
13 |
14 | self.Q = dict()
15 | for s in self.states:
16 | self.Q[s] = np.zeros(self.action_num)
17 |
18 | self.Model = dict()
19 |
20 | for s in self.states:
21 | self.Model[s] = list()
22 | for i in range(self.action_num):
23 | rand_state = self.states[np.randint(0,self.state_num-1)]
24 | self.Model[s].append([0,rand_state])
25 |
26 | def set_policy(self,learning_type):
27 |
28 | self.pi = dict()
29 |
30 | if learning_type == 'Dyna-Q':
31 | for s in self.states:
32 | self.pi[s] = np.random.random(self.action_num)
33 | self.pi[s] = self.pi[s] / np.sum(self.pi[s])
34 |
35 | def Dyna_Q_learning(self,agent,episode_num,epsilon,alpha,gamma,max_timestep,planning_num,eval_interval):
36 |
37 | ep_idx = 0
38 |
39 | avg_ep_return_list = []
40 |
41 | observed_sa = dict()
42 |
43 | while ep_idx < episode_num:
44 |
45 | ep_idx += 1
46 |
47 | agent.c_state = agent.getInitState()
48 | agent.next_state = agent.c_state
49 |
50 | n = 0
51 |
52 | c_action_idx = np.random.choice(self.action_num, 1, p=self.pi[agent.c_state])[0]
53 | agent.c_action = self.actions[c_action_idx]
54 |
55 | while not (agent.isTerminated() or n >= max_timestep) :
56 |
57 | agent.c_state = agent.next_state
58 | agent.c_action = self.actions[c_action_idx]
59 |
60 | if agent.c_state in observed_sa.keys():
61 | observed_sa[agent.c_state].append(c_action_idx)
62 | else:
63 | observed_sa[agent.c_state] = [c_action_idx]
64 |
65 | agent.c_state, agent.c_action, agent.c_reward, agent.next_state = agent.oneStep_generator()
66 |
67 | next_action_idx = np.random.choice(self.action_num, 1, p=self.pi[agent.next_state])[0]
68 |
69 | self.Q[agent.c_state][c_action_idx] += alpha * (agent.c_reward + gamma * self.Q[agent.next_state][next_action_idx] - self.Q[agent.c_state][c_action_idx])
70 |
71 | self.Model[agent.c_state][c_action_idx] = [agent.c_reward,agent.next_state]
72 |
73 | for plan_idx in range(planning_num):
74 | pass
75 |
76 |
77 |
--------------------------------------------------------------------------------
/papers/A3C.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/A3C.pdf
--------------------------------------------------------------------------------
/papers/DDPG.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/DDPG.pdf
--------------------------------------------------------------------------------
/papers/DPG.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/DPG.pdf
--------------------------------------------------------------------------------
/papers/DQN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/DQN.pdf
--------------------------------------------------------------------------------
/papers/DRL_simulated_Auto_Vehicle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/DRL_simulated_Auto_Vehicle.pdf
--------------------------------------------------------------------------------
/papers/TRPO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/TRPO.pdf
--------------------------------------------------------------------------------
/papers/bookdraft2017june19.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/bookdraft2017june19.pdf
--------------------------------------------------------------------------------
/papers/crossing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xubo92/an-introduction-to-reinforcement-learning/ba235fff96b1adac8875ff610c59b25528122494/papers/crossing.pdf
--------------------------------------------------------------------------------