├── .gitignore
├── BlackJackMonteCarlo
    ├── black_jack_sampler.py
    ├── cards.py
    ├── cards_set
    │   └── img
    │   │   ├── 10C.png
    │   │   ├── 10D.png
    │   │   ├── 10H.png
    │   │   ├── 10S.png
    │   │   ├── 2C.png
    │   │   ├── 2D.png
    │   │   ├── 2H.png
    │   │   ├── 2S.png
    │   │   ├── 3C.png
    │   │   ├── 3D.png
    │   │   ├── 3H.png
    │   │   ├── 3S.png
    │   │   ├── 4C.png
    │   │   ├── 4D.png
    │   │   ├── 4H.png
    │   │   ├── 4S.png
    │   │   ├── 5C.png
    │   │   ├── 5D.png
    │   │   ├── 5H.png
    │   │   ├── 5S.png
    │   │   ├── 6C.png
    │   │   ├── 6D.png
    │   │   ├── 6H.png
    │   │   ├── 6S.png
    │   │   ├── 7C.png
    │   │   ├── 7D.png
    │   │   ├── 7H.png
    │   │   ├── 7S.png
    │   │   ├── 8C.png
    │   │   ├── 8D.png
    │   │   ├── 8H.png
    │   │   ├── 8S.png
    │   │   ├── 9C.png
    │   │   ├── 9D.png
    │   │   ├── 9H.png
    │   │   ├── 9S.png
    │   │   ├── AC.png
    │   │   ├── AD.png
    │   │   ├── AH.png
    │   │   ├── AS.png
    │   │   ├── JC.png
    │   │   ├── JD.png
    │   │   ├── JH.png
    │   │   ├── JS.png
    │   │   ├── KC.png
    │   │   ├── KD.png
    │   │   ├── KH.png
    │   │   ├── KS.png
    │   │   ├── QC.png
    │   │   ├── QD.png
    │   │   ├── QH.png
    │   │   ├── QS.png
    │   │   └── jb_card.png
    ├── config.py
    ├── controllers
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-35.pyc
    │   │   └── manual_controller.cpython-35.pyc
    │   └── manual_controller.py
    ├── environment.py
    ├── first_visit_mc.py
    ├── game.py
    └── player.py
├── DoubleQLearning
    ├── double_qlearning.py
    ├── environment.py
    ├── main.py
    └── qlearning.py
├── README.md
├── SimplePolicyIteration
    ├── .vscode
    │   └── settings.json
    ├── agent.py
    ├── config.py
    ├── config.pyc
    ├── controller.py
    ├── controller.pyc
    ├── environment.py
    ├── environment.pyc
    ├── game.py
    ├── game_widgets.py
    ├── game_widgets.pyc
    ├── images
    │   ├── baby_r.png
    │   ├── beat_r.png
    │   ├── grass_r.png
    │   └── kfc_r.png
    └── policy_iteration.py
├── mountain_car
    ├── README.md
    ├── __init__.py
    ├── main.py
    ├── semi_gradient_sarsa.py
    ├── tile_coding.py
    ├── tilings.pkl
    ├── train.py
    └── weights.pkl
└── self_driving_agent
    ├── DQN_Control
        ├── __init__.py
        ├── model.py
        ├── process_img.py
        └── replay_buffer.py
    ├── README.md
    ├── __init__.py
    ├── config.py
    ├── controllers.py
    ├── environment.py
    ├── initial_setup.py
    ├── main.py
    ├── synch_mode.py
    ├── train.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | */__pycache__
2 | */myenv
3 | */weights/*
4 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/black_jack_sampler.py:
--------------------------------------------------------------------------------
 1 | from cards import cards
 2 | from player import Player
 3 | from environment import Environment
 4 | 
 5 | # We assume cards are drawn from an inifinite set with replacement
 6 | 
 7 | class BlackJackSampler(object):
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def generate_episode(self, policy_func):
12 |         dealer = Player(cards)
13 |         player = Player(cards)
14 |         env = Environment(player, dealer)
15 | 
16 |         state = env.state()
17 |         episode_trace = [state]
18 |         while True:
19 |             action = policy_func(state)
20 |             episode_trace.append(action)
21 |             state, reward, done = env.step(action)
22 |             episode_trace.append(reward)
23 | 
24 |             if done:
25 |                 break
26 |             episode_trace.append(state)
27 |         return episode_trace
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards.py:
--------------------------------------------------------------------------------
1 | cards = { str(i): i for i in range(2, 11) }
2 | 
3 | cards['A'] = 1
4 | cards['J'] = 10
5 | cards['Q'] = 10
6 | cards['K'] = 10
7 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/10C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/10D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/10H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/10S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/2C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/2D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/2H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/2S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/3C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/3D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/3H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/3S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/4C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/4D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/4H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/4S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/5C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/5D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/5H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/5S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/6C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/6D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/6H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/6S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/7C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/7D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/7H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/7S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/8C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/8D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/8H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/8S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/9C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9C.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/9D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9D.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/9H.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9H.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/9S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9S.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/AC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AC.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/AD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AD.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/AH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AH.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/AS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AS.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/JC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JC.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/JD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JD.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/JH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JH.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/JS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JS.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/KC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KC.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/KD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KD.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/KH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KH.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/KS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KS.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/QC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QC.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/QD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QD.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/QH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QH.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/QS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QS.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/cards_set/img/jb_card.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/jb_card.png


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     screen_height = 800
3 |     screen_width = 1200
4 |     card_height = 230
5 |     card_width = 150
6 |     delay = 100
7 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__init__.py


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/controllers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/controllers/__pycache__/manual_controller.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__pycache__/manual_controller.cpython-35.pyc


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/controllers/manual_controller.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | 
 3 | class ManualController:
 4 |     keys = None
 5 | 
 6 |     def set_key_map(key_map):
 7 |         ManualController.keys = key_map
 8 | 
 9 |     def stick():
10 |         return ManualController.keys[pygame.K_LEFT]
11 | 
12 |     def hit():
13 |         return ManualController.keys[pygame.K_RIGHT]
14 | 
15 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/environment.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Environment(object):
 3 |     def __init__(self, player, dealer):
 4 |         self.player = player
 5 |         self.dealer = dealer  # The dealer is actually part of the env
 6 |         self.dealer_thresh = 17
 7 |         self.bust_thresh = 21
 8 | 
 9 |     def state(self):
10 |         return self.player.total(), self.dealer.first_card_total()
11 | 
12 |     def selected_cards_state(self):
13 |         # This is just to help with the visualization in pygame
14 |         if not self.dealer.stick_:
15 |             return self.player.chosen_cards, [self.dealer.chosen_cards[1]]
16 |         return self.player.chosen_cards, self.dealer.chosen_cards
17 |     
18 |     def step(self, action):
19 |         # Map 0 - hit, 1 - stick
20 |         if action == 0:
21 |             self.player.hit()
22 |         elif action == 1:
23 |             self.player.stick()
24 | 
25 |         state = self.player.total(), self.dealer.first_card_total()
26 |         # player is done when bust or after he sticks
27 |         player_done = (self.player.total() > self.bust_thresh) or (action == 1)
28 |         reward = self._reward(state, player_done)
29 |         return state, reward, player_done
30 | 
31 |     def _reward(self, state, player_done):
32 |         if not player_done:
33 |             return 0
34 |         while self.dealer.total() < self.dealer_thresh:
35 |             self.dealer.hit()
36 |         self.dealer.stick()
37 | 
38 |         player_bust = self.player.total() > self.bust_thresh
39 |         dealer_bust = self.dealer.total() > self.bust_thresh
40 |         player_scored_higher = self.player.total() > self.dealer.total()
41 |         dealer_scored_higher = self.player.total() < self.dealer.total()
42 | 
43 |         # scoring when any of the players bust
44 |         if player_bust and dealer_bust:
45 |             return 0
46 |         elif player_bust and (not dealer_bust):
47 |             return -1
48 |         elif (not player_bust) and dealer_bust:
49 |             return 1
50 |         
51 |         # scoring based on scores
52 |         if player_scored_higher:
53 |             return 1
54 |         elif dealer_scored_higher:
55 |             return -1
56 |         else:
57 |             return 0
58 |         
59 |         
60 |     
61 |     
62 | 
63 |     


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/first_visit_mc.py:
--------------------------------------------------------------------------------
 1 | # This import registers the 3D projection, but is otherwise unused.
 2 | from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | from matplotlib import cm
 6 | from matplotlib.ticker import LinearLocator, FormatStrFormatter
 7 | import numpy as np
 8 | from black_jack_sampler import BlackJackSampler
 9 | 
10 | # For solving the prediction problem
11 | class FirstVisitMC(object):
12 |     def __init__(self):
13 |         self.values = {}
14 |         self.episodes = 100
15 |     
16 |     def policy(self, state):
17 |         if state[0] >= 20:
18 |             return 1  # stick
19 |         else:
20 |             return 0
21 | 
22 |     def run(self):
23 |         
24 |         black_jack = BlackJackSampler()
25 |         
26 |         for i in range(100000):
27 |             episode = black_jack.generate_episode(self.policy)
28 |             G = 0
29 |             for i in range(len(episode)-1, 0, -3):
30 |                 reward = episode[i]
31 |                 action = episode[i-1]
32 |                 state = episode[i-2]
33 | 
34 |                 G = G+reward
35 | 
36 |                 if state in episode[:i-2]:
37 |                     # it is not our first visir to this state
38 |                     continue
39 |                 
40 |                 if state in self.values:
41 |                     self.values[state].append(G)
42 |                 else:
43 |                     self.values[state] = [G]
44 |         
45 |         self.values = { key: sum(self.values[key])/len(self.values[key]) for key in self.values}
46 |         return self.values
47 |     
48 |     def plot_value_function(self):
49 |         x = np.arange(1, 22)
50 |         y = np.arange(1, 10)
51 | 
52 |         x, y = np.meshgrid(x, y)
53 | 
54 |         z = []
55 | 
56 |         for (row_ind, i) in enumerate(x):
57 |             temp = []
58 |             for (col_ind, j) in enumerate(x[row_ind]):
59 |                 x_val = j
60 |                 y_val = y[row_ind, col_ind]
61 |                 if (x_val, y_val) in self.values:
62 |                     temp.append(self.values[(x_val, y_val)])
63 |                 else:
64 |                     temp.append(0)
65 |             z.append(temp)
66 |         z = np.array(z)
67 | 
68 |         fig = plt.figure()
69 |         ax = fig.gca(projection='3d')
70 |         
71 |         # Plot the surface.
72 |         surf = ax.plot_surface(y, x, z, cmap=cm.coolwarm,
73 |                             linewidth=0, antialiased=False)
74 |         
75 |         ax.set_xlabel("Dealer's showing card")
76 |         ax.set_ylabel("Player Sum")
77 |         ax.set_zlabel("Value")
78 | 
79 |         # Customize the z axis.
80 |         ax.set_zlim(-1.01, 1.01)
81 |         ax.zaxis.set_major_locator(LinearLocator(10))
82 |         ax.zaxis.set_major_formatter(FormatStrFormatter('%.01f'))
83 | 
84 |         # Add a color bar which maps values to colors.
85 |         fig.colorbar(surf, shrink=0.5, aspect=5)
86 | 
87 |         plt.show()
88 | 
89 | 
90 | 
91 | fv_mc = FirstVisitMC()
92 | fv_mc.run()
93 | fv_mc.plot_value_function()
94 | 


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/game.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import pygame
  3 | from config import Config
  4 | from cards import cards
  5 | from player import Player
  6 | from environment import Environment
  7 | from controllers.manual_controller import ManualController
  8 | 
  9 | class BlackJack(object):
 10 |     def __init__(self, config, env, controller):
 11 |         pygame.init()
 12 |         self.win_ = pygame.display.set_mode((config.screen_width,
 13 |                                             config.screen_height))
 14 |         pygame.display.set_caption("Jabrah")
 15 | 
 16 |         self.config_ = config
 17 |         self.env_ = env
 18 |         self.controller_ = controller
 19 | 
 20 |         self.dealer_wins = -2
 21 |     
 22 | 
 23 |     def draw_env(self):
 24 |         self.win_.fill((0, 128, 0))
 25 |         init_offset = 10
 26 |         text_font_size = 50
 27 |         root_dir = "cards_set/img/"
 28 |         img_ext = '.png'
 29 | 
 30 |         font1 = pygame.font.SysFont(None, text_font_size)
 31 |         font2 = pygame.font.SysFont(None, text_font_size)
 32 |         dealer_text = font1.render("Dealer's Cards", True, (255, 255, 255))
 33 |         player_text = font1.render("Player's Cards", True, (255, 255, 255))
 34 |         self.win_.blit(dealer_text, (init_offset, init_offset))
 35 |         self.win_.blit(player_text, (
 36 |             init_offset, self.config_.screen_height - text_font_size - init_offset))
 37 |         
 38 |         
 39 |         player_cards, dealer_cards = self.env_.selected_cards_state()
 40 |         
 41 |         player_cards = [c+s+img_ext for c, s in player_cards]
 42 |         dealer_cards = [c+s+img_ext for c, s in dealer_cards]
 43 |         
 44 | 
 45 |         if len(dealer_cards) <= 1:
 46 |             dealer_cards = ['jb_card.png'] + dealer_cards
 47 | 
 48 |         for (i, card_image) in enumerate(dealer_cards):
 49 |             card_obj = pygame.image.load(root_dir+card_image)
 50 |             x_coord = (i+1) * init_offset + (i * self.config_.card_width)
 51 |             y_coord = 2*init_offset + text_font_size
 52 |             self.win_.blit(card_obj, (x_coord, y_coord))
 53 | 
 54 |         for (i, card_image) in enumerate(player_cards):
 55 |             card_obj = pygame.image.load(root_dir+card_image)
 56 |             x_coord = (i+1) * init_offset + (i * self.config_.card_width )
 57 |             y_coord = self.config_.screen_height - self.config_.card_height - text_font_size - 3*init_offset
 58 |             self.win_.blit(card_obj, (x_coord, y_coord))
 59 |         
 60 |         # update scores
 61 |         score_text = "Score: "
 62 | 
 63 |         dealer_score = str(self.env_.dealer.first_card_total()) if not self.env_.dealer.stick_\
 64 |             else str(self.env_.dealer.total())
 65 |         player_score = str(self.env_.player.total())
 66 | 
 67 |         dealer_score_text = font2.render(score_text + dealer_score, True, (255, 255, 0))
 68 |         player_score_text = font2.render(score_text + player_score, True, (255, 255, 0))
 69 |         self.win_.blit(dealer_score_text, (self.config_.screen_width - 250, init_offset))
 70 |         self.win_.blit(
 71 |             player_score_text, (
 72 |                 self.config_.screen_width - 250,
 73 |                 self.config_.screen_height - text_font_size - init_offset))
 74 |         
 75 |         if self.dealer_wins == -1:
 76 |             win_text = font2.render("DRAW", True, (255, 0, 0))
 77 |             self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2))
 78 |         elif self.dealer_wins == 0:
 79 |             win_text = font2.render("WIN", True, (0, 0, 128))
 80 |             self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2))
 81 |         elif self.dealer_wins == 1:
 82 |             win_text = font2.render("LOST", True, (128, 0, 0))
 83 |             self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2))
 84 | 
 85 |     def start(self):
 86 |         run = True
 87 |         state = None
 88 |         reward = 0
 89 |         done = False
 90 | 
 91 |         while (run):
 92 |             pygame.time.delay(self.config_.delay)
 93 |             for event in pygame.event.get():
 94 |                 if event.type == pygame.QUIT:
 95 |                     run = False
 96 | 
 97 |             # Game Logic
 98 |             keys = pygame.key.get_pressed()
 99 |             self.controller_.set_key_map(keys)
100 | 
101 |             if self.controller_.hit():
102 |                 state, reward, done = self.env_.step(0)
103 |             elif self.controller_.stick():
104 |                 state, reward, done = self.env_.step(1)
105 | 
106 |             if done:
107 |                 if reward == 1:
108 |                     self.dealer_wins = 0
109 |                 elif reward == -1:
110 |                     self.dealer_wins = 1
111 |                 else:
112 |                     self.dealer_wins = -1
113 | 
114 |             self.draw_env()
115 |             pygame.display.update()
116 |         pygame.quit()
117 | 
118 | player = Player(cards)
119 | dealer = Player(cards)
120 | env = Environment(player, dealer)
121 | 
122 | game = BlackJack(Config, env, ManualController)
123 | game.draw_env()
124 | pygame.display.update()
125 | game.start()


--------------------------------------------------------------------------------
/BlackJackMonteCarlo/player.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | class Player(object):
 4 |     def __init__(self, cards):
 5 |         self.cards = cards
 6 |         self.chosen_cards = [self._pick_card(), self._pick_card()]
 7 |         self.stick_ = False
 8 | 
 9 |     def _pick_card(self):
10 |         card_suits = ['C', 'D', 'H', 'S']
11 | 
12 |         return random.choice(list(self.cards.keys())), random.choice(card_suits)
13 |     
14 |     def hit(self):
15 |         # Request for additional cards
16 |         card_choice = self._pick_card()
17 | 
18 |         card_value = self.cards[card_choice[0]]
19 | 
20 |         self.chosen_cards.append(card_choice)
21 | 
22 |     def stick(self):
23 |         self.stick_ = True
24 |     
25 |     def total(self):
26 |         # Stop requesting for additional cards
27 |         total = 0
28 |         for (chosen_card, _) in self.chosen_cards:
29 |             total += self.cards[chosen_card]
30 |         return total
31 |     
32 |     def first_card_total(self):
33 |         # If the player is a dealer
34 |         return self.cards[self.chosen_cards[1][0]]
35 | 
36 | 


--------------------------------------------------------------------------------
/DoubleQLearning/double_qlearning.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | 
  4 | class DoubleQLearningAgent(object):
  5 |     def __init__(self, env) -> None:
  6 |         super().__init__()
  7 |         self.e = 0.1
  8 |         self.alpha = 0.1
  9 |         self.gamma = 1
 10 |         self.num_episodes = 300
 11 | 
 12 |         self.env = env
 13 | 
 14 |         self.init_state = env.start_state
 15 |         self.q1_sa = {x: 0 for x in env.state_transitions}
 16 |         self.q2_sa = {x: 0 for x in env.state_transitions}
 17 |     
 18 |     def reset_policy(self):
 19 |         self.q1_sa = {x: 0 for x in self.env.state_transitions}
 20 |         self.q2_sa = {x: 0 for x in self.env.state_transitions}
 21 | 
 22 |     def soft_policy(self, state):
 23 |         possible_actions = self.env.possible_actions[state]
 24 |         if len(possible_actions) > 1:
 25 |             prob_explorative_action = self.e / len(possible_actions)
 26 |             prob_greedy_action = 1 - self.e + prob_explorative_action
 27 | 
 28 |             q1_values = []
 29 |             q2_values = []
 30 |             corresponding_actions = []
 31 |             for a in possible_actions:
 32 |                 if (state, a) in self.q1_sa:
 33 |                     q1_values.append(self.q1_sa[(state, a)])
 34 |                     q2_values.append(self.q2_sa[(state, a)])
 35 |                     corresponding_actions.append(a)
 36 | 
 37 |             # check if all q_values are the same, if so then take random action
 38 |             q_values = np.array(q1_values) + np.array(q2_values)
 39 |             if all(x == q_values[0] for x in q_values):
 40 |                 return random.choice(possible_actions)
 41 |             greedy_action = corresponding_actions[np.argmax(q_values)]
 42 |             non_greedy_actions = list(set(possible_actions) - set([greedy_action]))
 43 | 
 44 |             action = np.random.choice([greedy_action]+non_greedy_actions,
 45 |                         p=[prob_greedy_action]+[prob_explorative_action for i in range(len(non_greedy_actions))])
 46 |             return action
 47 |         return possible_actions[0]
 48 | 
 49 | 
 50 |     def generate_episode(self):
 51 |         episode = []
 52 |         state = self.init_state
 53 |         while True:
 54 |             action = self.soft_policy(state)
 55 |             reward, next_state, done = self.env.step(state, action)
 56 |             episode.append((state, action, reward, next_state))
 57 |             state = next_state
 58 |             if done:
 59 |                 break
 60 |         return episode
 61 | 
 62 | 
 63 |     def update_policy(self):
 64 |         n_iters =  1000
 65 |         final_left_array = []
 66 |         final_right_array = []
 67 |         for _ in range(n_iters):
 68 |             self.reset_policy()
 69 |             left_actions_count = []
 70 |             right_actions_count = []
 71 |             
 72 |             for i in range(self.num_episodes):
 73 |                 left = 0
 74 |                 right = 0
 75 | 
 76 |                 # generate episode using policy defined above
 77 |                 episode = self.generate_episode()
 78 |                 for s, a, r, s_p in episode:
 79 |                     # use a soft policy as behavior policy e.g epsilon-greedy
 80 |                     action = a
 81 |                     if s == self.init_state and action == 'left':
 82 |                         left += 1
 83 |                     if s == self.init_state and action == 'right':
 84 |                         right += 1
 85 |                     possible_future_actions = self.env.possible_actions[s_p]
 86 |                     
 87 |                     if np.random.rand() > 0.5: 
 88 |                         q2_values = [(self.q2_sa[(s_p, a_)], a_) for a_ in possible_future_actions if (s_p, a_) in self.q2_sa]
 89 |                         max_q_value = self.q1_sa[(s_p, max(q2_values)[1])] if len(q2_values) > 0 else 0
 90 |                         # update q function using different target policy
 91 |                         self.q2_sa[(s, action)] = self.q2_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q2_sa[(s, action)]))
 92 |                     else:
 93 |                         q1_values = [(self.q1_sa[(s_p, a_)], a_) for a_ in possible_future_actions if (s_p, a_) in self.q1_sa]
 94 |                         max_q_value = self.q2_sa[(s_p, max(q1_values)[1])] if len(q1_values) > 0 else 0
 95 |                         # update q function using different target policy
 96 |                         self.q1_sa[(s, action)] = self.q1_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q1_sa[(s, action)]))
 97 |                 left_actions_count.append(left)
 98 |                 right_actions_count.append(right)
 99 |             left_actions_count = np.array(left_actions_count)
100 |             right_actions_count = np.array(right_actions_count)
101 | 
102 |             final_left_array.append(left_actions_count)
103 |             final_right_array.append(right_actions_count)
104 | 
105 |         final_left_array = np.array(final_left_array)
106 |         final_right_array = np.array(final_right_array)
107 |         out = 100 * final_left_array.sum(axis=0) / (final_left_array.sum(axis=0) + final_right_array.sum(axis=0))
108 |         return out
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/DoubleQLearning/environment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Environment(object):
 4 |     def __init__(self) -> None:
 5 | 
 6 |         super().__init__()
 7 |         self.start_state = 'A'
 8 | 
 9 |         self.terminal_state = 'T'
10 | 
11 |         self.states = ['A', 'B', 'T']
12 | 
13 |         self.b_actions = list(range(1, 10))
14 | 
15 |         self.possible_actions = {
16 |             'A': ['left', 'right'],
17 |             'B': self.b_actions,
18 |             'T': []
19 |         }
20 | 
21 |         self.state_transitions = {('B', i): 'T' for i in self.b_actions}
22 |         self.state_transitions[('A', 'left')] = 'B'
23 |         self.state_transitions[('A', 'right')] = 'T'
24 | 
25 | 
26 |     def reward(self, state, action):
27 |         if (state == 'B'):
28 |             mu, sigma = -0.1, 1 
29 |             return np.random.normal(mu, sigma, 1)[0]
30 |         return 0
31 |     
32 |     def step(self, state, action):
33 |         state = state
34 |         reward = self.reward(state, action)
35 |         next_state = self.state_transitions[(state, action)]
36 |         done = (next_state == self.terminal_state)
37 |         return reward, next_state, done
38 | 


--------------------------------------------------------------------------------
/DoubleQLearning/main.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import seaborn as sns
 3 | 
 4 | from environment import Environment
 5 | from qlearning import QLearningAgent
 6 | from double_qlearning import DoubleQLearningAgent
 7 | 
 8 | sns.set()
 9 | 
10 | 
11 | env = Environment()
12 | agent = QLearningAgent(env)
13 | agent2 = DoubleQLearningAgent(env)
14 | 
15 | left_actions_ratio_a1 = agent.update_policy()
16 | left_actions_ratio_a2 = agent2.update_policy()
17 | 
18 | 
19 | fig, ax = plt.subplots()
20 | ax.plot(range(len(left_actions_ratio_a1)), left_actions_ratio_a1, color="red", label="Q-Learning")
21 | ax.plot(range(len(left_actions_ratio_a2)), left_actions_ratio_a2, color="green", label="Double Q-Learning")
22 | ax.plot(range(len(left_actions_ratio_a1)), [5]*len(left_actions_ratio_a1), '--', color='black', label='optimal')
23 | ax.set_xlabel("Number of episodes")
24 | ax.set_ylabel("% of left actions from A")
25 | ax.legend(loc='best')
26 | plt.show()
27 | 


--------------------------------------------------------------------------------
/DoubleQLearning/qlearning.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | 
  4 | class QLearningAgent(object):
  5 |     def __init__(self, env) -> None:
  6 |         super().__init__()
  7 |         self.e = 0.1
  8 |         self.alpha = 0.1
  9 |         self.gamma = 1
 10 |         self.num_episodes = 300
 11 | 
 12 |         self.env = env
 13 | 
 14 |         self.init_state = env.start_state
 15 |         self.q_sa = {x: 0 for x in env.state_transitions}
 16 |     
 17 |     def reset_policy(self):
 18 |         self.q_sa = {x: 0 for x in self.env.state_transitions}
 19 | 
 20 |     def soft_policy(self, state):
 21 |         possible_actions = self.env.possible_actions[state]
 22 |         if len(possible_actions) > 1:
 23 |             prob_explorative_action = self.e / len(possible_actions)
 24 |             prob_greedy_action = 1 - self.e + prob_explorative_action
 25 | 
 26 |             q_values = []
 27 |             corresponding_actions = []
 28 |             for a in possible_actions:
 29 |                 if (state, a) in self.q_sa:
 30 |                     q_values.append(self.q_sa[(state, a)])
 31 |                     corresponding_actions.append(a)
 32 | 
 33 |             # check if all q_values are the same, if so then take random action
 34 |             if all(x == q_values[0] for x in q_values):
 35 |                 return random.choice(possible_actions)
 36 |             greedy_action = corresponding_actions[np.argmax(q_values)]
 37 |             non_greedy_actions = list(set(possible_actions) - set([greedy_action]))
 38 | 
 39 |             action = np.random.choice([greedy_action]+non_greedy_actions,
 40 |                         p=[prob_greedy_action]+[prob_explorative_action for i in range(len(non_greedy_actions))])
 41 |             return action
 42 |         return possible_actions[0]
 43 | 
 44 | 
 45 |     def generate_episode(self):
 46 |         episode = []
 47 |         state = self.init_state
 48 |         while True:
 49 |             action = self.soft_policy(state)
 50 |             reward, next_state, done = self.env.step(state, action)
 51 |             episode.append((state, action, reward, next_state))
 52 |             state = next_state
 53 |             if done:
 54 |                 break
 55 |         return episode
 56 | 
 57 | 
 58 |     def update_policy(self):
 59 |         n_iters =  1000
 60 |         final_left_array = []
 61 |         final_right_array = []
 62 |         for _ in range(n_iters):
 63 |             self.reset_policy()
 64 |             left_actions_count = []
 65 |             right_actions_count = []
 66 |             
 67 |             for i in range(self.num_episodes):
 68 |                 left = 0
 69 |                 right = 0
 70 | 
 71 |                 # generate episode using policy defined above
 72 |                 episode = self.generate_episode()
 73 |                 for s, a, r, s_p in episode:
 74 |                     # use a soft policy as behavior policy e.g epsilon-greedy
 75 |                     action = a
 76 |                     if s == self.init_state and action == 'left':
 77 |                         left += 1
 78 |                     if s == self.init_state and action == 'right':
 79 |                         right += 1
 80 |                     possible_future_actions = self.env.possible_actions[s_p]
 81 |         
 82 |                     q_values = [self.q_sa[(s_p, a_)] for a_ in possible_future_actions if (s_p, a_) in self.q_sa]
 83 | 
 84 |                     max_q_value = max(q_values) if len(q_values) > 0 else 0
 85 | 
 86 |                     # update q function using different target policy
 87 |                     self.q_sa[(s, action)] = self.q_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q_sa[(s, action)]))
 88 |                 left_actions_count.append(left)
 89 |                 right_actions_count.append(right)
 90 |             left_actions_count = np.array(left_actions_count)
 91 |             right_actions_count = np.array(right_actions_count)
 92 | 
 93 |             final_left_array.append(left_actions_count)
 94 |             final_right_array.append(right_actions_count)
 95 | 
 96 |         final_left_array = np.array(final_left_array)
 97 |         final_right_array = np.array(final_right_array)
 98 |         out = 100 * final_left_array.sum(axis=0) / (final_left_array.sum(axis=0) + final_right_array.sum(axis=0))
 99 |         return out
100 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ReinforcementLearning
2 | This is the repository with the programming tutorials for the reinforcement learning module
3 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/jareth/anaconda3/envs/rllab3/bin/python",
3 |     "python.linting.pylintArgs": ["----extension-pkg-whitelist=1xml"]
4 | }


--------------------------------------------------------------------------------
/SimplePolicyIteration/agent.py:
--------------------------------------------------------------------------------
 1 | class Agent:
 2 |     def __init__(self, name, pos):
 3 |         self.__name = name
 4 |         self.__pos = pos
 5 | 
 6 |     def get_name(self):
 7 |         return self.__name
 8 | 
 9 |     def get_pos(self):
10 |         return self.__pos
11 |     
12 |     def set_pos(self, new_pos):
13 |         self.__pos = new_pos
14 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     screen_size = 500
3 |     cell_height = 100
4 |     cell_width = 100
5 |     velocity = 5
6 |     delay = 1000
7 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/config.pyc


--------------------------------------------------------------------------------
/SimplePolicyIteration/controller.py:
--------------------------------------------------------------------------------
 1 | import pygame
 2 | 
 3 | class Controller:
 4 |     keys = None
 5 | 
 6 |     def set_key_map(key_map):
 7 |         Controller.keys = key_map
 8 | 
 9 |     def left():
10 |         return Controller.keys[pygame.K_LEFT]
11 | 
12 |     def right():
13 |         return Controller.keys[pygame.K_RIGHT]
14 | 
15 |     def up():
16 |         return Controller.keys[pygame.K_UP]
17 | 
18 |     def down():
19 |         return Controller.keys[pygame.K_DOWN]
20 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/controller.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/controller.pyc


--------------------------------------------------------------------------------
/SimplePolicyIteration/environment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Environment:
 4 |     # creating a grid with 5 x 5 cells
 5 |     def __init__(self, size=5):
 6 |         self.grid = np.empty(shape=(size,size), dtype=object)
 7 |         self.size = size
 8 |     
 9 |     def place_cell(self, x, y, cell):
10 |         self.grid[x][y] = cell
11 | 
12 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/environment.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/environment.pyc


--------------------------------------------------------------------------------
/SimplePolicyIteration/game.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | from agent import Agent
  3 | from config import Config
  4 | from controller import Controller
  5 | from game_widgets import CellType, CellState
  6 | from environment import Environment
  7 | from policy_iteration import PolicyIteration 
  8 | 
  9 | class Game:
 10 |     def __init__(self, config, controller, env, agent, policy=None):
 11 |         # initialize pygame module
 12 |         pygame.init()
 13 | 
 14 |         # setup config parameters for game window
 15 |         self.controller_ = controller
 16 |         self.config_ = config
 17 |         self.win_ = pygame.display.set_mode((config.screen_size,
 18 |                                             config.screen_size))
 19 |         
 20 |         # set game env
 21 |         self.env_ = env
 22 | 
 23 |         # set agent
 24 |         self.agent = agent
 25 | 
 26 |         # set policy
 27 |         self.policy = policy
 28 | 
 29 |         pygame.display.set_caption("Jabrah")
 30 | 
 31 |     def draw_env(self):
 32 |         self.win_.fill((0,0,0))
 33 |         # load images
 34 |         kfc_image = pygame.image.load("images/kfc_r.png")
 35 |         grass_image = pygame.image.load("images/grass_r.png")
 36 |         whoop_image = pygame.image.load("images/beat_r.png")
 37 | 
 38 |         # draw environments
 39 |         for i in range(self.env_.size):
 40 |             for j in range(self.env_.size):
 41 |                 cell = self.env_.grid[i][j]
 42 | 
 43 |                 (x, y) = cell.pos()
 44 |                 width = self.config_.cell_width
 45 |                 height = self.config_.cell_height
 46 | 
 47 |                 x = x * width
 48 |                 y = y * height
 49 | 
 50 |                 if (i,j) == agent.get_pos():
 51 |                     agent_image = pygame.image.load("images/baby_r.png")
 52 |                     self.win_.blit(agent_image, (x, y))
 53 | 
 54 |                 elif cell.cell_type() == CellType.KFC:
 55 |                     self.win_.blit(kfc_image, (x, y))
 56 | 
 57 |                 elif cell.cell_type() == CellType.WHOOPING:
 58 |                     self.win_.blit(whoop_image, (x, y))
 59 | 
 60 |                 else:
 61 |                     self.win_.blit(grass_image, (x, y))
 62 | 
 63 | 
 64 |     def start(self):
 65 |         run = True
 66 |         while (run):
 67 |             pygame.time.delay(self.config_.delay)
 68 |             for event in pygame.event.get():
 69 |                 if event.type == pygame.QUIT:
 70 |                     run = False
 71 |             keys = pygame.key.get_pressed()
 72 | 
 73 |             self.controller_.set_key_map(keys)
 74 |             curr_pos = self.agent.get_pos()
 75 |             new_pos = curr_pos
 76 | 
 77 |             if self.policy is None:
 78 |                 # if no policy, use controller
 79 |                 if self.controller_.right():
 80 |                     new_pos = (min(curr_pos[0]+1, env.size-1), curr_pos[1])
 81 | 
 82 |                 elif self.controller_.left():
 83 |                     new_pos = (max(curr_pos[0]-1, 0), curr_pos[1])
 84 |                 
 85 |                 elif self.controller_.down():
 86 |                     new_pos = (curr_pos[0], min(curr_pos[1]+1, env.size-1))
 87 | 
 88 |                 elif self.controller_.up():
 89 |                     new_pos = (curr_pos[0], max(curr_pos[1]-1, 0))
 90 |             else:
 91 |                 if self.policy[curr_pos] == 1:
 92 |                     new_pos = (min(curr_pos[0]+1, env.size-1), curr_pos[1])
 93 | 
 94 |                 elif self.policy[curr_pos] == 3:
 95 |                     new_pos = (max(curr_pos[0]-1, 0), curr_pos[1])
 96 |                 
 97 |                 elif self.policy[curr_pos] == 2:
 98 |                     new_pos = (curr_pos[0], min(curr_pos[1]+1, env.size-1))
 99 | 
100 |                 elif self.policy[curr_pos] == 0:
101 |                     new_pos = (curr_pos[0], max(curr_pos[1]-1, 0))
102 | 
103 |             self.agent.set_pos(new_pos)
104 |             self.draw_env()
105 |             pygame.display.update()
106 | 
107 |         pygame.quit()
108 | 
109 | def create_game_env():
110 |     env = Environment(size=5)
111 |     states = []
112 |     policy = {}
113 | 
114 |     # the elements in the matrix represent the cell type
115 |     cell_matrix = [[1, 1, 1, 1, 1],
116 |                    [1, 1, 1, 1, 2],
117 |                    [2, 1, 1, 1, 1],
118 |                    [2, 2, 1, 1, 1],
119 |                    [1, 2, 1, 1, 3]]
120 | 
121 |     size = len(cell_matrix)
122 |     env = Environment(size=size)
123 | 
124 |     reward = 0
125 |     is_terminal = False
126 |     for i in range(size):
127 |         for j in range(size):
128 |             cell_type = cell_matrix[j][i]
129 | 
130 |             if cell_type == CellType.WHOOPING:
131 |                 is_terminal = True
132 |                 reward = -10
133 |             elif cell_type == CellType.KFC:
134 |                 is_terminal = True
135 |                 reward = 10
136 |             else:
137 |                 is_terminal = False
138 |                 reward = -1
139 |             cell = CellState((i,j), reward, cell_type, is_terminal)
140 |             env.place_cell(i, j, cell)
141 |             states.append(cell)
142 |     return env, states
143 | 
144 | 
145 | env, states = create_game_env()
146 | agent = Agent("policy_eval", (0, 0))
147 | 
148 | policy_iter_algo = PolicyIteration(states)
149 | policy = policy_iter_algo.run()
150 | 
151 | game = Game(Config, Controller, env, agent, policy)
152 | # initiate env
153 | game.draw_env()
154 | pygame.display.update()
155 | game.start()
156 | 


--------------------------------------------------------------------------------
/SimplePolicyIteration/game_widgets.py:
--------------------------------------------------------------------------------
 1 | class CellType:
 2 |     BLANK = 1  # no reward
 3 |     WHOOPING = 2 # reward = -10
 4 |     KFC = 3 # reward = 10
 5 | 
 6 | class CellState:
 7 |     # initialize a cell for grid 
 8 |     def __init__(self, pos, reward, cell_type, is_terminal):
 9 |         self.__pos = pos
10 |         self.__reward = reward
11 |         self.__cell_type = cell_type
12 |         self.__is_terminal = is_terminal
13 |         self.children = []
14 | 
15 |     def append_child(self, cell):
16 |         self.children.append(cell)
17 | 
18 |     def pos(self):
19 |         return self.__pos
20 | 
21 |     def reward(self):
22 |         return self.__reward
23 | 
24 |     def cell_type(self):
25 |         return self.__cell_type
26 | 
27 |     def is_terminal(self):
28 |         return self.__is_terminal


--------------------------------------------------------------------------------
/SimplePolicyIteration/game_widgets.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/game_widgets.pyc


--------------------------------------------------------------------------------
/SimplePolicyIteration/images/baby_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/baby_r.png


--------------------------------------------------------------------------------
/SimplePolicyIteration/images/beat_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/beat_r.png


--------------------------------------------------------------------------------
/SimplePolicyIteration/images/grass_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/grass_r.png


--------------------------------------------------------------------------------
/SimplePolicyIteration/images/kfc_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/kfc_r.png


--------------------------------------------------------------------------------
/SimplePolicyIteration/policy_iteration.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | class PolicyIteration(object):
 4 |     def __init__(self, cell_states):
 5 |         # 0 - up, 1 - right, 2 - down, 3 - up
 6 |         self.epsilon = 3
 7 |         self.actions = [0, 1, 2, 3]
 8 |         self.state_dict, self.policy_dict , self.value_dict = \
 9 |             self.create_state_policy_dict(cell_states)
10 | 
11 |     def create_state_policy_dict(self, cell_states):
12 |         state_dict = {}
13 |         policy_dict = {}
14 |         value_dict = {}
15 |         for cell_state in cell_states:
16 |             state_dict[cell_state.pos()] = cell_state
17 |             policy_dict[cell_state.pos()] = random.choice(self.actions)
18 |             if cell_state.is_terminal():
19 |                 value_dict[cell_state.pos()] = cell_state.reward()
20 |             else:
21 |                 value_dict[cell_state.pos()] =0
22 |         return state_dict, policy_dict, value_dict
23 |     
24 |     def get_future_state(self, curr_pos, action):
25 |         new_pos = curr_pos
26 |         env_size = 5
27 | 
28 |         if action == 1:
29 |             new_pos = (min(curr_pos[0]+1, env_size-1), curr_pos[1])
30 |         elif action == 3:
31 |             new_pos = (max(curr_pos[0]-1, 0), curr_pos[1])
32 |         elif action == 2:
33 |             new_pos = (curr_pos[0], min(curr_pos[1]+1, env_size-1))
34 |         elif action == 0:
35 |             new_pos = (curr_pos[0], max(curr_pos[1]-1, 0))
36 |         return new_pos
37 |     
38 |     def q_value(self, state, action):
39 |         state_ = self.get_future_state(state, action)
40 |         q = self.state_dict[state_].reward() + \
41 |             self.value_dict[state_]
42 |         return q
43 |     
44 |     def policy_evaluation(self):
45 |         print("policy evaluation...")
46 |         while True:
47 |             delta = 0
48 |             for state in self.state_dict:
49 |                 v = self.value_dict[state]
50 |                 if self.state_dict[state].is_terminal():
51 |                     continue
52 |                 # next state s' computation
53 |                 self.value_dict[state] = self.q_value(state, self.policy_dict[state])
54 |                 delta = max(delta, abs(v - self.value_dict[state]))
55 |             if delta < self.epsilon:
56 |                 break
57 |     
58 |     def policy_improvement(self):
59 |         print("policy improvement...")
60 |         policy_stable = True
61 |         for state in self.state_dict:
62 |             old_action = self.policy_dict[state]
63 |             old_action_value = self.q_value(state, old_action)
64 | 
65 |             best_action = old_action
66 |             best_action_value = old_action_value
67 | 
68 |             for action in self.actions:
69 |                 action_value = self.q_value(state, action)
70 |                 if action_value > old_action_value:
71 |                     best_action = action
72 |                     best_action_value = action_value
73 |                     policy_stable = False
74 |                 self.policy_dict[state] = best_action
75 |         return policy_stable
76 |     
77 |     def run(self):
78 |         while True:
79 |             self.policy_evaluation()
80 |             policy_stable = self.policy_improvement()
81 |             if policy_stable:
82 |                 return self.policy_dict
83 | 


--------------------------------------------------------------------------------
/mountain_car/README.md:
--------------------------------------------------------------------------------
1 | # Mountain Car
2 | 
3 | Solved with Semi Gradient Sarsa & Tile Coding
4 | 


--------------------------------------------------------------------------------
/mountain_car/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/__init__.py


--------------------------------------------------------------------------------
/mountain_car/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | import numpy as np
 4 | import gymnasium as gym
 5 | 
 6 | random.seed(10)
 7 | 
 8 | from semi_gradient_sarsa import SemiGradientSarsa
 9 | 
10 | env_ = gym.make("MountainCar-v0", render_mode=None)
11 | 
12 | sarsa = SemiGradientSarsa(env_)
13 | sarsa.load_params()
14 | 
15 | env = gym.make("MountainCar-v0", render_mode='human')
16 | state, info = env.reset()
17 | 
18 | for i in range(200):
19 |     action = sarsa.select_action(state, eps_greedy=False)
20 |     next_state, reward, terminated, truncated, info = env.step(action)
21 | 
22 |     # Render the env
23 |     env.render()
24 | 
25 |     # Wait a bit before the next frame unless you want to see a crazy fast video
26 |     time.sleep(0.001)
27 | 
28 |     state = next_state
29 | 
30 | env.close()
31 | 
32 | 


--------------------------------------------------------------------------------
/mountain_car/semi_gradient_sarsa.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import random
  3 | import numpy as np
  4 | import gymnasium as gym
  5 | 
  6 | random.seed(18)
  7 | 
  8 | from tile_coding import *
  9 | 
 10 | 
 11 | class SemiGradientSarsa(object):
 12 |     def __init__(
 13 |         self,
 14 |         env,
 15 |         alpha=0.01, 
 16 |         eps=0.1,
 17 |         gamma=1,
 18 |         n_tilings = 7,
 19 |         num_eps=100) -> None:
 20 |         self.alpha = alpha
 21 |         self.eps = eps
 22 |         self.gamma = gamma
 23 |         self.num_eps = num_eps
 24 |         self.n_tilings = n_tilings
 25 |         self.env = env
 26 |         
 27 |         # define weight vector
 28 |         self.w = np.random.uniform(low=-0.05, high=0.05, size=(n_tilings**4,))
 29 | 
 30 |         # hash for tile coding
 31 |         self.tile_coding = IHT(n_tilings**4)
 32 | 
 33 |     def q_func(self, feature_vector):
 34 |         return np.dot(self.w, feature_vector)
 35 | 
 36 |     def update_weight(self, reward, current_q, future_q, feature_vector, terminal):
 37 |         if terminal:
 38 |             w_update = self.alpha * (reward - current_q)
 39 |         else:
 40 |             w_update = self.alpha * (reward + self.gamma *future_q - current_q)
 41 |         self.w += np.multiply(w_update, feature_vector)
 42 | 
 43 | 
 44 |     def train(self):
 45 |         state, info = self.env.reset()
 46 |         action, q = self.select_action(state)
 47 |         episodes = 0
 48 |         steps = 0
 49 | 
 50 |         total_reward = 0
 51 | 
 52 |         while episodes < self.num_eps:
 53 |             steps += 1
 54 | 
 55 |             feature_vec = self.hash_feature_vector(state, action)
 56 |             next_state, reward, terminated, truncated, info = self.env.step(action)
 57 |             total_reward += reward
 58 | 
 59 |             if terminated:
 60 | 
 61 |                 if episodes % 50 == 0:
 62 |                     print("episode:", episodes, 'completed', 'reward:', total_reward)
 63 |                 
 64 |                 self.update_weight(reward, q, None, feature_vec, True)
 65 |                 state, info = self.env.reset()
 66 |                 action, q = self.select_action(state)
 67 |                 total_reward = 0
 68 |                 steps = 0
 69 |                 episodes += 1
 70 | 
 71 |                 continue
 72 | 
 73 |             next_action, next_q = self.select_action(next_state)
 74 |             self.update_weight(reward, q, next_q, feature_vec, False)
 75 |             state = next_state
 76 |             action = next_action
 77 |             q = next_q
 78 |             
 79 |         self.save_params()
 80 |     
 81 |     def save_params(self):
 82 |         print(self.w)
 83 |         pickle.dump(self.w, open('weights.pkl', 'wb'))
 84 |         pickle.dump(self.tile_coding, open('tilings.pkl', 'wb'))
 85 |     
 86 |     def load_params(self):
 87 |         self.w = pickle.load(open('weights.pkl', 'rb'))
 88 |         self.tile_coding = pickle.load(open('tilings.pkl', 'rb'))
 89 | 
 90 |     def one_hot_encode(self, indices):
 91 |         size = len(self.w)
 92 |         one_hot_vec = np.zeros(size)
 93 |         for i in indices:
 94 |             one_hot_vec[i] = 1
 95 |         return one_hot_vec
 96 | 
 97 |     def hash_feature_vector(self, state, action):
 98 |         # speed you up
 99 |         feature_ind = np.array(tiles(self.tile_coding, self.n_tilings, state.tolist(), [action]))
100 |         feature_vec = self.one_hot_encode(feature_ind)
101 |         return feature_vec
102 | 
103 |     def select_action(self, state, eps_greedy = True):
104 |         num_actions = self.env.action_space.n
105 |         actions = range(num_actions)
106 |         action_val_dict = {}
107 |         for action in actions:
108 |             feature_vector = self.hash_feature_vector(state, action)
109 |             q_val = self.q_func(np.array(feature_vector))
110 | 
111 |             action_val_dict[action] = q_val
112 |         
113 |         greedy_action = max(action_val_dict, key=action_val_dict.get)
114 |         
115 |         if not eps_greedy:
116 |             return greedy_action
117 | 
118 |         non_greedy_actions = list(set(range(num_actions)) - {greedy_action})
119 |         
120 |         prob_explorative_action = self.eps / num_actions
121 |         prob_greedy_action = 1 - self.eps + prob_explorative_action
122 | 
123 |         action = np.random.choice([greedy_action] + non_greedy_actions,
124 |                     p=[prob_greedy_action]+[prob_explorative_action for _ in range(len(non_greedy_actions))])
125 |         return action, action_val_dict[action]
126 | 
127 | 


--------------------------------------------------------------------------------
/mountain_car/tile_coding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tile Coding Software version 3.0beta
  3 | by Rich Sutton
  4 | based on a program created by Steph Schaeffer and others
  5 | External documentation and recommendations on the use of this code is available in the 
  6 | reinforcement learning textbook by Sutton and Barto, and on the web.
  7 | These need to be understood before this code is.
  8 | 
  9 | This software is for Python 3 or more.
 10 | 
 11 | This is an implementation of grid-style tile codings, based originally on
 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 
 13 | Here we provide a function, "tiles", that maps floating and integer
 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while
 15 | wrapping some floats to provided widths (the lower wrap value is always 0).
 16 | 
 17 | The float variables will be gridded at unit intervals, so generalization
 18 | will be by approximately 1 in each direction, and any scaling will have 
 19 | to be done externally before calling tiles.
 20 | 
 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should
 22 | also be greater than or equal to four times the number of floats.
 23 | 
 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 
 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 
 26 | coordinates are to be returned without being converted to indices).
 27 | 
 28 | This code can be found here http://incompleteideas.net/tiles/tiles3.py-remove
 29 | """
 30 | 
 31 | basehash = hash
 32 | 
 33 | class IHT:
 34 |     "Structure to handle collisions"
 35 |     def __init__(self, sizeval):
 36 |         self.size = sizeval                        
 37 |         self.overfullCount = 0
 38 |         self.dictionary = {}
 39 | 
 40 |     def __str__(self):
 41 |         "Prepares a string for printing whenever this object is printed"
 42 |         return "Collision table:" + \
 43 |                " size:" + str(self.size) + \
 44 |                " overfullCount:" + str(self.overfullCount) + \
 45 |                " dictionary:" + str(len(self.dictionary)) + " items"
 46 | 
 47 |     def count (self):
 48 |         return len(self.dictionary)
 49 |     
 50 |     def fullp (self):
 51 |         return len(self.dictionary) >= self.size
 52 |     
 53 |     def getindex (self, obj, readonly=False):
 54 |         d = self.dictionary
 55 |         if obj in d: return d[obj]
 56 |         elif readonly: return None
 57 |         size = self.size
 58 |         count = self.count()
 59 |         if count >= size:
 60 |             if self.overfullCount==0: print('IHT full, starting to allow collisions')
 61 |             self.overfullCount += 1
 62 |             return basehash(obj) % self.size
 63 |         else:
 64 |             d[obj] = count
 65 |             return count
 66 | 
 67 | def hashcoords(coordinates, m, readonly=False):
 68 |     if type(m)==IHT: return m.getindex(tuple(coordinates), readonly)
 69 |     if type(m)==int: return basehash(tuple(coordinates)) % m
 70 |     if m==None: return coordinates
 71 | 
 72 | from math import floor, log
 73 | from itertools import zip_longest
 74 | 
 75 | def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False):
 76 |     """returns num-tilings tile indices corresponding to the floats and ints"""
 77 |     qfloats = [floor(f*numtilings) for f in floats]
 78 |     Tiles = []
 79 |     for tiling in range(numtilings):
 80 |         tilingX2 = tiling*2
 81 |         coords = [tiling]
 82 |         b = tiling
 83 |         for q in qfloats:
 84 |             coords.append( (q + b) // numtilings )
 85 |             b += tilingX2
 86 |         coords.extend(ints)
 87 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
 88 |     return Tiles
 89 | 
 90 | def tileswrap (ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False):
 91 |     """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats"""
 92 |     qfloats = [floor(f*numtilings) for f in floats]
 93 |     Tiles = []
 94 |     for tiling in range(numtilings):
 95 |         tilingX2 = tiling*2
 96 |         coords = [tiling]
 97 |         b = tiling
 98 |         for q, width in zip_longest(qfloats, wrapwidths):
 99 |             c = (q + b%numtilings) // numtilings
100 |             coords.append(c%width if width else c)
101 |             b += tilingX2
102 |         coords.extend(ints)
103 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
104 |     return Tiles
105 | 


--------------------------------------------------------------------------------
/mountain_car/tilings.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/tilings.pkl


--------------------------------------------------------------------------------
/mountain_car/train.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | from semi_gradient_sarsa import SemiGradientSarsa
3 | 
4 | # add `.env` at the end to ignore internal truncation
5 | env = gym.make("MountainCar-v0", render_mode=None)
6 | 
7 | sarsa = SemiGradientSarsa(env, num_eps=500, alpha=0.01)
8 | sarsa.train()


--------------------------------------------------------------------------------
/mountain_car/weights.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/weights.pkl


--------------------------------------------------------------------------------
/self_driving_agent/DQN_Control/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/self_driving_agent/DQN_Control/__init__.py


--------------------------------------------------------------------------------
/self_driving_agent/DQN_Control/model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | class ConvNet(nn.Module):
  8 |     def __init__(self, dim, in_channels, num_actions) -> None:
  9 |         super(ConvNet, self).__init__()
 10 |         # conv2d(in_channels, out_channels, kernel_size, stride)
 11 |         
 12 |         self.conv1 = nn.Conv2d(in_channels, 32, 8, 4)
 13 |         self.conv1_bn = nn.BatchNorm2d(32)
 14 |         self.conv2 = nn.Conv2d(32, 64, 4, 3)
 15 |         self.conv2_bn = nn.BatchNorm2d(64)
 16 |         self.conv3 = nn.Conv2d(64, 64, 3, 1)
 17 |         self.conv3_bn = nn.BatchNorm2d(64)
 18 | 
 19 |         self.fc1 = nn.Linear(64*8*8, 256)
 20 |         self.fc1_bn = nn.BatchNorm1d(256)
 21 |         self.fc2 = nn.Linear(256, 32)
 22 |         self.fc2_bn = nn.BatchNorm1d(32)
 23 |         self.fc3 = nn.Linear(32, num_actions)
 24 |     
 25 |     def forward(self, x):
 26 |         x = F.relu(self.conv1_bn(self.conv1(x)))
 27 |         x = F.relu(self.conv2_bn(self.conv2(x)))
 28 |         x = F.relu(self.conv3_bn(self.conv3(x)))
 29 |         x = F.relu(self.fc1_bn(self.fc1(x.reshape(-1, 64*8*8))))
 30 |         x = F.relu(self.fc2_bn(self.fc2(x)))
 31 |         x = self.fc3(x)
 32 |         return x
 33 | 
 34 | class DQN(object):
 35 |     def __init__(
 36 |         self,
 37 |         num_actions,
 38 |         state_dim, #?
 39 |         in_channels,
 40 |         device,
 41 |         discount=0.9,
 42 |         optimizer="Adam",
 43 |         optimizer_parameters={'lr':0.01},
 44 |         target_update_frequency=1e4,
 45 |         initial_eps = 1,
 46 |         end_eps = 0.05,
 47 |         eps_decay_period = 25e4,
 48 |         eval_eps=0.001
 49 |     ) -> None:
 50 |         self.device = device
 51 | 
 52 |         self.Q = ConvNet(state_dim, in_channels, num_actions).to(self.device)
 53 |         self.Q_target = copy.deepcopy(self.Q)  # copy target network
 54 |         self.Q_optimizer = getattr(torch.optim, optimizer)(self.Q.parameters(), 
 55 |         **optimizer_parameters)
 56 | 
 57 |         self.discount = discount
 58 | 
 59 |         self.target_update_frequency = target_update_frequency
 60 |  
 61 |         # epsilon decay
 62 |         self.initial_eps = initial_eps
 63 |         self.end_eps = end_eps
 64 |         self.slope = (self.end_eps - self.initial_eps) / eps_decay_period
 65 | 
 66 |         self.state_shape = (-1,) + state_dim
 67 |         self.eval_eps = eval_eps
 68 |         self.num_actions = num_actions
 69 | 
 70 |         self.iterations = 0
 71 | 
 72 |     def select_action(self, state, eval=False):
 73 |         eps = self.eval_eps if eval \
 74 |         else max(self.slope * self.iterations + self.initial_eps, self.end_eps)
 75 |         self.current_eps = eps
 76 | 
 77 |         # Select action according to policy with probability (1-eps)
 78 |         # otherwise, select random action
 79 |         if np.random.uniform(0,1) > eps:
 80 |             self.Q.eval()
 81 |             with torch.no_grad():
 82 |                 # without batch norm, remove the unsqueeze
 83 |                 state = torch.FloatTensor(state).reshape(self.state_shape).unsqueeze(0).to(self.device)
 84 |                 return int(self.Q(state).argmax(1))
 85 |         else:
 86 |             return np.random.randint(self.num_actions)
 87 | 
 88 |     def train(self, replay_buffer):
 89 |         self.Q.train()
 90 |         # Sample mininbatch from replay buffer
 91 |         state, action, next_state, reward, done = replay_buffer.sample()
 92 | 
 93 |         # Compute the target Q value
 94 |         with torch.no_grad():
 95 |             target_Q = reward + (1-done) * self.discount * self.Q_target(next_state).max(1, keepdim=True)[0]
 96 | 
 97 |         # Get current Q estimate
 98 |         # torch gather just selects action values from Q(state) using the action tensor as an index
 99 |         current_Q = self.Q(state).gather(1, action)
100 | 
101 |         # Compute Q loss
102 |         Q_loss = F.smooth_l1_loss(current_Q, target_Q)
103 | 
104 |         # Optimize the Q
105 |         self.Q_optimizer.zero_grad()
106 |         Q_loss.backward()
107 |         self.Q_optimizer.step()
108 | 
109 |         # Update target network by full copy every X iterations.
110 |         self.iterations += 1
111 |         self.copy_target_update()
112 |     
113 |     def copy_target_update(self):
114 |         if self.iterations % self.target_update_frequency == 0:
115 |             print('target network updated')
116 |             print('current epsilon', self.current_eps)
117 |             self.Q_target.load_state_dict(self.Q.state_dict())
118 | 
119 | 
120 |     def save(self, filename):
121 |         torch.save(self.Q.state_dict(), filename + "_Q")
122 |         torch.save(self.Q_optimizer.state_dict(), filename + "_optimizer")
123 | 
124 | 
125 |     def load(self, filename):
126 |         self.Q.load_state_dict(torch.load(filename + "_Q"))
127 |         self.Q_target = copy.deepcopy(self.Q)
128 |         self.Q_optimizer.load_state_dict(torch.load(filename + "_optimizer"))
129 |     


--------------------------------------------------------------------------------
/self_driving_agent/DQN_Control/process_img.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | 
 3 | img = cv2.imread('../output/000970.png')
 4 | print(img.shape)
 5 | 
 6 | scale_percent = 25
 7 | width = int(img.shape[1] * scale_percent/100)
 8 | height = int(img.shape[0] * scale_percent/100)
 9 | 
10 | dim = (128, 128)
11 | 
12 | resized_img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
13 | img_gray = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY)
14 | print(img_gray.shape)
15 | cv2.imshow('', img_gray)
16 | cv2.waitKey(5000)
17 | cv2.destroyAllWindows()
18 | 


--------------------------------------------------------------------------------
/self_driving_agent/DQN_Control/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | import numpy as np
 4 | from torchvision import transforms
 5 | 
 6 | class ReplayBuffer(object):
 7 |     def __init__(self, state_dim, batch_size, buffer_size, device) -> None:
 8 |         self.batch_size = batch_size
 9 |         self.max_size = int(buffer_size)
10 |         self.device = device
11 | 
12 |         self.ptr = 0
13 |         self.crt_size = 0
14 | 
15 |         self.state = np.zeros((self.max_size,) + state_dim)
16 |         self.action = np.zeros((self.max_size, 1))
17 |         self.next_state = np.array(self.state)
18 |         self.reward = np.zeros((self.max_size, 1))
19 |         self.done = np.zeros((self.max_size, 1))
20 | 
21 |     def add(self, state, action, next_state, reward, done):
22 |         self.state[self.ptr] = state
23 |         self.action[self.ptr] = action
24 |         self.next_state[self.ptr] = next_state
25 |         self.reward[self.ptr] = reward
26 |         self.done[self.ptr] = done
27 | 
28 |         self.ptr = (self.ptr + 1) % self.max_size
29 |         self.crt_size = min(self.crt_size + 1, self.max_size)
30 | 
31 |     def sample(self):
32 |         ind = np.random.randint(0, self.crt_size, size=self.batch_size)
33 |         return (
34 |             torch.FloatTensor(self.state[ind]).unsqueeze(1).to(self.device),
35 |             torch.LongTensor(self.action[ind]).to(self.device),
36 |             torch.FloatTensor(self.next_state[ind]).unsqueeze(1).to(self.device),
37 |             torch.FloatTensor(self.reward[ind]).to(self.device),
38 |             torch.FloatTensor(self.done[ind]).to(self.device)
39 |         )
40 | 
41 | def test_buffer():
42 |     img0 = np.zeros((5, 5))
43 |     img1 = img0 + 1
44 |     img2 = img0 + 2
45 |     img3 = img0 + 3
46 | 
47 |     action = 1
48 |     reward = 10
49 |     done = 0
50 | 
51 |     device = "cpu"
52 | 
53 |     buffer = ReplayBuffer((5, 5), 2, 10, device)
54 |     buffer.add(img0, action, img1, reward, done)
55 |     buffer.add(img1, action, img2, reward, done)
56 |     buffer.add(img2, action, img3, reward, done + 1)
57 | 
58 |     sample = buffer.sample()[0]
59 |     print(sample.shape)
60 | 
61 |     norm = transforms.Normalize((0.5, 0.5), (0.5, 0.5))
62 |     print(norm(sample).shape)
63 | 
64 |     
65 | 
66 | # test_buffer()
67 | 


--------------------------------------------------------------------------------
/self_driving_agent/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | This is the code accompanying the lecture on self-driving with carla that can be found here https://www.youtube.com/watch?v=MNiqlHC6Kn4&t
 3 | You should only run this after installing carla and getting a grasp on how to run the simulator and calling the python API, which is documented very well on their site.
 4 | 
 5 | # How to Run
 6 | 
 7 | ## Setup
 8 | I did not create a dependency or yml file (will do so at a later time), but you need carla, pygame, pytorch, opencv and numpy to run this project
 9 | 
10 | You should ensure that you have a `weights` folder when you run the project. If you do not have one, then just run `initial_setup.py` and it will create it for you. If you just cloned the repository, I reccomend you run this file first.
11 | 
12 | ## main.py
13 | Run this file if you want to evaluate the performance of your agent
14 | ```
15 | env = SimEnv(visuals=False)
16 | ```
17 | 
18 | The call above initializes our simulation environment. You should set visuals to `False` if you do not want to open this with pygame, or to `True` if you want a pygame window to open along with the simulator.
19 | 
20 | ```
21 | model.load('weights/model_ep_4400')
22 | ```
23 | 
24 | This loads a trained/pre-trained model. The program will not run unless it can load this model.
25 | The 4400 indicates that this model was trained for 4400 episodes.
26 | For example, if you train your own model for 200 episodes you will see the following files in the weights folder
27 | 
28 | `model_ep_200_optimizer` and `model_ep_200_Q`
29 | 
30 | You can then load the model as `model.load('weights/model_ep_200')`. Please note however that this is likely to be a very bad model, and it will learn effectively after many episodes.
31 | 
32 | ## train.py
33 | This is for training the model. The model only starts learning after a certain number of episodes, and it can take from 8-10 hours (at least on my setup) before we see signs of learning. I will now describe a few variables you can set to configure your training process. You can modify them yourself in `config.py`.
34 | 
35 | `target_speed` --> Speed you want the car to move at in km/h
36 | 
37 | `max_iter` --> Maximum number of steps before starting a new episode
38 | 
39 | `start_buffer` --> Number of episodes to run before starting training
40 | 
41 | `train_freq` --> How often to train (set to 1 to train every step, 2 to train every 2 steps etc)
42 | 
43 | `save_freq`: --> Frequency of saving our model
44 | 
45 | `start_ep` --> Which episode should we start on (just a counter which you can update if program crushes while training for example)
46 | 
47 | `max_dist_from_waypoint` --> Maximum distance from waypoint/road before we decide to terminate the episode
48 | 


--------------------------------------------------------------------------------
/self_driving_agent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/self_driving_agent/__init__.py


--------------------------------------------------------------------------------
/self_driving_agent/config.py:
--------------------------------------------------------------------------------
 1 | # dqn action values
 2 | action_values = [-0.75, -0.5, -0.25, -0.15, -0.1, -0.05, 0,
 3 |                 0.05, 0.1, 0.15, 0.25, 0.5, 0.75]
 4 | action_map = {i:x for i, x in enumerate(action_values)}
 5 | 
 6 | env_params = {
 7 |     'target_speed' :30 , 
 8 |     'max_iter': 4000,
 9 |     'start_buffer': 10,
10 |     'train_freq': 1,
11 |     'save_freq': 200,
12 |     'start_ep': 0,
13 |     'max_dist_from_waypoint': 20
14 | }
15 | 


--------------------------------------------------------------------------------
/self_driving_agent/controllers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob
 4 | 
 5 | try:
 6 |     sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % (
 7 |         sys.version_info.major,
 8 |         sys.version_info.minor,
 9 |         'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0])
10 | except IndexError:
11 |     pass
12 | 
13 | import carla
14 | import numpy as np
15 | from collections import deque
16 | from utils import get_speed
17 | 
18 | class PIDLongitudinalController():
19 |     """
20 |     PIDLongitudinalController implements longitudinal control using a PID.
21 |     """
22 | 
23 |     def __init__(self, vehicle, max_throttle=0.75, max_brake=0.3, K_P=1.0, K_I=0.0, K_D=0.0, dt=0.03):
24 |         """
25 |         Constructor method.
26 |             :param vehicle: actor to apply to local planner logic onto
27 |             :param K_P: Proportional term
28 |             :param K_D: Differential term
29 |             :param K_I: Integral term
30 |             :param dt: time differential in seconds
31 |         """
32 |         self._vehicle = vehicle
33 |         self.max_throttle = max_throttle
34 |         self.max_brake = max_brake
35 |         self._k_p = K_P
36 |         self._k_i = K_I
37 |         self._k_d = K_D
38 |         self._dt = dt
39 |         self._error_buffer = deque(maxlen=10)
40 | 
41 |     def run_step(self, target_speed, debug=False):
42 |         """
43 |         Execute one step of longitudinal control to reach a given target speed.
44 |             :param target_speed: target speed in Km/h
45 |             :param debug: boolean for debugging
46 |             :return: throttle control
47 |         """
48 |         current_speed = get_speed(self._vehicle)
49 | 
50 |         if debug:
51 |             print('Current speed = {}'.format(current_speed))
52 | 
53 |         acceleration = self._pid_control(target_speed, current_speed)
54 |         control = carla.VehicleControl()
55 |         if acceleration >= 0.0:
56 |             control.throttle = min(acceleration, self.max_throttle)
57 |             control.brake = 0.0
58 |         else:
59 |             control.throttle = 0.0
60 |             control.brake = min(abs(acceleration), self.max_brake)
61 |         return control
62 | 
63 |     def _pid_control(self, target_speed, current_speed):
64 |         """
65 |         Estimate the throttle/brake of the vehicle based on the PID equations
66 |             :param target_speed:  target speed in Km/h
67 |             :param current_speed: current speed of the vehicle in Km/h
68 |             :return: throttle/brake control
69 |         """
70 | 
71 |         error = target_speed - current_speed
72 |         self._error_buffer.append(error)
73 | 
74 |         if len(self._error_buffer) >= 2:
75 |             _de = (self._error_buffer[-1] - self._error_buffer[-2]) / self._dt
76 |             _ie = sum(self._error_buffer) * self._dt
77 |         else:
78 |             _de = 0.0
79 |             _ie = 0.0
80 | 
81 |         return np.clip((self._k_p * error) + (self._k_d * _de) + (self._k_i * _ie), -1.0, 1.0)
82 | 
83 |     def change_parameters(self, K_P, K_I, K_D, dt):
84 |         """Changes the PID parameters"""
85 |         self._k_p = K_P
86 |         self._k_i = K_I
87 |         self._k_d = K_D
88 |         self._dt = dt
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/self_driving_agent/environment.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import sys
  4 | import numpy as np
  5 | 
  6 | try:
  7 |     sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % (
  8 |         sys.version_info.major,
  9 |         sys.version_info.minor,
 10 |         'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0])
 11 | except IndexError:
 12 |     pass
 13 | 
 14 | import carla
 15 | import random
 16 | import pickle
 17 | 
 18 | from synch_mode import CarlaSyncMode
 19 | from controllers import PIDLongitudinalController
 20 | from utils import *
 21 | 
 22 | random.seed(78)
 23 | 
 24 | class SimEnv(object):
 25 |     def __init__(self, 
 26 |         visuals=True,
 27 |         target_speed = 30,
 28 |         max_iter = 4000,
 29 |         start_buffer = 10,
 30 |         train_freq = 1,
 31 |         save_freq = 200,
 32 |         start_ep = 0,
 33 |         max_dist_from_waypoint = 20
 34 |     ) -> None:
 35 |         self.visuals = visuals
 36 |         if self.visuals:
 37 |             self._initiate_visuals()
 38 | 
 39 |         self.client = carla.Client('localhost', 2000)
 40 |         self.client.set_timeout(10.0)
 41 | 
 42 |         self.world = self.client.load_world('Town02_Opt')
 43 |         self.world.unload_map_layer(carla.MapLayer.Decals)
 44 |         self.world.unload_map_layer(carla.MapLayer.Foliage)
 45 |         self.world.unload_map_layer(carla.MapLayer.ParkedVehicles)
 46 |         self.world.unload_map_layer(carla.MapLayer.Particles)
 47 |         self.world.unload_map_layer(carla.MapLayer.Props)
 48 |         self.world.unload_map_layer(carla.MapLayer.StreetLights)
 49 |         
 50 | 
 51 |         self.spawn_points = self.world.get_map().get_spawn_points()
 52 | 
 53 |         self.blueprint_library = self.world.get_blueprint_library()
 54 |         self.vehicle_blueprint = self.blueprint_library.find('vehicle.nissan.patrol')
 55 | 
 56 |         # input these later on as arguments
 57 |         self.global_t = 0 # global timestep
 58 |         self.target_speed = target_speed # km/h 
 59 |         self.max_iter = max_iter
 60 |         self.start_buffer = start_buffer
 61 |         self.train_freq = train_freq
 62 |         self.save_freq = save_freq
 63 |         self.start_ep = start_ep
 64 | 
 65 |         self.max_dist_from_waypoint = max_dist_from_waypoint
 66 |         self.start_train = self.start_ep + self.start_buffer
 67 |         
 68 |         self.total_rewards = 0
 69 |         self.average_rewards_list = []
 70 |     
 71 |     def _initiate_visuals(self):
 72 |         pygame.init()
 73 | 
 74 |         self.display = pygame.display.set_mode(
 75 |             (800, 600),
 76 |             pygame.HWSURFACE | pygame.DOUBLEBUF)
 77 |         self.font = get_font()
 78 |         self.clock = pygame.time.Clock()
 79 |     
 80 |     def create_actors(self):
 81 |         self.actor_list = []
 82 |         # spawn vehicle at random location
 83 |         self.vehicle = self.world.spawn_actor(self.vehicle_blueprint, random.choice(self.spawn_points))
 84 |         # vehicle.set_autopilot(True)
 85 |         self.actor_list.append(self.vehicle)
 86 | 
 87 |         self.camera_rgb = self.world.spawn_actor(
 88 |             self.blueprint_library.find('sensor.camera.rgb'),
 89 |             carla.Transform(carla.Location(x=1.5, z=2.4), carla.Rotation(pitch=-15)),
 90 |             attach_to=self.vehicle)
 91 |         self.actor_list.append(self.camera_rgb)
 92 | 
 93 |         self.camera_rgb_vis = self.world.spawn_actor(
 94 |             self.blueprint_library.find('sensor.camera.rgb'),
 95 |             carla.Transform(carla.Location(x=-5.5, z=2.8), carla.Rotation(pitch=-15)),
 96 |             attach_to=self.vehicle)
 97 |         self.actor_list.append(self.camera_rgb_vis)
 98 | 
 99 |         self.collision_sensor = self.world.spawn_actor(
100 |             self.blueprint_library.find('sensor.other.collision'),
101 |             carla.Transform(),
102 |             attach_to=self.vehicle
103 |         )
104 |         self.actor_list.append(self.collision_sensor)
105 | 
106 |         self.speed_controller = PIDLongitudinalController(self.vehicle)
107 |     
108 |     def reset(self):
109 |         for actor in self.actor_list:
110 |             actor.destroy()
111 |     
112 |     def generate_episode(self, model, replay_buffer, ep, action_map=None, eval=True):
113 |         with CarlaSyncMode(self.world, self.camera_rgb, self.camera_rgb_vis, self.collision_sensor, fps=30) as sync_mode:
114 |             counter = 0
115 | 
116 |             snapshot, image_rgb, image_rgb_vis, collision = sync_mode.tick(timeout=2.0)
117 | 
118 |             # destroy if there is no data
119 |             if snapshot is None or image_rgb is None:
120 |                 print("No data, skipping episode")
121 |                 self.reset()
122 |                 return None
123 | 
124 |             image = process_img(image_rgb)
125 |             next_state = image 
126 | 
127 |             while True:
128 |                 if self.visuals:
129 |                     if should_quit():
130 |                         return
131 |                     self.clock.tick_busy_loop(30)
132 | 
133 |                 vehicle_location = self.vehicle.get_location()
134 | 
135 |                 waypoint = self.world.get_map().get_waypoint(vehicle_location, project_to_road=True, 
136 |                     lane_type=carla.LaneType.Driving)
137 |                 
138 |                 speed = get_speed(self.vehicle)
139 | 
140 |                 # Advance the simulation and wait for the data.
141 |                 state = next_state
142 | 
143 |                 counter += 1
144 |                 self.global_t += 1
145 | 
146 | 
147 |                 action = model.select_action(state, eval=eval)
148 |                 steer = action
149 |                 if action_map is not None:
150 |                     steer = action_map[action]
151 | 
152 |                 control = self.speed_controller.run_step(self.target_speed)
153 |                 control.steer = steer
154 |                 self.vehicle.apply_control(control)
155 | 
156 |                 fps = round(1.0 / snapshot.timestamp.delta_seconds)
157 | 
158 |                 snapshot, image_rgb, image_rgb_vis, collision = sync_mode.tick(timeout=2.0)
159 | 
160 |                 cos_yaw_diff, dist, collision = get_reward_comp(self.vehicle, waypoint, collision)
161 |                 reward = reward_value(cos_yaw_diff, dist, collision)
162 | 
163 |                 if snapshot is None or image_rgb is None:
164 |                     print("Process ended here")
165 |                     break
166 | 
167 |                 image = process_img(image_rgb)
168 | 
169 |                 done = 1 if collision else 0
170 | 
171 |                 self.total_rewards += reward
172 | 
173 |                 next_state = image
174 | 
175 |                 replay_buffer.add(state, action, next_state, reward, done)
176 | 
177 |                 if not eval:
178 |                     if ep > self.start_train and (self.global_t % self.train_freq) == 0:
179 |                         model.train(replay_buffer)
180 | 
181 |                 # Draw the display.
182 |                 if self.visuals:
183 |                     draw_image(self.display, image_rgb_vis)
184 |                     self.display.blit(
185 |                         self.font.render('% 5d FPS (real)' % self.clock.get_fps(), True, (255, 255, 255)),
186 |                         (8, 10))
187 |                     self.display.blit(
188 |                         self.font.render('% 5d FPS (simulated)' % fps, True, (255, 255, 255)),
189 |                         (8, 28))
190 |                     pygame.display.flip()
191 | 
192 |                 if collision == 1 or counter >= self.max_iter or dist > self.max_dist_from_waypoint:
193 |                     print("Episode {} processed".format(ep), counter)
194 |                     break
195 |             
196 |             if ep % self.save_freq == 0 and ep > 0:
197 |                 self.save(model, ep)
198 | 
199 |     def save(self, model, ep):
200 |         if ep % self.save_freq == 0 and ep > self.start_ep:
201 |             avg_reward = self.total_rewards/self.save_freq
202 |             self.average_rewards_list.append(avg_reward)
203 |             self.total_rewards = 0
204 | 
205 |             model.save('weights/model_ep_{}'.format(ep))
206 | 
207 |             print("Saved model with average reward =", avg_reward)
208 |     
209 |     def quit(self):
210 |         pygame.quit()
211 | 
212 | def get_reward_comp(vehicle, waypoint, collision):
213 |     vehicle_location = vehicle.get_location()
214 |     x_wp = waypoint.transform.location.x
215 |     y_wp = waypoint.transform.location.y
216 | 
217 |     x_vh = vehicle_location.x
218 |     y_vh = vehicle_location.y
219 | 
220 |     wp_array = np.array([x_wp, y_wp])
221 |     vh_array = np.array([x_vh, y_vh])
222 | 
223 |     dist = np.linalg.norm(wp_array - vh_array)
224 | 
225 |     vh_yaw = correct_yaw(vehicle.get_transform().rotation.yaw)
226 |     wp_yaw = correct_yaw(waypoint.transform.rotation.yaw)
227 |     cos_yaw_diff = np.cos((vh_yaw - wp_yaw)*np.pi/180.)
228 | 
229 |     collision = 0 if collision is None else 1
230 |     
231 |     return cos_yaw_diff, dist, collision
232 | 
233 | def reward_value(cos_yaw_diff, dist, collision, lambda_1=1, lambda_2=1, lambda_3=5):
234 |     reward = (lambda_1 * cos_yaw_diff) - (lambda_2 * dist) - (lambda_3 * collision)
235 |     return reward
236 | 


--------------------------------------------------------------------------------
/self_driving_agent/initial_setup.py:
--------------------------------------------------------------------------------
1 | from utils import create_folders
2 | 
3 | # automatically creates folders that may not exist, or ignores if they do
4 | folders = ['weights']
5 | create_folders(folders)
6 | 


--------------------------------------------------------------------------------
/self_driving_agent/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from DQN_Control.replay_buffer import ReplayBuffer
 4 | from DQN_Control.model import DQN
 5 | 
 6 | from config import action_map, env_params
 7 | from utils import *
 8 | from environment import SimEnv
 9 | 
10 | def run():
11 |     try:
12 |         buffer_size = 1e4
13 |         batch_size = 32
14 |         state_dim = (128, 128)
15 |         # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16 |         device = "cpu"
17 |         num_actions = len(action_map)
18 |         in_channels = 1
19 |         episodes = 10000
20 | 
21 |         replay_buffer = ReplayBuffer(state_dim, batch_size, buffer_size, device)
22 |         model = DQN(num_actions, state_dim, in_channels, device)
23 | 
24 |         # this only works if you have a model in your weights folder. Replace this by that file
25 |         model.load('weights/model_ep_4400')
26 | 
27 |         # set to True if you want to run with pygame
28 |         env = SimEnv(visuals=True, **env_params)
29 | 
30 |         for ep in range(episodes):
31 |             env.create_actors()
32 |             env.generate_episode(model, replay_buffer, ep, action_map, eval=True)
33 |             env.reset()
34 |     finally:
35 |         env.reset()
36 |         env.quit()
37 | 
38 | if __name__ == "__main__":
39 |     run()
40 | 


--------------------------------------------------------------------------------
/self_driving_agent/synch_mode.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import sys
 4 | 
 5 | try:
 6 |     sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % (
 7 |         sys.version_info.major,
 8 |         sys.version_info.minor,
 9 |         'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0])
10 | except IndexError:
11 |     pass
12 | 
13 | import carla
14 | import queue
15 | 
16 | class CarlaSyncMode(object):
17 |     """
18 |     Context manager to synchronize output from different sensors. Synchronous
19 |     mode is enabled as long as we are inside this context
20 | 
21 |         with CarlaSyncMode(world, sensors) as sync_mode:
22 |             while True:
23 |                 data = sync_mode.tick(timeout=1.0)
24 | 
25 |     """
26 | 
27 |     def __init__(self, world, *sensors, **kwargs):
28 |         self.world = world
29 |         self.sensors = sensors
30 |         self.frame = None
31 |         self.delta_seconds = 1.0 / kwargs.get('fps', 20)
32 |         self._queues = []
33 |         self._settings = None
34 |         self.collisions = []
35 | 
36 |     def __enter__(self):
37 |         self._settings = self.world.get_settings()
38 |         self.frame = self.world.apply_settings(carla.WorldSettings(
39 |             no_rendering_mode=False,
40 |             synchronous_mode=True,
41 |             fixed_delta_seconds=self.delta_seconds))
42 | 
43 |         def make_queue(register_event):
44 |             q = queue.Queue()
45 |             register_event(q.put)
46 |             self._queues.append(q)
47 | 
48 |         make_queue(self.world.on_tick)
49 |         for sensor in self.sensors:
50 |             make_queue(sensor.listen)
51 |         return self
52 | 
53 |     def tick(self, timeout):
54 |         try:
55 |             self.frame = self.world.tick()
56 |             data = [self._retrieve_data(q, timeout) for q in self._queues[:-1]]
57 |             # collision sensor is the last element in the queue
58 |             collision = self._detect_collision(self._queues[-1])
59 |             
60 |             assert all(x.frame == self.frame for x in data)
61 | 
62 |             return data + [collision]
63 |         except queue.Empty:
64 |             print("empty queue")
65 |             return None, None, None
66 | 
67 |     def __exit__(self, *args, **kwargs):
68 |         self.world.apply_settings(self._settings)
69 | 
70 |     def _retrieve_data(self, sensor_queue, timeout):
71 |         while True:
72 |             data = sensor_queue.get(timeout=timeout)
73 |             if data.frame == self.frame:
74 |                 return data
75 |     
76 |     def _detect_collision(self, sensor):
77 |         # This collision is not fully aligned with other sensors, fix later
78 |         try:
79 |             data = sensor.get(block=False)
80 |             return data
81 |         except queue.Empty:
82 |             return None


--------------------------------------------------------------------------------
/self_driving_agent/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | from DQN_Control.replay_buffer import ReplayBuffer
 5 | from DQN_Control.model import DQN
 6 | 
 7 | from config import action_map, env_params
 8 | from utils import *
 9 | from environment import SimEnv
10 | 
11 | def run():
12 |     try:
13 |         buffer_size = 1e4
14 |         batch_size = 32
15 |         state_dim = (128, 128)
16 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17 |         num_actions = len(action_map)
18 |         in_channels = 1
19 |         episodes = 10000
20 | 
21 |         replay_buffer = ReplayBuffer(state_dim, batch_size, buffer_size, device)
22 |         model = DQN(num_actions, state_dim, in_channels, device)
23 | 
24 |         env = SimEnv(visuals=False, **env_params)
25 | 
26 |         for ep in range(episodes):
27 |             env.create_actors()
28 |             env.generate_episode(model, replay_buffer, ep, action_map, eval=False)
29 |             env.reset()
30 |     finally:
31 |         env.quit()
32 | 
33 | if __name__ == "__main__":
34 |     run()
35 | 


--------------------------------------------------------------------------------
/self_driving_agent/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import pygame
 4 | import math
 5 | import numpy as np
 6 | 
 7 | def process_img(image, dim_x=128, dim_y=128):
 8 |     array = np.frombuffer(image.raw_data, dtype=np.dtype("uint8"))
 9 |     array = np.reshape(array, (image.height, image.width, 4))
10 |     array = array[:, :, :3]
11 |     array = array[:, :, ::-1]
12 | 
13 |     # scale_percent = 25
14 |     # width = int(array.shape[1] * scale_percent/100)
15 |     # height = int(array.shape[0] * scale_percent/100)
16 | 
17 |     # dim = (width, height)
18 |     dim = (dim_x, dim_y)  # set same dim for now
19 |     resized_img = cv2.resize(array, dim, interpolation=cv2.INTER_AREA)
20 |     img_gray = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY)
21 |     scaledImg = img_gray/255.
22 | 
23 |     # normalize
24 |     mean, std = 0.5, 0.5
25 |     normalizedImg = (scaledImg - mean) / std
26 | 
27 |     return normalizedImg
28 | 
29 | def draw_image(surface, image, blend=False):
30 |     array = np.frombuffer(image.raw_data, dtype=np.dtype("uint8"))
31 |     array = np.reshape(array, (image.height, image.width, 4))
32 |     array = array[:, :, :3]
33 |     array = array[:, :, ::-1]
34 |     image_surface = pygame.surfarray.make_surface(array.swapaxes(0, 1))
35 |     if blend:
36 |         image_surface.set_alpha(100)
37 |     surface.blit(image_surface, (0, 0))
38 | 
39 | def get_font():
40 |     fonts = [x for x in pygame.font.get_fonts()]
41 |     default_font = 'ubuntumono'
42 |     font = default_font if default_font in fonts else fonts[0]
43 |     font = pygame.font.match_font(font)
44 |     return pygame.font.Font(font, 14)
45 | 
46 | def should_quit():
47 |     for event in pygame.event.get():
48 |         if event.type == pygame.QUIT:
49 |             return True
50 |         elif event.type == pygame.KEYUP:
51 |             if event.key == pygame.K_ESCAPE:
52 |                 return True
53 |     return False
54 | 
55 | def get_speed(vehicle):
56 |     """
57 |     Compute speed of a vehicle in Km/h.
58 |         :param vehicle: the vehicle for which speed is calculated
59 |         :return: speed as a float in Km/h
60 |     """
61 |     vel = vehicle.get_velocity()
62 | 
63 |     return 3.6 * math.sqrt(vel.x ** 2 + vel.y ** 2 + vel.z ** 2)
64 | 
65 | def correct_yaw(x):
66 |     return(((x%360) + 360) % 360)
67 | 
68 | def create_folders(folder_names):
69 |     for directory in folder_names:
70 |         if not os.path.exists(directory):
71 |                 # If it doesn't exist, create it
72 |                 os.makedirs(directory)


--------------------------------------------------------------------------------