├── .gitignore ├── BlackJackMonteCarlo ├── black_jack_sampler.py ├── cards.py ├── cards_set │ └── img │ │ ├── 10C.png │ │ ├── 10D.png │ │ ├── 10H.png │ │ ├── 10S.png │ │ ├── 2C.png │ │ ├── 2D.png │ │ ├── 2H.png │ │ ├── 2S.png │ │ ├── 3C.png │ │ ├── 3D.png │ │ ├── 3H.png │ │ ├── 3S.png │ │ ├── 4C.png │ │ ├── 4D.png │ │ ├── 4H.png │ │ ├── 4S.png │ │ ├── 5C.png │ │ ├── 5D.png │ │ ├── 5H.png │ │ ├── 5S.png │ │ ├── 6C.png │ │ ├── 6D.png │ │ ├── 6H.png │ │ ├── 6S.png │ │ ├── 7C.png │ │ ├── 7D.png │ │ ├── 7H.png │ │ ├── 7S.png │ │ ├── 8C.png │ │ ├── 8D.png │ │ ├── 8H.png │ │ ├── 8S.png │ │ ├── 9C.png │ │ ├── 9D.png │ │ ├── 9H.png │ │ ├── 9S.png │ │ ├── AC.png │ │ ├── AD.png │ │ ├── AH.png │ │ ├── AS.png │ │ ├── JC.png │ │ ├── JD.png │ │ ├── JH.png │ │ ├── JS.png │ │ ├── KC.png │ │ ├── KD.png │ │ ├── KH.png │ │ ├── KS.png │ │ ├── QC.png │ │ ├── QD.png │ │ ├── QH.png │ │ ├── QS.png │ │ └── jb_card.png ├── config.py ├── controllers │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── manual_controller.cpython-35.pyc │ └── manual_controller.py ├── environment.py ├── first_visit_mc.py ├── game.py └── player.py ├── DoubleQLearning ├── double_qlearning.py ├── environment.py ├── main.py └── qlearning.py ├── README.md ├── SimplePolicyIteration ├── .vscode │ └── settings.json ├── agent.py ├── config.py ├── config.pyc ├── controller.py ├── controller.pyc ├── environment.py ├── environment.pyc ├── game.py ├── game_widgets.py ├── game_widgets.pyc ├── images │ ├── baby_r.png │ ├── beat_r.png │ ├── grass_r.png │ └── kfc_r.png └── policy_iteration.py ├── mountain_car ├── README.md ├── __init__.py ├── main.py ├── semi_gradient_sarsa.py ├── tile_coding.py ├── tilings.pkl ├── train.py └── weights.pkl └── self_driving_agent ├── DQN_Control ├── __init__.py ├── model.py ├── process_img.py └── replay_buffer.py ├── README.md ├── __init__.py ├── config.py ├── controllers.py ├── environment.py ├── initial_setup.py ├── main.py ├── synch_mode.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__ 2 | */myenv 3 | */weights/* 4 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/black_jack_sampler.py: -------------------------------------------------------------------------------- 1 | from cards import cards 2 | from player import Player 3 | from environment import Environment 4 | 5 | # We assume cards are drawn from an inifinite set with replacement 6 | 7 | class BlackJackSampler(object): 8 | def __init__(self): 9 | pass 10 | 11 | def generate_episode(self, policy_func): 12 | dealer = Player(cards) 13 | player = Player(cards) 14 | env = Environment(player, dealer) 15 | 16 | state = env.state() 17 | episode_trace = [state] 18 | while True: 19 | action = policy_func(state) 20 | episode_trace.append(action) 21 | state, reward, done = env.step(action) 22 | episode_trace.append(reward) 23 | 24 | if done: 25 | break 26 | episode_trace.append(state) 27 | return episode_trace 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards.py: -------------------------------------------------------------------------------- 1 | cards = { str(i): i for i in range(2, 11) } 2 | 3 | cards['A'] = 1 4 | cards['J'] = 10 5 | cards['Q'] = 10 6 | cards['K'] = 10 7 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/10C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/10D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/10H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/10S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/10S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/2C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/2H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/2S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/2S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/3C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/3D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/3H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/3S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/3S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/4C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/4D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/4H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/4S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/4S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/5C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/5D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/5H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/5S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/5S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/6C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/6D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/6H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/6S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/6S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/7C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/7D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/7H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/7S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/7S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/8C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/8D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/8H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/8S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/8S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/9C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9C.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/9D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9D.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/9H.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9H.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/9S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/9S.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/AC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AC.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/AD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AD.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/AH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AH.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/AS.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/JC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JC.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/JD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JD.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/JH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JH.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/JS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/JS.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/KC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KC.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/KD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KD.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/KH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KH.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/KS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/KS.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/QC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QC.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/QD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QD.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/QH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QH.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/QS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/QS.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/cards_set/img/jb_card.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/cards_set/img/jb_card.png -------------------------------------------------------------------------------- /BlackJackMonteCarlo/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | screen_height = 800 3 | screen_width = 1200 4 | card_height = 230 5 | card_width = 150 6 | delay = 100 7 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__init__.py -------------------------------------------------------------------------------- /BlackJackMonteCarlo/controllers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /BlackJackMonteCarlo/controllers/__pycache__/manual_controller.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/BlackJackMonteCarlo/controllers/__pycache__/manual_controller.cpython-35.pyc -------------------------------------------------------------------------------- /BlackJackMonteCarlo/controllers/manual_controller.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | class ManualController: 4 | keys = None 5 | 6 | def set_key_map(key_map): 7 | ManualController.keys = key_map 8 | 9 | def stick(): 10 | return ManualController.keys[pygame.K_LEFT] 11 | 12 | def hit(): 13 | return ManualController.keys[pygame.K_RIGHT] 14 | 15 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/environment.py: -------------------------------------------------------------------------------- 1 | 2 | class Environment(object): 3 | def __init__(self, player, dealer): 4 | self.player = player 5 | self.dealer = dealer # The dealer is actually part of the env 6 | self.dealer_thresh = 17 7 | self.bust_thresh = 21 8 | 9 | def state(self): 10 | return self.player.total(), self.dealer.first_card_total() 11 | 12 | def selected_cards_state(self): 13 | # This is just to help with the visualization in pygame 14 | if not self.dealer.stick_: 15 | return self.player.chosen_cards, [self.dealer.chosen_cards[1]] 16 | return self.player.chosen_cards, self.dealer.chosen_cards 17 | 18 | def step(self, action): 19 | # Map 0 - hit, 1 - stick 20 | if action == 0: 21 | self.player.hit() 22 | elif action == 1: 23 | self.player.stick() 24 | 25 | state = self.player.total(), self.dealer.first_card_total() 26 | # player is done when bust or after he sticks 27 | player_done = (self.player.total() > self.bust_thresh) or (action == 1) 28 | reward = self._reward(state, player_done) 29 | return state, reward, player_done 30 | 31 | def _reward(self, state, player_done): 32 | if not player_done: 33 | return 0 34 | while self.dealer.total() < self.dealer_thresh: 35 | self.dealer.hit() 36 | self.dealer.stick() 37 | 38 | player_bust = self.player.total() > self.bust_thresh 39 | dealer_bust = self.dealer.total() > self.bust_thresh 40 | player_scored_higher = self.player.total() > self.dealer.total() 41 | dealer_scored_higher = self.player.total() < self.dealer.total() 42 | 43 | # scoring when any of the players bust 44 | if player_bust and dealer_bust: 45 | return 0 46 | elif player_bust and (not dealer_bust): 47 | return -1 48 | elif (not player_bust) and dealer_bust: 49 | return 1 50 | 51 | # scoring based on scores 52 | if player_scored_higher: 53 | return 1 54 | elif dealer_scored_higher: 55 | return -1 56 | else: 57 | return 0 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/first_visit_mc.py: -------------------------------------------------------------------------------- 1 | # This import registers the 3D projection, but is otherwise unused. 2 | from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import 3 | 4 | import matplotlib.pyplot as plt 5 | from matplotlib import cm 6 | from matplotlib.ticker import LinearLocator, FormatStrFormatter 7 | import numpy as np 8 | from black_jack_sampler import BlackJackSampler 9 | 10 | # For solving the prediction problem 11 | class FirstVisitMC(object): 12 | def __init__(self): 13 | self.values = {} 14 | self.episodes = 100 15 | 16 | def policy(self, state): 17 | if state[0] >= 20: 18 | return 1 # stick 19 | else: 20 | return 0 21 | 22 | def run(self): 23 | 24 | black_jack = BlackJackSampler() 25 | 26 | for i in range(100000): 27 | episode = black_jack.generate_episode(self.policy) 28 | G = 0 29 | for i in range(len(episode)-1, 0, -3): 30 | reward = episode[i] 31 | action = episode[i-1] 32 | state = episode[i-2] 33 | 34 | G = G+reward 35 | 36 | if state in episode[:i-2]: 37 | # it is not our first visir to this state 38 | continue 39 | 40 | if state in self.values: 41 | self.values[state].append(G) 42 | else: 43 | self.values[state] = [G] 44 | 45 | self.values = { key: sum(self.values[key])/len(self.values[key]) for key in self.values} 46 | return self.values 47 | 48 | def plot_value_function(self): 49 | x = np.arange(1, 22) 50 | y = np.arange(1, 10) 51 | 52 | x, y = np.meshgrid(x, y) 53 | 54 | z = [] 55 | 56 | for (row_ind, i) in enumerate(x): 57 | temp = [] 58 | for (col_ind, j) in enumerate(x[row_ind]): 59 | x_val = j 60 | y_val = y[row_ind, col_ind] 61 | if (x_val, y_val) in self.values: 62 | temp.append(self.values[(x_val, y_val)]) 63 | else: 64 | temp.append(0) 65 | z.append(temp) 66 | z = np.array(z) 67 | 68 | fig = plt.figure() 69 | ax = fig.gca(projection='3d') 70 | 71 | # Plot the surface. 72 | surf = ax.plot_surface(y, x, z, cmap=cm.coolwarm, 73 | linewidth=0, antialiased=False) 74 | 75 | ax.set_xlabel("Dealer's showing card") 76 | ax.set_ylabel("Player Sum") 77 | ax.set_zlabel("Value") 78 | 79 | # Customize the z axis. 80 | ax.set_zlim(-1.01, 1.01) 81 | ax.zaxis.set_major_locator(LinearLocator(10)) 82 | ax.zaxis.set_major_formatter(FormatStrFormatter('%.01f')) 83 | 84 | # Add a color bar which maps values to colors. 85 | fig.colorbar(surf, shrink=0.5, aspect=5) 86 | 87 | plt.show() 88 | 89 | 90 | 91 | fv_mc = FirstVisitMC() 92 | fv_mc.run() 93 | fv_mc.plot_value_function() 94 | -------------------------------------------------------------------------------- /BlackJackMonteCarlo/game.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pygame 3 | from config import Config 4 | from cards import cards 5 | from player import Player 6 | from environment import Environment 7 | from controllers.manual_controller import ManualController 8 | 9 | class BlackJack(object): 10 | def __init__(self, config, env, controller): 11 | pygame.init() 12 | self.win_ = pygame.display.set_mode((config.screen_width, 13 | config.screen_height)) 14 | pygame.display.set_caption("Jabrah") 15 | 16 | self.config_ = config 17 | self.env_ = env 18 | self.controller_ = controller 19 | 20 | self.dealer_wins = -2 21 | 22 | 23 | def draw_env(self): 24 | self.win_.fill((0, 128, 0)) 25 | init_offset = 10 26 | text_font_size = 50 27 | root_dir = "cards_set/img/" 28 | img_ext = '.png' 29 | 30 | font1 = pygame.font.SysFont(None, text_font_size) 31 | font2 = pygame.font.SysFont(None, text_font_size) 32 | dealer_text = font1.render("Dealer's Cards", True, (255, 255, 255)) 33 | player_text = font1.render("Player's Cards", True, (255, 255, 255)) 34 | self.win_.blit(dealer_text, (init_offset, init_offset)) 35 | self.win_.blit(player_text, ( 36 | init_offset, self.config_.screen_height - text_font_size - init_offset)) 37 | 38 | 39 | player_cards, dealer_cards = self.env_.selected_cards_state() 40 | 41 | player_cards = [c+s+img_ext for c, s in player_cards] 42 | dealer_cards = [c+s+img_ext for c, s in dealer_cards] 43 | 44 | 45 | if len(dealer_cards) <= 1: 46 | dealer_cards = ['jb_card.png'] + dealer_cards 47 | 48 | for (i, card_image) in enumerate(dealer_cards): 49 | card_obj = pygame.image.load(root_dir+card_image) 50 | x_coord = (i+1) * init_offset + (i * self.config_.card_width) 51 | y_coord = 2*init_offset + text_font_size 52 | self.win_.blit(card_obj, (x_coord, y_coord)) 53 | 54 | for (i, card_image) in enumerate(player_cards): 55 | card_obj = pygame.image.load(root_dir+card_image) 56 | x_coord = (i+1) * init_offset + (i * self.config_.card_width ) 57 | y_coord = self.config_.screen_height - self.config_.card_height - text_font_size - 3*init_offset 58 | self.win_.blit(card_obj, (x_coord, y_coord)) 59 | 60 | # update scores 61 | score_text = "Score: " 62 | 63 | dealer_score = str(self.env_.dealer.first_card_total()) if not self.env_.dealer.stick_\ 64 | else str(self.env_.dealer.total()) 65 | player_score = str(self.env_.player.total()) 66 | 67 | dealer_score_text = font2.render(score_text + dealer_score, True, (255, 255, 0)) 68 | player_score_text = font2.render(score_text + player_score, True, (255, 255, 0)) 69 | self.win_.blit(dealer_score_text, (self.config_.screen_width - 250, init_offset)) 70 | self.win_.blit( 71 | player_score_text, ( 72 | self.config_.screen_width - 250, 73 | self.config_.screen_height - text_font_size - init_offset)) 74 | 75 | if self.dealer_wins == -1: 76 | win_text = font2.render("DRAW", True, (255, 0, 0)) 77 | self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2)) 78 | elif self.dealer_wins == 0: 79 | win_text = font2.render("WIN", True, (0, 0, 128)) 80 | self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2)) 81 | elif self.dealer_wins == 1: 82 | win_text = font2.render("LOST", True, (128, 0, 0)) 83 | self.win_.blit(win_text, (self.config_.screen_width//2, self.config_.screen_height//2)) 84 | 85 | def start(self): 86 | run = True 87 | state = None 88 | reward = 0 89 | done = False 90 | 91 | while (run): 92 | pygame.time.delay(self.config_.delay) 93 | for event in pygame.event.get(): 94 | if event.type == pygame.QUIT: 95 | run = False 96 | 97 | # Game Logic 98 | keys = pygame.key.get_pressed() 99 | self.controller_.set_key_map(keys) 100 | 101 | if self.controller_.hit(): 102 | state, reward, done = self.env_.step(0) 103 | elif self.controller_.stick(): 104 | state, reward, done = self.env_.step(1) 105 | 106 | if done: 107 | if reward == 1: 108 | self.dealer_wins = 0 109 | elif reward == -1: 110 | self.dealer_wins = 1 111 | else: 112 | self.dealer_wins = -1 113 | 114 | self.draw_env() 115 | pygame.display.update() 116 | pygame.quit() 117 | 118 | player = Player(cards) 119 | dealer = Player(cards) 120 | env = Environment(player, dealer) 121 | 122 | game = BlackJack(Config, env, ManualController) 123 | game.draw_env() 124 | pygame.display.update() 125 | game.start() -------------------------------------------------------------------------------- /BlackJackMonteCarlo/player.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class Player(object): 4 | def __init__(self, cards): 5 | self.cards = cards 6 | self.chosen_cards = [self._pick_card(), self._pick_card()] 7 | self.stick_ = False 8 | 9 | def _pick_card(self): 10 | card_suits = ['C', 'D', 'H', 'S'] 11 | 12 | return random.choice(list(self.cards.keys())), random.choice(card_suits) 13 | 14 | def hit(self): 15 | # Request for additional cards 16 | card_choice = self._pick_card() 17 | 18 | card_value = self.cards[card_choice[0]] 19 | 20 | self.chosen_cards.append(card_choice) 21 | 22 | def stick(self): 23 | self.stick_ = True 24 | 25 | def total(self): 26 | # Stop requesting for additional cards 27 | total = 0 28 | for (chosen_card, _) in self.chosen_cards: 29 | total += self.cards[chosen_card] 30 | return total 31 | 32 | def first_card_total(self): 33 | # If the player is a dealer 34 | return self.cards[self.chosen_cards[1][0]] 35 | 36 | -------------------------------------------------------------------------------- /DoubleQLearning/double_qlearning.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | class DoubleQLearningAgent(object): 5 | def __init__(self, env) -> None: 6 | super().__init__() 7 | self.e = 0.1 8 | self.alpha = 0.1 9 | self.gamma = 1 10 | self.num_episodes = 300 11 | 12 | self.env = env 13 | 14 | self.init_state = env.start_state 15 | self.q1_sa = {x: 0 for x in env.state_transitions} 16 | self.q2_sa = {x: 0 for x in env.state_transitions} 17 | 18 | def reset_policy(self): 19 | self.q1_sa = {x: 0 for x in self.env.state_transitions} 20 | self.q2_sa = {x: 0 for x in self.env.state_transitions} 21 | 22 | def soft_policy(self, state): 23 | possible_actions = self.env.possible_actions[state] 24 | if len(possible_actions) > 1: 25 | prob_explorative_action = self.e / len(possible_actions) 26 | prob_greedy_action = 1 - self.e + prob_explorative_action 27 | 28 | q1_values = [] 29 | q2_values = [] 30 | corresponding_actions = [] 31 | for a in possible_actions: 32 | if (state, a) in self.q1_sa: 33 | q1_values.append(self.q1_sa[(state, a)]) 34 | q2_values.append(self.q2_sa[(state, a)]) 35 | corresponding_actions.append(a) 36 | 37 | # check if all q_values are the same, if so then take random action 38 | q_values = np.array(q1_values) + np.array(q2_values) 39 | if all(x == q_values[0] for x in q_values): 40 | return random.choice(possible_actions) 41 | greedy_action = corresponding_actions[np.argmax(q_values)] 42 | non_greedy_actions = list(set(possible_actions) - set([greedy_action])) 43 | 44 | action = np.random.choice([greedy_action]+non_greedy_actions, 45 | p=[prob_greedy_action]+[prob_explorative_action for i in range(len(non_greedy_actions))]) 46 | return action 47 | return possible_actions[0] 48 | 49 | 50 | def generate_episode(self): 51 | episode = [] 52 | state = self.init_state 53 | while True: 54 | action = self.soft_policy(state) 55 | reward, next_state, done = self.env.step(state, action) 56 | episode.append((state, action, reward, next_state)) 57 | state = next_state 58 | if done: 59 | break 60 | return episode 61 | 62 | 63 | def update_policy(self): 64 | n_iters = 1000 65 | final_left_array = [] 66 | final_right_array = [] 67 | for _ in range(n_iters): 68 | self.reset_policy() 69 | left_actions_count = [] 70 | right_actions_count = [] 71 | 72 | for i in range(self.num_episodes): 73 | left = 0 74 | right = 0 75 | 76 | # generate episode using policy defined above 77 | episode = self.generate_episode() 78 | for s, a, r, s_p in episode: 79 | # use a soft policy as behavior policy e.g epsilon-greedy 80 | action = a 81 | if s == self.init_state and action == 'left': 82 | left += 1 83 | if s == self.init_state and action == 'right': 84 | right += 1 85 | possible_future_actions = self.env.possible_actions[s_p] 86 | 87 | if np.random.rand() > 0.5: 88 | q2_values = [(self.q2_sa[(s_p, a_)], a_) for a_ in possible_future_actions if (s_p, a_) in self.q2_sa] 89 | max_q_value = self.q1_sa[(s_p, max(q2_values)[1])] if len(q2_values) > 0 else 0 90 | # update q function using different target policy 91 | self.q2_sa[(s, action)] = self.q2_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q2_sa[(s, action)])) 92 | else: 93 | q1_values = [(self.q1_sa[(s_p, a_)], a_) for a_ in possible_future_actions if (s_p, a_) in self.q1_sa] 94 | max_q_value = self.q2_sa[(s_p, max(q1_values)[1])] if len(q1_values) > 0 else 0 95 | # update q function using different target policy 96 | self.q1_sa[(s, action)] = self.q1_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q1_sa[(s, action)])) 97 | left_actions_count.append(left) 98 | right_actions_count.append(right) 99 | left_actions_count = np.array(left_actions_count) 100 | right_actions_count = np.array(right_actions_count) 101 | 102 | final_left_array.append(left_actions_count) 103 | final_right_array.append(right_actions_count) 104 | 105 | final_left_array = np.array(final_left_array) 106 | final_right_array = np.array(final_right_array) 107 | out = 100 * final_left_array.sum(axis=0) / (final_left_array.sum(axis=0) + final_right_array.sum(axis=0)) 108 | return out 109 | 110 | 111 | -------------------------------------------------------------------------------- /DoubleQLearning/environment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Environment(object): 4 | def __init__(self) -> None: 5 | 6 | super().__init__() 7 | self.start_state = 'A' 8 | 9 | self.terminal_state = 'T' 10 | 11 | self.states = ['A', 'B', 'T'] 12 | 13 | self.b_actions = list(range(1, 10)) 14 | 15 | self.possible_actions = { 16 | 'A': ['left', 'right'], 17 | 'B': self.b_actions, 18 | 'T': [] 19 | } 20 | 21 | self.state_transitions = {('B', i): 'T' for i in self.b_actions} 22 | self.state_transitions[('A', 'left')] = 'B' 23 | self.state_transitions[('A', 'right')] = 'T' 24 | 25 | 26 | def reward(self, state, action): 27 | if (state == 'B'): 28 | mu, sigma = -0.1, 1 29 | return np.random.normal(mu, sigma, 1)[0] 30 | return 0 31 | 32 | def step(self, state, action): 33 | state = state 34 | reward = self.reward(state, action) 35 | next_state = self.state_transitions[(state, action)] 36 | done = (next_state == self.terminal_state) 37 | return reward, next_state, done 38 | -------------------------------------------------------------------------------- /DoubleQLearning/main.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | 4 | from environment import Environment 5 | from qlearning import QLearningAgent 6 | from double_qlearning import DoubleQLearningAgent 7 | 8 | sns.set() 9 | 10 | 11 | env = Environment() 12 | agent = QLearningAgent(env) 13 | agent2 = DoubleQLearningAgent(env) 14 | 15 | left_actions_ratio_a1 = agent.update_policy() 16 | left_actions_ratio_a2 = agent2.update_policy() 17 | 18 | 19 | fig, ax = plt.subplots() 20 | ax.plot(range(len(left_actions_ratio_a1)), left_actions_ratio_a1, color="red", label="Q-Learning") 21 | ax.plot(range(len(left_actions_ratio_a2)), left_actions_ratio_a2, color="green", label="Double Q-Learning") 22 | ax.plot(range(len(left_actions_ratio_a1)), [5]*len(left_actions_ratio_a1), '--', color='black', label='optimal') 23 | ax.set_xlabel("Number of episodes") 24 | ax.set_ylabel("% of left actions from A") 25 | ax.legend(loc='best') 26 | plt.show() 27 | -------------------------------------------------------------------------------- /DoubleQLearning/qlearning.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | class QLearningAgent(object): 5 | def __init__(self, env) -> None: 6 | super().__init__() 7 | self.e = 0.1 8 | self.alpha = 0.1 9 | self.gamma = 1 10 | self.num_episodes = 300 11 | 12 | self.env = env 13 | 14 | self.init_state = env.start_state 15 | self.q_sa = {x: 0 for x in env.state_transitions} 16 | 17 | def reset_policy(self): 18 | self.q_sa = {x: 0 for x in self.env.state_transitions} 19 | 20 | def soft_policy(self, state): 21 | possible_actions = self.env.possible_actions[state] 22 | if len(possible_actions) > 1: 23 | prob_explorative_action = self.e / len(possible_actions) 24 | prob_greedy_action = 1 - self.e + prob_explorative_action 25 | 26 | q_values = [] 27 | corresponding_actions = [] 28 | for a in possible_actions: 29 | if (state, a) in self.q_sa: 30 | q_values.append(self.q_sa[(state, a)]) 31 | corresponding_actions.append(a) 32 | 33 | # check if all q_values are the same, if so then take random action 34 | if all(x == q_values[0] for x in q_values): 35 | return random.choice(possible_actions) 36 | greedy_action = corresponding_actions[np.argmax(q_values)] 37 | non_greedy_actions = list(set(possible_actions) - set([greedy_action])) 38 | 39 | action = np.random.choice([greedy_action]+non_greedy_actions, 40 | p=[prob_greedy_action]+[prob_explorative_action for i in range(len(non_greedy_actions))]) 41 | return action 42 | return possible_actions[0] 43 | 44 | 45 | def generate_episode(self): 46 | episode = [] 47 | state = self.init_state 48 | while True: 49 | action = self.soft_policy(state) 50 | reward, next_state, done = self.env.step(state, action) 51 | episode.append((state, action, reward, next_state)) 52 | state = next_state 53 | if done: 54 | break 55 | return episode 56 | 57 | 58 | def update_policy(self): 59 | n_iters = 1000 60 | final_left_array = [] 61 | final_right_array = [] 62 | for _ in range(n_iters): 63 | self.reset_policy() 64 | left_actions_count = [] 65 | right_actions_count = [] 66 | 67 | for i in range(self.num_episodes): 68 | left = 0 69 | right = 0 70 | 71 | # generate episode using policy defined above 72 | episode = self.generate_episode() 73 | for s, a, r, s_p in episode: 74 | # use a soft policy as behavior policy e.g epsilon-greedy 75 | action = a 76 | if s == self.init_state and action == 'left': 77 | left += 1 78 | if s == self.init_state and action == 'right': 79 | right += 1 80 | possible_future_actions = self.env.possible_actions[s_p] 81 | 82 | q_values = [self.q_sa[(s_p, a_)] for a_ in possible_future_actions if (s_p, a_) in self.q_sa] 83 | 84 | max_q_value = max(q_values) if len(q_values) > 0 else 0 85 | 86 | # update q function using different target policy 87 | self.q_sa[(s, action)] = self.q_sa[(s, action)] + (self.alpha * ((r + self.gamma * max_q_value) - self.q_sa[(s, action)])) 88 | left_actions_count.append(left) 89 | right_actions_count.append(right) 90 | left_actions_count = np.array(left_actions_count) 91 | right_actions_count = np.array(right_actions_count) 92 | 93 | final_left_array.append(left_actions_count) 94 | final_right_array.append(right_actions_count) 95 | 96 | final_left_array = np.array(final_left_array) 97 | final_right_array = np.array(final_right_array) 98 | out = 100 * final_left_array.sum(axis=0) / (final_left_array.sum(axis=0) + final_right_array.sum(axis=0)) 99 | return out 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ReinforcementLearning 2 | This is the repository with the programming tutorials for the reinforcement learning module 3 | -------------------------------------------------------------------------------- /SimplePolicyIteration/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/jareth/anaconda3/envs/rllab3/bin/python", 3 | "python.linting.pylintArgs": ["----extension-pkg-whitelist=1xml"] 4 | } -------------------------------------------------------------------------------- /SimplePolicyIteration/agent.py: -------------------------------------------------------------------------------- 1 | class Agent: 2 | def __init__(self, name, pos): 3 | self.__name = name 4 | self.__pos = pos 5 | 6 | def get_name(self): 7 | return self.__name 8 | 9 | def get_pos(self): 10 | return self.__pos 11 | 12 | def set_pos(self, new_pos): 13 | self.__pos = new_pos 14 | -------------------------------------------------------------------------------- /SimplePolicyIteration/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | screen_size = 500 3 | cell_height = 100 4 | cell_width = 100 5 | velocity = 5 6 | delay = 1000 7 | -------------------------------------------------------------------------------- /SimplePolicyIteration/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/config.pyc -------------------------------------------------------------------------------- /SimplePolicyIteration/controller.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | 3 | class Controller: 4 | keys = None 5 | 6 | def set_key_map(key_map): 7 | Controller.keys = key_map 8 | 9 | def left(): 10 | return Controller.keys[pygame.K_LEFT] 11 | 12 | def right(): 13 | return Controller.keys[pygame.K_RIGHT] 14 | 15 | def up(): 16 | return Controller.keys[pygame.K_UP] 17 | 18 | def down(): 19 | return Controller.keys[pygame.K_DOWN] 20 | -------------------------------------------------------------------------------- /SimplePolicyIteration/controller.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/controller.pyc -------------------------------------------------------------------------------- /SimplePolicyIteration/environment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Environment: 4 | # creating a grid with 5 x 5 cells 5 | def __init__(self, size=5): 6 | self.grid = np.empty(shape=(size,size), dtype=object) 7 | self.size = size 8 | 9 | def place_cell(self, x, y, cell): 10 | self.grid[x][y] = cell 11 | 12 | -------------------------------------------------------------------------------- /SimplePolicyIteration/environment.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/environment.pyc -------------------------------------------------------------------------------- /SimplePolicyIteration/game.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | from agent import Agent 3 | from config import Config 4 | from controller import Controller 5 | from game_widgets import CellType, CellState 6 | from environment import Environment 7 | from policy_iteration import PolicyIteration 8 | 9 | class Game: 10 | def __init__(self, config, controller, env, agent, policy=None): 11 | # initialize pygame module 12 | pygame.init() 13 | 14 | # setup config parameters for game window 15 | self.controller_ = controller 16 | self.config_ = config 17 | self.win_ = pygame.display.set_mode((config.screen_size, 18 | config.screen_size)) 19 | 20 | # set game env 21 | self.env_ = env 22 | 23 | # set agent 24 | self.agent = agent 25 | 26 | # set policy 27 | self.policy = policy 28 | 29 | pygame.display.set_caption("Jabrah") 30 | 31 | def draw_env(self): 32 | self.win_.fill((0,0,0)) 33 | # load images 34 | kfc_image = pygame.image.load("images/kfc_r.png") 35 | grass_image = pygame.image.load("images/grass_r.png") 36 | whoop_image = pygame.image.load("images/beat_r.png") 37 | 38 | # draw environments 39 | for i in range(self.env_.size): 40 | for j in range(self.env_.size): 41 | cell = self.env_.grid[i][j] 42 | 43 | (x, y) = cell.pos() 44 | width = self.config_.cell_width 45 | height = self.config_.cell_height 46 | 47 | x = x * width 48 | y = y * height 49 | 50 | if (i,j) == agent.get_pos(): 51 | agent_image = pygame.image.load("images/baby_r.png") 52 | self.win_.blit(agent_image, (x, y)) 53 | 54 | elif cell.cell_type() == CellType.KFC: 55 | self.win_.blit(kfc_image, (x, y)) 56 | 57 | elif cell.cell_type() == CellType.WHOOPING: 58 | self.win_.blit(whoop_image, (x, y)) 59 | 60 | else: 61 | self.win_.blit(grass_image, (x, y)) 62 | 63 | 64 | def start(self): 65 | run = True 66 | while (run): 67 | pygame.time.delay(self.config_.delay) 68 | for event in pygame.event.get(): 69 | if event.type == pygame.QUIT: 70 | run = False 71 | keys = pygame.key.get_pressed() 72 | 73 | self.controller_.set_key_map(keys) 74 | curr_pos = self.agent.get_pos() 75 | new_pos = curr_pos 76 | 77 | if self.policy is None: 78 | # if no policy, use controller 79 | if self.controller_.right(): 80 | new_pos = (min(curr_pos[0]+1, env.size-1), curr_pos[1]) 81 | 82 | elif self.controller_.left(): 83 | new_pos = (max(curr_pos[0]-1, 0), curr_pos[1]) 84 | 85 | elif self.controller_.down(): 86 | new_pos = (curr_pos[0], min(curr_pos[1]+1, env.size-1)) 87 | 88 | elif self.controller_.up(): 89 | new_pos = (curr_pos[0], max(curr_pos[1]-1, 0)) 90 | else: 91 | if self.policy[curr_pos] == 1: 92 | new_pos = (min(curr_pos[0]+1, env.size-1), curr_pos[1]) 93 | 94 | elif self.policy[curr_pos] == 3: 95 | new_pos = (max(curr_pos[0]-1, 0), curr_pos[1]) 96 | 97 | elif self.policy[curr_pos] == 2: 98 | new_pos = (curr_pos[0], min(curr_pos[1]+1, env.size-1)) 99 | 100 | elif self.policy[curr_pos] == 0: 101 | new_pos = (curr_pos[0], max(curr_pos[1]-1, 0)) 102 | 103 | self.agent.set_pos(new_pos) 104 | self.draw_env() 105 | pygame.display.update() 106 | 107 | pygame.quit() 108 | 109 | def create_game_env(): 110 | env = Environment(size=5) 111 | states = [] 112 | policy = {} 113 | 114 | # the elements in the matrix represent the cell type 115 | cell_matrix = [[1, 1, 1, 1, 1], 116 | [1, 1, 1, 1, 2], 117 | [2, 1, 1, 1, 1], 118 | [2, 2, 1, 1, 1], 119 | [1, 2, 1, 1, 3]] 120 | 121 | size = len(cell_matrix) 122 | env = Environment(size=size) 123 | 124 | reward = 0 125 | is_terminal = False 126 | for i in range(size): 127 | for j in range(size): 128 | cell_type = cell_matrix[j][i] 129 | 130 | if cell_type == CellType.WHOOPING: 131 | is_terminal = True 132 | reward = -10 133 | elif cell_type == CellType.KFC: 134 | is_terminal = True 135 | reward = 10 136 | else: 137 | is_terminal = False 138 | reward = -1 139 | cell = CellState((i,j), reward, cell_type, is_terminal) 140 | env.place_cell(i, j, cell) 141 | states.append(cell) 142 | return env, states 143 | 144 | 145 | env, states = create_game_env() 146 | agent = Agent("policy_eval", (0, 0)) 147 | 148 | policy_iter_algo = PolicyIteration(states) 149 | policy = policy_iter_algo.run() 150 | 151 | game = Game(Config, Controller, env, agent, policy) 152 | # initiate env 153 | game.draw_env() 154 | pygame.display.update() 155 | game.start() 156 | -------------------------------------------------------------------------------- /SimplePolicyIteration/game_widgets.py: -------------------------------------------------------------------------------- 1 | class CellType: 2 | BLANK = 1 # no reward 3 | WHOOPING = 2 # reward = -10 4 | KFC = 3 # reward = 10 5 | 6 | class CellState: 7 | # initialize a cell for grid 8 | def __init__(self, pos, reward, cell_type, is_terminal): 9 | self.__pos = pos 10 | self.__reward = reward 11 | self.__cell_type = cell_type 12 | self.__is_terminal = is_terminal 13 | self.children = [] 14 | 15 | def append_child(self, cell): 16 | self.children.append(cell) 17 | 18 | def pos(self): 19 | return self.__pos 20 | 21 | def reward(self): 22 | return self.__reward 23 | 24 | def cell_type(self): 25 | return self.__cell_type 26 | 27 | def is_terminal(self): 28 | return self.__is_terminal -------------------------------------------------------------------------------- /SimplePolicyIteration/game_widgets.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/game_widgets.pyc -------------------------------------------------------------------------------- /SimplePolicyIteration/images/baby_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/baby_r.png -------------------------------------------------------------------------------- /SimplePolicyIteration/images/beat_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/beat_r.png -------------------------------------------------------------------------------- /SimplePolicyIteration/images/grass_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/grass_r.png -------------------------------------------------------------------------------- /SimplePolicyIteration/images/kfc_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/SimplePolicyIteration/images/kfc_r.png -------------------------------------------------------------------------------- /SimplePolicyIteration/policy_iteration.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class PolicyIteration(object): 4 | def __init__(self, cell_states): 5 | # 0 - up, 1 - right, 2 - down, 3 - up 6 | self.epsilon = 3 7 | self.actions = [0, 1, 2, 3] 8 | self.state_dict, self.policy_dict , self.value_dict = \ 9 | self.create_state_policy_dict(cell_states) 10 | 11 | def create_state_policy_dict(self, cell_states): 12 | state_dict = {} 13 | policy_dict = {} 14 | value_dict = {} 15 | for cell_state in cell_states: 16 | state_dict[cell_state.pos()] = cell_state 17 | policy_dict[cell_state.pos()] = random.choice(self.actions) 18 | if cell_state.is_terminal(): 19 | value_dict[cell_state.pos()] = cell_state.reward() 20 | else: 21 | value_dict[cell_state.pos()] =0 22 | return state_dict, policy_dict, value_dict 23 | 24 | def get_future_state(self, curr_pos, action): 25 | new_pos = curr_pos 26 | env_size = 5 27 | 28 | if action == 1: 29 | new_pos = (min(curr_pos[0]+1, env_size-1), curr_pos[1]) 30 | elif action == 3: 31 | new_pos = (max(curr_pos[0]-1, 0), curr_pos[1]) 32 | elif action == 2: 33 | new_pos = (curr_pos[0], min(curr_pos[1]+1, env_size-1)) 34 | elif action == 0: 35 | new_pos = (curr_pos[0], max(curr_pos[1]-1, 0)) 36 | return new_pos 37 | 38 | def q_value(self, state, action): 39 | state_ = self.get_future_state(state, action) 40 | q = self.state_dict[state_].reward() + \ 41 | self.value_dict[state_] 42 | return q 43 | 44 | def policy_evaluation(self): 45 | print("policy evaluation...") 46 | while True: 47 | delta = 0 48 | for state in self.state_dict: 49 | v = self.value_dict[state] 50 | if self.state_dict[state].is_terminal(): 51 | continue 52 | # next state s' computation 53 | self.value_dict[state] = self.q_value(state, self.policy_dict[state]) 54 | delta = max(delta, abs(v - self.value_dict[state])) 55 | if delta < self.epsilon: 56 | break 57 | 58 | def policy_improvement(self): 59 | print("policy improvement...") 60 | policy_stable = True 61 | for state in self.state_dict: 62 | old_action = self.policy_dict[state] 63 | old_action_value = self.q_value(state, old_action) 64 | 65 | best_action = old_action 66 | best_action_value = old_action_value 67 | 68 | for action in self.actions: 69 | action_value = self.q_value(state, action) 70 | if action_value > old_action_value: 71 | best_action = action 72 | best_action_value = action_value 73 | policy_stable = False 74 | self.policy_dict[state] = best_action 75 | return policy_stable 76 | 77 | def run(self): 78 | while True: 79 | self.policy_evaluation() 80 | policy_stable = self.policy_improvement() 81 | if policy_stable: 82 | return self.policy_dict 83 | -------------------------------------------------------------------------------- /mountain_car/README.md: -------------------------------------------------------------------------------- 1 | # Mountain Car 2 | 3 | Solved with Semi Gradient Sarsa & Tile Coding 4 | -------------------------------------------------------------------------------- /mountain_car/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/__init__.py -------------------------------------------------------------------------------- /mountain_car/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import numpy as np 4 | import gymnasium as gym 5 | 6 | random.seed(10) 7 | 8 | from semi_gradient_sarsa import SemiGradientSarsa 9 | 10 | env_ = gym.make("MountainCar-v0", render_mode=None) 11 | 12 | sarsa = SemiGradientSarsa(env_) 13 | sarsa.load_params() 14 | 15 | env = gym.make("MountainCar-v0", render_mode='human') 16 | state, info = env.reset() 17 | 18 | for i in range(200): 19 | action = sarsa.select_action(state, eps_greedy=False) 20 | next_state, reward, terminated, truncated, info = env.step(action) 21 | 22 | # Render the env 23 | env.render() 24 | 25 | # Wait a bit before the next frame unless you want to see a crazy fast video 26 | time.sleep(0.001) 27 | 28 | state = next_state 29 | 30 | env.close() 31 | 32 | -------------------------------------------------------------------------------- /mountain_car/semi_gradient_sarsa.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import random 3 | import numpy as np 4 | import gymnasium as gym 5 | 6 | random.seed(18) 7 | 8 | from tile_coding import * 9 | 10 | 11 | class SemiGradientSarsa(object): 12 | def __init__( 13 | self, 14 | env, 15 | alpha=0.01, 16 | eps=0.1, 17 | gamma=1, 18 | n_tilings = 7, 19 | num_eps=100) -> None: 20 | self.alpha = alpha 21 | self.eps = eps 22 | self.gamma = gamma 23 | self.num_eps = num_eps 24 | self.n_tilings = n_tilings 25 | self.env = env 26 | 27 | # define weight vector 28 | self.w = np.random.uniform(low=-0.05, high=0.05, size=(n_tilings**4,)) 29 | 30 | # hash for tile coding 31 | self.tile_coding = IHT(n_tilings**4) 32 | 33 | def q_func(self, feature_vector): 34 | return np.dot(self.w, feature_vector) 35 | 36 | def update_weight(self, reward, current_q, future_q, feature_vector, terminal): 37 | if terminal: 38 | w_update = self.alpha * (reward - current_q) 39 | else: 40 | w_update = self.alpha * (reward + self.gamma *future_q - current_q) 41 | self.w += np.multiply(w_update, feature_vector) 42 | 43 | 44 | def train(self): 45 | state, info = self.env.reset() 46 | action, q = self.select_action(state) 47 | episodes = 0 48 | steps = 0 49 | 50 | total_reward = 0 51 | 52 | while episodes < self.num_eps: 53 | steps += 1 54 | 55 | feature_vec = self.hash_feature_vector(state, action) 56 | next_state, reward, terminated, truncated, info = self.env.step(action) 57 | total_reward += reward 58 | 59 | if terminated: 60 | 61 | if episodes % 50 == 0: 62 | print("episode:", episodes, 'completed', 'reward:', total_reward) 63 | 64 | self.update_weight(reward, q, None, feature_vec, True) 65 | state, info = self.env.reset() 66 | action, q = self.select_action(state) 67 | total_reward = 0 68 | steps = 0 69 | episodes += 1 70 | 71 | continue 72 | 73 | next_action, next_q = self.select_action(next_state) 74 | self.update_weight(reward, q, next_q, feature_vec, False) 75 | state = next_state 76 | action = next_action 77 | q = next_q 78 | 79 | self.save_params() 80 | 81 | def save_params(self): 82 | print(self.w) 83 | pickle.dump(self.w, open('weights.pkl', 'wb')) 84 | pickle.dump(self.tile_coding, open('tilings.pkl', 'wb')) 85 | 86 | def load_params(self): 87 | self.w = pickle.load(open('weights.pkl', 'rb')) 88 | self.tile_coding = pickle.load(open('tilings.pkl', 'rb')) 89 | 90 | def one_hot_encode(self, indices): 91 | size = len(self.w) 92 | one_hot_vec = np.zeros(size) 93 | for i in indices: 94 | one_hot_vec[i] = 1 95 | return one_hot_vec 96 | 97 | def hash_feature_vector(self, state, action): 98 | # speed you up 99 | feature_ind = np.array(tiles(self.tile_coding, self.n_tilings, state.tolist(), [action])) 100 | feature_vec = self.one_hot_encode(feature_ind) 101 | return feature_vec 102 | 103 | def select_action(self, state, eps_greedy = True): 104 | num_actions = self.env.action_space.n 105 | actions = range(num_actions) 106 | action_val_dict = {} 107 | for action in actions: 108 | feature_vector = self.hash_feature_vector(state, action) 109 | q_val = self.q_func(np.array(feature_vector)) 110 | 111 | action_val_dict[action] = q_val 112 | 113 | greedy_action = max(action_val_dict, key=action_val_dict.get) 114 | 115 | if not eps_greedy: 116 | return greedy_action 117 | 118 | non_greedy_actions = list(set(range(num_actions)) - {greedy_action}) 119 | 120 | prob_explorative_action = self.eps / num_actions 121 | prob_greedy_action = 1 - self.eps + prob_explorative_action 122 | 123 | action = np.random.choice([greedy_action] + non_greedy_actions, 124 | p=[prob_greedy_action]+[prob_explorative_action for _ in range(len(non_greedy_actions))]) 125 | return action, action_val_dict[action] 126 | 127 | -------------------------------------------------------------------------------- /mountain_car/tile_coding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tile Coding Software version 3.0beta 3 | by Rich Sutton 4 | based on a program created by Steph Schaeffer and others 5 | External documentation and recommendations on the use of this code is available in the 6 | reinforcement learning textbook by Sutton and Barto, and on the web. 7 | These need to be understood before this code is. 8 | 9 | This software is for Python 3 or more. 10 | 11 | This is an implementation of grid-style tile codings, based originally on 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 13 | Here we provide a function, "tiles", that maps floating and integer 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while 15 | wrapping some floats to provided widths (the lower wrap value is always 0). 16 | 17 | The float variables will be gridded at unit intervals, so generalization 18 | will be by approximately 1 in each direction, and any scaling will have 19 | to be done externally before calling tiles. 20 | 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should 22 | also be greater than or equal to four times the number of floats. 23 | 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 26 | coordinates are to be returned without being converted to indices). 27 | 28 | This code can be found here http://incompleteideas.net/tiles/tiles3.py-remove 29 | """ 30 | 31 | basehash = hash 32 | 33 | class IHT: 34 | "Structure to handle collisions" 35 | def __init__(self, sizeval): 36 | self.size = sizeval 37 | self.overfullCount = 0 38 | self.dictionary = {} 39 | 40 | def __str__(self): 41 | "Prepares a string for printing whenever this object is printed" 42 | return "Collision table:" + \ 43 | " size:" + str(self.size) + \ 44 | " overfullCount:" + str(self.overfullCount) + \ 45 | " dictionary:" + str(len(self.dictionary)) + " items" 46 | 47 | def count (self): 48 | return len(self.dictionary) 49 | 50 | def fullp (self): 51 | return len(self.dictionary) >= self.size 52 | 53 | def getindex (self, obj, readonly=False): 54 | d = self.dictionary 55 | if obj in d: return d[obj] 56 | elif readonly: return None 57 | size = self.size 58 | count = self.count() 59 | if count >= size: 60 | if self.overfullCount==0: print('IHT full, starting to allow collisions') 61 | self.overfullCount += 1 62 | return basehash(obj) % self.size 63 | else: 64 | d[obj] = count 65 | return count 66 | 67 | def hashcoords(coordinates, m, readonly=False): 68 | if type(m)==IHT: return m.getindex(tuple(coordinates), readonly) 69 | if type(m)==int: return basehash(tuple(coordinates)) % m 70 | if m==None: return coordinates 71 | 72 | from math import floor, log 73 | from itertools import zip_longest 74 | 75 | def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False): 76 | """returns num-tilings tile indices corresponding to the floats and ints""" 77 | qfloats = [floor(f*numtilings) for f in floats] 78 | Tiles = [] 79 | for tiling in range(numtilings): 80 | tilingX2 = tiling*2 81 | coords = [tiling] 82 | b = tiling 83 | for q in qfloats: 84 | coords.append( (q + b) // numtilings ) 85 | b += tilingX2 86 | coords.extend(ints) 87 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 88 | return Tiles 89 | 90 | def tileswrap (ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False): 91 | """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats""" 92 | qfloats = [floor(f*numtilings) for f in floats] 93 | Tiles = [] 94 | for tiling in range(numtilings): 95 | tilingX2 = tiling*2 96 | coords = [tiling] 97 | b = tiling 98 | for q, width in zip_longest(qfloats, wrapwidths): 99 | c = (q + b%numtilings) // numtilings 100 | coords.append(c%width if width else c) 101 | b += tilingX2 102 | coords.extend(ints) 103 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 104 | return Tiles 105 | -------------------------------------------------------------------------------- /mountain_car/tilings.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/tilings.pkl -------------------------------------------------------------------------------- /mountain_car/train.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | from semi_gradient_sarsa import SemiGradientSarsa 3 | 4 | # add `.env` at the end to ignore internal truncation 5 | env = gym.make("MountainCar-v0", render_mode=None) 6 | 7 | sarsa = SemiGradientSarsa(env, num_eps=500, alpha=0.01) 8 | sarsa.train() -------------------------------------------------------------------------------- /mountain_car/weights.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/mountain_car/weights.pkl -------------------------------------------------------------------------------- /self_driving_agent/DQN_Control/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/self_driving_agent/DQN_Control/__init__.py -------------------------------------------------------------------------------- /self_driving_agent/DQN_Control/model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | class ConvNet(nn.Module): 8 | def __init__(self, dim, in_channels, num_actions) -> None: 9 | super(ConvNet, self).__init__() 10 | # conv2d(in_channels, out_channels, kernel_size, stride) 11 | 12 | self.conv1 = nn.Conv2d(in_channels, 32, 8, 4) 13 | self.conv1_bn = nn.BatchNorm2d(32) 14 | self.conv2 = nn.Conv2d(32, 64, 4, 3) 15 | self.conv2_bn = nn.BatchNorm2d(64) 16 | self.conv3 = nn.Conv2d(64, 64, 3, 1) 17 | self.conv3_bn = nn.BatchNorm2d(64) 18 | 19 | self.fc1 = nn.Linear(64*8*8, 256) 20 | self.fc1_bn = nn.BatchNorm1d(256) 21 | self.fc2 = nn.Linear(256, 32) 22 | self.fc2_bn = nn.BatchNorm1d(32) 23 | self.fc3 = nn.Linear(32, num_actions) 24 | 25 | def forward(self, x): 26 | x = F.relu(self.conv1_bn(self.conv1(x))) 27 | x = F.relu(self.conv2_bn(self.conv2(x))) 28 | x = F.relu(self.conv3_bn(self.conv3(x))) 29 | x = F.relu(self.fc1_bn(self.fc1(x.reshape(-1, 64*8*8)))) 30 | x = F.relu(self.fc2_bn(self.fc2(x))) 31 | x = self.fc3(x) 32 | return x 33 | 34 | class DQN(object): 35 | def __init__( 36 | self, 37 | num_actions, 38 | state_dim, #? 39 | in_channels, 40 | device, 41 | discount=0.9, 42 | optimizer="Adam", 43 | optimizer_parameters={'lr':0.01}, 44 | target_update_frequency=1e4, 45 | initial_eps = 1, 46 | end_eps = 0.05, 47 | eps_decay_period = 25e4, 48 | eval_eps=0.001 49 | ) -> None: 50 | self.device = device 51 | 52 | self.Q = ConvNet(state_dim, in_channels, num_actions).to(self.device) 53 | self.Q_target = copy.deepcopy(self.Q) # copy target network 54 | self.Q_optimizer = getattr(torch.optim, optimizer)(self.Q.parameters(), 55 | **optimizer_parameters) 56 | 57 | self.discount = discount 58 | 59 | self.target_update_frequency = target_update_frequency 60 | 61 | # epsilon decay 62 | self.initial_eps = initial_eps 63 | self.end_eps = end_eps 64 | self.slope = (self.end_eps - self.initial_eps) / eps_decay_period 65 | 66 | self.state_shape = (-1,) + state_dim 67 | self.eval_eps = eval_eps 68 | self.num_actions = num_actions 69 | 70 | self.iterations = 0 71 | 72 | def select_action(self, state, eval=False): 73 | eps = self.eval_eps if eval \ 74 | else max(self.slope * self.iterations + self.initial_eps, self.end_eps) 75 | self.current_eps = eps 76 | 77 | # Select action according to policy with probability (1-eps) 78 | # otherwise, select random action 79 | if np.random.uniform(0,1) > eps: 80 | self.Q.eval() 81 | with torch.no_grad(): 82 | # without batch norm, remove the unsqueeze 83 | state = torch.FloatTensor(state).reshape(self.state_shape).unsqueeze(0).to(self.device) 84 | return int(self.Q(state).argmax(1)) 85 | else: 86 | return np.random.randint(self.num_actions) 87 | 88 | def train(self, replay_buffer): 89 | self.Q.train() 90 | # Sample mininbatch from replay buffer 91 | state, action, next_state, reward, done = replay_buffer.sample() 92 | 93 | # Compute the target Q value 94 | with torch.no_grad(): 95 | target_Q = reward + (1-done) * self.discount * self.Q_target(next_state).max(1, keepdim=True)[0] 96 | 97 | # Get current Q estimate 98 | # torch gather just selects action values from Q(state) using the action tensor as an index 99 | current_Q = self.Q(state).gather(1, action) 100 | 101 | # Compute Q loss 102 | Q_loss = F.smooth_l1_loss(current_Q, target_Q) 103 | 104 | # Optimize the Q 105 | self.Q_optimizer.zero_grad() 106 | Q_loss.backward() 107 | self.Q_optimizer.step() 108 | 109 | # Update target network by full copy every X iterations. 110 | self.iterations += 1 111 | self.copy_target_update() 112 | 113 | def copy_target_update(self): 114 | if self.iterations % self.target_update_frequency == 0: 115 | print('target network updated') 116 | print('current epsilon', self.current_eps) 117 | self.Q_target.load_state_dict(self.Q.state_dict()) 118 | 119 | 120 | def save(self, filename): 121 | torch.save(self.Q.state_dict(), filename + "_Q") 122 | torch.save(self.Q_optimizer.state_dict(), filename + "_optimizer") 123 | 124 | 125 | def load(self, filename): 126 | self.Q.load_state_dict(torch.load(filename + "_Q")) 127 | self.Q_target = copy.deepcopy(self.Q) 128 | self.Q_optimizer.load_state_dict(torch.load(filename + "_optimizer")) 129 | -------------------------------------------------------------------------------- /self_driving_agent/DQN_Control/process_img.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | img = cv2.imread('../output/000970.png') 4 | print(img.shape) 5 | 6 | scale_percent = 25 7 | width = int(img.shape[1] * scale_percent/100) 8 | height = int(img.shape[0] * scale_percent/100) 9 | 10 | dim = (128, 128) 11 | 12 | resized_img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA) 13 | img_gray = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY) 14 | print(img_gray.shape) 15 | cv2.imshow('', img_gray) 16 | cv2.waitKey(5000) 17 | cv2.destroyAllWindows() 18 | -------------------------------------------------------------------------------- /self_driving_agent/DQN_Control/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | import numpy as np 4 | from torchvision import transforms 5 | 6 | class ReplayBuffer(object): 7 | def __init__(self, state_dim, batch_size, buffer_size, device) -> None: 8 | self.batch_size = batch_size 9 | self.max_size = int(buffer_size) 10 | self.device = device 11 | 12 | self.ptr = 0 13 | self.crt_size = 0 14 | 15 | self.state = np.zeros((self.max_size,) + state_dim) 16 | self.action = np.zeros((self.max_size, 1)) 17 | self.next_state = np.array(self.state) 18 | self.reward = np.zeros((self.max_size, 1)) 19 | self.done = np.zeros((self.max_size, 1)) 20 | 21 | def add(self, state, action, next_state, reward, done): 22 | self.state[self.ptr] = state 23 | self.action[self.ptr] = action 24 | self.next_state[self.ptr] = next_state 25 | self.reward[self.ptr] = reward 26 | self.done[self.ptr] = done 27 | 28 | self.ptr = (self.ptr + 1) % self.max_size 29 | self.crt_size = min(self.crt_size + 1, self.max_size) 30 | 31 | def sample(self): 32 | ind = np.random.randint(0, self.crt_size, size=self.batch_size) 33 | return ( 34 | torch.FloatTensor(self.state[ind]).unsqueeze(1).to(self.device), 35 | torch.LongTensor(self.action[ind]).to(self.device), 36 | torch.FloatTensor(self.next_state[ind]).unsqueeze(1).to(self.device), 37 | torch.FloatTensor(self.reward[ind]).to(self.device), 38 | torch.FloatTensor(self.done[ind]).to(self.device) 39 | ) 40 | 41 | def test_buffer(): 42 | img0 = np.zeros((5, 5)) 43 | img1 = img0 + 1 44 | img2 = img0 + 2 45 | img3 = img0 + 3 46 | 47 | action = 1 48 | reward = 10 49 | done = 0 50 | 51 | device = "cpu" 52 | 53 | buffer = ReplayBuffer((5, 5), 2, 10, device) 54 | buffer.add(img0, action, img1, reward, done) 55 | buffer.add(img1, action, img2, reward, done) 56 | buffer.add(img2, action, img3, reward, done + 1) 57 | 58 | sample = buffer.sample()[0] 59 | print(sample.shape) 60 | 61 | norm = transforms.Normalize((0.5, 0.5), (0.5, 0.5)) 62 | print(norm(sample).shape) 63 | 64 | 65 | 66 | # test_buffer() 67 | -------------------------------------------------------------------------------- /self_driving_agent/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | This is the code accompanying the lecture on self-driving with carla that can be found here https://www.youtube.com/watch?v=MNiqlHC6Kn4&t 3 | You should only run this after installing carla and getting a grasp on how to run the simulator and calling the python API, which is documented very well on their site. 4 | 5 | # How to Run 6 | 7 | ## Setup 8 | I did not create a dependency or yml file (will do so at a later time), but you need carla, pygame, pytorch, opencv and numpy to run this project 9 | 10 | You should ensure that you have a `weights` folder when you run the project. If you do not have one, then just run `initial_setup.py` and it will create it for you. If you just cloned the repository, I reccomend you run this file first. 11 | 12 | ## main.py 13 | Run this file if you want to evaluate the performance of your agent 14 | ``` 15 | env = SimEnv(visuals=False) 16 | ``` 17 | 18 | The call above initializes our simulation environment. You should set visuals to `False` if you do not want to open this with pygame, or to `True` if you want a pygame window to open along with the simulator. 19 | 20 | ``` 21 | model.load('weights/model_ep_4400') 22 | ``` 23 | 24 | This loads a trained/pre-trained model. The program will not run unless it can load this model. 25 | The 4400 indicates that this model was trained for 4400 episodes. 26 | For example, if you train your own model for 200 episodes you will see the following files in the weights folder 27 | 28 | `model_ep_200_optimizer` and `model_ep_200_Q` 29 | 30 | You can then load the model as `model.load('weights/model_ep_200')`. Please note however that this is likely to be a very bad model, and it will learn effectively after many episodes. 31 | 32 | ## train.py 33 | This is for training the model. The model only starts learning after a certain number of episodes, and it can take from 8-10 hours (at least on my setup) before we see signs of learning. I will now describe a few variables you can set to configure your training process. You can modify them yourself in `config.py`. 34 | 35 | `target_speed` --> Speed you want the car to move at in km/h 36 | 37 | `max_iter` --> Maximum number of steps before starting a new episode 38 | 39 | `start_buffer` --> Number of episodes to run before starting training 40 | 41 | `train_freq` --> How often to train (set to 1 to train every step, 2 to train every 2 steps etc) 42 | 43 | `save_freq`: --> Frequency of saving our model 44 | 45 | `start_ep` --> Which episode should we start on (just a counter which you can update if program crushes while training for example) 46 | 47 | `max_dist_from_waypoint` --> Maximum distance from waypoint/road before we decide to terminate the episode 48 | -------------------------------------------------------------------------------- /self_driving_agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JabrahTutorials/ReinforcementLearning/601d1bbf5e3118a8524a3ea8c9a77b3cc86dea18/self_driving_agent/__init__.py -------------------------------------------------------------------------------- /self_driving_agent/config.py: -------------------------------------------------------------------------------- 1 | # dqn action values 2 | action_values = [-0.75, -0.5, -0.25, -0.15, -0.1, -0.05, 0, 3 | 0.05, 0.1, 0.15, 0.25, 0.5, 0.75] 4 | action_map = {i:x for i, x in enumerate(action_values)} 5 | 6 | env_params = { 7 | 'target_speed' :30 , 8 | 'max_iter': 4000, 9 | 'start_buffer': 10, 10 | 'train_freq': 1, 11 | 'save_freq': 200, 12 | 'start_ep': 0, 13 | 'max_dist_from_waypoint': 20 14 | } 15 | -------------------------------------------------------------------------------- /self_driving_agent/controllers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | 5 | try: 6 | sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % ( 7 | sys.version_info.major, 8 | sys.version_info.minor, 9 | 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) 10 | except IndexError: 11 | pass 12 | 13 | import carla 14 | import numpy as np 15 | from collections import deque 16 | from utils import get_speed 17 | 18 | class PIDLongitudinalController(): 19 | """ 20 | PIDLongitudinalController implements longitudinal control using a PID. 21 | """ 22 | 23 | def __init__(self, vehicle, max_throttle=0.75, max_brake=0.3, K_P=1.0, K_I=0.0, K_D=0.0, dt=0.03): 24 | """ 25 | Constructor method. 26 | :param vehicle: actor to apply to local planner logic onto 27 | :param K_P: Proportional term 28 | :param K_D: Differential term 29 | :param K_I: Integral term 30 | :param dt: time differential in seconds 31 | """ 32 | self._vehicle = vehicle 33 | self.max_throttle = max_throttle 34 | self.max_brake = max_brake 35 | self._k_p = K_P 36 | self._k_i = K_I 37 | self._k_d = K_D 38 | self._dt = dt 39 | self._error_buffer = deque(maxlen=10) 40 | 41 | def run_step(self, target_speed, debug=False): 42 | """ 43 | Execute one step of longitudinal control to reach a given target speed. 44 | :param target_speed: target speed in Km/h 45 | :param debug: boolean for debugging 46 | :return: throttle control 47 | """ 48 | current_speed = get_speed(self._vehicle) 49 | 50 | if debug: 51 | print('Current speed = {}'.format(current_speed)) 52 | 53 | acceleration = self._pid_control(target_speed, current_speed) 54 | control = carla.VehicleControl() 55 | if acceleration >= 0.0: 56 | control.throttle = min(acceleration, self.max_throttle) 57 | control.brake = 0.0 58 | else: 59 | control.throttle = 0.0 60 | control.brake = min(abs(acceleration), self.max_brake) 61 | return control 62 | 63 | def _pid_control(self, target_speed, current_speed): 64 | """ 65 | Estimate the throttle/brake of the vehicle based on the PID equations 66 | :param target_speed: target speed in Km/h 67 | :param current_speed: current speed of the vehicle in Km/h 68 | :return: throttle/brake control 69 | """ 70 | 71 | error = target_speed - current_speed 72 | self._error_buffer.append(error) 73 | 74 | if len(self._error_buffer) >= 2: 75 | _de = (self._error_buffer[-1] - self._error_buffer[-2]) / self._dt 76 | _ie = sum(self._error_buffer) * self._dt 77 | else: 78 | _de = 0.0 79 | _ie = 0.0 80 | 81 | return np.clip((self._k_p * error) + (self._k_d * _de) + (self._k_i * _ie), -1.0, 1.0) 82 | 83 | def change_parameters(self, K_P, K_I, K_D, dt): 84 | """Changes the PID parameters""" 85 | self._k_p = K_P 86 | self._k_i = K_I 87 | self._k_d = K_D 88 | self._dt = dt 89 | 90 | 91 | -------------------------------------------------------------------------------- /self_driving_agent/environment.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import sys 4 | import numpy as np 5 | 6 | try: 7 | sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % ( 8 | sys.version_info.major, 9 | sys.version_info.minor, 10 | 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) 11 | except IndexError: 12 | pass 13 | 14 | import carla 15 | import random 16 | import pickle 17 | 18 | from synch_mode import CarlaSyncMode 19 | from controllers import PIDLongitudinalController 20 | from utils import * 21 | 22 | random.seed(78) 23 | 24 | class SimEnv(object): 25 | def __init__(self, 26 | visuals=True, 27 | target_speed = 30, 28 | max_iter = 4000, 29 | start_buffer = 10, 30 | train_freq = 1, 31 | save_freq = 200, 32 | start_ep = 0, 33 | max_dist_from_waypoint = 20 34 | ) -> None: 35 | self.visuals = visuals 36 | if self.visuals: 37 | self._initiate_visuals() 38 | 39 | self.client = carla.Client('localhost', 2000) 40 | self.client.set_timeout(10.0) 41 | 42 | self.world = self.client.load_world('Town02_Opt') 43 | self.world.unload_map_layer(carla.MapLayer.Decals) 44 | self.world.unload_map_layer(carla.MapLayer.Foliage) 45 | self.world.unload_map_layer(carla.MapLayer.ParkedVehicles) 46 | self.world.unload_map_layer(carla.MapLayer.Particles) 47 | self.world.unload_map_layer(carla.MapLayer.Props) 48 | self.world.unload_map_layer(carla.MapLayer.StreetLights) 49 | 50 | 51 | self.spawn_points = self.world.get_map().get_spawn_points() 52 | 53 | self.blueprint_library = self.world.get_blueprint_library() 54 | self.vehicle_blueprint = self.blueprint_library.find('vehicle.nissan.patrol') 55 | 56 | # input these later on as arguments 57 | self.global_t = 0 # global timestep 58 | self.target_speed = target_speed # km/h 59 | self.max_iter = max_iter 60 | self.start_buffer = start_buffer 61 | self.train_freq = train_freq 62 | self.save_freq = save_freq 63 | self.start_ep = start_ep 64 | 65 | self.max_dist_from_waypoint = max_dist_from_waypoint 66 | self.start_train = self.start_ep + self.start_buffer 67 | 68 | self.total_rewards = 0 69 | self.average_rewards_list = [] 70 | 71 | def _initiate_visuals(self): 72 | pygame.init() 73 | 74 | self.display = pygame.display.set_mode( 75 | (800, 600), 76 | pygame.HWSURFACE | pygame.DOUBLEBUF) 77 | self.font = get_font() 78 | self.clock = pygame.time.Clock() 79 | 80 | def create_actors(self): 81 | self.actor_list = [] 82 | # spawn vehicle at random location 83 | self.vehicle = self.world.spawn_actor(self.vehicle_blueprint, random.choice(self.spawn_points)) 84 | # vehicle.set_autopilot(True) 85 | self.actor_list.append(self.vehicle) 86 | 87 | self.camera_rgb = self.world.spawn_actor( 88 | self.blueprint_library.find('sensor.camera.rgb'), 89 | carla.Transform(carla.Location(x=1.5, z=2.4), carla.Rotation(pitch=-15)), 90 | attach_to=self.vehicle) 91 | self.actor_list.append(self.camera_rgb) 92 | 93 | self.camera_rgb_vis = self.world.spawn_actor( 94 | self.blueprint_library.find('sensor.camera.rgb'), 95 | carla.Transform(carla.Location(x=-5.5, z=2.8), carla.Rotation(pitch=-15)), 96 | attach_to=self.vehicle) 97 | self.actor_list.append(self.camera_rgb_vis) 98 | 99 | self.collision_sensor = self.world.spawn_actor( 100 | self.blueprint_library.find('sensor.other.collision'), 101 | carla.Transform(), 102 | attach_to=self.vehicle 103 | ) 104 | self.actor_list.append(self.collision_sensor) 105 | 106 | self.speed_controller = PIDLongitudinalController(self.vehicle) 107 | 108 | def reset(self): 109 | for actor in self.actor_list: 110 | actor.destroy() 111 | 112 | def generate_episode(self, model, replay_buffer, ep, action_map=None, eval=True): 113 | with CarlaSyncMode(self.world, self.camera_rgb, self.camera_rgb_vis, self.collision_sensor, fps=30) as sync_mode: 114 | counter = 0 115 | 116 | snapshot, image_rgb, image_rgb_vis, collision = sync_mode.tick(timeout=2.0) 117 | 118 | # destroy if there is no data 119 | if snapshot is None or image_rgb is None: 120 | print("No data, skipping episode") 121 | self.reset() 122 | return None 123 | 124 | image = process_img(image_rgb) 125 | next_state = image 126 | 127 | while True: 128 | if self.visuals: 129 | if should_quit(): 130 | return 131 | self.clock.tick_busy_loop(30) 132 | 133 | vehicle_location = self.vehicle.get_location() 134 | 135 | waypoint = self.world.get_map().get_waypoint(vehicle_location, project_to_road=True, 136 | lane_type=carla.LaneType.Driving) 137 | 138 | speed = get_speed(self.vehicle) 139 | 140 | # Advance the simulation and wait for the data. 141 | state = next_state 142 | 143 | counter += 1 144 | self.global_t += 1 145 | 146 | 147 | action = model.select_action(state, eval=eval) 148 | steer = action 149 | if action_map is not None: 150 | steer = action_map[action] 151 | 152 | control = self.speed_controller.run_step(self.target_speed) 153 | control.steer = steer 154 | self.vehicle.apply_control(control) 155 | 156 | fps = round(1.0 / snapshot.timestamp.delta_seconds) 157 | 158 | snapshot, image_rgb, image_rgb_vis, collision = sync_mode.tick(timeout=2.0) 159 | 160 | cos_yaw_diff, dist, collision = get_reward_comp(self.vehicle, waypoint, collision) 161 | reward = reward_value(cos_yaw_diff, dist, collision) 162 | 163 | if snapshot is None or image_rgb is None: 164 | print("Process ended here") 165 | break 166 | 167 | image = process_img(image_rgb) 168 | 169 | done = 1 if collision else 0 170 | 171 | self.total_rewards += reward 172 | 173 | next_state = image 174 | 175 | replay_buffer.add(state, action, next_state, reward, done) 176 | 177 | if not eval: 178 | if ep > self.start_train and (self.global_t % self.train_freq) == 0: 179 | model.train(replay_buffer) 180 | 181 | # Draw the display. 182 | if self.visuals: 183 | draw_image(self.display, image_rgb_vis) 184 | self.display.blit( 185 | self.font.render('% 5d FPS (real)' % self.clock.get_fps(), True, (255, 255, 255)), 186 | (8, 10)) 187 | self.display.blit( 188 | self.font.render('% 5d FPS (simulated)' % fps, True, (255, 255, 255)), 189 | (8, 28)) 190 | pygame.display.flip() 191 | 192 | if collision == 1 or counter >= self.max_iter or dist > self.max_dist_from_waypoint: 193 | print("Episode {} processed".format(ep), counter) 194 | break 195 | 196 | if ep % self.save_freq == 0 and ep > 0: 197 | self.save(model, ep) 198 | 199 | def save(self, model, ep): 200 | if ep % self.save_freq == 0 and ep > self.start_ep: 201 | avg_reward = self.total_rewards/self.save_freq 202 | self.average_rewards_list.append(avg_reward) 203 | self.total_rewards = 0 204 | 205 | model.save('weights/model_ep_{}'.format(ep)) 206 | 207 | print("Saved model with average reward =", avg_reward) 208 | 209 | def quit(self): 210 | pygame.quit() 211 | 212 | def get_reward_comp(vehicle, waypoint, collision): 213 | vehicle_location = vehicle.get_location() 214 | x_wp = waypoint.transform.location.x 215 | y_wp = waypoint.transform.location.y 216 | 217 | x_vh = vehicle_location.x 218 | y_vh = vehicle_location.y 219 | 220 | wp_array = np.array([x_wp, y_wp]) 221 | vh_array = np.array([x_vh, y_vh]) 222 | 223 | dist = np.linalg.norm(wp_array - vh_array) 224 | 225 | vh_yaw = correct_yaw(vehicle.get_transform().rotation.yaw) 226 | wp_yaw = correct_yaw(waypoint.transform.rotation.yaw) 227 | cos_yaw_diff = np.cos((vh_yaw - wp_yaw)*np.pi/180.) 228 | 229 | collision = 0 if collision is None else 1 230 | 231 | return cos_yaw_diff, dist, collision 232 | 233 | def reward_value(cos_yaw_diff, dist, collision, lambda_1=1, lambda_2=1, lambda_3=5): 234 | reward = (lambda_1 * cos_yaw_diff) - (lambda_2 * dist) - (lambda_3 * collision) 235 | return reward 236 | -------------------------------------------------------------------------------- /self_driving_agent/initial_setup.py: -------------------------------------------------------------------------------- 1 | from utils import create_folders 2 | 3 | # automatically creates folders that may not exist, or ignores if they do 4 | folders = ['weights'] 5 | create_folders(folders) 6 | -------------------------------------------------------------------------------- /self_driving_agent/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from DQN_Control.replay_buffer import ReplayBuffer 4 | from DQN_Control.model import DQN 5 | 6 | from config import action_map, env_params 7 | from utils import * 8 | from environment import SimEnv 9 | 10 | def run(): 11 | try: 12 | buffer_size = 1e4 13 | batch_size = 32 14 | state_dim = (128, 128) 15 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | device = "cpu" 17 | num_actions = len(action_map) 18 | in_channels = 1 19 | episodes = 10000 20 | 21 | replay_buffer = ReplayBuffer(state_dim, batch_size, buffer_size, device) 22 | model = DQN(num_actions, state_dim, in_channels, device) 23 | 24 | # this only works if you have a model in your weights folder. Replace this by that file 25 | model.load('weights/model_ep_4400') 26 | 27 | # set to True if you want to run with pygame 28 | env = SimEnv(visuals=True, **env_params) 29 | 30 | for ep in range(episodes): 31 | env.create_actors() 32 | env.generate_episode(model, replay_buffer, ep, action_map, eval=True) 33 | env.reset() 34 | finally: 35 | env.reset() 36 | env.quit() 37 | 38 | if __name__ == "__main__": 39 | run() 40 | -------------------------------------------------------------------------------- /self_driving_agent/synch_mode.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import sys 4 | 5 | try: 6 | sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % ( 7 | sys.version_info.major, 8 | sys.version_info.minor, 9 | 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) 10 | except IndexError: 11 | pass 12 | 13 | import carla 14 | import queue 15 | 16 | class CarlaSyncMode(object): 17 | """ 18 | Context manager to synchronize output from different sensors. Synchronous 19 | mode is enabled as long as we are inside this context 20 | 21 | with CarlaSyncMode(world, sensors) as sync_mode: 22 | while True: 23 | data = sync_mode.tick(timeout=1.0) 24 | 25 | """ 26 | 27 | def __init__(self, world, *sensors, **kwargs): 28 | self.world = world 29 | self.sensors = sensors 30 | self.frame = None 31 | self.delta_seconds = 1.0 / kwargs.get('fps', 20) 32 | self._queues = [] 33 | self._settings = None 34 | self.collisions = [] 35 | 36 | def __enter__(self): 37 | self._settings = self.world.get_settings() 38 | self.frame = self.world.apply_settings(carla.WorldSettings( 39 | no_rendering_mode=False, 40 | synchronous_mode=True, 41 | fixed_delta_seconds=self.delta_seconds)) 42 | 43 | def make_queue(register_event): 44 | q = queue.Queue() 45 | register_event(q.put) 46 | self._queues.append(q) 47 | 48 | make_queue(self.world.on_tick) 49 | for sensor in self.sensors: 50 | make_queue(sensor.listen) 51 | return self 52 | 53 | def tick(self, timeout): 54 | try: 55 | self.frame = self.world.tick() 56 | data = [self._retrieve_data(q, timeout) for q in self._queues[:-1]] 57 | # collision sensor is the last element in the queue 58 | collision = self._detect_collision(self._queues[-1]) 59 | 60 | assert all(x.frame == self.frame for x in data) 61 | 62 | return data + [collision] 63 | except queue.Empty: 64 | print("empty queue") 65 | return None, None, None 66 | 67 | def __exit__(self, *args, **kwargs): 68 | self.world.apply_settings(self._settings) 69 | 70 | def _retrieve_data(self, sensor_queue, timeout): 71 | while True: 72 | data = sensor_queue.get(timeout=timeout) 73 | if data.frame == self.frame: 74 | return data 75 | 76 | def _detect_collision(self, sensor): 77 | # This collision is not fully aligned with other sensors, fix later 78 | try: 79 | data = sensor.get(block=False) 80 | return data 81 | except queue.Empty: 82 | return None -------------------------------------------------------------------------------- /self_driving_agent/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | from DQN_Control.replay_buffer import ReplayBuffer 5 | from DQN_Control.model import DQN 6 | 7 | from config import action_map, env_params 8 | from utils import * 9 | from environment import SimEnv 10 | 11 | def run(): 12 | try: 13 | buffer_size = 1e4 14 | batch_size = 32 15 | state_dim = (128, 128) 16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | num_actions = len(action_map) 18 | in_channels = 1 19 | episodes = 10000 20 | 21 | replay_buffer = ReplayBuffer(state_dim, batch_size, buffer_size, device) 22 | model = DQN(num_actions, state_dim, in_channels, device) 23 | 24 | env = SimEnv(visuals=False, **env_params) 25 | 26 | for ep in range(episodes): 27 | env.create_actors() 28 | env.generate_episode(model, replay_buffer, ep, action_map, eval=False) 29 | env.reset() 30 | finally: 31 | env.quit() 32 | 33 | if __name__ == "__main__": 34 | run() 35 | -------------------------------------------------------------------------------- /self_driving_agent/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import pygame 4 | import math 5 | import numpy as np 6 | 7 | def process_img(image, dim_x=128, dim_y=128): 8 | array = np.frombuffer(image.raw_data, dtype=np.dtype("uint8")) 9 | array = np.reshape(array, (image.height, image.width, 4)) 10 | array = array[:, :, :3] 11 | array = array[:, :, ::-1] 12 | 13 | # scale_percent = 25 14 | # width = int(array.shape[1] * scale_percent/100) 15 | # height = int(array.shape[0] * scale_percent/100) 16 | 17 | # dim = (width, height) 18 | dim = (dim_x, dim_y) # set same dim for now 19 | resized_img = cv2.resize(array, dim, interpolation=cv2.INTER_AREA) 20 | img_gray = cv2.cvtColor(resized_img, cv2.COLOR_BGR2GRAY) 21 | scaledImg = img_gray/255. 22 | 23 | # normalize 24 | mean, std = 0.5, 0.5 25 | normalizedImg = (scaledImg - mean) / std 26 | 27 | return normalizedImg 28 | 29 | def draw_image(surface, image, blend=False): 30 | array = np.frombuffer(image.raw_data, dtype=np.dtype("uint8")) 31 | array = np.reshape(array, (image.height, image.width, 4)) 32 | array = array[:, :, :3] 33 | array = array[:, :, ::-1] 34 | image_surface = pygame.surfarray.make_surface(array.swapaxes(0, 1)) 35 | if blend: 36 | image_surface.set_alpha(100) 37 | surface.blit(image_surface, (0, 0)) 38 | 39 | def get_font(): 40 | fonts = [x for x in pygame.font.get_fonts()] 41 | default_font = 'ubuntumono' 42 | font = default_font if default_font in fonts else fonts[0] 43 | font = pygame.font.match_font(font) 44 | return pygame.font.Font(font, 14) 45 | 46 | def should_quit(): 47 | for event in pygame.event.get(): 48 | if event.type == pygame.QUIT: 49 | return True 50 | elif event.type == pygame.KEYUP: 51 | if event.key == pygame.K_ESCAPE: 52 | return True 53 | return False 54 | 55 | def get_speed(vehicle): 56 | """ 57 | Compute speed of a vehicle in Km/h. 58 | :param vehicle: the vehicle for which speed is calculated 59 | :return: speed as a float in Km/h 60 | """ 61 | vel = vehicle.get_velocity() 62 | 63 | return 3.6 * math.sqrt(vel.x ** 2 + vel.y ** 2 + vel.z ** 2) 64 | 65 | def correct_yaw(x): 66 | return(((x%360) + 360) % 360) 67 | 68 | def create_folders(folder_names): 69 | for directory in folder_names: 70 | if not os.path.exists(directory): 71 | # If it doesn't exist, create it 72 | os.makedirs(directory) --------------------------------------------------------------------------------