├── Deep-RL-Mountain-Car ├── Policy.py ├── Agent.py ├── MCtest.py ├── MountainCar.py ├── QLearning.py ├── README.md ├── TrainModel.py ├── main.py ├── expected_sarsa.py ├── sarsa.py └── double_q.py ├── multi_armed_bandits ├── bandit.py └── policy.py ├── .gitignore ├── README.md ├── finite_markov_decision_processes └── cartpole.py ├── function_approximation ├── mountain_car_tile_coding.py └── tiles3.py ├── temporal_difference ├── blackjack.py ├── blackjack_figures.py └── CartPole.py ├── lib.py └── introduction_to_rl └── tic_tac_toe.py /Deep-RL-Mountain-Car/Policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class EpsilonGreedyPolicy: 5 | def __init__(self, number_of_action, epsilon=.1, ): 6 | self.epsilon = epsilon 7 | self.number_of_action = number_of_action 8 | 9 | def get_action(self, q_values): 10 | pi = self.get_pi(q=q_values) 11 | 12 | return np.random.choice(np.arange(self.number_of_action), p=pi) 13 | 14 | def get_pi(self, q): 15 | pi = np.ones(self.number_of_action, dtype=float) # init values 16 | pi *= self.epsilon / self.number_of_action # probability of random action 17 | 18 | greedy_action = np.argmax(q) 19 | pi[greedy_action] += (1.0 - self.epsilon) # probability of greedy action 20 | 21 | return pi 22 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/Agent.py: -------------------------------------------------------------------------------- 1 | class Agent(object): 2 | def __init__(self, max_memory=100, discount=.99): 3 | self.max_memory = max_memory 4 | self.memory = list() 5 | self.discount = discount 6 | 7 | def remember(self, states, game_over): 8 | # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?] 9 | self.memory.append([states, game_over]) 10 | if len(self.memory) > self.max_memory: 11 | del self.memory[0] 12 | 13 | def __create_model(self, load_model): 14 | pass 15 | 16 | def episode(self, env, batch_size=10, n_step=1, epoch=0): 17 | pass 18 | 19 | def save_model(self): 20 | pass 21 | 22 | def train(self, current_step, batch_size=10, n_step=1): 23 | pass 24 | 25 | def get_batch(self, batch_size=10, n_step=1): 26 | pass 27 | 28 | def get_action(self, state): 29 | pass 30 | 31 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/MCtest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from keras.models import model_from_json 4 | from MountainCar import MountainCar 5 | 6 | 7 | if __name__ == "__main__": 8 | # Initialize parameters 9 | 10 | # working with any parameters below, not nessecarily the parameters set during the training, it seems like neural network is able to generalize to othe initializations as well 11 | Xrange = [-1.5, 0.55] 12 | Vrange = [-0.7, 0.7] 13 | start = [-0.5, -0.1] 14 | goal = [0.45] 15 | 16 | with open("model.json", "r") as jfile: 17 | model = model_from_json(json.load(jfile)) 18 | model.load_weights("model.h5") 19 | model.compile("sgd", "mse") 20 | 21 | # Define environment, game 22 | env = MountainCar(start, goal, Xrange, Vrange) 23 | 24 | for e in range(10): 25 | c = 0 26 | loss = 0. 27 | env.reset() 28 | game_over = False 29 | # get initial input 30 | input_t = env.observe() 31 | 32 | c += 1 33 | while not game_over: 34 | input_tm1 = input_t 35 | 36 | # get next action 37 | q = model.predict(input_tm1) 38 | action = np.argmax(q[0]) 39 | 40 | # apply action, get rewards and new state 41 | input_t, reward, game_over = env.act(action) 42 | 43 | c += 1 44 | print("Episode %d, Steps %d" %(e, c)) -------------------------------------------------------------------------------- /multi_armed_bandits/bandit.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | class bandit: 6 | def __init__(self, kArm=10, epsilon=0, initial=0., ucb=0., variance=1, min=-2, max=2): 7 | self.epsilon = epsilon 8 | self.ucb = ucb 9 | self.kArm = kArm 10 | self.variance = variance 11 | 12 | self.selectActions = np.zeros(self.kArm) 13 | self.totalTS = 0 14 | self.steps = np.zeros(self.kArm) 15 | self.qTable = np.full(self.kArm, initial) 16 | self.armMeans = np.random.uniform(min, max, self.kArm) 17 | 18 | def takeAction(self): 19 | if self.epsilon > 0 and np.random.binomial(1, self.epsilon) == 1: 20 | idx = np.random.choice(self.kArm) 21 | elif self.ucb > 0: 22 | actions = [] 23 | for idx in np.arange(self.kArm): 24 | if self.selectActions[idx] == 0: 25 | actions.append(1000) 26 | break 27 | else: 28 | actions.append(self.qTable[idx] + self.ucb * math.sqrt(math.log(self.totalTS) / self.selectActions[idx])) 29 | 30 | idx = np.argmax(np.asarray(actions)) 31 | self.selectActions[idx] += 1 32 | else: 33 | idx = np.argmax(self.qTable) 34 | 35 | reward = np.random.normal(self.armMeans[idx], self.variance) 36 | 37 | self.totalTS += 1 38 | self.steps[idx] += 1 39 | self.qTable[idx] += (1 / self.steps[idx]) * (reward - self.qTable[idx]) 40 | 41 | return reward 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning Türkiye - Reinforcement Learning Project 2 | 3 | This repository consists projects from Deep Learning Türkiye - Reinforcement Learning Group. Enter folders to see each project's details. 4 | 5 | ## 1. Introduction To RL 6 | Simple tic tac toe example. Learns via Value Function at the moment. Policy Search *TODO*. 7 | Benefited from [tansey](https://github.com/tansey/rl-tictactoe/blob/master/tictactoe.py). 8 | 9 | ## 2. Multi-Armed Bandits 10 | Provides the underlying testbed for bandit problem. 11 | 12 | ## 3. Finite Markov Decision Processes 13 | Uses the OpenAI Gym. Learns via Q-Learning. 14 | 15 | ## 4. Temporal Difference 16 | Multiple approaches to CartPole problem. 17 | Benefited from [dennybritz](https://github.com/dennybritz/reinforcement-learning). 18 | 19 | ## Library usage 20 | You can find example usage below. 21 | 22 | ``` 23 | import gym 24 | from lib import q_learning_agent, double_q_learning_agent, sarsa_learning_agent 25 | 26 | env = gym.make("FrozenLake-v0") 27 | env.reset() 28 | 29 | def train(agent): 30 | for i_episode in range(1000): 31 | state = env.reset() 32 | while True: 33 | action = agent.select_action(state) 34 | next_state, reward, done, _ = env.step(action) 35 | agent.learn(action, reward, state, next_state) 36 | if done: 37 | break 38 | state = next_state 39 | 40 | qla = q_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n) 41 | sla = sarsa_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n) 42 | dqla = double_q_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n) 43 | 44 | train(qla) 45 | train(sla) 46 | train(dqla) 47 | ``` -------------------------------------------------------------------------------- /multi_armed_bandits/policy.py: -------------------------------------------------------------------------------- 1 | from bandit import bandit 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | run = 2000 7 | ts = 1000 8 | 9 | variance = 1 10 | limMin = 0 11 | limMax = 3 12 | 13 | 14 | def greedyPolicy(): 15 | averageRewards = np.zeros(ts) 16 | 17 | for num in range(run): 18 | bnd = bandit(variance=variance, min=limMin, max=limMax) 19 | 20 | for t in range(ts): 21 | averageRewards[t] += bnd.takeAction() 22 | 23 | return averageRewards / run 24 | 25 | 26 | def epsilonGreedyPolicy(epsilon): 27 | averageRewards = np.zeros(ts) 28 | 29 | for num in range(run): 30 | bnd = bandit(variance=variance, min=limMin, max=limMax, epsilon=epsilon) 31 | 32 | for t in range(ts): 33 | averageRewards[t] += bnd.takeAction() 34 | 35 | return averageRewards / run 36 | 37 | 38 | def optimisticInitialValues(initial): 39 | averageRewards = np.zeros(ts) 40 | 41 | for num in range(run): 42 | bnd = bandit(variance=variance, min=limMin, max=limMax, initial=initial) 43 | 44 | for t in range(ts): 45 | averageRewards[t] += bnd.takeAction() 46 | 47 | return averageRewards / run 48 | 49 | 50 | def upperConfidenceBound(ucb): 51 | averageRewards = np.zeros(ts) 52 | 53 | for num in range(run): 54 | bnd = bandit(variance=variance, min=limMin, max=limMax, ucb=ucb) 55 | 56 | for t in range(ts): 57 | averageRewards[t] += bnd.takeAction() 58 | 59 | return averageRewards / run 60 | 61 | 62 | # plt.plot(greedyPolicy(), color='r') 63 | plt.plot(epsilonGreedyPolicy(0.1), color='b') 64 | # plt.plot(epsilonGreedyPolicy(0.01), color='g') 65 | # plt.plot(optimisticInitialValues(5), color='g') 66 | plt.plot(upperConfidenceBound(2), color='r') 67 | 68 | 69 | plt.show() 70 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/MountainCar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | class MountainCar(object): 6 | def __init__(self, start, goal, Xrange, Vrange): 7 | self.start = start 8 | self.goal = goal 9 | self.Xrange = Xrange 10 | self.Vrange = Vrange 11 | self.num_actions = 3 12 | 13 | def _DoAction(self, action): 14 | # MountainCarDoAction: executes the action (a) into the mountain car 15 | # a: is the force to be applied to the car 16 | # x: is the vector containning the position and speed of the car 17 | # xp: is the vector containing the new position and velocity of the car 18 | 19 | position = self.state[0] 20 | speed = self.state[1] 21 | 22 | # bounds for position 23 | bpleft = self.Xrange[0] 24 | 25 | # bounds for speed 26 | bsleft = self.Vrange[0] 27 | bsright = self.Vrange[1] 28 | speedt1 = speed + (0.001 * (action - 1)) + (-0.0025 * math.cos(3.0 * position)) 29 | speedt1 = speedt1 * 0.999 # thermodynamic law, for a more real system with friction. 30 | 31 | if speedt1 < bsleft: 32 | speedt1 = bsleft 33 | elif speedt1 > bsright: 34 | speedt1 = bsright 35 | 36 | post1 = position + speedt1 37 | 38 | if post1 <= bpleft: 39 | post1 = bpleft 40 | speedt1 = 0.0 41 | 42 | xp = np.array([post1, speedt1]) 43 | self.state = xp 44 | 45 | def _GetReward(self): 46 | # MountainCarGetReward returns the reward at the current state 47 | # x: a vector of position and velocity of the car 48 | # r: the returned reward. 49 | # f: true if the car reached the goal, otherwise f is false 50 | 51 | position = self.state[0] 52 | # bound for position; the goal is to reach position = 0.45 53 | bpright = self.goal 54 | 55 | r = -1 56 | f = False 57 | 58 | if position >= bpright: 59 | r = 100 60 | f = True 61 | 62 | return r, f 63 | 64 | def act(self, action): 65 | self._DoAction(action) 66 | reward, game_over = self._GetReward() 67 | return self.observe(), reward, game_over 68 | 69 | def observe(self): 70 | return self.state.reshape((1, -1)) 71 | 72 | def reset(self): 73 | self.state = np.array([self.start[0], self.start[1]]) 74 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/QLearning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.models import Sequential 3 | from keras.layers.core import Dense 4 | from keras.optimizers import sgd 5 | from Agent import Agent 6 | 7 | 8 | class QLearning(Agent): 9 | 10 | def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1): 11 | super().__init__(max_memory, discount) 12 | self.num_actions = num_actions 13 | self.epsilon = e_greedy 14 | 15 | self.__create_model() 16 | 17 | def __create_model(self, load_model=True): 18 | hidden_size = 100 19 | 20 | self.model = Sequential() 21 | self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu')) 22 | self.model.add(Dense(hidden_size, activation='relu')) 23 | self.model.add(Dense(self.num_actions)) 24 | self.model.compile(sgd(lr=0.01), "mse") 25 | 26 | def get_batch(self, batch_size=10, n_step=1): 27 | len_memory = len(self.memory)-n_step # we don't want to update 'n' last states, because their returns not calculated yet 28 | num_actions = self.model.output_shape[-1] 29 | 30 | env_dim = self.memory[0][0][0].shape[1] 31 | inputs = np.zeros((min(len_memory, batch_size), env_dim)) 32 | targets = np.zeros((inputs.shape[0], num_actions)) 33 | 34 | for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])): 35 | state, action, reward, next_state = self.memory[idx][0] 36 | game_over = self.memory[idx][1] 37 | 38 | inputs[i:i + 1] = state 39 | # There should be no target values for actions not taken. 40 | # Thou shalt not correct actions not taken #deep 41 | targets[i] = self.model.predict(state)[0] 42 | Q_sa = np.max(self.model.predict(next_state)[0]) 43 | 44 | if game_over: # if game_over is True 45 | targets[i, action] = reward 46 | else: 47 | # reward + gamma * max_a' Q(s', a') 48 | targets[i, action] = reward + self.discount * Q_sa 49 | 50 | return inputs, targets 51 | 52 | def get_action(self, state): 53 | if np.random.rand() <= self.epsilon: 54 | return np.random.randint(0, self.num_actions, size=1)[0] 55 | else: 56 | return np.argmax(self.model.predict(state)[0]) 57 | 58 | def get_model(self): 59 | return self.model 60 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/README.md: -------------------------------------------------------------------------------- 1 | #Deep RL - Mountain Car Domain 2 | This is my implementation of Mountain Car domain in reinforcement learning using neural network function approximation with Keras Deep Learning Library. 3 | *To the best of my knowledge, this is the first opensource code for solving Mountain Car RL problem using DQN.* 4 | I am motivated by this simple example of [Keras playing catch](https://edersantana.github.io/articles/keras_rl/) 5 | My code is adapted from the above incorporated with Mountain car domain. 6 | 7 | ### DQN implementation 8 | DQN implementation is based on the paper: 9 | Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., & Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602. 10 | 11 | ### Mountain Car Domain 12 | Mountain car is standard platform for testing RL algorithms in which a underpowered car tries to reach a goal position uphill by moving to and fro the hill valley. The state space of the car is continuous and consist of its position and velocity. At every state, it can choose out of 3 possible actions -- move forward, backward or stay. Refer to this [Wikipedia article](https://en.wikipedia.org/wiki/Mountain_Car) for more information. 13 | 14 | ![alt tag](Mcar.png) 15 | 16 | The figure above (from Wikipedia) shows the problem, where car in starting position and star is the goal position. 17 | 18 | ### Files 19 | 1. MountainCar.py -- Define the class of Mountain Car environment - transition from one state to another given an action and returning reward. 20 | 2. MCqlearn.py -- DQN implementation for Q-learning. 21 | 3. MCtest.py -- Testing the learned policy. 22 | 23 | ### Training 24 | DQN is trained for 1000 successful episodes of the problem. The specific parameters of the algorithm are given in the MCqlearn.py file. To train the DQN network, symply run the training file: 25 | ``` 26 | python MCqlearn.py 27 | ``` 28 | After training, the network parameters are stored in .json and .h5 file. 29 | 30 | ### Testing 31 | Once the network is trained and parameters are saved in .json and .h5 file, testing can be done. To test the network, run the file: 32 | ``` 33 | python MCqtest.py 34 | ``` 35 | The initial state and other parameters of Mountain Car domain can be set up in this file. 36 | **It is interesting to note that though the network is trained only for one initial state and one range of Mountain Car domain, it is able to generalize and success during testing for arbitrary initial states and range of the domain.** 37 | 38 | ### Dependencies 39 | 1. Python3 40 | 2. Keras 41 | 3. Numpy 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /finite_markov_decision_processes/cartpole.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import numpy as np 3 | import gym 4 | 5 | 6 | # cart_position_bins = np.linspace(-2.4, 2.4, 10) 7 | # cart_velocity_bins = np.linspace(-1, 1, 10) 8 | 9 | class cartPole: 10 | def __init__(self, epsilon, gamma): 11 | self.qTable = {} 12 | self.epsilon = epsilon 13 | self.gamma = gamma 14 | 15 | self.env = gym.make('CartPole-v0') 16 | self.actions = [0, 1] 17 | 18 | self.pole_angle_bins = np.linspace(-0.42, 0.42, 10) 19 | self.pole_velocity_bins = np.linspace(-1, 1, 10) 20 | 21 | self.timesteps_over_time = [] 22 | 23 | def observationToState(self, observation): 24 | pole_angle = np.digitize(x=[observation[2]], bins=self.pole_angle_bins)[0] 25 | pole_velocity = np.digitize(x=[observation[3]], bins=self.pole_velocity_bins)[0] 26 | 27 | return (pole_angle, pole_velocity) 28 | 29 | def updateQTable(self, state, action, reward): 30 | currentValue = self.qTable.get((state, action), None) 31 | 32 | if currentValue is None: 33 | self.qTable[(state, action)] = reward 34 | else: 35 | self.qTable[(state, action)] = currentValue + self.gamma * (reward - currentValue) 36 | 37 | def chooseAction(self, state): 38 | if np.random.random() < self.epsilon: 39 | action = self.env.action_space.sample() 40 | else: 41 | q = [self.qTable.get((state, action), 0.0) for action in self.actions] 42 | action = self.actions[np.argmax(q)] 43 | 44 | return action 45 | 46 | def run(self): 47 | for i_episode in range(1000): 48 | observation = self.env.reset() 49 | state = self.observationToState(observation) 50 | 51 | done = False 52 | ts = 1 53 | 54 | episodeStates = [] 55 | episodeActions = [] 56 | 57 | while not done: 58 | self.env.render() 59 | 60 | action = self.chooseAction(state) 61 | 62 | observation, reward, done, info = self.env.step(action) 63 | 64 | episodeStates.append(state) 65 | episodeActions.append(action) 66 | 67 | state = self.observationToState(observation) 68 | 69 | if done: 70 | print("Episode {} finished after {} timesteps".format(i_episode, ts)) 71 | 72 | ts += 1 73 | 74 | for i in range(len(episodeStates)): 75 | state = episodeStates[i] 76 | action = episodeActions[i] 77 | 78 | self.updateQTable(state, action, ts) 79 | 80 | self.timesteps_over_time.append(ts) 81 | 82 | 83 | cp = cartPole(0.1, 0.5) 84 | cp.run() 85 | 86 | pp = pprint.PrettyPrinter(depth=6) 87 | pp.pprint(cp.qTable) 88 | 89 | pp.pprint(cp.timesteps_over_time) 90 | -------------------------------------------------------------------------------- /function_approximation/mountain_car_tile_coding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | import sys 5 | 6 | from tiles3 import * 7 | 8 | env = gym.make('MountainCar-v0') 9 | 10 | GAMMA = 1.0 11 | LAMBDA = 0.9 12 | 13 | IHT_SIZE = 4096 14 | num_tilings = 8 15 | weights = np.zeros((IHT_SIZE, 1)) 16 | z = np.zeros((IHT_SIZE, 1)) # Eligibility trace vector. 17 | iht = IHT(IHT_SIZE) 18 | 19 | POSITION_MIN, VELOCITY_MIN = env.env.low 20 | POSITION_MAX, VELOCITY_MAX = env.env.high 21 | 22 | 23 | def get_active_tiles(state, action): 24 | pos, vel = state 25 | active_tiles = tiles(iht, num_tilings, [pos * num_tilings / (POSITION_MAX - POSITION_MIN), 26 | vel * num_tilings / (VELOCITY_MAX - VELOCITY_MIN)], 27 | [action]) 28 | return active_tiles 29 | 30 | 31 | def s_a_feature_vector(state, action): 32 | active_tiles = get_active_tiles(state, action) 33 | feature_vector = np.zeros((IHT_SIZE, 1)) 34 | feature_vector[active_tiles] = 1 35 | return feature_vector 36 | 37 | 38 | def get_value(state, action): 39 | # If the state is terminal. 40 | if state[0] >= POSITION_MAX: 41 | return 0 42 | 43 | return np.dot(weights.T, s_a_feature_vector(state, action)) 44 | 45 | 46 | def get_action(state): 47 | values = [get_value(state, action) for action in range(env.action_space.n)] 48 | return np.argmax(values) 49 | 50 | 51 | alpha = 0.5 52 | step_size = alpha / num_tilings 53 | n_episodes = 100 54 | 55 | # PAGE 305 : Sarsa(λ) with binary features and linear function approximation 56 | for episode in range(n_episodes): 57 | if episode % 10 == 0: 58 | print('\rEpisode {}/{}'.format(episode + 1, n_episodes), end='') 59 | sys.stdout.flush() 60 | 61 | state = env.reset() 62 | while True: 63 | action = get_action(state) 64 | next_state, reward, done, _ = env.step(action) 65 | 66 | delta = reward 67 | 68 | active_tiles = get_active_tiles(state, action) 69 | delta -= get_value(state, action) 70 | z[active_tiles] = 1 71 | 72 | # If the next state is terminal state. 73 | if next_state[0] >= POSITION_MAX: 74 | weights += step_size * delta * z 75 | 76 | next_action = get_action(next_state) 77 | active_tiles = get_active_tiles(next_state, next_action) 78 | delta += GAMMA * get_value(next_state, next_action) 79 | 80 | weights += step_size * delta * z 81 | z = GAMMA * LAMBDA * z 82 | 83 | if done: 84 | break 85 | 86 | state = next_state 87 | 88 | print('Training is done') 89 | 90 | # Test the algorithm for 1 episode. 91 | for i in range(1): 92 | state = env.reset() 93 | while True: 94 | env.render() 95 | action = get_action(state) 96 | # action = get_action(state) 97 | next_state, reward, done, _ = env.step(action) 98 | if done: 99 | break 100 | 101 | state = next_state 102 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/TrainModel.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from MountainCar import MountainCar 6 | from QLearning import QLearning 7 | from sarsa import Sarsa 8 | from double_q import DQLearning 9 | from expected_sarsa import ExpectedSarsa 10 | import random 11 | 12 | if __name__ == "__main__": 13 | # parameters 14 | epsilon = .1 # exploration 15 | num_actions = 3 # [move_left, stay, move_right] 16 | epoch = 5 17 | max_memory = 50000 18 | batch_size = 32 19 | input_size = 2 20 | 21 | Xrange = [-1.5, 0.55] 22 | Vrange = [-2.0, 2.0] 23 | start = [np.random.randint(7) * 0.1 - 0.5, 0.0] 24 | goal = [0.45] 25 | 26 | # n_step = 1 27 | 28 | GAMMA = 0.99 # decay rate of past observations 29 | OBSERVATION = 3200. # timesteps to observe before training 30 | EXPLORE = 10000. # frames over which to anneal epsilon 31 | FINAL_EPSILON = 0.01 # final value of epsilon 32 | INITIAL_EPSILON = 0.2 # starting value of epsilon 33 | LEARNING_RATE = 1e-4 34 | FRAME_PER_ACTION = 1 35 | 36 | # all possible steps 37 | nSteps = np.arange(2, 5, 1) 38 | 39 | # all possible alphas 40 | #alphas = np.arange(0.01, 0.2, 0.1) 41 | 42 | alphas = [0.01] 43 | # If you want to continue training from a previous model, just uncomment the line bellow 44 | # model.load_weights("model.h5") 45 | 46 | # Define environment/game 47 | env = MountainCar(start, goal, Xrange, Vrange) 48 | 49 | # Initialize experience replay object 50 | # learning_model = QLearning(num_actions=num_actions, max_memory=max_memory) 51 | #learning_model = DQLearning(num_actions=num_actions, max_memory=max_memory, e_greedy=INITIAL_EPSILON) 52 | learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=INITIAL_EPSILON) 53 | 54 | for e in range(epoch): 55 | # loss = 0. 56 | env = MountainCar(start, goal, Xrange, Vrange) 57 | env.reset() 58 | game_over = False 59 | 60 | # get initial input 61 | s = env.observe() 62 | 63 | t = 0 64 | OBSERVE = OBSERVATION 65 | epsilon = INITIAL_EPSILON 66 | 67 | while not game_over: 68 | loss = 0 69 | t += 1 70 | 71 | action = learning_model.get_action(s) 72 | 73 | # We reduced the epsilon gradually 74 | if epsilon > FINAL_EPSILON and t > OBSERVE: 75 | epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE 76 | learning_model.change_epsilon(epsilon) 77 | 78 | next_state, reward, game_over = env.act(action) 79 | 80 | # store experience 81 | learning_model.remember((s, action, reward, next_state), game_over) 82 | 83 | # only train if done observing 84 | if t > OBSERVE: 85 | loss += learning_model.train(current_step=t, batch_size=batch_size) 86 | 87 | s = next_state 88 | 89 | # save progress every 10000 iterations 90 | if t % 1000 == 0: 91 | learning_model.save_model() 92 | 93 | # print info 94 | state = "" 95 | if t <= OBSERVE: 96 | state = "observe" 97 | elif OBSERVE < t <= OBSERVE + EXPLORE: 98 | state = "explore" 99 | else: 100 | state = "train" 101 | 102 | print("Epoch", e, "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/", "REWARD", reward, 103 | "/ ACTION ", action, "/ POS", next_state[0, 0], "/ Loss ", loss) 104 | 105 | if t > 20000: # stop sampling, continue with new episode 106 | break 107 | 108 | print("Episode finished!") 109 | print("************************") 110 | learning_model.save_model() 111 | training = False 112 | 113 | -------------------------------------------------------------------------------- /temporal_difference/blackjack.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import random 4 | 5 | 6 | env = gym.make('Blackjack-v0') 7 | env.reset() 8 | 9 | 10 | def make_epsilon_greedy_policy(Q_1, Q_2, epsilon, nA): 11 | """ 12 | Creates an epsilon-greedy policy based on a given Q-function and epsilon. 13 | Args: 14 | Q: A dictionary that maps from state -> action-values. 15 | Each value is a numpy array of length nA (see below) 16 | epsilon: The probability to select a random action . float between 0 and 1. 17 | nA: Number of actions in the environment. 18 | Returns: 19 | A function that takes the observation as an argument and returns 20 | the probabilities for each action in the form of a numpy array of length nA. 21 | """ 22 | def policy_fn(observation): 23 | A_probs = np.ones(nA, dtype=float) * epsilon / nA 24 | 25 | Q = Q_1 + Q_2 26 | 27 | best_action = np.argmax(Q[observation[0]][observation[1]][int(observation[2])]) 28 | A_probs[best_action] += (1.0 - epsilon) 29 | 30 | return A_probs # .reshape(1,-1) 31 | 32 | return policy_fn 33 | 34 | 35 | # Q Learning 36 | def double_Q_learning(env, train_episodes, test_episodes, discount_factor=0.2, alpha=0.5, epsilon=0.1): 37 | # Q table initialization 38 | Q_1 = np.zeros((32, 11, 2, env.action_space.n)) 39 | Q_2 = np.zeros((32, 11, 2, env.action_space.n)) 40 | 41 | # The policy we're following 42 | policy = make_epsilon_greedy_policy(Q_1, Q_2, epsilon, env.action_space.n) 43 | 44 | for i_episode in range(train_episodes): 45 | # Reset the environment and pick the first action 46 | observation = env.reset() 47 | state = observation 48 | 49 | while True: 50 | # Take a step 51 | action_probs = policy(state) 52 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 53 | 54 | next_observation, reward, done, _ = env.step(action) 55 | next_state = next_observation 56 | 57 | if(random.randint(1, 2) == 1): 58 | # Q Learning Update 59 | best_next_action = np.argmax(Q_1[next_state[0]][next_state[1]][int(next_state[2])]) 60 | td_target = reward + discount_factor * Q_2[next_state[0]][next_state[1]][int(next_state[2])][best_next_action] 61 | td_delta = td_target - Q_1[state[0]][state[1]][int(state[2])][action] 62 | Q_1[state[0]][state[1]][int(state[2])][action] += alpha * td_delta 63 | 64 | else: 65 | # Q Learning Update 66 | best_next_action = np.argmax(Q_2[next_state[0]][next_state[1]][int(next_state[2])]) 67 | td_target = reward + discount_factor * Q_1[next_state[0]][next_state[1]][int(next_state[2])][best_next_action] 68 | td_delta = td_target - Q_2[state[0]][state[1]][int(state[2])][action] 69 | Q_2[state[0]][state[1]][int(state[2])][action] += alpha * td_delta 70 | 71 | state = next_state 72 | 73 | if done: 74 | break 75 | 76 | policy = make_epsilon_greedy_policy(Q_1, Q_2, 0, env.action_space.n) 77 | 78 | win_count = 0 79 | reward_sum = 0 80 | 81 | for i_episode in range(test_episodes): 82 | # Reset the environment and pick the first action 83 | observation = env.reset() 84 | state = observation 85 | 86 | while True: 87 | # Take a step 88 | action_probs = policy(observation) 89 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 90 | 91 | observation, reward, done, _ = env.step(action) 92 | 93 | reward_sum += reward 94 | 95 | if reward > 0: 96 | win_count += 1 97 | 98 | if done: 99 | break 100 | 101 | print("Total Episodes: {}".format(test_episodes)) 102 | print("Win Count: {}".format(win_count)) 103 | print("Reward Sum: {}".format(reward_sum)) 104 | 105 | 106 | double_Q_learning(env, 10000, 1000) 107 | -------------------------------------------------------------------------------- /temporal_difference/blackjack_figures.py: -------------------------------------------------------------------------------- 1 | 2 | # REFERENCE : https://github.com/dennybritz/reinforcement-learning 3 | 4 | import numpy as np 5 | import gym 6 | import matplotlib.pyplot as plt 7 | import matplotlib.patches as mpatches 8 | 9 | import sys 10 | from collections import defaultdict 11 | 12 | env = gym.make('Blackjack-v0') 13 | 14 | 15 | def make_epsilon_greedy_policy(Q, epsilon, nA): 16 | def policy_fn(observation): 17 | A_probs = np.ones(nA, dtype=float) * epsilon / nA 18 | best_action = np.argmax(Q[observation]) 19 | A_probs[best_action] += (1 - epsilon) 20 | return A_probs 21 | 22 | return policy_fn 23 | 24 | 25 | def sarsa(env, n_episodes=500, discount_factor=1.0, alpha=0.5, epsilon=0.1): 26 | 27 | Q = defaultdict(lambda: np.zeros(env.action_space.n)) 28 | policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) 29 | 30 | for i_episode in range(n_episodes): 31 | if (i_episode + 1) % 100 == 0: 32 | print("\rEpisode {}/{}.".format(i_episode + 1, n_episodes), end="") 33 | sys.stdout.flush() 34 | state = env.reset() 35 | while True: 36 | A_probs = policy(state) 37 | action = np.random.choice(np.arange(len(A_probs)), p=A_probs) 38 | next_state, reward, done, _ = env.step(action) 39 | 40 | next_action_probs = policy(next_state) 41 | next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) 42 | td_target = reward + discount_factor * Q[next_state][next_action] 43 | td_error = td_target - Q[state][action] 44 | Q[state][action] += alpha * td_error 45 | 46 | if done: 47 | break 48 | 49 | state = next_state 50 | 51 | return Q 52 | 53 | 54 | def Q_learning(env, n_episodes=500, discount_factor=1.0, alpha=0.5, epsilon=0.1): 55 | 56 | Q = defaultdict(lambda: np.zeros(env.action_space.n)) 57 | policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) 58 | 59 | for i_episode in range(n_episodes): 60 | if (i_episode + 1) % 100 == 0: 61 | print("\rEpisode {}/{}.".format(i_episode + 1, n_episodes), end="") 62 | sys.stdout.flush() 63 | state = env.reset() 64 | while True: 65 | probs = policy(state) 66 | action = np.random.choice(np.arange(len(probs)), p=probs) 67 | next_state, reward, done, _ = env.step(action) 68 | next_action = np.argmax(Q[next_state]) 69 | td_target = reward + discount_factor * Q[next_state][next_action] 70 | td_error = td_target - Q[state][action] 71 | Q[state][action] += alpha * td_error 72 | 73 | if done: 74 | break 75 | 76 | state = next_state 77 | 78 | return Q 79 | 80 | 81 | def plot_figure(ax, usable_ace): 82 | def get_action(player_hand, dealer_showing, usable_ace): 83 | return policy[player_hand, dealer_showing, usable_ace] if (player_hand, dealer_showing, usable_ace) in policy else 1 84 | 85 | policy_mat = np.array([[get_action(player_hand, dealer_showing, usable_ace) for dealer_showing in range(1, 11)] 86 | for player_hand in range(21, 10, -1)]) 87 | 88 | ax.imshow(policy_mat, cmap=plt.cm.Accent, extent=[0.5, 10.5, 10.5, 21.5]) 89 | plt.ylim(11, 21) 90 | plt.xlim(1, 10) 91 | plt.xlabel('Dealer Hand') 92 | plt.ylabel('Player Hand') 93 | hit_patch = mpatches.Patch(color=plt.cm.Accent(.1), label='Stick') 94 | stick_patch = mpatches.Patch(color=plt.cm.Accent(.9), label='Hit') 95 | plt.legend(handles=[hit_patch, stick_patch]) 96 | 97 | 98 | # Q = sarsa(env, 1000) 99 | Q = Q_learning(env, 1000) 100 | policy = dict((k, np.argmax(v)) for k, v in Q.items()) 101 | 102 | fig = plt.figure(figsize=(15, 15)) 103 | ax = fig.add_subplot(121) 104 | ax.set_title('Blackjack MC Policy - No Usable Ace') 105 | plot_figure(ax, True) 106 | ax = fig.add_subplot(122) 107 | ax.set_title('Blackjack MC Policy - Usable Ace') 108 | plot_figure(ax, False) 109 | plt.show() 110 | -------------------------------------------------------------------------------- /function_approximation/tiles3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tile Coding Software version 3.0beta 3 | by Rich Sutton 4 | based on a program created by Steph Schaeffer and others 5 | External documentation and recommendations on the use of this code is available in the 6 | reinforcement learning textbook by Sutton and Barto, and on the web. 7 | These need to be understood before this code is. 8 | 9 | This software is for Python 3 or more. 10 | 11 | This is an implementation of grid-style tile codings, based originally on 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 13 | Here we provide a function, "tiles", that maps floating and integer 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while 15 | wrapping some floats to provided widths (the lower wrap value is always 0). 16 | 17 | The float variables will be gridded at unit intervals, so generalization 18 | will be by approximately 1 in each direction, and any scaling will have 19 | to be done externally before calling tiles. 20 | 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should 22 | also be greater than or equal to four times the number of floats. 23 | 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 26 | coordinates are to be returned without being converted to indices). 27 | """ 28 | 29 | basehash = hash 30 | 31 | class IHT: 32 | "Structure to handle collisions" 33 | def __init__(self, sizeval): 34 | self.size = sizeval 35 | self.overfullCount = 0 36 | self.dictionary = {} 37 | 38 | def __str__(self): 39 | "Prepares a string for printing whenever this object is printed" 40 | return "Collision table:" + \ 41 | " size:" + str(self.size) + \ 42 | " overfullCount:" + str(self.overfullCount) + \ 43 | " dictionary:" + str(len(self.dictionary)) + " items" 44 | 45 | def count (self): 46 | return len(self.dictionary) 47 | 48 | def fullp (self): 49 | return len(self.dictionary) >= self.size 50 | 51 | def getindex (self, obj, readonly=False): 52 | d = self.dictionary 53 | if obj in d: return d[obj] 54 | elif readonly: return None 55 | size = self.size 56 | count = self.count() 57 | if count >= size: 58 | if self.overfullCount==0: print('IHT full, starting to allow collisions') 59 | self.overfullCount += 1 60 | return basehash(obj) % self.size 61 | else: 62 | d[obj] = count 63 | return count 64 | 65 | def hashcoords(coordinates, m, readonly=False): 66 | if type(m)==IHT: return m.getindex(tuple(coordinates), readonly) 67 | if type(m)==int: return basehash(tuple(coordinates)) % m 68 | if m==None: return coordinates 69 | 70 | from math import floor, log 71 | from itertools import zip_longest 72 | 73 | def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False): 74 | """returns num-tilings tile indices corresponding to the floats and ints""" 75 | qfloats = [floor(f*numtilings) for f in floats] 76 | Tiles = [] 77 | for tiling in range(numtilings): 78 | tilingX2 = tiling*2 79 | coords = [tiling] 80 | b = tiling 81 | for q in qfloats: 82 | coords.append( (q + b) // numtilings ) 83 | b += tilingX2 84 | coords.extend(ints) 85 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 86 | return Tiles 87 | 88 | def tileswrap (ihtORsize, numtilings, floats, wrawidths, ints=[], readonly=False): 89 | """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats""" 90 | qfloats = [floor(f*numtilings) for f in floats] 91 | Tiles = [] 92 | for tiling in range(numtilings): 93 | tilingX2 = tiling*2 94 | coords = [tiling] 95 | b = tiling 96 | for q, width in zip_longest(qfloats, wrapwidths): 97 | c = (q + b%numtilings) // numtilings 98 | coords.append(c%width if width else c) 99 | b += tilingX2 100 | coords.extend(ints) 101 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 102 | return Tiles 103 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/main.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from MountainCar import MountainCar 6 | from sarsa import Sarsa 7 | from double_q import DQLearning 8 | from expected_sarsa import ExpectedSarsa 9 | 10 | 11 | def test_double_q(): 12 | runs = 1 13 | episode = 2 14 | 15 | alphas = [] 16 | steps = np.zeros((len(alphas), episode)) 17 | 18 | for run in range(0, runs): 19 | for i, n_step in zip(range(len(alphas)), alphas): 20 | learning_model = DQLearning(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False) 21 | 22 | for e in range(episode): 23 | print('run:', run, 'alpha:', alphas[i], 'episode:', e) 24 | 25 | start = [random.uniform(-0.6, -0.4), 0.0] 26 | env = MountainCar(start, goal, Xrange, Vrange) 27 | 28 | step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e) 29 | steps[i, e] += step 30 | 31 | steps /= runs 32 | 33 | for i in range(0, len(alphas)): 34 | plt.plot(steps[i], label='n = ' + str(alphas[i])) 35 | plt.xlabel('Alpha') 36 | plt.ylabel('Steps per episode') 37 | plt.yscale('log') 38 | plt.legend() 39 | 40 | plt.show() 41 | 42 | 43 | def one_vs_multi_step(): 44 | runs = 1 45 | episode = 10 46 | n_steps = np.arange(1, 9, 1) 47 | 48 | steps = np.zeros((len(n_steps), episode)) 49 | 50 | for run in range(0, runs): 51 | for i, n_step in zip(range(len(n_steps)), n_steps): 52 | learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False) 53 | #learning_model = Sarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False) 54 | 55 | for e in range(episode): 56 | print('run:', run, 'steps:', n_steps[i], 'episode:', e) 57 | 58 | start = [random.uniform(-0.6, -0.4), 0.0] 59 | env = MountainCar(start, goal, Xrange, Vrange) 60 | 61 | step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e) 62 | steps[i, e] += step 63 | 64 | steps /= runs 65 | 66 | for i in range(0, len(n_steps)): 67 | plt.plot(steps[i], label='n = '+str(n_steps[i])) 68 | plt.xlabel('Episode') 69 | plt.ylabel('Steps per episode') 70 | plt.yscale('log') 71 | plt.legend() 72 | 73 | plt.show() 74 | 75 | 76 | def effect_of_alpha_and_n(): 77 | # all possible alphas 78 | alphas = [0.0002, 0.0004, 0.0008, 0.0012] 79 | 80 | # all possible steps 81 | n_steps = np.arange(1, 9, 1) 82 | 83 | epoch = 20 84 | runs = 1 85 | 86 | steps = np.zeros((len(n_steps), len(alphas))) 87 | 88 | for run in range(0, runs): 89 | for nStepIndex, n_step in zip(range(0, len(n_steps)), n_steps): 90 | for alphaIndex, alpha in zip(range(0, len(alphas)), alphas): 91 | learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False) 92 | #learning_model = Sarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, 93 | # load_model=False) 94 | 95 | for e in range(0, epoch): 96 | print('run:', run, 'steps:', n_step, 'alpha:', alpha, 'episode:', e) 97 | 98 | start = [random.uniform(-0.6, -0.4), 0.0] 99 | env = MountainCar(start, goal, Xrange, Vrange) 100 | 101 | step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e) 102 | steps[nStepIndex, alphaIndex] += step 103 | 104 | # average over independent runs and episodes 105 | steps /= runs * epoch 106 | 107 | for i in range(0, len(n_steps)): 108 | plt.plot(alphas, steps[i, :], label='n = '+str(n_steps[i])) 109 | plt.xlabel('Alpha') 110 | plt.ylabel('Steps per episode') 111 | plt.legend() 112 | 113 | plt.show() 114 | 115 | 116 | if __name__ == "__main__": 117 | # parameters 118 | epsilon = .1 # exploration 119 | num_actions = 3 # [move_left, stay, move_right] 120 | max_memory = 20000 121 | batch_size = 100 122 | input_size = 2 123 | 124 | Xrange = [-1.2, 0.6] 125 | Vrange = [-0.07, 0.07] 126 | goal = [0.5] 127 | 128 | one_vs_multi_step() 129 | effect_of_alpha_and_n() 130 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/expected_sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import math 4 | import os.path 5 | 6 | from keras.models import Sequential 7 | from keras.layers.core import Dense, Dropout 8 | from keras.optimizers import SGD 9 | from keras.regularizers import l1 10 | 11 | from Agent import Agent 12 | from Policy import EpsilonGreedyPolicy 13 | 14 | 15 | class ExpectedSarsa(Agent): 16 | def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True): 17 | super().__init__(max_memory, discount) 18 | self.num_actions = num_actions 19 | self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy) 20 | self.next_action = None 21 | 22 | self.__create_model(load_model) 23 | 24 | def __create_model(self, load_model): 25 | hidden_size = 100 26 | 27 | self.model = Sequential() 28 | self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01))) 29 | self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01))) 30 | self.model.add(Dropout(rate=.1)) 31 | self.model.add(Dense(self.num_actions)) 32 | 33 | sgd = SGD(lr=0.0001, momentum=0.99) 34 | self.model.compile(optimizer=sgd, loss="mse") 35 | 36 | if load_model and os.path.exists("model.esarsa"): 37 | self.model.load_weights("model.esarsa") 38 | 39 | def episode(self, env, batch_size=10, n_step=1, epoch=0): 40 | loss = 0. 41 | env.reset() 42 | game_over = False 43 | # get initial input 44 | state = env.observe() 45 | 46 | time = 0 47 | while not game_over: 48 | # go to next time n_step 49 | time += 1 50 | 51 | action = self.get_action(state) 52 | 53 | # apply action, get rewards and new state 54 | next_state, reward, game_over = env.act(action) 55 | 56 | # store experience 57 | self.remember((state, action, reward, next_state), game_over) 58 | 59 | loss += self.train(current_step=time, batch_size=batch_size, n_step=n_step) 60 | 61 | print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format( 62 | time, epoch, n_step, loss, next_state[0, 0], action - 1)) 63 | 64 | if math.isnan(loss) or time > 1500: 65 | break 66 | 67 | state = next_state 68 | 69 | print("Episode finished!") 70 | print("************************") 71 | 72 | return time 73 | 74 | def change_epsilon(self, epsilon): 75 | self.policy.epsilon = epsilon 76 | 77 | def set_learning_rate(self, lr): 78 | self.model.optimizer.lr = lr 79 | 80 | def save_model(self): 81 | # Save trained model weights and architecture, this will be used by the visualization code 82 | self.model.save_weights("model.esarsa", overwrite=True) 83 | with open("model.json", "w") as outfile: 84 | json.dump(self.model.to_json(), outfile) 85 | 86 | def train(self, current_step, batch_size=10, n_step=1): 87 | loss = 0. 88 | 89 | model = self.get_model() 90 | 91 | if current_step - n_step >= 0: 92 | inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step) 93 | 94 | loss += model.train_on_batch(inputs, targets) 95 | 96 | return loss 97 | 98 | def get_batch(self, batch_size=10, n_step=1): 99 | len_memory = len( 100 | self.memory) - n_step + 1 # we don't want to update 'n' last states,because their returns have not seen yet 101 | num_actions = self.model.output_shape[-1] 102 | env_dim = self.memory[0][0][0].shape[1] 103 | 104 | inputs = np.zeros((min(len_memory, batch_size), env_dim)) 105 | targets = np.zeros((inputs.shape[0], num_actions)) 106 | 107 | for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])): 108 | state, action, reward, next_state = self.memory[idx][0] 109 | game_over = self.memory[idx][1] 110 | 111 | inputs[i:i + 1] = state 112 | targets[i] = self.model.predict(state)[0] 113 | 114 | if game_over: # if game_over is True 115 | targets[i, action] = reward 116 | else: 117 | returns = 0.0 118 | t_n_state = next_state 119 | 120 | for t in range(idx, idx + n_step): 121 | _, _, reward, t_n_state = self.memory[t][0] 122 | returns += pow(self.discount, t - idx) * reward 123 | 124 | if reward != 100: 125 | q = self.model.predict(t_n_state)[0] 126 | pi = self.policy.get_pi(q) 127 | 128 | q_sa = np.dot(q, pi.T) 129 | 130 | returns += pow(self.discount, n_step) * q_sa 131 | 132 | targets[i, action] = returns 133 | 134 | return inputs, targets 135 | 136 | def get_action(self, state): 137 | return self.policy.get_action(q_values=self.model.predict(state)[0]) 138 | 139 | def get_model(self): 140 | return self.model 141 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import os.path 4 | from keras.models import Sequential 5 | from keras.layers.core import Dense, Dropout 6 | from keras.optimizers import SGD 7 | from keras.regularizers import L1L2, l1 8 | from Agent import Agent 9 | from Policy import EpsilonGreedyPolicy 10 | 11 | 12 | class Sarsa(Agent): 13 | def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True): 14 | super().__init__(max_memory, discount) 15 | self.num_actions = num_actions 16 | self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy) 17 | self.next_action = None 18 | 19 | self.__create_model(load_model) 20 | 21 | def change_epsilon(self, epsilon): 22 | self.policy.epsilon = epsilon 23 | 24 | def __create_model(self, load_model): 25 | hidden_size = 100 26 | 27 | self.model = Sequential() 28 | self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01))) 29 | self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01))) 30 | self.model.add(Dropout(rate=.1)) 31 | self.model.add(Dense(self.num_actions)) 32 | 33 | sgd = SGD(lr=0.0001, momentum=0.99) 34 | self.model.compile(optimizer=sgd, loss="mse") 35 | 36 | if load_model and os.path.exists("model.sarsa"): 37 | self.model.load_weights("model.sarsa") 38 | 39 | def set_learning_rate(self, lr): 40 | self.model.optimizer.lr = lr 41 | 42 | def save_model(self): 43 | # Save trained model weights and architecture, this will be used by the visualization code 44 | self.model.save_weights("model.sarsa", overwrite=True) 45 | with open("model.json", "w") as outfile: 46 | json.dump(self.model.to_json(), outfile) 47 | 48 | def episode(self, env, batch_size=10, n_step=1, epoch=0): 49 | loss = 0. 50 | env.reset() 51 | game_over = False 52 | # get initial input 53 | state = env.observe() 54 | action = self.get_action(state) 55 | 56 | step = 0 57 | while not game_over: 58 | # go to next time n_step 59 | step += 1 60 | # apply action, get rewards and new state 61 | next_state, reward, game_over = env.act(action) 62 | 63 | if reward == 100: 64 | next_action = None 65 | else: 66 | next_action = self.get_action(next_state) 67 | 68 | # store experience 69 | self.remember((state, action, reward, next_state, next_action), game_over) 70 | 71 | loss += self.train(current_step=step, batch_size=batch_size, n_step=n_step) 72 | print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format( 73 | step, epoch, n_step, loss, next_state[0, 0], action - 1)) 74 | 75 | if np.math.isnan(loss) or step > 1500: 76 | break 77 | 78 | state = next_state 79 | action = next_action 80 | 81 | print("Episode finished!") 82 | print("************************") 83 | 84 | return step 85 | 86 | def train(self, current_step, batch_size=10, n_step=1): 87 | loss = 0. 88 | 89 | model = self.get_model() 90 | 91 | if current_step - n_step >= 0: 92 | inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step) 93 | 94 | loss += model.train_on_batch(inputs, targets) 95 | 96 | return loss 97 | 98 | def get_batch(self, batch_size=10, n_step=1): 99 | len_memory = len( 100 | self.memory) - n_step + 1 # we don't want to update 'n' last states,because their returns have not seen yet 101 | num_actions = self.model.output_shape[-1] 102 | env_dim = self.memory[0][0][0].shape[1] 103 | 104 | inputs = np.zeros((min(len_memory, batch_size), env_dim)) 105 | targets = np.zeros((inputs.shape[0], num_actions)) 106 | 107 | for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])): 108 | state, action, reward, next_state, next_action = self.memory[idx][0] 109 | game_over = self.memory[idx][1] 110 | 111 | inputs[i:i + 1] = state 112 | targets[i] = self.model.predict(state)[0] 113 | 114 | if game_over: # if it is last state 115 | targets[i, action] = reward 116 | else: 117 | returns = 0.0 118 | t_n_state = next_state 119 | t_n_action = next_action 120 | 121 | for t in range(idx, idx + n_step): 122 | _, _, reward, t_n_state, t_n_action = self.memory[t][0] 123 | returns += pow(self.discount, t - idx) * reward 124 | 125 | if reward != 100: # not self.memory[update_step + n_step - 1][1]: 126 | q = self.model.predict(t_n_state)[0] 127 | q_sa = q[t_n_action] 128 | 129 | returns += pow(self.discount, n_step) * q_sa 130 | 131 | targets[i, action] = returns 132 | 133 | return inputs, targets 134 | 135 | def get_action(self, state): 136 | return self.policy.get_action(q_values=self.model.predict(state)[0]) 137 | 138 | def get_model(self): 139 | return self.model 140 | -------------------------------------------------------------------------------- /lib.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reinforcement learning library 3 | 4 | Included methods: 5 | Temporal Difference 6 | Q Learning 7 | SARSA 8 | Double Q Learning 9 | """ 10 | import numpy as np 11 | import random 12 | from collections import defaultdict 13 | 14 | 15 | class policy(): 16 | def __init__(self, epsilon, action_space): 17 | 18 | self.epsilon=epsilon 19 | self.nA=action_space 20 | 21 | def probs(self,q_table,observation): 22 | A_probs = np.ones(self.nA, dtype=float) * self.epsilon / self.nA 23 | best_action = np.argmax(q_table[observation]) 24 | A_probs[best_action] += (1 - self.epsilon) 25 | 26 | return A_probs 27 | 28 | 29 | class q_learning_agent(): 30 | def __init__(self, epsilon, discount_factor, alpha, action_space): 31 | self.q_table = defaultdict(lambda: np.zeros(action_space)) 32 | self.epsilon = epsilon 33 | self.discount_factor = discount_factor 34 | self.action_space = action_space 35 | self.alpha = alpha 36 | self.policy = policy(self.epsilon, self.action_space) 37 | 38 | def learn(self, action, reward, state, next_state): 39 | next_action = np.argmax(self.q_table[next_state]) 40 | td_target = reward + self.discount_factor * self.q_table[next_state][next_action] 41 | td_error = td_target - self.q_table[state][action] 42 | self.q_table[state][action] += self.alpha * td_error 43 | 44 | def select_action(self,state): 45 | A_probs = self.policy.probs(self.q_table,state) 46 | 47 | return np.random.choice(np.arange(len(A_probs)), p=A_probs) 48 | 49 | def get_q_table(self): 50 | return self.q_table 51 | 52 | def set_q_table(self, q_table): 53 | self.q_table = q_table 54 | 55 | 56 | class sarsa_learning_agent(): 57 | def __init__(self, epsilon, discount_factor, alpha, action_space): 58 | self.q_table = defaultdict(lambda: np.zeros(action_space)) 59 | self.epsilon = epsilon 60 | self.discount_factor = discount_factor 61 | self.action_space = action_space 62 | self.alpha = alpha 63 | self.policy = policy(self.epsilon, self.action_space) 64 | 65 | def learn(self, action, reward, state, next_state): 66 | next_action = self.select_action(next_state) 67 | td_target = reward + self.discount_factor * self.q_table[next_state][next_action] 68 | td_error = td_target - self.q_table[state][action] 69 | self.q_table[state][action] += self.alpha * td_error 70 | 71 | def select_action(self,state): 72 | A_probs = self.policy.probs(self.q_table,state) 73 | 74 | return np.random.choice(np.arange(len(A_probs)), p=A_probs) 75 | 76 | def get_q_table(self): 77 | return self.q_table 78 | 79 | def set_q_table(self, q_table): 80 | self.q_table = q_table 81 | 82 | class double_q_learning_agent(): 83 | def __init__(self, epsilon, discount_factor, alpha, action_space): 84 | self.q_table_1 = defaultdict(lambda: np.zeros(action_space)) 85 | self.q_table_2 = defaultdict(lambda: np.zeros(action_space)) 86 | self.epsilon = epsilon 87 | self.discount_factor = discount_factor 88 | self.action_space = action_space 89 | self.alpha = alpha 90 | self.policy = policy(self.epsilon, self.action_space) 91 | 92 | def learn(self, action, reward, state, next_state): 93 | if random.randint(1, 2) == 1: 94 | next_action = np.argmax(self.q_table_1[next_state]) 95 | td_target = reward + self.discount_factor * self.q_table_2[next_state][next_action] 96 | td_error = td_target - self.q_table_1[state][action] 97 | self.q_table_1[state][action] += self.alpha * td_error 98 | else: 99 | next_action = np.argmax(self.q_table_2[next_state]) 100 | td_target = reward + self.discount_factor * self.q_table_1[next_state][next_action] 101 | td_error = td_target - self.q_table_2[state][action] 102 | self.q_table_2[state][action] += self.alpha * td_error 103 | 104 | def select_action(self, state): 105 | A_probs_1 = self.policy.probs(self.q_table_2, state) 106 | A_probs_2 = self.policy.probs(self.q_table_1, state) 107 | 108 | return np.random.choice(np.arange(len(A_probs_1)), p=(A_probs_1+A_probs_2)/2) 109 | 110 | def get_q_tables(self): 111 | return [self.q_table_1, self.q_table_2] 112 | 113 | def set_q_table(self, q_table_1, q_table_2): 114 | self.q_table_1 = q_table_1 115 | self.q_table_2 = q_table_2 116 | 117 | 118 | class Expected_Sarsa_learning_agent(): 119 | def init(self, epsilon, discount_factor,alpha, action_space): 120 | self.q_table = defaultdict(lambda: np.zeros(action_space)) 121 | self.epsilon = epsilon 122 | self.discount_factor = discount_factor 123 | self.action_space = action_space 124 | self.alpha = alpha 125 | self.policy = policy(self.epsilon, self.action_space) 126 | 127 | def learn(self, action, reward, state, next_state): 128 | A_probs = self.policy.probs(self.q_table,next_state) 129 | expected_value = np.dot(A_probs,self.q_table[next_state]) 130 | td_target = reward + self.discount_factor * expected_value 131 | td_error = td_target - self.q_table[state][action] 132 | self.q_table[state][action] += self.alpha * td_error 133 | 134 | def select_action(self,state): 135 | A_probs = self.policy.probs(self.q_table, state) 136 | return np.random.choice(np.arange(len(A_probs)), p=A_probs) 137 | 138 | def get_Q_table(self): 139 | return self.q_table 140 | 141 | def set_Q_table(self, q_table): 142 | self.q_table = q_table 143 | 144 | 145 | -------------------------------------------------------------------------------- /Deep-RL-Mountain-Car/double_q.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os.path 3 | import json 4 | 5 | from keras.models import Sequential 6 | from keras.layers.core import Dense 7 | from keras.optimizers import sgd 8 | from keras.regularizers import l1 9 | 10 | from Agent import Agent 11 | from Policy import EpsilonGreedyPolicy 12 | 13 | 14 | class DQLearning(Agent): 15 | def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True): 16 | super().__init__(max_memory, discount) 17 | self.num_actions = num_actions 18 | self.epsilon = e_greedy 19 | self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy) 20 | 21 | self.__create_model(load_model) 22 | self.current_model = self.model 23 | 24 | def change_epsilon(self, epsilon): 25 | self.policy.epsilon = epsilon 26 | 27 | def episode(self, env, batch_size=10, n_step=1, epoch=0): 28 | loss = 0. 29 | env.reset() 30 | game_over = False 31 | # get initial input 32 | state = env.observe() 33 | 34 | step = 0 35 | while not game_over: 36 | # go to next time n_step 37 | step += 1 38 | action = self.get_action(state) 39 | # apply action, get rewards and new state 40 | next_state, reward, game_over = env.act(action) 41 | 42 | # store experience 43 | self.remember((state, action, reward, next_state), game_over) 44 | 45 | loss += self.train(current_step=step, batch_size=batch_size, n_step=n_step) 46 | print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format( 47 | step, epoch, n_step, loss, next_state[0, 0], action - 1)) 48 | 49 | if np.math.isnan(loss) or step > 1500: 50 | break 51 | 52 | state = next_state 53 | 54 | print("Episode finished!") 55 | print("************************") 56 | 57 | return step 58 | 59 | def train(self, current_step, batch_size=10, n_step=1): 60 | loss = 0. 61 | 62 | model = self.get_model() 63 | 64 | inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step) 65 | 66 | loss += model.train_on_batch(inputs, targets) 67 | 68 | return loss 69 | 70 | def get_batch(self, batch_size=10, n_step=1): 71 | len_memory = len(self.memory) 72 | num_actions = self.model.output_shape[-1] 73 | 74 | env_dim = self.memory[0][0][0].shape[1] 75 | inputs = np.zeros((min(len_memory, batch_size), env_dim)) 76 | targets = np.zeros((inputs.shape[0], num_actions)) 77 | 78 | for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])): 79 | state, action, reward, next_state = self.memory[idx][0] 80 | game_over = self.memory[idx][1] 81 | 82 | inputs[i:i + 1] = state 83 | targets[i] = self.model.predict(state)[0] 84 | 85 | if game_over: # if game_over is True 86 | targets[i, action] = reward 87 | else: 88 | q1 = self.model.predict(next_state)[0] 89 | q2 = self.model2.predict(next_state)[0] 90 | 91 | if self.current_model == self.model: 92 | best_action = np.argmax(q1) 93 | q_sa = q2[best_action] 94 | else: 95 | best_action = np.argmax(q2) 96 | q_sa = q1[best_action] 97 | 98 | targets[i, action] = reward + self.discount * q_sa 99 | 100 | return inputs, targets 101 | 102 | def get_action(self, state): 103 | q1 = self.model.predict(state)[0] 104 | q2 = self.model2.predict(state)[0] 105 | 106 | return self.policy.get_action(q_values=q1+q2) 107 | 108 | def get_model(self): 109 | aa = np.random.randint(2, size=1) 110 | if aa == 0: 111 | self.current_model = self.model 112 | else: 113 | self.current_model = self.model2 114 | 115 | return self.current_model 116 | 117 | def __create_model(self, load_model=True): 118 | hidden_size = 100 119 | 120 | self.model = Sequential() 121 | self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01))) 122 | self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01))) 123 | # self.model.add(Dense(hidden_size, activation='relu')) 124 | self.model.add(Dense(self.num_actions)) 125 | self.model.compile(sgd(lr=0.0001), "mse") 126 | 127 | if load_model and os.path.exists("model.dqlearning1"): 128 | self.model.load_weights("model.dqlearning1") 129 | 130 | self.model2 = Sequential() 131 | self.model2.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01))) 132 | self.model2.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01))) 133 | # self.model2.add(Dense(hidden_size, activation='relu')) 134 | self.model2.add(Dense(self.num_actions)) 135 | self.model2.compile(sgd(lr=0.0001), "mse") 136 | 137 | if load_model and os.path.exists("model.dqlearning2"): 138 | self.model.load_weights("model.dqlearning2") 139 | 140 | def save_model(self): 141 | # Save trained model weights and architecture, this will be used by the visualization code 142 | self.model.save_weights("model.dqlearning1", overwrite=True) 143 | self.model2.save_weights("model.dqlearning2", overwrite=True) 144 | 145 | with open("model3.json", "w") as outfile: 146 | json.dump(self.model.to_json(), outfile) 147 | 148 | with open("model4.json", "w") as outfile: 149 | json.dump(self.model2.to_json(), outfile) -------------------------------------------------------------------------------- /introduction_to_rl/tic_tac_toe.py: -------------------------------------------------------------------------------- 1 | # Q-Learning for TicTacToe 2 | 3 | import numpy as np 4 | import random 5 | 6 | 7 | class Env(): 8 | def __init__(self, random_play): 9 | self.random_play = random_play 10 | self.state = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] 11 | self.reward = 0.5 12 | self.done = 0 13 | self.random_agent_value = "O" 14 | self.winner = "Random Agent" 15 | 16 | def step(self, action, agent_value): 17 | # action is a array like [0,1] state 18 | if(agent_value == "X" and self.control_win(self.state) == self.random_agent_value): 19 | self.reward = 0 20 | self.done = 1 21 | 22 | if(self.state[action[0]][action[1]] != 0): 23 | print("there is something wrong : this state is not avaliable for playing.") 24 | else: 25 | self.state[action[0]][action[1]] = agent_value 26 | 27 | if(self.control_win(self.state) == agent_value): 28 | self.reward = 1 29 | self.done = 1 30 | self.winner = "agent X" 31 | if(self.control_win(self.state) == "DRAW"): 32 | self.reward = 0 33 | self.done = 1 34 | self.winner = "DRAW" 35 | if(self.done != 1): 36 | if(self.random_play == 1): 37 | self.random_agent_act() 38 | 39 | return(self.state, self.reward, self.done, self.winner) 40 | 41 | def control_win(self, state): 42 | for i in range(3): 43 | if state[i][0] != 0 and state[i][0] == state[i][1] and state[i][0] == state[i][2]: 44 | return state[i][0] 45 | if state[0][i] != 0 and state[0][i] == state[1][i] and state[0][i] == state[2][i]: 46 | return state[0][i] 47 | if state[0][0] != 0 and state[0][0] == state[1][1] and state[0][0] == state[2][2]: 48 | return state[0][0] 49 | if state[0][2] != 0 and state[0][2] == state[1][1] and state[0][2] == state[2][0]: 50 | return state[0][2] 51 | 52 | empty_spaces = [] 53 | values_of_empty_spaces = [] 54 | 55 | for i in range(3): 56 | for j in range(3): 57 | if self.state[i][j] == 0: 58 | empty_spaces.append([i, j]) 59 | values_of_empty_spaces.append(0) 60 | if (len(values_of_empty_spaces) == 0): 61 | return "DRAW" 62 | else: 63 | return 0 64 | 65 | def random_agent_act(self): 66 | empty_spaces = [] 67 | values_of_empty_spaces = [] 68 | 69 | for i in range(3): 70 | for j in range(3): 71 | if self.state[i][j] == 0: 72 | empty_spaces.append([i, j]) 73 | values_of_empty_spaces.append(0) 74 | 75 | a = empty_spaces[random.choice(list(enumerate(values_of_empty_spaces)))[0]] 76 | 77 | self.state[a[0]][a[1]] = self.random_agent_value 78 | 79 | 80 | def learn_from_random_agent(): 81 | """ 82 | 83 | Bu fonsiyonun amacı boş bir değer fonksiyonu yaratıp, bir çok oyun oynayarak değer fonksiyonunun 84 | içeriğini oynanan her adımda V (s) ← V (s) + α[V(s0)−V(S1)] formülü ile güncellemektir. 85 | Öğrenmeyi rastgele hareketler yapan bir oyuncu üzerinden gerçekleştireceğiz. 86 | 87 | alpha ====> öğrenme katsayısı 88 | Öğrenme katsayısı yaptığımız her adım değer fonksiyonu güncellemesindeki [V(s0)−V(S1)] adımının katsayısı olacak ve 89 | her yeni deneyimin hafızamıza ne kadar etki bırakacağının katsayısı olacak. 90 | 91 | epsilon ====> rastgele seçim katsayısı 92 | Kullanacak olduğumuz epsilon-greedy politikasını bizi bulduğumuz ilk maksimum değere takılı kalmayıp ortamın ödüllerini 93 | epsilon olasılıkla keşif etmemizi sağlayacak. 94 | 95 | value_table ====> ortamdaki her durumun için; içinde bulunması ne kadar güzel bir durum olduğunu anlatan, 96 | ortamın durum sayısı kadar skaler değer barındıran matris. ortamdaki durum sayısı 9 ise ===> len(value_func)=9 97 | 98 | episodes =====> öğrenme aşamasında kaç oyun oynatacağımız. 99 | 100 | This function is for learning the value function with playing random agent 101 | Random agent just play randomly to valid states 102 | formula of value function estimation is ======> V (s) ← V (s) + α[V(s0)−V(S1)] 103 | 104 | action -----> [i,j] şeklinde gösterildiği için value function hesabında action(state) kullandım. 105 | """ 106 | 107 | episodes = 100 108 | value_table = [0, 0, 0, 0, 0, 0, 0, 0, 0] 109 | 110 | for i in range(episodes): 111 | eps = 0.1 112 | env = Env(random_play=1) 113 | total_reward = 0 114 | 115 | done = 0 116 | agent_value = "X" # it must be X for random play 117 | state = env.state 118 | old_a = [0, 0] 119 | alfa = 0.99 120 | 121 | while not done: 122 | empty_spaces = [] 123 | values_of_empty_spaces = [] 124 | 125 | for i in range(3): 126 | for j in range(3): 127 | if state[i][j] == 0: 128 | empty_spaces.append([i, j]) 129 | values_of_empty_spaces.append(value_table[i * 3 + j]) 130 | 131 | if np.random.random() < eps or np.sum(value_table) == 0: 132 | a = empty_spaces[random.choice(list(enumerate(values_of_empty_spaces)))[0]] 133 | else: 134 | # select the action with largest q value in state s 135 | a = empty_spaces[np.argmax(values_of_empty_spaces)] 136 | 137 | new_s, reward, done, winner = env.step(a, agent_value) 138 | total_reward = reward + total_reward 139 | 140 | if (state != [[0, 0, 0], [0, 0, 0], [0, 0, 0]]): 141 | value_table[old_a[0] * 3+old_a[1]] = value_table[old_a[0] * 3 + old_a[1]] + alfa*(reward - value_table[old_a[0] * 3 + old_a[1]]) 142 | else: 143 | value_table[a[0] * 3 + a[1]] = alfa * reward 144 | 145 | old_a = a 146 | state = new_s 147 | 148 | if(done == 1): 149 | print("Winner is :" + str(winner)) 150 | print(value_table) 151 | 152 | 153 | learn_from_random_agent() 154 | -------------------------------------------------------------------------------- /temporal_difference/CartPole.py: -------------------------------------------------------------------------------- 1 | 2 | # Solving CartPole problem from OpenAI with Temporal Difference methods 3 | # Implementations are based on Sutton's book 4 | 5 | import gym 6 | import numpy as np 7 | 8 | 9 | env = gym.make('CartPole-v0') 10 | 11 | 12 | # General function definitions 13 | 14 | 15 | def discretization(observation): 16 | discrete = np.zeros((1, 2)) 17 | theta_bins = np.linspace(-0.42, 0.42, 20) 18 | thetadot_bins = np.linspace(-1, 1, 10) 19 | discrete[0][0] = np.digitize(observation[2], theta_bins) 20 | discrete[0][1] = np.digitize(observation[3], thetadot_bins) 21 | 22 | return discrete.astype(np.int64) 23 | 24 | 25 | def make_epsilon_greedy_policy(Q, epsilon, nA): 26 | """ 27 | Creates an epsilon-greedy policy based on a given Q-function and epsilon. 28 | 29 | Args: 30 | Q: A dictionary that maps from state -> action-values. 31 | Each value is a numpy array of length nA (see below) 32 | epsilon: The probability to select a random action . float between 0 and 1. 33 | nA: Number of actions in the environment. 34 | 35 | Returns: 36 | A function that takes the observation as an argument and returns 37 | the probabilities for each action in the form of a numpy array of length nA. 38 | 39 | """ 40 | def policy_fn(observation): 41 | A_probs = np.ones(nA, dtype=float) * epsilon / nA 42 | best_action = np.argmax(Q[observation[0][0]][observation[0][1]]) 43 | A_probs[best_action] += (1.0 - epsilon) 44 | return A_probs # .reshape(1,-1) 45 | 46 | return policy_fn 47 | 48 | 49 | def render_games(Q, num_episodes): 50 | scores = [] 51 | observations = [] 52 | for i_episode in range(num_episodes): 53 | score = 0 54 | observation = env.reset() 55 | for t in range(200): 56 | policy = make_epsilon_greedy_policy(Q, 0, env.action_space.n) 57 | env.render() 58 | state = discretization(observation) 59 | action_probs = policy(state) 60 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 61 | observation, reward, done, info = env.step(action) 62 | observations.append(observation) 63 | score += reward 64 | if done: 65 | break 66 | scores.append(score) 67 | print('Average Score:', sum(scores)/len(scores)) 68 | 69 | 70 | # Define models 71 | 72 | # SARSA 73 | def Sarsa(env, num_episodes, discount_factor=0.9, alpha=0.5, epsilon=0.1): 74 | # Q table initialization 75 | Q_sarsa = np.zeros((21, 11, env.action_space.n)) 76 | 77 | policy = make_epsilon_greedy_policy(Q_sarsa, epsilon, env.action_space.n) 78 | 79 | for i_episode in range(num_episodes): 80 | observation = env.reset() 81 | state = discretization(observation) 82 | 83 | while True: 84 | # Take a step 85 | action_probs = policy(state) 86 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 87 | next_observation, reward, done, _ = env.step(action) 88 | next_state = discretization(next_observation) 89 | 90 | # Sarsa Update 91 | next_action_probs = policy(next_state) 92 | next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) 93 | td_target = reward + discount_factor * Q_sarsa[next_state[0][0]][next_state[0][1]][next_action] 94 | td_delta = td_target - Q_sarsa[state[0][0]][state[0][1]][action] 95 | Q_sarsa[state[0][0]][state[0][1]][action] += alpha * td_delta 96 | 97 | if done: 98 | break 99 | 100 | state = next_state 101 | 102 | return Q_sarsa 103 | 104 | 105 | # Q Learning 106 | def Q_learning(env, num_episodes, discount_factor=0.9, alpha=0.5, epsilon=0.1): 107 | # Q table initialization 108 | Q = np.zeros((21, 11, env.action_space.n)) 109 | 110 | # The policy we're following 111 | policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) 112 | 113 | for i_episode in range(num_episodes): 114 | # Reset the environment and pick the first action 115 | observation = env.reset() 116 | state = discretization(observation) 117 | 118 | while True: 119 | # Take a step 120 | action_probs = policy(state) 121 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 122 | next_observation, reward, done, _ = env.step(action) 123 | next_state = discretization(next_observation) 124 | 125 | # Q Learning Update 126 | best_next_action = np.argmax(Q[next_state[0][0]][next_state[0][1]]) 127 | td_target = reward + discount_factor * Q[next_state[0][0]][next_state[0][1]][best_next_action] 128 | td_delta = td_target - Q[state[0][0]][state[0][1]][action] 129 | Q[state[0][0]][state[0][1]][action] += alpha * td_delta 130 | 131 | if done: 132 | break 133 | 134 | state = next_state 135 | 136 | return Q 137 | 138 | 139 | # Expected SARSA 140 | def Expected_Sarsa(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1): 141 | # Q Table Initialization 142 | Q_expsarsa = np.zeros((21, 11, env.action_space.n)) 143 | 144 | policy = make_epsilon_greedy_policy(Q_expsarsa, epsilon, env.action_space.n) 145 | 146 | for i_episode in range(num_episodes): 147 | observation = env.reset() 148 | state = discretization(observation) 149 | 150 | while True: 151 | # Take a step 152 | action_probs = policy(state) 153 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 154 | next_observation, reward, done, _ = env.step(action) 155 | next_state = discretization(next_observation) 156 | 157 | # Expected Sarsa Update 158 | next_action_prob = policy(next_state) 159 | expected_value = np.sum(np.multiply(Q_expsarsa[next_state[0][0]][next_state[0][1]][:], next_action_prob)) 160 | td_target = reward + discount_factor * expected_value 161 | td_delta = td_target - Q_expsarsa[state[0][0]][state[0][1]][action] 162 | Q_expsarsa[state[0][0]][state[0][1]][action] += alpha * td_delta 163 | 164 | if done: 165 | break 166 | 167 | state = next_state 168 | 169 | return Q_expsarsa 170 | 171 | 172 | print('Starting training.') 173 | 174 | # Train 175 | Q_s = Sarsa(env, 1200) 176 | Q_q = Q_learning(env, 600) 177 | Q_es = Expected_Sarsa(env, 400) 178 | 179 | print('Training done.') 180 | print('Starting testing.') 181 | 182 | # Test 183 | render_games(Q_s, 5) 184 | render_games(Q_q, 5) 185 | render_games(Q_es, 5) 186 | 187 | print('Testing done.') 188 | --------------------------------------------------------------------------------