├── Deep-RL-Mountain-Car
    ├── Policy.py
    ├── Agent.py
    ├── MCtest.py
    ├── MountainCar.py
    ├── QLearning.py
    ├── README.md
    ├── TrainModel.py
    ├── main.py
    ├── expected_sarsa.py
    ├── sarsa.py
    └── double_q.py
├── multi_armed_bandits
    ├── bandit.py
    └── policy.py
├── .gitignore
├── README.md
├── finite_markov_decision_processes
    └── cartpole.py
├── function_approximation
    ├── mountain_car_tile_coding.py
    └── tiles3.py
├── temporal_difference
    ├── blackjack.py
    ├── blackjack_figures.py
    └── CartPole.py
├── lib.py
└── introduction_to_rl
    └── tic_tac_toe.py


/Deep-RL-Mountain-Car/Policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class EpsilonGreedyPolicy:
 5 |     def __init__(self, number_of_action, epsilon=.1, ):
 6 |         self.epsilon = epsilon
 7 |         self.number_of_action = number_of_action
 8 | 
 9 |     def get_action(self, q_values):
10 |         pi = self.get_pi(q=q_values)
11 | 
12 |         return np.random.choice(np.arange(self.number_of_action), p=pi)
13 |     
14 |     def get_pi(self, q):
15 |         pi = np.ones(self.number_of_action, dtype=float)  # init values
16 |         pi *= self.epsilon / self.number_of_action  # probability of random action
17 | 
18 |         greedy_action = np.argmax(q)
19 |         pi[greedy_action] += (1.0 - self.epsilon)  # probability of greedy action
20 |         
21 |         return pi
22 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/Agent.py:
--------------------------------------------------------------------------------
 1 | class Agent(object):
 2 |     def __init__(self, max_memory=100, discount=.99):
 3 |         self.max_memory = max_memory
 4 |         self.memory = list()
 5 |         self.discount = discount
 6 | 
 7 |     def remember(self, states, game_over):
 8 |         # memory[i] = [[state_t, action_t, reward_t, state_t+1], game_over?]
 9 |         self.memory.append([states, game_over])
10 |         if len(self.memory) > self.max_memory:
11 |             del self.memory[0]
12 | 
13 |     def __create_model(self, load_model):
14 |         pass
15 | 
16 |     def episode(self, env, batch_size=10, n_step=1, epoch=0):
17 |         pass
18 | 
19 |     def save_model(self):
20 |         pass
21 | 
22 |     def train(self, current_step, batch_size=10, n_step=1):
23 |         pass
24 | 
25 |     def get_batch(self, batch_size=10, n_step=1):
26 |         pass
27 | 
28 |     def get_action(self, state):
29 |         pass
30 | 
31 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/MCtest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | from keras.models import model_from_json
 4 | from MountainCar import MountainCar
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     # Initialize parameters
 9 | 
10 |     # working with any parameters below, not nessecarily the parameters set during the training, it seems like neural network is able to generalize to othe initializations as well
11 |     Xrange = [-1.5, 0.55]
12 |     Vrange = [-0.7, 0.7]
13 |     start = [-0.5, -0.1]
14 |     goal = [0.45]
15 | 
16 |     with open("model.json", "r") as jfile:
17 |         model = model_from_json(json.load(jfile))
18 |     model.load_weights("model.h5")
19 |     model.compile("sgd", "mse")
20 | 
21 |     # Define environment, game
22 |     env = MountainCar(start, goal, Xrange, Vrange)
23 | 
24 |     for e in range(10):
25 |         c = 0
26 |         loss = 0.
27 |         env.reset()
28 |         game_over = False
29 |         # get initial input
30 |         input_t = env.observe()
31 | 
32 |         c += 1
33 |         while not game_over:
34 |             input_tm1 = input_t
35 | 
36 |             # get next action
37 |             q = model.predict(input_tm1)
38 |             action = np.argmax(q[0])
39 | 
40 |             # apply action, get rewards and new state
41 |             input_t, reward, game_over = env.act(action)
42 | 
43 |             c += 1
44 |         print("Episode %d, Steps %d" %(e, c))


--------------------------------------------------------------------------------
/multi_armed_bandits/bandit.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | 
 5 | class bandit:
 6 |     def __init__(self, kArm=10, epsilon=0, initial=0., ucb=0., variance=1, min=-2, max=2):
 7 |         self.epsilon = epsilon
 8 |         self.ucb = ucb
 9 |         self.kArm = kArm
10 |         self.variance = variance
11 | 
12 |         self.selectActions = np.zeros(self.kArm)
13 |         self.totalTS = 0
14 |         self.steps = np.zeros(self.kArm)
15 |         self.qTable = np.full(self.kArm, initial)
16 |         self.armMeans = np.random.uniform(min, max, self.kArm)
17 | 
18 |     def takeAction(self):
19 |         if self.epsilon > 0 and np.random.binomial(1, self.epsilon) == 1:
20 |             idx = np.random.choice(self.kArm)
21 |         elif self.ucb > 0:
22 |             actions = []
23 |             for idx in np.arange(self.kArm):
24 |                 if self.selectActions[idx] == 0:
25 |                     actions.append(1000)
26 |                     break
27 |                 else:
28 |                     actions.append(self.qTable[idx] + self.ucb * math.sqrt(math.log(self.totalTS) / self.selectActions[idx]))
29 | 
30 |             idx = np.argmax(np.asarray(actions))
31 |             self.selectActions[idx] += 1
32 |         else:
33 |             idx = np.argmax(self.qTable)
34 | 
35 |         reward = np.random.normal(self.armMeans[idx], self.variance)
36 | 
37 |         self.totalTS += 1
38 |         self.steps[idx] += 1
39 |         self.qTable[idx] += (1 / self.steps[idx]) * (reward - self.qTable[idx])
40 | 
41 |         return reward
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Learning Türkiye - Reinforcement Learning Project
 2 | 
 3 | This repository consists projects from Deep Learning Türkiye - Reinforcement Learning Group. Enter folders to see each project's details.
 4 | 
 5 | ## 1. Introduction To RL
 6 | Simple tic tac toe example. Learns via Value Function at the moment. Policy Search *TODO*.
 7 | Benefited from [tansey](https://github.com/tansey/rl-tictactoe/blob/master/tictactoe.py).
 8 | 
 9 | ## 2. Multi-Armed Bandits
10 | Provides the underlying testbed for bandit problem.
11 | 
12 | ## 3. Finite Markov Decision Processes
13 | Uses the OpenAI Gym. Learns via Q-Learning.
14 | 
15 | ## 4. Temporal Difference
16 | Multiple approaches to CartPole problem.
17 | Benefited from [dennybritz](https://github.com/dennybritz/reinforcement-learning).
18 | 
19 | ## Library usage
20 | You can find example usage below.
21 | 
22 | ```
23 | import gym
24 | from lib import q_learning_agent, double_q_learning_agent, sarsa_learning_agent
25 | 
26 | env = gym.make("FrozenLake-v0")
27 | env.reset()
28 | 
29 | def train(agent):
30 |     for i_episode in range(1000):
31 |         state = env.reset()
32 |         while True:
33 |             action = agent.select_action(state)
34 |             next_state, reward, done, _ = env.step(action)
35 |             agent.learn(action, reward, state, next_state)
36 |             if done:
37 |                 break
38 |             state = next_state
39 | 
40 | qla = q_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n)
41 | sla = sarsa_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n)
42 | dqla = double_q_learning_agent(epsilon=0.3, discount_factor=0.9, alpha=0.5, action_space=env.action_space.n)
43 | 
44 | train(qla)
45 | train(sla)
46 | train(dqla)
47 | ```


--------------------------------------------------------------------------------
/multi_armed_bandits/policy.py:
--------------------------------------------------------------------------------
 1 | from bandit import bandit
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | run = 2000
 7 | ts = 1000
 8 | 
 9 | variance = 1
10 | limMin = 0
11 | limMax = 3
12 | 
13 | 
14 | def greedyPolicy():
15 |     averageRewards = np.zeros(ts)
16 | 
17 |     for num in range(run):
18 |         bnd = bandit(variance=variance, min=limMin, max=limMax)
19 | 
20 |         for t in range(ts):
21 |             averageRewards[t] += bnd.takeAction()
22 | 
23 |     return averageRewards / run
24 | 
25 | 
26 | def epsilonGreedyPolicy(epsilon):
27 |     averageRewards = np.zeros(ts)
28 | 
29 |     for num in range(run):
30 |         bnd = bandit(variance=variance, min=limMin, max=limMax, epsilon=epsilon)
31 | 
32 |         for t in range(ts):
33 |             averageRewards[t] += bnd.takeAction()
34 | 
35 |     return averageRewards / run
36 | 
37 | 
38 | def optimisticInitialValues(initial):
39 |     averageRewards = np.zeros(ts)
40 | 
41 |     for num in range(run):
42 |         bnd = bandit(variance=variance, min=limMin, max=limMax, initial=initial)
43 | 
44 |         for t in range(ts):
45 |             averageRewards[t] += bnd.takeAction()
46 | 
47 |     return averageRewards / run
48 | 
49 | 
50 | def upperConfidenceBound(ucb):
51 |     averageRewards = np.zeros(ts)
52 | 
53 |     for num in range(run):
54 |         bnd = bandit(variance=variance, min=limMin, max=limMax, ucb=ucb)
55 | 
56 |         for t in range(ts):
57 |             averageRewards[t] += bnd.takeAction()
58 | 
59 |     return averageRewards / run
60 | 
61 | 
62 | # plt.plot(greedyPolicy(), color='r')
63 | plt.plot(epsilonGreedyPolicy(0.1), color='b')
64 | # plt.plot(epsilonGreedyPolicy(0.01), color='g')
65 | # plt.plot(optimisticInitialValues(5), color='g')
66 | plt.plot(upperConfidenceBound(2), color='r')
67 | 
68 | 
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/MountainCar.py:
--------------------------------------------------------------------------------
 1 | import numpy as  np
 2 | import math
 3 | 
 4 | 
 5 | class MountainCar(object):
 6 |     def __init__(self, start, goal, Xrange, Vrange):
 7 |         self.start = start
 8 |         self.goal = goal
 9 |         self.Xrange = Xrange
10 |         self.Vrange = Vrange
11 |         self.num_actions = 3
12 | 
13 |     def _DoAction(self, action):
14 |         # MountainCarDoAction: executes the action (a) into the mountain car
15 |         # a: is the force to be applied to the car
16 |         # x: is the vector containning the position and speed of the car
17 |         # xp: is the vector containing the new position and velocity of the car
18 | 
19 |         position = self.state[0]
20 |         speed = self.state[1]
21 | 
22 |         # bounds for position
23 |         bpleft = self.Xrange[0]
24 | 
25 |         # bounds for speed
26 |         bsleft = self.Vrange[0]
27 |         bsright = self.Vrange[1]
28 |         speedt1 = speed + (0.001 * (action - 1)) + (-0.0025 * math.cos(3.0 * position))
29 |         speedt1 = speedt1 * 0.999  # thermodynamic law, for a more real system with friction.
30 | 
31 |         if speedt1 < bsleft:
32 |             speedt1 = bsleft
33 |         elif speedt1 > bsright:
34 |             speedt1 = bsright
35 | 
36 |         post1 = position + speedt1
37 | 
38 |         if post1 <= bpleft:
39 |             post1 = bpleft
40 |             speedt1 = 0.0
41 | 
42 |         xp = np.array([post1, speedt1])
43 |         self.state = xp
44 | 
45 |     def _GetReward(self):
46 |         # MountainCarGetReward returns the reward at the current state
47 |         # x: a vector of position and velocity of the car
48 |         # r: the returned reward.
49 |         # f: true if the car reached the goal, otherwise f is false
50 | 
51 |         position = self.state[0]
52 |         # bound for position; the goal is to reach position = 0.45
53 |         bpright = self.goal
54 | 
55 |         r = -1
56 |         f = False
57 | 
58 |         if position >= bpright:
59 |             r = 100
60 |             f = True
61 | 
62 |         return r, f
63 | 
64 |     def act(self, action):
65 |         self._DoAction(action)
66 |         reward, game_over = self._GetReward()
67 |         return self.observe(), reward, game_over
68 | 
69 |     def observe(self):
70 |         return self.state.reshape((1, -1))
71 | 
72 |     def reset(self):
73 |         self.state = np.array([self.start[0], self.start[1]])
74 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/QLearning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras.models import Sequential
 3 | from keras.layers.core import Dense
 4 | from keras.optimizers import sgd
 5 | from Agent import Agent
 6 | 
 7 | 
 8 | class QLearning(Agent):
 9 | 
10 |     def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1):
11 |         super().__init__(max_memory, discount)
12 |         self.num_actions = num_actions
13 |         self.epsilon = e_greedy
14 | 
15 |         self.__create_model()
16 | 
17 |     def __create_model(self, load_model=True):
18 |         hidden_size = 100
19 | 
20 |         self.model = Sequential()
21 |         self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu'))
22 |         self.model.add(Dense(hidden_size, activation='relu'))
23 |         self.model.add(Dense(self.num_actions))
24 |         self.model.compile(sgd(lr=0.01), "mse")
25 | 
26 |     def get_batch(self, batch_size=10, n_step=1):
27 |         len_memory = len(self.memory)-n_step    # we don't want to update 'n' last states, because their returns not calculated yet
28 |         num_actions = self.model.output_shape[-1]
29 | 
30 |         env_dim = self.memory[0][0][0].shape[1]
31 |         inputs = np.zeros((min(len_memory, batch_size), env_dim))
32 |         targets = np.zeros((inputs.shape[0], num_actions))
33 | 
34 |         for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
35 |             state, action, reward, next_state = self.memory[idx][0]
36 |             game_over = self.memory[idx][1]
37 | 
38 |             inputs[i:i + 1] = state
39 |             # There should be no target values for actions not taken.
40 |             # Thou shalt not correct actions not taken #deep
41 |             targets[i] = self.model.predict(state)[0]
42 |             Q_sa = np.max(self.model.predict(next_state)[0])
43 | 
44 |             if game_over:  # if game_over is True
45 |                 targets[i, action] = reward
46 |             else:
47 |                 # reward + gamma * max_a' Q(s', a')
48 |                 targets[i, action] = reward + self.discount * Q_sa
49 | 
50 |         return inputs, targets
51 | 
52 |     def get_action(self, state):
53 |         if np.random.rand() <= self.epsilon:
54 |             return np.random.randint(0, self.num_actions, size=1)[0]
55 |         else:
56 |             return np.argmax(self.model.predict(state)[0])
57 | 
58 |     def get_model(self):
59 |         return self.model
60 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/README.md:
--------------------------------------------------------------------------------
 1 | #Deep RL - Mountain Car Domain
 2 | This is my implementation of Mountain Car domain in reinforcement learning using neural network function approximation with Keras Deep Learning Library.  
 3 | *To the best of my knowledge, this is the first opensource code for solving Mountain Car RL problem using DQN.*    
 4 | I am motivated by this simple example of [Keras playing catch](https://edersantana.github.io/articles/keras_rl/)  
 5 | My code is adapted from the above incorporated with Mountain car domain.
 6 | 
 7 | ### DQN implementation
 8 | DQN implementation is based on the paper:  
 9 | Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., & Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.
10 | 
11 | ### Mountain Car Domain
12 | Mountain car is standard platform for testing RL algorithms in which a underpowered car tries to reach a goal position uphill by moving to and fro the hill valley. The state space of the car is continuous and consist of its position and velocity. At every state, it can choose out of 3 possible actions -- move forward, backward or stay. Refer to this [Wikipedia article](https://en.wikipedia.org/wiki/Mountain_Car) for more information.  
13 | 
14 | ![alt tag](Mcar.png) 
15 | 
16 | The figure above (from Wikipedia) shows the problem, where car in starting position and star is the goal position.
17 | 
18 | ### Files
19 | 1. MountainCar.py -- Define the class of Mountain Car environment - transition from one state to another given an action and returning reward.
20 | 2. MCqlearn.py -- DQN implementation for Q-learning.
21 | 3. MCtest.py -- Testing the learned policy.
22 | 
23 | ### Training
24 | DQN is trained for 1000 successful episodes of the problem. The specific parameters of the algorithm are given in the MCqlearn.py file. To train the DQN network, symply run the training file:
25 | ```
26 | python MCqlearn.py
27 | ```
28 | After training, the network parameters are stored in .json and .h5 file.
29 | 
30 | ### Testing
31 | Once the network is trained and parameters are saved in .json and .h5 file, testing can be done. To test the network, run the file:
32 | ```
33 | python MCqtest.py
34 | ```
35 | The initial state and other parameters of Mountain Car domain can be set up in this file.  
36 | **It is interesting to note that though the network is trained only for one initial state and one range of Mountain Car domain, it is able to generalize and success during testing for arbitrary initial states and range of the domain.**
37 | 
38 | ### Dependencies
39 | 1. Python3
40 | 2. Keras
41 | 3. Numpy 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/finite_markov_decision_processes/cartpole.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | import numpy as np
 3 | import gym
 4 | 
 5 | 
 6 | # cart_position_bins = np.linspace(-2.4, 2.4, 10)
 7 | # cart_velocity_bins = np.linspace(-1, 1, 10)
 8 | 
 9 | class cartPole:
10 |     def __init__(self, epsilon, gamma):
11 |         self.qTable = {}
12 |         self.epsilon = epsilon
13 |         self.gamma = gamma
14 | 
15 |         self.env = gym.make('CartPole-v0')
16 |         self.actions = [0, 1]
17 | 
18 |         self.pole_angle_bins = np.linspace(-0.42, 0.42, 10)
19 |         self.pole_velocity_bins = np.linspace(-1, 1, 10)
20 | 
21 |         self.timesteps_over_time = []
22 | 
23 |     def observationToState(self, observation):
24 |         pole_angle = np.digitize(x=[observation[2]], bins=self.pole_angle_bins)[0]
25 |         pole_velocity = np.digitize(x=[observation[3]], bins=self.pole_velocity_bins)[0]
26 | 
27 |         return (pole_angle, pole_velocity)
28 | 
29 |     def updateQTable(self, state, action, reward):
30 |         currentValue = self.qTable.get((state, action), None)
31 | 
32 |         if currentValue is None:
33 |             self.qTable[(state, action)] = reward
34 |         else:
35 |             self.qTable[(state, action)] = currentValue + self.gamma * (reward - currentValue)
36 | 
37 |     def chooseAction(self, state):
38 |         if np.random.random() < self.epsilon:
39 |             action = self.env.action_space.sample()
40 |         else:
41 |             q = [self.qTable.get((state, action), 0.0) for action in self.actions]
42 |             action = self.actions[np.argmax(q)]
43 | 
44 |         return action
45 | 
46 |     def run(self):
47 |         for i_episode in range(1000):
48 |             observation = self.env.reset()
49 |             state = self.observationToState(observation)
50 | 
51 |             done = False
52 |             ts = 1
53 | 
54 |             episodeStates = []
55 |             episodeActions = []
56 | 
57 |             while not done:
58 |                 self.env.render()
59 | 
60 |                 action = self.chooseAction(state)
61 | 
62 |                 observation, reward, done, info = self.env.step(action)
63 | 
64 |                 episodeStates.append(state)
65 |                 episodeActions.append(action)
66 | 
67 |                 state = self.observationToState(observation)
68 | 
69 |                 if done:
70 |                     print("Episode {} finished after {} timesteps".format(i_episode, ts))
71 | 
72 |                 ts += 1
73 | 
74 |             for i in range(len(episodeStates)):
75 |                 state = episodeStates[i]
76 |                 action = episodeActions[i]
77 | 
78 |                 self.updateQTable(state, action, ts)
79 | 
80 |             self.timesteps_over_time.append(ts)
81 | 
82 | 
83 | cp = cartPole(0.1, 0.5)
84 | cp.run()
85 | 
86 | pp = pprint.PrettyPrinter(depth=6)
87 | pp.pprint(cp.qTable)
88 | 
89 | pp.pprint(cp.timesteps_over_time)
90 | 


--------------------------------------------------------------------------------
/function_approximation/mountain_car_tile_coding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | 
  4 | import sys
  5 | 
  6 | from tiles3 import *
  7 | 
  8 | env = gym.make('MountainCar-v0')
  9 | 
 10 | GAMMA = 1.0
 11 | LAMBDA = 0.9
 12 | 
 13 | IHT_SIZE = 4096
 14 | num_tilings = 8
 15 | weights = np.zeros((IHT_SIZE, 1))
 16 | z = np.zeros((IHT_SIZE, 1))  # Eligibility trace vector.
 17 | iht = IHT(IHT_SIZE)
 18 | 
 19 | POSITION_MIN, VELOCITY_MIN = env.env.low
 20 | POSITION_MAX, VELOCITY_MAX = env.env.high
 21 | 
 22 | 
 23 | def get_active_tiles(state, action):
 24 |     pos, vel = state
 25 |     active_tiles = tiles(iht, num_tilings, [pos * num_tilings / (POSITION_MAX - POSITION_MIN),
 26 |                                             vel * num_tilings / (VELOCITY_MAX - VELOCITY_MIN)],
 27 |                          [action])
 28 |     return active_tiles
 29 | 
 30 | 
 31 | def s_a_feature_vector(state, action):
 32 |     active_tiles = get_active_tiles(state, action)
 33 |     feature_vector = np.zeros((IHT_SIZE, 1))
 34 |     feature_vector[active_tiles] = 1
 35 |     return feature_vector
 36 | 
 37 | 
 38 | def get_value(state, action):
 39 |         # If the state is terminal.
 40 |     if state[0] >= POSITION_MAX:
 41 |         return 0
 42 | 
 43 |     return np.dot(weights.T, s_a_feature_vector(state, action))
 44 | 
 45 | 
 46 | def get_action(state):
 47 |     values = [get_value(state, action) for action in range(env.action_space.n)]
 48 |     return np.argmax(values)
 49 | 
 50 | 
 51 | alpha = 0.5
 52 | step_size = alpha / num_tilings
 53 | n_episodes = 100
 54 | 
 55 | # PAGE 305 : Sarsa(λ) with binary features and linear function approximation
 56 | for episode in range(n_episodes):
 57 |     if episode % 10 == 0:
 58 |         print('\rEpisode {}/{}'.format(episode + 1, n_episodes), end='')
 59 |         sys.stdout.flush()
 60 | 
 61 |     state = env.reset()
 62 |     while True:
 63 |         action = get_action(state)
 64 |         next_state, reward, done, _ = env.step(action)
 65 | 
 66 |         delta = reward
 67 | 
 68 |         active_tiles = get_active_tiles(state, action)
 69 |         delta -= get_value(state, action)
 70 |         z[active_tiles] = 1
 71 | 
 72 |         # If the next state is terminal state.
 73 |         if next_state[0] >= POSITION_MAX:
 74 |             weights += step_size * delta * z
 75 | 
 76 |         next_action = get_action(next_state)
 77 |         active_tiles = get_active_tiles(next_state, next_action)
 78 |         delta += GAMMA * get_value(next_state, next_action)
 79 | 
 80 |         weights += step_size * delta * z
 81 |         z = GAMMA * LAMBDA * z
 82 | 
 83 |         if done:
 84 |             break
 85 | 
 86 |         state = next_state
 87 | 
 88 | print('Training is done')
 89 | 
 90 | # Test the algorithm for 1 episode.
 91 | for i in range(1):
 92 |     state = env.reset()
 93 |     while True:
 94 |         env.render()
 95 |         action = get_action(state)
 96 |         # action = get_action(state)
 97 |         next_state, reward, done, _ = env.step(action)
 98 |         if done:
 99 |             break
100 | 
101 |         state = next_state
102 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/TrainModel.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | from MountainCar import MountainCar
  6 | from QLearning import QLearning
  7 | from sarsa import Sarsa
  8 | from double_q import DQLearning
  9 | from expected_sarsa import ExpectedSarsa
 10 | import random
 11 | 
 12 | if __name__ == "__main__":
 13 |     # parameters
 14 |     epsilon = .1  # exploration
 15 |     num_actions = 3  # [move_left, stay, move_right]
 16 |     epoch = 5
 17 |     max_memory = 50000
 18 |     batch_size = 32
 19 |     input_size = 2
 20 | 
 21 |     Xrange = [-1.5, 0.55]
 22 |     Vrange = [-2.0, 2.0]
 23 |     start = [np.random.randint(7) * 0.1 - 0.5, 0.0]
 24 |     goal = [0.45]
 25 | 
 26 |     # n_step = 1
 27 | 
 28 |     GAMMA = 0.99  # decay rate of past observations
 29 |     OBSERVATION = 3200.  # timesteps to observe before training
 30 |     EXPLORE = 10000.  # frames over which to anneal epsilon
 31 |     FINAL_EPSILON = 0.01  # final value of epsilon
 32 |     INITIAL_EPSILON = 0.2  # starting value of epsilon
 33 |     LEARNING_RATE = 1e-4
 34 |     FRAME_PER_ACTION = 1
 35 | 
 36 |     # all possible steps
 37 |     nSteps = np.arange(2, 5, 1)
 38 | 
 39 |     # all possible alphas
 40 |     #alphas = np.arange(0.01, 0.2, 0.1)
 41 | 
 42 |     alphas = [0.01]
 43 |     # If you want to continue training from a previous model, just uncomment the line bellow
 44 |     # model.load_weights("model.h5")
 45 | 
 46 |     # Define environment/game
 47 |     env = MountainCar(start, goal, Xrange, Vrange)
 48 | 
 49 |     # Initialize experience replay object
 50 |     # learning_model = QLearning(num_actions=num_actions, max_memory=max_memory)
 51 |     #learning_model = DQLearning(num_actions=num_actions, max_memory=max_memory, e_greedy=INITIAL_EPSILON)
 52 |     learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=INITIAL_EPSILON)
 53 | 
 54 |     for e in range(epoch):
 55 |         # loss = 0.
 56 |         env = MountainCar(start, goal, Xrange, Vrange)
 57 |         env.reset()
 58 |         game_over = False
 59 | 
 60 |         # get initial input
 61 |         s = env.observe()
 62 | 
 63 |         t = 0
 64 |         OBSERVE = OBSERVATION
 65 |         epsilon = INITIAL_EPSILON
 66 | 
 67 |         while not game_over:
 68 |             loss = 0
 69 |             t += 1
 70 | 
 71 |             action = learning_model.get_action(s)
 72 | 
 73 |             # We reduced the epsilon gradually
 74 |             if epsilon > FINAL_EPSILON and t > OBSERVE:
 75 |                 epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
 76 |                 learning_model.change_epsilon(epsilon)
 77 | 
 78 |             next_state, reward, game_over = env.act(action)
 79 | 
 80 |             # store experience
 81 |             learning_model.remember((s, action, reward, next_state), game_over)
 82 | 
 83 |             # only train if done observing
 84 |             if t > OBSERVE:
 85 |                 loss += learning_model.train(current_step=t, batch_size=batch_size)
 86 | 
 87 |             s = next_state
 88 | 
 89 |             # save progress every 10000 iterations
 90 |             if t % 1000 == 0:
 91 |                 learning_model.save_model()
 92 | 
 93 |             # print info
 94 |             state = ""
 95 |             if t <= OBSERVE:
 96 |                 state = "observe"
 97 |             elif OBSERVE < t <= OBSERVE + EXPLORE:
 98 |                 state = "explore"
 99 |             else:
100 |                 state = "train"
101 | 
102 |             print("Epoch", e, "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/", "REWARD", reward,
103 |                   "/ ACTION ", action, "/ POS", next_state[0, 0], "/ Loss ", loss)
104 | 
105 |             if t > 20000:   # stop sampling, continue with new episode
106 |                 break
107 | 
108 |         print("Episode finished!")
109 |         print("************************")
110 |         learning_model.save_model()
111 |         training = False
112 | 
113 | 


--------------------------------------------------------------------------------
/temporal_difference/blackjack.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import random
  4 | 
  5 | 
  6 | env = gym.make('Blackjack-v0')
  7 | env.reset()
  8 | 
  9 | 
 10 | def make_epsilon_greedy_policy(Q_1, Q_2, epsilon, nA):
 11 |     """
 12 |     Creates an epsilon-greedy policy based on a given Q-function and epsilon.
 13 |     Args:
 14 |         Q: A dictionary that maps from state -> action-values.
 15 |             Each value is a numpy array of length nA (see below)
 16 |         epsilon: The probability to select a random action . float between 0 and 1.
 17 |         nA: Number of actions in the environment.
 18 |     Returns:
 19 |         A function that takes the observation as an argument and returns
 20 |         the probabilities for each action in the form of a numpy array of length nA.
 21 |     """
 22 |     def policy_fn(observation):
 23 |         A_probs = np.ones(nA, dtype=float) * epsilon / nA
 24 | 
 25 |         Q = Q_1 + Q_2
 26 | 
 27 |         best_action = np.argmax(Q[observation[0]][observation[1]][int(observation[2])])
 28 |         A_probs[best_action] += (1.0 - epsilon)
 29 | 
 30 |         return A_probs  # .reshape(1,-1)
 31 | 
 32 |     return policy_fn
 33 | 
 34 | 
 35 | # Q Learning
 36 | def double_Q_learning(env, train_episodes, test_episodes, discount_factor=0.2, alpha=0.5, epsilon=0.1):
 37 |     # Q table initialization
 38 |     Q_1 = np.zeros((32, 11, 2, env.action_space.n))
 39 |     Q_2 = np.zeros((32, 11, 2, env.action_space.n))
 40 | 
 41 |     # The policy we're following
 42 |     policy = make_epsilon_greedy_policy(Q_1, Q_2, epsilon, env.action_space.n)
 43 | 
 44 |     for i_episode in range(train_episodes):
 45 |         # Reset the environment and pick the first action
 46 |         observation = env.reset()
 47 |         state = observation
 48 | 
 49 |         while True:
 50 |             # Take a step
 51 |             action_probs = policy(state)
 52 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
 53 | 
 54 |             next_observation, reward, done, _ = env.step(action)
 55 |             next_state = next_observation
 56 | 
 57 |             if(random.randint(1, 2) == 1):
 58 |                 # Q Learning Update
 59 |                 best_next_action = np.argmax(Q_1[next_state[0]][next_state[1]][int(next_state[2])])
 60 |                 td_target = reward + discount_factor * Q_2[next_state[0]][next_state[1]][int(next_state[2])][best_next_action]
 61 |                 td_delta = td_target - Q_1[state[0]][state[1]][int(state[2])][action]
 62 |                 Q_1[state[0]][state[1]][int(state[2])][action] += alpha * td_delta
 63 | 
 64 |             else:
 65 |                 # Q Learning Update
 66 |                 best_next_action = np.argmax(Q_2[next_state[0]][next_state[1]][int(next_state[2])])
 67 |                 td_target = reward + discount_factor * Q_1[next_state[0]][next_state[1]][int(next_state[2])][best_next_action]
 68 |                 td_delta = td_target - Q_2[state[0]][state[1]][int(state[2])][action]
 69 |                 Q_2[state[0]][state[1]][int(state[2])][action] += alpha * td_delta
 70 | 
 71 |             state = next_state
 72 | 
 73 |             if done:
 74 |                 break
 75 | 
 76 |     policy = make_epsilon_greedy_policy(Q_1, Q_2, 0, env.action_space.n)
 77 | 
 78 |     win_count = 0
 79 |     reward_sum = 0
 80 | 
 81 |     for i_episode in range(test_episodes):
 82 |         # Reset the environment and pick the first action
 83 |         observation = env.reset()
 84 |         state = observation
 85 | 
 86 |         while True:
 87 |             # Take a step
 88 |             action_probs = policy(observation)
 89 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
 90 | 
 91 |             observation, reward, done, _ = env.step(action)
 92 | 
 93 |             reward_sum += reward
 94 | 
 95 |             if reward > 0:
 96 |                 win_count += 1
 97 | 
 98 |             if done:
 99 |                 break
100 | 
101 |     print("Total Episodes: {}".format(test_episodes))
102 |     print("Win Count: {}".format(win_count))
103 |     print("Reward Sum: {}".format(reward_sum))
104 | 
105 | 
106 | double_Q_learning(env, 10000, 1000)
107 | 


--------------------------------------------------------------------------------
/temporal_difference/blackjack_figures.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # REFERENCE : https://github.com/dennybritz/reinforcement-learning
  3 | 
  4 | import numpy as np
  5 | import gym
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib.patches as mpatches
  8 | 
  9 | import sys
 10 | from collections import defaultdict
 11 | 
 12 | env = gym.make('Blackjack-v0')
 13 | 
 14 | 
 15 | def make_epsilon_greedy_policy(Q, epsilon, nA):
 16 |     def policy_fn(observation):
 17 |         A_probs = np.ones(nA, dtype=float) * epsilon / nA
 18 |         best_action = np.argmax(Q[observation])
 19 |         A_probs[best_action] += (1 - epsilon)
 20 |         return A_probs
 21 | 
 22 |     return policy_fn
 23 | 
 24 | 
 25 | def sarsa(env, n_episodes=500, discount_factor=1.0, alpha=0.5, epsilon=0.1):
 26 | 
 27 |     Q = defaultdict(lambda: np.zeros(env.action_space.n))
 28 |     policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
 29 | 
 30 |     for i_episode in range(n_episodes):
 31 |         if (i_episode + 1) % 100 == 0:
 32 |             print("\rEpisode {}/{}.".format(i_episode + 1, n_episodes), end="")
 33 |             sys.stdout.flush()
 34 |         state = env.reset()
 35 |         while True:
 36 |             A_probs = policy(state)
 37 |             action = np.random.choice(np.arange(len(A_probs)), p=A_probs)
 38 |             next_state, reward, done, _ = env.step(action)
 39 | 
 40 |             next_action_probs = policy(next_state)
 41 |             next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
 42 |             td_target = reward + discount_factor * Q[next_state][next_action]
 43 |             td_error = td_target - Q[state][action]
 44 |             Q[state][action] += alpha * td_error
 45 | 
 46 |             if done:
 47 |                 break
 48 | 
 49 |             state = next_state
 50 | 
 51 |     return Q
 52 | 
 53 | 
 54 | def Q_learning(env, n_episodes=500, discount_factor=1.0, alpha=0.5, epsilon=0.1):
 55 | 
 56 |     Q = defaultdict(lambda: np.zeros(env.action_space.n))
 57 |     policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
 58 | 
 59 |     for i_episode in range(n_episodes):
 60 |         if (i_episode + 1) % 100 == 0:
 61 |             print("\rEpisode {}/{}.".format(i_episode + 1, n_episodes), end="")
 62 |             sys.stdout.flush()
 63 |         state = env.reset()
 64 |         while True:
 65 |             probs = policy(state)
 66 |             action = np.random.choice(np.arange(len(probs)), p=probs)
 67 |             next_state, reward, done, _ = env.step(action)
 68 |             next_action = np.argmax(Q[next_state])
 69 |             td_target = reward + discount_factor * Q[next_state][next_action]
 70 |             td_error = td_target - Q[state][action]
 71 |             Q[state][action] += alpha * td_error
 72 | 
 73 |             if done:
 74 |                 break
 75 | 
 76 |             state = next_state
 77 | 
 78 |     return Q
 79 | 
 80 | 
 81 | def plot_figure(ax, usable_ace):
 82 |     def get_action(player_hand, dealer_showing, usable_ace):
 83 |         return policy[player_hand, dealer_showing, usable_ace] if (player_hand, dealer_showing, usable_ace) in policy else 1
 84 | 
 85 |     policy_mat = np.array([[get_action(player_hand, dealer_showing, usable_ace) for dealer_showing in range(1, 11)]
 86 |                            for player_hand in range(21, 10, -1)])
 87 | 
 88 |     ax.imshow(policy_mat, cmap=plt.cm.Accent, extent=[0.5, 10.5, 10.5, 21.5])
 89 |     plt.ylim(11, 21)
 90 |     plt.xlim(1, 10)
 91 |     plt.xlabel('Dealer Hand')
 92 |     plt.ylabel('Player Hand')
 93 |     hit_patch = mpatches.Patch(color=plt.cm.Accent(.1), label='Stick')
 94 |     stick_patch = mpatches.Patch(color=plt.cm.Accent(.9), label='Hit')
 95 |     plt.legend(handles=[hit_patch, stick_patch])
 96 | 
 97 | 
 98 | # Q = sarsa(env, 1000)
 99 | Q = Q_learning(env, 1000)
100 | policy = dict((k, np.argmax(v)) for k, v in Q.items())
101 | 
102 | fig = plt.figure(figsize=(15, 15))
103 | ax = fig.add_subplot(121)
104 | ax.set_title('Blackjack MC Policy - No Usable Ace')
105 | plot_figure(ax, True)
106 | ax = fig.add_subplot(122)
107 | ax.set_title('Blackjack MC Policy - Usable Ace')
108 | plot_figure(ax, False)
109 | plt.show()
110 | 


--------------------------------------------------------------------------------
/function_approximation/tiles3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tile Coding Software version 3.0beta
  3 | by Rich Sutton
  4 | based on a program created by Steph Schaeffer and others
  5 | External documentation and recommendations on the use of this code is available in the 
  6 | reinforcement learning textbook by Sutton and Barto, and on the web.
  7 | These need to be understood before this code is.
  8 | 
  9 | This software is for Python 3 or more.
 10 | 
 11 | This is an implementation of grid-style tile codings, based originally on
 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 
 13 | Here we provide a function, "tiles", that maps floating and integer
 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while
 15 | wrapping some floats to provided widths (the lower wrap value is always 0).
 16 | 
 17 | The float variables will be gridded at unit intervals, so generalization
 18 | will be by approximately 1 in each direction, and any scaling will have 
 19 | to be done externally before calling tiles.
 20 | 
 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should
 22 | also be greater than or equal to four times the number of floats.
 23 | 
 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 
 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 
 26 | coordinates are to be returned without being converted to indices).
 27 | """
 28 | 
 29 | basehash = hash
 30 | 
 31 | class IHT:
 32 |     "Structure to handle collisions"
 33 |     def __init__(self, sizeval):
 34 |         self.size = sizeval                        
 35 |         self.overfullCount = 0
 36 |         self.dictionary = {}
 37 | 
 38 |     def __str__(self):
 39 |         "Prepares a string for printing whenever this object is printed"
 40 |         return "Collision table:" + \
 41 |                " size:" + str(self.size) + \
 42 |                " overfullCount:" + str(self.overfullCount) + \
 43 |                " dictionary:" + str(len(self.dictionary)) + " items"
 44 | 
 45 |     def count (self):
 46 |         return len(self.dictionary)
 47 |     
 48 |     def fullp (self):
 49 |         return len(self.dictionary) >= self.size
 50 |     
 51 |     def getindex (self, obj, readonly=False):
 52 |         d = self.dictionary
 53 |         if obj in d: return d[obj]
 54 |         elif readonly: return None
 55 |         size = self.size
 56 |         count = self.count()
 57 |         if count >= size:
 58 |             if self.overfullCount==0: print('IHT full, starting to allow collisions')
 59 |             self.overfullCount += 1
 60 |             return basehash(obj) % self.size
 61 |         else:
 62 |             d[obj] = count
 63 |             return count
 64 | 
 65 | def hashcoords(coordinates, m, readonly=False):
 66 |     if type(m)==IHT: return m.getindex(tuple(coordinates), readonly)
 67 |     if type(m)==int: return basehash(tuple(coordinates)) % m
 68 |     if m==None: return coordinates
 69 | 
 70 | from math import floor, log
 71 | from itertools import zip_longest
 72 | 
 73 | def tiles (ihtORsize, numtilings, floats, ints=[], readonly=False):
 74 |     """returns num-tilings tile indices corresponding to the floats and ints"""
 75 |     qfloats = [floor(f*numtilings) for f in floats]
 76 |     Tiles = []
 77 |     for tiling in range(numtilings):
 78 |         tilingX2 = tiling*2
 79 |         coords = [tiling]
 80 |         b = tiling
 81 |         for q in qfloats:
 82 |             coords.append( (q + b) // numtilings )
 83 |             b += tilingX2
 84 |         coords.extend(ints)
 85 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
 86 |     return Tiles
 87 | 
 88 | def tileswrap (ihtORsize, numtilings, floats, wrawidths, ints=[], readonly=False):
 89 |     """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats"""
 90 |     qfloats = [floor(f*numtilings) for f in floats]
 91 |     Tiles = []
 92 |     for tiling in range(numtilings):
 93 |         tilingX2 = tiling*2
 94 |         coords = [tiling]
 95 |         b = tiling
 96 |         for q, width in zip_longest(qfloats, wrapwidths):
 97 |             c = (q + b%numtilings) // numtilings
 98 |             coords.append(c%width if width else c)
 99 |             b += tilingX2
100 |         coords.extend(ints)
101 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
102 |     return Tiles
103 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/main.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | from MountainCar import MountainCar
  6 | from sarsa import Sarsa
  7 | from double_q import DQLearning
  8 | from expected_sarsa import ExpectedSarsa
  9 | 
 10 | 
 11 | def test_double_q():
 12 |     runs = 1
 13 |     episode = 2
 14 | 
 15 |     alphas = []
 16 |     steps = np.zeros((len(alphas), episode))
 17 | 
 18 |     for run in range(0, runs):
 19 |         for i, n_step in zip(range(len(alphas)), alphas):
 20 |             learning_model = DQLearning(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False)
 21 | 
 22 |             for e in range(episode):
 23 |                 print('run:', run, 'alpha:', alphas[i], 'episode:', e)
 24 | 
 25 |                 start = [random.uniform(-0.6, -0.4), 0.0]
 26 |                 env = MountainCar(start, goal, Xrange, Vrange)
 27 | 
 28 |                 step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e)
 29 |                 steps[i, e] += step
 30 | 
 31 |     steps /= runs
 32 | 
 33 |     for i in range(0, len(alphas)):
 34 |         plt.plot(steps[i], label='n = ' + str(alphas[i]))
 35 |     plt.xlabel('Alpha')
 36 |     plt.ylabel('Steps per episode')
 37 |     plt.yscale('log')
 38 |     plt.legend()
 39 | 
 40 |     plt.show()
 41 | 
 42 | 
 43 | def one_vs_multi_step():
 44 |     runs = 1
 45 |     episode = 10
 46 |     n_steps = np.arange(1, 9, 1)
 47 | 
 48 |     steps = np.zeros((len(n_steps), episode))
 49 | 
 50 |     for run in range(0, runs):
 51 |         for i, n_step in zip(range(len(n_steps)), n_steps):
 52 |             learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False)
 53 |             #learning_model = Sarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False)
 54 | 
 55 |             for e in range(episode):
 56 |                 print('run:', run, 'steps:', n_steps[i], 'episode:', e)
 57 | 
 58 |                 start = [random.uniform(-0.6, -0.4), 0.0]
 59 |                 env = MountainCar(start, goal, Xrange, Vrange)
 60 | 
 61 |                 step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e)
 62 |                 steps[i, e] += step
 63 | 
 64 |     steps /= runs
 65 | 
 66 |     for i in range(0, len(n_steps)):
 67 |         plt.plot(steps[i], label='n = '+str(n_steps[i]))
 68 |     plt.xlabel('Episode')
 69 |     plt.ylabel('Steps per episode')
 70 |     plt.yscale('log')
 71 |     plt.legend()
 72 | 
 73 |     plt.show()
 74 | 
 75 | 
 76 | def effect_of_alpha_and_n():
 77 |     # all possible alphas
 78 |     alphas = [0.0002, 0.0004, 0.0008, 0.0012]
 79 | 
 80 |     # all possible steps
 81 |     n_steps = np.arange(1, 9, 1)
 82 | 
 83 |     epoch = 20
 84 |     runs = 1
 85 | 
 86 |     steps = np.zeros((len(n_steps), len(alphas)))
 87 | 
 88 |     for run in range(0, runs):
 89 |         for nStepIndex, n_step in zip(range(0, len(n_steps)), n_steps):
 90 |             for alphaIndex, alpha in zip(range(0, len(alphas)), alphas):
 91 |                 learning_model = ExpectedSarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon, load_model=False)
 92 |                 #learning_model = Sarsa(num_actions=num_actions, max_memory=max_memory, e_greedy=epsilon,
 93 |                 #                       load_model=False)
 94 | 
 95 |                 for e in range(0, epoch):
 96 |                     print('run:', run, 'steps:', n_step, 'alpha:', alpha, 'episode:', e)
 97 | 
 98 |                     start = [random.uniform(-0.6, -0.4), 0.0]
 99 |                     env = MountainCar(start, goal, Xrange, Vrange)
100 | 
101 |                     step = learning_model.episode(env=env, batch_size=batch_size, n_step=n_step, epoch=e)
102 |                     steps[nStepIndex, alphaIndex] += step
103 | 
104 |     # average over independent runs and episodes
105 |     steps /= runs * epoch
106 | 
107 |     for i in range(0, len(n_steps)):
108 |         plt.plot(alphas, steps[i, :], label='n = '+str(n_steps[i]))
109 |     plt.xlabel('Alpha')
110 |     plt.ylabel('Steps per episode')
111 |     plt.legend()
112 | 
113 |     plt.show()
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     # parameters
118 |     epsilon = .1  # exploration
119 |     num_actions = 3  # [move_left, stay, move_right]
120 |     max_memory = 20000
121 |     batch_size = 100
122 |     input_size = 2
123 | 
124 |     Xrange = [-1.2, 0.6]
125 |     Vrange = [-0.07, 0.07]
126 |     goal = [0.5]
127 | 
128 |     one_vs_multi_step()
129 |     effect_of_alpha_and_n()
130 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/expected_sarsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import math
  4 | import os.path
  5 | 
  6 | from keras.models import Sequential
  7 | from keras.layers.core import Dense, Dropout
  8 | from keras.optimizers import SGD
  9 | from keras.regularizers import l1
 10 | 
 11 | from Agent import Agent
 12 | from Policy import EpsilonGreedyPolicy
 13 | 
 14 | 
 15 | class ExpectedSarsa(Agent):
 16 |     def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True):
 17 |         super().__init__(max_memory, discount)
 18 |         self.num_actions = num_actions
 19 |         self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy)
 20 |         self.next_action = None
 21 | 
 22 |         self.__create_model(load_model)
 23 | 
 24 |     def __create_model(self, load_model):
 25 |         hidden_size = 100
 26 | 
 27 |         self.model = Sequential()
 28 |         self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01)))
 29 |         self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01)))
 30 |         self.model.add(Dropout(rate=.1))
 31 |         self.model.add(Dense(self.num_actions))
 32 | 
 33 |         sgd = SGD(lr=0.0001, momentum=0.99)
 34 |         self.model.compile(optimizer=sgd, loss="mse")
 35 | 
 36 |         if load_model and os.path.exists("model.esarsa"):
 37 |             self.model.load_weights("model.esarsa")
 38 | 
 39 |     def episode(self, env, batch_size=10, n_step=1, epoch=0):
 40 |         loss = 0.
 41 |         env.reset()
 42 |         game_over = False
 43 |         # get initial input
 44 |         state = env.observe()
 45 | 
 46 |         time = 0
 47 |         while not game_over:
 48 |             # go to next time n_step
 49 |             time += 1
 50 | 
 51 |             action = self.get_action(state)
 52 | 
 53 |             # apply action, get rewards and new state
 54 |             next_state, reward, game_over = env.act(action)
 55 | 
 56 |             # store experience
 57 |             self.remember((state, action, reward, next_state), game_over)
 58 | 
 59 |             loss += self.train(current_step=time, batch_size=batch_size, n_step=n_step)
 60 | 
 61 |             print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format(
 62 |                 time, epoch, n_step, loss, next_state[0, 0], action - 1))
 63 | 
 64 |             if math.isnan(loss) or time > 1500:
 65 |                 break
 66 | 
 67 |             state = next_state
 68 | 
 69 |         print("Episode finished!")
 70 |         print("************************")
 71 | 
 72 |         return time
 73 | 
 74 |     def change_epsilon(self, epsilon):
 75 |         self.policy.epsilon = epsilon
 76 | 
 77 |     def set_learning_rate(self, lr):
 78 |         self.model.optimizer.lr = lr
 79 | 
 80 |     def save_model(self):
 81 |         # Save trained model weights and architecture, this will be used by the visualization code
 82 |         self.model.save_weights("model.esarsa", overwrite=True)
 83 |         with open("model.json", "w") as outfile:
 84 |             json.dump(self.model.to_json(), outfile)
 85 | 
 86 |     def train(self, current_step, batch_size=10, n_step=1):
 87 |         loss = 0.
 88 | 
 89 |         model = self.get_model()
 90 | 
 91 |         if current_step - n_step >= 0:
 92 |             inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step)
 93 | 
 94 |             loss += model.train_on_batch(inputs, targets)
 95 | 
 96 |         return loss
 97 | 
 98 |     def get_batch(self, batch_size=10, n_step=1):
 99 |         len_memory = len(
100 |             self.memory) - n_step + 1  # we don't want to update 'n' last states,because their returns have not seen yet
101 |         num_actions = self.model.output_shape[-1]
102 |         env_dim = self.memory[0][0][0].shape[1]
103 | 
104 |         inputs = np.zeros((min(len_memory, batch_size), env_dim))
105 |         targets = np.zeros((inputs.shape[0], num_actions))
106 | 
107 |         for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
108 |             state, action, reward, next_state = self.memory[idx][0]
109 |             game_over = self.memory[idx][1]
110 | 
111 |             inputs[i:i + 1] = state
112 |             targets[i] = self.model.predict(state)[0]
113 | 
114 |             if game_over:  # if game_over is True
115 |                 targets[i, action] = reward
116 |             else:
117 |                 returns = 0.0
118 |                 t_n_state = next_state
119 | 
120 |                 for t in range(idx, idx + n_step):
121 |                     _, _, reward, t_n_state = self.memory[t][0]
122 |                     returns += pow(self.discount, t - idx) * reward
123 | 
124 |                 if reward != 100:
125 |                     q = self.model.predict(t_n_state)[0]
126 |                     pi = self.policy.get_pi(q)
127 | 
128 |                     q_sa = np.dot(q, pi.T)
129 | 
130 |                     returns += pow(self.discount, n_step) * q_sa
131 | 
132 |                 targets[i, action] = returns
133 | 
134 |         return inputs, targets
135 | 
136 |     def get_action(self, state):
137 |         return self.policy.get_action(q_values=self.model.predict(state)[0])
138 | 
139 |     def get_model(self):
140 |         return self.model
141 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/sarsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import os.path
  4 | from keras.models import Sequential
  5 | from keras.layers.core import Dense, Dropout
  6 | from keras.optimizers import SGD
  7 | from keras.regularizers import L1L2, l1
  8 | from Agent import Agent
  9 | from Policy import EpsilonGreedyPolicy
 10 | 
 11 | 
 12 | class Sarsa(Agent):
 13 |     def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True):
 14 |         super().__init__(max_memory, discount)
 15 |         self.num_actions = num_actions
 16 |         self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy)
 17 |         self.next_action = None
 18 | 
 19 |         self.__create_model(load_model)
 20 | 
 21 |     def change_epsilon(self, epsilon):
 22 |         self.policy.epsilon = epsilon
 23 | 
 24 |     def __create_model(self, load_model):
 25 |         hidden_size = 100
 26 | 
 27 |         self.model = Sequential()
 28 |         self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01)))
 29 |         self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01)))
 30 |         self.model.add(Dropout(rate=.1))
 31 |         self.model.add(Dense(self.num_actions))
 32 | 
 33 |         sgd = SGD(lr=0.0001, momentum=0.99)
 34 |         self.model.compile(optimizer=sgd, loss="mse")
 35 | 
 36 |         if load_model and os.path.exists("model.sarsa"):
 37 |             self.model.load_weights("model.sarsa")
 38 | 
 39 |     def set_learning_rate(self, lr):
 40 |         self.model.optimizer.lr = lr
 41 | 
 42 |     def save_model(self):
 43 |         # Save trained model weights and architecture, this will be used by the visualization code
 44 |         self.model.save_weights("model.sarsa", overwrite=True)
 45 |         with open("model.json", "w") as outfile:
 46 |             json.dump(self.model.to_json(), outfile)
 47 | 
 48 |     def episode(self, env, batch_size=10, n_step=1, epoch=0):
 49 |         loss = 0.
 50 |         env.reset()
 51 |         game_over = False
 52 |         # get initial input
 53 |         state = env.observe()
 54 |         action = self.get_action(state)
 55 | 
 56 |         step = 0
 57 |         while not game_over:
 58 |             # go to next time n_step
 59 |             step += 1
 60 |             # apply action, get rewards and new state
 61 |             next_state, reward, game_over = env.act(action)
 62 | 
 63 |             if reward == 100:
 64 |                 next_action = None
 65 |             else:
 66 |                 next_action = self.get_action(next_state)
 67 | 
 68 |             # store experience
 69 |             self.remember((state, action, reward, next_state, next_action), game_over)
 70 | 
 71 |             loss += self.train(current_step=step, batch_size=batch_size, n_step=n_step)
 72 |             print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format(
 73 |                 step, epoch, n_step, loss, next_state[0, 0], action - 1))
 74 | 
 75 |             if np.math.isnan(loss) or step > 1500:
 76 |                 break
 77 | 
 78 |             state = next_state
 79 |             action = next_action
 80 | 
 81 |         print("Episode finished!")
 82 |         print("************************")
 83 | 
 84 |         return step
 85 | 
 86 |     def train(self, current_step, batch_size=10, n_step=1):
 87 |         loss = 0.
 88 | 
 89 |         model = self.get_model()
 90 | 
 91 |         if current_step - n_step >= 0:
 92 |             inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step)
 93 | 
 94 |             loss += model.train_on_batch(inputs, targets)
 95 | 
 96 |         return loss
 97 | 
 98 |     def get_batch(self, batch_size=10, n_step=1):
 99 |         len_memory = len(
100 |             self.memory) - n_step + 1  # we don't want to update 'n' last states,because their returns have not seen yet
101 |         num_actions = self.model.output_shape[-1]
102 |         env_dim = self.memory[0][0][0].shape[1]
103 | 
104 |         inputs = np.zeros((min(len_memory, batch_size), env_dim))
105 |         targets = np.zeros((inputs.shape[0], num_actions))
106 | 
107 |         for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
108 |             state, action, reward, next_state, next_action = self.memory[idx][0]
109 |             game_over = self.memory[idx][1]
110 | 
111 |             inputs[i:i + 1] = state
112 |             targets[i] = self.model.predict(state)[0]
113 | 
114 |             if game_over:  # if it is last state
115 |                 targets[i, action] = reward
116 |             else:
117 |                 returns = 0.0
118 |                 t_n_state = next_state
119 |                 t_n_action = next_action
120 | 
121 |                 for t in range(idx, idx + n_step):
122 |                     _, _, reward, t_n_state, t_n_action = self.memory[t][0]
123 |                     returns += pow(self.discount, t - idx) * reward
124 | 
125 |                 if reward != 100:  # not self.memory[update_step + n_step - 1][1]:
126 |                     q = self.model.predict(t_n_state)[0]
127 |                     q_sa = q[t_n_action]
128 | 
129 |                     returns += pow(self.discount, n_step) * q_sa
130 | 
131 |                 targets[i, action] = returns
132 | 
133 |         return inputs, targets
134 | 
135 |     def get_action(self, state):
136 |         return self.policy.get_action(q_values=self.model.predict(state)[0])
137 | 
138 |     def get_model(self):
139 |         return self.model
140 | 


--------------------------------------------------------------------------------
/lib.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reinforcement learning library
  3 | 
  4 | Included methods:
  5 |     Temporal Difference
  6 |     Q Learning
  7 |     SARSA
  8 |     Double Q Learning
  9 | """
 10 | import numpy as np
 11 | import random
 12 | from collections import defaultdict
 13 | 
 14 | 
 15 | class policy():
 16 |     def __init__(self, epsilon, action_space):
 17 |         
 18 |         self.epsilon=epsilon
 19 |         self.nA=action_space
 20 | 
 21 |     def probs(self,q_table,observation):
 22 |         A_probs = np.ones(self.nA, dtype=float) * self.epsilon / self.nA
 23 |         best_action = np.argmax(q_table[observation])
 24 |         A_probs[best_action] += (1 - self.epsilon)
 25 | 
 26 |         return A_probs
 27 | 
 28 | 
 29 | class q_learning_agent():
 30 |     def __init__(self, epsilon, discount_factor, alpha, action_space):
 31 |         self.q_table = defaultdict(lambda: np.zeros(action_space))
 32 |         self.epsilon = epsilon
 33 |         self.discount_factor = discount_factor
 34 |         self.action_space = action_space
 35 |         self.alpha = alpha
 36 |         self.policy = policy(self.epsilon, self.action_space)
 37 | 
 38 |     def learn(self, action, reward, state, next_state):
 39 |         next_action = np.argmax(self.q_table[next_state])
 40 |         td_target = reward + self.discount_factor * self.q_table[next_state][next_action]
 41 |         td_error = td_target - self.q_table[state][action]
 42 |         self.q_table[state][action] += self.alpha * td_error
 43 | 
 44 |     def select_action(self,state):
 45 |         A_probs = self.policy.probs(self.q_table,state)
 46 | 
 47 |         return np.random.choice(np.arange(len(A_probs)), p=A_probs)
 48 | 
 49 |     def get_q_table(self):
 50 |         return self.q_table
 51 | 
 52 |     def set_q_table(self, q_table):
 53 |         self.q_table = q_table
 54 | 
 55 | 
 56 | class sarsa_learning_agent():
 57 |     def __init__(self, epsilon, discount_factor, alpha, action_space):
 58 |         self.q_table = defaultdict(lambda: np.zeros(action_space))
 59 |         self.epsilon = epsilon
 60 |         self.discount_factor = discount_factor
 61 |         self.action_space = action_space
 62 |         self.alpha = alpha
 63 |         self.policy = policy(self.epsilon, self.action_space)
 64 | 
 65 |     def learn(self, action, reward, state, next_state):
 66 |         next_action = self.select_action(next_state)
 67 |         td_target = reward + self.discount_factor * self.q_table[next_state][next_action]
 68 |         td_error = td_target - self.q_table[state][action]
 69 |         self.q_table[state][action] += self.alpha * td_error
 70 | 
 71 |     def select_action(self,state):
 72 |         A_probs = self.policy.probs(self.q_table,state)
 73 | 
 74 |         return np.random.choice(np.arange(len(A_probs)), p=A_probs)
 75 | 
 76 |     def get_q_table(self):
 77 |         return self.q_table
 78 | 
 79 |     def set_q_table(self, q_table):
 80 |         self.q_table = q_table
 81 | 
 82 | class double_q_learning_agent():
 83 |     def __init__(self, epsilon, discount_factor, alpha, action_space):
 84 |         self.q_table_1 = defaultdict(lambda: np.zeros(action_space))
 85 |         self.q_table_2 = defaultdict(lambda: np.zeros(action_space))
 86 |         self.epsilon = epsilon
 87 |         self.discount_factor = discount_factor
 88 |         self.action_space = action_space
 89 |         self.alpha = alpha
 90 |         self.policy = policy(self.epsilon, self.action_space)
 91 | 
 92 |     def learn(self, action, reward, state, next_state):
 93 |         if random.randint(1, 2) == 1:
 94 |             next_action = np.argmax(self.q_table_1[next_state])
 95 |             td_target = reward + self.discount_factor * self.q_table_2[next_state][next_action]
 96 |             td_error = td_target - self.q_table_1[state][action]
 97 |             self.q_table_1[state][action] += self.alpha * td_error
 98 |         else:
 99 |             next_action = np.argmax(self.q_table_2[next_state])
100 |             td_target = reward + self.discount_factor * self.q_table_1[next_state][next_action]
101 |             td_error = td_target - self.q_table_2[state][action]
102 |             self.q_table_2[state][action] += self.alpha * td_error
103 | 
104 |     def select_action(self, state):
105 |         A_probs_1 = self.policy.probs(self.q_table_2, state)
106 |         A_probs_2 = self.policy.probs(self.q_table_1, state)
107 | 
108 |         return np.random.choice(np.arange(len(A_probs_1)), p=(A_probs_1+A_probs_2)/2)
109 | 
110 |     def get_q_tables(self):
111 |         return [self.q_table_1, self.q_table_2]
112 | 
113 |     def set_q_table(self, q_table_1, q_table_2):
114 |         self.q_table_1 = q_table_1
115 |         self.q_table_2 = q_table_2
116 | 
117 | 
118 | class Expected_Sarsa_learning_agent():
119 |     def init(self, epsilon, discount_factor,alpha, action_space):
120 |         self.q_table = defaultdict(lambda: np.zeros(action_space))
121 |         self.epsilon = epsilon
122 |         self.discount_factor = discount_factor
123 |         self.action_space = action_space
124 |         self.alpha = alpha
125 |         self.policy = policy(self.epsilon, self.action_space)
126 | 
127 |     def learn(self, action, reward, state, next_state):
128 |         A_probs = self.policy.probs(self.q_table,next_state)
129 |         expected_value = np.dot(A_probs,self.q_table[next_state])
130 |         td_target = reward + self.discount_factor * expected_value
131 |         td_error = td_target - self.q_table[state][action]
132 |         self.q_table[state][action] += self.alpha * td_error
133 | 
134 |     def select_action(self,state):
135 |         A_probs = self.policy.probs(self.q_table, state)
136 |         return np.random.choice(np.arange(len(A_probs)), p=A_probs)
137 | 
138 |     def get_Q_table(self):
139 |         return self.q_table
140 | 
141 |     def set_Q_table(self, q_table):
142 |         self.q_table = q_table
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/Deep-RL-Mountain-Car/double_q.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os.path
  3 | import json
  4 | 
  5 | from keras.models import Sequential
  6 | from keras.layers.core import Dense
  7 | from keras.optimizers import sgd
  8 | from keras.regularizers import l1
  9 | 
 10 | from Agent import Agent
 11 | from Policy import EpsilonGreedyPolicy
 12 | 
 13 | 
 14 | class DQLearning(Agent):
 15 |     def __init__(self, num_actions, max_memory=100, discount=.99, e_greedy=.1, load_model=True):
 16 |         super().__init__(max_memory, discount)
 17 |         self.num_actions = num_actions
 18 |         self.epsilon = e_greedy
 19 |         self.policy = EpsilonGreedyPolicy(number_of_action=num_actions, epsilon=e_greedy)
 20 | 
 21 |         self.__create_model(load_model)
 22 |         self.current_model = self.model
 23 | 
 24 |     def change_epsilon(self, epsilon):
 25 |         self.policy.epsilon = epsilon
 26 | 
 27 |     def episode(self, env, batch_size=10, n_step=1, epoch=0):
 28 |         loss = 0.
 29 |         env.reset()
 30 |         game_over = False
 31 |         # get initial input
 32 |         state = env.observe()
 33 | 
 34 |         step = 0
 35 |         while not game_over:
 36 |             # go to next time n_step
 37 |             step += 1
 38 |             action = self.get_action(state)
 39 |             # apply action, get rewards and new state
 40 |             next_state, reward, game_over = env.act(action)
 41 | 
 42 |             # store experience
 43 |             self.remember((state, action, reward, next_state), game_over)
 44 | 
 45 |             loss += self.train(current_step=step, batch_size=batch_size, n_step=n_step)
 46 |             print('Step {}| epoch {} | n_step {} | Loss {:.4f} |Pos {:.3f} | Act {}'.format(
 47 |                 step, epoch, n_step, loss, next_state[0, 0], action - 1))
 48 | 
 49 |             if np.math.isnan(loss) or step > 1500:
 50 |                 break
 51 | 
 52 |             state = next_state
 53 | 
 54 |         print("Episode finished!")
 55 |         print("************************")
 56 | 
 57 |         return step
 58 | 
 59 |     def train(self, current_step, batch_size=10, n_step=1):
 60 |         loss = 0.
 61 | 
 62 |         model = self.get_model()
 63 | 
 64 |         inputs, targets = self.get_batch(batch_size=batch_size, n_step=n_step)
 65 | 
 66 |         loss += model.train_on_batch(inputs, targets)
 67 | 
 68 |         return loss
 69 | 
 70 |     def get_batch(self, batch_size=10, n_step=1):
 71 |         len_memory = len(self.memory)
 72 |         num_actions = self.model.output_shape[-1]
 73 | 
 74 |         env_dim = self.memory[0][0][0].shape[1]
 75 |         inputs = np.zeros((min(len_memory, batch_size), env_dim))
 76 |         targets = np.zeros((inputs.shape[0], num_actions))
 77 | 
 78 |         for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
 79 |             state, action, reward, next_state = self.memory[idx][0]
 80 |             game_over = self.memory[idx][1]
 81 | 
 82 |             inputs[i:i + 1] = state
 83 |             targets[i] = self.model.predict(state)[0]
 84 | 
 85 |             if game_over:  # if game_over is True
 86 |                 targets[i, action] = reward
 87 |             else:
 88 |                 q1 = self.model.predict(next_state)[0]
 89 |                 q2 = self.model2.predict(next_state)[0]
 90 | 
 91 |                 if self.current_model == self.model:
 92 |                     best_action = np.argmax(q1)
 93 |                     q_sa = q2[best_action]
 94 |                 else:
 95 |                     best_action = np.argmax(q2)
 96 |                     q_sa = q1[best_action]
 97 | 
 98 |                 targets[i, action] = reward + self.discount * q_sa
 99 | 
100 |         return inputs, targets
101 | 
102 |     def get_action(self, state):
103 |         q1 = self.model.predict(state)[0]
104 |         q2 = self.model2.predict(state)[0]
105 | 
106 |         return self.policy.get_action(q_values=q1+q2)
107 | 
108 |     def get_model(self):
109 |         aa = np.random.randint(2, size=1)
110 |         if aa == 0:
111 |             self.current_model = self.model
112 |         else:
113 |             self.current_model = self.model2
114 | 
115 |         return self.current_model
116 | 
117 |     def __create_model(self, load_model=True):
118 |         hidden_size = 100
119 | 
120 |         self.model = Sequential()
121 |         self.model.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01)))
122 |         self.model.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01)))
123 |         # self.model.add(Dense(hidden_size, activation='relu'))
124 |         self.model.add(Dense(self.num_actions))
125 |         self.model.compile(sgd(lr=0.0001), "mse")
126 | 
127 |         if load_model and os.path.exists("model.dqlearning1"):
128 |             self.model.load_weights("model.dqlearning1")
129 | 
130 |         self.model2 = Sequential()
131 |         self.model2.add(Dense(hidden_size, input_shape=(2,), activation='relu', kernel_regularizer=l1(0.01)))
132 |         self.model2.add(Dense(hidden_size, activation='relu', kernel_regularizer=l1(0.01)))
133 |         # self.model2.add(Dense(hidden_size, activation='relu'))
134 |         self.model2.add(Dense(self.num_actions))
135 |         self.model2.compile(sgd(lr=0.0001), "mse")
136 | 
137 |         if load_model and os.path.exists("model.dqlearning2"):
138 |             self.model.load_weights("model.dqlearning2")
139 | 
140 |     def save_model(self):
141 |         # Save trained model weights and architecture, this will be used by the visualization code
142 |         self.model.save_weights("model.dqlearning1", overwrite=True)
143 |         self.model2.save_weights("model.dqlearning2", overwrite=True)
144 | 
145 |         with open("model3.json", "w") as outfile:
146 |             json.dump(self.model.to_json(), outfile)
147 | 
148 |         with open("model4.json", "w") as outfile:
149 |             json.dump(self.model2.to_json(), outfile)


--------------------------------------------------------------------------------
/introduction_to_rl/tic_tac_toe.py:
--------------------------------------------------------------------------------
  1 | # Q-Learning for TicTacToe
  2 | 
  3 | import numpy as np
  4 | import random
  5 | 
  6 | 
  7 | class Env():
  8 |     def __init__(self, random_play):
  9 |         self.random_play = random_play
 10 |         self.state = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
 11 |         self.reward = 0.5
 12 |         self.done = 0
 13 |         self.random_agent_value = "O"
 14 |         self.winner = "Random Agent"
 15 | 
 16 |     def step(self, action, agent_value):
 17 |         # action is a array like [0,1] state
 18 |         if(agent_value == "X" and self.control_win(self.state) == self.random_agent_value):
 19 |             self.reward = 0
 20 |             self.done = 1
 21 | 
 22 |         if(self.state[action[0]][action[1]] != 0):
 23 |             print("there is something wrong    :   this state is not avaliable for playing.")
 24 |         else:
 25 |             self.state[action[0]][action[1]] = agent_value
 26 | 
 27 |         if(self.control_win(self.state) == agent_value):
 28 |             self.reward = 1
 29 |             self.done = 1
 30 |             self.winner = "agent X"
 31 |         if(self.control_win(self.state) == "DRAW"):
 32 |             self.reward = 0
 33 |             self.done = 1
 34 |             self.winner = "DRAW"
 35 |         if(self.done != 1):
 36 |             if(self.random_play == 1):
 37 |                 self.random_agent_act()
 38 | 
 39 |         return(self.state, self.reward, self.done, self.winner)
 40 | 
 41 |     def control_win(self, state):
 42 |         for i in range(3):
 43 |             if state[i][0] != 0 and state[i][0] == state[i][1] and state[i][0] == state[i][2]:
 44 |                 return state[i][0]
 45 |             if state[0][i] != 0 and state[0][i] == state[1][i] and state[0][i] == state[2][i]:
 46 |                 return state[0][i]
 47 |             if state[0][0] != 0 and state[0][0] == state[1][1] and state[0][0] == state[2][2]:
 48 |                 return state[0][0]
 49 |             if state[0][2] != 0 and state[0][2] == state[1][1] and state[0][2] == state[2][0]:
 50 |                 return state[0][2]
 51 | 
 52 |         empty_spaces = []
 53 |         values_of_empty_spaces = []
 54 | 
 55 |         for i in range(3):
 56 |             for j in range(3):
 57 |                 if self.state[i][j] == 0:
 58 |                     empty_spaces.append([i, j])
 59 |                     values_of_empty_spaces.append(0)
 60 |         if (len(values_of_empty_spaces) == 0):
 61 |             return "DRAW"
 62 |         else:
 63 |             return 0
 64 | 
 65 |     def random_agent_act(self):
 66 |         empty_spaces = []
 67 |         values_of_empty_spaces = []
 68 | 
 69 |         for i in range(3):
 70 |             for j in range(3):
 71 |                 if self.state[i][j] == 0:
 72 |                     empty_spaces.append([i, j])
 73 |                     values_of_empty_spaces.append(0)
 74 | 
 75 |         a = empty_spaces[random.choice(list(enumerate(values_of_empty_spaces)))[0]]
 76 | 
 77 |         self.state[a[0]][a[1]] = self.random_agent_value
 78 | 
 79 | 
 80 | def learn_from_random_agent():
 81 |     """
 82 | 
 83 |     Bu fonsiyonun amacı boş bir değer fonksiyonu yaratıp, bir çok oyun oynayarak değer fonksiyonunun
 84 |     içeriğini oynanan her adımda V (s) ← V (s) + α[V(s0)−V(S1)]    formülü ile güncellemektir.
 85 |     Öğrenmeyi rastgele hareketler yapan bir oyuncu üzerinden gerçekleştireceğiz.
 86 | 
 87 |     alpha ====> öğrenme katsayısı
 88 |     Öğrenme katsayısı yaptığımız her adım değer fonksiyonu güncellemesindeki [V(s0)−V(S1)] adımının katsayısı olacak ve
 89 |     her yeni deneyimin hafızamıza ne kadar etki bırakacağının katsayısı olacak.
 90 | 
 91 |     epsilon  ====>  rastgele seçim katsayısı
 92 |     Kullanacak olduğumuz epsilon-greedy politikasını bizi bulduğumuz ilk maksimum değere takılı kalmayıp ortamın ödüllerini
 93 |     epsilon olasılıkla keşif etmemizi sağlayacak.
 94 | 
 95 |     value_table  ====> ortamdaki her durumun için; içinde bulunması ne kadar güzel bir durum olduğunu anlatan,
 96 |     ortamın durum sayısı kadar skaler değer barındıran matris.  ortamdaki durum sayısı 9 ise ===> len(value_func)=9
 97 | 
 98 |     episodes   =====> öğrenme aşamasında kaç oyun oynatacağımız.
 99 | 
100 |     This function is for learning the value function with playing random agent
101 |     Random agent just play randomly to valid states
102 |     formula of value function estimation is ======>    V (s) ← V (s) + α[V(s0)−V(S1)]
103 | 
104 |     action -----> [i,j]  şeklinde gösterildiği için value function hesabında action(state) kullandım.
105 |     """
106 | 
107 |     episodes = 100
108 |     value_table = [0, 0, 0, 0, 0, 0, 0, 0, 0]
109 | 
110 |     for i in range(episodes):
111 |         eps = 0.1
112 |         env = Env(random_play=1)
113 |         total_reward = 0
114 | 
115 |         done = 0
116 |         agent_value = "X"   # it must be X for random play
117 |         state = env.state
118 |         old_a = [0, 0]
119 |         alfa = 0.99
120 | 
121 |         while not done:
122 |             empty_spaces = []
123 |             values_of_empty_spaces = []
124 | 
125 |             for i in range(3):
126 |                 for j in range(3):
127 |                     if state[i][j] == 0:
128 |                         empty_spaces.append([i, j])
129 |                         values_of_empty_spaces.append(value_table[i * 3 + j])
130 | 
131 |             if np.random.random() < eps or np.sum(value_table) == 0:
132 |                 a = empty_spaces[random.choice(list(enumerate(values_of_empty_spaces)))[0]]
133 |             else:
134 |                 # select the action with largest q value in state s
135 |                 a = empty_spaces[np.argmax(values_of_empty_spaces)]
136 | 
137 |             new_s, reward, done, winner = env.step(a, agent_value)
138 |             total_reward = reward + total_reward
139 | 
140 |             if (state != [[0, 0, 0], [0, 0, 0], [0, 0, 0]]):
141 |                 value_table[old_a[0] * 3+old_a[1]] = value_table[old_a[0] * 3 + old_a[1]] + alfa*(reward - value_table[old_a[0] * 3 + old_a[1]])
142 |             else:
143 |                 value_table[a[0] * 3 + a[1]] = alfa * reward
144 | 
145 |             old_a = a
146 |             state = new_s
147 | 
148 |             if(done == 1):
149 |                 print("Winner is  :" + str(winner))
150 |                 print(value_table)
151 | 
152 | 
153 | learn_from_random_agent()
154 | 


--------------------------------------------------------------------------------
/temporal_difference/CartPole.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Solving CartPole problem from OpenAI with Temporal Difference methods
  3 | # Implementations are based on Sutton's book
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | 
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | 
 11 | 
 12 | # General function definitions
 13 | 
 14 | 
 15 | def discretization(observation):
 16 |     discrete = np.zeros((1, 2))
 17 |     theta_bins = np.linspace(-0.42, 0.42, 20)
 18 |     thetadot_bins = np.linspace(-1, 1, 10)
 19 |     discrete[0][0] = np.digitize(observation[2], theta_bins)
 20 |     discrete[0][1] = np.digitize(observation[3], thetadot_bins)
 21 | 
 22 |     return discrete.astype(np.int64)
 23 | 
 24 | 
 25 | def make_epsilon_greedy_policy(Q, epsilon, nA):
 26 |     """
 27 |     Creates an epsilon-greedy policy based on a given Q-function and epsilon.
 28 | 
 29 |     Args:
 30 |         Q: A dictionary that maps from state -> action-values.
 31 |             Each value is a numpy array of length nA (see below)
 32 |         epsilon: The probability to select a random action . float between 0 and 1.
 33 |         nA: Number of actions in the environment.
 34 | 
 35 |     Returns:
 36 |         A function that takes the observation as an argument and returns
 37 |         the probabilities for each action in the form of a numpy array of length nA.
 38 | 
 39 |     """
 40 |     def policy_fn(observation):
 41 |         A_probs = np.ones(nA, dtype=float) * epsilon / nA
 42 |         best_action = np.argmax(Q[observation[0][0]][observation[0][1]])
 43 |         A_probs[best_action] += (1.0 - epsilon)
 44 |         return A_probs  # .reshape(1,-1)
 45 | 
 46 |     return policy_fn
 47 | 
 48 | 
 49 | def render_games(Q, num_episodes):
 50 |     scores = []
 51 |     observations = []
 52 |     for i_episode in range(num_episodes):
 53 |         score = 0
 54 |         observation = env.reset()
 55 |         for t in range(200):
 56 |             policy = make_epsilon_greedy_policy(Q, 0, env.action_space.n)
 57 |             env.render()
 58 |             state = discretization(observation)
 59 |             action_probs = policy(state)
 60 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
 61 |             observation, reward, done, info = env.step(action)
 62 |             observations.append(observation)
 63 |             score += reward
 64 |             if done:
 65 |                 break
 66 |         scores.append(score)
 67 |     print('Average Score:', sum(scores)/len(scores))
 68 | 
 69 | 
 70 | # Define models
 71 | 
 72 | # SARSA
 73 | def Sarsa(env, num_episodes, discount_factor=0.9, alpha=0.5, epsilon=0.1):
 74 |     # Q table initialization
 75 |     Q_sarsa = np.zeros((21, 11, env.action_space.n))
 76 | 
 77 |     policy = make_epsilon_greedy_policy(Q_sarsa, epsilon, env.action_space.n)
 78 | 
 79 |     for i_episode in range(num_episodes):
 80 |         observation = env.reset()
 81 |         state = discretization(observation)
 82 | 
 83 |         while True:
 84 |             # Take a step
 85 |             action_probs = policy(state)
 86 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
 87 |             next_observation, reward, done, _ = env.step(action)
 88 |             next_state = discretization(next_observation)
 89 | 
 90 |             # Sarsa Update
 91 |             next_action_probs = policy(next_state)
 92 |             next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
 93 |             td_target = reward + discount_factor * Q_sarsa[next_state[0][0]][next_state[0][1]][next_action]
 94 |             td_delta = td_target - Q_sarsa[state[0][0]][state[0][1]][action]
 95 |             Q_sarsa[state[0][0]][state[0][1]][action] += alpha * td_delta
 96 | 
 97 |             if done:
 98 |                 break
 99 | 
100 |             state = next_state
101 | 
102 |     return Q_sarsa
103 | 
104 | 
105 | # Q Learning
106 | def Q_learning(env, num_episodes, discount_factor=0.9, alpha=0.5, epsilon=0.1):
107 |     # Q table initialization
108 |     Q = np.zeros((21, 11, env.action_space.n))
109 | 
110 |     # The policy we're following
111 |     policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
112 | 
113 |     for i_episode in range(num_episodes):
114 |         # Reset the environment and pick the first action
115 |         observation = env.reset()
116 |         state = discretization(observation)
117 | 
118 |         while True:
119 |             # Take a step
120 |             action_probs = policy(state)
121 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
122 |             next_observation, reward, done, _ = env.step(action)
123 |             next_state = discretization(next_observation)
124 | 
125 |             # Q Learning Update
126 |             best_next_action = np.argmax(Q[next_state[0][0]][next_state[0][1]])
127 |             td_target = reward + discount_factor * Q[next_state[0][0]][next_state[0][1]][best_next_action]
128 |             td_delta = td_target - Q[state[0][0]][state[0][1]][action]
129 |             Q[state[0][0]][state[0][1]][action] += alpha * td_delta
130 | 
131 |             if done:
132 |                 break
133 | 
134 |             state = next_state
135 | 
136 |     return Q
137 | 
138 | 
139 | # Expected SARSA
140 | def Expected_Sarsa(env, num_episodes, discount_factor=1, alpha=0.5, epsilon=0.1):
141 |     # Q Table Initialization
142 |     Q_expsarsa = np.zeros((21, 11, env.action_space.n))
143 | 
144 |     policy = make_epsilon_greedy_policy(Q_expsarsa, epsilon, env.action_space.n)
145 | 
146 |     for i_episode in range(num_episodes):
147 |         observation = env.reset()
148 |         state = discretization(observation)
149 | 
150 |         while True:
151 |             # Take a step
152 |             action_probs = policy(state)
153 |             action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
154 |             next_observation, reward, done, _ = env.step(action)
155 |             next_state = discretization(next_observation)
156 | 
157 |             # Expected Sarsa Update
158 |             next_action_prob = policy(next_state)
159 |             expected_value = np.sum(np.multiply(Q_expsarsa[next_state[0][0]][next_state[0][1]][:], next_action_prob))
160 |             td_target = reward + discount_factor * expected_value
161 |             td_delta = td_target - Q_expsarsa[state[0][0]][state[0][1]][action]
162 |             Q_expsarsa[state[0][0]][state[0][1]][action] += alpha * td_delta
163 | 
164 |             if done:
165 |                 break
166 | 
167 |             state = next_state
168 | 
169 |     return Q_expsarsa
170 | 
171 | 
172 | print('Starting training.')
173 | 
174 | # Train
175 | Q_s = Sarsa(env, 1200)
176 | Q_q = Q_learning(env, 600)
177 | Q_es = Expected_Sarsa(env, 400)
178 | 
179 | print('Training done.')
180 | print('Starting testing.')
181 | 
182 | # Test
183 | render_games(Q_s, 5)
184 | render_games(Q_q, 5)
185 | render_games(Q_es, 5)
186 | 
187 | print('Testing done.')
188 | 


--------------------------------------------------------------------------------