├── README.md
├── agents.py
├── cem.py
└── test.py


/README.md:
--------------------------------------------------------------------------------
 1 | # ReinforcementLearning
 2 | 
 3 | This repo contains basic algorithms/agents used for reinforcement learning. More specifically, you can find here:
 4 | 
 5 | - MC control
 6 | - Q-learning
 7 | - SARSA
 8 | - Cross Entropy Method
 9 | 
10 | ## Tests
11 | 
12 | I tested agents on OpenAI gym, CartPole-v0 environment, measuring how long it takes to solve environment (average reward of at least 195
13 | for 100 consecutive episodes). Maximum number of episodes was 1000 and each learning procedure was run 100 times.
14 | 
15 | | Algorithm  | No. trials without solving | Mean no. episodes to solve | Median no. episodes to solve | Minimum no. episodes to solve |
16 | | ------ | ------: | ------: | ------: | ------: |
17 | | MC         | 28                         | 414                       | 160                          | 91                            |
18 | | Q-learning | 0                          | 394                       | 389                          | 42                            |
19 | | SARSA      | 1                          | 403                       | 348                          | 47                            |
20 | | CEM | 25 | 271 | 11 | 1 |
21 | 


--------------------------------------------------------------------------------
/agents.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy.linalg
  3 | import itertools
  4 | from random import shuffle
  5 | import scipy.signal
  6 | 
  7 | def discount(x, gamma): 
  8 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1])[::-1]
  9 | 
 10 | class Agent:
 11 |     '''
 12 |         Parent class for all agents, since they share storing experience and policy 'evaluation'.
 13 |         Since all agents assign Q values to percise states, I cannot use continuous states and 
 14 |         need to discretize them. Function create_bins() and discretize() take care of that.
 15 |         
 16 |     '''
 17 | 
 18 |     def create_bins(self, min, max, count):
 19 |         if min < -10.0:
 20 |             min = -10.0
 21 |         if max > 10.0:
 22 |             max = 10.0
 23 |         bins = np.arange(min, max, (max - min) / count)
 24 |         return bins
 25 | 
 26 |     def discretize(self, values, bins):
 27 |         values = values.flatten()
 28 |         discretized = []
 29 |         for i, v in enumerate(values):
 30 |             index = np.digitize(v, bins[i])
 31 |             discretized.append(bins[i][np.maximum(index - 1, 0)])
 32 |         return np.array(discretized)
 33 | 
 34 | 
 35 |     '''
 36 |         init_value controls exploration behavior of the agent - if initial estimates 
 37 |         of the Q value are overly positive, agent will prefer action it never tried.
 38 |     '''
 39 |     def init(self, env, init_value = 0):
 40 |         self.env = env
 41 |         self.observation_bins = []
 42 |         for min, max in zip(env.observation_space.low, env.observation_space.high):
 43 |             self.observation_bins.append(self.create_bins(min, max, 8))
 44 | 
 45 |         self.Q = {}
 46 |         for state in list(itertools.product(*self.observation_bins)):
 47 |             for a in xrange(env.action_space.n):
 48 |                 self.Q[tuple(state), a] = init_value
 49 |         
 50 |         self.sars = []
 51 |     
 52 |     def experience(self, observation, action, reward, next_observation):
 53 |         state = self.discretize(observation, self.observation_bins)
 54 |         next_state = self.discretize(next_observation, self.observation_bins)
 55 |         self.sars.append([state, action, reward, next_state])
 56 | 
 57 |     def get_action(self, observation):
 58 |         state = self.discretize(observation, self.observation_bins)
 59 |         q_vals = [self.Q[tuple(state), a] for a in xrange(self.env.action_space.n)]
 60 |         max_actions = np.argwhere(q_vals == np.amax(q_vals))
 61 |         max_actions = np.squeeze(max_actions, 1)
 62 |         return max_actions[np.random.randint(max_actions.shape[0])]
 63 | 
 64 | class QAgent(Agent):
 65 |     '''
 66 |         Agent implementing Q-Learning algorithm.
 67 |     '''
 68 |     def __init__(self, env, init_value = 0):
 69 |         self.init(env, init_value)    
 70 | 
 71 |     def learn(self, l_rate):
 72 |         for [s,a,r,ns] in self.sars:
 73 |             maxQ = np.array([self.Q[tuple(ns), na] for na in xrange(self.env.action_space.n)]).max()
 74 |             self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r + maxQ - self.Q[tuple(s), a])
 75 |         self.sars = []
 76 | 
 77 | class SarsaAgent(Agent):
 78 |     '''
 79 |         Agent implementing SARSA algorithm.
 80 |     '''
 81 |     def __init__(self, env, init_value = 0):
 82 |         self.init(env, init_value)    
 83 | 
 84 |     def learn(self, l_rate):
 85 |         for i, [s,a,r,ns] in enumerate(self.sars):
 86 |             na = self.sars[i + 1][1] if i < len(self.sars) - 1 else self.get_action(ns)
 87 |             self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r + self.Q[tuple(ns), na] - self.Q[tuple(s), a])
 88 |         self.sars = []
 89 | 
 90 | class MCAgent(Agent):
 91 |     '''
 92 |         Agent implementing MonteCarlo Q value estimation.
 93 |     '''
 94 |     def __init__(self, env, init_value = 0):
 95 |         self.init(env, init_value)
 96 | 
 97 |     def learn(self, l_rate):
 98 |         rewards = np.array([sar[2] for sar in self.sars])
 99 |         cumulative_rewards = discount(rewards, 1.0)
100 |         self.sars = [[s[0], s[1], c, s[3]] for s, c in zip(self.sars, cumulative_rewards)]
101 |         for [s,a,r,_] in self.sars:
102 |             self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r - self.Q[tuple(s), a])
103 |         self.sars = []


--------------------------------------------------------------------------------
/cem.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | env = gym.make('CartPole-v0')
 5 | 
 6 | # return parameters with random values from gaussian distirbution with specified mean and standard deviation
 7 | # parameters specify affine transformation and are flattened into a vector
 8 | def get_random_theta(means, stddev):
 9 |     theta = [np.random.normal(m, s, 1) for m,s in zip(means, stddev)]
10 |     return np.reshape(np.array(theta), (-1))
11 | 
12 | # compute best action given parameters and state
13 | def get_action(theta, state, action_count):
14 |     W = np.reshape(theta[action_count:], (action_count, state.size))
15 |     b = theta[0:action_count]
16 |     a = np.dot(W, state) + b
17 |     return np.argmax(a)
18 | 
19 | def episode(theta, render=True):
20 |     observations = env.reset()
21 |     total_reward = 0
22 |     for _ in range(1000):
23 |         if render:
24 |             env.render()
25 |         a = get_action(theta, observations, env.action_space.n)
26 |         observations, reward, done, _ = env.step(a)
27 |         total_reward += reward
28 |         if done: 
29 |             break
30 |     return total_reward
31 | 
32 | # initial mean and standard deviation for the parameters
33 | theta_size = (env.observation_space.shape[0] + 1) * env.action_space.n
34 | theta_mean = np.zeros((theta_size))
35 | theta_stddev = np.ones((theta_size))
36 | 
37 | for step in range(100):
38 |     # get new population
39 |     thetas = [get_random_theta(theta_mean, theta_stddev) for _ in xrange(10)]
40 |     # evaluate
41 |     rewards = [episode(theta, render=False) for theta in thetas]
42 |     
43 |     # compute weights used to compute next mean and stddev
44 |     total_reward = np.array(rewards).sum()
45 |     weights = np.array(rewards) / total_reward
46 | 
47 |     # new mean is a weighted average of population parameters
48 |     theta_mean = np.average(np.array(thetas), 0, weights=weights)
49 |     
50 |     # new stddev ("weighted" stddev)
51 |     theta_variance = np.sum([(thetas[i] - theta_mean)**2 * weights[i] for i in xrange(10)], 0)
52 |     theta_stddev = np.sqrt(theta_variance)
53 |     
54 |     # get new "elite"
55 |     reward = episode(theta_mean, render=False)
56 |     print 'Reward at step {}: {}'.format(step, reward)
57 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from agents import SarsaAgent, QAgent, MCAgent
 4 |         
 5 | env = gym.make('CartPole-v0')
 6 | 
 7 | initial_value = 300.0
 8 | #solver = SarsaAgent(env, initial_value)
 9 | #solver = QAgent(env, initial_value)
10 | solver = MCAgent(env, initial_value)
11 | 
12 | for i in xrange(1000):
13 |     done = False
14 |     total_reward = 0
15 |     observation = env.reset()
16 |     while not done:
17 |         
18 |         action = solver.get_action(observation)
19 |         next_observation, reward, done, _ = env.step(action)
20 |         total_reward += reward
21 |         if done:
22 |             reward = -initial_value
23 |         
24 |         solver.experience(observation, action, reward, next_observation)
25 |         observation = next_observation
26 |         
27 |         if done:
28 |             solver.learn(0.1)     
29 | 
30 |             print 'Iteration: {}; Reward: {}'.format(i, total_reward)
31 |             break


--------------------------------------------------------------------------------