├── README.md ├── agents.py ├── cem.py └── test.py /README.md: -------------------------------------------------------------------------------- 1 | # ReinforcementLearning 2 | 3 | This repo contains basic algorithms/agents used for reinforcement learning. More specifically, you can find here: 4 | 5 | - MC control 6 | - Q-learning 7 | - SARSA 8 | - Cross Entropy Method 9 | 10 | ## Tests 11 | 12 | I tested agents on OpenAI gym, CartPole-v0 environment, measuring how long it takes to solve environment (average reward of at least 195 13 | for 100 consecutive episodes). Maximum number of episodes was 1000 and each learning procedure was run 100 times. 14 | 15 | | Algorithm | No. trials without solving | Mean no. episodes to solve | Median no. episodes to solve | Minimum no. episodes to solve | 16 | | ------ | ------: | ------: | ------: | ------: | 17 | | MC | 28 | 414 | 160 | 91 | 18 | | Q-learning | 0 | 394 | 389 | 42 | 19 | | SARSA | 1 | 403 | 348 | 47 | 20 | | CEM | 25 | 271 | 11 | 1 | 21 | -------------------------------------------------------------------------------- /agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.linalg 3 | import itertools 4 | from random import shuffle 5 | import scipy.signal 6 | 7 | def discount(x, gamma): 8 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1])[::-1] 9 | 10 | class Agent: 11 | ''' 12 | Parent class for all agents, since they share storing experience and policy 'evaluation'. 13 | Since all agents assign Q values to percise states, I cannot use continuous states and 14 | need to discretize them. Function create_bins() and discretize() take care of that. 15 | 16 | ''' 17 | 18 | def create_bins(self, min, max, count): 19 | if min < -10.0: 20 | min = -10.0 21 | if max > 10.0: 22 | max = 10.0 23 | bins = np.arange(min, max, (max - min) / count) 24 | return bins 25 | 26 | def discretize(self, values, bins): 27 | values = values.flatten() 28 | discretized = [] 29 | for i, v in enumerate(values): 30 | index = np.digitize(v, bins[i]) 31 | discretized.append(bins[i][np.maximum(index - 1, 0)]) 32 | return np.array(discretized) 33 | 34 | 35 | ''' 36 | init_value controls exploration behavior of the agent - if initial estimates 37 | of the Q value are overly positive, agent will prefer action it never tried. 38 | ''' 39 | def init(self, env, init_value = 0): 40 | self.env = env 41 | self.observation_bins = [] 42 | for min, max in zip(env.observation_space.low, env.observation_space.high): 43 | self.observation_bins.append(self.create_bins(min, max, 8)) 44 | 45 | self.Q = {} 46 | for state in list(itertools.product(*self.observation_bins)): 47 | for a in xrange(env.action_space.n): 48 | self.Q[tuple(state), a] = init_value 49 | 50 | self.sars = [] 51 | 52 | def experience(self, observation, action, reward, next_observation): 53 | state = self.discretize(observation, self.observation_bins) 54 | next_state = self.discretize(next_observation, self.observation_bins) 55 | self.sars.append([state, action, reward, next_state]) 56 | 57 | def get_action(self, observation): 58 | state = self.discretize(observation, self.observation_bins) 59 | q_vals = [self.Q[tuple(state), a] for a in xrange(self.env.action_space.n)] 60 | max_actions = np.argwhere(q_vals == np.amax(q_vals)) 61 | max_actions = np.squeeze(max_actions, 1) 62 | return max_actions[np.random.randint(max_actions.shape[0])] 63 | 64 | class QAgent(Agent): 65 | ''' 66 | Agent implementing Q-Learning algorithm. 67 | ''' 68 | def __init__(self, env, init_value = 0): 69 | self.init(env, init_value) 70 | 71 | def learn(self, l_rate): 72 | for [s,a,r,ns] in self.sars: 73 | maxQ = np.array([self.Q[tuple(ns), na] for na in xrange(self.env.action_space.n)]).max() 74 | self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r + maxQ - self.Q[tuple(s), a]) 75 | self.sars = [] 76 | 77 | class SarsaAgent(Agent): 78 | ''' 79 | Agent implementing SARSA algorithm. 80 | ''' 81 | def __init__(self, env, init_value = 0): 82 | self.init(env, init_value) 83 | 84 | def learn(self, l_rate): 85 | for i, [s,a,r,ns] in enumerate(self.sars): 86 | na = self.sars[i + 1][1] if i < len(self.sars) - 1 else self.get_action(ns) 87 | self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r + self.Q[tuple(ns), na] - self.Q[tuple(s), a]) 88 | self.sars = [] 89 | 90 | class MCAgent(Agent): 91 | ''' 92 | Agent implementing MonteCarlo Q value estimation. 93 | ''' 94 | def __init__(self, env, init_value = 0): 95 | self.init(env, init_value) 96 | 97 | def learn(self, l_rate): 98 | rewards = np.array([sar[2] for sar in self.sars]) 99 | cumulative_rewards = discount(rewards, 1.0) 100 | self.sars = [[s[0], s[1], c, s[3]] for s, c in zip(self.sars, cumulative_rewards)] 101 | for [s,a,r,_] in self.sars: 102 | self.Q[tuple(s), a] = self.Q[tuple(s), a] + l_rate * (r - self.Q[tuple(s), a]) 103 | self.sars = [] -------------------------------------------------------------------------------- /cem.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | env = gym.make('CartPole-v0') 5 | 6 | # return parameters with random values from gaussian distirbution with specified mean and standard deviation 7 | # parameters specify affine transformation and are flattened into a vector 8 | def get_random_theta(means, stddev): 9 | theta = [np.random.normal(m, s, 1) for m,s in zip(means, stddev)] 10 | return np.reshape(np.array(theta), (-1)) 11 | 12 | # compute best action given parameters and state 13 | def get_action(theta, state, action_count): 14 | W = np.reshape(theta[action_count:], (action_count, state.size)) 15 | b = theta[0:action_count] 16 | a = np.dot(W, state) + b 17 | return np.argmax(a) 18 | 19 | def episode(theta, render=True): 20 | observations = env.reset() 21 | total_reward = 0 22 | for _ in range(1000): 23 | if render: 24 | env.render() 25 | a = get_action(theta, observations, env.action_space.n) 26 | observations, reward, done, _ = env.step(a) 27 | total_reward += reward 28 | if done: 29 | break 30 | return total_reward 31 | 32 | # initial mean and standard deviation for the parameters 33 | theta_size = (env.observation_space.shape[0] + 1) * env.action_space.n 34 | theta_mean = np.zeros((theta_size)) 35 | theta_stddev = np.ones((theta_size)) 36 | 37 | for step in range(100): 38 | # get new population 39 | thetas = [get_random_theta(theta_mean, theta_stddev) for _ in xrange(10)] 40 | # evaluate 41 | rewards = [episode(theta, render=False) for theta in thetas] 42 | 43 | # compute weights used to compute next mean and stddev 44 | total_reward = np.array(rewards).sum() 45 | weights = np.array(rewards) / total_reward 46 | 47 | # new mean is a weighted average of population parameters 48 | theta_mean = np.average(np.array(thetas), 0, weights=weights) 49 | 50 | # new stddev ("weighted" stddev) 51 | theta_variance = np.sum([(thetas[i] - theta_mean)**2 * weights[i] for i in xrange(10)], 0) 52 | theta_stddev = np.sqrt(theta_variance) 53 | 54 | # get new "elite" 55 | reward = episode(theta_mean, render=False) 56 | print 'Reward at step {}: {}'.format(step, reward) 57 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from agents import SarsaAgent, QAgent, MCAgent 4 | 5 | env = gym.make('CartPole-v0') 6 | 7 | initial_value = 300.0 8 | #solver = SarsaAgent(env, initial_value) 9 | #solver = QAgent(env, initial_value) 10 | solver = MCAgent(env, initial_value) 11 | 12 | for i in xrange(1000): 13 | done = False 14 | total_reward = 0 15 | observation = env.reset() 16 | while not done: 17 | 18 | action = solver.get_action(observation) 19 | next_observation, reward, done, _ = env.step(action) 20 | total_reward += reward 21 | if done: 22 | reward = -initial_value 23 | 24 | solver.experience(observation, action, reward, next_observation) 25 | observation = next_observation 26 | 27 | if done: 28 | solver.learn(0.1) 29 | 30 | print 'Iteration: {}; Reward: {}'.format(i, total_reward) 31 | break --------------------------------------------------------------------------------