├── gym_cartpolemod ├── envs │ ├── __init__.py │ └── cartpolemod_env.py └── __init__.py ├── .gitignore ├── setup.py ├── example_files ├── simple_test.py ├── test2.py ├── dqn.py ├── ddqn.py ├── test.py └── deepQNetwork.py └── README.md /gym_cartpolemod/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym_cartpolemod.envs.cartpolemod_env import CartPoleModEnv -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .gitupload 3 | gym_cartpolemod.egg-info/ 4 | gym_cartpolemod/__pycache__/ 5 | gym_cartpolemod/envs/__pycache__/ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gym_cartpolemod', 4 | version='0.2.0', 5 | install_requires=['gym', 'keras', 'numpy'] 6 | ) 7 | -------------------------------------------------------------------------------- /example_files/simple_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_cartpolemod 3 | env = gym.make('CartPoleMod-v0') 4 | for i_episode in range(20): 5 | observation = env.reset() 6 | for t in range(100): 7 | env.render() 8 | print(observation) 9 | action = env.action_space.sample() 10 | observation, reward, done, info = env.step(action) 11 | if done: 12 | print("Episode finished after {} timesteps".format(t+1)) 13 | break 14 | -------------------------------------------------------------------------------- /gym_cartpolemod/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from gym.envs.registration import register 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | register( 7 | id='CartPoleMod-v0', 8 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 9 | kwargs={'case':1}, 10 | ) 11 | register( 12 | id='CartPoleMod-v1', 13 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 14 | kwargs={'case':2}, 15 | ) 16 | register( 17 | id='CartPoleMod-v2', 18 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 19 | kwargs={'case':3}, 20 | ) 21 | register( 22 | id='CartPoleMod-v3', 23 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 24 | kwargs={'case':4}, 25 | ) 26 | register( 27 | id='CartPoleMod-v4', 28 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 29 | kwargs={'case':5}, 30 | ) 31 | register( 32 | id='CartPoleMod-v5', 33 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 34 | kwargs={'case':6}, 35 | ) 36 | register( 37 | id='CartPoleMod-v6', 38 | entry_point='gym_cartpolemod.envs:CartPoleModEnv', 39 | kwargs={'case':7}, 40 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains a PIP package which is a modified version of the 2 | CartPole-v0 OpenAI environment which includes cart & pole friction and random sensor & actuator noise. 3 | 4 | 5 | ## Installation 6 | 7 | Install [OpenAI gym](https://gym.openai.com/docs/). 8 | 9 | Then install this package via 10 | 11 | ``` 12 | git clone http://github.com/AadityaPatanjali/gym-cartpolemod.git 13 | cd gym-cartpolemod 14 | sudo pip install -e . 15 | ``` 16 | 17 | ## Usage 18 | Python usage: 19 | ``` 20 | import gym 21 | import gym_cartpolemod 22 | 23 | env = gym.make('CartPoleMod-v0') 24 | ``` 25 | Examples: 26 | Versions go from v0 through v6 for different noise scenarios 27 | ``` 28 | cd example_files 29 | python deepQNetwork.py v1 30 | ``` 31 | 32 | ## The Environment 33 | 34 | Some parameters for the cart-pole system: 35 | - mass of the cart = 1.0 36 | - mass of the pole = 0.1 37 | - length of the pole = 0.5 38 | - magnitude of the force = 10.0 39 | - friction at the cart = 5e-4 40 | - friction at the pole = 2e-6 41 | 42 | Noise cases(v0 to v6): 43 | - v0: Noise free 44 | - v1: 5% Uniform Actuator noise 45 | - v2: 10% Uniform Actuator noise 46 | - v3: 5% Uniform Sensor noise 47 | - v4: 10% Uniform Sensor noise 48 | - v5: 0.1 var Gaussian Sensor noise 49 | - v6: 0.2 var Gaussian Sensor noise 50 | 51 | Note: The sensor noise is added to the angle, theta alone, and the actuator noise is added to the force. 52 | 53 | Some Neural network parameters: 54 | - Discount rate : gamma = 0.95 55 | - Exploration rate : epsilon = 1.0 56 | - Exploration decay : 90% 57 | - Learning rate : 0.01 58 | - NN Input layer : 24 neurons, tanh activation function 59 | - NN Hidden layer : 48 neurons, tanh activation function 60 | - NN Output layer : 2 neurons, linear activation function 61 | - MSE loss function with Adam Optimizer 62 | - Seed : None (Both in deepQNetwork and cartpolemod_env) 63 | 64 | Note: To give a seed, uncomment line# 17 in deepQNetwork and change line# 76 in cartpole_mod to the desired seed value. 65 | 66 | Notations: 67 | - 1 run : 1000 trials 68 | - 1 trials : 60,000 time-steps 69 | - 1 time step : 0.02s 70 | - Success rate : successful runs for 100 runs 71 | - Successful trial: System lasts 60k time steps without failing 72 | - Successful run : Atleast one successful trial in 1000 trials 73 | - Average trials : Mean trials to successful trial over 100 runs 74 | - Execution time : Approximate average time of execution for 100 runs 75 | 76 | ## Results 77 | |CartpoleMod Version|Noise type|Success rate|Average trials|Execution time| 78 | |:---:|:---:|:---:|:---:|:---:| 79 | |v0|Noise free|100%|119.68|~ 4.5 hours| 80 | |v1|5% Uniform Actuator Noise|100%|118.88|~ 4.5 hours| 81 | |v2|10% Uniform Actuator Noise|100%|130.47|~ 4.5 hours| 82 | |v3|5% Uniform Sensor Noise|100%|123.93|~ 5.5 hours| 83 | |v4|10% Uniform Sensor Noise|100%|133.67|~ 5.5 hours| 84 | |v5|σ^2 = 0.1 Gaussian Sensor Noise|100%|179.36|~ 6.5 hours| 85 | |v6|σ^2 = 0.2 Gaussian Sensor Noise|98%|209.59|~ 8 hours| 86 | 87 | ## The team 88 | - Aaditya Ravindran 89 | - Apoorva Sonavani 90 | - Rohith Krishna Gopi 91 | 92 | This was created as a part of Prof. Jennie Si's class on Artificial Neural Computation (Fall 2017) at Arizona State University 93 | 94 | Special thanks to [MartinThoma](https://github.com/MartinThoma/banana-gym), [Kevin Frans](https://github.com/kvfrans/openai-cartpole), and [keon](https://keon.io/deep-q-learning/) 95 | -------------------------------------------------------------------------------- /example_files/test2.py: -------------------------------------------------------------------------------- 1 | # Inspired by https://keon.io/deep-q-learning/ 2 | 3 | import random 4 | import gym 5 | import math 6 | import numpy as np 7 | from collections import deque 8 | from keras.models import Sequential 9 | from keras.layers import Dense 10 | from keras.optimizers import Adam 11 | import gym_cartpolemod 12 | 13 | class DQNCartPoleSolver(): 14 | def __init__(self, n_episodes=1000, n_win_ticks=195, max_env_steps=None, gamma=1.0, epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.995, alpha=0.01, alpha_decay=0.01, batch_size=64, monitor=False, quiet=False): 15 | self.memory = deque(maxlen=100000) 16 | self.env = gym.make('CartPoleMod-v0') 17 | if monitor: self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True) 18 | self.gamma = gamma 19 | self.epsilon = epsilon 20 | self.epsilon_min = epsilon_min 21 | self.epsilon_decay = epsilon_log_decay 22 | self.alpha = alpha 23 | self.alpha_decay = alpha_decay 24 | self.n_episodes = n_episodes 25 | self.n_win_ticks = n_win_ticks 26 | self.batch_size = batch_size 27 | self.quiet = quiet 28 | if max_env_steps is not None: self.env._max_episode_steps = max_env_steps 29 | 30 | # Init model 31 | self.model = Sequential() 32 | self.model.add(Dense(24, input_dim=4, activation='tanh')) 33 | self.model.add(Dense(48, activation='tanh')) 34 | self.model.add(Dense(2, activation='linear')) 35 | self.model.compile(loss='mse', optimizer=Adam(lr=self.alpha, decay=self.alpha_decay)) 36 | 37 | def remember(self, state, action, reward, next_state, done): 38 | self.memory.append((state, action, reward, next_state, done)) 39 | 40 | def choose_action(self, state, epsilon): 41 | return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.model.predict(state)) 42 | 43 | def get_epsilon(self, t): 44 | return max(self.epsilon_min, min(self.epsilon, 1.0 - math.log10((t + 1) * self.epsilon_decay))) 45 | 46 | def preprocess_state(self, state): 47 | return np.reshape(state, [1, 4]) 48 | 49 | def replay(self, batch_size): 50 | x_batch, y_batch = [], [] 51 | minibatch = random.sample( 52 | self.memory, min(len(self.memory), batch_size)) 53 | for state, action, reward, next_state, done in minibatch: 54 | y_target = self.model.predict(state) 55 | y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0]) 56 | x_batch.append(state[0]) 57 | y_batch.append(y_target[0]) 58 | 59 | self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0) 60 | if self.epsilon > self.epsilon_min: 61 | self.epsilon *= self.epsilon_decay 62 | 63 | def run(self): 64 | scores = deque(maxlen=100) 65 | 66 | for e in range(self.n_episodes): 67 | state = self.preprocess_state(self.env.reset()) 68 | done = False 69 | i = 0 70 | while not done: 71 | action = self.choose_action(state, self.get_epsilon(e)) 72 | next_state, reward, done, _ = self.env.step(action) 73 | next_state = self.preprocess_state(next_state) 74 | self.remember(state, action, reward, next_state, done) 75 | state = next_state 76 | i += 1 77 | 78 | scores.append(i) 79 | mean_score = np.mean(scores) 80 | if mean_score >= self.n_win_ticks and e >= 100: 81 | if not self.quiet: print('Ran {} episodes. Solved after {} trials ✔'.format(e, e - 100)) 82 | return e - 100 83 | if e % 100 == 0 and not self.quiet: 84 | print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score)) 85 | 86 | self.replay(self.batch_size) 87 | 88 | if not self.quiet: print('Did not solve after {} episodes 😞'.format(e)) 89 | return e 90 | 91 | if __name__ == '__main__': 92 | agent = DQNCartPoleSolver() 93 | agent.run() -------------------------------------------------------------------------------- /example_files/dqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import random 4 | import gym 5 | import numpy as np 6 | from collections import deque 7 | from keras.models import Sequential 8 | from keras.layers import Dense 9 | from keras.optimizers import Adam 10 | import gym_cartpolemod 11 | 12 | EPISODES = 1000 13 | 14 | 15 | class DQNAgent: 16 | def __init__(self, state_size, action_size,envName): 17 | self.state_size = state_size 18 | self.action_size = action_size 19 | self.memory = deque(maxlen=2000) 20 | self.gamma = 0.95 # discount rate 21 | self.epsilon = 1.0 # exploration rate 22 | self.epsilon_min = 0.01 23 | self.epsilon_decay = 0.995 24 | self.learning_rate = 0.001 25 | self.model = self._build_model() 26 | self.envName = envName 27 | 28 | def _build_model(self): 29 | # Neural Net for Deep-Q learning Model 30 | model = Sequential() 31 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 32 | model.add(Dense(24, activation='relu')) 33 | model.add(Dense(self.action_size, activation='linear')) 34 | print('Compiling Neural Network...') 35 | model.compile(loss='mse', 36 | optimizer=Adam(lr=self.learning_rate)) 37 | return model 38 | 39 | def set_epsilon(self,epsilon): 40 | # Set exploration rate 41 | self.epsilon = epsilon 42 | 43 | def remember(self, state, action, reward, next_state, done): 44 | self.memory.append((state, action, reward, next_state, done)) 45 | 46 | def act(self, state): 47 | if np.random.rand() <= self.epsilon: 48 | return random.randrange(self.action_size) 49 | act_values = self.model.predict(state) 50 | return np.argmax(act_values[0]) # returns action 51 | 52 | def replay(self, batch_size): 53 | minibatch = random.sample(self.memory, batch_size) 54 | for state, action, reward, next_state, done in minibatch: 55 | target = reward 56 | if not done: 57 | target = (reward + self.gamma * 58 | np.amax(self.model.predict(next_state)[0])) 59 | target_f = self.model.predict(state) 60 | target_f[0][action] = target 61 | self.model.fit(state, target_f, epochs=1, verbose=0) 62 | if self.epsilon > self.epsilon_min: 63 | self.epsilon *= self.epsilon_decay 64 | 65 | def load(self, name): 66 | self.model.load_weights(name) 67 | 68 | def save(self, name): 69 | self.model.save_weights(name) 70 | 71 | def main(self,useMem=False,explore=True): 72 | fileName = "./save/"+self.envName+".h5" 73 | try: 74 | if useMem: 75 | self.load(fileName) 76 | print('\n\n\nLoaded Cartpole weights') 77 | if not explore: 78 | self.set_epsilon(0.01) 79 | except: 80 | pass 81 | done = False 82 | batch_size = 32 83 | scores = deque(maxlen=100) 84 | 85 | for e in range(EPISODES): 86 | state = env.reset() 87 | state = np.reshape(state, [1, state_size]) 88 | for time in range(500): 89 | # env.render() 90 | action = self.act(state) 91 | next_state, reward, done, _ = env.step(action) 92 | reward = reward if not done else -10 93 | next_state = np.reshape(next_state, [1, state_size]) 94 | self.remember(state, action, reward, next_state, done) 95 | state = next_state 96 | if done: 97 | scores.append(time) 98 | mean_score = np.mean(scores) 99 | # print("episode: {}/{}, score: {}, mean_score: {}, e: {:.2}".format(e, EPISODES, time, mean_score, agent.epsilon)) 100 | if mean_score >= 200 and e >= 100: 101 | print('\t\t\tRan {} episodes. Solved after {} trials ✔'.format(e, e - 100)) 102 | return e-100 103 | 104 | if e % 100 == 0: 105 | print('\t\t\t[Episode {}] - Mean survival time over last 100 episodes was {} ticks. e: {}'.format(e, mean_score,self.epsilon)) 106 | break 107 | 108 | if len(self.memory) > batch_size: 109 | self.replay(batch_size) 110 | if e % 10 == 0: 111 | if useMem: self.save(fileName) 112 | 113 | if __name__ == "__main__": 114 | print('Making CartpoleMod environment') 115 | envName = 'CartPoleMod-'+sys.argv[1] 116 | env = gym.make(envName) 117 | state_size = env.observation_space.shape[0] 118 | action_size = env.action_space.n 119 | agent = DQNAgent(state_size, action_size, envName) 120 | agent.main(useMem=True) -------------------------------------------------------------------------------- /example_files/ddqn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import gym 4 | import numpy as np 5 | from collections import deque 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras import backend as K 10 | import gym_cartpolemod 11 | 12 | EPISODES = 5000 13 | 14 | 15 | class DQNAgent: 16 | def __init__(self, state_size, action_size): 17 | self.state_size = state_size 18 | self.action_size = action_size 19 | self.memory = deque(maxlen=2000) 20 | self.gamma = 0.95 # discount rate 21 | self.epsilon = 1.0 # exploration rate 22 | self.epsilon_min = 0.01 23 | self.epsilon_decay = 0.99 24 | self.learning_rate = 0.001 25 | self.model = self._build_model() 26 | self.target_model = self._build_model() 27 | self.update_target_model() 28 | 29 | def _huber_loss(self, target, prediction): 30 | # sqrt(1+error^2)-1 31 | error = prediction - target 32 | return K.mean(K.sqrt(1+K.square(error))-1, axis=-1) 33 | 34 | def _build_model(self): 35 | # Neural Net for Deep-Q learning Model 36 | model = Sequential() 37 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 38 | model.add(Dense(24, activation='relu')) 39 | model.add(Dense(self.action_size, activation='linear')) 40 | model.compile(loss=self._huber_loss, 41 | optimizer=Adam(lr=self.learning_rate)) 42 | return model 43 | 44 | def update_target_model(self): 45 | # copy weights from model to target_model 46 | self.target_model.set_weights(self.model.get_weights()) 47 | 48 | def remember(self, state, action, reward, next_state, done): 49 | self.memory.append((state, action, reward, next_state, done)) 50 | 51 | def act(self, state): 52 | if np.random.rand() <= self.epsilon: 53 | return random.randrange(self.action_size) 54 | act_values = self.model.predict(state) 55 | return np.argmax(act_values[0]) # returns action 56 | 57 | def replay(self, batch_size): 58 | minibatch = random.sample(self.memory, batch_size) 59 | for state, action, reward, next_state, done in minibatch: 60 | target = self.model.predict(state) 61 | if done: 62 | target[0][action] = reward 63 | else: 64 | a = self.model.predict(next_state)[0] 65 | t = self.target_model.predict(next_state)[0] 66 | target[0][action] = reward + self.gamma * t[np.argmax(a)] 67 | self.model.fit(state, target, epochs=1, verbose=0) 68 | if self.epsilon > self.epsilon_min: 69 | self.epsilon *= self.epsilon_decay 70 | 71 | def load(self, name): 72 | self.model.load_weights(name) 73 | 74 | def save(self, name): 75 | self.model.save_weights(name) 76 | 77 | 78 | if __name__ == "__main__": 79 | env = gym.make('CartPoleMod-v0') 80 | state_size = env.observation_space.shape[0] 81 | action_size = env.action_space.n 82 | agent = DQNAgent(state_size, action_size) 83 | # agent.load("./save/cartpole-ddqn.h5") 84 | done = False 85 | batch_size = 32 86 | 87 | for e in range(EPISODES): 88 | state = env.reset() 89 | state = np.reshape(state, [1, state_size]) 90 | for time in range(500): 91 | # env.render() 92 | action = agent.act(state) 93 | next_state, reward, done, _ = env.step(action) 94 | reward = reward if not done else -10 95 | next_state = np.reshape(next_state, [1, state_size]) 96 | agent.remember(state, action, reward, next_state, done) 97 | state = next_state 98 | if done: 99 | agent.update_target_model() 100 | print("episode: {}/{}, score: {}, e: {:.2}" 101 | .format(e, EPISODES, time, agent.epsilon)) 102 | break 103 | if len(agent.memory) > batch_size: 104 | agent.replay(batch_size) 105 | # if e % 10 == 0: 106 | # agent.save("./save/cartpole-ddqn.h5") -------------------------------------------------------------------------------- /example_files/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import random 4 | import gym 5 | import math 6 | import matplotlib.pyplot as plt 7 | from collections import deque 8 | import gym_cartpolemod 9 | 10 | TIME_STEPS = 600000 11 | TRIALS = 1000 12 | RUNS = 100 13 | success_score = 6000 14 | 15 | def softmax(x): 16 | e_x = np.exp(x - np.max(x)) 17 | out = e_x / e_x.sum() 18 | return out 19 | 20 | 21 | def policy_gradient(): 22 | with tf.variable_scope("policy"): 23 | params = tf.get_variable("policy_parameters",[4,2]) 24 | state = tf.placeholder("float",[None,4]) 25 | actions = tf.placeholder("float",[None,2]) 26 | advantages = tf.placeholder("float",[None,1]) 27 | linear = tf.matmul(state,params) 28 | probabilities = tf.nn.softmax(linear) 29 | good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1]) 30 | eligibility = tf.log(good_probabilities) * advantages 31 | loss = -tf.reduce_sum(eligibility) 32 | optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) 33 | return probabilities, state, actions, advantages, optimizer 34 | 35 | def value_gradient(): 36 | with tf.variable_scope("value"): 37 | state = tf.placeholder("float",[None,4]) 38 | newvals = tf.placeholder("float",[None,1]) 39 | w1 = tf.get_variable("w1",[4,10]) 40 | b1 = tf.get_variable("b1",[10]) 41 | h1 = tf.nn.relu(tf.matmul(state,w1) + b1) 42 | w2 = tf.get_variable("w2",[10,1]) 43 | b2 = tf.get_variable("b2",[1]) 44 | calculated = tf.matmul(h1,w2) + b2 45 | diffs = calculated - newvals 46 | loss = tf.nn.l2_loss(diffs) 47 | optimizer = tf.train.AdamOptimizer(0.1).minimize(loss) 48 | return calculated, state, newvals, optimizer, loss 49 | 50 | def run_episode(env, policy_grad, value_grad, sess): 51 | pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad 52 | vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad 53 | observation = env.reset() 54 | totalreward = 0 55 | states = [] 56 | actions = [] 57 | advantages = [] 58 | transitions = [] 59 | update_vals = [] 60 | 61 | 62 | for _ in range(TIME_STEPS): 63 | # env.render() 64 | # calculate policy 65 | obs_vector = np.expand_dims(observation, axis=0) 66 | probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector}) 67 | action = 0 if random.uniform(0,1) < probs[0][0] else 1 68 | # record the transition 69 | states.append(observation) 70 | actionblank = np.zeros(2) 71 | actionblank[action] = 1 72 | actions.append(actionblank) 73 | # take the action in the environment 74 | old_observation = observation 75 | observation, reward, done, info = env.step(action) 76 | transitions.append((old_observation, action, reward)) 77 | totalreward += reward 78 | 79 | if done: 80 | break 81 | for index, trans in enumerate(transitions): 82 | obs, action, reward = trans 83 | 84 | # calculate discounted monte-carlo return 85 | future_reward = 0 86 | future_transitions = len(transitions) - index 87 | decrease = 1 88 | for index2 in range(future_transitions): 89 | future_reward += transitions[(index2) + index][2] * decrease 90 | decrease = decrease * 0.97 91 | obs_vector = np.expand_dims(obs, axis=0) 92 | currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0] 93 | 94 | # advantage: how much better was this action than normal 95 | advantages.append(future_reward - currentval) 96 | 97 | # update the value function towards new return 98 | update_vals.append(future_reward) 99 | 100 | # update value function 101 | update_vals_vector = np.expand_dims(update_vals, axis=1) 102 | sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector}) 103 | # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector}) 104 | 105 | advantages_vector = np.expand_dims(advantages, axis=1) 106 | sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions}) 107 | 108 | return totalreward 109 | 110 | 111 | env = gym.make('CartPoleMod-v0') 112 | env._max_episode_steps = TIME_STEPS 113 | policy_grad = policy_gradient() 114 | value_grad = value_gradient() 115 | sess = tf.InteractiveSession() 116 | sess.run(tf.initialize_all_variables()) 117 | 118 | trial_score = deque(maxlen = RUNS) 119 | total_success = 0 120 | for run in range(1,RUNS+1): 121 | scores = deque(maxlen = 20) 122 | success = 0 123 | for trial in range(1,TRIALS+1): 124 | total_reward = 0 125 | reward = run_episode(env, policy_grad, value_grad, sess) 126 | scores.append(reward) 127 | if trial%100==0: print('Trial: {} reward: {}'.format(trial,reward)) 128 | try: 129 | mean_score = np.mean(scores) 130 | except: 131 | mean_score = 0 132 | if mean_score > success_score: 133 | trial_score.append(trial) 134 | success = 1 135 | print('Successful trial. Run#:{} Score: {}'.format(run,mean_score)) 136 | 137 | mean_trial = np.mean(trial_score) 138 | total_success += success 139 | if success==0: 140 | print('Failed Run#:{}'.format(run)) 141 | else: 142 | print('Successful run#: {} Average trial#: {}'.format(run,mean_trial)) 143 | print('\n\n\n\n\n\nSuccess Rate:{}% #Trials: {}'.format(total_success,mean_trial)) 144 | -------------------------------------------------------------------------------- /example_files/deepQNetwork.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | start_time = datetime.now() # Start timing the program 4 | import sys 5 | import random 6 | import gym 7 | import numpy as np 8 | from collections import deque 9 | from keras.models import Sequential 10 | from keras.layers import Dense 11 | from keras.optimizers import Adam 12 | import gym_cartpolemod # Import modded version of cartpole 13 | 14 | TIME_STEPS = 60000 15 | TRIALS = 1000 16 | RUNS = 100 17 | # np.random.seed(10) # Set appropriate seed value 18 | 19 | 20 | class deepQNetwork: 21 | def __init__(self, state_size, action_size,envName): 22 | 23 | self.state_size = state_size 24 | self.action_size = action_size 25 | self.memory = deque(maxlen=2000) 26 | self.gamma = 0.95 # discount rate 27 | self.epsilon = 1.0 # exploration rate 28 | self.epsilon_min = 0.01 29 | self.epsilon_decay = 0.9 30 | self.learning_rate = 0.01 31 | self.model = self._build_model() 32 | self.envName = envName 33 | 34 | def _build_model(self): 35 | # Neural Net for Deep-Q learning Model 36 | model = Sequential() 37 | model.add(Dense(24, input_dim=self.state_size, activation='tanh')) 38 | model.add(Dense(48, activation='tanh')) 39 | model.add(Dense(self.action_size, activation='linear')) 40 | print('Compiling Neural Network...') 41 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 42 | return model 43 | 44 | def set_epsilon(self,epsilon): 45 | # Set exploration rate 46 | self.epsilon = epsilon 47 | 48 | def reset_memory(self): 49 | # Reset memory and rebuild the neural network 50 | self.memory = deque(maxlen=2000) 51 | self.set_epsilon(1.0) 52 | self.model = self._build_model() 53 | 54 | def remember(self, state, action, reward, next_state, done): 55 | # Save the states for the Q function(target) 56 | self.memory.append((state, action, reward, next_state, done)) 57 | 58 | def act(self, state): 59 | # Take random action if epsilon is high, else, take action based on the predicted states 60 | if self.epsilon > 0.01: 61 | if np.random.rand() <= self.epsilon: 62 | return random.randrange(self.action_size) 63 | act_values = self.model.predict(state) 64 | return np.argmax(act_values[0]) # returns action 65 | 66 | def replay(self, batch_size): 67 | # Calculate the Q function (Target) for the saved states 68 | minibatch = random.sample(self.memory, batch_size) 69 | for state, action, reward, next_state, done in minibatch: 70 | target = reward 71 | if not done: 72 | target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) 73 | target_f = self.model.predict(state) 74 | target_f[0][action] = target 75 | self.model.fit(state, target_f, epochs=1, verbose=0) # Train the model for each state 76 | # Update the exploration rate for every trial/episode 77 | if self.epsilon > self.epsilon_min: 78 | self.epsilon *= self.epsilon_decay 79 | 80 | def main(self,explore=True): 81 | # Do not explore if explore=False 82 | if not explore: 83 | self.set_epsilon(0.01) 84 | # Initialize parameters 85 | done = False 86 | batch_size = 64 87 | trial_score = deque(maxlen = RUNS) 88 | run_success = deque(maxlen = RUNS) 89 | total_success = 0 90 | for run in range(1,RUNS+1): 91 | # Reset memory for every run 92 | self.reset_memory() 93 | scores = deque(maxlen = 100) 94 | success = 0 95 | for trial in range(1,TRIALS+1): 96 | # Reset states for each trial 97 | state = env.reset() 98 | state = np.reshape(state, [1, state_size]) 99 | total_reward = 0 100 | for time in range(TIME_STEPS): 101 | # Render environment for visualization. Uncomment if needed. 102 | # env.render() 103 | action = self.act(state) 104 | next_state, reward, done, _ = env.step(action) 105 | reward = reward if not done else -10 # Add a negative reward if failed 106 | next_state = np.reshape(next_state, [1, state_size]) 107 | self.remember(state, action, reward, next_state, done) # Save states for each time step 108 | state = next_state 109 | if done: 110 | scores.append(time) # Calculate a score based on the time-steps reached 111 | if trial%100 ==0: 112 | print('Run:{} Trial:{}, Mean score over 100 trials: {}'.format(run,trial,np.mean(scores))) 113 | break 114 | elif time== TIME_STEPS/10: 115 | print('It\'s gonna be a great trial! Ran {} times already! Hope Trial#{} goes on!'.format(TIME_STEPS/10,trial)) 116 | elif time>=(TIME_STEPS-1): 117 | scores.append(time) 118 | print('Woah!!!!') 119 | print('Run:{} Trial:{}, Time: {}'.format(run,trial,time+1)) 120 | success = 1 121 | break 122 | try: 123 | mean_score = np.mean(scores) 124 | except: 125 | mean_score = 0 126 | if success==1: 127 | trial_score.append(trial) 128 | print('Successful trial. Run#:{}'.format(run)) 129 | break 130 | if len(self.memory) > batch_size: 131 | self.replay(batch_size) 132 | run_success.append(success) 133 | try: 134 | mean_trial = np.mean(trial_score) 135 | except: 136 | mean_trial = 0 137 | total_success += success 138 | if success==0: 139 | print('\t\t\t\tFailed Run#:{}'.format(run)) 140 | else: 141 | print('Successful run#: {} Average trial#: {} Successes till now:{}'.format(run,mean_trial,total_success)) 142 | print('\n\n\n\n\n\nSuccess Rate:{}% #Trials: {}'.format(total_success/RUNS*100,mean_trial)) 143 | 144 | 145 | if __name__ == "__main__": 146 | print('Making CartpoleMod environment') 147 | envName = 'CartPoleMod-'+sys.argv[1] 148 | env = gym.make(envName) 149 | env._max_episode_steps = TIME_STEPS 150 | state_size = env.observation_space.shape[0] 151 | action_size = env.action_space.n 152 | agent = deepQNetwork(state_size, action_size, envName) 153 | agent.main() 154 | print('---------Execution time: {} for Case {}------------'.format(datetime.now()-start_time,sys.argv[1])) -------------------------------------------------------------------------------- /gym_cartpolemod/envs/cartpolemod_env.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Classic cart-pole system implemented by Rich Sutton et al. 4 | Copied from http://incompleteideas.net/sutton/book/code/pole.c 5 | permalink: https://perma.cc/C9ZM-652R 6 | Modified by Aaditya Ravindran to include friction and random sensor & actuator noise 7 | """ 8 | 9 | import logging 10 | import math 11 | import random 12 | import gym 13 | from gym import spaces 14 | from gym.utils import seeding 15 | import numpy as np 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | class CartPoleModEnv(gym.Env): 20 | metadata = { 21 | 'render.modes': ['human', 'rgb_array'], 22 | 'video.frames_per_second' : 50 23 | } 24 | 25 | def __init__(self,case=1): 26 | self.__version__ = "0.2.0" 27 | print("CartPoleModEnv - Version {}, Noise case: {}".format(self.__version__,case)) 28 | self.gravity = 9.8 29 | self.masscart = 1.0 30 | self.masspole = 0.1 31 | self.total_mass = (self.masspole + self.masscart) 32 | self.length = 0.5 # actually half the pole's length 33 | self.polemass_length = (self.masspole * self.length) 34 | self._seed() 35 | if case<4: 36 | self.force_mag = 10.0*(1+self.addnoise(case)) 37 | self.case = 1 38 | else: 39 | self.force_mag = 10.0 40 | self.case = case 41 | 42 | self.tau = 0.02 # seconds between state updates 43 | self.frictioncart = 5e-4 # AA Added cart friction 44 | self.frictionpole = 2e-6 # AA Added cart friction 45 | 46 | # Angle at which to fail the episode 47 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 48 | self.x_threshold = 2.4 49 | 50 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds 51 | high = np.array([ 52 | self.x_threshold * 2, 53 | np.finfo(np.float32).max, 54 | self.theta_threshold_radians * 2, 55 | np.finfo(np.float32).max]) 56 | 57 | self.action_space = spaces.Discrete(2) # AA Set discrete states back to 2 58 | self.observation_space = spaces.Box(-high, high) 59 | 60 | self.viewer = None 61 | self.state = None 62 | 63 | self.steps_beyond_done = None 64 | 65 | def addnoise(self,x): 66 | return { 67 | 1 : 0, 68 | 2 : self.np_random.uniform(low=-0.05, high=0.05, size=(1,)), # 5% actuator noise 69 | 3 : self.np_random.uniform(low=-0.10, high=0.10, size=(1,)), # 10% actuator noise 70 | 4 : self.np_random.uniform(low=-0.05, high=0.05, size=(1,)), # 5% sensor noise 71 | 5 : self.np_random.uniform(low=-0.10, high=0.10, size=(1,)), # 10% sensor noise 72 | 6 : self.np_random.normal(loc=0, scale=np.sqrt(0.10), size=(1,)), # 0.1 var sensor noise 73 | 7 : self.np_random.normal(loc=0, scale=np.sqrt(0.20), size=(1,)), # 0.2 var sensor noise 74 | }.get(x,1) 75 | 76 | def _seed(self, seed=None): # Set appropriate seed value 77 | self.np_random, seed = seeding.np_random(seed) 78 | return [seed] 79 | 80 | def _step(self, action): 81 | assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) 82 | state = self.state 83 | x, x_dot, theta, theta_dot = state 84 | force = self.force_mag if action==1 else -self.force_mag 85 | costheta = math.cos(theta) 86 | sintheta = math.sin(theta) 87 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta - self.frictioncart*np.sign(x_dot)) / self.total_mass # AA Added cart friction 88 | thetaacc = (self.gravity * sintheta - costheta* temp - self.frictionpole*theta_dot/self.polemass_length) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass)) # AA Added pole friction 89 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 90 | noise = self.addnoise(self.case) 91 | x = (x + self.tau * x_dot) 92 | x_dot = (x_dot + self.tau * xacc) 93 | theta = (theta + self.tau * theta_dot)*(1 + noise) 94 | theta_dot = (theta_dot + self.tau * thetaacc) 95 | self.state = (x,x_dot,theta,theta_dot) 96 | done = x < -self.x_threshold \ 97 | or x > self.x_threshold \ 98 | or theta < -self.theta_threshold_radians \ 99 | or theta > self.theta_threshold_radians 100 | done = bool(done) 101 | 102 | if not done: 103 | reward = 1.0 104 | elif self.steps_beyond_done is None: 105 | # Pole just fell! 106 | self.steps_beyond_done = 0 107 | reward = 1.0 108 | else: 109 | if self.steps_beyond_done == 0: 110 | logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.") 111 | self.steps_beyond_done += 1 112 | reward = 0.0 113 | 114 | return np.array(self.state), reward, done, {} 115 | 116 | def _reset(self): 117 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) 118 | self.steps_beyond_done = None 119 | return np.array(self.state) 120 | 121 | def _render(self, mode='human', close=False): 122 | if close: 123 | if self.viewer is not None: 124 | self.viewer.close() 125 | self.viewer = None 126 | return 127 | 128 | screen_width = 600 129 | screen_height = 400 130 | 131 | world_width = self.x_threshold*2 132 | scale = screen_width/world_width 133 | carty = 100 # TOP OF CART 134 | polewidth = 10.0 135 | polelen = scale * 1.0 136 | cartwidth = 50.0 137 | cartheight = 30.0 138 | 139 | if self.viewer is None: 140 | from gym.envs.classic_control import rendering 141 | self.viewer = rendering.Viewer(screen_width, screen_height) 142 | l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2 143 | axleoffset =cartheight/4.0 144 | cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) 145 | self.carttrans = rendering.Transform() 146 | cart.add_attr(self.carttrans) 147 | self.viewer.add_geom(cart) 148 | l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2 149 | pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)]) 150 | pole.set_color(.8,.6,.4) 151 | self.poletrans = rendering.Transform(translation=(0, axleoffset)) 152 | pole.add_attr(self.poletrans) 153 | pole.add_attr(self.carttrans) 154 | self.viewer.add_geom(pole) 155 | self.axle = rendering.make_circle(polewidth/2) 156 | self.axle.add_attr(self.poletrans) 157 | self.axle.add_attr(self.carttrans) 158 | self.axle.set_color(.5,.5,.8) 159 | self.viewer.add_geom(self.axle) 160 | self.track = rendering.Line((0,carty), (screen_width,carty)) 161 | self.track.set_color(0,0,0) 162 | self.viewer.add_geom(self.track) 163 | 164 | if self.state is None: return None 165 | 166 | x = self.state 167 | cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART 168 | self.carttrans.set_translation(cartx, carty) 169 | self.poletrans.set_rotation(-x[2]) 170 | return self.viewer.render(return_rgb_array = mode=='rgb_array') --------------------------------------------------------------------------------