├── README.md ├── breakout ├── README.md └── breakout-dqn.py ├── cartpole ├── README.md ├── ac-cartpole.py ├── cartpole-dqn1.py ├── cartpole-dqn2.py ├── cartpole-policygradient.py ├── cartpole-policygradient2.py ├── ppo_tf │ ├── README.md │ ├── algo │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── ppo.cpython-35.pyc │ │ │ └── ppo.cpython-36.pyc │ │ └── ppo.py │ ├── images │ │ ├── ppo_train_test.png │ │ ├── ppo_train_test_legend.png │ │ └── training_ppo.gif │ ├── network_models │ │ └── policy_net.py │ ├── run_ppo.py │ ├── test_policy.py │ └── trajectory │ │ ├── actions.csv │ │ └── observations.csv └── test.csv ├── cartpole_gazebo ├── README.md ├── cartpole_controller │ ├── CMakeLists.txt │ ├── config │ │ └── joint_position_control.yaml │ ├── package.xml │ └── src │ │ ├── pg.py │ │ └── pg2.py └── cartpole_gazebo │ ├── CMakeLists.txt │ ├── launch │ └── cartpole_gazebo.launch │ ├── meshes │ ├── cart.STL │ ├── cart.urdf │ ├── pole.STL │ ├── pole.urdf │ ├── stand.STL │ └── stand.urdf │ ├── package.xml │ └── robots │ └── cartpole_v1.urdf ├── images ├── breakout-v0.gif ├── cartpole-pg-gazebo.gif ├── cartpole.gif ├── cartpole_pg_rewards.png ├── example.gif ├── mountain-car-v0.gif ├── mountaincar_pg_rewards.png └── pg2.gif ├── lunarlander ├── LunarLander.gif └── lunarlander_dqn.py └── mountaincar ├── README.md ├── mountain-car-v0-dqn1.py ├── mountain-car-v0-dqn2.py └── mountaincar-policygradient.py /README.md: -------------------------------------------------------------------------------- 1 | # **Reinforcement Learning for OpenAI gym environments** 2 | 3 | - This repository contains my solution for several of the OpenAI gym's problems on Reinforcement Learning. 4 | 5 | | CartPole | LunarLander | 6 | | ------------------------------- |--------------------------------------- | 7 | | ![CartPole](/images/cartpole.gif) | ![LunarLander](/lunarlander/LunarLander.gif) | 8 | 9 | | Breakout | Mountain Car | CartPole-gazebo | 10 | | ------------------------------- | ------------------------------------- | --------------------------------------- | 11 | | ![Breakout](/images/breakout-v0.gif) | ![MountainCar](/images/mountain-car-v0.gif) | ![CartPole-gazebo](/images/pg2.gif) | 12 | 13 | ## References: 14 | - Human-level control through deep reinforcement 15 | learning 16 | - Playing Atari with Deep Reinforcement Learning 17 | - Policy Gradient Methods for Reinforcement Learning with Function Approximation 18 | -------------------------------------------------------------------------------- /breakout/README.md: -------------------------------------------------------------------------------- 1 | # Breakout 2 | 3 | - Maximize the score in the Atari 2600 game Breakout using Deep Q Networks(DQN). 4 | - In this environment, the observation is an RGB image of the screen, which is an array of shape (210, 160, 3) 5 | - Each action is repeatedly performed for a duration of kk frames, where kk is uniformly sampled from {2,3,4}. 6 | 7 | ![breakout-v0](../images/breakout-v0.gif) 8 | -------------------------------------------------------------------------------- /breakout/breakout-dqn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Breakout-v0 using Full Deep Q Learning 3 | observation dimensions (210, 160, 3) 4 | actions ['NOOP', 'FIRE','RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'] 5 | 6 | ''' 7 | import tensorflow as tf 8 | import gym 9 | import numpy as np 10 | import math 11 | import random 12 | from matplotlib import pyplot as plt 13 | import itertools 14 | import sys 15 | from collections import deque, namedtuple 16 | 17 | env = gym.make("Breakout-v0") 18 | 19 | #observation = env.reset() 20 | 21 | #print env.get_action_meanings() 22 | 23 | #plt.figure() 24 | #plt.imshow(env.render(mode='rgb_array')) 25 | 26 | #[env.step(4) for x in range(1)] 27 | #plt.figure() 28 | #plt.imshow(env.render(mode='rgb_array')) 29 | #plt.imshow(observation[34:-16,:,:]) 30 | #plt.imshow(observation) 31 | #env.render(close=True) 32 | 33 | #plt.show() 34 | 35 | VALID_ACTIONS = [0, 1, 2, 3] 36 | 37 | tf.reset_default_graph() 38 | 39 | # input_preprocessor graph 40 | input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8) 41 | output = tf.image.rgb_to_grayscale(input_state) 42 | # image, offset_height, offset_width, target_height, target_width 43 | output = tf.image.crop_to_bounding_box(output, 34, 0, 160, 160) 44 | output = tf.image.resize_images(output, [84, 84]) 45 | output = tf.squeeze(output) 46 | 47 | # build estimator model 48 | # input is 4 grayscale frames of 84, 84 each 49 | X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name='X') 50 | # target value 51 | y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name='y') 52 | # which action was chosen 53 | actions_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="actions") 54 | 55 | X = tf.to_float(X_pl) / 255.0 56 | batch_size = tf.shape(X_pl)[0] 57 | 58 | with tf.variable_scope("estimator"): 59 | # three convolutional layers -------------------------------------------------- 60 | conv1 = tf.contrib.layers.conv2d(X, 32, 8, 4, activation_fn=tf.nn.relu) 61 | conv2 = tf.contrib.layers.conv2d(conv1, 64, 4, 2, activation_fn=tf.nn.relu) 62 | conv3 = tf.contrib.layers.conv2d(conv2, 64, 3, 1, activation_fn=tf.nn.relu) 63 | 64 | # fully connected layers 65 | flattened = tf.contrib.layers.flatten(conv3) 66 | fc1 = tf.contrib.layers.fully_connected(flattened, 512) 67 | predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS)) 68 | 69 | # get predictions for chosen actions only 70 | gather_indices = tf.range(batch_size)*tf.shape(predictions[1] + actions_pl) 71 | action_predictions = tf.gather(tf.reshape(predictions, [-1]), gather_indices) 72 | 73 | 74 | # build target model ----------------------------------------------------------- 75 | with tf.variable_scope("target"): 76 | # three convolutional layers 77 | t_conv1 = tf.contrib.layers.conv2d(X, 32, 8, 4, activation_fn=tf.nn.relu) 78 | t_conv2 = tf.contrib.layers.conv2d(t_conv1, 64, 4, 2, activation_fn=tf.nn.relu) 79 | t_conv3 = tf.contrib.layers.conv2d(t_conv2, 64, 3, 1, activation_fn=tf.nn.relu) 80 | 81 | # fully connected layers 82 | t_flattened = tf.contrib.layers.flatten(t_conv3) 83 | t_fc1 = tf.contrib.layers.fully_connected(t_flattened, 512) 84 | t_predictions = tf.contrib.layers.fully_connected(t_fc1, len(VALID_ACTIONS)) 85 | 86 | # get predictions for chosen actions only 87 | t_gather_indices = tf.range(batch_size)*tf.shape(t_predictions[1] + actions_pl) 88 | t_action_predictions = tf.gather(tf.reshape(t_predictions, [-1]), t_gather_indices) 89 | 90 | # calculate loss ---------------------------------------------------------------- 91 | losses = tf.squared_difference(y_pl, action_predictions) 92 | loss = tf.reduce_mean(losses) 93 | 94 | # optimizer parameters 95 | optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) 96 | train_op = optimizer.minimize(loss, global_step=tf.contrib.framework.get_global_step()) 97 | 98 | def input_preprocessor(sess, state): 99 | return sess.run(output, feed_dict={input_state: state}) 100 | 101 | def predict(sess, s, network): 102 | ''' 103 | s: shape [batch_size, 4, 160, 160, 3] 104 | returns: shape[batch_size, NUM_VALID_ACTIONS] 105 | ''' 106 | if network == "estimator": 107 | return sess.run(predictions, feed_dict={X_pl: s}) 108 | else: 109 | return sess.run(t_predictions, feed_dict={X_pl: s}) 110 | 111 | def update(sess, s, a, y): 112 | ''' 113 | s: shape [batch_size, 4, 160, 160, 3] 114 | a: chosen actions of shape [batch_size] 115 | y: targets of shape [batch_size] 116 | returns: calculated loss on the batch 117 | ''' 118 | _, _loss = sess.run([train_op, loss], feed_dict={X_pl: s, y_pl: y, actions_pl: a}) 119 | return _loss 120 | 121 | def copy_model_parameters(sess): 122 | e_params = [t for t in tf.trainable_variables() if t.name.startswith("estimator")] 123 | e_params = sorted(e_params, key=lambda v: v.name) 124 | t_params = [t for t in tf.trainable_variables() if t.name.startswith("target")] 125 | t_params = sorted(t_params, key=lambda v: v.name) 126 | 127 | update_ops = [] 128 | for e_v, t_v in zip(e_params, t_params): 129 | op = t_v.assign(e_v) 130 | update_ops.append(op) 131 | sess.run(update_ops) 132 | 133 | def epsilon_greedy_policy(nA, sess, observation, epsilon): 134 | A = np.ones(nA, dtype=float)*epsilon/nA 135 | q_values = predict(sess,np.expand_dims(observation, 0), "estimator")[0] 136 | best_action = np.argmax(q_values) 137 | A[best_action] += (1.0 - epsilon) 138 | 139 | return A 140 | 141 | def deep_q_learning(sess, 142 | env, 143 | num_episodes, 144 | replay_memory_size=500000, 145 | replay_memory_init_size=50000, 146 | update_target_every=10000, 147 | discount_factor=0.99, 148 | epsilon_start=1.0, 149 | epsilon_end=0.1, 150 | epsilon_decay_steps=500000, 151 | batch_size=32): 152 | 153 | Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) 154 | 155 | # the replay memory 156 | replay_memory = [] 157 | 158 | # get the current time step 159 | total_t = sess.run(tf.contrib.framework.get_global_step()) 160 | # the epsilon decay schedule 161 | epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) 162 | 163 | # populating the replay memory with initial experience 164 | state = env.reset() 165 | state = input_preprocessor(sess, state) 166 | state = np.stack([state]*4, axis=2) 167 | for i in range(replay_memory_init_size): 168 | action_probs = epsilon_greedy_policy(len(VALID_ACTIONS), sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) 169 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 170 | next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) 171 | next_state = input_preprocessor(sess, next_state) 172 | next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) 173 | replay_memory.append(Transition(state, action, reward, next_state, done)) 174 | if done: 175 | state = env.reset() 176 | state = input_preprocessor(sess, state) 177 | state = np.stack([state]*4, axis=2) 178 | print "populating replay memory ... current episode: ", i 179 | 180 | for i_episode in range(num_episodes): 181 | 182 | # reset the environment 183 | state = env.reset() 184 | state = input_preprocessor(sess, state) 185 | state = np.stack([state]*4, axis=2) 186 | loss = None 187 | 188 | # one step in the environment 189 | for t in itertools.count(): 190 | 191 | # epsilon for this time step 192 | epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] 193 | 194 | # update target after regular intervals 195 | if total_t % update_target_every == 0: 196 | copy_model_parameters(sess) 197 | 198 | # print out which step are we on 199 | print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( 200 | t, total_t, i_episode + 1, num_episodes, loss)) 201 | sys.stdout.flush() 202 | 203 | # take a step 204 | action_probs = epsilon_greedy_policy(len(VALID_ACTIONS), sess, state, epsilon) 205 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 206 | next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) 207 | next_state = input_preprocessor(sess, next_state) 208 | next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) 209 | 210 | # if replay memory is full, pop the first element 211 | if len(replay_memory) == replay_memory_size: 212 | replay_memory.pop(0) 213 | 214 | # save the transition in replay memory 215 | replay_memory.append(Transition(state, action, reward, next_state, done)) 216 | 217 | # sample a minibatch from the replay memory 218 | samples = random.sample(replay_memory, batch_size) 219 | states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) 220 | 221 | # calculate the q values and targets 222 | q_values_next = predict(sess, next_states_batch, "target") 223 | targets_batch = reward_batch + np.invert(done_batch).astype(np.float32)*discount_factor*np.amax(q_values_next, axis=1) 224 | 225 | # perform gradient descent update 226 | states_batch = np.array(states_batch) 227 | loss = update(sess, states_batch, action_batch, targets_batch) 228 | 229 | if done: 230 | break 231 | 232 | state = next_state 233 | total_t += 1 234 | 235 | # create a glboal step variable 236 | global_step = tf.Variable(0, name='global_step', trainable=False) 237 | 238 | with tf.Session() as sess: 239 | sess.run(tf.initialize_all_variables()) 240 | deep_q_learning(sess, env, 10000) 241 | 242 | 243 | -------------------------------------------------------------------------------- /cartpole/README.md: -------------------------------------------------------------------------------- 1 | ## Cartpole balancing using Reinforcement Learning 2 | 3 | ![Cartpole](../images/cartpole.gif) 4 | 5 | ### Policy Gradient solves the problem in 1600 episodes. 6 | ![Cartpole_Rewards](../images/cartpole_pg_rewards.png) 7 | -------------------------------------------------------------------------------- /cartpole/ac-cartpole.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | import csv 5 | import copy 6 | import random 7 | 8 | writer_file = open('rewards_ac_cartpole_2.csv', 'wt') 9 | writer = csv.writer(writer_file) 10 | writer.writerow(['total_rewards']) 11 | 12 | EPISODES = 10000 13 | env = gym.make('CartPole-v0') 14 | # env.seed(1) 15 | # env = env.unwrapped 16 | 17 | # create network graph 18 | D = env.observation_space.shape[0] 19 | A = env.action_space.n 20 | H = 10 21 | actor_learning_rate = 0.001 22 | critc_learning_rate = 0.01 23 | gamma = 0.95 24 | render = False 25 | memory_size = 200 26 | batch_size = 64 27 | 28 | tf.reset_default_graph() 29 | input_x = tf.placeholder(tf.float32, [None, D], name="input_x") 30 | true_q = tf.placeholder(tf.float32, name = "true_q") 31 | 32 | ################################## Critic Network ################################### 33 | W1 = tf.get_variable("W1", shape=[D, H], 34 | initializer=tf.contrib.layers.xavier_initializer()) 35 | layer1 = tf.nn.relu(tf.matmul(input_x,W1)) 36 | 37 | W2 = tf.get_variable("W2", shape=[H, 1], 38 | initializer=tf.contrib.layers.xavier_initializer()) 39 | 40 | critic_fc3 = tf.matmul(layer1, W2) 41 | ## ---------------------------------------------------------------------------------- 42 | t_W1 = tf.get_variable("t_W1", shape=[D, H], 43 | initializer=tf.contrib.layers.xavier_initializer()) 44 | t_layer1 = tf.nn.relu(tf.matmul(input_x, t_W1)) 45 | 46 | t_W2 = tf.get_variable("t_W2", shape=[H, 1], 47 | initializer=tf.contrib.layers.xavier_initializer()) 48 | 49 | critic_t_fc3 = tf.matmul(t_layer1, t_W2) 50 | 51 | diffs = critic_fc3 - true_q 52 | critic_loss = -tf.reduce_mean(tf.square(diffs)) 53 | 54 | critic_optimizer = tf.train.AdamOptimizer(learning_rate=critc_learning_rate).minimize(critic_loss) 55 | 56 | 57 | ##################################### Actor Network ################################# 58 | actor_fc1 = tf.contrib.layers.fully_connected(inputs = input_x,\ 59 | num_outputs = H,\ 60 | activation_fn= tf.nn.relu,\ 61 | weights_initializer=tf.contrib.layers.xavier_initializer()) 62 | actor_fc2 = tf.contrib.layers.fully_connected(inputs = actor_fc1,\ 63 | num_outputs = A,\ 64 | activation_fn= tf.nn.relu,\ 65 | weights_initializer=tf.contrib.layers.xavier_initializer()) 66 | actor_fc3 = tf.contrib.layers.fully_connected(inputs = actor_fc2,\ 67 | num_outputs = A,\ 68 | activation_fn= None,\ 69 | weights_initializer=tf.contrib.layers.xavier_initializer()) 70 | 71 | output = tf.nn.softmax(actor_fc3) 72 | 73 | input_y = tf.placeholder(tf.float32, [None, 2], name="input_y") 74 | discounted_rewards = tf.placeholder(tf.float32, name="discounted_rewards") 75 | neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(logits=actor_fc3, labels=input_y) 76 | product = neg_log_likelihood * discounted_rewards 77 | actor_loss = tf.reduce_mean(product) # no need for -ve sign if using tf.nn.softmax_cross_entr..... 78 | # as it gives neg_log_likelihood 79 | 80 | actor_optimizer = tf.train.AdamOptimizer(learning_rate=actor_learning_rate).minimize(actor_loss) 81 | 82 | 83 | init = tf.initialize_all_variables() 84 | 85 | def discounted_rewards_(r): 86 | discounted_r = np.zeros_like(r) 87 | running_add = 0 88 | for t in reversed(xrange(len(r))): 89 | running_add = running_add * gamma + r[t] 90 | discounted_r[t] = running_add 91 | 92 | return discounted_r 93 | 94 | def choose_action(out): 95 | action = np.random.choice(range(out.shape[1]), p=out.ravel()) 96 | return action 97 | 98 | 99 | with tf.Session() as sess: 100 | sess.run(init) 101 | 102 | xs, drs, ys = [], [], [] 103 | 104 | episode_number = 0 105 | reward_sum = 0 106 | reward_sum_buffer = [] 107 | current_state = env.reset() 108 | memory_target = [] 109 | memory_states = [] 110 | done = False 111 | goal_reached = False 112 | 113 | while not goal_reached: 114 | x = np.reshape(current_state, [1, D]) 115 | out = sess.run(output, feed_dict={input_x: x}) 116 | action = choose_action(out) 117 | xs.append(x) 118 | temp_y = np.zeros(2) 119 | temp_y[action] = 1 120 | ys.append(temp_y) 121 | 122 | next_state, reward, done, _ = env.step(action) 123 | # if render: 124 | # env.render() 125 | drs.append(reward) 126 | reward_sum += reward 127 | 128 | if not done: 129 | q_pred = sess.run(critic_t_fc3, feed_dict={input_x: np.reshape(next_state,[1, D])}) 130 | update = reward + gamma*q_pred 131 | else: 132 | update = reward 133 | 134 | memory_target.append(update) 135 | memory_states.append(x) 136 | 137 | # if episode ends, find discounted rewards and 138 | # find gradients for the episode 139 | 140 | if done: 141 | 142 | episode_number += 1 143 | epx = np.vstack(np.array(xs)) 144 | epy = np.vstack(np.array(ys)) 145 | epr = np.vstack(np.array(drs)) 146 | 147 | discounted_rs = discounted_rewards_(drs) 148 | 149 | xs, ys, drs = [], [], [] 150 | 151 | 152 | memory_states_temp = copy.copy(memory_states) 153 | memory_targets_temp = copy.copy(memory_target) 154 | 155 | memory_states_temp = np.vstack(memory_states_temp) 156 | memory_targets_temp = np.vstack(memory_targets_temp) 157 | 158 | temp_list = zip(memory_states_temp, memory_targets_temp) 159 | random.shuffle(temp_list) 160 | ep_states, ep_targets = zip(*temp_list[:batch_size]) 161 | 162 | 163 | q_pred_ = sess.run(critic_t_fc3, feed_dict={input_x: epx}) 164 | sess.run(actor_optimizer, feed_dict={discounted_rewards: (q_pred_- np.mean(discounted_rs)), input_x: epx, input_y: epy}) 165 | sess.run(critic_optimizer, feed_dict={true_q: ep_targets, input_x: ep_states}) 166 | reward_sum_buffer.append(reward_sum) 167 | 168 | if episode_number % 100 == 0: 169 | average_per_100_eps = sum(reward_sum_buffer)/100 170 | if average_per_100_eps == 200.00: # acieved the goal. 171 | goal_reached = True 172 | t_W1 = tf.identity(W1) 173 | t_W2 = tf.identity(W2) 174 | print "Average reward for ", episode_number," episodes is :", average_per_100_eps 175 | reward_sum_buffer = [] 176 | writer.writerow([average_per_100_eps]) 177 | if episode_number % memory_size == 0: 178 | memory_states = [] 179 | memory_target = [] 180 | 181 | if reward_sum == 200.0: 182 | render = True 183 | reward_sum = 0 184 | current_state = env.reset() 185 | 186 | current_state = next_state 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /cartpole/cartpole-dqn1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole balancing using a simple DQN with experience-replay. 3 | ''' 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | import gym 8 | import random 9 | import copy 10 | import math 11 | 12 | env = gym.make("CartPole-v0") 13 | render = True # set to True for rendering 14 | 15 | num_episodes = 10000 16 | batch_size = 64 17 | memory_size = 200 18 | H1 = 64 19 | D = 4 20 | learning_rate = 1e-2 21 | gamma = 0.99 22 | epsilon_max = 1.0 23 | epsilon_min = 0.01 24 | 25 | tf.reset_default_graph() 26 | observations = tf.placeholder(tf.float32, [None,D], name="input_x") 27 | W1 = tf.get_variable("W1", shape=[D, H1], 28 | initializer=tf.contrib.layers.xavier_initializer()) 29 | layer1 = tf.nn.relu(tf.matmul(observations,W1)) 30 | 31 | W2 = tf.get_variable("W2", shape=[H1, 2], 32 | initializer=tf.contrib.layers.xavier_initializer()) 33 | 34 | linear = tf.matmul(layer1, W2) 35 | #Qout = tf.nn.sigmoid(linear) 36 | Qout = linear 37 | Qtarget = tf.placeholder(tf.float32, [None, 2], name="Qtarget") 38 | #loglik = tf.log(Qtarget*(Qtarget - Qout) + (1 - Qtarget)*(Qtarget + Qout)) 39 | diffs = Qtarget - Qout 40 | loss = -tf.reduce_mean(tf.square(diffs)) 41 | #loss = -tf.reduce_mean(loglik) 42 | adam = tf.train.AdamOptimizer(learning_rate).minimize(loss) 43 | 44 | init = tf.initialize_all_variables() 45 | 46 | with tf.Session() as sess: 47 | sess.run(init) 48 | memory_states = [] 49 | memory_targets = [] 50 | for _ in xrange(num_episodes): 51 | observation = env.reset() 52 | done = False 53 | ep_states = [] 54 | ep_targets = [] 55 | memory_states_temp = [] 56 | memory_targets_temp = [] 57 | i = 0 58 | total_reward = 0 59 | while done == False: 60 | i += 1 61 | #print i 62 | state = np.reshape(observation, [1, D]) 63 | #print state 64 | #ep_states.append(state) 65 | memory_states.append(state) 66 | #print memory_states 67 | Qvals = sess.run(Qout, feed_dict={observations: state}) 68 | epsilon = epsilon_min + (epsilon_max - epsilon_min)*(math.exp(-0.01*_)) 69 | if random.random() < epsilon: 70 | action = env.action_space.sample() 71 | #print "RANDOM" 72 | else: 73 | action = np.argmax(Qvals) 74 | #print "GREEDY" 75 | 76 | #take an e-greedy action 77 | new_state, reward, done, info = env.step(action) 78 | if render == True: 79 | env.render() 80 | 81 | total_reward += reward 82 | nextQvals = sess.run(Qout, feed_dict={observations: np.reshape(new_state,[1, D])}) 83 | old_state = state 84 | observation = new_state 85 | maxQvals = np.max(nextQvals) 86 | if done == False: 87 | update = reward + (gamma*maxQvals) 88 | #print total_reward 89 | else: 90 | update = reward 91 | targetQvals = Qvals 92 | targetQvals[0, action] = update 93 | #ep_targets.append(targetQvals) 94 | memory_targets.append(targetQvals) 95 | 96 | memory_states_temp = copy.copy(memory_states) 97 | memory_targets_temp = copy.copy(memory_targets) 98 | 99 | memory_states_temp = np.vstack(memory_states_temp) 100 | memory_targets_temp = np.vstack(memory_targets_temp) 101 | 102 | temp_list = zip(memory_states_temp, memory_targets_temp) 103 | random.shuffle(temp_list) 104 | ep_states, ep_targets = zip(*temp_list[:batch_size]) 105 | sess.run(adam, feed_dict={observations: ep_states, Qtarget: ep_targets}) 106 | if _ % memory_size == 0: 107 | memory_states = [] 108 | memory_targets = [] 109 | print "reward this episode :",total_reward 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /cartpole/cartpole-dqn2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole balancing using a Full DQN with experience-replay 3 | and a separate Target network. 4 | ''' 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | import random 10 | import copy 11 | import math 12 | import csv 13 | 14 | env = gym.make("CartPole-v0") 15 | render = True # set to True for rendering 16 | 17 | num_episodes = 10000 18 | batch_size = 64 19 | memory_size = 200 20 | H1 = 64 21 | D = 4 22 | learning_rate = 1e-2 23 | gamma = 0.99 24 | epsilon_max = 1.0 25 | epsilon_min = 0.01 26 | 27 | file = open('test.csv', 'wb') 28 | csv_writer = csv.writer(file, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) 29 | csv_writer.writerow(('total_rewards')) 30 | 31 | tf.reset_default_graph() 32 | 33 | # normal network 34 | observations = tf.placeholder(tf.float32, [None,D], name="input_x") 35 | W1 = tf.get_variable("W1", shape=[D, H1], 36 | initializer=tf.contrib.layers.xavier_initializer()) 37 | layer1 = tf.nn.relu(tf.matmul(observations,W1)) 38 | 39 | W2 = tf.get_variable("W2", shape=[H1, 2], 40 | initializer=tf.contrib.layers.xavier_initializer()) 41 | 42 | linear = tf.matmul(layer1, W2) 43 | #Qout = tf.nn.sigmoid(linear) 44 | Qout = linear 45 | 46 | Qtarget = tf.placeholder(tf.float32, [None, 2], name="Qtarget") 47 | 48 | # separate target network 49 | t_W1 = tf.get_variable("t_W1", shape=[D, H1], 50 | initializer=tf.contrib.layers.xavier_initializer()) 51 | t_layer1 = tf.nn.relu(tf.matmul(observations,t_W1)) 52 | 53 | t_W2 = tf.get_variable("t_W2", shape=[H1, 2], 54 | initializer=tf.contrib.layers.xavier_initializer()) 55 | t_linear = tf.matmul(t_layer1, t_W2) 56 | t_Qout = t_linear 57 | 58 | # error 59 | diffs = Qtarget - Qout 60 | loss = -tf.reduce_mean(tf.square(diffs)) 61 | adam = tf.train.AdamOptimizer(learning_rate).minimize(loss) 62 | 63 | init = tf.initialize_all_variables() 64 | 65 | with tf.Session() as sess: 66 | sess.run(init) 67 | memory_states = [] 68 | memory_targets = [] 69 | for _ in xrange(num_episodes): 70 | observation = env.reset() 71 | done = False 72 | ep_states = [] 73 | ep_targets = [] 74 | memory_states_temp = [] 75 | memory_targets_temp = [] 76 | i = 0 77 | total_reward = 0 78 | while done == False: 79 | i += 1 80 | #print i 81 | state = np.reshape(observation, [1, D]) 82 | print state 83 | #print state 84 | #ep_states.append(state) 85 | memory_states.append(state) 86 | #print memory_states 87 | Qvals = sess.run(Qout, feed_dict={observations: state}) 88 | epsilon = epsilon_min + (epsilon_max - epsilon_min)*(math.exp(-0.01*_)) 89 | if random.random() < epsilon: 90 | action = env.action_space.sample() 91 | #print "RANDOM" 92 | else: 93 | action = np.argmax(Qvals) 94 | #print "GREEDY" 95 | 96 | #take an e-greedy action 97 | new_state, reward, done, info = env.step(action) 98 | if render == True: 99 | env.render() 100 | 101 | total_reward += reward 102 | nextQvals = sess.run(t_Qout, feed_dict={observations: np.reshape(new_state,[1, D])}) 103 | old_state = state 104 | observation = new_state 105 | maxQvals = np.max(nextQvals) 106 | if done == False: 107 | update = reward + (gamma*maxQvals) 108 | #print total_reward 109 | else: 110 | update = reward 111 | targetQvals = Qvals 112 | targetQvals[0, action] = update 113 | #ep_targets.append(targetQvals) 114 | memory_targets.append(targetQvals) 115 | 116 | memory_states_temp = copy.copy(memory_states) 117 | memory_targets_temp = copy.copy(memory_targets) 118 | 119 | memory_states_temp = np.vstack(memory_states_temp) 120 | memory_targets_temp = np.vstack(memory_targets_temp) 121 | 122 | temp_list = zip(memory_states_temp, memory_targets_temp) 123 | random.shuffle(temp_list) 124 | ep_states, ep_targets = zip(*temp_list[:batch_size]) 125 | sess.run(adam, feed_dict={observations: ep_states, Qtarget: ep_targets}) 126 | if _ % memory_size == 0: 127 | memory_states = [] 128 | memory_targets = [] 129 | 130 | # update target network regularly but slowly 131 | # copy the weights from the normal network in current episode 132 | # to the target network 133 | if _ % 100 == 0: 134 | # update target network 135 | t_W1 = tf.identity(W1) 136 | t_W2 = tf.identity(W2) 137 | 138 | csv_writer.writerow((str(total_reward))) 139 | print "reward in episode ",_," is: ",total_reward 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /cartpole/cartpole-policygradient.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code uses Policy gradient algorithm called REINFORCE to solve 3 | OpenAI Gym's CartPole balancing problem. 4 | 5 | ''' 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import gym 10 | import math 11 | import csv 12 | 13 | env = gym.make("CartPole-v0") 14 | env.reset() 15 | 16 | writer_file = open('rewards_pg1.csv', 'wt') 17 | writer = csv.writer(writer_file) 18 | writer.writerow(['total_rewards_0']) 19 | 20 | H = 10 21 | batch_size = 50 22 | learning_rate = 1e-2 23 | gamma = 0.99 24 | D = 4 25 | 26 | tf.reset_default_graph() 27 | 28 | observations = tf.placeholder(tf.float32, [None, D], name="input_x") 29 | w1 = tf.get_variable("w1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer()) 30 | layer1 = tf.nn.relu(tf.matmul(observations, w1)) 31 | w2 = tf.get_variable("w2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer()) 32 | score = tf.matmul(layer1, w2) 33 | probability = tf.nn.sigmoid(score) 34 | 35 | tvars = tf.trainable_variables() 36 | input_y = tf.placeholder(tf.float32, [None,1], name="input_y") 37 | advantages = tf.placeholder(tf.float32, name="reward_signal") 38 | 39 | loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) 40 | loss = -tf.reduce_mean(loglik*advantages) 41 | newGrads = tf.gradients(loss, tvars) 42 | 43 | adam = tf.train.AdamOptimizer(learning_rate=learning_rate) 44 | w1grad = tf.placeholder(tf.float32,name="batch_grad1") 45 | w2grad = tf.placeholder(tf.float32, name="batch_grad2") 46 | batchgrads = [w1grad, w2grad] 47 | updategrads = adam.apply_gradients(zip(batchgrads, tvars)) 48 | 49 | def discount_rewards(r): 50 | discounted_r = np.zeros_like(r) 51 | running_add = 0 52 | for t in reversed(xrange(0, r.size)): 53 | running_add = running_add * gamma + r[t] 54 | discounted_r[t] = running_add 55 | return discounted_r 56 | 57 | xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] 58 | running_reward = None 59 | reward_sum = 0 60 | episode_number = 1 61 | total_episodes = 10000 62 | init = tf.initialize_all_variables() 63 | #env.monitor.start('cartpole-policygradient-monitor/', force=True) 64 | 65 | # Launch the graph 66 | with tf.Session() as sess: 67 | rendering = False 68 | sess.run(init) 69 | observation = env.reset() # Obtain an initial observation of the environment 70 | 71 | # Reset the gradient placeholder. We will collect gradients in 72 | # gradBuffer until we are ready to update our policy network. 73 | gradBuffer = sess.run(tvars) 74 | for ix,grad in enumerate(gradBuffer): 75 | gradBuffer[ix] = grad * 0 76 | 77 | while episode_number <= total_episodes: 78 | 79 | # if reward_sum/batch_size > 100 or rendering == True : 80 | # env.render() 81 | # rendering = True 82 | 83 | # Make sure the observation is in a shape the network can handle. 84 | x = np.reshape(observation,[1,D]) 85 | 86 | # Run the policy network and get an action to take. 87 | tfprob = sess.run(probability,feed_dict={observations: x}) 88 | 89 | action = 1 if np.random.uniform() < tfprob else 0 90 | #print ("ACTION :",action) 91 | xs.append(x) # observation 92 | y = 1 if action == 0 else 0 93 | ys.append(y) 94 | 95 | # step the environment and get new measurements 96 | observation, reward, done, info = env.step(action) 97 | 98 | reward_sum += reward 99 | 100 | drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) 101 | 102 | if done: 103 | episode_number += 1 104 | # stack together all inputs, hidden states, action gradients, and rewards for this episode 105 | epx = np.vstack(xs) 106 | epy = np.vstack(ys) 107 | epr = np.vstack(drs) 108 | tfp = tfps 109 | xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory 110 | 111 | # compute the discounted reward backwards through time 112 | discounted_epr = discount_rewards(epr) 113 | # size the rewards to be unit normal (helps control the gradient estimator variance) 114 | discounted_epr -= np.mean(discounted_epr) 115 | discounted_epr /= np.std(discounted_epr) 116 | 117 | # Get the gradient for this episode, and save it in the gradBuffer 118 | tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr}) 119 | for ix,grad in enumerate(tGrad): 120 | gradBuffer[ix] += grad 121 | 122 | # If we have completed enough episodes, then update the policy network with our gradients. 123 | if episode_number % batch_size == 0: 124 | sess.run(updategrads,feed_dict={w1grad: gradBuffer[0],w2grad:gradBuffer[1]}) 125 | for ix,grad in enumerate(gradBuffer): 126 | gradBuffer[ix] = grad * 0 127 | 128 | # Give a summary of how well our network is doing for each batch of episodes. 129 | running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 130 | print ('Average reward for episode %f is :%f. Total average reward %f.' % (episode_number, reward_sum/batch_size, running_reward/batch_size)) 131 | writer.writerow([reward_sum/batch_size]) 132 | 133 | if reward_sum/batch_size > 200: 134 | print ("Task solved in",episode_number,'episodes!') 135 | break 136 | 137 | reward_sum = 0 138 | 139 | observation = env.reset() 140 | 141 | print (episode_number,'Episodes completed.') 142 | #env.monitor.close() 143 | 144 | -------------------------------------------------------------------------------- /cartpole/cartpole-policygradient2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | import csv 5 | 6 | # writer_file = open('rewards_pg_cartpole.csv', 'wt') 7 | # writer = csv.writer(writer_file) 8 | # writer.writerow(['total_rewards']) 9 | 10 | EPISODES = 10000 11 | env = gym.make('CartPole-v0') 12 | 13 | # create network graph 14 | D = env.observation_space.shape[0] 15 | A = env.action_space.n 16 | H = 10 17 | learning_rate = 0.01 18 | gamma = 0.95 19 | render = False 20 | 21 | tf.reset_default_graph() 22 | input_x = tf.placeholder(tf.float32, [None, D], name="input_x") 23 | fc1 = tf.contrib.layers.fully_connected(inputs = input_x,\ 24 | num_outputs = H,\ 25 | activation_fn= tf.nn.relu,\ 26 | weights_initializer=tf.contrib.layers.xavier_initializer()) 27 | fc2 = tf.contrib.layers.fully_connected(inputs = fc1,\ 28 | num_outputs = A,\ 29 | activation_fn= tf.nn.relu,\ 30 | weights_initializer=tf.contrib.layers.xavier_initializer()) 31 | fc3 = tf.contrib.layers.fully_connected(inputs = fc2,\ 32 | num_outputs = A,\ 33 | activation_fn= None,\ 34 | weights_initializer=tf.contrib.layers.xavier_initializer()) 35 | 36 | output = tf.nn.softmax(fc3) 37 | 38 | input_y = tf.placeholder(tf.float32, [None, 2], name="input_y") 39 | discounted_rewards = tf.placeholder(tf.float32, name="discounted_rewards") 40 | neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(logits=fc3, labels=input_y) 41 | product = neg_log_likelihood * discounted_rewards 42 | loss = tf.reduce_mean(product) # no need for -ve sign if using tf.nn.softmax_cross_entr..... 43 | # as it gives neg_log_likelihood 44 | 45 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 46 | 47 | init = tf.initialize_all_variables() 48 | 49 | def discounted_rewards_(r): 50 | discounted_r = np.zeros_like(r) 51 | running_add = 0 52 | for t in reversed(xrange(len(r))): 53 | running_add = running_add * gamma + r[t] 54 | discounted_r[t] = running_add 55 | 56 | return discounted_r 57 | 58 | def choose_action(output): 59 | action = np.random.choice(range(out.shape[1]), p=out.ravel()) 60 | return action 61 | # env = gym.wrappers.Monitor(env, 'cartpole-policygradient-monitor/', force=True) 62 | 63 | with tf.Session() as sess: 64 | sess.run(init) 65 | 66 | xs, drs, ys = [], [], [] 67 | 68 | episode_number = 0 69 | reward_sum = 0 70 | reward_sum_buffer = [] 71 | current_state = env.reset() 72 | 73 | done = False 74 | goal_reached = False 75 | while not goal_reached: 76 | x = np.reshape(current_state, [1, D]) 77 | out = sess.run(output, feed_dict={input_x: x}) 78 | action = choose_action(out) 79 | xs.append(x) 80 | temp_y = np.zeros(2) 81 | temp_y[action] = 1 82 | ys.append(temp_y) 83 | 84 | next_state, reward, done, _ = env.step(action) 85 | # if render: 86 | # env.render() 87 | drs.append(reward) 88 | reward_sum += reward 89 | 90 | # if episode ends, find discounted rewards and 91 | # find gradients for the episode 92 | if done: 93 | episode_number += 1 94 | epx = np.vstack(np.array(xs)) 95 | epy = np.vstack(np.array(ys)) 96 | epr = np.vstack(np.array(drs)) 97 | 98 | discounted_rs = discounted_rewards_(drs) 99 | discounted_rs -= np.mean(discounted_rs) 100 | discounted_rs /= np.std(discounted_rs) 101 | 102 | xs, ys, drs = [], [], [] 103 | 104 | sess.run(optimizer, feed_dict={discounted_rewards: discounted_rs, input_x: epx, input_y: epy}) 105 | 106 | reward_sum_buffer.append(reward_sum) 107 | if episode_number % 100 == 0: 108 | average_per_100_eps = sum(reward_sum_buffer)/100 109 | if average_per_100_eps == 200.00: # acieved the goal. 110 | goal_reached = True 111 | 112 | print "Average reward for ", episode_number," episodes is :", average_per_100_eps 113 | reward_sum_buffer = [] 114 | # writer.writerow([average_per_100_eps]) 115 | 116 | if reward_sum == 200.0: 117 | render = True 118 | reward_sum = 0 119 | current_state = env.reset() 120 | 121 | current_state = next_state 122 | 123 | # env.monitor.close() 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/README.md: -------------------------------------------------------------------------------- 1 | # Proximal Policy Optimization(PPO) 2 | Implementation of Proximal Policy Optimization(PPO) for classic cartpole environment using tensorflow. 3 | 4 | ## Dependencies 5 | - Python: 3.5 6 | - Tensorflow: 1.4 7 | - Gym: 0.9.3 8 | 9 | ## Gym environment 10 | 11 | Environment: CartPole-v0 12 | State: Continuous 13 | Action: Discrete 14 | 15 | ## Implementation: 16 | 17 | **Train experts** 18 | ``` 19 | python3 run_ppo.py 20 | ``` 21 |

22 | 23 |

24 | 25 | **Test trained policy** 26 | ``` 27 | python3 test_policy.py --alg=ppo 28 | ``` 29 | **Plot(s)** 30 | 31 | | ![](./images/ppo_train_test.png) | ![](./images/ppo_train_test_legend.png) | 32 | | :---: | :---: | 33 | | Training and Testing results for PPO | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/algo/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/algo/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /cartpole/ppo_tf/algo/__pycache__/ppo.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/algo/__pycache__/ppo.cpython-35.pyc -------------------------------------------------------------------------------- /cartpole/ppo_tf/algo/__pycache__/ppo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/algo/__pycache__/ppo.cpython-36.pyc -------------------------------------------------------------------------------- /cartpole/ppo_tf/algo/ppo.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import copy 3 | 4 | 5 | class PPOTrain: 6 | def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.01): 7 | """ 8 | :param Policy: 9 | :param Old_Policy: 10 | :param gamma: 11 | :param clip_value: 12 | :param c_1: parameter for value difference 13 | :param c_2: parameter for entropy bonus 14 | """ 15 | 16 | self.Policy = Policy 17 | self.Old_Policy = Old_Policy 18 | self.gamma = gamma 19 | 20 | pi_trainable = self.Policy.get_trainable_variables() 21 | old_pi_trainable = self.Old_Policy.get_trainable_variables() 22 | 23 | # assign_operations for policy parameter values to old policy parameters 24 | with tf.variable_scope('assign_op'): 25 | self.assign_ops = [] 26 | for v_old, v in zip(old_pi_trainable, pi_trainable): 27 | self.assign_ops.append(tf.assign(v_old, v)) 28 | 29 | # inputs for train_op 30 | with tf.variable_scope('train_inp'): 31 | self.actions = tf.placeholder(dtype=tf.int32, shape=[None], name='actions') 32 | self.rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards') 33 | self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') 34 | self.gaes = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes') 35 | 36 | act_probs = self.Policy.act_probs 37 | act_probs_old = self.Old_Policy.act_probs 38 | 39 | # probabilities of actions which agent took with policy 40 | act_probs = act_probs * tf.one_hot(indices=self.actions, depth=act_probs.shape[1]) 41 | act_probs = tf.reduce_sum(act_probs, axis=1) 42 | 43 | # probabilities of actions which agent took with old policy 44 | act_probs_old = act_probs_old * tf.one_hot(indices=self.actions, depth=act_probs_old.shape[1]) 45 | act_probs_old = tf.reduce_sum(act_probs_old, axis=1) 46 | 47 | with tf.variable_scope('loss'): 48 | # construct computation graph for loss_clip 49 | # ratios = tf.divide(act_probs, act_probs_old) 50 | ratios = tf.exp(tf.log(tf.clip_by_value(act_probs, 1e-10, 1.0)) 51 | - tf.log(tf.clip_by_value(act_probs_old, 1e-10, 1.0))) 52 | clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - clip_value, clip_value_max=1 + clip_value) 53 | loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) 54 | loss_clip = tf.reduce_mean(loss_clip) 55 | tf.summary.scalar('loss_clip', loss_clip) 56 | 57 | # construct computation graph for loss of entropy bonus 58 | entropy = -tf.reduce_sum(self.Policy.act_probs * 59 | tf.log(tf.clip_by_value(self.Policy.act_probs, 1e-10, 1.0)), axis=1) 60 | entropy = tf.reduce_mean(entropy, axis=0) # mean of entropy of pi(obs) 61 | tf.summary.scalar('entropy', entropy) 62 | 63 | # construct computation graph for loss of value function 64 | v_preds = self.Policy.v_preds 65 | loss_vf = tf.squared_difference(self.rewards + self.gamma * self.v_preds_next, v_preds) 66 | loss_vf = tf.reduce_mean(loss_vf) 67 | tf.summary.scalar('value_difference', loss_vf) 68 | 69 | # construct computation graph for loss 70 | loss = loss_clip - c_1 * loss_vf + c_2 * entropy 71 | 72 | # minimize -loss == maximize loss 73 | loss = -loss 74 | tf.summary.scalar('total', loss) 75 | 76 | self.merged = tf.summary.merge_all() 77 | optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-5) 78 | self.gradients = optimizer.compute_gradients(loss, var_list=pi_trainable) 79 | self.train_op = optimizer.minimize(loss, var_list=pi_trainable) 80 | 81 | def train(self, obs, actions, gaes, rewards, v_preds_next): 82 | tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs, 83 | self.Old_Policy.obs: obs, 84 | self.actions: actions, 85 | self.rewards: rewards, 86 | self.v_preds_next: v_preds_next, 87 | self.gaes: gaes}) 88 | 89 | def get_summary(self, obs, actions, gaes, rewards, v_preds_next): 90 | return tf.get_default_session().run(self.merged, feed_dict={self.Policy.obs: obs, 91 | self.Old_Policy.obs: obs, 92 | self.actions: actions, 93 | self.rewards: rewards, 94 | self.v_preds_next: v_preds_next, 95 | self.gaes: gaes}) 96 | 97 | def assign_policy_parameters(self): 98 | # assign policy parameter values to old policy parameters 99 | return tf.get_default_session().run(self.assign_ops) 100 | 101 | def get_gaes(self, rewards, v_preds, v_preds_next): 102 | deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)] 103 | # calculate generative advantage estimator(lambda = 1), see ppo paper eq(11) 104 | gaes = copy.deepcopy(deltas) 105 | for t in reversed(range(len(gaes) - 1)): # is T-1, where T is time step which run policy 106 | gaes[t] = gaes[t] + self.gamma * gaes[t + 1] 107 | return gaes 108 | 109 | def get_grad(self, obs, actions, gaes, rewards, v_preds_next): 110 | return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs, 111 | self.Old_Policy.obs: obs, 112 | self.actions: actions, 113 | self.rewards: rewards, 114 | self.v_preds_next: v_preds_next, 115 | self.gaes: gaes}) 116 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/images/ppo_train_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/images/ppo_train_test.png -------------------------------------------------------------------------------- /cartpole/ppo_tf/images/ppo_train_test_legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/images/ppo_train_test_legend.png -------------------------------------------------------------------------------- /cartpole/ppo_tf/images/training_ppo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole/ppo_tf/images/training_ppo.gif -------------------------------------------------------------------------------- /cartpole/ppo_tf/network_models/policy_net.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class Policy_net: 5 | def __init__(self, name: str, env): 6 | """ 7 | :param name: string 8 | :param env: gym env 9 | """ 10 | 11 | ob_space = env.observation_space 12 | act_space = env.action_space 13 | 14 | with tf.variable_scope(name): 15 | self.obs = tf.placeholder(dtype=tf.float32, shape=[None] + list(ob_space.shape), name='obs') 16 | 17 | with tf.variable_scope('policy_net'): 18 | layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh) 19 | layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh) 20 | layer_3 = tf.layers.dense(inputs=layer_2, units=act_space.n, activation=tf.tanh) 21 | self.act_probs = tf.layers.dense(inputs=layer_3, units=act_space.n, activation=tf.nn.softmax) 22 | 23 | with tf.variable_scope('value_net'): 24 | layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh) 25 | layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh) 26 | self.v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) 27 | 28 | self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) 29 | self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) 30 | 31 | self.act_deterministic = tf.argmax(self.act_probs, axis=1) 32 | 33 | self.scope = tf.get_variable_scope().name 34 | 35 | def act(self, obs, stochastic=True): 36 | if stochastic: 37 | return tf.get_default_session().run([self.act_stochastic, self.v_preds], feed_dict={self.obs: obs}) 38 | else: 39 | return tf.get_default_session().run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) 40 | 41 | def get_action_prob(self, obs): 42 | return tf.get_default_session().run(self.act_probs, feed_dict={self.obs: obs}) 43 | 44 | def get_variables(self): 45 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 46 | 47 | def get_trainable_variables(self): 48 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 49 | 50 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/run_ppo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import argparse 3 | import gym 4 | import numpy as np 5 | import tensorflow as tf 6 | from network_models.policy_net import Policy_net 7 | from algo.ppo import PPOTrain 8 | 9 | 10 | def argparser(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--logdir', help='log directory', default='log/train/ppo') 13 | parser.add_argument('--savedir', help='save directory', default='trained_models/ppo') 14 | parser.add_argument('--gamma', default=0.95, type=float) 15 | parser.add_argument('--iteration', default=int(1e4), type=int) 16 | return parser.parse_args() 17 | 18 | 19 | def main(args): 20 | env = gym.make('CartPole-v0') 21 | env.seed(0) 22 | ob_space = env.observation_space 23 | Policy = Policy_net('policy', env) 24 | Old_Policy = Policy_net('old_policy', env) 25 | PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) 26 | saver = tf.train.Saver() 27 | 28 | with tf.Session() as sess: 29 | writer = tf.summary.FileWriter(args.logdir, sess.graph) 30 | sess.run(tf.global_variables_initializer()) 31 | obs = env.reset() 32 | reward = 0 33 | success_num = 0 34 | 35 | for iteration in range(args.iteration): 36 | observations = [] 37 | actions = [] 38 | v_preds = [] 39 | rewards = [] 40 | episode_length = 0 41 | while True: # run policy RUN_POLICY_STEPS which is much less than episode length 42 | episode_length += 1 43 | obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs 44 | act, v_pred = Policy.act(obs=obs, stochastic=True) 45 | 46 | act = np.asscalar(act) 47 | v_pred = np.asscalar(v_pred) 48 | 49 | observations.append(obs) 50 | actions.append(act) 51 | v_preds.append(v_pred) 52 | rewards.append(reward) 53 | 54 | next_obs, reward, done, info = env.step(act) 55 | 56 | if done: 57 | v_preds_next = v_preds[1:] + [0] # next state of terminate state has 0 state value 58 | obs = env.reset() 59 | reward = -1 60 | break 61 | else: 62 | obs = next_obs 63 | 64 | writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=episode_length)]) 65 | , iteration) 66 | writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) 67 | , iteration) 68 | 69 | if sum(rewards) >= 195: 70 | success_num += 1 71 | if success_num >= 100: 72 | saver.save(sess, args.savedir+'/model.ckpt') 73 | print('Clear!! Model saved.') 74 | break 75 | else: 76 | success_num = 0 77 | 78 | gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) 79 | 80 | # convert list to numpy array for feeding tf.placeholder 81 | observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) 82 | actions = np.array(actions).astype(dtype=np.int32) 83 | gaes = np.array(gaes).astype(dtype=np.float32) 84 | gaes = (gaes - gaes.mean()) / gaes.std() 85 | rewards = np.array(rewards).astype(dtype=np.float32) 86 | v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) 87 | 88 | PPO.assign_policy_parameters() 89 | 90 | inp = [observations, actions, gaes, rewards, v_preds_next] 91 | 92 | # train 93 | for epoch in range(6): 94 | # sample indices from [low, high) 95 | sample_indices = np.random.randint(low=0, high=observations.shape[0], size=32) 96 | sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data 97 | PPO.train(obs=sampled_inp[0], 98 | actions=sampled_inp[1], 99 | gaes=sampled_inp[2], 100 | rewards=sampled_inp[3], 101 | v_preds_next=sampled_inp[4]) 102 | 103 | summary = PPO.get_summary(obs=inp[0], 104 | actions=inp[1], 105 | gaes=inp[2], 106 | rewards=inp[3], 107 | v_preds_next=inp[4]) 108 | 109 | writer.add_summary(summary, iteration) 110 | writer.close() 111 | 112 | 113 | if __name__ == '__main__': 114 | args = argparser() 115 | main(args) 116 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/test_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import argparse 5 | from network_models.policy_net import Policy_net 6 | 7 | 8 | def argparser(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--modeldir', help='directory of model', default='trained_models') 11 | parser.add_argument('--alg', help='chose algorithm one of gail, ppo, bc', default='gail') 12 | parser.add_argument('--model', help='number of model to test. model.ckpt-number', default='') 13 | parser.add_argument('--logdir', help='log directory', default='log/test') 14 | parser.add_argument('--iteration', default=int(1e3)) 15 | parser.add_argument('--stochastic', action='store_false') 16 | return parser.parse_args() 17 | 18 | 19 | def main(args): 20 | env = gym.make('CartPole-v0') 21 | env.seed(0) 22 | Policy = Policy_net('policy', env) 23 | saver = tf.train.Saver() 24 | 25 | with tf.Session() as sess: 26 | writer = tf.summary.FileWriter(args.logdir+'/'+args.alg, sess.graph) 27 | sess.run(tf.global_variables_initializer()) 28 | if args.model == '': 29 | saver.restore(sess, args.modeldir+'/'+args.alg+'/'+'model.ckpt') 30 | else: 31 | saver.restore(sess, args.modeldir+'/'+args.alg+'/'+'model.ckpt-'+args.model) 32 | obs = env.reset() 33 | reward = 0 34 | success_num = 0 35 | 36 | for iteration in range(args.iteration): 37 | rewards = [] 38 | run_policy_steps = 0 39 | while True: # run policy RUN_POLICY_STEPS which is much less than episode length 40 | run_policy_steps += 1 41 | obs = np.stack([obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs 42 | act, _ = Policy.act(obs=obs, stochastic=args.stochastic) 43 | 44 | act = np.asscalar(act) 45 | 46 | rewards.append(reward) 47 | 48 | next_obs, reward, done, info = env.step(act) 49 | 50 | if done: 51 | obs = env.reset() 52 | reward = -1 53 | break 54 | else: 55 | obs = next_obs 56 | 57 | writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) 58 | , iteration) 59 | writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) 60 | , iteration) 61 | 62 | # end condition of test 63 | if sum(rewards) >= 195: 64 | success_num += 1 65 | if success_num >= 100: 66 | print('Iteration: ', iteration) 67 | print('Clear!!') 68 | break 69 | else: 70 | success_num = 0 71 | 72 | writer.close() 73 | 74 | 75 | if __name__ == '__main__': 76 | args = argparser() 77 | main(args) 78 | -------------------------------------------------------------------------------- /cartpole/ppo_tf/trajectory/actions.csv: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 1 4 | 0 5 | 1 6 | 0 7 | 1 8 | 1 9 | 0 10 | 0 11 | 1 12 | 1 13 | 0 14 | 0 15 | 1 16 | 0 17 | 1 18 | 0 19 | 0 20 | 0 21 | 1 22 | 1 23 | 0 24 | 0 25 | 1 26 | 0 27 | 0 28 | 1 29 | 1 30 | 0 31 | 0 32 | 0 33 | 1 34 | 1 35 | 1 36 | 1 37 | 0 38 | 0 39 | 0 40 | 1 41 | 1 42 | 0 43 | 1 44 | 0 45 | 0 46 | 1 47 | 0 48 | 1 49 | 1 50 | 0 51 | 1 52 | 0 53 | 1 54 | 0 55 | 1 56 | 1 57 | 0 58 | 0 59 | 0 60 | 1 61 | 1 62 | 0 63 | 0 64 | 1 65 | 1 66 | 0 67 | 0 68 | 1 69 | 0 70 | 1 71 | 0 72 | 0 73 | 1 74 | 1 75 | 0 76 | 1 77 | 0 78 | 0 79 | 1 80 | 0 81 | 0 82 | 1 83 | 1 84 | 1 85 | 0 86 | 1 87 | 1 88 | 0 89 | 0 90 | 1 91 | 0 92 | 1 93 | 0 94 | 1 95 | 1 96 | 0 97 | 1 98 | 0 99 | 1 100 | 0 101 | 0 102 | 1 103 | 1 104 | 0 105 | 0 106 | 1 107 | 0 108 | 1 109 | 1 110 | 0 111 | 0 112 | 1 113 | 1 114 | 0 115 | 1 116 | 0 117 | 0 118 | 1 119 | 0 120 | 1 121 | 1 122 | 0 123 | 0 124 | 1 125 | 1 126 | 0 127 | 0 128 | 1 129 | 1 130 | 1 131 | 0 132 | 0 133 | 0 134 | 1 135 | 1 136 | 0 137 | 0 138 | 1 139 | 1 140 | 0 141 | 1 142 | 0 143 | 0 144 | 1 145 | 0 146 | 0 147 | 1 148 | 1 149 | 1 150 | 0 151 | 1 152 | 0 153 | 0 154 | 1 155 | 0 156 | 0 157 | 1 158 | 1 159 | 0 160 | 1 161 | 0 162 | 1 163 | 0 164 | 1 165 | 1 166 | 0 167 | 0 168 | 1 169 | 0 170 | 0 171 | 1 172 | 1 173 | 0 174 | 0 175 | 1 176 | 0 177 | 0 178 | 1 179 | 1 180 | 0 181 | 0 182 | 0 183 | 1 184 | 1 185 | 0 186 | 0 187 | 1 188 | 0 189 | 1 190 | 0 191 | 1 192 | 0 193 | 0 194 | 1 195 | 0 196 | 1 197 | 1 198 | 0 199 | 0 200 | 1 201 | 1 202 | 1 203 | 0 204 | 0 205 | 1 206 | 0 207 | 1 208 | 1 209 | 0 210 | 0 211 | 0 212 | 1 213 | 1 214 | 1 215 | 0 216 | 0 217 | 0 218 | 0 219 | 1 220 | 0 221 | 0 222 | 1 223 | 1 224 | 1 225 | 0 226 | 1 227 | 0 228 | 0 229 | 1 230 | 0 231 | 0 232 | 1 233 | 0 234 | 1 235 | 0 236 | 1 237 | 0 238 | 0 239 | 0 240 | 0 241 | 1 242 | 0 243 | 1 244 | 1 245 | 0 246 | 1 247 | 0 248 | 0 249 | 0 250 | 1 251 | 0 252 | 1 253 | 0 254 | 1 255 | 0 256 | 1 257 | 0 258 | 1 259 | 1 260 | 1 261 | 0 262 | 0 263 | 1 264 | 0 265 | 1 266 | 1 267 | 1 268 | 0 269 | 1 270 | 1 271 | 0 272 | 0 273 | 1 274 | 1 275 | 1 276 | 0 277 | 0 278 | 1 279 | 0 280 | 1 281 | 1 282 | 0 283 | 1 284 | 1 285 | 0 286 | 1 287 | 1 288 | 0 289 | 0 290 | 1 291 | 1 292 | 0 293 | 1 294 | 1 295 | 1 296 | 0 297 | 1 298 | 0 299 | 0 300 | 0 301 | 1 302 | 0 303 | 1 304 | 0 305 | 1 306 | 0 307 | 0 308 | 0 309 | 1 310 | 1 311 | 0 312 | 0 313 | 0 314 | 1 315 | 1 316 | 0 317 | 1 318 | 0 319 | 0 320 | 1 321 | 1 322 | 0 323 | 0 324 | 1 325 | 0 326 | 0 327 | 1 328 | 1 329 | 0 330 | 1 331 | 0 332 | 1 333 | 0 334 | 0 335 | 1 336 | 0 337 | 0 338 | 1 339 | 0 340 | 1 341 | 0 342 | 0 343 | 1 344 | 1 345 | 1 346 | 0 347 | 0 348 | 1 349 | 0 350 | 1 351 | 1 352 | 0 353 | 0 354 | 1 355 | 1 356 | 0 357 | 0 358 | 0 359 | 1 360 | 0 361 | 1 362 | 0 363 | 0 364 | 1 365 | 1 366 | 0 367 | 1 368 | 1 369 | 1 370 | 0 371 | 1 372 | 0 373 | 1 374 | 0 375 | 1 376 | 0 377 | 1 378 | 1 379 | 0 380 | 1 381 | 0 382 | 1 383 | 0 384 | 1 385 | 0 386 | 0 387 | 1 388 | 1 389 | 1 390 | 0 391 | 0 392 | 0 393 | 0 394 | 1 395 | 1 396 | 0 397 | 1 398 | 1 399 | 0 400 | 0 401 | 0 402 | 1 403 | 1 404 | 1 405 | 0 406 | 0 407 | 1 408 | 0 409 | 0 410 | 1 411 | 0 412 | 1 413 | 0 414 | 1 415 | 0 416 | 0 417 | 1 418 | 0 419 | 1 420 | 0 421 | 1 422 | 1 423 | 0 424 | 0 425 | 1 426 | 0 427 | 0 428 | 1 429 | 1 430 | 0 431 | 1 432 | 0 433 | 1 434 | 0 435 | 0 436 | 1 437 | 0 438 | 0 439 | 1 440 | 0 441 | 1 442 | 1 443 | 0 444 | 0 445 | 1 446 | 1 447 | 0 448 | 0 449 | 1 450 | 0 451 | 1 452 | 1 453 | 0 454 | 0 455 | 0 456 | 1 457 | 1 458 | 1 459 | 0 460 | 1 461 | 0 462 | 1 463 | 1 464 | 0 465 | 0 466 | 1 467 | 1 468 | 1 469 | 0 470 | 1 471 | 0 472 | 1 473 | 0 474 | 0 475 | 1 476 | 0 477 | 1 478 | 0 479 | 1 480 | 1 481 | 0 482 | 0 483 | 1 484 | 1 485 | 0 486 | 0 487 | 0 488 | 1 489 | 1 490 | 1 491 | 0 492 | 0 493 | 0 494 | 0 495 | 1 496 | 1 497 | 1 498 | 0 499 | 1 500 | 0 501 | 0 502 | 1 503 | 1 504 | 0 505 | 1 506 | 0 507 | 0 508 | 0 509 | 0 510 | 1 511 | 1 512 | 0 513 | 0 514 | 1 515 | 1 516 | 1 517 | 0 518 | 1 519 | 0 520 | 0 521 | 0 522 | 1 523 | 0 524 | 1 525 | 1 526 | 0 527 | 1 528 | 0 529 | 1 530 | 1 531 | 1 532 | 0 533 | 1 534 | 0 535 | 0 536 | 1 537 | 0 538 | 1 539 | 0 540 | 1 541 | 0 542 | 1 543 | 1 544 | 0 545 | 1 546 | 0 547 | 0 548 | 0 549 | 1 550 | 1 551 | 0 552 | 1 553 | 1 554 | 0 555 | 1 556 | 1 557 | 0 558 | 0 559 | 1 560 | 1 561 | 1 562 | 0 563 | 0 564 | 0 565 | 0 566 | 1 567 | 0 568 | 1 569 | 1 570 | 0 571 | 1 572 | 0 573 | 0 574 | 1 575 | 1 576 | 0 577 | 0 578 | 1 579 | 0 580 | 1 581 | 1 582 | 0 583 | 0 584 | 1 585 | 1 586 | 1 587 | 0 588 | 0 589 | 1 590 | 0 591 | 1 592 | 0 593 | 1 594 | 0 595 | 1 596 | 0 597 | 0 598 | 1 599 | 0 600 | 0 601 | 1 602 | 1 603 | 0 604 | 0 605 | 1 606 | 0 607 | 1 608 | 1 609 | 0 610 | 0 611 | 1 612 | 1 613 | 0 614 | 0 615 | 1 616 | 1 617 | 0 618 | 1 619 | 0 620 | 1 621 | 0 622 | 0 623 | 1 624 | 0 625 | 1 626 | 0 627 | 1 628 | 0 629 | 1 630 | 0 631 | 0 632 | 1 633 | 0 634 | 0 635 | 1 636 | 1 637 | 0 638 | 0 639 | 1 640 | 0 641 | 1 642 | 1 643 | 0 644 | 0 645 | 1 646 | 0 647 | 0 648 | 1 649 | 1 650 | 0 651 | 0 652 | 1 653 | 0 654 | 1 655 | 1 656 | 0 657 | 1 658 | 0 659 | 1 660 | 0 661 | 0 662 | 0 663 | 0 664 | 1 665 | 1 666 | 0 667 | 1 668 | 1 669 | 0 670 | 1 671 | 0 672 | 0 673 | 1 674 | 1 675 | 0 676 | 1 677 | 0 678 | 1 679 | 0 680 | 1 681 | 0 682 | 1 683 | 1 684 | 0 685 | 0 686 | 0 687 | 1 688 | 1 689 | 1 690 | 0 691 | 0 692 | 1 693 | 0 694 | 1 695 | 0 696 | 1 697 | 0 698 | 0 699 | 1 700 | 1 701 | 1 702 | 0 703 | 1 704 | 0 705 | 0 706 | 1 707 | 1 708 | 0 709 | 1 710 | 0 711 | 1 712 | 1 713 | 0 714 | 0 715 | 1 716 | 0 717 | 1 718 | 1 719 | 0 720 | 0 721 | 1 722 | 0 723 | 0 724 | 0 725 | 1 726 | 1 727 | 0 728 | 1 729 | 1 730 | 1 731 | 0 732 | 0 733 | 1 734 | 0 735 | 1 736 | 1 737 | 0 738 | 0 739 | 1 740 | 1 741 | 0 742 | 1 743 | 0 744 | 1 745 | 1 746 | 1 747 | 0 748 | 0 749 | 0 750 | 0 751 | 1 752 | 0 753 | 1 754 | 1 755 | 0 756 | 1 757 | 1 758 | 0 759 | 1 760 | 0 761 | 1 762 | 0 763 | 1 764 | 0 765 | 0 766 | 1 767 | 1 768 | 0 769 | 0 770 | 0 771 | 1 772 | 0 773 | 0 774 | 1 775 | 1 776 | 0 777 | 1 778 | 0 779 | 0 780 | 1 781 | 0 782 | 0 783 | 0 784 | 1 785 | 0 786 | 0 787 | 1 788 | 1 789 | 0 790 | 1 791 | 1 792 | 0 793 | 0 794 | 1 795 | 0 796 | 1 797 | 1 798 | 0 799 | 0 800 | 1 801 | 0 802 | 1 803 | 0 804 | 1 805 | 1 806 | 0 807 | 0 808 | 1 809 | 0 810 | 1 811 | 1 812 | 0 813 | 0 814 | 1 815 | 1 816 | 0 817 | 0 818 | 1 819 | 0 820 | 1 821 | 0 822 | 0 823 | 1 824 | 0 825 | 1 826 | 0 827 | 1 828 | 1 829 | 0 830 | 1 831 | 0 832 | 0 833 | 1 834 | 0 835 | 1 836 | 1 837 | 0 838 | 0 839 | 1 840 | 0 841 | 1 842 | 0 843 | 1 844 | 0 845 | 0 846 | 1 847 | 0 848 | 0 849 | 1 850 | 1 851 | 0 852 | 0 853 | 1 854 | 0 855 | 0 856 | 1 857 | 1 858 | 0 859 | 1 860 | 1 861 | 0 862 | 0 863 | 1 864 | 0 865 | 1 866 | 0 867 | 1 868 | 0 869 | 0 870 | 1 871 | 0 872 | 0 873 | 1 874 | 1 875 | 1 876 | 0 877 | 1 878 | 0 879 | 0 880 | 1 881 | 0 882 | 1 883 | 1 884 | 0 885 | 1 886 | 1 887 | 0 888 | 1 889 | 1 890 | 0 891 | 1 892 | 0 893 | 1 894 | 0 895 | 1 896 | 0 897 | 1 898 | 0 899 | 1 900 | 1 901 | 1 902 | 0 903 | 0 904 | 0 905 | 1 906 | 1 907 | 1 908 | 0 909 | 0 910 | 1 911 | 0 912 | 1 913 | 0 914 | 1 915 | 1 916 | 0 917 | 1 918 | 0 919 | 1 920 | 1 921 | 0 922 | 0 923 | 0 924 | 1 925 | 0 926 | 1 927 | 0 928 | 1 929 | 0 930 | 0 931 | 1 932 | 0 933 | 0 934 | 1 935 | 0 936 | 1 937 | 1 938 | 0 939 | 0 940 | 1 941 | 0 942 | 1 943 | 0 944 | 1 945 | 1 946 | 0 947 | 0 948 | 1 949 | 1 950 | 0 951 | 0 952 | 0 953 | 0 954 | 1 955 | 1 956 | 1 957 | 0 958 | 0 959 | 1 960 | 1 961 | 0 962 | 1 963 | 1 964 | 0 965 | 0 966 | 1 967 | 0 968 | 1 969 | 1 970 | 0 971 | 0 972 | 1 973 | 1 974 | 1 975 | 0 976 | 1 977 | 0 978 | 0 979 | 0 980 | 1 981 | 1 982 | 1 983 | 0 984 | 1 985 | 0 986 | 0 987 | 1 988 | 0 989 | 1 990 | 0 991 | 1 992 | 0 993 | 1 994 | 0 995 | 0 996 | 1 997 | 1 998 | 0 999 | 0 1000 | 0 1001 | 1 1002 | 1 1003 | 0 1004 | 0 1005 | 1 1006 | 0 1007 | 1 1008 | 1 1009 | 0 1010 | 0 1011 | 1 1012 | 0 1013 | 0 1014 | 1 1015 | 0 1016 | 1 1017 | 0 1018 | 1 1019 | 0 1020 | 0 1021 | 1 1022 | 1 1023 | 0 1024 | 0 1025 | 0 1026 | 1 1027 | 0 1028 | 1 1029 | 1 1030 | 0 1031 | 0 1032 | 1 1033 | 0 1034 | 1 1035 | 0 1036 | 0 1037 | 1 1038 | 0 1039 | 1 1040 | 0 1041 | 1 1042 | 0 1043 | 1 1044 | 0 1045 | 1 1046 | 1 1047 | 0 1048 | 1 1049 | 1 1050 | 0 1051 | 0 1052 | 1 1053 | 1 1054 | 0 1055 | 0 1056 | 1 1057 | 1 1058 | 1 1059 | 1 1060 | 0 1061 | 1 1062 | 0 1063 | 0 1064 | 1 1065 | 0 1066 | 1 1067 | 0 1068 | 1 1069 | 0 1070 | 1 1071 | 0 1072 | 1 1073 | 0 1074 | 1 1075 | 0 1076 | 1 1077 | 1 1078 | 0 1079 | 0 1080 | 1 1081 | 0 1082 | 1 1083 | 0 1084 | 1 1085 | 1 1086 | 0 1087 | 0 1088 | 1 1089 | 0 1090 | 0 1091 | 1 1092 | 1 1093 | 0 1094 | 1 1095 | 0 1096 | 0 1097 | 0 1098 | 1 1099 | 1 1100 | 0 1101 | 1 1102 | 0 1103 | 1 1104 | 0 1105 | 0 1106 | 1 1107 | 0 1108 | 1 1109 | 0 1110 | 1 1111 | 0 1112 | 1 1113 | 1 1114 | 0 1115 | 1 1116 | 0 1117 | 0 1118 | 1 1119 | 0 1120 | 1 1121 | 0 1122 | 1 1123 | 0 1124 | 1 1125 | 0 1126 | 1 1127 | 1 1128 | 0 1129 | 0 1130 | 0 1131 | 1 1132 | 0 1133 | 1 1134 | 1 1135 | 1 1136 | 0 1137 | 1 1138 | 0 1139 | 0 1140 | 0 1141 | 0 1142 | 1 1143 | 1 1144 | 1 1145 | 0 1146 | 1 1147 | 1 1148 | 0 1149 | 1 1150 | 1 1151 | 0 1152 | 1 1153 | 1 1154 | 0 1155 | 0 1156 | 1 1157 | 1 1158 | 1 1159 | 0 1160 | 0 1161 | 1 1162 | 0 1163 | 1 1164 | 0 1165 | 1 1166 | 0 1167 | 1 1168 | 1 1169 | 0 1170 | 0 1171 | 0 1172 | 1 1173 | 1 1174 | 0 1175 | 1 1176 | 1 1177 | 0 1178 | 1 1179 | 0 1180 | 1 1181 | 0 1182 | 0 1183 | 0 1184 | 1 1185 | 1 1186 | 0 1187 | 1 1188 | 0 1189 | 0 1190 | 1 1191 | 0 1192 | 0 1193 | 1 1194 | 0 1195 | 0 1196 | 1 1197 | 0 1198 | 1 1199 | 0 1200 | 0 1201 | 1 1202 | 0 1203 | 1 1204 | 0 1205 | 1 1206 | 1 1207 | 0 1208 | 0 1209 | 1 1210 | 0 1211 | 1 1212 | 0 1213 | 1 1214 | 0 1215 | 1 1216 | 0 1217 | 1 1218 | 0 1219 | 1 1220 | 0 1221 | 0 1222 | 1 1223 | 1 1224 | 0 1225 | 0 1226 | 1 1227 | 1 1228 | 0 1229 | 0 1230 | 0 1231 | 1 1232 | 0 1233 | 1 1234 | 0 1235 | 0 1236 | 1 1237 | 0 1238 | 1 1239 | 0 1240 | 0 1241 | 1 1242 | 1 1243 | 0 1244 | 0 1245 | 1 1246 | 1 1247 | 1 1248 | 0 1249 | 0 1250 | 0 1251 | 1 1252 | 1 1253 | 0 1254 | 0 1255 | 1 1256 | 1 1257 | 0 1258 | 0 1259 | 0 1260 | 1 1261 | 0 1262 | 1 1263 | 0 1264 | 1 1265 | 1 1266 | 0 1267 | 1 1268 | 0 1269 | 0 1270 | 0 1271 | 1 1272 | 1 1273 | 1 1274 | 0 1275 | 1 1276 | 0 1277 | 1 1278 | 1 1279 | 0 1280 | 0 1281 | 1 1282 | 0 1283 | 1 1284 | 1 1285 | 1 1286 | 1 1287 | 0 1288 | 1 1289 | 0 1290 | 1 1291 | 1 1292 | 0 1293 | 1 1294 | 1 1295 | 1 1296 | 0 1297 | 1 1298 | 0 1299 | 0 1300 | 1 1301 | 1 1302 | 0 1303 | 0 1304 | 1 1305 | 0 1306 | 0 1307 | 1 1308 | 1 1309 | 0 1310 | 0 1311 | 1 1312 | 1 1313 | 0 1314 | 1 1315 | 0 1316 | 1 1317 | 0 1318 | 0 1319 | 1 1320 | 0 1321 | 1 1322 | 0 1323 | 0 1324 | 1 1325 | 1 1326 | 0 1327 | 0 1328 | 0 1329 | 1 1330 | 1 1331 | 1 1332 | 0 1333 | 0 1334 | 0 1335 | 1 1336 | 0 1337 | 0 1338 | 1 1339 | 1 1340 | 0 1341 | 0 1342 | 1 1343 | 0 1344 | 0 1345 | 1 1346 | 1 1347 | 0 1348 | 1 1349 | 0 1350 | 0 1351 | 1 1352 | 1 1353 | 0 1354 | 0 1355 | 0 1356 | 1 1357 | 0 1358 | 0 1359 | 1 1360 | 0 1361 | 1 1362 | 1 1363 | 0 1364 | 0 1365 | 1 1366 | 1 1367 | 0 1368 | 0 1369 | 1 1370 | 1 1371 | 1 1372 | 0 1373 | 0 1374 | 1 1375 | 1 1376 | 0 1377 | 1 1378 | 1 1379 | 0 1380 | 0 1381 | 1 1382 | 1 1383 | 1 1384 | 0 1385 | 1 1386 | 1 1387 | 0 1388 | 0 1389 | 1 1390 | 0 1391 | 0 1392 | 1 1393 | 1 1394 | 1 1395 | 0 1396 | 1 1397 | 0 1398 | 1 1399 | 0 1400 | 1 1401 | 1 1402 | 1 1403 | 0 1404 | 0 1405 | 1 1406 | 0 1407 | 1 1408 | 0 1409 | 1 1410 | 0 1411 | 0 1412 | 1 1413 | 0 1414 | 1 1415 | 0 1416 | 1 1417 | 0 1418 | 0 1419 | 0 1420 | 1 1421 | 1 1422 | 0 1423 | 0 1424 | 1 1425 | 0 1426 | 0 1427 | 1 1428 | 0 1429 | 0 1430 | 1 1431 | 1 1432 | 1 1433 | 0 1434 | 0 1435 | 1 1436 | 1 1437 | 0 1438 | 1 1439 | 1 1440 | 1 1441 | 0 1442 | 1 1443 | 0 1444 | 1 1445 | 0 1446 | 1 1447 | 0 1448 | 0 1449 | 0 1450 | 1 1451 | 1 1452 | 0 1453 | 1 1454 | 0 1455 | 1 1456 | 0 1457 | 1 1458 | 1 1459 | 0 1460 | 1 1461 | 0 1462 | 1 1463 | 0 1464 | 1 1465 | 0 1466 | 1 1467 | 0 1468 | 1 1469 | 0 1470 | 1 1471 | 1 1472 | 0 1473 | 0 1474 | 1 1475 | 0 1476 | 1 1477 | 1 1478 | 1 1479 | 0 1480 | 0 1481 | 1 1482 | 0 1483 | 0 1484 | 1 1485 | 1 1486 | 0 1487 | 1 1488 | 1 1489 | 0 1490 | 0 1491 | 0 1492 | 1 1493 | 0 1494 | 1 1495 | 0 1496 | 0 1497 | 1 1498 | 1 1499 | 0 1500 | 0 1501 | 0 1502 | 0 1503 | 1 1504 | 1 1505 | 1 1506 | 0 1507 | 0 1508 | 1 1509 | 0 1510 | 0 1511 | 0 1512 | 1 1513 | 1 1514 | 1 1515 | 0 1516 | 1 1517 | 0 1518 | 1 1519 | 0 1520 | 1 1521 | 0 1522 | 0 1523 | 1 1524 | 1 1525 | 0 1526 | 0 1527 | 0 1528 | 1 1529 | 1 1530 | 0 1531 | 1 1532 | 0 1533 | 1 1534 | 1 1535 | 0 1536 | 0 1537 | 1 1538 | 1 1539 | 0 1540 | 0 1541 | 0 1542 | 1 1543 | 1 1544 | 1 1545 | 0 1546 | 1 1547 | 0 1548 | 0 1549 | 1 1550 | 0 1551 | 1 1552 | 0 1553 | 1 1554 | 0 1555 | 1 1556 | 0 1557 | 1 1558 | 0 1559 | 0 1560 | 1 1561 | 1 1562 | 0 1563 | 1 1564 | 1 1565 | 0 1566 | 1 1567 | 1 1568 | 0 1569 | 0 1570 | 0 1571 | 1 1572 | 0 1573 | 1 1574 | 1 1575 | 1 1576 | 0 1577 | 1 1578 | 0 1579 | 0 1580 | 1 1581 | 1 1582 | 1 1583 | 0 1584 | 0 1585 | 0 1586 | 1 1587 | 1 1588 | 1 1589 | 0 1590 | 0 1591 | 1 1592 | 1 1593 | 0 1594 | 0 1595 | 1 1596 | 0 1597 | 1 1598 | 0 1599 | 1 1600 | 0 1601 | 1 1602 | 1 1603 | 0 1604 | 1 1605 | 0 1606 | 0 1607 | 0 1608 | 0 1609 | 1 1610 | 1 1611 | 1 1612 | 1 1613 | 0 1614 | 0 1615 | 1 1616 | 0 1617 | 1 1618 | 0 1619 | 0 1620 | 1 1621 | 0 1622 | 0 1623 | 0 1624 | 1 1625 | 1 1626 | 0 1627 | 0 1628 | 0 1629 | 1 1630 | 1 1631 | 0 1632 | 0 1633 | 1 1634 | 0 1635 | 0 1636 | 1 1637 | 1 1638 | 1 1639 | 0 1640 | 1 1641 | 0 1642 | 1 1643 | 0 1644 | 1 1645 | 0 1646 | 0 1647 | 0 1648 | 0 1649 | 1 1650 | 1 1651 | 1 1652 | 1 1653 | 0 1654 | 0 1655 | 1 1656 | 0 1657 | 0 1658 | 1 1659 | 0 1660 | 1 1661 | 1 1662 | 1 1663 | 0 1664 | 0 1665 | 0 1666 | 0 1667 | 1 1668 | 1 1669 | 1 1670 | 0 1671 | 0 1672 | 1 1673 | 1 1674 | 1 1675 | 0 1676 | 0 1677 | 0 1678 | 1 1679 | 1 1680 | 0 1681 | 1 1682 | 1 1683 | 0 1684 | 1 1685 | 0 1686 | 1 1687 | 0 1688 | 1 1689 | 0 1690 | 1 1691 | 0 1692 | 1 1693 | 1 1694 | 0 1695 | 0 1696 | 1 1697 | 1 1698 | 0 1699 | 1 1700 | 0 1701 | 1 1702 | 0 1703 | 0 1704 | 1 1705 | 1 1706 | 0 1707 | 1 1708 | 0 1709 | 1 1710 | 0 1711 | 1 1712 | 0 1713 | 1 1714 | 1 1715 | 1 1716 | 0 1717 | 1 1718 | 1 1719 | 0 1720 | 0 1721 | 0 1722 | 1 1723 | 0 1724 | 1 1725 | 0 1726 | 0 1727 | 1 1728 | 1 1729 | 0 1730 | 0 1731 | 1 1732 | 0 1733 | 0 1734 | 1 1735 | 0 1736 | 0 1737 | 1 1738 | 1 1739 | 1 1740 | 0 1741 | 1 1742 | 0 1743 | 1 1744 | 1 1745 | 0 1746 | 1 1747 | 1 1748 | 0 1749 | 0 1750 | 0 1751 | 1 1752 | 1 1753 | 1 1754 | 0 1755 | 1 1756 | 0 1757 | 0 1758 | 0 1759 | 1 1760 | 0 1761 | 1 1762 | 0 1763 | 0 1764 | 1 1765 | 0 1766 | 1 1767 | 1 1768 | 0 1769 | 0 1770 | 0 1771 | 1 1772 | 0 1773 | 0 1774 | 1 1775 | 1 1776 | 0 1777 | 0 1778 | 1 1779 | 0 1780 | 1 1781 | 0 1782 | 0 1783 | 1 1784 | 0 1785 | 1 1786 | 1 1787 | 0 1788 | 1 1789 | 0 1790 | 1 1791 | 0 1792 | 1 1793 | 1 1794 | 0 1795 | 0 1796 | 1 1797 | 1 1798 | 0 1799 | 0 1800 | 1 1801 | 1 1802 | 1 1803 | 0 1804 | 0 1805 | 1 1806 | 0 1807 | 0 1808 | 1 1809 | 1 1810 | 1 1811 | 0 1812 | 0 1813 | 0 1814 | 1 1815 | 0 1816 | 1 1817 | 0 1818 | 0 1819 | 1 1820 | 1 1821 | 0 1822 | 1 1823 | 0 1824 | 0 1825 | 0 1826 | 1 1827 | 1 1828 | 0 1829 | 0 1830 | 1 1831 | 1 1832 | 0 1833 | 1 1834 | 1 1835 | 0 1836 | 0 1837 | 0 1838 | 1 1839 | 0 1840 | 0 1841 | 0 1842 | 1 1843 | 1 1844 | 1 1845 | 0 1846 | 1 1847 | 0 1848 | 1 1849 | 0 1850 | 1 1851 | 0 1852 | 1 1853 | 0 1854 | 1 1855 | 0 1856 | 1 1857 | 1 1858 | 0 1859 | 1 1860 | 1 1861 | 0 1862 | 0 1863 | 0 1864 | 1 1865 | 0 1866 | 1 1867 | 0 1868 | 1 1869 | 0 1870 | 1 1871 | 1 1872 | 0 1873 | 1 1874 | 1 1875 | 0 1876 | 1 1877 | 0 1878 | 0 1879 | 1 1880 | 1 1881 | 0 1882 | 1 1883 | 1 1884 | 0 1885 | 0 1886 | 0 1887 | 1 1888 | 1 1889 | 0 1890 | 0 1891 | 0 1892 | 1 1893 | 1 1894 | 1 1895 | 0 1896 | 0 1897 | 1 1898 | 1 1899 | 0 1900 | 1 1901 | 1 1902 | 0 1903 | 0 1904 | 0 1905 | 1 1906 | 0 1907 | 1 1908 | 1 1909 | 0 1910 | 0 1911 | 1 1912 | 0 1913 | 1 1914 | 1 1915 | 0 1916 | 0 1917 | 1 1918 | 0 1919 | 0 1920 | 1 1921 | 0 1922 | 1 1923 | 0 1924 | 0 1925 | 1 1926 | 0 1927 | 1 1928 | 1 1929 | 1 1930 | 0 1931 | 1 1932 | 0 1933 | 0 1934 | 1 1935 | 0 1936 | 0 1937 | 0 1938 | 1 1939 | 1 1940 | 1 1941 | 0 1942 | 0 1943 | 0 1944 | 1 1945 | 0 1946 | 1 1947 | 1 1948 | 1 1949 | 0 1950 | 1 1951 | 0 1952 | 1 1953 | 0 1954 | 1 1955 | 0 1956 | 1 1957 | 1 1958 | 0 1959 | 1 1960 | 0 1961 | 1 1962 | 0 1963 | 0 1964 | 1 1965 | 1 1966 | 0 1967 | 1 1968 | 1 1969 | 0 1970 | 1 1971 | 1 1972 | 0 1973 | 0 1974 | 0 1975 | 1 1976 | 1 1977 | 0 1978 | 1 1979 | 1 1980 | 1 1981 | 0 1982 | 0 1983 | 1 1984 | 0 1985 | 0 1986 | 0 1987 | 1 1988 | 1 1989 | 0 1990 | 0 1991 | 1 1992 | 0 1993 | 1 1994 | 1 1995 | 1 1996 | 1 1997 | 0 1998 | 0 1999 | 0 2000 | 0 2001 | 1 2002 | 0 2003 | 1 2004 | 0 2005 | 1 2006 | 0 2007 | 0 2008 | 1 2009 | 0 2010 | 1 2011 | 1 2012 | 1 2013 | 0 2014 | 0 2015 | 1 2016 | 1 2017 | 1 2018 | 0 2019 | 0 2020 | 1 2021 | 0 2022 | 0 2023 | 0 2024 | 0 2025 | 1 2026 | 0 2027 | 0 2028 | 1 2029 | 1 2030 | 0 2031 | 1 2032 | 0 2033 | 1 2034 | 0 2035 | 0 2036 | 1 2037 | 1 2038 | 0 2039 | 1 2040 | 0 2041 | 1 2042 | 0 2043 | 1 2044 | 0 2045 | 0 2046 | 1 2047 | 0 2048 | 1 2049 | 0 2050 | 1 2051 | 0 2052 | 0 2053 | 1 2054 | 1 2055 | 0 2056 | 0 2057 | 1 2058 | 0 2059 | 0 2060 | 1 2061 | 0 2062 | 1 2063 | 0 2064 | 1 2065 | 1 2066 | 0 2067 | 1 2068 | 0 2069 | 1 2070 | 0 2071 | 0 2072 | 0 2073 | 1 2074 | 1 2075 | 1 2076 | 0 2077 | 1 2078 | 1 2079 | 1 2080 | 0 2081 | 0 2082 | 0 2083 | 1 2084 | 0 2085 | 1 2086 | 1 2087 | 1 2088 | 0 2089 | 0 2090 | 1 2091 | 0 2092 | 1 2093 | 0 2094 | 1 2095 | 1 2096 | 0 2097 | 1 2098 | 1 2099 | 0 2100 | 0 2101 | 0 2102 | 1 2103 | 0 2104 | 1 2105 | 1 2106 | 0 2107 | 0 2108 | 1 2109 | 1 2110 | 0 2111 | 0 2112 | 1 2113 | 1 2114 | 1 2115 | 0 2116 | 1 2117 | 1 2118 | 0 2119 | 1 2120 | 1 2121 | 0 2122 | 1 2123 | 0 2124 | 1 2125 | 1 2126 | 0 2127 | 0 2128 | 1 2129 | 0 2130 | 1 2131 | 0 2132 | 1 2133 | 0 2134 | 1 2135 | 0 2136 | 1 2137 | 1 2138 | 0 2139 | 0 2140 | 0 2141 | 1 2142 | 1 2143 | 0 2144 | 0 2145 | 1 2146 | 1 2147 | 0 2148 | 1 2149 | 0 2150 | 0 2151 | 1 2152 | 1 2153 | 0 2154 | 1 2155 | 0 2156 | 0 2157 | 1 2158 | 0 2159 | 1 2160 | 1 2161 | 0 2162 | 0 2163 | 1 2164 | 0 2165 | 1 2166 | 0 2167 | 1 2168 | 0 2169 | 1 2170 | 0 2171 | 0 2172 | 1 2173 | 1 2174 | 0 2175 | 1 2176 | 1 2177 | 0 2178 | 0 2179 | 0 2180 | 1 2181 | 0 2182 | 1 2183 | 0 2184 | 1 2185 | 1 2186 | 1 2187 | 0 2188 | 1 2189 | 1 2190 | 0 2191 | 1 2192 | 0 2193 | 0 2194 | 1 2195 | 0 2196 | 0 2197 | 1 2198 | 1 2199 | 0 2200 | 1 2201 | 1 2202 | 1 2203 | 0 2204 | 0 2205 | 0 2206 | 1 2207 | 0 2208 | 1 2209 | 0 2210 | 1 2211 | 0 2212 | 0 2213 | 1 2214 | 1 2215 | 1 2216 | 0 2217 | 1 2218 | 0 2219 | 0 2220 | 1 2221 | 1 2222 | 0 2223 | 1 2224 | 0 2225 | 1 2226 | 0 2227 | 0 2228 | 1 2229 | 1 2230 | 0 2231 | 0 2232 | 0 2233 | 1 2234 | 0 2235 | 1 2236 | 0 2237 | 1 2238 | 0 2239 | 1 2240 | 0 2241 | 0 2242 | 1 2243 | 1 2244 | 0 2245 | 1 2246 | 1 2247 | 0 2248 | 0 2249 | 0 2250 | 1 2251 | 1 2252 | 0 2253 | 1 2254 | 0 2255 | 0 2256 | 0 2257 | 1 2258 | 1 2259 | 0 2260 | 1 2261 | 1 2262 | 0 2263 | 0 2264 | 0 2265 | 1 2266 | 0 2267 | 1 2268 | 1 2269 | 1 2270 | 0 2271 | 0 2272 | 1 2273 | 1 2274 | 0 2275 | 0 2276 | 1 2277 | 0 2278 | 1 2279 | 1 2280 | 0 2281 | 1 2282 | 0 2283 | 1 2284 | 1 2285 | 0 2286 | 1 2287 | 0 2288 | 1 2289 | 1 2290 | 0 2291 | 0 2292 | 0 2293 | 1 2294 | 1 2295 | 0 2296 | 1 2297 | 0 2298 | 1 2299 | 0 2300 | 1 2301 | 1 2302 | 0 2303 | 0 2304 | 1 2305 | 1 2306 | 1 2307 | 0 2308 | 0 2309 | 1 2310 | 1 2311 | 0 2312 | 0 2313 | 0 2314 | 0 2315 | 1 2316 | 1 2317 | 0 2318 | 0 2319 | 1 2320 | 0 2321 | 1 2322 | 0 2323 | 1 2324 | 0 2325 | 1 2326 | 0 2327 | 1 2328 | 0 2329 | 1 2330 | 1 2331 | 0 2332 | 1 2333 | 0 2334 | 1 2335 | 0 2336 | 0 2337 | 1 2338 | 0 2339 | 1 2340 | 0 2341 | 0 2342 | 1 2343 | 0 2344 | 0 2345 | 1 2346 | 1 2347 | 0 2348 | 1 2349 | 0 2350 | 0 2351 | 1 2352 | 1 2353 | 1 2354 | 0 2355 | 0 2356 | 1 2357 | 1 2358 | 1 2359 | 1 2360 | 0 2361 | 1 2362 | 0 2363 | 0 2364 | 0 2365 | 1 2366 | 1 2367 | 0 2368 | 1 2369 | 1 2370 | 0 2371 | 1 2372 | 0 2373 | 0 2374 | 0 2375 | 0 2376 | 1 2377 | 1 2378 | 1 2379 | 0 2380 | 1 2381 | 0 2382 | 1 2383 | 0 2384 | 0 2385 | 1 2386 | 1 2387 | 1 2388 | 0 2389 | 1 2390 | 0 2391 | 0 2392 | 1 2393 | 1 2394 | 0 2395 | 1 2396 | 0 2397 | 1 2398 | 1 2399 | 1 2400 | 1 2401 | 0 2402 | 1 2403 | 1 2404 | 0 2405 | 1 2406 | 0 2407 | 0 2408 | 1 2409 | 0 2410 | 1 2411 | 1 2412 | 0 2413 | 1 2414 | 0 2415 | 0 2416 | 0 2417 | 1 2418 | 0 2419 | 1 2420 | 0 2421 | 1 2422 | 1 2423 | 1 2424 | 0 2425 | 1 2426 | 0 2427 | 0 2428 | 0 2429 | 1 2430 | 0 2431 | 0 2432 | 1 2433 | 1 2434 | 0 2435 | 0 2436 | 1 2437 | 0 2438 | 1 2439 | 1 2440 | 0 2441 | 0 2442 | 1 2443 | 1 2444 | 0 2445 | 1 2446 | 0 2447 | 0 2448 | 0 2449 | 0 2450 | 1 2451 | 0 2452 | 0 2453 | 1 2454 | 0 2455 | 0 2456 | 1 2457 | 1 2458 | 0 2459 | 1 2460 | 1 2461 | 0 2462 | 1 2463 | 1 2464 | 0 2465 | 0 2466 | 1 2467 | 0 2468 | 0 2469 | 1 2470 | 0 2471 | 0 2472 | 1 2473 | 1 2474 | 0 2475 | 0 2476 | 1 2477 | 1 2478 | 0 2479 | 1 2480 | 0 2481 | 1 2482 | 0 2483 | 1 2484 | 1 2485 | 0 2486 | 1 2487 | 1 2488 | 1 2489 | 0 2490 | 1 2491 | 1 2492 | 0 2493 | 0 2494 | 1 2495 | 0 2496 | 1 2497 | 1 2498 | 0 2499 | 1 2500 | 0 2501 | 1 2502 | 0 2503 | 1 2504 | 1 2505 | 1 2506 | 0 2507 | 0 2508 | 1 2509 | 0 2510 | 1 2511 | 1 2512 | 1 2513 | 0 2514 | 1 2515 | 0 2516 | 0 2517 | 1 2518 | 1 2519 | 1 2520 | 0 2521 | 1 2522 | 1 2523 | 1 2524 | 0 2525 | 0 2526 | 0 2527 | 0 2528 | 1 2529 | 1 2530 | 1 2531 | 0 2532 | 1 2533 | 0 2534 | 1 2535 | 0 2536 | 0 2537 | 0 2538 | 1 2539 | 1 2540 | 1 2541 | 1 2542 | 0 2543 | 0 2544 | 0 2545 | 1 2546 | 1 2547 | 1 2548 | 1 2549 | 0 2550 | 0 2551 | 1 2552 | 0 2553 | 1 2554 | 0 2555 | 1 2556 | 0 2557 | 1 2558 | 0 2559 | 1 2560 | 1 2561 | 0 2562 | 0 2563 | 0 2564 | 1 2565 | 0 2566 | 1 2567 | 1 2568 | 0 2569 | 1 2570 | 1 2571 | 0 2572 | 0 2573 | 1 2574 | 1 2575 | 0 2576 | 0 2577 | 0 2578 | 1 2579 | 1 2580 | 0 2581 | 1 2582 | 1 2583 | 0 2584 | 0 2585 | 0 2586 | 1 2587 | 0 2588 | 1 2589 | 0 2590 | 1 2591 | 0 2592 | 1 2593 | 1 2594 | 0 2595 | 0 2596 | 0 2597 | 1 2598 | 1 2599 | 0 2600 | 1 2601 | 1 2602 | 1 2603 | 0 2604 | 1 2605 | 0 2606 | 0 2607 | 1 2608 | 1 2609 | 1 2610 | 0 2611 | 1 2612 | 0 2613 | 0 2614 | 0 2615 | 1 2616 | 0 2617 | 1 2618 | 0 2619 | 0 2620 | 1 2621 | 1 2622 | 0 2623 | 0 2624 | 1 2625 | 0 2626 | 1 2627 | 0 2628 | 1 2629 | 0 2630 | 1 2631 | 0 2632 | 0 2633 | 0 2634 | 1 2635 | 1 2636 | 1 2637 | 0 2638 | 0 2639 | 1 2640 | 0 2641 | 0 2642 | 1 2643 | 0 2644 | 1 2645 | 0 2646 | 1 2647 | 0 2648 | 1 2649 | 0 2650 | 0 2651 | 1 2652 | 0 2653 | 0 2654 | 1 2655 | 0 2656 | 1 2657 | 1 2658 | 1 2659 | 0 2660 | 0 2661 | 1 2662 | 0 2663 | 1 2664 | 0 2665 | 0 2666 | 1 2667 | 0 2668 | 1 2669 | 1 2670 | 0 2671 | 0 2672 | 1 2673 | 0 2674 | 0 2675 | 1 2676 | 0 2677 | 1 2678 | 1 2679 | 0 2680 | 0 2681 | 1 2682 | 0 2683 | 1 2684 | 0 2685 | 1 2686 | 0 2687 | 1 2688 | 0 2689 | 1 2690 | 0 2691 | 1 2692 | 1 2693 | 1 2694 | 0 2695 | 0 2696 | 0 2697 | 1 2698 | 0 2699 | 0 2700 | 1 2701 | 1 2702 | 0 2703 | 1 2704 | 1 2705 | 0 2706 | 0 2707 | 1 2708 | 0 2709 | 1 2710 | 0 2711 | 1 2712 | 1 2713 | 0 2714 | 1 2715 | 0 2716 | 1 2717 | 1 2718 | 1 2719 | 1 2720 | 0 2721 | 0 2722 | 0 2723 | 0 2724 | 1 2725 | 0 2726 | 1 2727 | 1 2728 | 0 2729 | 1 2730 | 0 2731 | 1 2732 | 1 2733 | 0 2734 | 1 2735 | 0 2736 | 1 2737 | 1 2738 | 0 2739 | 1 2740 | 1 2741 | 1 2742 | 0 2743 | 0 2744 | 1 2745 | 0 2746 | 1 2747 | 0 2748 | 1 2749 | 1 2750 | 0 2751 | 0 2752 | 1 2753 | 1 2754 | 0 2755 | 1 2756 | 0 2757 | 0 2758 | 1 2759 | 1 2760 | 1 2761 | 0 2762 | 1 2763 | 0 2764 | 1 2765 | 0 2766 | 1 2767 | 1 2768 | 1 2769 | 0 2770 | 1 2771 | 1 2772 | 0 2773 | 1 2774 | 1 2775 | 0 2776 | 0 2777 | 1 2778 | 1 2779 | 0 2780 | 1 2781 | 1 2782 | 0 2783 | 1 2784 | 1 2785 | 0 2786 | 0 2787 | 0 2788 | 1 2789 | 0 2790 | 0 2791 | 1 2792 | 1 2793 | 1 2794 | 0 2795 | 1 2796 | 0 2797 | 0 2798 | 1 2799 | 1 2800 | 1 2801 | 0 2802 | 1 2803 | 1 2804 | 0 2805 | 0 2806 | 1 2807 | 1 2808 | 1 2809 | 0 2810 | 0 2811 | 0 2812 | 0 2813 | 0 2814 | 1 2815 | 1 2816 | 1 2817 | 0 2818 | 1 2819 | 1 2820 | 0 2821 | 0 2822 | 1 2823 | 0 2824 | 0 2825 | 0 2826 | 1 2827 | 0 2828 | 1 2829 | 1 2830 | 1 2831 | 1 2832 | 0 2833 | 0 2834 | 0 2835 | 1 2836 | 0 2837 | 0 2838 | 0 2839 | 1 2840 | 1 2841 | 0 2842 | 1 2843 | 0 2844 | 1 2845 | 0 2846 | 0 2847 | 0 2848 | 1 2849 | 1 2850 | 1 2851 | 1 2852 | 0 2853 | 0 2854 | 0 2855 | 1 2856 | 1 2857 | 0 2858 | 1 2859 | 1 2860 | 0 2861 | 1 2862 | 1 2863 | 0 2864 | 0 2865 | 1 2866 | 0 2867 | 0 2868 | 1 2869 | 1 2870 | 0 2871 | 1 2872 | 0 2873 | 0 2874 | 1 2875 | 1 2876 | 1 2877 | 1 2878 | 0 2879 | 1 2880 | 0 2881 | 1 2882 | 1 2883 | 0 2884 | 0 2885 | 0 2886 | 1 2887 | 0 2888 | 1 2889 | 1 2890 | 0 2891 | 1 2892 | 1 2893 | 0 2894 | 1 2895 | 0 2896 | 0 2897 | 1 2898 | 0 2899 | 1 2900 | 0 2901 | 0 2902 | 1 2903 | 1 2904 | 1 2905 | 0 2906 | 0 2907 | 1 2908 | 0 2909 | 1 2910 | 0 2911 | 0 2912 | 1 2913 | 1 2914 | 1 2915 | 0 2916 | 1 2917 | 0 2918 | 0 2919 | 1 2920 | 0 2921 | 0 2922 | 1 2923 | 1 2924 | 1 2925 | 0 2926 | 1 2927 | 0 2928 | 0 2929 | 1 2930 | 0 2931 | 1 2932 | 0 2933 | 1 2934 | 1 2935 | 0 2936 | 0 2937 | 0 2938 | 1 2939 | 0 2940 | 1 2941 | 0 2942 | 1 2943 | 0 2944 | 0 2945 | 0 2946 | 1 2947 | 1 2948 | 1 2949 | 1 2950 | 1 2951 | 0 2952 | 0 2953 | 0 2954 | 0 2955 | 0 2956 | 1 2957 | 0 2958 | 0 2959 | 0 2960 | 1 2961 | 0 2962 | 0 2963 | 1 2964 | 1 2965 | 0 2966 | 0 2967 | 1 2968 | 0 2969 | 1 2970 | 0 2971 | 1 2972 | 1 2973 | 0 2974 | 1 2975 | 0 2976 | 1 2977 | 0 2978 | 1 2979 | 0 2980 | 1 2981 | 0 2982 | 1 2983 | 0 2984 | 1 2985 | 0 2986 | 1 2987 | 0 2988 | 1 2989 | 0 2990 | 1 2991 | 1 2992 | 1 2993 | 0 2994 | 0 2995 | 1 2996 | 0 2997 | 0 2998 | 1 2999 | 1 3000 | 0 3001 | 1 3002 | 0 3003 | 1 3004 | 0 3005 | 1 3006 | 0 3007 | 1 3008 | 0 3009 | 1 3010 | 0 3011 | 1 3012 | 0 3013 | 0 3014 | 1 3015 | 0 3016 | 1 3017 | 0 3018 | 0 3019 | 1 3020 | 0 3021 | 1 3022 | 0 3023 | 1 3024 | 0 3025 | 1 3026 | 1 3027 | 0 3028 | 0 3029 | 1 3030 | 1 3031 | 0 3032 | 1 3033 | 1 3034 | 0 3035 | 1 3036 | 1 3037 | 0 3038 | 0 3039 | 0 3040 | 1 3041 | 0 3042 | 1 3043 | 1 3044 | 1 3045 | 0 3046 | 0 3047 | 0 3048 | 0 3049 | 0 3050 | 1 3051 | 1 3052 | 0 3053 | 1 3054 | 0 3055 | 0 3056 | 0 3057 | 1 3058 | 0 3059 | 1 3060 | 0 3061 | 1 3062 | 0 3063 | 1 3064 | 0 3065 | 0 3066 | 1 3067 | 1 3068 | 1 3069 | 0 3070 | 1 3071 | 1 3072 | 0 3073 | 1 3074 | 1 3075 | 0 3076 | 0 3077 | 0 3078 | 1 3079 | 1 3080 | 0 3081 | 1 3082 | 1 3083 | 1 3084 | 0 3085 | 1 3086 | 0 3087 | 1 3088 | 0 3089 | 0 3090 | 0 3091 | 1 3092 | 0 3093 | 1 3094 | 1 3095 | 0 3096 | 1 3097 | 0 3098 | 1 3099 | 1 3100 | 0 3101 | 0 3102 | 1 3103 | 1 3104 | 1 3105 | 0 3106 | 1 3107 | 1 3108 | 0 3109 | 1 3110 | 0 3111 | 0 3112 | 1 3113 | 0 3114 | 1 3115 | 0 3116 | 1 3117 | 0 3118 | 0 3119 | 0 3120 | 0 3121 | 1 3122 | 1 3123 | 0 3124 | 0 3125 | 0 3126 | 1 3127 | 0 3128 | 1 3129 | 1 3130 | 1 3131 | 1 3132 | 1 3133 | 1 3134 | 0 3135 | 1 3136 | 0 3137 | 0 3138 | 1 3139 | 0 3140 | 0 3141 | 0 3142 | 0 3143 | 1 3144 | 1 3145 | 1 3146 | 1 3147 | 0 3148 | 0 3149 | 1 3150 | 1 3151 | 1 3152 | 0 3153 | 0 3154 | 0 3155 | 0 3156 | 1 3157 | 0 3158 | 1 3159 | 1 3160 | 1 3161 | 0 3162 | 1 3163 | 0 3164 | 0 3165 | 1 3166 | 0 3167 | 0 3168 | 1 3169 | 1 3170 | 0 3171 | 1 3172 | 0 3173 | 0 3174 | 1 3175 | 0 3176 | 1 3177 | 0 3178 | 0 3179 | 1 3180 | 1 3181 | 1 3182 | 0 3183 | 1 3184 | 0 3185 | 1 3186 | 1 3187 | 1 3188 | 0 3189 | 0 3190 | 0 3191 | 1 3192 | 0 3193 | 1 3194 | 0 3195 | 1 3196 | 0 3197 | 1 3198 | 0 3199 | 0 3200 | 0 3201 | 1 3202 | 0 3203 | 0 3204 | 1 3205 | 1 3206 | 1 3207 | 0 3208 | 1 3209 | 0 3210 | 0 3211 | 1 3212 | 0 3213 | 0 3214 | 1 3215 | 1 3216 | 0 3217 | 1 3218 | 0 3219 | 0 3220 | 1 3221 | 0 3222 | 1 3223 | 0 3224 | 1 3225 | 1 3226 | 0 3227 | 0 3228 | 1 3229 | 0 3230 | 1 3231 | 1 3232 | 1 3233 | 0 3234 | 0 3235 | 0 3236 | 0 3237 | 1 3238 | 0 3239 | 1 3240 | 0 3241 | 1 3242 | 1 3243 | 0 3244 | 0 3245 | 1 3246 | 0 3247 | 1 3248 | 0 3249 | 0 3250 | 1 3251 | 0 3252 | 0 3253 | 1 3254 | 0 3255 | 1 3256 | 1 3257 | 0 3258 | 1 3259 | 0 3260 | 1 3261 | 0 3262 | 1 3263 | 0 3264 | 0 3265 | 1 3266 | 1 3267 | 0 3268 | 1 3269 | 1 3270 | 0 3271 | 0 3272 | 1 3273 | 0 3274 | 0 3275 | 1 3276 | 0 3277 | 1 3278 | 1 3279 | 1 3280 | 0 3281 | 1 3282 | 0 3283 | 0 3284 | 1 3285 | 0 3286 | 0 3287 | 1 3288 | 1 3289 | 0 3290 | 0 3291 | 1 3292 | 0 3293 | 1 3294 | 0 3295 | 0 3296 | 0 3297 | 1 3298 | 1 3299 | 1 3300 | 1 3301 | 0 3302 | 1 3303 | 0 3304 | 0 3305 | 1 3306 | 1 3307 | 0 3308 | 1 3309 | 0 3310 | 1 3311 | 0 3312 | 1 3313 | 0 3314 | 0 3315 | 1 3316 | 1 3317 | 1 3318 | 1 3319 | 1 3320 | 0 3321 | 1 3322 | 1 3323 | 1 3324 | 1 3325 | 1 3326 | 0 3327 | 0 3328 | 1 3329 | 0 3330 | 0 3331 | 0 3332 | 1 3333 | 0 3334 | 1 3335 | 1 3336 | 0 3337 | 0 3338 | 1 3339 | 1 3340 | 1 3341 | 1 3342 | 0 3343 | 1 3344 | 0 3345 | 1 3346 | 0 3347 | 0 3348 | 0 3349 | 1 3350 | 1 3351 | 0 3352 | 0 3353 | 1 3354 | 0 3355 | 0 3356 | 0 3357 | 1 3358 | 0 3359 | 0 3360 | 1 3361 | 1 3362 | 1 3363 | 0 3364 | 0 3365 | 0 3366 | 1 3367 | 0 3368 | 1 3369 | 0 3370 | 0 3371 | 1 3372 | 1 3373 | 1 3374 | 1 3375 | 0 3376 | 1 3377 | 0 3378 | 0 3379 | 1 3380 | 0 3381 | 1 3382 | 1 3383 | 0 3384 | 0 3385 | 1 3386 | 1 3387 | 1 3388 | 0 3389 | 0 3390 | 0 3391 | 1 3392 | 0 3393 | 0 3394 | 1 3395 | 0 3396 | 0 3397 | 1 3398 | 1 3399 | 0 3400 | 1 3401 | 1 3402 | 0 3403 | 1 3404 | 0 3405 | 0 3406 | 1 3407 | 1 3408 | 0 3409 | 0 3410 | 1 3411 | 0 3412 | 0 3413 | 1 3414 | 0 3415 | 1 3416 | 0 3417 | 0 3418 | 1 3419 | 1 3420 | 1 3421 | 1 3422 | 0 3423 | 0 3424 | 0 3425 | 0 3426 | 1 3427 | 1 3428 | 0 3429 | 1 3430 | 1 3431 | 1 3432 | 0 3433 | 1 3434 | 0 3435 | 1 3436 | 0 3437 | 0 3438 | 1 3439 | 1 3440 | 0 3441 | 1 3442 | 1 3443 | 0 3444 | 0 3445 | 1 3446 | 1 3447 | 0 3448 | 1 3449 | 0 3450 | 1 3451 | 1 3452 | 0 3453 | 0 3454 | 1 3455 | 0 3456 | 0 3457 | 1 3458 | 0 3459 | 1 3460 | 1 3461 | 0 3462 | 0 3463 | 0 3464 | 1 3465 | 1 3466 | 0 3467 | 1 3468 | 1 3469 | 0 3470 | 0 3471 | 1 3472 | 1 3473 | 1 3474 | 1 3475 | 0 3476 | 0 3477 | 0 3478 | 0 3479 | 1 3480 | 1 3481 | 0 3482 | 0 3483 | 1 3484 | 0 3485 | 0 3486 | 1 3487 | 0 3488 | 1 3489 | 1 3490 | 0 3491 | 1 3492 | 0 3493 | 0 3494 | 0 3495 | 1 3496 | 0 3497 | 1 3498 | 1 3499 | 1 3500 | 0 3501 | 0 3502 | 0 3503 | 1 3504 | 1 3505 | 0 3506 | 0 3507 | 1 3508 | 0 3509 | 0 3510 | 1 3511 | 1 3512 | 0 3513 | 1 3514 | 1 3515 | 0 3516 | 0 3517 | 0 3518 | 1 3519 | 1 3520 | 1 3521 | 0 3522 | 0 3523 | 1 3524 | 0 3525 | 0 3526 | 1 3527 | 0 3528 | 0 3529 | 1 3530 | 1 3531 | 0 3532 | 1 3533 | 1 3534 | 0 3535 | 1 3536 | 0 3537 | 1 3538 | 0 3539 | 0 3540 | 1 3541 | 1 3542 | 0 3543 | 1 3544 | 0 3545 | 0 3546 | 0 3547 | 1 3548 | 1 3549 | 1 3550 | 1 3551 | 0 3552 | 0 3553 | 0 3554 | 0 3555 | 0 3556 | 1 3557 | 1 3558 | 0 3559 | 1 3560 | 0 3561 | 1 3562 | 0 3563 | 1 3564 | 0 3565 | 1 3566 | 1 3567 | 1 3568 | 0 3569 | 1 3570 | 0 3571 | 1 3572 | 0 3573 | 0 3574 | 1 3575 | 1 3576 | 0 3577 | 1 3578 | 1 3579 | 0 3580 | 1 3581 | 0 3582 | 0 3583 | 1 3584 | 1 3585 | 0 3586 | 1 3587 | 0 3588 | 1 3589 | 1 3590 | 0 3591 | 1 3592 | 1 3593 | 0 3594 | 0 3595 | 0 3596 | 1 3597 | 0 3598 | 0 3599 | 1 3600 | 1 3601 | 1 3602 | 0 3603 | 1 3604 | 1 3605 | 0 3606 | 0 3607 | 1 3608 | 0 3609 | 1 3610 | 1 3611 | 0 3612 | 1 3613 | 0 3614 | 0 3615 | 0 3616 | 0 3617 | 1 3618 | 1 3619 | 0 3620 | 0 3621 | 1 3622 | 0 3623 | 0 3624 | 1 3625 | 1 3626 | 0 3627 | 1 3628 | 0 3629 | 0 3630 | 0 3631 | 1 3632 | 1 3633 | 0 3634 | 0 3635 | 1 3636 | 0 3637 | 1 3638 | 1 3639 | 0 3640 | 0 3641 | 1 3642 | 0 3643 | 0 3644 | 0 3645 | 0 3646 | 1 3647 | 1 3648 | 1 3649 | 0 3650 | 0 3651 | 1 3652 | 1 3653 | 0 3654 | 1 3655 | 0 3656 | 0 3657 | 1 3658 | 1 3659 | 0 3660 | 0 3661 | 1 3662 | 1 3663 | 0 3664 | 1 3665 | 0 3666 | 0 3667 | 1 3668 | 0 3669 | 1 3670 | 1 3671 | 0 3672 | 1 3673 | 0 3674 | 0 3675 | 0 3676 | 1 3677 | 1 3678 | 1 3679 | 1 3680 | 0 3681 | 1 3682 | 0 3683 | 1 3684 | 0 3685 | 1 3686 | 1 3687 | 0 3688 | 1 3689 | 0 3690 | 1 3691 | 0 3692 | 1 3693 | 1 3694 | 1 3695 | 0 3696 | 0 3697 | 0 3698 | 0 3699 | 1 3700 | 1 3701 | 1 3702 | 1 3703 | 0 3704 | 0 3705 | 1 3706 | 0 3707 | 0 3708 | 1 3709 | 1 3710 | 1 3711 | 1 3712 | 0 3713 | 1 3714 | 1 3715 | 1 3716 | 0 3717 | 0 3718 | 1 3719 | 1 3720 | 0 3721 | 0 3722 | 1 3723 | 0 3724 | 1 3725 | 1 3726 | 0 3727 | 1 3728 | 0 3729 | 1 3730 | 1 3731 | 0 3732 | 0 3733 | 1 3734 | 1 3735 | 0 3736 | 1 3737 | 0 3738 | 1 3739 | 0 3740 | 1 3741 | 1 3742 | 1 3743 | 0 3744 | 0 3745 | 1 3746 | 1 3747 | 1 3748 | 1 3749 | 1 3750 | 0 3751 | 0 3752 | 0 3753 | 1 3754 | 1 3755 | 0 3756 | 1 3757 | 0 3758 | 1 3759 | 0 3760 | 0 3761 | 1 3762 | 1 3763 | 0 3764 | 1 3765 | 0 3766 | 0 3767 | 0 3768 | 1 3769 | 1 3770 | 0 3771 | 1 3772 | 0 3773 | 0 3774 | 1 3775 | 1 3776 | 1 3777 | 0 3778 | 0 3779 | 1 3780 | 0 3781 | 0 3782 | 1 3783 | 0 3784 | 1 3785 | 0 3786 | 1 3787 | 1 3788 | 0 3789 | 0 3790 | 1 3791 | 1 3792 | 0 3793 | 0 3794 | 0 3795 | 0 3796 | 1 3797 | 1 3798 | 1 3799 | 0 3800 | 1 3801 | 0 3802 | 0 3803 | 1 3804 | 1 3805 | 1 3806 | 0 3807 | 0 3808 | 1 3809 | 0 3810 | 1 3811 | 0 3812 | 1 3813 | 0 3814 | 1 3815 | 0 3816 | 1 3817 | 0 3818 | 0 3819 | 1 3820 | 0 3821 | 0 3822 | 1 3823 | 0 3824 | 0 3825 | 0 3826 | 0 3827 | 1 3828 | 1 3829 | 0 3830 | 0 3831 | 1 3832 | 1 3833 | 1 3834 | 1 3835 | 0 3836 | 1 3837 | 0 3838 | 1 3839 | 0 3840 | 0 3841 | 1 3842 | 1 3843 | 1 3844 | 0 3845 | 0 3846 | 1 3847 | 0 3848 | 1 3849 | 0 3850 | 0 3851 | 0 3852 | 1 3853 | 1 3854 | 0 3855 | 1 3856 | 0 3857 | 0 3858 | 1 3859 | 1 3860 | 0 3861 | 1 3862 | 0 3863 | 0 3864 | 1 3865 | 0 3866 | 1 3867 | 1 3868 | 0 3869 | 0 3870 | 0 3871 | 1 3872 | 1 3873 | 0 3874 | 1 3875 | 0 3876 | 1 3877 | 1 3878 | 0 3879 | 1 3880 | 1 3881 | 1 3882 | 0 3883 | 0 3884 | 1 3885 | 0 3886 | 1 3887 | 1 3888 | 0 3889 | 1 3890 | 0 3891 | 0 3892 | 0 3893 | 1 3894 | 0 3895 | 1 3896 | 1 3897 | 0 3898 | 1 3899 | 0 3900 | 0 3901 | 1 3902 | 1 3903 | 0 3904 | 1 3905 | 0 3906 | 1 3907 | 0 3908 | 1 3909 | 1 3910 | 0 3911 | 1 3912 | 0 3913 | 1 3914 | 0 3915 | 1 3916 | 0 3917 | 1 3918 | 0 3919 | 1 3920 | 0 3921 | 0 3922 | 0 3923 | 1 3924 | 1 3925 | 0 3926 | 1 3927 | 0 3928 | 1 3929 | 0 3930 | 1 3931 | 1 3932 | 0 3933 | 1 3934 | 1 3935 | 0 3936 | 0 3937 | 0 3938 | 1 3939 | 1 3940 | 0 3941 | 1 3942 | 0 3943 | 0 3944 | 0 3945 | 1 3946 | 0 3947 | 1 3948 | 0 3949 | 1 3950 | 1 3951 | 0 3952 | 0 3953 | 1 3954 | 1 3955 | 1 3956 | 0 3957 | 1 3958 | 0 3959 | 1 3960 | 1 3961 | 1 3962 | 0 3963 | 0 3964 | 0 3965 | 1 3966 | 0 3967 | 1 3968 | 1 3969 | 0 3970 | 0 3971 | 1 3972 | 0 3973 | 1 3974 | 1 3975 | 0 3976 | 1 3977 | 0 3978 | 1 3979 | 0 3980 | 0 3981 | 1 3982 | 0 3983 | 1 3984 | 0 3985 | 1 3986 | 1 3987 | 0 3988 | 1 3989 | 1 3990 | 0 3991 | 0 3992 | 1 3993 | 1 3994 | 0 3995 | 1 3996 | 0 3997 | 0 3998 | -------------------------------------------------------------------------------- /cartpole/test.csv: -------------------------------------------------------------------------------- 1 | t o t a l _ r e w a r d s 2 | 1 6 . 0 3 | 1 9 . 0 4 | 1 8 . 0 5 | 3 7 . 0 6 | 2 0 . 0 7 | 1 2 . 0 8 | 1 0 . 0 9 | 4 8 . 0 10 | 1 5 . 0 11 | 2 0 . 0 12 | 1 4 . 0 13 | 2 0 . 0 14 | 6 2 . 0 15 | 3 5 . 0 16 | 5 1 . 0 17 | 1 7 . 0 18 | 1 4 . 0 19 | 1 8 . 0 20 | 1 5 . 0 21 | 2 5 . 0 22 | 2 9 . 0 23 | -------------------------------------------------------------------------------- /cartpole_gazebo/README.md: -------------------------------------------------------------------------------- 1 | ## 3D CartPole environment in Gazebo. 2 | 3 | ### Dependencies: 4 | - Ubuntu 16.04 5 | - ROS Kinetic 6 | - Gazebo 7 7 | - TensorFlow: 1.1.0 [with GPU support] 8 | - gym: 0.9.3 9 | - Python 3.6 10 | 11 | ### File setup: 12 | - ***cartpole_gazebo*** contains the robot model(both **.stl** files & **.urdf** file) and also the ***gazebo launch file***. 13 | 14 | - ***cartpole_controller*** contains the reinforcement learning implementation of ****Policy Gradient algorithm**** for custom cartpole environment. 15 | 16 | #### Policy Gradient for custom designed cartpole model in gazebo environment. 17 |

18 | 19 |

20 | 21 | 22 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_controller/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.3) 2 | project(cartpole_controller) 3 | 4 | ## Compile as C++11, supported in ROS Kinetic and newer 5 | # add_compile_options(-std=c++11) 6 | 7 | ## Find catkin macros and libraries 8 | ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) 9 | ## is used, also find other catkin packages 10 | find_package(catkin REQUIRED COMPONENTS 11 | 12 | ) 13 | 14 | ## System dependencies are found with CMake's conventions 15 | # find_package(Boost REQUIRED COMPONENTS system) 16 | 17 | 18 | ## Uncomment this if the package has a setup.py. This macro ensures 19 | ## modules and global scripts declared therein get installed 20 | ## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html 21 | # catkin_python_setup() 22 | 23 | ################################################ 24 | ## Declare ROS messages, services and actions ## 25 | ################################################ 26 | 27 | ## To declare and build messages, services or actions from within this 28 | ## package, follow these steps: 29 | ## * Let MSG_DEP_SET be the set of packages whose message types you use in 30 | ## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...). 31 | ## * In the file package.xml: 32 | ## * add a build_depend tag for "message_generation" 33 | ## * add a build_depend and a run_depend tag for each package in MSG_DEP_SET 34 | ## * If MSG_DEP_SET isn't empty the following dependency has been pulled in 35 | ## but can be declared for certainty nonetheless: 36 | ## * add a run_depend tag for "message_runtime" 37 | ## * In this file (CMakeLists.txt): 38 | ## * add "message_generation" and every package in MSG_DEP_SET to 39 | ## find_package(catkin REQUIRED COMPONENTS ...) 40 | ## * add "message_runtime" and every package in MSG_DEP_SET to 41 | ## catkin_package(CATKIN_DEPENDS ...) 42 | ## * uncomment the add_*_files sections below as needed 43 | ## and list every .msg/.srv/.action file to be processed 44 | ## * uncomment the generate_messages entry below 45 | ## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...) 46 | 47 | ## Generate messages in the 'msg' folder 48 | # add_message_files( 49 | # FILES 50 | # Message1.msg 51 | # Message2.msg 52 | # ) 53 | 54 | ## Generate services in the 'srv' folder 55 | # add_service_files( 56 | # FILES 57 | # Service1.srv 58 | # Service2.srv 59 | # ) 60 | 61 | ## Generate actions in the 'action' folder 62 | # add_action_files( 63 | # FILES 64 | # Action1.action 65 | # Action2.action 66 | # ) 67 | 68 | ## Generate added messages and services with any dependencies listed here 69 | # generate_messages( 70 | # DEPENDENCIES 71 | # std_msgs 72 | # ) 73 | 74 | ################################################ 75 | ## Declare ROS dynamic reconfigure parameters ## 76 | ################################################ 77 | 78 | ## To declare and build dynamic reconfigure parameters within this 79 | ## package, follow these steps: 80 | ## * In the file package.xml: 81 | ## * add a build_depend and a run_depend tag for "dynamic_reconfigure" 82 | ## * In this file (CMakeLists.txt): 83 | ## * add "dynamic_reconfigure" to 84 | ## find_package(catkin REQUIRED COMPONENTS ...) 85 | ## * uncomment the "generate_dynamic_reconfigure_options" section below 86 | ## and list every .cfg file to be processed 87 | 88 | ## Generate dynamic reconfigure parameters in the 'cfg' folder 89 | # generate_dynamic_reconfigure_options( 90 | # cfg/DynReconf1.cfg 91 | # cfg/DynReconf2.cfg 92 | # ) 93 | 94 | ################################### 95 | ## catkin specific configuration ## 96 | ################################### 97 | ## The catkin_package macro generates cmake config files for your package 98 | ## Declare things to be passed to dependent projects 99 | ## INCLUDE_DIRS: uncomment this if your package contains header files 100 | ## LIBRARIES: libraries you create in this project that dependent projects also need 101 | ## CATKIN_DEPENDS: catkin_packages dependent projects also need 102 | ## DEPENDS: system dependencies of this project that dependent projects also need 103 | catkin_package( 104 | # INCLUDE_DIRS include 105 | # LIBRARIES cartpole_controller 106 | # CATKIN_DEPENDS ros_control ros_controllers rospy std_msgs 107 | # DEPENDS system_lib 108 | ) 109 | 110 | ########### 111 | ## Build ## 112 | ########### 113 | 114 | ## Specify additional locations of header files 115 | ## Your package locations should be listed before other locations 116 | include_directories( 117 | # include 118 | ${catkin_INCLUDE_DIRS} 119 | ) 120 | 121 | ## Declare a C++ library 122 | # add_library(${PROJECT_NAME} 123 | # src/${PROJECT_NAME}/cartpole_controller.cpp 124 | # ) 125 | 126 | ## Add cmake target dependencies of the library 127 | ## as an example, code may need to be generated before libraries 128 | ## either from message generation or dynamic reconfigure 129 | # add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) 130 | 131 | ## Declare a C++ executable 132 | ## With catkin_make all packages are built within a single CMake context 133 | ## The recommended prefix ensures that target names across packages don't collide 134 | # add_executable(${PROJECT_NAME}_node src/cartpole_controller_node.cpp) 135 | 136 | ## Rename C++ executable without prefix 137 | ## The above recommended prefix causes long target names, the following renames the 138 | ## target back to the shorter version for ease of user use 139 | ## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node" 140 | # set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "") 141 | 142 | ## Add cmake target dependencies of the executable 143 | ## same as for the library above 144 | # add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) 145 | 146 | ## Specify libraries to link a library or executable target against 147 | # target_link_libraries(${PROJECT_NAME}_node 148 | # ${catkin_LIBRARIES} 149 | # ) 150 | 151 | ############# 152 | ## Install ## 153 | ############# 154 | 155 | # all install targets should use catkin DESTINATION variables 156 | # See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html 157 | 158 | ## Mark executable scripts (Python etc.) for installation 159 | ## in contrast to setup.py, you can choose the destination 160 | # install(PROGRAMS 161 | # scripts/my_python_script 162 | # DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 163 | # ) 164 | 165 | ## Mark executables and/or libraries for installation 166 | # install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_node 167 | # ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 168 | # LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 169 | # RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 170 | # ) 171 | 172 | ## Mark cpp header files for installation 173 | # install(DIRECTORY include/${PROJECT_NAME}/ 174 | # DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} 175 | # FILES_MATCHING PATTERN "*.h" 176 | # PATTERN ".svn" EXCLUDE 177 | # ) 178 | 179 | ## Mark other files for installation (e.g. launch and bag files, etc.) 180 | # install(FILES 181 | # # myfile1 182 | # # myfile2 183 | # DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} 184 | # ) 185 | 186 | ############# 187 | ## Testing ## 188 | ############# 189 | 190 | ## Add gtest based cpp test target and link libraries 191 | # catkin_add_gtest(${PROJECT_NAME}-test test/test_cartpole_controller.cpp) 192 | # if(TARGET ${PROJECT_NAME}-test) 193 | # target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) 194 | # endif() 195 | 196 | ## Add folders to be run by python nosetests 197 | # catkin_add_nosetests(test) 198 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_controller/config/joint_position_control.yaml: -------------------------------------------------------------------------------- 1 | joint_state_controller: 2 | type: joint_state_controller/JointStateController 3 | publish_rate: 120 4 | 5 | # Position Controllers --------------------------------------- 6 | stand_cart_position_controller: 7 | type: effort_controllers/JointPositionController 8 | joint: stand_cart 9 | pid: {p: 100, i: 0.01, d: 10} 10 | 11 | cart_pole_position_controller: 12 | type: effort_controllers/JointPositionController 13 | joint: cart_pole 14 | pid: {p: 100, i: 0.01, d: 10} 15 | 16 | 17 | # waist_pendulum_position_controller: 18 | # type: effort_controllers/JointPositionController 19 | # joint: waist_pendulum 20 | # pid: {p: 100.0, i: 0.01, d: 10.0} 21 | 22 | # top_pendulum_position_controller: 23 | # type: effort_controllers/JointPositionController 24 | # joint: top_pendulum 25 | # pid: {p: 100.0, i: 0.01, d: 10.0} 26 | 27 | # shankL_footL_position_controller: 28 | # type: effort_controllers/JointPositionController 29 | # joint: shankL_footL 30 | # pid: {p: 100.0, i: 0.01, d: 10.0} 31 | 32 | # waist_boom_position_controller: 33 | # type: effort_controllers/JointPositionController 34 | # joint: waist_boom 35 | # pid: {p: 100.0, i: 0.01, d: 10.0} 36 | 37 | 38 | # pivot_boom_position_controller: 39 | # type: effort_controllers/JointPositionController 40 | # joint: boom_pivot 41 | # pid: {p: 100.0, i: 0.01, d: 10.0} 42 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_controller/package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | cartpole_controller 4 | 0.0.0 5 | The cartpole_controller package 6 | 7 | 8 | 9 | 10 | imitaion 11 | 12 | 13 | 14 | 15 | 16 | TODO 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | catkin 52 | ros_control 53 | ros_controllers 54 | rospy 55 | std_msgs 56 | ros_control 57 | ros_controllers 58 | rospy 59 | std_msgs 60 | ros_control 61 | ros_controllers 62 | rospy 63 | std_msgs 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_controller/src/pg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | import roslib 6 | import rospy 7 | import random 8 | import time 9 | import math 10 | import csv 11 | from std_srvs.srv import Empty 12 | from gazebo_msgs.srv import SetModelConfiguration 13 | 14 | from control_msgs.msg import JointControllerState 15 | from sensor_msgs.msg import JointState 16 | from gazebo_msgs.msg import LinkStates 17 | from gazebo_msgs.srv import SetLinkState 18 | from gazebo_msgs.msg import LinkState 19 | from std_msgs.msg import Float64 20 | from std_msgs.msg import String 21 | from sensor_msgs.msg import Joy 22 | 23 | 24 | import threading 25 | from scipy.interpolate import interp1d 26 | 27 | ENV_NAME = 'Cartpole_v0' 28 | EPISODES = 100000 29 | TEST = 10 30 | 31 | 32 | pubCartPosition = rospy.Publisher('/stand_cart_position_controller/command', Float64, queue_size=1) 33 | pubJointStates = rospy.Publisher('/joint_states', JointState, queue_size=1) 34 | 35 | reset_world = rospy.ServiceProxy('/gazebo/reset_world', Empty) 36 | reset_joints = rospy.ServiceProxy('/gazebo/set_model_configuration', SetModelConfiguration) 37 | unpause = rospy.ServiceProxy('/gazebo/unpause_physics', Empty) 38 | pause = rospy.ServiceProxy('/gazebo/pause_physics', Empty) 39 | set_link = rospy.ServiceProxy('/gazebo/set_link_state', SetLinkState) 40 | 41 | fall = 0 42 | 43 | 44 | rospy.init_node('cartpole_control_script') 45 | rate = rospy.Rate(120) 46 | 47 | 48 | 49 | class RobotState(object): 50 | def __init__(self): 51 | self.cart_x = 0.0 52 | self.cart_x_dot = 0.0 53 | self.pole_theta = 0.0 54 | self.pole_theta_dot = 0.0 55 | self.robot_state = [self.cart_x, self.cart_x_dot, self.pole_theta, self.pole_theta_dot] 56 | 57 | self.data = None 58 | self.latest_reward = 0.0 59 | self.fall = 0 60 | 61 | self.theta_threshold = 0.20943951023 62 | self.x_threshold = 0.4 63 | 64 | self.current_vel = 0.0 65 | self.done = False 66 | 67 | 68 | robot_state = RobotState() 69 | 70 | 71 | def reset(): 72 | rospy.wait_for_service('/gazebo/reset_world') 73 | 74 | try: 75 | reset_world() 76 | except (rospy.ServiceException) as e: 77 | print "reset_world failed!" 78 | 79 | 80 | # rospy.wait_for_service('/gazebo/reset_world') 81 | rospy.wait_for_service('/gazebo/set_model_configuration') 82 | 83 | try: 84 | #reset_proxy.call() 85 | # reset_world() 86 | reset_joints("cartpole", "robot_description", ["stand_cart", "cart_pole"], [0.0, 0.0]) 87 | 88 | 89 | except (rospy.ServiceException) as e: 90 | print "/gazebo/reset_joints service call failed" 91 | 92 | rospy.wait_for_service('/gazebo/pause_physics') 93 | try: 94 | pause() 95 | except (rospy.ServiceException) as e: 96 | print "rospause failed!" 97 | 98 | # rospy.wait_for_service('/gazebo/unpause_physics') 99 | 100 | # try: 101 | # unpause() 102 | # except (rospy.ServiceException) as e: 103 | # print "/gazebo/pause_physics service call failed" 104 | 105 | set_robot_state() 106 | robot_state.current_vel = 0 107 | print "called reset()" 108 | 109 | 110 | 111 | 112 | 113 | def set_robot_state(): 114 | robot_state.robot_state = [robot_state.cart_x, robot_state.cart_x_dot, robot_state.pole_theta, robot_state.pole_theta_dot] 115 | 116 | def take_action(action): 117 | rospy.wait_for_service('/gazebo/unpause_physics') 118 | 119 | try: 120 | unpause() 121 | except (rospy.ServiceException) as e: 122 | print "/gazebo/pause_physics service call failed" 123 | 124 | 125 | if action == 1: 126 | robot_state.current_vel = robot_state.current_vel + 0.05 127 | else: 128 | robot_state.current_vel = robot_state.current_vel - 0.05 129 | 130 | 131 | # print "publish : ", robot_state.current_vel 132 | pubCartPosition.publish(robot_state.current_vel) 133 | 134 | reward = 1 135 | 136 | # ['cart_pole', 'stand_cart'] 137 | if robot_state.data==None: 138 | while robot_state.data is None: 139 | try: 140 | robot_state.data = rospy.wait_for_message('/joint_states', JointState, timeout=5) 141 | except: 142 | print "Error getting /joint_states data." 143 | # print "DATA : ",robot_state.data 144 | # print "latest_reward: ", robot_state.latest_reward 145 | 146 | # if len(robot_state.data.velocity) > 0: 147 | # robot_state.cart_x_dot = robot_state.data.velocity[1] 148 | # robot_state.pole_theta_dot = robot_state.data.velocity[0] 149 | # else: 150 | # robot_state.cart_x_dot = 0.0 151 | # robot_state.pole_theta_dot = 0.0 152 | 153 | # robot_state.cart_x = robot_state.data.position[1] 154 | # robot_state.pole_theta = robot_state.data.position[0] 155 | 156 | 157 | set_robot_state() 158 | 159 | if robot_state.cart_x < -robot_state.x_threshold or robot_state.cart_x > robot_state.x_threshold or robot_state.pole_theta > robot_state.theta_threshold \ 160 | or robot_state.pole_theta < -robot_state.theta_threshold: 161 | 162 | robot_state.done = True 163 | reward = 1 164 | 165 | else: 166 | reward = 1 167 | 168 | # rate.sleep() 169 | 170 | return reward, robot_state.done 171 | 172 | 173 | def callbackJointStates(data): 174 | if len(data.velocity) > 0: 175 | robot_state.cart_x_dot = data.velocity[1] 176 | robot_state.pole_theta_dot = data.velocity[0] 177 | else: 178 | robot_state.cart_x_dot = 0.0 179 | robot_state.pole_theta_dot = 0.0 180 | robot_state.cart_x = data.position[1] 181 | robot_state.pole_theta = data.position[0] 182 | 183 | set_robot_state() 184 | 185 | print "DATA :", data 186 | 187 | 188 | def listener(): 189 | print "listener" 190 | rospy.Subscriber("/joint_states", JointState, callbackJointStates) 191 | 192 | 193 | 194 | def softmax(x): 195 | e_x = np.exp(x - np.max(x)) 196 | out = e_x / e_x.sum() 197 | return out 198 | 199 | 200 | def policy_gradient(): 201 | with tf.variable_scope("policy"): 202 | params = tf.get_variable("policy_parameters",[4,2]) 203 | state = tf.placeholder("float",[None,4]) 204 | actions = tf.placeholder("float",[None,2]) 205 | advantages = tf.placeholder("float",[None,1]) 206 | linear = tf.matmul(state,params) 207 | probabilities = tf.nn.softmax(linear) 208 | good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1]) 209 | eligibility = tf.log(good_probabilities) * advantages 210 | loss = -tf.reduce_sum(eligibility) 211 | optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) 212 | return probabilities, state, actions, advantages, optimizer 213 | 214 | def value_gradient(): 215 | with tf.variable_scope("value"): 216 | state = tf.placeholder("float",[None,4]) 217 | newvals = tf.placeholder("float",[None,1]) 218 | w1 = tf.get_variable("w1",[4,10]) 219 | b1 = tf.get_variable("b1",[10]) 220 | h1 = tf.nn.relu(tf.matmul(state,w1) + b1) 221 | w2 = tf.get_variable("w2",[10,1]) 222 | b2 = tf.get_variable("b2",[1]) 223 | calculated = tf.matmul(h1,w2) + b2 224 | diffs = calculated - newvals 225 | loss = tf.nn.l2_loss(diffs) 226 | optimizer = tf.train.AdamOptimizer(0.1).minimize(loss) 227 | return calculated, state, newvals, optimizer, loss 228 | 229 | 230 | def run_episode(policy_grad, value_grad, sess): 231 | pl_calculated, pl_state, pl_actions, pl_advantages, pl_optimizer = policy_grad 232 | vl_calculated, vl_state, vl_newvals, vl_optimizer, vl_loss = value_grad 233 | reset() 234 | observation = robot_state.robot_state 235 | totalreward = 0 236 | states = [] 237 | actions = [] 238 | advantages = [] 239 | transitions = [] 240 | update_vals = [] 241 | 242 | 243 | for _ in range(20000): 244 | 245 | # calculate policy 246 | obs_vector = np.expand_dims(observation, axis=0) 247 | probs = sess.run(pl_calculated,feed_dict={pl_state: obs_vector}) 248 | action = 0 if random.uniform(0,1) < probs[0][0] else 1 249 | # record the transition 250 | states.append(observation) 251 | # print("angle: ", observation[2]*180/3.14) 252 | actionblank = np.zeros(2) 253 | actionblank[action] = 1 254 | actions.append(actionblank) 255 | # take the action in the environment 256 | old_observation = observation 257 | reward, done = take_action(action) 258 | observation = robot_state.robot_state 259 | transitions.append((old_observation, action, reward)) 260 | totalreward += reward 261 | 262 | if done: 263 | robot_state.done = False 264 | break 265 | for index, trans in enumerate(transitions): 266 | obs, action, reward = trans 267 | 268 | # calculate discounted monte-carlo return 269 | future_reward = 0 270 | future_transitions = len(transitions) - index 271 | decrease = 1 272 | for index2 in range(future_transitions): 273 | future_reward += transitions[(index2) + index][2] * decrease 274 | decrease = decrease * 0.97 275 | obs_vector = np.expand_dims(obs, axis=0) 276 | currentval = sess.run(vl_calculated,feed_dict={vl_state: obs_vector})[0][0] 277 | 278 | # advantage: how much better was this action than normal 279 | advantages.append(future_reward - currentval) 280 | 281 | # update the value function towards new return 282 | update_vals.append(future_reward) 283 | 284 | # update value function 285 | update_vals_vector = np.expand_dims(update_vals, axis=1) 286 | sess.run(vl_optimizer, feed_dict={vl_state: states, vl_newvals: update_vals_vector}) 287 | # real_vl_loss = sess.run(vl_loss, feed_dict={vl_state: states, vl_newvals: update_vals_vector}) 288 | 289 | advantages_vector = np.expand_dims(advantages, axis=1) 290 | sess.run(pl_optimizer, feed_dict={pl_state: states, pl_advantages: advantages_vector, pl_actions: actions}) 291 | 292 | return totalreward 293 | 294 | 295 | def main(): 296 | 297 | listener() 298 | # outdir = '/home/imitaion/catkin_ws/src/cartpole_controller/src' 299 | # plotter = LivePlot(outdir) 300 | 301 | policy_grad = policy_gradient() 302 | value_grad = value_gradient() 303 | sess = tf.InteractiveSession() 304 | sess.run(tf.global_variables_initializer()) 305 | list_rewards = [] 306 | 307 | for i in range(200000): 308 | print("Episode ", i) 309 | reward = run_episode(policy_grad, value_grad, sess) 310 | print "reward", reward 311 | list_rewards.append(reward) 312 | if(i%20==0): 313 | # plotter.plot(list_rewards) 314 | print "20 episodes done." 315 | 316 | time.sleep(0.05) 317 | t = 0 318 | for _ in range(1000): 319 | reward = run_episode(policy_grad, value_grad, sess) 320 | t += reward 321 | print (t / 1000) 322 | #env.monitor.close() 323 | print ("END!") 324 | 325 | 326 | if __name__ == '__main__': 327 | main() 328 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_controller/src/pg2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | import roslib 6 | import rospy 7 | import random 8 | import time 9 | import math 10 | import csv 11 | from std_srvs.srv import Empty 12 | from gazebo_msgs.srv import SetModelConfiguration 13 | 14 | from control_msgs.msg import JointControllerState 15 | from sensor_msgs.msg import JointState 16 | from gazebo_msgs.msg import LinkStates 17 | from gazebo_msgs.srv import SetLinkState 18 | from gazebo_msgs.msg import LinkState 19 | from std_msgs.msg import Float64 20 | from std_msgs.msg import String 21 | from sensor_msgs.msg import Joy 22 | 23 | 24 | import threading 25 | from scipy.interpolate import interp1d 26 | 27 | 28 | H = 10 29 | batch_size = 50 30 | learning_rate = 1e-2 31 | gamma = 0.99 32 | D = 4 33 | 34 | pubCartPosition = rospy.Publisher('/stand_cart_position_controller/command', Float64, queue_size=1) 35 | pubJointStates = rospy.Publisher('/joint_states', JointState, queue_size=1) 36 | 37 | reset_world = rospy.ServiceProxy('/gazebo/reset_world', Empty) 38 | reset_joints = rospy.ServiceProxy('/gazebo/set_model_configuration', SetModelConfiguration) 39 | unpause = rospy.ServiceProxy('/gazebo/unpause_physics', Empty) 40 | pause = rospy.ServiceProxy('/gazebo/pause_physics', Empty) 41 | set_link = rospy.ServiceProxy('/gazebo/set_link_state', SetLinkState) 42 | 43 | fall = 0 44 | 45 | 46 | rospy.init_node('cartpole_control_script') 47 | rate = rospy.Rate(120) 48 | 49 | 50 | 51 | class RobotState(object): 52 | def __init__(self): 53 | self.cart_x = 0.0 54 | self.cart_x_dot = 0.0 55 | self.pole_theta = 0.0 56 | self.pole_theta_dot = 0.0 57 | self.robot_state = [self.cart_x, self.cart_x_dot, self.pole_theta, self.pole_theta_dot] 58 | 59 | self.data = None 60 | self.latest_reward = 0.0 61 | self.fall = 0 62 | 63 | self.theta_threshold = 0.20943951023 64 | self.x_threshold = 0.4 65 | 66 | self.current_vel = 0.0 67 | self.done = False 68 | 69 | 70 | robot_state = RobotState() 71 | 72 | 73 | def reset(): 74 | rospy.wait_for_service('/gazebo/reset_world') 75 | 76 | try: 77 | reset_world() 78 | except (rospy.ServiceException) as e: 79 | print "reset_world failed!" 80 | 81 | 82 | # rospy.wait_for_service('/gazebo/reset_world') 83 | rospy.wait_for_service('/gazebo/set_model_configuration') 84 | 85 | try: 86 | #reset_proxy.call() 87 | # reset_world() 88 | reset_joints("cartpole", "robot_description", ["stand_cart", "cart_pole"], [0.0, 0.0]) 89 | 90 | 91 | except (rospy.ServiceException) as e: 92 | print "/gazebo/reset_joints service call failed" 93 | 94 | rospy.wait_for_service('/gazebo/pause_physics') 95 | try: 96 | pause() 97 | except (rospy.ServiceException) as e: 98 | print "rospause failed!" 99 | 100 | # rospy.wait_for_service('/gazebo/unpause_physics') 101 | 102 | # try: 103 | # unpause() 104 | # except (rospy.ServiceException) as e: 105 | # print "/gazebo/pause_physics service call failed" 106 | 107 | set_robot_state() 108 | robot_state.current_vel = 0 109 | print "called reset()" 110 | 111 | 112 | 113 | 114 | 115 | def set_robot_state(): 116 | robot_state.robot_state = [robot_state.cart_x, robot_state.cart_x_dot, robot_state.pole_theta, robot_state.pole_theta_dot] 117 | 118 | def take_action(action): 119 | rospy.wait_for_service('/gazebo/unpause_physics') 120 | 121 | try: 122 | unpause() 123 | except (rospy.ServiceException) as e: 124 | print "/gazebo/pause_physics service call failed" 125 | 126 | 127 | if action == 1: 128 | robot_state.current_vel = robot_state.current_vel + 0.05 129 | else: 130 | robot_state.current_vel = robot_state.current_vel - 0.05 131 | 132 | 133 | # print "publish : ", robot_state.current_vel 134 | pubCartPosition.publish(robot_state.current_vel) 135 | 136 | reward = 1 137 | 138 | # ['cart_pole', 'stand_cart'] 139 | if robot_state.data==None: 140 | while robot_state.data is None: 141 | try: 142 | robot_state.data = rospy.wait_for_message('/joint_states', JointState, timeout=5) 143 | except: 144 | print "Error getting /joint_states data." 145 | # print "DATA : ",robot_state.data 146 | # print "latest_reward: ", robot_state.latest_reward 147 | 148 | # if len(robot_state.data.velocity) > 0: 149 | # robot_state.cart_x_dot = robot_state.data.velocity[1] 150 | # robot_state.pole_theta_dot = robot_state.data.velocity[0] 151 | # else: 152 | # robot_state.cart_x_dot = 0.0 153 | # robot_state.pole_theta_dot = 0.0 154 | 155 | # robot_state.cart_x = robot_state.data.position[1] 156 | # robot_state.pole_theta = robot_state.data.position[0] 157 | 158 | 159 | set_robot_state() 160 | 161 | if robot_state.cart_x < -robot_state.x_threshold or robot_state.cart_x > robot_state.x_threshold or robot_state.pole_theta > robot_state.theta_threshold \ 162 | or robot_state.pole_theta < -robot_state.theta_threshold: 163 | 164 | robot_state.done = True 165 | reward = 1 166 | 167 | else: 168 | reward = 1 169 | 170 | # rate.sleep() 171 | 172 | return reward, robot_state.done 173 | 174 | 175 | def callbackJointStates(data): 176 | if len(data.velocity) > 0: 177 | robot_state.cart_x_dot = data.velocity[1] 178 | robot_state.pole_theta_dot = data.velocity[0] 179 | else: 180 | robot_state.cart_x_dot = 0.0 181 | robot_state.pole_theta_dot = 0.0 182 | robot_state.cart_x = data.position[1] 183 | robot_state.pole_theta = data.position[0] 184 | 185 | set_robot_state() 186 | 187 | print "DATA :", data 188 | 189 | 190 | def listener(): 191 | print "listener" 192 | rospy.Subscriber("/joint_states", JointState, callbackJointStates) 193 | 194 | 195 | def discount_rewards(r): 196 | discounted_r = np.zeros_like(r) 197 | running_add = 0 198 | for t in reversed(xrange(0, r.size)): 199 | running_add = running_add * gamma + r[t] 200 | discounted_r[t] = running_add 201 | return discounted_r 202 | 203 | 204 | def main(): 205 | 206 | listener() 207 | reset() 208 | 209 | tf.reset_default_graph() 210 | 211 | observations = tf.placeholder(tf.float32, [None, D], name="input_x") 212 | w1 = tf.get_variable("w1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer()) 213 | layer1 = tf.nn.relu(tf.matmul(observations, w1)) 214 | w2 = tf.get_variable("w2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer()) 215 | score = tf.matmul(layer1, w2) 216 | probability = tf.nn.sigmoid(score) 217 | 218 | tvars = tf.trainable_variables() 219 | input_y = tf.placeholder(tf.float32, [None,1], name="input_y") 220 | advantages = tf.placeholder(tf.float32, name="reward_signal") 221 | 222 | loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) 223 | loss = -tf.reduce_mean(loglik*advantages) 224 | newGrads = tf.gradients(loss, tvars) 225 | 226 | adam = tf.train.AdamOptimizer(learning_rate=learning_rate) 227 | w1grad = tf.placeholder(tf.float32,name="batch_grad1") 228 | w2grad = tf.placeholder(tf.float32, name="batch_grad2") 229 | batchgrads = [w1grad, w2grad] 230 | updategrads = adam.apply_gradients(zip(batchgrads, tvars)) 231 | 232 | xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] 233 | running_reward = None 234 | reward_sum = 0 235 | episode_number = 1 236 | total_episodes = 10000 237 | init = tf.initialize_all_variables() 238 | 239 | 240 | # Launch the graph 241 | with tf.Session() as sess: 242 | rendering = False 243 | sess.run(init) 244 | reset() # Obtain an initial observation of the environment 245 | observation = robot_state.robot_state 246 | 247 | # Reset the gradient placeholder. We will collect gradients in 248 | # gradBuffer until we are ready to update our policy network. 249 | gradBuffer = sess.run(tvars) 250 | for ix,grad in enumerate(gradBuffer): 251 | gradBuffer[ix] = grad * 0 252 | 253 | while episode_number <= total_episodes: 254 | # Make sure the observation is in a shape the network can handle. 255 | x = np.reshape(observation,[1,D]) 256 | 257 | # Run the policy network and get an action to take. 258 | tfprob = sess.run(probability,feed_dict={observations: x}) 259 | 260 | action = 1 if np.random.uniform() < tfprob else 0 261 | #print ("ACTION :",action) 262 | xs.append(x) # observation 263 | y = 1 if action == 0 else 0 264 | ys.append(y) 265 | 266 | # step the environment and get new measurements 267 | reward, done = take_action(action) 268 | observation = robot_state.robot_state 269 | reward_sum += reward 270 | 271 | drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) 272 | 273 | if done: 274 | episode_number += 1 275 | # stack together all inputs, hidden states, action gradients, and rewards for this episode 276 | epx = np.vstack(xs) 277 | epy = np.vstack(ys) 278 | epr = np.vstack(drs) 279 | tfp = tfps 280 | xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory 281 | 282 | # compute the discounted reward backwards through time 283 | discounted_epr = discount_rewards(epr) 284 | # size the rewards to be unit normal (helps control the gradient estimator variance) 285 | discounted_epr = discounted_epr - np.mean(discounted_epr) 286 | discounted_epr = discounted_epr / np.std(discounted_epr) 287 | 288 | # Get the gradient for this episode, and save it in the gradBuffer 289 | tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr}) 290 | for ix,grad in enumerate(tGrad): 291 | gradBuffer[ix] += grad 292 | 293 | # If we have completed enough episodes, then update the policy network with our gradients. 294 | if episode_number % batch_size == 0: 295 | sess.run(updategrads,feed_dict={w1grad: gradBuffer[0],w2grad:gradBuffer[1]}) 296 | for ix,grad in enumerate(gradBuffer): 297 | gradBuffer[ix] = grad * 0 298 | 299 | # Give a summary of how well our network is doing for each batch of episodes. 300 | running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 301 | print ('Average reward for episode %f. Total average reward %f.' % (reward_sum/batch_size, running_reward/batch_size)) 302 | 303 | if reward_sum/batch_size > 200: 304 | print ("Task solved in",episode_number,'episodes!') 305 | break 306 | 307 | reward_sum = 0 308 | 309 | reset() 310 | observation = robot_state.robot_state 311 | 312 | print (episode_number,'Episodes completed.') 313 | 314 | 315 | if __name__ == '__main__': 316 | main() 317 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.3) 2 | project(cartpole_gazebo) 3 | 4 | ## Compile as C++11, supported in ROS Kinetic and newer 5 | # add_compile_options(-std=c++11) 6 | 7 | ## Find catkin macros and libraries 8 | ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) 9 | ## is used, also find other catkin packages 10 | find_package(catkin REQUIRED COMPONENTS 11 | rospy 12 | std_msgs 13 | ) 14 | 15 | ## System dependencies are found with CMake's conventions 16 | # find_package(Boost REQUIRED COMPONENTS system) 17 | 18 | 19 | ## Uncomment this if the package has a setup.py. This macro ensures 20 | ## modules and global scripts declared therein get installed 21 | ## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html 22 | # catkin_python_setup() 23 | 24 | ################################################ 25 | ## Declare ROS messages, services and actions ## 26 | ################################################ 27 | 28 | ## To declare and build messages, services or actions from within this 29 | ## package, follow these steps: 30 | ## * Let MSG_DEP_SET be the set of packages whose message types you use in 31 | ## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...). 32 | ## * In the file package.xml: 33 | ## * add a build_depend tag for "message_generation" 34 | ## * add a build_depend and a run_depend tag for each package in MSG_DEP_SET 35 | ## * If MSG_DEP_SET isn't empty the following dependency has been pulled in 36 | ## but can be declared for certainty nonetheless: 37 | ## * add a run_depend tag for "message_runtime" 38 | ## * In this file (CMakeLists.txt): 39 | ## * add "message_generation" and every package in MSG_DEP_SET to 40 | ## find_package(catkin REQUIRED COMPONENTS ...) 41 | ## * add "message_runtime" and every package in MSG_DEP_SET to 42 | ## catkin_package(CATKIN_DEPENDS ...) 43 | ## * uncomment the add_*_files sections below as needed 44 | ## and list every .msg/.srv/.action file to be processed 45 | ## * uncomment the generate_messages entry below 46 | ## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...) 47 | 48 | ## Generate messages in the 'msg' folder 49 | # add_message_files( 50 | # FILES 51 | # Message1.msg 52 | # Message2.msg 53 | # ) 54 | 55 | ## Generate services in the 'srv' folder 56 | # add_service_files( 57 | # FILES 58 | # Service1.srv 59 | # Service2.srv 60 | # ) 61 | 62 | ## Generate actions in the 'action' folder 63 | # add_action_files( 64 | # FILES 65 | # Action1.action 66 | # Action2.action 67 | # ) 68 | 69 | ## Generate added messages and services with any dependencies listed here 70 | # generate_messages( 71 | # DEPENDENCIES 72 | # std_msgs 73 | # ) 74 | 75 | ################################################ 76 | ## Declare ROS dynamic reconfigure parameters ## 77 | ################################################ 78 | 79 | ## To declare and build dynamic reconfigure parameters within this 80 | ## package, follow these steps: 81 | ## * In the file package.xml: 82 | ## * add a build_depend and a run_depend tag for "dynamic_reconfigure" 83 | ## * In this file (CMakeLists.txt): 84 | ## * add "dynamic_reconfigure" to 85 | ## find_package(catkin REQUIRED COMPONENTS ...) 86 | ## * uncomment the "generate_dynamic_reconfigure_options" section below 87 | ## and list every .cfg file to be processed 88 | 89 | ## Generate dynamic reconfigure parameters in the 'cfg' folder 90 | # generate_dynamic_reconfigure_options( 91 | # cfg/DynReconf1.cfg 92 | # cfg/DynReconf2.cfg 93 | # ) 94 | 95 | ################################### 96 | ## catkin specific configuration ## 97 | ################################### 98 | ## The catkin_package macro generates cmake config files for your package 99 | ## Declare things to be passed to dependent projects 100 | ## INCLUDE_DIRS: uncomment this if your package contains header files 101 | ## LIBRARIES: libraries you create in this project that dependent projects also need 102 | ## CATKIN_DEPENDS: catkin_packages dependent projects also need 103 | ## DEPENDS: system dependencies of this project that dependent projects also need 104 | catkin_package( 105 | # INCLUDE_DIRS include 106 | # LIBRARIES cartpole_gazebo 107 | # CATKIN_DEPENDS rospy std_msgs 108 | # DEPENDS system_lib 109 | ) 110 | 111 | ########### 112 | ## Build ## 113 | ########### 114 | 115 | ## Specify additional locations of header files 116 | ## Your package locations should be listed before other locations 117 | include_directories( 118 | # include 119 | ${catkin_INCLUDE_DIRS} 120 | ) 121 | 122 | ## Declare a C++ library 123 | # add_library(${PROJECT_NAME} 124 | # src/${PROJECT_NAME}/cartpole_gazebo.cpp 125 | # ) 126 | 127 | ## Add cmake target dependencies of the library 128 | ## as an example, code may need to be generated before libraries 129 | ## either from message generation or dynamic reconfigure 130 | # add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) 131 | 132 | ## Declare a C++ executable 133 | ## With catkin_make all packages are built within a single CMake context 134 | ## The recommended prefix ensures that target names across packages don't collide 135 | # add_executable(${PROJECT_NAME}_node src/cartpole_gazebo_node.cpp) 136 | 137 | ## Rename C++ executable without prefix 138 | ## The above recommended prefix causes long target names, the following renames the 139 | ## target back to the shorter version for ease of user use 140 | ## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node" 141 | # set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "") 142 | 143 | ## Add cmake target dependencies of the executable 144 | ## same as for the library above 145 | # add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) 146 | 147 | ## Specify libraries to link a library or executable target against 148 | # target_link_libraries(${PROJECT_NAME}_node 149 | # ${catkin_LIBRARIES} 150 | # ) 151 | 152 | ############# 153 | ## Install ## 154 | ############# 155 | 156 | # all install targets should use catkin DESTINATION variables 157 | # See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html 158 | 159 | ## Mark executable scripts (Python etc.) for installation 160 | ## in contrast to setup.py, you can choose the destination 161 | # install(PROGRAMS 162 | # scripts/my_python_script 163 | # DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 164 | # ) 165 | 166 | ## Mark executables and/or libraries for installation 167 | # install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_node 168 | # ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 169 | # LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 170 | # RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 171 | # ) 172 | 173 | ## Mark cpp header files for installation 174 | # install(DIRECTORY include/${PROJECT_NAME}/ 175 | # DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} 176 | # FILES_MATCHING PATTERN "*.h" 177 | # PATTERN ".svn" EXCLUDE 178 | # ) 179 | 180 | ## Mark other files for installation (e.g. launch and bag files, etc.) 181 | # install(FILES 182 | # # myfile1 183 | # # myfile2 184 | # DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} 185 | # ) 186 | 187 | ############# 188 | ## Testing ## 189 | ############# 190 | 191 | ## Add gtest based cpp test target and link libraries 192 | # catkin_add_gtest(${PROJECT_NAME}-test test/test_cartpole_gazebo.cpp) 193 | # if(TARGET ${PROJECT_NAME}-test) 194 | # target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) 195 | # endif() 196 | 197 | ## Add folders to be run by python nosetests 198 | # catkin_add_nosetests(test) 199 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/launch/cartpole_gazebo.launch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/cart.STL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole_gazebo/cartpole_gazebo/meshes/cart.STL -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/cart.urdf: -------------------------------------------------------------------------------- 1 | 3 | 5 | 6 | 9 | 11 | 18 | 19 | 20 | 23 | 24 | 26 | 27 | 29 | 31 | 32 | 33 | 34 | 37 | 38 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/pole.STL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole_gazebo/cartpole_gazebo/meshes/pole.STL -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/pole.urdf: -------------------------------------------------------------------------------- 1 | 3 | 5 | 6 | 9 | 11 | 18 | 19 | 20 | 23 | 24 | 26 | 27 | 29 | 31 | 32 | 33 | 34 | 37 | 38 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/stand.STL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/cartpole_gazebo/cartpole_gazebo/meshes/stand.STL -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/meshes/stand.urdf: -------------------------------------------------------------------------------- 1 | 3 | 5 | 6 | 9 | 11 | 18 | 19 | 20 | 23 | 24 | 26 | 27 | 29 | 31 | 32 | 33 | 34 | 37 | 38 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | cartpole_gazebo 4 | 0.0.0 5 | The cartpole_gazebo package 6 | 7 | 8 | 9 | 10 | imitaion 11 | 12 | 13 | 14 | 15 | 16 | TODO 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | catkin 52 | gazebo_msgs 53 | sensor_msgs 54 | geometry_msgs 55 | message_generation 56 | roscpp 57 | roscpp 58 | roscpp 59 | roscpp 60 | rospy 61 | std_msgs 62 | roscpp 63 | rospy 64 | std_msgs 65 | roscpp 66 | rospy 67 | std_msgs 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /cartpole_gazebo/cartpole_gazebo/robots/cartpole_v1.urdf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 10 | 12 | 19 | 20 | 21 | 24 | 25 | 27 | 28 | 30 | 32 | 33 | 34 | 35 | 38 | 39 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 56 | 57 | 60 | 62 | 69 | 70 | 71 | 74 | 75 | 77 | 78 | 80 | 82 | 83 | 84 | 85 | 88 | 89 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 106 | 107 | 110 | 112 | 119 | 120 | 121 | 124 | 125 | 127 | 128 | 130 | 132 | 133 | 134 | 135 | 138 | 139 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | transmission_interface/SimpleTransmission 174 | 175 | EffortJointInterface 176 | 177 | 178 | EffortJointInterface 179 | 1 180 | 181 | 182 | 183 | 184 | transmission_interface/SimpleTransmission 185 | 186 | EffortJointInterface 187 | 188 | 189 | EffortJointInterface 190 | 1 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /images/breakout-v0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/breakout-v0.gif -------------------------------------------------------------------------------- /images/cartpole-pg-gazebo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/cartpole-pg-gazebo.gif -------------------------------------------------------------------------------- /images/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/cartpole.gif -------------------------------------------------------------------------------- /images/cartpole_pg_rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/cartpole_pg_rewards.png -------------------------------------------------------------------------------- /images/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/example.gif -------------------------------------------------------------------------------- /images/mountain-car-v0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/mountain-car-v0.gif -------------------------------------------------------------------------------- /images/mountaincar_pg_rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/mountaincar_pg_rewards.png -------------------------------------------------------------------------------- /images/pg2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/images/pg2.gif -------------------------------------------------------------------------------- /lunarlander/LunarLander.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/navuboy/rl_gym/4cd370b369a236437730d456bec43353c23c6525/lunarlander/LunarLander.gif -------------------------------------------------------------------------------- /lunarlander/lunarlander_dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import wrappers 4 | import tensorflow as tf 5 | import json, sys, os 6 | from os import path 7 | import random 8 | from collections import deque 9 | 10 | # Deep Q-Networks (DQN) 11 | # An off-policy action-value function based approach (Q-learning) that uses epsilon-greedy exploration 12 | # to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay memory 13 | # to update the Q-network's parameters. 14 | # Neural networks are used for function approximation. 15 | # A slowly-changing "target" Q network, as well as gradient norm clipping, are used to improve 16 | # stability and encourage convergence. 17 | # Parameter updates are made via Adam. 18 | 19 | env_to_use = 'LunarLander-v2' 20 | 21 | # hyperparameters 22 | gamma = 0.99 # reward discount factor 23 | h1 = 512 # hidden layer 1 size 24 | h2 = 512 # hidden layer 2 size 25 | h3 = 512 # hidden layer 3 size 26 | lr = 5e-5 # learning rate 27 | lr_decay = 1 # learning rate decay (per episode) 28 | l2_reg = 1e-6 # L2 regularization factor 29 | dropout = 0 # dropout rate (0 = no dropout) 30 | num_episodes = 5000 # number of episodes 31 | max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit) 32 | slow_target_burnin = 1000 # number of steps where slow target weights are tied to current network weights 33 | update_slow_target_every = 100 # number of steps to use slow target as target before updating it to latest weights 34 | train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights 35 | replay_memory_capacity = int(1e6) # capacity of experience replay memory 36 | minibatch_size = 1024 # size of minibatch from experience replay memory for updates 37 | epsilon_start = 1.0 # probability of random action at start 38 | epsilon_end = 0.05 # minimum probability of random action after linear decay period 39 | epsilon_decay_length = 1e5 # number of steps over which to linearly decay epsilon 40 | epsilon_decay_exp = 0.97 # exponential decay rate after reaching epsilon_end (per episode) 41 | 42 | # game parameters 43 | env = gym.make(env_to_use) 44 | state_dim = np.prod(np.array(env.observation_space.shape)) # Get total number of dimensions in state 45 | n_actions = env.action_space.n # Assuming discrete action space 46 | 47 | # set seeds to 0 48 | env.seed(0) 49 | np.random.seed(0) 50 | 51 | # prepare monitorings 52 | outdir = '/tmp/dqn-agent-results' 53 | env = wrappers.Monitor(env, outdir, force=True) 54 | def writefile(fname, s): 55 | with open(path.join(outdir, fname), 'w') as fh: fh.write(s) 56 | info = {} 57 | info['env_id'] = env.spec.id 58 | info['params'] = dict( 59 | gamma = gamma, 60 | h1 = h1, 61 | h2 = h2, 62 | h3 = h3, 63 | lr = lr, 64 | lr_decay = lr_decay, 65 | l2_reg = l2_reg, 66 | dropout = dropout, 67 | num_episodes = num_episodes, 68 | max_steps_ep = max_steps_ep, 69 | slow_target_burnin = slow_target_burnin, 70 | update_slow_target_every = update_slow_target_every, 71 | train_every = train_every, 72 | replay_memory_capacity = replay_memory_capacity, 73 | minibatch_size = minibatch_size, 74 | epsilon_start = epsilon_start, 75 | epsilon_end = epsilon_end, 76 | epsilon_decay_length = epsilon_decay_length, 77 | epsilon_decay_exp = epsilon_decay_exp 78 | ) 79 | 80 | tf.reset_default_graph() 81 | 82 | # placeholders 83 | state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) # input to Q network 84 | next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim]) # input to slow target network 85 | action_ph = tf.placeholder(dtype=tf.int32, shape=[None]) # action indices (indices of Q network output) 86 | reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # rewards (go into target computation) 87 | is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation) 88 | is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout 89 | 90 | # episode counter 91 | episodes = tf.Variable(0.0, trainable=False, name='episodes') 92 | episode_inc_op = episodes.assign_add(1) 93 | 94 | # will use this to initialize both Q network and slowly-changing target network with same structure 95 | def generate_network(s, trainable, reuse): 96 | hidden = tf.layers.dense(s, h1, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse) 97 | hidden_drop = tf.layers.dropout(hidden, rate = dropout, training = trainable & is_training_ph) 98 | hidden_2 = tf.layers.dense(hidden_drop, h2, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse) 99 | hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout, training = trainable & is_training_ph) 100 | hidden_3 = tf.layers.dense(hidden_drop_2, h3, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse) 101 | hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout, training = trainable & is_training_ph) 102 | action_values = tf.squeeze(tf.layers.dense(hidden_drop_3, n_actions, trainable = trainable, name = 'dense_3', reuse = reuse)) 103 | return action_values 104 | 105 | with tf.variable_scope('q_network') as scope: 106 | # Q network applied to state_ph 107 | q_action_values = generate_network(state_ph, trainable = True, reuse = False) 108 | # Q network applied to next_state_ph (for double Q learning) 109 | q_action_values_next = tf.stop_gradient(generate_network(next_state_ph, trainable = False, reuse = True)) 110 | 111 | # slow target network 112 | with tf.variable_scope('slow_target_network', reuse=False): 113 | # use stop_gradient to treat the output values as constant targets when doing backprop 114 | slow_target_action_values = tf.stop_gradient(generate_network(next_state_ph, trainable = False, reuse = False)) 115 | 116 | # isolate vars for each network 117 | q_network_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_network') 118 | slow_target_network_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_network') 119 | 120 | # update values for slowly-changing target network to match current critic network 121 | update_slow_target_ops = [] 122 | for i, slow_target_var in enumerate(slow_target_network_vars): 123 | update_slow_target_op = slow_target_var.assign(q_network_vars[i]) 124 | update_slow_target_ops.append(update_slow_target_op) 125 | 126 | update_slow_target_op = tf.group(*update_slow_target_ops, name='update_slow_target') 127 | 128 | # Q-learning targets y_i for (s,a) from experience replay 129 | # = r_i + gamma*Q_slow(s',argmax_{a}Q(s',a)) if s' is not terminal 130 | # = r_i if s' terminal 131 | # Note that we're using Q_slow(s',argmax_{a}Q(s',a)) instead of max_{a}Q_slow(s',a) to address the maximization bias problem via Double Q-Learning 132 | targets = reward_ph + is_not_terminal_ph * gamma * \ 133 | tf.gather_nd(slow_target_action_values, tf.stack((tf.range(minibatch_size), tf.cast(tf.argmax(q_action_values_next, axis=1), tf.int32)), axis=1)) 134 | 135 | # Estimated Q values for (s,a) from experience replay 136 | estim_taken_action_vales = tf.gather_nd(q_action_values, tf.stack((tf.range(minibatch_size), action_ph), axis=1)) 137 | 138 | # loss function (with regularization) 139 | loss = tf.reduce_mean(tf.square(targets - estim_taken_action_vales)) 140 | for var in q_network_vars: 141 | if not 'bias' in var.name: 142 | loss += l2_reg * 0.5 * tf.nn.l2_loss(var) 143 | 144 | # optimizer 145 | train_op = tf.train.AdamOptimizer(lr*lr_decay**episodes).minimize(loss) 146 | 147 | # initialize session 148 | sess = tf.Session() 149 | sess.run(tf.global_variables_initializer()) 150 | 151 | ## Training starts here 152 | 153 | total_steps = 0 154 | experience = deque(maxlen=replay_memory_capacity) 155 | 156 | epsilon = epsilon_start 157 | epsilon_linear_step = (epsilon_start-epsilon_end)/epsilon_decay_length 158 | 159 | for ep in range(num_episodes): 160 | 161 | total_reward = 0 162 | steps_in_ep = 0 163 | 164 | # Initial state 165 | observation = env.reset() 166 | # env.render() 167 | 168 | for t in range(max_steps_ep): 169 | 170 | # choose action according to epsilon-greedy policy wrt Q 171 | if np.random.random() < epsilon: 172 | action = np.random.randint(n_actions) 173 | else: 174 | q_s = sess.run(q_action_values, 175 | feed_dict = {state_ph: observation[None], is_training_ph: False}) 176 | action = np.argmax(q_s) 177 | 178 | # take step 179 | next_observation, reward, done, _info = env.step(action) 180 | # env.render() 181 | total_reward += reward 182 | 183 | # add this to experience replay buffer 184 | experience.append((observation, action, reward, next_observation, 185 | # is next_observation a terminal state? 186 | 0.0 if done else 1.0)) 187 | 188 | # update the slow target's weights to match the latest q network if it's time to do so 189 | if total_steps%update_slow_target_every == 0: 190 | _ = sess.run(update_slow_target_op) 191 | 192 | # update network weights to fit a minibatch of experience 193 | if total_steps%train_every == 0 and len(experience) >= minibatch_size: 194 | 195 | # grab N (s,a,r,s') tuples from experience 196 | minibatch = random.sample(experience, minibatch_size) 197 | 198 | # do a train_op with all the inputs required 199 | _ = sess.run(train_op, 200 | feed_dict = { 201 | state_ph: np.asarray([elem[0] for elem in minibatch]), 202 | action_ph: np.asarray([elem[1] for elem in minibatch]), 203 | reward_ph: np.asarray([elem[2] for elem in minibatch]), 204 | next_state_ph: np.asarray([elem[3] for elem in minibatch]), 205 | is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]), 206 | is_training_ph: True}) 207 | 208 | observation = next_observation 209 | total_steps += 1 210 | steps_in_ep += 1 211 | 212 | # linearly decay epsilon from epsilon_start to epsilon_end over epsilon_decay_length steps 213 | if total_steps < epsilon_decay_length: 214 | epsilon -= epsilon_linear_step 215 | # then exponentially decay it every episode 216 | elif done: 217 | epsilon *= epsilon_decay_exp 218 | 219 | if total_steps == epsilon_decay_length: 220 | print('--------------------------------MOVING TO EXPONENTIAL EPSILON DECAY-----------------------------------------') 221 | 222 | if done: 223 | # Increment episode counter 224 | _ = sess.run(episode_inc_op) 225 | break 226 | 227 | print('Episode %2i, Reward: %7.3f, Steps: %i, Next eps: %7.3f'%(ep,total_reward,steps_in_ep, epsilon)) 228 | 229 | # Finalize and upload results 230 | writefile('info.json', json.dumps(info)) 231 | env.close() 232 | gym.upload(outdir) 233 | -------------------------------------------------------------------------------- /mountaincar/README.md: -------------------------------------------------------------------------------- 1 | # Mountain Car 2 | 3 | ## MountainCar-v0 4 | 5 | - A car is on a one-dimensional track, positioned between two "mountains". 6 | - The goal is to drive up the mountain on the right. 7 | - However, the car's engine is not strong enough to scale the mountain in a single pass. 8 | - Therefore, the only way to succeed is to drive back and forth to build up momentum. 9 | - Two reinforcement learning approaches: **Deep Q Networks** & **Policy Gradient**. 10 | 11 | ![MountainCar-v0](../images/mountain-car-v0.gif) 12 | 13 | - The plot for rewards vs episodes, from policy gradient method:(mountaincar-policygradient.py): 14 | 15 | ![MountainCar-v0-Rewards](../images/mountaincar_pg_rewards.png) 16 | -------------------------------------------------------------------------------- /mountaincar/mountain-car-v0-dqn1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Environment - MountainCar-v0 3 | actions = 0(LEFT), 1(STAY), 2(RIGHT) 4 | state space (continuous) dimension = 2(car position, car velocity) 5 | 6 | ''' 7 | 8 | import tensorflow as tf 9 | import gym 10 | import numpy as np 11 | import math 12 | import random 13 | import copy 14 | 15 | env = gym.make("MountainCar-v0") 16 | 17 | learning_rate = 1e-2 18 | memory_size = 100000 19 | batch_size = 64 20 | gamma = 0.99 21 | epsilon_max = 1 22 | epsilon_min = 0.1 23 | 24 | tf.reset_default_graph() 25 | 26 | # computation graph 27 | observations = tf.placeholder(tf.float32, [None, 2], name="input_x") 28 | W1 = tf.get_variable("W1", shape=[2, 64], initializer=tf.contrib.layers.xavier_initializer()) 29 | layer1 = tf.nn.relu(tf.matmul(observations, W1)) 30 | W2 = tf.get_variable("W2", shape=[64, 2], initializer=tf.contrib.layers.xavier_initializer()) 31 | Qpredict = tf.matmul(layer1, W2) 32 | 33 | Qtarget = tf.placeholder(tf.float32, [None, 2], name="input_y") 34 | error = Qtarget - Qpredict 35 | 36 | # mean square error loss function 37 | loss = -tf.reduce_mean(tf.square(error)) 38 | adam = tf.train.AdamOptimizer(learning_rate).minimize(loss) 39 | 40 | init = tf.initialize_all_variables() 41 | 42 | # start the session 43 | with tf.Session() as sess: 44 | sess.run(init) 45 | state_memory = [] 46 | target_memory = [] 47 | for _ in range(1000): 48 | done = False 49 | state = env.reset() 50 | total_reward = 0 51 | while done == False: 52 | observation = np.reshape(state, [1, 2]) 53 | state_memory.append(observation) 54 | _Qpredict = sess.run(Qpredict, feed_dict={observations: observation}) 55 | epsilon = epsilon_min + (epsilon_max - epsilon_min)*(math.exp(-0.001*_)) 56 | action_temp = 0 57 | # chose action using e-greedy policy 58 | if random.random() < epsilon: 59 | action = random.randint(0, 1) 60 | # map action 1 to 2(RIGHT) 61 | if action == 1: 62 | action_temp = 2 63 | new_state, reward, done, info = env.step(action_temp) 64 | else: 65 | action = np.argmax(_Qpredict) 66 | new_state, reward, done, info = env.step(action) 67 | 68 | total_reward += reward 69 | #env.render() 70 | 71 | _Qout = sess.run(Qpredict, feed_dict={observations: np.reshape(new_state, [1, 2])}) 72 | _maxQout = np.max(_Qout) 73 | _Qtarget = _Qpredict[:] 74 | 75 | if done == False: 76 | update = reward + (gamma*_maxQout) 77 | else: 78 | update = reward 79 | 80 | _Qtarget[0][action] = update 81 | 82 | target_memory.append(_Qtarget) 83 | 84 | # experience replay 85 | sample_size = min(batch_size, len(state_memory)) 86 | 87 | state_memory_temp = np.vstack(copy.copy(state_memory)) 88 | target_memory_temp = np.vstack(copy.copy(target_memory)) 89 | 90 | temp_list = zip(state_memory_temp, target_memory_temp) 91 | random.shuffle(temp_list) 92 | _states, _targets = zip(*temp_list) 93 | sess.run(adam, feed_dict={observations: _states, Qtarget: _targets}) 94 | 95 | if state_memory >= memory_size: 96 | state_memory = [] 97 | target_memory = [] 98 | print "reward in episode ",_, " is ", total_reward 99 | -------------------------------------------------------------------------------- /mountaincar/mountain-car-v0-dqn2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | MountainCar-v0 solution using a Full DQN with experience-replay 3 | and a separate Target network. 4 | ''' 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | import random 10 | import copy 11 | import math 12 | 13 | env = gym.make("MountainCar-v0") 14 | render = True # set to True for rendering 15 | 16 | num_episodes = 10000 17 | batch_size = 64 18 | memory_size = 200 19 | H1 = 64 20 | D = 2 21 | learning_rate = 1e-2 22 | gamma = 0.99 23 | epsilon_max = 1.0 24 | epsilon_min = 0.01 25 | 26 | tf.reset_default_graph() 27 | 28 | # normal network 29 | observations = tf.placeholder(tf.float32, [None,D], name="input_x") 30 | W1 = tf.get_variable("W1", shape=[D, H1], 31 | initializer=tf.contrib.layers.xavier_initializer()) 32 | layer1 = tf.nn.relu(tf.matmul(observations,W1)) 33 | 34 | W2 = tf.get_variable("W2", shape=[H1, 2], 35 | initializer=tf.contrib.layers.xavier_initializer()) 36 | 37 | linear = tf.matmul(layer1, W2) 38 | #Qout = tf.nn.sigmoid(linear) 39 | Qout = linear 40 | 41 | Qtarget = tf.placeholder(tf.float32, [None, 2], name="Qtarget") 42 | 43 | # separate target network 44 | t_W1 = tf.get_variable("t_W1", shape=[D, H1], 45 | initializer=tf.contrib.layers.xavier_initializer()) 46 | t_layer1 = tf.nn.relu(tf.matmul(observations,t_W1)) 47 | 48 | t_W2 = tf.get_variable("t_W2", shape=[H1, 2], 49 | initializer=tf.contrib.layers.xavier_initializer()) 50 | t_linear = tf.matmul(t_layer1, t_W2) 51 | t_Qout = t_linear 52 | 53 | 54 | # error 55 | diffs = Qtarget - Qout 56 | loss = -tf.reduce_mean(tf.square(diffs)) 57 | adam = tf.train.AdamOptimizer(learning_rate).minimize(loss) 58 | 59 | init = tf.initialize_all_variables() 60 | 61 | with tf.Session() as sess: 62 | sess.run(init) 63 | memory_states = [] 64 | memory_targets = [] 65 | for _ in xrange(num_episodes): 66 | observation = env.reset() 67 | done = False 68 | ep_states = [] 69 | ep_targets = [] 70 | memory_states_temp = [] 71 | memory_targets_temp = [] 72 | i = 0 73 | total_reward = 0 74 | while done == False: 75 | i += 1 76 | #print i 77 | state = np.reshape(observation, [1, D]) 78 | #print state 79 | #ep_states.append(state) 80 | memory_states.append(state) 81 | #print memory_states 82 | Qvals = sess.run(Qout, feed_dict={observations: state}) 83 | epsilon = epsilon_min + (epsilon_max - epsilon_min)*(math.exp(-0.01*_)) 84 | if random.random() < epsilon: 85 | action = env.action_space.sample() 86 | #print "RANDOM" 87 | else: 88 | action = np.argmax(Qvals) 89 | #print "GREEDY" 90 | 91 | #take an e-greedy action 92 | new_state, reward, done, info = env.step(action) 93 | if render == True: 94 | env.render() 95 | 96 | total_reward += reward 97 | nextQvals = sess.run(t_Qout, feed_dict={observations: np.reshape(new_state,[1, D])}) 98 | old_state = state 99 | observation = new_state 100 | maxQvals = np.max(nextQvals) 101 | if done == False: 102 | update = reward + (gamma*maxQvals) 103 | #print total_reward 104 | else: 105 | update = reward 106 | targetQvals = Qvals 107 | targetQvals[0, action] = update 108 | #ep_targets.append(targetQvals) 109 | memory_targets.append(targetQvals) 110 | 111 | memory_states_temp = copy.copy(memory_states) 112 | memory_targets_temp = copy.copy(memory_targets) 113 | 114 | memory_states_temp = np.vstack(memory_states_temp) 115 | memory_targets_temp = np.vstack(memory_targets_temp) 116 | 117 | temp_list = zip(memory_states_temp, memory_targets_temp) 118 | random.shuffle(temp_list) 119 | ep_states, ep_targets = zip(*temp_list[:batch_size]) 120 | sess.run(adam, feed_dict={observations: ep_states, Qtarget: ep_targets}) 121 | if _ % memory_size == 0: 122 | memory_states = [] 123 | memory_targets = [] 124 | 125 | # update target network regularly but slowly 126 | # copy the weights from the normal network in current episode 127 | # to the target network 128 | if _ % 100 == 0: 129 | # update target network 130 | t_W1 = tf.identity(W1) 131 | t_W2 = tf.identity(W2) 132 | 133 | print "reward in episode ",_," is: ",total_reward 134 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /mountaincar/mountaincar-policygradient.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | from gym.wrappers import Monitor 5 | 6 | EPISODES = 10000 7 | episode_number = 0 8 | env = gym.make('MountainCar-v0') 9 | env.seed(1) 10 | env = env.unwrapped 11 | 12 | # create network graph 13 | D = env.observation_space.shape[0] 14 | A = env.action_space.n 15 | H = 10 16 | learning_rate = 0.02 17 | gamma = 0.995 18 | 19 | tf.reset_default_graph() 20 | input_x = tf.placeholder(tf.float32, [None, D], name="input_x") 21 | fc1 = tf.contrib.layers.fully_connected(inputs = input_x,\ 22 | num_outputs = H,\ 23 | activation_fn= tf.nn.relu,\ 24 | weights_initializer=tf.contrib.layers.xavier_initializer()) 25 | fc2 = tf.contrib.layers.fully_connected(inputs = fc1,\ 26 | num_outputs = A,\ 27 | activation_fn= tf.nn.relu,\ 28 | weights_initializer=tf.contrib.layers.xavier_initializer()) 29 | fc3 = tf.contrib.layers.fully_connected(inputs = fc2,\ 30 | num_outputs = A,\ 31 | activation_fn= None,\ 32 | weights_initializer=tf.contrib.layers.xavier_initializer()) 33 | 34 | output = tf.nn.softmax(fc3) 35 | 36 | tvars = tf.trainable_variables() 37 | input_y = tf.placeholder(tf.float32, [None, 3], name="input_y") 38 | discounted_rewards = tf.placeholder(tf.float32, name="discounted_rewards") 39 | neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(logits=fc3, labels=input_y) 40 | product = neg_log_likelihood * discounted_rewards 41 | loss = tf.reduce_mean(product) # no need for -ve sign if using tf.nn.softmax_cross_entr..... 42 | # as it gives neg_log_likelihood 43 | 44 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 45 | 46 | init = tf.initialize_all_variables() 47 | 48 | def discounted_rewards_(r): 49 | discounted_r = np.zeros_like(r) 50 | running_add = 0 51 | for t in reversed(xrange(len(r))): 52 | running_add = running_add * gamma + r[t] 53 | discounted_r[t] = running_add 54 | 55 | return discounted_r 56 | 57 | def choose_action(output): 58 | action = np.random.choice(range(out.shape[1]), p=out.ravel()) 59 | return action 60 | 61 | # env = Monitor(env, 'mountain-car-policygradient-monitor/', force=True) 62 | 63 | with tf.Session() as sess: 64 | sess.run(init) 65 | 66 | xs, drs, ys = [], [], [] 67 | 68 | reward_sum = 0 69 | current_state = env.reset() 70 | render = False 71 | done = False 72 | goal_reached = False 73 | 74 | while not goal_reached: 75 | x = np.reshape(current_state, [1, D]) 76 | out = sess.run(output, feed_dict={input_x: x}) 77 | action = choose_action(out) 78 | xs.append(x) 79 | temp_y = np.zeros(A) 80 | temp_y[action] = 1 81 | ys.append(temp_y) 82 | next_state, reward, done, _ = env.step(action) 83 | drs.append(reward) 84 | reward_sum += reward 85 | 86 | # if episode ends, find discounted rewards and 87 | # find gradients for the episode 88 | if done: 89 | episode_number += 1 90 | epx = np.vstack(np.array(xs)) 91 | epy = np.vstack(np.array(ys)) 92 | epr = np.vstack(np.array(drs)) 93 | 94 | discounted_rs = discounted_rewards_(drs) 95 | discounted_rs -= np.mean(discounted_rs) 96 | discounted_rs /= np.std(discounted_rs) 97 | 98 | xs, ys, drs = [], [], [] 99 | 100 | sess.run([loss, optimizer], feed_dict={discounted_rewards: discounted_rs, input_x: epx, input_y: epy}) 101 | print "Reward in episode :", episode_number, "is :", reward_sum 102 | 103 | writer.writerow([reward_sum]) 104 | 105 | reward_sum = 0 106 | 107 | current_state = env.reset() 108 | 109 | current_state = next_state 110 | 111 | env.close() 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | --------------------------------------------------------------------------------