├── README.md ├── actor-mimic ├── actor-mimic.py └── train_AMN.py ├── dqn_reg ├── dqn_reg__v4_train.py ├── dqn_reg_v4.py └── dqn_reg_v4.pyc ├── dqn_reg_models ├── network_file_49.pkl ├── network_file_50.pkl ├── q_network_reg_v4.py └── q_network_reg_v4.pyc ├── frame_prediction_atari ├── ae_dqn.py ├── ae_random.py ├── tensorboard ├── test_autoencoder_alter.py └── test_multistep_autoencoder.py ├── images ├── image_screenshot8_24.05.2017.png ├── image_screenshot9_24.05.2017.png ├── image_screenshot_23.05.2017.png ├── prediction_screenshot10_24.05.2017.png ├── prediction_screenshot11_24.05.2017.png ├── prediction_screenshot12_25.05.2017.png ├── prediction_screenshot13_25.05.2017.png ├── prediction_screenshot13_26.05.2017.png └── prediction_screenshot14_26.05.2017.png ├── misc ├── caffe_atari_cnn.py └── policy_gradients.py ├── simple_dqn ├── naive_nips_dqn.py └── nature_dqn.py └── weight_conversion ├── tf_pre_model.py ├── tf_pre_model.pyc ├── th2tf_weights.py ├── th_pre_model.py ├── th_pre_model.pyc ├── theano_params.py └── theano_weights.h5f /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-Learning 2 | Contains implementations of various deep RL algorithms and papers, including : 3 | 4 | 1. Human level control through deep reinforcement learning (https://arxiv.org/abs/1312.5602) 5 | 2. Action-Conditional Video Prediction using Deep Networks in Atari Games (https://arxiv.org/abs/1507.08750) 6 | 3. Actor-Mimic : Deep Multitask and Transfer Reinforcement Learning (https://arxiv.org/abs/1511.06342) 7 | -------------------------------------------------------------------------------- /actor-mimic/actor-mimic.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | This is an implementation of the Actor Mimic Network described in the 4 | Actor Critic paper. The network is designed in order to distill expert 5 | policies which are trained on model based prediction. Therefore, to be 6 | consistent with the paper, the AMN comprises of the same architecture 7 | as the individual policies. 8 | 9 | Expert code flow : 10 | 11 | experiment.run --> agent.step --> agent._do_training --> network.train --> which basically calls a minibatch update 12 | | | ---> combine all these in one file which describes the environment, samples (probably another file) 13 | -------------------------------------------- and calls training updates, referencing to a train function in AMN class 14 | """ 15 | 16 | import tensorflow as tf 17 | import numpy as np 18 | import random 19 | 20 | TEMP = 5 21 | 22 | class AMN: 23 | 24 | # initiate tensorboard summaries 25 | def __init__(self, ): 26 | 27 | num_actions = tf.placeholder("uint8", ()) 28 | self.Q_val = self.build_net(num_actions) / TEMP 29 | 30 | # load weights for expert 1, 2, 3... 31 | # one hot encoded vector 32 | teacher = tf.placeholder("float", [BATCH, num_actions]) 33 | 34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(teacher, self.Q_val)) 35 | train_step = tf.train.AdamOptimizer(1e-4).minimize(cost) 36 | # sample an action from 37 | 38 | def train_step(true_policy, sampled_state, sampled_action, num_actions): 39 | 40 | train_step.run(feed_dict = { 41 | teacher : true_policy, 42 | state : sampled_state, 43 | action : sampled_action 44 | num_actions : num_actions}) 45 | 46 | loss = cost.eval(feed_dict = { 47 | teacher : true_policy, 48 | state : sampled_state, 49 | action : sampled_action 50 | num_actions : num_actions}) 51 | 52 | def weight_variable(name, shape): 53 | initial = tf.contrib.layers.xavier_initializer() 54 | return tf.get_variable(name, shape, initial) 55 | 56 | def bias_variable(shape): 57 | initial = tf.constant(0.1, shape = shape) 58 | return tf.Variable(initial) 59 | 60 | def conv2d(x, W, stride): 61 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 62 | 63 | def build_net(self, num_actions): 64 | 65 | state = tf.placeholder("float", [BATCH, 8, 84, 84]) 66 | action = tf.placeholder("float", [BATCH, 4]) 67 | 68 | s_in = tf.reshape(state, [-1, 4, 84, 84]) 69 | a_in = tf.reshape(action, [-1]) 70 | # add action embeddings 71 | 72 | #w1_embed = weight_variable("w1_embed", [-1, ]) 73 | 74 | w1_conv = weight_variable("w1_conv", [4, 4, 4, 64]) 75 | b1_conv = bias_variable([64]) 76 | 77 | # 22 x 22 78 | w2_conv = weight_variable("w2_conv", [4, 4, 64, 64]) 79 | b2_conv = bias_variable([64]) 80 | 81 | # 10 x 10 82 | w3_conv = weight_variable("w3_conv", [3, 3, 64, 64]) 83 | b3_conv = bias_variable([64]) 84 | 85 | # 8 x 8 86 | w_fc1 = weight_variable("w_fc1", [8, 512]) 87 | b_fc1 = bias_variable([512]) 88 | 89 | conv1 = tf.nn.relu(conv2d(s_in, w1_conv, 4) + b1_conv) 90 | conv2 = tf.nn.relu(conv2d(conv1, w2_conv, 2) + b2_conv) 91 | conv3 = tf.nn.relu(conv2d(conv2, w3_conv, 4) + b3_conv) 92 | 93 | conv3_reshaped = tf.reshape(conv3, [-1, _]) 94 | 95 | fc1 = tf.nn.relu(tf.matmul(conv3_reshaped, w_fc1) + b_fc1) 96 | fc1_reshaped = tf.reshape(fc1, [-1, 512*2]) 97 | 98 | latent_curr_true = fc1_reshaped[:,0:512] 99 | latent_next_true = fc1_reshaped[:,512:1024] 100 | 101 | w_fc2 = weight_variable("w_fc2", [512, num_actions]) 102 | b_fc2 = bias_variable([num_actions]) 103 | 104 | l_out = tf.nn.relu(tf.matmul(latent_curr_true, w_fc2) + b_fc2) 105 | 106 | return l_out 107 | 108 | def Qval_to_action(self, Qval): 109 | 110 | #self.Qvalue = Qval 111 | num = tf.exp(Qval / T) 112 | policy = num / tf.reduce_sum(num) 113 | 114 | return policy 115 | -------------------------------------------------------------------------------- /actor-mimic/train_AMN.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script describes the OpenAI gym environment for the source tasks, 3 | samples actions from either the AMN or the expert networks and provides 4 | the true policy and the sampled state action pairs to train the AMN 5 | 6 | """ 7 | 8 | import gym 9 | import numpy as np 10 | import tensorflow as tf 11 | from actor-mimic import AMN 12 | from scipy.misc import imresize 13 | import random 14 | from collections import deque 15 | import cv2 16 | 17 | net = AMN() 18 | 19 | game1_load_path = './pong_dqn_v4_reg_0.01/network_file_50.pkl' 20 | game2_load_path = './pong_dqn_v4_reg_0.01/network_file_50.pkl' 21 | #-------------------importing the pretrained models----------------------------# 22 | 23 | import theano 24 | import pickle 25 | 26 | print("unpickling first game...") 27 | pkl = open(game_1_load_path, 'rb') 28 | game_1 = pickle.load(pkl) 29 | pkl.close() 30 | 31 | print("unpickling second game...") 32 | pkl = open(game2_load_path, 'rb') 33 | game_2 = pickle.load(pkl) 34 | 35 | 36 | #-----------------------------------------------------------------------------# 37 | 38 | # implementing intially only for two games 39 | game = ['MsPacman-v0', 'Pong-v0'] # add accordingly 40 | 41 | num_exp = len(game) 42 | 43 | def preprocess(frame): 44 | 45 | gray_image = frame.mean(2) 46 | reshaped_image = imresize(gray_image, (84,84)) 47 | x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32) 48 | x *= (1.0 / 255.0) # divide by 255 49 | 50 | return x 51 | 52 | def get_num_actions(game_id): 53 | 54 | 55 | def get_AMN_policy(s_t, num_actions): 56 | 57 | #num_actions = get_num_actions(game_id) 58 | q_vals = net.build_net.eval(feed_dict = {state : s_t, num_actions : num_actions}) 59 | 60 | one_hot = np.zeros(BATCH, num_actions) 61 | one_hot[:,np.argmax(q_vals, axis=1)] = 1 62 | AMN_policy = one_hot 63 | 64 | return AMN_policy 65 | 66 | def get_true_policy(state_batch, AMN_action_batch, game_id): 67 | 68 | true_policy = [] 69 | 70 | if game_id == 0: 71 | game = game_1 72 | else: 73 | game = game_2 74 | 75 | for i,s in enumerate(state_batch): 76 | game.state_shared.set_value(s) 77 | true_policy[i] = game._q_vals() 78 | 79 | return true_policy 80 | 81 | def rollout(state, action, encode): 82 | 83 | #sess.run(tf.initialize_all_variables()) 84 | saver = tf.train.Saver(tf.all_variables()) 85 | saver.restore(sess, load_path) 86 | print("variables restored and loaded...") 87 | 88 | # stores history for all games separately 89 | replay_memory = [] 90 | s_t = [] 91 | s_t1 = [] 92 | 93 | for i in range(num_exp): 94 | D = deque() 95 | replay_memory.append(D) 96 | s_t.append([]) 97 | s_t1.append([]) 98 | 99 | num_episodes = np.zeros(num_exp) 100 | k = 0 101 | 102 | while np.max(num_episodes) < MAX_EPISODES: 103 | 104 | game_id = random.randint(0, num_exp - 1) 105 | env = gym.make(game[game_id]) 106 | num_actions = env.action_space.n 107 | ob = env.reset() 108 | 109 | obf = preprocess(ob) 110 | s_t[game_id] = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 111 | observations, actions = [], [] 112 | 113 | i = 0 114 | print("num of episodes ", num_episodes[game_id]) 115 | 116 | for t in range(10000): 117 | env.render() #optional 118 | 119 | q_val_AMN = get_AMN_policy(s_t[game_id], num_actions) 120 | # epsilon greedy policy 121 | if random.random() <= epsilon: 122 | action_index = random.randrange(ACTIONS) 123 | action[action_index] = 1 124 | else: 125 | action_index = np.argmax(q_val_AMN) # create instance from AMN class 126 | action[action_index] = 1 127 | 128 | ob, reward, done, info = env.step(action_index) 129 | 130 | obf = preprocess(ob) 131 | 132 | s_t1[game_id] = np.append(obf, s_t[:,:,0:3], axis = 2) 133 | ''' uncomment for training ''' 134 | 135 | replay_memory[game_id].append((s_t, action, obf)) 136 | if len(replay_memory[game_id]) > REPLAY_MEMORY: 137 | replay_memory[game_id].popleft() 138 | 139 | if train == True: 140 | 141 | minibatch = random.sample(replay_memory[game_id], BATCH) 142 | state_batch = [d[0] for d in minibatch] 143 | AMN_action_batch = [d[1] for d in minibatch] 144 | # get true action 145 | true_action = get_true_policy(state_batch, AMN_action_batch) 146 | #num_actions = get_num_actions(game_id) 147 | # minibatch update 148 | net.train_step(true_action, state_batch, AMN_action_batch, num_actions) 149 | 150 | s_t[game_id] = s_t1[game_id] 151 | 152 | if done: 153 | num_episodes[game_id] += 1 154 | break 155 | -------------------------------------------------------------------------------- /dqn_reg/dqn_reg__v4_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script describes the OpenAI gym environment for the source tasks, 3 | samples actions from either the AMN or the expert networks and provides 4 | the true policy and the sampled state action pairs to train the AMN 5 | 6 | After skipping frames we generate the following sequence: 7 | |-a_t-| |-a_t+1-| |-a_t+2-| |-a_t+3-| 8 | eps_start : action - ob - action - ob - action - ob - action - ob - action - ob - action - ob - action - ob - action - ob 9 | |-------------- state t ------------------| 10 | |---------------- state t+1 ---------------| 11 | |---------------- state t+2 ---------------| 12 | |---------------- state t+3 ---------------| 13 | |---------------- state t+4 ---------------| 14 | 15 | """ 16 | 17 | import gym 18 | import numpy as np 19 | import tensorflow as tf 20 | from dqn_reg_v4 import net_v4 21 | from scipy.misc import imresize 22 | import random 23 | from collections import deque 24 | import cv2 25 | import itertools 26 | 27 | BATCH = 32 28 | MAX_EPISODES = 10 29 | REPLAY_MEMORY = 1000 30 | ACTIONS = 4 31 | epsilon = 0.3 32 | 33 | net = net_v4(0.99, 10000, BATCH, 0.5) 34 | 35 | def get_minibatch(D, BATCH): 36 | 37 | batch_id = 0 38 | minibatch = [] 39 | 40 | 41 | while batch_id < BATCH: 42 | #print("D size", len(D)) 43 | idx = random.randrange(len(D) - 8) 44 | range_idx = np.arange(idx, idx + 8) 45 | action_idx = np.arange(idx + 3, idx + 7) 46 | end_idx = idx + 3 47 | 48 | state_sample = [s[0] for s in D[idx : idx + 8]] 49 | action_sample = [s[1] for s in D[idx + 3 : idx + 7]] 50 | reward_sample = D[idx + 3][2] 51 | done_sample = D[idx + 3][3] 52 | print("action sample size", action_sample[0]) 53 | minibatch.append((np.asarray(state_sample).transpose(3,1,2,0), action_sample, reward_sample, done_sample)) 54 | batch_id += 1 55 | 56 | return minibatch 57 | 58 | def preprocess(frame): 59 | 60 | gray_image = frame.mean(2) 61 | reshaped_image = imresize(gray_image, (84,84)) 62 | x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32) 63 | x *= (1.0 / 255.0) # divide by 255 64 | 65 | return x 66 | 67 | def get_policy(D, obf): 68 | 69 | idx = len(D) - 7 70 | #range_idx = np.arange(idx, idx+7) 71 | 72 | state = [s[0] for s in D[idx : idx+7]] 73 | state.append(obf) 74 | #print("state shape input", state[0].shape) 75 | 76 | q_vals = net.q_val(np.asarray(state).transpose(3,1,2,0)) 77 | 78 | one_hot = np.zeros((BATCH, ACTIONS)) 79 | one_hot[:,np.argmax(q_vals, axis=1)] = 1 80 | AMN_policy = one_hot 81 | 82 | return AMN_policy 83 | 84 | def rollout(): 85 | 86 | sess = tf.InteractiveSession() 87 | sess.run(tf.initialize_all_variables()) 88 | saver = tf.train.Saver(tf.all_variables()) 89 | #saver.restore(sess, load_path) 90 | #print("variables restored and loaded...") 91 | 92 | env = gym.make('Pong-v0') 93 | ACTIONS = env.action_space.n 94 | 95 | # stores history for all games separately 96 | s_t = [] 97 | s_t1 = [] 98 | 99 | D = [] 100 | k = 0 101 | num_episodes = 0 102 | train = False 103 | 104 | while num_episodes < MAX_EPISODES: 105 | 106 | ob = env.reset() 107 | 108 | obf = preprocess(ob) 109 | s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 110 | observations, actions = [], [] 111 | REWARD = 0 112 | action_index = random.randrange(ACTIONS) 113 | 114 | i = 0 115 | print("num of episodes ", num_episodes) 116 | 117 | for t in range(10000): 118 | env.render() #optional 119 | 120 | ob, reward, done, info = env.step(action_index) 121 | 122 | REWARD += reward 123 | obf = preprocess(ob) 124 | #print("D lenght", len(D)) 125 | 126 | if i == 3: 127 | 128 | if len(D) > 8: 129 | q_val = get_policy(D, obf) 130 | # epsilon greedy policy 131 | if random.random() <= epsilon: 132 | action_index = random.randrange(ACTIONS) 133 | #action[action_index] = 1 134 | else: 135 | action_index = np.argmax(q_val) 136 | #action[action_index] = 1 137 | 138 | else: 139 | action_index = random.randrange(ACTIONS) 140 | 141 | #s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2) 142 | D.append((obf, int(action_index), REWARD, done)) 143 | if len(D) > REPLAY_MEMORY: 144 | D.pop(0) 145 | 146 | if num_episodes > 2: 147 | train = True 148 | 149 | if train == True: 150 | 151 | print("training now...") 152 | minibatch = get_minibatch(D, BATCH) 153 | print("minibatch collected...") 154 | state_batch = [d[0] for d in minibatch] 155 | action_batch = [d[1] for d in minibatch] 156 | reward_batch = [d[2] for d in minibatch] 157 | done_batch = [d[3] for d in minibatch] 158 | print("minibatch state shape", np.asarray(state_batch).shape) 159 | # minibatch update 160 | net.train(np.asarray(state_batch).reshape(32,84,84,8), np.asarray(action_batch).reshape(32,4), 161 | np.asarray(reward_batch).reshape(32,1), np.asarray(done_batch).reshape(32,1)) 162 | 163 | if i == 3: 164 | #s_t = s_t1 165 | i = 0 166 | 167 | i += 1 168 | 169 | if done: 170 | num_episodes += 1 171 | break 172 | 173 | rollout() 174 | -------------------------------------------------------------------------------- /dqn_reg/dqn_reg_v4.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | The network is designed in order to distill expert 4 | policies which are trained on model based prediction. Therefore, to be 5 | consistent with the paper, the AMN comprises of the same architecture 6 | as the individual policies. 7 | """ 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | import random 12 | 13 | 14 | class net_v4: 15 | 16 | # initiate tensorboard summaries 17 | def __init__(self, discount, clip_delta, batch, lambda_reg): 18 | 19 | self.discount = discount 20 | self.clip_delta = clip_delta 21 | self.batch = batch 22 | self.lambda_reg = lambda_reg 23 | 24 | #num_actions = tf.placeholder("uint8", ()) 25 | self.Q_val, self.f_pred, self.f_true = self.build_net(6) 26 | 27 | self.next_Q_val = tf.placeholder("float", [self.batch, 6]) 28 | #self.action = tf.placeholder("float", [self.batch, 4]) 29 | self.reward = tf.placeholder("float", [self.batch, ]) 30 | self.done = tf.placeholder("float", [self.batch, ]) 31 | 32 | target = self.reward + self.discount * \ 33 | tf.to_float((np.ones_like(self.done) - self.done)) * tf.reduce_max(self.next_Q_val, axis=1, keep_dims=True) 34 | 35 | # not yet clear what it does actually 36 | action_mask = np.equal(tf.reshape(np.arange(16), [1,-1]), tf.reshape(self.action[:,0], [-1,1])) 37 | out = tf.reshape(tf.reduce_sum((self.Q_val*action_mask), 1), [-1,1]) 38 | 39 | self.diff = target - out 40 | self.diff_reg = self.f_true - self.f_pred 41 | 42 | if self.clip_delta > 0: 43 | quadratic_part = tf.minimum(abs(self.diff), self.clip_delta) 44 | linear_part = abs(self.diff) - quadratic_part 45 | self.loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part 46 | else: 47 | self.loss = 0.5 * self.diff ** 2 48 | 49 | self.loss += tf.reduce_sum(0.5 * self.lambda_reg * (self.diff_reg ** 2), 1) 50 | self.loss = tf.reduce_sum(self.loss) 51 | tf.summary.scalar("loss", self.loss) 52 | 53 | optimizer = tf.train.AdamOptimizer(learning_rate = 0.00025) 54 | self.train_step = optimizer.minimize(self.loss) 55 | # sample an action from 56 | 57 | def q_val(self, state): 58 | 59 | return self.Q_val.eval(feed_dict = {self.state : state}) 60 | 61 | def train(self, state, action, reward, done, merged_summary_op): 62 | 63 | state_padded = np.zeros((state.shape[0], state.shape[1]+1, state.shape[2], state.shape[3])) 64 | state_padded[:,:-1] = state 65 | 66 | next_Q_val = self.Q_val.eval(feed_dict = {self.state : state_padded[:, 1:]}) 67 | 68 | self.train_step.run(feed_dict = { 69 | self.state : state_padded[:, :-1], 70 | self.action : action, 71 | self.next_Q_val : next_Q_val, 72 | self.reward : reward, 73 | self.done : done}) 74 | 75 | cost = self.loss.eval(feed_dict = { 76 | self.state : state, 77 | self.action : action, 78 | self.next_Q_val : next_Q_val, 79 | self.reward : reward, 80 | self.done : done}) 81 | diff_reg = self.diff_reg.eval(feed_dict = { 82 | self.state : state, 83 | self.action : action, 84 | self.next_Q_val : next_Q_val, 85 | self.reward : reward, 86 | self.done : done}) 87 | 88 | diff = self.diff.eval(feed_dict = { 89 | self.state : state, 90 | self.action : action, 91 | self.next_Q_val : next_Q_val, 92 | self.reward : reward, 93 | self.done : done}) 94 | #print("loss", np.sum(0.5 * self.lambda_reg * (diff_reg ** 2))) 95 | #print("loss diff", np.sum(diff)) 96 | print("loss total", cost) 97 | 98 | summary = merged_summary_op.eval(feed_dict = { 99 | self.state : state, 100 | self.action : action, 101 | self.next_Q_val : next_Q_val, 102 | self.reward : reward, 103 | self.done : done}) 104 | 105 | return summary 106 | 107 | def weight_variable(self, name, shape): 108 | initial = tf.contrib.layers.xavier_initializer() 109 | return tf.get_variable(name = name, shape = shape, initializer = initial) 110 | 111 | def bias_variable(self, shape): 112 | initial = tf.constant(0.1, shape = shape) 113 | return tf.Variable(initial) 114 | 115 | def conv2d(self, x, W, stride): 116 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 117 | 118 | def build_net(self, num_actions): 119 | 120 | self.state = tf.placeholder("float", [None, 84, 84, 8]) 121 | self.action = tf.placeholder("int32", [None, 4]) 122 | 123 | s_in = tf.reshape(self.state, [-1, 84, 84, 4]) 124 | a_in = tf.reshape(self.action, [-1, ]) 125 | # add action embeddings 126 | embeddings = tf.Variable(tf.random_uniform([num_actions, 256], -1.0, 1.0)) 127 | 128 | a_embed = tf.nn.embedding_lookup(embeddings, a_in) 129 | a_embed_reshaped = tf.reshape(a_embed, [-1, 4*256]) 130 | 131 | w1_conv = self.weight_variable("w1_conv", [8, 8, 4, 64]) 132 | b1_conv = self.bias_variable([64]) 133 | 134 | # 22 x 22 135 | w2_conv = self.weight_variable("w2_conv", [4, 4, 64, 64]) 136 | b2_conv = self.bias_variable([64]) 137 | 138 | # 10 x 10 139 | w3_conv = self.weight_variable("w3_conv", [3, 3, 64, 64]) 140 | b3_conv = self.bias_variable([64]) 141 | 142 | # 8 x 8 143 | w_fc1 = self.weight_variable("w_fc1", [3136, 512]) 144 | b_fc1 = self.bias_variable([512]) 145 | 146 | conv1 = tf.nn.relu(self.conv2d(s_in, w1_conv, 4) + b1_conv) 147 | conv2 = tf.nn.relu(self.conv2d(conv1, w2_conv, 2) + b2_conv) 148 | conv3 = tf.nn.relu(self.conv2d(conv2, w3_conv, 1) + b3_conv) 149 | 150 | print("conv1", conv1.shape) 151 | print("conv2", conv2.shape) 152 | print("conv3", conv3.shape) 153 | 154 | conv3_reshaped = tf.reshape(conv3, [-1, 7*7*64]) 155 | 156 | fc1 = tf.nn.relu(tf.matmul(conv3_reshaped, w_fc1) + b_fc1) 157 | fc1_reshaped = tf.reshape(fc1, [-1, 512*2]) 158 | 159 | l_curr_true = fc1_reshaped[:,0:512] 160 | l_next_true = fc1_reshaped[:,512:1024] 161 | 162 | w_fc_act = self.weight_variable("w_fc_act", [256*4, 512]) 163 | b_fc_act = self.bias_variable([512]) 164 | 165 | w_fc_curr = self.weight_variable("w_fc_curr", [512, 512]) 166 | b_fc_curr = self.bias_variable([512]) 167 | 168 | fc_act = tf.nn.relu(tf.matmul(a_embed_reshaped, w_fc_act) + b_fc_act) 169 | fc_curr = tf.nn.relu(tf.matmul(l_curr_true, w_fc_curr) + b_fc_curr) 170 | 171 | l_concat = tf.concat([fc_act, fc_curr], 1) 172 | 173 | w_fc_pred = self.weight_variable("w_fc_pred", [1024,512]) 174 | b_fc_pred = self.bias_variable([512]) 175 | 176 | fc_next_pred = tf.nn.relu(tf.matmul(l_concat, w_fc_pred) + b_fc_pred) 177 | 178 | w_fc2 = self.weight_variable("w_fc2", [512, num_actions]) 179 | b_fc2 = self.bias_variable([num_actions]) 180 | 181 | l_out = tf.nn.relu(tf.matmul(l_curr_true, w_fc2) + b_fc2) 182 | 183 | return l_out, fc_next_pred, l_next_true 184 | 185 | def Qval_to_action(self, Qval): 186 | 187 | #self.Qvalue = Qval 188 | num = tf.exp(Qval / T) 189 | policy = num / tf.reduce_sum(num) 190 | 191 | return policy 192 | -------------------------------------------------------------------------------- /dqn_reg/dqn_reg_v4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg/dqn_reg_v4.pyc -------------------------------------------------------------------------------- /dqn_reg_models/network_file_49.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/network_file_49.pkl -------------------------------------------------------------------------------- /dqn_reg_models/network_file_50.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/network_file_50.pkl -------------------------------------------------------------------------------- /dqn_reg_models/q_network_reg_v4.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for deep Q-learning as described in: 3 | 4 | Playing Atari with Deep Reinforcement Learning 5 | NIPS Deep Learning Workshop 2013 6 | 7 | and 8 | 9 | Human-level control through deep reinforcement learning. 10 | Nature, 518(7540):529-533, February 2015 11 | 12 | 13 | Author of Lasagne port: Nissan Pow 14 | Modifications: Nathan Sprague 15 | """ 16 | import lasagne 17 | import numpy as np 18 | import theano 19 | import theano.tensor as T 20 | from updates import deepmind_rmsprop 21 | 22 | 23 | class DeepQLearner: 24 | """ 25 | Deep Q-learning network using Lasagne. 26 | """ 27 | 28 | def __init__(self, input_width, input_height, num_actions, 29 | num_frames, discount, learning_rate, rho, 30 | rms_epsilon, momentum, clip_delta, freeze_interval, 31 | batch_size, network_type, update_rule, lambda_reg, 32 | batch_accumulator, pretrained_net, rng, input_scale=255.0): 33 | 34 | self.input_width = input_width 35 | self.input_height = input_height 36 | self.num_actions = num_actions 37 | self.num_frames = num_frames 38 | self.batch_size = batch_size 39 | self.discount = discount 40 | self.rho = rho 41 | self.lr = learning_rate 42 | self.rms_epsilon = rms_epsilon 43 | self.momentum = momentum 44 | self.clip_delta = clip_delta 45 | self.freeze_interval = freeze_interval 46 | self.rng = rng 47 | self.lambda_reg = lambda_reg 48 | 49 | lasagne.random.set_rng(self.rng) 50 | 51 | self.update_counter = 0 52 | 53 | self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \ 54 | self.build_network(network_type, \ 55 | input_width, input_height, num_actions,\ 56 | num_frames, batch_size) 57 | 58 | if self.freeze_interval > 0: 59 | self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \ 60 | self.build_network(network_type, input_width, \ 61 | input_height, num_actions, num_frames, batch_size) 62 | self.reset_q_hat() 63 | 64 | states = T.tensor4('states') 65 | next_states = T.tensor4('next_states') 66 | rewards = T.col('rewards') 67 | actions = T.imatrix('actions') 68 | terminals = T.icol('terminals') 69 | 70 | # Shared variables for training from a minibatch of replayed 71 | # state transitions, each consisting of num_frames + 1 (due to 72 | # overlap) images, along with the chosen action and resulting 73 | # reward and terminal status. 74 | self.imgs_shared = theano.shared( 75 | np.zeros((batch_size, num_frames*2+1, input_height, input_width), 76 | dtype=theano.config.floatX)) 77 | self.rewards_shared = theano.shared( 78 | np.zeros((batch_size, 1), dtype=theano.config.floatX), 79 | broadcastable=(False, True)) 80 | self.actions_shared = theano.shared( 81 | np.zeros((batch_size, num_frames), dtype='int32') 82 | ) 83 | self.terminals_shared = theano.shared( 84 | np.zeros((batch_size, 1), dtype='int32'), 85 | broadcastable=(False, True)) 86 | 87 | # Shared variable for a single state, to calculate q_vals. 88 | self.state_shared = theano.shared( 89 | np.zeros((num_frames*2, input_height, input_width), 90 | dtype=theano.config.floatX)) 91 | 92 | q_vals, z_pred, z_true = lasagne.layers.get_output( 93 | [self.l_out, self.pred_z, self.true_z], 94 | inputs = {self.l_in: states / input_scale, 95 | self.l_act_in: actions} 96 | ) 97 | 98 | if self.freeze_interval > 0: 99 | next_q_vals = lasagne.layers.get_output( 100 | self.next_l_out, 101 | {self.next_l_in: next_states / input_scale, 102 | self.next_l_act_in: actions} 103 | ) 104 | else: 105 | next_q_vals = lasagne.layers.get_output( 106 | self.l_out, 107 | {self.l_in: next_states / input_scale, 108 | self.l_act_in: actions} 109 | ) 110 | next_q_vals = theano.gradient.disconnected_grad(next_q_vals) 111 | 112 | terminalsX = terminals.astype(theano.config.floatX) 113 | actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), 114 | actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX) 115 | 116 | target = (rewards + 117 | (T.ones_like(terminalsX) - terminalsX) * 118 | self.discount * T.max(next_q_vals, axis=1, keepdims=True)) 119 | output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) 120 | diff = target - output 121 | diff_reg = z_true - z_pred 122 | 123 | if self.clip_delta > 0: 124 | # If we simply take the squared clipped diff as our loss, 125 | # then the gradient will be zero whenever the diff exceeds 126 | # the clip bounds. To avoid this, we extend the loss 127 | # linearly past the clip point to keep the gradient constant 128 | # in that regime. 129 | # 130 | # This is equivalent to declaring d loss/d q_vals to be 131 | # equal to the clipped diff, then backpropagating from 132 | # there, which is what the DeepMind implementation does. 133 | quadratic_part = T.minimum(abs(diff), self.clip_delta) 134 | linear_part = abs(diff) - quadratic_part 135 | loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part 136 | else: 137 | loss = 0.5 * diff ** 2 138 | 139 | loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1) 140 | 141 | if batch_accumulator == 'sum': 142 | loss = T.sum(loss) 143 | elif batch_accumulator == 'mean': 144 | loss = T.mean(loss) 145 | else: 146 | raise ValueError("Bad accumulator: {}".format(batch_accumulator)) 147 | 148 | params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z]) 149 | train_givens = { 150 | states: self.imgs_shared[:, :-1], 151 | next_states: self.imgs_shared[:, 1:], 152 | rewards: self.rewards_shared, 153 | actions: self.actions_shared, 154 | terminals: self.terminals_shared 155 | } 156 | 157 | if update_rule == 'deepmind_rmsprop': 158 | updates = deepmind_rmsprop(loss, params, self.lr, self.rho, 159 | self.rms_epsilon) 160 | elif update_rule == 'rmsprop': 161 | updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 162 | self.rms_epsilon) 163 | elif update_rule == 'sgd': 164 | updates = lasagne.updates.sgd(loss, params, self.lr) 165 | else: 166 | raise ValueError("Unrecognized update: {}".format(update_rule)) 167 | 168 | if self.momentum > 0: 169 | updates = lasagne.updates.apply_momentum(updates, None, 170 | self.momentum) 171 | 172 | self._train = theano.function([], [loss], updates=updates, 173 | givens=train_givens) 174 | q_givens = { 175 | states: self.state_shared.reshape((1, 176 | self.num_frames*2, 177 | self.input_height, 178 | self.input_width)) 179 | } 180 | self._q_vals = theano.function([], q_vals[0], givens=q_givens) 181 | 182 | def build_network(self, network_type, input_width, input_height, 183 | output_dim, num_frames, batch_size): 184 | if network_type == "latent_dnn_v4": 185 | return self.build_latent_network_dnn_v4(input_width, input_height, 186 | output_dim, num_frames, 187 | batch_size) 188 | else: 189 | raise ValueError("Unrecognized network: {}".format(network_type)) 190 | 191 | def train(self, imgs, actions, rewards, terminals): 192 | """ 193 | Train one batch. 194 | 195 | Arguments: 196 | 197 | imgs - b x (2f) x h x w numpy array, where b is batch size, 198 | f is num frames, h is height and w is width. 199 | actions - b x 4 numpy array of integers 200 | rewards - b x 1 numpy array 201 | terminals - b x 1 numpy boolean array (currently ignored) 202 | 203 | Returns: average loss 204 | """ 205 | imgs_padded = np.zeros((imgs.shape[0], imgs.shape[1]+1, 206 | imgs.shape[2], imgs.shape[3]), dtype=np.float32) 207 | imgs_padded[:,:-1] = imgs 208 | 209 | self.imgs_shared.set_value(imgs_padded) 210 | self.actions_shared.set_value(actions) 211 | self.rewards_shared.set_value(rewards) 212 | self.terminals_shared.set_value(terminals) 213 | if (self.freeze_interval > 0 and 214 | self.update_counter % self.freeze_interval == 0): 215 | self.reset_q_hat() 216 | loss = self._train() 217 | self.update_counter += 1 218 | return np.sqrt(loss) 219 | 220 | def q_vals(self, state): 221 | self.state_shared.set_value(state) 222 | return self._q_vals() 223 | 224 | def choose_action(self, state, epsilon): 225 | if self.rng.rand() < epsilon: 226 | return self.rng.randint(0, self.num_actions) 227 | q_vals = self.q_vals(state) 228 | return np.argmax(q_vals) 229 | 230 | def reset_q_hat(self): 231 | all_params = lasagne.layers.helper.get_all_param_values(self.l_out) 232 | lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) 233 | 234 | def build_latent_network_dnn_v4(self, input_width, input_height, output_dim, 235 | num_frames, batch_size): 236 | """ 237 | Build a large network consistent with the DeepMind Nature paper. 238 | """ 239 | from lasagne.layers import dnn 240 | 241 | """ 242 | States input 243 | """ 244 | l_in = lasagne.layers.InputLayer( 245 | shape=(None, num_frames*2, input_width, input_height) 246 | ) 247 | 248 | """ 249 | Integer encoding input for actions 250 | """ 251 | l_act_in = lasagne.layers.InputLayer( 252 | shape=(None, 4) 253 | ) 254 | 255 | l_act_in_reshaped = lasagne.layers.ReshapeLayer( 256 | l_act_in, 257 | shape=(-1, ) 258 | ) 259 | 260 | """ 261 | Action embedding 262 | """ 263 | l_act_embed = lasagne.layers.EmbeddingLayer( 264 | l_act_in_reshaped, 265 | input_size=output_dim, 266 | output_size=256, 267 | W=lasagne.init.HeUniform() 268 | ) 269 | 270 | l_act_embed_reshaped = lasagne.layers.ReshapeLayer( 271 | l_act_embed, 272 | shape=(-1, num_frames*256) 273 | ) 274 | 275 | """ 276 | State embedding 277 | """ 278 | l_reshaped_in = lasagne.layers.ReshapeLayer( 279 | l_in, 280 | shape=(-1, num_frames, input_width, input_height) 281 | ) 282 | 283 | l_conv1 = dnn.Conv2DDNNLayer( 284 | l_reshaped_in, 285 | num_filters=32, 286 | filter_size=(8, 8), 287 | stride=(4, 4), 288 | nonlinearity=lasagne.nonlinearities.rectify, 289 | W=lasagne.init.HeUniform(), 290 | b=lasagne.init.Constant(.1) 291 | ) 292 | 293 | l_conv2 = dnn.Conv2DDNNLayer( 294 | l_conv1, 295 | num_filters=64, 296 | filter_size=(4, 4), 297 | stride=(2, 2), 298 | nonlinearity=lasagne.nonlinearities.rectify, 299 | W=lasagne.init.HeUniform(), 300 | b=lasagne.init.Constant(.1) 301 | ) 302 | 303 | l_conv3 = dnn.Conv2DDNNLayer( 304 | l_conv2, 305 | num_filters=64, 306 | filter_size=(3, 3), 307 | stride=(1, 1), 308 | nonlinearity=lasagne.nonlinearities.rectify, 309 | W=lasagne.init.HeUniform(), 310 | b=lasagne.init.Constant(.1) 311 | ) 312 | 313 | l_hidden1 = lasagne.layers.DenseLayer( 314 | l_conv3, 315 | num_units=512, 316 | nonlinearity=lasagne.nonlinearities.rectify, 317 | W=lasagne.init.HeUniform(), 318 | b=lasagne.init.Constant(.1) 319 | ) 320 | 321 | l_hidden_reshaped = lasagne.layers.ReshapeLayer( 322 | l_hidden1, 323 | shape=(-1, 512*2) 324 | ) 325 | 326 | """ 327 | "True" latent embeddings for current state and future state 328 | """ 329 | l_latent_1 = lasagne.layers.SliceLayer( 330 | l_hidden_reshaped, 331 | indices=slice(0, 512), 332 | axis=1 333 | ) 334 | 335 | l_out_3 = lasagne.layers.SliceLayer( 336 | l_hidden_reshaped, 337 | indices=slice(512, 1024), 338 | axis=1 339 | ) 340 | 341 | """ 342 | Future state latent embedding prediction using current 343 | state and future action embeddings 344 | """ 345 | l_act_project = lasagne.layers.DenseLayer( 346 | l_act_embed_reshaped, 347 | num_units=512, 348 | nonlinearity=lasagne.nonlinearities.rectify, 349 | W=lasagne.init.HeUniform(), 350 | b=lasagne.init.Constant(.1) 351 | ) 352 | 353 | l_state_project = lasagne.layers.DenseLayer( 354 | l_latent_1, 355 | num_units=512, 356 | nonlinearity=lasagne.nonlinearities.rectify, 357 | W=lasagne.init.HeUniform(), 358 | b=lasagne.init.Constant(.1) 359 | ) 360 | 361 | l_project_concat = lasagne.layers.ConcatLayer( 362 | [l_act_project, l_state_project], 363 | axis=1 364 | ) 365 | 366 | l_out_2 = lasagne.layers.DenseLayer( 367 | l_project_concat, 368 | num_units=512, 369 | nonlinearity=lasagne.nonlinearities.rectify, 370 | W=lasagne.init.HeUniform(), 371 | b=lasagne.init.Constant(.1) 372 | ) 373 | 374 | """ 375 | Action prediction based on current state 376 | """ 377 | l_out_1 = lasagne.layers.DenseLayer( 378 | l_latent_1, 379 | num_units=output_dim, 380 | nonlinearity=None, 381 | W=lasagne.init.HeUniform(), 382 | b=lasagne.init.Constant(.1) 383 | ) 384 | 385 | return l_in, l_act_in, l_out_1, l_out_2, l_out_3 386 | 387 | def main(): 388 | net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000, 389 | 32, 'nature_cuda') 390 | 391 | 392 | if __name__ == '__main__': 393 | main() 394 | -------------------------------------------------------------------------------- /dqn_reg_models/q_network_reg_v4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/q_network_reg_v4.pyc -------------------------------------------------------------------------------- /frame_prediction_atari/ae_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | ''' We use downsampled gray scale images - 84 X 84, 4 | consider only every 4th frame as input, applying 5 | the same action for the intermediate frames. 6 | Minibatch size is taken to be 32. Each input 7 | consists of a fixed memory of T = 4 to unroll 8 | each trajectory and pass in as an input. K, which 9 | is the prediction step parameter, taken to be 1''' 10 | 11 | ''' latest model is stored at /Downloads/models3/ ''' 12 | 13 | import argparse 14 | import os 15 | import sys 16 | import gym 17 | import numpy as np 18 | import tensorflow as tf 19 | from scipy.misc import imresize 20 | import random 21 | from collections import deque 22 | import cv2 23 | 24 | from baselines import deepq 25 | from baselines.common.atari_wrappers_deprecated import wrap_dqn 26 | from baselines.deepq.experiments.atari.model import model, dueling_model 27 | import baselines.common.tf_util as U 28 | from baselines.common.misc_util import ( 29 | boolean_flag, 30 | SimpleMonitor, 31 | ) 32 | 33 | # redundant as for now 34 | #flags = tf.app.flags 35 | #flags.DEFINE_boolean('train', True, 'Whether to do training or testing') 36 | #flags.DEFINE_string('env_name', 'PongNoFrameskip-v0', 'The name of gym environment to use') 37 | 38 | def parse_args(): 39 | parser = argparse.ArgumentParser("Run an already learned DQN model.") 40 | # Environment 41 | parser.add_argument("--env", type=str, required=True, help="name of the game") 42 | parser.add_argument("--is_train", default=True, help="name of the game") 43 | parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ") 44 | parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.") 45 | boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value") 46 | boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model") 47 | 48 | return parser.parse_args() 49 | 50 | args = parse_args() 51 | 52 | env = gym.make(args.env) 53 | env = wrap_dqn(env) 54 | 55 | epsilon = 0.35 56 | MAX_EPISODES = 100000 57 | BATCH = 32 # change to 1 while predicting 58 | max_iter = 10000 59 | ACTIONS = env.action_space.n 60 | FACTORS = 2048 61 | REPLAY_MEMORY = 1000000 62 | 63 | def weight_variable(shape): 64 | initial = tf.truncated_normal(shape, stddev = 0.01) 65 | return tf.Variable(initial) 66 | 67 | def bias_variable(shape): 68 | initial = tf.constant(0.0, shape = shape) 69 | return tf.Variable(initial) 70 | 71 | def conv2d(x, W, stride): 72 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 73 | 74 | def conv2d_nopad(x, W, stride): 75 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 76 | 77 | def deconv2d(x, W, output_shape, stride): 78 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME") 79 | 80 | def deconv2d_nopad(x, W, output_shape, stride): 81 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID") 82 | 83 | def max_pool_2x2(x): 84 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 85 | 86 | class autoencoder(): 87 | 88 | def __init__(self, scope): 89 | 90 | self.scope = scope 91 | 92 | with tf.variable_scope(self.scope): 93 | #with tf.device('/gpu:0'): 94 | 95 | self.pred_frame = self.build_encoder() 96 | self.y = tf.placeholder("float", [BATCH, 84, 84]) 97 | self.loss = tf.square(tf.norm(self.y - self.pred_frame)) 98 | self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss) 99 | 100 | tf.summary.scalar("loss", self.loss) 101 | self.summary_writer = tf.summary.FileWriter(logs_path) 102 | self.summaries = tf.summary.merge_all() 103 | 104 | def build_encoder(self, ): 105 | 106 | # input - Batch X 84 X 84 X 4 107 | self.state = tf.placeholder("float", [None, 84, 84, 4]) 108 | self.action = tf.placeholder("float", [None, ACTIONS]) 109 | 110 | # 6 X 6 X 4 x 64 - stride 2 111 | W_conv1 = weight_variable([6, 6, 4, 64]) 112 | wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer()) 113 | b_conv1 = bias_variable([64]) 114 | 115 | # 6 X 6 X 64 x 64 - stride 2 116 | W_conv2 = weight_variable([6, 6, 64, 64]) 117 | b_conv2 = bias_variable([64]) 118 | 119 | # 6 X 6 X 64 x 64 - stride 2 120 | W_conv3 = weight_variable([6, 6, 64, 64]) 121 | b_conv3 = bias_variable([64]) 122 | 123 | # _*16 ie. flattened output from conv3 124 | W_fc1 = weight_variable([10*10*64, 1024]) 125 | b_fc1 = bias_variable([1024]) 126 | 127 | #second fully connected layer - 2048 units 128 | W_fc2 = weight_variable([1024, 2048]) 129 | b_fc2 = bias_variable([2048]) 130 | 131 | #W_fc2 = weight_variable([256, ACTIONS]) 132 | #b_fc2 = bias_variable([ACTIONS]) 133 | 134 | conv1 = tf.nn.relu(conv2d_nopad(self.state, wconv, 2) + b_conv1) 135 | #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 136 | #print("padded shape", padded_conv1.shape) 137 | 138 | conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2) 139 | #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 140 | 141 | conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3) 142 | 143 | conv3_flat = tf.reshape(conv3, [-1, 10*10*64]) 144 | fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1) 145 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) 146 | 147 | # 6 X 6 X 4 x 64 - stride 2 148 | W_enc = weight_variable([FACTORS, 2048]) 149 | W_dec = weight_variable([2048, FACTORS]) 150 | W_action = weight_variable([FACTORS, ACTIONS]) 151 | b_interactions = bias_variable([2048]) 152 | 153 | #W_henc = tf.matmul(W_enc, fc2) 154 | #W_a = tf.matmul(W_action, action) 155 | #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions 156 | 157 | W_henc = tf.matmul(fc2, tf.transpose(W_enc)) 158 | W_a = tf.matmul(self.action, tf.transpose(W_action)) 159 | fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions 160 | 161 | # first fully connected layer after multiplicative interaction- 2048 162 | W_fc3 = weight_variable([2048, 1024]) 163 | b_fc3 = bias_variable([1024]) 164 | 165 | # second fully connected layer after multiplicative interaction- 1024 units 166 | W_fc4 = weight_variable([1024, 10*10*64]) 167 | b_fc4 = bias_variable([10*10*64]) 168 | 169 | fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3) 170 | # TRYING OUT AN ALL CONV. NET 171 | #fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3) 172 | fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4) 173 | 174 | # reshaping into a 4-D matrix 175 | fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64]) 176 | 177 | # deconv variables 178 | W_deconv1 = weight_variable([6, 6, 64, 64]) 179 | b_deconv1 = bias_variable([64]) 180 | 181 | W_deconv2 = weight_variable([6, 6, 64, 64]) 182 | b_deconv2 = bias_variable([64]) 183 | 184 | W_deconv3 = weight_variable([6, 6, 1, 64]) 185 | b_deconv3 = bias_variable([1]) 186 | 187 | # output - 1 x 84 84 188 | deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1) 189 | deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2) 190 | deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3 191 | 192 | 193 | #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84]) 194 | encode = tf.reshape(deconv3, [-1, 84, 84]) 195 | 196 | return encode 197 | 198 | def predict(self, sess, s, a): 199 | """ 200 | Predicts the next state based on the current action. 201 | Args: 202 | sess: Tensorflow session 203 | s: State input of shape [batch_size, 4, 160, 160, 3] 204 | a : Action input of shape [batch_size, ACTIONS] 205 | Returns: 206 | Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 207 | action values. 208 | """ 209 | return sess.run(self.pred_frame, { self.state: s, self.action : a }) 210 | 211 | def update(self, sess, s, a, y, p, global_step): 212 | """ 213 | Updates the estimator towards the given targets. 214 | Args: 215 | sess: Tensorflow session object 216 | s: State input of shape [batch_size, 84, 84, 4] 217 | a: Chosen actions of shape [batch_size, ACTIONS] 218 | y: Targets of shape [batch_size, 84, 84] 219 | p : Predicted next observation frame of shape [batch_size, 84, 84] 220 | Returns: 221 | The calculated loss on the batch. 222 | """ 223 | feed_dict = { self.y : y, self.pred_frame : p, 224 | self.state : s, self.action : a } 225 | summaries, _, loss = sess.run( 226 | [self.summaries, self.train_step, self.loss], feed_dict) 227 | #print("summaries", summaries) 228 | if self.summary_writer: 229 | self.summary_writer.add_summary(summaries, global_step) 230 | return loss 231 | 232 | def rgb2gray(frame): 233 | 234 | r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2] 235 | gray = 0.2989 * r + 0.5870 * g + 0.1140 * b 236 | 237 | return gray 238 | 239 | def preprocess(frame): 240 | 241 | gray_image = rgb2gray(frame) 242 | reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84)) 243 | x = np.reshape(reshaped_image, [84,84,1]) 244 | x *= 1 / 255.0 245 | 246 | return x 247 | 248 | def rollout(sess, prediction_net, act): 249 | 250 | 251 | #tf.summary.scalar("Qval", encode) 252 | merged_summary_op = tf.summary.merge_all() 253 | 254 | sess.run(tf.variables_initializer(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = "ae"))) 255 | saver = tf.train.Saver(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = "ae")) 256 | print("ae varibles initialized and saver defined") 257 | 258 | checkpoint = tf.train.latest_checkpoint(checkpoint_dir) 259 | if checkpoint: 260 | saver.restore(sess, checkpoint) 261 | print("Loaded model checkpoint {}...".format(checkpoint)) 262 | 263 | #summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 264 | 265 | D = deque() 266 | num_episodes = 0 267 | k = 0 268 | 269 | while num_episodes < MAX_EPISODES: 270 | s_t = env.reset() 271 | #ob = env.reset() 272 | #print("shape obf", np.asarray(obf).shape) 273 | 274 | #obf = preprocess(ob) 275 | #s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 276 | #observations, actions = [], [] 277 | 278 | #i = 0 279 | 280 | for t in range(10000): 281 | #env.render() #optional 282 | env.unwrapped.render() 283 | 284 | 285 | #action_id = env.action_space.sample() 286 | #action_id = random.randint(0,5) 287 | action_id = act(np.array(s_t)[None], stochastic=args.stochastic)[0] 288 | action_vector = np.zeros(ACTIONS) 289 | action_vector[action_id] = 1 290 | #actions.append(action_vector) 291 | 292 | s_t1, reward, done, info = env.step(action_id) 293 | #ob, reward, done, info = env.step(action_id) 294 | 295 | #obf = preprocess(ob) 296 | 297 | #s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2) 298 | 299 | # if training, collect data and apply learning updates 300 | if args.is_train: 301 | # storing current state and the next frame 302 | D.append((np.array(s_t) / 255.0, action_vector, np.array(s_t1)[:, :, 3] / 255.0)) 303 | #D.append((s_t, action_vector, obf)) 304 | if len(D) > REPLAY_MEMORY: 305 | D.popleft() 306 | 307 | if num_episodes > 2: 308 | minibatch = random.sample(D, BATCH) 309 | action_batch = [d[1] for d in minibatch] 310 | state_batch = [d[0] for d in minibatch] 311 | target_batch = [d[2] for d in minibatch] 312 | target_batch = np.reshape(target_batch, (BATCH, 84, 84)) 313 | 314 | pred_batch = prediction_net.predict(sess, np.reshape(state_batch, (BATCH, 84, 84, 4)), np.reshape(action_batch, (BATCH, 6))) 315 | 316 | loss = prediction_net.update(sess, state_batch, action_batch, target_batch, pred_batch, k) 317 | 318 | #summary_writer.add_summary(summary, num_episodes) 319 | 320 | print("\riteration {} @ Episode {}/{}, loss {}".format(k, num_episodes, MAX_EPISODES, loss), end="") 321 | sys.stdout.flush() 322 | 323 | if k % 100000 == 0 and k != 0: 324 | print("\nsaving model now") 325 | saver.save(sess, save_path, global_step = t) 326 | 327 | k += 1 328 | # display the first frame of the minibatch 329 | cv2.imshow("prediction", pred_batch[0]) 330 | cv2.imshow("target", target_batch[0]) 331 | cv2.imshow("input", state_batch[0][:,:,0]) 332 | cv2.waitKey(5) 333 | 334 | else: 335 | #render video frames while testing 336 | prediction = prediction_net.predict(sess, np.reshape(np.asarray(s_t), (1, 84, 84, 4)), np.reshape(action_vector, (1, 6))) 337 | #print("prediction shape", prediction[0]) 338 | cv2.imshow("prediction", prediction[0]) 339 | cv2.waitKey(1) 340 | 341 | #k +=1 342 | #if i == 3: #maybe change to 4 343 | # i = 0 344 | #else: 345 | # i +=1 346 | 347 | s_t = s_t1 348 | 349 | if done: 350 | num_episodes += 1 351 | break 352 | 353 | 354 | #sess = tf.Session() 355 | 356 | checkpoint_dir = './checkpoints_ae/' 357 | save_path = './checkpoints_ae/' 358 | #save_path = '/home/manan/Downloads/models3/video_prediction.ckpt' 359 | #load_path='/home/manan/Downloads/models3/video_prediction.ckpt-302' 360 | logs_path = './logs/' 361 | 362 | #sess_dqn = U.make_session(4) 363 | #sess_dqn.as_default() 364 | #config=tf.ConfigProto(log_device_placement=True) 365 | 366 | with tf.Session() as sess: 367 | 368 | act = deepq.build_act( 369 | make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), 370 | q_func=dueling_model if args.dueling else model, 371 | num_actions=env.action_space.n) 372 | 373 | U.load_state(os.path.join(args.model_dir, "saved")) 374 | #U.load_state('/tmp/models/model-atari-pong-1/saved') 375 | prediction_net = autoencoder("ae") 376 | rollout(sess, prediction_net, act) 377 | 378 | '''Pong : Actions 2,4 : up 379 | 3,5 : down 380 | 0,1 : no movement''' 381 | 382 | # basic code for simulating random policy 383 | '''for i_episode in range(2): 384 | observation = env.reset() 385 | ob = preprocess(observation) 386 | print(ob.shape) 387 | for t in range(10000) 388 | env.render() 389 | print(observation) 390 | if random.random() < epsilon: 391 | action = env.action_space.sample() 392 | else: 393 | action = 1 394 | observation, reward, done, info = env.step(action) 395 | #print(action) 396 | if done == True: 397 | print("Episode finished") 398 | break''' 399 | -------------------------------------------------------------------------------- /frame_prediction_atari/ae_random.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | ''' We use downsampled gray scale images - 84 X 84, 4 | consider only every 4th frame as input, applying 5 | the same action for the intermediate frames. 6 | Minibatch size is taken to be 32. Each input 7 | consists of a fixed memory of T = 4 to unroll 8 | each trajectory and pass in as an input. K, which 9 | is the prediction step parameter, taken to be 1''' 10 | 11 | ''' latest model is stored at /Downloads/models3/ ''' 12 | 13 | import sys 14 | import gym 15 | import numpy as np 16 | import tensorflow as tf 17 | from scipy.misc import imresize 18 | import random 19 | from collections import deque 20 | import cv2 21 | 22 | flags = tf.app.flags 23 | flags.DEFINE_boolean('train', True, 'Whether to do training or testing') 24 | flags.DEFINE_string('env_name', 'Pong-v0', 'The name of gym environment to use') 25 | 26 | env = gym.make(flags.FLAGS.env_name) 27 | 28 | epsilon = 0.35 29 | MAX_EPISODES = 10000 30 | BATCH = 32 # change to 1 while predicting 31 | max_iter = 10000 32 | ACTIONS = env.action_space.n 33 | FACTORS = 2048 34 | REPLAY_MEMORY = 1000000 35 | 36 | def weight_variable(shape): 37 | initial = tf.truncated_normal(shape, stddev = 0.01) 38 | return tf.Variable(initial) 39 | 40 | def bias_variable(shape): 41 | initial = tf.constant(0.0, shape = shape) 42 | return tf.Variable(initial) 43 | 44 | def conv2d(x, W, stride): 45 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 46 | 47 | def conv2d_nopad(x, W, stride): 48 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 49 | 50 | def deconv2d(x, W, output_shape, stride): 51 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME") 52 | 53 | def deconv2d_nopad(x, W, output_shape, stride): 54 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID") 55 | 56 | def max_pool_2x2(x): 57 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 58 | 59 | class autoencoder(): 60 | 61 | def __init__(self, ): 62 | 63 | self.pred_frame = self.build_encoder() 64 | self.y = tf.placeholder("float", [BATCH, 84, 84]) 65 | self.loss = tf.square(tf.norm(self.y - self.pred_frame)) 66 | self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss) 67 | 68 | self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 69 | self.summaries = tf.summary.merge_all(tf.summary.scalar("loss", self.loss)) 70 | 71 | def build_encoder(self, ): 72 | 73 | # input - Batch X 84 X 84 X 4 74 | self.state = tf.placeholder("float", [None, 84, 84, 4]) 75 | self.action = tf.placeholder("float", [None, ACTIONS]) 76 | 77 | # 6 X 6 X 4 x 64 - stride 2 78 | W_conv1 = weight_variable([6, 6, 4, 64]) 79 | wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer()) 80 | b_conv1 = bias_variable([64]) 81 | 82 | # 6 X 6 X 64 x 64 - stride 2 83 | W_conv2 = weight_variable([6, 6, 64, 64]) 84 | b_conv2 = bias_variable([64]) 85 | 86 | # 6 X 6 X 64 x 64 - stride 2 87 | W_conv3 = weight_variable([6, 6, 64, 64]) 88 | b_conv3 = bias_variable([64]) 89 | 90 | # _*16 ie. flattened output from conv3 91 | W_fc1 = weight_variable([10*10*64, 1024]) 92 | b_fc1 = bias_variable([1024]) 93 | 94 | #second fully connected layer - 2048 units 95 | W_fc2 = weight_variable([1024, 2048]) 96 | b_fc2 = bias_variable([2048]) 97 | 98 | #W_fc2 = weight_variable([256, ACTIONS]) 99 | #b_fc2 = bias_variable([ACTIONS]) 100 | 101 | conv1 = tf.nn.relu(conv2d_nopad(self.state, wconv, 2) + b_conv1) 102 | #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 103 | #print("padded shape", padded_conv1.shape) 104 | 105 | conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2) 106 | #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 107 | 108 | conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3) 109 | 110 | conv3_flat = tf.reshape(conv3, [-1, 10*10*64]) 111 | fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1) 112 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) 113 | 114 | # 6 X 6 X 4 x 64 - stride 2 115 | W_enc = weight_variable([FACTORS, 2048]) 116 | W_dec = weight_variable([2048, FACTORS]) 117 | W_action = weight_variable([FACTORS, ACTIONS]) 118 | b_interactions = bias_variable([2048]) 119 | 120 | #W_henc = tf.matmul(W_enc, fc2) 121 | #W_a = tf.matmul(W_action, action) 122 | #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions 123 | 124 | W_henc = tf.matmul(fc2, tf.transpose(W_enc)) 125 | W_a = tf.matmul(self.action, tf.transpose(W_action)) 126 | fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions 127 | 128 | # first fully connected layer after multiplicative interaction- 2048 129 | W_fc3 = weight_variable([2048, 1024]) 130 | b_fc3 = bias_variable([1024]) 131 | 132 | # second fully connected layer after multiplicative interaction- 1024 units 133 | W_fc4 = weight_variable([1024, 10*10*64]) 134 | b_fc4 = bias_variable([10*10*64]) 135 | 136 | #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3) 137 | # TRYING OUT AN ALL CONV. NET 138 | fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3) 139 | fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4) 140 | 141 | # reshaping into a 4-D matrix 142 | fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64]) 143 | 144 | # deconv variables 145 | W_deconv1 = weight_variable([6, 6, 64, 64]) 146 | b_deconv1 = bias_variable([64]) 147 | 148 | W_deconv2 = weight_variable([6, 6, 64, 64]) 149 | b_deconv2 = bias_variable([64]) 150 | 151 | W_deconv3 = weight_variable([6, 6, 1, 64]) 152 | b_deconv3 = bias_variable([1]) 153 | 154 | # output - 1 x 84 84 155 | deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1) 156 | deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2) 157 | deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3 158 | 159 | 160 | #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84]) 161 | encode = tf.reshape(deconv3, [-1, 84, 84]) 162 | 163 | return encode 164 | 165 | def predict(self, sess, s, a): 166 | """ 167 | Predicts the next state based on the current action. 168 | Args: 169 | sess: Tensorflow session 170 | s: State input of shape [batch_size, 4, 160, 160, 3] 171 | a : Action input of shape [batch_size, ACTIONS] 172 | Returns: 173 | Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 174 | action values. 175 | """ 176 | return sess.run(self.pred_frame, { self.state: s, self.action : a }) 177 | 178 | def update(self, sess, s, a, y, p): 179 | """ 180 | Updates the estimator towards the given targets. 181 | Args: 182 | sess: Tensorflow session object 183 | s: State input of shape [batch_size, 84, 84, 4] 184 | a: Chosen actions of shape [batch_size, ACTIONS] 185 | y: Targets of shape [batch_size, 84, 84] 186 | p : Predicted next observation frame of shape [batch_size, 84, 84] 187 | Returns: 188 | The calculated loss on the batch. 189 | """ 190 | feed_dict = { self.y : y, self.pred_frame : p, 191 | self.state : s, self.action : a } 192 | _, loss = sess.run( 193 | [self.train_step, self.loss], feed_dict) 194 | #if self.summary_writer: 195 | # self.summary_writer.add_summary(summaries, global_step) 196 | return loss 197 | 198 | def rgb2gray(frame): 199 | 200 | r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2] 201 | gray = 0.2989 * r + 0.5870 * g + 0.1140 * b 202 | 203 | return gray 204 | 205 | def preprocess(frame): 206 | 207 | gray_image = rgb2gray(frame) 208 | reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84)) 209 | x = np.reshape(reshaped_image, [84,84,1]) 210 | x *= 1 / 255.0 211 | 212 | return x 213 | 214 | def rollout(sess, prediction_net): 215 | 216 | 217 | #tf.summary.scalar("Qval", encode) 218 | merged_summary_op = tf.summary.merge_all() 219 | 220 | sess.run(tf.initialize_all_variables()) 221 | saver = tf.train.Saver(tf.all_variables()) 222 | 223 | checkpoint = tf.train.latest_checkpoint(checkpoint_dir) 224 | if checkpoint: 225 | saver.restore(sess, checkpoint) 226 | print("Loaded model checkpoint {}...".format(checkpoint)) 227 | 228 | #summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 229 | 230 | D = deque() 231 | num_episodes = 0 232 | k = 0 233 | 234 | while num_episodes < MAX_EPISODES: 235 | ob = env.reset() 236 | 237 | obf = preprocess(ob) 238 | s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 239 | observations, actions = [], [] 240 | 241 | i = 0 242 | 243 | for t in range(10000): 244 | env.render() #optional 245 | 246 | if i == 0: 247 | #action_id = env.action_space.sample() 248 | #action_id = random.randint(0,5) 249 | action_id = 0 250 | action_vector = np.zeros(ACTIONS) 251 | action_vector[action_id] = 1 252 | #actions.append(action_vector) 253 | 254 | ob, reward, done, info = env.step(action_id) 255 | 256 | obf = preprocess(ob) 257 | 258 | s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2) 259 | 260 | # if training, collect data and apply learning updates 261 | if flags.FLAGS.train: 262 | # storing current state and the next frame 263 | D.append((s_t, action_vector, obf)) 264 | if len(D) > REPLAY_MEMORY: 265 | D.popleft() 266 | 267 | if num_episodes > 32: 268 | minibatch = random.sample(D, BATCH) 269 | action_batch = [d[1] for d in minibatch] 270 | state_batch = [d[0] for d in minibatch] 271 | target_batch = [d[2] for d in minibatch] 272 | target_batch = np.reshape(target_batch, (BATCH, 84, 84)) 273 | 274 | pred_batch = prediction_net.predict(sess, np.reshape(state_batch, (BATCH, 84, 84, 4)), np.reshape(action_batch, (BATCH, 6))) 275 | 276 | loss = prediction_net.update(sess, state_batch, action_batch, target_batch, pred_batch) 277 | 278 | #summary_writer.add_summary(summary, num_episodes) 279 | 280 | print("\riteration {} @ Episode {}/{}, loss {}".format(k, num_episodes, MAX_EPISODES, loss), end="") 281 | sys.stdout.flush() 282 | 283 | if k % 1000 == 0: 284 | print("\nsaving model now") 285 | saver.save(sess, save_path, global_step = t) 286 | 287 | # display the first frame of the minibatch 288 | cv2.imshow("prediction", pred_batch[0]) 289 | cv2.imshow("target", target_batch[0]) 290 | cv2.imshow("input", state_batch[0][:,:,0]) 291 | cv2.waitKey(5) 292 | 293 | else: 294 | #render video frames while testing 295 | prediction = prediction_net.predict(sess, np.reshape(s_t, (1, 84, 84, 4)), np.reshape(action_vector, (1, 6))) 296 | #print("prediction shape", prediction[0]) 297 | cv2.imshow("prediction", prediction[0]) 298 | cv2.waitKey(1) 299 | 300 | k += 1 301 | 302 | if i == 3: #maybe change to 4 303 | i = 0 304 | else: 305 | i +=1 306 | 307 | s_t = s_t1 308 | 309 | if done: 310 | num_episodes += 1 311 | break 312 | 313 | 314 | sess = tf.InteractiveSession() 315 | 316 | checkpoint_dir = '/home/manan/Downloads/models3/' 317 | save_path = '/home/manan/Downloads/models/video_prediction.ckpt' 318 | #save_path = '/home/manan/Downloads/models3/video_prediction.ckpt' 319 | #load_path='/home/manan/Downloads/models3/video_prediction.ckpt-302' 320 | logs_path = '/tmp/tensorboard_example' 321 | 322 | prediction_net = autoencoder() 323 | rollout(sess, prediction_net) 324 | '''Pong : Actions 2,4 : up 325 | 3,5 : down 326 | 0,1 : no movement''' 327 | 328 | # basic code for simulating random policy 329 | '''for i_episode in range(2): 330 | observation = env.reset() 331 | ob = preprocess(observation) 332 | print(ob.shape) 333 | for t in range(10000) 334 | env.render() 335 | print(observation) 336 | if random.random() < epsilon: 337 | action = env.action_space.sample() 338 | else: 339 | action = 1 340 | observation, reward, done, info = env.step(action) 341 | #print(action) 342 | if done == True: 343 | print("Episode finished") 344 | break''' 345 | -------------------------------------------------------------------------------- /frame_prediction_atari/tensorboard: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/frame_prediction_atari/tensorboard -------------------------------------------------------------------------------- /frame_prediction_atari/test_autoencoder_alter.py: -------------------------------------------------------------------------------- 1 | 2 | ''' We use downsampled gray scale images - 84 X 84, 3 | consider only every 4th frame as input, applying 4 | the same action for the intermediate frames. 5 | Minibatch size is taken to be 32. Each input 6 | consists of a fixed memory of T = 4 to unroll 7 | each trajectory and pass in as an input. K, which 8 | is the prediction step parameter, taken to be 1''' 9 | 10 | ''' latest model is stored at /Downloads/models3/ ''' 11 | 12 | import gym 13 | import numpy as np 14 | import tensorflow as tf 15 | from scipy.misc import imresize 16 | import random 17 | from collections import deque 18 | import cv2 19 | 20 | epsilon = 0.35 21 | MAX_EPISODES = 10000 22 | BATCH = 8 23 | max_iter = 10000 24 | ACTIONS = 6 25 | FACTORS = 2048 26 | REPLAY_MEMORY = 50 27 | 28 | def weight_variable(shape): 29 | initial = tf.truncated_normal(shape, stddev = 0.01) 30 | return tf.Variable(initial) 31 | 32 | def bias_variable(shape): 33 | initial = tf.constant(0.0, shape = shape) 34 | return tf.Variable(initial) 35 | 36 | def conv2d(x, W, stride): 37 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 38 | 39 | def conv2d_nopad(x, W, stride): 40 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 41 | 42 | def deconv2d(x, W, output_shape, stride): 43 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME") 44 | 45 | def deconv2d_nopad(x, W, output_shape, stride): 46 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID") 47 | 48 | def max_pool_2x2(x): 49 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 50 | 51 | def autoencoder(): 52 | 53 | # input - Batch X 84 X 84 X 4 54 | state = tf.placeholder("float", [BATCH, 84, 84, 4]) 55 | action = tf.placeholder("float", [BATCH, ACTIONS]) 56 | 57 | # 6 X 6 X 4 x 64 - stride 2 58 | W_conv1 = weight_variable([6, 6, 4, 64]) 59 | wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer()) 60 | b_conv1 = bias_variable([64]) 61 | 62 | # 6 X 6 X 64 x 64 - stride 2 63 | W_conv2 = weight_variable([6, 6, 64, 64]) 64 | b_conv2 = bias_variable([64]) 65 | 66 | # 6 X 6 X 64 x 64 - stride 2 67 | W_conv3 = weight_variable([6, 6, 64, 64]) 68 | b_conv3 = bias_variable([64]) 69 | 70 | # _*16 ie. flattened output from conv3 71 | W_fc1 = weight_variable([10*10*64, 1024]) 72 | b_fc1 = bias_variable([1024]) 73 | 74 | #second fully connected layer - 2048 units 75 | W_fc2 = weight_variable([1024, 2048]) 76 | b_fc2 = bias_variable([2048]) 77 | 78 | #W_fc2 = weight_variable([256, ACTIONS]) 79 | #b_fc2 = bias_variable([ACTIONS]) 80 | 81 | conv1 = tf.nn.relu(conv2d_nopad(state, wconv, 2) + b_conv1) 82 | #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 83 | #print("padded shape", padded_conv1.shape) 84 | 85 | conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2) 86 | #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 87 | 88 | conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3) 89 | 90 | conv3_flat = tf.reshape(conv3, [-1, 10*10*64]) 91 | fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1) 92 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) 93 | 94 | # 6 X 6 X 4 x 64 - stride 2 95 | W_enc = weight_variable([FACTORS, 2048]) 96 | W_dec = weight_variable([2048, FACTORS]) 97 | W_action = weight_variable([FACTORS, ACTIONS]) 98 | b_interactions = bias_variable([2048]) 99 | 100 | #W_henc = tf.matmul(W_enc, fc2) 101 | #W_a = tf.matmul(W_action, action) 102 | #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions 103 | 104 | W_henc = tf.matmul(fc2, tf.transpose(W_enc)) 105 | W_a = tf.matmul(action, tf.transpose(W_action)) 106 | fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions 107 | 108 | # first fully connected layer after multiplicative interaction- 2048 109 | W_fc3 = weight_variable([2048, 1024]) 110 | b_fc3 = bias_variable([1024]) 111 | 112 | # second fully connected layer after multiplicative interaction- 1024 units 113 | W_fc4 = weight_variable([1024, 10*10*64]) 114 | b_fc4 = bias_variable([10*10*64]) 115 | 116 | #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3) 117 | # TRYING OUT AN ALL CONV. NET 118 | fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3) 119 | fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4) 120 | 121 | # reshaping into a 4-D matrix 122 | fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64]) 123 | 124 | # deconv variables 125 | W_deconv1 = weight_variable([6, 6, 64, 64]) 126 | b_deconv1 = bias_variable([64]) 127 | 128 | W_deconv2 = weight_variable([6, 6, 64, 64]) 129 | b_deconv2 = bias_variable([64]) 130 | 131 | W_deconv3 = weight_variable([6, 6, 1, 64]) 132 | b_deconv3 = bias_variable([1]) 133 | 134 | # output - 1 x 84 84 135 | deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1) 136 | deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2) 137 | deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3 138 | 139 | 140 | #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84]) 141 | encode = tf.reshape(deconv3, [-1, 84, 84]) 142 | 143 | return state, action, encode 144 | 145 | def preprocess(frame): 146 | gray_image = frame.mean(2) 147 | reshaped_image = imresize(gray_image, (84,84)) 148 | x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32) 149 | x *= (1.0 / 128.0) 150 | # divide by 255 151 | ''' clipping code here ''' 152 | 153 | return x 154 | 155 | def rollout(state, action, encode): 156 | 157 | # reshape the predicted frame 158 | '''reshape code here''' 159 | 160 | y = tf.placeholder("float", [BATCH, 84, 84]) 161 | pred_frame = encode 162 | cost = tf.square(tf.norm(y - pred_frame)) 163 | train_step = tf.train.RMSPropOptimizer(1e-4).minimize(cost) 164 | 165 | print("working") 166 | sess.run(tf.initialize_all_variables()) 167 | saver = tf.train.Saver(tf.all_variables()) 168 | #saver.restore(sess, load_path) 169 | #print("variables restored and loaded...") 170 | 171 | D = deque() 172 | num_episodes = 0 173 | k = 0 174 | 175 | while num_episodes < MAX_EPISODES: 176 | ob = env.reset() 177 | 178 | obf = preprocess(ob) 179 | s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 180 | observations, actions = [], [] 181 | 182 | i = 1 183 | print("num of episodes", num_episodes) 184 | 185 | for t in range(10000): 186 | env.render() #optional 187 | 188 | if i == 1: 189 | #action_id = env.action_space.sample() 190 | action_id = 0 191 | action_vector = np.zeros(ACTIONS) 192 | action_vector[action_id] = 1 193 | actions.append(action_vector) 194 | #print("action size sample", action_vector) 195 | 196 | ob, reward, done, info = env.step(action_id) 197 | #if i == 1: 198 | # cv2.imshow("image", preprocess(ob)) 199 | # cv2.waitKey() 200 | #i += 1 201 | 202 | obf = preprocess(ob) 203 | s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2) 204 | #observations.append(s_t1) 205 | ''' uncomment for training ''' 206 | if i == 1: 207 | observations.append(s_t1) 208 | #D.append((s_t, action_vector, obf)) 209 | #if len(D) > REPLAY_MEMORY: 210 | # D.popleft() 211 | 212 | if i == 3: #maybe change to 4 213 | i = 1 214 | else: 215 | i +=1 216 | 217 | ''' comment for training ''' 218 | '''prediction = encode.eval(feed_dict = {state : np.reshape(s_t, (1, 84, 84, 4)), action : np.reshape(action_vector, (1, 6))}) 219 | print("prediction shape", prediction[0]) 220 | cv2.imshow("prediction", prediction[0]) 221 | cv2.waitKey(1)''' 222 | 223 | s_t = s_t1 224 | 225 | #D.append((observations, actions)) 226 | #print("observations length", D[0][0].shape) 227 | 228 | #print("deque length", len(D[0][0])) 229 | 230 | ''' uncomment for training ''' 231 | #k = 0 232 | #while k < max_iter: 233 | if num_episodes > 32: 234 | 235 | minibatch = random.sample(D, BATCH) 236 | action_batch = [d[1] for d in minibatch] 237 | state_batch = [d[0] for d in minibatch] 238 | 239 | #print("state_batch shape" + str(state_batch[0][0].shape)) 240 | # the first frame of the second set of observations 241 | idx = random.randint(1, 300) 242 | target_batch = [d[idx][:,:,0] for d in state_batch] 243 | #print("target_batch shape" + str(target_batch[0].shape)) 244 | # the first set of 4 frames 245 | input_batch = [d[idx-1] for d in state_batch] 246 | #print("input_batch shape" + str(input_batch[0].shape)) 247 | action_input_batch = [d[0] for d in action_batch] 248 | 249 | # unroll 250 | for j in range(3): 251 | pred_batch = encode.eval(feed_dict = {action : np.reshape(action_input_batch, (BATCH, 6)), 252 | state : np.reshape(input_batch, (BATCH, 84, 84, 4))}) 253 | 254 | train_step.run(feed_dict = { 255 | y : target_batch, 256 | pred_frame : pred_batch, 257 | state : input_batch, 258 | action : action_input_batch}) 259 | loss = cost.eval(feed_dict = {y : target_batch, 260 | pred_frame : pred_batch, 261 | state : input_batch, 262 | action : action_input_batch}) 263 | 264 | print("iteration : ", k) 265 | print("loss : ", loss) 266 | #print("j is :", j) 267 | 268 | if k % 1000 == 0: 269 | print("saving model now") 270 | saver.save(sess, save_path, global_step = t) 271 | 272 | #if k == max_iter - 1: 273 | cv2.imshow("prediction", pred_batch[0]) 274 | cv2.imshow("target", target_batch[0]) 275 | cv2.imshow("input", input_batch[0][:,:,0]) 276 | #if k % 500 == 0: 277 | #cv2.imwrite('prediction%s.jpg' %k, pred_batch[0]) 278 | cv2.waitKey(5) 279 | 280 | k += 1 281 | 282 | pred_batch = np.reshape(pred_batch, (BATCH, 84, 84, 1)) 283 | target_batch = [d[idx][:,:,j+1] for d in state_batch] 284 | temp = [d[:,:,0:3] for d in input_batch] 285 | #print("pred_batch shape", pred_batch.shape) 286 | #print("temp shape", len(temp), temp[0].shape) 287 | input_batch = np.append(pred_batch, temp, axis = 3) 288 | 289 | if done: 290 | num_episodes += 1 291 | D.append((observations, actions)) 292 | if len(D) > REPLAY_MEMORY: 293 | D.popleft() 294 | break 295 | 296 | env = gym.make('Pong-v0') 297 | sess = tf.InteractiveSession() 298 | save_path = '/home/manan/Downloads/models3/video_prediction.ckpt' 299 | load_path='/home/manan/Downloads/models3/video_prediction.ckpt-1000' 300 | 301 | state, action, encode = autoencoder() 302 | rollout(state, action, encode) 303 | '''Pong : Actions 2,4 : up 304 | 3,5 : down 305 | 0,1 : no movement''' 306 | 307 | '''for i_episode in range(2): 308 | observation = env.reset() 309 | ob = preprocess(observation) 310 | print(ob.shape) 311 | for t in range(10000) 312 | env.render() 313 | print(observation) 314 | if random.random() < epsilon: 315 | action = env.action_space.sample() 316 | else: 317 | action = 1 318 | observation, reward, done, info = env.step(action) 319 | #print(action) 320 | if done == True: 321 | print("Episode finished") 322 | break''' 323 | -------------------------------------------------------------------------------- /frame_prediction_atari/test_multistep_autoencoder.py: -------------------------------------------------------------------------------- 1 | 2 | ''' We use downsampled gray scale images - 84 X 84, 3 | consider only every 4th frame as input, applying 4 | the same action for the intermediate frames. 5 | Minibatch size is taken to be 32. Each input 6 | consists of a fixed memory of T = 4 to unroll 7 | each trajectory and pass in as an input. K, which 8 | is the prediction step parameter, taken to be 1''' 9 | 10 | ''' latest model is stored at /Downloads/models3/ ''' 11 | 12 | import gym 13 | import numpy as np 14 | import tensorflow as tf 15 | from scipy.misc import imresize 16 | import random 17 | from collections import deque 18 | import cv2 19 | 20 | epsilon = 0.35 21 | MAX_EPISODES = 10000 22 | BATCH = 32 23 | max_iter = 10000 24 | ACTIONS = 6 25 | FACTORS = 2048 26 | REPLAY_MEMORY = 1000000 27 | num_steps = 3 28 | H = 4 29 | 30 | def weight_variable(shape): 31 | initial = tf.truncated_normal(shape, stddev = 0.01) 32 | return tf.Variable(initial) 33 | 34 | def bias_variable(shape): 35 | initial = tf.constant(0.0, shape = shape) 36 | return tf.Variable(initial) 37 | 38 | def conv2d(x, W, stride): 39 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 40 | 41 | def conv2d_nopad(x, W, stride): 42 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 43 | 44 | def deconv2d(x, W, output_shape, stride): 45 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME") 46 | 47 | def deconv2d_nopad(x, W, output_shape, stride): 48 | return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID") 49 | 50 | def max_pool_2x2(x): 51 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 52 | 53 | def autoencoder(): 54 | 55 | # input - Batch X 84 X 84 X 4 56 | state = tf.placeholder("float", [BATCH, 84, 84, 4]) 57 | action = tf.placeholder("float", [BATCH, ACTIONS]) 58 | 59 | # 6 X 6 X 4 x 64 - stride 2 60 | W_conv1 = weight_variable([6, 6, 4, 64]) 61 | wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer()) 62 | b_conv1 = bias_variable([64]) 63 | 64 | # 6 X 6 X 64 x 64 - stride 2 65 | W_conv2 = weight_variable([6, 6, 64, 64]) 66 | b_conv2 = bias_variable([64]) 67 | 68 | # 6 X 6 X 64 x 64 - stride 2 69 | W_conv3 = weight_variable([6, 6, 64, 64]) 70 | b_conv3 = bias_variable([64]) 71 | 72 | # _*16 ie. flattened output from conv3 73 | W_fc1 = weight_variable([10*10*64, 1024]) 74 | b_fc1 = bias_variable([1024]) 75 | 76 | #second fully connected layer - 2048 units 77 | W_fc2 = weight_variable([1024, 2048]) 78 | b_fc2 = bias_variable([2048]) 79 | 80 | #W_fc2 = weight_variable([256, ACTIONS]) 81 | #b_fc2 = bias_variable([ACTIONS]) 82 | 83 | conv1 = tf.nn.relu(conv2d_nopad(state, wconv, 2) + b_conv1) 84 | #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 85 | #print("padded shape", padded_conv1.shape) 86 | 87 | conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2) 88 | #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT") 89 | 90 | conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3) 91 | 92 | conv3_flat = tf.reshape(conv3, [-1, 10*10*64]) 93 | fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1) 94 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) 95 | 96 | # 6 X 6 X 4 x 64 - stride 2 97 | W_enc = weight_variable([FACTORS, 2048]) 98 | W_dec = weight_variable([2048, FACTORS]) 99 | W_action = weight_variable([FACTORS, ACTIONS]) 100 | b_interactions = bias_variable([2048]) 101 | 102 | #W_henc = tf.matmul(W_enc, fc2) 103 | #W_a = tf.matmul(W_action, action) 104 | #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions 105 | 106 | W_henc = tf.matmul(fc_interactions, tf.transpose(W_enc)) 107 | W_a = tf.matmul(action, tf.transpose(W_action)) 108 | fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions 109 | 110 | # first fully connected layer after multiplicative interaction- 2048 111 | W_fc3 = weight_variable([2048, 1024]) 112 | b_fc3 = bias_variable([1024]) 113 | 114 | # second fully connected layer after multiplicative interaction- 1024 units 115 | W_fc4 = weight_variable([1024, 10*10*64]) 116 | b_fc4 = bias_variable([10*10*64]) 117 | 118 | #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3) 119 | # TRYING OUT AN ALL CONV. NET 120 | fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3) 121 | fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4) 122 | 123 | # reshaping into a 4-D matrix 124 | fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64]) 125 | 126 | # deconv variables 127 | W_deconv1 = weight_variable([6, 6, 64, 64]) 128 | b_deconv1 = bias_variable([64]) 129 | 130 | W_deconv2 = weight_variable([6, 6, 64, 64]) 131 | b_deconv2 = bias_variable([64]) 132 | 133 | W_deconv3 = weight_variable([6, 6, 1, 64]) 134 | b_deconv3 = bias_variable([1]) 135 | 136 | # output - 1 x 84 84 137 | deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1) 138 | deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2) 139 | deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3 140 | 141 | 142 | #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84]) 143 | encode = tf.reshape(deconv3, [-1, 84, 84]) 144 | 145 | return state, action, encode 146 | 147 | def preprocess(frame): 148 | gray_image = frame.mean(2) 149 | reshaped_image = imresize(gray_image, (84,84)) 150 | x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32) 151 | x *= (1.0 / 255.0) 152 | # divide by 255 153 | ''' clipping code here ''' 154 | 155 | return x 156 | 157 | def rollout(state, action, encode): 158 | 159 | # reshape the predicted frame 160 | '''reshape code here''' 161 | 162 | y = tf.placeholder("float", [BATCH, 84, 84]) 163 | pred_frame = encode 164 | cost = tf.square(tf.norm(y - pred_frame)) 165 | train_step = tf.train.AdamOptimizer(1e-4).minimize(cost) 166 | 167 | print("working") 168 | sess.run(tf.initialize_all_variables()) 169 | saver = tf.train.Saver(tf.all_variables()) 170 | #saver.restore(sess, load_path) 171 | #print("variables restored and loaded...") 172 | 173 | D = deque() 174 | num_episodes = 0 175 | k = 0 176 | 177 | while num_episodes < MAX_EPISODES: 178 | ob = env.reset() 179 | 180 | obf = preprocess(ob) 181 | s = () 182 | for i in range(num_steps + 4): 183 | s += obf 184 | s_t = np.reshape(np.stack(s, axis=2), (84, 84, H)) 185 | observations, actions = [], [] 186 | 187 | i = 0 188 | print("num of episodes", num_episodes) 189 | 190 | for t in range(10000): 191 | env.render() #optional 192 | 193 | if i == 0: 194 | #action_id = env.action_space.sample() 195 | action_id = 0 196 | action_vector = np.zeros(ACTIONS) 197 | action_vector[action_id] = 1 198 | #actions.append(action_vector) 199 | #print("action size sample", action_vector) 200 | 201 | ob, reward, done, info = env.step(action_id) 202 | #if i == 1: 203 | # cv2.imshow("image", preprocess(ob)) 204 | # cv2.waitKey() 205 | #i += 1 206 | 207 | obf = preprocess(ob) 208 | 209 | #observations.append(s_t1) 210 | ''' uncomment for training ''' 211 | #if i == 0: 212 | s_t = np.append(obf, s_t[:,:,0:num_steps + H - 1], axis = 2) 213 | x_t = s_t[:,:,num_steps + H - 1:] 214 | y_t = s_t[:,:,0:num_steps + H -1] 215 | D.append((x_t, action_vector, y_t)) 216 | if len(D) > REPLAY_MEMORY: 217 | D.popleft() 218 | 219 | if i == H - 1: #maybe change to 4 220 | i = 0 221 | else: 222 | i +=1 223 | 224 | ''' comment for training ''' 225 | '''prediction = encode.eval(feed_dict = {state : np.reshape(s_t, (1, 84, 84, 4)), action : np.reshape(action_vector, (1, 6))}) 226 | print("prediction shape", prediction[0]) 227 | cv2.imshow("prediction", prediction[0]) 228 | cv2.waitKey(1)''' 229 | 230 | s_t = s_t1 231 | 232 | #D.append((observations, actions)) 233 | #print("observations length", D[0][0].shape) 234 | 235 | #print("deque length", len(D[0][0])) 236 | 237 | ''' uncomment for training ''' 238 | #k = 0 239 | #while k < max_iter: 240 | if num_episodes > 20: 241 | 242 | minibatch = random.sample(D, BATCH) 243 | train(minibatch, k) 244 | 245 | k += 1 246 | 247 | if done: 248 | num_episodes += 1 249 | break 250 | 251 | def train(minibatch, k): 252 | 253 | action_batch = [d[1] for d in minibatch] 254 | x_batch = [d[0] for d in minibatch] 255 | y_batch = [d[2] for d in minibatch] 256 | target_batch = np.reshape(y_batch, (BATCH, 84, 84)) 257 | 258 | # include changing actions in predictions as well 259 | for i in range(num_steps): 260 | pred_batch = encode.eval(feed_dict = {action : np.reshape(action_batch, (BATCH, 6)), 261 | state : np.reshape(x_batch, (BATCH, 84, 84, H))}) 262 | x_batch.pop(0) 263 | x_batch.append(pred_batch) 264 | train_step.run(feed_dict = { 265 | y : target_batch[:,:,:,i], 266 | pred_frame : pred_batch, 267 | state : x_batch, 268 | action : action_batch}) 269 | loss = cost.eval(feed_dict = {y : target_batch[:,:,:,i], 270 | pred_frame : pred_batch, 271 | state : x_batch, 272 | action : action_batch}) 273 | 274 | print("iteration : ", k) 275 | print("loss : ", loss) 276 | 277 | if k % 1000 == 0: 278 | print("saving model now") 279 | saver.save(sess, save_path, global_step = t) 280 | 281 | #if k == max_iter - 1: 282 | cv2.imshow("prediction", pred_batch[0]) 283 | cv2.imshow("target", target_batch[0]) 284 | cv2.imshow("input", state_batch[0][:,:,0]) 285 | #if k % 500 == 0: 286 | #cv2.imwrite('prediction%s.jpg' %k, pred_batch[0]) 287 | cv2.waitKey(5) 288 | 289 | env = gym.make('MsPacman-v0') 290 | sess = tf.InteractiveSession() 291 | save_path = '/home/manan/Downloads/models3/video_prediction.ckpt' 292 | load_path='/home/manan/Downloads/models3/video_prediction.ckpt-1000' 293 | 294 | state, action, encode = autoencoder() 295 | rollout(state, action, encode) 296 | '''Pong : Actions 2,4 : up 297 | 3,5 : down 298 | 0,1 : no movement''' 299 | 300 | '''for i_episode in range(2): 301 | observation = env.reset() 302 | ob = preprocess(observation) 303 | print(ob.shape) 304 | for t in range(10000) 305 | env.render() 306 | print(observation) 307 | if random.random() < epsilon: 308 | action = env.action_space.sample() 309 | else: 310 | action = 1 311 | observation, reward, done, info = env.step(action) 312 | #print(action) 313 | if done == True: 314 | print("Episode finished") 315 | break''' 316 | -------------------------------------------------------------------------------- /images/image_screenshot8_24.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot8_24.05.2017.png -------------------------------------------------------------------------------- /images/image_screenshot9_24.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot9_24.05.2017.png -------------------------------------------------------------------------------- /images/image_screenshot_23.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot_23.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot10_24.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot10_24.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot11_24.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot11_24.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot12_25.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot12_25.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot13_25.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot13_25.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot13_26.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot13_26.05.2017.png -------------------------------------------------------------------------------- /images/prediction_screenshot14_26.05.2017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot14_26.05.2017.png -------------------------------------------------------------------------------- /misc/caffe_atari_cnn.py: -------------------------------------------------------------------------------- 1 | def add_cnn(n, data, act, batch_size, T, K, num_step, mode='train'): 2 | # data : batch_size x T x 3 x height x width 3 | n.x_flat = L.Flatten(data, axis=1, end_axis=2) 4 | # n.x_flat : batch_size x T*3 x height x width 5 | n.act_flat = L.Flatten(act, axis=1, end_axis=2) 6 | if mode == 'train': 7 | x = L.Slice(n.x_flat, axis=1, ntop=T) 8 | # x : T layers of size : batch_size x 3 x height x width 9 | act_slice = L.Slice(n.act_flat, axis=1, ntop=T-1) 10 | x_set = () 11 | label_set = () 12 | x_hat_set = () 13 | silence_set = () 14 | for i in range(T): 15 | t = tag(i+1) 16 | # n.tops[x1] : batch_size x 3 x height x width 17 | # n.tops[x2] : batch_size x 3 x height x width 18 | n.tops['x'+t] = x[i] 19 | if i < K: 20 | # storing just the first four frames in x_set 21 | x_set += (x[i],) 22 | if i < T - 1: 23 | n.tops['act'+t] = act_slice[i] 24 | if i < K - 1: 25 | silence_set += (n.tops['act'+t],) 26 | if i >= K: 27 | # storing the fifth frame as the label 28 | label_set += (x[i],) 29 | # not important for 1 step prediction, 30 | # produces : batch_size x 3 x height x width 31 | n.label = L.Concat(*label_set, axis=0) 32 | # converting to list 33 | input_list = list(x_set) 34 | # not important as no. of steps is 1 35 | for step in range(0, num_step): 36 | step_tag = tag(step + 1) if step > 0 else '' 37 | t = tag(step + K) 38 | tp = tag(step + K + 1) 39 | input_tuple = tuple(input_list) 40 | # concatenating all 4 frames together 41 | n.tops['input'+step_tag] = L.Concat(*input_tuple, axis=1) 42 | # passing through the feed-forward net 43 | top = add_conv_enc(n, n.tops['input'+step_tag], tag=step_tag) 44 | n.tops['x_hat'+tp] = add_decoder(n, top, n.tops['act'+t], flatten=False, 45 | tag=step_tag) 46 | # using the predicted values to form the input for the next prediction 47 | input_list.pop(0) 48 | input_list.append(n.tops['x_hat'+tp]) 49 | else: 50 | top = add_conv_enc(n, n.x_flat) 51 | n.tops['x_hat'+tag(K+1)] = add_decoder(n, top, n.act_flat, flatten=False) 52 | if mode == 'train': 53 | x_hat = () 54 | # for 1 step prediciton, just runs once for i = 4 55 | for i in range(K, T): 56 | t = tag(i+1) 57 | # prediction for the 5th frame comes from the net 58 | x_hat += (n.tops['x_hat'+t],) 59 | # concatenate all predictions 60 | n.x_hat = L.Concat(*x_hat, axis=0) 61 | n.silence = L.Silence(*silence_set, ntop=0) 62 | # takes the predcition for the 5th frame and output label 63 | # both are of size batch_size x 3 x height x width 64 | n.l2_loss = L.EuclideanLoss(n.x_hat, n.label) 65 | return n 66 | -------------------------------------------------------------------------------- /misc/policy_gradients.py: -------------------------------------------------------------------------------- 1 | 2 | """ implements a simple policy gradient (actor critic technically) agent """ 3 | 4 | import argparse 5 | import gym 6 | import time 7 | from gym.spaces import Discrete 8 | import numpy as np 9 | from scipy.signal import lfilter 10 | from scipy.misc import imsave, imresize 11 | import tensorflow as tf 12 | import tensorflow.contrib.slim as slim 13 | 14 | parser = argparse.ArgumentParser(description=None) 15 | parser.add_argument('-e', '--env', default='Breakout-v3', type=str, help='gym environment') 16 | parser.add_argument('-b', '--batch_size', default=10000, type=int, help='batch size to use during learning') 17 | parser.add_argument('-l', '--learning_rate', default=1e-3, type=float, help='used for Adam') 18 | parser.add_argument('-g', '--discount', default=0.99, type=float, help='reward discount rate to use') 19 | parser.add_argument('-n', '--hidden_size', default=20, type=int, help='number of hidden units in net') 20 | parser.add_argument('-c', '--gradient_clip', default=40.0, type=float, help='clip at this max norm of gradient') 21 | parser.add_argument('-v', '--value_scale', default=0.5, type=float, help='scale of value function regression in loss') 22 | parser.add_argument('-t', '--entropy_scale', default=0, type=float, help='scale of entropy penalty in loss') 23 | parser.add_argument('-m', '--max_steps', default=10000, type=int, help='max number of steps to run for') 24 | args = parser.parse_args() 25 | print(args) 26 | 27 | # ----------------------------------------------------------------------------- 28 | def process_frame(frame): 29 | """ Atari specific preprocessing, consistent with DeepMind """ 30 | reshaped_screen = frame.astype(np.float32).mean(2) # grayscale 31 | resized_screen = imresize(reshaped_screen, (84, 110)) # downsample 32 | x = resized_screen[18:102, :] # crop top/bottom 33 | x = imresize(x, (42, 42)).astype(np.float32) # downsample 34 | x *= (1.0 / 255.0) # place in [0,1] 35 | x = np.reshape(x, [42, 42, 1]) # introduce channel 36 | return x 37 | 38 | def policy_spec(x): 39 | net = slim.conv2d(x, args.hidden_size, [5, 5], stride=2, padding='SAME', activation_fn=tf.nn.elu, scope='conv1') 40 | net = slim.conv2d(net, args.hidden_size, [5, 5], stride=2, padding='SAME', activation_fn=tf.nn.elu, scope='conv2') 41 | net = slim.flatten(net) 42 | action_logits = slim.fully_connected(net, num_actions, activation_fn=None, scope='fc_act') 43 | value_function = slim.fully_connected(net, 1, activation_fn=None, scope='fc_value') 44 | return action_logits, value_function 45 | 46 | def rollout(n, max_steps_per_episode=4500): 47 | """ gather a single episode with current policy """ 48 | 49 | observations, actions, rewards, discounted_rewards = [], [], [], [] 50 | ob = env.reset() 51 | ep_steps = 0 52 | num_episodes = 0 53 | ep_start_pointer = 0 54 | prev_obf = None 55 | while True: 56 | 57 | # we concatenate the previous frame to get some motion information 58 | obf_now = process_frame(ob) 59 | obf_before = obf_now if prev_obf is None else prev_obf 60 | obf = np.concatenate((obf_before, obf_now), axis=2) 61 | #obf = obf_now - obf_before 62 | prev_obf = obf_now 63 | 64 | # run the policy 65 | action = sess.run(action_index, feed_dict={x: np.expand_dims(obf, 0)}) # intro a batch dim 66 | action = action[0][0] # strip batch and #of samples from tf.multinomial 67 | 68 | # execute the action 69 | ob, reward, done, info = env.step(action) 70 | ep_steps += 1 71 | 72 | observations.append(obf) 73 | actions.append(action) 74 | rewards.append(reward) 75 | 76 | if done or ep_steps >= max_steps_per_episode: 77 | num_episodes += 1 78 | ep_steps = 0 79 | prev_obf = None 80 | discounted_rewards.append(discount(rewards[ep_start_pointer:], args.discount)) 81 | ep_start_pointer = len(rewards) 82 | ob = env.reset() 83 | if len(rewards) >= n: break 84 | 85 | return np.stack(observations), np.stack(actions), np.stack(rewards), np.concatenate(discounted_rewards), {'num_episodes':num_episodes} 86 | 87 | def discount(x, gamma): 88 | return lfilter([1],[1,-gamma],x[::-1])[::-1] 89 | # ----------------------------------------------------------------------------- 90 | 91 | # create the environment 92 | env = gym.make(args.env) 93 | num_actions = env.action_space.n 94 | 95 | # compile the model 96 | x = tf.placeholder(tf.float32, (None,) + (42,42,2), name='x') 97 | action_logits, value_function = policy_spec(x) 98 | action_index = tf.multinomial(action_logits - tf.reduce_max(action_logits, 1, keep_dims=True), 1) # take 1 sample 99 | # compile the loss: 1) the policy gradient 100 | sampled_actions = tf.placeholder(tf.int32, (None,), name='sampled_actions') 101 | discounted_reward = tf.placeholder(tf.float32, (None,), name='discounted_reward') 102 | pg_loss = tf.reduce_mean((discounted_reward - value_function) * tf.nn.sparse_softmax_cross_entropy_with_logits(logits=action_logits, labels=sampled_actions)) 103 | # and 2) the baseline (value function) regression piece 104 | value_loss = args.value_scale * tf.reduce_mean(tf.square(discounted_reward - value_function)) 105 | # and 3) entropy regularization 106 | action_log_prob = tf.nn.log_softmax(action_logits) 107 | entropy_loss = -args.entropy_scale * tf.reduce_sum(action_log_prob*tf.exp(action_log_prob)) 108 | # add up and minimize 109 | loss = pg_loss + value_loss + entropy_loss 110 | # create the optimizer 111 | optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) 112 | grads = tf.gradients(loss, tf.trainable_variables()) 113 | grads, _ = tf.clip_by_global_norm(grads, args.gradient_clip) # gradient clipping 114 | grads_and_vars = list(zip(grads, tf.trainable_variables())) 115 | train_op = optimizer.apply_gradients(grads_and_vars) 116 | 117 | # tf init 118 | sess = tf.Session() 119 | sess.run(tf.initialize_all_variables()) 120 | n = 0 121 | mean_rewards = [] 122 | while n <= 100: # loop forever 123 | n += 1 124 | 125 | # collect a batch of data from rollouts and do forward/backward/update 126 | t0 = time.time() 127 | observations, actions, rewards, discounted_reward_np, info = rollout(args.batch_size) 128 | t1 = time.time() 129 | sess.run(train_op, feed_dict={x:observations, sampled_actions:actions, discounted_reward:discounted_reward_np}) 130 | t2 = time.time() 131 | 132 | average_reward = np.sum(rewards)/info['num_episodes'] 133 | mean_rewards.append(average_reward) 134 | print('step %d: collected %d frames in %fs, mean episode reward = %f (%d eps), update in %fs' % \ 135 | (n, observations.shape[0], t1-t0, average_reward, info['num_episodes'], t2-t1)) 136 | 137 | print(args) 138 | print('total average reward: %f +/- %f (min %f, max %f)' % \ 139 | (np.mean(mean_rewards), np.std(mean_rewards), np.min(mean_rewards), np.max(mean_rewards))) 140 | -------------------------------------------------------------------------------- /simple_dqn/naive_nips_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import tensorflow as tf 3 | import tensorflow.contrib.slim as slim 4 | import numpy as np 5 | from scipy.misc import imresize 6 | from collections import deque 7 | import sys 8 | import random 9 | 10 | INITIAL_EPSILON = 0.1 11 | FINAL_EPSILON = 0.05 12 | REPLAY_MEMORY = 10000 13 | max_episodes = 5 14 | BATCH = 2 15 | GAMMA = 0.99 16 | TRAIN = 1 17 | 18 | 19 | def weight_variable(shape): 20 | initial = tf.truncated_normal(shape, stddev = 0.01) 21 | return tf.Variable(initial) 22 | 23 | def bias_variable(shape): 24 | initial = tf.constant(0.01, shape = shape) 25 | return tf.Variable(initial) 26 | 27 | def conv2d(x, W, stride): 28 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") 29 | 30 | def max_pool_2x2(x): 31 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 32 | 33 | def rgb2gray(frame): 34 | 35 | r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2] 36 | gray = 0.2989 * r + 0.5870 * g + 0.1140 * b 37 | 38 | return gray 39 | 40 | def preprocess(frame): 41 | 42 | gray_image = rgb2gray(frame) 43 | reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84)) 44 | x = np.reshape(reshaped_image, [84,84,1]) 45 | x *= 1 / 255.0 46 | 47 | return x 48 | 49 | class q_network(): 50 | 51 | def __init__(self, scope): 52 | 53 | self.scope = scope 54 | with tf.variable_scope(self.scope): 55 | 56 | self.build_net() 57 | 58 | 59 | def build_net(): 60 | 61 | x = tf.placeholder("float", [None, 84, 84, 4]) 62 | #print(x.shape) 63 | conv1 = tf.layers.conv2d(x, 32, [5, 5], padding="same", activation=tf.nn.relu) 64 | print(conv1.shape) 65 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) 66 | conv2 = tf.layers.conv2d(pool1, 64, [5, 5], padding="same", activation=tf.nn.relu) 67 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) 68 | print("passes pool2", pool2.shape) 69 | #pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64 * 9]) 70 | pool2_flat = tf.reshape(pool2, [-1, 64]) 71 | dense = tf.layers.dense(inputs=pool2_flat, units=512, activation=tf.nn.relu) 72 | #print("passes dense", dense.shape) 73 | action_logits = tf.layers.dense(inputs=dense, units=ACTIONS) 74 | print("passes logits", action_logits.shape) 75 | #conv1 = slim.conv2d(x, 10, [5,5], stride=2, padding='SAME', activation_fn=tf.nn.relu) 76 | #conv2 = slim.conv2d(conv1, 10, [5,5], stride=2, padding='SAME', activation_fn=tf.nn.relu) 77 | #net = slim.flatten(conv2) 78 | #action_logits = slim.fully_connected(net, ACTIONS, activation_fn=None) 79 | 80 | return action_logits 81 | 82 | 83 | def rollout(sess, max_iter=5000): 84 | 85 | observations, actions, rewards = [], [], [] 86 | ob = env.reset() 87 | ep_steps = 0 88 | num_episodes = 0 89 | epsilon = INITIAL_EPSILON 90 | 91 | ob_now = preprocess(ob) 92 | ob_prev = None 93 | t=0 94 | 95 | D = deque() 96 | a = tf.placeholder("float", [None, ACTIONS]) 97 | y = tf.placeholder("float", [None]) 98 | #s = tf.placeholder("float", [None, 84, 84, 2]) 99 | s = tf.placeholder("float", [None, 4, 4, 2]) 100 | 101 | readout_action = tf.reduce_sum(tf.multiply(dqn(s), a), reduction_indices = 1) 102 | cost = tf.reduce_mean(tf.square(y - readout_action)) 103 | train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) 104 | 105 | sess.run(tf.initialize_all_variables()) 106 | 107 | while True or num_episodes < max_episodes: 108 | 109 | ob_before = ob_now if ob_prev is None else ob_prev 110 | obf_prev = np.concatenate((ob_before, ob_now), 2) 111 | ob_prev = ob_now 112 | 113 | action_index = 0 114 | action = np.zeros(ACTIONS, np.int32) 115 | print("action is ", action) 116 | if random.random() <= epsilon: 117 | action_index = random.randrange(ACTIONS) 118 | action[action_index] = 1 119 | else: 120 | action_index = np.argmax(dqn(obf_prev.astype(np.float32))) 121 | action[action_index] = 1 122 | 123 | if epsilon > FINAL_EPSILON: 124 | epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 1000 125 | 126 | ob, reward, done, info = env.step(np.argmax(action)) 127 | ep_steps += 1 128 | 129 | ob_now = preprocess(ob) 130 | ob_before = ob_now if ob_prev is None else ob_prev 131 | obf_now = np.concatenate((ob_before, ob_now), 2) 132 | 133 | #observations.append(ob) 134 | #actions.append(action) 135 | #rewards.append(reward) 136 | 137 | D.append((obf_prev, action, reward, obf_now, done)) 138 | if len(D) > REPLAY_MEMORY: 139 | D.popleft() 140 | if t >= TRAIN: 141 | #training starts 142 | minibatch = random.sample(D, BATCH) 143 | 144 | # get the batch variables 145 | obf_prev_batch = [d[0] for d in minibatch] 146 | action_batch = [d[1] for d in minibatch] 147 | reward_batch = [d[2] for d in minibatch] 148 | obf_now_batch = [d[3] for d in minibatch] 149 | 150 | target_batch = [] 151 | #obf_batch = np.concatenate(ob_before, ob_now), 2) 152 | for i in range(0, len(minibatch)): 153 | 154 | if minibatch[i][4]: 155 | target_batch.append(reward_batch[i]) 156 | 157 | else: 158 | print("obf_prev_batch shape ", len(obf_prev_batch)) 159 | target_batch.append(reward_batch[i] + GAMMA*sess.run(tf.reduce_max(dqn(obf_now_batch[i].astype(np.float32))))) 160 | 161 | #print("obf_prev_batch", obf_prev_batch) 162 | obff = np.zeros((len(obf_prev_batch), 4, 4, 2)) 163 | for i,x in enumerate(obf_prev_batch): 164 | obff[i] = x 165 | #readout_t = s.eval(feed_dict = {s : obff})[0] 166 | print("reward", reward_batch) 167 | print("target_batch", target_batch[0]) 168 | target = np.zeros((len(target_batch))) 169 | for i,x in enumerate(target_batch): 170 | target[i] = x 171 | 172 | #print("reward", reward_batch[1]) 173 | train_step.run(feed_dict = { 174 | s : obff, 175 | a : action_batch, 176 | y : target}) 177 | 178 | 179 | ob_prev = ob_now 180 | t += 1 181 | 182 | if done or ep_steps >= max_iter: 183 | num_episodes += 1 184 | ep_steps = 0 185 | ob_prev = None 186 | ob = env.reset() 187 | 188 | env = gym.make('Pong-v0') 189 | 190 | #x = tf.placeholder(tf.float32, name ) 191 | #sampled_actions = tf.placeholder(tf.int32) 192 | #discounted_reward = tf.placeholder(tf.float32) 193 | 194 | #action_logits = dqn(x) 195 | ACTIONS = env.action_space.n 196 | sess = tf.InteractiveSession() 197 | #s, action_logits = dqn() 198 | rollout(sess) 199 | #print(env.action_space.n) 200 | for i_episode in range(1): 201 | observation = env.reset() 202 | ob = preprocess(observation) 203 | obf = [] 204 | print(ob.shape) 205 | for t in range(10000): 206 | env.render() 207 | #print(observation) 208 | action = env.action_space.sample() 209 | observation, reward, done, info = env.step(action) 210 | obf.append(preprocess(observation)) 211 | #print(action) 212 | if done == True: 213 | print("Episode finished") 214 | break 215 | 216 | #print(obf[0]) 217 | #print(len(obf)) 218 | #obff = np.zeros((len(obf), 84, 84, 1)) 219 | #for i,x in enumerate(obf): 220 | # obff[i] = x 221 | #print(obff) 222 | -------------------------------------------------------------------------------- /simple_dqn/nature_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import gym 3 | from gym import wrappers 4 | import tensorflow as tf 5 | import tensorflow.contrib.slim as slim 6 | import numpy as np 7 | from scipy.misc import imresize 8 | from collections import deque 9 | import sys 10 | import os 11 | import random 12 | import cv2 13 | 14 | flags = tf.app.flags 15 | flags.DEFINE_boolean('train', True, 'Whether to do training or testing') 16 | flags.DEFINE_string('env_name', 'Pong', 'The name of gym environment to use') 17 | 18 | env = gym.make(flags.FLAGS.env_name + 'NoFrameskip-v0') 19 | 20 | ACTIONS = env.action_space.n 21 | INITIAL_EPSILON = 1. 22 | FINAL_EPSILON = 0.05 23 | REPLAY_MEMORY = 1000000 24 | max_episodes = 100000 25 | BATCH = 32 26 | GAMMA = 0.99 27 | max_iter = 5000 28 | 29 | def rgb2gray(frame): 30 | 31 | r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2] 32 | gray = 0.2989 * r + 0.5870 * g + 0.1140 * b 33 | 34 | return gray 35 | 36 | def preprocess(frame): 37 | 38 | gray_image = rgb2gray(frame) 39 | reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84)) 40 | x = np.reshape(reshaped_image, [84,84,1]) 41 | x *= 1 / 255.0 42 | 43 | return x 44 | 45 | def weight_variable(name, shape): 46 | initial = tf.contrib.layers.xavier_initializer() 47 | return tf.get_variable(name = name, shape = shape, initializer = initial) 48 | 49 | def bias_variable(shape): 50 | initial = tf.constant(0.01, shape = shape) 51 | return tf.Variable(initial) 52 | 53 | def conv2d(x, W, stride): 54 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 55 | 56 | def max_pool_2x2(x): 57 | return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME") 58 | 59 | class dqn(): 60 | 61 | def __init__(self, clip_delta, scope, discount): 62 | 63 | self.clip_delta = clip_delta 64 | self.scope = scope 65 | self.discount = discount 66 | 67 | with tf.variable_scope(self.scope): 68 | 69 | self.net = self.build_net() 70 | 71 | self.y = tf.placeholder("float", [None]) 72 | self.diff = self.y - tf.reduce_max(self.net, axis = 1) 73 | 74 | if self.clip_delta > 0: 75 | quadratic_part = tf.minimum(abs(self.diff), self.clip_delta) 76 | linear_part = abs(self.diff) - quadratic_part 77 | self.loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part 78 | else: 79 | self.loss = 0.5 * self.diff ** 2 80 | 81 | self.loss = tf.reduce_mean(self.loss) 82 | self.train_step = tf.train.AdamOptimizer(0.00025).minimize(self.loss) 83 | self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) 84 | self.summaries = tf.summary.merge_all(tf.summary.scalar("loss", self.loss)) 85 | 86 | def build_net(self, ): 87 | 88 | # input - Batch X 84 X 84 X 4 89 | self.s = tf.placeholder("float", [None, 84, 84, 4]) 90 | 91 | # 8 X 8 X 4 x 32 - stride 4 92 | W_conv1 = weight_variable("w1", [8, 8, 4, 32]) 93 | b_conv1 = bias_variable([32]) 94 | 95 | # 4 X 4 X 32 x 64 - stride 2 96 | W_conv2 = weight_variable("w2", [4, 4, 32, 64]) 97 | b_conv2 = bias_variable([64]) 98 | 99 | # 3 X 3 X 64 x 64 - stride 1 100 | W_conv3 = weight_variable("w3", [3, 3, 64, 64]) 101 | b_conv3 = bias_variable([64]) 102 | 103 | # 3*3*64 ie. flattened output from conv3 104 | W_fc1 = weight_variable("w4",[3136, 512]) 105 | b_fc1 = bias_variable([512]) 106 | 107 | W_fc2 = weight_variable("w5",[512, ACTIONS]) 108 | b_fc2 = bias_variable([ACTIONS]) 109 | 110 | conv1 = tf.nn.relu(conv2d(self.s, W_conv1, 4) + b_conv1) 111 | conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2) 112 | conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 1) + b_conv3) 113 | 114 | # flatten the output from conv3 layer 115 | conv3_flat = tf.reshape(conv3, [-1, 3136]) 116 | 117 | # add two fully connected layers 118 | fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1) 119 | out_fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) 120 | 121 | return out_fc2 122 | 123 | def copy_model_parameters(sess, estimator1, estimator2): 124 | """ 125 | Copies the model parameters of one estimator to another. 126 | Args: 127 | sess: Tensorflow session instance 128 | estimator1: Estimator to copy the paramters from 129 | estimator2: Estimator to copy the parameters to 130 | """ 131 | e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)] 132 | e1_params = sorted(e1_params, key=lambda v: v.name) 133 | e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)] 134 | e2_params = sorted(e2_params, key=lambda v: v.name) 135 | 136 | update_ops = [] 137 | for e1_v, e2_v in zip(e1_params, e2_params): 138 | op = e2_v.assign(e1_v) 139 | update_ops.append(op) 140 | 141 | sess.run(update_ops) 142 | 143 | def rollout(sess, q_network, target_network): 144 | 145 | merged_summary_op = tf.summary.merge_all() 146 | q_summary = tf.Summary() 147 | 148 | num_episodes = 0 149 | epsilon = INITIAL_EPSILON 150 | 151 | replay_memory = deque() 152 | 153 | sess.run(tf.initialize_all_variables()) 154 | saver = tf.train.Saver(tf.all_variables()) 155 | 156 | checkpoint = tf.train.latest_checkpoint(checkpoint_dir) 157 | checkpoint_path = os.path.join(checkpoint_dir, "model") 158 | if checkpoint: 159 | saver.restore(sess, checkpoint) 160 | print("Loaded model checkpoint {}...".format(checkpoint)) 161 | 162 | print("collecting initial rollouts...") 163 | i = 0 164 | global_step = 0 165 | 166 | while num_episodes < max_episodes: 167 | 168 | ob = env.reset() 169 | steps_per_episode = 0 170 | reward_per_episode = 0 171 | 172 | ob_flkr = preprocess(ob) 173 | obf_flkr = np.reshape(np.stack((ob_flkr, ob_flkr), axis=2), (84, 84, 2)) 174 | obf = np.amax((obf_flkr[:,:,0], obf_flkr[:,:,1]), (0)) 175 | state = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4)) 176 | 177 | action_index = np.argmax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) })) 178 | loss_per_episode = 0 179 | reward_per_episode = 0 180 | 181 | for t in range(10000): 182 | 183 | if epsilon > FINAL_EPSILON: 184 | epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 100000 185 | 186 | ob, reward, done, info = env.step(action_index) 187 | 188 | REWARD = reward 189 | if reward > 1: 190 | REWARD = 1 191 | if reward < -1: 192 | REWARD = -1 193 | 194 | reward_per_episode += reward 195 | ob_flkr = preprocess(ob) 196 | obf_flkr = np.append(ob_flkr, obf_flkr[:,:,0:1], axis = 2) 197 | obf = np.amax((obf_flkr[:,:,0], obf_flkr[:,:,1]), (0)).reshape((84,84,1)) 198 | 199 | 200 | if i == 3: 201 | action_index = 0 202 | action = np.zeros(ACTIONS, np.int32) 203 | 204 | if random.random() <= epsilon: 205 | action_index = random.randrange(ACTIONS) 206 | action[action_index] = 1 207 | else: 208 | action_index = np.argmax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) })) 209 | action[action_index] = 1 210 | 211 | next_state = np.append(obf, state[:,:,0:3], axis = 2) 212 | 213 | replay_memory.append((state, action, REWARD, next_state, done)) 214 | 215 | if len(replay_memory) > REPLAY_MEMORY: 216 | replay_memory.popleft() 217 | 218 | if global_step > 50: 219 | 220 | #training starts 221 | minibatch = random.sample(replay_memory, BATCH) 222 | 223 | # get the batch variables 224 | state_batch = [d[0] for d in minibatch] 225 | action_batch = [d[1] for d in minibatch] 226 | reward_batch = [d[2] for d in minibatch] 227 | next_state_batch = [d[3] for d in minibatch] 228 | done_batch = [d[4] for d in minibatch] 229 | 230 | next_q_value = sess.run(target_network.net, feed_dict = {target_network.s : next_state_batch}) 231 | q_value = sess.run(q_network.net, feed_dict = {q_network.s : state_batch}) 232 | 233 | target_batch = np.asarray(reward_batch) + q_network.discount * \ 234 | (np.ones_like(done_batch) - done_batch) * \ 235 | np.max(next_q_value, axis=1) 236 | 237 | _, loss = sess.run([q_network.train_step, q_network.loss], feed_dict = { \ 238 | q_network.s : state_batch, \ 239 | q_network.y : target_batch}) 240 | 241 | print("\riteration {} @ episode {}/{}".format(global_step, num_episodes, max_episodes), end="") 242 | sys.stdout.flush() 243 | 244 | if global_step % 10000 == 0: 245 | print("\nsaving model now") 246 | saver.save(sess, checkpoint_path) 247 | print("\nupdating target network...") 248 | copy_model_parameters(sess, q_network, target_network) 249 | 250 | steps_per_episode += 1 251 | loss_per_episode += loss 252 | 253 | global_step += 1 254 | state = next_state 255 | 256 | i += 1 257 | 258 | if i == 4: 259 | i = 0 260 | 261 | if done or steps_per_episode >= max_iter: 262 | num_episodes += 1 263 | if global_step > 50: 264 | print("\nloss per episode {}".format(loss_per_episode / steps_per_episode)) 265 | print("\nreward per episode {}".format(reward_per_episode)) 266 | Q = np.amax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) })) 267 | 268 | q_summary.value.add(simple_value=steps_per_episode, node_name="episode_lengths", tag="episode_lengths") 269 | q_summary.value.add(simple_value=Q, node_name="q_value", tag="q_value") 270 | q_summary.value.add(simple_value=reward_per_episode, node_name="episode_reward", tag="episode_reward") 271 | q_network.summary_writer.add_summary(q_summary, global_step) 272 | q_network.summary_writer.flush() 273 | 274 | ob = env.reset() 275 | break 276 | 277 | monitor_dir = os.path.abspath("./{}-experiment/".format(flags.FLAGS.env_name)) 278 | checkpoint_dir = os.path.abspath("./dqn/") 279 | logs_path = os.path.abspath("./tensorboard_example/") 280 | 281 | env = wrappers.Monitor(env, monitor_dir, force=True) 282 | 283 | #load_path='/home/manan/Downloads/models/pong.ckpt-2920000-2940000' 284 | #save_path = '/home/manan/Downloads/models2/pong.ckpt' 285 | 286 | sess = tf.InteractiveSession() 287 | 288 | q_network = dqn(1.0, scope="q_net", discount=0.99) 289 | target_network = dqn(1.0, scope="target_network", discount=0.99) 290 | 291 | rollout(sess, q_network, target_network) 292 | -------------------------------------------------------------------------------- /weight_conversion/tf_pre_model.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers import Dense, Activation, Flatten, Convolution2D 3 | 4 | class tensorflow_model: 5 | def __init__(self, ): 6 | 7 | model = Sequential() 8 | model.add(Convolution2D(32, 8, strides=(4,4), input_shape=(84, 84, 4), data_format="channels_last")) 9 | #model.layers[1].set_weights(param_values["w1"]) 10 | model.add(Activation('relu')) 11 | model.add(Convolution2D(64, 4, strides=(2,2))) 12 | model.add(Activation('relu')) 13 | model.add(Convolution2D(64, 3, strides=(1,1))) 14 | model.add(Activation('relu')) 15 | model.add(Flatten()) 16 | model.add(Dense(512)) 17 | model.add(Activation('relu')) 18 | model.add(Dense(6)) 19 | -------------------------------------------------------------------------------- /weight_conversion/tf_pre_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/tf_pre_model.pyc -------------------------------------------------------------------------------- /weight_conversion/th2tf_weights.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from keras import backend as K 5 | from keras.utils.layer_utils import convert_all_kernels_in_model 6 | 7 | ''' IMPORT YOUR SCRIPT FILE HERE TO CREATE YOUR MODEL LATER ''' 8 | from th_pre_model import theano_model 9 | from tf_pre_model import tensorflow_model 10 | 11 | ''' BACKEND must be TENSORFLOW 12 | This is a script to convert Theano models (Theano Backend, TH dim ordering) 13 | to the other possible backend / dim ordering combinations. 14 | Given weights and model for TH-kernels-TH-dim-ordering, produces a folder with 15 | - TH-kernels-TF-dim-ordering 16 | - TF-kernels-TH-dim-ordering 17 | - TF-kernels-TF-dim-ordering 18 | Needs 3 important inputs: 19 | 1) Theano model (model with TH dim ordering) 20 | 2) Tensorflow model (model with TF dim ordering) 21 | 3) Weight file for Theano model (theano-kernels-th-dim-ordering) 22 | Supports : Multiple weights for same model (auto converts different weights for same model) 23 | Usage: 24 | 1) Place script in the same directory as the weight file directory. If you want to place somewhere 25 | else, then you must provide absolute path to the weight files below instead of relative paths. 26 | 2) Edit the script to create your model : 27 | a) Import your model building script above (in the imports section) 28 | b) Set `th_dim_model` = ... (create your th dim model here and set it to th_dim_model) 29 | c) Set `tf_dim_model` = ... (create your tf dim model here and set it to tf_dim_model) 30 | d) Add the path to the weight files in `model_weights`. 31 | Note : The weight files must be for the Theano model (theano kernels, th dim ordering) 32 | 3) Run the script. 33 | 4) Use the weight files in the created folders : ["tf-kernels-tf-dim/", "tf-kernels-th-dim/", "th-kernels-tf-dim/"] 34 | ''' 35 | 36 | K.set_image_dim_ordering('th') 37 | th_dim_model = theano_model() # Create your theano model here with TH dim ordering 38 | 39 | K.set_image_dim_ordering('tf') 40 | tf_dim_model = tensorflow_model() # Create your tensorflow model with TF dimordering here 41 | 42 | model_weights = ['theano_weights.h5f'] # Add names of theano model weight file paths here. 43 | # These weights are assumed to be for theano backend 44 | # (th kernels) with th dim ordering! 45 | # ('w1 shape', (32, 4, 8, 8)) 46 | 47 | """ 48 | No need to edit anything below this. Simply run the script now after 49 | editing the above 3 inputs. 50 | """ 51 | 52 | 53 | def shuffle_rows(original_w, nb_last_conv, nb_rows_dense): 54 | ''' Note : 55 | This algorithm to shuffle dense layer rows was provided by Kent Sommers (@kentsommer) 56 | in a gist : https://gist.github.com/kentsommer/e872f65926f1a607b94c2b464a63d0d3 57 | ''' 58 | converted_w = np.zeros(original_w.shape) 59 | count = 0 60 | for index in range(original_w.shape[0]): 61 | if (index % nb_last_conv) == 0 and index != 0: 62 | count += 1 63 | new_index = ((index % nb_last_conv) * nb_rows_dense) + count 64 | print("index from " + str(index) + " -> " + str(new_index)) 65 | converted_w[index] = original_w[new_index] 66 | 67 | return converted_w 68 | 69 | 70 | first_dense = True 71 | nb_last_conv = 0 72 | 73 | for dirpath in ["tf-kernels-tf-dim-ordering/", "tf-kernels-th-dim-ordering/", "th-kernels-tf-dim-ordering/"]: 74 | if not os.path.exists(dirpath): 75 | os.makedirs(dirpath) 76 | 77 | # Converts (theano kernels, th dim ordering) to (tensorflow kernels, th dim ordering) 78 | K.set_image_dim_ordering('tf') 79 | for weight_fn in model_weights: 80 | th_dim_model.load_weights(weight_fn) 81 | convert_all_kernels_in_model(th_dim_model) 82 | 83 | th_dim_model.save_weights("tf-kernels-th-dim-ordering/%s" % weight_fn, overwrite=True) 84 | print("Done tf-kernels-th-dim %s" % weight_fn) 85 | 86 | 87 | # Converts (theano kernels, th dim ordering) to (tensorflow kernels, tf dim ordering) 88 | K.set_image_dim_ordering('th') 89 | for weight_fn in model_weights: 90 | th_dim_model.load_weights(weight_fn) # th-kernels-th-dim 91 | convert_all_kernels_in_model(th_dim_model) # tf-kernels-th-dim 92 | 93 | count_dense = 0 94 | for layer in th_dim_model.layers: 95 | if layer.__class__.__name__ == "Dense": 96 | count_dense += 1 97 | 98 | if count_dense == 1: 99 | first_dense = False # If there is only 1 dense, no need to perform row shuffle in Dense layer 100 | 101 | print("Nb layers : ", len(th_dim_model.layers)) 102 | 103 | for index, th_layer in enumerate(th_dim_model.layers): 104 | if th_layer.__class__.__name__ in ['Convolution1D', 105 | 'Convolution2D', 106 | 'Convolution3D', 107 | 'AtrousConvolution2D', 108 | 'Deconvolution2D']: 109 | weights = th_layer.get_weights() # tf-kernels-th-dim 110 | weights[0] = weights[0].transpose((2, 3, 1, 0)) 111 | tf_dim_model.layers[index].set_weights(weights) # tf-kernels-tf-dim 112 | 113 | nb_last_conv = th_layer.nb_filter # preserve last number of convolutions to use with dense layers 114 | print("Converted layer %d : %s" % (index + 1, th_layer.name)) 115 | else: 116 | if th_layer.__class__.__name__ == "Dense" and first_dense: 117 | weights = th_layer.get_weights() 118 | nb_rows_dense_layer = weights[0].shape[0] // nb_last_conv 119 | 120 | print("Magic Number 1 : ", nb_last_conv) 121 | print("Magic nunber 2 : ", nb_rows_dense_layer) 122 | 123 | weights[0] = shuffle_rows(weights[0], nb_last_conv, nb_rows_dense_layer) 124 | tf_dim_model.layers[index].set_weights(weights) 125 | 126 | first_dense = False 127 | print("Shuffled Dense Weights layer and saved %d : %s" % (index + 1, th_layer.name)) 128 | else: 129 | tf_dim_model.layers[index].set_weights(th_layer.get_weights()) 130 | print("Saved layer %d : %s" % (index + 1, th_layer.name)) 131 | 132 | 133 | tf_dim_model.save_weights("tf-kernels-tf-dim-ordering/%s" % weight_fn, overwrite=True) 134 | print("Done tf-kernels-tf-dim %s" % weight_fn) 135 | 136 | 137 | # Converts (theano kernels, th dim ordering) to (theano kernels, tf dim ordering) 138 | for weight_fn in model_weights: 139 | th_dim_model.load_weights(weight_fn) 140 | 141 | for index, th_layer in enumerate(th_dim_model.layers): 142 | if th_layer.__class__.__name__ in ['Convolution1D', 143 | 'Convolution2D', 144 | 'Convolution3D', 145 | 'AtrousConvolution2D', 146 | 'Deconvolution2D']: 147 | weights = th_layer.get_weights() 148 | weights[0] = weights[0].transpose((2, 3, 1, 0)) 149 | tf_dim_model.layers[index].set_weights(weights) 150 | else: 151 | tf_dim_model.layers[index].set_weights(th_layer.get_weights()) 152 | 153 | print("Changed dim %d : %s" % (index + 1, th_layer.name)) 154 | 155 | tf_dim_model.save_weights("th-kernels-tf-dim-ordering/%s" % weight_fn, overwrite=True) 156 | print("Done th-kernels-tf-dim %s" % weight_fn) 157 | -------------------------------------------------------------------------------- /weight_conversion/th_pre_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["KERAS_BACKEND"] = "device" 3 | from keras.models import Sequential 4 | from keras.layers import Dense, Activation, Flatten, Convolution2D 5 | 6 | 7 | def theano_model(): 8 | 9 | model = Sequential() 10 | model.add(Convolution2D(32, 8, strides=(4,4), input_shape=(4, 84, 84), data_format="channels_first")) 11 | model.add(Activation('relu')) 12 | model.add(Convolution2D(64, 4, strides=(2,2))) 13 | model.add(Activation('relu')) 14 | model.add(Convolution2D(64, 3, strides=(1,1))) 15 | model.add(Activation('relu')) 16 | model.add(Flatten()) 17 | model.add(Dense(512)) 18 | model.add(Activation('relu')) 19 | model.add(Dense(6)) 20 | return model 21 | -------------------------------------------------------------------------------- /weight_conversion/th_pre_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/th_pre_model.pyc -------------------------------------------------------------------------------- /weight_conversion/theano_params.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['THEANO_FLAGS'] = "device=gpu0" 3 | 4 | import theano 5 | import numpy 6 | import pickle 7 | 8 | pkl = open('./pong_dqn_v4_reg_0.01/network_file_50.pkl', 'rb') 9 | data = pickle.load(pkl) 10 | 11 | import lasagne 12 | 13 | params = lasagne.layers.get_all_params(data.l_out) 14 | param_values = {} 15 | 16 | id = 0 17 | 18 | for p in params: 19 | 20 | if str(p) == "W": 21 | param_values["w%d" %(id+1)] = p.get_value().T 22 | 23 | if str(p) == "b": 24 | param_values["w%d" %(id+1)] = p.get_value() 25 | id += 1 26 | 27 | pkl.close() 28 | pkl = open('./pong_dqn_v4_reg_0.01/network_params.pkl', 'wb') 29 | pickle.dump(param_values, pkl) 30 | 31 | pkl.close() 32 | -------------------------------------------------------------------------------- /weight_conversion/theano_weights.h5f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/theano_weights.h5f --------------------------------------------------------------------------------