├── README.md
├── actor-mimic
    ├── actor-mimic.py
    └── train_AMN.py
├── dqn_reg
    ├── dqn_reg__v4_train.py
    ├── dqn_reg_v4.py
    └── dqn_reg_v4.pyc
├── dqn_reg_models
    ├── network_file_49.pkl
    ├── network_file_50.pkl
    ├── q_network_reg_v4.py
    └── q_network_reg_v4.pyc
├── frame_prediction_atari
    ├── ae_dqn.py
    ├── ae_random.py
    ├── tensorboard
    ├── test_autoencoder_alter.py
    └── test_multistep_autoencoder.py
├── images
    ├── image_screenshot8_24.05.2017.png
    ├── image_screenshot9_24.05.2017.png
    ├── image_screenshot_23.05.2017.png
    ├── prediction_screenshot10_24.05.2017.png
    ├── prediction_screenshot11_24.05.2017.png
    ├── prediction_screenshot12_25.05.2017.png
    ├── prediction_screenshot13_25.05.2017.png
    ├── prediction_screenshot13_26.05.2017.png
    └── prediction_screenshot14_26.05.2017.png
├── misc
    ├── caffe_atari_cnn.py
    └── policy_gradients.py
├── simple_dqn
    ├── naive_nips_dqn.py
    └── nature_dqn.py
└── weight_conversion
    ├── tf_pre_model.py
    ├── tf_pre_model.pyc
    ├── th2tf_weights.py
    ├── th_pre_model.py
    ├── th_pre_model.pyc
    ├── theano_params.py
    └── theano_weights.h5f


/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement-Learning
2 | Contains implementations of various deep RL algorithms and papers, including :
3 | 
4 |   1. Human level control through deep reinforcement learning (https://arxiv.org/abs/1312.5602)
5 |   2. Action-Conditional Video Prediction using Deep Networks in Atari Games (https://arxiv.org/abs/1507.08750)
6 |   3. Actor-Mimic : Deep Multitask and Transfer Reinforcement Learning (https://arxiv.org/abs/1511.06342)
7 | 


--------------------------------------------------------------------------------
/actor-mimic/actor-mimic.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     This is an implementation of the Actor Mimic Network described in the
  4 |     Actor Critic paper. The network is designed in order to distill expert
  5 |     policies which are trained on model based prediction. Therefore, to be
  6 |     consistent with the paper, the AMN comprises of the same architecture
  7 |     as the individual policies.
  8 | 
  9 |     Expert code flow :
 10 | 
 11 |     experiment.run --> agent.step --> agent._do_training --> network.train --> which basically calls a minibatch update
 12 |     |                                           |  --->  combine all these in one file which describes the environment, samples (probably another file)
 13 |     --------------------------------------------         and calls training updates, referencing to a train function in AMN class
 14 | """
 15 | 
 16 | import tensorflow as tf
 17 | import numpy as np
 18 | import random
 19 | 
 20 | TEMP = 5
 21 | 
 22 | class AMN:
 23 | 
 24 |     # initiate tensorboard summaries
 25 |     def __init__(self, ):
 26 | 
 27 |         num_actions = tf.placeholder("uint8", ())
 28 |         self.Q_val = self.build_net(num_actions) / TEMP
 29 | 
 30 |         # load weights for expert 1, 2, 3...
 31 |         # one hot encoded vector
 32 |         teacher = tf.placeholder("float", [BATCH, num_actions])
 33 | 
 34 |         loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(teacher, self.Q_val))
 35 |         train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)
 36 |         # sample an action from
 37 | 
 38 |     def train_step(true_policy, sampled_state, sampled_action, num_actions):
 39 | 
 40 |         train_step.run(feed_dict = {
 41 |                 teacher : true_policy,
 42 |                 state : sampled_state,
 43 |                 action : sampled_action
 44 |                 num_actions : num_actions})
 45 | 
 46 |         loss = cost.eval(feed_dict = {
 47 |                 teacher : true_policy,
 48 |                 state : sampled_state,
 49 |                 action : sampled_action
 50 |                 num_actions : num_actions})
 51 | 
 52 |     def weight_variable(name, shape):
 53 |         initial = tf.contrib.layers.xavier_initializer()
 54 |         return tf.get_variable(name, shape, initial)
 55 | 
 56 |     def bias_variable(shape):
 57 |         initial = tf.constant(0.1, shape = shape)
 58 |         return tf.Variable(initial)
 59 | 
 60 |     def conv2d(x, W, stride):
 61 |         return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 62 | 
 63 |     def build_net(self, num_actions):
 64 | 
 65 |         state = tf.placeholder("float", [BATCH, 8, 84, 84])
 66 |         action = tf.placeholder("float", [BATCH, 4])
 67 | 
 68 |         s_in = tf.reshape(state, [-1, 4, 84, 84])
 69 |         a_in = tf.reshape(action, [-1])
 70 |         # add action embeddings
 71 | 
 72 |         #w1_embed = weight_variable("w1_embed", [-1, ])
 73 | 
 74 |         w1_conv = weight_variable("w1_conv", [4, 4, 4, 64])
 75 |         b1_conv = bias_variable([64])
 76 | 
 77 |         # 22 x 22
 78 |         w2_conv = weight_variable("w2_conv", [4, 4, 64, 64])
 79 |         b2_conv = bias_variable([64])
 80 | 
 81 |         # 10 x 10
 82 |         w3_conv = weight_variable("w3_conv", [3, 3, 64, 64])
 83 |         b3_conv = bias_variable([64])
 84 | 
 85 |         # 8 x 8
 86 |         w_fc1 = weight_variable("w_fc1", [8, 512])
 87 |         b_fc1 = bias_variable([512])
 88 | 
 89 |         conv1 = tf.nn.relu(conv2d(s_in, w1_conv, 4) + b1_conv)
 90 |         conv2 = tf.nn.relu(conv2d(conv1, w2_conv, 2) + b2_conv)
 91 |         conv3 = tf.nn.relu(conv2d(conv2, w3_conv, 4) + b3_conv)
 92 | 
 93 |         conv3_reshaped = tf.reshape(conv3, [-1, _])
 94 | 
 95 |         fc1 = tf.nn.relu(tf.matmul(conv3_reshaped, w_fc1) + b_fc1)
 96 |         fc1_reshaped = tf.reshape(fc1, [-1, 512*2])
 97 | 
 98 |         latent_curr_true = fc1_reshaped[:,0:512]
 99 |         latent_next_true = fc1_reshaped[:,512:1024]
100 | 
101 |         w_fc2 = weight_variable("w_fc2", [512, num_actions])
102 |         b_fc2 = bias_variable([num_actions])
103 | 
104 |         l_out = tf.nn.relu(tf.matmul(latent_curr_true, w_fc2) + b_fc2)
105 | 
106 |         return l_out
107 | 
108 |     def Qval_to_action(self, Qval):
109 | 
110 |         #self.Qvalue = Qval
111 |         num = tf.exp(Qval / T)
112 |         policy = num  / tf.reduce_sum(num)
113 | 
114 |         return policy
115 | 


--------------------------------------------------------------------------------
/actor-mimic/train_AMN.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This script describes the OpenAI gym environment for the source tasks,
  3 |     samples actions from either the AMN or the expert networks and provides
  4 |     the true policy and the sampled state action pairs to train the AMN
  5 | 
  6 | """
  7 | 
  8 | import gym
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from actor-mimic import AMN
 12 | from scipy.misc import imresize
 13 | import random
 14 | from collections import deque
 15 | import cv2
 16 | 
 17 | net = AMN()
 18 | 
 19 | game1_load_path = './pong_dqn_v4_reg_0.01/network_file_50.pkl'
 20 | game2_load_path = './pong_dqn_v4_reg_0.01/network_file_50.pkl'
 21 | #-------------------importing the pretrained models----------------------------#
 22 | 
 23 | import theano
 24 | import pickle
 25 | 
 26 | print("unpickling first game...")
 27 | pkl = open(game_1_load_path, 'rb')
 28 | game_1 = pickle.load(pkl)
 29 | pkl.close()
 30 | 
 31 | print("unpickling second game...")
 32 | pkl = open(game2_load_path, 'rb')
 33 | game_2 = pickle.load(pkl)
 34 | 
 35 | 
 36 | #-----------------------------------------------------------------------------#
 37 | 
 38 | # implementing intially only for two games
 39 | game = ['MsPacman-v0', 'Pong-v0'] # add accordingly
 40 | 
 41 | num_exp = len(game)
 42 | 
 43 | def preprocess(frame):
 44 | 
 45 |     gray_image = frame.mean(2)
 46 |     reshaped_image = imresize(gray_image, (84,84))
 47 |     x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32)
 48 |     x *= (1.0 / 255.0) # divide by 255
 49 | 
 50 |     return x
 51 | 
 52 | def get_num_actions(game_id):
 53 | 
 54 | 
 55 | def get_AMN_policy(s_t, num_actions):
 56 | 
 57 |     #num_actions = get_num_actions(game_id)
 58 |     q_vals = net.build_net.eval(feed_dict = {state : s_t, num_actions : num_actions})
 59 | 
 60 |     one_hot = np.zeros(BATCH, num_actions)
 61 |     one_hot[:,np.argmax(q_vals, axis=1)] = 1
 62 |     AMN_policy = one_hot
 63 | 
 64 |     return AMN_policy
 65 | 
 66 | def get_true_policy(state_batch, AMN_action_batch, game_id):
 67 | 
 68 |     true_policy = []
 69 | 
 70 |     if game_id == 0:
 71 |         game = game_1
 72 |     else:
 73 |         game = game_2
 74 | 
 75 |     for i,s in enumerate(state_batch):
 76 |         game.state_shared.set_value(s)
 77 |         true_policy[i] = game._q_vals()
 78 | 
 79 |     return true_policy
 80 | 
 81 | def rollout(state, action, encode):
 82 | 
 83 |     #sess.run(tf.initialize_all_variables())
 84 |     saver = tf.train.Saver(tf.all_variables())
 85 |     saver.restore(sess, load_path)
 86 |     print("variables restored and loaded...")
 87 | 
 88 |     # stores history for all games separately
 89 |     replay_memory = []
 90 |     s_t = []
 91 |     s_t1 = []
 92 | 
 93 |     for i in range(num_exp):
 94 |         D = deque()
 95 |         replay_memory.append(D)
 96 |         s_t.append([])
 97 |         s_t1.append([])
 98 | 
 99 |     num_episodes = np.zeros(num_exp)
100 |     k = 0
101 | 
102 |     while np.max(num_episodes) < MAX_EPISODES:
103 | 
104 |         game_id = random.randint(0, num_exp - 1)
105 |         env = gym.make(game[game_id])
106 |         num_actions = env.action_space.n
107 |         ob = env.reset()
108 | 
109 |         obf = preprocess(ob)
110 |         s_t[game_id] = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
111 |         observations, actions = [], []
112 | 
113 |         i = 0
114 |         print("num of episodes ", num_episodes[game_id])
115 | 
116 |         for t in range(10000):
117 |             env.render() #optional
118 | 
119 |             q_val_AMN = get_AMN_policy(s_t[game_id], num_actions)
120 |             # epsilon greedy policy
121 |             if random.random() <= epsilon:
122 |                 action_index = random.randrange(ACTIONS)
123 |                 action[action_index] = 1
124 |             else:
125 |                 action_index = np.argmax(q_val_AMN) # create instance from AMN class
126 |                 action[action_index] = 1
127 | 
128 |             ob, reward, done, info = env.step(action_index)
129 | 
130 |             obf = preprocess(ob)
131 | 
132 |             s_t1[game_id] = np.append(obf, s_t[:,:,0:3], axis = 2)
133 |             ''' uncomment for training '''
134 | 
135 |             replay_memory[game_id].append((s_t, action, obf))
136 |             if len(replay_memory[game_id]) > REPLAY_MEMORY:
137 |                 replay_memory[game_id].popleft()
138 | 
139 |             if train == True:
140 | 
141 |                 minibatch = random.sample(replay_memory[game_id], BATCH)
142 |                 state_batch = [d[0] for d in minibatch]
143 |                 AMN_action_batch = [d[1] for d in minibatch]
144 |                 # get true action
145 |                 true_action = get_true_policy(state_batch, AMN_action_batch)
146 |                 #num_actions = get_num_actions(game_id)
147 |                 # minibatch update
148 |                 net.train_step(true_action, state_batch, AMN_action_batch, num_actions)
149 | 
150 |             s_t[game_id] = s_t1[game_id]
151 | 
152 |             if done:
153 |                 num_episodes[game_id] += 1
154 |                 break
155 | 


--------------------------------------------------------------------------------
/dqn_reg/dqn_reg__v4_train.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This script describes the OpenAI gym environment for the source tasks,
  3 |     samples actions from either the AMN or the expert networks and provides
  4 |     the true policy and the sampled state action pairs to train the AMN
  5 | 
  6 | After skipping frames we generate the following sequence:
  7 |                                                      |-a_t-|       |-a_t+1-|     |-a_t+2-|     |-a_t+3-|
  8 | eps_start : action - ob - action - ob - action - ob - action - ob - action - ob - action - ob - action - ob - action - ob
  9 |                      |-------------- state t ------------------|
 10 |                                    |---------------- state t+1 ---------------|
 11 |                                                   |---------------- state t+2 ---------------|
 12 |                                                                |---------------- state t+3 ---------------|
 13 |                                                                               |---------------- state t+4 ---------------|
 14 | 
 15 | """
 16 | 
 17 | import gym
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | from dqn_reg_v4 import net_v4
 21 | from scipy.misc import imresize
 22 | import random
 23 | from collections import deque
 24 | import cv2
 25 | import itertools
 26 | 
 27 | BATCH = 32
 28 | MAX_EPISODES = 10
 29 | REPLAY_MEMORY = 1000
 30 | ACTIONS = 4
 31 | epsilon = 0.3
 32 | 
 33 | net = net_v4(0.99, 10000, BATCH, 0.5)
 34 | 
 35 | def get_minibatch(D, BATCH):
 36 | 
 37 |     batch_id = 0
 38 |     minibatch = []
 39 | 
 40 | 
 41 |     while batch_id < BATCH:
 42 |         #print("D size", len(D))
 43 |         idx = random.randrange(len(D) - 8)
 44 |         range_idx = np.arange(idx, idx + 8)
 45 |         action_idx = np.arange(idx + 3, idx + 7)
 46 |         end_idx = idx + 3
 47 | 
 48 |         state_sample = [s[0] for s in D[idx : idx + 8]]
 49 |         action_sample = [s[1] for s in D[idx + 3 : idx + 7]]
 50 |         reward_sample = D[idx + 3][2]
 51 |         done_sample = D[idx + 3][3]
 52 |         print("action sample size", action_sample[0])
 53 |         minibatch.append((np.asarray(state_sample).transpose(3,1,2,0), action_sample, reward_sample, done_sample))
 54 |         batch_id += 1
 55 | 
 56 |     return minibatch
 57 | 
 58 | def preprocess(frame):
 59 | 
 60 |     gray_image = frame.mean(2)
 61 |     reshaped_image = imresize(gray_image, (84,84))
 62 |     x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32)
 63 |     x *= (1.0 / 255.0) # divide by 255
 64 | 
 65 |     return x
 66 | 
 67 | def get_policy(D, obf):
 68 | 
 69 |     idx = len(D) - 7
 70 |     #range_idx = np.arange(idx, idx+7)
 71 | 
 72 |     state = [s[0] for s in D[idx : idx+7]]
 73 |     state.append(obf)
 74 |     #print("state shape input", state[0].shape)
 75 | 
 76 |     q_vals = net.q_val(np.asarray(state).transpose(3,1,2,0))
 77 | 
 78 |     one_hot = np.zeros((BATCH, ACTIONS))
 79 |     one_hot[:,np.argmax(q_vals, axis=1)] = 1
 80 |     AMN_policy = one_hot
 81 | 
 82 |     return AMN_policy
 83 | 
 84 | def rollout():
 85 | 
 86 |     sess = tf.InteractiveSession()
 87 |     sess.run(tf.initialize_all_variables())
 88 |     saver = tf.train.Saver(tf.all_variables())
 89 |     #saver.restore(sess, load_path)
 90 |     #print("variables restored and loaded...")
 91 | 
 92 |     env = gym.make('Pong-v0')
 93 |     ACTIONS = env.action_space.n
 94 | 
 95 |     # stores history for all games separately
 96 |     s_t = []
 97 |     s_t1 = []
 98 | 
 99 |     D = []
100 |     k = 0
101 |     num_episodes = 0
102 |     train = False
103 | 
104 |     while num_episodes < MAX_EPISODES:
105 | 
106 |         ob = env.reset()
107 | 
108 |         obf = preprocess(ob)
109 |         s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
110 |         observations, actions = [], []
111 |         REWARD = 0
112 |         action_index = random.randrange(ACTIONS)
113 | 
114 |         i = 0
115 |         print("num of episodes ", num_episodes)
116 | 
117 |         for t in range(10000):
118 |             env.render() #optional
119 | 
120 |             ob, reward, done, info = env.step(action_index)
121 | 
122 |             REWARD += reward
123 |             obf = preprocess(ob)
124 |             #print("D lenght", len(D))
125 | 
126 |             if i == 3:
127 | 
128 |                 if len(D) > 8:
129 |                     q_val = get_policy(D, obf)
130 |                     # epsilon greedy policy
131 |                     if random.random() <= epsilon:
132 |                         action_index = random.randrange(ACTIONS)
133 |                         #action[action_index] = 1
134 |                     else:
135 |                         action_index = np.argmax(q_val)
136 |                         #action[action_index] = 1
137 | 
138 |                 else:
139 |                     action_index = random.randrange(ACTIONS)
140 | 
141 |                 #s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2)
142 |                 D.append((obf, int(action_index), REWARD, done))
143 |                 if len(D) > REPLAY_MEMORY:
144 |                     D.pop(0)
145 | 
146 |             if num_episodes > 2:
147 |                 train = True
148 | 
149 |             if train == True:
150 | 
151 |                 print("training now...")
152 |                 minibatch = get_minibatch(D, BATCH)
153 |                 print("minibatch collected...")
154 |                 state_batch = [d[0] for d in minibatch]
155 |                 action_batch = [d[1] for d in minibatch]
156 |                 reward_batch = [d[2] for d in minibatch]
157 |                 done_batch = [d[3] for d in minibatch]
158 |                 print("minibatch state shape", np.asarray(state_batch).shape)
159 |                 # minibatch update
160 |                 net.train(np.asarray(state_batch).reshape(32,84,84,8), np.asarray(action_batch).reshape(32,4),
161 |                                                        np.asarray(reward_batch).reshape(32,1), np.asarray(done_batch).reshape(32,1))
162 | 
163 |             if i == 3:
164 |                 #s_t = s_t1
165 |                 i = 0
166 | 
167 |             i += 1
168 | 
169 |             if done:
170 |                 num_episodes += 1
171 |                 break
172 | 
173 | rollout()
174 | 


--------------------------------------------------------------------------------
/dqn_reg/dqn_reg_v4.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     The network is designed in order to distill expert
  4 |     policies which are trained on model based prediction. Therefore, to be
  5 |     consistent with the paper, the AMN comprises of the same architecture
  6 |     as the individual policies.
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | import random
 12 | 
 13 | 
 14 | class net_v4:
 15 | 
 16 |     # initiate tensorboard summaries
 17 |     def __init__(self, discount, clip_delta, batch, lambda_reg):
 18 | 
 19 |         self.discount = discount
 20 |         self.clip_delta = clip_delta
 21 |         self.batch = batch
 22 |         self.lambda_reg = lambda_reg
 23 | 
 24 |         #num_actions = tf.placeholder("uint8", ())
 25 |         self.Q_val, self.f_pred, self.f_true = self.build_net(6)
 26 | 
 27 |         self.next_Q_val = tf.placeholder("float", [self.batch, 6])
 28 |         #self.action = tf.placeholder("float", [self.batch, 4])
 29 |         self.reward = tf.placeholder("float", [self.batch, ])
 30 |         self.done = tf.placeholder("float", [self.batch, ])
 31 | 
 32 |         target = self.reward + self.discount * \
 33 |         tf.to_float((np.ones_like(self.done) - self.done)) * tf.reduce_max(self.next_Q_val, axis=1, keep_dims=True)
 34 | 
 35 |         # not yet clear what it does actually
 36 |         action_mask = np.equal(tf.reshape(np.arange(16), [1,-1]), tf.reshape(self.action[:,0], [-1,1]))
 37 |         out = tf.reshape(tf.reduce_sum((self.Q_val*action_mask), 1), [-1,1])
 38 | 
 39 |         self.diff = target - out
 40 |         self.diff_reg = self.f_true - self.f_pred
 41 | 
 42 |         if self.clip_delta > 0:
 43 |             quadratic_part = tf.minimum(abs(self.diff), self.clip_delta)
 44 |             linear_part = abs(self.diff) - quadratic_part
 45 |             self.loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
 46 |         else:
 47 |             self.loss = 0.5 * self.diff ** 2
 48 | 
 49 |         self.loss += tf.reduce_sum(0.5 * self.lambda_reg * (self.diff_reg ** 2), 1)
 50 |         self.loss = tf.reduce_sum(self.loss)
 51 |         tf.summary.scalar("loss", self.loss)
 52 | 
 53 |         optimizer = tf.train.AdamOptimizer(learning_rate = 0.00025)
 54 |         self.train_step = optimizer.minimize(self.loss)
 55 |         # sample an action from
 56 | 
 57 |     def q_val(self, state):
 58 | 
 59 |         return self.Q_val.eval(feed_dict = {self.state : state})
 60 | 
 61 |     def train(self, state, action, reward, done, merged_summary_op):
 62 | 
 63 |         state_padded = np.zeros((state.shape[0], state.shape[1]+1, state.shape[2], state.shape[3]))
 64 |         state_padded[:,:-1] = state
 65 | 
 66 |         next_Q_val = self.Q_val.eval(feed_dict = {self.state : state_padded[:, 1:]})
 67 | 
 68 |         self.train_step.run(feed_dict = {
 69 |                 self.state : state_padded[:, :-1],
 70 |                 self.action : action,
 71 |                 self.next_Q_val : next_Q_val,
 72 |                 self.reward : reward,
 73 |                 self.done : done})
 74 | 
 75 |         cost = self.loss.eval(feed_dict = {
 76 |                 self.state : state,
 77 |                 self.action : action,
 78 |                 self.next_Q_val : next_Q_val,
 79 |                 self.reward : reward,
 80 |                 self.done : done})
 81 |         diff_reg = self.diff_reg.eval(feed_dict = {
 82 |                 self.state : state,
 83 |                 self.action : action,
 84 |                 self.next_Q_val : next_Q_val,
 85 |                 self.reward : reward,
 86 |                 self.done : done})
 87 | 
 88 |         diff = self.diff.eval(feed_dict = {
 89 |                 self.state : state,
 90 |                 self.action : action,
 91 |                 self.next_Q_val : next_Q_val,
 92 |                 self.reward : reward,
 93 |                 self.done : done})
 94 |         #print("loss", np.sum(0.5 * self.lambda_reg * (diff_reg ** 2)))
 95 |         #print("loss diff", np.sum(diff))
 96 |         print("loss total", cost)
 97 | 
 98 |         summary = merged_summary_op.eval(feed_dict = {
 99 |                 self.state : state,
100 |                 self.action : action,
101 |                 self.next_Q_val : next_Q_val,
102 |                 self.reward : reward,
103 |                 self.done : done})
104 | 
105 |         return summary
106 | 
107 |     def weight_variable(self, name, shape):
108 |         initial = tf.contrib.layers.xavier_initializer()
109 |         return tf.get_variable(name = name, shape = shape, initializer = initial)
110 | 
111 |     def bias_variable(self, shape):
112 |         initial = tf.constant(0.1, shape = shape)
113 |         return tf.Variable(initial)
114 | 
115 |     def conv2d(self, x, W, stride):
116 |         return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
117 | 
118 |     def build_net(self, num_actions):
119 | 
120 |         self.state = tf.placeholder("float", [None, 84, 84, 8])
121 |         self.action = tf.placeholder("int32", [None, 4])
122 | 
123 |         s_in = tf.reshape(self.state, [-1, 84, 84, 4])
124 |         a_in = tf.reshape(self.action, [-1, ])
125 |         # add action embeddings
126 |         embeddings = tf.Variable(tf.random_uniform([num_actions, 256], -1.0, 1.0))
127 | 
128 |         a_embed = tf.nn.embedding_lookup(embeddings, a_in)
129 |         a_embed_reshaped = tf.reshape(a_embed, [-1, 4*256])
130 | 
131 |         w1_conv = self.weight_variable("w1_conv", [8, 8, 4, 64])
132 |         b1_conv = self.bias_variable([64])
133 | 
134 |         # 22 x 22
135 |         w2_conv = self.weight_variable("w2_conv", [4, 4, 64, 64])
136 |         b2_conv = self.bias_variable([64])
137 | 
138 |         # 10 x 10
139 |         w3_conv = self.weight_variable("w3_conv", [3, 3, 64, 64])
140 |         b3_conv = self.bias_variable([64])
141 | 
142 |         # 8 x 8
143 |         w_fc1 = self.weight_variable("w_fc1", [3136, 512])
144 |         b_fc1 = self.bias_variable([512])
145 | 
146 |         conv1 = tf.nn.relu(self.conv2d(s_in, w1_conv, 4) + b1_conv)
147 |         conv2 = tf.nn.relu(self.conv2d(conv1, w2_conv, 2) + b2_conv)
148 |         conv3 = tf.nn.relu(self.conv2d(conv2, w3_conv, 1) + b3_conv)
149 | 
150 |         print("conv1", conv1.shape)
151 |         print("conv2", conv2.shape)
152 |         print("conv3", conv3.shape)
153 | 
154 |         conv3_reshaped = tf.reshape(conv3, [-1, 7*7*64])
155 | 
156 |         fc1 = tf.nn.relu(tf.matmul(conv3_reshaped, w_fc1) + b_fc1)
157 |         fc1_reshaped = tf.reshape(fc1, [-1, 512*2])
158 | 
159 |         l_curr_true = fc1_reshaped[:,0:512]
160 |         l_next_true = fc1_reshaped[:,512:1024]
161 | 
162 |         w_fc_act = self.weight_variable("w_fc_act", [256*4, 512])
163 |         b_fc_act = self.bias_variable([512])
164 | 
165 |         w_fc_curr = self.weight_variable("w_fc_curr", [512, 512])
166 |         b_fc_curr = self.bias_variable([512])
167 | 
168 |         fc_act = tf.nn.relu(tf.matmul(a_embed_reshaped, w_fc_act) + b_fc_act)
169 |         fc_curr = tf.nn.relu(tf.matmul(l_curr_true, w_fc_curr) + b_fc_curr)
170 | 
171 |         l_concat = tf.concat([fc_act, fc_curr], 1)
172 | 
173 |         w_fc_pred = self.weight_variable("w_fc_pred", [1024,512])
174 |         b_fc_pred = self.bias_variable([512])
175 | 
176 |         fc_next_pred = tf.nn.relu(tf.matmul(l_concat, w_fc_pred) + b_fc_pred)
177 | 
178 |         w_fc2 = self.weight_variable("w_fc2", [512, num_actions])
179 |         b_fc2 = self.bias_variable([num_actions])
180 | 
181 |         l_out = tf.nn.relu(tf.matmul(l_curr_true, w_fc2) + b_fc2)
182 | 
183 |         return l_out, fc_next_pred, l_next_true
184 | 
185 |     def Qval_to_action(self, Qval):
186 | 
187 |         #self.Qvalue = Qval
188 |         num = tf.exp(Qval / T)
189 |         policy = num  / tf.reduce_sum(num)
190 | 
191 |         return policy
192 | 


--------------------------------------------------------------------------------
/dqn_reg/dqn_reg_v4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg/dqn_reg_v4.pyc


--------------------------------------------------------------------------------
/dqn_reg_models/network_file_49.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/network_file_49.pkl


--------------------------------------------------------------------------------
/dqn_reg_models/network_file_50.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/network_file_50.pkl


--------------------------------------------------------------------------------
/dqn_reg_models/q_network_reg_v4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for deep Q-learning as described in:
  3 | 
  4 | Playing Atari with Deep Reinforcement Learning
  5 | NIPS Deep Learning Workshop 2013
  6 | 
  7 | and
  8 | 
  9 | Human-level control through deep reinforcement learning.
 10 | Nature, 518(7540):529-533, February 2015
 11 | 
 12 | 
 13 | Author of Lasagne port: Nissan Pow
 14 | Modifications: Nathan Sprague
 15 | """
 16 | import lasagne
 17 | import numpy as np
 18 | import theano
 19 | import theano.tensor as T
 20 | from updates import deepmind_rmsprop
 21 | 
 22 | 
 23 | class DeepQLearner:
 24 |     """
 25 |     Deep Q-learning network using Lasagne.
 26 |     """
 27 | 
 28 |     def __init__(self, input_width, input_height, num_actions,
 29 |                  num_frames, discount, learning_rate, rho,
 30 |                  rms_epsilon, momentum, clip_delta, freeze_interval,
 31 |                  batch_size, network_type, update_rule, lambda_reg,
 32 |                  batch_accumulator, pretrained_net, rng, input_scale=255.0):
 33 | 
 34 |         self.input_width = input_width
 35 |         self.input_height = input_height
 36 |         self.num_actions = num_actions
 37 |         self.num_frames = num_frames
 38 |         self.batch_size = batch_size
 39 |         self.discount = discount
 40 |         self.rho = rho
 41 |         self.lr = learning_rate
 42 |         self.rms_epsilon = rms_epsilon
 43 |         self.momentum = momentum
 44 |         self.clip_delta = clip_delta
 45 |         self.freeze_interval = freeze_interval
 46 |         self.rng = rng
 47 |         self.lambda_reg = lambda_reg
 48 | 
 49 |         lasagne.random.set_rng(self.rng)
 50 | 
 51 |         self.update_counter = 0
 52 | 
 53 |         self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \
 54 |                                         self.build_network(network_type, \
 55 |                                         input_width, input_height, num_actions,\
 56 |                                         num_frames, batch_size)
 57 | 
 58 |         if self.freeze_interval > 0:
 59 |             self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \
 60 |                                 self.build_network(network_type, input_width, \
 61 |                                 input_height, num_actions, num_frames, batch_size)
 62 |             self.reset_q_hat()
 63 | 
 64 |         states = T.tensor4('states')
 65 |         next_states = T.tensor4('next_states')
 66 |         rewards = T.col('rewards')
 67 |         actions = T.imatrix('actions')
 68 |         terminals = T.icol('terminals')
 69 | 
 70 |         # Shared variables for training from a minibatch of replayed
 71 |         # state transitions, each consisting of num_frames + 1 (due to
 72 |         # overlap) images, along with the chosen action and resulting
 73 |         # reward and terminal status.
 74 |         self.imgs_shared = theano.shared(
 75 |             np.zeros((batch_size, num_frames*2+1, input_height, input_width),
 76 |                      dtype=theano.config.floatX))
 77 |         self.rewards_shared = theano.shared(
 78 |             np.zeros((batch_size, 1), dtype=theano.config.floatX),
 79 |             broadcastable=(False, True))
 80 |         self.actions_shared = theano.shared(
 81 |             np.zeros((batch_size, num_frames), dtype='int32')
 82 |             )
 83 |         self.terminals_shared = theano.shared(
 84 |             np.zeros((batch_size, 1), dtype='int32'),
 85 |             broadcastable=(False, True))
 86 | 
 87 |         # Shared variable for a single state, to calculate q_vals.
 88 |         self.state_shared = theano.shared(
 89 |             np.zeros((num_frames*2, input_height, input_width),
 90 |                      dtype=theano.config.floatX))
 91 | 
 92 |         q_vals, z_pred, z_true = lasagne.layers.get_output(
 93 |                                     [self.l_out, self.pred_z, self.true_z],
 94 |                                     inputs = {self.l_in: states / input_scale,
 95 |                                         self.l_act_in: actions}
 96 |                                 )
 97 |         
 98 |         if self.freeze_interval > 0:
 99 |             next_q_vals = lasagne.layers.get_output(
100 |                                     self.next_l_out,
101 |                                     {self.next_l_in: next_states / input_scale, 
102 |                                      self.next_l_act_in: actions}
103 |                                     )
104 |         else:
105 |             next_q_vals = lasagne.layers.get_output(
106 |                                     self.l_out,
107 |                                     {self.l_in: next_states / input_scale, 
108 |                                      self.l_act_in: actions}
109 |                                     )
110 |             next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
111 | 
112 |         terminalsX = terminals.astype(theano.config.floatX)
113 |         actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
114 |                 actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX)
115 | 
116 |         target = (rewards +
117 |                   (T.ones_like(terminalsX) - terminalsX) *
118 |                   self.discount * T.max(next_q_vals, axis=1, keepdims=True))
119 |         output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
120 |         diff = target - output
121 |         diff_reg = z_true - z_pred
122 | 
123 |         if self.clip_delta > 0:
124 |             # If we simply take the squared clipped diff as our loss,
125 |             # then the gradient will be zero whenever the diff exceeds
126 |             # the clip bounds. To avoid this, we extend the loss
127 |             # linearly past the clip point to keep the gradient constant
128 |             # in that regime.
129 |             # 
130 |             # This is equivalent to declaring d loss/d q_vals to be
131 |             # equal to the clipped diff, then backpropagating from
132 |             # there, which is what the DeepMind implementation does.
133 |             quadratic_part = T.minimum(abs(diff), self.clip_delta)
134 |             linear_part = abs(diff) - quadratic_part
135 |             loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
136 |         else:
137 |             loss = 0.5 * diff ** 2
138 | 
139 |         loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1)
140 | 
141 |         if batch_accumulator == 'sum':
142 |             loss = T.sum(loss)
143 |         elif batch_accumulator == 'mean':
144 |             loss = T.mean(loss)
145 |         else:
146 |             raise ValueError("Bad accumulator: {}".format(batch_accumulator))
147 | 
148 |         params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z])  
149 |         train_givens = {
150 |             states: self.imgs_shared[:, :-1],
151 |             next_states: self.imgs_shared[:, 1:],
152 |             rewards: self.rewards_shared,
153 |             actions: self.actions_shared,
154 |             terminals: self.terminals_shared
155 |         }
156 | 
157 |         if update_rule == 'deepmind_rmsprop':
158 |             updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
159 |                                        self.rms_epsilon)
160 |         elif update_rule == 'rmsprop':
161 |             updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
162 |                                               self.rms_epsilon)
163 |         elif update_rule == 'sgd':
164 |             updates = lasagne.updates.sgd(loss, params, self.lr)
165 |         else:
166 |             raise ValueError("Unrecognized update: {}".format(update_rule))
167 | 
168 |         if self.momentum > 0:
169 |             updates = lasagne.updates.apply_momentum(updates, None,
170 |                                                      self.momentum)
171 | 
172 |         self._train = theano.function([], [loss], updates=updates,
173 |                                       givens=train_givens)
174 |         q_givens = {
175 |             states: self.state_shared.reshape((1,
176 |                                                self.num_frames*2,
177 |                                                self.input_height,
178 |                                                self.input_width))
179 |         }
180 |         self._q_vals = theano.function([], q_vals[0], givens=q_givens)
181 | 
182 |     def build_network(self, network_type, input_width, input_height,
183 |                       output_dim, num_frames, batch_size):
184 |         if network_type == "latent_dnn_v4":
185 |             return self.build_latent_network_dnn_v4(input_width, input_height,
186 |                                                output_dim, num_frames,
187 |                                                batch_size)
188 |         else:
189 |             raise ValueError("Unrecognized network: {}".format(network_type))
190 | 
191 |     def train(self, imgs, actions, rewards, terminals):
192 |         """
193 |         Train one batch.
194 | 
195 |         Arguments:
196 | 
197 |         imgs - b x (2f) x h x w numpy array, where b is batch size,
198 |                f is num frames, h is height and w is width.
199 |         actions - b x 4 numpy array of integers
200 |         rewards - b x 1 numpy array
201 |         terminals - b x 1 numpy boolean array (currently ignored)
202 | 
203 |         Returns: average loss
204 |         """
205 |         imgs_padded = np.zeros((imgs.shape[0], imgs.shape[1]+1,
206 |                                 imgs.shape[2], imgs.shape[3]), dtype=np.float32)
207 |         imgs_padded[:,:-1] = imgs 
208 | 
209 |         self.imgs_shared.set_value(imgs_padded)
210 |         self.actions_shared.set_value(actions)
211 |         self.rewards_shared.set_value(rewards)
212 |         self.terminals_shared.set_value(terminals)
213 |         if (self.freeze_interval > 0 and
214 |             self.update_counter % self.freeze_interval == 0):
215 |             self.reset_q_hat()
216 |         loss = self._train()
217 |         self.update_counter += 1
218 |         return np.sqrt(loss)
219 | 
220 |     def q_vals(self, state):
221 |         self.state_shared.set_value(state)
222 |         return self._q_vals()
223 | 
224 |     def choose_action(self, state, epsilon):
225 |         if self.rng.rand() < epsilon:
226 |             return self.rng.randint(0, self.num_actions)
227 |         q_vals = self.q_vals(state)
228 |         return np.argmax(q_vals)
229 | 
230 |     def reset_q_hat(self):
231 |         all_params = lasagne.layers.helper.get_all_param_values(self.l_out)
232 |         lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params)
233 | 
234 |     def build_latent_network_dnn_v4(self, input_width, input_height, output_dim,
235 |                                  num_frames, batch_size):
236 |         """
237 |         Build a large network consistent with the DeepMind Nature paper.
238 |         """
239 |         from lasagne.layers import dnn
240 |         
241 |         """
242 |         States input
243 |         """
244 |         l_in = lasagne.layers.InputLayer(
245 |             shape=(None, num_frames*2, input_width, input_height)
246 |         )
247 |         
248 |         """
249 |         Integer encoding input for actions
250 |         """
251 |         l_act_in = lasagne.layers.InputLayer(
252 |             shape=(None, 4)
253 |         )
254 | 
255 |         l_act_in_reshaped = lasagne.layers.ReshapeLayer(
256 |             l_act_in,
257 |             shape=(-1, )
258 |         )
259 |         
260 |         """
261 |         Action embedding
262 |         """
263 |         l_act_embed = lasagne.layers.EmbeddingLayer(
264 |             l_act_in_reshaped,
265 |             input_size=output_dim,
266 |             output_size=256,
267 |             W=lasagne.init.HeUniform()
268 |         )
269 |         
270 |         l_act_embed_reshaped = lasagne.layers.ReshapeLayer(
271 |             l_act_embed,
272 |             shape=(-1, num_frames*256)
273 |         )
274 |         
275 |         """
276 |         State embedding
277 |         """
278 |         l_reshaped_in = lasagne.layers.ReshapeLayer(
279 |             l_in, 
280 |             shape=(-1, num_frames, input_width, input_height)
281 |         )
282 | 
283 |         l_conv1 = dnn.Conv2DDNNLayer(
284 |             l_reshaped_in,
285 |             num_filters=32,
286 |             filter_size=(8, 8),
287 |             stride=(4, 4),
288 |             nonlinearity=lasagne.nonlinearities.rectify,
289 |             W=lasagne.init.HeUniform(),
290 |             b=lasagne.init.Constant(.1)
291 |         )
292 | 
293 |         l_conv2 = dnn.Conv2DDNNLayer(
294 |             l_conv1,
295 |             num_filters=64,
296 |             filter_size=(4, 4),
297 |             stride=(2, 2),
298 |             nonlinearity=lasagne.nonlinearities.rectify,
299 |             W=lasagne.init.HeUniform(),
300 |             b=lasagne.init.Constant(.1)
301 |         )
302 | 
303 |         l_conv3 = dnn.Conv2DDNNLayer(
304 |             l_conv2,
305 |             num_filters=64,
306 |             filter_size=(3, 3),
307 |             stride=(1, 1),
308 |             nonlinearity=lasagne.nonlinearities.rectify,
309 |             W=lasagne.init.HeUniform(),
310 |             b=lasagne.init.Constant(.1)
311 |         )
312 | 
313 |         l_hidden1 = lasagne.layers.DenseLayer(
314 |             l_conv3,
315 |             num_units=512,
316 |             nonlinearity=lasagne.nonlinearities.rectify,
317 |             W=lasagne.init.HeUniform(),
318 |             b=lasagne.init.Constant(.1)
319 |         )
320 | 
321 |         l_hidden_reshaped = lasagne.layers.ReshapeLayer(
322 |             l_hidden1,
323 |             shape=(-1, 512*2)
324 |         )
325 | 
326 |         """ 
327 |         "True" latent embeddings for current state and future state
328 |         """
329 |         l_latent_1 = lasagne.layers.SliceLayer(
330 |             l_hidden_reshaped,
331 |             indices=slice(0, 512),
332 |             axis=1
333 |         )
334 | 
335 |         l_out_3 = lasagne.layers.SliceLayer(
336 |             l_hidden_reshaped,
337 |             indices=slice(512, 1024),
338 |             axis=1
339 |         )
340 | 
341 |         """
342 |         Future state latent embedding prediction using current
343 |         state and future action embeddings
344 |         """
345 |         l_act_project = lasagne.layers.DenseLayer(
346 |             l_act_embed_reshaped,
347 |             num_units=512,
348 |             nonlinearity=lasagne.nonlinearities.rectify,
349 |             W=lasagne.init.HeUniform(),
350 |             b=lasagne.init.Constant(.1)
351 |         )
352 | 
353 |         l_state_project = lasagne.layers.DenseLayer(
354 |             l_latent_1,
355 |             num_units=512,
356 |             nonlinearity=lasagne.nonlinearities.rectify,
357 |             W=lasagne.init.HeUniform(),
358 |             b=lasagne.init.Constant(.1)
359 |         )
360 | 
361 |         l_project_concat = lasagne.layers.ConcatLayer(
362 |             [l_act_project, l_state_project],
363 |             axis=1
364 |         )
365 | 
366 |         l_out_2 = lasagne.layers.DenseLayer(
367 |             l_project_concat,
368 |             num_units=512,
369 |             nonlinearity=lasagne.nonlinearities.rectify,
370 |             W=lasagne.init.HeUniform(),
371 |             b=lasagne.init.Constant(.1)
372 |         )
373 | 
374 |         """
375 |         Action prediction based on current state
376 |         """
377 |         l_out_1 = lasagne.layers.DenseLayer(
378 |             l_latent_1,
379 |             num_units=output_dim,
380 |             nonlinearity=None,
381 |             W=lasagne.init.HeUniform(),
382 |             b=lasagne.init.Constant(.1)
383 |         )
384 | 
385 |         return l_in, l_act_in, l_out_1, l_out_2, l_out_3
386 | 
387 | def main():
388 |     net = DeepQLearner(84, 84, 16, 4, .99, .00025, .95, .95, 10000,
389 |                        32, 'nature_cuda')
390 | 
391 | 
392 | if __name__ == '__main__':
393 |     main()
394 | 


--------------------------------------------------------------------------------
/dqn_reg_models/q_network_reg_v4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/dqn_reg_models/q_network_reg_v4.pyc


--------------------------------------------------------------------------------
/frame_prediction_atari/ae_dqn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | ''' We use downsampled gray scale images - 84 X 84,
  4 |     consider only every 4th frame as input, applying
  5 |     the same action for the intermediate frames.
  6 |     Minibatch size is taken to be 32. Each input
  7 |     consists of a fixed memory of T = 4 to unroll
  8 |     each trajectory and pass in as an input. K, which
  9 |     is the prediction step parameter, taken to be 1'''
 10 | 
 11 | ''' latest model is stored at /Downloads/models3/ '''
 12 | 
 13 | import argparse
 14 | import os
 15 | import sys
 16 | import gym
 17 | import numpy as np
 18 | import tensorflow as tf
 19 | from scipy.misc import imresize
 20 | import random
 21 | from collections import deque
 22 | import cv2
 23 | 
 24 | from baselines import deepq
 25 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
 26 | from baselines.deepq.experiments.atari.model import model, dueling_model
 27 | import baselines.common.tf_util as U
 28 | from baselines.common.misc_util import (
 29 |     boolean_flag,
 30 |     SimpleMonitor,
 31 | )
 32 | 
 33 | # redundant as for now
 34 | #flags = tf.app.flags
 35 | #flags.DEFINE_boolean('train', True, 'Whether to do training or testing')
 36 | #flags.DEFINE_string('env_name', 'PongNoFrameskip-v0', 'The name of gym environment to use')
 37 | 
 38 | def parse_args():
 39 |     parser = argparse.ArgumentParser("Run an already learned DQN model.")
 40 |     # Environment
 41 |     parser.add_argument("--env", type=str, required=True, help="name of the game")
 42 |     parser.add_argument("--is_train", default=True, help="name of the game")
 43 |     parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
 44 |     parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.")
 45 |     boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
 46 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
 47 | 
 48 |     return parser.parse_args()
 49 | 
 50 | args = parse_args()
 51 | 
 52 | env = gym.make(args.env)
 53 | env = wrap_dqn(env)
 54 | 
 55 | epsilon = 0.35
 56 | MAX_EPISODES = 100000
 57 | BATCH = 32 # change to 1 while predicting
 58 | max_iter = 10000
 59 | ACTIONS = env.action_space.n
 60 | FACTORS = 2048
 61 | REPLAY_MEMORY = 1000000
 62 | 
 63 | def weight_variable(shape):
 64 |     initial = tf.truncated_normal(shape, stddev = 0.01)
 65 |     return tf.Variable(initial)
 66 | 
 67 | def bias_variable(shape):
 68 |     initial = tf.constant(0.0, shape = shape)
 69 |     return tf.Variable(initial)
 70 | 
 71 | def conv2d(x, W, stride):
 72 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 73 | 
 74 | def conv2d_nopad(x, W, stride):
 75 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
 76 | 
 77 | def deconv2d(x, W, output_shape, stride):
 78 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME")
 79 | 
 80 | def deconv2d_nopad(x, W, output_shape, stride):
 81 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID")
 82 | 
 83 | def max_pool_2x2(x):
 84 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 85 | 
 86 | class autoencoder():
 87 | 
 88 |     def __init__(self, scope):
 89 | 
 90 |         self.scope = scope
 91 | 
 92 |         with tf.variable_scope(self.scope):
 93 |             #with tf.device('/gpu:0'):
 94 | 
 95 |             self.pred_frame = self.build_encoder()
 96 |             self.y = tf.placeholder("float", [BATCH, 84, 84])
 97 |             self.loss = tf.square(tf.norm(self.y - self.pred_frame))
 98 |             self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
 99 | 
100 |             tf.summary.scalar("loss", self.loss)
101 |             self.summary_writer = tf.summary.FileWriter(logs_path)
102 |             self.summaries = tf.summary.merge_all()
103 | 
104 |     def build_encoder(self, ):
105 | 
106 |         # input - Batch X 84 X 84 X 4
107 |         self.state = tf.placeholder("float", [None, 84, 84, 4])
108 |         self.action = tf.placeholder("float", [None, ACTIONS])
109 | 
110 |         # 6 X 6 X 4 x 64 - stride 2
111 |         W_conv1 = weight_variable([6, 6, 4, 64])
112 |         wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer())
113 |         b_conv1 = bias_variable([64])
114 | 
115 |         # 6 X 6 X 64 x 64 - stride 2
116 |         W_conv2 = weight_variable([6, 6, 64, 64])
117 |         b_conv2 = bias_variable([64])
118 | 
119 |         # 6 X 6 X 64 x 64 - stride 2
120 |         W_conv3 = weight_variable([6, 6, 64, 64])
121 |         b_conv3 = bias_variable([64])
122 | 
123 |         # _*16 ie. flattened output from conv3
124 |         W_fc1 = weight_variable([10*10*64, 1024])
125 |         b_fc1 = bias_variable([1024])
126 | 
127 |         #second fully connected layer - 2048 units
128 |         W_fc2 = weight_variable([1024, 2048])
129 |         b_fc2 = bias_variable([2048])
130 | 
131 |         #W_fc2 = weight_variable([256, ACTIONS])
132 |         #b_fc2 = bias_variable([ACTIONS])
133 | 
134 |         conv1 = tf.nn.relu(conv2d_nopad(self.state, wconv, 2) + b_conv1)
135 |         #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
136 |         #print("padded shape", padded_conv1.shape)
137 | 
138 |         conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2)
139 |         #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
140 | 
141 |         conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3)
142 | 
143 |         conv3_flat = tf.reshape(conv3, [-1, 10*10*64])
144 |         fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1)
145 |         fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
146 | 
147 |         # 6 X 6 X 4 x 64 - stride 2
148 |         W_enc = weight_variable([FACTORS, 2048])
149 |         W_dec = weight_variable([2048, FACTORS])
150 |         W_action = weight_variable([FACTORS, ACTIONS])
151 |         b_interactions = bias_variable([2048])
152 | 
153 |         #W_henc = tf.matmul(W_enc, fc2)
154 |         #W_a = tf.matmul(W_action, action)
155 |         #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions
156 | 
157 |         W_henc = tf.matmul(fc2, tf.transpose(W_enc))
158 |         W_a = tf.matmul(self.action, tf.transpose(W_action))
159 |         fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions
160 | 
161 |         # first fully connected layer after multiplicative interaction- 2048
162 |         W_fc3 = weight_variable([2048, 1024])
163 |         b_fc3 = bias_variable([1024])
164 | 
165 |         # second fully connected layer after multiplicative interaction- 1024 units
166 |         W_fc4 = weight_variable([1024, 10*10*64])
167 |         b_fc4 = bias_variable([10*10*64])
168 | 
169 |         fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3)
170 |         # TRYING OUT AN ALL CONV. NET
171 |         #fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3)
172 |         fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4)
173 | 
174 |         # reshaping into a 4-D matrix
175 |         fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64])
176 | 
177 |         # deconv variables
178 |         W_deconv1 = weight_variable([6, 6, 64, 64])
179 |         b_deconv1 = bias_variable([64])
180 | 
181 |         W_deconv2 = weight_variable([6, 6, 64, 64])
182 |         b_deconv2 = bias_variable([64])
183 | 
184 |         W_deconv3 = weight_variable([6, 6, 1, 64])
185 |         b_deconv3 = bias_variable([1])
186 | 
187 |         # output - 1 x 84 84
188 |         deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1)
189 |         deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2)
190 |         deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3
191 | 
192 | 
193 |         #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84])
194 |         encode = tf.reshape(deconv3, [-1, 84, 84])
195 | 
196 |         return encode
197 | 
198 |     def predict(self, sess, s, a):
199 |         """
200 |         Predicts the next state based on the current action.
201 |         Args:
202 |           sess: Tensorflow session
203 |           s: State input of shape [batch_size, 4, 160, 160, 3]
204 |           a : Action input of shape [batch_size, ACTIONS]
205 |         Returns:
206 |           Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated
207 |           action values.
208 |         """
209 |         return sess.run(self.pred_frame, { self.state: s, self.action : a })
210 | 
211 |     def update(self, sess, s, a, y, p, global_step):
212 |         """
213 |         Updates the estimator towards the given targets.
214 |         Args:
215 |           sess: Tensorflow session object
216 |           s: State input of shape [batch_size, 84, 84, 4]
217 |           a: Chosen actions of shape [batch_size, ACTIONS]
218 |           y: Targets of shape [batch_size, 84, 84]
219 |           p : Predicted next observation frame of shape [batch_size, 84, 84]
220 |         Returns:
221 |           The calculated loss on the batch.
222 |         """
223 |         feed_dict = { self.y : y, self.pred_frame : p,
224 |                     self.state : s, self.action : a }
225 |         summaries, _, loss = sess.run(
226 |             [self.summaries, self.train_step, self.loss], feed_dict)
227 |         #print("summaries", summaries)
228 |         if self.summary_writer:
229 |             self.summary_writer.add_summary(summaries, global_step)
230 |         return loss
231 | 
232 | def rgb2gray(frame):
233 | 
234 |     r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2]
235 |     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
236 | 
237 |     return gray
238 | 
239 | def preprocess(frame):
240 | 
241 |     gray_image = rgb2gray(frame)
242 |     reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84))
243 |     x = np.reshape(reshaped_image, [84,84,1])
244 |     x *= 1 / 255.0
245 | 
246 |     return x
247 | 
248 | def rollout(sess, prediction_net, act):
249 | 
250 | 
251 |     #tf.summary.scalar("Qval", encode)
252 |     merged_summary_op = tf.summary.merge_all()
253 | 
254 |     sess.run(tf.variables_initializer(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = "ae")))
255 |     saver = tf.train.Saver(var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = "ae"))
256 |     print("ae varibles initialized and saver defined")
257 | 
258 |     checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
259 |     if checkpoint:
260 |         saver.restore(sess, checkpoint)
261 |         print("Loaded model checkpoint {}...".format(checkpoint))
262 | 
263 |     #summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
264 | 
265 |     D = deque()
266 |     num_episodes = 0
267 |     k = 0
268 | 
269 |     while num_episodes < MAX_EPISODES:
270 |         s_t = env.reset()
271 |         #ob = env.reset()
272 |         #print("shape obf", np.asarray(obf).shape)
273 | 
274 |         #obf = preprocess(ob)
275 |         #s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
276 |         #observations, actions = [], []
277 | 
278 |         #i = 0
279 | 
280 |         for t in range(10000):
281 |             #env.render() #optional
282 |             env.unwrapped.render()
283 | 
284 | 
285 |             #action_id = env.action_space.sample()
286 |             #action_id = random.randint(0,5)
287 |             action_id = act(np.array(s_t)[None], stochastic=args.stochastic)[0]
288 |             action_vector = np.zeros(ACTIONS)
289 |             action_vector[action_id] = 1
290 |             #actions.append(action_vector)
291 | 
292 |             s_t1, reward, done, info = env.step(action_id)
293 |             #ob, reward, done, info = env.step(action_id)
294 | 
295 |             #obf = preprocess(ob)
296 | 
297 |             #s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2)
298 | 
299 |             # if training, collect data and apply learning updates
300 |             if args.is_train:
301 |                 # storing current state and the next frame
302 |                 D.append((np.array(s_t) / 255.0, action_vector, np.array(s_t1)[:, :, 3] / 255.0))
303 |                 #D.append((s_t, action_vector, obf))
304 |                 if len(D) > REPLAY_MEMORY:
305 |                     D.popleft()
306 | 
307 |                 if num_episodes > 2:
308 |                     minibatch = random.sample(D, BATCH)
309 |                     action_batch = [d[1] for d in minibatch]
310 |                     state_batch = [d[0] for d in minibatch]
311 |                     target_batch = [d[2] for d in minibatch]
312 |                     target_batch = np.reshape(target_batch, (BATCH, 84, 84))
313 | 
314 |                     pred_batch = prediction_net.predict(sess, np.reshape(state_batch, (BATCH, 84, 84, 4)), np.reshape(action_batch, (BATCH, 6)))
315 | 
316 |                     loss = prediction_net.update(sess, state_batch, action_batch, target_batch, pred_batch, k)
317 | 
318 |                     #summary_writer.add_summary(summary, num_episodes)
319 | 
320 |                     print("\riteration {} @ Episode {}/{}, loss {}".format(k, num_episodes, MAX_EPISODES, loss), end="")
321 |                     sys.stdout.flush()
322 | 
323 |                     if k % 100000 == 0 and k != 0:
324 |                         print("\nsaving model now")
325 |                         saver.save(sess, save_path, global_step = t)
326 | 
327 |                     k += 1
328 |                     # display the first frame of the minibatch
329 |                     cv2.imshow("prediction", pred_batch[0])
330 |                     cv2.imshow("target", target_batch[0])
331 |                     cv2.imshow("input", state_batch[0][:,:,0])
332 |                     cv2.waitKey(5)
333 | 
334 |             else:
335 |                 #render video frames while testing
336 |                 prediction = prediction_net.predict(sess, np.reshape(np.asarray(s_t), (1, 84, 84, 4)), np.reshape(action_vector, (1, 6)))
337 |                 #print("prediction shape", prediction[0])
338 |                 cv2.imshow("prediction", prediction[0])
339 |                 cv2.waitKey(1)
340 | 
341 |             #k +=1
342 |             #if i == 3: #maybe change to 4
343 |             #    i = 0
344 |             #else:
345 |             #    i +=1
346 | 
347 |             s_t = s_t1
348 | 
349 |             if done:
350 |                 num_episodes += 1
351 |                 break
352 | 
353 | 
354 | #sess = tf.Session()
355 | 
356 | checkpoint_dir = './checkpoints_ae/'
357 | save_path = './checkpoints_ae/'
358 | #save_path = '/home/manan/Downloads/models3/video_prediction.ckpt'
359 | #load_path='/home/manan/Downloads/models3/video_prediction.ckpt-302'
360 | logs_path = './logs/'
361 | 
362 | #sess_dqn = U.make_session(4)
363 | #sess_dqn.as_default()
364 | #config=tf.ConfigProto(log_device_placement=True)
365 | 
366 | with tf.Session() as sess:
367 | 
368 |     act = deepq.build_act(
369 |         make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
370 |         q_func=dueling_model if args.dueling else model,
371 |         num_actions=env.action_space.n)
372 | 
373 |     U.load_state(os.path.join(args.model_dir, "saved"))
374 |     #U.load_state('/tmp/models/model-atari-pong-1/saved')
375 |     prediction_net = autoencoder("ae")
376 |     rollout(sess, prediction_net, act)
377 | 
378 | '''Pong : Actions 2,4 : up
379 |                   3,5 : down
380 |                   0,1 : no movement'''
381 | 
382 | # basic code for simulating random policy
383 | '''for i_episode in range(2):
384 |     observation = env.reset()
385 |     ob = preprocess(observation)
386 |     print(ob.shape)
387 |     for t in range(10000)
388 |         env.render()
389 |         print(observation)
390 |         if random.random() < epsilon:
391 |             action = env.action_space.sample()
392 |         else:
393 |             action = 1
394 |         observation, reward, done, info = env.step(action)
395 |         #print(action)
396 |         if done == True:
397 |             print("Episode finished")
398 |             break'''
399 | 


--------------------------------------------------------------------------------
/frame_prediction_atari/ae_random.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | ''' We use downsampled gray scale images - 84 X 84,
  4 |     consider only every 4th frame as input, applying
  5 |     the same action for the intermediate frames.
  6 |     Minibatch size is taken to be 32. Each input
  7 |     consists of a fixed memory of T = 4 to unroll
  8 |     each trajectory and pass in as an input. K, which
  9 |     is the prediction step parameter, taken to be 1'''
 10 | 
 11 | ''' latest model is stored at /Downloads/models3/ '''
 12 | 
 13 | import sys
 14 | import gym
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from scipy.misc import imresize
 18 | import random
 19 | from collections import deque
 20 | import cv2
 21 | 
 22 | flags = tf.app.flags
 23 | flags.DEFINE_boolean('train', True, 'Whether to do training or testing')
 24 | flags.DEFINE_string('env_name', 'Pong-v0', 'The name of gym environment to use')
 25 | 
 26 | env = gym.make(flags.FLAGS.env_name)
 27 | 
 28 | epsilon = 0.35
 29 | MAX_EPISODES = 10000
 30 | BATCH = 32 # change to 1 while predicting
 31 | max_iter = 10000
 32 | ACTIONS = env.action_space.n
 33 | FACTORS = 2048
 34 | REPLAY_MEMORY = 1000000
 35 | 
 36 | def weight_variable(shape):
 37 |     initial = tf.truncated_normal(shape, stddev = 0.01)
 38 |     return tf.Variable(initial)
 39 | 
 40 | def bias_variable(shape):
 41 |     initial = tf.constant(0.0, shape = shape)
 42 |     return tf.Variable(initial)
 43 | 
 44 | def conv2d(x, W, stride):
 45 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 46 | 
 47 | def conv2d_nopad(x, W, stride):
 48 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
 49 | 
 50 | def deconv2d(x, W, output_shape, stride):
 51 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME")
 52 | 
 53 | def deconv2d_nopad(x, W, output_shape, stride):
 54 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID")
 55 | 
 56 | def max_pool_2x2(x):
 57 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 58 | 
 59 | class autoencoder():
 60 | 
 61 |     def __init__(self, ):
 62 | 
 63 |         self.pred_frame = self.build_encoder()
 64 |         self.y = tf.placeholder("float", [BATCH, 84, 84])
 65 |         self.loss = tf.square(tf.norm(self.y - self.pred_frame))
 66 |         self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
 67 | 
 68 |         self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
 69 |         self.summaries = tf.summary.merge_all(tf.summary.scalar("loss", self.loss))
 70 | 
 71 |     def build_encoder(self, ):
 72 | 
 73 |         # input - Batch X 84 X 84 X 4
 74 |         self.state = tf.placeholder("float", [None, 84, 84, 4])
 75 |         self.action = tf.placeholder("float", [None, ACTIONS])
 76 | 
 77 |         # 6 X 6 X 4 x 64 - stride 2
 78 |         W_conv1 = weight_variable([6, 6, 4, 64])
 79 |         wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer())
 80 |         b_conv1 = bias_variable([64])
 81 | 
 82 |         # 6 X 6 X 64 x 64 - stride 2
 83 |         W_conv2 = weight_variable([6, 6, 64, 64])
 84 |         b_conv2 = bias_variable([64])
 85 | 
 86 |         # 6 X 6 X 64 x 64 - stride 2
 87 |         W_conv3 = weight_variable([6, 6, 64, 64])
 88 |         b_conv3 = bias_variable([64])
 89 | 
 90 |         # _*16 ie. flattened output from conv3
 91 |         W_fc1 = weight_variable([10*10*64, 1024])
 92 |         b_fc1 = bias_variable([1024])
 93 | 
 94 |         #second fully connected layer - 2048 units
 95 |         W_fc2 = weight_variable([1024, 2048])
 96 |         b_fc2 = bias_variable([2048])
 97 | 
 98 |         #W_fc2 = weight_variable([256, ACTIONS])
 99 |         #b_fc2 = bias_variable([ACTIONS])
100 | 
101 |         conv1 = tf.nn.relu(conv2d_nopad(self.state, wconv, 2) + b_conv1)
102 |         #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
103 |         #print("padded shape", padded_conv1.shape)
104 | 
105 |         conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2)
106 |         #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
107 | 
108 |         conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3)
109 | 
110 |         conv3_flat = tf.reshape(conv3, [-1, 10*10*64])
111 |         fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1)
112 |         fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
113 | 
114 |         # 6 X 6 X 4 x 64 - stride 2
115 |         W_enc = weight_variable([FACTORS, 2048])
116 |         W_dec = weight_variable([2048, FACTORS])
117 |         W_action = weight_variable([FACTORS, ACTIONS])
118 |         b_interactions = bias_variable([2048])
119 | 
120 |         #W_henc = tf.matmul(W_enc, fc2)
121 |         #W_a = tf.matmul(W_action, action)
122 |         #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions
123 | 
124 |         W_henc = tf.matmul(fc2, tf.transpose(W_enc))
125 |         W_a = tf.matmul(self.action, tf.transpose(W_action))
126 |         fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions
127 | 
128 |         # first fully connected layer after multiplicative interaction- 2048
129 |         W_fc3 = weight_variable([2048, 1024])
130 |         b_fc3 = bias_variable([1024])
131 | 
132 |         # second fully connected layer after multiplicative interaction- 1024 units
133 |         W_fc4 = weight_variable([1024, 10*10*64])
134 |         b_fc4 = bias_variable([10*10*64])
135 | 
136 |         #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3)
137 |         # TRYING OUT AN ALL CONV. NET
138 |         fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3)
139 |         fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4)
140 | 
141 |         # reshaping into a 4-D matrix
142 |         fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64])
143 | 
144 |         # deconv variables
145 |         W_deconv1 = weight_variable([6, 6, 64, 64])
146 |         b_deconv1 = bias_variable([64])
147 | 
148 |         W_deconv2 = weight_variable([6, 6, 64, 64])
149 |         b_deconv2 = bias_variable([64])
150 | 
151 |         W_deconv3 = weight_variable([6, 6, 1, 64])
152 |         b_deconv3 = bias_variable([1])
153 | 
154 |         # output - 1 x 84 84
155 |         deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1)
156 |         deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2)
157 |         deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3
158 | 
159 | 
160 |         #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84])
161 |         encode = tf.reshape(deconv3, [-1, 84, 84])
162 | 
163 |         return encode
164 | 
165 |     def predict(self, sess, s, a):
166 |         """
167 |         Predicts the next state based on the current action.
168 |         Args:
169 |           sess: Tensorflow session
170 |           s: State input of shape [batch_size, 4, 160, 160, 3]
171 |           a : Action input of shape [batch_size, ACTIONS]
172 |         Returns:
173 |           Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated
174 |           action values.
175 |         """
176 |         return sess.run(self.pred_frame, { self.state: s, self.action : a })
177 | 
178 |     def update(self, sess, s, a, y, p):
179 |         """
180 |         Updates the estimator towards the given targets.
181 |         Args:
182 |           sess: Tensorflow session object
183 |           s: State input of shape [batch_size, 84, 84, 4]
184 |           a: Chosen actions of shape [batch_size, ACTIONS]
185 |           y: Targets of shape [batch_size, 84, 84]
186 |           p : Predicted next observation frame of shape [batch_size, 84, 84]
187 |         Returns:
188 |           The calculated loss on the batch.
189 |         """
190 |         feed_dict = { self.y : y, self.pred_frame : p,
191 |                     self.state : s, self.action : a }
192 |         _, loss = sess.run(
193 |             [self.train_step, self.loss], feed_dict)
194 |         #if self.summary_writer:
195 |         #    self.summary_writer.add_summary(summaries, global_step)
196 |         return loss
197 | 
198 | def rgb2gray(frame):
199 | 
200 |     r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2]
201 |     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
202 | 
203 |     return gray
204 | 
205 | def preprocess(frame):
206 | 
207 |     gray_image = rgb2gray(frame)
208 |     reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84))
209 |     x = np.reshape(reshaped_image, [84,84,1])
210 |     x *= 1 / 255.0
211 | 
212 |     return x
213 | 
214 | def rollout(sess, prediction_net):
215 | 
216 | 
217 |     #tf.summary.scalar("Qval", encode)
218 |     merged_summary_op = tf.summary.merge_all()
219 | 
220 |     sess.run(tf.initialize_all_variables())
221 |     saver = tf.train.Saver(tf.all_variables())
222 | 
223 |     checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
224 |     if checkpoint:
225 |         saver.restore(sess, checkpoint)
226 |         print("Loaded model checkpoint {}...".format(checkpoint))
227 | 
228 |     #summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
229 | 
230 |     D = deque()
231 |     num_episodes = 0
232 |     k = 0
233 | 
234 |     while num_episodes < MAX_EPISODES:
235 |         ob = env.reset()
236 | 
237 |         obf = preprocess(ob)
238 |         s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
239 |         observations, actions = [], []
240 | 
241 |         i = 0
242 | 
243 |         for t in range(10000):
244 |             env.render() #optional
245 | 
246 |             if i == 0:
247 |                 #action_id = env.action_space.sample()
248 |                 #action_id = random.randint(0,5)
249 |                 action_id = 0
250 |                 action_vector = np.zeros(ACTIONS)
251 |                 action_vector[action_id] = 1
252 |                 #actions.append(action_vector)
253 | 
254 |             ob, reward, done, info = env.step(action_id)
255 | 
256 |             obf = preprocess(ob)
257 | 
258 |             s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2)
259 | 
260 |             # if training, collect data and apply learning updates
261 |             if flags.FLAGS.train:
262 |                 # storing current state and the next frame
263 |                 D.append((s_t, action_vector, obf))
264 |                 if len(D) > REPLAY_MEMORY:
265 |                     D.popleft()
266 | 
267 |                 if num_episodes > 32:
268 |                     minibatch = random.sample(D, BATCH)
269 |                     action_batch = [d[1] for d in minibatch]
270 |                     state_batch = [d[0] for d in minibatch]
271 |                     target_batch = [d[2] for d in minibatch]
272 |                     target_batch = np.reshape(target_batch, (BATCH, 84, 84))
273 | 
274 |                     pred_batch = prediction_net.predict(sess, np.reshape(state_batch, (BATCH, 84, 84, 4)), np.reshape(action_batch, (BATCH, 6)))
275 | 
276 |                     loss = prediction_net.update(sess, state_batch, action_batch, target_batch, pred_batch)
277 | 
278 |                     #summary_writer.add_summary(summary, num_episodes)
279 | 
280 |                     print("\riteration {} @ Episode {}/{}, loss {}".format(k, num_episodes, MAX_EPISODES, loss), end="")
281 |                     sys.stdout.flush()
282 | 
283 |                     if k % 1000 == 0:
284 |                         print("\nsaving model now")
285 |                         saver.save(sess, save_path, global_step = t)
286 | 
287 |                     # display the first frame of the minibatch
288 |                     cv2.imshow("prediction", pred_batch[0])
289 |                     cv2.imshow("target", target_batch[0])
290 |                     cv2.imshow("input", state_batch[0][:,:,0])
291 |                     cv2.waitKey(5)
292 | 
293 |             else:
294 |                 #render video frames while testing
295 |                 prediction = prediction_net.predict(sess, np.reshape(s_t, (1, 84, 84, 4)), np.reshape(action_vector, (1, 6)))
296 |                 #print("prediction shape", prediction[0])
297 |                 cv2.imshow("prediction", prediction[0])
298 |                 cv2.waitKey(1)
299 | 
300 |             k += 1
301 | 
302 |             if i == 3: #maybe change to 4
303 |                 i = 0
304 |             else:
305 |                 i +=1
306 | 
307 |             s_t = s_t1
308 | 
309 |             if done:
310 |                 num_episodes += 1
311 |                 break
312 | 
313 | 
314 | sess = tf.InteractiveSession()
315 | 
316 | checkpoint_dir = '/home/manan/Downloads/models3/'
317 | save_path = '/home/manan/Downloads/models/video_prediction.ckpt'
318 | #save_path = '/home/manan/Downloads/models3/video_prediction.ckpt'
319 | #load_path='/home/manan/Downloads/models3/video_prediction.ckpt-302'
320 | logs_path = '/tmp/tensorboard_example'
321 | 
322 | prediction_net = autoencoder()
323 | rollout(sess, prediction_net)
324 | '''Pong : Actions 2,4 : up
325 |                   3,5 : down
326 |                   0,1 : no movement'''
327 | 
328 | # basic code for simulating random policy
329 | '''for i_episode in range(2):
330 |     observation = env.reset()
331 |     ob = preprocess(observation)
332 |     print(ob.shape)
333 |     for t in range(10000)
334 |         env.render()
335 |         print(observation)
336 |         if random.random() < epsilon:
337 |             action = env.action_space.sample()
338 |         else:
339 |             action = 1
340 |         observation, reward, done, info = env.step(action)
341 |         #print(action)
342 |         if done == True:
343 |             print("Episode finished")
344 |             break'''
345 | 


--------------------------------------------------------------------------------
/frame_prediction_atari/tensorboard:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/frame_prediction_atari/tensorboard


--------------------------------------------------------------------------------
/frame_prediction_atari/test_autoencoder_alter.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ''' We use downsampled gray scale images - 84 X 84,
  3 |     consider only every 4th frame as input, applying
  4 |     the same action for the intermediate frames.
  5 |     Minibatch size is taken to be 32. Each input
  6 |     consists of a fixed memory of T = 4 to unroll
  7 |     each trajectory and pass in as an input. K, which
  8 |     is the prediction step parameter, taken to be 1'''
  9 | 
 10 | ''' latest model is stored at /Downloads/models3/ '''
 11 | 
 12 | import gym
 13 | import numpy as np
 14 | import tensorflow as tf
 15 | from scipy.misc import imresize
 16 | import random
 17 | from collections import deque
 18 | import cv2
 19 | 
 20 | epsilon = 0.35
 21 | MAX_EPISODES = 10000
 22 | BATCH = 8
 23 | max_iter = 10000
 24 | ACTIONS = 6
 25 | FACTORS = 2048
 26 | REPLAY_MEMORY = 50
 27 | 
 28 | def weight_variable(shape):
 29 |     initial = tf.truncated_normal(shape, stddev = 0.01)
 30 |     return tf.Variable(initial)
 31 | 
 32 | def bias_variable(shape):
 33 |     initial = tf.constant(0.0, shape = shape)
 34 |     return tf.Variable(initial)
 35 | 
 36 | def conv2d(x, W, stride):
 37 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 38 | 
 39 | def conv2d_nopad(x, W, stride):
 40 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
 41 | 
 42 | def deconv2d(x, W, output_shape, stride):
 43 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME")
 44 | 
 45 | def deconv2d_nopad(x, W, output_shape, stride):
 46 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID")
 47 | 
 48 | def max_pool_2x2(x):
 49 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 50 | 
 51 | def autoencoder():
 52 | 
 53 |     # input - Batch X 84 X 84 X 4
 54 |     state = tf.placeholder("float", [BATCH, 84, 84, 4])
 55 |     action = tf.placeholder("float", [BATCH, ACTIONS])
 56 | 
 57 |     # 6 X 6 X 4 x 64 - stride 2
 58 |     W_conv1 = weight_variable([6, 6, 4, 64])
 59 |     wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer())
 60 |     b_conv1 = bias_variable([64])
 61 | 
 62 |     # 6 X 6 X 64 x 64 - stride 2
 63 |     W_conv2 = weight_variable([6, 6, 64, 64])
 64 |     b_conv2 = bias_variable([64])
 65 | 
 66 |     # 6 X 6 X 64 x 64 - stride 2
 67 |     W_conv3 = weight_variable([6, 6, 64, 64])
 68 |     b_conv3 = bias_variable([64])
 69 | 
 70 |     # _*16 ie. flattened output from conv3
 71 |     W_fc1 = weight_variable([10*10*64, 1024])
 72 |     b_fc1 = bias_variable([1024])
 73 | 
 74 |     #second fully connected layer - 2048 units
 75 |     W_fc2 = weight_variable([1024, 2048])
 76 |     b_fc2 = bias_variable([2048])
 77 | 
 78 |     #W_fc2 = weight_variable([256, ACTIONS])
 79 |     #b_fc2 = bias_variable([ACTIONS])
 80 | 
 81 |     conv1 = tf.nn.relu(conv2d_nopad(state, wconv, 2) + b_conv1)
 82 |     #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
 83 |     #print("padded shape", padded_conv1.shape)
 84 | 
 85 |     conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2)
 86 |     #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
 87 | 
 88 |     conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3)
 89 | 
 90 |     conv3_flat = tf.reshape(conv3, [-1, 10*10*64])
 91 |     fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1)
 92 |     fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
 93 | 
 94 |     # 6 X 6 X 4 x 64 - stride 2
 95 |     W_enc = weight_variable([FACTORS, 2048])
 96 |     W_dec = weight_variable([2048, FACTORS])
 97 |     W_action = weight_variable([FACTORS, ACTIONS])
 98 |     b_interactions = bias_variable([2048])
 99 | 
100 |     #W_henc = tf.matmul(W_enc, fc2)
101 |     #W_a = tf.matmul(W_action, action)
102 |     #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions
103 | 
104 |     W_henc = tf.matmul(fc2, tf.transpose(W_enc))
105 |     W_a = tf.matmul(action, tf.transpose(W_action))
106 |     fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions
107 | 
108 |     # first fully connected layer after multiplicative interaction- 2048
109 |     W_fc3 = weight_variable([2048, 1024])
110 |     b_fc3 = bias_variable([1024])
111 | 
112 |     # second fully connected layer after multiplicative interaction- 1024 units
113 |     W_fc4 = weight_variable([1024, 10*10*64])
114 |     b_fc4 = bias_variable([10*10*64])
115 | 
116 |     #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3)
117 |     # TRYING OUT AN ALL CONV. NET
118 |     fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3)
119 |     fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4)
120 | 
121 |     # reshaping into a 4-D matrix
122 |     fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64])
123 | 
124 |     # deconv variables
125 |     W_deconv1 = weight_variable([6, 6, 64, 64])
126 |     b_deconv1 = bias_variable([64])
127 | 
128 |     W_deconv2 = weight_variable([6, 6, 64, 64])
129 |     b_deconv2 = bias_variable([64])
130 | 
131 |     W_deconv3 = weight_variable([6, 6, 1, 64])
132 |     b_deconv3 = bias_variable([1])
133 | 
134 |     # output - 1 x 84 84
135 |     deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1)
136 |     deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2)
137 |     deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3
138 | 
139 | 
140 |     #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84])
141 |     encode = tf.reshape(deconv3, [-1, 84, 84])
142 | 
143 |     return state, action, encode
144 | 
145 | def preprocess(frame):
146 |     gray_image = frame.mean(2)
147 |     reshaped_image = imresize(gray_image, (84,84))
148 |     x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32)
149 |     x *= (1.0 / 128.0)
150 |     # divide by 255
151 |     ''' clipping code here '''
152 | 
153 |     return x
154 | 
155 | def rollout(state, action, encode):
156 | 
157 |     # reshape the predicted frame
158 |     '''reshape code here'''
159 | 
160 |     y = tf.placeholder("float", [BATCH, 84, 84])
161 |     pred_frame = encode
162 |     cost = tf.square(tf.norm(y - pred_frame))
163 |     train_step = tf.train.RMSPropOptimizer(1e-4).minimize(cost)
164 | 
165 |     print("working")
166 |     sess.run(tf.initialize_all_variables())
167 |     saver = tf.train.Saver(tf.all_variables())
168 |     #saver.restore(sess, load_path)
169 |     #print("variables restored and loaded...")
170 | 
171 |     D = deque()
172 |     num_episodes = 0
173 |     k = 0
174 | 
175 |     while num_episodes < MAX_EPISODES:
176 |         ob = env.reset()
177 | 
178 |         obf = preprocess(ob)
179 |         s_t = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
180 |         observations, actions = [], []
181 | 
182 |         i = 1
183 |         print("num of episodes", num_episodes)
184 | 
185 |         for t in range(10000):
186 |             env.render() #optional
187 | 
188 |             if i == 1:
189 |                 #action_id = env.action_space.sample()
190 |                 action_id = 0
191 |                 action_vector = np.zeros(ACTIONS)
192 |                 action_vector[action_id] = 1
193 |                 actions.append(action_vector)
194 |                 #print("action size sample", action_vector)
195 | 
196 |             ob, reward, done, info = env.step(action_id)
197 |             #if i == 1:
198 |             #    cv2.imshow("image", preprocess(ob))
199 |             #    cv2.waitKey()
200 |             #i += 1
201 | 
202 |             obf = preprocess(ob)
203 |             s_t1 = np.append(obf, s_t[:,:,0:3], axis = 2)
204 |             #observations.append(s_t1)
205 |             ''' uncomment for training '''
206 |             if i == 1:
207 |                 observations.append(s_t1)
208 |                 #D.append((s_t, action_vector, obf))
209 |                 #if len(D) > REPLAY_MEMORY:
210 |                 #        D.popleft()
211 | 
212 |             if i == 3: #maybe change to 4
213 |                 i = 1
214 |             else:
215 |                 i +=1
216 | 
217 |             ''' comment for training '''
218 |             '''prediction = encode.eval(feed_dict = {state : np.reshape(s_t, (1, 84, 84, 4)), action : np.reshape(action_vector, (1, 6))})
219 |             print("prediction shape", prediction[0])
220 |             cv2.imshow("prediction", prediction[0])
221 |             cv2.waitKey(1)'''
222 | 
223 |             s_t = s_t1
224 | 
225 |             #D.append((observations, actions))
226 |             #print("observations length", D[0][0].shape)
227 | 
228 |             #print("deque length", len(D[0][0]))
229 | 
230 |             ''' uncomment for training '''
231 |             #k = 0
232 |             #while k < max_iter:
233 |             if num_episodes > 32:
234 | 
235 |                 minibatch = random.sample(D, BATCH)
236 |                 action_batch = [d[1] for d in minibatch]
237 |                 state_batch = [d[0] for d in minibatch]
238 | 
239 |                 #print("state_batch shape" + str(state_batch[0][0].shape))
240 |                 # the first frame of the second set of observations
241 |                 idx = random.randint(1, 300)
242 |                 target_batch = [d[idx][:,:,0] for d in state_batch]
243 |                 #print("target_batch shape" + str(target_batch[0].shape))
244 |                 # the first set of 4 frames
245 |                 input_batch = [d[idx-1] for d in state_batch]
246 |                 #print("input_batch shape" + str(input_batch[0].shape))
247 |                 action_input_batch = [d[0] for d in action_batch]
248 | 
249 |                 # unroll
250 |                 for j in range(3):
251 |                     pred_batch = encode.eval(feed_dict = {action : np.reshape(action_input_batch, (BATCH, 6)),
252 |                                                     state : np.reshape(input_batch, (BATCH, 84, 84, 4))})
253 | 
254 |                     train_step.run(feed_dict = {
255 |                                 y : target_batch,
256 |                                 pred_frame : pred_batch,
257 |                                 state : input_batch,
258 |                                 action : action_input_batch})
259 |                     loss = cost.eval(feed_dict = {y : target_batch,
260 |                                 pred_frame : pred_batch,
261 |                                 state : input_batch,
262 |                                 action : action_input_batch})
263 | 
264 |                     print("iteration : ", k)
265 |                     print("loss : ", loss)
266 |                     #print("j is :", j)
267 | 
268 |                     if k % 1000 == 0:
269 |                         print("saving model now")
270 |                         saver.save(sess, save_path, global_step = t)
271 | 
272 |                     #if k == max_iter - 1:
273 |                     cv2.imshow("prediction", pred_batch[0])
274 |                     cv2.imshow("target", target_batch[0])
275 |                     cv2.imshow("input", input_batch[0][:,:,0])
276 |                     #if k % 500 == 0:
277 |                     #cv2.imwrite('prediction%s.jpg' %k, pred_batch[0])
278 |                     cv2.waitKey(5)
279 | 
280 |                     k += 1
281 | 
282 |                     pred_batch = np.reshape(pred_batch, (BATCH, 84, 84, 1))
283 |                     target_batch = [d[idx][:,:,j+1] for d in state_batch]
284 |                     temp = [d[:,:,0:3] for d in input_batch]
285 |                     #print("pred_batch shape", pred_batch.shape)
286 |                     #print("temp shape", len(temp), temp[0].shape)
287 |                     input_batch = np.append(pred_batch, temp, axis = 3)
288 | 
289 |             if done:
290 |                 num_episodes += 1
291 |                 D.append((observations, actions))
292 |                 if len(D) > REPLAY_MEMORY:
293 |                     D.popleft()
294 |                 break
295 | 
296 | env = gym.make('Pong-v0')
297 | sess = tf.InteractiveSession()
298 | save_path = '/home/manan/Downloads/models3/video_prediction.ckpt'
299 | load_path='/home/manan/Downloads/models3/video_prediction.ckpt-1000'
300 | 
301 | state, action, encode = autoencoder()
302 | rollout(state, action, encode)
303 | '''Pong : Actions 2,4 : up
304 |                   3,5 : down
305 |                   0,1 : no movement'''
306 | 
307 | '''for i_episode in range(2):
308 |     observation = env.reset()
309 |     ob = preprocess(observation)
310 |     print(ob.shape)
311 |     for t in range(10000)
312 |         env.render()
313 |         print(observation)
314 |         if random.random() < epsilon:
315 |             action = env.action_space.sample()
316 |         else:
317 |             action = 1
318 |         observation, reward, done, info = env.step(action)
319 |         #print(action)
320 |         if done == True:
321 |             print("Episode finished")
322 |             break'''
323 | 


--------------------------------------------------------------------------------
/frame_prediction_atari/test_multistep_autoencoder.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ''' We use downsampled gray scale images - 84 X 84,
  3 |     consider only every 4th frame as input, applying
  4 |     the same action for the intermediate frames.
  5 |     Minibatch size is taken to be 32. Each input
  6 |     consists of a fixed memory of T = 4 to unroll
  7 |     each trajectory and pass in as an input. K, which
  8 |     is the prediction step parameter, taken to be 1'''
  9 | 
 10 | ''' latest model is stored at /Downloads/models3/ '''
 11 | 
 12 | import gym
 13 | import numpy as np
 14 | import tensorflow as tf
 15 | from scipy.misc import imresize
 16 | import random
 17 | from collections import deque
 18 | import cv2
 19 | 
 20 | epsilon = 0.35
 21 | MAX_EPISODES = 10000
 22 | BATCH = 32
 23 | max_iter = 10000
 24 | ACTIONS = 6
 25 | FACTORS = 2048
 26 | REPLAY_MEMORY = 1000000
 27 | num_steps = 3
 28 | H = 4
 29 | 
 30 | def weight_variable(shape):
 31 |     initial = tf.truncated_normal(shape, stddev = 0.01)
 32 |     return tf.Variable(initial)
 33 | 
 34 | def bias_variable(shape):
 35 |     initial = tf.constant(0.0, shape = shape)
 36 |     return tf.Variable(initial)
 37 | 
 38 | def conv2d(x, W, stride):
 39 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 40 | 
 41 | def conv2d_nopad(x, W, stride):
 42 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
 43 | 
 44 | def deconv2d(x, W, output_shape, stride):
 45 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "SAME")
 46 | 
 47 | def deconv2d_nopad(x, W, output_shape, stride):
 48 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides = [1, stride, stride, 1], padding = "VALID")
 49 | 
 50 | def max_pool_2x2(x):
 51 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 52 | 
 53 | def autoencoder():
 54 | 
 55 |     # input - Batch X 84 X 84 X 4
 56 |     state = tf.placeholder("float", [BATCH, 84, 84, 4])
 57 |     action = tf.placeholder("float", [BATCH, ACTIONS])
 58 | 
 59 |     # 6 X 6 X 4 x 64 - stride 2
 60 |     W_conv1 = weight_variable([6, 6, 4, 64])
 61 |     wconv = tf.get_variable("wconv", shape=[6, 6, 4, 64], initializer=tf.contrib.layers.xavier_initializer())
 62 |     b_conv1 = bias_variable([64])
 63 | 
 64 |     # 6 X 6 X 64 x 64 - stride 2
 65 |     W_conv2 = weight_variable([6, 6, 64, 64])
 66 |     b_conv2 = bias_variable([64])
 67 | 
 68 |     # 6 X 6 X 64 x 64 - stride 2
 69 |     W_conv3 = weight_variable([6, 6, 64, 64])
 70 |     b_conv3 = bias_variable([64])
 71 | 
 72 |     # _*16 ie. flattened output from conv3
 73 |     W_fc1 = weight_variable([10*10*64, 1024])
 74 |     b_fc1 = bias_variable([1024])
 75 | 
 76 |     #second fully connected layer - 2048 units
 77 |     W_fc2 = weight_variable([1024, 2048])
 78 |     b_fc2 = bias_variable([2048])
 79 | 
 80 |     #W_fc2 = weight_variable([256, ACTIONS])
 81 |     #b_fc2 = bias_variable([ACTIONS])
 82 | 
 83 |     conv1 = tf.nn.relu(conv2d_nopad(state, wconv, 2) + b_conv1)
 84 |     #padded_conv1 = tf.pad(conv1, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
 85 |     #print("padded shape", padded_conv1.shape)
 86 | 
 87 |     conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2)
 88 |     #padded_conv2 = tf.pad(conv2, [[0, 0], [2, 2], [2, 2], [0, 0]], "CONSTANT")
 89 | 
 90 |     conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 2) + b_conv3)
 91 | 
 92 |     conv3_flat = tf.reshape(conv3, [-1, 10*10*64])
 93 |     fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1)
 94 |     fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
 95 | 
 96 |     # 6 X 6 X 4 x 64 - stride 2
 97 |     W_enc = weight_variable([FACTORS, 2048])
 98 |     W_dec = weight_variable([2048, FACTORS])
 99 |     W_action = weight_variable([FACTORS, ACTIONS])
100 |     b_interactions = bias_variable([2048])
101 | 
102 |     #W_henc = tf.matmul(W_enc, fc2)
103 |     #W_a = tf.matmul(W_action, action)
104 |     #fc_interactions = tf.matmul(W_dec, tf.multiply(W_henc, W_a)) + b_interactions
105 | 
106 |     W_henc = tf.matmul(fc_interactions, tf.transpose(W_enc))
107 |     W_a = tf.matmul(action, tf.transpose(W_action))
108 |     fc_interactions = tf.matmul(tf.multiply(W_henc, W_a), tf.transpose(W_dec)) + b_interactions
109 | 
110 |     # first fully connected layer after multiplicative interaction- 2048
111 |     W_fc3 = weight_variable([2048, 1024])
112 |     b_fc3 = bias_variable([1024])
113 | 
114 |     # second fully connected layer after multiplicative interaction- 1024 units
115 |     W_fc4 = weight_variable([1024, 10*10*64])
116 |     b_fc4 = bias_variable([10*10*64])
117 | 
118 |     #fc3 = tf.nn.relu(tf.matmul(fc_interactions, W_fc3) + b_fc3)
119 |     # TRYING OUT AN ALL CONV. NET
120 |     fc3 = tf.nn.relu(tf.matmul(fc2, W_fc3) + b_fc3)
121 |     fc4 = tf.nn.relu(tf.matmul(fc3, W_fc4) + b_fc4)
122 | 
123 |     # reshaping into a 4-D matrix
124 |     fc4_matrix = tf.reshape(fc4, [-1, 10, 10, 64])
125 | 
126 |     # deconv variables
127 |     W_deconv1 = weight_variable([6, 6, 64, 64])
128 |     b_deconv1 = bias_variable([64])
129 | 
130 |     W_deconv2 = weight_variable([6, 6, 64, 64])
131 |     b_deconv2 = bias_variable([64])
132 | 
133 |     W_deconv3 = weight_variable([6, 6, 1, 64])
134 |     b_deconv3 = bias_variable([1])
135 | 
136 |     # output - 1 x 84 84
137 |     deconv1 = tf.nn.relu(deconv2d(fc4_matrix, W_deconv1, (BATCH, 20, 20, 64), 2) + b_deconv1)
138 |     deconv2 = tf.nn.relu(deconv2d(deconv1, W_deconv2, (BATCH, 40, 40, 64), 2) + b_deconv2)
139 |     deconv3 = deconv2d_nopad(deconv2, W_deconv3, (BATCH, 84, 84, 1), 2) + b_deconv3
140 | 
141 | 
142 |     #encode = tf.reshape(tf.image.resize_images(deconv3, [84, 84]), [-1, 84, 84])
143 |     encode = tf.reshape(deconv3, [-1, 84, 84])
144 | 
145 |     return state, action, encode
146 | 
147 | def preprocess(frame):
148 |     gray_image = frame.mean(2)
149 |     reshaped_image = imresize(gray_image, (84,84))
150 |     x = np.reshape(reshaped_image, [84,84,1]).astype(np.float32)
151 |     x *= (1.0 / 255.0)
152 |     # divide by 255
153 |     ''' clipping code here '''
154 | 
155 |     return x
156 | 
157 | def rollout(state, action, encode):
158 | 
159 |     # reshape the predicted frame
160 |     '''reshape code here'''
161 | 
162 |     y = tf.placeholder("float", [BATCH, 84, 84])
163 |     pred_frame = encode
164 |     cost = tf.square(tf.norm(y - pred_frame))
165 |     train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)
166 | 
167 |     print("working")
168 |     sess.run(tf.initialize_all_variables())
169 |     saver = tf.train.Saver(tf.all_variables())
170 |     #saver.restore(sess, load_path)
171 |     #print("variables restored and loaded...")
172 | 
173 |     D = deque()
174 |     num_episodes = 0
175 |     k = 0
176 | 
177 |     while num_episodes < MAX_EPISODES:
178 |         ob = env.reset()
179 | 
180 |         obf = preprocess(ob)
181 |         s = ()
182 |         for i in range(num_steps + 4):
183 |             s += obf
184 |         s_t = np.reshape(np.stack(s, axis=2), (84, 84, H))
185 |         observations, actions = [], []
186 | 
187 |         i = 0
188 |         print("num of episodes", num_episodes)
189 | 
190 |         for t in range(10000):
191 |             env.render() #optional
192 | 
193 |             if i == 0:
194 |                 #action_id = env.action_space.sample()
195 |                 action_id = 0
196 |                 action_vector = np.zeros(ACTIONS)
197 |                 action_vector[action_id] = 1
198 |                 #actions.append(action_vector)
199 |                 #print("action size sample", action_vector)
200 | 
201 |             ob, reward, done, info = env.step(action_id)
202 |             #if i == 1:
203 |             #    cv2.imshow("image", preprocess(ob))
204 |             #    cv2.waitKey()
205 |             #i += 1
206 | 
207 |             obf = preprocess(ob)
208 | 
209 |             #observations.append(s_t1)
210 |             ''' uncomment for training '''
211 |             #if i == 0:
212 |             s_t = np.append(obf, s_t[:,:,0:num_steps + H - 1], axis = 2)
213 |             x_t = s_t[:,:,num_steps + H - 1:]
214 |             y_t = s_t[:,:,0:num_steps + H -1]
215 |             D.append((x_t, action_vector, y_t))
216 |             if len(D) > REPLAY_MEMORY:
217 |                 D.popleft()
218 | 
219 |             if i == H - 1: #maybe change to 4
220 |                 i = 0
221 |             else:
222 |                 i +=1
223 | 
224 |             ''' comment for training '''
225 |             '''prediction = encode.eval(feed_dict = {state : np.reshape(s_t, (1, 84, 84, 4)), action : np.reshape(action_vector, (1, 6))})
226 |             print("prediction shape", prediction[0])
227 |             cv2.imshow("prediction", prediction[0])
228 |             cv2.waitKey(1)'''
229 | 
230 |             s_t = s_t1
231 | 
232 |             #D.append((observations, actions))
233 |             #print("observations length", D[0][0].shape)
234 | 
235 |             #print("deque length", len(D[0][0]))
236 | 
237 |             ''' uncomment for training '''
238 |             #k = 0
239 |             #while k < max_iter:
240 |             if num_episodes > 20:
241 | 
242 |                 minibatch = random.sample(D, BATCH)
243 |                 train(minibatch, k)
244 | 
245 |             k += 1
246 | 
247 |             if done:
248 |                 num_episodes += 1
249 |                 break
250 | 
251 | def train(minibatch, k):
252 | 
253 |     action_batch = [d[1] for d in minibatch]
254 |     x_batch = [d[0] for d in minibatch]
255 |     y_batch = [d[2] for d in minibatch]
256 |     target_batch = np.reshape(y_batch, (BATCH, 84, 84))
257 | 
258 |     # include changing actions in predictions as well
259 |     for i in range(num_steps):
260 |         pred_batch = encode.eval(feed_dict = {action : np.reshape(action_batch, (BATCH, 6)),
261 |                                         state : np.reshape(x_batch, (BATCH, 84, 84, H))})
262 |         x_batch.pop(0)
263 |         x_batch.append(pred_batch)
264 |         train_step.run(feed_dict = {
265 |             y : target_batch[:,:,:,i],
266 |             pred_frame : pred_batch,
267 |             state : x_batch,
268 |             action : action_batch})
269 |         loss = cost.eval(feed_dict = {y : target_batch[:,:,:,i],
270 |             pred_frame : pred_batch,
271 |             state : x_batch,
272 |             action : action_batch})
273 | 
274 |         print("iteration : ", k)
275 |         print("loss : ", loss)
276 | 
277 |         if k % 1000 == 0:
278 |         print("saving model now")
279 |         saver.save(sess, save_path, global_step = t)
280 | 
281 |         #if k == max_iter - 1:
282 |         cv2.imshow("prediction", pred_batch[0])
283 |         cv2.imshow("target", target_batch[0])
284 |         cv2.imshow("input", state_batch[0][:,:,0])
285 |         #if k % 500 == 0:
286 |         #cv2.imwrite('prediction%s.jpg' %k, pred_batch[0])
287 |         cv2.waitKey(5)
288 | 
289 | env = gym.make('MsPacman-v0')
290 | sess = tf.InteractiveSession()
291 | save_path = '/home/manan/Downloads/models3/video_prediction.ckpt'
292 | load_path='/home/manan/Downloads/models3/video_prediction.ckpt-1000'
293 | 
294 | state, action, encode = autoencoder()
295 | rollout(state, action, encode)
296 | '''Pong : Actions 2,4 : up
297 |                   3,5 : down
298 |                   0,1 : no movement'''
299 | 
300 | '''for i_episode in range(2):
301 |     observation = env.reset()
302 |     ob = preprocess(observation)
303 |     print(ob.shape)
304 |     for t in range(10000)
305 |         env.render()
306 |         print(observation)
307 |         if random.random() < epsilon:
308 |             action = env.action_space.sample()
309 |         else:
310 |             action = 1
311 |         observation, reward, done, info = env.step(action)
312 |         #print(action)
313 |         if done == True:
314 |             print("Episode finished")
315 |             break'''
316 | 


--------------------------------------------------------------------------------
/images/image_screenshot8_24.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot8_24.05.2017.png


--------------------------------------------------------------------------------
/images/image_screenshot9_24.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot9_24.05.2017.png


--------------------------------------------------------------------------------
/images/image_screenshot_23.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/image_screenshot_23.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot10_24.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot10_24.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot11_24.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot11_24.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot12_25.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot12_25.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot13_25.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot13_25.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot13_26.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot13_26.05.2017.png


--------------------------------------------------------------------------------
/images/prediction_screenshot14_26.05.2017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/images/prediction_screenshot14_26.05.2017.png


--------------------------------------------------------------------------------
/misc/caffe_atari_cnn.py:
--------------------------------------------------------------------------------
 1 | def add_cnn(n, data, act, batch_size, T, K, num_step, mode='train'):
 2 |   # data : batch_size x T x 3 x height x width
 3 |   n.x_flat = L.Flatten(data, axis=1, end_axis=2)
 4 |   # n.x_flat : batch_size x T*3 x height x width
 5 |   n.act_flat = L.Flatten(act, axis=1, end_axis=2)
 6 |   if mode == 'train':
 7 |     x = L.Slice(n.x_flat, axis=1, ntop=T)
 8 |     # x : T layers of size : batch_size x 3 x height x width
 9 |     act_slice = L.Slice(n.act_flat, axis=1, ntop=T-1)
10 |     x_set = ()
11 |     label_set = ()
12 |     x_hat_set = ()
13 |     silence_set = ()
14 |     for i in range(T):
15 |       t = tag(i+1)
16 |       # n.tops[x1] : batch_size x 3 x height x width
17 |       # n.tops[x2] : batch_size x 3 x height x width
18 |       n.tops['x'+t] = x[i]
19 |       if i < K:
20 |         # storing just the first four frames in x_set
21 |         x_set += (x[i],)
22 |       if i < T - 1:
23 |         n.tops['act'+t] = act_slice[i]
24 |       if i < K - 1:
25 |         silence_set += (n.tops['act'+t],)
26 |       if i >= K:
27 |         # storing the fifth frame as the label
28 |         label_set += (x[i],)
29 |     # not important for 1 step prediction,
30 |     # produces : batch_size x 3 x height x width
31 |     n.label = L.Concat(*label_set, axis=0)
32 |     # converting to list
33 |     input_list = list(x_set)
34 |     # not important as no. of steps is 1
35 |     for step in range(0, num_step):
36 |       step_tag = tag(step + 1) if step > 0 else ''
37 |       t = tag(step + K)
38 |       tp = tag(step + K + 1)
39 |       input_tuple = tuple(input_list)
40 |       # concatenating all 4 frames together
41 |       n.tops['input'+step_tag] = L.Concat(*input_tuple, axis=1)
42 |       # passing through the feed-forward net
43 |       top = add_conv_enc(n, n.tops['input'+step_tag], tag=step_tag)
44 |       n.tops['x_hat'+tp] = add_decoder(n, top, n.tops['act'+t], flatten=False,
45 |           tag=step_tag)
46 |       # using the predicted values to form the input for the next prediction
47 |       input_list.pop(0)
48 |       input_list.append(n.tops['x_hat'+tp])
49 |   else:
50 |     top = add_conv_enc(n, n.x_flat)
51 |     n.tops['x_hat'+tag(K+1)] = add_decoder(n, top, n.act_flat, flatten=False)
52 |   if mode == 'train':
53 |     x_hat = ()
54 |     # for 1 step prediciton, just runs once for i = 4
55 |     for i in range(K, T):
56 |       t = tag(i+1)
57 |       # prediction for the 5th frame comes from the net
58 |       x_hat += (n.tops['x_hat'+t],)
59 |     # concatenate all predictions
60 |     n.x_hat = L.Concat(*x_hat, axis=0)
61 |     n.silence = L.Silence(*silence_set, ntop=0)
62 |     # takes the predcition for the 5th frame and output label
63 |     # both are of size batch_size x 3 x height x width
64 |     n.l2_loss = L.EuclideanLoss(n.x_hat, n.label)
65 |   return n
66 | 


--------------------------------------------------------------------------------
/misc/policy_gradients.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """ implements a simple policy gradient (actor critic technically) agent """
  3 | 
  4 | import argparse
  5 | import gym
  6 | import time
  7 | from gym.spaces import Discrete
  8 | import numpy as np
  9 | from scipy.signal import lfilter
 10 | from scipy.misc import imsave, imresize
 11 | import tensorflow as tf
 12 | import tensorflow.contrib.slim as slim
 13 | 
 14 | parser = argparse.ArgumentParser(description=None)
 15 | parser.add_argument('-e', '--env', default='Breakout-v3', type=str, help='gym environment')
 16 | parser.add_argument('-b', '--batch_size', default=10000, type=int, help='batch size to use during learning')
 17 | parser.add_argument('-l', '--learning_rate', default=1e-3, type=float, help='used for Adam')
 18 | parser.add_argument('-g', '--discount', default=0.99, type=float, help='reward discount rate to use')
 19 | parser.add_argument('-n', '--hidden_size', default=20, type=int, help='number of hidden units in net')
 20 | parser.add_argument('-c', '--gradient_clip', default=40.0, type=float, help='clip at this max norm of gradient')
 21 | parser.add_argument('-v', '--value_scale', default=0.5, type=float, help='scale of value function regression in loss')
 22 | parser.add_argument('-t', '--entropy_scale', default=0, type=float, help='scale of entropy penalty in loss')
 23 | parser.add_argument('-m', '--max_steps', default=10000, type=int, help='max number of steps to run for')
 24 | args = parser.parse_args()
 25 | print(args)
 26 | 
 27 | # -----------------------------------------------------------------------------
 28 | def process_frame(frame):
 29 |     """ Atari specific preprocessing, consistent with DeepMind """
 30 |     reshaped_screen = frame.astype(np.float32).mean(2)      # grayscale
 31 |     resized_screen = imresize(reshaped_screen, (84, 110)) # downsample
 32 |     x = resized_screen[18:102, :]                           # crop top/bottom
 33 |     x = imresize(x, (42, 42)).astype(np.float32)                             # downsample
 34 |     x *= (1.0 / 255.0)                                      # place in [0,1]
 35 |     x = np.reshape(x, [42, 42, 1])                          # introduce channel
 36 |     return x
 37 | 
 38 | def policy_spec(x):
 39 |   net = slim.conv2d(x, args.hidden_size, [5, 5], stride=2, padding='SAME', activation_fn=tf.nn.elu, scope='conv1')
 40 |   net = slim.conv2d(net, args.hidden_size, [5, 5], stride=2, padding='SAME', activation_fn=tf.nn.elu, scope='conv2')
 41 |   net = slim.flatten(net)
 42 |   action_logits = slim.fully_connected(net, num_actions, activation_fn=None, scope='fc_act')
 43 |   value_function = slim.fully_connected(net, 1, activation_fn=None, scope='fc_value')
 44 |   return action_logits, value_function
 45 | 
 46 | def rollout(n, max_steps_per_episode=4500):
 47 |   """ gather a single episode with current policy """
 48 | 
 49 |   observations, actions, rewards, discounted_rewards = [], [], [], []
 50 |   ob = env.reset()
 51 |   ep_steps = 0
 52 |   num_episodes = 0
 53 |   ep_start_pointer = 0
 54 |   prev_obf = None
 55 |   while True:
 56 | 
 57 |     # we concatenate the previous frame to get some motion information
 58 |     obf_now = process_frame(ob)
 59 |     obf_before = obf_now if prev_obf is None else prev_obf
 60 |     obf = np.concatenate((obf_before, obf_now), axis=2)
 61 |     #obf = obf_now - obf_before
 62 |     prev_obf = obf_now
 63 | 
 64 |     # run the policy
 65 |     action = sess.run(action_index, feed_dict={x: np.expand_dims(obf, 0)}) # intro a batch dim
 66 |     action = action[0][0] # strip batch and #of samples from tf.multinomial
 67 | 
 68 |     # execute the action
 69 |     ob, reward, done, info = env.step(action)
 70 |     ep_steps += 1
 71 | 
 72 |     observations.append(obf)
 73 |     actions.append(action)
 74 |     rewards.append(reward)
 75 | 
 76 |     if done or ep_steps >= max_steps_per_episode:
 77 |       num_episodes += 1
 78 |       ep_steps = 0
 79 |       prev_obf = None
 80 |       discounted_rewards.append(discount(rewards[ep_start_pointer:], args.discount))
 81 |       ep_start_pointer = len(rewards)
 82 |       ob = env.reset()
 83 |       if len(rewards) >= n: break
 84 | 
 85 |   return np.stack(observations), np.stack(actions), np.stack(rewards), np.concatenate(discounted_rewards), {'num_episodes':num_episodes}
 86 | 
 87 | def discount(x, gamma):
 88 |   return lfilter([1],[1,-gamma],x[::-1])[::-1]
 89 | # -----------------------------------------------------------------------------
 90 | 
 91 | # create the environment
 92 | env = gym.make(args.env)
 93 | num_actions = env.action_space.n
 94 | 
 95 | # compile the model
 96 | x = tf.placeholder(tf.float32, (None,) + (42,42,2), name='x')
 97 | action_logits, value_function = policy_spec(x)
 98 | action_index = tf.multinomial(action_logits - tf.reduce_max(action_logits, 1, keep_dims=True), 1) # take 1 sample
 99 | # compile the loss: 1) the policy gradient
100 | sampled_actions = tf.placeholder(tf.int32, (None,), name='sampled_actions')
101 | discounted_reward = tf.placeholder(tf.float32, (None,), name='discounted_reward')
102 | pg_loss = tf.reduce_mean((discounted_reward - value_function) * tf.nn.sparse_softmax_cross_entropy_with_logits(logits=action_logits, labels=sampled_actions))
103 | # and 2) the baseline (value function) regression piece
104 | value_loss = args.value_scale * tf.reduce_mean(tf.square(discounted_reward - value_function))
105 | # and 3) entropy regularization
106 | action_log_prob = tf.nn.log_softmax(action_logits)
107 | entropy_loss = -args.entropy_scale * tf.reduce_sum(action_log_prob*tf.exp(action_log_prob))
108 | # add up and minimize
109 | loss = pg_loss + value_loss + entropy_loss
110 | # create the optimizer
111 | optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
112 | grads = tf.gradients(loss, tf.trainable_variables())
113 | grads, _ = tf.clip_by_global_norm(grads, args.gradient_clip) # gradient clipping
114 | grads_and_vars = list(zip(grads, tf.trainable_variables()))
115 | train_op = optimizer.apply_gradients(grads_and_vars)
116 | 
117 | # tf init
118 | sess = tf.Session()
119 | sess.run(tf.initialize_all_variables())
120 | n = 0
121 | mean_rewards = []
122 | while n <= 100: # loop forever
123 |   n += 1
124 | 
125 |   # collect a batch of data from rollouts and do forward/backward/update
126 |   t0 = time.time()
127 |   observations, actions, rewards, discounted_reward_np, info = rollout(args.batch_size)
128 |   t1 = time.time()
129 |   sess.run(train_op, feed_dict={x:observations, sampled_actions:actions, discounted_reward:discounted_reward_np})
130 |   t2 = time.time()
131 | 
132 |   average_reward = np.sum(rewards)/info['num_episodes']
133 |   mean_rewards.append(average_reward)
134 |   print('step %d: collected %d frames in %fs, mean episode reward = %f (%d eps), update in %fs' % \
135 |         (n, observations.shape[0], t1-t0, average_reward, info['num_episodes'], t2-t1))
136 | 
137 | print(args)
138 | print('total average reward: %f +/- %f (min %f, max %f)' % \
139 |       (np.mean(mean_rewards), np.std(mean_rewards), np.min(mean_rewards), np.max(mean_rewards)))
140 | 


--------------------------------------------------------------------------------
/simple_dqn/naive_nips_dqn.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import tensorflow as tf
  3 | import tensorflow.contrib.slim as slim
  4 | import numpy as np
  5 | from scipy.misc import imresize
  6 | from collections import deque
  7 | import sys
  8 | import random
  9 | 
 10 | INITIAL_EPSILON = 0.1
 11 | FINAL_EPSILON = 0.05
 12 | REPLAY_MEMORY = 10000
 13 | max_episodes = 5
 14 | BATCH = 2
 15 | GAMMA = 0.99
 16 | TRAIN = 1
 17 | 
 18 | 
 19 | def weight_variable(shape):
 20 |     initial = tf.truncated_normal(shape, stddev = 0.01)
 21 |     return tf.Variable(initial)
 22 | 
 23 | def bias_variable(shape):
 24 |     initial = tf.constant(0.01, shape = shape)
 25 |     return tf.Variable(initial)
 26 | 
 27 | def conv2d(x, W, stride):
 28 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
 29 | 
 30 | def max_pool_2x2(x):
 31 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 32 | 
 33 | def rgb2gray(frame):
 34 | 
 35 |     r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2]
 36 |     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
 37 | 
 38 |     return gray
 39 | 
 40 | def preprocess(frame):
 41 | 
 42 |     gray_image = rgb2gray(frame)
 43 |     reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84))
 44 |     x = np.reshape(reshaped_image, [84,84,1])
 45 |     x *= 1 / 255.0
 46 | 
 47 |     return x
 48 | 
 49 | class q_network():
 50 | 
 51 |     def __init__(self, scope):
 52 | 
 53 |         self.scope = scope
 54 |         with tf.variable_scope(self.scope):
 55 | 
 56 |             self.build_net()
 57 | 
 58 | 
 59 |     def build_net():
 60 | 
 61 |     x = tf.placeholder("float", [None, 84, 84, 4])
 62 |     #print(x.shape)
 63 |     conv1 = tf.layers.conv2d(x, 32, [5, 5], padding="same", activation=tf.nn.relu)
 64 |     print(conv1.shape)
 65 |     pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
 66 |     conv2 = tf.layers.conv2d(pool1, 64, [5, 5], padding="same", activation=tf.nn.relu)
 67 |     pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 68 |     print("passes pool2", pool2.shape)
 69 |     #pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64 * 9])
 70 |     pool2_flat = tf.reshape(pool2, [-1, 64])
 71 |     dense = tf.layers.dense(inputs=pool2_flat, units=512, activation=tf.nn.relu)
 72 |     #print("passes dense", dense.shape)
 73 |     action_logits = tf.layers.dense(inputs=dense, units=ACTIONS)
 74 |     print("passes logits", action_logits.shape)
 75 |     #conv1 = slim.conv2d(x, 10, [5,5], stride=2, padding='SAME', activation_fn=tf.nn.relu)
 76 |     #conv2 = slim.conv2d(conv1, 10, [5,5], stride=2, padding='SAME', activation_fn=tf.nn.relu)
 77 |     #net = slim.flatten(conv2)
 78 |     #action_logits = slim.fully_connected(net, ACTIONS, activation_fn=None)
 79 | 
 80 |     return action_logits
 81 | 
 82 | 
 83 | def rollout(sess, max_iter=5000):
 84 | 
 85 |     observations, actions, rewards = [], [], []
 86 |     ob = env.reset()
 87 |     ep_steps = 0
 88 |     num_episodes = 0
 89 |     epsilon = INITIAL_EPSILON
 90 | 
 91 |     ob_now = preprocess(ob)
 92 |     ob_prev = None
 93 |     t=0
 94 | 
 95 |     D = deque()
 96 |     a = tf.placeholder("float", [None, ACTIONS])
 97 |     y = tf.placeholder("float", [None])
 98 |     #s = tf.placeholder("float", [None, 84, 84, 2])
 99 |     s = tf.placeholder("float", [None, 4, 4, 2])
100 | 
101 |     readout_action = tf.reduce_sum(tf.multiply(dqn(s), a), reduction_indices = 1)
102 |     cost = tf.reduce_mean(tf.square(y - readout_action))
103 |     train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
104 | 
105 |     sess.run(tf.initialize_all_variables())
106 | 
107 |     while True or num_episodes < max_episodes:
108 | 
109 |         ob_before = ob_now if ob_prev is None else ob_prev
110 |         obf_prev = np.concatenate((ob_before, ob_now), 2)
111 |         ob_prev = ob_now
112 | 
113 |         action_index = 0
114 |         action = np.zeros(ACTIONS, np.int32)
115 |         print("action is ", action)
116 |         if random.random() <= epsilon:
117 |             action_index = random.randrange(ACTIONS)
118 |             action[action_index] = 1
119 |         else:
120 |             action_index = np.argmax(dqn(obf_prev.astype(np.float32)))
121 |             action[action_index] = 1
122 | 
123 |         if epsilon > FINAL_EPSILON:
124 |             epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 1000
125 | 
126 |         ob, reward, done, info = env.step(np.argmax(action))
127 |         ep_steps += 1
128 | 
129 |         ob_now = preprocess(ob)
130 |         ob_before = ob_now if ob_prev is None else ob_prev
131 |         obf_now = np.concatenate((ob_before, ob_now), 2)
132 | 
133 |         #observations.append(ob)
134 |         #actions.append(action)
135 |         #rewards.append(reward)
136 | 
137 |         D.append((obf_prev, action, reward, obf_now, done))
138 |         if len(D) > REPLAY_MEMORY:
139 |                 D.popleft()
140 |         if t >= TRAIN:
141 |             #training starts
142 |             minibatch = random.sample(D, BATCH)
143 | 
144 |             # get the batch variables
145 |             obf_prev_batch = [d[0] for d in minibatch]
146 |             action_batch = [d[1] for d in minibatch]
147 |             reward_batch = [d[2] for d in minibatch]
148 |             obf_now_batch = [d[3] for d in minibatch]
149 | 
150 |             target_batch = []
151 |             #obf_batch = np.concatenate(ob_before, ob_now), 2)
152 |             for i in range(0, len(minibatch)):
153 | 
154 |                 if minibatch[i][4]:
155 |                      target_batch.append(reward_batch[i])
156 | 
157 |                 else:
158 |                     print("obf_prev_batch shape ", len(obf_prev_batch))
159 |                     target_batch.append(reward_batch[i] + GAMMA*sess.run(tf.reduce_max(dqn(obf_now_batch[i].astype(np.float32)))))
160 | 
161 |             #print("obf_prev_batch", obf_prev_batch)
162 |             obff = np.zeros((len(obf_prev_batch), 4, 4, 2))
163 |             for i,x in enumerate(obf_prev_batch):
164 |                 obff[i] = x
165 |             #readout_t = s.eval(feed_dict = {s : obff})[0]
166 |             print("reward", reward_batch)
167 |             print("target_batch", target_batch[0])
168 |             target = np.zeros((len(target_batch)))
169 |             for i,x in enumerate(target_batch):
170 |                 target[i] = x
171 | 
172 |             #print("reward", reward_batch[1])
173 |             train_step.run(feed_dict = {
174 |                 s : obff,
175 |                 a : action_batch,
176 |                 y : target})
177 | 
178 | 
179 |         ob_prev = ob_now
180 |         t += 1
181 | 
182 |         if done or ep_steps >= max_iter:
183 |             num_episodes += 1
184 |             ep_steps = 0
185 |             ob_prev = None
186 |             ob = env.reset()
187 | 
188 | env = gym.make('Pong-v0')
189 | 
190 | #x = tf.placeholder(tf.float32, name )
191 | #sampled_actions = tf.placeholder(tf.int32)
192 | #discounted_reward = tf.placeholder(tf.float32)
193 | 
194 | #action_logits = dqn(x)
195 | ACTIONS = env.action_space.n
196 | sess = tf.InteractiveSession()
197 | #s, action_logits = dqn()
198 | rollout(sess)
199 | #print(env.action_space.n)
200 | for i_episode in range(1):
201 |     observation = env.reset()
202 |     ob = preprocess(observation)
203 |     obf = []
204 |     print(ob.shape)
205 |     for t in range(10000):
206 |         env.render()
207 |         #print(observation)
208 |         action = env.action_space.sample()
209 |         observation, reward, done, info = env.step(action)
210 |         obf.append(preprocess(observation))
211 |         #print(action)
212 |         if done == True:
213 |             print("Episode finished")
214 |             break
215 | 
216 | #print(obf[0])
217 | #print(len(obf))
218 | #obff = np.zeros((len(obf), 84, 84, 1))
219 | #for i,x in enumerate(obf):
220 | #    obff[i] = x
221 | #print(obff)
222 | 


--------------------------------------------------------------------------------
/simple_dqn/nature_dqn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import gym
  3 | from gym import wrappers
  4 | import tensorflow as tf
  5 | import tensorflow.contrib.slim as slim
  6 | import numpy as np
  7 | from scipy.misc import imresize
  8 | from collections import deque
  9 | import sys
 10 | import os
 11 | import random
 12 | import cv2
 13 | 
 14 | flags = tf.app.flags
 15 | flags.DEFINE_boolean('train', True, 'Whether to do training or testing')
 16 | flags.DEFINE_string('env_name', 'Pong', 'The name of gym environment to use')
 17 | 
 18 | env = gym.make(flags.FLAGS.env_name + 'NoFrameskip-v0')
 19 | 
 20 | ACTIONS = env.action_space.n
 21 | INITIAL_EPSILON = 1.
 22 | FINAL_EPSILON = 0.05
 23 | REPLAY_MEMORY = 1000000
 24 | max_episodes = 100000
 25 | BATCH = 32
 26 | GAMMA = 0.99
 27 | max_iter = 5000
 28 | 
 29 | def rgb2gray(frame):
 30 | 
 31 |     r, g, b = frame[:,:,0], frame[:,:,1], frame[:,:,2]
 32 |     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
 33 | 
 34 |     return gray
 35 | 
 36 | def preprocess(frame):
 37 | 
 38 |     gray_image = rgb2gray(frame)
 39 |     reshaped_image = cv2.resize(gray_image.astype(np.float32), (84, 84))
 40 |     x = np.reshape(reshaped_image, [84,84,1])
 41 |     x *= 1 / 255.0
 42 | 
 43 |     return x
 44 | 
 45 | def weight_variable(name, shape):
 46 |     initial = tf.contrib.layers.xavier_initializer()
 47 |     return tf.get_variable(name = name, shape = shape, initializer = initial)
 48 | 
 49 | def bias_variable(shape):
 50 |     initial = tf.constant(0.01, shape = shape)
 51 |     return tf.Variable(initial)
 52 | 
 53 | def conv2d(x, W, stride):
 54 |     return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID")
 55 | 
 56 | def max_pool_2x2(x):
 57 |     return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
 58 | 
 59 | class dqn():
 60 | 
 61 |     def __init__(self, clip_delta, scope, discount):
 62 | 
 63 |         self.clip_delta = clip_delta
 64 |         self.scope = scope
 65 |         self.discount = discount
 66 | 
 67 |         with tf.variable_scope(self.scope):
 68 | 
 69 |             self.net = self.build_net()
 70 | 
 71 |             self.y = tf.placeholder("float", [None])
 72 |             self.diff = self.y - tf.reduce_max(self.net, axis = 1)
 73 | 
 74 |             if self.clip_delta > 0:
 75 |                 quadratic_part = tf.minimum(abs(self.diff), self.clip_delta)
 76 |                 linear_part = abs(self.diff) - quadratic_part
 77 |                 self.loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
 78 |             else:
 79 |                 self.loss = 0.5 * self.diff ** 2
 80 | 
 81 |             self.loss = tf.reduce_mean(self.loss)
 82 |             self.train_step = tf.train.AdamOptimizer(0.00025).minimize(self.loss)
 83 |             self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
 84 |             self.summaries = tf.summary.merge_all(tf.summary.scalar("loss", self.loss))
 85 | 
 86 |     def build_net(self, ):
 87 | 
 88 |         # input - Batch X 84 X 84 X 4
 89 |         self.s = tf.placeholder("float", [None, 84, 84, 4])
 90 | 
 91 |         # 8 X 8 X 4 x 32 - stride 4
 92 |         W_conv1 = weight_variable("w1", [8, 8, 4, 32])
 93 |         b_conv1 = bias_variable([32])
 94 | 
 95 |         # 4 X 4 X 32 x 64 - stride 2
 96 |         W_conv2 = weight_variable("w2", [4, 4, 32, 64])
 97 |         b_conv2 = bias_variable([64])
 98 | 
 99 |         # 3 X 3 X 64 x 64 - stride 1
100 |         W_conv3 = weight_variable("w3", [3, 3, 64, 64])
101 |         b_conv3 = bias_variable([64])
102 | 
103 |         # 3*3*64 ie. flattened output from conv3
104 |         W_fc1 = weight_variable("w4",[3136, 512])
105 |         b_fc1 = bias_variable([512])
106 | 
107 |         W_fc2 = weight_variable("w5",[512, ACTIONS])
108 |         b_fc2 = bias_variable([ACTIONS])
109 | 
110 |         conv1 = tf.nn.relu(conv2d(self.s, W_conv1, 4) + b_conv1)
111 |         conv2 = tf.nn.relu(conv2d(conv1, W_conv2, 2) + b_conv2)
112 |         conv3 = tf.nn.relu(conv2d(conv2, W_conv3, 1) + b_conv3)
113 | 
114 |         # flatten the output from conv3 layer
115 |         conv3_flat = tf.reshape(conv3, [-1, 3136])
116 | 
117 |         # add two fully connected layers
118 |         fc1 = tf.nn.relu(tf.matmul(conv3_flat, W_fc1) + b_fc1)
119 |         out_fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
120 | 
121 |         return out_fc2
122 | 
123 | def copy_model_parameters(sess, estimator1, estimator2):
124 |     """
125 |     Copies the model parameters of one estimator to another.
126 |     Args:
127 |       sess: Tensorflow session instance
128 |       estimator1: Estimator to copy the paramters from
129 |       estimator2: Estimator to copy the parameters to
130 |     """
131 |     e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
132 |     e1_params = sorted(e1_params, key=lambda v: v.name)
133 |     e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
134 |     e2_params = sorted(e2_params, key=lambda v: v.name)
135 | 
136 |     update_ops = []
137 |     for e1_v, e2_v in zip(e1_params, e2_params):
138 |         op = e2_v.assign(e1_v)
139 |         update_ops.append(op)
140 | 
141 |     sess.run(update_ops)
142 | 
143 | def rollout(sess, q_network, target_network):
144 | 
145 |     merged_summary_op = tf.summary.merge_all()
146 |     q_summary = tf.Summary()
147 | 
148 |     num_episodes = 0
149 |     epsilon = INITIAL_EPSILON
150 | 
151 |     replay_memory = deque()
152 | 
153 |     sess.run(tf.initialize_all_variables())
154 |     saver = tf.train.Saver(tf.all_variables())
155 | 
156 |     checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
157 |     checkpoint_path = os.path.join(checkpoint_dir, "model")
158 |     if checkpoint:
159 |         saver.restore(sess, checkpoint)
160 |         print("Loaded model checkpoint {}...".format(checkpoint))
161 | 
162 |     print("collecting initial rollouts...")
163 |     i = 0
164 |     global_step = 0
165 | 
166 |     while num_episodes < max_episodes:
167 | 
168 |         ob = env.reset()
169 |         steps_per_episode = 0
170 |         reward_per_episode = 0
171 | 
172 |         ob_flkr = preprocess(ob)
173 |         obf_flkr = np.reshape(np.stack((ob_flkr, ob_flkr), axis=2), (84, 84, 2))
174 |         obf = np.amax((obf_flkr[:,:,0], obf_flkr[:,:,1]), (0))
175 |         state = np.reshape(np.stack((obf, obf, obf, obf), axis=2), (84, 84, 4))
176 | 
177 |         action_index = np.argmax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) }))
178 |         loss_per_episode = 0
179 |         reward_per_episode = 0
180 | 
181 |         for t in range(10000):
182 | 
183 |             if epsilon > FINAL_EPSILON:
184 |                 epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / 100000
185 | 
186 |             ob, reward, done, info = env.step(action_index)
187 | 
188 |             REWARD = reward
189 |             if reward > 1:
190 |                 REWARD = 1
191 |             if reward < -1:
192 |                 REWARD = -1
193 | 
194 |             reward_per_episode += reward
195 |             ob_flkr = preprocess(ob)
196 |             obf_flkr = np.append(ob_flkr, obf_flkr[:,:,0:1], axis = 2)
197 |             obf = np.amax((obf_flkr[:,:,0], obf_flkr[:,:,1]), (0)).reshape((84,84,1))
198 | 
199 | 
200 |             if i == 3:
201 |                 action_index = 0
202 |                 action = np.zeros(ACTIONS, np.int32)
203 | 
204 |                 if random.random() <= epsilon:
205 |                     action_index = random.randrange(ACTIONS)
206 |                     action[action_index] = 1
207 |                 else:
208 |                     action_index = np.argmax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) }))
209 |                     action[action_index] = 1
210 | 
211 |                 next_state = np.append(obf, state[:,:,0:3], axis = 2)
212 | 
213 |                 replay_memory.append((state, action, REWARD, next_state, done))
214 | 
215 |                 if len(replay_memory) > REPLAY_MEMORY:
216 |                     replay_memory.popleft()
217 | 
218 |                 if global_step > 50:
219 | 
220 |                     #training starts
221 |                     minibatch = random.sample(replay_memory, BATCH)
222 | 
223 |                     # get the batch variables
224 |                     state_batch = [d[0] for d in minibatch]
225 |                     action_batch = [d[1] for d in minibatch]
226 |                     reward_batch = [d[2] for d in minibatch]
227 |                     next_state_batch = [d[3] for d in minibatch]
228 |                     done_batch = [d[4] for d in minibatch]
229 | 
230 |                     next_q_value = sess.run(target_network.net, feed_dict = {target_network.s : next_state_batch})
231 |                     q_value = sess.run(q_network.net, feed_dict = {q_network.s : state_batch})
232 | 
233 |                     target_batch = np.asarray(reward_batch) + q_network.discount * \
234 |                     (np.ones_like(done_batch) - done_batch) * \
235 |                     np.max(next_q_value, axis=1)
236 | 
237 |                     _, loss = sess.run([q_network.train_step, q_network.loss], feed_dict = { \
238 |                         q_network.s : state_batch, \
239 |                         q_network.y : target_batch})
240 | 
241 |                     print("\riteration {} @ episode {}/{}".format(global_step, num_episodes, max_episodes), end="")
242 |                     sys.stdout.flush()
243 | 
244 |                     if global_step % 10000 == 0:
245 |                         print("\nsaving model now")
246 |                         saver.save(sess, checkpoint_path)
247 |                         print("\nupdating target network...")
248 |                         copy_model_parameters(sess, q_network, target_network)
249 | 
250 |                     steps_per_episode += 1
251 |                     loss_per_episode += loss
252 | 
253 |                 global_step += 1
254 |                 state = next_state
255 | 
256 |             i += 1
257 | 
258 |             if i == 4:
259 |                 i = 0
260 | 
261 |             if done or steps_per_episode >= max_iter:
262 |                 num_episodes += 1
263 |                 if global_step > 50:
264 |                     print("\nloss per episode {}".format(loss_per_episode / steps_per_episode))
265 |                     print("\nreward per episode {}".format(reward_per_episode))
266 |                     Q = np.amax(sess.run(q_network.net, feed_dict = {q_network.s : state.reshape((1, 84, 84, 4)) }))
267 | 
268 |                     q_summary.value.add(simple_value=steps_per_episode, node_name="episode_lengths", tag="episode_lengths")
269 |                     q_summary.value.add(simple_value=Q, node_name="q_value", tag="q_value")
270 |                     q_summary.value.add(simple_value=reward_per_episode, node_name="episode_reward", tag="episode_reward")
271 |                     q_network.summary_writer.add_summary(q_summary, global_step)
272 |                     q_network.summary_writer.flush()
273 | 
274 |                 ob = env.reset()
275 |                 break
276 | 
277 | monitor_dir = os.path.abspath("./{}-experiment/".format(flags.FLAGS.env_name))
278 | checkpoint_dir = os.path.abspath("./dqn/")
279 | logs_path = os.path.abspath("./tensorboard_example/")
280 | 
281 | env = wrappers.Monitor(env, monitor_dir, force=True)
282 | 
283 | #load_path='/home/manan/Downloads/models/pong.ckpt-2920000-2940000'
284 | #save_path = '/home/manan/Downloads/models2/pong.ckpt'
285 | 
286 | sess = tf.InteractiveSession()
287 | 
288 | q_network = dqn(1.0, scope="q_net", discount=0.99)
289 | target_network = dqn(1.0, scope="target_network", discount=0.99)
290 | 
291 | rollout(sess, q_network, target_network)
292 | 


--------------------------------------------------------------------------------
/weight_conversion/tf_pre_model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers import Dense, Activation, Flatten, Convolution2D
 3 | 
 4 | class tensorflow_model:
 5 |     def __init__(self, ):
 6 | 
 7 |         model = Sequential()
 8 |         model.add(Convolution2D(32, 8, strides=(4,4), input_shape=(84, 84, 4), data_format="channels_last"))
 9 |         #model.layers[1].set_weights(param_values["w1"])
10 |         model.add(Activation('relu'))
11 |         model.add(Convolution2D(64, 4, strides=(2,2)))
12 |         model.add(Activation('relu'))
13 |         model.add(Convolution2D(64, 3, strides=(1,1)))
14 |         model.add(Activation('relu'))
15 |         model.add(Flatten())
16 |         model.add(Dense(512))
17 |         model.add(Activation('relu'))
18 |         model.add(Dense(6))
19 | 


--------------------------------------------------------------------------------
/weight_conversion/tf_pre_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/tf_pre_model.pyc


--------------------------------------------------------------------------------
/weight_conversion/th2tf_weights.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | 
  4 | from keras import backend as K
  5 | from keras.utils.layer_utils import convert_all_kernels_in_model
  6 | 
  7 | ''' IMPORT YOUR SCRIPT FILE HERE TO CREATE YOUR MODEL LATER '''
  8 | from th_pre_model import theano_model
  9 | from tf_pre_model import tensorflow_model
 10 | 
 11 | ''' BACKEND must be TENSORFLOW
 12 | This is a script to convert Theano models (Theano Backend, TH dim ordering)
 13 | to the other possible backend / dim ordering combinations.
 14 | Given weights and model for TH-kernels-TH-dim-ordering, produces a folder with
 15 | - TH-kernels-TF-dim-ordering
 16 | - TF-kernels-TH-dim-ordering
 17 | - TF-kernels-TF-dim-ordering
 18 | Needs 3 important inputs:
 19 | 1) Theano model (model with TH dim ordering)
 20 | 2) Tensorflow model (model with TF dim ordering)
 21 | 3) Weight file for Theano model (theano-kernels-th-dim-ordering)
 22 | Supports : Multiple weights for same model (auto converts different weights for same model)
 23 | Usage:
 24 | 1) Place script in the same directory as the weight file directory. If you want to place somewhere
 25 |    else, then you must provide absolute path to the weight files below instead of relative paths.
 26 | 2) Edit the script to create your model :
 27 |     a) Import your model building script above (in the imports section)
 28 |     b) Set `th_dim_model` = ... (create your th dim model here and set it to th_dim_model)
 29 |     c) Set `tf_dim_model` = ... (create your tf dim model here and set it to tf_dim_model)
 30 |     d) Add the path to the weight files in `model_weights`.
 31 |        Note : The weight files must be for the Theano model (theano kernels, th dim ordering)
 32 | 3) Run the script.
 33 | 4) Use the weight files in the created folders : ["tf-kernels-tf-dim/", "tf-kernels-th-dim/", "th-kernels-tf-dim/"]
 34 | '''
 35 | 
 36 | K.set_image_dim_ordering('th')
 37 | th_dim_model = theano_model() # Create your theano model here with TH dim ordering
 38 | 
 39 | K.set_image_dim_ordering('tf')
 40 | tf_dim_model = tensorflow_model() # Create your tensorflow model with TF dimordering here
 41 | 
 42 | model_weights = ['theano_weights.h5f'] # Add names of theano model weight file paths here.
 43 |                      # These weights are assumed to be for  theano backend
 44 |                      # (th kernels) with th dim ordering!
 45 |                      # ('w1 shape', (32, 4, 8, 8))
 46 | 
 47 | """
 48 | No need to edit anything below this. Simply run the script now after
 49 | editing the above 3 inputs.
 50 | """
 51 | 
 52 | 
 53 | def shuffle_rows(original_w, nb_last_conv, nb_rows_dense):
 54 |     ''' Note :
 55 |     This algorithm to shuffle dense layer rows was provided by Kent Sommers (@kentsommer)
 56 |     in a gist : https://gist.github.com/kentsommer/e872f65926f1a607b94c2b464a63d0d3
 57 |     '''
 58 |     converted_w = np.zeros(original_w.shape)
 59 |     count = 0
 60 |     for index in range(original_w.shape[0]):
 61 |         if (index % nb_last_conv) == 0 and index != 0:
 62 |             count += 1
 63 |         new_index = ((index % nb_last_conv) * nb_rows_dense) + count
 64 |         print("index from " + str(index) + " -> " + str(new_index))
 65 |         converted_w[index] = original_w[new_index]
 66 | 
 67 |     return converted_w
 68 | 
 69 | 
 70 | first_dense = True
 71 | nb_last_conv = 0
 72 | 
 73 | for dirpath in ["tf-kernels-tf-dim-ordering/", "tf-kernels-th-dim-ordering/", "th-kernels-tf-dim-ordering/"]:
 74 |     if not os.path.exists(dirpath):
 75 |         os.makedirs(dirpath)
 76 | 
 77 | # Converts (theano kernels, th dim ordering) to (tensorflow kernels, th dim ordering)
 78 | K.set_image_dim_ordering('tf')
 79 | for weight_fn in model_weights:
 80 |     th_dim_model.load_weights(weight_fn)
 81 |     convert_all_kernels_in_model(th_dim_model)
 82 | 
 83 |     th_dim_model.save_weights("tf-kernels-th-dim-ordering/%s" % weight_fn, overwrite=True)
 84 |     print("Done tf-kernels-th-dim %s" % weight_fn)
 85 | 
 86 | 
 87 | # Converts (theano kernels, th dim ordering) to (tensorflow kernels, tf dim ordering)
 88 | K.set_image_dim_ordering('th')
 89 | for weight_fn in model_weights:
 90 |     th_dim_model.load_weights(weight_fn) # th-kernels-th-dim
 91 |     convert_all_kernels_in_model(th_dim_model) # tf-kernels-th-dim
 92 | 
 93 |     count_dense = 0
 94 |     for layer in th_dim_model.layers:
 95 |         if layer.__class__.__name__ == "Dense":
 96 |             count_dense += 1
 97 | 
 98 |     if count_dense == 1:
 99 |         first_dense = False # If there is only 1 dense, no need to perform row shuffle in Dense layer
100 | 
101 |     print("Nb layers : ", len(th_dim_model.layers))
102 | 
103 |     for index, th_layer in enumerate(th_dim_model.layers):
104 |         if th_layer.__class__.__name__ in ['Convolution1D',
105 |                                            'Convolution2D',
106 |                                            'Convolution3D',
107 |                                            'AtrousConvolution2D',
108 |                                            'Deconvolution2D']:
109 |             weights = th_layer.get_weights() # tf-kernels-th-dim
110 |             weights[0] = weights[0].transpose((2, 3, 1, 0))
111 |             tf_dim_model.layers[index].set_weights(weights) # tf-kernels-tf-dim
112 | 
113 |             nb_last_conv = th_layer.nb_filter # preserve last number of convolutions to use with dense layers
114 |             print("Converted layer %d : %s" % (index + 1, th_layer.name))
115 |         else:
116 |             if th_layer.__class__.__name__ == "Dense" and first_dense:
117 |                 weights = th_layer.get_weights()
118 |                 nb_rows_dense_layer = weights[0].shape[0] // nb_last_conv
119 | 
120 |                 print("Magic Number 1 : ", nb_last_conv)
121 |                 print("Magic nunber 2 : ", nb_rows_dense_layer)
122 | 
123 |                 weights[0] = shuffle_rows(weights[0], nb_last_conv, nb_rows_dense_layer)
124 |                 tf_dim_model.layers[index].set_weights(weights)
125 | 
126 |                 first_dense = False
127 |                 print("Shuffled Dense Weights layer and saved %d : %s" % (index + 1, th_layer.name))
128 |             else:
129 |                 tf_dim_model.layers[index].set_weights(th_layer.get_weights())
130 |                 print("Saved layer %d : %s" % (index + 1, th_layer.name))
131 | 
132 | 
133 |     tf_dim_model.save_weights("tf-kernels-tf-dim-ordering/%s" % weight_fn, overwrite=True)
134 |     print("Done tf-kernels-tf-dim %s" % weight_fn)
135 | 
136 | 
137 | # Converts (theano kernels, th dim ordering) to (theano kernels, tf dim ordering)
138 | for weight_fn in model_weights:
139 |     th_dim_model.load_weights(weight_fn)
140 | 
141 |     for index, th_layer in enumerate(th_dim_model.layers):
142 |         if th_layer.__class__.__name__ in ['Convolution1D',
143 |                                            'Convolution2D',
144 |                                            'Convolution3D',
145 |                                            'AtrousConvolution2D',
146 |                                            'Deconvolution2D']:
147 |             weights = th_layer.get_weights()
148 |             weights[0] = weights[0].transpose((2, 3, 1, 0))
149 |             tf_dim_model.layers[index].set_weights(weights)
150 |         else:
151 |             tf_dim_model.layers[index].set_weights(th_layer.get_weights())
152 | 
153 |         print("Changed dim %d : %s" % (index + 1, th_layer.name))
154 | 
155 |     tf_dim_model.save_weights("th-kernels-tf-dim-ordering/%s" % weight_fn, overwrite=True)
156 |     print("Done th-kernels-tf-dim %s" % weight_fn)
157 | 


--------------------------------------------------------------------------------
/weight_conversion/th_pre_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["KERAS_BACKEND"] = "device"
 3 | from keras.models import Sequential
 4 | from keras.layers import Dense, Activation, Flatten, Convolution2D
 5 | 
 6 | 
 7 | def theano_model():
 8 | 
 9 |     model = Sequential()
10 |     model.add(Convolution2D(32, 8, strides=(4,4), input_shape=(4, 84, 84), data_format="channels_first"))
11 |     model.add(Activation('relu'))
12 |     model.add(Convolution2D(64, 4, strides=(2,2)))
13 |     model.add(Activation('relu'))
14 |     model.add(Convolution2D(64, 3, strides=(1,1)))
15 |     model.add(Activation('relu'))
16 |     model.add(Flatten())
17 |     model.add(Dense(512))
18 |     model.add(Activation('relu'))
19 |     model.add(Dense(6))
20 |     return model
21 | 


--------------------------------------------------------------------------------
/weight_conversion/th_pre_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/th_pre_model.pyc


--------------------------------------------------------------------------------
/weight_conversion/theano_params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['THEANO_FLAGS'] = "device=gpu0"
 3 | 
 4 | import theano
 5 | import numpy
 6 | import pickle
 7 | 
 8 | pkl = open('./pong_dqn_v4_reg_0.01/network_file_50.pkl', 'rb')
 9 | data = pickle.load(pkl)
10 | 
11 | import lasagne
12 | 
13 | params = lasagne.layers.get_all_params(data.l_out)
14 | param_values = {}
15 | 
16 | id = 0
17 | 
18 | for p in params:
19 | 
20 |     if str(p) == "W":
21 |         param_values["w%d" %(id+1)] = p.get_value().T
22 | 
23 |     if str(p) == "b":
24 |         param_values["w%d" %(id+1)] = p.get_value()
25 |         id += 1
26 | 
27 | pkl.close()
28 | pkl = open('./pong_dqn_v4_reg_0.01/network_params.pkl', 'wb')
29 | pickle.dump(param_values, pkl)
30 | 
31 | pkl.close()
32 | 


--------------------------------------------------------------------------------
/weight_conversion/theano_weights.h5f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manantomar/Reinforcement-Learning/3bcb241e05c9d32df9a69f20b18347780e4a5be7/weight_conversion/theano_weights.h5f


--------------------------------------------------------------------------------