├── 00_atari_dqn.py ├── 01_dqn.py ├── 02_ddqn.py ├── 03_priority_replay.py ├── 04_dueling.py ├── 05_multistep_td.py ├── 06_distributional_rl.py ├── 07_noisynet.py ├── README.md ├── images ├── ddqn.png ├── distributional_algorithm2.png ├── distributional_learn.png ├── distributional_project.png ├── distributional_projected.png ├── distributional_rl.png ├── dqn.png ├── dqn_algorithm.png ├── dqn_net.png ├── dueling_detail.png ├── dueling_details.png ├── dueling_netarch.png ├── gym_cartpole_v0.gif ├── noisy_net_algorithm.png ├── p2.png ├── rlblog_images │ ├── IS.jpg │ ├── LSTM.png │ ├── PPO.png │ ├── README.md │ ├── RNN-unrolled.png │ ├── ppo.png │ ├── r1.png │ └── r2.png └── sards.png └── tutorial_blogs ├── Building_Rainbow_Step_by_Step_with_TensorFlow2.0.md └── gym_tutorial.md /00_atari_dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Q-Network(DQN) for Atari Game, which has convolutional layers to handle images input and other preprocessings. 3 | 4 | Using: 5 | TensorFlow 2.0 6 | Numpy 1.16.2 7 | Gym 0.12.1 8 | """ 9 | 10 | import tensorflow as tf 11 | print(tf.__version__) 12 | 13 | import gym 14 | import time 15 | import numpy as np 16 | import tensorflow.keras.layers as kl 17 | import tensorflow.keras.optimizers as ko 18 | 19 | np.random.seed(1) 20 | tf.random.set_seed(1) 21 | 22 | # Minor change from cs234:reinforcement learning, assignment 2 -> utils/preprocess.py 23 | def greyscale(state): 24 | """ 25 | Preprocess state (210, 160, 3) image into 26 | a (80, 80, 1) image in grey scale 27 | """ 28 | state = np.reshape(state, [210, 160, 3]).astype(np.float32) 29 | # grey scale 30 | state = state[:, :, 0] * 0.299 + state[:, :, 1] * 0.587 + state[:, :, 2] * 0.114 31 | # karpathy 32 | state = state[35:195] # crop 33 | state = state[::2,::2] # downsample by factor of 2 34 | state = state[:, :, np.newaxis] 35 | return state.astype(np.float32) 36 | 37 | 38 | class Model(tf.keras.Model): 39 | def __init__(self, num_actions): 40 | super().__init__(name='dqn') 41 | self.conv1 = kl.Conv2D(32, kernel_size=(8, 8), strides=4, activation='relu') 42 | self.conv2 = kl.Conv2D(64, kernel_size=(4, 4), strides=2, activation='relu') 43 | self.conv3 = kl.Conv2D(64, kernel_size=(3, 3), strides=1, activation='relu') 44 | self.flat = kl.Flatten() 45 | self.fc1 = kl.Dense(512, activation='relu') 46 | self.fc2 = kl.Dense(num_actions) 47 | 48 | def call(self, inputs): 49 | # x = tf.convert_to_tensor(inputs, dtype=tf.float32) 50 | x = self.conv1(inputs) 51 | x = self.conv2(x) 52 | x = self.conv3(x) 53 | x = self.flat(x) 54 | x = self.fc1(x) 55 | x = self.fc2(x) 56 | return x 57 | 58 | def action_value(self, obs): 59 | q_values = self.predict(obs) 60 | best_action = np.argmax(q_values, axis=-1) 61 | return best_action[0], q_values[0] 62 | 63 | 64 | class DQNAgent: 65 | def __init__(self, model, target_model, env, buffer_size=1000, learning_rate=.001, epsilon=.1, gamma=.9, 66 | batch_size=4, target_update_iter=20, train_nums=100, start_learning=10): 67 | self.model = model 68 | self.target_model = target_model 69 | self.model.compile(optimizer=ko.Adam(), loss='mse') 70 | 71 | # parameters 72 | self.env = env # gym environment 73 | self.lr = learning_rate # learning step 74 | self.epsilon = epsilon # e-greedy when exploring 75 | self.gamma = gamma # discount rate 76 | self.batch_size = batch_size # batch_size 77 | self.target_update_iter = target_update_iter # target update period 78 | self.train_nums = train_nums # total training steps 79 | self.num_in_buffer = 0 # transitions num in buffer 80 | self.buffer_size = buffer_size # replay buffer size 81 | self.start_learning = start_learning # step to begin learning(save transitions before that step) 82 | 83 | # replay buffer 84 | self.obs = np.empty((self.buffer_size,) + greyscale(self.env.reset()).shape) 85 | self.actions = np.empty((self.buffer_size), dtype=np.int8) 86 | self.rewards = np.empty((self.buffer_size), dtype=np.float32) 87 | self.dones = np.empty((self.buffer_size), dtype=np.bool) 88 | self.next_states = np.empty((self.buffer_size,) + greyscale(self.env.reset()).shape) 89 | self.next_idx = 0 90 | 91 | 92 | # To test whether the model works 93 | def test(self, render=True): 94 | obs, done, ep_reward = self.env.reset(), False, 0 95 | while not done: 96 | obs = greyscale(obs) 97 | # Using [None] to extend its dimension [80, 80, 1] -> [1, 80, 80, 1] 98 | action, _ = self.model.action_value(obs[None]) 99 | obs, reward, done, info = self.env.step(action) 100 | ep_reward += reward 101 | if render: # visually 102 | self.env.render() 103 | time.sleep(0.05) 104 | self.env.close() 105 | return ep_reward 106 | 107 | def train(self): 108 | obs = self.env.reset() 109 | obs = greyscale(obs)[None] 110 | for t in range(self.train_nums): 111 | best_action, q_values = self.model.action_value(obs) 112 | action = self.get_action(best_action) 113 | next_obs, reward, done, info = self.env.step(action) 114 | next_obs = greyscale(next_obs)[None] 115 | self.store_transition(obs, action, reward, next_obs, done) 116 | self.num_in_buffer += 1 117 | 118 | if t > self.start_learning: # start learning 119 | losses = self.train_step(t) 120 | 121 | if t % self.target_update_iter == 0: 122 | self.update_target_model() 123 | 124 | obs = next_obs 125 | 126 | def train_step(self, t): 127 | idxes = self.sample(self.batch_size) 128 | self.s_batch = self.obs[idxes] 129 | self.a_batch = self.actions[idxes] 130 | self.r_batch = self.rewards[idxes] 131 | self.ns_batch = self.next_states[idxes] 132 | self.done_batch = self.dones[idxes] 133 | 134 | target_q = self.r_batch + self.gamma * \ 135 | np.amax(self.get_target_value(self.ns_batch), axis=1) * (1 - self.done_batch) 136 | target_f = self.model.predict(self.s_batch) 137 | for i, val in enumerate(self.a_batch): 138 | target_f[i][val] = target_q[i] 139 | 140 | losses = self.model.train_on_batch(self.s_batch, target_f) 141 | 142 | return losses 143 | 144 | 145 | 146 | # def loss_function(self, q, target_q): 147 | # n_actions = self.env.action_space.n 148 | # print('action in loss', self.a_batch) 149 | # actions = to_categorical(self.a_batch, n_actions) 150 | # q = np.sum(np.multiply(q, actions), axis=1) 151 | # self.loss = kls.mean_squared_error(q, target_q) 152 | 153 | 154 | def store_transition(self, obs, action, reward, next_state, done): 155 | n_idx = self.next_idx % self.buffer_size 156 | self.obs[n_idx] = obs 157 | self.actions[n_idx] = action 158 | self.rewards[n_idx] = reward 159 | self.next_states[n_idx] = next_state 160 | self.dones[n_idx] = done 161 | self.next_idx = (self.next_idx + 1) % self.buffer_size 162 | 163 | def sample(self, n): 164 | assert n < self.num_in_buffer 165 | res = [] 166 | while True: 167 | num = np.random.randint(0, self.num_in_buffer) 168 | if num not in res: 169 | res.append(num) 170 | if len(res) == n: 171 | break 172 | return res 173 | 174 | def get_action(self, best_action): 175 | if np.random.rand() < self.epsilon: 176 | return self.env.action_space.sample() 177 | return best_action 178 | 179 | def update_target_model(self): 180 | print('update_target_mdoel') 181 | self.target_model.set_weights(self.model.get_weights()) 182 | 183 | def get_target_value(self, obs): 184 | return self.target_model.predict(obs) 185 | 186 | if __name__ == '__main__': 187 | env = gym.make("Pong-v0") 188 | obs = env.reset() 189 | num_actions = env.action_space.n 190 | model = Model(num_actions) 191 | target_model = Model(num_actions) 192 | agent = DQNAgent(model, target_model, env) 193 | # reward = agent.test() 194 | agent.train() 195 | -------------------------------------------------------------------------------- /01_dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Deep Q-Network(DQN) including the main tactics mentioned in DeepMind's original paper: 3 | - Experience Replay 4 | - Target Network 5 | To play CartPole-v0. 6 | 7 | > Note: DQN can only handle discrete-env which have a discrete action space, like up, down, left, right. 8 | As for the CartPole-v0 environment, its state(the agent's observation) is a 1-D vector not a 3-D image like 9 | Atari, so in that simple example, there is no need to use the convolutional layer, just fully-connected layer. 10 | 11 | Using: 12 | TensorFlow 2.0 13 | Numpy 1.16.2 14 | Gym 0.12.1 15 | """ 16 | 17 | import tensorflow as tf 18 | print(tf.__version__) 19 | 20 | import gym 21 | import time 22 | import numpy as np 23 | import tensorflow.keras.layers as kl 24 | import tensorflow.keras.optimizers as ko 25 | 26 | np.random.seed(1) 27 | tf.random.set_seed(1) 28 | 29 | # Neural Network Model Defined at Here. 30 | class Model(tf.keras.Model): 31 | def __init__(self, num_actions): 32 | super().__init__(name='basic_dqn') 33 | # you can try different kernel initializer 34 | self.fc1 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 35 | self.fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 36 | self.logits = kl.Dense(num_actions, name='q_values') 37 | 38 | # forward propagation 39 | def call(self, inputs): 40 | x = self.fc1(inputs) 41 | x = self.fc2(x) 42 | x = self.logits(x) 43 | return x 44 | 45 | # a* = argmax_a' Q(s, a') 46 | def action_value(self, obs): 47 | q_values = self.predict(obs) 48 | best_action = np.argmax(q_values, axis=-1) 49 | return best_action[0], q_values[0] 50 | 51 | # To test whether the model works 52 | def test_model(): 53 | env = gym.make('CartPole-v0') 54 | print('num_actions: ', env.action_space.n) 55 | model = Model(env.action_space.n) 56 | 57 | obs = env.reset() 58 | print('obs_shape: ', obs.shape) 59 | 60 | # tensorflow 2.0: no feed_dict or tf.Session() needed at all 61 | best_action, q_values = model.action_value(obs[None]) 62 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 63 | 64 | 65 | class DQNAgent: # Deep Q-Network 66 | def __init__(self, model, target_model, env, buffer_size=100, learning_rate=.0015, epsilon=.1, epsilon_dacay=0.995, 67 | min_epsilon=.01, gamma=.95, batch_size=4, target_update_iter=400, train_nums=5000, start_learning=10): 68 | self.model = model 69 | self.target_model = target_model 70 | # print(id(self.model), id(self.target_model)) # to make sure the two models don't update simultaneously 71 | # gradient clip 72 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) # do gradient clip 73 | self.model.compile(optimizer=opt, loss='mse') 74 | 75 | # parameters 76 | self.env = env # gym environment 77 | self.lr = learning_rate # learning step 78 | self.epsilon = epsilon # e-greedy when exploring 79 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 80 | self.min_epsilon = min_epsilon # minimum epsilon 81 | self.gamma = gamma # discount rate 82 | self.batch_size = batch_size # batch_size 83 | self.target_update_iter = target_update_iter # target network update period 84 | self.train_nums = train_nums # total training steps 85 | self.num_in_buffer = 0 # transition's num in buffer 86 | self.buffer_size = buffer_size # replay buffer size 87 | self.start_learning = start_learning # step to begin learning(no update before that step) 88 | 89 | # replay buffer params [(s, a, r, ns, done), ...] 90 | self.obs = np.empty((self.buffer_size,) + self.env.reset().shape) 91 | self.actions = np.empty((self.buffer_size), dtype=np.int8) 92 | self.rewards = np.empty((self.buffer_size), dtype=np.float32) 93 | self.dones = np.empty((self.buffer_size), dtype=np.bool) 94 | self.next_states = np.empty((self.buffer_size,) + self.env.reset().shape) 95 | self.next_idx = 0 96 | 97 | def train(self): 98 | # initialize the initial observation of the agent 99 | obs = self.env.reset() 100 | for t in range(1, self.train_nums): 101 | best_action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 102 | action = self.get_action(best_action) # get the real action 103 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 104 | self.store_transition(obs, action, reward, next_obs, done) # store that transition into replay butter 105 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 106 | 107 | if t > self.start_learning: # start learning 108 | losses = self.train_step() 109 | if t % 1000 == 0: 110 | print('losses each 1000 steps: ', losses) 111 | 112 | if t % self.target_update_iter == 0: 113 | self.update_target_model() 114 | if done: 115 | obs = self.env.reset() 116 | else: 117 | obs = next_obs 118 | 119 | def train_step(self): 120 | idxes = self.sample(self.batch_size) 121 | s_batch = self.obs[idxes] 122 | a_batch = self.actions[idxes] 123 | r_batch = self.rewards[idxes] 124 | ns_batch = self.next_states[idxes] 125 | done_batch = self.dones[idxes] 126 | 127 | target_q = r_batch + self.gamma * np.amax(self.get_target_value(ns_batch), axis=1) * (1 - done_batch) 128 | target_f = self.model.predict(s_batch) 129 | for i, val in enumerate(a_batch): 130 | target_f[i][val] = target_q[i] 131 | 132 | losses = self.model.train_on_batch(s_batch, target_f) 133 | 134 | return losses 135 | 136 | def evalation(self, env, render=True): 137 | obs, done, ep_reward = env.reset(), False, 0 138 | # one episode until done 139 | while not done: 140 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 141 | obs, reward, done, info = env.step(action) 142 | ep_reward += reward 143 | if render: # visually show 144 | env.render() 145 | time.sleep(0.05) 146 | env.close() 147 | return ep_reward 148 | 149 | # store transitions into replay butter 150 | def store_transition(self, obs, action, reward, next_state, done): 151 | n_idx = self.next_idx % self.buffer_size 152 | self.obs[n_idx] = obs 153 | self.actions[n_idx] = action 154 | self.rewards[n_idx] = reward 155 | self.next_states[n_idx] = next_state 156 | self.dones[n_idx] = done 157 | self.next_idx = (self.next_idx + 1) % self.buffer_size 158 | 159 | # sample n different indexes 160 | def sample(self, n): 161 | assert n < self.num_in_buffer 162 | res = [] 163 | while True: 164 | num = np.random.randint(0, self.num_in_buffer) 165 | if num not in res: 166 | res.append(num) 167 | if len(res) == n: 168 | break 169 | return res 170 | 171 | # e-greedy 172 | def get_action(self, best_action): 173 | if np.random.rand() < self.epsilon: 174 | return self.env.action_space.sample() 175 | return best_action 176 | 177 | # assign the current network parameters to target network 178 | def update_target_model(self): 179 | self.target_model.set_weights(self.model.get_weights()) 180 | 181 | def get_target_value(self, obs): 182 | return self.target_model.predict(obs) 183 | 184 | def e_decay(self): 185 | self.epsilon *= self.epsilon_decay 186 | 187 | if __name__ == '__main__': 188 | test_model() 189 | 190 | env = gym.make("CartPole-v0") 191 | num_actions = env.action_space.n 192 | model = Model(num_actions) 193 | target_model = Model(num_actions) 194 | agent = DQNAgent(model, target_model, env) 195 | # test before 196 | rewards_sum = agent.evalation(env) 197 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 198 | 199 | agent.train() 200 | # test after 201 | rewards_sum = agent.evalation(env) 202 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 203 | -------------------------------------------------------------------------------- /02_ddqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Double Deep Q-Network(DDQN), minor change to DQN. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | np.random.seed(1) 21 | tf.random.set_seed(1) 22 | 23 | # Neural Network Model Defined at Here. 24 | class Model(tf.keras.Model): 25 | def __init__(self, num_actions): 26 | super().__init__(name='basic_ddqn') 27 | # you can try different kernel initializer 28 | self.fc1 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 29 | self.fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 30 | self.logits = kl.Dense(num_actions, name='q_values') 31 | 32 | # forward propagation 33 | def call(self, inputs): 34 | x = self.fc1(inputs) 35 | x = self.fc2(x) 36 | x = self.logits(x) 37 | return x 38 | 39 | # a* = argmax_a' Q(s, a') 40 | def action_value(self, obs): 41 | q_values = self.predict(obs) 42 | best_action = np.argmax(q_values, axis=-1) 43 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 44 | 45 | # To test whether the model works 46 | def test_model(): 47 | env = gym.make('CartPole-v0') 48 | print('num_actions: ', env.action_space.n) 49 | model = Model(env.action_space.n) 50 | 51 | obs = env.reset() 52 | print('obs_shape: ', obs.shape) 53 | 54 | # tensorflow 2.0: no feed_dict or tf.Session() needed at all 55 | best_action, q_values = model.action_value(obs[None]) 56 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 57 | 58 | 59 | class DDQNAgent: # Double Deep Q-Network 60 | def __init__(self, model, target_model, env, buffer_size=200, learning_rate=.0015, epsilon=.1, epsilon_dacay=0.995, 61 | min_epsilon=.01, gamma=.9, batch_size=8, target_update_iter=200, train_nums=5000, start_learning=100): 62 | self.model = model 63 | self.target_model = target_model 64 | # gradient clip 65 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) 66 | self.model.compile(optimizer=opt, loss='mse') 67 | 68 | # parameters 69 | self.env = env # gym environment 70 | self.lr = learning_rate # learning step 71 | self.epsilon = epsilon # e-greedy when exploring 72 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 73 | self.min_epsilon = min_epsilon # minimum epsilon 74 | self.gamma = gamma # discount rate 75 | self.batch_size = batch_size # batch_size 76 | self.target_update_iter = target_update_iter # target network update period 77 | self.train_nums = train_nums # total training steps 78 | self.num_in_buffer = 0 # transition's num in buffer 79 | self.buffer_size = buffer_size # replay buffer size 80 | self.start_learning = start_learning # step to begin learning(no update before that step) 81 | 82 | # replay buffer params [(s, a, r, ns, done), ...] 83 | self.obs = np.empty((self.buffer_size,) + self.env.reset().shape) 84 | self.actions = np.empty((self.buffer_size), dtype=np.int8) 85 | self.rewards = np.empty((self.buffer_size), dtype=np.float32) 86 | self.dones = np.empty((self.buffer_size), dtype=np.bool) 87 | self.next_states = np.empty((self.buffer_size,) + self.env.reset().shape) 88 | self.next_idx = 0 89 | 90 | def train(self): 91 | # initialize the initial observation of the agent 92 | obs = self.env.reset() 93 | for t in range(1, self.train_nums): 94 | best_action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 95 | action = self.get_action(best_action) # get the real action 96 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 97 | self.store_transition(obs, action, reward, next_obs, done) # store that transition into replay butter 98 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 99 | 100 | if t > self.start_learning: # start learning 101 | losses = self.train_step() 102 | if t % 1000 == 0: 103 | print('losses each 1000 steps: ', losses) 104 | 105 | if t % self.target_update_iter == 0: 106 | self.update_target_model() 107 | if done: 108 | obs = self.env.reset() 109 | else: 110 | obs = next_obs 111 | 112 | def train_step(self): 113 | idxes = self.sample(self.batch_size) 114 | s_batch = self.obs[idxes] 115 | a_batch = self.actions[idxes] 116 | r_batch = self.rewards[idxes] 117 | ns_batch = self.next_states[idxes] 118 | done_batch = self.dones[idxes] 119 | # Double Q-Learning, decoupling selection and evaluation of the bootstrap action 120 | # selection with the current DQN model 121 | best_action_idxes, _ = self.model.action_value(ns_batch) 122 | target_q = self.get_target_value(ns_batch) 123 | # evaluation with the target DQN model 124 | target_q = r_batch + self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - done_batch) 125 | target_f = self.model.predict(s_batch) 126 | for i, val in enumerate(a_batch): 127 | target_f[i][val] = target_q[i] 128 | 129 | losses = self.model.train_on_batch(s_batch, target_f) 130 | 131 | return losses 132 | 133 | def evalation(self, env, render=True): 134 | obs, done, ep_reward = env.reset(), False, 0 135 | # one episode until done 136 | while not done: 137 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 138 | obs, reward, done, info = env.step(action) 139 | ep_reward += reward 140 | if render: # visually show 141 | env.render() 142 | time.sleep(0.05) 143 | env.close() 144 | return ep_reward 145 | 146 | # store transitions into replay butter 147 | def store_transition(self, obs, action, reward, next_state, done): 148 | n_idx = self.next_idx % self.buffer_size 149 | self.obs[n_idx] = obs 150 | self.actions[n_idx] = action 151 | self.rewards[n_idx] = reward 152 | self.next_states[n_idx] = next_state 153 | self.dones[n_idx] = done 154 | self.next_idx = (self.next_idx + 1) % self.buffer_size 155 | 156 | # sample n different indexes 157 | def sample(self, n): 158 | assert n < self.num_in_buffer 159 | res = [] 160 | while True: 161 | num = np.random.randint(0, self.num_in_buffer) 162 | if num not in res: 163 | res.append(num) 164 | if len(res) == n: 165 | break 166 | return res 167 | 168 | # e-greedy 169 | def get_action(self, best_action): 170 | if np.random.rand() < self.epsilon: 171 | return self.env.action_space.sample() 172 | return best_action 173 | 174 | # assign the current network parameters to target network 175 | def update_target_model(self): 176 | self.target_model.set_weights(self.model.get_weights()) 177 | 178 | def get_target_value(self, obs): 179 | return self.target_model.predict(obs) 180 | 181 | def e_decay(self): 182 | self.epsilon *= self.epsilon_decay 183 | 184 | if __name__ == '__main__': 185 | test_model() 186 | 187 | env = gym.make("CartPole-v0") 188 | num_actions = env.action_space.n 189 | model = Model(num_actions) 190 | target_model = Model(num_actions) 191 | agent = DDQNAgent(model, target_model, env) 192 | # test before 193 | rewards_sum = agent.evalation(env) 194 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 195 | 196 | agent.train() 197 | # test after 198 | # env = gym.wrappers.Monitor(env, './recording', force=True) # to record the process 199 | rewards_sum = agent.evalation(env) 200 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 201 | -------------------------------------------------------------------------------- /03_priority_replay.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Prioritized Experience Replay based on Double DQN. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | np.random.seed(1) 21 | tf.random.set_seed(1) 22 | 23 | # Neural Network Model Defined at Here. 24 | class Model(tf.keras.Model): 25 | def __init__(self, num_actions): 26 | super().__init__(name='basic_prddqn') 27 | # you can try different kernel initializer 28 | self.fc1 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 29 | self.fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 30 | self.logits = kl.Dense(num_actions, name='q_values') 31 | 32 | # forward propagation 33 | def call(self, inputs): 34 | x = self.fc1(inputs) 35 | x = self.fc2(x) 36 | x = self.logits(x) 37 | return x 38 | 39 | # a* = argmax_a' Q(s, a') 40 | def action_value(self, obs): 41 | q_values = self.predict(obs) 42 | best_action = np.argmax(q_values, axis=-1) 43 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 44 | 45 | 46 | # To test whether the model works 47 | def test_model(): 48 | env = gym.make('CartPole-v0') 49 | print('num_actions: ', env.action_space.n) 50 | model = Model(env.action_space.n) 51 | 52 | obs = env.reset() 53 | print('obs_shape: ', obs.shape) 54 | 55 | # tensorflow 2.0 eager mode: no feed_dict or tf.Session() needed at all 56 | best_action, q_values = model.action_value(obs[None]) 57 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 58 | 59 | 60 | # replay buffer 61 | class SumTree: 62 | # little modified from https://github.com/jaromiru/AI-blog/blob/master/SumTree.py 63 | def __init__(self, capacity): 64 | self.capacity = capacity # N, the size of replay buffer, so as to the number of sum tree's leaves 65 | self.tree = np.zeros(2 * capacity - 1) # equation, to calculate the number of nodes in a sum tree 66 | self.transitions = np.empty(capacity, dtype=object) 67 | self.next_idx = 0 68 | 69 | @property 70 | def total_p(self): 71 | return self.tree[0] 72 | 73 | def add(self, priority, transition): 74 | idx = self.next_idx + self.capacity - 1 75 | self.transitions[self.next_idx] = transition 76 | self.update(idx, priority) 77 | self.next_idx = (self.next_idx + 1) % self.capacity 78 | 79 | def update(self, idx, priority): 80 | change = priority - self.tree[idx] 81 | self.tree[idx] = priority 82 | self._propagate(idx, change) # O(logn) 83 | 84 | def _propagate(self, idx, change): 85 | parent = (idx - 1) // 2 86 | self.tree[parent] += change 87 | if parent != 0: 88 | self._propagate(parent, change) 89 | 90 | def get_leaf(self, s): 91 | idx = self._retrieve(0, s) # from root 92 | trans_idx = idx - self.capacity + 1 93 | return idx, self.tree[idx], self.transitions[trans_idx] 94 | 95 | def _retrieve(self, idx, s): 96 | left = 2 * idx + 1 97 | right = left + 1 98 | if left >= len(self.tree): 99 | return idx 100 | if s <= self.tree[left]: 101 | return self._retrieve(left, s) 102 | else: 103 | return self._retrieve(right, s - self.tree[left]) 104 | 105 | 106 | class PERAgent: # Double DQN with Proportional Prioritization 107 | def __init__(self, model, target_model, env, learning_rate=.0012, epsilon=.1, epsilon_dacay=0.995, min_epsilon=.01, 108 | gamma=.9, batch_size=8, target_update_iter=400, train_nums=5000, buffer_size=200, replay_period=20, 109 | alpha=0.4, beta=0.4, beta_increment_per_sample=0.001): 110 | self.model = model 111 | self.target_model = target_model 112 | # gradient clip 113 | opt = ko.Adam(learning_rate=learning_rate) # , clipvalue=10.0 114 | self.model.compile(optimizer=opt, loss=self._per_loss) # loss=self._per_loss 115 | 116 | # parameters 117 | self.env = env # gym environment 118 | self.lr = learning_rate # learning step 119 | self.epsilon = epsilon # e-greedy when exploring 120 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 121 | self.min_epsilon = min_epsilon # minimum epsilon 122 | self.gamma = gamma # discount rate 123 | self.batch_size = batch_size # minibatch k 124 | self.target_update_iter = target_update_iter # target network update period 125 | self.train_nums = train_nums # total training steps 126 | 127 | # replay buffer params [(s, a, r, ns, done), ...] 128 | self.b_obs = np.empty((self.batch_size,) + self.env.reset().shape) 129 | self.b_actions = np.empty(self.batch_size, dtype=np.int8) 130 | self.b_rewards = np.empty(self.batch_size, dtype=np.float32) 131 | self.b_next_states = np.empty((self.batch_size,) + self.env.reset().shape) 132 | self.b_dones = np.empty(self.batch_size, dtype=np.bool) 133 | 134 | self.replay_buffer = SumTree(buffer_size) # sum-tree data structure 135 | self.buffer_size = buffer_size # replay buffer size N 136 | self.replay_period = replay_period # replay period K 137 | self.alpha = alpha # priority parameter, alpha=[0, 0.4, 0.5, 0.6, 0.7, 0.8] 138 | self.beta = beta # importance sampling parameter, beta=[0, 0.4, 0.5, 0.6, 1] 139 | self.beta_increment_per_sample = beta_increment_per_sample 140 | self.num_in_buffer = 0 # total number of transitions stored in buffer 141 | self.margin = 0.01 # pi = |td_error| + margin 142 | self.p1 = 1 # initialize priority for the first transition 143 | # self.is_weight = np.empty((None, 1)) 144 | self.is_weight = np.power(self.buffer_size, -self.beta) # because p1 == 1 145 | self.abs_error_upper = 1 146 | 147 | def _per_loss(self, y_target, y_pred): 148 | return tf.reduce_mean(self.is_weight * tf.math.squared_difference(y_target, y_pred)) 149 | 150 | def train(self): 151 | # initialize the initial observation of the agent 152 | obs = self.env.reset() 153 | for t in range(1, self.train_nums): 154 | best_action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 155 | action = self.get_action(best_action) # get the real action 156 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 157 | if t == 1: 158 | p = self.p1 159 | else: 160 | p = np.max(self.replay_buffer.tree[-self.replay_buffer.capacity:]) 161 | self.store_transition(p, obs, action, reward, next_obs, done) # store that transition into replay butter 162 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 163 | 164 | if t > self.buffer_size: 165 | # if t % self.replay_period == 0: # transition sampling and update 166 | losses = self.train_step() 167 | if t % 1000 == 0: 168 | print('losses each 1000 steps: ', losses) 169 | 170 | if t % self.target_update_iter == 0: 171 | self.update_target_model() 172 | if done: 173 | obs = self.env.reset() # one episode end 174 | else: 175 | obs = next_obs 176 | 177 | def train_step(self): 178 | idxes, self.is_weight = self.sum_tree_sample(self.batch_size) 179 | # Double Q-Learning 180 | best_action_idxes, _ = self.model.action_value(self.b_next_states) # get actions through the current network 181 | target_q = self.get_target_value(self.b_next_states) # get target q-value through the target network 182 | # get td_targets of batch states 183 | td_target = self.b_rewards + \ 184 | self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - self.b_dones) 185 | predict_q = self.model.predict(self.b_obs) 186 | td_predict = predict_q[np.arange(predict_q.shape[0]), self.b_actions] 187 | abs_td_error = np.abs(td_target - td_predict) + self.margin 188 | clipped_error = np.where(abs_td_error < self.abs_error_upper, abs_td_error, self.abs_error_upper) 189 | ps = np.power(clipped_error, self.alpha) 190 | # priorities update 191 | for idx, p in zip(idxes, ps): 192 | self.replay_buffer.update(idx, p) 193 | 194 | for i, val in enumerate(self.b_actions): 195 | predict_q[i][val] = td_target[i] 196 | 197 | target_q = predict_q # just to change a more explicit name 198 | losses = self.model.train_on_batch(self.b_obs, target_q) 199 | 200 | return losses 201 | 202 | # proportional prioritization sampling 203 | def sum_tree_sample(self, k): 204 | idxes = [] 205 | is_weights = np.empty((k, 1)) 206 | self.beta = min(1., self.beta + self.beta_increment_per_sample) 207 | # calculate max_weight 208 | min_prob = np.min(self.replay_buffer.tree[-self.replay_buffer.capacity:]) / self.replay_buffer.total_p 209 | max_weight = np.power(self.buffer_size * min_prob, -self.beta) 210 | segment = self.replay_buffer.total_p / k 211 | for i in range(k): 212 | s = np.random.uniform(segment * i, segment * (i + 1)) 213 | idx, p, t = self.replay_buffer.get_leaf(s) 214 | idxes.append(idx) 215 | self.b_obs[i], self.b_actions[i], self.b_rewards[i], self.b_next_states[i], self.b_dones[i] = t 216 | # P(j) 217 | sampling_probabilities = p / self.replay_buffer.total_p # where p = p ** self.alpha 218 | is_weights[i, 0] = np.power(self.buffer_size * sampling_probabilities, -self.beta) / max_weight 219 | return idxes, is_weights 220 | 221 | def evaluation(self, env, render=True): 222 | obs, done, ep_reward = env.reset(), False, 0 223 | # one episode until done 224 | while not done: 225 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 226 | obs, reward, done, info = env.step(action) 227 | ep_reward += reward 228 | if render: # visually show 229 | env.render() 230 | time.sleep(0.05) 231 | env.close() 232 | return ep_reward 233 | 234 | # store transitions into replay butter, now sum tree. 235 | def store_transition(self, priority, obs, action, reward, next_state, done): 236 | transition = [obs, action, reward, next_state, done] 237 | self.replay_buffer.add(priority, transition) 238 | 239 | # rank-based prioritization sampling 240 | def rand_based_sample(self, k): 241 | pass 242 | 243 | # e-greedy 244 | def get_action(self, best_action): 245 | if np.random.rand() < self.epsilon: 246 | return self.env.action_space.sample() 247 | return best_action 248 | 249 | # assign the current network parameters to target network 250 | def update_target_model(self): 251 | self.target_model.set_weights(self.model.get_weights()) 252 | 253 | def get_target_value(self, obs): 254 | return self.target_model.predict(obs) 255 | 256 | def e_decay(self): 257 | self.epsilon *= self.epsilon_decay 258 | 259 | 260 | if __name__ == '__main__': 261 | test_model() 262 | 263 | env = gym.make("CartPole-v0") 264 | num_actions = env.action_space.n 265 | model = Model(num_actions) 266 | target_model = Model(num_actions) 267 | agent = PERAgent(model, target_model, env) 268 | # test before 269 | rewards_sum = agent.evaluation(env) 270 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 271 | 272 | agent.train() 273 | # test after 274 | # env = gym.wrappers.Monitor(env, './recording', force=True) 275 | rewards_sum = agent.evaluation(env) 276 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 277 | -------------------------------------------------------------------------------- /04_dueling.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Dueling Double DQN with Prioritized Experience Replay. Just slightly modify the network architecture. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | np.random.seed(1) 21 | tf.random.set_seed(1) 22 | 23 | # Neural Network Model Defined at Here. 24 | class Model(tf.keras.Model): 25 | def __init__(self, num_actions): 26 | super().__init__(name='basic_prdddqn') 27 | # you can try different kernel initializer 28 | self.shared_fc1 = kl.Dense(16, activation='relu', kernel_initializer='he_uniform') 29 | self.shared_fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 30 | # there is a trick that combining the two streams' fc layer, then 31 | # the output of that layer is a |A| + 1 dimension tensor: |V|A1|A2| ... |An| 32 | # output[:, 0] is state value, output[:, 1:] is action advantage 33 | self.val_adv_fc = kl.Dense(num_actions + 1, activation='relu', kernel_initializer='he_uniform') 34 | 35 | # forward propagation 36 | def call(self, inputs): 37 | x = self.shared_fc1(inputs) 38 | x = self.shared_fc2(x) 39 | val_adv = self.val_adv_fc(x) 40 | # average version, you can also try the max version. 41 | outputs = tf.expand_dims(val_adv[:, 0], -1) + (val_adv[:, 1:] - tf.reduce_mean(val_adv[:, 1:], -1, keepdims=True)) 42 | return outputs 43 | 44 | # a* = argmax_a' Q(s, a') 45 | def action_value(self, obs): 46 | q_values = self.predict(obs) 47 | best_action = np.argmax(q_values, axis=-1) 48 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 49 | 50 | 51 | # To test whether the model works 52 | def test_model(): 53 | env = gym.make('CartPole-v0') 54 | print('num_actions: ', env.action_space.n) 55 | model = Model(env.action_space.n) 56 | 57 | obs = env.reset() 58 | print('obs_shape: ', obs.shape) 59 | 60 | # tensorflow 2.0 eager mode: no feed_dict or tf.Session() needed at all 61 | best_action, q_values = model.action_value(obs[None]) 62 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 63 | 64 | 65 | # replay buffer 66 | class SumTree: 67 | # little modified from https://github.com/jaromiru/AI-blog/blob/master/SumTree.py 68 | def __init__(self, capacity): 69 | self.capacity = capacity # N, the size of replay buffer, so as to the number of sum tree's leaves 70 | self.tree = np.zeros(2 * capacity - 1) # equation, to calculate the number of nodes in a sum tree 71 | self.transitions = np.empty(capacity, dtype=object) 72 | self.next_idx = 0 73 | 74 | @property 75 | def total_p(self): 76 | return self.tree[0] 77 | 78 | def add(self, priority, transition): 79 | idx = self.next_idx + self.capacity - 1 80 | self.transitions[self.next_idx] = transition 81 | self.update(idx, priority) 82 | self.next_idx = (self.next_idx + 1) % self.capacity 83 | 84 | def update(self, idx, priority): 85 | change = priority - self.tree[idx] 86 | self.tree[idx] = priority 87 | self._propagate(idx, change) # O(logn) 88 | 89 | def _propagate(self, idx, change): 90 | parent = (idx - 1) // 2 91 | self.tree[parent] += change 92 | if parent != 0: 93 | self._propagate(parent, change) 94 | 95 | def get_leaf(self, s): 96 | idx = self._retrieve(0, s) # from root 97 | trans_idx = idx - self.capacity + 1 98 | return idx, self.tree[idx], self.transitions[trans_idx] 99 | 100 | def _retrieve(self, idx, s): 101 | left = 2 * idx + 1 102 | right = left + 1 103 | if left >= len(self.tree): 104 | return idx 105 | if s <= self.tree[left]: 106 | return self._retrieve(left, s) 107 | else: 108 | return self._retrieve(right, s - self.tree[left]) 109 | 110 | 111 | class DDDQNAgent: # Dueling Double DQN with Proportional Prioritization 112 | def __init__(self, model, target_model, env, learning_rate=.001, epsilon=.1, epsilon_dacay=0.995, min_epsilon=.01, 113 | gamma=.9, batch_size=8, target_update_iter=400, train_nums=5000, buffer_size=300, replay_period=20, 114 | alpha=0.4, beta=0.4, beta_increment_per_sample=0.001): 115 | self.model = model 116 | self.target_model = target_model 117 | # gradient clip 118 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) #, clipvalue=10.0 119 | self.model.compile(optimizer=opt, loss=self._per_loss) #loss=self._per_loss 120 | 121 | # parameters 122 | self.env = env # gym environment 123 | self.lr = learning_rate # learning step 124 | self.epsilon = epsilon # e-greedy when exploring 125 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 126 | self.min_epsilon = min_epsilon # minimum epsilon 127 | self.gamma = gamma # discount rate 128 | self.batch_size = batch_size # minibatch k 129 | self.target_update_iter = target_update_iter # target network update period 130 | self.train_nums = train_nums # total training steps 131 | 132 | # replay buffer params [(s, a, r, ns, done), ...] 133 | self.b_obs = np.empty((self.batch_size,) + self.env.reset().shape) 134 | self.b_actions = np.empty(self.batch_size, dtype=np.int8) 135 | self.b_rewards = np.empty(self.batch_size, dtype=np.float32) 136 | self.b_next_states = np.empty((self.batch_size,) + self.env.reset().shape) 137 | self.b_dones = np.empty(self.batch_size, dtype=np.bool) 138 | 139 | self.replay_buffer = SumTree(buffer_size) # sum-tree data structure 140 | self.buffer_size = buffer_size # replay buffer size N 141 | self.replay_period = replay_period # replay period K 142 | self.alpha = alpha # priority parameter, alpha=[0, 0.4, 0.5, 0.6, 0.7, 0.8] 143 | self.beta = beta # importance sampling parameter, beta=[0, 0.4, 0.5, 0.6, 1] 144 | self.beta_increment_per_sample = beta_increment_per_sample 145 | self.num_in_buffer = 0 # total number of transitions stored in buffer 146 | self.margin = 0.01 # pi = |td_error| + margin 147 | self.p1 = 1 # initialize priority for the first transition 148 | # self.is_weight = np.empty((None, 1)) 149 | self.is_weight = np.power(self.buffer_size, -self.beta) # because p1 == 1 150 | self.abs_error_upper = 1 151 | 152 | def _per_loss(self, y_target, y_pred): 153 | return tf.reduce_mean(self.is_weight * tf.math.squared_difference(y_target, y_pred)) 154 | 155 | def train(self): 156 | # initialize the initial observation of the agent 157 | obs = self.env.reset() 158 | for t in range(1, self.train_nums): 159 | best_action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 160 | action = self.get_action(best_action) # get the real action 161 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 162 | if t == 1: 163 | p = self.p1 164 | else: 165 | p = np.max(self.replay_buffer.tree[-self.replay_buffer.capacity:]) 166 | self.store_transition(p, obs, action, reward, next_obs, done) # store that transition into replay butter 167 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 168 | 169 | if t > self.buffer_size: 170 | # if t % self.replay_period == 0: # transition sampling and update 171 | losses = self.train_step() 172 | if t % 1000 == 0: 173 | print('losses each 1000 steps: ', losses) 174 | 175 | if t % self.target_update_iter == 0: 176 | self.update_target_model() 177 | if done: 178 | obs = self.env.reset() # one episode end 179 | else: 180 | obs = next_obs 181 | 182 | def train_step(self): 183 | idxes, self.is_weight = self.sum_tree_sample(self.batch_size) 184 | # Double Q-Learning 185 | best_action_idxes, _ = self.model.action_value(self.b_next_states) # get actions through the current network 186 | target_q = self.get_target_value(self.b_next_states) # get target q-value through the target network 187 | # get td_targets of batch states 188 | td_target = self.b_rewards + \ 189 | self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - self.b_dones) 190 | predict_q = self.model.predict(self.b_obs) 191 | td_predict = predict_q[np.arange(predict_q.shape[0]), self.b_actions] 192 | abs_td_error = np.abs(td_target - td_predict) + self.margin 193 | clipped_error = np.where(abs_td_error < self.abs_error_upper, abs_td_error, self.abs_error_upper) 194 | ps = np.power(clipped_error, self.alpha) 195 | # priorities update 196 | for idx, p in zip(idxes, ps): 197 | self.replay_buffer.update(idx, p) 198 | 199 | for i, val in enumerate(self.b_actions): 200 | predict_q[i][val] = td_target[i] 201 | 202 | target_q = predict_q # just to change a more explicit name 203 | losses = self.model.train_on_batch(self.b_obs, target_q) 204 | 205 | return losses 206 | 207 | # proportional prioritization sampling 208 | def sum_tree_sample(self, k): 209 | idxes = [] 210 | is_weights = np.empty((k, 1)) 211 | self.beta = min(1., self.beta + self.beta_increment_per_sample) 212 | # calculate max_weight 213 | min_prob = np.min(self.replay_buffer.tree[-self.replay_buffer.capacity:]) / self.replay_buffer.total_p 214 | max_weight = np.power(self.buffer_size * min_prob, -self.beta) 215 | segment = self.replay_buffer.total_p / k 216 | for i in range(k): 217 | s = np.random.uniform(segment * i, segment * (i + 1)) 218 | idx, p, t = self.replay_buffer.get_leaf(s) 219 | idxes.append(idx) 220 | self.b_obs[i], self.b_actions[i], self.b_rewards[i], self.b_next_states[i], self.b_dones[i] = t 221 | # P(j) 222 | sampling_probabilities = p / self.replay_buffer.total_p # where p = p ** self.alpha 223 | is_weights[i, 0] = np.power(self.buffer_size * sampling_probabilities, -self.beta) / max_weight 224 | return idxes, is_weights 225 | 226 | def evaluation(self, env, render=True): 227 | obs, done, ep_reward = env.reset(), False, 0 228 | # one episode until done 229 | while not done: 230 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 231 | obs, reward, done, info = env.step(action) 232 | ep_reward += reward 233 | if render: # visually show 234 | env.render() 235 | time.sleep(0.05) 236 | env.close() 237 | return ep_reward 238 | 239 | # store transitions into replay butter, now sum tree. 240 | def store_transition(self, priority, obs, action, reward, next_state, done): 241 | transition = [obs, action, reward, next_state, done] 242 | self.replay_buffer.add(priority, transition) 243 | 244 | # rank-based prioritization sampling 245 | def rand_based_sample(self, k): 246 | pass 247 | 248 | # e-greedy 249 | def get_action(self, best_action): 250 | if np.random.rand() < self.epsilon: 251 | return self.env.action_space.sample() 252 | return best_action 253 | 254 | # assign the current network parameters to target network 255 | def update_target_model(self): 256 | self.target_model.set_weights(self.model.get_weights()) 257 | 258 | def get_target_value(self, obs): 259 | return self.target_model.predict(obs) 260 | 261 | def e_decay(self): 262 | self.epsilon *= self.epsilon_decay 263 | 264 | 265 | if __name__ == '__main__': 266 | test_model() 267 | 268 | env = gym.make("CartPole-v0") 269 | num_actions = env.action_space.n 270 | model = Model(num_actions) 271 | target_model = Model(num_actions) 272 | agent = DDDQNAgent(model, target_model, env) 273 | # test before 274 | rewards_sum = agent.evaluation(env) 275 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 276 | 277 | agent.train() 278 | # test after 279 | # env = gym.wrappers.Monitor(env, './recording', force=True) 280 | rewards_sum = agent.evaluation(env) 281 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 282 | -------------------------------------------------------------------------------- /05_multistep_td.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Multi-Step TD Learning Based on Dueling Double DQN with Prioritized Experience Replay. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | from collections import deque 21 | 22 | np.random.seed(1) 23 | tf.random.set_seed(1) 24 | 25 | # Neural Network Model Defined at Here. 26 | class Model(tf.keras.Model): 27 | def __init__(self, num_actions): 28 | super().__init__(name='basic_nstepTD') 29 | # you can try different kernel initializer 30 | self.shared_fc1 = kl.Dense(16, activation='relu', kernel_initializer='he_uniform') 31 | self.shared_fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 32 | # there is a trick that combining the two streams' fc layer, then 33 | # the output of that layer is a |A| + 1 dimension tensor: |V|A1|A2| ... |An| 34 | # output[:, 0] is state value, output[:, 1:] is action advantage 35 | self.val_adv_fc = kl.Dense(num_actions + 1, activation='relu', kernel_initializer='he_uniform') 36 | 37 | # forward propagation 38 | def call(self, inputs): 39 | x = self.shared_fc1(inputs) 40 | x = self.shared_fc2(x) 41 | val_adv = self.val_adv_fc(x) 42 | # average version, you can also try the max version. 43 | outputs = tf.expand_dims(val_adv[:, 0], -1) + (val_adv[:, 1:] - tf.reduce_mean(val_adv[:, 1:], -1, keepdims=True)) 44 | return outputs 45 | 46 | # a* = argmax_a' Q(s, a') 47 | def action_value(self, obs): 48 | q_values = self.predict(obs) 49 | best_action = np.argmax(q_values, axis=-1) 50 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 51 | 52 | 53 | # To test whether the model works 54 | def test_model(): 55 | env = gym.make('CartPole-v0') 56 | print('num_actions: ', env.action_space.n) 57 | model = Model(env.action_space.n) 58 | 59 | obs = env.reset() 60 | print('obs_shape: ', obs.shape) 61 | 62 | # tensorflow 2.0 eager mode: no feed_dict or tf.Session() needed at all 63 | best_action, q_values = model.action_value(obs[None]) 64 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 65 | 66 | 67 | # replay buffer 68 | class SumTree: 69 | # little modified from https://github.com/jaromiru/AI-blog/blob/master/SumTree.py 70 | def __init__(self, capacity): 71 | self.capacity = capacity # N, the size of replay buffer, so as to the number of sum tree's leaves 72 | self.tree = np.zeros(2 * capacity - 1) # equation, to calculate the number of nodes in a sum tree 73 | self.transitions = np.empty(capacity, dtype=object) 74 | self.next_idx = 0 75 | 76 | @property 77 | def total_p(self): 78 | return self.tree[0] 79 | 80 | def add(self, priority, transition): 81 | idx = self.next_idx + self.capacity - 1 82 | self.transitions[self.next_idx] = transition 83 | self.update(idx, priority) 84 | self.next_idx = (self.next_idx + 1) % self.capacity 85 | 86 | def update(self, idx, priority): 87 | change = priority - self.tree[idx] 88 | self.tree[idx] = priority 89 | self._propagate(idx, change) # O(logn) 90 | 91 | def _propagate(self, idx, change): 92 | parent = (idx - 1) // 2 93 | self.tree[parent] += change 94 | if parent != 0: 95 | self._propagate(parent, change) 96 | 97 | def get_leaf(self, s): 98 | idx = self._retrieve(0, s) # from root 99 | trans_idx = idx - self.capacity + 1 100 | return idx, self.tree[idx], self.transitions[trans_idx] 101 | 102 | def _retrieve(self, idx, s): 103 | left = 2 * idx + 1 104 | right = left + 1 105 | if left >= len(self.tree): 106 | return idx 107 | if s <= self.tree[left]: 108 | return self._retrieve(left, s) 109 | else: 110 | return self._retrieve(right, s - self.tree[left]) 111 | 112 | 113 | class MSTDAgent: # Multi-Step TD Learning Based on Dueling Double DQN with Proportional Prioritization 114 | def __init__(self, model, target_model, env, learning_rate=.0008, epsilon=.1, epsilon_dacay=0.995, min_epsilon=.01, 115 | gamma=.9, batch_size=8, target_update_iter=400, train_nums=5000, buffer_size=300, replay_period=20, 116 | alpha=0.4, beta=0.4, beta_increment_per_sample=0.001, n_step=3): 117 | self.model = model 118 | self.target_model = target_model 119 | # gradient clip 120 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) # , clipvalue=10.0 121 | self.model.compile(optimizer=opt, loss=self._per_loss) # loss=self._per_loss 122 | 123 | # parameters 124 | self.env = env # gym environment 125 | self.lr = learning_rate # learning step 126 | self.epsilon = epsilon # e-greedy when exploring 127 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 128 | self.min_epsilon = min_epsilon # minimum epsilon 129 | self.gamma = gamma # discount rate 130 | self.batch_size = batch_size # minibatch k 131 | self.target_update_iter = target_update_iter # target network update period 132 | self.train_nums = train_nums # total training steps 133 | 134 | # replay buffer params [(s, a, r, ns, done), ...] 135 | self.b_obs = np.empty((self.batch_size,) + self.env.reset().shape) 136 | self.b_actions = np.empty(self.batch_size, dtype=np.int8) 137 | self.b_rewards = np.empty(self.batch_size, dtype=np.float32) 138 | self.b_next_states = np.empty((self.batch_size,) + self.env.reset().shape) 139 | self.b_dones = np.empty(self.batch_size, dtype=np.bool) 140 | 141 | self.replay_buffer = SumTree(buffer_size) # sum-tree data structure 142 | self.buffer_size = buffer_size # replay buffer size N 143 | self.replay_period = replay_period # replay period K 144 | self.alpha = alpha # priority parameter, alpha=[0, 0.4, 0.5, 0.6, 0.7, 0.8] 145 | self.beta = beta # importance sampling parameter, beta=[0, 0.4, 0.5, 0.6, 1] 146 | self.beta_increment_per_sample = beta_increment_per_sample 147 | self.num_in_buffer = 0 # total number of transitions stored in buffer 148 | self.margin = 0.01 # pi = |td_error| + margin 149 | self.p1 = 1 # initialize priority for the first transition 150 | # self.is_weight = np.empty((None, 1)) 151 | self.is_weight = np.power(self.buffer_size, -self.beta) # because p1 == 1 152 | self.abs_error_upper = 1 153 | 154 | # multi step TD learning 155 | self.n_step = n_step 156 | self.n_step_buffer = deque(maxlen=n_step) 157 | 158 | def _per_loss(self, y_target, y_pred): 159 | return tf.reduce_mean(self.is_weight * tf.math.squared_difference(y_target, y_pred)) 160 | 161 | def train(self): 162 | # initialize the initial observation of the agent 163 | obs = self.env.reset() 164 | for t in range(1, self.train_nums): 165 | best_action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 166 | action = self.get_action(best_action) # get the real action 167 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 168 | 169 | # n-step replay buffer 170 | # minor modified from github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 171 | temp_transition = [obs, action, reward, next_obs, done] 172 | self.n_step_buffer.append(temp_transition) 173 | if len(self.n_step_buffer) == self.n_step: # fill the n-step buffer for the first translation 174 | # add a multi step transition 175 | reward, next_obs, done = self.get_n_step_info(self.n_step_buffer, self.gamma) 176 | obs, action = self.n_step_buffer[0][:2] 177 | 178 | if t == 1: 179 | p = self.p1 180 | else: 181 | p = np.max(self.replay_buffer.tree[-self.replay_buffer.capacity:]) 182 | self.store_transition(p, obs, action, reward, next_obs, done) # store that transition into replay butter 183 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 184 | 185 | if t > self.buffer_size: 186 | # if t % self.replay_period == 0: # transition sampling and update 187 | losses = self.train_step() 188 | if t % 1000 == 0: 189 | print('losses each 1000 steps: ', losses) 190 | 191 | if t % self.target_update_iter == 0: 192 | self.update_target_model() 193 | if done: 194 | obs = self.env.reset() # one episode end 195 | else: 196 | obs = next_obs 197 | 198 | def train_step(self): 199 | idxes, self.is_weight = self.sum_tree_sample(self.batch_size) 200 | assert len(idxes) == self.b_next_states.shape[0] 201 | 202 | # Double Q-Learning 203 | best_action_idxes, _ = self.model.action_value(self.b_next_states) # get actions through the current network 204 | target_q = self.get_target_value(self.b_next_states) # get target q-value through the target network 205 | # get td_targets of batch states 206 | td_target = self.b_rewards + \ 207 | self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - self.b_dones) 208 | predict_q = self.model.predict(self.b_obs) 209 | td_predict = predict_q[np.arange(predict_q.shape[0]), self.b_actions] 210 | abs_td_error = np.abs(td_target - td_predict) + self.margin 211 | clipped_error = np.where(abs_td_error < self.abs_error_upper, abs_td_error, self.abs_error_upper) 212 | ps = np.power(clipped_error, self.alpha) 213 | # priorities update 214 | for idx, p in zip(idxes, ps): 215 | self.replay_buffer.update(idx, p) 216 | 217 | for i, val in enumerate(self.b_actions): 218 | predict_q[i][val] = td_target[i] 219 | 220 | target_q = predict_q # just to change a more explicit name 221 | losses = self.model.train_on_batch(self.b_obs, target_q) 222 | 223 | return losses 224 | 225 | # proportional prioritization sampling 226 | def sum_tree_sample(self, k): 227 | idxes = [] 228 | is_weights = np.empty((k, 1)) 229 | self.beta = min(1., self.beta + self.beta_increment_per_sample) 230 | # calculate max_weight 231 | min_prob = np.min(self.replay_buffer.tree[-self.replay_buffer.capacity:]) / self.replay_buffer.total_p 232 | max_weight = np.power(self.buffer_size * min_prob, -self.beta) 233 | segment = self.replay_buffer.total_p / k 234 | for i in range(k): 235 | s = np.random.uniform(segment * i, segment * (i + 1)) 236 | idx, p, t = self.replay_buffer.get_leaf(s) 237 | idxes.append(idx) 238 | self.b_obs[i], self.b_actions[i], self.b_rewards[i], self.b_next_states[i], self.b_dones[i] = t 239 | # P(j) 240 | sampling_probabilities = p / self.replay_buffer.total_p # where p = p ** self.alpha 241 | is_weights[i, 0] = np.power(self.buffer_size * sampling_probabilities, -self.beta) / max_weight 242 | return idxes, is_weights 243 | 244 | def evaluation(self, env, render=True): 245 | obs, done, ep_reward = env.reset(), False, 0 246 | # one episode until done 247 | while not done: 248 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 249 | obs, reward, done, info = env.step(action) 250 | ep_reward += reward 251 | if render: # visually show 252 | env.render() 253 | time.sleep(0.05) 254 | env.close() 255 | return ep_reward 256 | 257 | # store transitions into replay butter, now sum tree. 258 | def store_transition(self, priority, obs, action, reward, next_state, done): 259 | transition = [obs, action, reward, next_state, done] 260 | self.replay_buffer.add(priority, transition) 261 | 262 | # minor modified from https://github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 263 | def get_n_step_info(self, n_step_buffer, gamma): 264 | """Return n step reward, next state, and done.""" 265 | # info of the last transition 266 | reward, next_state, done = n_step_buffer[-1][-3:] 267 | 268 | for transition in reversed(list(n_step_buffer)[:-1]): 269 | r, n_s, d = transition[-3:] 270 | 271 | reward = r + gamma * reward * (1 - d) 272 | next_state, done = (n_s, d) if d else (next_state, done) 273 | 274 | return reward, next_state, done 275 | 276 | 277 | # rank-based prioritization sampling 278 | def rand_based_sample(self, k): 279 | pass 280 | 281 | # e-greedy 282 | def get_action(self, best_action): 283 | if np.random.rand() < self.epsilon: 284 | return self.env.action_space.sample() 285 | return best_action 286 | 287 | # assign the current network parameters to target network 288 | def update_target_model(self): 289 | self.target_model.set_weights(self.model.get_weights()) 290 | 291 | def get_target_value(self, obs): 292 | return self.target_model.predict(obs) 293 | 294 | def e_decay(self): 295 | self.epsilon *= self.epsilon_decay 296 | 297 | 298 | if __name__ == '__main__': 299 | test_model() 300 | 301 | env = gym.make("CartPole-v0") 302 | num_actions = env.action_space.n 303 | model = Model(num_actions) 304 | target_model = Model(num_actions) 305 | agent = MSTDAgent(model, target_model, env) 306 | # test before 307 | rewards_sum = agent.evaluation(env) 308 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 309 | 310 | agent.train() 311 | # test after 312 | # env = gym.wrappers.Monitor(env, './recording', force=True) 313 | rewards_sum = agent.evaluation(env) 314 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 315 | -------------------------------------------------------------------------------- /06_distributional_rl.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Distributional RL Based on Multi-Step Dueling Double DQN with Prioritized Experience Replay. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | from collections import deque 21 | 22 | np.random.seed(1) 23 | tf.random.set_seed(1) 24 | 25 | # Neural Network Model Defined at Here. 26 | class Model(tf.keras.Model): 27 | def __init__(self, num_actions, num_atoms): 28 | super().__init__(name='basic_distributional_rl') 29 | self.num_actions = num_actions 30 | self.num_atoms = num_atoms 31 | # you can try different kernel initializer 32 | self.shared_fc1 = kl.Dense(16, activation='relu', kernel_initializer='he_uniform') 33 | self.shared_fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 34 | # still use the dueling network architecture, but now: 35 | # V | v_0| v_1| ... | v_N-1| 36 | # A |a1_0|a1_1| ... |a1_N-1| 37 | # |a2_0|a2_1| ... |a2_N-1| 38 | # . . . 39 | # . . . 40 | # |an_0|an_1| ... |an_N-1| 41 | # the output of that layer is a (|A| + 1) * N dimension tensor 42 | # each column is a |A| + 1 dimension tensor for each atom. 43 | self.val_adv_fc = kl.Dense((num_actions + 1) * num_atoms, activation='relu', kernel_initializer='he_uniform') 44 | 45 | # forward propagation 46 | def call(self, inputs): 47 | x = self.shared_fc1(inputs) 48 | x = self.shared_fc2(x) 49 | val_adv = self.val_adv_fc(x) 50 | # average version, you can also try the max version. 51 | val_adv = tf.reshape(val_adv, [-1, self.num_actions + 1, self.num_atoms]) 52 | outputs = tf.expand_dims(val_adv[:, 0], 1) + (val_adv[:, 1:] - tf.reduce_mean(val_adv[:, 1:], -1, keepdims=True)) 53 | # you may need tf.nn.log_softmax() 54 | outputs = tf.nn.softmax(outputs, axis=-1) 55 | 56 | return outputs 57 | 58 | # a* = argmax_a' Q(s, a') 59 | def action_value(self, obs, support_z): 60 | r_distribute = self.predict(obs) 61 | q_values = np.sum(r_distribute * support_z, axis=-1) 62 | best_action = np.argmax(q_values, axis=-1) 63 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 64 | 65 | 66 | # To test whether the model works 67 | def test_model(): 68 | num_atoms = 11 69 | support_z = np.linspace(-5.0, 5.0, num_atoms) 70 | env = gym.make('CartPole-v0') 71 | print('num_actions: ', env.action_space.n) 72 | model = Model(env.action_space.n, num_atoms) 73 | 74 | obs = env.reset() 75 | print('obs_shape: ', obs.shape) 76 | 77 | # tensorflow 2.0 eager mode: no feed_dict or tf.Session() needed at all 78 | best_action, q_values = model.action_value(obs[None], support_z) 79 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 80 | 81 | 82 | # replay buffer 83 | class SumTree: 84 | # little modified from https://github.com/jaromiru/AI-blog/blob/master/SumTree.py 85 | def __init__(self, capacity): 86 | self.capacity = capacity # N, the size of replay buffer, so as to the number of sum tree's leaves 87 | self.tree = np.zeros(2 * capacity - 1) # equation, to calculate the number of nodes in a sum tree 88 | self.transitions = np.empty(capacity, dtype=object) 89 | self.next_idx = 0 90 | 91 | @property 92 | def total_p(self): 93 | return self.tree[0] 94 | 95 | def add(self, priority, transition): 96 | idx = self.next_idx + self.capacity - 1 97 | self.transitions[self.next_idx] = transition 98 | self.update(idx, priority) 99 | self.next_idx = (self.next_idx + 1) % self.capacity 100 | 101 | def update(self, idx, priority): 102 | change = priority - self.tree[idx] 103 | self.tree[idx] = priority 104 | self._propagate(idx, change) # O(logn) 105 | 106 | def _propagate(self, idx, change): 107 | parent = (idx - 1) // 2 108 | self.tree[parent] += change 109 | if parent != 0: 110 | self._propagate(parent, change) 111 | 112 | def get_leaf(self, s): 113 | idx = self._retrieve(0, s) # from root 114 | trans_idx = idx - self.capacity + 1 115 | return idx, self.tree[idx], self.transitions[trans_idx] 116 | 117 | def _retrieve(self, idx, s): 118 | left = 2 * idx + 1 119 | right = left + 1 120 | if left >= len(self.tree): 121 | return idx 122 | if s <= self.tree[left]: 123 | return self._retrieve(left, s) 124 | else: 125 | return self._retrieve(right, s - self.tree[left]) 126 | 127 | 128 | class DISTAgent: # Distributional RL Based on Multi-Step Dueling Double DQN with Proportional Prioritization 129 | def __init__(self, model, target_model, env, learning_rate=.001, epsilon=.1, epsilon_dacay=0.995, min_epsilon=.01, 130 | gamma=.9, batch_size=8, target_update_iter=400, train_nums=5000, buffer_size=300, replay_period=20, 131 | alpha=0.4, beta=0.4, beta_increment_per_sample=0.001, n_step=3, atom_num=11, vmin=-3.0, vmax=3.0): 132 | self.model = model 133 | self.target_model = target_model 134 | # gradient clip 135 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) # , clipvalue=10.0 136 | self.model.compile(optimizer=opt, loss=self._per_loss) # loss=self._per_loss 137 | 138 | # parameters 139 | self.env = env # gym environment 140 | self.lr = learning_rate # learning step 141 | self.epsilon = epsilon # e-greedy when exploring 142 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 143 | self.min_epsilon = min_epsilon # minimum epsilon 144 | self.gamma = gamma # discount rate 145 | self.batch_size = batch_size # minibatch k 146 | self.target_update_iter = target_update_iter # target network update period 147 | self.train_nums = train_nums # total training steps 148 | 149 | # replay buffer params [(s, a, r, ns, done), ...] 150 | self.b_obs = np.empty((self.batch_size,) + self.env.reset().shape) 151 | self.b_actions = np.empty(self.batch_size, dtype=np.int8) 152 | self.b_rewards = np.empty(self.batch_size, dtype=np.float32) 153 | self.b_next_states = np.empty((self.batch_size,) + self.env.reset().shape) 154 | self.b_dones = np.empty(self.batch_size, dtype=np.bool) 155 | 156 | self.replay_buffer = SumTree(buffer_size) # sum-tree data structure 157 | self.buffer_size = buffer_size # replay buffer size N 158 | self.replay_period = replay_period # replay period K 159 | self.alpha = alpha # priority parameter, alpha=[0, 0.4, 0.5, 0.6, 0.7, 0.8] 160 | self.beta = beta # importance sampling parameter, beta=[0, 0.4, 0.5, 0.6, 1] 161 | self.beta_increment_per_sample = beta_increment_per_sample 162 | self.num_in_buffer = 0 # total number of transitions stored in buffer 163 | self.margin = 0.01 # pi = |td_error| + margin 164 | self.p1 = 1 # initialize priority for the first transition 165 | # self.is_weight = np.empty((None, 1)) 166 | self.is_weight = np.power(self.buffer_size, -self.beta) # because p1 == 1 167 | self.abs_error_upper = 1 168 | 169 | # multi step TD learning 170 | self.n_step = n_step 171 | self.n_step_buffer = deque(maxlen=n_step) 172 | 173 | # distributional rl 174 | self.atom_num = atom_num 175 | self.vmin = vmin 176 | self.vmax = vmax 177 | self.support_z = np.expand_dims(np.linspace(vmin, vmax, atom_num), 0) 178 | self.delta_z = (vmax - vmin) / (atom_num - 1) 179 | 180 | def _per_loss(self, y_target, y_pred): 181 | return tf.reduce_mean(self.is_weight * tf.math.squared_difference(y_target, y_pred)) 182 | 183 | def _kl_loss(self, y_target, y_pred): # cross_entropy loss 184 | return tf.reduce_mean(self.is_weight * tf.nn.softmax_cross_entropy_with_logits(labels=y_pred, logits=y_target)) 185 | 186 | def train(self): 187 | # initialize the initial observation of the agent 188 | obs = self.env.reset() 189 | for t in range(1, self.train_nums): 190 | best_action, _ = self.model.action_value(obs[None], self.support_z) # input the obs to the network model 191 | action = self.get_action(best_action) # get the real action 192 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 193 | 194 | # n-step replay buffer 195 | # minor modified from github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 196 | temp_transition = [obs, action, reward, next_obs, done] 197 | self.n_step_buffer.append(temp_transition) 198 | if len(self.n_step_buffer) == self.n_step: # fill the n-step buffer for the first translation 199 | # add a multi step transition 200 | reward, next_obs, done = self.get_n_step_info(self.n_step_buffer, self.gamma) 201 | obs, action = self.n_step_buffer[0][:2] 202 | 203 | if t == 1: 204 | p = self.p1 205 | else: 206 | p = np.max(self.replay_buffer.tree[-self.replay_buffer.capacity:]) 207 | self.store_transition(p, obs, action, reward, next_obs, done) # store that transition into replay butter 208 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 209 | 210 | if t > self.buffer_size: 211 | # if t % self.replay_period == 0: # transition sampling and update 212 | losses = self.train_step() 213 | if t % 1000 == 0: 214 | print('losses each 1000 steps: ', losses) 215 | 216 | if t % self.target_update_iter == 0: 217 | self.update_target_model() 218 | if done: 219 | obs = self.env.reset() # one episode end 220 | else: 221 | obs = next_obs 222 | 223 | def train_step(self): 224 | idxes, self.is_weight = self.sum_tree_sample(self.batch_size) 225 | assert len(idxes) == self.b_next_states.shape[0] 226 | 227 | # Double Q-Learning 228 | best_action_idxes, _ = self.model.action_value(self.b_next_states, self.support_z) # get actions through the current network 229 | target_distrib = self.get_target_value(self.b_next_states) # get target distrib through the target network 230 | target_q = np.sum(target_distrib * self.support_z, axis=-1) 231 | 232 | # get td_targets of batch states 233 | td_target = self.b_rewards + \ 234 | self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - self.b_dones) 235 | 236 | predict_distrib = self.model.predict(self.b_obs) 237 | predict_q = np.sum(predict_distrib * self.support_z, axis=-1) 238 | 239 | td_predict = predict_q[np.arange(predict_q.shape[0]), self.b_actions] 240 | abs_td_error = np.abs(td_target - td_predict) + self.margin 241 | clipped_error = np.where(abs_td_error < self.abs_error_upper, abs_td_error, self.abs_error_upper) 242 | ps = np.power(clipped_error, self.alpha) 243 | # priorities update 244 | for idx, p in zip(idxes, ps): 245 | self.replay_buffer.update(idx, p) 246 | 247 | Tdistrib = target_distrib[np.arange(target_distrib.shape[0]), best_action_idxes] 248 | perjected_distrib = self.projected_distrib(Tdistrib) 249 | 250 | # perjected_distrib = np.c_[perjected_distrib, self.b_actions] 251 | null_distrib = np.zeros_like(predict_distrib) 252 | for i, val in enumerate(self.b_actions): 253 | null_distrib[i][val] = perjected_distrib[i] 254 | 255 | # print(self.b_actions) 256 | # print(perjected_distrib[:, -1].astype(np.int32)) 257 | # print(predict_distrib[np.arange(self.batch_size), perjected_distrib[:, -1].astype(np.int32)]) 258 | 259 | losses = self.model.train_on_batch(self.b_obs, null_distrib) 260 | 261 | return losses 262 | 263 | # proportional prioritization sampling 264 | def sum_tree_sample(self, k): 265 | idxes = [] 266 | is_weights = np.empty((k, 1)) 267 | self.beta = min(1., self.beta + self.beta_increment_per_sample) 268 | # calculate max_weight 269 | min_prob = np.min(self.replay_buffer.tree[-self.replay_buffer.capacity:]) / self.replay_buffer.total_p 270 | max_weight = np.power(self.buffer_size * min_prob, -self.beta) 271 | segment = self.replay_buffer.total_p / k 272 | for i in range(k): 273 | s = np.random.uniform(segment * i, segment * (i + 1)) 274 | idx, p, t = self.replay_buffer.get_leaf(s) 275 | idxes.append(idx) 276 | self.b_obs[i], self.b_actions[i], self.b_rewards[i], self.b_next_states[i], self.b_dones[i] = t 277 | # P(j) 278 | sampling_probabilities = p / self.replay_buffer.total_p # where p = p ** self.alpha 279 | is_weights[i, 0] = np.power(self.buffer_size * sampling_probabilities, -self.beta) / max_weight 280 | return idxes, is_weights 281 | 282 | def evaluation(self, env, render=True): 283 | obs, done, ep_reward = env.reset(), False, 0 284 | # one episode until done 285 | while not done: 286 | action, q_values = self.model.action_value(obs[None], self.support_z) # Using [None] to extend its dimension (4,) -> (1, 4) 287 | obs, reward, done, info = env.step(action) 288 | ep_reward += reward 289 | if render: # visually show 290 | env.render() 291 | time.sleep(0.05) 292 | env.close() 293 | return ep_reward 294 | 295 | # store transitions into replay butter, now sum tree. 296 | def store_transition(self, priority, obs, action, reward, next_state, done): 297 | transition = [obs, action, reward, next_state, done] 298 | self.replay_buffer.add(priority, transition) 299 | 300 | # rank-based prioritization sampling 301 | def rand_based_sample(self, k): 302 | pass 303 | 304 | # e-greedy 305 | def get_action(self, best_action): 306 | if np.random.rand() < self.epsilon: 307 | return self.env.action_space.sample() 308 | return best_action 309 | 310 | # assign the current network parameters to target network 311 | def update_target_model(self): 312 | self.target_model.set_weights(self.model.get_weights()) 313 | 314 | def get_target_value(self, obs): 315 | return self.target_model.predict(obs) 316 | 317 | def e_decay(self): 318 | self.epsilon *= self.epsilon_decay 319 | 320 | # minor modified from https://github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 321 | def get_n_step_info(self, n_step_buffer, gamma): 322 | """Return n step reward, next state, and done.""" 323 | # info of the last transition 324 | reward, next_state, done = n_step_buffer[-1][-3:] 325 | 326 | for transition in reversed(list(n_step_buffer)[:-1]): 327 | r, n_s, d = transition[-3:] 328 | 329 | reward = r + gamma * reward * (1 - d) 330 | next_state, done = (n_s, d) if d else (next_state, done) 331 | 332 | return reward, next_state, done 333 | 334 | def projected_distrib(self, Tdistrib): 335 | Tz = np.broadcast_to(self.support_z, [self.batch_size, self.support_z.shape[1]]) 336 | Tz = (self.gamma ** self.n_step) * Tz 337 | # stupid tensorflow don't know how to broad cast, angry!!! 338 | for i in range(self.batch_size): 339 | Tz[i] += self.b_rewards[i] 340 | Tz = tf.clip_by_value(Tz, self.vmin, self.vmax) 341 | m = np.zeros_like(Tdistrib) 342 | b = (Tz - self.vmin) / self.delta_z 343 | l, u = tf.cast(tf.math.floor(b), tf.int32), tf.cast(tf.math.ceil(b), tf.int32) 344 | assert m.shape == l.shape 345 | Ldistrib = Tdistrib * (tf.cast(u, tf.float64) - b) 346 | Udistrib = Tdistrib * (b - tf.cast(l, tf.float64)) 347 | for i in range(self.batch_size): 348 | np.add.at(m[i], np.asarray(l)[i], Ldistrib[i]) 349 | np.add.at(m[i], np.asarray(u)[i], Udistrib[i]) 350 | return tf.clip_by_value(m, 0.0, 1.0) 351 | 352 | 353 | 354 | if __name__ == '__main__': 355 | test_model() 356 | 357 | num_atoms = 11 358 | env = gym.make("CartPole-v0") 359 | num_actions = env.action_space.n 360 | model = Model(num_actions, num_atoms) 361 | target_model = Model(num_actions, num_atoms) 362 | agent = DISTAgent(model, target_model, env, atom_num=num_atoms) 363 | # test before 364 | rewards_sum = agent.evaluation(env) 365 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 366 | 367 | agent.train() 368 | # test after 369 | # env = gym.wrappers.Monitor(env, './recording', force=True) 370 | rewards_sum = agent.evaluation(env) 371 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 372 | -------------------------------------------------------------------------------- /07_noisynet.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of Multi-Step TD Learning Based on Dueling Double DQN with Prioritized Experience Replay. 3 | To play CartPole-v0. 4 | 5 | Using: 6 | TensorFlow 2.0 7 | Numpy 1.16.2 8 | Gym 0.12.1 9 | """ 10 | 11 | import tensorflow as tf 12 | print(tf.__version__) 13 | 14 | import gym 15 | import time 16 | import numpy as np 17 | import tensorflow.keras.layers as kl 18 | import tensorflow.keras.optimizers as ko 19 | 20 | 21 | from collections import deque 22 | 23 | np.random.seed(1) 24 | tf.random.set_seed(1) 25 | 26 | # Neural Network Model Defined at Here. 27 | class Model(tf.keras.Model): 28 | def __init__(self, num_actions): 29 | super().__init__(name='basic_nstepTD') 30 | # you can try different kernel initializer 31 | self.shared_fc1 = kl.Dense(16, activation='relu', kernel_initializer='he_uniform') 32 | self.shared_fc2 = kl.Dense(32, activation='relu', kernel_initializer='he_uniform') 33 | # there is a trick that combining the two streams' fc layer, then 34 | # the output of that layer is a |A| + 1 dimension tensor: |V|A1|A2| ... |An| 35 | # output[:, 0] is state value, output[:, 1:] is action advantage 36 | self.val_adv_fc = NoisyDense(num_actions + 1, input_dim=32) 37 | 38 | # forward propagation 39 | def call(self, inputs): 40 | x = self.shared_fc1(inputs) 41 | x = self.shared_fc2(x) 42 | val_adv = self.val_adv_fc(x) 43 | # average version, you can also try the max version. 44 | outputs = tf.expand_dims(val_adv[:, 0], -1) + (val_adv[:, 1:] - tf.reduce_mean(val_adv[:, 1:], -1, keepdims=True)) 45 | return outputs 46 | 47 | # a* = argmax_a' Q(s, a') 48 | def action_value(self, obs): 49 | q_values = self.predict(obs) 50 | best_action = np.argmax(q_values, axis=-1) 51 | return best_action if best_action.shape[0] > 1 else best_action[0], q_values[0] 52 | 53 | 54 | # Factorized Gaussian Noise Layer 55 | # Reference from https://github.com/Kaixhin/Rainbow/blob/master/model.py 56 | class NoisyDense(kl.Layer): 57 | def __init__(self, units, input_dim, std_init=0.5): 58 | super().__init__() 59 | self.units = units 60 | self.std_init = std_init 61 | self.reset_noise(input_dim) 62 | mu_range = 1 / np.sqrt(input_dim) 63 | mu_initializer = tf.random_uniform_initializer(-mu_range, mu_range) 64 | sigma_initializer = tf.constant_initializer(self.std_init / np.sqrt(self.units)) 65 | 66 | self.weight_mu = tf.Variable(initial_value=mu_initializer(shape=(input_dim, units), dtype='float32'), 67 | trainable=True) 68 | 69 | self.weight_sigma = tf.Variable(initial_value=sigma_initializer(shape=(input_dim, units), dtype='float32'), 70 | trainable=True) 71 | 72 | self.bias_mu = tf.Variable(initial_value=mu_initializer(shape=(units,), dtype='float32'), 73 | trainable=True) 74 | 75 | self.bias_sigma = tf.Variable(initial_value=sigma_initializer(shape=(units,), dtype='float32'), 76 | trainable=True) 77 | 78 | def call(self, inputs): 79 | # output = tf.tensordot(inputs, self.kernel, 1) 80 | # tf.nn.bias_add(output, self.bias) 81 | # return output 82 | self.kernel = self.weight_mu + self.weight_sigma * self.weights_eps 83 | self.bias = self.bias_mu + self.bias_sigma * self.bias_eps 84 | return tf.matmul(inputs, self.kernel) + self.bias 85 | 86 | def _scale_noise(self, dim): 87 | noise = tf.random.normal([dim]) 88 | return tf.sign(noise) * tf.sqrt(tf.abs(noise)) 89 | 90 | def reset_noise(self, input_shape): 91 | eps_in = self._scale_noise(input_shape) 92 | eps_out = self._scale_noise(self.units) 93 | self.weights_eps = tf.multiply(tf.expand_dims(eps_in, 1), eps_out) 94 | self.bias_eps = eps_out 95 | 96 | 97 | # To test whether the model works 98 | def test_model(): 99 | env = gym.make('CartPole-v0') 100 | print('num_actions: ', env.action_space.n) 101 | model = Model(env.action_space.n) 102 | 103 | obs = env.reset() 104 | print('obs_shape: ', obs.shape) 105 | 106 | # tensorflow 2.0 eager mode: no feed_dict or tf.Session() needed at all 107 | best_action, q_values = model.action_value(obs[None]) 108 | print('res of test model: ', best_action, q_values) # 0 [ 0.00896799 -0.02111824] 109 | 110 | 111 | # replay buffer 112 | class SumTree: 113 | # little modified from https://github.com/jaromiru/AI-blog/blob/master/SumTree.py 114 | def __init__(self, capacity): 115 | self.capacity = capacity # N, the size of replay buffer, so as to the number of sum tree's leaves 116 | self.tree = np.zeros(2 * capacity - 1) # equation, to calculate the number of nodes in a sum tree 117 | self.transitions = np.empty(capacity, dtype=object) 118 | self.next_idx = 0 119 | 120 | @property 121 | def total_p(self): 122 | return self.tree[0] 123 | 124 | def add(self, priority, transition): 125 | idx = self.next_idx + self.capacity - 1 126 | self.transitions[self.next_idx] = transition 127 | self.update(idx, priority) 128 | self.next_idx = (self.next_idx + 1) % self.capacity 129 | 130 | def update(self, idx, priority): 131 | change = priority - self.tree[idx] 132 | self.tree[idx] = priority 133 | self._propagate(idx, change) # O(logn) 134 | 135 | def _propagate(self, idx, change): 136 | parent = (idx - 1) // 2 137 | self.tree[parent] += change 138 | if parent != 0: 139 | self._propagate(parent, change) 140 | 141 | def get_leaf(self, s): 142 | idx = self._retrieve(0, s) # from root 143 | trans_idx = idx - self.capacity + 1 144 | return idx, self.tree[idx], self.transitions[trans_idx] 145 | 146 | def _retrieve(self, idx, s): 147 | left = 2 * idx + 1 148 | right = left + 1 149 | if left >= len(self.tree): 150 | return idx 151 | if s <= self.tree[left]: 152 | return self._retrieve(left, s) 153 | else: 154 | return self._retrieve(right, s - self.tree[left]) 155 | 156 | 157 | class NoisyAgent: # Multi-Step TD Learning Based on Dueling Double DQN with Proportional Prioritization 158 | def __init__(self, model, target_model, env, learning_rate=.005, epsilon=.1, epsilon_dacay=0.995, min_epsilon=.01, 159 | gamma=.9, batch_size=8, target_update_iter=400, train_nums=4000, buffer_size=300, replay_period=20, 160 | alpha=0.4, beta=0.4, beta_increment_per_sample=0.001, n_step=3): 161 | self.model = model 162 | self.target_model = target_model 163 | # gradient clip 164 | opt = ko.Adam(learning_rate=learning_rate, clipvalue=10.0) # , clipvalue=10.0 165 | self.model.compile(optimizer=opt, loss=self._per_loss) # loss=self._per_loss 166 | 167 | # parameters 168 | self.env = env # gym environment 169 | self.lr = learning_rate # learning step 170 | self.epsilon = epsilon # e-greedy when exploring 171 | self.epsilon_decay = epsilon_dacay # epsilon decay rate 172 | self.min_epsilon = min_epsilon # minimum epsilon 173 | self.gamma = gamma # discount rate 174 | self.batch_size = batch_size # minibatch k 175 | self.target_update_iter = target_update_iter # target network update period 176 | self.train_nums = train_nums # total training steps 177 | 178 | # replay buffer params [(s, a, r, ns, done), ...] 179 | self.b_obs = np.empty((self.batch_size,) + self.env.reset().shape) 180 | self.b_actions = np.empty(self.batch_size, dtype=np.int8) 181 | self.b_rewards = np.empty(self.batch_size, dtype=np.float32) 182 | self.b_next_states = np.empty((self.batch_size,) + self.env.reset().shape) 183 | self.b_dones = np.empty(self.batch_size, dtype=np.bool) 184 | 185 | self.replay_buffer = SumTree(buffer_size) # sum-tree data structure 186 | self.buffer_size = buffer_size # replay buffer size N 187 | self.replay_period = replay_period # replay period K 188 | self.alpha = alpha # priority parameter, alpha=[0, 0.4, 0.5, 0.6, 0.7, 0.8] 189 | self.beta = beta # importance sampling parameter, beta=[0, 0.4, 0.5, 0.6, 1] 190 | self.beta_increment_per_sample = beta_increment_per_sample 191 | self.num_in_buffer = 0 # total number of transitions stored in buffer 192 | self.margin = 0.01 # pi = |td_error| + margin 193 | self.p1 = 1 # initialize priority for the first transition 194 | # self.is_weight = np.empty((None, 1)) 195 | self.is_weight = np.power(self.buffer_size, -self.beta) # because p1 == 1 196 | self.abs_error_upper = 1 197 | 198 | # multi step TD learning 199 | self.n_step = n_step 200 | self.n_step_buffer = deque(maxlen=n_step) 201 | 202 | def _per_loss(self, y_target, y_pred): 203 | return tf.reduce_mean(self.is_weight * tf.math.squared_difference(y_target, y_pred)) 204 | 205 | def train(self): 206 | # initialize the initial observation of the agent 207 | obs = self.env.reset() 208 | for t in range(1, self.train_nums): 209 | action, q_values = self.model.action_value(obs[None]) # input the obs to the network model 210 | # action = self.get_action(best_action) # get the real action -- no need the e-greedy exploration 211 | next_obs, reward, done, info = self.env.step(action) # take the action in the env to return s', r, done 212 | 213 | # n-step replay buffer 214 | #################################################################################################### 215 | # minor modified from github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 216 | temp_transition = [obs, action, reward, next_obs, done] 217 | self.n_step_buffer.append(temp_transition) 218 | if len(self.n_step_buffer) == self.n_step: # fill the n-step buffer for the first translation 219 | # add a multi step transition 220 | reward, next_obs, done = self.get_n_step_info(self.n_step_buffer, self.gamma) 221 | obs, action = self.n_step_buffer[0][:2] 222 | #################################################################################################### 223 | 224 | if t == 1: 225 | p = self.p1 226 | else: 227 | p = np.max(self.replay_buffer.tree[-self.replay_buffer.capacity:]) 228 | self.store_transition(p, obs, action, reward, next_obs, done) # store that transition into replay butter 229 | self.num_in_buffer = min(self.num_in_buffer + 1, self.buffer_size) 230 | 231 | if t > self.buffer_size: 232 | # if t % self.replay_period == 0: # transition sampling and update 233 | losses = self.train_step() 234 | if t % 1000 == 0: 235 | print('losses each 1000 steps: ', losses) 236 | 237 | if t % self.target_update_iter == 0: 238 | self.update_target_model() 239 | if done: 240 | obs = self.env.reset() # one episode end 241 | else: 242 | obs = next_obs 243 | 244 | def train_step(self): 245 | idxes, self.is_weight = self.sum_tree_sample(self.batch_size) 246 | assert len(idxes) == self.b_next_states.shape[0] 247 | 248 | # Double Q-Learning 249 | best_action_idxes, _ = self.model.action_value(self.b_next_states) # get actions through the current network 250 | target_q = self.get_target_value(self.b_next_states) # get target q-value through the target network 251 | # get td_targets of batch states 252 | td_target = self.b_rewards + \ 253 | self.gamma * target_q[np.arange(target_q.shape[0]), best_action_idxes] * (1 - self.b_dones) 254 | predict_q = self.model.predict(self.b_obs) 255 | td_predict = predict_q[np.arange(predict_q.shape[0]), self.b_actions] 256 | abs_td_error = np.abs(td_target - td_predict) + self.margin 257 | clipped_error = np.where(abs_td_error < self.abs_error_upper, abs_td_error, self.abs_error_upper) 258 | ps = np.power(clipped_error, self.alpha) 259 | # priorities update 260 | for idx, p in zip(idxes, ps): 261 | self.replay_buffer.update(idx, p) 262 | 263 | for i, val in enumerate(self.b_actions): 264 | predict_q[i][val] = td_target[i] 265 | 266 | target_q = predict_q # just to change a more explicit name 267 | losses = self.model.train_on_batch(self.b_obs, target_q) 268 | 269 | return losses 270 | 271 | # proportional prioritization sampling 272 | def sum_tree_sample(self, k): 273 | idxes = [] 274 | is_weights = np.empty((k, 1)) 275 | self.beta = min(1., self.beta + self.beta_increment_per_sample) 276 | # calculate max_weight 277 | min_prob = np.min(self.replay_buffer.tree[-self.replay_buffer.capacity:]) / self.replay_buffer.total_p 278 | max_weight = np.power(self.buffer_size * min_prob, -self.beta) 279 | segment = self.replay_buffer.total_p / k 280 | for i in range(k): 281 | s = np.random.uniform(segment * i, segment * (i + 1)) 282 | idx, p, t = self.replay_buffer.get_leaf(s) 283 | idxes.append(idx) 284 | self.b_obs[i], self.b_actions[i], self.b_rewards[i], self.b_next_states[i], self.b_dones[i] = t 285 | # P(j) 286 | sampling_probabilities = p / self.replay_buffer.total_p # where p = p ** self.alpha 287 | is_weights[i, 0] = np.power(self.buffer_size * sampling_probabilities, -self.beta) / max_weight 288 | return idxes, is_weights 289 | 290 | def evaluation(self, env, render=True): 291 | obs, done, ep_reward = env.reset(), False, 0 292 | # one episode until done 293 | while not done: 294 | action, q_values = self.model.action_value(obs[None]) # Using [None] to extend its dimension (4,) -> (1, 4) 295 | obs, reward, done, info = env.step(action) 296 | ep_reward += reward 297 | if render: # visually show 298 | env.render() 299 | time.sleep(0.05) 300 | env.close() 301 | return ep_reward 302 | 303 | # store transitions into replay butter, now sum tree. 304 | def store_transition(self, priority, obs, action, reward, next_state, done): 305 | transition = [obs, action, reward, next_state, done] 306 | self.replay_buffer.add(priority, transition) 307 | 308 | # minor modified from https://github.com/medipixel/rl_algorithms/blob/master/algorithms/common/helper_functions.py 309 | def get_n_step_info(self, n_step_buffer, gamma): 310 | """Return n step reward, next state, and done.""" 311 | # info of the last transition 312 | reward, next_state, done = n_step_buffer[-1][-3:] 313 | 314 | for transition in reversed(list(n_step_buffer)[:-1]): 315 | r, n_s, d = transition[-3:] 316 | 317 | reward = r + gamma * reward * (1 - d) 318 | next_state, done = (n_s, d) if d else (next_state, done) 319 | 320 | return reward, next_state, done 321 | 322 | 323 | # rank-based prioritization sampling 324 | def rand_based_sample(self, k): 325 | pass 326 | 327 | # e-greedy 328 | def get_action(self, best_action): 329 | if np.random.rand() < self.epsilon: 330 | return self.env.action_space.sample() 331 | return best_action 332 | 333 | # assign the current network parameters to target network 334 | def update_target_model(self): 335 | self.target_model.set_weights(self.model.get_weights()) 336 | 337 | def get_target_value(self, obs): 338 | return self.target_model.predict(obs) 339 | 340 | def e_decay(self): 341 | self.epsilon *= self.epsilon_decay 342 | 343 | 344 | if __name__ == '__main__': 345 | test_model() 346 | 347 | env = gym.make("CartPole-v0") 348 | num_actions = env.action_space.n 349 | model = Model(num_actions) 350 | target_model = Model(num_actions) 351 | agent = NoisyAgent(model, target_model, env) 352 | # test before 353 | rewards_sum = agent.evaluation(env) 354 | print("Before Training: %d out of 200" % rewards_sum) # 9 out of 200 355 | 356 | agent.train() 357 | # test after 358 | # env = gym.wrappers.Monitor(env, './recording', force=True) 359 | rewards_sum = agent.evaluation(env) 360 | print("After Training: %d out of 200" % rewards_sum) # 200 out of 200 361 | 362 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## TensorFlow 2.0 for Deep Reinforcement Learning 2 | 3 | 4 | This is a simple tutorial of deep reinforcement learning with tensorflow 2.0, which has simple demos and detailed model implementations to help beginners get start in this research region. 5 | 6 | ### How to install TensorFlow 2.0 7 | ``` 8 | $ conda create --name tensorflow_2_0 9 | $ conda activate tensorflow_2_0 10 | $ pip install tensorflow==2.0.0-b1 # pip install tensorflow-gpu==2.0.0-b1 for GPU version 11 | ``` 12 | Test: 13 | ``` 14 | >>> import tensorflow as tf 15 | >>> tf.__version__ 16 | '2.0.0-beta1' 17 | ``` 18 | 19 | ### TensorFlow 2.0 Tutorial 20 | * [Official Tutorial](https://www.tensorflow.org/tutorials/) 21 | * [Hands-on ML2](https://github.com/ageron/handson-ml2) 22 | * [Summary of some of the new features in TensorFlow 2.0](https://colab.research.google.com/github/zaidalyafeai/Notebooks/blob/master/TF_2_0.ipynb) 23 | * [Model building with TensorFlow 2.0](https://colab.research.google.com/drive/17u-pRZJnKN0gO5XZmq8n5A2bKGrfKEUg) 24 | 25 | 26 | ### Python Tutorial 27 | * Welcome to visit my [Fast Py3 Repo](https://github.com/Huixxi/Fast-Py3). This is a fast python3 tutorial. 28 | 29 | ### Gym Tutorial 30 | * [Basic Gym](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/tutorial_blogs/gym_tutorial.md) 31 | 32 | ### Reinforcement Learning 33 | Book notes ... 34 | 35 | ### Deep Reinforcement Learning 36 | * Rainbow 37 | ([Building Rainbow Step by Step with TensorFlow2.0](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/tutorial_blogs/Building_Rainbow_Step_by_Step_with_TensorFlow2.0.md)) 38 | * [Deep Q-Network](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/01_dqn.py) 39 | * +[Double DQN](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/02_ddqn.py) 40 | * +[Prioritized Experience Replay](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/03_priority_replay.py) 41 | * +[Dueling Network](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/04_dueling.py) 42 | * +[Multi-Step Q-Learning](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/05_multistep_td.py) 43 | * +[Distributional RL](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/06_distributional_rl.py)(Failed Yet, But I Will Try My BestTo Make It Work Soon!) 44 | * +[Noisy Network](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/07_noisynet.py)(Failed Yet, But I Will Try My Best To Make It Work Soon!) 45 | 46 | ### Paper Reading 47 | * Welcome to visit my personal blog website: [HU's Blog](https://huixxi.github.io/). There is a list of **RL Paper Overviews**. 48 | 49 | Flag Counter 50 | -------------------------------------------------------------------------------- /images/ddqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/ddqn.png -------------------------------------------------------------------------------- /images/distributional_algorithm2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/distributional_algorithm2.png -------------------------------------------------------------------------------- /images/distributional_learn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/distributional_learn.png -------------------------------------------------------------------------------- /images/distributional_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/distributional_project.png -------------------------------------------------------------------------------- /images/distributional_projected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/distributional_projected.png -------------------------------------------------------------------------------- /images/distributional_rl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/distributional_rl.png -------------------------------------------------------------------------------- /images/dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dqn.png -------------------------------------------------------------------------------- /images/dqn_algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dqn_algorithm.png -------------------------------------------------------------------------------- /images/dqn_net.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dqn_net.png -------------------------------------------------------------------------------- /images/dueling_detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dueling_detail.png -------------------------------------------------------------------------------- /images/dueling_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dueling_details.png -------------------------------------------------------------------------------- /images/dueling_netarch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/dueling_netarch.png -------------------------------------------------------------------------------- /images/gym_cartpole_v0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/gym_cartpole_v0.gif -------------------------------------------------------------------------------- /images/noisy_net_algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/noisy_net_algorithm.png -------------------------------------------------------------------------------- /images/p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/p2.png -------------------------------------------------------------------------------- /images/rlblog_images/IS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/IS.jpg -------------------------------------------------------------------------------- /images/rlblog_images/LSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/LSTM.png -------------------------------------------------------------------------------- /images/rlblog_images/PPO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/PPO.png -------------------------------------------------------------------------------- /images/rlblog_images/README.md: -------------------------------------------------------------------------------- 1 | To save some pictures used in my blogs. 2 | -------------------------------------------------------------------------------- /images/rlblog_images/RNN-unrolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/RNN-unrolled.png -------------------------------------------------------------------------------- /images/rlblog_images/ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/ppo.png -------------------------------------------------------------------------------- /images/rlblog_images/r1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/r1.png -------------------------------------------------------------------------------- /images/rlblog_images/r2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/rlblog_images/r2.png -------------------------------------------------------------------------------- /images/sards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/d1c191e7bfbb44357a4066ced3b96fa8c847875a/images/sards.png -------------------------------------------------------------------------------- /tutorial_blogs/Building_Rainbow_Step_by_Step_with_TensorFlow2.0.md: -------------------------------------------------------------------------------- 1 | # Building Rainbow Step by Step with TensorFlow 2.0 2 | *[Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf)* 3 | `Journal: The Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)` 4 | `Year: 2017` 5 | `Institute: DeepMind` 6 | `Author: Matteo Hessel, Joseph Modayil, Hado van Hasselt` 7 | `#`*Deep Reinforcement Learning* 8 | 9 | **

Abstract

** 10 | This paper examines six main extensions to DQN algorithm and empirically studies their combination. (It is a good paper which gives you a summary of several important technologies to alleviate the problems remaining in DQN and provides you some valuable insights in this research region.) 11 | [Baseline: Deep Q-Network(DQN) Algorithm Implementation in CS234 Assignment 2](https://github.com/Huixxi/CS234-Reinforcement-Learning/tree/master/assignment%202) 12 | 13 | ## INTRODUCTION 14 | Because the traditional tabular methods are not applicable in arbitrarily large state spaces, we turn to those approximate solution methods (`linear approximator & nonlinear approximator` `value-function approximation & policy approximation`), which is to find a good approximate solution using limited computational resources. We can use a `linear function`, or `multi-layer artificial neural networks`, or `decision tree` as a parameterized function to approximate the value-function or policy.(More, read Sutton's book [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/the-book-2nd.html) Chapter 9). 15 | 16 | The following methods are all `value-function approximation` and `gradient-based(using the gradients to update the parameters)`, and they all use **experience replay** and **target network** to eliminate the correlations present in the sequence of observations. 17 | 18 | ## 1>Linear 19 | Using a linear function to approximate the value function(always the action value). 20 | $$ 21 | \hat v(s, w) \doteq w^Tx(s) \doteq \sum \limits_{i=1}^d w_i x_i 22 | $$ 23 | $w$ is the parameters, $x(s)$ is called a *feature vector* representing state $s$, and the state $s$ is the images(frames) observed by the agent in most time. So a linear approximator implemented with *tensorflow* can be just a fully-connected layer. 24 | ``` python 25 | import tensorflow as tf 26 | # state: a sequence of image(frame) 27 | inputs = tf.layers.flatten(state) 28 | # scope, which is used to distinguish q_params and target_q_params 29 | out = layers.fully_connected(inputs, num_actions, scope=scope, reuse=reuse) 30 | ``` 31 | 32 | ## 2>Nonlinear-DQN 33 | ***Deep Q-Network.*** The main difference of *DQN* from *linear approximator* is the architecture of getting the *q_value*, it is nonlinear. 34 | 35 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/dqn.png) 36 | 37 | And the total algorithm is as follows: 38 | 39 | ![Paper: Human-level control through deep reinforcement learning.](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/dqn_algorithm.png) 40 | 41 | The approximator of DeepMind DQN implemented with *tensorflow* as described in their [Nature paper](https://www.nature.com/articles/nature14236) can be: 42 | ``` python 43 | import tensorflow as tf 44 | with tf.variable_scope(scope, reuse=reuse) as _: 45 | conv1 = layers.conv2d(state, num_outputs=32, kernel_size=(8, 8), stride=4, activation_fn=tf.nn.relu) 46 | conv2 = layers.conv2d(conv1, num_outputs=64, kernel_size=(4, 4), stride=2, activation_fn=tf.nn.relu) 47 | conv3 = layers.conv2d(conv2, num_outputs=64, kernel_size=(3, 3), stride=1, activation_fn=tf.nn.relu) 48 | full_inputs = layers.flatten(conv3) 49 | full_layer = layers.fully_connected(full_inputs, num_outputs=512, activation_fn=tf.nn.relu) 50 | out = layers.fully_connected(full_layer, num_outputs=num_actions) 51 | ``` 52 | 53 | [Do DQN from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/01_dqn.py) 54 | 55 | 56 | 57 | ## 3>Nonlinear-DDQN 58 | ***Double DQN.*** The main difference of *DDQN* from *DQN* is the way of calculating the target q value. 59 | As a reminder, 60 | In Q-Learning: 61 | $$ 62 | Q(s,a) \leftarrow Q(s,a) + \alpha[r + \lambda max_{a'}Q(s',a') − Q(s,a)] 63 | $$ 64 | $$ 65 | Y_t^{Q} = R_{t+1} + \lambda max_{a'}Q(S_{t+1},a') = R_{t+1} + \lambda Q(S_{t+1},argmax_{a'}Q(S_{t+1},a')) 66 | $$ 67 | In DQN: 68 | 69 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/ddqn.png) 70 | 71 | where $\theta_{i-1}$ is the target network parameters which is always represeted with $\theta_t^-$. 72 | $$ 73 | Y_t^{DQN} = R_{t+1} + \lambda max_{a'}Q(S_{t+1},a';\theta_t^-) 74 | $$ 75 | There is a problem with deep q-learning that "It is known to sometimes learn unrealistically high action values because it includes a maximization step over estimated action values, which tends to prefer overestimated to underestimated values" as said in [DDQN paper](https://arxiv.org/pdf/1509.06461.pdf). 76 | The idea of Double Q-learning is to reduce overestimations by decomposing the max operation in the target into action selection and action evaluation. 77 | $$ 78 | Y_t^{DoubleQ} = R_{t+1} + \lambda Q(S_{t+1}, argmax_{a'}Q(S_{t+1},a';\theta_t);\theta_t^-) 79 | $$ 80 | Implement with *tensorflow* (the minimal possible change to DQN in cs234 assignment 2) 81 | ``` python 82 | # DQN 83 | q_samp = tf.where(self.done_mask, self.r, self.r + self.config.gamma * tf.reduce_max(target_q, axis=1)) 84 | actions = tf.one_hot(self.a, num_actions) 85 | q = tf.reduce_sum(tf.multiply(q, actions), axis=1) 86 | self.loss = tf.reduce_mean(tf.squared_difference(q_samp, q)) 87 | 88 | # DDQN 89 | max_q_idxes = tf.argmax(q, axis=1) 90 | max_actions = tf.one_hot(max_q_idxes, num_actions) 91 | q_samp = tf.where(self.done_mask, self.r, self.r + self.config.gamma * tf.reduce_sum(tf.multiply(target_q, max_actions), axis=1)) 92 | actions = tf.one_hot(self.a, num_actions) 93 | q = tf.reduce_sum(tf.multiply(q, actions), axis=1) 94 | self.loss = tf.reduce_mean(tf.squared_difference(q_samp, q)) 95 | ``` 96 | 97 | [Do Double DQN from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/02_ddqn.py) 98 | 99 | 100 | 101 | ## 4>Prioritized experience replay 102 | ***Prioritized experience replay.*** Improve data efficiency, by replaying more often transitions from which there is more to learn. 103 | ***And the total algorithm is as follows:*** 104 | 105 | ![Paper: Prioritized Experience Replay](https://raw.githubusercontent.com/Huixxi/CS234-Reinforcement-Learning/master/rl_images/prior_replay.png) 106 | 107 | * Prioritizing with Temporal-Difference(TD) Error 108 | TD-Error: how far the value is from its next-step bootstrap estimate $$ r + \lambda V(s') - V(s) $$ 109 | Where the value $r + \lambda V(s') $ is known as the TD target. 110 | Experiences with high magnitude TD error also appear to be replayed more often. TD-errors have also been used as a prioritization mechanism for determining where to focus resources, for example when choosing where to explore or which features to select. However, the TD-error can be a poor estimate in some circumstances as well, e.g. when rewards are noisy. 111 | 112 | * Stochastic Prioritization 113 | Because `greedy prioritization` results in high-error transitions are replayed too frequently causing lack of diversity which could lead to `over-fitting`. So `Stochastic Prioritization` is intruduced in order to add diversity and find a balance between greedy prioritization and random sampling. 114 | We ensure that the probability of being sampled is monotonic in a transition’s priority, while guaranteeing a non-zero probability even for the lowest-priority transition. Concretely, the probability of sampling transition i as 115 | $$ 116 | P(i) = \frac{p_i^{\alpha}}{\sum_kp_k^{\alpha}} 117 | $$ 118 | (Note: the probability of sampling transition $P(i)$ has nothing to do with the probability to sample a transition(experience) in the replay buffer(sum tree), which is based on the transition's priority $p_i$. So don't be confused by it, the $P(i)$ is used to calculate the `Importance Sampling(IS) Weight`.) 119 | where $p_i > 0$ is the priority of transition $i$. The exponent α determines how much prioritization is used, with $\alpha = 0$ corresponding to the uniform case. 120 | * proportional prioritization: $p_i = |\delta_i| + \epsilon$ 121 | * rank-based prioritization: $p_i = \frac{1}{rank(i)}$ , where $rank(i)$ is the rank of transition $i$ sorted according to $\delta_i$. 122 | 123 | * Importance Sampling(IS) 124 | Because prioritized replay introduces a bias that changes this distribution uncontrollably. This can be corrected by using importance-sampling (IS) weights: 125 | $$ 126 | w_i = (\frac{1}{N} \frac{1}{P(i)})^\beta 127 | $$ 128 | that fully compensate for the non-uniform probabilities $P(i)$ if $\beta = 1$. These weights can be folded into the Q-learning update by using $w_i\delta_i$ instead of $\delta_i$. For stability reasons, we always normalize weights by $1 / max_i w_i$ so that they only scale the update downwards. 129 | `IS` is annealed from $\beta_0$ to $1$, which means its affect is felt more strongly at the end of the stochastic process; this is because the unbiased nature of the updates in RL is most important near convergence. 130 | 131 | [Do Double DQN with prioritized experience replay from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/03_priority_replay.py) 132 | 133 | 134 | ## 5>Dueling network architecture 135 | ***Dueling network architecture.*** Generalize across actions by separately representing state values and action advantages. 136 | The dueling network is a neural network architecture designed for value based RL which has a $|A|$ dimension output that Q-value for each action. It features two streams of computation, the **state value** and **action advantage** streams, sharing a convolutional encoder, and merged by a special aggregator layer. 137 | 138 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/dueling_netarch.png) 139 | 140 | The aggregator can be expressed as: 141 | $$ 142 | Q(s, a; \theta, \alpha, \beta) = V(s; \theta, \beta) + \big(A(s, a; \theta, \alpha) - \frac{1}{|A|} \sum_{a'}A(s, a'; \theta, \alpha)\big) 143 | $$ 144 | where $\theta, \beta, \alpha$, respectively, the parameters of the shared convolutional encoder, value stream, and action advantage stream. 145 | **The details of dueling network architecture for Atari:** 146 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/dueling_detail.png) 147 | 148 | Since both the value and the advantage stream propagate gradients to the last convolutional layer in the backward pass, we rescale the combined gradient entering the last convolutional layer by $1/\sqrt{2}$. This simple heuristic mildly increases stability. In addition, we clip the gradients to have their norm less than or equal to $10$. 149 | 150 | **Other tricks:** 151 | 152 | * ***Human Starts:*** Using 100 starting points sampled from a human expert’s trajectory. 153 | * ***Saliency maps:*** To better understand the roles of the value and the advantage streams. 154 | 155 | [Do Dueliing Double DQN with prioritized experience replay from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/04_dueling.py) 156 | 157 | 158 | 159 | ## 6>Multi-step bootstrapping 160 | ***Multi-step bootstrap targets.*** Shift the bias-variance tradeoff and helps to propagate newly observed rewards faster to earlier visited states. 161 | The best methods are often intermediate between the two extremes. *n-step TD learning method* lies between **Monte Carlo** and **one-step TD methods**. 162 | 163 | * Monte Carlo methods perform an update for each state based on the entire sequence of observed rewards from that state until the end of the episode 164 | $$ 165 | G_t \doteq R_{t+1} + \gamma R_{t+2} + \gamma^2R_{t+3} + \dots + \gamma^{T−t−1}R_T 166 | $$ 167 | 168 | * The update of one-step TD methods(also called TD(0) methods), on the other hand, is based on just the one next reward, bootstrapping from the value of the state one step later as a proxy for the remaining rewards. 169 | $$ 170 | G_{t:t+1} \doteq R_{t+1} + \gamma V_t(S_{t+1}) 171 | $$ 172 | 173 | * Now, n-step TD methods perform a tradeoff that update each state after **n** time steps, based on **n** next rewards, bootstrapping from the value of the state **n** step later as a proxy for the remaining rewards. 174 | $$ 175 | G_{t:t+n} \doteq R_{t+1} + \gamma R_{t+2} + \dots + \gamma ^{n−1}R_{t+n} + \gamma^n V_{t+n−1}(S_{t+n}) 176 | $$ 177 | 178 | 179 | We know that Q-learning is a kind of TD learning. All the implementations before are based on TD(0) learing updating. Now, we are going to implement a n-step deep Q-learning method, the most different part is how to calculate the *target* Q value. 180 | In one-step DQN: 181 | $$ 182 | q_{target} = R_{t+1} + \gamma_{t+1} \max q_{\bar{\theta}}(S_{t+1}, a') 183 | $$ 184 | 185 | In one-step Double DQN, the loss is : 186 | $$ 187 | q_{target} = R_{t+1} + \gamma_{t+1} q_{\bar{\theta}}(S_{t+1}, \arg\max_{a'} q_{\theta}(S_{t+1}, a')) 188 | $$ 189 | 190 | In multi-step Double DQN, the loss is : 191 | $$ 192 | R^{(n)}\_t = \sum\limits_{k=0}^{n-1} \gamma_t^{(k)}R_{t+k+1} 193 | $$ 194 | 195 | $$ 196 | q_{target} = R^{(n)}\_t + \gamma_{t}^n q_{\bar{\theta}}(S_{t+n}, \arg\max_{a'} q_{\theta}(S_{t+n}, a')) 197 | $$ 198 | 199 | (The algorithm looks easy to implement and stability guaranteed, but it brings much fluctuation and seems learning rate sensitive when used to train the agent to play CartPole-v0. So if you check this model, you maybe should pay a little bit more attention to it.) 200 | 201 | [Do Multi-Step Dueliing Double DQN with prioritized experience replay from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/05_multistep_td.py) 202 | 203 | 204 | 205 | ## 7>Distributional Q-learning 206 | ***Distributional Q-learning.*** Learn a categorical distribution of discounted returns, instead of its expectation. 207 | In Q learning: 208 | $$ 209 | Q(s, a) = \sum\limits_{i=0}^{n} p_{r_i}r_i(s, a) + \gamma \sum\limits_{s' \in S} P(s'|s, a)\max_{a' \in A(s')}Q(s', a') 210 | $$ 211 | $$ 212 | Q(s, a) = E_{s, a}[ r(s, a) ] + \gamma E_{s, a, s'}[ \max_{a' \in A(s')}Q(s', a') ] \\ 213 | $$ 214 | $$ 215 | Q(s, a) = E_{s, a, s'}[ r(s, a) + \gamma \max_{a' \in A(s')}Q(s', a') ] 216 | $$ 217 | Where $Q(s, a)$ is the expection of the discounted returns. 218 | Now, in distributional rl, instead of calculating the expection, we work directly with the full distribution of the returns of state $s$, action $a$ and following the current policy $\pi$, denoted by a random variable $Z(s, a)$. 219 | 220 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/distributional_rl.png) 221 | 222 | Where $z_i - z_{i-1} = \Delta z = (V_{min} - V_{max}) / N$, we assume that the range of the return $z_i$ is from $V_{min}$ to $V_{max}$, $N$ is the number of atoms, $(z_i, p_i(s, a))$. Now, for each state-action pair $(s, a)$, there is a corresponding distribution of its returns, not a expection value. We calculate the action value of $(s, a)$ as $Q(s, a) = E[Z(s, a)]$. Even through we still use the expected value, what we're going to optimize is the distribution: 223 | $$ 224 | \sup_{s, a} dist(R(s, a) + \gamma Z_{\bar{\theta}}(s', a^\*), Z_{\theta}(s, a)) \\ 225 | a^* = \arg\max_{a′}Q(s′, a′) = \arg\max_{a′}E[Z(s′, a′)] 226 | $$ 227 | The difference is obverse that, we still use a deep neural network to do function approximation, in traditional DQN, our output for each input state $s$ is a $|A|$-dim vector, each element corresponds to an action value $q(s, a)$, but now, the output for each input state $s$ is a $|A|N$-dim matrix, that each row is a $N$-dim vector represents the return distribution of $Z(s, a)$, then we calculate the action-value of $(s, a)$ through: 228 | $$ 229 | q(s, a) = E[Z(s, a)] = \sum\limits_{i=0}^{N} p_i(s, a) z_i 230 | $$ 231 | ***KL Divergence*** 232 | Now, we need to minimize the distance between the current distribution and the target distribution. 233 | ***Note:*** the following content are mainly from that great blog: https://mtomassoli.github.io/2017/12/08/distributional_rl/#kl-divergence 234 | If $p$ and $q$ are two distributions with same support (i.e. their $pdfs$ are non-zero at the same points), then their KL divergence is defined as follows: 235 | $$ 236 | KL(p||q) = \int p(x) \log \frac{p(x)}{q(x)}dx \\ 237 | KL(p||q) = \sum\limits_{i=1}^{N} p(x_i) \log\frac{p(x_i)}{q(x_i)} = \sum\limits_{i=1}^{N} p(x_i)[ \log{p(x_i)} - \log{q(x_i)}] 238 | $$ 239 | "Now say we’re using DQN and extract $(s, a, r, s′)$ from the replay buffer. A “sample of the target distribution” is $r + \gamma Z_{\bar{\theta}}(s′, a^\*)$. We want to move $Z_{\theta}(s, a)$ towards this target (by keeping the target fixed)." 240 | 241 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/distributional_learn.png) 242 | 243 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/distributional_projected.png) 244 | 245 | Then, their KL loss is: 246 | $$ 247 | KL(m||p_{\theta}) = \sum\limits_{i=1}^{N} m_i \log\frac{m_i}{p_{\theta, i}} = \sum\limits_{i=1}^{N} m_i[ \log{m_i} - \log{p_{\theta, i}}] = H(m, p_{\theta}) − H(m) 248 | $$ 249 | The gradient of the KL loss is: 250 | $$ 251 | \nabla_{\theta} KL(m||p_{\theta}) = \nabla_{\theta} \sum\limits_{i=1}^{N} m_i \log\frac{m_i}{p_{\theta, i}} = \nabla_{\theta}[H(m, p_{\theta}) − H(m)] = \nabla_{\theta}H(m, p_{\theta}) 252 | $$ 253 | So, we can just use the *cross-entropy*: 254 | $$ 255 | H(m, p_{\theta}) = - \sum\limits_{i=1}^{N} m_i \log{p_i(s, a; \theta)} 256 | $$ 257 | as the loss function. 258 | 259 | ***The total algorithm is as follows:*** 260 | 261 | ![](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/distributional_algorithm2.png) 262 | 263 | [Do Distributional RL Based on Multi-Step Dueling Double DQN with Prioritized Experience Replay from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/06_distributional_rl.py) 264 | I feel really sorry to say that actually, this is a failed implementation, just as a reference, but I still hope it to be helpful to someone, and I promise I will try my best to fix it. Further more, I really hope some good guy can check my code, find the wrong place, even as a contributor to make it work together, thanks a lot. 265 | 266 | 267 | 268 | ## 8>Noisy DQN 269 | ***Noisy DQN.*** Use stochastic network layers for exploration. 270 | By now, the exploration method we used are all e-greedy methods, but in some games such as Montezuma’s Revenge, where many actions must be executed to collect the first reward. the limitations of exploring using e-greedy policies are clear. Noisy Nets propose a noisy linear layer that combines a deterministic and noisy stream. 271 | A normal linear layer with $p$ inputs and $q$ outputs, represented by: 272 | $$ 273 | y = wx + b 274 | $$ 275 | A noisy linear layer now is: 276 | $$ 277 | y = (\mu^w + \sigma^w \odot \epsilon^w)x + (\mu^b + \sigma^b \odot \epsilon^b) 278 | $$ 279 | Where where $\mu^w + \sigma^w \odot \epsilon^w$ and $\mu^b + \sigma^b \odot \epsilon^b$ replace $w$ and $b$, respectively. The parameters $\mu^w \in R^{q \times p}$, $\mu^b \in R^q$, $\sigma^w \in R^{q\times p}$ and $\sigma^b \in R^q$ are learnable whereas $\epsilon^w \in R^{q\times p}$ and $\epsilon^b \in R^q$ are noise random variables. There are two kinds of Gaussian Noise: 280 | 281 | * **Independent Gaussian Noise:** 282 | The noise applied to each weight and bias is independent, where each entry $\epsilon^w_{i,j}$ (respectively each entry $\epsilon^b_j$) of the random matrix $\epsilon^w$ (respectively of the random vector $\epsilon^b$ ) is drawn from a unit Gaussian distribution. This means that for each noisy linear layer, there are $pq + q$ noise variables (for p inputs to the layer and q outputs). 283 | 284 | * **Factorised Gaussian Noise:** 285 | By factorising $\epsilon^w_{i,j}$, we can use $p$ unit Gaussian variables $\epsilon_i$ for noise of the inputs and and $q$ unit Gaussian variables $\epsilon_j$ for noise of the outputs (thus $p + q$ unit Gaussian variables in total). Each $\epsilon^w_{i,j}$ and $\epsilon^b_j$ can then be written as: 286 | $$ 287 | \epsilon^w_{i,j} = f(\epsilon_i)f(\epsilon_j) \\ 288 | \epsilon^b_j = f(\epsilon_j) 289 | $$ 290 | where $f$ is a real-valued function. In our experiments we used $f(x) = sgn(x) \sqrt{|x|}$. Note that 291 | for the bias $\epsilon^b_j$ we could have set $f(x) = x$, but we decided to keep the same output noise for weights and biases. 292 | 293 | ***The total algorithm is as follows:*** 294 | ![Paper:Noisy Networks for Exploration](https://raw.githubusercontent.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/master/images/noisy_net_algorithm.png) 295 | 296 | [Do Noisy Network Based on Multi-Step Dueling Double DQN with Prioritized Experience Replay from scratch(basic version)](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/07_noisynet.py) 297 | Run well, but too slow.(I don't know why...) 298 | 299 | 300 | ## 9>Rainbow 301 | Finally, we get the integrated agent: Rainbow. It used a **multi-step** distributional loss: 302 | $$ 303 | D_{KL}(\Phi_z d_t^{(n)} || d_t) 304 | $$ 305 | Where $\Phi_z$ is the projection onto $z$, and the target distribution $d_t^{(n)}$ is: 306 | $$ 307 | d_t^{(n)} =(R_t^{(n)} + \gamma_t^{(n)} z, p_\bar{\theta} (S_{t+n}, a^{\*}\_{t+n})) 308 | $$ 309 | Using **double Q-learning** gets the greedy action $a^\*\_{t+n}$ of $S_{t+n}$ through *online network*, and evaluates such action using the *target network*. 310 | 311 | In Rainbow, it uses the KL loss to **prioritize transitions** instead of using the absolute TD error, maybe more robust to noisy stochastic environments because the loss can continue to decrease even when the returns are not deterministic. 312 | $$ 313 | p_t \propto (D_{KL}(\Phi_z d_t^{(n)} || d_t))^w 314 | $$ 315 | 316 | The network architecture is a **dueling network architecture** adapted for use with return **distributions**. The network has a shared representation $f_{\xi}(s)$, which is then fed into a value stream $v_{\eta}$ with $N_{atoms}$ outputs, and into an advantage stream $a_{\xi}$ with $N_{atoms} \times N_{actions}$ outputs, where $a_{\xi}^i(f_{\xi}(s),a)$ will denote the output corresponding to atom $i$ and action $a$. For each atom $z^i$, the value and advantage streams are aggregated, as in dueling DQN, and then passed through a softmax layer to obtain the normalised parametric distributions used to estimate the returns’ distributions: 317 | $$ 318 | p_{\theta}^i(s, a) = \frac{exp(v_{\eta}^i + a_{\Phi}^i(\phi, a) - \bar{a}\_{\Phi}^i(s))}{\sum_j exp(v_{\eta}^j + a_{\Phi}^j(\phi, a) - \bar{a}\_{\Phi}^j(s))} 319 | $$ 320 | where $\phi = f_{\xi}(s)$, and $\bar{a}\_{\Phi}^i(s) = \frac{1}{N_{actions}}\sum_{a'}a_{\Phi}^i(\phi, a')$ 321 | 322 | Then replace all linear layers with their noisy equivalent(factorised Gaussian noise version). 323 | 324 | Done, and thanks for reading, I hope it could be helpful to someone. 325 | Any suggestion is more than welcome, thanks again. 326 | 327 | 328 | ## REFERENCES 329 | **Blogs:** 330 | [1.Self Learning AI-Agents III:Deep (Double) Q-Learning(Blog)](https://towardsdatascience.com/deep-double-q-learning-7fca410b193a) 331 | [2.【强化学习】Deep Q Network(DQN)算法详解(Bolg)](https://blog.csdn.net/qq_30615903/article/details/80744083) 332 | [3.Improvements in Deep Q Learning: Dueling Double DQN, Prioritized Experience Replay, and fixed…(Blog)](https://www.freecodecamp.org/news/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682/) 333 | [4.Let’s make a DQN: Double Learning and Prioritized Experience Replay(Blog)](https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/) 334 | [5.Distributional RL](https://mtomassoli.github.io/2017/12/08/distributional_rl/#ref-2017arXiv171010044D) 335 | 336 | 337 | **Books:** 338 | [1.Reinforcement Learning: An Introduction (Chapter 6, 7, 9)](http://incompleteideas.net/book/the-book-2nd.html) 339 | 340 | 341 | **Papers:** 342 | [1.Rainbow: Combining Improvements in Deep Reinforcement Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17204/16680) 343 | [2.Human-level control through deep reinforcement learning](https://daiwk.github.io/assets/dqn.pdf) 344 | [3.Implementing the Deep Q-Network](https://arxiv.org/pdf/1711.07478.pdf) 345 | [4.Deep Reinforcement Learning with Double Q-learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12389/11847) 346 | [5.Prioritized Experience Replay](https://arxiv.org/pdf/1511.05952.pdf) 347 | [6.Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/pdf/1511.06581.pdf) 348 | [7.Understanding Multi-Step Deep Reinforcement Learning: A Systematic Study of the DQN Target](https://arxiv.org/pdf/1901.07510.pdf) 349 | [8.Distributed Prioritized Experience Replay](https://arxiv.org/pdf/1803.00933.pdf) 350 | [9.A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) 351 | [10.Noisy Networks for Exploration](https://arxiv.org/pdf/1706.10295.pdf) 352 | 353 | 354 | **GitHub Repos:** 355 | [1.inoryy/tensorflow2-deep-reinforcement-learning for the whole TF2 Network Architecture](https://github.com/inoryy/tensorflow2-deep-reinforcement-learning) 356 | [2.keras-rl for Deuling Network](https://github.com/germain-hug/Deep-RL-Keras/blob/master/DDQN/agent.py#L49) 357 | [3.jaromiru/AI-blog for Prioritized Experience Replay](https://github.com/jaromiru/AI-blog/blob/master/SumTree.py) 358 | [4.rl_algorithms for Multi-Step TD Learning](https://github.com/medipixel/rl_algorithms/) 359 | [5.Kaixhin/Rainbow for Distribution RL & Noisy Net](https://github.com/Kaixhin/Rainbow/blob/master/model.py#L10) 360 | [6.keras for Noisy Net](https://github.com/keras-team/keras/blob/master/keras/layers/core.py#L796) 361 | [7.dopamine for Rainbow](https://github.com/google/dopamine/blob/master/dopamine/agents/rainbow/rainbow_agent.py) 362 | [8.tensorflow gudie](https://www.tensorflow.org/guide/keras/custom_layers_and_models) 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | -------------------------------------------------------------------------------- /tutorial_blogs/gym_tutorial.md: -------------------------------------------------------------------------------- 1 | # OpenAI Gym An Introduction 2 | Official Docs: http://gym.openai.com/docs/ 3 | Github: https://github.com/openai/gym 4 | 5 | ## Installation 6 | * Simply install `gym` using `pip3`: 7 | `pip3 install gym` 8 | 9 | * Full installation containing all environments 10 | `pip3 install gym[all]` 11 | You can ignore the failed building message of `mujoco-py`, which needs a license. 12 | 13 | ## Environment 14 | Check all environment in gym using: 15 | * `print(gym.envs.registry.all())` 16 | * `print([env.id for env in gym.envs.registry.all()]) # list version` 17 | 18 | `['Copy-v0', 'RepeatCopy-v0', 'ReversedAddition-v0', 'ReversedAddition3-v0', 'DuplicatedInput-v0', 'Reverse-v0', 'CartPole-v0', 'CartPole-v1', 'MountainCar-v0', ...` 19 | 20 | ## Basic Usage 21 | Take "CartPole-v0" environment as an example: 22 | ```python 23 | import gym 24 | import time 25 | 26 | env = gym.make("CartPole-v0") # setup a environment for the agent 27 | initial_observation = env.reset() 28 | done = False 29 | 30 | # one episode, when done is True, break. 31 | while not done: 32 | env.render() # make the environment visiable 33 | action = env.action_space.sample() # randomly select an action from total actions the agent can take 34 | next_observation, reward, done, info = env.step(action) 35 | time.sleep(0.1) # for better display effect 36 | 37 | env.close() # close the environment 38 | ``` 39 | Here, the agent is a random agent that just take a random action in each step. You can change it as a **linear agent** or a **neural network agent** which accept the observation and return an action not randomly select from the action space. 40 | Note, `env.step(action)` that takes an action and returns four different things: 41 | * **observation (object):** an environment-specific object representing your observation of the environment. 42 | * **reward (float):** amount of reward achieved by the previous action. 43 | * **done (boolean):** whether it’s time to reset the environment again. 44 | * **info (dict):** diagnostic information useful for debugging. 45 | 46 | ![](https://github.com/Huixxi/TensorFlow2.0-for-Deep-Reinforcement-Learning/blob/master/images/sards.png) 47 | 48 | ## Spaces 49 | (Just copy from the official docs. Still take "CartPole-v0" as example.) 50 | Every environment(discrete) comes with an `action_space` and an `observation_space`. These attributes are of type `Space`, and they describe the format of valid actions and observations: 51 | ```python 52 | import gym 53 | env = gym.make('CartPole-v0') 54 | print(env.action_space) 55 | #> Discrete(2) 56 | print(env.observation_space) 57 | #> Box(4,) 58 | ``` 59 | The `Discrete` space allows a fixed range of non-negative numbers, so in this case valid `actions` are either `0` or `1`. The `Box` space represents an `n`-dimensional box, so valid `observations` will be an array of `4` numbers. We can also check the `Box`’s bounds: 60 | ```python 61 | print(env.observation_space.high) 62 | #> array([ 2.4 , inf, 0.20943951, inf]) 63 | print(env.observation_space.low) 64 | #> array([-2.4 , -inf, -0.20943951, -inf]) 65 | ``` 66 | `Box` and `Discrete` are the most common `Space`s. You can sample from a `Space` or check that something belongs to it: 67 | ```c++ 68 | from gym import spaces 69 | space = spaces.Discrete(8) # Set with 8 elements {0, 1, 2, ..., 7} 70 | x = space.sample() 71 | assert space.contains(x) 72 | assert space.n == 8 73 | ``` 74 | For `CartPole-v0` one of the actions applies force to the left, and one of them applies force to the right. 75 | --------------------------------------------------------------------------------