├── Char00 Conventional Algorithms
    ├── Q-learning.py
    ├── Sarsa.py
    └── gridworld.py
├── Char01 DQN
    ├── DQN.py
    ├── DQN
    │   └── pic
    │   │   ├── finish_episode.jpg
    │   │   ├── readme.md
    │   │   └── value_loss.jpg
    ├── DQN_CartPole-v0.py
    ├── DQN_MountainCar-v0.py
    ├── DQN_mountain_car_v1.py
    ├── naiveDQN.py
    └── readme.md
├── Char02 Policy Gradient
    ├── PolicyGradient.py
    ├── REINFORCE.py
    ├── REINFORCE_with_Baseline.py
    ├── Run_Model.py
    ├── naive-policy-gradient.py
    └── pytorch_MountainCar-v0.py
├── Char03 Actor-Critic
    ├── AC_CartPole-v0.py
    └── AC_MountainCar-v0.py
├── Char04 A2C
    ├── A2C.py
    └── multiprocessing_env.py
├── Char05 DDPG
    ├── DDPG.py
    ├── DDPG_exp.jpg
    └── README.md
├── Char07 PPO
    ├── PPO2.py
    ├── PPO_CartPole_v0.py
    ├── PPO_MountainCar-v0.py
    ├── PPO_pendulum.py
    └── readme.md
├── Char08 ACER
    └── readme.md
├── Char09 SAC
    ├── SAC.py
    ├── SAC_BipedalWalker-v2.py
    ├── SAC_dual_Q_net.py
    ├── SAC_ep_r_curve.png
    └── test_agent.py
├── Char10 TD3
    ├── Episode_reward_TD3_BipedakWalker.png
    ├── TD3.py
    ├── TD3_BipedalWalker-v2.py
    ├── TD3_Pendulum-v0.png
    ├── expTD3.pyPendulum-v0.
    │   ├── actor.pth
    │   ├── actor_target.pth
    │   ├── critic_1.pth
    │   ├── critic_1_target.pth
    │   ├── critic_2.pth
    │   └── critic_2_target.pth
    └── expTD3_BipedalWalker-v2.pyBipedalWalker-v2.
    │   ├── actor.pth
    │   ├── actor_target.pth
    │   ├── critic_1.pth
    │   ├── critic_1_target.pth
    │   ├── critic_2.pth
    │   └── critic_2_target.pth
├── LICENSE
├── More
    ├── Application in real world
    │   └── README.md
    ├── MARL
    │   └── README.md
    ├── plot.py
    └── readme.md
├── figures
    └── test.png
├── readme.md
└── requirements.txt


/Char00 Conventional Algorithms/Q-learning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import time
 5 | 
 6 | ALPHA = 0.1
 7 | GAMMA = 0.95
 8 | EPSILION = 0.9
 9 | N_STATE = 20
10 | ACTIONS = ['left', 'right']
11 | MAX_EPISODES = 200
12 | FRESH_TIME = 0.1
13 | 
14 | def build_q_table(n_state, actions):
15 |     q_table = pd.DataFrame(
16 |     np.zeros((n_state, len(actions))),
17 |     np.arange(n_state),
18 |     actions
19 |     )
20 |     return q_table
21 | 
22 | def choose_action(state, q_table):
23 |     #epslion - greedy policy
24 |     state_action = q_table.loc[state,:]
25 |     if np.random.uniform()>EPSILION or (state_action==0).all():
26 |         action_name = np.random.choice(ACTIONS)
27 |     else:
28 |         action_name = state_action.idxmax()
29 |     return action_name
30 | 
31 | def get_env_feedback(state, action):
32 |     if action=='right':
33 |         if state == N_STATE-2:
34 |             next_state = 'terminal'
35 |             reward = 1
36 |         else:
37 |             next_state = state+1
38 |             reward = -0.5
39 |     else:
40 |         if state == 0:
41 |             next_state = 0
42 |             
43 |         else:
44 |             next_state = state-1
45 |         reward = -0.5
46 |     return next_state, reward
47 | 
48 | def update_env(state,episode, step_counter):
49 |     env = ['-'] *(N_STATE-1)+['T']
50 |     if state =='terminal':
51 |         print("Episode {}, the total step is {}".format(episode+1, step_counter))
52 |         final_env = ['-'] *(N_STATE-1)+['T']
53 |         return True, step_counter
54 |     else:
55 |         env[state]='*'
56 |         env = ''.join(env)
57 |         print(env)
58 |         time.sleep(FRESH_TIME)
59 |         return False, step_counter
60 |         
61 |     
62 | def q_learning():
63 |     q_table = build_q_table(N_STATE, ACTIONS)
64 |     step_counter_times = []
65 |     for episode in range(MAX_EPISODES):
66 |         state = 0
67 |         is_terminal = False
68 |         step_counter = 0
69 |         update_env(state, episode, step_counter)
70 |         while not is_terminal:
71 |             action = choose_action(state,q_table)
72 |             next_state, reward = get_env_feedback(state, action)
73 |             next_q = q_table.loc[state, action]
74 |             if next_state == 'terminal':
75 |                 is_terminal = True
76 |                 q_target = reward
77 |             else:
78 |                 delta = reward + GAMMA*q_table.iloc[next_state,:].max()-q_table.loc[state, action]
79 |                 q_table.loc[state, action] += ALPHA*delta
80 |             state = next_state
81 |             is_terminal,steps = update_env(state, episode, step_counter+1)
82 |             step_counter+=1
83 |             if is_terminal:
84 |                 step_counter_times.append(steps)
85 |                 
86 |     return q_table, step_counter_times
87 | 
88 | def main():
89 |     q_table, step_counter_times= q_learning()
90 |     print("Q table\n{}\n".format(q_table))
91 |     print('end')
92 |     
93 |     plt.plot(step_counter_times,'g-')
94 |     plt.ylabel("steps")
95 |     plt.show()
96 |     print("The step_counter_times is {}".format(step_counter_times))
97 | 
98 | main() 
99 | 


--------------------------------------------------------------------------------
/Char00 Conventional Algorithms/Sarsa.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import time
  6 | 
  7 | ALPHA = 0.1
  8 | GAMMA = 0.95
  9 | EPSILION = 0.9
 10 | N_STATE = 6
 11 | ACTIONS = ['left', 'right']
 12 | MAX_EPISODES = 200
 13 | FRESH_TIME = 0.1
 14 | 
 15 | def build_q_table(n_state, actions):
 16 |     q_table = pd.DataFrame(
 17 |     np.zeros((n_state, len(actions))),
 18 |     np.arange(n_state),
 19 |     actions
 20 |     )
 21 |     return q_table
 22 | 
 23 | def choose_action(state, q_table):
 24 |     #epslion - greedy policy
 25 |     state_action = q_table.loc[state,:]
 26 |     if np.random.uniform()>EPSILION or (state_action==0).all():
 27 |         action_name = np.random.choice(ACTIONS)
 28 |     else:
 29 |         action_name = state_action.idxmax()
 30 |     return action_name
 31 | 
 32 | def get_env_feedback(state, action):
 33 |     if action=='right':
 34 |         if state == N_STATE-2:
 35 |             next_state = 'terminal'
 36 |             reward = 1
 37 |         else:
 38 |             next_state = state+1
 39 |             reward = -0.5
 40 |     else:
 41 |         if state == 0:
 42 |             next_state = 0
 43 |             
 44 |         else:
 45 |             next_state = state-1
 46 |         reward = -0.5
 47 |     return next_state, reward
 48 | 
 49 | def update_env(state,episode, step_counter):
 50 |     env = ['-'] *(N_STATE-1)+['T']
 51 |     if state =='terminal':
 52 |         print("Episode {}, the total step is {}".format(episode+1, step_counter))
 53 |         final_env = ['-'] *(N_STATE-1)+['T']
 54 |         return True, step_counter
 55 |     else:
 56 |         env[state]='*'
 57 |         env = ''.join(env)
 58 |         print(env)
 59 |         time.sleep(FRESH_TIME)
 60 |         return False, step_counter
 61 |         
 62 |     
 63 | def sarsa_learning():
 64 |     q_table = build_q_table(N_STATE, ACTIONS)
 65 |     step_counter_times = []
 66 |     for episode in range(MAX_EPISODES):
 67 |         state = 0
 68 |         is_terminal = False
 69 |         step_counter = 0
 70 |         update_env(state, episode, step_counter)
 71 |         while not is_terminal:
 72 |             action = choose_action(state,q_table)
 73 |             next_state, reward = get_env_feedback(state, action)
 74 |             if next_state != 'terminal':
 75 |                 next_action = choose_action(next_state, q_table) #sarsa update method
 76 |             else:
 77 |                 next_action = action
 78 |             next_q = q_table.loc[state, action]
 79 | 
 80 |             if next_state == 'terminal':
 81 |                 is_terminal = True
 82 |                 q_target = reward
 83 |             else:
 84 |                 delta = reward + GAMMA*q_table.loc[next_state,next_action]-q_table.loc[state, action]
 85 |                 q_table.loc[state, action] += ALPHA*delta
 86 |             state = next_state
 87 |             is_terminal,steps = update_env(state, episode, step_counter+1)
 88 |             step_counter+=1
 89 |             if is_terminal:
 90 |                 step_counter_times.append(steps)
 91 |                 
 92 |     return q_table, step_counter_times
 93 | 
 94 | def main():
 95 |     q_table, step_counter_times= sarsa_learning()
 96 |     print("Q table\n{}\n".format(q_table))
 97 |     print('end')
 98 |     
 99 |     plt.plot(step_counter_times,'g-')
100 |     plt.ylabel("steps")
101 |     plt.show()
102 |     print("The step_counter_times is {}".format(step_counter_times))
103 | 
104 | main() 
105 | 


--------------------------------------------------------------------------------
/Char00 Conventional Algorithms/gridworld.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | 
  4 | class GridWorld:
  5 | 
  6 |     def __init__(self, tot_row, tot_col):
  7 |         self.action_space_size = 4
  8 |         self.world_row = tot_row
  9 |         self.world_col = tot_col
 10 |         #The world is a matrix of size row x col x 2
 11 |         #The first layer contains the obstacles
 12 |         #The second layer contains the rewards
 13 |         #self.world_matrix = np.zeros((tot_row, tot_col, 2))
 14 |         self.transition_matrix = np.ones((self.action_space_size, self.action_space_size))/ self.action_space_size
 15 |         #self.transition_array = np.ones(self.action_space_size) / self.action_space_size
 16 |         self.reward_matrix = np.zeros((tot_row, tot_col))
 17 |         self.state_matrix = np.zeros((tot_row, tot_col))
 18 |         self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]
 19 | 
 20 |     #def setTransitionArray(self, transition_array):
 21 |         #if(transition_array.shape != self.transition_array):
 22 |             #raise ValueError('The shape of the two matrices must be the same.') 
 23 |         #self.transition_array = transition_array        
 24 | 
 25 |     def setTransitionMatrix(self, transition_matrix):
 26 |         '''Set the reward matrix.
 27 | 
 28 |         The transition matrix here is intended as a matrix which has a line
 29 |         for each action and the element of the row are the probabilities to
 30 |         executes each action when a command is given. For example:
 31 |         [[0.55, 0.25, 0.10, 0.10]
 32 |          [0.25, 0.25, 0.25, 0.25]
 33 |          [0.30, 0.20, 0.40, 0.10]
 34 |          [0.10, 0.20, 0.10, 0.60]]
 35 | 
 36 |         This matrix defines the transition rules for all the 4 possible actions.
 37 |         The first row corresponds to the probabilities of executing each one of
 38 |         the 4 actions when the policy orders to the robot to go UP. In this case
 39 |         the transition model says that with a probability of 0.55 the robot will
 40 |         go UP, with a probaiblity of 0.25 RIGHT, 0.10 DOWN and 0.10 LEFT.
 41 |         '''
 42 |         if(transition_matrix.shape != self.transition_matrix.shape):
 43 |             raise ValueError('The shape of the two matrices must be the same.') 
 44 |         self.transition_matrix = transition_matrix
 45 | 
 46 |     def setRewardMatrix(self, reward_matrix):
 47 |         '''Set the reward matrix.
 48 | 
 49 |         '''
 50 |         if(reward_matrix.shape != self.reward_matrix.shape):
 51 |             raise ValueError('The shape of the matrix does not match with the shape of the world.')
 52 |         self.reward_matrix = reward_matrix
 53 | 
 54 |     def setStateMatrix(self, state_matrix):
 55 |         '''Set the obstacles in the world.
 56 | 
 57 |         The input to the function is a matrix with the
 58 |         same size of the world 
 59 |         -1 for states which are not walkable.
 60 |         +1 for terminal states
 61 |          0 for all the walkable states (non terminal)
 62 |         The following matrix represents the 4x3 world
 63 |         used in the series "dissecting reinforcement learning"
 64 |         [[0,  0,  0, +1]
 65 |          [0, -1,  0, +1]
 66 |          [0,  0,  0,  0]]
 67 |         '''
 68 |         if(state_matrix.shape != self.state_matrix.shape):
 69 |             raise ValueError('The shape of the matrix does not match with the shape of the world.')
 70 |         self.state_matrix = state_matrix
 71 | 
 72 |     def setPosition(self, index_row=None, index_col=None):
 73 |         ''' Set the position of the robot in a specific state.
 74 | 
 75 |         '''
 76 |         if(index_row is None or index_col is None): self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]
 77 |         else: self.position = [index_row, index_col]
 78 | 
 79 |     def render(self):
 80 |         ''' Print the current world in the terminal.
 81 | 
 82 |         O represents the robot position
 83 |         - respresent empty states.
 84 |         # represents obstacles
 85 |         * represents terminal states
 86 |         '''
 87 |         graph = ""
 88 |         for row in range(self.world_row):
 89 |             row_string = ""
 90 |             for col in range(self.world_col):
 91 |                 if(self.position == [row, col]): row_string += u" \u25CB " # u" \u25CC "
 92 |                 else:
 93 |                     if(self.state_matrix[row, col] == 0): row_string += ' - '
 94 |                     elif(self.state_matrix[row, col] == -1): row_string += ' # '
 95 |                     elif(self.state_matrix[row, col] == +1): row_string += ' * '
 96 |             row_string += '\n'
 97 |             graph += row_string 
 98 |         print(graph)            
 99 | 
100 |     def reset(self, exploring_starts=False):
101 |         ''' Set the position of the robot in the bottom left corner.
102 | 
103 |         It returns the first observation
104 |         '''
105 |         if exploring_starts:
106 |             while(True):
107 |                 row = np.random.randint(0, self.world_row)
108 |                 col = np.random.randint(0, self.world_col)
109 |                 if(self.state_matrix[row, col] == 0): break
110 |             self.position = [row, col]
111 |         else:
112 |             self.position = [self.world_row-1, 0]
113 |         #reward = self.reward_matrix[self.position[0], self.position[1]]
114 |         return self.position
115 | 
116 |     def step(self, action):
117 |         ''' One step in the world.
118 | 
119 |         [observation, reward, done = env.step(action)]
120 |         The robot moves one step in the world based on the action given.
121 |         The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT
122 |         @return observation the position of the robot after the step
123 |         @return reward the reward associated with the next state
124 |         @return done True if the state is terminal  
125 |         '''
126 |         if(action >= self.action_space_size): 
127 |             raise ValueError('The action is not included in the action space.')
128 | 
129 |         #Based on the current action and the probability derived
130 |         #from the trasition model it chooses a new actio to perform
131 |         action = np.random.choice(4, 1, p=self.transition_matrix[int(action),:])
132 |         #action = self.transition_model(action)
133 | 
134 |         #Generating a new position based on the current position and action
135 |         if(action == 0): new_position = [self.position[0]-1, self.position[1]]   #UP
136 |         elif(action == 1): new_position = [self.position[0], self.position[1]+1] #RIGHT
137 |         elif(action == 2): new_position = [self.position[0]+1, self.position[1]] #DOWN
138 |         elif(action == 3): new_position = [self.position[0], self.position[1]-1] #LEFT
139 |         else: raise ValueError('The action is not included in the action space.')
140 | 
141 |         #Check if the new position is a valid position
142 |         #print(self.state_matrix)
143 |         if (new_position[0]>=0 and new_position[0]<self.world_row):
144 |             if(new_position[1]>=0 and new_position[1]<self.world_col):
145 |                 if(self.state_matrix[new_position[0], new_position[1]] != -1):
146 |                     self.position = new_position
147 | 
148 |         reward = self.reward_matrix[self.position[0], self.position[1]]
149 |         #Done is True if the state is a terminal state
150 |         done = bool(self.state_matrix[self.position[0], self.position[1]])
151 |         return self.position, reward, done
152 | 
153 | 


--------------------------------------------------------------------------------
/Char01 DQN/DQN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import gym
  6 | import matplotlib.pyplot as plt
  7 | import copy
  8 | 
  9 | # hyper-parameters
 10 | BATCH_SIZE = 128
 11 | LR = 0.01
 12 | GAMMA = 0.90
 13 | EPISILO = 0.9
 14 | MEMORY_CAPACITY = 2000
 15 | Q_NETWORK_ITERATION = 100
 16 | 
 17 | env = gym.make("CartPole-v0")
 18 | env = env.unwrapped
 19 | NUM_ACTIONS = env.action_space.n
 20 | NUM_STATES = env.observation_space.shape[0]
 21 | ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample.shape
 22 | 
 23 | class Net(nn.Module):
 24 |     """docstring for Net"""
 25 |     def __init__(self):
 26 |         super(Net, self).__init__()
 27 |         self.fc1 = nn.Linear(NUM_STATES, 50)
 28 |         self.fc1.weight.data.normal_(0,0.1)
 29 |         self.fc2 = nn.Linear(50,30)
 30 |         self.fc2.weight.data.normal_(0,0.1)
 31 |         self.out = nn.Linear(30,NUM_ACTIONS)
 32 |         self.out.weight.data.normal_(0,0.1)
 33 | 
 34 |     def forward(self,x):
 35 |         x = self.fc1(x)
 36 |         x = F.relu(x)
 37 |         x = self.fc2(x)
 38 |         x = F.relu(x)
 39 |         action_prob = self.out(x)
 40 |         return action_prob
 41 | 
 42 | class DQN():
 43 |     """docstring for DQN"""
 44 |     def __init__(self):
 45 |         super(DQN, self).__init__()
 46 |         self.eval_net, self.target_net = Net(), Net()
 47 | 
 48 |         self.learn_step_counter = 0
 49 |         self.memory_counter = 0
 50 |         self.memory = np.zeros((MEMORY_CAPACITY, NUM_STATES * 2 + 2))
 51 |         # why the NUM_STATE*2 +2
 52 |         # When we store the memory, we put the state, action, reward and next_state in the memory
 53 |         # here reward and action is a number, state is a ndarray
 54 |         self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
 55 |         self.loss_func = nn.MSELoss()
 56 | 
 57 |     def choose_action(self, state):
 58 |         state = torch.unsqueeze(torch.FloatTensor(state), 0) # get a 1D array
 59 |         if np.random.randn() <= EPISILO:# greedy policy
 60 |             action_value = self.eval_net.forward(state)
 61 |             action = torch.max(action_value, 1)[1].data.numpy()
 62 |             action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
 63 |         else: # random policy
 64 |             action = np.random.randint(0,NUM_ACTIONS)
 65 |             action = action if ENV_A_SHAPE ==0 else action.reshape(ENV_A_SHAPE)
 66 |         return action
 67 | 
 68 | 
 69 |     def store_transition(self, state, action, reward, next_state):
 70 |         transition = np.hstack((state, [action, reward], next_state))
 71 |         index = self.memory_counter % MEMORY_CAPACITY
 72 |         self.memory[index, :] = transition
 73 |         self.memory_counter += 1
 74 | 
 75 | 
 76 |     def learn(self):
 77 | 
 78 |         #update the parameters
 79 |         if self.learn_step_counter % Q_NETWORK_ITERATION ==0:
 80 |             self.target_net.load_state_dict(self.eval_net.state_dict())
 81 |         self.learn_step_counter+=1
 82 | 
 83 |         #sample batch from memory
 84 |         sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
 85 |         batch_memory = self.memory[sample_index, :]
 86 |         batch_state = torch.FloatTensor(batch_memory[:, :NUM_STATES])
 87 |         batch_action = torch.LongTensor(batch_memory[:, NUM_STATES:NUM_STATES+1].astype(int))
 88 |         batch_reward = torch.FloatTensor(batch_memory[:, NUM_STATES+1:NUM_STATES+2])
 89 |         batch_next_state = torch.FloatTensor(batch_memory[:,-NUM_STATES:])
 90 | 
 91 |         #q_eval
 92 |         q_eval = self.eval_net(batch_state).gather(1, batch_action)
 93 |         q_next = self.target_net(batch_next_state).detach()
 94 |         q_target = batch_reward + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
 95 |         loss = self.loss_func(q_eval, q_target)
 96 | 
 97 |         self.optimizer.zero_grad()
 98 |         loss.backward()
 99 |         self.optimizer.step()
100 | 
101 | def reward_func(env, x, x_dot, theta, theta_dot):
102 |     r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.5
103 |     r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
104 |     reward = r1 + r2
105 |     return reward
106 | 
107 | def main():
108 |     dqn = DQN()
109 |     episodes = 400
110 |     print("Collecting Experience....")
111 |     reward_list = []
112 |     plt.ion()
113 |     fig, ax = plt.subplots()
114 |     for i in range(episodes):
115 |         state = env.reset()
116 |         ep_reward = 0
117 |         while True:
118 |             env.render()
119 |             action = dqn.choose_action(state)
120 |             next_state, _ , done, info = env.step(action)
121 |             x, x_dot, theta, theta_dot = next_state
122 |             reward = reward_func(env, x, x_dot, theta, theta_dot)
123 | 
124 |             dqn.store_transition(state, action, reward, next_state)
125 |             ep_reward += reward
126 | 
127 |             if dqn.memory_counter >= MEMORY_CAPACITY:
128 |                 dqn.learn()
129 |                 if done:
130 |                     print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3)))
131 |             if done:
132 |                 break
133 |             state = next_state
134 |         r = copy.copy(reward)
135 |         reward_list.append(r)
136 |         ax.set_xlim(0,300)
137 |         #ax.cla()
138 |         ax.plot(reward_list, 'g-', label='total_loss')
139 |         plt.pause(0.001)
140 |         
141 | 
142 | if __name__ == '__main__':
143 |     main()
144 | 


--------------------------------------------------------------------------------
/Char01 DQN/DQN/pic/finish_episode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char01 DQN/DQN/pic/finish_episode.jpg


--------------------------------------------------------------------------------
/Char01 DQN/DQN/pic/readme.md:
--------------------------------------------------------------------------------
1 | readme
2 | 


--------------------------------------------------------------------------------
/Char01 DQN/DQN/pic/value_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char01 DQN/DQN/pic/value_loss.jpg


--------------------------------------------------------------------------------
/Char01 DQN/DQN_CartPole-v0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | from itertools import count
  5 | 
  6 | import os, time
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal, Categorical
 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 17 | from tensorboardX import SummaryWriter
 18 | 
 19 | # Hyper-parameters
 20 | seed = 1
 21 | render = False
 22 | num_episodes = 2000
 23 | env = gym.make('CartPole-v0').unwrapped
 24 | num_state = env.observation_space.shape[0]
 25 | num_action = env.action_space.n
 26 | torch.manual_seed(seed)
 27 | env.seed(seed)
 28 | 
 29 | Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state'])
 30 | 
 31 | class Net(nn.Module):
 32 |     def __init__(self):
 33 |         super(Net, self).__init__()
 34 |         self.fc1 = nn.Linear(num_state, 100)
 35 |         self.fc2 = nn.Linear(100, num_action)
 36 | 
 37 |     def forward(self, x):
 38 |         x = F.relu(self.fc1(x))
 39 |         action_value = self.fc2(x)
 40 |         return action_value
 41 | 
 42 | class DQN():
 43 | 
 44 |     capacity = 8000
 45 |     learning_rate = 1e-3
 46 |     memory_count = 0
 47 |     batch_size = 256
 48 |     gamma = 0.995
 49 |     update_count = 0
 50 | 
 51 |     def __init__(self):
 52 |         super(DQN, self).__init__()
 53 |         self.target_net, self.act_net = Net(), Net()
 54 |         self.memory = [None]*self.capacity
 55 |         self.optimizer = optim.Adam(self.act_net.parameters(), self.learning_rate)
 56 |         self.loss_func = nn.MSELoss()
 57 |         self.writer = SummaryWriter('./DQN/logs')
 58 | 
 59 | 
 60 |     def select_action(self,state):
 61 |         state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
 62 |         value = self.act_net(state)
 63 |         action_max_value, index = torch.max(value, 1)
 64 |         action = index.item()
 65 |         if np.random.rand(1) >= 0.9: # epslion greedy
 66 |             action = np.random.choice(range(num_action), 1).item()
 67 |         return action
 68 | 
 69 |     def store_transition(self,transition):
 70 |         index = self.memory_count % self.capacity
 71 |         self.memory[index] = transition
 72 |         self.memory_count += 1
 73 |         return self.memory_count >= self.capacity
 74 | 
 75 |     def update(self):
 76 |         if self.memory_count >= self.capacity:
 77 |             state = torch.tensor([t.state for t in self.memory]).float()
 78 |             action = torch.LongTensor([t.action for t in self.memory]).view(-1,1).long()
 79 |             reward = torch.tensor([t.reward for t in self.memory]).float()
 80 |             next_state = torch.tensor([t.next_state for t in self.memory]).float()
 81 | 
 82 |             reward = (reward - reward.mean()) / (reward.std() + 1e-7)
 83 |             with torch.no_grad():
 84 |                 target_v = reward + self.gamma * self.target_net(next_state).max(1)[0]
 85 | 
 86 |             #Update...
 87 |             for index in BatchSampler(SubsetRandomSampler(range(len(self.memory))), batch_size=self.batch_size, drop_last=False):
 88 |                 v = (self.act_net(state).gather(1, action))[index]
 89 |                 loss = self.loss_func(target_v[index].unsqueeze(1), (self.act_net(state).gather(1, action))[index])
 90 |                 self.optimizer.zero_grad()
 91 |                 loss.backward()
 92 |                 self.optimizer.step()
 93 |                 self.writer.add_scalar('loss/value_loss', loss, self.update_count)
 94 |                 self.update_count +=1
 95 |                 if self.update_count % 100 ==0:
 96 |                     self.target_net.load_state_dict(self.act_net.state_dict())
 97 |         else:
 98 |             print("Memory Buff is too less")
 99 | def main():
100 | 
101 |     agent = DQN()
102 |     for i_ep in range(num_episodes):
103 |         state = env.reset()
104 |         if render: env.render()
105 |         for t in range(10000):
106 |             action = agent.select_action(state)
107 |             next_state, reward, done, info = env.step(action)
108 |             if render: env.render()
109 |             transition = Transition(state, action, reward, next_state)
110 |             agent.store_transition(transition)
111 |             state = next_state
112 |             if done or t >=9999:
113 |                 agent.writer.add_scalar('live/finish_step', t+1, global_step=i_ep)
114 |                 agent.update()
115 |                 if i_ep % 10 == 0:
116 |                     print("episodes {}, step is {} ".format(i_ep, t))
117 |                 break
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/Char01 DQN/DQN_MountainCar-v0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | from itertools import count
  5 | 
  6 | import os, time
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal, Categorical
 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 17 | from tensorboardX import SummaryWriter
 18 | 
 19 | # Hyper-parameters
 20 | seed = 1
 21 | render = False
 22 | num_episodes = 400000
 23 | env = gym.make('MountainCar-v0').unwrapped
 24 | num_state = env.observation_space.shape[0]
 25 | num_action = env.action_space.n
 26 | torch.manual_seed(seed)
 27 | env.seed(seed)
 28 | 
 29 | Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state'])
 30 | 
 31 | class Net(nn.Module):
 32 |     def __init__(self):
 33 |         super(Net, self).__init__()
 34 |         self.fc1 = nn.Linear(num_state, 100)
 35 |         self.fc2 = nn.Linear(100, num_action)
 36 | 
 37 |     def forward(self, x):
 38 |         x = F.relu(self.fc1(x))
 39 |         action_prob = self.fc2(x)
 40 |         return action_prob
 41 | 
 42 | class DQN():
 43 | 
 44 |     capacity = 8000
 45 |     learning_rate = 1e-3
 46 |     memory_count = 0
 47 |     batch_size = 256
 48 |     gamma = 0.995
 49 |     update_count = 0
 50 | 
 51 |     def __init__(self):
 52 |         super(DQN, self).__init__()
 53 |         self.target_net, self.act_net = Net(), Net()
 54 |         self.memory = [None]*self.capacity
 55 |         self.optimizer = optim.Adam(self.act_net.parameters(), self.learning_rate)
 56 |         self.loss_func = nn.MSELoss()
 57 |         self.writer = SummaryWriter('./DQN/logs')
 58 | 
 59 | 
 60 |     def select_action(self,state):
 61 |         state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
 62 |         value = self.act_net(state)
 63 |         action_max_value, index = torch.max(value, 1)
 64 |         action = index.item()
 65 |         if np.random.rand(1) >= 0.9: # epslion greedy
 66 |             action = np.random.choice(range(num_action), 1).item()
 67 |         return action
 68 | 
 69 |     def store_transition(self,transition):
 70 |         index = self.memory_count % self.capacity
 71 |         self.memory[index] = transition
 72 |         self.memory_count += 1
 73 |         return self.memory_count >= self.capacity
 74 | 
 75 |     def update(self):
 76 |         if self.memory_count >= self.capacity:
 77 |             state = torch.tensor([t.state for t in self.memory]).float()
 78 |             action = torch.LongTensor([t.action for t in self.memory]).view(-1,1).long()
 79 |             reward = torch.tensor([t.reward for t in self.memory]).float()
 80 |             next_state = torch.tensor([t.next_state for t in self.memory]).float()
 81 | 
 82 |             reward = (reward - reward.mean()) / (reward.std() + 1e-7)
 83 |             with torch.no_grad():
 84 |                 target_v = reward + self.gamma * self.target_net(next_state).max(1)[0]
 85 | 
 86 |             #Update...
 87 |             for index in BatchSampler(SubsetRandomSampler(range(len(self.memory))), batch_size=self.batch_size, drop_last=False):
 88 |                 v = (self.act_net(state).gather(1, action))[index]
 89 |                 loss = self.loss_func(target_v[index].unsqueeze(1), (self.act_net(state).gather(1, action))[index])
 90 |                 self.optimizer.zero_grad()
 91 |                 loss.backward()
 92 |                 self.optimizer.step()
 93 |                 self.writer.add_scalar('loss/value_loss', loss, self.update_count)
 94 |                 self.update_count +=1
 95 |                 if self.update_count % 100 ==0:
 96 |                     self.target_net.load_state_dict(self.act_net.state_dict())
 97 |         else:
 98 |             print("Memory Buff is too less")
 99 | def main():
100 | 
101 |     agent = DQN()
102 |     for i_ep in range(num_episodes):
103 |         state = env.reset()
104 |         if render: env.render()
105 |         for t in range(10000):
106 |             action = agent.select_action(state)
107 |             next_state, reward, done, info = env.step(action)
108 |             if render: env.render()
109 |             transition = Transition(state, action, reward, next_state)
110 |             agent.store_transition(transition)
111 |             state = next_state
112 |             if done or t >=9999:
113 |                 agent.writer.add_scalar('live/finish_step', t+1, global_step=i_ep)
114 |                 agent.update()
115 |                 if i_ep % 10 == 0:
116 |                     print("episodes {}, step is {} ".format(i_ep, t))
117 |                 break
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/Char01 DQN/DQN_mountain_car_v1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch import optim
  6 | import matplotlib.pyplot as plt
  7 | import gym
  8 | 
  9 | 
 10 | #hyper parameters
 11 | EPSILON = 0.9
 12 | GAMMA = 0.9
 13 | LR = 0.01
 14 | MEMORY_CAPACITY = 2000
 15 | Q_NETWORK_ITERATION = 100
 16 | BATCH_SIZE = 32
 17 | 
 18 | EPISODES = 400
 19 | env = gym.make('MountainCar-v0')
 20 | env = env.unwrapped
 21 | NUM_STATES = env.observation_space.shape[0] # 2
 22 | NUM_ACTIONS = env.action_space.n
 23 | 
 24 | 
 25 | class Net(nn.Module):
 26 |     def __init__(self):
 27 |         super(Net, self).__init__()
 28 | 
 29 |         self.fc1 = nn.Linear(NUM_STATES, 30)
 30 |         self.fc1.weight.data.normal_(0, 0.1)
 31 |         self.fc2 = nn.Linear(30, NUM_ACTIONS)
 32 |         self.fc2.weight.data.normal_(0, 0.1)
 33 | 
 34 | 
 35 |     def forward(self, x):
 36 |         x = self.fc1(x)
 37 |         x = F.relu(x)
 38 |         x = self.fc2(x)
 39 | 
 40 |         return x
 41 | 
 42 | class Dqn():
 43 |     def __init__(self):
 44 |         self.eval_net, self.target_net = Net(), Net()
 45 |         self.memory = np.zeros((MEMORY_CAPACITY, NUM_STATES *2 +2))
 46 |         # state, action ,reward and next state
 47 |         self.memory_counter = 0
 48 |         self.learn_counter = 0
 49 |         self.optimizer = optim.Adam(self.eval_net.parameters(), LR)
 50 |         self.loss = nn.MSELoss()
 51 | 
 52 |         self.fig, self.ax = plt.subplots()
 53 | 
 54 |     def store_trans(self, state, action, reward, next_state):
 55 |         if self.memory_counter % 500 ==0:
 56 |             print("The experience pool collects {} time experience".format(self.memory_counter))
 57 |         index = self.memory_counter % MEMORY_CAPACITY
 58 |         trans = np.hstack((state, [action], [reward], next_state))
 59 |         self.memory[index,] = trans
 60 |         self.memory_counter += 1
 61 | 
 62 |     def choose_action(self, state):
 63 |         # notation that the function return the action's index nor the real action
 64 |         # EPSILON
 65 |         state = torch.unsqueeze(torch.FloatTensor(state) ,0)
 66 |         if np.random.randn() <= EPSILON:
 67 |             action_value = self.eval_net.forward(state)
 68 |             action = torch.max(action_value, 1)[1].data.numpy() # get action whose q is max
 69 |             action = action[0] #get the action index
 70 |         else:
 71 |             action = np.random.randint(0,NUM_ACTIONS)
 72 |         return action
 73 | 
 74 |     def plot(self, ax, x):
 75 |         ax.cla()
 76 |         ax.set_xlabel("episode")
 77 |         ax.set_ylabel("total reward")
 78 |         ax.plot(x, 'b-')
 79 |         plt.pause(0.000000000000001)
 80 | 
 81 |     def learn(self):
 82 |         # learn 100 times then the target network update
 83 |         if self.learn_counter % Q_NETWORK_ITERATION ==0:
 84 |             self.target_net.load_state_dict(self.eval_net.state_dict())
 85 |         self.learn_counter+=1
 86 | 
 87 |         sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
 88 |         batch_memory = self.memory[sample_index, :]
 89 |         batch_state = torch.FloatTensor(batch_memory[:, :NUM_STATES])
 90 |         #note that the action must be a int
 91 |         batch_action = torch.LongTensor(batch_memory[:, NUM_STATES:NUM_STATES+1].astype(int))
 92 |         batch_reward = torch.FloatTensor(batch_memory[:, NUM_STATES+1: NUM_STATES+2])
 93 |         batch_next_state = torch.FloatTensor(batch_memory[:, -NUM_STATES:])
 94 | 
 95 |         q_eval = self.eval_net(batch_state).gather(1, batch_action)
 96 |         q_next = self.target_net(batch_next_state).detach()
 97 |         q_target = batch_reward + GAMMA*q_next.max(1)[0].view(BATCH_SIZE, 1)
 98 | 
 99 |         loss = self.loss(q_eval, q_target)
100 |         self.optimizer.zero_grad()
101 |         loss.backward()
102 |         self.optimizer.step()
103 | 
104 | 
105 | 
106 | def main():
107 |     net = Dqn()
108 |     print("The DQN is collecting experience...")
109 |     step_counter_list = []
110 |     for episode in range(EPISODES):
111 |         state = env.reset()
112 |         step_counter = 0
113 |         while True:
114 |             step_counter +=1
115 |             env.render()
116 |             action = net.choose_action(state)
117 |             next_state, reward, done, info = env.step(action)
118 |             reward = reward * 100 if reward >0 else reward * 5
119 |             net.store_trans(state, action, reward, next_state)
120 | 
121 |             if net.memory_counter >= MEMORY_CAPACITY:
122 |                 net.learn()
123 |                 if done:
124 |                     print("episode {}, the reward is {}".format(episode, round(reward, 3)))
125 |             if done:
126 |                 step_counter_list.append(step_counter)
127 |                 net.plot(net.ax, step_counter_list)
128 |                 break
129 | 
130 |             state = next_state
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/Char01 DQN/naiveDQN.py:
--------------------------------------------------------------------------------
  1 | # Nota that this network won't work because the reward is always 1
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import numpy as np
  7 | import gym
  8 | 
  9 | # hyper-parameters
 10 | BATCH_SIZE = 128
 11 | LR = 0.01
 12 | GAMMA = 0.90
 13 | EPISILO = 0.9
 14 | MEMORY_CAPACITY = 20000
 15 | Q_NETWORK_ITERATION = 100
 16 | 
 17 | env = gym.make("CartPole-v0")
 18 | env = env.unwrapped
 19 | NUM_ACTIONS = env.action_space.n
 20 | NUM_STATES = env.observation_space.shape[0]
 21 | ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample.shape
 22 | class Net(nn.Module):
 23 |     """docstring for Net"""
 24 |     def __init__(self):
 25 |         super(Net, self).__init__()
 26 |         self.fc1 = nn.Linear(NUM_STATES, 50)
 27 |         self.fc1.weight.data.normal_(0,0.1)
 28 |         self.fc2 = nn.Linear(50,30)
 29 |         self.fc2.weight.data.normal_(0,0.1)
 30 |         self.out = nn.Linear(30,NUM_ACTIONS)
 31 |         self.out.weight.data.normal_(0,0.1)
 32 | 
 33 |     def forward(self,x):
 34 |         x = self.fc1(x)
 35 |         x = F.relu(x)
 36 |         x = self.fc2(x)
 37 |         x = F.relu(x)
 38 |         action_prob = self.out(x)
 39 |         return action_prob
 40 | 
 41 | class DQN():
 42 |     """docstring for DQN"""
 43 |     def __init__(self):
 44 |         super(DQN, self).__init__()
 45 |         self.eval_net, self.target_net = Net(), Net()
 46 | 
 47 |         self.learn_step_counter = 0
 48 |         self.memory_counter = 0
 49 |         self.memory = np.zeros((MEMORY_CAPACITY, NUM_STATES * 2 + 2))
 50 |         # why the NUM_STATE*2 +2
 51 |         # When we store the memory, we put the state, action, reward and next_state in the memory
 52 |         # here reward and action is a number, state is a ndarray
 53 |         self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
 54 |         self.loss_func = nn.MSELoss()
 55 | 
 56 |     def choose_action(self, state):
 57 |         state = torch.unsqueeze(torch.FloatTensor(state), 0) # get a 1D array
 58 |         if np.random.randn() <= EPISILO:# greedy policy
 59 |             action_value = self.eval_net.forward(state)
 60 |             action = torch.max(action_value, 1)[1].data.numpy()
 61 |             action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
 62 |         else: # random policy
 63 |             action = np.random.randint(0,NUM_ACTIONS)
 64 |             action = action if ENV_A_SHAPE ==0 else action.reshape(ENV_A_SHAPE)
 65 |         return action
 66 | 
 67 | 
 68 |     def store_transition(self, state, action, reward, next_state):
 69 |         transition = np.hstack((state, [action, reward], next_state))
 70 |         index = self.memory_counter % MEMORY_CAPACITY
 71 |         self.memory[index, :] = transition
 72 |         self.memory_counter += 1
 73 | 
 74 | 
 75 |     def learn(self):
 76 | 
 77 |         #update the parameters
 78 |         if self.learn_step_counter % Q_NETWORK_ITERATION ==0:
 79 |             self.target_net.load_state_dict(self.eval_net.state_dict())
 80 |         self.learn_step_counter+=1
 81 | 
 82 |         #sample batch from memory
 83 |         sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
 84 |         batch_memory = self.memory[sample_index, :]
 85 |         batch_state = torch.FloatTensor(batch_memory[:, :NUM_STATES])
 86 |         batch_action = torch.LongTensor(batch_memory[:, NUM_STATES:NUM_STATES+1].astype(int))
 87 |         batch_reward = torch.FloatTensor(batch_memory[:, NUM_STATES+1:NUM_STATES+2])
 88 |         batch_next_state = torch.FloatTensor(batch_memory[:,-NUM_STATES:])
 89 | 
 90 |         #q_eval
 91 |         q_eval = self.eval_net(batch_state).gather(1, batch_action)
 92 |         q_next = self.target_net(batch_next_state).detach()
 93 |         q_target = batch_reward + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
 94 |         loss = self.loss_func(q_eval, q_target)
 95 | 
 96 |         self.optimizer.zero_grad()
 97 |         loss.backward()
 98 |         self.optimizer.step()
 99 | 
100 | def main():
101 |     dqn = DQN()
102 |     episodes = 400
103 |     print("Collecting Experience....")
104 |     for i in range(episodes):
105 |         state = env.reset()
106 |         ep_reward = 0
107 |         while True:
108 |             env.render()
109 |             action = dqn.choose_action(state)
110 |             next_state, reward, done, info = env.step(action)
111 | 
112 |             dqn.store_transition(state, action, reward, next_state)
113 |             ep_reward += reward
114 | 
115 |             if dqn.memory_counter >= MEMORY_CAPACITY:
116 |                 dqn.learn()
117 |                 if done:
118 |                     print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3)))
119 |             if done:
120 |                 break
121 |             state = next_state
122 | 
123 | if __name__ == '__main__':
124 |     main()
125 | 


--------------------------------------------------------------------------------
/Char01 DQN/readme.md:
--------------------------------------------------------------------------------
 1 | # Requirment：
 2 | 
 3 | - tensorflow 1.10
 4 | - pytorch 4.1
 5 | - tensorboardX
 6 | - gym
 7 | 
 8 | ## Tips for MountainCar-v0 env:
 9 | 
10 | This is very sparse for MountainCar-v0, it is 0 at the beginning, only when the top of the mountain is 1, there is a reward. This leads to the fact that if the sample to the top of the mountain is not taken during training, basically the train will not come out. So you can change the reward, for example, to change to the current position of the Car is positively related. Of course, there is a more advanced approach to inverse reinforcement learning (using GAN).
11 | 
12 | ![value_loss](DQN/pic/value_loss.jpg)   
13 | ![step](DQN/pic/finish_episode.jpg) 
14 | This is value loss for DQN, We can see that the loss increaded to 1e13 however, the network work well. This is because the training is going on, the target_net and act_net are very different, so the calculated loss becomes very large. The previous loss was small because the reward was very sparse, resulting in a small update of the two networks.
15 | 


--------------------------------------------------------------------------------
/Char02 Policy Gradient/PolicyGradient.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import gym
  4 | import numpy as np
  5 | from itertools import count
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.optim as optim
 11 | import matplotlib.pyplot as plt
 12 | from torch.distributions import Categorical
 13 | 
 14 | 
 15 | 
 16 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
 17 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
 18 |                     help='discount factor (default: 0.99)')
 19 | parser.add_argument('--seed', type=int, default=543, metavar='N',
 20 |                     help='random seed (default: 543)')
 21 | parser.add_argument('--render', action='store_true',
 22 |                     help='render the environment')
 23 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 24 |                     help='interval between training status logs (default: 10)')
 25 | args = parser.parse_args()
 26 | 
 27 | 
 28 | env = gym.make('CartPole-v0')
 29 | env.seed(args.seed)
 30 | torch.manual_seed(args.seed)
 31 | 
 32 | 
 33 | class Policy(nn.Module):
 34 |     def __init__(self):
 35 |         super(Policy, self).__init__()
 36 |         self.affine1 = nn.Linear(4, 128)
 37 |         self.affine2 = nn.Linear(128, 2)
 38 | 
 39 |         self.saved_log_probs = []
 40 |         self.rewards = []
 41 | 
 42 |     def forward(self, x):
 43 |         x = F.relu(self.affine1(x))
 44 |         action_scores = self.affine2(x)
 45 |         return F.softmax(action_scores, dim=1)
 46 | 
 47 | 
 48 | policy = Policy()
 49 | optimizer = optim.Adam(policy.parameters(), lr=1e-2)
 50 | eps = np.finfo(np.float32).eps.item()
 51 | 
 52 | 
 53 | def select_action(state):
 54 |     state = torch.from_numpy(state).float().unsqueeze(0)
 55 |     probs = policy(state)
 56 |     m = Categorical(probs)
 57 |     action = m.sample()
 58 |     policy.saved_log_probs.append(m.log_prob(action))
 59 |     return action.item()
 60 | 
 61 | 
 62 | def finish_episode():
 63 |     R = 0
 64 |     policy_loss = []
 65 |     rewards = []
 66 |     for r in policy.rewards[::-1]:
 67 |         R = r + args.gamma * R
 68 |         rewards.insert(0, R)
 69 |     rewards = torch.tensor(rewards)
 70 |     rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
 71 |     for log_prob, reward in zip(policy.saved_log_probs, rewards):
 72 |         policy_loss.append(-log_prob * reward)
 73 |     optimizer.zero_grad()
 74 |     policy_loss = torch.cat(policy_loss).sum()
 75 |     policy_loss.backward()
 76 |     optimizer.step()
 77 |     del policy.rewards[:]
 78 |     del policy.saved_log_probs[:]
 79 | 
 80 | 
 81 | def main():
 82 |     running_reward = 10
 83 |     for i_episode in count(1):
 84 |         state = env.reset()
 85 |         for t in range(10000):  # Don't infinite loop while learning
 86 |             action = select_action(state)
 87 |             state, reward, done, _ = env.step(action)
 88 |             if args.render:
 89 |                 env.render()
 90 |             policy.rewards.append(reward)
 91 |             if done:
 92 |                 break
 93 | 
 94 |         running_reward = running_reward * 0.99 + t * 0.01
 95 |         finish_episode()
 96 |         if i_episode % args.log_interval == 0:
 97 |             print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
 98 |                 i_episode, t, running_reward))
 99 |         if running_reward > env.spec.reward_threshold:
100 |             print("Solved! Running reward is now {} and "
101 |                   "the last episode runs to {} time steps!".format(running_reward, t))
102 |             break
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/Char02 Policy Gradient/REINFORCE.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import numpy as np
  4 | from itertools import count
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | from torch.distributions import Categorical
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
 14 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
 15 |                     help='discount factor (default: 0.99)')
 16 | parser.add_argument('--seed', type=int, default=543, metavar='N',
 17 |                     help='random seed (default: 543)')
 18 | parser.add_argument('--render', action='store_true',
 19 |                     help='render the environment')
 20 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 21 |                     help='interval between training status logs (default: 10)')
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | env = gym.make('CartPole-v0')
 26 | env.seed(args.seed)
 27 | torch.manual_seed(args.seed)
 28 | 
 29 | 
 30 | class Policy(nn.Module):
 31 |     def __init__(self):
 32 |         super(Policy, self).__init__()
 33 |         self.affine1 = nn.Linear(4, 128)
 34 |         self.affine2 = nn.Linear(128, 2)
 35 | 
 36 |         self.saved_log_probs = []
 37 |         self.rewards = []
 38 | 
 39 |     def forward(self, x):
 40 |         x = F.relu(self.affine1(x))
 41 |         action_scores = self.affine2(x)
 42 |         return F.softmax(action_scores, dim=1)
 43 | 
 44 | 
 45 | policy = Policy()
 46 | optimizer = optim.Adam(policy.parameters(), lr=1e-2)
 47 | eps = np.finfo(np.float32).eps.item()
 48 | 
 49 | 
 50 | def select_action(state):
 51 |     state = torch.from_numpy(state).float().unsqueeze(0)
 52 |     probs = policy(state)
 53 |     m = Categorical(probs)
 54 |     action = m.sample()
 55 |     policy.saved_log_probs.append(m.log_prob(action))
 56 |     return action.item()
 57 | 
 58 | 
 59 | def finish_episode():
 60 |     R = 0
 61 |     policy_loss = []
 62 |     rewards = []
 63 |     for r in policy.rewards[::-1]:
 64 |         R = r + args.gamma * R
 65 |         rewards.insert(0, R)
 66 |     rewards = torch.tensor(rewards)
 67 |     rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
 68 |     for log_prob, reward in zip(policy.saved_log_probs, rewards):
 69 |         policy_loss.append(-log_prob * reward)
 70 |     optimizer.zero_grad()
 71 |     policy_loss = torch.cat(policy_loss).sum()
 72 |     policy_loss.backward()
 73 |     optimizer.step()
 74 |     del policy.rewards[:]
 75 |     del policy.saved_log_probs[:]
 76 | 
 77 | 
 78 | def main():
 79 |     running_reward = 10
 80 |     for i_episode in count(1):
 81 |         state = env.reset()
 82 |         for t in range(10000):  # Don't infinite loop while learning
 83 |             action = select_action(state)
 84 |             state, reward, done, _ = env.step(action)
 85 |             if args.render:
 86 |                 env.render()
 87 |             policy.rewards.append(reward)
 88 |             if done:
 89 |                 break
 90 | 
 91 |         running_reward = running_reward * 0.99 + t * 0.01
 92 |         finish_episode()
 93 |         if i_episode % args.log_interval == 0:
 94 |             print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
 95 |                 i_episode, t, running_reward))
 96 |         if running_reward > env.spec.reward_threshold:
 97 |             print("Solved! Running reward is now {} and "
 98 |                   "the last episode runs to {} time steps!".format(running_reward, t))
 99 |             break
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     main()
104 | 


--------------------------------------------------------------------------------
/Char02 Policy Gradient/REINFORCE_with_Baseline.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import gym
  7 | import torch
  8 | from torch.distributions import Categorical
  9 | import torch.optim as optim
 10 | from copy import deepcopy
 11 | import argparse
 12 | import matplotlib.pyplot as plt
 13 | from tensorboardX import SummaryWriter
 14 | from torch.nn.utils import clip_grad_norm_
 15 | render = False
 16 | 
 17 | #parser = argparse.ArgumentParser(description='PyTorch REINFORCE example with baseline')
 18 | # parser.add_argument('--render', action='store_true', default=True,
 19 | #                     help='render the environment')
 20 | #args = parser.parse_args()
 21 | 
 22 | #神经网络的输出： Net1输入state，输出action_prob; Net2 输入state,输出各个action_reward
 23 | class Policy(nn.Module):
 24 |     def __init__(self,n_states, n_hidden, n_output):
 25 |         super(Policy, self).__init__()
 26 |         self.linear1 = nn.Linear(n_states, n_hidden)
 27 |         self.linear2 = nn.Linear(n_hidden, n_output)
 28 | 
 29 |  #这是policy的参数
 30 |         self.reward = []
 31 |         self.log_act_probs = []
 32 |         self.Gt = []
 33 |         self.sigma = []
 34 | #这是state_action_func的参数
 35 |         # self.Reward = []
 36 |         # self.s_value = []
 37 | 
 38 |     def forward(self, x):
 39 |         x = F.relu(self.linear1(x))
 40 |         output = F.softmax(self.linear2(x), dim= 1)
 41 |         # self.act_probs.append(action_probs)
 42 |         return output
 43 | 
 44 | 
 45 | 
 46 | env = gym.make('CartPole-v0')
 47 | # writer = SummaryWriter('./yingyingying')
 48 | # state = env.reset()
 49 | n_states = env.observation_space.shape[0]
 50 | n_actions = env.action_space.n
 51 | 
 52 | policy = Policy(n_states, 128, n_actions)
 53 | s_value_func = Policy(n_states, 128, 1)
 54 | 
 55 | 
 56 | alpha_theta = 1e-3
 57 | optimizer_theta = optim.Adam(policy.parameters(), lr=alpha_theta)
 58 | # alpha_w = 1e-3  #初始化
 59 | # optimizer_w = optim.Adam(policy.parameters(), lr=alpha_w)
 60 | gamma = 0.99
 61 | 
 62 | 
 63 | 
 64 | seed = 1
 65 | env.seed(seed)
 66 | torch.manual_seed(seed)
 67 | live_time = []
 68 | 
 69 | def loop_episode():
 70 | 
 71 |     state = env.reset()
 72 |     if render: env.render()
 73 |     policy_loss = []
 74 |     s_value = []
 75 |     state_sequence = []
 76 |     log_act_prob = []
 77 |     for t in range(1000):
 78 |         state = torch.from_numpy(state).unsqueeze(0).float()  # 在第0维增加一个维度，将数据组织成[N , .....] 形式
 79 |         state_sequence.append(deepcopy(state))
 80 |         action_probs = policy(state)
 81 |         m = Categorical(action_probs)
 82 |         action = m.sample()
 83 |         m_log_prob = m.log_prob(action)
 84 |         log_act_prob.append(m_log_prob)
 85 |         # policy.log_act_probs.append(m_log_prob)
 86 |         action = action.item()
 87 |         next_state, re, done, _ = env.step(action)
 88 |         if render: env.render()
 89 |         policy.reward.append(re)
 90 |         if done:
 91 |             live_time.append(t)
 92 |             break
 93 |         state = next_state
 94 | 
 95 |     R = 0
 96 |     Gt = []
 97 | 
 98 |     # get Gt value
 99 |     for r in policy.reward[::-1]:
100 |         R = r + gamma * R
101 |         Gt.insert(0, R)
102 |         # s_value_func.sigma.insert(0,sigma)
103 |         # policy.Gt.insert(0,R)
104 | 
105 | 
106 |     # update step by step
107 |     for i in range(len(Gt)):
108 | 
109 | 
110 | 
111 |         G = Gt[i]
112 |         V = s_value_func(state_sequence[i])
113 |         delta = G - V
114 | 
115 |         # update value network
116 |         alpha_w = 1e-3  # 初始化
117 | 
118 |         optimizer_w = optim.Adam(policy.parameters(), lr=alpha_w)
119 |         optimizer_w.zero_grad()
120 |         policy_loss_w =-delta
121 |         policy_loss_w.backward(retain_graph = True)
122 |         clip_grad_norm_(policy_loss_w, 0.1)
123 |         optimizer_w.step()
124 | 
125 |         # update policy network
126 |         optimizer_theta.zero_grad()
127 |         policy_loss_theta = - log_act_prob[i] * delta
128 |         policy_loss_theta.backward(retain_graph = True)
129 |         clip_grad_norm_(policy_loss_theta, 0.1)
130 |         optimizer_theta.step()
131 | 
132 |     del policy.log_act_probs[:]
133 |     del policy.reward[:]
134 | 
135 | 
136 | def plot(live_time):
137 |     plt.ion()
138 |     plt.grid()
139 |     plt.plot(live_time, 'g-')
140 |     plt.xlabel('running step')
141 |     plt.ylabel('live time')
142 |     plt.pause(0.000001)
143 | 
144 | 
145 | 
146 | if __name__ == '__main__':
147 | 
148 |     #生成若干episode
149 |     # graph_data = torch.autograd.Variable(torch.ones(1,4))
150 |     # writer.add_graph(policy, (graph_data, ))
151 |     for i_episode in range(1000):
152 |         loop_episode()
153 |         plot(live_time)
154 |     #policy.plot(live_time)
155 | 


--------------------------------------------------------------------------------
/Char02 Policy Gradient/Run_Model.py:
--------------------------------------------------------------------------------
  1 | # MountainCar V0
  2 | #
  3 | import numpy as np
  4 | import gym
  5 | import matplotlib.pyplot as plt
  6 | from itertools import count
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import functional as F
 11 | from torch.optim import adam
 12 | from torch.distributions import Categorical
 13 | 
 14 | env = gym.make('MountainCar-v0')
 15 | env = env.unwrapped
 16 | env.seed(1)
 17 | 
 18 | torch.manual_seed(1)
 19 | plt.ion()
 20 | 
 21 | 
 22 | #Hyperparameters
 23 | learning_rate = 0.02
 24 | gamma = 0.995
 25 | episodes = 1000
 26 | 
 27 | eps = np.finfo(np.float32).eps.item()
 28 | 
 29 | action_space = env.action_space.n
 30 | state_space = env.observation_space.shape[0]
 31 | 
 32 | class Policy(nn.Module):
 33 |     def __init__(self):
 34 |         super(Policy, self).__init__()
 35 | 
 36 |         self.fc1 = nn.Linear(state_space, 20)
 37 |         #self.fc2 = nn.Linear(128,64)
 38 |         self.fc3 = nn.Linear(20, action_space)
 39 | 
 40 |         self.gamma = gamma
 41 |         self.saved_log_probs = []
 42 |         self.rewards = []
 43 | 
 44 |     def forward(self, x):
 45 | 
 46 |         x = F.relu(self.fc1(x))
 47 |         #x = F.relu(self.fc2(x))
 48 |         x = F.softmax(self.fc3(x), dim=1)
 49 | 
 50 |         return x
 51 | 
 52 | policy = torch.load('policyNet.pkl')
 53 | 
 54 | def plot(steps):
 55 |     ax = plt.subplot(111)
 56 |     ax.cla()
 57 |     ax.set_title('Training')
 58 |     ax.set_xlabel('Episode')
 59 |     ax.set_ylabel('Run Time')
 60 |     ax.plot(steps)
 61 |     RunTime = len(steps)
 62 |     path =  './PG_MountainCar-v0/'+'RunTime'+str(RunTime)+'.jpg'
 63 |     if len(steps) % 100 == 0:
 64 |         #plt.savefig(path)
 65 |         pass
 66 |     plt.pause(0.0000001)
 67 | 
 68 | def selct_action(state):
 69 |     state = torch.from_numpy(state).float().unsqueeze(0)
 70 |     probs = policy(state)
 71 |     c = Categorical(probs)
 72 |     action = c.sample()
 73 | 
 74 | 
 75 |     #policy.saved_log_probs.append(c.log_prob(action))
 76 |     action = action.item()
 77 |     return action
 78 | 
 79 | def run_Model():
 80 |     running_reward = 0
 81 |     steps = []
 82 |     for episode in count(60000):
 83 |         state = env.reset()
 84 | 
 85 |         for t in range(10000):
 86 |             action = selct_action(state)
 87 |             state, reward ,done, info = env.step(action)
 88 |             env.render()
 89 |             #policy.rewards.append(reward)
 90 | 
 91 |             if done:
 92 |                 print("Episode {}, live time = {}".format(episode, t))
 93 |                 steps.append(t)
 94 |                 plot(steps)
 95 |                 break
 96 |         if episode % 50 == 0:
 97 |             pass
 98 |             #torch.save(policy, 'policyNet.pkl')
 99 | 
100 |         running_reward = running_reward * policy.gamma - t*0.01
101 |         #finish_episode()
102 | 
103 | if __name__ == '__main__':
104 |     run_Model()


--------------------------------------------------------------------------------
/Char02 Policy Gradient/naive-policy-gradient.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import matplotlib.pyplot as plt
  4 | from itertools import count
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from torch.autograd import Variable
 10 | from torch.distributions import Categorical
 11 | 
 12 | 
 13 | #Hyperparameters
 14 | learning_rate = 0.01
 15 | gamma = 0.98
 16 | 
 17 | num_episode = 5000
 18 | batch_size = 32
 19 | 
 20 | 
 21 | env = gym.make('CartPole-v0')
 22 | state_space = env.observation_space.shape[0]
 23 | action_space = env.action_space.n
 24 | 
 25 | def plot_durations(episode_durations):
 26 |     plt.ion()
 27 |     plt.figure(2)
 28 |     plt.clf()
 29 |     duration_t = torch.FloatTensor(episode_durations)
 30 |     plt.title('Training')
 31 |     plt.xlabel('Episodes')
 32 |     plt.ylabel('Duration')
 33 |     plt.plot(duration_t.numpy())
 34 | 
 35 |     if len(duration_t) >= 100:
 36 |         means = duration_t.unfold(0,100,1).mean(1).view(-1)
 37 |         means = torch.cat((torch.zeros(99), means))
 38 |         plt.plot(means.numpy())
 39 | 
 40 |     plt.pause(0.00001)
 41 | 
 42 | class Policy(nn.Module):
 43 | 
 44 |     def __init__(self):
 45 |         super(Policy, self).__init__()
 46 | 
 47 |         self.state_space = state_space
 48 |         self.action_space = action_space
 49 | 
 50 |         self.fc1 = nn.Linear(self.state_space, 128)
 51 |         self.fc2 = nn.Linear(128, self.action_space)
 52 | 
 53 |     def forward(self, x):
 54 |         x = self.fc1(x)
 55 |         #x = F.dropout(x, 0.5)
 56 |         x = F.relu(x)
 57 |         x = F.softmax(self.fc2(x), dim=-1)
 58 | 
 59 |         return x
 60 | 
 61 | policy = Policy()
 62 | optimizer = torch.optim.Adam(policy.parameters(), lr=learning_rate)
 63 | 
 64 | 
 65 | 
 66 | def train():
 67 | 
 68 |     episode_durations = []
 69 |     #Batch_history
 70 |     state_pool = []
 71 |     action_pool = []
 72 |     reward_pool = []
 73 |     steps = 0
 74 | 
 75 |     for episode in range(num_episode):
 76 |         state = env.reset()
 77 |         state = torch.from_numpy(state).float()
 78 |         state = Variable(state)
 79 | 
 80 |         env.render()
 81 | 
 82 |         for t in count():
 83 |             probs = policy(state)
 84 |             c = Categorical(probs)
 85 |             action = c.sample()
 86 | 
 87 |             action = action.data.numpy().astype('int32')
 88 |             next_state, reward, done, info = env.step(action)
 89 |             reward = 0 if done else reward # correct the reward
 90 |             env.render()
 91 | 
 92 |             state_pool.append(state)
 93 |             action_pool.append(float(action))
 94 |             reward_pool.append(reward)
 95 | 
 96 |             state = next_state
 97 |             state = torch.from_numpy(state).float()
 98 |             state = Variable(state)
 99 | 
100 |             steps += 1
101 | 
102 |             if done:
103 |                 episode_durations.append(t+1)
104 |                 plot_durations(episode_durations)
105 |                 break
106 | 
107 |         # update policy
108 |         if episode >0 and episode % batch_size == 0:
109 | 
110 |             r = 0
111 |             '''
112 |             for i in reversed(range(steps)):
113 |                 if reward_pool[i] == 0:
114 |                     running_add = 0
115 |                 else:
116 |                     running_add = running_add * gamma +reward_pool[i]
117 |                     reward_pool[i] = running_add
118 |             '''
119 |             for i in reversed(range(steps)):
120 |                 if reward_pool[i] == 0:
121 |                     r = 0
122 |                 else:
123 |                     r = r * gamma + reward_pool[i]
124 |                     reward_pool[i] = r
125 | 
126 |             #Normalize reward
127 |             reward_mean = np.mean(reward_pool)
128 |             reward_std = np.std(reward_pool)
129 |             reward_pool = (reward_pool-reward_mean)/reward_std
130 | 
131 |             #gradiend desent
132 |             optimizer.zero_grad()
133 | 
134 |             for i in range(steps):
135 |                 state = state_pool[i]
136 |                 action = Variable(torch.FloatTensor([action_pool[i]]))
137 |                 reward = reward_pool[i]
138 | 
139 |                 probs = policy(state)
140 |                 c = Categorical(probs)
141 | 
142 |                 loss = -c.log_prob(action) * reward
143 |                 loss.backward()
144 | 
145 |             optimizer.step()
146 | 
147 |             # clear the batch pool
148 |             state_pool = []
149 |             action_pool = []
150 |             reward_pool = []
151 |             steps = 0
152 | 
153 | train()
154 | 


--------------------------------------------------------------------------------
/Char02 Policy Gradient/pytorch_MountainCar-v0.py:
--------------------------------------------------------------------------------
  1 | # MountainCar V0
  2 | 
  3 | import numpy as np
  4 | import gym
  5 | import matplotlib.pyplot as plt
  6 | from itertools import count
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import functional as F
 11 | from torch.optim import adam
 12 | from torch.distributions import Categorical
 13 | 
 14 | env = gym.make('MountainCar-v0')
 15 | env = env.unwrapped
 16 | env.seed(1)
 17 | 
 18 | torch.manual_seed(1)
 19 | plt.ion()
 20 | 
 21 | 
 22 | #Hyperparameters
 23 | learning_rate = 0.02
 24 | gamma = 0.995
 25 | episodes = 1000
 26 | 
 27 | eps = np.finfo(np.float32).eps.item()
 28 | 
 29 | action_space = env.action_space.n
 30 | state_space = env.observation_space.shape[0]
 31 | 
 32 | 
 33 | class Policy(nn.Module):
 34 |     def __init__(self):
 35 |         super(Policy, self).__init__()
 36 | 
 37 |         self.fc1 = nn.Linear(state_space, 20)
 38 |         #self.fc2 = nn.Linear(128,64)
 39 |         self.fc3 = nn.Linear(20, action_space)
 40 | 
 41 |         self.gamma = gamma
 42 |         self.saved_log_probs = []
 43 |         self.rewards = []
 44 | 
 45 |     def forward(self, x):
 46 | 
 47 |         x = F.relu(self.fc1(x))
 48 |         #x = F.relu(self.fc2(x))
 49 |         x = F.softmax(self.fc3(x), dim=1)
 50 | 
 51 |         return x
 52 | 
 53 | policy = Policy()
 54 | optimizer = adam.Adam(policy.parameters(), lr=learning_rate)
 55 | 
 56 | def selct_action(state):
 57 |     state = torch.from_numpy(state).float().unsqueeze(0)
 58 |     probs = policy(state)
 59 |     c = Categorical(probs)
 60 |     action = c.sample()
 61 | 
 62 | 
 63 |     policy.saved_log_probs.append(c.log_prob(action))
 64 |     action = action.item()
 65 |     return action
 66 | 
 67 | def finish_episode():
 68 |     R = 0
 69 |     policy_loss = []
 70 |     rewards = []
 71 | 
 72 |     for r in policy.rewards[::-1]:
 73 |         R = r + policy.gamma * R
 74 |         rewards.insert(0, R)
 75 | 
 76 |     # Formalize reward
 77 |     rewards = torch.tensor(rewards)
 78 |     rewards = (rewards - rewards.mean())/(rewards.std() + eps)
 79 | 
 80 |     # get loss
 81 |     for reward, log_prob in zip(rewards, policy.saved_log_probs):
 82 |         policy_loss.append(-log_prob * reward)
 83 | 
 84 |     optimizer.zero_grad()
 85 |     policy_loss = torch.cat(policy_loss).sum()
 86 |     policy_loss.backward()
 87 |     optimizer.step()
 88 | 
 89 | 
 90 | 
 91 |     del policy.rewards[:]
 92 |     del policy.saved_log_probs[:]
 93 | 
 94 | def plot(steps):
 95 |     ax = plt.subplot(111)
 96 |     ax.cla()
 97 |     ax.set_title('Training')
 98 |     ax.set_xlabel('Episode')
 99 |     ax.set_ylabel('Run Time')
100 |     ax.plot(steps)
101 |     RunTime = len(steps)
102 |     path =  './PG_MountainCar-v0/'+'RunTime'+str(RunTime)+'.jpg'
103 |     if len(steps) % 100 == 0:
104 |         plt.savefig(path)
105 |     plt.pause(0.0000001)
106 | 
107 | 
108 | 
109 | def main():
110 | 
111 |     running_reward = 0
112 |     steps = []
113 |     for episode in count(60000):
114 |         state = env.reset()
115 | 
116 |         for t in range(10000):
117 |             action = selct_action(state)
118 |             state, reward ,done, info = env.step(action)
119 |             env.render()
120 |             policy.rewards.append(reward)
121 | 
122 |             if done:
123 |                 print("Episode {}, live time = {}".format(episode, t))
124 |                 steps.append(t)
125 |                 plot(steps)
126 |                 break
127 |         if episode % 50 == 0:
128 |             torch.save(policy, 'policyNet.pkl')
129 | 
130 |         running_reward = running_reward * policy.gamma - t*0.01
131 |         finish_episode()
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/Char03 Actor-Critic/AC_CartPole-v0.py:
--------------------------------------------------------------------------------
  1 | import gym, os
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from itertools import count
  5 | from collections import namedtuple
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | from torch.distributions import Categorical
 12 | 
 13 | #Parameters
 14 | env = gym.make('CartPole-v0')
 15 | env = env.unwrapped
 16 | 
 17 | env.seed(1)
 18 | torch.manual_seed(1)
 19 | 
 20 | state_space = env.observation_space.shape[0]
 21 | action_space = env.action_space.n
 22 | 
 23 | 
 24 | #Hyperparameters
 25 | learning_rate = 0.01
 26 | gamma = 0.99
 27 | episodes = 20000
 28 | render = False
 29 | eps = np.finfo(np.float32).eps.item()
 30 | SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
 31 | 
 32 | class Policy(nn.Module):
 33 |     def __init__(self):
 34 |         super(Policy, self).__init__()
 35 |         self.fc1 = nn.Linear(state_space, 32)
 36 | 
 37 |         self.action_head = nn.Linear(32, action_space)
 38 |         self.value_head = nn.Linear(32, 1) # Scalar Value
 39 | 
 40 |         self.save_actions = []
 41 |         self.rewards = []
 42 |         os.makedirs('./AC_CartPole-v0', exist_ok=True)
 43 | 
 44 |     def forward(self, x):
 45 |         x = F.relu(self.fc1(x))
 46 |         action_score = self.action_head(x)
 47 |         state_value = self.value_head(x)
 48 | 
 49 |         return F.softmax(action_score, dim=-1), state_value
 50 | 
 51 | model = Policy()
 52 | optimizer = optim.Adam(model.parameters(), lr=learning_rate)
 53 | 
 54 | def plot(steps):
 55 |     ax = plt.subplot(111)
 56 |     ax.cla()
 57 |     ax.grid()
 58 |     ax.set_title('Training')
 59 |     ax.set_xlabel('Episode')
 60 |     ax.set_ylabel('Run Time')
 61 |     ax.plot(steps)
 62 |     RunTime = len(steps)
 63 | 
 64 |     path = './AC_CartPole-v0/' + 'RunTime' + str(RunTime) + '.jpg'
 65 |     if len(steps) % 200 == 0:
 66 |         plt.savefig(path)
 67 |     plt.pause(0.0000001)
 68 | 
 69 | def select_action(state):
 70 |     state = torch.from_numpy(state).float()
 71 |     probs, state_value = model(state)
 72 |     m = Categorical(probs)
 73 |     action = m.sample()
 74 |     model.save_actions.append(SavedAction(m.log_prob(action), state_value))
 75 | 
 76 |     return action.item()
 77 | 
 78 | 
 79 | def finish_episode():
 80 |     R = 0
 81 |     save_actions = model.save_actions
 82 |     policy_loss = []
 83 |     value_loss = []
 84 |     rewards = []
 85 | 
 86 |     for r in model.rewards[::-1]:
 87 |         R = r + gamma * R
 88 |         rewards.insert(0, R)
 89 | 
 90 |     rewards = torch.tensor(rewards)
 91 |     rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
 92 | 
 93 |     for (log_prob , value), r in zip(save_actions, rewards):
 94 |         reward = r - value.item()
 95 |         policy_loss.append(-log_prob * reward)
 96 |         value_loss.append(F.smooth_l1_loss(value, torch.tensor([r])))
 97 | 
 98 |     optimizer.zero_grad()
 99 |     loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
100 |     loss.backward()
101 |     optimizer.step()
102 | 
103 |     del model.rewards[:]
104 |     del model.save_actions[:]
105 | 
106 | def main():
107 |     running_reward = 10
108 |     live_time = []
109 |     for i_episode in count(episodes):
110 |         state = env.reset()
111 |         for t in count():
112 |             action = select_action(state)
113 |             state, reward, done, info = env.step(action)
114 |             if render: env.render()
115 |             model.rewards.append(reward)
116 | 
117 |             if done or t >= 1000:
118 |                 break
119 |         running_reward = running_reward * 0.99 + t * 0.01
120 |         live_time.append(t)
121 |         plot(live_time)
122 |         if i_episode % 100 == 0:
123 |             modelPath = './AC_CartPole_Model/ModelTraing'+str(i_episode)+'Times.pkl'
124 |             torch.save(model, modelPath)
125 |         finish_episode()
126 | 
127 | if __name__ == '__main__':
128 |     main()
129 | 


--------------------------------------------------------------------------------
/Char03 Actor-Critic/AC_MountainCar-v0.py:
--------------------------------------------------------------------------------
  1 | import gym, os
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from itertools import count
  5 | from collections import namedtuple
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.nn import functional as F
 10 | from torch.optim import Adam
 11 | from torch.distributions import Categorical
 12 | from torch.nn.functional import  smooth_l1_loss
 13 | 
 14 | #Hyperparameters
 15 | LEARNING_RATE = 0.01
 16 | GAMMA = 0.995
 17 | NUM_EPISODES = 50000
 18 | RENDER = False
 19 | #env info
 20 | env = gym.make('MountainCar-v0')
 21 | env = env.unwrapped
 22 | 
 23 | env.seed(1)
 24 | torch.manual_seed(1)
 25 | 
 26 | num_state = env.observation_space.shape[0]
 27 | num_action = env.action_space.n
 28 | eps = np.finfo(np.float32).eps.item()
 29 | plt.ion()
 30 | saveAction = namedtuple('SavedActions',['probs', 'action_values'])
 31 | 
 32 | class Module(nn.Module):
 33 |     def __init__(self):
 34 |         super(Module, self).__init__()
 35 |         self.fc1 = nn.Linear(num_state, 128)
 36 |         #self.fc2 = nn.Linear(64, 128)
 37 | 
 38 |         self.action_head = nn.Linear(128, num_action)
 39 |         self.value_head = nn.Linear(128, 1)
 40 |         self.policy_action_value = []
 41 |         self.rewards = []
 42 | 
 43 |         self.gamma = GAMMA
 44 |         os.makedirs('/AC_MountainCar-v0_Model/', exist_ok=True)
 45 | 
 46 | 
 47 |     def forward(self, x):
 48 |         x = F.relu(self.fc1(x))
 49 |         #x = F.relu(self.fc2(x))
 50 | 
 51 |         probs = F.softmax(self.action_head(x))
 52 |         value = self.value_head(x)
 53 |         return probs, value
 54 | 
 55 | policy = Module()
 56 | optimizer = Adam(policy.parameters(), lr=LEARNING_RATE)
 57 | 
 58 | def plot(steps):
 59 |     ax = plt.subplot(111)
 60 |     ax.cla()
 61 |     ax.grid()
 62 |     ax.set_title('Training')
 63 |     ax.set_xlabel('Episode')
 64 |     ax.set_ylabel('Run Time')
 65 |     ax.plot(steps)
 66 |     RunTime = len(steps)
 67 |     path = './AC_MountainCar-v0/' + 'RunTime' + str(RunTime) + '.jpg'
 68 |     if len(steps) % 200 == 0:
 69 |         plt.savefig(path)
 70 |     plt.pause(0.0000001)
 71 | 
 72 | 
 73 | def select_action(state):
 74 |     state = torch.from_numpy(state).float()
 75 |     probs, value = policy(state)
 76 |     c = Categorical(probs)
 77 |     action = c.sample()
 78 |     log_prob = c.log_prob(action)
 79 | 
 80 | 
 81 |     policy.policy_action_value.append(saveAction(log_prob, value))
 82 |     action = action.item()
 83 |     return action
 84 | 
 85 | 
 86 | def finish_episode():
 87 |     rewards = []
 88 |     saveActions = policy.policy_action_value
 89 |     policy_loss = []
 90 |     value_loss = []
 91 |     R = 0
 92 | 
 93 |     for r in policy.rewards[::-1]:
 94 |         R = r + policy.gamma * R
 95 |         rewards.insert(0, R)
 96 | 
 97 |     # Normalize the reward
 98 |     rewards = torch.tensor(rewards)
 99 |     rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
100 | 
101 |     #Figure out loss
102 |     for (log_prob, value), r in zip(saveActions, rewards):
103 |         reward = r - value.item()
104 |         policy_loss.append(-log_prob * reward)
105 |         value_loss.append(smooth_l1_loss(value, torch.tensor([r]) ))
106 | 
107 |     optimizer.zero_grad()
108 |     loss = torch.stack(policy_loss).sum() + torch.stack(policy_loss).sum()
109 |     loss.backward()
110 |     optimizer.step()
111 | 
112 |     del policy.rewards[:]
113 |     del policy.policy_action_value[:]
114 | 
115 | 
116 | def main():
117 |     run_steps = []
118 |     for i_episode in range(NUM_EPISODES):
119 |         state = env.reset()
120 |         if RENDER: env.render()
121 | 
122 |         for t in count():
123 |             action = select_action(state)
124 |             state , reward, done, _ = env.step(action)
125 |             reward = state[0] + reward
126 |             if RENDER: env.render()
127 |             policy.rewards.append(reward)
128 | 
129 |             if done:
130 |                 run_steps.append(t)
131 |                 print("Epiosde {} , run step is {} ".format(i_episode+1 , t+1))
132 |                 break
133 | 
134 |         finish_episode()
135 |         plot(run_steps)
136 | 
137 |         if i_episode % 100 == 0 and i_episode !=0:
138 | 
139 |             modelPath = './AC_MountainCar-v0_Model/ModelTraing' + str(i_episode) + 'Times.pkl'
140 |             torch.save(policy, modelPath)
141 | 
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     main()
146 | 


--------------------------------------------------------------------------------
/Char04 A2C/A2C.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math
  3 | import random
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.optim as optim
 11 | import torch.nn.functional as F
 12 | from torch.distributions import Categorical
 13 | 
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | use_cuda = torch.cuda.is_available()
 17 | device   = torch.device("cuda" if use_cuda else "cpu")
 18 | 
 19 | from multiprocessing_env import SubprocVecEnv
 20 | 
 21 | num_envs = 8
 22 | env_name = "CartPole-v0"
 23 | 
 24 | def make_env():
 25 |     def _thunk():
 26 |         env = gym.make(env_name)
 27 |         return env
 28 |     return _thunk
 29 | 
 30 | plt.ion()
 31 | envs = [make_env() for i in range(num_envs)]
 32 | envs = SubprocVecEnv(envs) # 8 env
 33 | 
 34 | env = gym.make(env_name) # a single env
 35 | 
 36 | class ActorCritic(nn.Module):
 37 |     def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
 38 |         super(ActorCritic, self).__init__()
 39 |         
 40 |         self.critic = nn.Sequential(
 41 |             nn.Linear(num_inputs, hidden_size),
 42 |             nn.ReLU(),
 43 |             nn.Linear(hidden_size, 1)
 44 |         )
 45 |         
 46 |         self.actor = nn.Sequential(
 47 |             nn.Linear(num_inputs, hidden_size),
 48 |             nn.ReLU(),
 49 |             nn.Linear(hidden_size, num_outputs),
 50 |             nn.Softmax(dim=1),
 51 |         )
 52 |         
 53 |     def forward(self, x):
 54 |         value = self.critic(x)
 55 |         probs = self.actor(x)
 56 |         dist  = Categorical(probs)
 57 |         return dist, value
 58 | 
 59 | 
 60 | def test_env(vis=False):
 61 |     state = env.reset()
 62 |     if vis: env.render()
 63 |     done = False
 64 |     total_reward = 0
 65 |     while not done:
 66 |         state = torch.FloatTensor(state).unsqueeze(0).to(device)
 67 |         dist, _ = model(state)
 68 |         next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
 69 |         state = next_state
 70 |         if vis: env.render()
 71 |         total_reward += reward
 72 |     return total_reward
 73 | 
 74 | 
 75 | def compute_returns(next_value, rewards, masks, gamma=0.99):
 76 |     R = next_value
 77 |     returns = []
 78 |     for step in reversed(range(len(rewards))):
 79 |         R = rewards[step] + gamma * R * masks[step]
 80 |         returns.insert(0, R)
 81 |     return returns
 82 | 
 83 | def plot(frame_idx, rewards):
 84 |     plt.plot(rewards,'b-')
 85 |     plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
 86 |     plt.pause(0.0001)
 87 | 
 88 | 
 89 | num_inputs  = envs.observation_space.shape[0]
 90 | num_outputs = envs.action_space.n
 91 | 
 92 | #Hyper params:
 93 | hidden_size = 256
 94 | lr          = 1e-3
 95 | num_steps   = 5
 96 | 
 97 | model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
 98 | optimizer = optim.Adam(model.parameters())
 99 | 
100 | 
101 | max_frames   = 20000
102 | frame_idx    = 0
103 | test_rewards = []
104 | 
105 | 
106 | state = envs.reset()
107 | 
108 | while frame_idx < max_frames:
109 | 
110 |     log_probs = []
111 |     values    = []
112 |     rewards   = []
113 |     masks     = []
114 |     entropy = 0
115 | 
116 |     # rollout trajectory
117 |     for _ in range(num_steps):
118 |         state = torch.FloatTensor(state).to(device)
119 |         dist, value = model(state)
120 | 
121 |         action = dist.sample()
122 |         next_state, reward, done, _ = envs.step(action.cpu().numpy())
123 | 
124 |         log_prob = dist.log_prob(action)
125 |         entropy += dist.entropy().mean()
126 |         
127 |         log_probs.append(log_prob)
128 |         values.append(value)
129 |         rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
130 |         masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
131 |         
132 |         state = next_state
133 |         frame_idx += 1
134 |         
135 |         if frame_idx % 100 == 0:
136 |             test_rewards.append(np.mean([test_env() for _ in range(10)]))
137 |             plot(frame_idx, test_rewards)
138 |             
139 |     next_state = torch.FloatTensor(next_state).to(device)
140 |     _, next_value = model(next_state)
141 |     returns = compute_returns(next_value, rewards, masks)
142 |     
143 |     log_probs = torch.cat(log_probs)
144 |     returns   = torch.cat(returns).detach()
145 |     values    = torch.cat(values)
146 | 
147 |     advantage = returns - values
148 | 
149 |     actor_loss  = -(log_probs * advantage.detach()).mean()
150 |     critic_loss = advantage.pow(2).mean()
151 | 
152 |     loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
153 | 
154 |     optimizer.zero_grad()
155 |     loss.backward()
156 |     optimizer.step()
157 | 
158 | #test_env(True)
159 | 


--------------------------------------------------------------------------------
/Char04 A2C/multiprocessing_env.py:
--------------------------------------------------------------------------------
  1 | #This code is from openai baseline
  2 | #https://github.com/openai/baselines/tree/master/baselines/common/vec_env
  3 | 
  4 | import numpy as np
  5 | from multiprocessing import Process, Pipe
  6 | 
  7 | def worker(remote, parent_remote, env_fn_wrapper):
  8 |     parent_remote.close()
  9 |     env = env_fn_wrapper.x()
 10 |     while True:
 11 |         cmd, data = remote.recv()
 12 |         if cmd == 'step':
 13 |             ob, reward, done, info = env.step(data)
 14 |             if done:
 15 |                 ob = env.reset()
 16 |             remote.send((ob, reward, done, info))
 17 |         elif cmd == 'reset':
 18 |             ob = env.reset()
 19 |             remote.send(ob)
 20 |         elif cmd == 'reset_task':
 21 |             ob = env.reset_task()
 22 |             remote.send(ob)
 23 |         elif cmd == 'close':
 24 |             remote.close()
 25 |             break
 26 |         elif cmd == 'get_spaces':
 27 |             remote.send((env.observation_space, env.action_space))
 28 |         else:
 29 |             raise NotImplementedError
 30 | 
 31 | class VecEnv(object):
 32 |     """
 33 |     An abstract asynchronous, vectorized environment.
 34 |     """
 35 |     def __init__(self, num_envs, observation_space, action_space):
 36 |         self.num_envs = num_envs
 37 |         self.observation_space = observation_space
 38 |         self.action_space = action_space
 39 | 
 40 |     def reset(self):
 41 |         """
 42 |         Reset all the environments and return an array of
 43 |         observations, or a tuple of observation arrays.
 44 |         If step_async is still doing work, that work will
 45 |         be cancelled and step_wait() should not be called
 46 |         until step_async() is invoked again.
 47 |         """
 48 |         pass
 49 | 
 50 |     def step_async(self, actions):
 51 |         """
 52 |         Tell all the environments to start taking a step
 53 |         with the given actions.
 54 |         Call step_wait() to get the results of the step.
 55 |         You should not call this if a step_async run is
 56 |         already pending.
 57 |         """
 58 |         pass
 59 | 
 60 |     def step_wait(self):
 61 |         """
 62 |         Wait for the step taken with step_async().
 63 |         Returns (obs, rews, dones, infos):
 64 |          - obs: an array of observations, or a tuple of
 65 |                 arrays of observations.
 66 |          - rews: an array of rewards
 67 |          - dones: an array of "episode done" booleans
 68 |          - infos: a sequence of info objects
 69 |         """
 70 |         pass
 71 | 
 72 |     def close(self):
 73 |         """
 74 |         Clean up the environments' resources.
 75 |         """
 76 |         pass
 77 | 
 78 |     def step(self, actions):
 79 |         self.step_async(actions)
 80 |         return self.step_wait()
 81 | 
 82 |     
 83 | class CloudpickleWrapper(object):
 84 |     """
 85 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
 86 |     """
 87 |     def __init__(self, x):
 88 |         self.x = x
 89 |     def __getstate__(self):
 90 |         import cloudpickle
 91 |         return cloudpickle.dumps(self.x)
 92 |     def __setstate__(self, ob):
 93 |         import pickle
 94 |         self.x = pickle.loads(ob)
 95 | 
 96 |         
 97 | class SubprocVecEnv(VecEnv):
 98 |     def __init__(self, env_fns, spaces=None):
 99 |         """
100 |         envs: list of gym environments to run in subprocesses
101 |         """
102 |         self.waiting = False
103 |         self.closed = False
104 |         nenvs = len(env_fns)
105 |         self.nenvs = nenvs
106 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
107 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
108 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
109 |         for p in self.ps:
110 |             p.daemon = True # if the main process crashes, we should not cause things to hang
111 |             p.start()
112 |         for remote in self.work_remotes:
113 |             remote.close()
114 | 
115 |         self.remotes[0].send(('get_spaces', None))
116 |         observation_space, action_space = self.remotes[0].recv()
117 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
118 | 
119 |     def step_async(self, actions):
120 |         for remote, action in zip(self.remotes, actions):
121 |             remote.send(('step', action))
122 |         self.waiting = True
123 | 
124 |     def step_wait(self):
125 |         results = [remote.recv() for remote in self.remotes]
126 |         self.waiting = False
127 |         obs, rews, dones, infos = zip(*results)
128 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
129 | 
130 |     def reset(self):
131 |         for remote in self.remotes:
132 |             remote.send(('reset', None))
133 |         return np.stack([remote.recv() for remote in self.remotes])
134 | 
135 |     def reset_task(self):
136 |         for remote in self.remotes:
137 |             remote.send(('reset_task', None))
138 |         return np.stack([remote.recv() for remote in self.remotes])
139 | 
140 |     def close(self):
141 |         if self.closed:
142 |             return
143 |         if self.waiting:
144 |             for remote in self.remotes:            
145 |                 remote.recv()
146 |         for remote in self.remotes:
147 |             remote.send(('close', None))
148 |         for p in self.ps:
149 |             p.join()
150 |             self.closed = True
151 |             
152 |     def __len__(self):
153 |         return self.nenvs
154 | 


--------------------------------------------------------------------------------
/Char05 DDPG/DDPG.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from itertools import count
  3 | 
  4 | import os, sys, random
  5 | import numpy as np
  6 | 
  7 | import gym
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from torch.distributions import Normal
 13 | from tensorboardX import SummaryWriter
 14 | 
 15 | '''
 16 | Implementation of Deep Deterministic Policy Gradients (DDPG) with pytorch 
 17 | riginal paper: https://arxiv.org/abs/1509.02971
 18 | Not the author's implementation !
 19 | '''
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test'
 23 | # OpenAI gym environment name, # ['BipedalWalker-v2', 'Pendulum-v0'] or any continuous environment
 24 | # Note that DDPG is feasible about hyper-parameters.
 25 | # You should fine-tuning if you change to another environment.
 26 | parser.add_argument("--env_name", default="Pendulum-v0")
 27 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 28 | parser.add_argument('--target_update_interval', default=1, type=int)
 29 | parser.add_argument('--test_iteration', default=10, type=int)
 30 | 
 31 | parser.add_argument('--learning_rate', default=1e-4, type=float)
 32 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor
 33 | parser.add_argument('--capacity', default=1000000, type=int) # replay buffer size
 34 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size
 35 | parser.add_argument('--seed', default=False, type=bool)
 36 | parser.add_argument('--random_seed', default=9527, type=int)
 37 | # optional parameters
 38 | 
 39 | parser.add_argument('--sample_frequency', default=2000, type=int)
 40 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 41 | parser.add_argument('--log_interval', default=50, type=int) #
 42 | parser.add_argument('--load', default=False, type=bool) # load model
 43 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
 44 | parser.add_argument('--exploration_noise', default=0.1, type=float)
 45 | parser.add_argument('--max_episode', default=100000, type=int) # num of games
 46 | parser.add_argument('--print_log', default=5, type=int)
 47 | parser.add_argument('--update_iteration', default=200, type=int)
 48 | args = parser.parse_args()
 49 | 
 50 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 51 | script_name = os.path.basename(__file__)
 52 | env = gym.make(args.env_name)
 53 | 
 54 | if args.seed:
 55 |     env.seed(args.random_seed)
 56 |     torch.manual_seed(args.random_seed)
 57 |     np.random.seed(args.random_seed)
 58 | 
 59 | state_dim = env.observation_space.shape[0]
 60 | action_dim = env.action_space.shape[0]
 61 | max_action = float(env.action_space.high[0])
 62 | min_Val = torch.tensor(1e-7).float().to(device) # min value
 63 | 
 64 | directory = './exp' + script_name + args.env_name +'./'
 65 | 
 66 | class Replay_buffer():
 67 |     '''
 68 |     Code based on:
 69 |     https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 70 |     Expects tuples of (state, next_state, action, reward, done)
 71 |     '''
 72 |     def __init__(self, max_size=args.capacity):
 73 |         self.storage = []
 74 |         self.max_size = max_size
 75 |         self.ptr = 0
 76 | 
 77 |     def push(self, data):
 78 |         if len(self.storage) == self.max_size:
 79 |             self.storage[int(self.ptr)] = data
 80 |             self.ptr = (self.ptr + 1) % self.max_size
 81 |         else:
 82 |             self.storage.append(data)
 83 | 
 84 |     def sample(self, batch_size):
 85 |         ind = np.random.randint(0, len(self.storage), size=batch_size)
 86 |         x, y, u, r, d = [], [], [], [], []
 87 | 
 88 |         for i in ind:
 89 |             X, Y, U, R, D = self.storage[i]
 90 |             x.append(np.array(X, copy=False))
 91 |             y.append(np.array(Y, copy=False))
 92 |             u.append(np.array(U, copy=False))
 93 |             r.append(np.array(R, copy=False))
 94 |             d.append(np.array(D, copy=False))
 95 | 
 96 |         return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)
 97 | 
 98 | 
 99 | class Actor(nn.Module):
100 |     def __init__(self, state_dim, action_dim, max_action):
101 |         super(Actor, self).__init__()
102 | 
103 |         self.l1 = nn.Linear(state_dim, 400)
104 |         self.l2 = nn.Linear(400, 300)
105 |         self.l3 = nn.Linear(300, action_dim)
106 | 
107 |         self.max_action = max_action
108 | 
109 |     def forward(self, x):
110 |         x = F.relu(self.l1(x))
111 |         x = F.relu(self.l2(x))
112 |         x = self.max_action * torch.tanh(self.l3(x))
113 |         return x
114 | 
115 | 
116 | class Critic(nn.Module):
117 |     def __init__(self, state_dim, action_dim):
118 |         super(Critic, self).__init__()
119 | 
120 |         self.l1 = nn.Linear(state_dim + action_dim, 400)
121 |         self.l2 = nn.Linear(400 , 300)
122 |         self.l3 = nn.Linear(300, 1)
123 | 
124 |     def forward(self, x, u):
125 |         x = F.relu(self.l1(torch.cat([x, u], 1)))
126 |         x = F.relu(self.l2(x))
127 |         x = self.l3(x)
128 |         return x
129 | 
130 | 
131 | class DDPG(object):
132 |     def __init__(self, state_dim, action_dim, max_action):
133 |         self.actor = Actor(state_dim, action_dim, max_action).to(device)
134 |         self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
135 |         self.actor_target.load_state_dict(self.actor.state_dict())
136 |         self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
137 | 
138 |         self.critic = Critic(state_dim, action_dim).to(device)
139 |         self.critic_target = Critic(state_dim, action_dim).to(device)
140 |         self.critic_target.load_state_dict(self.critic.state_dict())
141 |         self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
142 |         self.replay_buffer = Replay_buffer()
143 |         self.writer = SummaryWriter(directory)
144 | 
145 |         self.num_critic_update_iteration = 0
146 |         self.num_actor_update_iteration = 0
147 |         self.num_training = 0
148 | 
149 |     def select_action(self, state):
150 |         state = torch.FloatTensor(state.reshape(1, -1)).to(device)
151 |         return self.actor(state).cpu().data.numpy().flatten()
152 | 
153 |     def update(self):
154 | 
155 |         for it in range(args.update_iteration):
156 |             # Sample replay buffer
157 |             x, y, u, r, d = self.replay_buffer.sample(args.batch_size)
158 |             state = torch.FloatTensor(x).to(device)
159 |             action = torch.FloatTensor(u).to(device)
160 |             next_state = torch.FloatTensor(y).to(device)
161 |             done = torch.FloatTensor(1-d).to(device)
162 |             reward = torch.FloatTensor(r).to(device)
163 | 
164 |             # Compute the target Q value
165 |             target_Q = self.critic_target(next_state, self.actor_target(next_state))
166 |             target_Q = reward + (done * args.gamma * target_Q).detach()
167 | 
168 |             # Get current Q estimate
169 |             current_Q = self.critic(state, action)
170 | 
171 |             # Compute critic loss
172 |             critic_loss = F.mse_loss(current_Q, target_Q)
173 |             self.writer.add_scalar('Loss/critic_loss', critic_loss, global_step=self.num_critic_update_iteration)
174 |             # Optimize the critic
175 |             self.critic_optimizer.zero_grad()
176 |             critic_loss.backward()
177 |             self.critic_optimizer.step()
178 | 
179 |             # Compute actor loss
180 |             actor_loss = -self.critic(state, self.actor(state)).mean()
181 |             self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
182 | 
183 |             # Optimize the actor
184 |             self.actor_optimizer.zero_grad()
185 |             actor_loss.backward()
186 |             self.actor_optimizer.step()
187 | 
188 |             # Update the frozen target models
189 |             for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
190 |                 target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
191 | 
192 |             for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
193 |                 target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
194 | 
195 |             self.num_actor_update_iteration += 1
196 |             self.num_critic_update_iteration += 1
197 | 
198 |     def save(self):
199 |         torch.save(self.actor.state_dict(), directory + 'actor.pth')
200 |         torch.save(self.critic.state_dict(), directory + 'critic.pth')
201 |         # print("====================================")
202 |         # print("Model has been saved...")
203 |         # print("====================================")
204 | 
205 |     def load(self):
206 |         self.actor.load_state_dict(torch.load(directory + 'actor.pth'))
207 |         self.critic.load_state_dict(torch.load(directory + 'critic.pth'))
208 |         print("====================================")
209 |         print("model has been loaded...")
210 |         print("====================================")
211 | 
212 | def main():
213 |     agent = DDPG(state_dim, action_dim, max_action)
214 |     ep_r = 0
215 |     if args.mode == 'test':
216 |         agent.load()
217 |         for i in range(args.test_iteration):
218 |             state = env.reset()
219 |             for t in count():
220 |                 action = agent.select_action(state)
221 |                 next_state, reward, done, info = env.step(np.float32(action))
222 |                 ep_r += reward
223 |                 env.render()
224 |                 if done or t >= args.max_length_of_trajectory:
225 |                     print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
226 |                     ep_r = 0
227 |                     break
228 |                 state = next_state
229 | 
230 |     elif args.mode == 'train':
231 |         if args.load: agent.load()
232 |         total_step = 0
233 |         for i in range(args.max_episode):
234 |             total_reward = 0
235 |             step =0
236 |             state = env.reset()
237 |             for t in count():
238 |                 action = agent.select_action(state)
239 |                 action = (action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0])).clip(
240 |                     env.action_space.low, env.action_space.high)
241 | 
242 |                 next_state, reward, done, info = env.step(action)
243 |                 if args.render and i >= args.render_interval : env.render()
244 |                 agent.replay_buffer.push((state, next_state, action, reward, np.float(done)))
245 | 
246 |                 state = next_state
247 |                 if done:
248 |                     break
249 |                 step += 1
250 |                 total_reward += reward
251 |             total_step += step+1
252 |             print("Total T:{} Episode: \t{} Total Reward: \t{:0.2f}".format(total_step, i, total_reward))
253 |             agent.update()
254 |            # "Total T: %d Episode Num: %d Episode T: %d Reward: %f
255 | 
256 |             if i % args.log_interval == 0:
257 |                 agent.save()
258 |     else:
259 |         raise NameError("mode wrong!!!")
260 | 
261 | if __name__ == '__main__':
262 |     main()
263 | 


--------------------------------------------------------------------------------
/Char05 DDPG/DDPG_exp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char05 DDPG/DDPG_exp.jpg


--------------------------------------------------------------------------------
/Char05 DDPG/README.md:
--------------------------------------------------------------------------------
 1 | # DDPG
 2 | - Original paper: https://arxiv.org/abs/1509.02971
 3 | - OPENAI Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
 4 | 
 5 | **Note that DDPG is feasible about hyper-parameters. You should fine-tuning if you change to another environment.**
 6 | 
 7 | Episode reward in Pendulum-v0:  
 8 | 
 9 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char05%20DDPG/DDPG_exp.jpg)  
10 | 
11 | 


--------------------------------------------------------------------------------
/Char07 PPO/PPO2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import pickle
  4 | from collections import namedtuple
  5 | 
  6 | import os
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal
 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 17 | 
 18 | # Parameters
 19 | parser = argparse.ArgumentParser(description='Solve the Pendulum-v0 with PPO')
 20 | parser.add_argument(
 21 |     '--gamma', type=float, default=0.9, metavar='G', help='discount factor (default: 0.9)')
 22 | parser.add_argument('--seed', type=int, default=0, metavar='N', help='random seed (default: 0)')
 23 | parser.add_argument('--render', action='store_true', default=True, help='render the environment')
 24 | parser.add_argument(
 25 |     '--log-interval',
 26 |     type=int,
 27 |     default=10,
 28 |     metavar='N',
 29 |     help='interval between training status logs (default: 10)')
 30 | args = parser.parse_args()
 31 | 
 32 | env = gym.make('Pendulum-v0').unwrapped
 33 | num_state = env.observation_space.shape[0]
 34 | num_action = env.action_space.shape[0]
 35 | torch.manual_seed(args.seed)
 36 | env.seed(args.seed)
 37 | 
 38 | Transition = namedtuple('Transition',['state', 'aciton', 'reward', 'a_log_prob', 'next_state'])
 39 | TrainRecord = namedtuple('TrainRecord',['episode', 'reward'])
 40 | 
 41 | class Actor(nn.Module):
 42 |     def __init__(self):
 43 |         super(Actor, self).__init__()
 44 |         self.fc1 = nn.Linear(num_state, 64)
 45 |         self.fc2 = nn.Linear(64,8)
 46 |         self.mu_head = nn.Linear(8, 1)
 47 |         self.sigma_head = nn.Linear(8, 1)
 48 | 
 49 |     def forward(self, x):
 50 |         x = F.leaky_relu(self.fc1(x))
 51 |         x = F.leaky_relu(self.fc2(x))
 52 | 
 53 |         mu = self.mu_head(x)
 54 |         sigma = self.sigma_head(x)
 55 | 
 56 |         return mu, sigma
 57 | 
 58 | class Critic(nn.Module):
 59 |     def __init__(self):
 60 |         super(Critic, self).__init__()
 61 |         self.fc1 = nn.Linear(num_state, 64)
 62 |         self.fc2 = nn.Linear(64, 8)
 63 |         self.state_value= nn.Linear(8, 1)
 64 | 
 65 |     def forward(self, x):
 66 |         x = F.leaky_relu(self.fc1(x))
 67 |         x = F.leaky_relu(self.fc2(x))
 68 |         value = self.state_value(x)
 69 |         return value
 70 | 
 71 | class PPO():
 72 |     clip_param = 0.2
 73 |     max_grad_norm = 0.5
 74 |     ppo_epoch = 10
 75 |     buffer_capacity = 1000
 76 |     batch_size = 8
 77 | 
 78 |     def __init__(self):
 79 |         super(PPO, self).__init__()
 80 |         self.actor_net = Actor().float()
 81 |         self.critic_net = Critic().float()
 82 |         self.buffer = []
 83 |         self.counter = 0
 84 |         self.training_step = 0
 85 | 
 86 |         self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3)
 87 |         self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 4e-3)
 88 |         if not os.path.exists('../param'):
 89 |             os.makedirs('../param/net_param')
 90 |             os.makedirs('../param/img')
 91 | 
 92 |     def select_action(self, state):
 93 |         state = torch.from_numpy(state).float().unsqueeze(0)
 94 |         with torch.no_grad():
 95 |             mu, sigma = self.actor_net(state)
 96 |         dist = Normal(mu, sigma)
 97 |         action = dist.sample()
 98 |         action_log_prob = dist.log_prob(action)
 99 |         action = action.clamp(-2, 2)
100 |         return action.item(), action_log_prob.item()
101 | 
102 | 
103 |     def get_value(self, state):
104 |         state = torch.from_numpy(state)
105 |         with torch.no_grad():
106 |             value = self.critic_net(state)
107 |         return value.item()
108 | 
109 |     def save_param(self):
110 |         torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net'+str(time.time())[:10],+'.pkl')
111 |         torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net'+str(time.time())[:10],+'.pkl')
112 | 
113 |     def store_transition(self, transition):
114 |         self.buffer.append(transition)
115 |         self.counter+=1
116 |         return counter % self.buffer_capacity == 0
117 | 
118 |     def update(self):
119 |         self.training_step +=1
120 | 
121 |         state = torch.tensor([t.state for t in self.buffer ], dtype=torch.float)
122 |         action = torch.tensor([t.action for t in self.buffer], dtype=torch.float).view(-1, 1)
123 |         reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1)
124 |         next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float)
125 |         old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1)
126 | 
127 |         reward = (reward - reward.mean())/(reward.std() + 1e-10)
128 |         with torch.no_grad():
129 |             target_v = reward + args.gamma * self.critic_net(next_state)
130 | 
131 |         advantage = (target_v - self.critic_net(state)).detach()
132 |         for _ in range(self.ppo_epoch): # iteration ppo_epoch 
133 |             for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity), self.batch_size, True)):
134 |                 # epoch iteration, PPO core!!!
135 |                 mu, sigma = self.actor_net(state[index])
136 |                 n = Normal(mu, sigma)
137 |                 action_log_prob = n.log_prob(action[index])
138 |                 ratio = torch.exp(action_log_prob - old_action_log_prob)
139 |                 
140 |                 L1 = ratio * advantage[index]
141 |                 L2 = torch.clamp(ratio, 1-self.clip_param, 1+self.clip_param) * advantage[index]
142 |                 action_loss = -torch.min(L1, L2).mean() # MAX->MIN desent
143 |                 self.actor_optimizer.zero_grad()
144 |                 action_loss.backward()
145 |                 nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
146 |                 self.actor_optimizer.step()
147 | 
148 |                 value_loss = F.smooth_l1_loss(self.critic_net(state[index]), target_v[index])
149 |                 self.critic_net_optimizer.zero_grad()
150 |                 value_loss.backward()
151 |                 nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
152 |                 self.critic_net_optimizer.step()
153 | 
154 |         del self.buffer[:]
155 | 
156 | def main():
157 | 
158 |     agent = PPO()
159 | 
160 |     training_records = []
161 |     running_reward = -1000
162 | 
163 |     for i_epoch in range(1000):
164 |         score = 0
165 |         state = env.reset()
166 |         if args.render: env.render()
167 |         for t in range(200):
168 |             action, action_log_prob = agent.select_action(state)
169 |             next_state, reward, done, info = env.step(action)
170 |             trans = Transition(state, action, reward, action_log_prob, next_state)
171 |             if args.render: env.render()
172 |             if agent.store_transition(trans):
173 |                 agent.update()
174 |             score += reward
175 |             state = next_state
176 | 
177 |         running_reward = running_reward * 0.9 + score * 0.1
178 |         training_records.append(TrainingRecord(i_epoch, running_reward))
179 |         if i_epoch % 10 ==0:
180 |             print("Epoch {}, Moving average score is: {:.2f} ".format(i_epoch, running_reward))
181 |         if running_reward > -200:
182 |             print("Solved! Moving average score is now {}!".format(running_reward))
183 |             env.close()
184 |             agent.save_param()
185 |             break
186 | 
187 | if __name__ == '__main__':
188 |     main()
189 | 


--------------------------------------------------------------------------------
/Char07 PPO/PPO_CartPole_v0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | from itertools import count
  5 | 
  6 | import os, time
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal, Categorical
 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 17 | from tensorboardX import SummaryWriter
 18 | 
 19 | # Parameters
 20 | gamma = 0.99
 21 | render = False
 22 | seed = 1
 23 | log_interval = 10
 24 | 
 25 | env = gym.make('CartPole-v0').unwrapped
 26 | num_state = env.observation_space.shape[0]
 27 | num_action = env.action_space.n
 28 | torch.manual_seed(seed)
 29 | env.seed(seed)
 30 | Transition = namedtuple('Transition', ['state', 'action',  'a_log_prob', 'reward', 'next_state'])
 31 | 
 32 | class Actor(nn.Module):
 33 |     def __init__(self):
 34 |         super(Actor, self).__init__()
 35 |         self.fc1 = nn.Linear(num_state, 100)
 36 |         self.action_head = nn.Linear(100, num_action)
 37 | 
 38 |     def forward(self, x):
 39 |         x = F.relu(self.fc1(x))
 40 |         action_prob = F.softmax(self.action_head(x), dim=1)
 41 |         return action_prob
 42 | 
 43 | 
 44 | class Critic(nn.Module):
 45 |     def __init__(self):
 46 |         super(Critic, self).__init__()
 47 |         self.fc1 = nn.Linear(num_state, 100)
 48 |         self.state_value = nn.Linear(100, 1)
 49 | 
 50 |     def forward(self, x):
 51 |         x = F.relu(self.fc1(x))
 52 |         value = self.state_value(x)
 53 |         return value
 54 | 
 55 | 
 56 | class PPO():
 57 |     clip_param = 0.2
 58 |     max_grad_norm = 0.5
 59 |     ppo_update_time = 10
 60 |     buffer_capacity = 1000
 61 |     batch_size = 32
 62 | 
 63 |     def __init__(self):
 64 |         super(PPO, self).__init__()
 65 |         self.actor_net = Actor()
 66 |         self.critic_net = Critic()
 67 |         self.buffer = []
 68 |         self.counter = 0
 69 |         self.training_step = 0
 70 |         self.writer = SummaryWriter('../exp')
 71 | 
 72 |         self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3)
 73 |         self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3)
 74 |         if not os.path.exists('../param'):
 75 |             os.makedirs('../param/net_param')
 76 |             os.makedirs('../param/img')
 77 | 
 78 |     def select_action(self, state):
 79 |         state = torch.from_numpy(state).float().unsqueeze(0)
 80 |         with torch.no_grad():
 81 |             action_prob = self.actor_net(state)
 82 |         c = Categorical(action_prob)
 83 |         action = c.sample()
 84 |         return action.item(), action_prob[:,action.item()].item()
 85 | 
 86 |     def get_value(self, state):
 87 |         state = torch.from_numpy(state)
 88 |         with torch.no_grad():
 89 |             value = self.critic_net(state)
 90 |         return value.item()
 91 | 
 92 |     def save_param(self):
 93 |         torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl')
 94 |         torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl')
 95 | 
 96 |     def store_transition(self, transition):
 97 |         self.buffer.append(transition)
 98 |         self.counter += 1
 99 | 
100 | 
101 |     def update(self, i_ep):
102 |         state = torch.tensor([t.state for t in self.buffer], dtype=torch.float)
103 |         action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1)
104 |         reward = [t.reward for t in self.buffer]
105 |         # update: don't need next_state
106 |         #reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1)
107 |         #next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float)
108 |         old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1)
109 | 
110 |         R = 0
111 |         Gt = []
112 |         for r in reward[::-1]:
113 |             R = r + gamma * R
114 |             Gt.insert(0, R)
115 |         Gt = torch.tensor(Gt, dtype=torch.float)
116 |         #print("The agent is updateing....")
117 |         for i in range(self.ppo_update_time):
118 |             for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False):
119 |                 if self.training_step % 1000 ==0:
120 |                     print('I_ep {} ，train {} times'.format(i_ep,self.training_step))
121 |                 #with torch.no_grad():
122 |                 Gt_index = Gt[index].view(-1, 1)
123 |                 V = self.critic_net(state[index])
124 |                 delta = Gt_index - V
125 |                 advantage = delta.detach()
126 |                 # epoch iteration, PPO core!!!
127 |                 action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy
128 | 
129 |                 ratio = (action_prob/old_action_log_prob[index])
130 |                 surr1 = ratio * advantage
131 |                 surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage
132 | 
133 |                 # update actor network
134 |                 action_loss = -torch.min(surr1, surr2).mean()  # MAX->MIN desent
135 |                 self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step)
136 |                 self.actor_optimizer.zero_grad()
137 |                 action_loss.backward()
138 |                 nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
139 |                 self.actor_optimizer.step()
140 | 
141 |                 #update critic network
142 |                 value_loss = F.mse_loss(Gt_index, V)
143 |                 self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step)
144 |                 self.critic_net_optimizer.zero_grad()
145 |                 value_loss.backward()
146 |                 nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
147 |                 self.critic_net_optimizer.step()
148 |                 self.training_step += 1
149 | 
150 |         del self.buffer[:] # clear experience
151 | 
152 |     
153 | def main():
154 |     agent = PPO()
155 |     for i_epoch in range(1000):
156 |         state = env.reset()
157 |         if render: env.render()
158 | 
159 |         for t in count():
160 |             action, action_prob = agent.select_action(state)
161 |             next_state, reward, done, _ = env.step(action)
162 |             trans = Transition(state, action, action_prob, reward, next_state)
163 |             if render: env.render()
164 |             agent.store_transition(trans)
165 |             state = next_state
166 | 
167 |             if done :
168 |                 if len(agent.buffer) >= agent.batch_size:agent.update(i_epoch)
169 |                 agent.writer.add_scalar('liveTime/livestep', t, global_step=i_epoch)
170 |                 break
171 | 
172 | if __name__ == '__main__':
173 |     main()
174 |     print("end")
175 | 


--------------------------------------------------------------------------------
/Char07 PPO/PPO_MountainCar-v0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | from itertools import count
  5 | 
  6 | import os, time
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal, Categorical
 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 17 | from tensorboardX import SummaryWriter
 18 | 
 19 | # Parameters
 20 | env_name = 'MountainCar-v0'
 21 | gamma = 0.99
 22 | render = False
 23 | seed = 1
 24 | log_interval = 10
 25 | 
 26 | env = gym.make(env_name).unwrapped
 27 | num_state = env.observation_space.shape[0]
 28 | num_action = env.action_space.n
 29 | torch.manual_seed(seed)
 30 | env.seed(seed)
 31 | Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state'])
 32 | 
 33 | 
 34 | class Actor(nn.Module):
 35 |     def __init__(self):
 36 |         super(Actor, self).__init__()
 37 |         self.fc1 = nn.Linear(num_state, 128)
 38 |         self.action_head = nn.Linear(128, num_action)
 39 | 
 40 |     def forward(self, x):
 41 |         x = F.relu(self.fc1(x))
 42 |         action_prob = F.softmax(self.action_head(x), dim=1)
 43 |         return action_prob
 44 | 
 45 | 
 46 | class Critic(nn.Module):
 47 |     def __init__(self):
 48 |         super(Critic, self).__init__()
 49 |         self.fc1 = nn.Linear(num_state, 128)
 50 |         self.state_value = nn.Linear(128, 1)
 51 | 
 52 |     def forward(self, x):
 53 |         x = F.relu(self.fc1(x))
 54 |         value = self.state_value(x)
 55 |         return value
 56 | 
 57 | 
 58 | class PPO():
 59 |     clip_param = 0.2
 60 |     max_grad_norm = 0.5
 61 |     ppo_update_time = 10
 62 |     buffer_capacity = 8000
 63 |     batch_size = 32
 64 | 
 65 |     def __init__(self):
 66 |         super(PPO, self).__init__()
 67 |         self.actor_net = Actor()
 68 |         self.critic_net = Critic()
 69 |         self.buffer = []
 70 |         self.counter = 0
 71 |         self.training_step = 0
 72 |         self.writer = SummaryWriter('../exp')
 73 | 
 74 |         self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3)
 75 |         self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3)
 76 |         if not os.path.exists('../param'):
 77 |             os.makedirs('../param/net_param')
 78 |             os.makedirs('../param/img')
 79 | 
 80 |     def select_action(self, state):
 81 |         state = torch.from_numpy(state).float().unsqueeze(0)
 82 |         with torch.no_grad():
 83 |             action_prob = self.actor_net(state)
 84 |         c = Categorical(action_prob)
 85 |         action = c.sample()
 86 |         return action.item(), action_prob[:, action.item()].item()
 87 | 
 88 |     def get_value(self, state):
 89 |         state = torch.from_numpy(state)
 90 |         with torch.no_grad():
 91 |             value = self.critic_net(state)
 92 |         return value.item()
 93 | 
 94 |     def save_param(self):
 95 |         torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl')
 96 |         torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl')
 97 | 
 98 |     def store_transition(self, transition):
 99 |         self.buffer.append(transition)
100 |         self.counter += 1
101 | 
102 |     def update(self, i_ep):
103 |         state = torch.tensor([t.state for t in self.buffer], dtype=torch.float)
104 |         action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1)
105 |         reward = [t.reward for t in self.buffer]
106 |         # update: don't need next_state
107 |         # reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1)
108 |         # next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float)
109 |         old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1)
110 | 
111 |         R = 0
112 |         Gt = []
113 |         for r in reward[::-1]:
114 |             R = r + gamma * R
115 |             Gt.insert(0, R)
116 |         Gt = torch.tensor(Gt, dtype=torch.float)
117 |         # print("The agent is updateing....")
118 |         for i in range(self.ppo_update_time):
119 |             for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False):
120 |                 if self.training_step % 1000 == 0:
121 |                     print('I_ep {} ，train {} times'.format(i_ep, self.training_step))
122 |                 # with torch.no_grad():
123 |                 Gt_index = Gt[index].view(-1, 1)
124 |                 V = self.critic_net(state[index])
125 |                 delta = Gt_index - V
126 |                 advantage = delta.detach()
127 |                 # epoch iteration, PPO core!!!
128 |                 action_prob = self.actor_net(state[index]).gather(1, action[index])  # new policy
129 | 
130 |                 ratio = (action_prob / old_action_log_prob[index])
131 |                 surr1 = ratio * advantage
132 |                 surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage
133 | 
134 |                 # update actor network
135 |                 action_loss = -torch.min(surr1, surr2).mean()  # MAX->MIN desent
136 |                 self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step)
137 |                 self.actor_optimizer.zero_grad()
138 |                 action_loss.backward()
139 |                 nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
140 |                 self.actor_optimizer.step()
141 | 
142 |                 # update critic network
143 |                 value_loss = F.mse_loss(Gt_index, V)
144 |                 self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step)
145 |                 self.critic_net_optimizer.zero_grad()
146 |                 value_loss.backward()
147 |                 nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
148 |                 self.critic_net_optimizer.step()
149 |                 self.training_step += 1
150 | 
151 |         del self.buffer[:]  # clear experience
152 | 
153 | 
154 | def main():
155 |     agent = PPO()
156 |     for i_epoch in range(1000):
157 |         state = env.reset()
158 |         if render: env.render()
159 | 
160 |         for t in count():
161 |             action, action_prob = agent.select_action(state)
162 |             next_state, reward, done, _ = env.step(action)
163 |             trans = Transition(state, action, action_prob, reward, next_state)
164 |             if render: env.render()
165 |             agent.store_transition(trans)
166 |             state = next_state
167 | 
168 |             if done:
169 |                 if len(agent.buffer) >= agent.batch_size: agent.update(i_epoch)
170 |                 agent.writer.add_scalar('Steptime/steptime', t, global_step=i_epoch)
171 |                 break
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     main()
176 |     print("end")
177 | 


--------------------------------------------------------------------------------
/Char07 PPO/PPO_pendulum.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import gym
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from torch.distributions import Normal
 13 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 14 | 
 15 | parser = argparse.ArgumentParser(description='Solve the Pendulum-v0 with PPO')
 16 | parser.add_argument(
 17 |     '--gamma', type=float, default=0.9, metavar='G', help='discount factor (default: 0.9)')
 18 | parser.add_argument('--seed', type=int, default=0, metavar='N', help='random seed (default: 0)')
 19 | parser.add_argument('--render', action='store_true', help='render the environment')
 20 | parser.add_argument(
 21 |     '--log-interval',
 22 |     type=int,
 23 |     default=10,
 24 |     metavar='N',
 25 |     help='interval between training status logs (default: 10)')
 26 | args = parser.parse_args()
 27 | 
 28 | torch.manual_seed(args.seed)
 29 | 
 30 | TrainingRecord = namedtuple('TrainingRecord', ['ep', 'reward'])
 31 | Transition = namedtuple('Transition', ['s', 'a', 'a_log_p', 'r', 's_'])
 32 | 
 33 | 
 34 | class ActorNet(nn.Module):
 35 | 
 36 |     def __init__(self):
 37 |         super(ActorNet, self).__init__()
 38 |         self.fc = nn.Linear(3, 100)
 39 |         self.mu_head = nn.Linear(100, 1)
 40 |         self.sigma_head = nn.Linear(100, 1)
 41 | 
 42 |     def forward(self, x):
 43 |         x = F.relu(self.fc(x))
 44 |         mu = 2.0 * F.tanh(self.mu_head(x))
 45 |         sigma = F.softplus(self.sigma_head(x))
 46 |         return (mu, sigma)
 47 | 
 48 | 
 49 | class CriticNet(nn.Module):
 50 | 
 51 |     def __init__(self):
 52 |         super(CriticNet, self).__init__()
 53 |         self.fc = nn.Linear(3, 100)
 54 |         self.v_head = nn.Linear(100, 1)
 55 | 
 56 |     def forward(self, x):
 57 |         x = F.relu(self.fc(x))
 58 |         state_value = self.v_head(x)
 59 |         return state_value
 60 | 
 61 | 
 62 | class Agent():
 63 | 
 64 |     clip_param = 0.2
 65 |     max_grad_norm = 0.5
 66 |     ppo_epoch = 10
 67 |     buffer_capacity, batch_size = 1000, 32
 68 | 
 69 |     def __init__(self):
 70 |         self.training_step = 0
 71 |         self.anet = ActorNet().float()
 72 |         self.cnet = CriticNet().float()
 73 |         self.buffer = []
 74 |         self.counter = 0
 75 | 
 76 |         self.optimizer_a = optim.Adam(self.anet.parameters(), lr=1e-4)
 77 |         self.optimizer_c = optim.Adam(self.cnet.parameters(), lr=3e-4)
 78 | 
 79 |     def select_action(self, state):
 80 |         state = torch.from_numpy(state).float().unsqueeze(0)
 81 |         with torch.no_grad():
 82 |             (mu, sigma) = self.anet(state)
 83 |         dist = Normal(mu, sigma)
 84 |         action = dist.sample()
 85 |         action_log_prob = dist.log_prob(action)
 86 |         action = action.clamp(-2.0, 2.0)
 87 |         return action.item(), action_log_prob.item()
 88 | 
 89 |     def get_value(self, state):
 90 | 
 91 |         state = torch.from_numpy(state).float().unsqueeze(0)
 92 |         with torch.no_grad():
 93 |             state_value = self.cnet(state)
 94 |         return state_value.item()
 95 | 
 96 |     def save_param(self):
 97 |         torch.save(self.anet.state_dict(), 'param/ppo_anet_params.pkl')
 98 |         torch.save(self.cnet.state_dict(), 'param/ppo_cnet_params.pkl')
 99 | 
100 |     def store(self, transition):
101 |         self.buffer.append(transition)
102 |         self.counter += 1
103 |         return self.counter % self.buffer_capacity == 0
104 | 
105 |     def update(self):
106 |         self.training_step += 1
107 | 
108 |         s = torch.tensor([t.s for t in self.buffer], dtype=torch.float)
109 |         a = torch.tensor([t.a for t in self.buffer], dtype=torch.float).view(-1, 1)
110 |         r = torch.tensor([t.r for t in self.buffer], dtype=torch.float).view(-1, 1)
111 |         s_ = torch.tensor([t.s_ for t in self.buffer], dtype=torch.float)
112 | 
113 |         old_action_log_probs = torch.tensor(
114 |             [t.a_log_p for t in self.buffer], dtype=torch.float).view(-1, 1)
115 | 
116 |         r = (r - r.mean()) / (r.std() + 1e-5)
117 |         with torch.no_grad():
118 |             target_v = r + args.gamma * self.cnet(s_)
119 | 
120 |         adv = (target_v - self.cnet(s)).detach()
121 | 
122 |         for _ in range(self.ppo_epoch):
123 |             for index in BatchSampler(
124 |                     SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False):
125 | 
126 |                 (mu, sigma) = self.anet(s[index])
127 |                 dist = Normal(mu, sigma)
128 |                 action_log_probs = dist.log_prob(a[index])
129 |                 ratio = torch.exp(action_log_probs - old_action_log_probs[index])
130 | 
131 |                 surr1 = ratio * adv[index]
132 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
133 |                                     1.0 + self.clip_param) * adv[index]
134 |                 action_loss = -torch.min(surr1, surr2).mean()
135 | 
136 |                 self.optimizer_a.zero_grad()
137 |                 action_loss.backward()
138 |                 nn.utils.clip_grad_norm_(self.anet.parameters(), self.max_grad_norm)
139 |                 self.optimizer_a.step()
140 | 
141 |                 value_loss = F.smooth_l1_loss(self.cnet(s[index]), target_v[index])
142 |                 self.optimizer_c.zero_grad()
143 |                 value_loss.backward()
144 |                 nn.utils.clip_grad_norm_(self.cnet.parameters(), self.max_grad_norm)
145 |                 self.optimizer_c.step()
146 | 
147 |         del self.buffer[:]
148 | 
149 | 
150 | def main():
151 |     env = gym.make('Pendulum-v0')
152 |     env.seed(args.seed)
153 | 
154 |     agent = Agent()
155 | 
156 |     training_records = []
157 |     running_reward = -1000
158 |     state = env.reset()
159 |     for i_ep in range(1000):
160 |         score = 0
161 |         state = env.reset()
162 | 
163 |         for t in range(200):
164 |             action, action_log_prob = agent.select_action(state)
165 |             state_, reward, done, _ = env.step([action])
166 |             if args.render:
167 |                 env.render()
168 |             if agent.store(Transition(state, action, action_log_prob, (reward + 8) / 8, state_)):
169 |                 agent.update()
170 |             score += reward
171 |             state = state_
172 | 
173 |         running_reward = running_reward * 0.9 + score * 0.1
174 |         training_records.append(TrainingRecord(i_ep, running_reward))
175 | 
176 |         if i_ep % args.log_interval == 0:
177 |             print('Ep {}\tMoving average score: {:.2f}\t'.format(i_ep, running_reward))
178 |         if running_reward > -200:
179 |             print("Solved! Moving average score is now {}!".format(running_reward))
180 |             env.close()
181 |             agent.save_param()
182 |             with open('log/ppo_training_records.pkl', 'wb') as f:
183 |                 pickle.dump(training_records, f)
184 |             break
185 | 
186 |     plt.plot([r.ep for r in training_records], [r.reward for r in training_records])
187 |     plt.title('PPO')
188 |     plt.xlabel('Episode')
189 |     plt.ylabel('Moving averaged episode reward')
190 |     plt.savefig("img/ppo.png")
191 |     plt.show()
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     main()
196 | 


--------------------------------------------------------------------------------
/Char07 PPO/readme.md:
--------------------------------------------------------------------------------
1 | # PPO
2 | 
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Openai Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | 
6 | Notice: This is not the author's implementation! 
7 | 


--------------------------------------------------------------------------------
/Char08 ACER/readme.md:
--------------------------------------------------------------------------------
1 | #ACER 
2 | 
3 | actor-critic with experience replay
4 | 


--------------------------------------------------------------------------------
/Char09 SAC/SAC.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | from collections import namedtuple
  4 | from itertools import count
  5 | 
  6 | import os
  7 | import numpy as np
  8 | 
  9 | 
 10 | import gym
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | from torch.distributions import Normal
 16 | from torch.autograd import grad
 17 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 18 | from tensorboardX import SummaryWriter
 19 | 
 20 | 
 21 | '''
 22 | Implementation of soft actor critic
 23 | Original paper: https://arxiv.org/abs/1801.01290
 24 | Not the author's implementation !
 25 | '''
 26 | 
 27 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 28 | parser = argparse.ArgumentParser()
 29 | 
 30 | 
 31 | parser.add_argument("--env_name", default="Pendulum-v0")  # OpenAI gym environment name
 32 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 33 | parser.add_argument('--target_update_interval', default=1, type=int)
 34 | parser.add_argument('--gradient_steps', default=1, type=int)
 35 | 
 36 | 
 37 | parser.add_argument('--learning_rate', default=3e-4, type=int)
 38 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma
 39 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size
 40 | parser.add_argument('--iteration', default=100000, type=int) #  num of  games
 41 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size
 42 | parser.add_argument('--seed', default=1, type=int)
 43 | 
 44 | # optional parameters
 45 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 46 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int)
 47 | parser.add_argument('--sample_frequency', default=256, type=int)
 48 | parser.add_argument('--activation', default='Relu', type=str)
 49 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 50 | parser.add_argument('--log_interval', default=2000, type=int) #
 51 | parser.add_argument('--load', default=False, type=bool) # load model
 52 | args = parser.parse_args()
 53 | 
 54 | class NormalizedActions(gym.ActionWrapper):
 55 |     def _action(self, action):
 56 |         low = self.action_space.low
 57 |         high = self.action_space.high
 58 | 
 59 |         action = low + (action + 1.0) * 0.5 * (high - low)
 60 |         action = np.clip(action, low, high)
 61 | 
 62 |         return action
 63 | 
 64 |     def _reverse_action(self, action):
 65 |         low = self.action_space.low
 66 |         high = self.action_space.high
 67 | 
 68 |         action = 2 * (action - low) / (high - low) - 1
 69 |         action = np.clip(action, low, high)
 70 | 
 71 |         return action
 72 | 
 73 | 
 74 | env = NormalizedActions(gym.make(args.env_name))
 75 | 
 76 | # Set seeds
 77 | env.seed(args.seed)
 78 | torch.manual_seed(args.seed)
 79 | np.random.seed(args.seed)
 80 | 
 81 | state_dim = env.observation_space.shape[0]
 82 | action_dim = env.action_space.shape[0]
 83 | max_action = float(env.action_space.high[0])
 84 | min_Val = torch.tensor(1e-7).float()
 85 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd'])
 86 | 
 87 | class Actor(nn.Module):
 88 |     def __init__(self, state_dim, min_log_std=-20, max_log_std=2):
 89 |         super(Actor, self).__init__()
 90 |         self.fc1 = nn.Linear(state_dim, 256)
 91 |         self.fc2 = nn.Linear(256, 256)
 92 |         self.mu_head = nn.Linear(256, 1)
 93 |         self.log_std_head = nn.Linear(256, 1)
 94 |         self.max_action = max_action
 95 | 
 96 |         self.min_log_std = min_log_std
 97 |         self.max_log_std = max_log_std
 98 | 
 99 |     def forward(self, x):
100 |         x = F.relu(self.fc1(x))
101 |         x = F.relu(self.fc2(x))
102 |         mu = self.mu_head(x)
103 |         log_std_head = F.relu(self.log_std_head(x))
104 |         log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std)
105 |         return mu, log_std_head
106 | 
107 | 
108 | class Critic(nn.Module):
109 |     def __init__(self, state_dim):
110 |         super(Critic, self).__init__()
111 |         self.fc1 = nn.Linear(state_dim, 256)
112 |         self.fc2 = nn.Linear(256, 256)
113 |         self.fc3 = nn.Linear(256, 1)
114 | 
115 |     def forward(self, x):
116 |         x = F.relu(self.fc1(x))
117 |         x = F.relu(self.fc2(x))
118 |         x = self.fc3(x)
119 |         return x
120 | 
121 | 
122 | class Q(nn.Module):
123 |     def __init__(self, state_dim, action_dim):
124 |         super(Q, self).__init__()
125 |         self.fc1 = nn.Linear(state_dim + action_dim, 256)
126 |         self.fc2 = nn.Linear(256, 256)
127 |         self.fc3 = nn.Linear(256, 1)
128 | 
129 |     def forward(self, s, a):
130 |         s = s.reshape(-1, state_dim)
131 |         a = a.reshape(-1, action_dim)
132 |         x = torch.cat((s, a), -1) # combination s and a
133 |         x = F.relu(self.fc1(x))
134 |         x = F.relu(self.fc2(x))
135 |         x = self.fc3(x)
136 |         return x
137 | 
138 | 
139 | class SAC():
140 |     def __init__(self):
141 |         super(SAC, self).__init__()
142 | 
143 |         self.policy_net = Actor(state_dim).to(device)
144 |         self.value_net = Critic(state_dim).to(device)
145 |         self.Q_net = Q(state_dim, action_dim).to(device)
146 |         self.Target_value_net = Critic(state_dim).to(device)
147 | 
148 |         self.replay_buffer = [Transition] * args.capacity
149 |         self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate)
150 |         self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate)
151 |         self.Q_optimizer = optim.Adam(self.Q_net.parameters(), lr=args.learning_rate)
152 |         self.num_transition = 0 # pointer of replay buffer
153 |         self.num_training = 1
154 |         self.writer = SummaryWriter('./exp-SAC')
155 | 
156 |         self.value_criterion = nn.MSELoss()
157 |         self.Q_criterion = nn.MSELoss()
158 | 
159 |         for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
160 |             target_param.data.copy_(param.data)
161 | 
162 |         os.makedirs('./SAC_model/', exist_ok=True)
163 | 
164 |     def select_action(self, state):
165 |         state = torch.FloatTensor(state).to(device)
166 |         mu, log_sigma = self.policy_net(state)
167 |         sigma = torch.exp(log_sigma)
168 |         dist = Normal(mu, sigma)
169 |         z = dist.sample()
170 |         action = torch.tanh(z).detach().cpu().numpy()
171 |         return action.item() # return a scalar, float32
172 | 
173 |     def store(self, s, a, r, s_, d):
174 |         index = self.num_transition % args.capacity
175 |         transition = Transition(s, a, r, s_, d)
176 |         self.replay_buffer[index] = transition
177 |         self.num_transition += 1
178 | 
179 |     def get_action_log_prob(self, state):
180 | 
181 |         batch_mu, batch_log_sigma = self.policy_net(state)
182 |         batch_sigma = torch.exp(batch_log_sigma)
183 |         dist = Normal(batch_mu, batch_sigma)
184 |         z = dist.sample()
185 |         action = torch.tanh(z)
186 |         log_prob = dist.log_prob(z) - torch.log(1 - action.pow(2) + min_Val)
187 |         return action, log_prob, z, batch_mu, batch_log_sigma
188 | 
189 | 
190 |     def update(self):
191 |         if self.num_training % 500 == 0:
192 |             print("Training ... {} ".format(self.num_training))
193 |         s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device)
194 |         a = torch.tensor([t.a for t in self.replay_buffer]).to(device)
195 |         r = torch.tensor([t.r for t in self.replay_buffer]).to(device)
196 |         s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device)
197 |         d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device)
198 | 
199 |         for _ in range(args.gradient_steps):
200 |             #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False):
201 |             index = np.random.choice(range(args.capacity), args.batch_size, replace=False)
202 |             bn_s = s[index]
203 |             bn_a = a[index].reshape(-1, 1)
204 |             bn_r = r[index].reshape(-1, 1)
205 |             bn_s_ = s_[index]
206 |             bn_d = d[index].reshape(-1, 1)
207 | 
208 | 
209 |             target_value = self.Target_value_net(bn_s_)
210 |             next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value
211 | 
212 |             excepted_value = self.value_net(bn_s)
213 |             excepted_Q = self.Q_net(bn_s, bn_a)
214 | 
215 |             sample_action, log_prob, z, batch_mu, batch_log_sigma = self.get_action_log_prob(bn_s)
216 |             excepted_new_Q = self.Q_net(bn_s, sample_action)
217 |             next_value = excepted_new_Q - log_prob
218 | 
219 |             # !!!Note that the actions are sampled according to the current policy,
220 |             # instead of replay buffer. (From original paper)
221 | 
222 |             V_loss = self.value_criterion(excepted_value, next_value.detach())  # J_V
223 |             V_loss = V_loss.mean()
224 | 
225 |             # Single Q_net this is different from original paper!!!
226 |             Q_loss = self.Q_criterion(excepted_Q, next_q_value.detach()) # J_Q
227 |             Q_loss = Q_loss.mean()
228 | 
229 |             log_policy_target = excepted_new_Q - excepted_value
230 | 
231 |             pi_loss = log_prob * (log_prob- log_policy_target).detach()
232 |             pi_loss = pi_loss.mean()
233 | 
234 |             self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training)
235 |             self.writer.add_scalar('Loss/Q_loss', Q_loss, global_step=self.num_training)
236 |             self.writer.add_scalar('Loss/pi_loss', pi_loss, global_step=self.num_training)
237 |             # mini batch gradient descent
238 |             self.value_optimizer.zero_grad()
239 |             V_loss.backward(retain_graph=True)
240 |             nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5)
241 |             self.value_optimizer.step()
242 | 
243 |             self.Q_optimizer.zero_grad()
244 |             Q_loss.backward(retain_graph = True)
245 |             nn.utils.clip_grad_norm_(self.Q_net.parameters(), 0.5)
246 |             self.Q_optimizer.step()
247 | 
248 |             self.policy_optimizer.zero_grad()
249 |             pi_loss.backward(retain_graph = True)
250 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
251 |             self.policy_optimizer.step()
252 | 
253 |             # soft update
254 |             for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
255 |                 target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau)
256 | 
257 |             self.num_training += 1
258 | 
259 |     def save(self):
260 |         torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth')
261 |         torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth')
262 |         torch.save(self.Q_net.state_dict(), './SAC_model/Q_net.pth')
263 |         print("====================================")
264 |         print("Model has been saved...")
265 |         print("====================================")
266 | 
267 |     def load(self):
268 |         torch.load(self.policy_net.state_dict(), './SAC_model/policy_net.pth')
269 |         torch.load(self.value_net.state_dict(), './SAC_model/value_net.pth')
270 |         torch.load(self.Q_net.state_dict(), './SAC_model/Q_net.pth')
271 |         print()
272 | 
273 | def main():
274 | 
275 |     agent = SAC()
276 |     if args.load: agent.load()
277 |     if args.render: env.render()
278 |     print("====================================")
279 |     print("Collection Experience...")
280 |     print("====================================")
281 | 
282 |     ep_r = 0
283 |     for i in range(args.iteration):
284 |         state = env.reset()
285 |         for t in range(200):
286 |             action = agent.select_action(state)
287 |             next_state, reward, done, info = env.step(np.float32(action))
288 |             ep_r += reward
289 |             if args.render: env.render()
290 |             agent.store(state, action, reward, next_state, done)
291 | 
292 |             if agent.num_transition >= args.capacity:
293 |                 agent.update()
294 | 
295 |             state = next_state
296 |             if done or t == 199:
297 |                 if i % 10 == 0:
298 |                     print("Ep_i {}, the ep_r is {}, the t is {}".format(i, ep_r, t))
299 |                 break
300 |         if i % args.log_interval == 0:
301 |             agent.save()
302 |         agent.writer.add_scalar('ep_r', ep_r, global_step=i)
303 |         ep_r = 0
304 | 
305 | 
306 | if __name__ == '__main__':
307 |     main()


--------------------------------------------------------------------------------
/Char09 SAC/SAC_BipedalWalker-v2.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import namedtuple
  3 | from itertools import count
  4 | import pickle
  5 | 
  6 | import os, random
  7 | import numpy as np
  8 | 
  9 | import gym
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torch.distributions import Normal
 15 | from tensorboardX import SummaryWriter
 16 | 
 17 | 
 18 | '''
 19 | Implementation of soft actor critic, dual Q network version 
 20 | Original paper: https://arxiv.org/abs/1801.01290
 21 | Not the author's implementation !
 22 | '''
 23 | 
 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 25 | parser = argparse.ArgumentParser()
 26 | 
 27 | parser.add_argument("--env_name", default="BipedalWalker-v2")  # OpenAI gym environment name
 28 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 29 | parser.add_argument('--target_update_interval', default=1, type=int)
 30 | parser.add_argument('--gradient_steps', default=1, type=int)
 31 | 
 32 | parser.add_argument('--learning_rate', default=3e-4, type=float)
 33 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma
 34 | parser.add_argument('--capacity', default=1000000, type=int) # replay buffer size
 35 | parser.add_argument('--iteration', default=100000, type=int) #  num of  games
 36 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size
 37 | parser.add_argument('--seed', default=1, type=int)
 38 | 
 39 | # optional parameters
 40 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 41 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int)
 42 | parser.add_argument('--sample_frequency', default=256, type=int)
 43 | parser.add_argument('--activation', default='Relu', type=str)
 44 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 45 | parser.add_argument('--log_interval', default=50, type=int) #
 46 | parser.add_argument('--load', default=False, type=bool) # load model
 47 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
 48 | args = parser.parse_args()
 49 | 
 50 | class NormalizedActions(gym.ActionWrapper):
 51 |     def _action(self, action):
 52 |         low = self.action_space.low
 53 |         high = self.action_space.high
 54 | 
 55 |         action = low + (action + 1.0) * 0.5 * (high - low)
 56 |         action = np.clip(action, low, high)
 57 | 
 58 |         return action
 59 | 
 60 |     def _reverse_action(self, action):
 61 |         low = self.action_space.low
 62 |         high = self.action_space.high
 63 | 
 64 |         action = 2 * (action - low) / (high - low) - 1
 65 |         action = np.clip(action, low, high)
 66 | 
 67 |         return action
 68 | 
 69 | 
 70 | env = NormalizedActions(gym.make(args.env_name))
 71 | 
 72 | # Set seeds
 73 | env.seed(args.seed)
 74 | torch.manual_seed(args.seed)
 75 | np.random.seed(args.seed)
 76 | 
 77 | state_dim = env.observation_space.shape[0]
 78 | action_dim = env.action_space.shape[0]
 79 | max_action = float(env.action_space.high[0])
 80 | min_Val = torch.tensor(1e-7).float().to(device)
 81 | 
 82 | class Replay_buffer():
 83 |     def __init__(self, capacity):
 84 |         self.capacity = capacity
 85 |         self.state_pool = torch.zeros(self.capacity, state_dim).float().to(device)
 86 |         self.action_pool = torch.zeros(self.capacity, action_dim).float().to(device)
 87 |         self.reward_pool = torch.zeros(self.capacity, 1).float().to(device)
 88 |         self.next_state_pool = torch.zeros(self.capacity, state_dim).float().to(device)
 89 |         self.done_pool = torch.zeros(self.capacity, 1).float().to(device)
 90 |         self.num_transition = 0
 91 | 
 92 |     def push(self, s, a, r, s_, d):
 93 |         index = self.num_transition % self.capacity
 94 |         s = torch.tensor(s).float().to(device)
 95 |         a = torch.tensor(a).float().to(device)
 96 |         r = torch.tensor(r).float().to(device)
 97 |         s_ = torch.tensor(s_).float().to(device)
 98 |         d = torch.tensor(d).float().to(device)
 99 |         for pool, ele in zip([self.state_pool, self.action_pool, self.reward_pool, self.next_state_pool, self.done_pool],
100 |                            [s, a, r, s_, d]):
101 |             pool[index] = ele
102 |         self.num_transition += 1
103 | 
104 |     def sample(self, batch_size):
105 |         index = np.random.choice(range(self.capacity), batch_size, replace=False)
106 |         bn_s, bn_a, bn_r, bn_s_, bn_d = self.state_pool[index], self.action_pool[index], self.reward_pool[index],\
107 |                                         self.next_state_pool[index], self.done_pool[index]
108 | 
109 |         return bn_s, bn_a, bn_r, bn_s_, bn_d
110 | 
111 | class Actor(nn.Module):
112 |     def __init__(self, state_dim, action_dim=action_dim, min_log_std=-10, max_log_std=2):
113 |         super(Actor, self).__init__()
114 |         self.fc1 = nn.Linear(state_dim, 256)
115 |         self.fc2 = nn.Linear(256, 512)
116 |         self.mu_head = nn.Linear(512, action_dim)
117 |         self.log_std_head = nn.Linear(512, action_dim)
118 |         self.max_action = max_action
119 | 
120 |         self.min_log_std = min_log_std
121 |         self.max_log_std = max_log_std
122 | 
123 |     def forward(self, x):
124 |         x = F.relu(self.fc1(x))
125 |         x = F.relu(self.fc2(x))
126 |         mu = self.mu_head(x)
127 |         log_std_head = F.relu(self.log_std_head(x))
128 |         log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std)
129 |         return mu, log_std_head
130 | 
131 | 
132 | class Critic(nn.Module):
133 |     def __init__(self, state_dim):
134 |         super(Critic, self).__init__()
135 |         self.fc1 = nn.Linear(state_dim, 256)
136 |         self.fc2 = nn.Linear(256, 256)
137 |         self.fc3 = nn.Linear(256, 1)
138 | 
139 |     def forward(self, x):
140 |         x = F.relu(self.fc1(x))
141 |         x = F.relu(self.fc2(x))
142 |         x = self.fc3(x)
143 |         return x
144 | 
145 | 
146 | class Q(nn.Module):
147 |     def __init__(self, state_dim, action_dim):
148 |         super(Q, self).__init__()
149 |         self.fc1 = nn.Linear(state_dim + action_dim, 256)
150 |         self.fc2 = nn.Linear(256, 256)
151 |         self.fc3 = nn.Linear(256, 1)
152 | 
153 |     def forward(self, s, a):
154 |         s = s.reshape(-1, state_dim)
155 |         a = a.reshape(-1, action_dim)
156 |         x = torch.cat((s, a), -1) # combination s and a
157 |         x = F.relu(self.fc1(x))
158 |         x = F.relu(self.fc2(x))
159 |         x = self.fc3(x)
160 |         return x
161 | 
162 | 
163 | class SAC():
164 |     def __init__(self):
165 |         super(SAC, self).__init__()
166 | 
167 |         self.policy_net = Actor(state_dim).to(device)
168 |         self.value_net = Critic(state_dim).to(device)
169 |         self.Target_value_net = Critic(state_dim).to(device)
170 |         self.Q_net1 = Q(state_dim, action_dim).to(device)
171 |         self.Q_net2 = Q(state_dim, action_dim).to(device)
172 | 
173 |         self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate)
174 |         self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate)
175 |         self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate)
176 |         self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate)
177 | 
178 |         self.replay_buffer = Replay_buffer(args.capacity)
179 |         self.num_transition = 0 # pointer of replay buffer
180 |         self.num_training = 0
181 |         self.writer = SummaryWriter('./exp-SAC_dual_Q_network')
182 | 
183 |         self.value_criterion = nn.MSELoss()
184 |         self.Q1_criterion = nn.MSELoss()
185 |         self.Q2_criterion = nn.MSELoss()
186 | 
187 |         for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
188 |             target_param.data.copy_(param.data)
189 | 
190 |         os.makedirs('./SAC_model/', exist_ok=True)
191 | 
192 |     def select_action(self, state):
193 |         state = torch.FloatTensor(state).to(device)
194 |         mu, log_sigma = self.policy_net(state)
195 |         sigma = torch.exp(log_sigma)
196 |         dist = Normal(mu, sigma)
197 |         z = dist.sample()
198 |         action = torch.tanh(z).detach().cpu().numpy()
199 |         return action # return a scalar, float32
200 | 
201 |     def evaluate(self, state):
202 |         batch_mu, batch_log_sigma = self.policy_net(state)
203 |         batch_sigma = torch.exp(batch_log_sigma)
204 |         dist = Normal(batch_mu, batch_sigma)
205 |         noise = Normal(0, 1)
206 | 
207 |         z = noise.sample()
208 |         action = torch.tanh(batch_mu + batch_sigma*z.to(device))
209 |         log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val)
210 |         return action, log_prob, z, batch_mu, batch_log_sigma
211 | 
212 |     def update(self):
213 |         if self.num_training % 500 == 0:
214 |             print("Training ... \t{} times ".format(self.num_training))
215 | 
216 |         for _ in range(args.gradient_steps):
217 |             bn_s, bn_a, bn_r, bn_s_, bn_d = self.replay_buffer.sample(args.batch_size)
218 | 
219 |             target_value = self.Target_value_net(bn_s_)
220 |             next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value
221 | 
222 |             excepted_value = self.value_net(bn_s)
223 |             excepted_Q1 = self.Q_net1(bn_s, bn_a)
224 |             excepted_Q2 = self.Q_net2(bn_s, bn_a)
225 |             sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s)
226 |             excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action))
227 |             next_value = excepted_new_Q - log_prob
228 | 
229 |             # !!!Note that the actions are sampled according to the current policy,
230 |             # instead of replay buffer. (From original paper)
231 |             V_loss = self.value_criterion(excepted_value, next_value.detach()).mean()  # J_V
232 | 
233 |             # Dual Q net
234 |             Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q
235 |             Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean()
236 | 
237 |             pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper
238 | 
239 |             self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training)
240 |             self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training)
241 |             self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training)
242 |             self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training)
243 | 
244 |             # mini batch gradient descent
245 |             self.value_optimizer.zero_grad()
246 |             V_loss.backward(retain_graph=True)
247 |             nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5)
248 |             self.value_optimizer.step()
249 | 
250 |             self.Q1_optimizer.zero_grad()
251 |             Q1_loss.backward(retain_graph = True)
252 |             nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5)
253 |             self.Q1_optimizer.step()
254 | 
255 |             self.Q2_optimizer.zero_grad()
256 |             Q2_loss.backward(retain_graph = True)
257 |             nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5)
258 |             self.Q2_optimizer.step()
259 | 
260 |             self.policy_optimizer.zero_grad()
261 |             pi_loss.backward(retain_graph = True)
262 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
263 |             self.policy_optimizer.step()
264 | 
265 |             # update target v net update
266 |             for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
267 |                 target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau)
268 | 
269 |             self.num_training += 1
270 | 
271 |     def save(self):
272 |         torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth')
273 |         torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth')
274 |         torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth')
275 |         torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net2.pth')
276 |         print("====================================")
277 |         print("Model has been saved...")
278 |         print("====================================")
279 | 
280 |     def load(self):
281 |         self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth'))
282 |         self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth'))
283 |         self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth'))
284 |         self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net2.pth'))
285 |         print("====================================")
286 |         print("model has been loaded...")
287 |         print("====================================")
288 | 
289 | 
290 | def main():
291 |     agent = SAC()
292 |     if args.load: agent.load()
293 |     print("====================================")
294 |     print("Collection Experience...")
295 |     print("====================================")
296 | 
297 |     ep_r = 0
298 |     for i in range(args.iteration):
299 |         state = env.reset()
300 |         for t in range(200):
301 |             action = agent.select_action(state)
302 |             next_state, reward, done, info = env.step(np.float32(action))
303 |             ep_r += reward
304 |             if args.render and i >= args.render_interval : env.render()
305 |             agent.replay_buffer.push(state, action, reward, next_state, done)
306 | 
307 |             state = next_state
308 |             if done:
309 |                 if agent.replay_buffer.num_transition >= args.capacity:
310 |                     agent.update()
311 |                 if i > 100:
312 |                     print("Ep_i \t{}, the ep_r is \t{}, the step is \t{}".format(i, ep_r, t))
313 |                 break
314 |         if i % args.log_interval == 0:
315 |             agent.save()
316 |         agent.writer.add_scalar('ep_r', ep_r, global_step=i)
317 |         ep_r = 0
318 | 
319 | 
320 | if __name__ == '__main__':
321 |     main()
322 | 


--------------------------------------------------------------------------------
/Char09 SAC/SAC_dual_Q_net.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import namedtuple
  3 | from itertools import count
  4 | 
  5 | import os
  6 | import numpy as np
  7 | 
  8 | 
  9 | import gym
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torch.distributions import Normal
 15 | from tensorboardX import SummaryWriter
 16 | 
 17 | 
 18 | '''
 19 | Implementation of soft actor critic, dual Q network version 
 20 | Original paper: https://arxiv.org/abs/1801.01290
 21 | Not the author's implementation !
 22 | '''
 23 | 
 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 25 | parser = argparse.ArgumentParser()
 26 | 
 27 | 
 28 | parser.add_argument("--env_name", default="Pendulum-v0")  # OpenAI gym environment name
 29 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 30 | parser.add_argument('--target_update_interval', default=1, type=int)
 31 | parser.add_argument('--gradient_steps', default=1, type=int)
 32 | 
 33 | 
 34 | parser.add_argument('--learning_rate', default=3e-4, type=int)
 35 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma
 36 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size
 37 | parser.add_argument('--iteration', default=100000, type=int) #  num of  games
 38 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size
 39 | parser.add_argument('--seed', default=1, type=int)
 40 | 
 41 | # optional parameters
 42 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 43 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int)
 44 | parser.add_argument('--sample_frequency', default=256, type=int)
 45 | parser.add_argument('--activation', default='Relu', type=str)
 46 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 47 | parser.add_argument('--log_interval', default=2000, type=int) #
 48 | parser.add_argument('--load', default=False, type=bool) # load model
 49 | args = parser.parse_args()
 50 | 
 51 | class NormalizedActions(gym.ActionWrapper):
 52 |     def _action(self, action):
 53 |         low = self.action_space.low
 54 |         high = self.action_space.high
 55 | 
 56 |         action = low + (action + 1.0) * 0.5 * (high - low)
 57 |         action = np.clip(action, low, high)
 58 | 
 59 |         return action
 60 | 
 61 |     def _reverse_action(self, action):
 62 |         low = self.action_space.low
 63 |         high = self.action_space.high
 64 | 
 65 |         action = 2 * (action - low) / (high - low) - 1
 66 |         action = np.clip(action, low, high)
 67 | 
 68 |         return action
 69 | 
 70 | 
 71 | env = NormalizedActions(gym.make(args.env_name))
 72 | 
 73 | # Set seeds
 74 | env.seed(args.seed)
 75 | torch.manual_seed(args.seed)
 76 | np.random.seed(args.seed)
 77 | 
 78 | state_dim = env.observation_space.shape[0]
 79 | action_dim = env.action_space.shape[0]
 80 | max_action = float(env.action_space.high[0])
 81 | min_Val = torch.tensor(1e-7).float().to(device)
 82 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd'])
 83 | 
 84 | class Actor(nn.Module):
 85 |     def __init__(self, state_dim, min_log_std=-10, max_log_std=2):
 86 |         super(Actor, self).__init__()
 87 |         self.fc1 = nn.Linear(state_dim, 256)
 88 |         self.fc2 = nn.Linear(256, 256)
 89 |         self.mu_head = nn.Linear(256, 1)
 90 |         self.log_std_head = nn.Linear(256, 1)
 91 |         self.max_action = max_action
 92 | 
 93 |         self.min_log_std = min_log_std
 94 |         self.max_log_std = max_log_std
 95 | 
 96 |     def forward(self, x):
 97 |         x = F.relu(self.fc1(x))
 98 |         x = F.relu(self.fc2(x))
 99 |         mu = self.mu_head(x)
100 |         log_std_head = F.relu(self.log_std_head(x))
101 |         log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std)
102 |         return mu, log_std_head
103 | 
104 | 
105 | class Critic(nn.Module):
106 |     def __init__(self, state_dim):
107 |         super(Critic, self).__init__()
108 |         self.fc1 = nn.Linear(state_dim, 256)
109 |         self.fc2 = nn.Linear(256, 256)
110 |         self.fc3 = nn.Linear(256, 1)
111 | 
112 |     def forward(self, x):
113 |         x = F.relu(self.fc1(x))
114 |         x = F.relu(self.fc2(x))
115 |         x = self.fc3(x)
116 |         return x
117 | 
118 | 
119 | class Q(nn.Module):
120 |     def __init__(self, state_dim, action_dim):
121 |         super(Q, self).__init__()
122 |         self.fc1 = nn.Linear(state_dim + action_dim, 256)
123 |         self.fc2 = nn.Linear(256, 256)
124 |         self.fc3 = nn.Linear(256, 1)
125 | 
126 |     def forward(self, s, a):
127 |         s = s.reshape(-1, state_dim)
128 |         a = a.reshape(-1, action_dim)
129 |         x = torch.cat((s, a), -1) # combination s and a
130 |         x = F.relu(self.fc1(x))
131 |         x = F.relu(self.fc2(x))
132 |         x = self.fc3(x)
133 |         return x
134 | 
135 | 
136 | class SAC():
137 |     def __init__(self):
138 |         super(SAC, self).__init__()
139 | 
140 |         self.policy_net = Actor(state_dim).to(device)
141 |         self.value_net = Critic(state_dim).to(device)
142 |         self.Target_value_net = Critic(state_dim).to(device)
143 |         self.Q_net1 = Q(state_dim, action_dim).to(device)
144 |         self.Q_net2 = Q(state_dim, action_dim).to(device)
145 | 
146 |         self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate)
147 |         self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate)
148 |         self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate)
149 |         self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate)
150 | 
151 |         self.replay_buffer = [Transition] * args.capacity
152 |         self.num_transition = 0 # pointer of replay buffer
153 |         self.num_training = 1
154 |         self.writer = SummaryWriter('./exp-SAC_dual_Q_network')
155 | 
156 |         self.value_criterion = nn.MSELoss()
157 |         self.Q1_criterion = nn.MSELoss()
158 |         self.Q2_criterion = nn.MSELoss()
159 | 
160 |         for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
161 |             target_param.data.copy_(param.data)
162 | 
163 |         os.makedirs('./SAC_model/', exist_ok=True)
164 | 
165 |     def select_action(self, state):
166 |         state = torch.FloatTensor(state).to(device)
167 |         mu, log_sigma = self.policy_net(state)
168 |         sigma = torch.exp(log_sigma)
169 |         dist = Normal(mu, sigma)
170 |         z = dist.sample()
171 |         action = torch.tanh(z).detach().cpu().numpy()
172 |         return action.item() # return a scalar, float32
173 | 
174 |     def store(self, s, a, r, s_, d):
175 |         index = self.num_transition % args.capacity
176 |         transition = Transition(s, a, r, s_, d)
177 |         self.replay_buffer[index] = transition
178 |         self.num_transition += 1
179 | 
180 |     def evaluate(self, state):
181 |         batch_mu, batch_log_sigma = self.policy_net(state)
182 |         batch_sigma = torch.exp(batch_log_sigma)
183 |         dist = Normal(batch_mu, batch_sigma)
184 |         noise = Normal(0, 1)
185 | 
186 |         z = noise.sample()
187 |         action = torch.tanh(batch_mu + batch_sigma*z.to(device))
188 |         log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val)
189 |         return action, log_prob, z, batch_mu, batch_log_sigma
190 | 
191 |     def update(self):
192 |         if self.num_training % 500 == 0:
193 |             print("Training ... {} times ".format(self.num_training))
194 |         s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device)
195 |         a = torch.tensor([t.a for t in self.replay_buffer]).to(device)
196 |         r = torch.tensor([t.r for t in self.replay_buffer]).to(device)
197 |         s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device)
198 |         d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device)
199 | 
200 |         for _ in range(args.gradient_steps):
201 |             #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False):
202 |             index = np.random.choice(range(args.capacity), args.batch_size, replace=False)
203 |             bn_s = s[index]
204 |             bn_a = a[index].reshape(-1, 1)
205 |             bn_r = r[index].reshape(-1, 1)
206 |             bn_s_ = s_[index]
207 |             bn_d = d[index].reshape(-1, 1)
208 | 
209 |             target_value = self.Target_value_net(bn_s_)
210 |             next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value
211 | 
212 |             excepted_value = self.value_net(bn_s)
213 |             excepted_Q1 = self.Q_net1(bn_s, bn_a)
214 |             excepted_Q2 = self.Q_net2(bn_s, bn_a)
215 |             sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s)
216 |             excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action))
217 |             next_value = excepted_new_Q - log_prob
218 | 
219 |             # !!!Note that the actions are sampled according to the current policy,
220 |             # instead of replay buffer. (From original paper)
221 |             V_loss = self.value_criterion(excepted_value, next_value.detach()).mean()  # J_V
222 | 
223 |             # Dual Q net
224 |             Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q
225 |             Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean()
226 | 
227 |             pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper
228 | 
229 |             self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training)
230 |             self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training)
231 |             self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training)
232 |             self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training)
233 | 
234 |             # mini batch gradient descent
235 |             self.value_optimizer.zero_grad()
236 |             V_loss.backward(retain_graph=True)
237 |             nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5)
238 |             self.value_optimizer.step()
239 | 
240 |             self.Q1_optimizer.zero_grad()
241 |             Q1_loss.backward(retain_graph = True)
242 |             nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5)
243 |             self.Q1_optimizer.step()
244 | 
245 |             self.Q2_optimizer.zero_grad()
246 |             Q2_loss.backward(retain_graph = True)
247 |             nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5)
248 |             self.Q2_optimizer.step()
249 | 
250 |             self.policy_optimizer.zero_grad()
251 |             pi_loss.backward(retain_graph = True)
252 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
253 |             self.policy_optimizer.step()
254 | 
255 |             # update target v net update
256 |             for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
257 |                 target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau)
258 | 
259 |             self.num_training += 1
260 | 
261 |     def save(self):
262 |         torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth')
263 |         torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth')
264 |         torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth')
265 |         torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net2.pth')
266 |         print("====================================")
267 |         print("Model has been saved...")
268 |         print("====================================")
269 | 
270 |     def load(self):
271 |         self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth'))
272 |         self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth'))
273 |         self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth'))
274 |         self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net2.pth'))
275 |         print("model has been load")
276 | 
277 | 
278 | def main():
279 | 
280 |     agent = SAC()
281 |     if args.load: agent.load()
282 |     if args.render: env.render()
283 |     print("====================================")
284 |     print("Collection Experience...")
285 |     print("====================================")
286 | 
287 |     ep_r = 0
288 |     for i in range(args.iteration):
289 |         state = env.reset()
290 |         for t in range(200):
291 |             action = agent.select_action(state)
292 |             next_state, reward, done, info = env.step(np.float32(action))
293 |             ep_r += reward
294 |             if args.render: env.render()
295 |             agent.store(state, action, reward, next_state, done)
296 | 
297 |             if agent.num_transition >= args.capacity:
298 |                 agent.update()
299 | 
300 |             state = next_state
301 |             if done or t == 199:
302 |                 if i % 10 == 0:
303 |                     print("Ep_i {}, the ep_r is {}, the t is {}".format(i, ep_r, t))
304 |                 break
305 |         if i % args.log_interval == 0:
306 |             agent.save()
307 |         agent.writer.add_scalar('ep_r', ep_r, global_step=i)
308 |         ep_r = 0
309 | 
310 | 
311 | if __name__ == '__main__':
312 |     main()
313 | 


--------------------------------------------------------------------------------
/Char09 SAC/SAC_ep_r_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char09 SAC/SAC_ep_r_curve.png


--------------------------------------------------------------------------------
/Char09 SAC/test_agent.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import namedtuple
  3 | from itertools import count
  4 | import pickle
  5 | import os
  6 | import numpy as np
  7 | 
  8 | 
  9 | import gym
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torch.distributions import Normal
 15 | from tensorboardX import SummaryWriter
 16 | 
 17 | 
 18 | '''
 19 | Implementation of soft actor critic, dual Q network version 
 20 | Original paper: https://arxiv.org/abs/1801.01290
 21 | Not the author's implementation !
 22 | '''
 23 | 
 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 25 | parser = argparse.ArgumentParser()
 26 | 
 27 | 
 28 | parser.add_argument("--env_name", default="BipedalWalker-v2")  # OpenAI gym environment name
 29 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 30 | parser.add_argument('--target_update_interval', default=1, type=int)
 31 | parser.add_argument('--gradient_steps', default=1, type=int)
 32 | parser.add_argument('--mode', default='test', type=str) # test or train
 33 | 
 34 | parser.add_argument('--learning_rate', default=3e-4, type=int)
 35 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma
 36 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size
 37 | parser.add_argument('--iteration', default=100000, type=int) #  num of  games
 38 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size
 39 | parser.add_argument('--seed', default=1, type=int)
 40 | 
 41 | # optional parameters
 42 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 43 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int)
 44 | parser.add_argument('--sample_frequency', default=256, type=int)
 45 | parser.add_argument('--activation', default='Relu', type=str)
 46 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 47 | parser.add_argument('--log_interval', default=50, type=int) #
 48 | parser.add_argument('--load', default=False, type=bool) # load model
 49 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
 50 | args = parser.parse_args()
 51 | 
 52 | class NormalizedActions(gym.ActionWrapper):
 53 |     def _action(self, action):
 54 |         low = self.action_space.low
 55 |         high = self.action_space.high
 56 | 
 57 |         action = low + (action + 1.0) * 0.5 * (high - low)
 58 |         action = np.clip(action, low, high)
 59 | 
 60 |         return action
 61 | 
 62 |     def _reverse_action(self, action):
 63 |         low = self.action_space.low
 64 |         high = self.action_space.high
 65 | 
 66 |         action = 2 * (action - low) / (high - low) - 1
 67 |         action = np.clip(action, low, high)
 68 | 
 69 |         return action
 70 | 
 71 | 
 72 | env = NormalizedActions(gym.make(args.env_name))
 73 | 
 74 | # Set seeds
 75 | env.seed(args.seed)
 76 | torch.manual_seed(args.seed)
 77 | np.random.seed(args.seed)
 78 | 
 79 | state_dim = env.observation_space.shape[0]
 80 | action_dim = env.action_space.shape[0]
 81 | max_action = float(env.action_space.high[0])
 82 | min_Val = torch.tensor(1e-7).float().to(device)
 83 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd'])
 84 | 
 85 | class Actor(nn.Module):
 86 |     def __init__(self, state_dim, action_dim=action_dim, min_log_std=-10, max_log_std=2):
 87 |         super(Actor, self).__init__()
 88 |         self.fc1 = nn.Linear(state_dim, 256)
 89 |         self.fc2 = nn.Linear(256, 512)
 90 |         self.mu_head = nn.Linear(512, action_dim)
 91 |         self.log_std_head = nn.Linear(512, action_dim)
 92 |         self.max_action = max_action
 93 | 
 94 |         self.min_log_std = min_log_std
 95 |         self.max_log_std = max_log_std
 96 | 
 97 |     def forward(self, x):
 98 |         x = F.relu(self.fc1(x))
 99 |         x = F.relu(self.fc2(x))
100 |         mu = self.mu_head(x)
101 |         log_std_head = F.relu(self.log_std_head(x))
102 |         log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std)
103 |         return mu, log_std_head
104 | 
105 | 
106 | class Critic(nn.Module):
107 |     def __init__(self, state_dim):
108 |         super(Critic, self).__init__()
109 |         self.fc1 = nn.Linear(state_dim, 256)
110 |         self.fc2 = nn.Linear(256, 256)
111 |         self.fc3 = nn.Linear(256, 1)
112 | 
113 |     def forward(self, x):
114 |         x = F.relu(self.fc1(x))
115 |         x = F.relu(self.fc2(x))
116 |         x = self.fc3(x)
117 |         return x
118 | 
119 | 
120 | class Q(nn.Module):
121 |     def __init__(self, state_dim, action_dim):
122 |         super(Q, self).__init__()
123 |         self.fc1 = nn.Linear(state_dim + action_dim, 256)
124 |         self.fc2 = nn.Linear(256, 256)
125 |         self.fc3 = nn.Linear(256, 1)
126 | 
127 |     def forward(self, s, a):
128 |         s = s.reshape(-1, state_dim)
129 |         a = a.reshape(-1, action_dim)
130 |         x = torch.cat((s, a), -1) # combination s and a
131 |         x = F.relu(self.fc1(x))
132 |         x = F.relu(self.fc2(x))
133 |         x = self.fc3(x)
134 |         return x
135 | 
136 | 
137 | class SAC():
138 |     def __init__(self):
139 |         super(SAC, self).__init__()
140 | 
141 |         self.policy_net = Actor(state_dim).to(device)
142 |         self.value_net = Critic(state_dim).to(device)
143 |         self.Target_value_net = Critic(state_dim).to(device)
144 |         self.Q_net1 = Q(state_dim, action_dim).to(device)
145 |         self.Q_net2 = Q(state_dim, action_dim).to(device)
146 | 
147 |         self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate)
148 |         self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate)
149 |         self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate)
150 |         self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate)
151 | 
152 |         self.replay_buffer = [Transition] * args.capacity
153 |         self.num_transition = 0 # pointer of replay buffer
154 |         self.num_training = 0
155 |         self.writer = SummaryWriter('./test_agent')
156 | 
157 |         self.value_criterion = nn.MSELoss()
158 |         self.Q1_criterion = nn.MSELoss()
159 |         self.Q2_criterion = nn.MSELoss()
160 | 
161 |         for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
162 |             target_param.data.copy_(param.data)
163 | 
164 |         os.makedirs('./SAC_model/', exist_ok=True)
165 | 
166 |     def select_action(self, state):
167 |         state = torch.FloatTensor(state).to(device)
168 |         mu, log_sigma = self.policy_net(state)
169 |         sigma = torch.exp(log_sigma)
170 |         dist = Normal(mu, sigma)
171 |         z = dist.sample()
172 |         action = torch.tanh(z).detach().cpu().numpy()
173 |         return action # return a scalar, float32
174 | 
175 |     def store(self, s, a, r, s_, d):
176 |         index = self.num_transition % args.capacity
177 |         transition = Transition(s, a, r, s_, d)
178 |         self.replay_buffer[index] = transition
179 |         self.num_transition += 1
180 | 
181 |     def evaluate(self, state):
182 |         batch_mu, batch_log_sigma = self.policy_net(state)
183 |         batch_sigma = torch.exp(batch_log_sigma)
184 |         dist = Normal(batch_mu, batch_sigma)
185 |         noise = Normal(0, 1)
186 | 
187 |         z = noise.sample()
188 |         action = torch.tanh(batch_mu + batch_sigma*z.to(device))
189 |         log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val)
190 |         return action, log_prob, z, batch_mu, batch_log_sigma
191 | 
192 |     def update(self):
193 |         if self.num_training % 500 == 0:
194 |             print("Training ... \t{} times ".format(self.num_training))
195 |         s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device)
196 |         a = torch.tensor([t.a for t in self.replay_buffer]).to(device)
197 |         r = torch.tensor([t.r for t in self.replay_buffer]).to(device)
198 |         s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device)
199 |         d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device)
200 | 
201 |         for _ in range(args.gradient_steps):
202 |             #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False):
203 |             index = np.random.choice(range(args.capacity), args.batch_size, replace=False)
204 |             bn_s = s[index].reshape(-1, state_dim)
205 |             bn_a = a[index].reshape(-1, action_dim)
206 |             bn_r = r[index].reshape(-1, 1)
207 |             bn_s_ = s_[index].reshape(-1, state_dim)
208 |             bn_d = d[index].reshape(-1, 1)
209 | 
210 |             target_value = self.Target_value_net(bn_s_)
211 |             next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value
212 | 
213 |             excepted_value = self.value_net(bn_s)
214 |             excepted_Q1 = self.Q_net1(bn_s, bn_a)
215 |             excepted_Q2 = self.Q_net2(bn_s, bn_a)
216 |             sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s)
217 |             excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action))
218 |             next_value = excepted_new_Q - log_prob
219 | 
220 |             # !!!Note that the actions are sampled according to the current policy,
221 |             # instead of replay buffer. (From original paper)
222 |             V_loss = self.value_criterion(excepted_value, next_value.detach()).mean()  # J_V
223 | 
224 |             # Dual Q net
225 |             Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q
226 |             Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean()
227 | 
228 |             pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper
229 | 
230 |             self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training)
231 |             self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training)
232 |             self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training)
233 |             self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training)
234 | 
235 |             # mini batch gradient descent
236 |             self.value_optimizer.zero_grad()
237 |             V_loss.backward(retain_graph=True)
238 |             nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5)
239 |             self.value_optimizer.step()
240 | 
241 |             self.Q1_optimizer.zero_grad()
242 |             Q1_loss.backward(retain_graph = True)
243 |             nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5)
244 |             self.Q1_optimizer.step()
245 | 
246 |             self.Q2_optimizer.zero_grad()
247 |             Q2_loss.backward(retain_graph = True)
248 |             nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5)
249 |             self.Q2_optimizer.step()
250 | 
251 |             self.policy_optimizer.zero_grad()
252 |             pi_loss.backward(retain_graph = True)
253 |             nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5)
254 |             self.policy_optimizer.step()
255 | 
256 |             # update target v net update
257 |             for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()):
258 |                 target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau)
259 | 
260 |             self.num_training += 1
261 | 
262 |     def save(self):
263 |         torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth')
264 |         torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth')
265 |         torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth')
266 |         torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net1.pth')
267 |         print("====================================")
268 |         print("Model has been saved...")
269 |         print("====================================")
270 | 
271 |     def load(self):
272 |         self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth'))
273 |         self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth'))
274 |         self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth'))
275 |         self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net1.pth'))
276 |         print("model has been load")
277 | 
278 | 
279 | def main():
280 |     agent = SAC()
281 |     ep_r = 0
282 |     if args.mode == 'test':
283 |         agent.load()
284 |         for i in range(args.iteration):
285 |             state = env.reset()
286 |             for t in count():
287 |                 action = agent.select_action(state)
288 |                 next_state, reward, done, info = env.step(np.float32(action))
289 |                 ep_r += reward
290 |                 env.render()
291 |                 if done:
292 |                     break
293 |                 state = next_state
294 |     else:
295 |         print("====================================")
296 |         print("Collection Experience...")
297 |         print("====================================")
298 | 
299 | 
300 |         for i in range(args.iteration):
301 |             state = env.reset()
302 |             for t in range(200):
303 |                 action = agent.select_action(state)
304 |                 next_state, reward, done, info = env.step(np.float32(action))
305 |                 ep_r += reward
306 |                 if args.render and i >= args.render_interval : env.render()
307 |                 agent.store(state, action, reward, next_state, done)
308 | 
309 |                 if agent.num_transition >= args.capacity:
310 |                     agent.update()
311 | 
312 |                 state = next_state
313 |                 if done:
314 |                     if i > 100:
315 |                         print("Ep_i \t{}, the ep_r is \t{}, the step is \t{}".format(i, ep_r, t))
316 |                     break
317 |             if i % args.log_interval == 0:
318 |                 agent.save()
319 |             agent.writer.add_scalar('ep_r', ep_r, global_step=i)
320 |             ep_r = 0
321 | 
322 | 
323 | if __name__ == '__main__':
324 |     main()
325 | 


--------------------------------------------------------------------------------
/Char10 TD3/Episode_reward_TD3_BipedakWalker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/Episode_reward_TD3_BipedakWalker.png


--------------------------------------------------------------------------------
/Char10 TD3/TD3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import namedtuple
  3 | from itertools import count
  4 | 
  5 | import os, sys, random
  6 | import numpy as np
  7 | 
  8 | import gym
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | from torch.distributions import Normal
 14 | from tensorboardX import SummaryWriter
 15 | 
 16 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 17 | parser = argparse.ArgumentParser()
 18 | 
 19 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test'
 20 | parser.add_argument("--env_name", default="Pendulum-v0")  # OpenAI gym environment name， BipedalWalker-v2
 21 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 22 | parser.add_argument('--target_update_interval', default=1, type=int)
 23 | parser.add_argument('--iteration', default=5, type=int)
 24 | 
 25 | parser.add_argument('--learning_rate', default=3e-4, type=float)
 26 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor
 27 | parser.add_argument('--capacity', default=50000, type=int) # replay buffer size
 28 | parser.add_argument('--num_iteration', default=100000, type=int) #  num of  games
 29 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size
 30 | parser.add_argument('--seed', default=1, type=int)
 31 | 
 32 | # optional parameters
 33 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 34 | parser.add_argument('--sample_frequency', default=256, type=int)
 35 | parser.add_argument('--activation', default='Relu', type=str)
 36 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 37 | parser.add_argument('--log_interval', default=50, type=int) #
 38 | parser.add_argument('--load', default=False, type=bool) # load model
 39 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
 40 | parser.add_argument('--policy_noise', default=0.2, type=float)
 41 | parser.add_argument('--noise_clip', default=0.5, type=float)
 42 | parser.add_argument('--policy_delay', default=2, type=int)
 43 | parser.add_argument('--exploration_noise', default=0.1, type=float)
 44 | parser.add_argument('--max_episode', default=2000, type=int)
 45 | parser.add_argument('--print_log', default=5, type=int)
 46 | args = parser.parse_args()
 47 | 
 48 | 
 49 | 
 50 | # Set seeds
 51 | # env.seed(args.seed)
 52 | # torch.manual_seed(args.seed)
 53 | # np.random.seed(args.seed)
 54 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 55 | script_name = os.path.basename(__file__)
 56 | env = gym.make(args.env_name)
 57 | 
 58 | state_dim = env.observation_space.shape[0]
 59 | action_dim = env.action_space.shape[0]
 60 | max_action = float(env.action_space.high[0])
 61 | min_Val = torch.tensor(1e-7).float().to(device) # min value
 62 | 
 63 | directory = './exp' + script_name + args.env_name +'./'
 64 | '''
 65 | Implementation of TD3 with pytorch 
 66 | Original paper: https://arxiv.org/abs/1802.09477
 67 | Not the author's implementation !
 68 | '''
 69 | 
 70 | class Replay_buffer():
 71 |     '''
 72 |     Code based on:
 73 |     https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 74 |     Expects tuples of (state, next_state, action, reward, done)
 75 |     '''
 76 |     def __init__(self, max_size=args.capacity):
 77 |         self.storage = []
 78 |         self.max_size = max_size
 79 |         self.ptr = 0
 80 | 
 81 |     def push(self, data):
 82 |         if len(self.storage) == self.max_size:
 83 |             self.storage[int(self.ptr)] = data
 84 |             self.ptr = (self.ptr + 1) % self.max_size
 85 |         else:
 86 |             self.storage.append(data)
 87 | 
 88 |     def sample(self, batch_size):
 89 |         ind = np.random.randint(0, len(self.storage), size=batch_size)
 90 |         x, y, u, r, d = [], [], [], [], []
 91 | 
 92 |         for i in ind:
 93 |             X, Y, U, R, D = self.storage[i]
 94 |             x.append(np.array(X, copy=False))
 95 |             y.append(np.array(Y, copy=False))
 96 |             u.append(np.array(U, copy=False))
 97 |             r.append(np.array(R, copy=False))
 98 |             d.append(np.array(D, copy=False))
 99 | 
100 |         return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)
101 | 
102 | 
103 | class Actor(nn.Module):
104 | 
105 |     def __init__(self, state_dim, action_dim, max_action):
106 |         super(Actor, self).__init__()
107 | 
108 |         self.fc1 = nn.Linear(state_dim, 400)
109 |         self.fc2 = nn.Linear(400, 300)
110 |         self.fc3 = nn.Linear(300, action_dim)
111 | 
112 |         self.max_action = max_action
113 | 
114 |     def forward(self, state):
115 |         a = F.relu(self.fc1(state))
116 |         a = F.relu(self.fc2(a))
117 |         a = torch.tanh(self.fc3(a)) * self.max_action
118 |         return a
119 | 
120 | 
121 | class Critic(nn.Module):
122 | 
123 |     def __init__(self, state_dim, action_dim):
124 |         super(Critic, self).__init__()
125 | 
126 |         self.fc1 = nn.Linear(state_dim + action_dim, 400)
127 |         self.fc2 = nn.Linear(400, 300)
128 |         self.fc3 = nn.Linear(300, 1)
129 | 
130 |     def forward(self, state, action):
131 |         state_action = torch.cat([state, action], 1)
132 | 
133 |         q = F.relu(self.fc1(state_action))
134 |         q = F.relu(self.fc2(q))
135 |         q = self.fc3(q)
136 |         return q
137 | 
138 | 
139 | class TD3():
140 |     def __init__(self, state_dim, action_dim, max_action):
141 | 
142 |         self.actor = Actor(state_dim, action_dim, max_action).to(device)
143 |         self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
144 |         self.critic_1 = Critic(state_dim, action_dim).to(device)
145 |         self.critic_1_target = Critic(state_dim, action_dim).to(device)
146 |         self.critic_2 = Critic(state_dim, action_dim).to(device)
147 |         self.critic_2_target = Critic(state_dim, action_dim).to(device)
148 | 
149 |         self.actor_optimizer = optim.Adam(self.actor.parameters())
150 |         self.critic_1_optimizer = optim.Adam(self.critic_1.parameters())
151 |         self.critic_2_optimizer = optim.Adam(self.critic_2.parameters())
152 | 
153 |         self.actor_target.load_state_dict(self.actor.state_dict())
154 |         self.critic_1_target.load_state_dict(self.critic_1.state_dict())
155 |         self.critic_2_target.load_state_dict(self.critic_2.state_dict())
156 | 
157 |         self.max_action = max_action
158 |         self.memory = Replay_buffer(args.capacity)
159 |         self.writer = SummaryWriter(directory)
160 |         self.num_critic_update_iteration = 0
161 |         self.num_actor_update_iteration = 0
162 |         self.num_training = 0
163 | 
164 |     def select_action(self, state):
165 |         state = torch.tensor(state.reshape(1, -1)).float().to(device)
166 |         return self.actor(state).cpu().data.numpy().flatten()
167 | 
168 |     def update(self, num_iteration):
169 | 
170 |         if self.num_training % 500 == 0:
171 |             print("====================================")
172 |             print("model has been trained for {} times...".format(self.num_training))
173 |             print("====================================")
174 |         for i in range(num_iteration):
175 |             x, y, u, r, d = self.memory.sample(args.batch_size)
176 |             state = torch.FloatTensor(x).to(device)
177 |             action = torch.FloatTensor(u).to(device)
178 |             next_state = torch.FloatTensor(y).to(device)
179 |             done = torch.FloatTensor(d).to(device)
180 |             reward = torch.FloatTensor(r).to(device)
181 | 
182 |             # Select next action according to target policy:
183 |             noise = torch.ones_like(action).data.normal_(0, args.policy_noise).to(device)
184 |             noise = noise.clamp(-args.noise_clip, args.noise_clip)
185 |             next_action = (self.actor_target(next_state) + noise)
186 |             next_action = next_action.clamp(-self.max_action, self.max_action)
187 | 
188 |             # Compute target Q-value:
189 |             target_Q1 = self.critic_1_target(next_state, next_action)
190 |             target_Q2 = self.critic_2_target(next_state, next_action)
191 |             target_Q = torch.min(target_Q1, target_Q2)
192 |             target_Q = reward + ((1 - done) * args.gamma * target_Q).detach()
193 | 
194 |             # Optimize Critic 1:
195 |             current_Q1 = self.critic_1(state, action)
196 |             loss_Q1 = F.mse_loss(current_Q1, target_Q)
197 |             self.critic_1_optimizer.zero_grad()
198 |             loss_Q1.backward()
199 |             self.critic_1_optimizer.step()
200 |             self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration)
201 | 
202 |             # Optimize Critic 2:
203 |             current_Q2 = self.critic_2(state, action)
204 |             loss_Q2 = F.mse_loss(current_Q2, target_Q)
205 |             self.critic_2_optimizer.zero_grad()
206 |             loss_Q2.backward()
207 |             self.critic_2_optimizer.step()
208 |             self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration)
209 |             # Delayed policy updates:
210 |             if i % args.policy_delay == 0:
211 |                 # Compute actor loss:
212 |                 actor_loss = - self.critic_1(state, self.actor(state)).mean()
213 | 
214 |                 # Optimize the actor
215 |                 self.actor_optimizer.zero_grad()
216 |                 actor_loss.backward()
217 |                 self.actor_optimizer.step()
218 |                 self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
219 |                 for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
220 |                     target_param.data.copy_(((1- args.tau) * target_param.data) + args.tau * param.data)
221 | 
222 |                 for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
223 |                     target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data)
224 | 
225 |                 for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
226 |                     target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data)
227 | 
228 |                 self.num_actor_update_iteration += 1
229 |         self.num_critic_update_iteration += 1
230 |         self.num_training += 1
231 | 
232 |     def save(self):
233 |         torch.save(self.actor.state_dict(), directory+'actor.pth')
234 |         torch.save(self.actor_target.state_dict(), directory+'actor_target.pth')
235 |         torch.save(self.critic_1.state_dict(), directory+'critic_1.pth')
236 |         torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth')
237 |         torch.save(self.critic_2.state_dict(), directory+'critic_2.pth')
238 |         torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth')
239 |         print("====================================")
240 |         print("Model has been saved...")
241 |         print("====================================")
242 | 
243 |     def load(self):
244 |         self.actor.load_state_dict(torch.load(directory + 'actor.pth'))
245 |         self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth'))
246 |         self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth'))
247 |         self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth'))
248 |         self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth'))
249 |         self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth'))
250 |         print("====================================")
251 |         print("model has been loaded...")
252 |         print("====================================")
253 | 
254 | 
255 | def main():
256 |     agent = TD3(state_dim, action_dim, max_action)
257 |     ep_r = 0
258 | 
259 |     if args.mode == 'test':
260 |         agent.load()
261 |         for i in range(args.iteration):
262 |             state = env.reset()
263 |             for t in count():
264 |                 action = agent.select_action(state)
265 |                 next_state, reward, done, info = env.step(np.float32(action))
266 |                 ep_r += reward
267 |                 env.render()
268 |                 if done or t ==2000 :
269 |                     print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
270 |                     break
271 |                 state = next_state
272 | 
273 |     elif args.mode == 'train':
274 |         print("====================================")
275 |         print("Collection Experience...")
276 |         print("====================================")
277 |         if args.load: agent.load()
278 |         for i in range(args.num_iteration):
279 |             state = env.reset()
280 |             for t in range(2000):
281 | 
282 |                 action = agent.select_action(state)
283 |                 action = action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0])
284 |                 action = action.clip(env.action_space.low, env.action_space.high)
285 |                 next_state, reward, done, info = env.step(action)
286 |                 ep_r += reward
287 |                 if args.render and i >= args.render_interval : env.render()
288 |                 agent.memory.push((state, next_state, action, reward, np.float(done)))
289 |                 if i+1 % 10 == 0:
290 |                     print('Episode {},  The memory size is {} '.format(i, len(agent.memory.storage)))
291 |                 if len(agent.memory.storage) >= args.capacity-1:
292 |                     agent.update(10)
293 | 
294 |                 state = next_state
295 |                 if done or t == args.max_episode -1:
296 |                     agent.writer.add_scalar('ep_r', ep_r, global_step=i)
297 |                     if i % args.print_log == 0:
298 |                         print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
299 |                     ep_r = 0
300 |                     break
301 | 
302 |             if i % args.log_interval == 0:
303 |                 agent.save()
304 | 
305 |     else:
306 |         raise NameError("mode wrong!!!")
307 | 
308 | if __name__ == '__main__':
309 |     main()
310 | 


--------------------------------------------------------------------------------
/Char10 TD3/TD3_BipedalWalker-v2.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import namedtuple
  3 | from itertools import count
  4 | 
  5 | import os, sys, random
  6 | import numpy as np
  7 | 
  8 | import gym
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | from torch.distributions import Normal
 14 | from tensorboardX import SummaryWriter
 15 | 
 16 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 17 | parser = argparse.ArgumentParser()
 18 | 
 19 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test'
 20 | # OpenAI gym environment name, # ['BipedalWalker-v2', 'Pendulum-v0'] or any continuous environment
 21 | # Note that if you want test in another game, you should fine-tuning.
 22 | parser.add_argument("--env_name", default="BipedalWalker-v2")
 23 | parser.add_argument('--tau',  default=0.005, type=float) # target smoothing coefficient
 24 | parser.add_argument('--target_update_interval', default=1, type=int)
 25 | parser.add_argument('--test_iteration', default=10, type=int)
 26 | 
 27 | parser.add_argument('--learning_rate', default=3e-4, type=float)
 28 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor
 29 | parser.add_argument('--capacity', default=50000, type=int) # replay buffer size
 30 | parser.add_argument('--num_iteration', default=100000, type=int) #  num of  games
 31 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size
 32 | parser.add_argument('--seed', default=False, type=bool)
 33 | parser.add_argument('--random_seed', default=9527, type=int)
 34 | # optional parameters
 35 | parser.add_argument('--num_hidden_layers', default=2, type=int)
 36 | parser.add_argument('--sample_frequency', default=256, type=int)
 37 | parser.add_argument('--render', default=False, type=bool) # show UI or not
 38 | parser.add_argument('--log_interval', default=50, type=int) #
 39 | parser.add_argument('--load', default=False, type=bool) # load model
 40 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work
 41 | parser.add_argument('--policy_noise', default=0.2, type=float)
 42 | parser.add_argument('--noise_clip', default=0.5, type=float)
 43 | parser.add_argument('--policy_delay', default=2, type=int)
 44 | parser.add_argument('--exploration_noise', default=0.1, type=float)
 45 | parser.add_argument('--max_episode', default=2000, type=int)
 46 | parser.add_argument('--print_log', default=5, type=int)
 47 | args = parser.parse_args()
 48 | 
 49 | 
 50 | 
 51 | 
 52 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 53 | script_name = os.path.basename(__file__)
 54 | env = gym.make(args.env_name)
 55 | if args.seed:
 56 |     env.seed(args.random_seed)
 57 |     torch.manual_seed(args.random_seed)
 58 |     np.random.seed(args.random_seed)
 59 | 
 60 | state_dim = env.observation_space.shape[0]
 61 | action_dim = env.action_space.shape[0]
 62 | max_action = float(env.action_space.high[0])
 63 | min_Val = torch.tensor(1e-7).float().to(device) # min value
 64 | 
 65 | directory = './exp' + script_name + args.env_name +'./'
 66 | '''
 67 | Implementation of TD3 with pytorch 
 68 | Original paper: https://arxiv.org/abs/1802.09477
 69 | Not the author's implementation !
 70 | '''
 71 | 
 72 | class Replay_buffer():
 73 |     '''
 74 |     Code based on:
 75 |     https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 76 |     Expects tuples of (state, next_state, action, reward, done)
 77 |     '''
 78 |     def __init__(self, max_size=args.capacity):
 79 |         self.storage = []
 80 |         self.max_size = max_size
 81 |         self.ptr = 0
 82 | 
 83 |     def push(self, data):
 84 |         if len(self.storage) == self.max_size:
 85 |             self.storage[int(self.ptr)] = data
 86 |             self.ptr = (self.ptr + 1) % self.max_size
 87 |         else:
 88 |             self.storage.append(data)
 89 | 
 90 |     def sample(self, batch_size):
 91 |         ind = np.random.randint(0, len(self.storage), size=batch_size)
 92 |         x, y, u, r, d = [], [], [], [], []
 93 | 
 94 |         for i in ind:
 95 |             X, Y, U, R, D = self.storage[i]
 96 |             x.append(np.array(X, copy=False))
 97 |             y.append(np.array(Y, copy=False))
 98 |             u.append(np.array(U, copy=False))
 99 |             r.append(np.array(R, copy=False))
100 |             d.append(np.array(D, copy=False))
101 | 
102 |         return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)
103 | 
104 | 
105 | class Actor(nn.Module):
106 | 
107 |     def __init__(self, state_dim, action_dim, max_action):
108 |         super(Actor, self).__init__()
109 | 
110 |         self.fc1 = nn.Linear(state_dim, 400)
111 |         self.fc2 = nn.Linear(400, 300)
112 |         self.fc3 = nn.Linear(300, action_dim)
113 | 
114 |         self.max_action = max_action
115 | 
116 |     def forward(self, state):
117 |         a = F.relu(self.fc1(state))
118 |         a = F.relu(self.fc2(a))
119 |         a = torch.tanh(self.fc3(a)) * self.max_action
120 |         return a
121 | 
122 | 
123 | class Critic(nn.Module):
124 | 
125 |     def __init__(self, state_dim, action_dim):
126 |         super(Critic, self).__init__()
127 | 
128 |         self.fc1 = nn.Linear(state_dim + action_dim, 400)
129 |         self.fc2 = nn.Linear(400, 300)
130 |         self.fc3 = nn.Linear(300, 1)
131 | 
132 |     def forward(self, state, action):
133 |         state_action = torch.cat([state, action], 1)
134 | 
135 |         q = F.relu(self.fc1(state_action))
136 |         q = F.relu(self.fc2(q))
137 |         q = self.fc3(q)
138 |         return q
139 | 
140 | 
141 | class TD3():
142 |     def __init__(self, state_dim, action_dim, max_action):
143 | 
144 |         self.actor = Actor(state_dim, action_dim, max_action).to(device)
145 |         self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
146 |         self.critic_1 = Critic(state_dim, action_dim).to(device)
147 |         self.critic_1_target = Critic(state_dim, action_dim).to(device)
148 |         self.critic_2 = Critic(state_dim, action_dim).to(device)
149 |         self.critic_2_target = Critic(state_dim, action_dim).to(device)
150 | 
151 |         self.actor_optimizer = optim.Adam(self.actor.parameters())
152 |         self.critic_1_optimizer = optim.Adam(self.critic_1.parameters())
153 |         self.critic_2_optimizer = optim.Adam(self.critic_2.parameters())
154 | 
155 |         self.actor_target.load_state_dict(self.actor.state_dict())
156 |         self.critic_1_target.load_state_dict(self.critic_1.state_dict())
157 |         self.critic_2_target.load_state_dict(self.critic_2.state_dict())
158 | 
159 |         self.max_action = max_action
160 |         self.memory = Replay_buffer(args.capacity)
161 |         self.writer = SummaryWriter(directory)
162 |         self.num_critic_update_iteration = 0
163 |         self.num_actor_update_iteration = 0
164 |         self.num_training = 0
165 | 
166 |     def select_action(self, state):
167 |         state = torch.tensor(state.reshape(1, -1)).float().to(device)
168 |         return self.actor(state).cpu().data.numpy().flatten()
169 | 
170 |     def update(self, num_iteration):
171 | 
172 |         if self.num_training % 500 == 0:
173 |             print("====================================")
174 |             print("model has been trained for {} times...".format(self.num_training))
175 |             print("====================================")
176 |         for i in range(num_iteration):
177 |             x, y, u, r, d = self.memory.sample(args.batch_size)
178 |             state = torch.FloatTensor(x).to(device)
179 |             action = torch.FloatTensor(u).to(device)
180 |             next_state = torch.FloatTensor(y).to(device)
181 |             done = torch.FloatTensor(d).to(device)
182 |             reward = torch.FloatTensor(r).to(device)
183 | 
184 |             # Select next action according to target policy:
185 |             noise = torch.ones_like(action).data.normal_(0, args.policy_noise).to(device)
186 |             noise = noise.clamp(-args.noise_clip, args.noise_clip)
187 |             next_action = (self.actor_target(next_state) + noise)
188 |             next_action = next_action.clamp(-self.max_action, self.max_action)
189 | 
190 |             # Compute target Q-value:
191 |             target_Q1 = self.critic_1_target(next_state, next_action)
192 |             target_Q2 = self.critic_2_target(next_state, next_action)
193 |             target_Q = torch.min(target_Q1, target_Q2)
194 |             target_Q = reward + ((1 - done) * args.gamma * target_Q).detach()
195 | 
196 |             # Optimize Critic 1:
197 |             current_Q1 = self.critic_1(state, action)
198 |             loss_Q1 = F.mse_loss(current_Q1, target_Q)
199 |             self.critic_1_optimizer.zero_grad()
200 |             loss_Q1.backward()
201 |             self.critic_1_optimizer.step()
202 |             self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration)
203 | 
204 |             # Optimize Critic 2:
205 |             current_Q2 = self.critic_2(state, action)
206 |             loss_Q2 = F.mse_loss(current_Q2, target_Q)
207 |             self.critic_2_optimizer.zero_grad()
208 |             loss_Q2.backward()
209 |             self.critic_2_optimizer.step()
210 |             self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration)
211 |             # Delayed policy updates:
212 |             if i % args.policy_delay == 0:
213 |                 # Compute actor loss:
214 |                 actor_loss = - self.critic_1(state, self.actor(state)).mean()
215 | 
216 |                 # Optimize the actor
217 |                 self.actor_optimizer.zero_grad()
218 |                 actor_loss.backward()
219 |                 self.actor_optimizer.step()
220 |                 self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
221 |                 for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
222 |                     target_param.data.copy_(((1- args.tau) * target_param.data) + args.tau * param.data)
223 | 
224 |                 for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
225 |                     target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data)
226 | 
227 |                 for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
228 |                     target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data)
229 | 
230 |                 self.num_actor_update_iteration += 1
231 |         self.num_critic_update_iteration += 1
232 |         self.num_training += 1
233 | 
234 |     def save(self):
235 |         torch.save(self.actor.state_dict(), directory+'actor.pth')
236 |         torch.save(self.actor_target.state_dict(), directory+'actor_target.pth')
237 |         torch.save(self.critic_1.state_dict(), directory+'critic_1.pth')
238 |         torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth')
239 |         torch.save(self.critic_2.state_dict(), directory+'critic_2.pth')
240 |         torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth')
241 |         print("====================================")
242 |         print("Model has been saved...")
243 |         print("====================================")
244 | 
245 |     def load(self):
246 |         self.actor.load_state_dict(torch.load(directory + 'actor.pth'))
247 |         self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth'))
248 |         self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth'))
249 |         self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth'))
250 |         self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth'))
251 |         self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth'))
252 |         print("====================================")
253 |         print("model has been loaded...")
254 |         print("====================================")
255 | 
256 | 
257 | def main():
258 |     agent = TD3(state_dim, action_dim, max_action)
259 |     ep_r = 0
260 | 
261 |     if args.mode == 'test':
262 |         agent.load()
263 |         for i in range(args.test_iteration):
264 |             state = env.reset()
265 |             for t in count():
266 |                 action = agent.select_action(state)
267 |                 next_state, reward, done, info = env.step(np.float32(action))
268 |                 ep_r += reward
269 |                 env.render()
270 |                 if done or t ==2000 :
271 |                     print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
272 |                     ep_r = 0
273 |                     break
274 |                 state = next_state
275 | 
276 | 
277 |     elif args.mode == 'train':
278 |         print("====================================")
279 |         print("Collection Experience...")
280 |         print("====================================")
281 |         if args.load: agent.load()
282 |         for i in range(args.num_iteration):
283 |             state = env.reset()
284 |             for t in range(2000):
285 | 
286 |                 action = agent.select_action(state)
287 |                 action = action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0])
288 |                 action = action.clip(env.action_space.low, env.action_space.high)
289 |                 next_state, reward, done, info = env.step(action)
290 |                 ep_r += reward
291 |                 if args.render and i >= args.render_interval : env.render()
292 |                 agent.memory.push((state, next_state, action, reward, np.float(done)))
293 |                 if i+1 % 10 == 0:
294 |                     print('Episode {},  The memory size is {} '.format(i, len(agent.memory.storage)))
295 |                 if len(agent.memory.storage) >= args.capacity-1:
296 |                     agent.update(10)
297 | 
298 |                 state = next_state
299 |                 if done or t == args.max_episode -1:
300 |                     agent.writer.add_scalar('ep_r', ep_r, global_step=i)
301 |                     if i % args.print_log == 0:
302 |                         print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
303 |                     ep_r = 0
304 |                     break
305 | 
306 |             if i % args.log_interval == 0:
307 |                 agent.save()
308 | 
309 |     else:
310 |         raise NameError("mode wrong!!!")
311 | 
312 | if __name__ == '__main__':
313 |     main()
314 | 


--------------------------------------------------------------------------------
/Char10 TD3/TD3_Pendulum-v0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/TD3_Pendulum-v0.png


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./actor.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./actor_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./actor_target.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./critic_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_1.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./critic_1_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_1_target.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./critic_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_2.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3.pyPendulum-v0./critic_2_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_2_target.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor_target.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1_target.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2.pth


--------------------------------------------------------------------------------
/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2_target.pth


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Johnny He
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/More/Application in real world/README.md:
--------------------------------------------------------------------------------
1 | # This repo we introduce some application in real world
2 | 
3 | 
4 | This [post](https://towardsdatascience.com/advanced-reinforcement-learning-6d769f529eb3) would give you an idea of the application of reinforcement learning in industry.
5 | 
6 | Here I upload some papers about this topic. 
7 | 


--------------------------------------------------------------------------------
/More/MARL/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-agent reinforcement learing
 2 | 
 3 | Multi-Agent Reinforcement Learning is a very interesting research area, which has strong connections with single-agent RL, multi-agent systems, game theory, evolutionary computation and optimization theory.
 4 | 
 5 | 
 6 | 1. Learning to Communicate with Deep Multi-Agent Reinforcement Learning in PyTorch
 7 | 
 8 | You can click [here](https://github.com/sweetice/learning-to-communicate-pytorch) to read code.
 9 | 
10 | 
11 | 
12 | 2. MARL papers
13 | 
14 | Paper list of multi-agent reinforcement learning(MARL)
15 | 
16 | You can click [here](https://github.com/LantaoYu/MARL-Papers) to read the repo.
17 | 


--------------------------------------------------------------------------------
/More/plot.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | import re
 6 | import os
 7 | sns.set(style='darkgrid')
 8 | 
 9 | def get_info(filename):
10 |     filename = filename.replace('.npy', '') # remove .npy
11 |     algo, env, seed = re.split('_', filename)
12 |     seed = int(seed)
13 |     return algo, env, seed
14 | 
15 | 
16 | def get_file_name(path='./'):
17 |     file_names = []
18 |     for _, __, file_name in os.walk(path):
19 |         file_names += file_name
20 |     data_name = [f for f in file_names if '.npy' in f]
21 |     return data_name
22 | 
23 | def exact_data(file_name, steps):
24 |     '''
25 |     exact data from single .npy file
26 |     :param file_name:
27 |     :return: a Dataframe include time, seed, algo_name, avg_reward
28 |     '''
29 |     avg_reward = np.load(file_name).reshape(-1, 1)
30 |     algo, env_name, seed = get_info(file_name)
31 |     df = pd.DataFrame(avg_reward)
32 |     df.columns = ['Average Return']
33 |     df['Time Steps (1e6)'] = steps
34 |     df['Algorithm'] = algo
35 |     df['env'] = env_name
36 |     df['seed'] = seed
37 |     return df
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     file_names = get_file_name('./')
42 |     _, env_name, __ = get_info(file_names[0])
43 |     df = pd.DataFrame([])
44 |     steps = np.linspace(0, 1, 201)
45 |     for file in file_names:
46 |         data = exact_data(file, steps)
47 |         df = pd.concat([df, data], axis=0)
48 |     sns.lineplot(x='Time Steps (1e6)', y='Average Return', data=df, hue='Algorithm',ci=90)
49 |     plt.title(env_name)
50 |     plt.savefig(env_name + '.svg')
51 |     plt.show()
52 | 


--------------------------------------------------------------------------------
/More/readme.md:
--------------------------------------------------------------------------------
1 | # More
2 | 
3 | This folder give you more insights about RL.
4 | 


--------------------------------------------------------------------------------
/figures/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/figures/test.png


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | **Status:** Active (under active development, breaking changes may occur)
  2 | 
  3 | This repository will implement the classic and state-of-the-art deep reinforcement learning algorithms. The aim of this repository is to provide clear pytorch code for people to learn the deep reinforcement learning algorithm. 
  4 | 
  5 | In the future, more state-of-the-art algorithms will be added and the existing codes will also be maintained.
  6 | 
  7 | ![demo](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/figures/grid.gif)
  8 | 
  9 | ## Requirements
 10 | - python <=3.6 
 11 | - tensorboardX
 12 | - gym >= 0.10
 13 | - pytorch >= 0.4
 14 | 
 15 | **Note that tensorflow does not support python3.7** 
 16 | 
 17 | ## Installation
 18 | 
 19 | ```
 20 | pip install -r requirements.txt
 21 | ```
 22 | 
 23 | If you fail:  
 24 | 
 25 | - Install gym
 26 | 
 27 | ```
 28 | pip install gym
 29 | ```
 30 | 
 31 | 
 32 | 
 33 | - Install the pytorch
 34 | ```bash
 35 | please go to official webisite to install it: https://pytorch.org/
 36 | 
 37 | Recommend use Anaconda Virtual Environment to manage your packages
 38 | 
 39 | ```
 40 | 
 41 | - Install tensorboardX
 42 | ```bash
 43 | pip install tensorboardX
 44 | pip install tensorflow==1.12
 45 | ```
 46 | 
 47 | - Test 
 48 | ```
 49 | cd Char10\ TD3/
 50 | python TD3_BipedalWalker-v2.py --mode test
 51 | ```
 52 | 
 53 | You could see a bipedalwalker if you install successfully.
 54 | 
 55 | BipedalWalker: 
 56 | 
 57 | ![](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/figures/test.png)
 58 | 
 59 | - 4. install openai-baselines (**Optional**)
 60 | 
 61 | ```bash
 62 | # clone the openai baselines
 63 | git clone https://github.com/openai/baselines.git
 64 | cd baselines
 65 | pip install -e .
 66 | 
 67 | ```
 68 | 
 69 | ## DQN
 70 | 
 71 | Here I uploaded two DQN models which is trianing CartPole-v0 and MountainCar-v0.
 72 | 
 73 | ### Tips for MountainCar-v0
 74 | 
 75 | This is a sparse binary reward task. Only when car reach the top of the mountain there is a none-zero reward. In genearal it may take 1e5 steps in stochastic policy. You can add a reward term, for example, to change to the current position of the Car is positively related. Of course, there is a more advanced approach that is inverse reinforcement learning.
 76 | 
 77 | ![value_loss](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char01%20DQN/DQN/pic/value_loss.jpg)   
 78 | ![step](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char01%20DQN/DQN/pic/finish_episode.jpg) 
 79 | This is value loss for DQN, We can see that the loss increaded to 1e13, however, the network work well. Because the target_net and act_net are very different with the training process going on. The calculated loss cumulate large. The previous loss was small because the reward was very sparse, resulting in a small update of the two networks.
 80 | 
 81 | ### Papers Related to the DQN
 82 | 
 83 | 
 84 |   1. Playing Atari with Deep Reinforcement Learning [[arxiv]](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/1.dqn.ipynb)
 85 |   2. Deep Reinforcement Learning with Double Q-learning [[arxiv]](https://arxiv.org/abs/1509.06461) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/2.double%20dqn.ipynb)
 86 |   3. Dueling Network Architectures for Deep Reinforcement Learning [[arxiv]](https://arxiv.org/abs/1511.06581) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/3.dueling%20dqn.ipynb)
 87 |   4. Prioritized Experience Replay [[arxiv]](https://arxiv.org/abs/1511.05952) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/4.prioritized%20dqn.ipynb)
 88 |   5. Noisy Networks for Exploration [[arxiv]](https://arxiv.org/abs/1706.10295) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/5.noisy%20dqn.ipynb)
 89 |   6. A Distributional Perspective on Reinforcement Learning [[arxiv]](https://arxiv.org/pdf/1707.06887.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/6.categorical%20dqn.ipynb)
 90 |   7. Rainbow: Combining Improvements in Deep Reinforcement Learning [[arxiv]](https://arxiv.org/abs/1710.02298) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/7.rainbow%20dqn.ipynb)
 91 |   8. Distributional Reinforcement Learning with Quantile Regression [[arxiv]](https://arxiv.org/pdf/1710.10044.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/8.quantile%20regression%20dqn.ipynb)
 92 |   9. Hierarchical Deep Reinforcement Learning: Integrating Temporal Abstraction and Intrinsic Motivation  [[arxiv]](https://arxiv.org/abs/1604.06057) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/9.hierarchical%20dqn.ipynb)
 93 |   10. Neural Episodic Control [[arxiv]](https://arxiv.org/pdf/1703.01988.pdf) [[code]](#)
 94 | 
 95 | 
 96 | ## Policy Gradient
 97 | 
 98 | 
 99 | Use the following command to run a saved model
100 | 
101 | 
102 | ```
103 | python Run_Model.py
104 | ```
105 | 
106 | 
107 | Use the following command to train model
108 | 
109 | 
110 | ```
111 | python pytorch_MountainCar-v0.py
112 | ```
113 | 
114 | 
115 | 
116 | > policyNet.pkl
117 | 
118 | This is a model that I have trained.
119 | 
120 | 
121 | ## Actor-Critic
122 | 
123 | This is an algorithmic framework, and the classic REINFORCE method is stored under Actor-Critic.
124 |  
125 | ## DDPG  
126 | Episode reward in Pendulum-v0:  
127 | 
128 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char05%20DDPG/DDPG_exp.jpg)  
129 | 
130 | 
131 | ## PPO  
132 | 
133 | - Original paper: https://arxiv.org/abs/1707.06347
134 | - Openai Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
135 | 
136 | 
137 | ## A2C
138 | 
139 | Advantage Policy Gradient, an paper in 2017 pointed out that the difference in performance between A2C and A3C is not obvious.
140 | 
141 | The Asynchronous Advantage Actor Critic method (A3C) has been very influential since the paper was published. The algorithm combines a few key ideas:
142 | 
143 | - An updating scheme that operates on fixed-length segments of experience (say, 20 timesteps) and uses these segments to compute estimators of the returns and advantage function.
144 | - Architectures that share layers between the policy and value function.
145 | - Asynchronous updates.
146 | 
147 | ## A3C
148 | 
149 | Original paper: https://arxiv.org/abs/1602.01783
150 | 
151 | ## SAC
152 | 
153 | **This is not the implementation of the author of paper!!!**
154 | 
155 | Episode reward in Pendulum-v0:
156 | 
157 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char09%20SAC/SAC_ep_r_curve.png)
158 | 
159 | ## TD3
160 | 
161 | **This is not the implementation of the author of paper!!!**  
162 | 
163 | Episode reward in Pendulum-v0:  
164 | 
165 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char10%20TD3/TD3_Pendulum-v0.png)  
166 | 
167 | Episode reward in BipedalWalker-v2:  
168 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char10%20TD3/Episode_reward_TD3_BipedakWalker.png)  
169 | 
170 | If you want to use the test your model:
171 | 
172 | ```
173 | python TD3_BipedalWalker-v2.py --mode test
174 | ```
175 | 
176 | ## Papers Related to the Deep Reinforcement Learning
177 | [01] [A Brief Survey of Deep Reinforcement Learning](https://arxiv.org/abs/1708.05866)  
178 | [02] [The Beta Policy for Continuous Control Reinforcement Learning](https://www.ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf)  
179 | [03] [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)  
180 | [04] [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)  
181 | [05] [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)  
182 | [06] [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)  
183 | [07] [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748)  
184 | [08] [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783)  
185 | [09] [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)  
186 | [10] [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)  
187 | [11] [Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation](https://arxiv.org/abs/1708.05144)  
188 | [12] [High-Dimensional Continuous Control Using Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438)  
189 | [13] [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](https://arxiv.org/abs/1801.01290)  
190 | [14] [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/abs/1802.09477)  
191 | 
192 | ## TO DO
193 | - [x] DDPG
194 | - [x] SAC
195 | - [x] TD3
196 | 
197 | 
198 | # Best RL courses
199 | - [OpenAI's spinning up](https://spinningup.openai.com/)  
200 | - [David Silver's course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)  
201 | - [Berkeley deep RL](http://rll.berkeley.edu/deeprlcourse/)  
202 | - [Practical RL](https://github.com/yandexdataschool/Practical_RL)  
203 | - [Deep Reinforcement Learning by Hung-yi Lee](https://www.youtube.com/playlist?list=PLJV_el3uVTsODxQFgzMzPLa16h6B8kWM_)   
204 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.0
2 | torchvision
3 | tensorflow==1.15.2
4 | tensorboardX
5 | gym
6 | gym[atari]
7 | 


--------------------------------------------------------------------------------