├── README.md └── algorithm ├── policy gradient ├── A2C.py ├── A3C │ ├── main.py │ ├── model.py │ └── utils.py ├── Actor_Critic.py ├── DDPG │ ├── experience_replay.py │ ├── main.py │ └── model.py ├── DDPG_discrete │ ├── experience_replay.py │ ├── gumbel_softmax.py │ ├── main.py │ └── model.py ├── REINFORCE.py ├── SAC │ ├── experience_replay.py │ ├── main.py │ └── model.py ├── TD3 │ ├── experience_replay.py │ ├── main.py │ └── model.py ├── TRPO │ ├── main.py │ └── model.py └── baseline_REINFORCE.py └── value-based ├── DQN.py ├── DoubleDQN.py ├── DuelingDQN.py └── Sarsa.py /README.md: -------------------------------------------------------------------------------- 1 | # Deep-RL-with-pytorch 2 | Practice for deep reinforcement learning algorithms by a starter. 3 | Test environment is Gym-CartPolev0 for discrete action space and Gym-PendulmV0 for continuous action space. 4 | Under active development. 5 | Including:DQN, REINFORCE, baseline-REINFORCE, Actor-Critic, Double DQN, Dueling DQN, Sarsa, DDPG, DDPG for discrete action space, A2C, A3C, TD3, SAC, TRPO 6 | 7 | ## 2020-9-19 implement 8 | ### algorithm: 9 | 1.DQN 10 | 2.REINFORCE 11 | ### components: 12 | 1.experience replay 13 | ## 2020-9-20 implement 14 | ### algorithm: 15 | 1.baseline-REINFORCE 16 | 2.Actor-Critic 17 | 18 | Add CUDA support 19 | ## 2021-1-15 implement 20 | ### algorithm: 21 | 1.Double DQN 22 | 2.Dueling DQN 23 | ## 2021-1-19 implement 24 | ### algorithm: 25 | 1.Sarsa 26 | ## 2021-1-23 implement 27 | ### algorithm: 28 | 1.DDPG 29 | 2.DDPG for discrete action space using gumbel softmax 30 | ## 2021-1-26 implement 31 | ### algorithm: 32 | 1.A2C 33 | ## 2021-1-27 implement 34 | ### algorithm: 35 | 1.A3C 36 | ## 2021-2-4 implement 37 | ### algorithm: 38 | 1.TD3 39 | 2.SAC 40 | ## 2021-2-25 implement 41 | ### algorithm: 42 | 1.TRPO(Natural Policy gradient). 43 | Unknown bug exists: Hessian matrix may not be positive definite at the beginning of training(But the training will usually converge) 44 | -------------------------------------------------------------------------------- /algorithm/policy gradient/A2C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch as th 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.distributions.categorical import Categorical 7 | lr=0.001 8 | gamma=0.9 9 | hidden=32 10 | env=gym.make('CartPole-v0') 11 | device="cuda" 12 | env=env.unwrapped 13 | n_action=env.action_space.n 14 | n_state=env.observation_space.shape[0] 15 | 16 | class actor(nn.Module): #policy net 17 | def __init__(self): 18 | super(actor,self).__init__() 19 | self.fc1=nn.Linear(n_state,hidden) 20 | self.fc2=nn.Linear(hidden,n_action) 21 | self.softmax=nn.Softmax() 22 | def forward(self,x): 23 | x=self.fc1(x) 24 | x=F.relu(x) 25 | x=self.fc2(x) 26 | prob=self.softmax(x) 27 | return prob 28 | 29 | 30 | class Q(nn.Module): 31 | def __init__(self): 32 | super(Q,self).__init__() 33 | self.q=nn.Sequential(nn.Linear(n_state,hidden), 34 | nn.ReLU(), 35 | nn.Linear(hidden,n_action)) 36 | def forward(self,x): 37 | q=self.q(x) 38 | return q 39 | 40 | class V(nn.Module): 41 | def __init__(self): 42 | super(V,self).__init__() 43 | self.v=nn.Sequential(nn.Linear(n_state,hidden), 44 | nn.ReLU(), 45 | nn.Linear(hidden,1)) 46 | def forward(self,x): 47 | v=self.v(x) 48 | return v 49 | 50 | class critic(nn.Module): 51 | def __init__(self): 52 | super(critic,self).__init__() 53 | self.v=V() 54 | self.q=Q() 55 | 56 | def forward(self,x): 57 | v=self.v(x) 58 | q=self.q(x) 59 | advantage=q-v.repeat(2) 60 | return advantage 61 | 62 | 63 | class AC(): 64 | def __init__(self): 65 | self.actor=actor().to(device) 66 | self.critic=critic().to(device) 67 | 68 | self.Aoptimizer=th.optim.Adam(self.actor.parameters(),lr=lr) 69 | self.Qoptimizer=th.optim.Adam(self.critic.q.parameters(),lr=lr) 70 | self.Voptimizer=th.optim.Adam(self.critic.v.parameters(),lr=lr) 71 | 72 | def choose_action(self,s): 73 | s=th.FloatTensor(s).to(device) 74 | a_prob=self.actor(s) 75 | dist=Categorical(a_prob) 76 | action=dist.sample().tolist() 77 | return action 78 | 79 | def actor_learn(self,s,a,A): 80 | s=th.FloatTensor(s).to(device) 81 | a_prob=self.actor(s)[a] 82 | loss=-(th.log(a_prob)*A.detach()) 83 | 84 | self.Aoptimizer.zero_grad() 85 | loss.backward() 86 | self.Aoptimizer.step() 87 | 88 | def critic_learn(self,transition): #transition=[s,[r],[a],s_,[done]] 89 | s=th.FloatTensor(transition[0]).to(device) 90 | r=transition[1][0] 91 | s_=th.FloatTensor(transition[3]).to(device) 92 | done=transition[4][0] 93 | 94 | a=transition[2][0] 95 | q=self.critic.q(s)[a] 96 | v=self.critic.v(s) 97 | A=q-v 98 | v_=self.critic.v(s_)*gamma+r 99 | if not done: 100 | q_target=th.max(self.critic.q(s_))*gamma+r 101 | loss_q=(q-q_target.detach())**2 102 | else: 103 | q_target=r 104 | loss_q=(q-q_target)**2 105 | loss_v=(v-v_.detach())**2 106 | #print(loss) 107 | self.Qoptimizer.zero_grad() 108 | loss_q.backward() 109 | self.Qoptimizer.step() 110 | self.Voptimizer.zero_grad() 111 | loss_v.backward() 112 | self.Voptimizer.step() 113 | return A 114 | 115 | 116 | ac=AC() 117 | 118 | for episode in range(10000): 119 | t=0 120 | s=env.reset() 121 | total_reward=0 122 | while(t<300): 123 | a=ac.choose_action(s) 124 | s_,r,done,_=env.step(a) 125 | total_reward+=r 126 | transition=[s,[r],[a],s_,[done]] 127 | 128 | A=ac.critic_learn(transition) 129 | ac.actor_learn(s,a,A) 130 | if done: 131 | break 132 | s=s_ 133 | if(episode%10==0): 134 | print("Episode:"+format(episode)+",score:"+format(total_reward)) 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /algorithm/policy gradient/A3C/main.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | from utils import SharedAdam 3 | from model import Worker,Net 4 | import gym 5 | import torch.multiprocessing as mp 6 | 7 | env=gym.make('CartPole-v0') 8 | n_action=env.action_space.n 9 | n_state=env.observation_space.shape[0] 10 | 11 | global_net=Net(n_state,n_action) 12 | global_net.share_memory() 13 | optA=SharedAdam(global_net.policy.parameters(), lr=1e-4, betas=(0.92, 0.999)) 14 | optC=SharedAdam(global_net.v.parameters(), lr=1e-4, betas=(0.92, 0.999)) 15 | workers=[Worker(global_net,optA,optC,str(i)) for i in range(8)] 16 | [w.start() for w in workers] 17 | 18 | [w.join() for w in workers] 19 | -------------------------------------------------------------------------------- /algorithm/policy gradient/A3C/model.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import multiprocessing as mp 5 | import gym 6 | from utils import update_global 7 | 8 | max_lstep=200 9 | update_interval=10 10 | 11 | class Net(nn.Module): 12 | def __init__(self,n_state,n_action): 13 | super(Net,self).__init__() 14 | self.n_state=n_state 15 | self.n_action=n_action 16 | self.v=nn.Sequential( 17 | nn.Linear(n_state,256), 18 | nn.Linear(256,1)) 19 | self.policy=nn.Sequential( 20 | nn.Linear(n_state,256), 21 | nn.Linear(256,n_action)) 22 | def forward(self,x): 23 | value=self.v(x) 24 | prob=F.softmax(self.policy(x)) 25 | #print(prob) 26 | return value,prob 27 | 28 | class Worker(mp.Process): 29 | def __init__(self,gnet,optA,optC,name): 30 | super(Worker,self).__init__() 31 | self.name='Worker '+name 32 | self.optA=optA 33 | self.optC=optC 34 | self.env=gym.make('CartPole-v0') 35 | self.lnet=Net(self.env.observation_space.shape[0],self.env.action_space.n) 36 | self.gnet=gnet 37 | self.queue=[] 38 | self.max_episode=1000 39 | self.cur_episode=0 40 | def choose_action(self,s): 41 | _,prob=self.lnet(th.Tensor(s)) 42 | dist=th.distributions.categorical.Categorical(prob) 43 | a=dist.sample().tolist() 44 | return a 45 | 46 | def run(self): 47 | buffer_a,buffer_s,buffer_r,buffer_s_,buffer_d=[],[],[],[],[] 48 | while self.cur_episode