├── .DS_Store ├── .idea ├── .gitignore ├── DRL_pytorch.iml ├── deployment.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── other.xml ├── vcs.xml └── webServers.xml ├── Actor_Critic ├── A3C │ ├── Pendulum_A3C_1.png │ ├── __pycache__ │ │ ├── agent_a3c.cpython-37.pyc │ │ ├── agent_a3c.cpython-38.pyc │ │ ├── untils.cpython-37.pyc │ │ ├── untils.cpython-38.pyc │ │ ├── worker.cpython-37.pyc │ │ └── worker.cpython-38.pyc │ ├── a3c_main.py │ ├── agent_a3c.py │ ├── untils.py │ └── worker.py └── SAC │ ├── sac_agent.py │ ├── sac_main.py │ └── sac_network.py ├── BlackBox_optimazation ├── Hill_Climbing │ ├── __pycache__ │ │ └── agent_HC.cpython-36.pyc │ ├── agent_HC.py │ └── main_hillClimb.py └── cross_entropy_method │ ├── CEM.png │ ├── __pycache__ │ └── agent_cem.cpython-37.pyc │ ├── agent_cem.py │ ├── checkpoint.pth │ └── main_cem.py ├── DDPGs ├── DDPG │ ├── DDPG_agent.py │ ├── DDPG_main.py │ ├── DDPG_model.py │ ├── ddpg_1.py │ └── model_save │ │ ├── actor1.pth │ │ ├── actor2.pth │ │ ├── checkpoint_actor.pth │ │ ├── checkpoint_critic.pth │ │ ├── critic1.pth │ │ └── critic2.pth └── TD3 │ ├── TD3_agent.py │ ├── TD3_main.py │ ├── TD3_model.py │ ├── TD3_new.py │ ├── TD3_solved.png │ ├── __pycache__ │ ├── TD3_model.cpython-38.pyc │ └── TD3_new.cpython-38.pyc │ ├── models │ ├── TD3_actor.pth │ └── TD3_critic.pth │ ├── scores_saved.csv │ └── test.py ├── DQNs ├── .DS_Store ├── DDQN │ ├── .DS_Store │ ├── DQN_main.py │ ├── Deep_Q_Network.ipynb │ ├── __pycache__ │ │ ├── ddqn_v3.cpython-38.pyc │ │ └── model_dueling.cpython-38.pyc │ ├── ddqn_v1.py │ ├── ddqn_v2.py │ ├── ddqn_v3.py │ ├── dqn.py │ ├── images │ │ ├── Total Average reward scores plot.png │ │ ├── ddqn_agent_scores.png │ │ ├── ddqn_testing_scores.png │ │ ├── double_dqn_v1.png │ │ ├── dueling-ddqn_testing.png │ │ ├── dueling-ddqn_training.png │ │ ├── runningResult.png │ │ └── runningResult_1.png │ ├── model.py │ ├── model_dueling.py │ ├── models │ │ ├── checkpoint.pth │ │ ├── dueling_model.pth │ │ └── org_dqn.pth │ ├── old_agent.py │ ├── play_env.py │ └── test.py ├── DQN_PER │ ├── .DS_Store │ ├── PER_memory.py │ ├── Plots │ │ ├── cnn_per.png │ │ ├── epsilon_1.png │ │ ├── epsilon_2.png │ │ ├── epsilon_exp-1.png │ │ ├── epsilon_exp-2.png │ │ ├── epsilon_exp-3.png │ │ ├── epsilon_linear-1.png │ │ ├── train_1.png │ │ ├── train_2.png │ │ ├── train_DQN_per.png │ │ ├── train_exp-1.png │ │ ├── train_exp-2.png │ │ ├── train_exp-3.png │ │ ├── train_exp.png │ │ └── train_linear-1.png │ ├── SumTree.py │ ├── __pycache__ │ │ ├── PER_memory.cpython-38.pyc │ │ ├── SumTree.cpython-38.pyc │ │ ├── atari_wappers.cpython-38.pyc │ │ ├── dqn_model.cpython-38.pyc │ │ └── dqn_per.cpython-38.pyc │ ├── atari_wappers.py │ ├── dqn_model.py │ ├── dqn_per.py │ ├── main_dqn_per.py │ └── train_20210520.log ├── DQN_cnn │ ├── .DS_Store │ ├── Models │ │ ├── CNN_model|03-29#19:21.pth │ │ ├── CNN_model|03-30#11:19.pth │ │ ├── CNN_model|03-30#21:05.pth │ │ ├── CNN_model|03-31#19:32.pth │ │ ├── dqnCNN_model_0324.pth │ │ └── dqn_model.pth │ ├── Plots │ │ ├── test-score|03-25#20:00.png │ │ ├── test-score|03-26#09:15.png │ │ ├── test-score|03-26#09:45.png │ │ ├── train-score|03-29#19:21.png │ │ ├── train-score|03-30#11:19.png │ │ ├── train-score|03-30#21:05.png │ │ └── train-score|03-31#19:32.png │ ├── ReadMe.md │ ├── __pycache__ │ │ ├── atari_wappers.cpython-38.pyc │ │ ├── cnn_model.cpython-38.pyc │ │ └── dqn_agent.cpython-38.pyc │ ├── atari_wappers.py │ ├── cnn_model.py │ ├── dqn_agent.py │ ├── image │ │ ├── pic-0.jpg │ │ ├── pic-100.jpg │ │ ├── pic-140.jpg │ │ ├── pic-152.jpg │ │ ├── pic-167.jpg │ │ ├── pic-185.jpg │ │ ├── pic-200.jpg │ │ ├── pic-204.jpg │ │ ├── pic-227.jpg │ │ ├── pic-300.jpg │ │ ├── pic-400.jpg │ │ ├── pic-500.jpg │ │ ├── pic-600.jpg │ │ ├── pic-674.jpg │ │ ├── pic-683.jpg │ │ ├── pic-696.jpg │ │ ├── pic-700.jpg │ │ ├── pic-714.jpg │ │ ├── pic-733.jpg │ │ ├── pic-756.jpg │ │ ├── pic-800.jpg │ │ ├── pic-900.jpg │ │ ├── pic-902.jpg │ │ ├── pic-909.jpg │ │ ├── pic-920.jpg │ │ ├── pic-936.jpg │ │ └── pic-956.jpg │ ├── log │ │ ├── train_20210326.log │ │ ├── train_20210329.log │ │ ├── train_20210329_1.log │ │ ├── train_20210330.log │ │ └── train_20210331.log │ ├── main_dqn_atari.py │ ├── main_test.py │ ├── play_atari.py │ └── train_20210401.log └── DQN_iws │ └── ref_ddqn_iws.py ├── Evaluation_Algorithms └── CartPole.py ├── Games_play_train └── atari.py └── Policy_Gradient ├── .DS_Store ├── PGs ├── __pycache__ │ ├── agent_PG.cpython-37.pyc │ ├── model.cpython-36.pyc │ └── model.cpython-37.pyc ├── agent_PG.py ├── main_PG.py ├── model.py └── models │ ├── PPO_model-1.pth │ ├── PPO_new.pth │ ├── PPOv2_model-1.pth │ ├── pg_model_1.pth │ ├── pg_model_2.pth │ ├── pg_model_3.pth │ ├── pg_model_4.pth │ ├── reinforce_model_2.pth │ ├── reinforce_model_3.pth │ ├── reinforce_model_4.pth │ ├── reinforce_model_5.pth │ └── reinforce_model_6.pth ├── PPO ├── .DS_Store ├── PPO_model.py ├── PPO_v1.py ├── PPO_v2.py ├── __pycache__ │ ├── PPO_model.cpython-37.pyc │ ├── PPO_model.cpython-38.pyc │ ├── PPO_v1.cpython-38.pyc │ └── PPO_v2.cpython-38.pyc ├── board │ ├── .DS_Store │ └── logs │ │ ├── .DS_Store │ │ ├── events.out.tfevents.1608693869.bogon.80327.0 │ │ ├── events.out.tfevents.1608694041.bogon.80355.0 │ │ ├── events.out.tfevents.1608778854.bogon.82580.0 │ │ ├── events.out.tfevents.1608779119.bogon.82611.0 │ │ ├── events.out.tfevents.1608779166.bogon.82627.0 │ │ ├── events.out.tfevents.1608779638.bogon.82655.0 │ │ ├── events.out.tfevents.1608779657.bogon.82666.0 │ │ ├── events.out.tfevents.1608780330.bogon.82692.0 │ │ └── events.out.tfevents.1608780689.bogon.82718.0 ├── cartPole_ppo-v1_1.png ├── main_PPO.py └── models │ ├── PPO_new.pth │ ├── cartPole_ppo.pth │ └── cartPole_ppo_20201222.pth ├── PPO_cnn └── cnn_ppo.py ├── envTest.py └── results&plots ├── .DS_Store ├── PPO-A vs. PPO-R.png ├── PPO-A vs.PPO-R_1.png ├── PPO-A_train_5_times.png ├── PPO-A_train_5_times1.png ├── PPO-A_train_5times_2.png ├── PPO-entropy_5times.png ├── PPO_cartPole_20201222.png ├── PPO_cartPole_test.png ├── PPO_cartPole_train.png ├── PPO_comparison.png ├── PPO_comparison_1.png ├── PPO_comparison_2.png ├── PPO_comparison_3.png ├── PPO_comparison_4.png ├── PPO_multiple_1.png ├── PPO_with_entropy.png ├── PPO_with_entropy_1.png ├── cartpole_reinforce.png ├── diff_algorithm.png ├── pg_1.png ├── pg_2.png ├── pg_3.png ├── reinforce_1.png ├── reinforce_2.png ├── reinforce_3.png ├── reinforce_4.png ├── reinforce_5.png ├── reinforce_vs_pg.png ├── rf-vs-pg_1.png ├── rf-vs-pg_2.png ├── rf-vs-pg_3.png └── rf-vs-pg_4.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/.DS_Store -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/DRL_pytorch.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/webServers.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | -------------------------------------------------------------------------------- /Actor_Critic/A3C/Pendulum_A3C_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/Pendulum_A3C_1.png -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc -------------------------------------------------------------------------------- /Actor_Critic/A3C/a3c_main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from Actor_Critic.A3C.agent_a3c import A3C 5 | 6 | 7 | def get_env_prop(env_name, continuous): 8 | env = gym.make(env_name) 9 | state_dim = env.observation_space.shape[0] 10 | if continuous: 11 | action_dim = env.action_space.shape[0] 12 | else: 13 | action_dim = env.action_space.n 14 | 15 | return env,state_dim, action_dim 16 | 17 | 18 | def train_a3c(env_name,continuous): 19 | env,state_size,action_size = get_env_prop(env_name,continuous) 20 | agent = A3C(env,continuous,state_size,action_size) 21 | scores = agent.train_worker() 22 | return scores 23 | 24 | 25 | def train_agent_for_env(env_name,continuous): 26 | env = gym.make(env_name) 27 | 28 | state_dim = env.observation_space.shape[0] 29 | if continuous: 30 | action_dim = env.action_space.shape[0] 31 | else: 32 | action_dim = env.action_space.n 33 | 34 | agent = A3C(env, continuous,state_dim,action_dim) 35 | scores = agent.train_worker() 36 | 37 | return agent,scores 38 | 39 | 40 | def plot_scores(scores,filename): 41 | fig = plt.figure() 42 | ax = fig.add_subplot(111) 43 | plt.plot(np.arange(1, len(scores) + 1), scores) 44 | plt.ylabel('Score') 45 | plt.xlabel('Episode #') 46 | plt.savefig(filename) 47 | plt.show() 48 | 49 | 50 | if __name__ == "__main__": 51 | # env = gym.make("Pendulum-v0") 52 | # train_scores = train_a3c(env,True) 53 | 54 | # train A3C on discrete env : CartPole 55 | scores_cartPole = train_agent_for_env("CartPole-v0",False) 56 | plot_scores(scores_cartPole,"cartPole_trainPlot.png") 57 | 58 | # train A3C on continuous env : continuous 59 | # a3c_mCar = train_agent_for_env("MountainCarContinuous-v0", True) 60 | 61 | -------------------------------------------------------------------------------- /Actor_Critic/A3C/agent_a3c.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torch.optim as optim 4 | import multiprocessing as mp 5 | from multiprocessing import Process 6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous 7 | from Actor_Critic.A3C.worker import Worker 8 | 9 | GAMMA = 0.9 10 | LR = 1e-4 11 | GLOBAL_MAX_EPISODE = 5000 12 | 13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 14 | 15 | 16 | class A3C(): 17 | def __init__(self,env,continuous,state_size,action_size): 18 | self.max_episode=GLOBAL_MAX_EPISODE 19 | self.global_episode = mp.Value('i', 0) # 进程之间共享的变量 20 | self.global_epi_rew = mp.Value('d',0) 21 | self.rew_queue = mp.Queue() 22 | self.worker_num = mp.cpu_count() 23 | 24 | # define the global networks 25 | self.global_valueNet= ValueNetwork(state_size,1).to(device) 26 | # global 的网络参数放入 shared memory,以便复制给各个进程中的 worker网络 27 | self.global_valueNet.share_memory() 28 | 29 | if continuous: 30 | self.global_policyNet = ActorContinous(state_size, action_size).to(device) 31 | else: 32 | self.global_policyNet = ActorDiscrete(state_size, action_size).to(device) 33 | self.global_policyNet.share_memory() 34 | 35 | # global optimizer 36 | self.global_optimizer_policy = optim.Adam(self.global_policyNet.parameters(), lr=LR) 37 | self.global_optimizer_value = optim.Adam(self.global_valueNet.parameters(),lr=LR) 38 | 39 | # define the workers 40 | self.workers=[Worker(env,continuous,state_size,action_size,i, 41 | self.global_valueNet,self.global_optimizer_value, 42 | self.global_policyNet,self.global_optimizer_policy, 43 | self.global_episode,self.global_epi_rew,self.rew_queue, 44 | self.max_episode,GAMMA) 45 | for i in range(self.worker_num)] 46 | 47 | def train_worker(self): 48 | scores=[] 49 | [w.start() for w in self.workers] 50 | while True: 51 | r = self.rew_queue.get() 52 | if r is not None: 53 | scores.append(r) 54 | else: 55 | break 56 | [w.join() for w in self.workers] 57 | 58 | return scores 59 | 60 | def save_model(self): 61 | torch.save(self.global_valueNet.state_dict(), "a3c_value_model.pth") 62 | torch.save(self.global_policyNet.state_dict(), "a3c_policy_model.pth") 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Actor_Critic/A3C/untils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import namedtuple 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.distributions import Categorical 6 | from torch.distributions import Normal 7 | 8 | 9 | class ValueNetwork(nn.Module): 10 | 11 | def __init__(self, input_dim, output_dim): 12 | super(ValueNetwork, self).__init__() 13 | self.fc1 = nn.Linear(input_dim, 256) 14 | self.fc2 = nn.Linear(256, output_dim) 15 | 16 | def forward(self, state): 17 | value = F.relu(self.fc1(state)) 18 | value = self.fc2(value) 19 | 20 | return value 21 | 22 | 23 | class ActorDiscrete(nn.Module): 24 | """ 25 | 用于离散动作空间的策略网络 26 | """ 27 | def __init__(self,state_size,action_size): 28 | super(ActorDiscrete, self).__init__() 29 | self.seed = torch.manual_seed(0) 30 | self.fc1 = nn.Linear(state_size, 128) 31 | # self.fc2 = nn.Linear(64,128) 32 | self.fc2= nn.Linear(128, action_size) 33 | 34 | def forward(self, x): 35 | """ 36 | Build a network that maps state -> action probs. 37 | """ 38 | 39 | x=F.relu(self.fc1(x)) 40 | out = F.softmax(self.fc2(x),dim=1) 41 | return out 42 | 43 | def act(self,state): 44 | """ 45 | 返回 action 和 action的概率 46 | """ 47 | # probs for each action (2d tensor) 48 | probs = self.forward(state) 49 | m = Categorical(probs) 50 | action = m.sample() 51 | 52 | # return action for current state, and the corresponding probability 53 | return action.item(),probs[:,action.item()].item() 54 | 55 | 56 | class ActorContinous(nn.Module): 57 | """ 58 | 用于连续动作空间的策略网络 59 | """ 60 | def __init__(self,state_size,action_size): 61 | super(ActorContinous, self).__init__() 62 | self.fc1 = nn.Linear(state_size, 128) 63 | self.fc2 = nn.Linear(128,128) 64 | self.mu_head = nn.Linear(128, action_size) 65 | self.sigma_head = nn.Linear(128, action_size) 66 | 67 | def forward(self, x): 68 | x = F.relu(self.fc1(x)) 69 | x = F.relu(self.fc2(x)) 70 | mu = 2.0 * torch.tanh(self.mu_head(x)) 71 | sigma = F.softplus(self.sigma_head(x)) 72 | return (mu, sigma) 73 | 74 | def act(self,state): 75 | """ 76 | 返回 action 和 action 的 log prob 77 | """ 78 | with torch.no_grad(): 79 | (mu, sigma) = self.policy(state) # 2d tensors 80 | dist = Normal(mu, sigma) 81 | action = dist.sample() 82 | action_log_prob = dist.log_prob(action) 83 | 84 | return action.numpy()[0], action_log_prob.numpy()[0] 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /Actor_Critic/A3C/worker.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.multiprocessing as mp 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.distributions import Normal 6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous 7 | 8 | 9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 10 | 11 | 12 | class Worker(mp.Process): 13 | def __init__(self,env,continuous,state_size,action_size,id, global_valueNet,global_value_optimizer, 14 | global_policyNet,global_policy_optimizer, 15 | global_epi,global_epi_rew,rew_queue, 16 | max_epi,gamma): 17 | super(Worker, self).__init__() 18 | # define env for individual worker 19 | self.env = env 20 | self.continuous = continuous 21 | self.name = str(id) 22 | self.env.seed(id) 23 | self.state_size = state_size 24 | self.action_size = action_size 25 | self.memory=[] 26 | 27 | # passing global settings to worker 28 | self.global_valueNet,self.global_value_optimizer = global_valueNet,global_value_optimizer 29 | self.global_policyNet,self.global_policy_optimizer = global_policyNet,global_policy_optimizer 30 | self.global_epi,self.global_epi_rew = global_epi,global_epi_rew 31 | self.rew_queue = rew_queue 32 | self.max_epi = max_epi 33 | # self.batch_size = batch_size 34 | self.gamma = gamma 35 | 36 | # define local net for individual worker 37 | self.local_policyNet = ActorDiscrete(self.state_size,self.action_size).to(device) 38 | if self.continuous: 39 | self.local_policyNet = ActorContinous(self.state_size,self.action_size).to(device) 40 | self.local_valueNet = ValueNetwork(self.state_size,1).to(device) 41 | 42 | def sync_global(self): 43 | self.local_valueNet.load_state_dict(self.global_valueNet.state_dict()) 44 | self.local_policyNet.load_state_dict(self.global_policyNet.state_dict()) 45 | 46 | def calculate_loss(self): 47 | # get experiences from current trajectory 48 | states = torch.tensor([t[0] for t in self.memory], dtype=torch.float) 49 | log_probs = torch.tensor([t[1] for t in self.memory], dtype=torch.float) 50 | 51 | # -- calculate discount future rewards for every time step 52 | rewards = [t[2] for t in self.memory] 53 | fur_Rewards = [] 54 | for i in range(len(rewards)): 55 | discount = [self.gamma ** i for i in range(len(rewards) - i)] 56 | f_rewards = rewards[i:] 57 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards))) 58 | fur_Rewards = torch.tensor(fur_Rewards, dtype=torch.float).view(-1, 1) 59 | 60 | # calculate loss for critic 61 | V = self.local_valueNet(states) 62 | value_loss = F.mse_loss(fur_Rewards, V) 63 | 64 | # compute entropy for policy loss 65 | (mu, sigma) = self.local_policyNet(states) 66 | dist = Normal(mu, sigma) 67 | entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(dist.scale) # exploration 68 | 69 | # calculate loss for actor 70 | advantage = (fur_Rewards - V).detach() 71 | policy_loss = -advantage * log_probs 72 | policy_loss = (policy_loss - 0.005 * entropy).mean() 73 | 74 | return value_loss,policy_loss 75 | 76 | def update_global(self): 77 | value_loss, policy_loss = self.calculate_loss() 78 | 79 | self.global_value_optimizer.zero_grad() 80 | value_loss.backward() 81 | # propagate local gradients to global parameters 82 | for local_params, global_params in zip(self.local_valueNet.parameters(), self.global_valueNet.parameters()): 83 | global_params._grad = local_params._grad 84 | self.global_value_optimizer.step() 85 | 86 | self.global_policy_optimizer.zero_grad() 87 | policy_loss.backward() 88 | # propagate local gradients to global parameters 89 | for local_params, global_params in zip(self.local_policyNet.parameters(), self.global_policyNet.parameters()): 90 | global_params._grad = local_params._grad 91 | self.global_policy_optimizer.step() 92 | 93 | self.memory=[] # clear trajectory 94 | 95 | def run(self): 96 | while self.global_epi.value < self.max_epi: 97 | state = self.env.reset() 98 | total_reward=0 99 | while True: 100 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 101 | action, prob = self.local_policyNet.act(state) # 离散空间取直接prob,连续空间取log prob 102 | next_state, reward, done, _ = self.env.step(action) 103 | self.memory.append([state,action,reward,next_state,done]) 104 | total_reward += reward 105 | state = next_state 106 | 107 | if done: 108 | # recoding global episode and episode reward 109 | with self.global_epi.get_lock(): 110 | self.global_epi.value += 1 111 | with self.global_epi_rew.get_lock(): 112 | if self.global_epi_rew.value == 0.: 113 | self.global_epi_rew.value = total_reward 114 | else: 115 | # Moving average reward 116 | self.global_epi_rew.value = self.global_epi_rew.value * 0.99 + total_reward * 0.01 117 | self.rew_queue.put(self.global_epi_rew.value) 118 | 119 | print("w{} | episode: {}\t , episode reward:{:.4} \t " 120 | .format(self.name,self.global_epi.value,self.global_epi_rew.value)) 121 | break 122 | 123 | # update and sync with the global net when finishing an episode 124 | self.update_global() 125 | self.sync_global() 126 | 127 | self.rew_queue.put(None) 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /Actor_Critic/SAC/sac_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_agent.py -------------------------------------------------------------------------------- /Actor_Critic/SAC/sac_main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import arrow 3 | import gym 4 | import numpy as np 5 | import pandas as pd 6 | from matplotlib import pyplot as plt 7 | 8 | 9 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit): 10 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}' 11 | .format(i_episode, np.mean(scores_deque), score), end="") 12 | if i_episode % 100 == 0: 13 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}' 14 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 15 | if np.mean(scores_deque) >= solve_limit: 16 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}' 17 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 18 | return True 19 | 20 | return False 21 | 22 | 23 | def plot_scores(scores,filename): 24 | plt.plot(np.arange(1, len(scores) + 1), scores) 25 | plt.ylabel('Score') 26 | plt.xlabel('Episode #') 27 | plt.savefig(filename) 28 | plt.show() 29 | 30 | 31 | def get_env_prop(env_name, continuous): 32 | env = gym.make(env_name) 33 | state_dim = env.observation_space.shape[0] 34 | if continuous: 35 | action_dim = env.action_space.shape[0] 36 | else: 37 | action_dim = env.action_space.n 38 | 39 | return env,state_dim, action_dim 40 | 41 | 42 | if __name__=="__main__": 43 | env,state_dim,action_dim = get_env_prop("CartPole-v0",False) 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Actor_Critic/SAC/sac_network.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_network.py -------------------------------------------------------------------------------- /BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc -------------------------------------------------------------------------------- /BlackBox_optimazation/Hill_Climbing/agent_HC.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import deque 4 | 5 | 6 | def hill_climbing(env,policy,n_episodes=1000, max_t=1000, gamma=1.0, print_every=100, noise_scale=1e-2): 7 | """Implementation of hill climbing with adaptive noise scaling. 8 | 9 | Params 10 | ====== 11 | n_episodes (int): maximum number of training episodes 12 | max_t (int): maximum number of timesteps per episode 13 | gamma (float): discount rate 14 | print_every (int): how often to print average score (over last 100 episodes) 15 | noise_scale (float): standard deviation of additive noise 16 | """ 17 | scores_deque = deque(maxlen=100) 18 | scores = [] # 用于存储各 episode 的得分(总奖励) 19 | best_R = -np.Inf 20 | best_w = policy.w 21 | 22 | for i_episode in range(1, n_episodes + 1): 23 | rewards = [] # 每个episode 重置奖励队列 24 | state = env.reset() 25 | for t in range(max_t): 26 | action = policy.act(state) 27 | state, reward, done, _ = env.step(action) 28 | rewards.append(reward) # 把当前 时间步的奖励加入 rewards 队列 29 | if done: 30 | break 31 | # 设定折扣率 32 | discounts = [gamma ** i for i in range(len(rewards) + 1)] 33 | # 计算当前episode的折扣累计总奖励 34 | R = sum([a * b for a, b in zip(discounts, rewards)]) 35 | 36 | scores_deque.append(sum(rewards)) # 把当前episode的累计奖励(无折扣)加入 scores 队列 37 | scores.append(sum(rewards)) 38 | 39 | # ------- 参数搜索 ----- # 40 | if R >= best_R: # found better weights 41 | best_R = R 42 | best_w = policy.w 43 | noise_scale = max(1e-3, noise_scale / 2) # 缩小搜索范围(下限为 0.001) 44 | policy.w += noise_scale * np.random.rand(*policy.w.shape) 45 | else: # did not find better weights 46 | noise_scale = min(2, noise_scale * 2) # 扩大搜索范围(上限为2) 47 | policy.w = best_w + noise_scale * np.random.rand(*policy.w.shape) 48 | # --------------------- # 49 | 50 | if i_episode % print_every == 0: 51 | print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) 52 | if np.mean(scores_deque) >= 195.0: 53 | print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100, 54 | np.mean(scores_deque))) 55 | policy.w = best_w 56 | break 57 | 58 | return scores -------------------------------------------------------------------------------- /BlackBox_optimazation/Hill_Climbing/main_hillClimb.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import deque 4 | import matplotlib.pyplot as plt 5 | from CartPole.Hill_Climbing.agent_HC import hill_climbing 6 | 7 | 8 | class Policy(): 9 | """ 10 | 策略函数是一个单层线性神经网络 P(A)=softmax(W*S) 11 | 输出层加入了激活函数softmax,为了把输出值转换成概率(0-1),但没有中间隐藏层,即没有非线性变换 12 | 输入节点数:s_size ;输出节点数:a_size 13 | 参数矩阵 w 的维度 tate_space x action_space 14 | """ 15 | def __init__(self, s_size=4, a_size=2): 16 | self.w = 1e-4 * np.random.rand(s_size, a_size) # weights for simple linear policy: state_space x action_space 17 | 18 | def forward(self, state): 19 | x = np.dot(state, self.w) 20 | return np.exp(x) / sum(np.exp(x)) 21 | 22 | def act(self, state): 23 | probs = self.forward(state) 24 | # action = np.random.choice(2, p=probs) # option 1: stochastic policy 25 | action = np.argmax(probs) # option 2: deterministic policy 26 | return action 27 | 28 | 29 | if __name__=="__main__": 30 | env = gym.make('CartPole-v0') 31 | policy=Policy() 32 | 33 | print(policy.w) 34 | 35 | # 训练智能体:更新 policy (参数w) 36 | scores = hill_climbing(env,policy) 37 | 38 | # 观察训练好的智能体 39 | state = env.reset() 40 | img = plt.imshow(env.render(mode='rgb_array')) 41 | for t in range(200): 42 | action = policy.act(state) 43 | img.set_data(env.render(mode='rgb_array')) 44 | 45 | state, reward, done, _ = env.step(action) 46 | if done: 47 | break 48 | 49 | env.close() 50 | 51 | # 画累计奖励曲线 52 | fig = plt.figure() 53 | ax = fig.add_subplot(111) 54 | plt.plot(np.arange(1, len(scores) + 1), scores) 55 | plt.ylabel('Score') 56 | plt.xlabel('Episode #') 57 | plt.show() -------------------------------------------------------------------------------- /BlackBox_optimazation/cross_entropy_method/CEM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/CEM.png -------------------------------------------------------------------------------- /BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc -------------------------------------------------------------------------------- /BlackBox_optimazation/cross_entropy_method/agent_cem.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 8 | 9 | 10 | class Agent(nn.Module): 11 | def __init__(self, env, h_size=16): 12 | super(Agent, self).__init__() 13 | self.env = env 14 | # state, hidden layer, action sizes 15 | self.s_size = env.observation_space.shape[0] 16 | self.h_size = h_size 17 | self.a_size = env.action_space.shape[0] 18 | # define layers 19 | self.fc1 = nn.Linear(self.s_size, self.h_size) 20 | self.fc2 = nn.Linear(self.h_size, self.a_size) 21 | 22 | def set_weights(self, weights): 23 | s_size = self.s_size 24 | h_size = self.h_size 25 | a_size = self.a_size 26 | # separate the weights for each layer 27 | fc1_end = (s_size * h_size) + h_size 28 | fc1_W = torch.from_numpy(weights[:s_size * h_size].reshape(s_size, h_size)) 29 | fc1_b = torch.from_numpy(weights[s_size * h_size:fc1_end]) 30 | fc2_W = torch.from_numpy(weights[fc1_end:fc1_end + (h_size * a_size)].reshape(h_size, a_size)) 31 | fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size):]) 32 | # set the weights for each layer 33 | self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data)) 34 | self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data)) 35 | self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data)) 36 | self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data)) 37 | 38 | def get_weights_dim(self): 39 | return (self.s_size + 1) * self.h_size + (self.h_size + 1) * self.a_size 40 | 41 | def forward(self, x): 42 | x = F.relu(self.fc1(x)) 43 | x = F.tanh(self.fc2(x)) 44 | return x.cpu().data 45 | 46 | def evaluate(self, weights, gamma=1.0, max_t=5000): 47 | self.set_weights(weights) 48 | episode_return = 0.0 49 | state = self.env.reset() 50 | for t in range(max_t): 51 | state = torch.from_numpy(state).float().to(device) 52 | action = self.forward(state) 53 | state, reward, done, _ = self.env.step(action) 54 | episode_return += reward * math.pow(gamma, t) 55 | if done: 56 | break 57 | return episode_return 58 | 59 | 60 | -------------------------------------------------------------------------------- /BlackBox_optimazation/cross_entropy_method/checkpoint.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/checkpoint.pth -------------------------------------------------------------------------------- /BlackBox_optimazation/cross_entropy_method/main_cem.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import math 3 | import numpy as np 4 | from collections import deque 5 | import matplotlib.pyplot as plt 6 | import torch 7 | from MountCar_continuous.cross_entropy_method.agent_cem import Agent 8 | 9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 10 | 11 | 12 | def cem(agent,n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5): 13 | """PyTorch implementation of a cross-entropy method. 14 | 15 | Params 16 | ====== 17 | n_iterations (int): maximum number of training iterations 18 | max_t (int): maximum number of timesteps per episode 19 | gamma (float): discount rate 20 | print_every (int): how often to print average score (over last 100 episodes) 21 | pop_size (int): size of population at each iteration 22 | elite_frac (float): percentage of top performers to use in update 23 | sigma (float): standard deviation of additive noise 24 | """ 25 | n_elite = int(pop_size * elite_frac) 26 | 27 | scores_deque = deque(maxlen=100) 28 | scores = [] 29 | best_weight = sigma * np.random.randn(agent.get_weights_dim()) 30 | 31 | for i_iteration in range(1, n_iterations + 1): 32 | weights_pop = [best_weight + (sigma * np.random.randn(agent.get_weights_dim())) for i in range(pop_size)] 33 | rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop]) 34 | 35 | elite_idxs = rewards.argsort()[-n_elite:] 36 | elite_weights = [weights_pop[i] for i in elite_idxs] 37 | best_weight = np.array(elite_weights).mean(axis=0) 38 | 39 | reward = agent.evaluate(best_weight, gamma=1.0) 40 | scores_deque.append(reward) 41 | scores.append(reward) 42 | 43 | torch.save(agent.state_dict(), 'checkpoint.pth') 44 | 45 | if i_iteration % print_every == 0: 46 | print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque))) 47 | 48 | if np.mean(scores_deque) >= 90.0: 49 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration - 100, 50 | np.mean(scores_deque))) 51 | break 52 | return scores 53 | 54 | 55 | 56 | def watch_trained_agent(agent): 57 | # load the weights from file 58 | agent.load_state_dict(torch.load('checkpoint.pth')) 59 | 60 | state = env.reset() 61 | img = plt.imshow(env.render(mode='rgb_array')) 62 | while True: 63 | state = torch.from_numpy(state).float().to(device) 64 | with torch.no_grad(): 65 | action = agent(state) 66 | img.set_data(env.render(mode='rgb_array')) 67 | plt.axis('off') 68 | next_state, reward, done, _ = env.step(action) 69 | state = next_state 70 | if done: 71 | break 72 | 73 | env.close() 74 | 75 | 76 | if __name__=="__main__": 77 | env = gym.make('MountainCarContinuous-v0') 78 | env.seed(101) 79 | np.random.seed(101) 80 | agent = Agent(env).to(device) 81 | 82 | # --- train and plot scores --- # 83 | scores = cem(agent) 84 | 85 | # plot the scores 86 | fig = plt.figure() 87 | ax = fig.add_subplot(111) 88 | plt.plot(np.arange(1, len(scores) + 1), scores) 89 | plt.ylabel('Score') 90 | plt.xlabel('Episode #') 91 | plt.show() 92 | 93 | # --- watch a pre-trained agent --- # 94 | watch_trained_agent(agent) 95 | 96 | 97 | -------------------------------------------------------------------------------- /DDPGs/DDPG/DDPG_main.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import gym 5 | import torch 6 | from DDPGs.DDPG.DDPG_agent import DDPGAgent 7 | 8 | 9 | def ddpg(env,agent,n_episodes=2000, max_t=700): 10 | scores_deque = deque(maxlen=100) 11 | scores = [] 12 | 13 | for i_episode in range(1, n_episodes+1): 14 | state = env.reset() 15 | agent.reset() 16 | score = 0 17 | while True: 18 | # 智能体生成与当前 state 对应的 action (行动策略) 19 | action = agent.act(state) 20 | # 与环境交互,得到 sars' 21 | next_state, reward, done, _ = env.step(action) 22 | # 把当前时间步的经验元组传给 agent 23 | agent.step(i_episode,state, action, reward, next_state, done) 24 | state = next_state 25 | score += reward 26 | if done: 27 | break 28 | scores_deque.append(score) 29 | scores.append(score) 30 | 31 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}' 32 | .format(i_episode, np.mean(scores_deque), score),end="") 33 | if i_episode % 100 == 0: 34 | torch.save(agent.actor_local.state_dict(), 'model_save/actor2.pth') 35 | torch.save(agent.critic_local.state_dict(), 'model_save/critic2.pth') 36 | print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) 37 | 38 | return scores 39 | 40 | 41 | def watch_agent(agent,filename_actor,filename_crtic): 42 | agent.actor_local.load_state_dict(torch.load(filename_actor)) 43 | agent.critic_local.load_state_dict(torch.load(filename_crtic)) 44 | state = env.reset() 45 | for t in range(1000): 46 | action = agent.act(state, noise=False) 47 | print(action) 48 | env.render() 49 | state, reward, done, _ = env.step(action) 50 | if done: 51 | break 52 | env.close() 53 | 54 | 55 | def plot_scores(scores): 56 | fig = plt.figure() 57 | ax = fig.add_subplot(111) 58 | plt.plot(np.arange(1, len(scores.size()) + 1), scores) 59 | plt.ylabel('Score') 60 | plt.xlabel('Episode #') 61 | plt.show() 62 | 63 | 64 | if __name__=="__main__": 65 | env = gym.make('BipedalWalker-v2') 66 | env.seed(10) 67 | 68 | # 初始化 ddpg agent 69 | agent=DDPGAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], seed=10) 70 | # 训练并保存 scores 71 | scores=ddpg(env,agent) 72 | plot_scores(scores) 73 | 74 | # watch_agent(agent,"actor1.pth","critic1.pth") 75 | 76 | 77 | -------------------------------------------------------------------------------- /DDPGs/DDPG/DDPG_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def hidden_init(layer): 9 | fan_in = layer.weight.data.size()[0] 10 | lim = 1. / np.sqrt(fan_in) 11 | return (-lim, lim) 12 | 13 | 14 | class Actor(nn.Module): 15 | """Actor (Policy) Model. 16 | """ 17 | 18 | def __init__(self, state_size, action_size, seed, fc1_units=256,fc2_units=256): 19 | """ 20 | single layer MLP network 21 | ====== 22 | Input dim: state_size 23 | Output dim: action_size 24 | """ 25 | super(Actor, self).__init__() 26 | self.seed = torch.manual_seed(seed) 27 | self.fc1 = nn.Linear(state_size, fc1_units) 28 | self.fc2 = nn.Linear(fc1_units, fc2_units) 29 | self.fc3=nn.Linear(fc2_units,action_size) 30 | 31 | self.reset_parameters() 32 | 33 | def reset_parameters(self): 34 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 35 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 36 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 37 | 38 | def forward(self, state): 39 | """Build an actor (policy) network that maps states -> actions.""" 40 | x = torch.relu(self.fc1(state)) 41 | x = torch.relu(self.fc2(x)) 42 | return torch.tanh(self.fc3(x)) 43 | 44 | 45 | class Critic(nn.Module): 46 | """Critic (Value) Model. 47 | """ 48 | 49 | def __init__(self, state_size, action_size, seed, fcs1_units=256, fc2_units=256,fc3_units=128): 50 | """ 51 | ====== 52 | Input dim: state_size 53 | Output dim: 1 54 | """ 55 | super(Critic, self).__init__() 56 | self.seed = torch.manual_seed(seed) 57 | self.fcs1 = nn.Linear(state_size, fcs1_units) 58 | self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units) 59 | self.fc3 = nn.Linear(fc2_units, fc3_units) 60 | self.fc4 = nn.Linear(fc3_units, 1) 61 | self.reset_parameters() 62 | 63 | def reset_parameters(self): 64 | self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1)) 65 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 66 | self.fc3.weight.data.uniform_(*hidden_init(self.fc3)) 67 | self.fc4.weight.data.uniform_(-3e-3, 3e-3) 68 | 69 | def forward(self, state, action): 70 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 71 | xs = F.leaky_relu(self.fcs1(state)) 72 | x = torch.cat((xs, action), dim=1) 73 | x = F.leaky_relu(self.fc2(x)) 74 | x = F.leaky_relu(self.fc3(x)) 75 | return self.fc4(x) 76 | -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/actor1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor1.pth -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/actor2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor2.pth -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/checkpoint_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_actor.pth -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/checkpoint_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_critic.pth -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/critic1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic1.pth -------------------------------------------------------------------------------- /DDPGs/DDPG/model_save/critic2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic2.pth -------------------------------------------------------------------------------- /DDPGs/TD3/TD3_main.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import gym 6 | import torch 7 | import arrow 8 | import os 9 | from DDPGs.TD3.TD3_new import TD3 10 | 11 | RESUME= True 12 | SAVE_MODEL_EVERY = 5 13 | load_checkpoint_patch=["models/checkpoint/actor_10.pth","models/checkpoint/critic_10.pth"] 14 | 15 | 16 | def output_scores(start_time,i_episode,scores_deque,score): 17 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}' 18 | .format(i_episode, np.mean(scores_deque), score), end="") 19 | if i_episode % 100 == 0: 20 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}' 21 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 22 | if np.mean(scores_deque) >= 300: 23 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}' 24 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 25 | return True 26 | 27 | return False 28 | 29 | 30 | def watch_smart_agent(agent,filename_actor,filename_crtic): 31 | agent.actor.load_state_dict(torch.load(filename_actor)) 32 | agent.critic.load_state_dict(torch.load(filename_crtic)) 33 | state = env.reset() 34 | for t in range(1000): 35 | action = agent.select_action(state) 36 | print(action) 37 | env.render() 38 | state, reward, done, _ = env.step(action) 39 | if done: 40 | break 41 | env.close() 42 | 43 | 44 | def watch_random_agent(): 45 | 46 | for _ in range(5): 47 | env.reset() 48 | while True: 49 | env.render() 50 | next_state, reward, done, _ =env.step(env.action_space.sample()) 51 | if done: 52 | break 53 | 54 | env.close() 55 | 56 | 57 | def plot_scores(scores,filename): 58 | fig = plt.figure() 59 | ax = fig.add_subplot(111) 60 | plt.plot(np.arange(1, len(scores) + 1), scores) 61 | plt.ylabel('Score') 62 | plt.xlabel('Episode #') 63 | plt.savefig(filename) 64 | plt.show() 65 | 66 | 67 | def save_check_point(agent,i_episode): 68 | # setting the check point for training 69 | checkpoint_actor = { 70 | "net": agent.actor.state_dict(), 71 | 'optimizer': agent.actor_optimizer.state_dict(), 72 | "epoch": i_episode 73 | } 74 | checkpoint_critic = { 75 | "net": agent.critic.state_dict(), 76 | "optimizer": agent.critic_optimizer.state_dict(), 77 | "epoch": i_episode 78 | } 79 | if not os.path.isdir("models/checkpoint"): 80 | os.mkdir("models/checkpoint") 81 | torch.save(checkpoint_actor, 'models/checkpoint/actor_%s.pth' % (str(i_episode))) 82 | torch.save(checkpoint_critic, 'models/checkpoint/critic_%s.pth' % (str(i_episode))) 83 | 84 | 85 | def load_check_point(agent): 86 | "load saved checkpoints to resume training" 87 | checkpoint_actor = torch.load(load_checkpoint_patch[0]) # 加载断点 88 | checkpoint_critic = torch.load(load_checkpoint_patch[1]) 89 | 90 | agent.actor.load_state_dict(checkpoint_actor['net']) # 加载模型可学习参数 91 | agent.critic.load_state_dict(checkpoint_critic['net']) 92 | 93 | agent.actor_optimizer.load_state_dict(checkpoint_actor['optimizer']) # 加载优化器参数 94 | agent.critic_optimizer.load_state_dict(checkpoint_critic['optimizer']) # 加载优化器参数 95 | 96 | start_epoch = checkpoint_actor['epoch'] # 设置开始的epoch 97 | return start_epoch 98 | 99 | 100 | def train_td3(env,agent,n_episodes): 101 | start_epoch = 1 102 | 103 | if RESUME: # 加载 check point 中保存的模型参数继续训练 104 | start_epoch=load_check_point(agent) 105 | 106 | scores_deque = deque(maxlen=100) 107 | scores = [] 108 | start_time = arrow.now() 109 | for i_episode in range(start_epoch, n_episodes + 1): 110 | state = env.reset() 111 | total_reward = 0 112 | time_step = 0 113 | 114 | # loop over time steps 115 | while True: 116 | # 智能体选择动作(根据当前策略) 117 | action = agent.select_action(state) 118 | next_state, reward, done, _ = env.step(action) 119 | agent.save_exp(state, action, next_state, reward, done) 120 | if agent.mode==1: 121 | agent.train(time_step) 122 | time_step += 1 123 | state = next_state 124 | total_reward += reward 125 | if done: 126 | break 127 | 128 | # recording scores 129 | scores.append([i_episode,total_reward]) 130 | scores_deque.append(total_reward) 131 | finished = output_scores(start_time, i_episode, scores_deque, total_reward) 132 | if finished: 133 | agent.save('models', 'TD3_v2') 134 | break 135 | 136 | if i_episode% SAVE_MODEL_EVERY ==0: 137 | save_check_point(agent, i_episode) 138 | # 同时保存 scores,存为 scv 文件 139 | scores_df=pd.DataFrame(data=scores,columns=['episode','score']) 140 | scores_df.to_csv('scores_saved.csv',index=False) 141 | 142 | if agent.mode==0: 143 | agent.train(time_step) 144 | 145 | return scores 146 | 147 | 148 | if __name__=="__main__": 149 | env = gym.make('BipedalWalker-v3') 150 | env.seed(10) 151 | state_dim = env.observation_space.shape[0] 152 | action_dim = env.action_space.shape[0] 153 | max_action = float(env.action_space.high[0]) 154 | 155 | agent_0 = TD3(state_dim,action_dim,max_action,env,0) # mode=0:update per episode 156 | agent_1 = TD3(state_dim, action_dim, max_action, env, 1) # mode=1: update per time step 157 | # scores=train_td3(env,agent_1,1000) 158 | 159 | # 观察未经训练的随机智能体 160 | #watch_random_agent() 161 | watch_smart_agent(agent_0,"models/TD3_actor.pth","models/TD3_critic.pth") 162 | 163 | 164 | -------------------------------------------------------------------------------- /DDPGs/TD3/TD3_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | def hidden_init(layer): 8 | fan_in = layer.weight.data.size()[0] 9 | lim = 1. / np.sqrt(fan_in) 10 | return (-lim, lim) 11 | 12 | # Actor Neural Network 13 | class Actor(nn.Module): 14 | def __init__(self, state_dim, action_dim, max_action): 15 | super(Actor, self).__init__() 16 | 17 | self.l1 = nn.Linear(state_dim, 400) 18 | self.l2 = nn.Linear(400, 300) 19 | self.l3 = nn.Linear(300, action_dim) 20 | 21 | self.max_action = max_action 22 | 23 | def forward(self, x): 24 | x = F.relu(self.l1(x)) 25 | x = F.relu(self.l2(x)) 26 | x = self.max_action * torch.tanh(self.l3(x)) 27 | return x 28 | 29 | 30 | # Q1-Q2-Critic Neural Network 31 | class Critic(nn.Module): 32 | def __init__(self, state_dim, action_dim): 33 | super(Critic, self).__init__() 34 | 35 | # Q1 architecture 36 | self.l1 = nn.Linear(state_dim + action_dim, 400) 37 | self.l2 = nn.Linear(400, 300) 38 | self.l3 = nn.Linear(300, 1) 39 | 40 | # Q2 architecture 41 | self.l4 = nn.Linear(state_dim + action_dim, 400) 42 | self.l5 = nn.Linear(400, 300) 43 | self.l6 = nn.Linear(300, 1) 44 | 45 | def forward(self, x, u): 46 | xu = torch.cat([x, u], 1) 47 | x1 = F.relu(self.l1(xu)) 48 | x1 = F.relu(self.l2(x1)) 49 | x1 = self.l3(x1) 50 | 51 | x2 = F.relu(self.l4(xu)) 52 | x2 = F.relu(self.l5(x2)) 53 | x2 = self.l6(x2) 54 | return x1, x2 55 | 56 | def Q1(self, x, u): 57 | xu = torch.cat([x, u], 1) 58 | 59 | x1 = F.relu(self.l1(xu)) 60 | x1 = F.relu(self.l2(x1)) 61 | x1 = self.l3(x1) 62 | return x1 63 | -------------------------------------------------------------------------------- /DDPGs/TD3/TD3_solved.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/TD3_solved.png -------------------------------------------------------------------------------- /DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc -------------------------------------------------------------------------------- /DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc -------------------------------------------------------------------------------- /DDPGs/TD3/models/TD3_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_actor.pth -------------------------------------------------------------------------------- /DDPGs/TD3/models/TD3_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_critic.pth -------------------------------------------------------------------------------- /DDPGs/TD3/scores_saved.csv: -------------------------------------------------------------------------------- 1 | episode,score 2 | 10,-110.23470465514899 3 | 11,-99.79645054419306 4 | 12,-126.96190521016625 5 | 13,-145.1785128976382 6 | 14,-104.07223475725621 7 | 15,-115.51990479428935 8 | 16,-121.89401543392783 9 | 17,-101.55811828293544 10 | 18,-99.89300219319254 11 | 19,-104.98346411872812 12 | 20,-106.23650016517124 13 | 21,-103.84864901668085 14 | 22,-121.26231449909034 15 | 23,-111.10637092719374 16 | 24,-114.66147317025639 17 | 25,-109.04674660607814 18 | 26,-106.12839938371035 19 | 27,-127.84833024115571 20 | 28,-112.4956240406665 21 | 29,-107.88297384519461 22 | 30,-99.92317202252215 23 | 31,-125.0179119318615 24 | 32,-100.36371986921576 25 | 33,-110.05038883724445 26 | 34,-132.021171753202 27 | 35,-133.880572424944 28 | 36,-100.11141411138209 29 | 37,-122.84088047947732 30 | 38,-103.55589092614429 31 | 39,-121.33897605525831 32 | 40,-230.96539978320874 33 | 41,-124.41352611894291 34 | 42,-119.04664112562759 35 | 43,-115.88990889360397 36 | 44,-116.13402150126659 37 | 45,-153.34740936961384 38 | 46,-107.6970551122332 39 | 47,-103.65690659839511 40 | 48,-107.19409725924586 41 | 49,-95.42425468958133 42 | 50,-103.23654880642972 43 | 51,-106.54915425148904 44 | 52,-114.40587754233572 45 | 53,-117.05561482745843 46 | 54,-113.98186717139261 47 | 55,-107.9192592421826 48 | 56,-100.75576811566108 49 | 57,-101.08801850037774 50 | 58,-132.08337249626365 51 | 59,-108.36035750377272 52 | 60,-111.05887943139943 53 | 61,-121.58251865927255 54 | 62,-141.45334827651286 55 | 63,-114.22801798708173 56 | 64,-119.39000501374326 57 | 65,-132.97221690919855 58 | 66,-123.59944656251093 59 | 67,-101.49795907883284 60 | 68,-103.72652107741114 61 | 69,-98.53701366942036 62 | 70,-100.89222275623676 63 | 71,-143.13936392812377 64 | 72,-101.36843625940055 65 | 73,-100.09964251577696 66 | 74,-98.95479499937969 67 | 75,-104.2556218599781 68 | -------------------------------------------------------------------------------- /DDPGs/TD3/test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | scores=np.array([[1,-1.2],[2,-3.4],[3,3.6]]) 5 | 6 | 7 | df=pd.DataFrame(data=scores,columns=['episode','score']) 8 | 9 | print(df) 10 | -------------------------------------------------------------------------------- /DQNs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/.DS_Store -------------------------------------------------------------------------------- /DQNs/DDQN/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/.DS_Store -------------------------------------------------------------------------------- /DQNs/DDQN/DQN_main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import arrow 3 | import torch 4 | import numpy as np 5 | from collections import deque 6 | # import matplotlib 7 | # matplotlib.use('TkAgg') 8 | import matplotlib.pyplot as plt 9 | from DQNs.DDQN.ddqn_v3 import AgentV3 10 | 11 | 12 | def dqn(agent,model_file,n_episodes=2000, max_t=1000, 13 | eps_start=1.0, eps_end=0.01, eps_decay=0.995, 14 | beta_start=0.4): 15 | """Deep Q-Learning. 16 | 17 | 18 | Params 19 | ====== 20 | n_episodes (int): maximum number of training episodes 21 | max_t (int): maximum number of timesteps per episode 22 | eps_start (float): starting value of epsilon, for epsilon-greedy action selection 23 | eps_end (float): minimum value of epsilon 24 | eps_decay (float): multiplicative factor (per episode) for decreasing epsilon 25 | """ 26 | scores = [] # list containing scores from each episode 27 | scores_window = deque(maxlen=100) # last 100 scores 28 | eps = eps_start # initialize epsilon 29 | beta=beta_start 30 | 31 | start_time=arrow.now() 32 | for i_episode in range(1, n_episodes + 1): 33 | state = env.reset() 34 | score = 0 35 | episode_loss=[] 36 | for t in range(max_t): 37 | # 在当前状态下获取要采取的 action 38 | action = agent.act(state, eps) 39 | # 与环境交互获取 (s',r,done) 40 | next_state, reward, done, _ = env.step(action) 41 | # 构建 sarsa 序列,传给智能体 42 | loss=agent.step(state, action, reward, next_state, done) 43 | if loss is not None: 44 | episode_loss.append(loss) 45 | state = next_state 46 | score += reward 47 | if done: 48 | break 49 | scores_window.append(score) # save most recent score 50 | scores.append(score) # save most recent score 51 | eps = max(eps_end, eps_decay * eps) # decrease epsilon 52 | 53 | # beta = beta/beta_incre if beta= 200.0: 61 | print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100, 62 | np.mean(scores_window))) 63 | torch.save(agent.qnetwork_local.state_dict(), model_file) 64 | print('\nTotal running time:{}'.format(arrow.now() - start_time)) 65 | break 66 | return scores 67 | 68 | 69 | def watch_agent(agent): 70 | 71 | state = env.reset() 72 | for j in range(500): 73 | action = agent.act(state) 74 | env.render() 75 | state, reward, done, _ = env.step(action) 76 | if done: 77 | break 78 | env.close() 79 | 80 | 81 | def watch_random_agent(): 82 | 83 | for _ in range(3): 84 | env.reset() 85 | while True: 86 | env.render() 87 | next_state, reward, done, _ =env.step(env.action_space.sample()) 88 | if done: 89 | break 90 | 91 | env.close() 92 | 93 | 94 | def trained_agent_test(filename,episode_num=500,max_t=1000,eps=0.01): 95 | """ 96 | :param filename: 97 | :param episode_num: 98 | :param max_t: 99 | :param eps: 100 | :return: 101 | """ 102 | # agent = Agent(state_size=8, action_size=4, seed=0) 103 | agent_v3 = AgentV3(state_size=8, action_size=4, seed=0) 104 | agent_v3.qnetwork_local.load_state_dict(torch.load(filename)) 105 | 106 | watch_agent(agent_v3) 107 | 108 | scores=[] 109 | scores_window = deque(maxlen=100) 110 | start_time=arrow.now() 111 | for i_episode in range(episode_num): 112 | state = env.reset() 113 | score = 0 114 | for t in range(max_t): 115 | # 直接采用贪婪策略 116 | action = agent_v3.act(state) 117 | next_state, reward, done, _ = env.step(action) 118 | state = next_state 119 | score += reward 120 | if done: 121 | break 122 | scores.append(score) 123 | scores_window.append(score) 124 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)),end="") 125 | if i_episode % 100 == 0: 126 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window))) 127 | print('\rRunning time:{}\n'.format(arrow.now()-start_time)) 128 | return scores 129 | 130 | 131 | def plot_scores(scores,filename): 132 | # plot the scores 133 | fig = plt.figure() 134 | ax = fig.add_subplot(1, 1, 1) 135 | # ax.plot(np.arange(len(scores_1)), scores_1) 136 | ax.plot(np.arange(len(scores)), scores) 137 | # rolling_mean = pd.Series(scores).rolling(100).mean() 138 | plt.ylabel('Score') 139 | plt.xlabel('Episode #') 140 | plt.savefig(filename) 141 | 142 | 143 | if __name__=="__main__": 144 | env = gym.make('LunarLander-v2') 145 | env.seed(0) 146 | 147 | # 训练 ddqn agent 并获取平均累计奖励 148 | agent_v3 = AgentV3(state_size=8, action_size=4, seed=0) 149 | print("\n\nTraining ddqn agent:\n-------------------------------------------------------------\n") 150 | train_scores = dqn(agent_v3,'dueling_model.pth') 151 | # plot_scores(train_scores,'images/dueling-ddqn_training.png') 152 | 153 | # 观察未经训练的随机智能体 154 | #watch_random_agent() 155 | # 用训练好的智能体跑分并绘制奖励曲线 156 | # test_scores=trained_agent_test('models/dueling_model.pth') 157 | # plot_scores(test_scores,'images/dueling-ddqn_testing.png') 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DDQN/ddqn_v1.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对经典 DQN 的改进 3 | 1. Double DQN 4 | """ 5 | 6 | import numpy as np 7 | import random 8 | from collections import namedtuple, deque 9 | import torch 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from LunarLander.DQN.model import QNetwork 13 | 14 | 15 | BUFFER_SIZE = int(1e5) # replay buffer size 16 | BATCH_SIZE = 64 # minibatch size 17 | GAMMA = 0.99 # discount factor 18 | TAU = 1e-3 # for soft update of target parameters 19 | LR = 5e-4 # learning rate 20 | UPDATE_EVERY = 4 # how often to update the network 21 | 22 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 23 | 24 | 25 | class AgentV2(): 26 | """Interacts with and learns from the environment.""" 27 | 28 | def __init__(self, state_size, action_size, seed): 29 | """Initialize an Agent object. 30 | 31 | Params 32 | ====== 33 | state_size (int): dimension of each state 34 | action_size (int): dimension of each action 35 | seed (int): random seed 36 | """ 37 | self.state_size = state_size 38 | self.action_size = action_size 39 | self.seed = random.seed(seed) 40 | 41 | # Q-Network 42 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) 43 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 44 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) 45 | 46 | # Replay memory 47 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 48 | # Initialize time step (for updating every UPDATE_EVERY steps) 49 | self.t_step = 0 50 | 51 | def step(self, state, action, reward, next_state, done): 52 | # Save experience in replay memory 53 | self.memory.add(state, action, reward, next_state, done) 54 | 55 | # Learn every UPDATE_EVERY time steps. 56 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 57 | if self.t_step == 0: 58 | # If enough samples are available in memory, get random subset and learn 59 | if len(self.memory) > BATCH_SIZE: 60 | experiences = self.memory.sample() 61 | self.learn(experiences, GAMMA) 62 | 63 | def act(self, state, eps=0.): 64 | """Returns actions for given state as per current policy. 65 | 66 | Params 67 | ====== 68 | state (array_like): current state 69 | eps (float): epsilon, for epsilon-greedy action selection 70 | """ 71 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 72 | self.qnetwork_local.eval() 73 | with torch.no_grad(): 74 | action_values = self.qnetwork_local(state) 75 | self.qnetwork_local.train() 76 | 77 | # Epsilon-greedy action selection 78 | if random.random() > eps: 79 | return np.argmax(action_values.cpu().data.numpy()) 80 | else: 81 | return random.choice(np.arange(self.action_size)) 82 | 83 | def learn(self, experiences, gamma): 84 | """Update value parameters using given batch of experience tuples. 85 | 86 | Params 87 | ====== 88 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 89 | gamma (float): discount factor 90 | """ 91 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列,均为列向量 [BATCH_SIZE,1] 92 | states, actions, rewards, next_states, dones = experiences 93 | 94 | # ----------计算 Q targets------------------------------ # 95 | # 从local网络的 Q estimated 取最大值对应的动作序列 96 | Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量) 97 | # Double TD3:这些动作序列输入target网络得到对应的 Q 估计值,而不是直接让 target 网络选取最大Q(避免了 overestimated 问题) 98 | Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max) 99 | # 根据公式计算 Q 目标 100 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 101 | 102 | # --------- Get expected Q values from local model----------------- # 103 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列 104 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4] 105 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1] 106 | 107 | # -------------训练 local网络-------------------------------- # 108 | # Compute loss 109 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量 110 | # Minimize the loss 111 | self.optimizer.zero_grad() # 先把原来的梯度清零 112 | loss.backward() 113 | self.optimizer.step() 114 | 115 | # ------------------- update target network ------------------- # 116 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 117 | 118 | def soft_update(self, local_model, target_model, tau): 119 | """Soft update model parameters. 120 | θ_target = τ*θ_local + (1 - τ)*θ_target 121 | 122 | Params 123 | ====== 124 | local_model (PyTorch model): weights will be copied from 125 | target_model (PyTorch model): weights will be copied to 126 | tau (float): interpolation parameter 127 | """ 128 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 129 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) 130 | 131 | 132 | class ReplayBuffer: 133 | """Fixed-size buffer to store experience tuples.""" 134 | 135 | def __init__(self, action_size, buffer_size, batch_size, seed): 136 | """Initialize a ReplayBuffer object. 137 | 138 | Params 139 | ====== 140 | action_size (int): dimension of each action 141 | buffer_size (int): maximum size of buffer 142 | batch_size (int): size of each training batch 143 | seed (int): random seed 144 | """ 145 | self.action_size = action_size 146 | self.memory = deque(maxlen=buffer_size) 147 | self.batch_size = batch_size 148 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 149 | self.seed = random.seed(seed) 150 | 151 | def add(self, state, action, reward, next_state, done): 152 | """Add a new experience to memory.""" 153 | e = self.experience(state, action, reward, next_state, done) 154 | self.memory.append(e) 155 | 156 | def sample(self): 157 | """Randomly sample a batch of experiences from memory.""" 158 | experiences = random.sample(self.memory, k=self.batch_size) 159 | 160 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 161 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 162 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 163 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( 164 | device) 165 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( 166 | device) 167 | 168 | return (states, actions, rewards, next_states, dones) 169 | 170 | def __len__(self): 171 | """Return the current size of internal memory.""" 172 | return len(self.memory) 173 | -------------------------------------------------------------------------------- /DQNs/DDQN/ddqn_v3.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对经典 DQN 的改进 3 | 1. Double DQN 4 | 2. Dueling Network 5 | """ 6 | import numpy as np 7 | import random 8 | from collections import namedtuple, deque 9 | import torch 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | 13 | from DQNs.DDQN.model_dueling import QNetwork 14 | 15 | BUFFER_SIZE = int(1e4) # replay buffer size 16 | BATCH_SIZE = 64 # minibatch size 17 | GAMMA = 0.99 # discount factor 18 | TAU = 1e-3 # for soft update of target parameters 19 | LR = 5e-3 # learning rate 20 | UPDATE_EVERY = 4 # how often to update the network 21 | E=1e-8 # small number to add to the priority of experience 22 | 23 | 24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 25 | 26 | 27 | class AgentV3(): 28 | """Interacts with and learns from the environment.""" 29 | 30 | def __init__(self, state_size, action_size, seed): 31 | """Initialize an Agent object. 32 | 33 | Params 34 | ====== 35 | state_size (int): dimension of each state 36 | action_size (int): dimension of each action 37 | seed (int): random seed 38 | """ 39 | self.state_size = state_size 40 | self.action_size = action_size 41 | # self.seed = random.seed(seed) 42 | 43 | # Q-Network 44 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) 45 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 46 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) 47 | 48 | # Replay memory 49 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 50 | # Initialize time step (for updating every UPDATE_EVERY steps) 51 | self.t_step = 0 52 | 53 | def step(self, state, action, reward, next_state, done): 54 | # Save experience in replay memory 55 | self.memory.add(state, action, reward, next_state, done) 56 | 57 | # Learn every UPDATE_EVERY time steps. 58 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 59 | if self.t_step == 0: 60 | # If enough samples are available in memory, get random subset and learn 61 | if len(self.memory) > BATCH_SIZE: 62 | experiences = self.memory.sample() 63 | 64 | loss=self.learn(experiences, GAMMA) 65 | return loss 66 | 67 | def act(self, state, eps=0.): 68 | """Returns actions for given state as per current policy. 69 | 70 | Params 71 | ====== 72 | state (array_like): current state 73 | eps (float): epsilon, for epsilon-greedy action selection 74 | """ 75 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 76 | self.qnetwork_local.eval() 77 | with torch.no_grad(): 78 | action_values = self.qnetwork_local(state) 79 | self.qnetwork_local.train() 80 | 81 | # Epsilon-greedy action selection 82 | if random.random() > eps: 83 | return np.argmax(action_values.cpu().data.numpy()) 84 | else: 85 | return random.choice(np.arange(self.action_size)) 86 | 87 | def learn(self, experiences, gamma): 88 | """Update value parameters using given batch of experience tuples. 89 | 90 | Params 91 | ====== 92 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 93 | gamma (float): discount factor 94 | """ 95 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列 96 | states, actions, rewards, next_states, dones = experiences 97 | 98 | # ----------计算 Q targets------------------------------ # 99 | # 从local网络的 Q estimated 取最大值对应的动作序列 100 | Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量) 101 | # Double TD3:这些动作序列输入target网络得到对应的 Q 估计值,而不是直接让 target 网络选取最大Q(避免了 overestimated 问题) 102 | Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max) 103 | # 根据公式计算 Q 目标 104 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 105 | 106 | # --------- Get expected Q values from local model----------------- # 107 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列 108 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4] 109 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1] 110 | 111 | # -------------训练 local网络-------------------------------- # 112 | # Compute loss 113 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量 114 | # Minimize the loss 115 | self.optimizer.zero_grad() # 先把原来的梯度清零 116 | loss.backward() 117 | self.optimizer.step() 118 | 119 | # ------------------- update target network ------------------- # 120 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 121 | 122 | return loss.cpu().detach().numpy() 123 | 124 | def soft_update(self, local_model, target_model, tau): 125 | """Soft update model parameters. 126 | θ_target = τ*θ_local + (1 - τ)*θ_target 127 | 128 | Params 129 | ====== 130 | local_model (PyTorch model): weights will be copied from 131 | target_model (PyTorch model): weights will be copied to 132 | tau (float): interpolation parameter 133 | """ 134 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 135 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) 136 | 137 | 138 | class ReplayBuffer: 139 | """Fixed-size buffer to store experience tuples.""" 140 | 141 | def __init__(self, action_size, buffer_size, batch_size, seed): 142 | """Initialize a ReplayBuffer object. 143 | 144 | Params 145 | ====== 146 | action_size (int): dimension of each action 147 | buffer_size (int): maximum size of buffer 148 | batch_size (int): size of each training batch 149 | seed (int): random seed 150 | """ 151 | # self.action_size = action_size 152 | self.memory = deque(maxlen=buffer_size) 153 | self.batch_size = batch_size 154 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 155 | # self.seed = random.seed(seed) 156 | 157 | def add(self, state, action, reward, next_state, done): 158 | """Add a new experience to memory.""" 159 | e = self.experience(state, action, reward, next_state, done) 160 | self.memory.append(e) 161 | 162 | def sample(self): 163 | """Randomly sample a batch of experiences from memory.""" 164 | experiences = random.sample(self.memory, k=self.batch_size) 165 | 166 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 167 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 168 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 169 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( 170 | device) 171 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( 172 | device) 173 | 174 | return (states, actions, rewards, next_states, dones) 175 | 176 | def __len__(self): 177 | """Return the current size of internal memory.""" 178 | return len(self.memory) -------------------------------------------------------------------------------- /DQNs/DDQN/dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import namedtuple, deque 4 | from LunarLander.DQN.model import QNetwork 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | 10 | BUFFER_SIZE = int(1e5) # replay buffer size 11 | BATCH_SIZE = 64 # minibatch size 12 | GAMMA = 0.99 # discount factor 13 | TAU = 1e-3 # for soft update of target parameters 14 | LR = 5e-4 # learning rate 15 | UPDATE_EVERY = 4 # how often to update the network 16 | 17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 18 | 19 | 20 | class Agent(): 21 | """Interacts with and learns from the environment.""" 22 | 23 | def __init__(self, state_size, action_size, seed): 24 | """Initialize an Agent object. 25 | 26 | Params 27 | ====== 28 | state_size (int): dimension of each state 29 | action_size (int): dimension of each action 30 | seed (int): random seed 31 | """ 32 | self.state_size = state_size 33 | self.action_size = action_size 34 | self.seed = random.seed(seed) 35 | 36 | # Q-Network 37 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) 38 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) 39 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) 40 | 41 | # Replay memory 42 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) 43 | # Initialize time step (for updating every UPDATE_EVERY steps) 44 | self.t_step = 0 45 | 46 | def step(self, state, action, reward, next_state, done): 47 | # Save experience in replay memory 48 | self.memory.add(state, action, reward, next_state, done) 49 | 50 | # Learn every UPDATE_EVERY time steps. 51 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 52 | if self.t_step == 0: 53 | # If enough samples are available in memory, get random subset and learn 54 | if len(self.memory) > BATCH_SIZE: 55 | experiences = self.memory.sample() 56 | self.learn(experiences, GAMMA) 57 | 58 | def act(self, state, eps=0.): 59 | """Returns actions for given state as per current policy. 60 | 61 | Params 62 | ====== 63 | state (array_like): current state 64 | eps (float): epsilon, for epsilon-greedy action selection 65 | """ 66 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 67 | self.qnetwork_local.eval() 68 | with torch.no_grad(): 69 | action_values = self.qnetwork_local(state) 70 | self.qnetwork_local.train() 71 | 72 | # Epsilon-greedy action selection 73 | if random.random() > eps: 74 | return np.argmax(action_values.cpu().data.numpy()) 75 | else: 76 | return random.choice(np.arange(self.action_size)) 77 | 78 | def learn(self, experiences, gamma): 79 | """Update value parameters using given batch of experience tuples. 80 | 81 | Params 82 | ====== 83 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 84 | gamma (float): discount factor 85 | """ 86 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列,均为列向量 [BATCH_SIZE,1] 87 | states, actions, rewards, next_states, dones = experiences 88 | # ------计算每个经验元组对应的Q目标序列 89 | # Get max predicted Q values (for next states) from target model 90 | # print(self.qnetwork_target(next_states)) # shape:[BATCH_SIZE,4] 91 | Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # shape:[BATCH_SIZE,1] 92 | # Compute Q targets for current states 93 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # shape:[BATCH_SIZE,1] 94 | 95 | # --------- Get expected Q values from local model 96 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列 97 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4] 98 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1] 99 | 100 | # Compute loss 101 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量 102 | # Minimize the loss 103 | self.optimizer.zero_grad() # 先把原来的梯度清零 104 | loss.backward() 105 | self.optimizer.step() 106 | 107 | # ------------------- update target network ------------------- # 108 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 109 | 110 | def soft_update(self, local_model, target_model, tau): 111 | """Soft update model parameters. 112 | θ_target = τ*θ_local + (1 - τ)*θ_target 113 | 114 | Params 115 | ====== 116 | local_model (PyTorch model): weights will be copied from 117 | target_model (PyTorch model): weights will be copied to 118 | tau (float): interpolation parameter 119 | """ 120 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 121 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) 122 | 123 | 124 | class ReplayBuffer: 125 | """Fixed-size buffer to store experience tuples.""" 126 | 127 | def __init__(self, action_size, buffer_size, batch_size, seed): 128 | """Initialize a ReplayBuffer object. 129 | 130 | Params 131 | ====== 132 | action_size (int): dimension of each action 133 | buffer_size (int): maximum size of buffer 134 | batch_size (int): size of each training batch 135 | seed (int): random seed 136 | """ 137 | self.action_size = action_size 138 | self.memory = deque(maxlen=buffer_size) 139 | self.batch_size = batch_size 140 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 141 | self.seed = random.seed(seed) 142 | 143 | def add(self, state, action, reward, next_state, done): 144 | """Add a new experience to memory.""" 145 | e = self.experience(state, action, reward, next_state, done) 146 | self.memory.append(e) 147 | 148 | def sample(self): 149 | """Randomly sample a batch of experiences from memory.""" 150 | experiences = random.sample(self.memory, k=self.batch_size) 151 | 152 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 153 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device) 154 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 155 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to( 156 | device) 157 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( 158 | device) 159 | 160 | return (states, actions, rewards, next_states, dones) 161 | 162 | def __len__(self): 163 | """Return the current size of internal memory.""" 164 | return len(self.memory) -------------------------------------------------------------------------------- /DQNs/DDQN/images/Total Average reward scores plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/Total Average reward scores plot.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/ddqn_agent_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_agent_scores.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/ddqn_testing_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_testing_scores.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/double_dqn_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/double_dqn_v1.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/dueling-ddqn_testing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_testing.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/dueling-ddqn_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_training.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/runningResult.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult.png -------------------------------------------------------------------------------- /DQNs/DDQN/images/runningResult_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult_1.png -------------------------------------------------------------------------------- /DQNs/DDQN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class QNetwork(nn.Module): 7 | """Actor (Policy) Model.""" 8 | 9 | def __init__(self, state_size, action_size, seed): 10 | """Initialize parameters and build model. 11 | Params 12 | ====== 13 | state_size (int): Dimension of each state 14 | action_size (int): Dimension of each action 15 | seed (int): Random seed 16 | """ 17 | super(QNetwork, self).__init__() 18 | self.seed = torch.manual_seed(seed) 19 | self.fc1=nn.Linear(state_size,64) 20 | self.fc2=nn.Linear(64,64) 21 | self.fc3=nn.Linear(64,action_size) 22 | 23 | 24 | def forward(self, state): 25 | """Build a network that maps state -> action values.""" 26 | out=self.fc1(state) 27 | out=F.relu(out) 28 | out=self.fc2(out) 29 | out=F.relu(out) 30 | q_a=self.fc3(out) 31 | 32 | return q_a 33 | -------------------------------------------------------------------------------- /DQNs/DDQN/model_dueling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | H_1=64 6 | H_2=64 7 | 8 | class QNetwork(nn.Module): 9 | """Dueling Architecture""" 10 | 11 | def __init__(self, state_size, action_size, seed): 12 | """Initialize parameters and build model. 13 | Params 14 | ====== 15 | state_size (int): Dimension of each state 16 | action_size (int): Dimension of each action 17 | seed (int): Random seed 18 | """ 19 | super(QNetwork, self).__init__() 20 | self.action_size=action_size 21 | self.seed = torch.manual_seed(seed) 22 | self.fc1=nn.Linear(state_size,H_1) 23 | 24 | self.fc2_adv = nn.Linear(H_1,H_2) 25 | self.fc2_v = nn.Linear(H_1, H_2) 26 | 27 | self.fc3_adv = nn.Linear(H_2,action_size) 28 | self.fc3_v = nn.Linear(H_2, 1) 29 | 30 | 31 | def forward(self, state): 32 | # first hidden layer 33 | h1=F.relu(self.fc1(state)) 34 | 35 | # dueling start in second layer 36 | h2_adv = F.relu(self.fc2_adv(h1)) 37 | h2_v = F.relu(self.fc2_v(h1)) 38 | 39 | # final advantage value 40 | adv = self.fc3_adv(h2_adv) 41 | # final state value 42 | v = self.fc3_v(h2_v).expand(state.size(0), self.action_size) # 从1维扩展到 action_size维 43 | 44 | # calculate final Q(s,a) value for output 45 | out_q=v+adv-adv.mean(1).unsqueeze(1).expand(state.size(0), self.action_size) 46 | 47 | return out_q 48 | 49 | 50 | -------------------------------------------------------------------------------- /DQNs/DDQN/models/checkpoint.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/checkpoint.pth -------------------------------------------------------------------------------- /DQNs/DDQN/models/dueling_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/dueling_model.pth -------------------------------------------------------------------------------- /DQNs/DDQN/models/org_dqn.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/org_dqn.pth -------------------------------------------------------------------------------- /DQNs/DDQN/play_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import torch 4 | import numpy as np 5 | from collections import deque 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | env = gym.make('LunarLander-v2') 10 | env.seed(0) 11 | print('State shape: ', env.observation_space.shape) 12 | print('Number of actions: ', env.action_space.n) 13 | 14 | 15 | # 观察一个未经训练的随机智能体 16 | state = env.reset() 17 | for _ in range(10000): 18 | env.render() 19 | next_state, reward, done, _ =env.step(env.action_space.sample()) 20 | # print(reward) 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /DQNs/DDQN/test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch 4 | from collections import namedtuple, deque 5 | 6 | lst = [1,2,3,4,0,6] 7 | 8 | print( np.array(lst) / 2.5) 9 | print( np.array(lst) / 2.5 + 0.0001) 10 | 11 | s1 = np.array([[1,2,3,4]]) 12 | s2 = np.vstack([[3,4,5,6]]) 13 | print(np.vstack([s1,s2])) 14 | 15 | 16 | 17 | 18 | target_org=np.array([[ 0.0910, -0.0224, -0.0552, -0.0192], 19 | [ 0.0908, -0.0209, -0.0553, -0.0181], 20 | [ 0.0922, -0.0219, -0.0546, -0.0206], 21 | [ 0.0913, -0.0211, -0.0548, -0.0182], 22 | [ 0.0910, -0.0211, -0.0554, -0.0187]]) 23 | target_org=torch.tensor(target_org) 24 | # print(target_org.shape) 25 | # # 按行取最大值 26 | # print(target_org.detach().max(1)) 27 | # print(target_org.detach().max(1)[0]) 28 | # # 转换成列向量 29 | # print(target_org.detach().max(1)[0].unsqueeze(1)) 30 | 31 | 32 | local_org=np.array([[ 0.0936, -0.0768, -0.1730, -0.0238], 33 | [ 0.0930, -0.0620, -0.1845, -0.0077], 34 | [ 0.0986, -0.0473, -0.1868, 0.0110], 35 | [ 0.0946, -0.0752, -0.1726, -0.0264], 36 | [ 0.0979, -0.0497, -0.1886, 0.0097]]) 37 | local_org=torch.tensor(local_org) 38 | actions=torch.tensor(np.array( 39 | [[3], 40 | [1], 41 | [2], 42 | [0], 43 | [0]])) 44 | # print(actions) 45 | # print(local_org.shape) 46 | # print(local_org.gather(1, actions.long())) 47 | 48 | 49 | b=torch.tensor(np.array([ 0.0932, -0.0206, -0.0541, -0.0204])) 50 | action=torch.LongTensor([0]) 51 | print(b.gather(0,action)) 52 | 53 | memory=deque(maxlen=10) 54 | exp=namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 55 | e1=exp(0.34,1,3.56,0.56,False) 56 | memory.append(e1) 57 | e2=exp(3.34,0,8.56,-2.3,False) 58 | memory.append(e2) 59 | memory.append(exp(4.6,0,8.56,-2.3,False)) 60 | memory.append(exp(8.7,0,8.56,-4.3,False)) 61 | memory.append(exp(2.2,0,-0.8,-2.3,False)) 62 | 63 | 64 | 65 | # print(memory) 66 | # print(memory[0].state) 67 | # print(len(memory)) 68 | # # 69 | # sample_inds=np.random.choice(len(memory), 3, p=[0.1,0.2,0.2,0.4,0.1],replace=False) 70 | # print(sample_inds) 71 | 72 | 73 | # env = gym.make('LunarLander-v2') 74 | # env.seed(0) 75 | # print('State shape: ', env.observation_space.shape) 76 | # print('Number of actions: ', env.action_space.n) 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /DQNs/DQN_PER/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/.DS_Store -------------------------------------------------------------------------------- /DQNs/DQN_PER/PER_memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from SumTree import SumTree 4 | # from DQNs.DQN_PER.SumTree import SumTree 5 | 6 | 7 | class Memory: # stored as ( s, a, r, s_ ) in SumTree 8 | e = 0.0001 9 | alpha = 0.6 10 | beta = 0.4 11 | beta_increment_per_sampling = 0.001 12 | 13 | def __init__(self, capacity): 14 | self.tree = SumTree(capacity) 15 | self.capacity = capacity 16 | 17 | # 根据 TD-error 计算优先级 18 | def _get_priority(self, error): 19 | return (np.abs(error) + self.e) ** self.alpha 20 | 21 | # 存储一条经验和相应优先级 22 | def add(self, error, sample): 23 | p = self._get_priority(error) 24 | self.tree.add(p, sample) 25 | 26 | def batch_sample(self, n): 27 | batch = [] 28 | idxs = [] 29 | segment = self.tree.total() / n 30 | priorities = [] 31 | 32 | # beta 随着sample的次数增加而增大(??),上限为 1.0 33 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) 34 | 35 | # 把叶子节点分成n个采样区间(n为样本数量) 36 | for i in range(n): 37 | a = segment * i 38 | b = segment * (i + 1) 39 | s = random.uniform(a, b) 40 | (idx, p, data) = self.tree.get(s) 41 | priorities.append(p) 42 | batch.append(data) 43 | idxs.append(idx) 44 | 45 | # 采样概率 46 | sampling_probabilities = np.array(priorities) / self.tree.total() + self.e 47 | # 样本权重: IS weight 48 | is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) 49 | is_weight /= is_weight.max() 50 | 51 | return batch, idxs, is_weight 52 | 53 | def update(self, idx, error): 54 | p = self._get_priority(error) 55 | self.tree.update(idx, p) 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/cnn_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/cnn_per.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_2.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_exp-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_exp-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-2.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_exp-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-3.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/epsilon_linear-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_linear-1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_2.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_DQN_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_DQN_per.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_exp-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_exp-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-2.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_exp-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-3.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_exp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/Plots/train_linear-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_linear-1.png -------------------------------------------------------------------------------- /DQNs/DQN_PER/SumTree.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | # SumTree 5 | # a binary tree data structure where the parent’s value is the sum of its children 6 | class SumTree: 7 | write = 0 8 | 9 | def __init__(self, capacity): 10 | self.capacity = capacity 11 | self.tree = numpy.zeros(2 * capacity - 1) 12 | self.data = numpy.zeros(capacity, dtype=object) 13 | self.n_entries = 0 14 | 15 | def total(self): 16 | return self.tree[0] 17 | 18 | # 从叶子节点到根节点向上传播,更新整棵树 19 | def _propagate(self, idx, change): 20 | parent = (idx - 1) // 2 21 | self.tree[parent] += change 22 | 23 | if parent != 0: 24 | self._propagate(parent, change) 25 | 26 | # 更新目标节点的 priority 27 | def update(self, idx, p): 28 | change = p - self.tree[idx] 29 | 30 | self.tree[idx] = p 31 | self._propagate(idx, change) 32 | 33 | # 存储样本和对应节点的 priority (只有叶子节点可以存储,上面节点的值都是下层的求和) 34 | def add(self, p, data): 35 | # 计算叶子节点的 index 36 | idx = self.write + self.capacity - 1 37 | 38 | self.data[self.write] = data 39 | self.update(idx, p) 40 | 41 | # 如果叶子节点已满,则从第一个开始清空重新存储 42 | self.write += 1 43 | if self.write >= self.capacity: 44 | self.write = 0 45 | 46 | if self.n_entries < self.capacity: 47 | self.n_entries += 1 48 | 49 | # 从根节点开始搜索,找到对应的叶子节点 50 | def _retrieve(self, idx, s): 51 | left = 2 * idx + 1 52 | right = left + 1 53 | 54 | if left >= len(self.tree): 55 | return idx 56 | 57 | if s <= self.tree[left]: 58 | return self._retrieve(left, s) 59 | else: 60 | return self._retrieve(right, s - self.tree[left]) 61 | 62 | # 采样方法,取得样本和对应的 priority 63 | def get(self, s): 64 | # 找到叶子节点的索引 65 | idx = self._retrieve(0, s) # s:在每个区间随机取的值 66 | # 找到样本的索引 67 | dataIdx = idx - self.capacity + 1 68 | 69 | return (idx, self.tree[idx], self.data[dataIdx]) 70 | -------------------------------------------------------------------------------- /DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_PER/atari_wappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import gym 3 | import gym.spaces 4 | import numpy as np 5 | import collections 6 | 7 | 8 | class MaxAndSkipEnv(gym.Wrapper): 9 | """ 10 | Combines the repetition of actions during k frames and pixels from two consecutive frames. 11 | """ 12 | def __init__(self, env=None, skip=4): 13 | super(MaxAndSkipEnv, self).__init__(env) 14 | self._obs_buffer = collections.deque(maxlen=2) 15 | self._skip = skip 16 | 17 | def step(self, action): 18 | total_reward = 0.0 19 | done = None 20 | for _ in range(self._skip): 21 | obs, reward, done, info = self.env.step(action) 22 | self._obs_buffer.append(obs) 23 | total_reward += reward 24 | if done: 25 | break 26 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 27 | return max_frame, total_reward, done, info 28 | 29 | def reset(self): 30 | self._obs_buffer.clear() 31 | obs = self.env.reset() 32 | self._obs_buffer.append(obs) 33 | return obs 34 | 35 | 36 | class FireResetEnv(gym.Wrapper): 37 | """ 38 | Presses fire button for environments that require it for the game to start. 39 | Also checks for some corner cases in some games 40 | """ 41 | def __init__(self,env=None): 42 | """For environments where the user need to press FIRE for the game to start.""" 43 | super(FireResetEnv, self).__init__(env) 44 | assert env.unwrapped.get_action_meanings()[1]=="FIRE" 45 | assert len(env.unwrapped.get_action_meanings()) >= 3 46 | 47 | def step(self,action): 48 | 49 | return self.env.step(action) 50 | 51 | def reset(self): 52 | 53 | self.env.reset() 54 | 55 | obs,_,done,_ = self.env.step(1) 56 | if done: 57 | self.env.reset() 58 | obs, _, done, _ = self.env.step(2) 59 | if done: 60 | self.env.reset() 61 | return obs 62 | 63 | 64 | class ProcessFrame84(gym.ObservationWrapper): 65 | """ 66 | converts input image of 210x160 rgb to grayscale 84x84 67 | """ 68 | def __init__(self, env=None): 69 | super(ProcessFrame84, self).__init__(env) 70 | 71 | self.observation_space = gym.spaces.Box( 72 | low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 73 | 74 | def observation(self, obs): 75 | 76 | return ProcessFrame84.process(obs) 77 | @staticmethod 78 | def process(frame): 79 | if frame.size == 210 * 160 * 3: 80 | img = np.reshape(frame, [210, 160, 3]).astype( 81 | np.float32) 82 | elif frame.size == 250 * 160 * 3: 83 | img = np.reshape(frame, [250, 160, 3]).astype( 84 | np.float32) 85 | else: 86 | assert False, "Unknown resolution." 87 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \ 88 | img[:, :, 2] * 0.114 89 | resized_screen = cv2.resize( 90 | img, (84, 110), interpolation=cv2.INTER_AREA) 91 | x_t = resized_screen[18:102, :] 92 | x_t = np.reshape(x_t, [84, 84, 1]) 93 | return x_t.astype(np.uint8) 94 | 95 | 96 | class ImageToPyTorch(gym.ObservationWrapper): 97 | def __init__(self, env): 98 | super(ImageToPyTorch, self).__init__(env) 99 | old_shape = self.observation_space.shape 100 | new_shape = (old_shape[-1], old_shape[0], old_shape[1]) 101 | self.observation_space = gym.spaces.Box( 102 | low=0.0, high=1.0, shape=new_shape, dtype=np.float32) 103 | def observation(self, observation): 104 | return np.moveaxis(observation, 2, 0) 105 | 106 | 107 | class BufferWrapper(gym.ObservationWrapper): 108 | def __init__(self, env, n_steps, dtype=np.float32): 109 | super(BufferWrapper, self).__init__(env) 110 | self.dtype = dtype 111 | old_space = env.observation_space 112 | self.observation_space = gym.spaces.Box( 113 | old_space.low.repeat(n_steps, axis=0), 114 | old_space.high.repeat(n_steps, axis=0), dtype=dtype) 115 | def reset(self): 116 | self.buffer = np.zeros_like( 117 | self.observation_space.low, dtype=self.dtype) 118 | return self.observation(self.env.reset()) 119 | def observation(self, observation): 120 | self.buffer[:-1] = self.buffer[1:] 121 | self.buffer[-1] = observation 122 | return self.buffer 123 | 124 | 125 | class ScaledFloatFrame(gym.ObservationWrapper): 126 | def observation(self, obs): 127 | return np.array(obs).astype(np.float32) / 255.0 128 | 129 | 130 | def make_env(env_name): 131 | env = gym.make(env_name) 132 | env = MaxAndSkipEnv(env) 133 | env = FireResetEnv(env) 134 | env = ProcessFrame84(env) 135 | env = ImageToPyTorch(env) 136 | env = BufferWrapper(env, 4) 137 | env = ScaledFloatFrame(env) 138 | 139 | return env 140 | 141 | 142 | if __name__ == "__main__": 143 | env_name = "Pong-v0" 144 | 145 | env = make_env(env_name) 146 | print(env.reset().shape) 147 | print(env.observation_space) 148 | env.render() -------------------------------------------------------------------------------- /DQNs/DQN_PER/dqn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class MLP_Model(nn.Module): 8 | def __init__(self, state_size, action_size): 9 | super(MLP_Model, self).__init__() 10 | self.fc1=nn.Linear(state_size,128) 11 | self.fc2=nn.Linear(128,256) 12 | self.fc3=nn.Linear(256,action_size) 13 | 14 | def forward(self, state): 15 | """Build a network that maps state -> action values.""" 16 | out=self.fc1(state) 17 | out=F.relu(out) 18 | out=self.fc2(out) 19 | out=F.relu(out) 20 | q_a=self.fc3(out) 21 | 22 | return q_a 23 | 24 | 25 | class CNN_Model (nn.Module): 26 | def __init__(self, input_shape, n_actions): 27 | super(CNN_Model, self).__init__() 28 | self.conv = nn.Sequential( 29 | # input_shape 的第一个维度为 输入的 channel 数,比如输入为(4,84,84)时,channel = 4 30 | nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4), 31 | nn.ReLU(), 32 | nn.Conv2d(128, 256, kernel_size=4, stride=2), 33 | nn.ReLU(), 34 | nn.Conv2d(256, 128, kernel_size=3, stride=1), 35 | nn.ReLU() 36 | ) 37 | conv_out_size = self._get_conv_out(input_shape) 38 | self.fc = nn.Sequential( 39 | nn.Linear(conv_out_size, 512), 40 | nn.ReLU(), 41 | nn.Linear(512, n_actions) 42 | ) 43 | 44 | def _get_conv_out(self, input_shape): 45 | o = self.conv(torch.zeros((1, *input_shape))) 46 | return int(np.prod(o.size())) 47 | 48 | def forward(self, x): 49 | conv_out = self.conv(x) 50 | conv_out = conv_out.view(x.size()[0], -1) 51 | return self.fc(conv_out) 52 | -------------------------------------------------------------------------------- /DQNs/DQN_PER/main_dqn_per.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" 3 | 4 | import gym 5 | import arrow 6 | import torch 7 | import numpy as np 8 | from matplotlib import pyplot as plt 9 | from collections import deque 10 | from dqn_per import Agent_dqn 11 | import atari_wappers 12 | # from DQNs.DQN_PER.dqn_per import Agent_dqn 13 | # from DQNs.DQN_PER import atari_wappers 14 | 15 | 16 | def train_agent(agent,state_size,n_episodes ): 17 | scores_window = deque(maxlen=100) # last 100 scores 18 | scores , eps_lst = [],[] 19 | 20 | start_time = arrow.now() 21 | for i_episode in range(1, n_episodes + 1): 22 | state = env.reset() 23 | score = 0 24 | 25 | while True: 26 | action,epsilon = agent.act(state,i_episode) 27 | next_state, reward, done, _ = env.step(action) 28 | 29 | ## add sample and train agent 30 | sarsd = (state, action, reward, next_state, done) 31 | agent.step(sarsd) 32 | 33 | state = next_state 34 | score += reward 35 | if done: 36 | break 37 | 38 | scores_window.append(score) # save most recent score 39 | scores.append(score) # save most recent score 40 | eps_lst.append(epsilon) 41 | 42 | print('\rEpisode {} \t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") 43 | if i_episode % 100 == 0: 44 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode,np.mean(scores_window))) 45 | print('\rRunning time:{}\n'.format(arrow.now() - start_time)) 46 | # if np.mean(scores_window) >= 195.0: 47 | # print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100, 48 | # np.mean(scores_window))) 49 | # # torch.save(agent.qnetwork_local.state_dict(), model_file) 50 | # print('\nTotal running time:{}'.format(arrow.now() - start_time)) 51 | # break 52 | 53 | return scores,eps_lst 54 | 55 | 56 | def plot_curves(data,plot_name,filename): 57 | fig = plt.figure() 58 | ax = fig.add_subplot(1, 1, 1) 59 | ax.plot(np.arange(len(data)), data) 60 | plt.ylabel(plot_name) 61 | plt.xlabel('Episode #') 62 | plt.savefig(filename) 63 | 64 | 65 | if __name__=="__main__": 66 | env = atari_wappers.make_env("SpaceInvaders-v0") 67 | state_size, action_size = env.observation_space.shape, env.action_space.n 68 | 69 | cnn_agent = Agent_dqn(state_size,action_size,'CNN','True','nonlinear') 70 | train_scores, _ = train_agent(cnn_agent, state_size, 2500) 71 | plot_curves(train_scores, 'Scores', 'Plots/cnn_per.png') 72 | 73 | # env = gym.make('CartPole-v0') 74 | # env.seed(0) 75 | # state_size, action_size = env.observation_space.shape[0], env.action_space.n 76 | # mlp_agent = Agent_dqn(state_size, action_size,'MLP','True','nonlinear') 77 | # train_scores,eps_lst = train_agent(mlp_agent,state_size,2500) 78 | # plot_curves(train_scores,'Scores','Plots/train_exp-3.png') 79 | # if mlp_agent.eps_decay: 80 | # plot_curves(eps_lst,'Epsilon', 'Plots/epsilon_exp-3.png') 81 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/.DS_Store -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Models/dqn_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqn_model.pth -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png -------------------------------------------------------------------------------- /DQNs/DQN_cnn/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | ### To start the project 3 | 4 | #### 0. Basic settings 5 | * `env_name`: (str) name of the gym atari env that you want to play with 6 | * `run_mode` : (str:train/test) 7 | 8 | #### 1. To train an agent using DDQN with CNN network 9 | * `train_episode` 10 | * `learning_rate` 11 | * `buffer_size` 12 | * `batch_size` 13 | * `gamma` 14 | * `update_every` 15 | * `eps_decay` 16 | 17 | exp. 18 | ``` 19 | python main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3 20 | ``` 21 | 22 | To run in back ground and save a log file: 23 | ``` 24 | nohup python -u main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3 > train_20210326.log 2>&1 & 25 | ``` 26 | 27 | #### 2. To test a trained agent 28 | * `test_episode` (int) number of episodes you what to test the agent 29 | * `test_model_file` (str) path of the model file corresponding with the trained agent you want to test 30 | * `test_video_play` (str:yes/no) whither you want to watch video playing during testing 31 | 32 | exp. 33 | ``` 34 | python main_dqn_atari.py SpaceInvaders-v0 test --test_episode 500 --test_model_file Models/dqnCNN_model_0324.pth --test_video_play no 35 | ``` 36 | 37 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc -------------------------------------------------------------------------------- /DQNs/DQN_cnn/atari_wappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import gym 3 | import gym.spaces 4 | import numpy as np 5 | import collections 6 | 7 | 8 | class MaxAndSkipEnv(gym.Wrapper): 9 | """ 10 | Combines the repetition of actions during k frames and pixels from two consecutive frames. 11 | """ 12 | def __init__(self, env=None, skip=4): 13 | super(MaxAndSkipEnv, self).__init__(env) 14 | self._obs_buffer = collections.deque(maxlen=2) 15 | self._skip = skip 16 | 17 | def step(self, action): 18 | total_reward = 0.0 19 | done = None 20 | for _ in range(self._skip): 21 | obs, reward, done, info = self.env.step(action) 22 | self._obs_buffer.append(obs) 23 | total_reward += reward 24 | if done: 25 | break 26 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 27 | return max_frame, total_reward, done, info 28 | 29 | def reset(self): 30 | self._obs_buffer.clear() 31 | obs = self.env.reset() 32 | self._obs_buffer.append(obs) 33 | return obs 34 | 35 | 36 | class FireResetEnv(gym.Wrapper): 37 | """ 38 | Presses fire button for environments that require it for the game to start. 39 | Also checks for some corner cases in some games 40 | """ 41 | def __init__(self,env=None): 42 | """For environments where the user need to press FIRE for the game to start.""" 43 | super(FireResetEnv, self).__init__(env) 44 | assert env.unwrapped.get_action_meanings()[1]=="FIRE" 45 | assert len(env.unwrapped.get_action_meanings()) >= 3 46 | 47 | def step(self,action): 48 | 49 | return self.env.step(action) 50 | 51 | def reset(self): 52 | 53 | self.env.reset() 54 | 55 | obs,_,done,_ = self.env.step(1) 56 | if done: 57 | self.env.reset() 58 | obs, _, done, _ = self.env.step(2) 59 | if done: 60 | self.env.reset() 61 | return obs 62 | 63 | 64 | class ProcessFrame84(gym.ObservationWrapper): 65 | """ 66 | converts input image of 210x160 rgb to grayscale 84x84 67 | """ 68 | def __init__(self, env=None): 69 | super(ProcessFrame84, self).__init__(env) 70 | 71 | self.observation_space = gym.spaces.Box( 72 | low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 73 | 74 | def observation(self, obs): 75 | 76 | return ProcessFrame84.process(obs) 77 | @staticmethod 78 | def process(frame): 79 | if frame.size == 210 * 160 * 3: 80 | img = np.reshape(frame, [210, 160, 3]).astype( 81 | np.float32) 82 | elif frame.size == 250 * 160 * 3: 83 | img = np.reshape(frame, [250, 160, 3]).astype( 84 | np.float32) 85 | else: 86 | assert False, "Unknown resolution." 87 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \ 88 | img[:, :, 2] * 0.114 89 | resized_screen = cv2.resize( 90 | img, (84, 110), interpolation=cv2.INTER_AREA) 91 | x_t = resized_screen[18:102, :] 92 | x_t = np.reshape(x_t, [84, 84, 1]) 93 | return x_t.astype(np.uint8) 94 | 95 | 96 | class ImageToPyTorch(gym.ObservationWrapper): 97 | def __init__(self, env): 98 | super(ImageToPyTorch, self).__init__(env) 99 | old_shape = self.observation_space.shape 100 | new_shape = (old_shape[-1], old_shape[0], old_shape[1]) 101 | self.observation_space = gym.spaces.Box( 102 | low=0.0, high=1.0, shape=new_shape, dtype=np.float32) 103 | def observation(self, observation): 104 | return np.moveaxis(observation, 2, 0) 105 | 106 | 107 | class BufferWrapper(gym.ObservationWrapper): 108 | def __init__(self, env, n_steps, dtype=np.float32): 109 | super(BufferWrapper, self).__init__(env) 110 | self.dtype = dtype 111 | old_space = env.observation_space 112 | self.observation_space = gym.spaces.Box( 113 | old_space.low.repeat(n_steps, axis=0), 114 | old_space.high.repeat(n_steps, axis=0), dtype=dtype) 115 | def reset(self): 116 | self.buffer = np.zeros_like( 117 | self.observation_space.low, dtype=self.dtype) 118 | return self.observation(self.env.reset()) 119 | def observation(self, observation): 120 | self.buffer[:-1] = self.buffer[1:] 121 | self.buffer[-1] = observation 122 | return self.buffer 123 | 124 | 125 | class ScaledFloatFrame(gym.ObservationWrapper): 126 | def observation(self, obs): 127 | return np.array(obs).astype(np.float32) / 255.0 128 | 129 | 130 | def make_env(env_name): 131 | env = gym.make(env_name) 132 | env = MaxAndSkipEnv(env) 133 | env = FireResetEnv(env) 134 | env = ProcessFrame84(env) 135 | env = ImageToPyTorch(env) 136 | env = BufferWrapper(env, 4) 137 | env = ScaledFloatFrame(env) 138 | 139 | return env 140 | 141 | 142 | if __name__ == "__main__": 143 | env_name = "Pong-v0" 144 | 145 | env = make_env(env_name) 146 | print(env.reset().shape) 147 | print(env.observation_space) 148 | env.render() -------------------------------------------------------------------------------- /DQNs/DQN_cnn/cnn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import atari_wappers 6 | 7 | 8 | class CNN_Model (nn.Module): 9 | def __init__(self, input_shape, n_actions): 10 | super(CNN_Model, self).__init__() 11 | self.conv = nn.Sequential( 12 | # input_shape 的第一个维度为 输入的 channel 数,比如输入为(4,84,84)时,channel = 4 13 | nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4), 14 | nn.ReLU(), 15 | nn.Conv2d(128, 256, kernel_size=4, stride=2), 16 | nn.ReLU(), 17 | nn.Conv2d(256, 128, kernel_size=3, stride=1), 18 | nn.ReLU() 19 | ) 20 | conv_out_size = self._get_conv_out(input_shape) 21 | self.fc = nn.Sequential( 22 | nn.Linear(conv_out_size, 512), 23 | nn.ReLU(), 24 | nn.Linear(512, n_actions) 25 | ) 26 | 27 | def _get_conv_out(self, input_shape): 28 | o = self.conv(torch.zeros((1, *input_shape))) 29 | return int(np.prod(o.size())) 30 | 31 | def forward(self, x): 32 | conv_out = self.conv(x) 33 | conv_out = conv_out.view(x.size()[0], -1) 34 | return self.fc(conv_out) 35 | 36 | 37 | if __name__ == "__main__": 38 | env = atari_wappers.make_env("SpaceInvaders-v0") 39 | state_size, action_size = env.observation_space.shape, env.action_space.n 40 | print(state_size, action_size) 41 | model = CNN_Model(state_size, action_size) 42 | 43 | state = env.reset() 44 | obs = env.reset() 45 | obs1 = env.reset() 46 | t = torch.tensor([obs, obs1]) 47 | print("x.shape", t.shape) 48 | 49 | q_value = model.forward(t) 50 | actions = torch.tensor([[0,1]]) 51 | print(q_value) 52 | print(q_value.gather(1,actions)) 53 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import namedtuple, deque 4 | import torch 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from cnn_model import CNN_Model 8 | 9 | TAU = 1e-3 # for soft update of target parameters 10 | EPS_start=1.0 11 | EPS_end=0.01 12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | class ReplayBuffer: 16 | """Fixed-size buffer to store experience tuples.""" 17 | 18 | def __init__(self, action_size, buffer_size, batch_size): 19 | """Initialize a ReplayBuffer object. 20 | 21 | Params 22 | ====== 23 | action_size (int): dimension of each action 24 | buffer_size (int): maximum size of buffer 25 | batch_size (int): size of each training batch 26 | seed (int): random seed 27 | """ 28 | self.action_size = action_size 29 | self.memory = deque(maxlen=buffer_size) 30 | # 使用 deque(maxlen=N) 构造函数会新建一个固定大小的队列。当新的元素加入并且这个队列已满的时候, 最老的元素会自动被移除掉 31 | self.batch_size = batch_size 32 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 33 | 34 | def add(self, state, action, reward, next_state, done): 35 | """ 36 | Add a new experience to the memory 37 | :param state: 38 | :param p: sample probability for this experience 39 | :return: 40 | """ 41 | e = self.experience(state, action, reward, next_state, done) 42 | self.memory.append(e) 43 | 44 | def clean_buffer(self): 45 | self.memory.clear() 46 | 47 | def sample(self): 48 | """Randomly sample a batch of experiences from memory.""" 49 | experiences = random.sample(self.memory, k=self.batch_size) 50 | 51 | states = torch.tensor([e.state for e in experiences if e is not None]).float().to(device) 52 | actions = torch.tensor([[e.action for e in experiences if e is not None]]).long().to(device) 53 | rewards = torch.tensor([e.reward for e in experiences if e is not None]).float().to(device) 54 | next_states = torch.tensor([e.next_state for e in experiences if e is not None]).float().to( 55 | device) 56 | dones = torch.from_numpy(np.array([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to( 57 | device) 58 | return (states, actions, rewards, next_states, dones) 59 | 60 | def __len__(self): 61 | """Return the current size of internal memory.""" 62 | return len(self.memory) 63 | 64 | 65 | class Agent_dqn(): 66 | def __init__(self, input_channel,action_size,learning_rate=5e-3,buffer_size=int(1e4),batch_size=32): 67 | """Initialize an Agent object. 68 | 69 | Params 70 | ====== 71 | state_size (int): dimension of each state 72 | action_size (int): dimension of each action 73 | seed (int): random seed 74 | """ 75 | self.action_size = action_size 76 | 77 | # Q-Network 78 | self.qnetwork_local = CNN_Model(input_channel,action_size).to(device) 79 | self.qnetwork_target = CNN_Model(input_channel,action_size).to(device) 80 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), learning_rate) 81 | 82 | # Replay memory 83 | self.batch_size = batch_size 84 | self.memory = ReplayBuffer(action_size, buffer_size,batch_size) 85 | # Initialize time step (for updating every UPDATE_EVERY steps) 86 | self.t_step = 0 87 | self.episode = 0 88 | self.epsilon = EPS_start 89 | 90 | def act(self,state,i_episode,eps_decay): 91 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 92 | self.qnetwork_local.eval() 93 | with torch.no_grad(): 94 | action_values = self.qnetwork_local(state) 95 | self.qnetwork_local.train() 96 | 97 | " Epsilon-greedy action selection" 98 | if i_episode>self.episode: 99 | # update EPS every new episode 100 | self.epsilon = max(EPS_end, eps_decay * self.epsilon) 101 | self.episode = i_episode 102 | # epsilon greedy policy 103 | if random.random() > self.epsilon: 104 | return np.argmax(action_values.cpu().data.numpy()) 105 | else: 106 | return random.choice(np.arange(self.action_size)) 107 | 108 | def act_greedy_policy(self,state): 109 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 110 | self.qnetwork_local.eval() 111 | with torch.no_grad(): 112 | action_values = self.qnetwork_local(state) 113 | return np.argmax(action_values.cpu().data.numpy()) 114 | 115 | def step(self,sarsd,gamma,update_every): 116 | state, action, reward, next_state, done = sarsd 117 | self.t_step += 1 118 | 119 | # add an experience for current time step 120 | self.memory.add(state, action, reward, next_state, done) 121 | 122 | # Learn every UPDATE_EVERY time steps 123 | if (self.t_step+1) % update_every==0: 124 | if self.memory.__len__()>self.batch_size: 125 | batch_exps = self.memory.sample() 126 | loss = self.learn(batch_exps,gamma) 127 | return loss 128 | 129 | def learn(self,exps,gamma): 130 | # fetch the batch (s,a,r,s',done) from experiences batch 131 | states,actions,rewards,next_states,dones = exps 132 | print(states.shape) 133 | 134 | # ------------------ calculate loss —------------------------- # 135 | 136 | # calculate Q targets 137 | expected_next_max_actions = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(0) 138 | Q_expected_next = self.qnetwork_target(next_states).gather(1, expected_next_max_actions) 139 | Q_targets = rewards + (gamma * Q_expected_next * (1 - dones)) 140 | 141 | # get expected Q for current state 142 | Q_expected = self.qnetwork_local(states).gather(1, actions) 143 | 144 | loss = F.mse_loss(Q_expected, Q_targets) 145 | 146 | # ---------------- update local Q net -------------------- # 147 | self.optimizer.zero_grad() 148 | loss.backward() 149 | self.optimizer.step() 150 | # print(next(self.qnetwork_local.parameters()).is_cuda) 151 | 152 | # ---------------- update target Q net -------------------- # 153 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) 154 | 155 | return loss.cpu().detach().numpy() 156 | 157 | def soft_update(self, local_model, target_model, tau): 158 | """Soft update model parameters. 159 | θ_target = τ*θ_local + (1 - τ)*θ_target 160 | 161 | Params 162 | ====== 163 | local_model (PyTorch model): weights will be copied from 164 | target_model (PyTorch model): weights will be copied to 165 | tau (float): interpolation parameter 166 | """ 167 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 168 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-0.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-100.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-100.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-140.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-140.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-152.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-152.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-167.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-167.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-185.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-185.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-200.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-200.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-204.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-204.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-227.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-227.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-300.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-300.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-400.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-400.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-500.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-500.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-600.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-600.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-674.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-674.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-683.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-683.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-696.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-696.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-700.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-700.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-714.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-714.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-733.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-733.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-756.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-756.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-800.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-800.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-900.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-900.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-902.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-902.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-909.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-909.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-920.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-920.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-936.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-936.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/image/pic-956.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-956.jpg -------------------------------------------------------------------------------- /DQNs/DQN_cnn/log/train_20210326.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 2000 8 | Network update every 5 time step 9 | Replay buffer size : 5000 10 | Batch size : 32 11 | Learning rate : 0.001 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/log/train_20210329.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 2000 8 | Network update every 5 time step 9 | Replay buffer size : 5000 10 | Batch size : 32 11 | Learning rate : 0.001 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | Episode 100 Loss 38.24846267700195 Average Score: 136.55 16 | Running time till now :0:11:03.313392 17 | 18 | Episode 200 Loss 13.040627479553223 Average Score: 153.55 19 | Running time till now :0:22:44.250463 20 | 21 | Episode 300 Loss 15.12213134765625 Average Score: 193.75 22 | Running time till now :0:35:31.581827 23 | 24 | Episode 400 Loss 52.153236389160156 Average Score: 205.35 25 | Running time till now :0:49:19.173927 26 | 27 | Episode 500 Loss 21.199983596801758 Average Score: 230.80 28 | Running time till now :1:03:42.341828 29 | 30 | Episode 600 Loss 44.75456237792969 Average Score: 237.10 31 | Running time till now :1:18:31.195379 32 | 33 | Episode 700 Loss 71.21875762939453 Average Score: 279.30 34 | Running time till now :1:33:07.172067 35 | 36 | Episode 800 Loss 46.80872344970703 Average Score: 250.20 37 | Running time till now :1:45:39.041750 38 | 39 | Episode 900 Loss 69.49663543701172 Average Score: 270.55 40 | Running time till now :1:59:20.249699 41 | 42 | Episode 1000 Loss 97.96715545654297 Average Score: 270.80 43 | Running time till now :2:13:20.709739 44 | 45 | Episode 1100 Loss 82.20999145507812 Average Score: 280.75 46 | Running time till now :2:28:11.418844 47 | 48 | Episode 1200 Loss 29.77111053466797 Average Score: 270.15 49 | Running time till now :2:42:38.161003 50 | 51 | Episode 1300 Loss 33.50057601928711 Average Score: 263.00 52 | Running time till now :2:55:55.026575 53 | 54 | Episode 1400 Loss 32.226627349853516 Average Score: 296.65 55 | Running time till now :3:10:45.828023 56 | 57 | Episode 1500 Loss 30.3413143157959 Average Score: 280.10 58 | Running time till now :3:26:07.034734 59 | 60 | Episode 1600 Loss 30.96596336364746 Average Score: 271.00 61 | Running time till now :3:40:35.273112 62 | 63 | Episode 1700 Loss 32.25701904296875 Average Score: 255.85 64 | Running time till now :3:53:36.508000 65 | 66 | Episode 1800 Loss 28.328149795532227 Average Score: 293.50 67 | Running time till now :4:09:19.669146 68 | 69 | Episode 1900 Loss 29.688913345336914 Average Score: 259.10 70 | Running time till now :4:23:57.511495 71 | 72 | Episode 2000 Loss 27.258968353271484 Average Score: 261.15 73 | Running time till now :4:38:51.220961 74 | 75 | Training finished, total running time:4:38:51.232527. 76 | Model saved. 77 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/log/train_20210329_1.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 1000 8 | Network update every 5 time step 9 | Replay buffer size : 5000 10 | Batch size : 64 11 | Learning rate : 0.005 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/log/train_20210330.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 1000 8 | Network update every 5 time step 9 | Replay buffer size : 3500 10 | Batch size : 64 11 | Learning rate : 0.005 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | Episode 100 Loss 71.22970581054688 Average Score: 180.00 16 | Running time till now :0:22:27.094323 17 | 18 | Episode 200 Loss 29.8588924407959 Average Score: 152.60 19 | Running time till now :0:44:21.409593 20 | 21 | Episode 300 Loss 61.14106369018555 Average Score: 209.75 22 | Running time till now :1:09:07.883698 23 | 24 | Episode 400 Loss 30.508338928222656 Average Score: 218.05 25 | Running time till now :1:35:38.577875 26 | 27 | Episode 500 Loss 89.18991088867188 Average Score: 245.05 28 | Running time till now :2:03:52.850307 29 | 30 | Episode 600 Loss 21.991769790649414 Average Score: 262.35 31 | Running time till now :2:30:57.078511 32 | 33 | Episode 700 Loss 23.49405860900879 Average Score: 254.65 34 | Running time till now :2:56:23.053378 35 | 36 | Episode 800 Loss 81.24069213867188 Average Score: 263.45 37 | Running time till now :3:21:37.061215 38 | 39 | Episode 900 Loss 24.93558692932129 Average Score: 284.15 40 | Running time till now :3:50:02.799914 41 | 42 | Episode 1000 Loss 85.55946350097656 Average Score: 268.20 43 | Running time till now :4:16:22.288483 44 | 45 | Training finished, total running time:4:16:22.299132. 46 | Model saved. 47 | Traceback (most recent call last): 48 | File "main_dqn_atari.py", line 178, in 49 | dqn_agent = Agent_dqn(state_size,action_size) 50 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/dqn_agent.py", line 79, in __init__ 51 | self.qnetwork_local = CNN_Model(input_channel,action_size).to(device) 52 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 20, in __init__ 53 | conv_out_size = self._get_conv_out(input_shape) 54 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 28, in _get_conv_out 55 | o = self.conv(torch.zeros((1, *input_shape))) 56 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl 57 | result = self.forward(*input, **kwargs) 58 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 119, in forward 59 | input = module(input) 60 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl 61 | result = self.forward(*input, **kwargs) 62 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 399, in forward 63 | return self._conv_forward(input, self.weight, self.bias) 64 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 396, in _conv_forward 65 | self.padding, self.dilation, self.groups) 66 | RuntimeError: Calculated padded input size per channel: (160 x 3). Kernel size: (8 x 8). Kernel size can't be greater than actual input size 67 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/log/train_20210331.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 2000 8 | Network update every 10 time step 9 | Replay buffer size : 3500 10 | Batch size : 128 11 | Learning rate : 0.0005 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | Episode 100 Loss 37.57309341430664 Average Score: 144.10 16 | Running time till now :0:20:57.373758 17 | 18 | Episode 200 Loss 45.944541931152344 Average Score: 149.85 19 | Running time till now :0:43:08.088817 20 | 21 | Episode 300 Loss 77.53382110595703 Average Score: 200.00 22 | Running time till now :1:08:21.291658 23 | 24 | Episode 400 Loss 94.59493255615234 Average Score: 200.65 25 | Running time till now :1:34:55.924278 26 | 27 | Episode 500 Loss 20.661224365234375 Average Score: 228.70 28 | Running time till now :2:02:23.143794 29 | 30 | Episode 600 Loss 38.10764694213867 Average Score: 259.20 31 | Running time till now :2:32:13.928401 32 | 33 | Episode 700 Loss 21.809246063232422 Average Score: 238.65 34 | Running time till now :2:58:31.344456 35 | 36 | Episode 800 Loss 27.276247024536133 Average Score: 252.15 37 | Running time till now :3:22:48.359756 38 | 39 | Episode 900 Loss 64.65150451660156 Average Score: 273.00 40 | Running time till now :3:50:41.210916 41 | 42 | Episode 1000 Loss 25.74323272705078 Average Score: 266.45 43 | Running time till now :4:16:18.361667 44 | 45 | Episode 1100 Loss 18.910884857177734 Average Score: 277.10 46 | Running time till now :4:44:24.187721 47 | 48 | Episode 1200 Loss 26.118581771850586 Average Score: 267.65 49 | Running time till now :5:09:37.085516 50 | 51 | Episode 1300 Loss 25.362396240234375 Average Score: 251.85 52 | Running time till now :5:34:53.701273 53 | 54 | Episode 1400 Loss 26.500167846679688 Average Score: 281.05 55 | Running time till now :6:00:55.716864 56 | 57 | Episode 1500 Loss 32.66218185424805 Average Score: 257.55 58 | Running time till now :6:25:39.960819 59 | 60 | Episode 1600 Loss 52.91573715209961 Average Score: 268.40 61 | Running time till now :6:50:39.701043 62 | 63 | Episode 1700 Loss 43.722801208496094 Average Score: 265.95 64 | Running time till now :7:13:50.829814 65 | 66 | Episode 1800 Loss 49.69996643066406 Average Score: 262.20 67 | Running time till now :7:39:07.592524 68 | 69 | Episode 1900 Loss 84.68921661376953 Average Score: 266.25 70 | Running time till now :8:06:00.022165 71 | 72 | Episode 2000 Loss 24.432580947875977 Average Score: 259.90 73 | Running time till now :8:37:15.948752 74 | 75 | Training finished, total running time:8:37:15.969363. 76 | Model saved. 77 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/main_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import arrow 4 | import torch 5 | import gym 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | from collections import deque 9 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn 10 | from DQNs.DQN_cnn import atari_wappers 11 | 12 | 13 | def plot_scores(scores,filename): 14 | fig = plt.figure() 15 | ax = fig.add_subplot(1, 1, 1) 16 | ax.plot(np.arange(len(scores)), scores) 17 | plt.ylabel('Score') 18 | plt.xlabel('Episode #') 19 | plt.savefig(filename) 20 | 21 | 22 | 23 | def train_agent(env,agent,n_episode,eps_decay,gamma,update_every): 24 | scores = [] # list containing scores from each episode 25 | scores_window = deque(maxlen=100) # last 100 scores 26 | 27 | start_time = arrow.now() 28 | for i_episode in range(1, n_episode + 1): 29 | state = env.reset() 30 | print(state.shape) 31 | score = 0 32 | episode_loss=[] 33 | while True: 34 | # # check the memory usage of system, clean replay buffer if too high 35 | # if (sys_mem.used / sys_mem.total) >= 0.03: 36 | # agent.memory.clean_buffer() 37 | # print('Buffer cleaned on episode {}'.format(i_episode)) 38 | # get action 39 | action = agent.act(state,i_episode,eps_decay) 40 | # interact with env (one step) 41 | next_state, reward, done, _ = env.step(action) 42 | # train the agent 43 | sarsd = (state, action, reward, next_state,done) 44 | loss = agent.step(sarsd,gamma,update_every) 45 | # update status 46 | state = next_state 47 | score += reward 48 | # break the loop if current episode is over 49 | if done: 50 | break 51 | if loss is not None: 52 | episode_loss.append(loss) 53 | 54 | # update rewards and scores every episode 55 | scores_window.append(score) 56 | scores.append(score) 57 | 58 | # print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss), 59 | # np.mean(scores_window)), end="") 60 | # 61 | # if i_episode > 25: 62 | # print('Replay Buffer size: {}'.format(agent.memory.__len__())) 63 | # print('Memory used: ',sys_mem.used) 64 | # print('Memory used rate: ',sys_mem.used/sys_mem.total) 65 | 66 | if i_episode % 100 == 0: 67 | print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss), 68 | np.mean(scores_window))) 69 | print('\rRunning time till now :{}\n'.format(arrow.now() - start_time)) 70 | 71 | 72 | print("Training finished, total running time:{}. \n Model saved.".format(arrow.now()-start_time)) 73 | 74 | return scores 75 | 76 | 77 | 78 | 79 | 80 | if __name__ =="__main__": 81 | env = atari_wappers.make_env("SpaceInvaders-v0") 82 | state_size, action_size = env.observation_space.shape, env.action_space.n 83 | dqn_agent = Agent_dqn(state_size,action_size) 84 | train_agent(env,dqn_agent,1,0.98,0.995,5) 85 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/play_atari.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import deque 4 | import os 5 | import torch 6 | # from skimage import io 7 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn 8 | from DQNs.DQN_cnn import atari_wappers 9 | 10 | 11 | # def save_films(state,step): 12 | # if not os.path.exists('./image'): 13 | # os.makedirs('./image') 14 | # img_name = './image/pic-%d.jpg' % step 15 | # io.imsave(img_name, state) 16 | 17 | 18 | def random_play(): 19 | for step in range(5000): 20 | env.render() 21 | action = 1 22 | state, reward, done, info = env.step(action) 23 | 24 | if step % 100 == 0: 25 | print(state.shape) 26 | # print(state) 27 | save_films(state, step) 28 | 29 | if reward > 0: 30 | print(reward, step) 31 | save_films(state, step) 32 | 33 | if done: 34 | print('dead in %d steps' % step) 35 | break 36 | 37 | 38 | def random_test(env): 39 | socres = [] 40 | scores_window = deque(maxlen=100) 41 | 42 | for i_episode in range(100): 43 | state = env.reset() 44 | score = 0 45 | while True: 46 | action = np.random.choice(env.action_space.n,1)[0] 47 | state, reward, done, info = env.step(action) 48 | score += reward 49 | if done: 50 | break 51 | socres.append(score) 52 | scores_window.append(score) 53 | 54 | if i_episode % 10 == 0: 55 | print('Episode {},\t Average score : {} '.format(i_episode, np.mean(scores_window))) 56 | 57 | 58 | def trained_agent_test(env,agent): 59 | socres = [] 60 | scores_window = deque(maxlen=100) 61 | 62 | for i_episode in range(5000): 63 | state = env.reset() 64 | score = 0 65 | 66 | while True: 67 | action = agent.act(state) 68 | env.render() 69 | state, reward, done, _ = env.step(action) 70 | score += reward 71 | if done: 72 | break 73 | socres.append(score) 74 | scores_window.append(score) 75 | 76 | if i_episode % 100 == 0: 77 | print('Episode {},\r Average score : {} '.format(i_episode,np.mean(scores_window))) 78 | 79 | 80 | if __name__ =="__main__": 81 | 82 | env = gym.make('SpaceInvaders-v0') 83 | random_test(env) 84 | 85 | # env = atari_wappers.make_env("SpaceInvaders-v0") 86 | # state_size, action_size = env.observation_space.shape, env.action_space.n 87 | # dqn_agent = Agent_dqn(state_size, action_size) 88 | # 89 | # dqn_agent.qnetwork_local.load_state_dict(torch.load("dqn_model.pth")) 90 | # trained_agent_test(env,dqn_agent) 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /DQNs/DQN_cnn/train_20210401.log: -------------------------------------------------------------------------------- 1 | nohup: ignoring input 2 | #################################################### 3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN 4 | #################################################### 5 | 6 | Training Parameters : 7 | Train episode : 5000 8 | Network update every 5 time step 9 | Replay buffer size : 3000 10 | Batch size : 128 11 | Learning rate : 0.0005 12 | GAMMA : 0.99 13 | Epsilon decay rate : 0.995 14 | 15 | -------------------------------------------------------------------------------- /Evaluation_Algorithms/CartPole.py: -------------------------------------------------------------------------------- 1 | """ 2 | 在 CartPole 环境中测试多种算法智能体的表现,并对比奖励曲线图 3 | 测试算法:1.PPO 4 | 2.DDPG/TD3 5 | 3.DQN 6 | 4.A3C/A2C 7 | """ 8 | 9 | import torch 10 | import gym 11 | import numpy as np 12 | import pandas as pd 13 | from collections import deque 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def plot_scores(scores,file_name,multi_time=False): 18 | "绘制多次训练多条曲线" 19 | if multi_time: 20 | x=np.arange(1, len(scores[0]) + 1) 21 | for n in range(len(scores)): 22 | rolling_mean = pd.Series(scores[n]).rolling(100).mean() 23 | plt.plot(x,rolling_mean,label="trial_"+str(n+1)) 24 | else: 25 | x = np.arange(1, len(scores) + 1) 26 | rolling_mean = pd.Series(scores).rolling(100).mean() 27 | plt.plot(x, rolling_mean) 28 | 29 | plt.ylabel('Score') 30 | plt.xlabel('Episode #') 31 | plt.legend() 32 | plt.savefig(file_name) 33 | plt.show() 34 | 35 | 36 | def plot_diff_agent(scores_2d,file_name): 37 | " 绘制多种不同agent的训练曲线:多曲线图" 38 | for name,scores in scores_2d: 39 | x = np.arange(1, len(scores) + 1) 40 | rolling_mean = pd.Series(scores).rolling(100).mean() 41 | plt.plot(x, rolling_mean,label=name) 42 | plt.ylabel('Score') 43 | plt.xlabel('Episode #') 44 | plt.legend() 45 | plt.savefig(file_name) 46 | plt.show() 47 | 48 | -------------------------------------------------------------------------------- /Games_play_train/atari.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import os 3 | from skimage import io 4 | 5 | env = gym.make('SpaceInvaders-v0') 6 | #env = gym.make("PongDeterministic-v4") 7 | status = env.reset() 8 | 9 | 10 | print('observation space:', env.observation_space) 11 | print('action space:', env.action_space) 12 | 13 | 14 | def save_films(state,step): 15 | if not os.path.exists('./image'): 16 | os.makedirs('./image') 17 | img_name = './image/pic-%d.jpg' % step 18 | io.imsave(img_name, state) 19 | 20 | 21 | for step in range(5000): 22 | env.render() 23 | action =1 24 | state, reward, done, info = env.step(action) 25 | 26 | if step%100 ==0 : 27 | print(state.shape) 28 | # print(state) 29 | save_films(state,step) 30 | 31 | if reward >0: 32 | print(reward,step) 33 | save_films(state,step) 34 | 35 | if done: 36 | print('dead in %d steps' % step) 37 | break -------------------------------------------------------------------------------- /Policy_Gradient/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/.DS_Store -------------------------------------------------------------------------------- /Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PGs/agent_PG.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from collections import deque 4 | import torch 5 | import torch.optim as optim 6 | from CartPole.Policy_Gradient.model import Policy 7 | 8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 9 | 10 | GAMMA=1.0 11 | LR=0.001 12 | 13 | 14 | class Agent_PG(): 15 | 16 | def __init__(self, state_size, action_size,type): 17 | self.policy=Policy(state_size,action_size).to(device) 18 | self.optimizer=optim.Adam(self.policy.parameters(), lr=LR) 19 | self.type=type 20 | 21 | def reinforce_loss(self,log_probs,rewards): 22 | "------根据 Reinforce 算法计算的损失函数---------" 23 | # calculate discount rewards 24 | discounts=[GAMMA**i for i in range(len(rewards))] 25 | R=sum([g*r for g,r in zip(discounts,rewards)]) 26 | 27 | loss_arr=[] 28 | for log_prob in log_probs: 29 | loss_arr.append(-log_prob * R) 30 | 31 | policy_loss=torch.cat(loss_arr).sum() # 把n个1d tensor 组成的list 拼接成一个完整的 tensor(1d,size:n) 32 | # print(policy_loss) 33 | return policy_loss 34 | 35 | def pg_loss(self,log_probs,rewards): 36 | """---- 37 | Reinforce 的改进版本: 38 | 1.Credit Assignment:对每个 a(t) 计算未来累积折扣回报 R 39 | 2.对每个t的回报R进行 batch normalization 40 | ------""" 41 | # calculate the (discounted) future rewards 42 | furRewards_dis = [] 43 | for i in range(len(rewards)): 44 | discount = [GAMMA ** i for i in range(len(rewards) - i)] 45 | f_rewards = rewards[i:] 46 | furRewards_dis.append(sum(d * f for d, f in zip(discount, f_rewards))) 47 | # print(furRewards_dis) 48 | 49 | # -- Normalize reward 50 | mean = np.mean(furRewards_dis) 51 | std = np.std(furRewards_dis) + 1.0e-10 52 | rewards_normalized = (furRewards_dis - mean) / std 53 | 54 | # -- calculate policy loss 55 | loss_arr = [] 56 | for i in range(len(rewards_normalized)): 57 | loss_arr.append(-log_probs[i]*rewards_normalized[i]) 58 | # print(loss_arr) 59 | 60 | policy_loss = torch.cat(loss_arr).sum() 61 | # print(policy_loss,"----------\n") 62 | 63 | return policy_loss 64 | 65 | def train(self,env): 66 | state = env.reset() 67 | log_probs = [] 68 | rewards = [] 69 | # --- collect log probs and rewards for a single trajectory 70 | while True: 71 | # convert state to tensor 72 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d 73 | result_dic = self.policy.act(state) 74 | next_state, reward, done, _ = env.step(result_dic['action']) 75 | log_probs.append(result_dic['log_prob']) 76 | rewards.append(reward) 77 | state = next_state 78 | if done: 79 | break 80 | total_reward = sum(rewards) 81 | 82 | # --- update policy after one completed trajectory 83 | # calculate loss 84 | loss = self.reinforce_loss(log_probs, rewards) 85 | if self.type=="reinforce": 86 | loss = self.reinforce_loss(log_probs, rewards) 87 | elif self.type=="pg": 88 | loss = self.pg_loss(log_probs, rewards) 89 | 90 | # backprop the loss to update policy network 91 | self.optimizer.zero_grad() 92 | loss.backward() 93 | self.optimizer.step() 94 | 95 | return total_reward 96 | 97 | 98 | if __name__=="__main__": 99 | env = gym.make('CartPole-v0') 100 | agent=Agent_PG(state_size=4,action_size=2,type='pg') 101 | n_episode=2000 102 | 103 | scores_deque = deque(maxlen=100) 104 | scores = [] 105 | for i_episode in range(1,n_episode+1): 106 | Reward=agent.train(env) 107 | 108 | scores_deque.append(Reward) 109 | scores.append(Reward) 110 | if i_episode % 100 == 0: 111 | print('Episode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) -------------------------------------------------------------------------------- /Policy_Gradient/PGs/main_PG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import pandas as pd 5 | from collections import deque 6 | import matplotlib.pyplot as plt 7 | from CartPole.Policy_Gradient.agent_PG import Agent_PG 8 | from CartPole.Policy_Gradient.PPO_with_R import PPO_v1 9 | from CartPole.Policy_Gradient.PPO_with_A import PPO_V2 10 | 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 12 | # model_file="models/pg_model_3.pth" 13 | # plot_file="results&plots/pg_3.png" 14 | 15 | 16 | def watch_smart_agent(agent,model_name): 17 | agent.policy.load_state_dict(torch.load(model_name)) 18 | state = env.reset() 19 | for t in range(1000): 20 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 21 | action,_ = agent.policy.act(state) 22 | env.render() 23 | state, reward, done, _ = env.step(action) 24 | if done: 25 | print("done in time step {}".format(t+1)) 26 | break 27 | env.close() 28 | 29 | 30 | def plot_scores(scores,file_name,multi_time=False): 31 | "绘制多次训练多条曲线" 32 | if multi_time: 33 | x=np.arange(1, len(scores[0]) + 1) 34 | for n in range(len(scores)): 35 | rolling_mean = pd.Series(scores[n]).rolling(100).mean() 36 | plt.plot(x,rolling_mean,label="trial_"+str(n+1)) 37 | else: 38 | x = np.arange(1, len(scores) + 1) 39 | rolling_mean = pd.Series(scores).rolling(100).mean() 40 | plt.plot(x, rolling_mean) 41 | 42 | plt.ylabel('Score') 43 | plt.xlabel('Episode #') 44 | plt.legend() 45 | plt.savefig(file_name) 46 | plt.show() 47 | 48 | 49 | def plot_diff_agent(scores_2d,file_name): 50 | " 绘制多种不同agent的训练曲线:多曲线图" 51 | for name,scores in scores_2d: 52 | x = np.arange(1, len(scores) + 1) 53 | rolling_mean = pd.Series(scores).rolling(100).mean() 54 | plt.plot(x, rolling_mean,label=name) 55 | plt.ylabel('Score') 56 | plt.xlabel('Episode #') 57 | plt.legend() 58 | plt.savefig(file_name) 59 | plt.show() 60 | 61 | 62 | def agent_test(agent,n_episode,model_name): 63 | agent.policy.load_state_dict(torch.load(model_name)) 64 | scores = [] 65 | for i_episode in range(1, n_episode + 1): 66 | rewards=[] 67 | state = env.reset() 68 | while True: 69 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d 70 | action, _ = agent.policy.act(state) 71 | state, reward, done, _ = env.step(action) 72 | rewards.append(reward) 73 | if done: 74 | break 75 | scores.append(sum(rewards)) 76 | 77 | return scores 78 | 79 | 80 | def train_agent(env,agent,n_episode,model_file): 81 | scores_deque = deque(maxlen=100) 82 | scores = [] 83 | 84 | for i_episode in range(1, n_episode + 1): 85 | total_reward=agent.train(env) 86 | # record scores(total rewards) per episode 87 | scores_deque.append(total_reward) 88 | scores.append(total_reward) 89 | 90 | print('\r Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}' 91 | .format(i_episode, np.mean(scores_deque), total_reward), end="") 92 | if i_episode % 100 == 0: 93 | print('\n Episode {}\t Average Score: {:.2f}\n'.format(i_episode,np.mean(scores_deque))) 94 | if np.mean(scores_deque) >= 195.0: 95 | print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}\n----------\n'.format(i_episode, 96 | np.mean(scores_deque))) 97 | torch.save(agent.policy.state_dict(),model_file) 98 | break 99 | 100 | return scores 101 | 102 | 103 | def train_agent_multi_times(env, agent, n_episode, train_time, file): 104 | " 一个 agent 训练多次并绘制所有的奖励曲线,考察特定 policy gradient 算法的稳定性" 105 | scores_2d = [] 106 | for n in range(train_time): 107 | scores = [] 108 | for i_episode in range(1, n_episode + 1): 109 | total_reward = agent.train(env) 110 | scores.append(total_reward) 111 | 112 | print('Trial {} finished. \t Avg score for the last 100 episode: {}' 113 | .format((n + 1), np.mean(scores[-100:]))) 114 | scores_2d.append(scores) 115 | 116 | plot_scores(scores_2d, file,multi_time=True) 117 | 118 | 119 | def train_diff_agents(env,agents,n_episode,file): 120 | " 训练多种算法的不同agent, 绘制奖励曲线对比性能 " 121 | scores_2d=[] 122 | for name in agents.keys(): 123 | scores = [] 124 | for i_episode in range(1, n_episode + 1): 125 | total_reward = agents[name].train(env) 126 | scores.append(total_reward) 127 | scores_2d.append((name,scores)) 128 | print('Training agent {} finished. \t Avg score for the last 100 episode: {}'\ 129 | .format(name,np.mean(scores[-100:]))) 130 | 131 | plot_diff_agent(scores_2d,file) 132 | 133 | 134 | if __name__=="__main__": 135 | env = gym.make('CartPole-v0') 136 | 137 | agent_pg = Agent_PG(state_size=4,action_size=2,type="pg") 138 | agent_rf=Agent_PG(state_size=4,action_size=2,type="reinforce") 139 | ppo_R=PPO_v1(state_size=4,action_size=2) 140 | 141 | ppo_without_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=False) 142 | ppo_with_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=True) 143 | 144 | #train_scores = train_agent(env, ppo_with_entropy, 2000, 'PGs/models/PPO_new.pth') 145 | #plot_scores(train_scores, 'PGs/results&plots/PPO_with_entropy_1.png') 146 | 147 | # agents={'PPO with R':ppo_R, 148 | # 'PPO with A':ppo_with_entropy, 149 | # 'Policy Gradient':agent_pg, 150 | # 'Reinforce':agent_rf} 151 | 152 | ppo_agents={'PPO_R':ppo_R,'PPO_A_org':ppo_without_entropy,'PPO_A_entropy':ppo_with_entropy} 153 | 154 | train_diff_agents(env, ppo_agents, 1500, '../results&plots/PPO_comparison_4.png') 155 | # train_agent_multi_times(env,ppo_with_entropy,1300,5,'PGs/results&plots/PPO-entropy_5times.png') 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /Policy_Gradient/PGs/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Categorical 5 | import numpy as np 6 | 7 | 8 | class Policy(nn.Module): 9 | "single Policy network for Reinforce and PG" 10 | 11 | def __init__(self,state_size,action_size): 12 | super(Policy, self).__init__() 13 | self.seed = torch.manual_seed(0) 14 | self.fc1 = nn.Linear(state_size, 24) 15 | self.fc2 = nn.Linear(24, 36) 16 | self.fc3 = nn.Linear(36, action_size) 17 | 18 | def forward(self, x): 19 | """ 20 | Build a network that maps state -> action probs. 21 | """ 22 | 23 | out=F.relu(self.fc1(x)) 24 | out = F.relu(self.fc2(out)) 25 | out = F.softmax(self.fc3(out),dim=1) 26 | 27 | return out 28 | 29 | def act(self,state): 30 | # probs for each action (2d tensor) 31 | probs = self.forward(state) 32 | m = Categorical(probs) 33 | action = m.sample() 34 | # return action for current state, and the corresponding probability 35 | 36 | result_dic={"action":action.item(),"log_prob":m.log_prob(action) 37 | ,"prob":probs[:,action.item()].item()} 38 | return result_dic 39 | 40 | 41 | class Actor(nn.Module): 42 | """Policy netwrok for PPO_R""" 43 | "Actor_Critic model for PPO_A" 44 | 45 | def __init__(self,state_size,action_size): 46 | super(Actor, self).__init__() 47 | self.seed = torch.manual_seed(0) 48 | self.fc1 = nn.Linear(state_size, 128) 49 | # self.fc2 = nn.Linear(64,128) 50 | self.fc2= nn.Linear(128, action_size) 51 | 52 | def forward(self, x): 53 | """ 54 | Build a network that maps state -> action probs. 55 | """ 56 | 57 | x=F.relu(self.fc1(x)) 58 | out = F.softmax(self.fc2(x),dim=1) 59 | return out 60 | 61 | def act(self,state): 62 | # probs for each action (2d tensor) 63 | probs = self.forward(state) 64 | m = Categorical(probs) 65 | action = m.sample() 66 | # return action for current state, and the corresponding probability 67 | 68 | result_dic={"action":action.item(),"log_prob":m.log_prob(action) 69 | ,"prob":probs[:,action.item()].item()} 70 | return result_dic 71 | 72 | 73 | class Critic(nn.Module): 74 | " Actor_Critic model for PPO" 75 | 76 | def __init__(self,state_size): 77 | super(Critic, self).__init__() 78 | self.fc1=nn.Linear(state_size,128) 79 | # self.fc2=nn.Linear(64,128) 80 | self.fc2=nn.Linear(128,1) 81 | 82 | def forward(self,x): 83 | x=F.relu(self.fc1(x)) 84 | state_value = self.fc2(x) 85 | return state_value 86 | 87 | 88 | if __name__=="__main__": 89 | 90 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 91 | state = np.array([-0.04456399, 0.04653909, 0.01326909, -0.02099827]) 92 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 93 | 94 | policy=Policy(state_size=4,action_size=2).to(device) 95 | action,log_prob=policy.act(state) 96 | print(action,log_prob) 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/PPO_model-1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_model-1.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/PPO_new.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_new.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/PPOv2_model-1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPOv2_model-1.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/pg_model_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_1.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/pg_model_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_2.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/pg_model_3.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_3.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/pg_model_4.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_4.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/reinforce_model_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_2.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/reinforce_model_3.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_3.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/reinforce_model_4.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_4.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/reinforce_model_5.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_5.pth -------------------------------------------------------------------------------- /Policy_Gradient/PGs/models/reinforce_model_6.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_6.pth -------------------------------------------------------------------------------- /Policy_Gradient/PPO/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/.DS_Store -------------------------------------------------------------------------------- /Policy_Gradient/PPO/PPO_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Categorical 5 | from torch.distributions import Normal 6 | 7 | 8 | class ActorDiscrete(nn.Module): 9 | """ 10 | 用于离散动作空间的策略网络 11 | """ 12 | def __init__(self,state_size,action_size): 13 | super(ActorDiscrete, self).__init__() 14 | self.seed = torch.manual_seed(0) 15 | self.fc1 = nn.Linear(state_size, 128) 16 | # self.fc2 = nn.Linear(64,128) 17 | self.fc2= nn.Linear(128, action_size) 18 | 19 | def forward(self, x): 20 | """ 21 | Build a network that maps state -> action probs. 22 | """ 23 | 24 | x=F.relu(self.fc1(x)) 25 | out = F.softmax(self.fc2(x),dim=1) 26 | return out 27 | 28 | def act(self,state): 29 | """ 30 | 返回 action 和 action的概率 31 | """ 32 | # probs for each action (2d tensor) 33 | probs = self.forward(state) 34 | m = Categorical(probs) 35 | action = m.sample() 36 | ## return action for current state, and the corresponding probability 37 | # result_dic={"action":action.item(),"log_prob":m.log_prob(action) 38 | # ,"prob":probs[:,action.item()].item()} 39 | 40 | return action.item(),probs[:,action.item()].item() 41 | 42 | 43 | class ActorContinous(nn.Module): 44 | """ 45 | 用于连续动作空间的策略网络 46 | """ 47 | def __init__(self,state_size,action_size): 48 | super(ActorContinous, self).__init__() 49 | self.fc1 = nn.Linear(state_size, 128) 50 | self.fc2 = nn.Linear(128,128) 51 | self.mu_head = nn.Linear(128, action_size) 52 | self.sigma_head = nn.Linear(128, action_size) 53 | 54 | def forward(self, x): 55 | x = F.relu(self.fc1(x)) 56 | x = F.relu(self.fc2(x)) 57 | mu = 2.0 * torch.tanh(self.mu_head(x)) 58 | sigma = F.softplus(self.sigma_head(x)) 59 | return (mu, sigma) 60 | 61 | def act(self,state): 62 | """ 63 | 返回 action 和 action 的 log prob 64 | """ 65 | with torch.no_grad(): 66 | (mu, sigma) = self.forward(state) # 2d tensors 67 | dist = Normal(mu, sigma) 68 | action = dist.sample() 69 | action_log_prob = dist.log_prob(action) 70 | 71 | return action.numpy()[0], action_log_prob.numpy()[0] 72 | 73 | 74 | class Critic(nn.Module): 75 | " Actor_Critic model for PPO" 76 | 77 | def __init__(self,state_size): 78 | super(Critic, self).__init__() 79 | self.fc1=nn.Linear(state_size,128) 80 | # self.fc2=nn.Linear(64,128) 81 | self.fc2=nn.Linear(128,1) 82 | 83 | def forward(self,x): 84 | x=F.relu(self.fc1(x)) 85 | state_value = self.fc2(x) 86 | return state_value 87 | 88 | 89 | if __name__=="__main__": 90 | pass 91 | 92 | 93 | -------------------------------------------------------------------------------- /Policy_Gradient/PPO/PPO_v1.py: -------------------------------------------------------------------------------- 1 | """ 2 | PPO_V1: 直接使用累积奖励计算loss;无critic,只有policy网络 3 | """ 4 | import numpy as np 5 | import gym 6 | from collections import namedtuple 7 | from collections import deque 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn as nn 11 | from torch.distributions import Normal 12 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 13 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic 14 | 15 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 16 | GAMMA=0.99 17 | LR=0.001 18 | BATCH_SIZE=32 19 | CLIP=0.2 20 | UPDATE_TIME=10 21 | max_grad_norm=0.5 22 | Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward']) 23 | 24 | 25 | class PPO_v1(): 26 | 27 | def __init__(self, state_size, action_size,continuous=False): 28 | self.policy = ActorDiscrete(state_size, action_size).to(device) 29 | self.continuous = continuous 30 | if self.continuous: 31 | self.policy = ActorContinous(state_size, action_size).to(device) 32 | self.optimizer=optim.Adam(self.policy.parameters(), lr=LR) 33 | self.trajectory=[] 34 | 35 | def update_policy(self,exps,i_episode): 36 | """ 37 | update policy for every sampled transition groups 38 | called by learn() multiple times for one episode 39 | """ 40 | states,actions,old_probs,f_Rewrds=exps 41 | # get action probs from new policy 42 | if self.continuous: 43 | (mus, sigmas) = self.policy(states) 44 | dists = Normal(mus, sigmas) 45 | new_probs = dists.log_prob(actions) 46 | ratios = torch.exp(new_probs - old_probs) 47 | else: 48 | new_probs = self.policy(states).gather(1, actions) 49 | ratios = new_probs / old_probs 50 | 51 | # calculate clipped surrogate function 52 | surr1 = ratios * f_Rewrds 53 | surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * f_Rewrds 54 | policy_loss=-torch.min(surr1,surr2).mean() 55 | 56 | # update policy network 57 | self.optimizer.zero_grad() 58 | policy_loss.backward() 59 | nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm) 60 | self.optimizer.step() 61 | 62 | # self.traintime_counter+=1 63 | 64 | def learn(self,i_episode): 65 | """ 66 | agent learn after finishing every episode. 67 | learn from experiences of this trajectory 68 | :return: 69 | """ 70 | states=torch.cat([t.state for t in self.trajectory]) 71 | actions=torch.tensor([t.action for t in self.trajectory],dtype=torch.long).view(-1,1) 72 | old_probs=torch.tensor([t.prob for t in self.trajectory],dtype=torch.float).view(-1,1) 73 | 74 | # -- calculate discount future rewards for every time step 75 | rewards = [t.reward for t in self.trajectory] 76 | fur_Rewards = [] 77 | for i in range(len(rewards)): 78 | discount = [GAMMA ** i for i in range(len(rewards) - i)] 79 | f_rewards = rewards[i:] 80 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards))) 81 | fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1) 82 | 83 | for i in range(UPDATE_TIME): 84 | # -- repeat the flowing update loop for several times 85 | # disorganize transitions in the trajectory into sub groups 86 | for index_set in BatchSampler(SubsetRandomSampler(range(len(self.trajectory))), BATCH_SIZE, False): 87 | exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set]) 88 | # -- update policy network for every sub groups 89 | self.update_policy(exps,i_episode) 90 | 91 | del self.trajectory[:] # clear trajectory 92 | 93 | 94 | def train(self,env,i_episode): 95 | state = env.reset() 96 | total_reward=0 97 | while True: 98 | # self.timesetp_counter+=1 99 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d 100 | action, prob = self.policy.act(state) # 离散空间取直接prob,连续空间取log prob 101 | next_state, reward, done, _ = env.step(action) 102 | 103 | # --store transition in this current trajectory 104 | self.trajectory.append(Transition(state,action,prob,reward)) 105 | state=next_state 106 | total_reward+=reward 107 | if done: 108 | break 109 | # --agent learn after finish current episode, and if there is enough transitions 110 | if BATCH_SIZE <= len(self.trajectory): 111 | self.learn(i_episode) 112 | 113 | return total_reward 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /Policy_Gradient/PPO/PPO_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | PPO_V2: 使用优势函数计算loss;有critic网络 3 | """ 4 | import random 5 | import numpy as np 6 | import gym 7 | from collections import namedtuple 8 | from collections import deque 9 | import torch 10 | import torch.optim as optim 11 | from torch.distributions import Normal 12 | import torch.nn.functional as F 13 | import torch.nn as nn 14 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 15 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic 16 | from torch.utils.tensorboard import SummaryWriter 17 | 18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 19 | writer = SummaryWriter('./board/logs') 20 | 21 | GAMMA=0.99 22 | LR_a=0.001 23 | LR_c=0.003 24 | BATCH_SIZE=32 25 | CLIP=0.2 26 | BETA=0.01 27 | UPDATE_TIME=10 28 | max_grad_norm=0.5 29 | 30 | Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward']) 31 | 32 | 33 | class Memory(): 34 | def __init__(self): 35 | self.trajectory=[] 36 | self.Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward']) 37 | 38 | def add(self,state,action,prob,reward): 39 | # state = torch.from_numpy(state).float().unsqueeze(0).to(device) 40 | self.trajectory.append(self.Transition(state,action,prob,reward)) 41 | 42 | def clean_buffer(self): 43 | del self.trajectory[:] 44 | 45 | def get_trajectory(self): 46 | states = torch.cat([t.state for t in self.trajectory]) 47 | actions = torch.tensor([t.action for t in self.trajectory], dtype=torch.long).view(-1, 1) 48 | probs = torch.tensor([t.prob for t in self.trajectory], dtype=torch.float).view(-1, 1) 49 | rewards = [t.reward for t in self.trajectory] 50 | return states,actions,probs,rewards 51 | 52 | def __len__(self): 53 | return len(self.trajectory) 54 | 55 | 56 | class PPO_v2(): 57 | def __init__(self,state_size, action_size,continuous=False,add_entropy=True): 58 | 59 | self.critic = Critic(state_size) 60 | self.policy = ActorDiscrete(state_size, action_size).to(device) 61 | self.continuous = continuous 62 | if self.continuous: 63 | self.policy = ActorContinous(state_size, action_size).to(device) 64 | 65 | self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=LR_a) 66 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_c) 67 | 68 | self.memory = Memory() 69 | self.train_step = 0 70 | self.add_entropy=add_entropy 71 | 72 | def policy_loss(self,states,actions, 73 | old_probs,f_Rewrds,V): 74 | 75 | # get action probs from new policy and calculate the ratio 76 | if self.continuous: 77 | (mus, sigmas) = self.policy(states) 78 | dists = Normal(mus, sigmas) 79 | new_probs = dists.log_prob(actions) 80 | ratios = torch.exp(new_probs - old_probs) 81 | else: 82 | new_probs = self.policy(states).gather(1, actions) 83 | ratios = new_probs / old_probs 84 | 85 | # calculate advance from critic network 86 | advantage = (f_Rewrds - V).detach() 87 | 88 | # calculate clipped surrogate function 89 | surr1 = ratios * advantage 90 | surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * advantage 91 | policy_loss = -torch.min(surr1, surr2) 92 | 93 | if self.add_entropy: 94 | # include a regularization term,this steers new_policy towards 0.5 95 | # add in 1.e-10 to avoid log(0) which gives nan 96 | entropy= -(new_probs*torch.log(old_probs+1.e-10)+ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) 97 | policy_loss+=BETA*entropy 98 | 99 | policy_loss=torch.mean(policy_loss) 100 | 101 | return policy_loss 102 | 103 | def critic_loss(self,f_Rewrds, V): 104 | return F.mse_loss(f_Rewrds, V) 105 | 106 | def update_policy(self,exps,i_episode): 107 | states, actions, old_probs, f_Rewrds = exps 108 | V = self.critic(states) 109 | 110 | # -- update policy(actor) network -- # 111 | policy_loss = self.policy_loss(states,actions,old_probs,f_Rewrds,V) 112 | # self.writer.add_scalar('loss/policy_loss', policy_loss, global_step=self.train_step) 113 | # update parameters 114 | self.policy_optimizer.zero_grad() 115 | policy_loss.backward() 116 | # nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm) 117 | self.policy_optimizer.step() 118 | 119 | # -- update value(critic) network -- # 120 | value_loss = self.critic_loss(f_Rewrds,V) 121 | # self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.train_step) 122 | self.critic_optimizer.zero_grad() 123 | value_loss.backward() 124 | # nn.utils.clip_grad_norm_(self.critic.parameters(), max_grad_norm) 125 | self.critic_optimizer.step() 126 | 127 | self.train_step+=1 128 | writer.add_scalar('loss/policy_loss',policy_loss.item(),i_episode) 129 | writer.add_scalar('loss/value_loss', value_loss.item(), i_episode) 130 | writer.flush() 131 | 132 | def learn(self,i_episode): 133 | """ 134 | agent learn after finishing every episode. 135 | learn from experiences of this trajectory 136 | :return: 137 | """ 138 | # states=torch.cat([t.state for t in self.memory.trajectory]) 139 | # actions=torch.tensor([t.action for t in self.memory.trajectory],dtype=torch.long).view(-1,1) 140 | # old_probs=torch.tensor([t.prob for t in self.memory.trajectory],dtype=torch.float).view(-1,1) 141 | # rewards = [t.reward for t in self.memory.trajectory] 142 | 143 | states, actions, old_probs, rewards = self.memory.get_trajectory() 144 | # -- calculate discount future rewards for every time step 145 | fur_Rewards = [] 146 | for i in range(len(rewards)): 147 | discount = [GAMMA ** i for i in range(len(rewards) - i)] 148 | f_rewards = rewards[i:] 149 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards))) 150 | fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1) 151 | 152 | for i in range(UPDATE_TIME): 153 | # -- repeat the flowing update loop for several times 154 | # disorganize transitions in the trajectory into sub groups 155 | for index_set in BatchSampler(SubsetRandomSampler(range(self.memory.__len__())), BATCH_SIZE, False): 156 | exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set]) 157 | # -- update policy network for every sub groups 158 | self.update_policy(exps,i_episode) 159 | 160 | self.memory.clean_buffer() 161 | 162 | def train(self,env,i_episode): 163 | state = env.reset() 164 | total_reward=0 165 | while True: 166 | # self.timesetp_counter+=1 167 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d 168 | action,prob = self.policy.act(state) # 离散空间取直接prob,连续空间取log prob 169 | next_state, reward, done, _ = env.step(action) 170 | # --store transition in this current trajectory 171 | self.memory.add(state,action,prob,reward) 172 | state=next_state 173 | total_reward+=reward 174 | if done: 175 | break 176 | # --agent learn after finish current episode, and if there is enough transitions 177 | if BATCH_SIZE <= self.memory.__len__(): 178 | self.learn(i_episode) 179 | 180 | return total_reward 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/.DS_Store -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/.DS_Store -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0 -------------------------------------------------------------------------------- /Policy_Gradient/PPO/cartPole_ppo-v1_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/cartPole_ppo-v1_1.png -------------------------------------------------------------------------------- /Policy_Gradient/PPO/main_PPO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import arrow 3 | import gym 4 | import numpy as np 5 | import pandas as pd 6 | from collections import deque 7 | import matplotlib.pyplot as plt 8 | from Policy_Gradient.PPO.PPO_v2 import PPO_v2 9 | from Policy_Gradient.PPO.PPO_v1 import PPO_v1 10 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 11 | 12 | 13 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit): 14 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}' 15 | .format(i_episode, np.mean(scores_deque), score), end="") 16 | if i_episode % 100 == 0: 17 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}' 18 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 19 | if np.mean(scores_deque) >= solve_limit: 20 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}' 21 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time)) 22 | return True 23 | 24 | return False 25 | 26 | 27 | def plot_scores(scores,filename): 28 | plt.plot(np.arange(1, len(scores) + 1), scores) 29 | plt.ylabel('Score') 30 | plt.xlabel('Episode #') 31 | plt.savefig(filename) 32 | plt.show() 33 | 34 | 35 | def get_env_prop(env_name, continuous): 36 | env = gym.make(env_name) 37 | state_dim = env.observation_space.shape[0] 38 | if continuous: 39 | action_dim = env.action_space.shape[0] 40 | else: 41 | action_dim = env.action_space.n 42 | 43 | return env,state_dim, action_dim 44 | 45 | 46 | def train_agent_for_env(env_name,continuous,n_episode,model_file,solve_limit): 47 | """ 48 | continuous: 动作空间是否连续(True/False) 49 | model_file: 训练好的模型的保存路径 50 | solve_limit : 环境 solve 的标准,score 阈值 51 | """ 52 | env, state_dim, action_dim = get_env_prop(env_name,continuous) 53 | 54 | agent = PPO_v1(state_dim,action_dim,continuous) 55 | scores_deque = deque(maxlen=100) 56 | scores = [] 57 | 58 | start_time = arrow.now() 59 | for i_episode in range(1, n_episode + 1): 60 | total_reward = agent.train(env,i_episode) 61 | # record scores(total rewards) per episode 62 | scores_deque.append(total_reward) 63 | scores.append(total_reward) 64 | solved = output_scores(start_time, i_episode, scores_deque, total_reward,solve_limit) 65 | if solved: 66 | torch.save(agent.policy.state_dict(), model_file) 67 | break 68 | 69 | return agent, scores 70 | 71 | 72 | def watch_random_agent(env_name,continuous): 73 | env, state_dim, action_dim = get_env_prop(env_name, continuous) 74 | for _ in range(5): 75 | env.reset() 76 | while True: 77 | env.render() 78 | next_state, reward, done, _ =env.step(env.action_space.sample()) 79 | if done: 80 | break 81 | 82 | env.close() 83 | 84 | 85 | def watch_smart_agent(env_name,continuous,model_name,n_episode): 86 | env,state_dim, action_dim = get_env_prop(env_name,continuous) 87 | agent=PPO_v1(state_dim,action_dim,continuous) 88 | agent.policy.load_state_dict(torch.load(model_name)) 89 | 90 | scores =[] 91 | for i_episode in range(1, n_episode + 1): 92 | rewards = [] 93 | state = env.reset() 94 | while True: 95 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 96 | action, _ = agent.policy.act(state) 97 | env.render() 98 | state, reward, done, _ = env.step(action) 99 | rewards.append(reward) 100 | if done: 101 | break 102 | scores.append(sum(rewards)) 103 | return scores 104 | 105 | 106 | if __name__=="__main__": 107 | """train PPO agent in CartPole (discrete action space)""" 108 | # agent_cartPole,scores_1 = train_agent_for_env('CartPole-v0',False,2000, 109 | # 'models/cartPole_ppo-v1_1.pth',195) 110 | # plot_scores(scores_1,'cartPole_ppo-v1_1.png') 111 | 112 | # 观察未经训练的随机智能体 113 | # watch_random_agent('CartPole-v0',False) 114 | # 测试训练好的智能体 115 | # test_scores=watch_smart_agent('CartPole-v0',False,'models/PPO_new.pth',100) 116 | # plot_scores(test_scores,"PPO_cartPole_test.png") 117 | 118 | """train PPO agent in MountainCarContinuous (continuous action space)""" 119 | agent_mCar, scores_2 = train_agent_for_env('MountainCarContinuous-v0', True, 2000, 120 | 'models/mCar_ppo-v1.pth',95) 121 | plot_scores(scores_2, 'mCar_ppo-v1_1.png') 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /Policy_Gradient/PPO/models/PPO_new.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/PPO_new.pth -------------------------------------------------------------------------------- /Policy_Gradient/PPO/models/cartPole_ppo.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo.pth -------------------------------------------------------------------------------- /Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth -------------------------------------------------------------------------------- /Policy_Gradient/PPO_cnn/cnn_ppo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class ActorCritic(torch.nn.Module): 8 | def __init__(self, input_shape, output_shape): 9 | super(ActorCritic, self).__init__() 10 | self.conv1 = nn.Conv2d(input_shape, 32, 3, stride=2, padding=1) 11 | self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 12 | self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 13 | self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 14 | 15 | self.critic_linear = nn.Linear(256, 1) 16 | self.actor_linear = nn.Linear(256, output_shape) 17 | 18 | -------------------------------------------------------------------------------- /Policy_Gradient/envTest.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import deque 4 | import matplotlib.pyplot as plt 5 | 6 | env = gym.make('CartPole-v0') 7 | env.seed(0) 8 | 9 | print('observation space:', env.observation_space) 10 | print('action space:', env.action_space) 11 | print('action space:', env.action_space.n) 12 | 13 | # 观察一个未经训练的随机智能体 14 | state = env.reset() 15 | print(state) 16 | 17 | done=False 18 | for _ in range(5000): 19 | env.render() 20 | if not done: 21 | next_state, reward, done, _ =env.step(env.action_space.sample()) 22 | print(next_state, reward) 23 | else: 24 | break 25 | 26 | env.close() 27 | -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/.DS_Store -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-A vs. PPO-R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs. PPO-R.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-A_train_5_times.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-A_train_5_times1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-A_train_5times_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5times_2.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO-entropy_5times.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-entropy_5times.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_cartPole_20201222.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_20201222.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_cartPole_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_test.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_cartPole_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_train.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_comparison_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_comparison_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_2.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_comparison_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_3.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_comparison_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_4.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_multiple_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_multiple_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_with_entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/PPO_with_entropy_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/cartpole_reinforce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/cartpole_reinforce.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/diff_algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/diff_algorithm.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/pg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/pg_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_2.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/pg_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_3.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_2.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_3.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_4.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_5.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/reinforce_vs_pg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_vs_pg.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/rf-vs-pg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_1.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/rf-vs-pg_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_2.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/rf-vs-pg_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_3.png -------------------------------------------------------------------------------- /Policy_Gradient/results&plots/rf-vs-pg_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_4.png --------------------------------------------------------------------------------