├── .DS_Store
├── .idea
├── .gitignore
├── DRL_pytorch.iml
├── deployment.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── other.xml
├── vcs.xml
└── webServers.xml
├── Actor_Critic
├── A3C
│ ├── Pendulum_A3C_1.png
│ ├── __pycache__
│ │ ├── agent_a3c.cpython-37.pyc
│ │ ├── agent_a3c.cpython-38.pyc
│ │ ├── untils.cpython-37.pyc
│ │ ├── untils.cpython-38.pyc
│ │ ├── worker.cpython-37.pyc
│ │ └── worker.cpython-38.pyc
│ ├── a3c_main.py
│ ├── agent_a3c.py
│ ├── untils.py
│ └── worker.py
└── SAC
│ ├── sac_agent.py
│ ├── sac_main.py
│ └── sac_network.py
├── BlackBox_optimazation
├── Hill_Climbing
│ ├── __pycache__
│ │ └── agent_HC.cpython-36.pyc
│ ├── agent_HC.py
│ └── main_hillClimb.py
└── cross_entropy_method
│ ├── CEM.png
│ ├── __pycache__
│ └── agent_cem.cpython-37.pyc
│ ├── agent_cem.py
│ ├── checkpoint.pth
│ └── main_cem.py
├── DDPGs
├── DDPG
│ ├── DDPG_agent.py
│ ├── DDPG_main.py
│ ├── DDPG_model.py
│ ├── ddpg_1.py
│ └── model_save
│ │ ├── actor1.pth
│ │ ├── actor2.pth
│ │ ├── checkpoint_actor.pth
│ │ ├── checkpoint_critic.pth
│ │ ├── critic1.pth
│ │ └── critic2.pth
└── TD3
│ ├── TD3_agent.py
│ ├── TD3_main.py
│ ├── TD3_model.py
│ ├── TD3_new.py
│ ├── TD3_solved.png
│ ├── __pycache__
│ ├── TD3_model.cpython-38.pyc
│ └── TD3_new.cpython-38.pyc
│ ├── models
│ ├── TD3_actor.pth
│ └── TD3_critic.pth
│ ├── scores_saved.csv
│ └── test.py
├── DQNs
├── .DS_Store
├── DDQN
│ ├── .DS_Store
│ ├── DQN_main.py
│ ├── Deep_Q_Network.ipynb
│ ├── __pycache__
│ │ ├── ddqn_v3.cpython-38.pyc
│ │ └── model_dueling.cpython-38.pyc
│ ├── ddqn_v1.py
│ ├── ddqn_v2.py
│ ├── ddqn_v3.py
│ ├── dqn.py
│ ├── images
│ │ ├── Total Average reward scores plot.png
│ │ ├── ddqn_agent_scores.png
│ │ ├── ddqn_testing_scores.png
│ │ ├── double_dqn_v1.png
│ │ ├── dueling-ddqn_testing.png
│ │ ├── dueling-ddqn_training.png
│ │ ├── runningResult.png
│ │ └── runningResult_1.png
│ ├── model.py
│ ├── model_dueling.py
│ ├── models
│ │ ├── checkpoint.pth
│ │ ├── dueling_model.pth
│ │ └── org_dqn.pth
│ ├── old_agent.py
│ ├── play_env.py
│ └── test.py
├── DQN_PER
│ ├── .DS_Store
│ ├── PER_memory.py
│ ├── Plots
│ │ ├── cnn_per.png
│ │ ├── epsilon_1.png
│ │ ├── epsilon_2.png
│ │ ├── epsilon_exp-1.png
│ │ ├── epsilon_exp-2.png
│ │ ├── epsilon_exp-3.png
│ │ ├── epsilon_linear-1.png
│ │ ├── train_1.png
│ │ ├── train_2.png
│ │ ├── train_DQN_per.png
│ │ ├── train_exp-1.png
│ │ ├── train_exp-2.png
│ │ ├── train_exp-3.png
│ │ ├── train_exp.png
│ │ └── train_linear-1.png
│ ├── SumTree.py
│ ├── __pycache__
│ │ ├── PER_memory.cpython-38.pyc
│ │ ├── SumTree.cpython-38.pyc
│ │ ├── atari_wappers.cpython-38.pyc
│ │ ├── dqn_model.cpython-38.pyc
│ │ └── dqn_per.cpython-38.pyc
│ ├── atari_wappers.py
│ ├── dqn_model.py
│ ├── dqn_per.py
│ ├── main_dqn_per.py
│ └── train_20210520.log
├── DQN_cnn
│ ├── .DS_Store
│ ├── Models
│ │ ├── CNN_model|03-29#19:21.pth
│ │ ├── CNN_model|03-30#11:19.pth
│ │ ├── CNN_model|03-30#21:05.pth
│ │ ├── CNN_model|03-31#19:32.pth
│ │ ├── dqnCNN_model_0324.pth
│ │ └── dqn_model.pth
│ ├── Plots
│ │ ├── test-score|03-25#20:00.png
│ │ ├── test-score|03-26#09:15.png
│ │ ├── test-score|03-26#09:45.png
│ │ ├── train-score|03-29#19:21.png
│ │ ├── train-score|03-30#11:19.png
│ │ ├── train-score|03-30#21:05.png
│ │ └── train-score|03-31#19:32.png
│ ├── ReadMe.md
│ ├── __pycache__
│ │ ├── atari_wappers.cpython-38.pyc
│ │ ├── cnn_model.cpython-38.pyc
│ │ └── dqn_agent.cpython-38.pyc
│ ├── atari_wappers.py
│ ├── cnn_model.py
│ ├── dqn_agent.py
│ ├── image
│ │ ├── pic-0.jpg
│ │ ├── pic-100.jpg
│ │ ├── pic-140.jpg
│ │ ├── pic-152.jpg
│ │ ├── pic-167.jpg
│ │ ├── pic-185.jpg
│ │ ├── pic-200.jpg
│ │ ├── pic-204.jpg
│ │ ├── pic-227.jpg
│ │ ├── pic-300.jpg
│ │ ├── pic-400.jpg
│ │ ├── pic-500.jpg
│ │ ├── pic-600.jpg
│ │ ├── pic-674.jpg
│ │ ├── pic-683.jpg
│ │ ├── pic-696.jpg
│ │ ├── pic-700.jpg
│ │ ├── pic-714.jpg
│ │ ├── pic-733.jpg
│ │ ├── pic-756.jpg
│ │ ├── pic-800.jpg
│ │ ├── pic-900.jpg
│ │ ├── pic-902.jpg
│ │ ├── pic-909.jpg
│ │ ├── pic-920.jpg
│ │ ├── pic-936.jpg
│ │ └── pic-956.jpg
│ ├── log
│ │ ├── train_20210326.log
│ │ ├── train_20210329.log
│ │ ├── train_20210329_1.log
│ │ ├── train_20210330.log
│ │ └── train_20210331.log
│ ├── main_dqn_atari.py
│ ├── main_test.py
│ ├── play_atari.py
│ └── train_20210401.log
└── DQN_iws
│ └── ref_ddqn_iws.py
├── Evaluation_Algorithms
└── CartPole.py
├── Games_play_train
└── atari.py
└── Policy_Gradient
├── .DS_Store
├── PGs
├── __pycache__
│ ├── agent_PG.cpython-37.pyc
│ ├── model.cpython-36.pyc
│ └── model.cpython-37.pyc
├── agent_PG.py
├── main_PG.py
├── model.py
└── models
│ ├── PPO_model-1.pth
│ ├── PPO_new.pth
│ ├── PPOv2_model-1.pth
│ ├── pg_model_1.pth
│ ├── pg_model_2.pth
│ ├── pg_model_3.pth
│ ├── pg_model_4.pth
│ ├── reinforce_model_2.pth
│ ├── reinforce_model_3.pth
│ ├── reinforce_model_4.pth
│ ├── reinforce_model_5.pth
│ └── reinforce_model_6.pth
├── PPO
├── .DS_Store
├── PPO_model.py
├── PPO_v1.py
├── PPO_v2.py
├── __pycache__
│ ├── PPO_model.cpython-37.pyc
│ ├── PPO_model.cpython-38.pyc
│ ├── PPO_v1.cpython-38.pyc
│ └── PPO_v2.cpython-38.pyc
├── board
│ ├── .DS_Store
│ └── logs
│ │ ├── .DS_Store
│ │ ├── events.out.tfevents.1608693869.bogon.80327.0
│ │ ├── events.out.tfevents.1608694041.bogon.80355.0
│ │ ├── events.out.tfevents.1608778854.bogon.82580.0
│ │ ├── events.out.tfevents.1608779119.bogon.82611.0
│ │ ├── events.out.tfevents.1608779166.bogon.82627.0
│ │ ├── events.out.tfevents.1608779638.bogon.82655.0
│ │ ├── events.out.tfevents.1608779657.bogon.82666.0
│ │ ├── events.out.tfevents.1608780330.bogon.82692.0
│ │ └── events.out.tfevents.1608780689.bogon.82718.0
├── cartPole_ppo-v1_1.png
├── main_PPO.py
└── models
│ ├── PPO_new.pth
│ ├── cartPole_ppo.pth
│ └── cartPole_ppo_20201222.pth
├── PPO_cnn
└── cnn_ppo.py
├── envTest.py
└── results&plots
├── .DS_Store
├── PPO-A vs. PPO-R.png
├── PPO-A vs.PPO-R_1.png
├── PPO-A_train_5_times.png
├── PPO-A_train_5_times1.png
├── PPO-A_train_5times_2.png
├── PPO-entropy_5times.png
├── PPO_cartPole_20201222.png
├── PPO_cartPole_test.png
├── PPO_cartPole_train.png
├── PPO_comparison.png
├── PPO_comparison_1.png
├── PPO_comparison_2.png
├── PPO_comparison_3.png
├── PPO_comparison_4.png
├── PPO_multiple_1.png
├── PPO_with_entropy.png
├── PPO_with_entropy_1.png
├── cartpole_reinforce.png
├── diff_algorithm.png
├── pg_1.png
├── pg_2.png
├── pg_3.png
├── reinforce_1.png
├── reinforce_2.png
├── reinforce_3.png
├── reinforce_4.png
├── reinforce_5.png
├── reinforce_vs_pg.png
├── rf-vs-pg_1.png
├── rf-vs-pg_2.png
├── rf-vs-pg_3.png
└── rf-vs-pg_4.png
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/.DS_Store
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/DRL_pytorch.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/webServers.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
--------------------------------------------------------------------------------
/Actor_Critic/A3C/Pendulum_A3C_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/Pendulum_A3C_1.png
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-37.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/agent_a3c.cpython-38.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-37.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/untils.cpython-38.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-37.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/A3C/__pycache__/worker.cpython-38.pyc
--------------------------------------------------------------------------------
/Actor_Critic/A3C/a3c_main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from Actor_Critic.A3C.agent_a3c import A3C
5 |
6 |
7 | def get_env_prop(env_name, continuous):
8 | env = gym.make(env_name)
9 | state_dim = env.observation_space.shape[0]
10 | if continuous:
11 | action_dim = env.action_space.shape[0]
12 | else:
13 | action_dim = env.action_space.n
14 |
15 | return env,state_dim, action_dim
16 |
17 |
18 | def train_a3c(env_name,continuous):
19 | env,state_size,action_size = get_env_prop(env_name,continuous)
20 | agent = A3C(env,continuous,state_size,action_size)
21 | scores = agent.train_worker()
22 | return scores
23 |
24 |
25 | def train_agent_for_env(env_name,continuous):
26 | env = gym.make(env_name)
27 |
28 | state_dim = env.observation_space.shape[0]
29 | if continuous:
30 | action_dim = env.action_space.shape[0]
31 | else:
32 | action_dim = env.action_space.n
33 |
34 | agent = A3C(env, continuous,state_dim,action_dim)
35 | scores = agent.train_worker()
36 |
37 | return agent,scores
38 |
39 |
40 | def plot_scores(scores,filename):
41 | fig = plt.figure()
42 | ax = fig.add_subplot(111)
43 | plt.plot(np.arange(1, len(scores) + 1), scores)
44 | plt.ylabel('Score')
45 | plt.xlabel('Episode #')
46 | plt.savefig(filename)
47 | plt.show()
48 |
49 |
50 | if __name__ == "__main__":
51 | # env = gym.make("Pendulum-v0")
52 | # train_scores = train_a3c(env,True)
53 |
54 | # train A3C on discrete env : CartPole
55 | scores_cartPole = train_agent_for_env("CartPole-v0",False)
56 | plot_scores(scores_cartPole,"cartPole_trainPlot.png")
57 |
58 | # train A3C on continuous env : continuous
59 | # a3c_mCar = train_agent_for_env("MountainCarContinuous-v0", True)
60 |
61 |
--------------------------------------------------------------------------------
/Actor_Critic/A3C/agent_a3c.py:
--------------------------------------------------------------------------------
1 | import random
2 | import torch
3 | import torch.optim as optim
4 | import multiprocessing as mp
5 | from multiprocessing import Process
6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous
7 | from Actor_Critic.A3C.worker import Worker
8 |
9 | GAMMA = 0.9
10 | LR = 1e-4
11 | GLOBAL_MAX_EPISODE = 5000
12 |
13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
14 |
15 |
16 | class A3C():
17 | def __init__(self,env,continuous,state_size,action_size):
18 | self.max_episode=GLOBAL_MAX_EPISODE
19 | self.global_episode = mp.Value('i', 0) # 进程之间共享的变量
20 | self.global_epi_rew = mp.Value('d',0)
21 | self.rew_queue = mp.Queue()
22 | self.worker_num = mp.cpu_count()
23 |
24 | # define the global networks
25 | self.global_valueNet= ValueNetwork(state_size,1).to(device)
26 | # global 的网络参数放入 shared memory,以便复制给各个进程中的 worker网络
27 | self.global_valueNet.share_memory()
28 |
29 | if continuous:
30 | self.global_policyNet = ActorContinous(state_size, action_size).to(device)
31 | else:
32 | self.global_policyNet = ActorDiscrete(state_size, action_size).to(device)
33 | self.global_policyNet.share_memory()
34 |
35 | # global optimizer
36 | self.global_optimizer_policy = optim.Adam(self.global_policyNet.parameters(), lr=LR)
37 | self.global_optimizer_value = optim.Adam(self.global_valueNet.parameters(),lr=LR)
38 |
39 | # define the workers
40 | self.workers=[Worker(env,continuous,state_size,action_size,i,
41 | self.global_valueNet,self.global_optimizer_value,
42 | self.global_policyNet,self.global_optimizer_policy,
43 | self.global_episode,self.global_epi_rew,self.rew_queue,
44 | self.max_episode,GAMMA)
45 | for i in range(self.worker_num)]
46 |
47 | def train_worker(self):
48 | scores=[]
49 | [w.start() for w in self.workers]
50 | while True:
51 | r = self.rew_queue.get()
52 | if r is not None:
53 | scores.append(r)
54 | else:
55 | break
56 | [w.join() for w in self.workers]
57 |
58 | return scores
59 |
60 | def save_model(self):
61 | torch.save(self.global_valueNet.state_dict(), "a3c_value_model.pth")
62 | torch.save(self.global_policyNet.state_dict(), "a3c_policy_model.pth")
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/Actor_Critic/A3C/untils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from collections import namedtuple
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.distributions import Categorical
6 | from torch.distributions import Normal
7 |
8 |
9 | class ValueNetwork(nn.Module):
10 |
11 | def __init__(self, input_dim, output_dim):
12 | super(ValueNetwork, self).__init__()
13 | self.fc1 = nn.Linear(input_dim, 256)
14 | self.fc2 = nn.Linear(256, output_dim)
15 |
16 | def forward(self, state):
17 | value = F.relu(self.fc1(state))
18 | value = self.fc2(value)
19 |
20 | return value
21 |
22 |
23 | class ActorDiscrete(nn.Module):
24 | """
25 | 用于离散动作空间的策略网络
26 | """
27 | def __init__(self,state_size,action_size):
28 | super(ActorDiscrete, self).__init__()
29 | self.seed = torch.manual_seed(0)
30 | self.fc1 = nn.Linear(state_size, 128)
31 | # self.fc2 = nn.Linear(64,128)
32 | self.fc2= nn.Linear(128, action_size)
33 |
34 | def forward(self, x):
35 | """
36 | Build a network that maps state -> action probs.
37 | """
38 |
39 | x=F.relu(self.fc1(x))
40 | out = F.softmax(self.fc2(x),dim=1)
41 | return out
42 |
43 | def act(self,state):
44 | """
45 | 返回 action 和 action的概率
46 | """
47 | # probs for each action (2d tensor)
48 | probs = self.forward(state)
49 | m = Categorical(probs)
50 | action = m.sample()
51 |
52 | # return action for current state, and the corresponding probability
53 | return action.item(),probs[:,action.item()].item()
54 |
55 |
56 | class ActorContinous(nn.Module):
57 | """
58 | 用于连续动作空间的策略网络
59 | """
60 | def __init__(self,state_size,action_size):
61 | super(ActorContinous, self).__init__()
62 | self.fc1 = nn.Linear(state_size, 128)
63 | self.fc2 = nn.Linear(128,128)
64 | self.mu_head = nn.Linear(128, action_size)
65 | self.sigma_head = nn.Linear(128, action_size)
66 |
67 | def forward(self, x):
68 | x = F.relu(self.fc1(x))
69 | x = F.relu(self.fc2(x))
70 | mu = 2.0 * torch.tanh(self.mu_head(x))
71 | sigma = F.softplus(self.sigma_head(x))
72 | return (mu, sigma)
73 |
74 | def act(self,state):
75 | """
76 | 返回 action 和 action 的 log prob
77 | """
78 | with torch.no_grad():
79 | (mu, sigma) = self.policy(state) # 2d tensors
80 | dist = Normal(mu, sigma)
81 | action = dist.sample()
82 | action_log_prob = dist.log_prob(action)
83 |
84 | return action.numpy()[0], action_log_prob.numpy()[0]
85 |
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/Actor_Critic/A3C/worker.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch.multiprocessing as mp
3 | import torch
4 | import torch.nn.functional as F
5 | from torch.distributions import Normal
6 | from Actor_Critic.A3C.untils import ValueNetwork,ActorDiscrete,ActorContinous
7 |
8 |
9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10 |
11 |
12 | class Worker(mp.Process):
13 | def __init__(self,env,continuous,state_size,action_size,id, global_valueNet,global_value_optimizer,
14 | global_policyNet,global_policy_optimizer,
15 | global_epi,global_epi_rew,rew_queue,
16 | max_epi,gamma):
17 | super(Worker, self).__init__()
18 | # define env for individual worker
19 | self.env = env
20 | self.continuous = continuous
21 | self.name = str(id)
22 | self.env.seed(id)
23 | self.state_size = state_size
24 | self.action_size = action_size
25 | self.memory=[]
26 |
27 | # passing global settings to worker
28 | self.global_valueNet,self.global_value_optimizer = global_valueNet,global_value_optimizer
29 | self.global_policyNet,self.global_policy_optimizer = global_policyNet,global_policy_optimizer
30 | self.global_epi,self.global_epi_rew = global_epi,global_epi_rew
31 | self.rew_queue = rew_queue
32 | self.max_epi = max_epi
33 | # self.batch_size = batch_size
34 | self.gamma = gamma
35 |
36 | # define local net for individual worker
37 | self.local_policyNet = ActorDiscrete(self.state_size,self.action_size).to(device)
38 | if self.continuous:
39 | self.local_policyNet = ActorContinous(self.state_size,self.action_size).to(device)
40 | self.local_valueNet = ValueNetwork(self.state_size,1).to(device)
41 |
42 | def sync_global(self):
43 | self.local_valueNet.load_state_dict(self.global_valueNet.state_dict())
44 | self.local_policyNet.load_state_dict(self.global_policyNet.state_dict())
45 |
46 | def calculate_loss(self):
47 | # get experiences from current trajectory
48 | states = torch.tensor([t[0] for t in self.memory], dtype=torch.float)
49 | log_probs = torch.tensor([t[1] for t in self.memory], dtype=torch.float)
50 |
51 | # -- calculate discount future rewards for every time step
52 | rewards = [t[2] for t in self.memory]
53 | fur_Rewards = []
54 | for i in range(len(rewards)):
55 | discount = [self.gamma ** i for i in range(len(rewards) - i)]
56 | f_rewards = rewards[i:]
57 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
58 | fur_Rewards = torch.tensor(fur_Rewards, dtype=torch.float).view(-1, 1)
59 |
60 | # calculate loss for critic
61 | V = self.local_valueNet(states)
62 | value_loss = F.mse_loss(fur_Rewards, V)
63 |
64 | # compute entropy for policy loss
65 | (mu, sigma) = self.local_policyNet(states)
66 | dist = Normal(mu, sigma)
67 | entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(dist.scale) # exploration
68 |
69 | # calculate loss for actor
70 | advantage = (fur_Rewards - V).detach()
71 | policy_loss = -advantage * log_probs
72 | policy_loss = (policy_loss - 0.005 * entropy).mean()
73 |
74 | return value_loss,policy_loss
75 |
76 | def update_global(self):
77 | value_loss, policy_loss = self.calculate_loss()
78 |
79 | self.global_value_optimizer.zero_grad()
80 | value_loss.backward()
81 | # propagate local gradients to global parameters
82 | for local_params, global_params in zip(self.local_valueNet.parameters(), self.global_valueNet.parameters()):
83 | global_params._grad = local_params._grad
84 | self.global_value_optimizer.step()
85 |
86 | self.global_policy_optimizer.zero_grad()
87 | policy_loss.backward()
88 | # propagate local gradients to global parameters
89 | for local_params, global_params in zip(self.local_policyNet.parameters(), self.global_policyNet.parameters()):
90 | global_params._grad = local_params._grad
91 | self.global_policy_optimizer.step()
92 |
93 | self.memory=[] # clear trajectory
94 |
95 | def run(self):
96 | while self.global_epi.value < self.max_epi:
97 | state = self.env.reset()
98 | total_reward=0
99 | while True:
100 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
101 | action, prob = self.local_policyNet.act(state) # 离散空间取直接prob,连续空间取log prob
102 | next_state, reward, done, _ = self.env.step(action)
103 | self.memory.append([state,action,reward,next_state,done])
104 | total_reward += reward
105 | state = next_state
106 |
107 | if done:
108 | # recoding global episode and episode reward
109 | with self.global_epi.get_lock():
110 | self.global_epi.value += 1
111 | with self.global_epi_rew.get_lock():
112 | if self.global_epi_rew.value == 0.:
113 | self.global_epi_rew.value = total_reward
114 | else:
115 | # Moving average reward
116 | self.global_epi_rew.value = self.global_epi_rew.value * 0.99 + total_reward * 0.01
117 | self.rew_queue.put(self.global_epi_rew.value)
118 |
119 | print("w{} | episode: {}\t , episode reward:{:.4} \t "
120 | .format(self.name,self.global_epi.value,self.global_epi_rew.value))
121 | break
122 |
123 | # update and sync with the global net when finishing an episode
124 | self.update_global()
125 | self.sync_global()
126 |
127 | self.rew_queue.put(None)
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_agent.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_agent.py
--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import arrow
3 | import gym
4 | import numpy as np
5 | import pandas as pd
6 | from matplotlib import pyplot as plt
7 |
8 |
9 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit):
10 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
11 | .format(i_episode, np.mean(scores_deque), score), end="")
12 | if i_episode % 100 == 0:
13 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
14 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
15 | if np.mean(scores_deque) >= solve_limit:
16 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
17 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
18 | return True
19 |
20 | return False
21 |
22 |
23 | def plot_scores(scores,filename):
24 | plt.plot(np.arange(1, len(scores) + 1), scores)
25 | plt.ylabel('Score')
26 | plt.xlabel('Episode #')
27 | plt.savefig(filename)
28 | plt.show()
29 |
30 |
31 | def get_env_prop(env_name, continuous):
32 | env = gym.make(env_name)
33 | state_dim = env.observation_space.shape[0]
34 | if continuous:
35 | action_dim = env.action_space.shape[0]
36 | else:
37 | action_dim = env.action_space.n
38 |
39 | return env,state_dim, action_dim
40 |
41 |
42 | if __name__=="__main__":
43 | env,state_dim,action_dim = get_env_prop("CartPole-v0",False)
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/Actor_Critic/SAC/sac_network.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Actor_Critic/SAC/sac_network.py
--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/Hill_Climbing/__pycache__/agent_HC.cpython-36.pyc
--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/agent_HC.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import deque
4 |
5 |
6 | def hill_climbing(env,policy,n_episodes=1000, max_t=1000, gamma=1.0, print_every=100, noise_scale=1e-2):
7 | """Implementation of hill climbing with adaptive noise scaling.
8 |
9 | Params
10 | ======
11 | n_episodes (int): maximum number of training episodes
12 | max_t (int): maximum number of timesteps per episode
13 | gamma (float): discount rate
14 | print_every (int): how often to print average score (over last 100 episodes)
15 | noise_scale (float): standard deviation of additive noise
16 | """
17 | scores_deque = deque(maxlen=100)
18 | scores = [] # 用于存储各 episode 的得分(总奖励)
19 | best_R = -np.Inf
20 | best_w = policy.w
21 |
22 | for i_episode in range(1, n_episodes + 1):
23 | rewards = [] # 每个episode 重置奖励队列
24 | state = env.reset()
25 | for t in range(max_t):
26 | action = policy.act(state)
27 | state, reward, done, _ = env.step(action)
28 | rewards.append(reward) # 把当前 时间步的奖励加入 rewards 队列
29 | if done:
30 | break
31 | # 设定折扣率
32 | discounts = [gamma ** i for i in range(len(rewards) + 1)]
33 | # 计算当前episode的折扣累计总奖励
34 | R = sum([a * b for a, b in zip(discounts, rewards)])
35 |
36 | scores_deque.append(sum(rewards)) # 把当前episode的累计奖励(无折扣)加入 scores 队列
37 | scores.append(sum(rewards))
38 |
39 | # ------- 参数搜索 ----- #
40 | if R >= best_R: # found better weights
41 | best_R = R
42 | best_w = policy.w
43 | noise_scale = max(1e-3, noise_scale / 2) # 缩小搜索范围(下限为 0.001)
44 | policy.w += noise_scale * np.random.rand(*policy.w.shape)
45 | else: # did not find better weights
46 | noise_scale = min(2, noise_scale * 2) # 扩大搜索范围(上限为2)
47 | policy.w = best_w + noise_scale * np.random.rand(*policy.w.shape)
48 | # --------------------- #
49 |
50 | if i_episode % print_every == 0:
51 | print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
52 | if np.mean(scores_deque) >= 195.0:
53 | print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100,
54 | np.mean(scores_deque)))
55 | policy.w = best_w
56 | break
57 |
58 | return scores
--------------------------------------------------------------------------------
/BlackBox_optimazation/Hill_Climbing/main_hillClimb.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import deque
4 | import matplotlib.pyplot as plt
5 | from CartPole.Hill_Climbing.agent_HC import hill_climbing
6 |
7 |
8 | class Policy():
9 | """
10 | 策略函数是一个单层线性神经网络 P(A)=softmax(W*S)
11 | 输出层加入了激活函数softmax,为了把输出值转换成概率(0-1),但没有中间隐藏层,即没有非线性变换
12 | 输入节点数:s_size ;输出节点数:a_size
13 | 参数矩阵 w 的维度 tate_space x action_space
14 | """
15 | def __init__(self, s_size=4, a_size=2):
16 | self.w = 1e-4 * np.random.rand(s_size, a_size) # weights for simple linear policy: state_space x action_space
17 |
18 | def forward(self, state):
19 | x = np.dot(state, self.w)
20 | return np.exp(x) / sum(np.exp(x))
21 |
22 | def act(self, state):
23 | probs = self.forward(state)
24 | # action = np.random.choice(2, p=probs) # option 1: stochastic policy
25 | action = np.argmax(probs) # option 2: deterministic policy
26 | return action
27 |
28 |
29 | if __name__=="__main__":
30 | env = gym.make('CartPole-v0')
31 | policy=Policy()
32 |
33 | print(policy.w)
34 |
35 | # 训练智能体:更新 policy (参数w)
36 | scores = hill_climbing(env,policy)
37 |
38 | # 观察训练好的智能体
39 | state = env.reset()
40 | img = plt.imshow(env.render(mode='rgb_array'))
41 | for t in range(200):
42 | action = policy.act(state)
43 | img.set_data(env.render(mode='rgb_array'))
44 |
45 | state, reward, done, _ = env.step(action)
46 | if done:
47 | break
48 |
49 | env.close()
50 |
51 | # 画累计奖励曲线
52 | fig = plt.figure()
53 | ax = fig.add_subplot(111)
54 | plt.plot(np.arange(1, len(scores) + 1), scores)
55 | plt.ylabel('Score')
56 | plt.xlabel('Episode #')
57 | plt.show()
--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/CEM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/CEM.png
--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/__pycache__/agent_cem.cpython-37.pyc
--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/agent_cem.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 |
7 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
8 |
9 |
10 | class Agent(nn.Module):
11 | def __init__(self, env, h_size=16):
12 | super(Agent, self).__init__()
13 | self.env = env
14 | # state, hidden layer, action sizes
15 | self.s_size = env.observation_space.shape[0]
16 | self.h_size = h_size
17 | self.a_size = env.action_space.shape[0]
18 | # define layers
19 | self.fc1 = nn.Linear(self.s_size, self.h_size)
20 | self.fc2 = nn.Linear(self.h_size, self.a_size)
21 |
22 | def set_weights(self, weights):
23 | s_size = self.s_size
24 | h_size = self.h_size
25 | a_size = self.a_size
26 | # separate the weights for each layer
27 | fc1_end = (s_size * h_size) + h_size
28 | fc1_W = torch.from_numpy(weights[:s_size * h_size].reshape(s_size, h_size))
29 | fc1_b = torch.from_numpy(weights[s_size * h_size:fc1_end])
30 | fc2_W = torch.from_numpy(weights[fc1_end:fc1_end + (h_size * a_size)].reshape(h_size, a_size))
31 | fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size):])
32 | # set the weights for each layer
33 | self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
34 | self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
35 | self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
36 | self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
37 |
38 | def get_weights_dim(self):
39 | return (self.s_size + 1) * self.h_size + (self.h_size + 1) * self.a_size
40 |
41 | def forward(self, x):
42 | x = F.relu(self.fc1(x))
43 | x = F.tanh(self.fc2(x))
44 | return x.cpu().data
45 |
46 | def evaluate(self, weights, gamma=1.0, max_t=5000):
47 | self.set_weights(weights)
48 | episode_return = 0.0
49 | state = self.env.reset()
50 | for t in range(max_t):
51 | state = torch.from_numpy(state).float().to(device)
52 | action = self.forward(state)
53 | state, reward, done, _ = self.env.step(action)
54 | episode_return += reward * math.pow(gamma, t)
55 | if done:
56 | break
57 | return episode_return
58 |
59 |
60 |
--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/BlackBox_optimazation/cross_entropy_method/checkpoint.pth
--------------------------------------------------------------------------------
/BlackBox_optimazation/cross_entropy_method/main_cem.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import math
3 | import numpy as np
4 | from collections import deque
5 | import matplotlib.pyplot as plt
6 | import torch
7 | from MountCar_continuous.cross_entropy_method.agent_cem import Agent
8 |
9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10 |
11 |
12 | def cem(agent,n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
13 | """PyTorch implementation of a cross-entropy method.
14 |
15 | Params
16 | ======
17 | n_iterations (int): maximum number of training iterations
18 | max_t (int): maximum number of timesteps per episode
19 | gamma (float): discount rate
20 | print_every (int): how often to print average score (over last 100 episodes)
21 | pop_size (int): size of population at each iteration
22 | elite_frac (float): percentage of top performers to use in update
23 | sigma (float): standard deviation of additive noise
24 | """
25 | n_elite = int(pop_size * elite_frac)
26 |
27 | scores_deque = deque(maxlen=100)
28 | scores = []
29 | best_weight = sigma * np.random.randn(agent.get_weights_dim())
30 |
31 | for i_iteration in range(1, n_iterations + 1):
32 | weights_pop = [best_weight + (sigma * np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
33 | rewards = np.array([agent.evaluate(weights, gamma, max_t) for weights in weights_pop])
34 |
35 | elite_idxs = rewards.argsort()[-n_elite:]
36 | elite_weights = [weights_pop[i] for i in elite_idxs]
37 | best_weight = np.array(elite_weights).mean(axis=0)
38 |
39 | reward = agent.evaluate(best_weight, gamma=1.0)
40 | scores_deque.append(reward)
41 | scores.append(reward)
42 |
43 | torch.save(agent.state_dict(), 'checkpoint.pth')
44 |
45 | if i_iteration % print_every == 0:
46 | print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
47 |
48 | if np.mean(scores_deque) >= 90.0:
49 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration - 100,
50 | np.mean(scores_deque)))
51 | break
52 | return scores
53 |
54 |
55 |
56 | def watch_trained_agent(agent):
57 | # load the weights from file
58 | agent.load_state_dict(torch.load('checkpoint.pth'))
59 |
60 | state = env.reset()
61 | img = plt.imshow(env.render(mode='rgb_array'))
62 | while True:
63 | state = torch.from_numpy(state).float().to(device)
64 | with torch.no_grad():
65 | action = agent(state)
66 | img.set_data(env.render(mode='rgb_array'))
67 | plt.axis('off')
68 | next_state, reward, done, _ = env.step(action)
69 | state = next_state
70 | if done:
71 | break
72 |
73 | env.close()
74 |
75 |
76 | if __name__=="__main__":
77 | env = gym.make('MountainCarContinuous-v0')
78 | env.seed(101)
79 | np.random.seed(101)
80 | agent = Agent(env).to(device)
81 |
82 | # --- train and plot scores --- #
83 | scores = cem(agent)
84 |
85 | # plot the scores
86 | fig = plt.figure()
87 | ax = fig.add_subplot(111)
88 | plt.plot(np.arange(1, len(scores) + 1), scores)
89 | plt.ylabel('Score')
90 | plt.xlabel('Episode #')
91 | plt.show()
92 |
93 | # --- watch a pre-trained agent --- #
94 | watch_trained_agent(agent)
95 |
96 |
97 |
--------------------------------------------------------------------------------
/DDPGs/DDPG/DDPG_main.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import gym
5 | import torch
6 | from DDPGs.DDPG.DDPG_agent import DDPGAgent
7 |
8 |
9 | def ddpg(env,agent,n_episodes=2000, max_t=700):
10 | scores_deque = deque(maxlen=100)
11 | scores = []
12 |
13 | for i_episode in range(1, n_episodes+1):
14 | state = env.reset()
15 | agent.reset()
16 | score = 0
17 | while True:
18 | # 智能体生成与当前 state 对应的 action (行动策略)
19 | action = agent.act(state)
20 | # 与环境交互,得到 sars'
21 | next_state, reward, done, _ = env.step(action)
22 | # 把当前时间步的经验元组传给 agent
23 | agent.step(i_episode,state, action, reward, next_state, done)
24 | state = next_state
25 | score += reward
26 | if done:
27 | break
28 | scores_deque.append(score)
29 | scores.append(score)
30 |
31 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
32 | .format(i_episode, np.mean(scores_deque), score),end="")
33 | if i_episode % 100 == 0:
34 | torch.save(agent.actor_local.state_dict(), 'model_save/actor2.pth')
35 | torch.save(agent.critic_local.state_dict(), 'model_save/critic2.pth')
36 | print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
37 |
38 | return scores
39 |
40 |
41 | def watch_agent(agent,filename_actor,filename_crtic):
42 | agent.actor_local.load_state_dict(torch.load(filename_actor))
43 | agent.critic_local.load_state_dict(torch.load(filename_crtic))
44 | state = env.reset()
45 | for t in range(1000):
46 | action = agent.act(state, noise=False)
47 | print(action)
48 | env.render()
49 | state, reward, done, _ = env.step(action)
50 | if done:
51 | break
52 | env.close()
53 |
54 |
55 | def plot_scores(scores):
56 | fig = plt.figure()
57 | ax = fig.add_subplot(111)
58 | plt.plot(np.arange(1, len(scores.size()) + 1), scores)
59 | plt.ylabel('Score')
60 | plt.xlabel('Episode #')
61 | plt.show()
62 |
63 |
64 | if __name__=="__main__":
65 | env = gym.make('BipedalWalker-v2')
66 | env.seed(10)
67 |
68 | # 初始化 ddpg agent
69 | agent=DDPGAgent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], seed=10)
70 | # 训练并保存 scores
71 | scores=ddpg(env,agent)
72 | plot_scores(scores)
73 |
74 | # watch_agent(agent,"actor1.pth","critic1.pth")
75 |
76 |
77 |
--------------------------------------------------------------------------------
/DDPGs/DDPG/DDPG_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | def hidden_init(layer):
9 | fan_in = layer.weight.data.size()[0]
10 | lim = 1. / np.sqrt(fan_in)
11 | return (-lim, lim)
12 |
13 |
14 | class Actor(nn.Module):
15 | """Actor (Policy) Model.
16 | """
17 |
18 | def __init__(self, state_size, action_size, seed, fc1_units=256,fc2_units=256):
19 | """
20 | single layer MLP network
21 | ======
22 | Input dim: state_size
23 | Output dim: action_size
24 | """
25 | super(Actor, self).__init__()
26 | self.seed = torch.manual_seed(seed)
27 | self.fc1 = nn.Linear(state_size, fc1_units)
28 | self.fc2 = nn.Linear(fc1_units, fc2_units)
29 | self.fc3=nn.Linear(fc2_units,action_size)
30 |
31 | self.reset_parameters()
32 |
33 | def reset_parameters(self):
34 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
35 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
36 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
37 |
38 | def forward(self, state):
39 | """Build an actor (policy) network that maps states -> actions."""
40 | x = torch.relu(self.fc1(state))
41 | x = torch.relu(self.fc2(x))
42 | return torch.tanh(self.fc3(x))
43 |
44 |
45 | class Critic(nn.Module):
46 | """Critic (Value) Model.
47 | """
48 |
49 | def __init__(self, state_size, action_size, seed, fcs1_units=256, fc2_units=256,fc3_units=128):
50 | """
51 | ======
52 | Input dim: state_size
53 | Output dim: 1
54 | """
55 | super(Critic, self).__init__()
56 | self.seed = torch.manual_seed(seed)
57 | self.fcs1 = nn.Linear(state_size, fcs1_units)
58 | self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
59 | self.fc3 = nn.Linear(fc2_units, fc3_units)
60 | self.fc4 = nn.Linear(fc3_units, 1)
61 | self.reset_parameters()
62 |
63 | def reset_parameters(self):
64 | self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
65 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
66 | self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
67 | self.fc4.weight.data.uniform_(-3e-3, 3e-3)
68 |
69 | def forward(self, state, action):
70 | """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
71 | xs = F.leaky_relu(self.fcs1(state))
72 | x = torch.cat((xs, action), dim=1)
73 | x = F.leaky_relu(self.fc2(x))
74 | x = F.leaky_relu(self.fc3(x))
75 | return self.fc4(x)
76 |
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/actor1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor1.pth
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/actor2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/actor2.pth
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/checkpoint_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_actor.pth
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/checkpoint_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/checkpoint_critic.pth
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/critic1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic1.pth
--------------------------------------------------------------------------------
/DDPGs/DDPG/model_save/critic2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/DDPG/model_save/critic2.pth
--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_main.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | import gym
6 | import torch
7 | import arrow
8 | import os
9 | from DDPGs.TD3.TD3_new import TD3
10 |
11 | RESUME= True
12 | SAVE_MODEL_EVERY = 5
13 | load_checkpoint_patch=["models/checkpoint/actor_10.pth","models/checkpoint/critic_10.pth"]
14 |
15 |
16 | def output_scores(start_time,i_episode,scores_deque,score):
17 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
18 | .format(i_episode, np.mean(scores_deque), score), end="")
19 | if i_episode % 100 == 0:
20 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
21 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
22 | if np.mean(scores_deque) >= 300:
23 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
24 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
25 | return True
26 |
27 | return False
28 |
29 |
30 | def watch_smart_agent(agent,filename_actor,filename_crtic):
31 | agent.actor.load_state_dict(torch.load(filename_actor))
32 | agent.critic.load_state_dict(torch.load(filename_crtic))
33 | state = env.reset()
34 | for t in range(1000):
35 | action = agent.select_action(state)
36 | print(action)
37 | env.render()
38 | state, reward, done, _ = env.step(action)
39 | if done:
40 | break
41 | env.close()
42 |
43 |
44 | def watch_random_agent():
45 |
46 | for _ in range(5):
47 | env.reset()
48 | while True:
49 | env.render()
50 | next_state, reward, done, _ =env.step(env.action_space.sample())
51 | if done:
52 | break
53 |
54 | env.close()
55 |
56 |
57 | def plot_scores(scores,filename):
58 | fig = plt.figure()
59 | ax = fig.add_subplot(111)
60 | plt.plot(np.arange(1, len(scores) + 1), scores)
61 | plt.ylabel('Score')
62 | plt.xlabel('Episode #')
63 | plt.savefig(filename)
64 | plt.show()
65 |
66 |
67 | def save_check_point(agent,i_episode):
68 | # setting the check point for training
69 | checkpoint_actor = {
70 | "net": agent.actor.state_dict(),
71 | 'optimizer': agent.actor_optimizer.state_dict(),
72 | "epoch": i_episode
73 | }
74 | checkpoint_critic = {
75 | "net": agent.critic.state_dict(),
76 | "optimizer": agent.critic_optimizer.state_dict(),
77 | "epoch": i_episode
78 | }
79 | if not os.path.isdir("models/checkpoint"):
80 | os.mkdir("models/checkpoint")
81 | torch.save(checkpoint_actor, 'models/checkpoint/actor_%s.pth' % (str(i_episode)))
82 | torch.save(checkpoint_critic, 'models/checkpoint/critic_%s.pth' % (str(i_episode)))
83 |
84 |
85 | def load_check_point(agent):
86 | "load saved checkpoints to resume training"
87 | checkpoint_actor = torch.load(load_checkpoint_patch[0]) # 加载断点
88 | checkpoint_critic = torch.load(load_checkpoint_patch[1])
89 |
90 | agent.actor.load_state_dict(checkpoint_actor['net']) # 加载模型可学习参数
91 | agent.critic.load_state_dict(checkpoint_critic['net'])
92 |
93 | agent.actor_optimizer.load_state_dict(checkpoint_actor['optimizer']) # 加载优化器参数
94 | agent.critic_optimizer.load_state_dict(checkpoint_critic['optimizer']) # 加载优化器参数
95 |
96 | start_epoch = checkpoint_actor['epoch'] # 设置开始的epoch
97 | return start_epoch
98 |
99 |
100 | def train_td3(env,agent,n_episodes):
101 | start_epoch = 1
102 |
103 | if RESUME: # 加载 check point 中保存的模型参数继续训练
104 | start_epoch=load_check_point(agent)
105 |
106 | scores_deque = deque(maxlen=100)
107 | scores = []
108 | start_time = arrow.now()
109 | for i_episode in range(start_epoch, n_episodes + 1):
110 | state = env.reset()
111 | total_reward = 0
112 | time_step = 0
113 |
114 | # loop over time steps
115 | while True:
116 | # 智能体选择动作(根据当前策略)
117 | action = agent.select_action(state)
118 | next_state, reward, done, _ = env.step(action)
119 | agent.save_exp(state, action, next_state, reward, done)
120 | if agent.mode==1:
121 | agent.train(time_step)
122 | time_step += 1
123 | state = next_state
124 | total_reward += reward
125 | if done:
126 | break
127 |
128 | # recording scores
129 | scores.append([i_episode,total_reward])
130 | scores_deque.append(total_reward)
131 | finished = output_scores(start_time, i_episode, scores_deque, total_reward)
132 | if finished:
133 | agent.save('models', 'TD3_v2')
134 | break
135 |
136 | if i_episode% SAVE_MODEL_EVERY ==0:
137 | save_check_point(agent, i_episode)
138 | # 同时保存 scores,存为 scv 文件
139 | scores_df=pd.DataFrame(data=scores,columns=['episode','score'])
140 | scores_df.to_csv('scores_saved.csv',index=False)
141 |
142 | if agent.mode==0:
143 | agent.train(time_step)
144 |
145 | return scores
146 |
147 |
148 | if __name__=="__main__":
149 | env = gym.make('BipedalWalker-v3')
150 | env.seed(10)
151 | state_dim = env.observation_space.shape[0]
152 | action_dim = env.action_space.shape[0]
153 | max_action = float(env.action_space.high[0])
154 |
155 | agent_0 = TD3(state_dim,action_dim,max_action,env,0) # mode=0:update per episode
156 | agent_1 = TD3(state_dim, action_dim, max_action, env, 1) # mode=1: update per time step
157 | # scores=train_td3(env,agent_1,1000)
158 |
159 | # 观察未经训练的随机智能体
160 | #watch_random_agent()
161 | watch_smart_agent(agent_0,"models/TD3_actor.pth","models/TD3_critic.pth")
162 |
163 |
164 |
--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | def hidden_init(layer):
8 | fan_in = layer.weight.data.size()[0]
9 | lim = 1. / np.sqrt(fan_in)
10 | return (-lim, lim)
11 |
12 | # Actor Neural Network
13 | class Actor(nn.Module):
14 | def __init__(self, state_dim, action_dim, max_action):
15 | super(Actor, self).__init__()
16 |
17 | self.l1 = nn.Linear(state_dim, 400)
18 | self.l2 = nn.Linear(400, 300)
19 | self.l3 = nn.Linear(300, action_dim)
20 |
21 | self.max_action = max_action
22 |
23 | def forward(self, x):
24 | x = F.relu(self.l1(x))
25 | x = F.relu(self.l2(x))
26 | x = self.max_action * torch.tanh(self.l3(x))
27 | return x
28 |
29 |
30 | # Q1-Q2-Critic Neural Network
31 | class Critic(nn.Module):
32 | def __init__(self, state_dim, action_dim):
33 | super(Critic, self).__init__()
34 |
35 | # Q1 architecture
36 | self.l1 = nn.Linear(state_dim + action_dim, 400)
37 | self.l2 = nn.Linear(400, 300)
38 | self.l3 = nn.Linear(300, 1)
39 |
40 | # Q2 architecture
41 | self.l4 = nn.Linear(state_dim + action_dim, 400)
42 | self.l5 = nn.Linear(400, 300)
43 | self.l6 = nn.Linear(300, 1)
44 |
45 | def forward(self, x, u):
46 | xu = torch.cat([x, u], 1)
47 | x1 = F.relu(self.l1(xu))
48 | x1 = F.relu(self.l2(x1))
49 | x1 = self.l3(x1)
50 |
51 | x2 = F.relu(self.l4(xu))
52 | x2 = F.relu(self.l5(x2))
53 | x2 = self.l6(x2)
54 | return x1, x2
55 |
56 | def Q1(self, x, u):
57 | xu = torch.cat([x, u], 1)
58 |
59 | x1 = F.relu(self.l1(xu))
60 | x1 = F.relu(self.l2(x1))
61 | x1 = self.l3(x1)
62 | return x1
63 |
--------------------------------------------------------------------------------
/DDPGs/TD3/TD3_solved.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/TD3_solved.png
--------------------------------------------------------------------------------
/DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_model.cpython-38.pyc
--------------------------------------------------------------------------------
/DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/__pycache__/TD3_new.cpython-38.pyc
--------------------------------------------------------------------------------
/DDPGs/TD3/models/TD3_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_actor.pth
--------------------------------------------------------------------------------
/DDPGs/TD3/models/TD3_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DDPGs/TD3/models/TD3_critic.pth
--------------------------------------------------------------------------------
/DDPGs/TD3/scores_saved.csv:
--------------------------------------------------------------------------------
1 | episode,score
2 | 10,-110.23470465514899
3 | 11,-99.79645054419306
4 | 12,-126.96190521016625
5 | 13,-145.1785128976382
6 | 14,-104.07223475725621
7 | 15,-115.51990479428935
8 | 16,-121.89401543392783
9 | 17,-101.55811828293544
10 | 18,-99.89300219319254
11 | 19,-104.98346411872812
12 | 20,-106.23650016517124
13 | 21,-103.84864901668085
14 | 22,-121.26231449909034
15 | 23,-111.10637092719374
16 | 24,-114.66147317025639
17 | 25,-109.04674660607814
18 | 26,-106.12839938371035
19 | 27,-127.84833024115571
20 | 28,-112.4956240406665
21 | 29,-107.88297384519461
22 | 30,-99.92317202252215
23 | 31,-125.0179119318615
24 | 32,-100.36371986921576
25 | 33,-110.05038883724445
26 | 34,-132.021171753202
27 | 35,-133.880572424944
28 | 36,-100.11141411138209
29 | 37,-122.84088047947732
30 | 38,-103.55589092614429
31 | 39,-121.33897605525831
32 | 40,-230.96539978320874
33 | 41,-124.41352611894291
34 | 42,-119.04664112562759
35 | 43,-115.88990889360397
36 | 44,-116.13402150126659
37 | 45,-153.34740936961384
38 | 46,-107.6970551122332
39 | 47,-103.65690659839511
40 | 48,-107.19409725924586
41 | 49,-95.42425468958133
42 | 50,-103.23654880642972
43 | 51,-106.54915425148904
44 | 52,-114.40587754233572
45 | 53,-117.05561482745843
46 | 54,-113.98186717139261
47 | 55,-107.9192592421826
48 | 56,-100.75576811566108
49 | 57,-101.08801850037774
50 | 58,-132.08337249626365
51 | 59,-108.36035750377272
52 | 60,-111.05887943139943
53 | 61,-121.58251865927255
54 | 62,-141.45334827651286
55 | 63,-114.22801798708173
56 | 64,-119.39000501374326
57 | 65,-132.97221690919855
58 | 66,-123.59944656251093
59 | 67,-101.49795907883284
60 | 68,-103.72652107741114
61 | 69,-98.53701366942036
62 | 70,-100.89222275623676
63 | 71,-143.13936392812377
64 | 72,-101.36843625940055
65 | 73,-100.09964251577696
66 | 74,-98.95479499937969
67 | 75,-104.2556218599781
68 |
--------------------------------------------------------------------------------
/DDPGs/TD3/test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | scores=np.array([[1,-1.2],[2,-3.4],[3,3.6]])
5 |
6 |
7 | df=pd.DataFrame(data=scores,columns=['episode','score'])
8 |
9 | print(df)
10 |
--------------------------------------------------------------------------------
/DQNs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/.DS_Store
--------------------------------------------------------------------------------
/DQNs/DDQN/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/.DS_Store
--------------------------------------------------------------------------------
/DQNs/DDQN/DQN_main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import arrow
3 | import torch
4 | import numpy as np
5 | from collections import deque
6 | # import matplotlib
7 | # matplotlib.use('TkAgg')
8 | import matplotlib.pyplot as plt
9 | from DQNs.DDQN.ddqn_v3 import AgentV3
10 |
11 |
12 | def dqn(agent,model_file,n_episodes=2000, max_t=1000,
13 | eps_start=1.0, eps_end=0.01, eps_decay=0.995,
14 | beta_start=0.4):
15 | """Deep Q-Learning.
16 |
17 |
18 | Params
19 | ======
20 | n_episodes (int): maximum number of training episodes
21 | max_t (int): maximum number of timesteps per episode
22 | eps_start (float): starting value of epsilon, for epsilon-greedy action selection
23 | eps_end (float): minimum value of epsilon
24 | eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
25 | """
26 | scores = [] # list containing scores from each episode
27 | scores_window = deque(maxlen=100) # last 100 scores
28 | eps = eps_start # initialize epsilon
29 | beta=beta_start
30 |
31 | start_time=arrow.now()
32 | for i_episode in range(1, n_episodes + 1):
33 | state = env.reset()
34 | score = 0
35 | episode_loss=[]
36 | for t in range(max_t):
37 | # 在当前状态下获取要采取的 action
38 | action = agent.act(state, eps)
39 | # 与环境交互获取 (s',r,done)
40 | next_state, reward, done, _ = env.step(action)
41 | # 构建 sarsa 序列,传给智能体
42 | loss=agent.step(state, action, reward, next_state, done)
43 | if loss is not None:
44 | episode_loss.append(loss)
45 | state = next_state
46 | score += reward
47 | if done:
48 | break
49 | scores_window.append(score) # save most recent score
50 | scores.append(score) # save most recent score
51 | eps = max(eps_end, eps_decay * eps) # decrease epsilon
52 |
53 | # beta = beta/beta_incre if beta= 200.0:
61 | print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100,
62 | np.mean(scores_window)))
63 | torch.save(agent.qnetwork_local.state_dict(), model_file)
64 | print('\nTotal running time:{}'.format(arrow.now() - start_time))
65 | break
66 | return scores
67 |
68 |
69 | def watch_agent(agent):
70 |
71 | state = env.reset()
72 | for j in range(500):
73 | action = agent.act(state)
74 | env.render()
75 | state, reward, done, _ = env.step(action)
76 | if done:
77 | break
78 | env.close()
79 |
80 |
81 | def watch_random_agent():
82 |
83 | for _ in range(3):
84 | env.reset()
85 | while True:
86 | env.render()
87 | next_state, reward, done, _ =env.step(env.action_space.sample())
88 | if done:
89 | break
90 |
91 | env.close()
92 |
93 |
94 | def trained_agent_test(filename,episode_num=500,max_t=1000,eps=0.01):
95 | """
96 | :param filename:
97 | :param episode_num:
98 | :param max_t:
99 | :param eps:
100 | :return:
101 | """
102 | # agent = Agent(state_size=8, action_size=4, seed=0)
103 | agent_v3 = AgentV3(state_size=8, action_size=4, seed=0)
104 | agent_v3.qnetwork_local.load_state_dict(torch.load(filename))
105 |
106 | watch_agent(agent_v3)
107 |
108 | scores=[]
109 | scores_window = deque(maxlen=100)
110 | start_time=arrow.now()
111 | for i_episode in range(episode_num):
112 | state = env.reset()
113 | score = 0
114 | for t in range(max_t):
115 | # 直接采用贪婪策略
116 | action = agent_v3.act(state)
117 | next_state, reward, done, _ = env.step(action)
118 | state = next_state
119 | score += reward
120 | if done:
121 | break
122 | scores.append(score)
123 | scores_window.append(score)
124 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)),end="")
125 | if i_episode % 100 == 0:
126 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
127 | print('\rRunning time:{}\n'.format(arrow.now()-start_time))
128 | return scores
129 |
130 |
131 | def plot_scores(scores,filename):
132 | # plot the scores
133 | fig = plt.figure()
134 | ax = fig.add_subplot(1, 1, 1)
135 | # ax.plot(np.arange(len(scores_1)), scores_1)
136 | ax.plot(np.arange(len(scores)), scores)
137 | # rolling_mean = pd.Series(scores).rolling(100).mean()
138 | plt.ylabel('Score')
139 | plt.xlabel('Episode #')
140 | plt.savefig(filename)
141 |
142 |
143 | if __name__=="__main__":
144 | env = gym.make('LunarLander-v2')
145 | env.seed(0)
146 |
147 | # 训练 ddqn agent 并获取平均累计奖励
148 | agent_v3 = AgentV3(state_size=8, action_size=4, seed=0)
149 | print("\n\nTraining ddqn agent:\n-------------------------------------------------------------\n")
150 | train_scores = dqn(agent_v3,'dueling_model.pth')
151 | # plot_scores(train_scores,'images/dueling-ddqn_training.png')
152 |
153 | # 观察未经训练的随机智能体
154 | #watch_random_agent()
155 | # 用训练好的智能体跑分并绘制奖励曲线
156 | # test_scores=trained_agent_test('models/dueling_model.pth')
157 | # plot_scores(test_scores,'images/dueling-ddqn_testing.png')
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/ddqn_v3.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/__pycache__/model_dueling.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DDQN/ddqn_v1.py:
--------------------------------------------------------------------------------
1 | """
2 | 对经典 DQN 的改进
3 | 1. Double DQN
4 | """
5 |
6 | import numpy as np
7 | import random
8 | from collections import namedtuple, deque
9 | import torch
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 | from LunarLander.DQN.model import QNetwork
13 |
14 |
15 | BUFFER_SIZE = int(1e5) # replay buffer size
16 | BATCH_SIZE = 64 # minibatch size
17 | GAMMA = 0.99 # discount factor
18 | TAU = 1e-3 # for soft update of target parameters
19 | LR = 5e-4 # learning rate
20 | UPDATE_EVERY = 4 # how often to update the network
21 |
22 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23 |
24 |
25 | class AgentV2():
26 | """Interacts with and learns from the environment."""
27 |
28 | def __init__(self, state_size, action_size, seed):
29 | """Initialize an Agent object.
30 |
31 | Params
32 | ======
33 | state_size (int): dimension of each state
34 | action_size (int): dimension of each action
35 | seed (int): random seed
36 | """
37 | self.state_size = state_size
38 | self.action_size = action_size
39 | self.seed = random.seed(seed)
40 |
41 | # Q-Network
42 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
43 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
44 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
45 |
46 | # Replay memory
47 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
48 | # Initialize time step (for updating every UPDATE_EVERY steps)
49 | self.t_step = 0
50 |
51 | def step(self, state, action, reward, next_state, done):
52 | # Save experience in replay memory
53 | self.memory.add(state, action, reward, next_state, done)
54 |
55 | # Learn every UPDATE_EVERY time steps.
56 | self.t_step = (self.t_step + 1) % UPDATE_EVERY
57 | if self.t_step == 0:
58 | # If enough samples are available in memory, get random subset and learn
59 | if len(self.memory) > BATCH_SIZE:
60 | experiences = self.memory.sample()
61 | self.learn(experiences, GAMMA)
62 |
63 | def act(self, state, eps=0.):
64 | """Returns actions for given state as per current policy.
65 |
66 | Params
67 | ======
68 | state (array_like): current state
69 | eps (float): epsilon, for epsilon-greedy action selection
70 | """
71 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
72 | self.qnetwork_local.eval()
73 | with torch.no_grad():
74 | action_values = self.qnetwork_local(state)
75 | self.qnetwork_local.train()
76 |
77 | # Epsilon-greedy action selection
78 | if random.random() > eps:
79 | return np.argmax(action_values.cpu().data.numpy())
80 | else:
81 | return random.choice(np.arange(self.action_size))
82 |
83 | def learn(self, experiences, gamma):
84 | """Update value parameters using given batch of experience tuples.
85 |
86 | Params
87 | ======
88 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
89 | gamma (float): discount factor
90 | """
91 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列,均为列向量 [BATCH_SIZE,1]
92 | states, actions, rewards, next_states, dones = experiences
93 |
94 | # ----------计算 Q targets------------------------------ #
95 | # 从local网络的 Q estimated 取最大值对应的动作序列
96 | Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量)
97 | # Double TD3:这些动作序列输入target网络得到对应的 Q 估计值,而不是直接让 target 网络选取最大Q(避免了 overestimated 问题)
98 | Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max)
99 | # 根据公式计算 Q 目标
100 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
101 |
102 | # --------- Get expected Q values from local model----------------- #
103 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列
104 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
105 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1]
106 |
107 | # -------------训练 local网络-------------------------------- #
108 | # Compute loss
109 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量
110 | # Minimize the loss
111 | self.optimizer.zero_grad() # 先把原来的梯度清零
112 | loss.backward()
113 | self.optimizer.step()
114 |
115 | # ------------------- update target network ------------------- #
116 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
117 |
118 | def soft_update(self, local_model, target_model, tau):
119 | """Soft update model parameters.
120 | θ_target = τ*θ_local + (1 - τ)*θ_target
121 |
122 | Params
123 | ======
124 | local_model (PyTorch model): weights will be copied from
125 | target_model (PyTorch model): weights will be copied to
126 | tau (float): interpolation parameter
127 | """
128 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
129 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
130 |
131 |
132 | class ReplayBuffer:
133 | """Fixed-size buffer to store experience tuples."""
134 |
135 | def __init__(self, action_size, buffer_size, batch_size, seed):
136 | """Initialize a ReplayBuffer object.
137 |
138 | Params
139 | ======
140 | action_size (int): dimension of each action
141 | buffer_size (int): maximum size of buffer
142 | batch_size (int): size of each training batch
143 | seed (int): random seed
144 | """
145 | self.action_size = action_size
146 | self.memory = deque(maxlen=buffer_size)
147 | self.batch_size = batch_size
148 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
149 | self.seed = random.seed(seed)
150 |
151 | def add(self, state, action, reward, next_state, done):
152 | """Add a new experience to memory."""
153 | e = self.experience(state, action, reward, next_state, done)
154 | self.memory.append(e)
155 |
156 | def sample(self):
157 | """Randomly sample a batch of experiences from memory."""
158 | experiences = random.sample(self.memory, k=self.batch_size)
159 |
160 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
161 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
162 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
163 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
164 | device)
165 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
166 | device)
167 |
168 | return (states, actions, rewards, next_states, dones)
169 |
170 | def __len__(self):
171 | """Return the current size of internal memory."""
172 | return len(self.memory)
173 |
--------------------------------------------------------------------------------
/DQNs/DDQN/ddqn_v3.py:
--------------------------------------------------------------------------------
1 | """
2 | 对经典 DQN 的改进
3 | 1. Double DQN
4 | 2. Dueling Network
5 | """
6 | import numpy as np
7 | import random
8 | from collections import namedtuple, deque
9 | import torch
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 |
13 | from DQNs.DDQN.model_dueling import QNetwork
14 |
15 | BUFFER_SIZE = int(1e4) # replay buffer size
16 | BATCH_SIZE = 64 # minibatch size
17 | GAMMA = 0.99 # discount factor
18 | TAU = 1e-3 # for soft update of target parameters
19 | LR = 5e-3 # learning rate
20 | UPDATE_EVERY = 4 # how often to update the network
21 | E=1e-8 # small number to add to the priority of experience
22 |
23 |
24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25 |
26 |
27 | class AgentV3():
28 | """Interacts with and learns from the environment."""
29 |
30 | def __init__(self, state_size, action_size, seed):
31 | """Initialize an Agent object.
32 |
33 | Params
34 | ======
35 | state_size (int): dimension of each state
36 | action_size (int): dimension of each action
37 | seed (int): random seed
38 | """
39 | self.state_size = state_size
40 | self.action_size = action_size
41 | # self.seed = random.seed(seed)
42 |
43 | # Q-Network
44 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
45 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
46 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
47 |
48 | # Replay memory
49 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
50 | # Initialize time step (for updating every UPDATE_EVERY steps)
51 | self.t_step = 0
52 |
53 | def step(self, state, action, reward, next_state, done):
54 | # Save experience in replay memory
55 | self.memory.add(state, action, reward, next_state, done)
56 |
57 | # Learn every UPDATE_EVERY time steps.
58 | self.t_step = (self.t_step + 1) % UPDATE_EVERY
59 | if self.t_step == 0:
60 | # If enough samples are available in memory, get random subset and learn
61 | if len(self.memory) > BATCH_SIZE:
62 | experiences = self.memory.sample()
63 |
64 | loss=self.learn(experiences, GAMMA)
65 | return loss
66 |
67 | def act(self, state, eps=0.):
68 | """Returns actions for given state as per current policy.
69 |
70 | Params
71 | ======
72 | state (array_like): current state
73 | eps (float): epsilon, for epsilon-greedy action selection
74 | """
75 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
76 | self.qnetwork_local.eval()
77 | with torch.no_grad():
78 | action_values = self.qnetwork_local(state)
79 | self.qnetwork_local.train()
80 |
81 | # Epsilon-greedy action selection
82 | if random.random() > eps:
83 | return np.argmax(action_values.cpu().data.numpy())
84 | else:
85 | return random.choice(np.arange(self.action_size))
86 |
87 | def learn(self, experiences, gamma):
88 | """Update value parameters using given batch of experience tuples.
89 |
90 | Params
91 | ======
92 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
93 | gamma (float): discount factor
94 | """
95 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列
96 | states, actions, rewards, next_states, dones = experiences
97 |
98 | # ----------计算 Q targets------------------------------ #
99 | # 从local网络的 Q estimated 取最大值对应的动作序列
100 | Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # shape:[BATCH_SIZE,1](.unsqueeze(1)转换成列向量)
101 | # Double TD3:这些动作序列输入target网络得到对应的 Q 估计值,而不是直接让 target 网络选取最大Q(避免了 overestimated 问题)
102 | Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max)
103 | # 根据公式计算 Q 目标
104 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
105 |
106 | # --------- Get expected Q values from local model----------------- #
107 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列
108 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
109 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1]
110 |
111 | # -------------训练 local网络-------------------------------- #
112 | # Compute loss
113 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量
114 | # Minimize the loss
115 | self.optimizer.zero_grad() # 先把原来的梯度清零
116 | loss.backward()
117 | self.optimizer.step()
118 |
119 | # ------------------- update target network ------------------- #
120 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
121 |
122 | return loss.cpu().detach().numpy()
123 |
124 | def soft_update(self, local_model, target_model, tau):
125 | """Soft update model parameters.
126 | θ_target = τ*θ_local + (1 - τ)*θ_target
127 |
128 | Params
129 | ======
130 | local_model (PyTorch model): weights will be copied from
131 | target_model (PyTorch model): weights will be copied to
132 | tau (float): interpolation parameter
133 | """
134 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
135 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
136 |
137 |
138 | class ReplayBuffer:
139 | """Fixed-size buffer to store experience tuples."""
140 |
141 | def __init__(self, action_size, buffer_size, batch_size, seed):
142 | """Initialize a ReplayBuffer object.
143 |
144 | Params
145 | ======
146 | action_size (int): dimension of each action
147 | buffer_size (int): maximum size of buffer
148 | batch_size (int): size of each training batch
149 | seed (int): random seed
150 | """
151 | # self.action_size = action_size
152 | self.memory = deque(maxlen=buffer_size)
153 | self.batch_size = batch_size
154 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
155 | # self.seed = random.seed(seed)
156 |
157 | def add(self, state, action, reward, next_state, done):
158 | """Add a new experience to memory."""
159 | e = self.experience(state, action, reward, next_state, done)
160 | self.memory.append(e)
161 |
162 | def sample(self):
163 | """Randomly sample a batch of experiences from memory."""
164 | experiences = random.sample(self.memory, k=self.batch_size)
165 |
166 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
167 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
168 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
169 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
170 | device)
171 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
172 | device)
173 |
174 | return (states, actions, rewards, next_states, dones)
175 |
176 | def __len__(self):
177 | """Return the current size of internal memory."""
178 | return len(self.memory)
--------------------------------------------------------------------------------
/DQNs/DDQN/dqn.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import namedtuple, deque
4 | from LunarLander.DQN.model import QNetwork
5 |
6 | import torch
7 | import torch.nn.functional as F
8 | import torch.optim as optim
9 |
10 | BUFFER_SIZE = int(1e5) # replay buffer size
11 | BATCH_SIZE = 64 # minibatch size
12 | GAMMA = 0.99 # discount factor
13 | TAU = 1e-3 # for soft update of target parameters
14 | LR = 5e-4 # learning rate
15 | UPDATE_EVERY = 4 # how often to update the network
16 |
17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
18 |
19 |
20 | class Agent():
21 | """Interacts with and learns from the environment."""
22 |
23 | def __init__(self, state_size, action_size, seed):
24 | """Initialize an Agent object.
25 |
26 | Params
27 | ======
28 | state_size (int): dimension of each state
29 | action_size (int): dimension of each action
30 | seed (int): random seed
31 | """
32 | self.state_size = state_size
33 | self.action_size = action_size
34 | self.seed = random.seed(seed)
35 |
36 | # Q-Network
37 | self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
38 | self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
39 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
40 |
41 | # Replay memory
42 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
43 | # Initialize time step (for updating every UPDATE_EVERY steps)
44 | self.t_step = 0
45 |
46 | def step(self, state, action, reward, next_state, done):
47 | # Save experience in replay memory
48 | self.memory.add(state, action, reward, next_state, done)
49 |
50 | # Learn every UPDATE_EVERY time steps.
51 | self.t_step = (self.t_step + 1) % UPDATE_EVERY
52 | if self.t_step == 0:
53 | # If enough samples are available in memory, get random subset and learn
54 | if len(self.memory) > BATCH_SIZE:
55 | experiences = self.memory.sample()
56 | self.learn(experiences, GAMMA)
57 |
58 | def act(self, state, eps=0.):
59 | """Returns actions for given state as per current policy.
60 |
61 | Params
62 | ======
63 | state (array_like): current state
64 | eps (float): epsilon, for epsilon-greedy action selection
65 | """
66 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
67 | self.qnetwork_local.eval()
68 | with torch.no_grad():
69 | action_values = self.qnetwork_local(state)
70 | self.qnetwork_local.train()
71 |
72 | # Epsilon-greedy action selection
73 | if random.random() > eps:
74 | return np.argmax(action_values.cpu().data.numpy())
75 | else:
76 | return random.choice(np.arange(self.action_size))
77 |
78 | def learn(self, experiences, gamma):
79 | """Update value parameters using given batch of experience tuples.
80 |
81 | Params
82 | ======
83 | experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
84 | gamma (float): discount factor
85 | """
86 | # 从 experiences 取得所有时间步的 (s,a,r,s',done)的序列,均为列向量 [BATCH_SIZE,1]
87 | states, actions, rewards, next_states, dones = experiences
88 | # ------计算每个经验元组对应的Q目标序列
89 | # Get max predicted Q values (for next states) from target model
90 | # print(self.qnetwork_target(next_states)) # shape:[BATCH_SIZE,4]
91 | Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # shape:[BATCH_SIZE,1]
92 | # Compute Q targets for current states
93 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # shape:[BATCH_SIZE,1]
94 |
95 | # --------- Get expected Q values from local model
96 | # 找到每个 (state,action) 对应的q值,输出为一个q(s,a)序列
97 | # print(self.qnetwork_local(states)) # shape:[BATCH_SIZE,4]
98 | Q_expected = self.qnetwork_local(states).gather(1, actions) # shape:[BATCH_SIZE,1]
99 |
100 | # Compute loss
101 | loss = F.mse_loss(Q_expected, Q_targets) # 用Q估计值和Q目标计算均方差损失函数,都为列向量
102 | # Minimize the loss
103 | self.optimizer.zero_grad() # 先把原来的梯度清零
104 | loss.backward()
105 | self.optimizer.step()
106 |
107 | # ------------------- update target network ------------------- #
108 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
109 |
110 | def soft_update(self, local_model, target_model, tau):
111 | """Soft update model parameters.
112 | θ_target = τ*θ_local + (1 - τ)*θ_target
113 |
114 | Params
115 | ======
116 | local_model (PyTorch model): weights will be copied from
117 | target_model (PyTorch model): weights will be copied to
118 | tau (float): interpolation parameter
119 | """
120 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
121 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
122 |
123 |
124 | class ReplayBuffer:
125 | """Fixed-size buffer to store experience tuples."""
126 |
127 | def __init__(self, action_size, buffer_size, batch_size, seed):
128 | """Initialize a ReplayBuffer object.
129 |
130 | Params
131 | ======
132 | action_size (int): dimension of each action
133 | buffer_size (int): maximum size of buffer
134 | batch_size (int): size of each training batch
135 | seed (int): random seed
136 | """
137 | self.action_size = action_size
138 | self.memory = deque(maxlen=buffer_size)
139 | self.batch_size = batch_size
140 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
141 | self.seed = random.seed(seed)
142 |
143 | def add(self, state, action, reward, next_state, done):
144 | """Add a new experience to memory."""
145 | e = self.experience(state, action, reward, next_state, done)
146 | self.memory.append(e)
147 |
148 | def sample(self):
149 | """Randomly sample a batch of experiences from memory."""
150 | experiences = random.sample(self.memory, k=self.batch_size)
151 |
152 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
153 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
154 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
155 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(
156 | device)
157 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
158 | device)
159 |
160 | return (states, actions, rewards, next_states, dones)
161 |
162 | def __len__(self):
163 | """Return the current size of internal memory."""
164 | return len(self.memory)
--------------------------------------------------------------------------------
/DQNs/DDQN/images/Total Average reward scores plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/Total Average reward scores plot.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/ddqn_agent_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_agent_scores.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/ddqn_testing_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/ddqn_testing_scores.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/double_dqn_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/double_dqn_v1.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/dueling-ddqn_testing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_testing.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/dueling-ddqn_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/dueling-ddqn_training.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/runningResult.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult.png
--------------------------------------------------------------------------------
/DQNs/DDQN/images/runningResult_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/images/runningResult_1.png
--------------------------------------------------------------------------------
/DQNs/DDQN/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class QNetwork(nn.Module):
7 | """Actor (Policy) Model."""
8 |
9 | def __init__(self, state_size, action_size, seed):
10 | """Initialize parameters and build model.
11 | Params
12 | ======
13 | state_size (int): Dimension of each state
14 | action_size (int): Dimension of each action
15 | seed (int): Random seed
16 | """
17 | super(QNetwork, self).__init__()
18 | self.seed = torch.manual_seed(seed)
19 | self.fc1=nn.Linear(state_size,64)
20 | self.fc2=nn.Linear(64,64)
21 | self.fc3=nn.Linear(64,action_size)
22 |
23 |
24 | def forward(self, state):
25 | """Build a network that maps state -> action values."""
26 | out=self.fc1(state)
27 | out=F.relu(out)
28 | out=self.fc2(out)
29 | out=F.relu(out)
30 | q_a=self.fc3(out)
31 |
32 | return q_a
33 |
--------------------------------------------------------------------------------
/DQNs/DDQN/model_dueling.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | H_1=64
6 | H_2=64
7 |
8 | class QNetwork(nn.Module):
9 | """Dueling Architecture"""
10 |
11 | def __init__(self, state_size, action_size, seed):
12 | """Initialize parameters and build model.
13 | Params
14 | ======
15 | state_size (int): Dimension of each state
16 | action_size (int): Dimension of each action
17 | seed (int): Random seed
18 | """
19 | super(QNetwork, self).__init__()
20 | self.action_size=action_size
21 | self.seed = torch.manual_seed(seed)
22 | self.fc1=nn.Linear(state_size,H_1)
23 |
24 | self.fc2_adv = nn.Linear(H_1,H_2)
25 | self.fc2_v = nn.Linear(H_1, H_2)
26 |
27 | self.fc3_adv = nn.Linear(H_2,action_size)
28 | self.fc3_v = nn.Linear(H_2, 1)
29 |
30 |
31 | def forward(self, state):
32 | # first hidden layer
33 | h1=F.relu(self.fc1(state))
34 |
35 | # dueling start in second layer
36 | h2_adv = F.relu(self.fc2_adv(h1))
37 | h2_v = F.relu(self.fc2_v(h1))
38 |
39 | # final advantage value
40 | adv = self.fc3_adv(h2_adv)
41 | # final state value
42 | v = self.fc3_v(h2_v).expand(state.size(0), self.action_size) # 从1维扩展到 action_size维
43 |
44 | # calculate final Q(s,a) value for output
45 | out_q=v+adv-adv.mean(1).unsqueeze(1).expand(state.size(0), self.action_size)
46 |
47 | return out_q
48 |
49 |
50 |
--------------------------------------------------------------------------------
/DQNs/DDQN/models/checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/checkpoint.pth
--------------------------------------------------------------------------------
/DQNs/DDQN/models/dueling_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/dueling_model.pth
--------------------------------------------------------------------------------
/DQNs/DDQN/models/org_dqn.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DDQN/models/org_dqn.pth
--------------------------------------------------------------------------------
/DQNs/DDQN/play_env.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import torch
4 | import numpy as np
5 | from collections import deque
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | env = gym.make('LunarLander-v2')
10 | env.seed(0)
11 | print('State shape: ', env.observation_space.shape)
12 | print('Number of actions: ', env.action_space.n)
13 |
14 |
15 | # 观察一个未经训练的随机智能体
16 | state = env.reset()
17 | for _ in range(10000):
18 | env.render()
19 | next_state, reward, done, _ =env.step(env.action_space.sample())
20 | # print(reward)
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/DQNs/DDQN/test.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import torch
4 | from collections import namedtuple, deque
5 |
6 | lst = [1,2,3,4,0,6]
7 |
8 | print( np.array(lst) / 2.5)
9 | print( np.array(lst) / 2.5 + 0.0001)
10 |
11 | s1 = np.array([[1,2,3,4]])
12 | s2 = np.vstack([[3,4,5,6]])
13 | print(np.vstack([s1,s2]))
14 |
15 |
16 |
17 |
18 | target_org=np.array([[ 0.0910, -0.0224, -0.0552, -0.0192],
19 | [ 0.0908, -0.0209, -0.0553, -0.0181],
20 | [ 0.0922, -0.0219, -0.0546, -0.0206],
21 | [ 0.0913, -0.0211, -0.0548, -0.0182],
22 | [ 0.0910, -0.0211, -0.0554, -0.0187]])
23 | target_org=torch.tensor(target_org)
24 | # print(target_org.shape)
25 | # # 按行取最大值
26 | # print(target_org.detach().max(1))
27 | # print(target_org.detach().max(1)[0])
28 | # # 转换成列向量
29 | # print(target_org.detach().max(1)[0].unsqueeze(1))
30 |
31 |
32 | local_org=np.array([[ 0.0936, -0.0768, -0.1730, -0.0238],
33 | [ 0.0930, -0.0620, -0.1845, -0.0077],
34 | [ 0.0986, -0.0473, -0.1868, 0.0110],
35 | [ 0.0946, -0.0752, -0.1726, -0.0264],
36 | [ 0.0979, -0.0497, -0.1886, 0.0097]])
37 | local_org=torch.tensor(local_org)
38 | actions=torch.tensor(np.array(
39 | [[3],
40 | [1],
41 | [2],
42 | [0],
43 | [0]]))
44 | # print(actions)
45 | # print(local_org.shape)
46 | # print(local_org.gather(1, actions.long()))
47 |
48 |
49 | b=torch.tensor(np.array([ 0.0932, -0.0206, -0.0541, -0.0204]))
50 | action=torch.LongTensor([0])
51 | print(b.gather(0,action))
52 |
53 | memory=deque(maxlen=10)
54 | exp=namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
55 | e1=exp(0.34,1,3.56,0.56,False)
56 | memory.append(e1)
57 | e2=exp(3.34,0,8.56,-2.3,False)
58 | memory.append(e2)
59 | memory.append(exp(4.6,0,8.56,-2.3,False))
60 | memory.append(exp(8.7,0,8.56,-4.3,False))
61 | memory.append(exp(2.2,0,-0.8,-2.3,False))
62 |
63 |
64 |
65 | # print(memory)
66 | # print(memory[0].state)
67 | # print(len(memory))
68 | # #
69 | # sample_inds=np.random.choice(len(memory), 3, p=[0.1,0.2,0.2,0.4,0.1],replace=False)
70 | # print(sample_inds)
71 |
72 |
73 | # env = gym.make('LunarLander-v2')
74 | # env.seed(0)
75 | # print('State shape: ', env.observation_space.shape)
76 | # print('Number of actions: ', env.action_space.n)
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/DQNs/DQN_PER/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/.DS_Store
--------------------------------------------------------------------------------
/DQNs/DQN_PER/PER_memory.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | from SumTree import SumTree
4 | # from DQNs.DQN_PER.SumTree import SumTree
5 |
6 |
7 | class Memory: # stored as ( s, a, r, s_ ) in SumTree
8 | e = 0.0001
9 | alpha = 0.6
10 | beta = 0.4
11 | beta_increment_per_sampling = 0.001
12 |
13 | def __init__(self, capacity):
14 | self.tree = SumTree(capacity)
15 | self.capacity = capacity
16 |
17 | # 根据 TD-error 计算优先级
18 | def _get_priority(self, error):
19 | return (np.abs(error) + self.e) ** self.alpha
20 |
21 | # 存储一条经验和相应优先级
22 | def add(self, error, sample):
23 | p = self._get_priority(error)
24 | self.tree.add(p, sample)
25 |
26 | def batch_sample(self, n):
27 | batch = []
28 | idxs = []
29 | segment = self.tree.total() / n
30 | priorities = []
31 |
32 | # beta 随着sample的次数增加而增大(??),上限为 1.0
33 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
34 |
35 | # 把叶子节点分成n个采样区间(n为样本数量)
36 | for i in range(n):
37 | a = segment * i
38 | b = segment * (i + 1)
39 | s = random.uniform(a, b)
40 | (idx, p, data) = self.tree.get(s)
41 | priorities.append(p)
42 | batch.append(data)
43 | idxs.append(idx)
44 |
45 | # 采样概率
46 | sampling_probabilities = np.array(priorities) / self.tree.total() + self.e
47 | # 样本权重: IS weight
48 | is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
49 | is_weight /= is_weight.max()
50 |
51 | return batch, idxs, is_weight
52 |
53 | def update(self, idx, error):
54 | p = self._get_priority(error)
55 | self.tree.update(idx, p)
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/cnn_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/cnn_per.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_2.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-2.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_exp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_exp-3.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/epsilon_linear-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/epsilon_linear-1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_2.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_DQN_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_DQN_per.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-2.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp-3.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_exp.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/Plots/train_linear-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/Plots/train_linear-1.png
--------------------------------------------------------------------------------
/DQNs/DQN_PER/SumTree.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | # SumTree
5 | # a binary tree data structure where the parent’s value is the sum of its children
6 | class SumTree:
7 | write = 0
8 |
9 | def __init__(self, capacity):
10 | self.capacity = capacity
11 | self.tree = numpy.zeros(2 * capacity - 1)
12 | self.data = numpy.zeros(capacity, dtype=object)
13 | self.n_entries = 0
14 |
15 | def total(self):
16 | return self.tree[0]
17 |
18 | # 从叶子节点到根节点向上传播,更新整棵树
19 | def _propagate(self, idx, change):
20 | parent = (idx - 1) // 2
21 | self.tree[parent] += change
22 |
23 | if parent != 0:
24 | self._propagate(parent, change)
25 |
26 | # 更新目标节点的 priority
27 | def update(self, idx, p):
28 | change = p - self.tree[idx]
29 |
30 | self.tree[idx] = p
31 | self._propagate(idx, change)
32 |
33 | # 存储样本和对应节点的 priority (只有叶子节点可以存储,上面节点的值都是下层的求和)
34 | def add(self, p, data):
35 | # 计算叶子节点的 index
36 | idx = self.write + self.capacity - 1
37 |
38 | self.data[self.write] = data
39 | self.update(idx, p)
40 |
41 | # 如果叶子节点已满,则从第一个开始清空重新存储
42 | self.write += 1
43 | if self.write >= self.capacity:
44 | self.write = 0
45 |
46 | if self.n_entries < self.capacity:
47 | self.n_entries += 1
48 |
49 | # 从根节点开始搜索,找到对应的叶子节点
50 | def _retrieve(self, idx, s):
51 | left = 2 * idx + 1
52 | right = left + 1
53 |
54 | if left >= len(self.tree):
55 | return idx
56 |
57 | if s <= self.tree[left]:
58 | return self._retrieve(left, s)
59 | else:
60 | return self._retrieve(right, s - self.tree[left])
61 |
62 | # 采样方法,取得样本和对应的 priority
63 | def get(self, s):
64 | # 找到叶子节点的索引
65 | idx = self._retrieve(0, s) # s:在每个区间随机取的值
66 | # 找到样本的索引
67 | dataIdx = idx - self.capacity + 1
68 |
69 | return (idx, self.tree[idx], self.data[dataIdx])
70 |
--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/PER_memory.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/SumTree.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/atari_wappers.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_model.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_PER/__pycache__/dqn_per.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_PER/atari_wappers.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import gym
3 | import gym.spaces
4 | import numpy as np
5 | import collections
6 |
7 |
8 | class MaxAndSkipEnv(gym.Wrapper):
9 | """
10 | Combines the repetition of actions during k frames and pixels from two consecutive frames.
11 | """
12 | def __init__(self, env=None, skip=4):
13 | super(MaxAndSkipEnv, self).__init__(env)
14 | self._obs_buffer = collections.deque(maxlen=2)
15 | self._skip = skip
16 |
17 | def step(self, action):
18 | total_reward = 0.0
19 | done = None
20 | for _ in range(self._skip):
21 | obs, reward, done, info = self.env.step(action)
22 | self._obs_buffer.append(obs)
23 | total_reward += reward
24 | if done:
25 | break
26 | max_frame = np.max(np.stack(self._obs_buffer), axis=0)
27 | return max_frame, total_reward, done, info
28 |
29 | def reset(self):
30 | self._obs_buffer.clear()
31 | obs = self.env.reset()
32 | self._obs_buffer.append(obs)
33 | return obs
34 |
35 |
36 | class FireResetEnv(gym.Wrapper):
37 | """
38 | Presses fire button for environments that require it for the game to start.
39 | Also checks for some corner cases in some games
40 | """
41 | def __init__(self,env=None):
42 | """For environments where the user need to press FIRE for the game to start."""
43 | super(FireResetEnv, self).__init__(env)
44 | assert env.unwrapped.get_action_meanings()[1]=="FIRE"
45 | assert len(env.unwrapped.get_action_meanings()) >= 3
46 |
47 | def step(self,action):
48 |
49 | return self.env.step(action)
50 |
51 | def reset(self):
52 |
53 | self.env.reset()
54 |
55 | obs,_,done,_ = self.env.step(1)
56 | if done:
57 | self.env.reset()
58 | obs, _, done, _ = self.env.step(2)
59 | if done:
60 | self.env.reset()
61 | return obs
62 |
63 |
64 | class ProcessFrame84(gym.ObservationWrapper):
65 | """
66 | converts input image of 210x160 rgb to grayscale 84x84
67 | """
68 | def __init__(self, env=None):
69 | super(ProcessFrame84, self).__init__(env)
70 |
71 | self.observation_space = gym.spaces.Box(
72 | low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
73 |
74 | def observation(self, obs):
75 |
76 | return ProcessFrame84.process(obs)
77 | @staticmethod
78 | def process(frame):
79 | if frame.size == 210 * 160 * 3:
80 | img = np.reshape(frame, [210, 160, 3]).astype(
81 | np.float32)
82 | elif frame.size == 250 * 160 * 3:
83 | img = np.reshape(frame, [250, 160, 3]).astype(
84 | np.float32)
85 | else:
86 | assert False, "Unknown resolution."
87 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \
88 | img[:, :, 2] * 0.114
89 | resized_screen = cv2.resize(
90 | img, (84, 110), interpolation=cv2.INTER_AREA)
91 | x_t = resized_screen[18:102, :]
92 | x_t = np.reshape(x_t, [84, 84, 1])
93 | return x_t.astype(np.uint8)
94 |
95 |
96 | class ImageToPyTorch(gym.ObservationWrapper):
97 | def __init__(self, env):
98 | super(ImageToPyTorch, self).__init__(env)
99 | old_shape = self.observation_space.shape
100 | new_shape = (old_shape[-1], old_shape[0], old_shape[1])
101 | self.observation_space = gym.spaces.Box(
102 | low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
103 | def observation(self, observation):
104 | return np.moveaxis(observation, 2, 0)
105 |
106 |
107 | class BufferWrapper(gym.ObservationWrapper):
108 | def __init__(self, env, n_steps, dtype=np.float32):
109 | super(BufferWrapper, self).__init__(env)
110 | self.dtype = dtype
111 | old_space = env.observation_space
112 | self.observation_space = gym.spaces.Box(
113 | old_space.low.repeat(n_steps, axis=0),
114 | old_space.high.repeat(n_steps, axis=0), dtype=dtype)
115 | def reset(self):
116 | self.buffer = np.zeros_like(
117 | self.observation_space.low, dtype=self.dtype)
118 | return self.observation(self.env.reset())
119 | def observation(self, observation):
120 | self.buffer[:-1] = self.buffer[1:]
121 | self.buffer[-1] = observation
122 | return self.buffer
123 |
124 |
125 | class ScaledFloatFrame(gym.ObservationWrapper):
126 | def observation(self, obs):
127 | return np.array(obs).astype(np.float32) / 255.0
128 |
129 |
130 | def make_env(env_name):
131 | env = gym.make(env_name)
132 | env = MaxAndSkipEnv(env)
133 | env = FireResetEnv(env)
134 | env = ProcessFrame84(env)
135 | env = ImageToPyTorch(env)
136 | env = BufferWrapper(env, 4)
137 | env = ScaledFloatFrame(env)
138 |
139 | return env
140 |
141 |
142 | if __name__ == "__main__":
143 | env_name = "Pong-v0"
144 |
145 | env = make_env(env_name)
146 | print(env.reset().shape)
147 | print(env.observation_space)
148 | env.render()
--------------------------------------------------------------------------------
/DQNs/DQN_PER/dqn_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | class MLP_Model(nn.Module):
8 | def __init__(self, state_size, action_size):
9 | super(MLP_Model, self).__init__()
10 | self.fc1=nn.Linear(state_size,128)
11 | self.fc2=nn.Linear(128,256)
12 | self.fc3=nn.Linear(256,action_size)
13 |
14 | def forward(self, state):
15 | """Build a network that maps state -> action values."""
16 | out=self.fc1(state)
17 | out=F.relu(out)
18 | out=self.fc2(out)
19 | out=F.relu(out)
20 | q_a=self.fc3(out)
21 |
22 | return q_a
23 |
24 |
25 | class CNN_Model (nn.Module):
26 | def __init__(self, input_shape, n_actions):
27 | super(CNN_Model, self).__init__()
28 | self.conv = nn.Sequential(
29 | # input_shape 的第一个维度为 输入的 channel 数,比如输入为(4,84,84)时,channel = 4
30 | nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4),
31 | nn.ReLU(),
32 | nn.Conv2d(128, 256, kernel_size=4, stride=2),
33 | nn.ReLU(),
34 | nn.Conv2d(256, 128, kernel_size=3, stride=1),
35 | nn.ReLU()
36 | )
37 | conv_out_size = self._get_conv_out(input_shape)
38 | self.fc = nn.Sequential(
39 | nn.Linear(conv_out_size, 512),
40 | nn.ReLU(),
41 | nn.Linear(512, n_actions)
42 | )
43 |
44 | def _get_conv_out(self, input_shape):
45 | o = self.conv(torch.zeros((1, *input_shape)))
46 | return int(np.prod(o.size()))
47 |
48 | def forward(self, x):
49 | conv_out = self.conv(x)
50 | conv_out = conv_out.view(x.size()[0], -1)
51 | return self.fc(conv_out)
52 |
--------------------------------------------------------------------------------
/DQNs/DQN_PER/main_dqn_per.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
3 |
4 | import gym
5 | import arrow
6 | import torch
7 | import numpy as np
8 | from matplotlib import pyplot as plt
9 | from collections import deque
10 | from dqn_per import Agent_dqn
11 | import atari_wappers
12 | # from DQNs.DQN_PER.dqn_per import Agent_dqn
13 | # from DQNs.DQN_PER import atari_wappers
14 |
15 |
16 | def train_agent(agent,state_size,n_episodes ):
17 | scores_window = deque(maxlen=100) # last 100 scores
18 | scores , eps_lst = [],[]
19 |
20 | start_time = arrow.now()
21 | for i_episode in range(1, n_episodes + 1):
22 | state = env.reset()
23 | score = 0
24 |
25 | while True:
26 | action,epsilon = agent.act(state,i_episode)
27 | next_state, reward, done, _ = env.step(action)
28 |
29 | ## add sample and train agent
30 | sarsd = (state, action, reward, next_state, done)
31 | agent.step(sarsd)
32 |
33 | state = next_state
34 | score += reward
35 | if done:
36 | break
37 |
38 | scores_window.append(score) # save most recent score
39 | scores.append(score) # save most recent score
40 | eps_lst.append(epsilon)
41 |
42 | print('\rEpisode {} \t Average Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
43 | if i_episode % 100 == 0:
44 | print('\rEpisode {}\t Average Score: {:.2f}'.format(i_episode,np.mean(scores_window)))
45 | print('\rRunning time:{}\n'.format(arrow.now() - start_time))
46 | # if np.mean(scores_window) >= 195.0:
47 | # print('\nEnvironment solved in {:d} episodes! \t Average Score: {:.2f}'.format(i_episode - 100,
48 | # np.mean(scores_window)))
49 | # # torch.save(agent.qnetwork_local.state_dict(), model_file)
50 | # print('\nTotal running time:{}'.format(arrow.now() - start_time))
51 | # break
52 |
53 | return scores,eps_lst
54 |
55 |
56 | def plot_curves(data,plot_name,filename):
57 | fig = plt.figure()
58 | ax = fig.add_subplot(1, 1, 1)
59 | ax.plot(np.arange(len(data)), data)
60 | plt.ylabel(plot_name)
61 | plt.xlabel('Episode #')
62 | plt.savefig(filename)
63 |
64 |
65 | if __name__=="__main__":
66 | env = atari_wappers.make_env("SpaceInvaders-v0")
67 | state_size, action_size = env.observation_space.shape, env.action_space.n
68 |
69 | cnn_agent = Agent_dqn(state_size,action_size,'CNN','True','nonlinear')
70 | train_scores, _ = train_agent(cnn_agent, state_size, 2500)
71 | plot_curves(train_scores, 'Scores', 'Plots/cnn_per.png')
72 |
73 | # env = gym.make('CartPole-v0')
74 | # env.seed(0)
75 | # state_size, action_size = env.observation_space.shape[0], env.action_space.n
76 | # mlp_agent = Agent_dqn(state_size, action_size,'MLP','True','nonlinear')
77 | # train_scores,eps_lst = train_agent(mlp_agent,state_size,2500)
78 | # plot_curves(train_scores,'Scores','Plots/train_exp-3.png')
79 | # if mlp_agent.eps_decay:
80 | # plot_curves(eps_lst,'Epsilon', 'Plots/epsilon_exp-3.png')
81 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/.DS_Store
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-29#19:21.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#11:19.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-30#21:05.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/CNN_model|03-31#19:32.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqnCNN_model_0324.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Models/dqn_model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Models/dqn_model.pth
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-25#20:00.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:15.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/test-score|03-26#09:45.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-29#19:21.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#11:19.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-30#21:05.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/Plots/train-score|03-31#19:32.png
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/ReadMe.md:
--------------------------------------------------------------------------------
1 |
2 | ### To start the project
3 |
4 | #### 0. Basic settings
5 | * `env_name`: (str) name of the gym atari env that you want to play with
6 | * `run_mode` : (str:train/test)
7 |
8 | #### 1. To train an agent using DDQN with CNN network
9 | * `train_episode`
10 | * `learning_rate`
11 | * `buffer_size`
12 | * `batch_size`
13 | * `gamma`
14 | * `update_every`
15 | * `eps_decay`
16 |
17 | exp.
18 | ```
19 | python main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3
20 | ```
21 |
22 | To run in back ground and save a log file:
23 | ```
24 | nohup python -u main_dqn_atari.py SpaceInvaders-v0 train --learning_rate 1e-3 > train_20210326.log 2>&1 &
25 | ```
26 |
27 | #### 2. To test a trained agent
28 | * `test_episode` (int) number of episodes you what to test the agent
29 | * `test_model_file` (str) path of the model file corresponding with the trained agent you want to test
30 | * `test_video_play` (str:yes/no) whither you want to watch video playing during testing
31 |
32 | exp.
33 | ```
34 | python main_dqn_atari.py SpaceInvaders-v0 test --test_episode 500 --test_model_file Models/dqnCNN_model_0324.pth --test_video_play no
35 | ```
36 |
37 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/atari_wappers.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/cnn_model.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/__pycache__/dqn_agent.cpython-38.pyc
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/atari_wappers.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import gym
3 | import gym.spaces
4 | import numpy as np
5 | import collections
6 |
7 |
8 | class MaxAndSkipEnv(gym.Wrapper):
9 | """
10 | Combines the repetition of actions during k frames and pixels from two consecutive frames.
11 | """
12 | def __init__(self, env=None, skip=4):
13 | super(MaxAndSkipEnv, self).__init__(env)
14 | self._obs_buffer = collections.deque(maxlen=2)
15 | self._skip = skip
16 |
17 | def step(self, action):
18 | total_reward = 0.0
19 | done = None
20 | for _ in range(self._skip):
21 | obs, reward, done, info = self.env.step(action)
22 | self._obs_buffer.append(obs)
23 | total_reward += reward
24 | if done:
25 | break
26 | max_frame = np.max(np.stack(self._obs_buffer), axis=0)
27 | return max_frame, total_reward, done, info
28 |
29 | def reset(self):
30 | self._obs_buffer.clear()
31 | obs = self.env.reset()
32 | self._obs_buffer.append(obs)
33 | return obs
34 |
35 |
36 | class FireResetEnv(gym.Wrapper):
37 | """
38 | Presses fire button for environments that require it for the game to start.
39 | Also checks for some corner cases in some games
40 | """
41 | def __init__(self,env=None):
42 | """For environments where the user need to press FIRE for the game to start."""
43 | super(FireResetEnv, self).__init__(env)
44 | assert env.unwrapped.get_action_meanings()[1]=="FIRE"
45 | assert len(env.unwrapped.get_action_meanings()) >= 3
46 |
47 | def step(self,action):
48 |
49 | return self.env.step(action)
50 |
51 | def reset(self):
52 |
53 | self.env.reset()
54 |
55 | obs,_,done,_ = self.env.step(1)
56 | if done:
57 | self.env.reset()
58 | obs, _, done, _ = self.env.step(2)
59 | if done:
60 | self.env.reset()
61 | return obs
62 |
63 |
64 | class ProcessFrame84(gym.ObservationWrapper):
65 | """
66 | converts input image of 210x160 rgb to grayscale 84x84
67 | """
68 | def __init__(self, env=None):
69 | super(ProcessFrame84, self).__init__(env)
70 |
71 | self.observation_space = gym.spaces.Box(
72 | low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
73 |
74 | def observation(self, obs):
75 |
76 | return ProcessFrame84.process(obs)
77 | @staticmethod
78 | def process(frame):
79 | if frame.size == 210 * 160 * 3:
80 | img = np.reshape(frame, [210, 160, 3]).astype(
81 | np.float32)
82 | elif frame.size == 250 * 160 * 3:
83 | img = np.reshape(frame, [250, 160, 3]).astype(
84 | np.float32)
85 | else:
86 | assert False, "Unknown resolution."
87 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + \
88 | img[:, :, 2] * 0.114
89 | resized_screen = cv2.resize(
90 | img, (84, 110), interpolation=cv2.INTER_AREA)
91 | x_t = resized_screen[18:102, :]
92 | x_t = np.reshape(x_t, [84, 84, 1])
93 | return x_t.astype(np.uint8)
94 |
95 |
96 | class ImageToPyTorch(gym.ObservationWrapper):
97 | def __init__(self, env):
98 | super(ImageToPyTorch, self).__init__(env)
99 | old_shape = self.observation_space.shape
100 | new_shape = (old_shape[-1], old_shape[0], old_shape[1])
101 | self.observation_space = gym.spaces.Box(
102 | low=0.0, high=1.0, shape=new_shape, dtype=np.float32)
103 | def observation(self, observation):
104 | return np.moveaxis(observation, 2, 0)
105 |
106 |
107 | class BufferWrapper(gym.ObservationWrapper):
108 | def __init__(self, env, n_steps, dtype=np.float32):
109 | super(BufferWrapper, self).__init__(env)
110 | self.dtype = dtype
111 | old_space = env.observation_space
112 | self.observation_space = gym.spaces.Box(
113 | old_space.low.repeat(n_steps, axis=0),
114 | old_space.high.repeat(n_steps, axis=0), dtype=dtype)
115 | def reset(self):
116 | self.buffer = np.zeros_like(
117 | self.observation_space.low, dtype=self.dtype)
118 | return self.observation(self.env.reset())
119 | def observation(self, observation):
120 | self.buffer[:-1] = self.buffer[1:]
121 | self.buffer[-1] = observation
122 | return self.buffer
123 |
124 |
125 | class ScaledFloatFrame(gym.ObservationWrapper):
126 | def observation(self, obs):
127 | return np.array(obs).astype(np.float32) / 255.0
128 |
129 |
130 | def make_env(env_name):
131 | env = gym.make(env_name)
132 | env = MaxAndSkipEnv(env)
133 | env = FireResetEnv(env)
134 | env = ProcessFrame84(env)
135 | env = ImageToPyTorch(env)
136 | env = BufferWrapper(env, 4)
137 | env = ScaledFloatFrame(env)
138 |
139 | return env
140 |
141 |
142 | if __name__ == "__main__":
143 | env_name = "Pong-v0"
144 |
145 | env = make_env(env_name)
146 | print(env.reset().shape)
147 | print(env.observation_space)
148 | env.render()
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/cnn_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import atari_wappers
6 |
7 |
8 | class CNN_Model (nn.Module):
9 | def __init__(self, input_shape, n_actions):
10 | super(CNN_Model, self).__init__()
11 | self.conv = nn.Sequential(
12 | # input_shape 的第一个维度为 输入的 channel 数,比如输入为(4,84,84)时,channel = 4
13 | nn.Conv2d(input_shape[0], 128, kernel_size=8, stride=4),
14 | nn.ReLU(),
15 | nn.Conv2d(128, 256, kernel_size=4, stride=2),
16 | nn.ReLU(),
17 | nn.Conv2d(256, 128, kernel_size=3, stride=1),
18 | nn.ReLU()
19 | )
20 | conv_out_size = self._get_conv_out(input_shape)
21 | self.fc = nn.Sequential(
22 | nn.Linear(conv_out_size, 512),
23 | nn.ReLU(),
24 | nn.Linear(512, n_actions)
25 | )
26 |
27 | def _get_conv_out(self, input_shape):
28 | o = self.conv(torch.zeros((1, *input_shape)))
29 | return int(np.prod(o.size()))
30 |
31 | def forward(self, x):
32 | conv_out = self.conv(x)
33 | conv_out = conv_out.view(x.size()[0], -1)
34 | return self.fc(conv_out)
35 |
36 |
37 | if __name__ == "__main__":
38 | env = atari_wappers.make_env("SpaceInvaders-v0")
39 | state_size, action_size = env.observation_space.shape, env.action_space.n
40 | print(state_size, action_size)
41 | model = CNN_Model(state_size, action_size)
42 |
43 | state = env.reset()
44 | obs = env.reset()
45 | obs1 = env.reset()
46 | t = torch.tensor([obs, obs1])
47 | print("x.shape", t.shape)
48 |
49 | q_value = model.forward(t)
50 | actions = torch.tensor([[0,1]])
51 | print(q_value)
52 | print(q_value.gather(1,actions))
53 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/dqn_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import namedtuple, deque
4 | import torch
5 | import torch.nn.functional as F
6 | import torch.optim as optim
7 | from cnn_model import CNN_Model
8 |
9 | TAU = 1e-3 # for soft update of target parameters
10 | EPS_start=1.0
11 | EPS_end=0.01
12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
13 |
14 |
15 | class ReplayBuffer:
16 | """Fixed-size buffer to store experience tuples."""
17 |
18 | def __init__(self, action_size, buffer_size, batch_size):
19 | """Initialize a ReplayBuffer object.
20 |
21 | Params
22 | ======
23 | action_size (int): dimension of each action
24 | buffer_size (int): maximum size of buffer
25 | batch_size (int): size of each training batch
26 | seed (int): random seed
27 | """
28 | self.action_size = action_size
29 | self.memory = deque(maxlen=buffer_size)
30 | # 使用 deque(maxlen=N) 构造函数会新建一个固定大小的队列。当新的元素加入并且这个队列已满的时候, 最老的元素会自动被移除掉
31 | self.batch_size = batch_size
32 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
33 |
34 | def add(self, state, action, reward, next_state, done):
35 | """
36 | Add a new experience to the memory
37 | :param state:
38 | :param p: sample probability for this experience
39 | :return:
40 | """
41 | e = self.experience(state, action, reward, next_state, done)
42 | self.memory.append(e)
43 |
44 | def clean_buffer(self):
45 | self.memory.clear()
46 |
47 | def sample(self):
48 | """Randomly sample a batch of experiences from memory."""
49 | experiences = random.sample(self.memory, k=self.batch_size)
50 |
51 | states = torch.tensor([e.state for e in experiences if e is not None]).float().to(device)
52 | actions = torch.tensor([[e.action for e in experiences if e is not None]]).long().to(device)
53 | rewards = torch.tensor([e.reward for e in experiences if e is not None]).float().to(device)
54 | next_states = torch.tensor([e.next_state for e in experiences if e is not None]).float().to(
55 | device)
56 | dones = torch.from_numpy(np.array([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(
57 | device)
58 | return (states, actions, rewards, next_states, dones)
59 |
60 | def __len__(self):
61 | """Return the current size of internal memory."""
62 | return len(self.memory)
63 |
64 |
65 | class Agent_dqn():
66 | def __init__(self, input_channel,action_size,learning_rate=5e-3,buffer_size=int(1e4),batch_size=32):
67 | """Initialize an Agent object.
68 |
69 | Params
70 | ======
71 | state_size (int): dimension of each state
72 | action_size (int): dimension of each action
73 | seed (int): random seed
74 | """
75 | self.action_size = action_size
76 |
77 | # Q-Network
78 | self.qnetwork_local = CNN_Model(input_channel,action_size).to(device)
79 | self.qnetwork_target = CNN_Model(input_channel,action_size).to(device)
80 | self.optimizer = optim.Adam(self.qnetwork_local.parameters(), learning_rate)
81 |
82 | # Replay memory
83 | self.batch_size = batch_size
84 | self.memory = ReplayBuffer(action_size, buffer_size,batch_size)
85 | # Initialize time step (for updating every UPDATE_EVERY steps)
86 | self.t_step = 0
87 | self.episode = 0
88 | self.epsilon = EPS_start
89 |
90 | def act(self,state,i_episode,eps_decay):
91 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
92 | self.qnetwork_local.eval()
93 | with torch.no_grad():
94 | action_values = self.qnetwork_local(state)
95 | self.qnetwork_local.train()
96 |
97 | " Epsilon-greedy action selection"
98 | if i_episode>self.episode:
99 | # update EPS every new episode
100 | self.epsilon = max(EPS_end, eps_decay * self.epsilon)
101 | self.episode = i_episode
102 | # epsilon greedy policy
103 | if random.random() > self.epsilon:
104 | return np.argmax(action_values.cpu().data.numpy())
105 | else:
106 | return random.choice(np.arange(self.action_size))
107 |
108 | def act_greedy_policy(self,state):
109 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
110 | self.qnetwork_local.eval()
111 | with torch.no_grad():
112 | action_values = self.qnetwork_local(state)
113 | return np.argmax(action_values.cpu().data.numpy())
114 |
115 | def step(self,sarsd,gamma,update_every):
116 | state, action, reward, next_state, done = sarsd
117 | self.t_step += 1
118 |
119 | # add an experience for current time step
120 | self.memory.add(state, action, reward, next_state, done)
121 |
122 | # Learn every UPDATE_EVERY time steps
123 | if (self.t_step+1) % update_every==0:
124 | if self.memory.__len__()>self.batch_size:
125 | batch_exps = self.memory.sample()
126 | loss = self.learn(batch_exps,gamma)
127 | return loss
128 |
129 | def learn(self,exps,gamma):
130 | # fetch the batch (s,a,r,s',done) from experiences batch
131 | states,actions,rewards,next_states,dones = exps
132 | print(states.shape)
133 |
134 | # ------------------ calculate loss —------------------------- #
135 |
136 | # calculate Q targets
137 | expected_next_max_actions = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(0)
138 | Q_expected_next = self.qnetwork_target(next_states).gather(1, expected_next_max_actions)
139 | Q_targets = rewards + (gamma * Q_expected_next * (1 - dones))
140 |
141 | # get expected Q for current state
142 | Q_expected = self.qnetwork_local(states).gather(1, actions)
143 |
144 | loss = F.mse_loss(Q_expected, Q_targets)
145 |
146 | # ---------------- update local Q net -------------------- #
147 | self.optimizer.zero_grad()
148 | loss.backward()
149 | self.optimizer.step()
150 | # print(next(self.qnetwork_local.parameters()).is_cuda)
151 |
152 | # ---------------- update target Q net -------------------- #
153 | self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)
154 |
155 | return loss.cpu().detach().numpy()
156 |
157 | def soft_update(self, local_model, target_model, tau):
158 | """Soft update model parameters.
159 | θ_target = τ*θ_local + (1 - τ)*θ_target
160 |
161 | Params
162 | ======
163 | local_model (PyTorch model): weights will be copied from
164 | target_model (PyTorch model): weights will be copied to
165 | tau (float): interpolation parameter
166 | """
167 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
168 | target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-0.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-100.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-140.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-140.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-152.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-152.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-167.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-167.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-185.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-185.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-200.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-200.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-204.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-204.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-227.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-227.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-300.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-300.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-400.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-400.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-500.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-500.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-600.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-600.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-674.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-674.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-683.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-683.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-696.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-696.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-700.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-700.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-714.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-714.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-733.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-733.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-756.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-756.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-800.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-800.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-900.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-900.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-902.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-902.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-909.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-909.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-920.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-920.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-936.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-936.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/image/pic-956.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/DQNs/DQN_cnn/image/pic-956.jpg
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210326.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 2000
8 | Network update every 5 time step
9 | Replay buffer size : 5000
10 | Batch size : 32
11 | Learning rate : 0.001
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210329.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 2000
8 | Network update every 5 time step
9 | Replay buffer size : 5000
10 | Batch size : 32
11 | Learning rate : 0.001
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
Episode 100 Loss 38.24846267700195 Average Score: 136.55
16 |
Running time till now :0:11:03.313392
17 |
18 |
Episode 200 Loss 13.040627479553223 Average Score: 153.55
19 |
Running time till now :0:22:44.250463
20 |
21 |
Episode 300 Loss 15.12213134765625 Average Score: 193.75
22 |
Running time till now :0:35:31.581827
23 |
24 |
Episode 400 Loss 52.153236389160156 Average Score: 205.35
25 |
Running time till now :0:49:19.173927
26 |
27 |
Episode 500 Loss 21.199983596801758 Average Score: 230.80
28 |
Running time till now :1:03:42.341828
29 |
30 |
Episode 600 Loss 44.75456237792969 Average Score: 237.10
31 |
Running time till now :1:18:31.195379
32 |
33 |
Episode 700 Loss 71.21875762939453 Average Score: 279.30
34 |
Running time till now :1:33:07.172067
35 |
36 |
Episode 800 Loss 46.80872344970703 Average Score: 250.20
37 |
Running time till now :1:45:39.041750
38 |
39 |
Episode 900 Loss 69.49663543701172 Average Score: 270.55
40 |
Running time till now :1:59:20.249699
41 |
42 |
Episode 1000 Loss 97.96715545654297 Average Score: 270.80
43 |
Running time till now :2:13:20.709739
44 |
45 |
Episode 1100 Loss 82.20999145507812 Average Score: 280.75
46 |
Running time till now :2:28:11.418844
47 |
48 |
Episode 1200 Loss 29.77111053466797 Average Score: 270.15
49 |
Running time till now :2:42:38.161003
50 |
51 |
Episode 1300 Loss 33.50057601928711 Average Score: 263.00
52 |
Running time till now :2:55:55.026575
53 |
54 |
Episode 1400 Loss 32.226627349853516 Average Score: 296.65
55 |
Running time till now :3:10:45.828023
56 |
57 |
Episode 1500 Loss 30.3413143157959 Average Score: 280.10
58 |
Running time till now :3:26:07.034734
59 |
60 |
Episode 1600 Loss 30.96596336364746 Average Score: 271.00
61 |
Running time till now :3:40:35.273112
62 |
63 |
Episode 1700 Loss 32.25701904296875 Average Score: 255.85
64 |
Running time till now :3:53:36.508000
65 |
66 |
Episode 1800 Loss 28.328149795532227 Average Score: 293.50
67 |
Running time till now :4:09:19.669146
68 |
69 |
Episode 1900 Loss 29.688913345336914 Average Score: 259.10
70 |
Running time till now :4:23:57.511495
71 |
72 |
Episode 2000 Loss 27.258968353271484 Average Score: 261.15
73 |
Running time till now :4:38:51.220961
74 |
75 | Training finished, total running time:4:38:51.232527.
76 | Model saved.
77 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210329_1.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 1000
8 | Network update every 5 time step
9 | Replay buffer size : 5000
10 | Batch size : 64
11 | Learning rate : 0.005
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210330.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 1000
8 | Network update every 5 time step
9 | Replay buffer size : 3500
10 | Batch size : 64
11 | Learning rate : 0.005
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
Episode 100 Loss 71.22970581054688 Average Score: 180.00
16 |
Running time till now :0:22:27.094323
17 |
18 |
Episode 200 Loss 29.8588924407959 Average Score: 152.60
19 |
Running time till now :0:44:21.409593
20 |
21 |
Episode 300 Loss 61.14106369018555 Average Score: 209.75
22 |
Running time till now :1:09:07.883698
23 |
24 |
Episode 400 Loss 30.508338928222656 Average Score: 218.05
25 |
Running time till now :1:35:38.577875
26 |
27 |
Episode 500 Loss 89.18991088867188 Average Score: 245.05
28 |
Running time till now :2:03:52.850307
29 |
30 |
Episode 600 Loss 21.991769790649414 Average Score: 262.35
31 |
Running time till now :2:30:57.078511
32 |
33 |
Episode 700 Loss 23.49405860900879 Average Score: 254.65
34 |
Running time till now :2:56:23.053378
35 |
36 |
Episode 800 Loss 81.24069213867188 Average Score: 263.45
37 |
Running time till now :3:21:37.061215
38 |
39 |
Episode 900 Loss 24.93558692932129 Average Score: 284.15
40 |
Running time till now :3:50:02.799914
41 |
42 |
Episode 1000 Loss 85.55946350097656 Average Score: 268.20
43 |
Running time till now :4:16:22.288483
44 |
45 | Training finished, total running time:4:16:22.299132.
46 | Model saved.
47 | Traceback (most recent call last):
48 | File "main_dqn_atari.py", line 178, in
49 | dqn_agent = Agent_dqn(state_size,action_size)
50 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/dqn_agent.py", line 79, in __init__
51 | self.qnetwork_local = CNN_Model(input_channel,action_size).to(device)
52 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 20, in __init__
53 | conv_out_size = self._get_conv_out(input_shape)
54 | File "/home/lesreg/Remote_Pros/DRL_pytorch/DQNs/DQN_cnn/cnn_model.py", line 28, in _get_conv_out
55 | o = self.conv(torch.zeros((1, *input_shape)))
56 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
57 | result = self.forward(*input, **kwargs)
58 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/container.py", line 119, in forward
59 | input = module(input)
60 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
61 | result = self.forward(*input, **kwargs)
62 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 399, in forward
63 | return self._conv_forward(input, self.weight, self.bias)
64 | File "/root/anaconda3/envs/gym_py36/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 396, in _conv_forward
65 | self.padding, self.dilation, self.groups)
66 | RuntimeError: Calculated padded input size per channel: (160 x 3). Kernel size: (8 x 8). Kernel size can't be greater than actual input size
67 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/log/train_20210331.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 2000
8 | Network update every 10 time step
9 | Replay buffer size : 3500
10 | Batch size : 128
11 | Learning rate : 0.0005
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
Episode 100 Loss 37.57309341430664 Average Score: 144.10
16 |
Running time till now :0:20:57.373758
17 |
18 |
Episode 200 Loss 45.944541931152344 Average Score: 149.85
19 |
Running time till now :0:43:08.088817
20 |
21 |
Episode 300 Loss 77.53382110595703 Average Score: 200.00
22 |
Running time till now :1:08:21.291658
23 |
24 |
Episode 400 Loss 94.59493255615234 Average Score: 200.65
25 |
Running time till now :1:34:55.924278
26 |
27 |
Episode 500 Loss 20.661224365234375 Average Score: 228.70
28 |
Running time till now :2:02:23.143794
29 |
30 |
Episode 600 Loss 38.10764694213867 Average Score: 259.20
31 |
Running time till now :2:32:13.928401
32 |
33 |
Episode 700 Loss 21.809246063232422 Average Score: 238.65
34 |
Running time till now :2:58:31.344456
35 |
36 |
Episode 800 Loss 27.276247024536133 Average Score: 252.15
37 |
Running time till now :3:22:48.359756
38 |
39 |
Episode 900 Loss 64.65150451660156 Average Score: 273.00
40 |
Running time till now :3:50:41.210916
41 |
42 |
Episode 1000 Loss 25.74323272705078 Average Score: 266.45
43 |
Running time till now :4:16:18.361667
44 |
45 |
Episode 1100 Loss 18.910884857177734 Average Score: 277.10
46 |
Running time till now :4:44:24.187721
47 |
48 |
Episode 1200 Loss 26.118581771850586 Average Score: 267.65
49 |
Running time till now :5:09:37.085516
50 |
51 |
Episode 1300 Loss 25.362396240234375 Average Score: 251.85
52 |
Running time till now :5:34:53.701273
53 |
54 |
Episode 1400 Loss 26.500167846679688 Average Score: 281.05
55 |
Running time till now :6:00:55.716864
56 |
57 |
Episode 1500 Loss 32.66218185424805 Average Score: 257.55
58 |
Running time till now :6:25:39.960819
59 |
60 |
Episode 1600 Loss 52.91573715209961 Average Score: 268.40
61 |
Running time till now :6:50:39.701043
62 |
63 |
Episode 1700 Loss 43.722801208496094 Average Score: 265.95
64 |
Running time till now :7:13:50.829814
65 |
66 |
Episode 1800 Loss 49.69996643066406 Average Score: 262.20
67 |
Running time till now :7:39:07.592524
68 |
69 |
Episode 1900 Loss 84.68921661376953 Average Score: 266.25
70 |
Running time till now :8:06:00.022165
71 |
72 |
Episode 2000 Loss 24.432580947875977 Average Score: 259.90
73 |
Running time till now :8:37:15.948752
74 |
75 | Training finished, total running time:8:37:15.969363.
76 | Model saved.
77 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/main_test.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import arrow
4 | import torch
5 | import gym
6 | import numpy as np
7 | from matplotlib import pyplot as plt
8 | from collections import deque
9 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn
10 | from DQNs.DQN_cnn import atari_wappers
11 |
12 |
13 | def plot_scores(scores,filename):
14 | fig = plt.figure()
15 | ax = fig.add_subplot(1, 1, 1)
16 | ax.plot(np.arange(len(scores)), scores)
17 | plt.ylabel('Score')
18 | plt.xlabel('Episode #')
19 | plt.savefig(filename)
20 |
21 |
22 |
23 | def train_agent(env,agent,n_episode,eps_decay,gamma,update_every):
24 | scores = [] # list containing scores from each episode
25 | scores_window = deque(maxlen=100) # last 100 scores
26 |
27 | start_time = arrow.now()
28 | for i_episode in range(1, n_episode + 1):
29 | state = env.reset()
30 | print(state.shape)
31 | score = 0
32 | episode_loss=[]
33 | while True:
34 | # # check the memory usage of system, clean replay buffer if too high
35 | # if (sys_mem.used / sys_mem.total) >= 0.03:
36 | # agent.memory.clean_buffer()
37 | # print('Buffer cleaned on episode {}'.format(i_episode))
38 | # get action
39 | action = agent.act(state,i_episode,eps_decay)
40 | # interact with env (one step)
41 | next_state, reward, done, _ = env.step(action)
42 | # train the agent
43 | sarsd = (state, action, reward, next_state,done)
44 | loss = agent.step(sarsd,gamma,update_every)
45 | # update status
46 | state = next_state
47 | score += reward
48 | # break the loop if current episode is over
49 | if done:
50 | break
51 | if loss is not None:
52 | episode_loss.append(loss)
53 |
54 | # update rewards and scores every episode
55 | scores_window.append(score)
56 | scores.append(score)
57 |
58 | # print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss),
59 | # np.mean(scores_window)), end="")
60 | #
61 | # if i_episode > 25:
62 | # print('Replay Buffer size: {}'.format(agent.memory.__len__()))
63 | # print('Memory used: ',sys_mem.used)
64 | # print('Memory used rate: ',sys_mem.used/sys_mem.total)
65 |
66 | if i_episode % 100 == 0:
67 | print('\rEpisode {}\t Loss {} \t Average Score: {:.2f}'.format(i_episode, np.mean(episode_loss),
68 | np.mean(scores_window)))
69 | print('\rRunning time till now :{}\n'.format(arrow.now() - start_time))
70 |
71 |
72 | print("Training finished, total running time:{}. \n Model saved.".format(arrow.now()-start_time))
73 |
74 | return scores
75 |
76 |
77 |
78 |
79 |
80 | if __name__ =="__main__":
81 | env = atari_wappers.make_env("SpaceInvaders-v0")
82 | state_size, action_size = env.observation_space.shape, env.action_space.n
83 | dqn_agent = Agent_dqn(state_size,action_size)
84 | train_agent(env,dqn_agent,1,0.98,0.995,5)
85 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/play_atari.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import deque
4 | import os
5 | import torch
6 | # from skimage import io
7 | from DQNs.DQN_cnn.dqn_agent import Agent_dqn
8 | from DQNs.DQN_cnn import atari_wappers
9 |
10 |
11 | # def save_films(state,step):
12 | # if not os.path.exists('./image'):
13 | # os.makedirs('./image')
14 | # img_name = './image/pic-%d.jpg' % step
15 | # io.imsave(img_name, state)
16 |
17 |
18 | def random_play():
19 | for step in range(5000):
20 | env.render()
21 | action = 1
22 | state, reward, done, info = env.step(action)
23 |
24 | if step % 100 == 0:
25 | print(state.shape)
26 | # print(state)
27 | save_films(state, step)
28 |
29 | if reward > 0:
30 | print(reward, step)
31 | save_films(state, step)
32 |
33 | if done:
34 | print('dead in %d steps' % step)
35 | break
36 |
37 |
38 | def random_test(env):
39 | socres = []
40 | scores_window = deque(maxlen=100)
41 |
42 | for i_episode in range(100):
43 | state = env.reset()
44 | score = 0
45 | while True:
46 | action = np.random.choice(env.action_space.n,1)[0]
47 | state, reward, done, info = env.step(action)
48 | score += reward
49 | if done:
50 | break
51 | socres.append(score)
52 | scores_window.append(score)
53 |
54 | if i_episode % 10 == 0:
55 | print('Episode {},\t Average score : {} '.format(i_episode, np.mean(scores_window)))
56 |
57 |
58 | def trained_agent_test(env,agent):
59 | socres = []
60 | scores_window = deque(maxlen=100)
61 |
62 | for i_episode in range(5000):
63 | state = env.reset()
64 | score = 0
65 |
66 | while True:
67 | action = agent.act(state)
68 | env.render()
69 | state, reward, done, _ = env.step(action)
70 | score += reward
71 | if done:
72 | break
73 | socres.append(score)
74 | scores_window.append(score)
75 |
76 | if i_episode % 100 == 0:
77 | print('Episode {},\r Average score : {} '.format(i_episode,np.mean(scores_window)))
78 |
79 |
80 | if __name__ =="__main__":
81 |
82 | env = gym.make('SpaceInvaders-v0')
83 | random_test(env)
84 |
85 | # env = atari_wappers.make_env("SpaceInvaders-v0")
86 | # state_size, action_size = env.observation_space.shape, env.action_space.n
87 | # dqn_agent = Agent_dqn(state_size, action_size)
88 | #
89 | # dqn_agent.qnetwork_local.load_state_dict(torch.load("dqn_model.pth"))
90 | # trained_agent_test(env,dqn_agent)
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/DQNs/DQN_cnn/train_20210401.log:
--------------------------------------------------------------------------------
1 | nohup: ignoring input
2 | ####################################################
3 | Start Training on SpaceInvaders-v0 environment using DQN with CNN
4 | ####################################################
5 |
6 | Training Parameters :
7 | Train episode : 5000
8 | Network update every 5 time step
9 | Replay buffer size : 3000
10 | Batch size : 128
11 | Learning rate : 0.0005
12 | GAMMA : 0.99
13 | Epsilon decay rate : 0.995
14 |
15 |
--------------------------------------------------------------------------------
/Evaluation_Algorithms/CartPole.py:
--------------------------------------------------------------------------------
1 | """
2 | 在 CartPole 环境中测试多种算法智能体的表现,并对比奖励曲线图
3 | 测试算法:1.PPO
4 | 2.DDPG/TD3
5 | 3.DQN
6 | 4.A3C/A2C
7 | """
8 |
9 | import torch
10 | import gym
11 | import numpy as np
12 | import pandas as pd
13 | from collections import deque
14 | import matplotlib.pyplot as plt
15 |
16 |
17 | def plot_scores(scores,file_name,multi_time=False):
18 | "绘制多次训练多条曲线"
19 | if multi_time:
20 | x=np.arange(1, len(scores[0]) + 1)
21 | for n in range(len(scores)):
22 | rolling_mean = pd.Series(scores[n]).rolling(100).mean()
23 | plt.plot(x,rolling_mean,label="trial_"+str(n+1))
24 | else:
25 | x = np.arange(1, len(scores) + 1)
26 | rolling_mean = pd.Series(scores).rolling(100).mean()
27 | plt.plot(x, rolling_mean)
28 |
29 | plt.ylabel('Score')
30 | plt.xlabel('Episode #')
31 | plt.legend()
32 | plt.savefig(file_name)
33 | plt.show()
34 |
35 |
36 | def plot_diff_agent(scores_2d,file_name):
37 | " 绘制多种不同agent的训练曲线:多曲线图"
38 | for name,scores in scores_2d:
39 | x = np.arange(1, len(scores) + 1)
40 | rolling_mean = pd.Series(scores).rolling(100).mean()
41 | plt.plot(x, rolling_mean,label=name)
42 | plt.ylabel('Score')
43 | plt.xlabel('Episode #')
44 | plt.legend()
45 | plt.savefig(file_name)
46 | plt.show()
47 |
48 |
--------------------------------------------------------------------------------
/Games_play_train/atari.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import os
3 | from skimage import io
4 |
5 | env = gym.make('SpaceInvaders-v0')
6 | #env = gym.make("PongDeterministic-v4")
7 | status = env.reset()
8 |
9 |
10 | print('observation space:', env.observation_space)
11 | print('action space:', env.action_space)
12 |
13 |
14 | def save_films(state,step):
15 | if not os.path.exists('./image'):
16 | os.makedirs('./image')
17 | img_name = './image/pic-%d.jpg' % step
18 | io.imsave(img_name, state)
19 |
20 |
21 | for step in range(5000):
22 | env.render()
23 | action =1
24 | state, reward, done, info = env.step(action)
25 |
26 | if step%100 ==0 :
27 | print(state.shape)
28 | # print(state)
29 | save_films(state,step)
30 |
31 | if reward >0:
32 | print(reward,step)
33 | save_films(state,step)
34 |
35 | if done:
36 | print('dead in %d steps' % step)
37 | break
--------------------------------------------------------------------------------
/Policy_Gradient/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/.DS_Store
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/agent_PG.cpython-37.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/agent_PG.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from collections import deque
4 | import torch
5 | import torch.optim as optim
6 | from CartPole.Policy_Gradient.model import Policy
7 |
8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
9 |
10 | GAMMA=1.0
11 | LR=0.001
12 |
13 |
14 | class Agent_PG():
15 |
16 | def __init__(self, state_size, action_size,type):
17 | self.policy=Policy(state_size,action_size).to(device)
18 | self.optimizer=optim.Adam(self.policy.parameters(), lr=LR)
19 | self.type=type
20 |
21 | def reinforce_loss(self,log_probs,rewards):
22 | "------根据 Reinforce 算法计算的损失函数---------"
23 | # calculate discount rewards
24 | discounts=[GAMMA**i for i in range(len(rewards))]
25 | R=sum([g*r for g,r in zip(discounts,rewards)])
26 |
27 | loss_arr=[]
28 | for log_prob in log_probs:
29 | loss_arr.append(-log_prob * R)
30 |
31 | policy_loss=torch.cat(loss_arr).sum() # 把n个1d tensor 组成的list 拼接成一个完整的 tensor(1d,size:n)
32 | # print(policy_loss)
33 | return policy_loss
34 |
35 | def pg_loss(self,log_probs,rewards):
36 | """----
37 | Reinforce 的改进版本:
38 | 1.Credit Assignment:对每个 a(t) 计算未来累积折扣回报 R
39 | 2.对每个t的回报R进行 batch normalization
40 | ------"""
41 | # calculate the (discounted) future rewards
42 | furRewards_dis = []
43 | for i in range(len(rewards)):
44 | discount = [GAMMA ** i for i in range(len(rewards) - i)]
45 | f_rewards = rewards[i:]
46 | furRewards_dis.append(sum(d * f for d, f in zip(discount, f_rewards)))
47 | # print(furRewards_dis)
48 |
49 | # -- Normalize reward
50 | mean = np.mean(furRewards_dis)
51 | std = np.std(furRewards_dis) + 1.0e-10
52 | rewards_normalized = (furRewards_dis - mean) / std
53 |
54 | # -- calculate policy loss
55 | loss_arr = []
56 | for i in range(len(rewards_normalized)):
57 | loss_arr.append(-log_probs[i]*rewards_normalized[i])
58 | # print(loss_arr)
59 |
60 | policy_loss = torch.cat(loss_arr).sum()
61 | # print(policy_loss,"----------\n")
62 |
63 | return policy_loss
64 |
65 | def train(self,env):
66 | state = env.reset()
67 | log_probs = []
68 | rewards = []
69 | # --- collect log probs and rewards for a single trajectory
70 | while True:
71 | # convert state to tensor
72 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d
73 | result_dic = self.policy.act(state)
74 | next_state, reward, done, _ = env.step(result_dic['action'])
75 | log_probs.append(result_dic['log_prob'])
76 | rewards.append(reward)
77 | state = next_state
78 | if done:
79 | break
80 | total_reward = sum(rewards)
81 |
82 | # --- update policy after one completed trajectory
83 | # calculate loss
84 | loss = self.reinforce_loss(log_probs, rewards)
85 | if self.type=="reinforce":
86 | loss = self.reinforce_loss(log_probs, rewards)
87 | elif self.type=="pg":
88 | loss = self.pg_loss(log_probs, rewards)
89 |
90 | # backprop the loss to update policy network
91 | self.optimizer.zero_grad()
92 | loss.backward()
93 | self.optimizer.step()
94 |
95 | return total_reward
96 |
97 |
98 | if __name__=="__main__":
99 | env = gym.make('CartPole-v0')
100 | agent=Agent_PG(state_size=4,action_size=2,type='pg')
101 | n_episode=2000
102 |
103 | scores_deque = deque(maxlen=100)
104 | scores = []
105 | for i_episode in range(1,n_episode+1):
106 | Reward=agent.train(env)
107 |
108 | scores_deque.append(Reward)
109 | scores.append(Reward)
110 | if i_episode % 100 == 0:
111 | print('Episode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/main_PG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gym
3 | import numpy as np
4 | import pandas as pd
5 | from collections import deque
6 | import matplotlib.pyplot as plt
7 | from CartPole.Policy_Gradient.agent_PG import Agent_PG
8 | from CartPole.Policy_Gradient.PPO_with_R import PPO_v1
9 | from CartPole.Policy_Gradient.PPO_with_A import PPO_V2
10 |
11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
12 | # model_file="models/pg_model_3.pth"
13 | # plot_file="results&plots/pg_3.png"
14 |
15 |
16 | def watch_smart_agent(agent,model_name):
17 | agent.policy.load_state_dict(torch.load(model_name))
18 | state = env.reset()
19 | for t in range(1000):
20 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
21 | action,_ = agent.policy.act(state)
22 | env.render()
23 | state, reward, done, _ = env.step(action)
24 | if done:
25 | print("done in time step {}".format(t+1))
26 | break
27 | env.close()
28 |
29 |
30 | def plot_scores(scores,file_name,multi_time=False):
31 | "绘制多次训练多条曲线"
32 | if multi_time:
33 | x=np.arange(1, len(scores[0]) + 1)
34 | for n in range(len(scores)):
35 | rolling_mean = pd.Series(scores[n]).rolling(100).mean()
36 | plt.plot(x,rolling_mean,label="trial_"+str(n+1))
37 | else:
38 | x = np.arange(1, len(scores) + 1)
39 | rolling_mean = pd.Series(scores).rolling(100).mean()
40 | plt.plot(x, rolling_mean)
41 |
42 | plt.ylabel('Score')
43 | plt.xlabel('Episode #')
44 | plt.legend()
45 | plt.savefig(file_name)
46 | plt.show()
47 |
48 |
49 | def plot_diff_agent(scores_2d,file_name):
50 | " 绘制多种不同agent的训练曲线:多曲线图"
51 | for name,scores in scores_2d:
52 | x = np.arange(1, len(scores) + 1)
53 | rolling_mean = pd.Series(scores).rolling(100).mean()
54 | plt.plot(x, rolling_mean,label=name)
55 | plt.ylabel('Score')
56 | plt.xlabel('Episode #')
57 | plt.legend()
58 | plt.savefig(file_name)
59 | plt.show()
60 |
61 |
62 | def agent_test(agent,n_episode,model_name):
63 | agent.policy.load_state_dict(torch.load(model_name))
64 | scores = []
65 | for i_episode in range(1, n_episode + 1):
66 | rewards=[]
67 | state = env.reset()
68 | while True:
69 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d
70 | action, _ = agent.policy.act(state)
71 | state, reward, done, _ = env.step(action)
72 | rewards.append(reward)
73 | if done:
74 | break
75 | scores.append(sum(rewards))
76 |
77 | return scores
78 |
79 |
80 | def train_agent(env,agent,n_episode,model_file):
81 | scores_deque = deque(maxlen=100)
82 | scores = []
83 |
84 | for i_episode in range(1, n_episode + 1):
85 | total_reward=agent.train(env)
86 | # record scores(total rewards) per episode
87 | scores_deque.append(total_reward)
88 | scores.append(total_reward)
89 |
90 | print('\r Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
91 | .format(i_episode, np.mean(scores_deque), total_reward), end="")
92 | if i_episode % 100 == 0:
93 | print('\n Episode {}\t Average Score: {:.2f}\n'.format(i_episode,np.mean(scores_deque)))
94 | if np.mean(scores_deque) >= 195.0:
95 | print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}\n----------\n'.format(i_episode,
96 | np.mean(scores_deque)))
97 | torch.save(agent.policy.state_dict(),model_file)
98 | break
99 |
100 | return scores
101 |
102 |
103 | def train_agent_multi_times(env, agent, n_episode, train_time, file):
104 | " 一个 agent 训练多次并绘制所有的奖励曲线,考察特定 policy gradient 算法的稳定性"
105 | scores_2d = []
106 | for n in range(train_time):
107 | scores = []
108 | for i_episode in range(1, n_episode + 1):
109 | total_reward = agent.train(env)
110 | scores.append(total_reward)
111 |
112 | print('Trial {} finished. \t Avg score for the last 100 episode: {}'
113 | .format((n + 1), np.mean(scores[-100:])))
114 | scores_2d.append(scores)
115 |
116 | plot_scores(scores_2d, file,multi_time=True)
117 |
118 |
119 | def train_diff_agents(env,agents,n_episode,file):
120 | " 训练多种算法的不同agent, 绘制奖励曲线对比性能 "
121 | scores_2d=[]
122 | for name in agents.keys():
123 | scores = []
124 | for i_episode in range(1, n_episode + 1):
125 | total_reward = agents[name].train(env)
126 | scores.append(total_reward)
127 | scores_2d.append((name,scores))
128 | print('Training agent {} finished. \t Avg score for the last 100 episode: {}'\
129 | .format(name,np.mean(scores[-100:])))
130 |
131 | plot_diff_agent(scores_2d,file)
132 |
133 |
134 | if __name__=="__main__":
135 | env = gym.make('CartPole-v0')
136 |
137 | agent_pg = Agent_PG(state_size=4,action_size=2,type="pg")
138 | agent_rf=Agent_PG(state_size=4,action_size=2,type="reinforce")
139 | ppo_R=PPO_v1(state_size=4,action_size=2)
140 |
141 | ppo_without_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=False)
142 | ppo_with_entropy=PPO_V2(state_size=4,action_size=2,add_entropy=True)
143 |
144 | #train_scores = train_agent(env, ppo_with_entropy, 2000, 'PGs/models/PPO_new.pth')
145 | #plot_scores(train_scores, 'PGs/results&plots/PPO_with_entropy_1.png')
146 |
147 | # agents={'PPO with R':ppo_R,
148 | # 'PPO with A':ppo_with_entropy,
149 | # 'Policy Gradient':agent_pg,
150 | # 'Reinforce':agent_rf}
151 |
152 | ppo_agents={'PPO_R':ppo_R,'PPO_A_org':ppo_without_entropy,'PPO_A_entropy':ppo_with_entropy}
153 |
154 | train_diff_agents(env, ppo_agents, 1500, '../results&plots/PPO_comparison_4.png')
155 | # train_agent_multi_times(env,ppo_with_entropy,1300,5,'PGs/results&plots/PPO-entropy_5times.png')
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.distributions import Categorical
5 | import numpy as np
6 |
7 |
8 | class Policy(nn.Module):
9 | "single Policy network for Reinforce and PG"
10 |
11 | def __init__(self,state_size,action_size):
12 | super(Policy, self).__init__()
13 | self.seed = torch.manual_seed(0)
14 | self.fc1 = nn.Linear(state_size, 24)
15 | self.fc2 = nn.Linear(24, 36)
16 | self.fc3 = nn.Linear(36, action_size)
17 |
18 | def forward(self, x):
19 | """
20 | Build a network that maps state -> action probs.
21 | """
22 |
23 | out=F.relu(self.fc1(x))
24 | out = F.relu(self.fc2(out))
25 | out = F.softmax(self.fc3(out),dim=1)
26 |
27 | return out
28 |
29 | def act(self,state):
30 | # probs for each action (2d tensor)
31 | probs = self.forward(state)
32 | m = Categorical(probs)
33 | action = m.sample()
34 | # return action for current state, and the corresponding probability
35 |
36 | result_dic={"action":action.item(),"log_prob":m.log_prob(action)
37 | ,"prob":probs[:,action.item()].item()}
38 | return result_dic
39 |
40 |
41 | class Actor(nn.Module):
42 | """Policy netwrok for PPO_R"""
43 | "Actor_Critic model for PPO_A"
44 |
45 | def __init__(self,state_size,action_size):
46 | super(Actor, self).__init__()
47 | self.seed = torch.manual_seed(0)
48 | self.fc1 = nn.Linear(state_size, 128)
49 | # self.fc2 = nn.Linear(64,128)
50 | self.fc2= nn.Linear(128, action_size)
51 |
52 | def forward(self, x):
53 | """
54 | Build a network that maps state -> action probs.
55 | """
56 |
57 | x=F.relu(self.fc1(x))
58 | out = F.softmax(self.fc2(x),dim=1)
59 | return out
60 |
61 | def act(self,state):
62 | # probs for each action (2d tensor)
63 | probs = self.forward(state)
64 | m = Categorical(probs)
65 | action = m.sample()
66 | # return action for current state, and the corresponding probability
67 |
68 | result_dic={"action":action.item(),"log_prob":m.log_prob(action)
69 | ,"prob":probs[:,action.item()].item()}
70 | return result_dic
71 |
72 |
73 | class Critic(nn.Module):
74 | " Actor_Critic model for PPO"
75 |
76 | def __init__(self,state_size):
77 | super(Critic, self).__init__()
78 | self.fc1=nn.Linear(state_size,128)
79 | # self.fc2=nn.Linear(64,128)
80 | self.fc2=nn.Linear(128,1)
81 |
82 | def forward(self,x):
83 | x=F.relu(self.fc1(x))
84 | state_value = self.fc2(x)
85 | return state_value
86 |
87 |
88 | if __name__=="__main__":
89 |
90 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
91 | state = np.array([-0.04456399, 0.04653909, 0.01326909, -0.02099827])
92 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
93 |
94 | policy=Policy(state_size=4,action_size=2).to(device)
95 | action,log_prob=policy.act(state)
96 | print(action,log_prob)
97 |
98 |
99 |
100 |
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPO_model-1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_model-1.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPO_new.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPO_new.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/PPOv2_model-1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/PPOv2_model-1.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_1.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_2.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_3.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/pg_model_4.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/pg_model_4.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_2.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_3.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_3.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_4.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_4.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_5.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_5.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PGs/models/reinforce_model_6.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PGs/models/reinforce_model_6.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/.DS_Store
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.distributions import Categorical
5 | from torch.distributions import Normal
6 |
7 |
8 | class ActorDiscrete(nn.Module):
9 | """
10 | 用于离散动作空间的策略网络
11 | """
12 | def __init__(self,state_size,action_size):
13 | super(ActorDiscrete, self).__init__()
14 | self.seed = torch.manual_seed(0)
15 | self.fc1 = nn.Linear(state_size, 128)
16 | # self.fc2 = nn.Linear(64,128)
17 | self.fc2= nn.Linear(128, action_size)
18 |
19 | def forward(self, x):
20 | """
21 | Build a network that maps state -> action probs.
22 | """
23 |
24 | x=F.relu(self.fc1(x))
25 | out = F.softmax(self.fc2(x),dim=1)
26 | return out
27 |
28 | def act(self,state):
29 | """
30 | 返回 action 和 action的概率
31 | """
32 | # probs for each action (2d tensor)
33 | probs = self.forward(state)
34 | m = Categorical(probs)
35 | action = m.sample()
36 | ## return action for current state, and the corresponding probability
37 | # result_dic={"action":action.item(),"log_prob":m.log_prob(action)
38 | # ,"prob":probs[:,action.item()].item()}
39 |
40 | return action.item(),probs[:,action.item()].item()
41 |
42 |
43 | class ActorContinous(nn.Module):
44 | """
45 | 用于连续动作空间的策略网络
46 | """
47 | def __init__(self,state_size,action_size):
48 | super(ActorContinous, self).__init__()
49 | self.fc1 = nn.Linear(state_size, 128)
50 | self.fc2 = nn.Linear(128,128)
51 | self.mu_head = nn.Linear(128, action_size)
52 | self.sigma_head = nn.Linear(128, action_size)
53 |
54 | def forward(self, x):
55 | x = F.relu(self.fc1(x))
56 | x = F.relu(self.fc2(x))
57 | mu = 2.0 * torch.tanh(self.mu_head(x))
58 | sigma = F.softplus(self.sigma_head(x))
59 | return (mu, sigma)
60 |
61 | def act(self,state):
62 | """
63 | 返回 action 和 action 的 log prob
64 | """
65 | with torch.no_grad():
66 | (mu, sigma) = self.forward(state) # 2d tensors
67 | dist = Normal(mu, sigma)
68 | action = dist.sample()
69 | action_log_prob = dist.log_prob(action)
70 |
71 | return action.numpy()[0], action_log_prob.numpy()[0]
72 |
73 |
74 | class Critic(nn.Module):
75 | " Actor_Critic model for PPO"
76 |
77 | def __init__(self,state_size):
78 | super(Critic, self).__init__()
79 | self.fc1=nn.Linear(state_size,128)
80 | # self.fc2=nn.Linear(64,128)
81 | self.fc2=nn.Linear(128,1)
82 |
83 | def forward(self,x):
84 | x=F.relu(self.fc1(x))
85 | state_value = self.fc2(x)
86 | return state_value
87 |
88 |
89 | if __name__=="__main__":
90 | pass
91 |
92 |
93 |
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_v1.py:
--------------------------------------------------------------------------------
1 | """
2 | PPO_V1: 直接使用累积奖励计算loss;无critic,只有policy网络
3 | """
4 | import numpy as np
5 | import gym
6 | from collections import namedtuple
7 | from collections import deque
8 | import torch
9 | import torch.optim as optim
10 | import torch.nn as nn
11 | from torch.distributions import Normal
12 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
13 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic
14 |
15 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
16 | GAMMA=0.99
17 | LR=0.001
18 | BATCH_SIZE=32
19 | CLIP=0.2
20 | UPDATE_TIME=10
21 | max_grad_norm=0.5
22 | Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward'])
23 |
24 |
25 | class PPO_v1():
26 |
27 | def __init__(self, state_size, action_size,continuous=False):
28 | self.policy = ActorDiscrete(state_size, action_size).to(device)
29 | self.continuous = continuous
30 | if self.continuous:
31 | self.policy = ActorContinous(state_size, action_size).to(device)
32 | self.optimizer=optim.Adam(self.policy.parameters(), lr=LR)
33 | self.trajectory=[]
34 |
35 | def update_policy(self,exps,i_episode):
36 | """
37 | update policy for every sampled transition groups
38 | called by learn() multiple times for one episode
39 | """
40 | states,actions,old_probs,f_Rewrds=exps
41 | # get action probs from new policy
42 | if self.continuous:
43 | (mus, sigmas) = self.policy(states)
44 | dists = Normal(mus, sigmas)
45 | new_probs = dists.log_prob(actions)
46 | ratios = torch.exp(new_probs - old_probs)
47 | else:
48 | new_probs = self.policy(states).gather(1, actions)
49 | ratios = new_probs / old_probs
50 |
51 | # calculate clipped surrogate function
52 | surr1 = ratios * f_Rewrds
53 | surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * f_Rewrds
54 | policy_loss=-torch.min(surr1,surr2).mean()
55 |
56 | # update policy network
57 | self.optimizer.zero_grad()
58 | policy_loss.backward()
59 | nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm)
60 | self.optimizer.step()
61 |
62 | # self.traintime_counter+=1
63 |
64 | def learn(self,i_episode):
65 | """
66 | agent learn after finishing every episode.
67 | learn from experiences of this trajectory
68 | :return:
69 | """
70 | states=torch.cat([t.state for t in self.trajectory])
71 | actions=torch.tensor([t.action for t in self.trajectory],dtype=torch.long).view(-1,1)
72 | old_probs=torch.tensor([t.prob for t in self.trajectory],dtype=torch.float).view(-1,1)
73 |
74 | # -- calculate discount future rewards for every time step
75 | rewards = [t.reward for t in self.trajectory]
76 | fur_Rewards = []
77 | for i in range(len(rewards)):
78 | discount = [GAMMA ** i for i in range(len(rewards) - i)]
79 | f_rewards = rewards[i:]
80 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
81 | fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1)
82 |
83 | for i in range(UPDATE_TIME):
84 | # -- repeat the flowing update loop for several times
85 | # disorganize transitions in the trajectory into sub groups
86 | for index_set in BatchSampler(SubsetRandomSampler(range(len(self.trajectory))), BATCH_SIZE, False):
87 | exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set])
88 | # -- update policy network for every sub groups
89 | self.update_policy(exps,i_episode)
90 |
91 | del self.trajectory[:] # clear trajectory
92 |
93 |
94 | def train(self,env,i_episode):
95 | state = env.reset()
96 | total_reward=0
97 | while True:
98 | # self.timesetp_counter+=1
99 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d
100 | action, prob = self.policy.act(state) # 离散空间取直接prob,连续空间取log prob
101 | next_state, reward, done, _ = env.step(action)
102 |
103 | # --store transition in this current trajectory
104 | self.trajectory.append(Transition(state,action,prob,reward))
105 | state=next_state
106 | total_reward+=reward
107 | if done:
108 | break
109 | # --agent learn after finish current episode, and if there is enough transitions
110 | if BATCH_SIZE <= len(self.trajectory):
111 | self.learn(i_episode)
112 |
113 | return total_reward
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/PPO_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | PPO_V2: 使用优势函数计算loss;有critic网络
3 | """
4 | import random
5 | import numpy as np
6 | import gym
7 | from collections import namedtuple
8 | from collections import deque
9 | import torch
10 | import torch.optim as optim
11 | from torch.distributions import Normal
12 | import torch.nn.functional as F
13 | import torch.nn as nn
14 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
15 | from Policy_Gradient.PPO.PPO_model import ActorContinous,ActorDiscrete,Critic
16 | from torch.utils.tensorboard import SummaryWriter
17 |
18 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
19 | writer = SummaryWriter('./board/logs')
20 |
21 | GAMMA=0.99
22 | LR_a=0.001
23 | LR_c=0.003
24 | BATCH_SIZE=32
25 | CLIP=0.2
26 | BETA=0.01
27 | UPDATE_TIME=10
28 | max_grad_norm=0.5
29 |
30 | Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward'])
31 |
32 |
33 | class Memory():
34 | def __init__(self):
35 | self.trajectory=[]
36 | self.Transition = namedtuple('Transition', ['state', 'action', 'prob', 'reward'])
37 |
38 | def add(self,state,action,prob,reward):
39 | # state = torch.from_numpy(state).float().unsqueeze(0).to(device)
40 | self.trajectory.append(self.Transition(state,action,prob,reward))
41 |
42 | def clean_buffer(self):
43 | del self.trajectory[:]
44 |
45 | def get_trajectory(self):
46 | states = torch.cat([t.state for t in self.trajectory])
47 | actions = torch.tensor([t.action for t in self.trajectory], dtype=torch.long).view(-1, 1)
48 | probs = torch.tensor([t.prob for t in self.trajectory], dtype=torch.float).view(-1, 1)
49 | rewards = [t.reward for t in self.trajectory]
50 | return states,actions,probs,rewards
51 |
52 | def __len__(self):
53 | return len(self.trajectory)
54 |
55 |
56 | class PPO_v2():
57 | def __init__(self,state_size, action_size,continuous=False,add_entropy=True):
58 |
59 | self.critic = Critic(state_size)
60 | self.policy = ActorDiscrete(state_size, action_size).to(device)
61 | self.continuous = continuous
62 | if self.continuous:
63 | self.policy = ActorContinous(state_size, action_size).to(device)
64 |
65 | self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=LR_a)
66 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_c)
67 |
68 | self.memory = Memory()
69 | self.train_step = 0
70 | self.add_entropy=add_entropy
71 |
72 | def policy_loss(self,states,actions,
73 | old_probs,f_Rewrds,V):
74 |
75 | # get action probs from new policy and calculate the ratio
76 | if self.continuous:
77 | (mus, sigmas) = self.policy(states)
78 | dists = Normal(mus, sigmas)
79 | new_probs = dists.log_prob(actions)
80 | ratios = torch.exp(new_probs - old_probs)
81 | else:
82 | new_probs = self.policy(states).gather(1, actions)
83 | ratios = new_probs / old_probs
84 |
85 | # calculate advance from critic network
86 | advantage = (f_Rewrds - V).detach()
87 |
88 | # calculate clipped surrogate function
89 | surr1 = ratios * advantage
90 | surr2 = torch.clamp(ratios, 1 - CLIP, 1 + CLIP) * advantage
91 | policy_loss = -torch.min(surr1, surr2)
92 |
93 | if self.add_entropy:
94 | # include a regularization term,this steers new_policy towards 0.5
95 | # add in 1.e-10 to avoid log(0) which gives nan
96 | entropy= -(new_probs*torch.log(old_probs+1.e-10)+ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))
97 | policy_loss+=BETA*entropy
98 |
99 | policy_loss=torch.mean(policy_loss)
100 |
101 | return policy_loss
102 |
103 | def critic_loss(self,f_Rewrds, V):
104 | return F.mse_loss(f_Rewrds, V)
105 |
106 | def update_policy(self,exps,i_episode):
107 | states, actions, old_probs, f_Rewrds = exps
108 | V = self.critic(states)
109 |
110 | # -- update policy(actor) network -- #
111 | policy_loss = self.policy_loss(states,actions,old_probs,f_Rewrds,V)
112 | # self.writer.add_scalar('loss/policy_loss', policy_loss, global_step=self.train_step)
113 | # update parameters
114 | self.policy_optimizer.zero_grad()
115 | policy_loss.backward()
116 | # nn.utils.clip_grad_norm_(self.policy.parameters(), max_grad_norm)
117 | self.policy_optimizer.step()
118 |
119 | # -- update value(critic) network -- #
120 | value_loss = self.critic_loss(f_Rewrds,V)
121 | # self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.train_step)
122 | self.critic_optimizer.zero_grad()
123 | value_loss.backward()
124 | # nn.utils.clip_grad_norm_(self.critic.parameters(), max_grad_norm)
125 | self.critic_optimizer.step()
126 |
127 | self.train_step+=1
128 | writer.add_scalar('loss/policy_loss',policy_loss.item(),i_episode)
129 | writer.add_scalar('loss/value_loss', value_loss.item(), i_episode)
130 | writer.flush()
131 |
132 | def learn(self,i_episode):
133 | """
134 | agent learn after finishing every episode.
135 | learn from experiences of this trajectory
136 | :return:
137 | """
138 | # states=torch.cat([t.state for t in self.memory.trajectory])
139 | # actions=torch.tensor([t.action for t in self.memory.trajectory],dtype=torch.long).view(-1,1)
140 | # old_probs=torch.tensor([t.prob for t in self.memory.trajectory],dtype=torch.float).view(-1,1)
141 | # rewards = [t.reward for t in self.memory.trajectory]
142 |
143 | states, actions, old_probs, rewards = self.memory.get_trajectory()
144 | # -- calculate discount future rewards for every time step
145 | fur_Rewards = []
146 | for i in range(len(rewards)):
147 | discount = [GAMMA ** i for i in range(len(rewards) - i)]
148 | f_rewards = rewards[i:]
149 | fur_Rewards.append(sum(d * f for d, f in zip(discount, f_rewards)))
150 | fur_Rewards=torch.tensor(fur_Rewards,dtype=torch.float).view(-1,1)
151 |
152 | for i in range(UPDATE_TIME):
153 | # -- repeat the flowing update loop for several times
154 | # disorganize transitions in the trajectory into sub groups
155 | for index_set in BatchSampler(SubsetRandomSampler(range(self.memory.__len__())), BATCH_SIZE, False):
156 | exps=(states[index_set],actions[index_set],old_probs[index_set],fur_Rewards[index_set])
157 | # -- update policy network for every sub groups
158 | self.update_policy(exps,i_episode)
159 |
160 | self.memory.clean_buffer()
161 |
162 | def train(self,env,i_episode):
163 | state = env.reset()
164 | total_reward=0
165 | while True:
166 | # self.timesetp_counter+=1
167 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) # 升维 1d->2d
168 | action,prob = self.policy.act(state) # 离散空间取直接prob,连续空间取log prob
169 | next_state, reward, done, _ = env.step(action)
170 | # --store transition in this current trajectory
171 | self.memory.add(state,action,prob,reward)
172 | state=next_state
173 | total_reward+=reward
174 | if done:
175 | break
176 | # --agent learn after finish current episode, and if there is enough transitions
177 | if BATCH_SIZE <= self.memory.__len__():
178 | self.learn(i_episode)
179 |
180 | return total_reward
181 |
182 |
183 |
184 |
185 |
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-37.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_model.cpython-38.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v1.cpython-38.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/__pycache__/PPO_v2.cpython-38.pyc
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/.DS_Store
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/.DS_Store
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608693869.bogon.80327.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608694041.bogon.80355.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608778854.bogon.82580.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779119.bogon.82611.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779166.bogon.82627.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779638.bogon.82655.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608779657.bogon.82666.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780330.bogon.82692.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/board/logs/events.out.tfevents.1608780689.bogon.82718.0
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/cartPole_ppo-v1_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/cartPole_ppo-v1_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/main_PPO.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import arrow
3 | import gym
4 | import numpy as np
5 | import pandas as pd
6 | from collections import deque
7 | import matplotlib.pyplot as plt
8 | from Policy_Gradient.PPO.PPO_v2 import PPO_v2
9 | from Policy_Gradient.PPO.PPO_v1 import PPO_v1
10 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
11 |
12 |
13 | def output_scores(start_time,i_episode,scores_deque,score,solve_limit):
14 | print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'
15 | .format(i_episode, np.mean(scores_deque), score), end="")
16 | if i_episode % 100 == 0:
17 | print('\rEpisode {}\tAverage Score: {:.2f}\t Running time til now :{}'
18 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
19 | if np.mean(scores_deque) >= solve_limit:
20 | print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}\t Total running time :{}'
21 | .format(i_episode, np.mean(scores_deque),arrow.now()-start_time))
22 | return True
23 |
24 | return False
25 |
26 |
27 | def plot_scores(scores,filename):
28 | plt.plot(np.arange(1, len(scores) + 1), scores)
29 | plt.ylabel('Score')
30 | plt.xlabel('Episode #')
31 | plt.savefig(filename)
32 | plt.show()
33 |
34 |
35 | def get_env_prop(env_name, continuous):
36 | env = gym.make(env_name)
37 | state_dim = env.observation_space.shape[0]
38 | if continuous:
39 | action_dim = env.action_space.shape[0]
40 | else:
41 | action_dim = env.action_space.n
42 |
43 | return env,state_dim, action_dim
44 |
45 |
46 | def train_agent_for_env(env_name,continuous,n_episode,model_file,solve_limit):
47 | """
48 | continuous: 动作空间是否连续(True/False)
49 | model_file: 训练好的模型的保存路径
50 | solve_limit : 环境 solve 的标准,score 阈值
51 | """
52 | env, state_dim, action_dim = get_env_prop(env_name,continuous)
53 |
54 | agent = PPO_v1(state_dim,action_dim,continuous)
55 | scores_deque = deque(maxlen=100)
56 | scores = []
57 |
58 | start_time = arrow.now()
59 | for i_episode in range(1, n_episode + 1):
60 | total_reward = agent.train(env,i_episode)
61 | # record scores(total rewards) per episode
62 | scores_deque.append(total_reward)
63 | scores.append(total_reward)
64 | solved = output_scores(start_time, i_episode, scores_deque, total_reward,solve_limit)
65 | if solved:
66 | torch.save(agent.policy.state_dict(), model_file)
67 | break
68 |
69 | return agent, scores
70 |
71 |
72 | def watch_random_agent(env_name,continuous):
73 | env, state_dim, action_dim = get_env_prop(env_name, continuous)
74 | for _ in range(5):
75 | env.reset()
76 | while True:
77 | env.render()
78 | next_state, reward, done, _ =env.step(env.action_space.sample())
79 | if done:
80 | break
81 |
82 | env.close()
83 |
84 |
85 | def watch_smart_agent(env_name,continuous,model_name,n_episode):
86 | env,state_dim, action_dim = get_env_prop(env_name,continuous)
87 | agent=PPO_v1(state_dim,action_dim,continuous)
88 | agent.policy.load_state_dict(torch.load(model_name))
89 |
90 | scores =[]
91 | for i_episode in range(1, n_episode + 1):
92 | rewards = []
93 | state = env.reset()
94 | while True:
95 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
96 | action, _ = agent.policy.act(state)
97 | env.render()
98 | state, reward, done, _ = env.step(action)
99 | rewards.append(reward)
100 | if done:
101 | break
102 | scores.append(sum(rewards))
103 | return scores
104 |
105 |
106 | if __name__=="__main__":
107 | """train PPO agent in CartPole (discrete action space)"""
108 | # agent_cartPole,scores_1 = train_agent_for_env('CartPole-v0',False,2000,
109 | # 'models/cartPole_ppo-v1_1.pth',195)
110 | # plot_scores(scores_1,'cartPole_ppo-v1_1.png')
111 |
112 | # 观察未经训练的随机智能体
113 | # watch_random_agent('CartPole-v0',False)
114 | # 测试训练好的智能体
115 | # test_scores=watch_smart_agent('CartPole-v0',False,'models/PPO_new.pth',100)
116 | # plot_scores(test_scores,"PPO_cartPole_test.png")
117 |
118 | """train PPO agent in MountainCarContinuous (continuous action space)"""
119 | agent_mCar, scores_2 = train_agent_for_env('MountainCarContinuous-v0', True, 2000,
120 | 'models/mCar_ppo-v1.pth',95)
121 | plot_scores(scores_2, 'mCar_ppo-v1_1.png')
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/PPO_new.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/PPO_new.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/cartPole_ppo.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/PPO/models/cartPole_ppo_20201222.pth
--------------------------------------------------------------------------------
/Policy_Gradient/PPO_cnn/cnn_ppo.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | class ActorCritic(torch.nn.Module):
8 | def __init__(self, input_shape, output_shape):
9 | super(ActorCritic, self).__init__()
10 | self.conv1 = nn.Conv2d(input_shape, 32, 3, stride=2, padding=1)
11 | self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
12 | self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
13 | self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
14 |
15 | self.critic_linear = nn.Linear(256, 1)
16 | self.actor_linear = nn.Linear(256, output_shape)
17 |
18 |
--------------------------------------------------------------------------------
/Policy_Gradient/envTest.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import deque
4 | import matplotlib.pyplot as plt
5 |
6 | env = gym.make('CartPole-v0')
7 | env.seed(0)
8 |
9 | print('observation space:', env.observation_space)
10 | print('action space:', env.action_space)
11 | print('action space:', env.action_space.n)
12 |
13 | # 观察一个未经训练的随机智能体
14 | state = env.reset()
15 | print(state)
16 |
17 | done=False
18 | for _ in range(5000):
19 | env.render()
20 | if not done:
21 | next_state, reward, done, _ =env.step(env.action_space.sample())
22 | print(next_state, reward)
23 | else:
24 | break
25 |
26 | env.close()
27 |
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/.DS_Store
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A vs. PPO-R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs. PPO-R.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A vs.PPO-R_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5_times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5_times1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5_times1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-A_train_5times_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-A_train_5times_2.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO-entropy_5times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO-entropy_5times.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_20201222.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_20201222.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_test.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_cartPole_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_cartPole_train.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_2.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_3.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_comparison_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_comparison_4.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_multiple_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_multiple_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_with_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/PPO_with_entropy_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/PPO_with_entropy_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/cartpole_reinforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/cartpole_reinforce.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/diff_algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/diff_algorithm.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_2.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/pg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/pg_3.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_2.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_3.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_4.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_5.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/reinforce_vs_pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/reinforce_vs_pg.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_1.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_2.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_3.png
--------------------------------------------------------------------------------
/Policy_Gradient/results&plots/rf-vs-pg_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantum-Cheese/DeepReinforcementLearning_Pytorch/3a80a41ed1f33ad82807496d43947fc9d670184b/Policy_Gradient/results&plots/rf-vs-pg_4.png
--------------------------------------------------------------------------------