├── .DS_Store
├── .gitattributes
├── .gitignore
├── .idea
├── .gitignore
├── Torch-rl.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── other.xml
└── vcs.xml
├── LICENSE
├── README.md
└── Torch_rl
├── .DS_Store
├── Hierarchical_RL
├── HIRO.py
└── __init__.py
├── ImitationLearning
├── Behavior_Clone.py
├── GAIL.py
├── __init__.py
└── core_IL.py
├── __init__.py
├── agent
├── .DS_Store
├── A3C.py
├── DDPG.py
├── DDPG_2.py
├── DQN.py
├── DRQN.py
├── PPO.py
├── SAC.py
├── TD3.py
├── TRPO.py
├── core_policy.py
└── core_value.py
├── algorithm
├── PPO_LSTM.py
├── PPO_Lagrangian.py
├── SPPO.py
└── __init__.py
├── common
├── Policy_for_DQN.py
├── distribution.py
├── logger.py
├── loss.py
├── memory.py
└── util.py
├── example
├── agent_example
│ ├── RUN_Catrpole_with_DQN.py
│ ├── RUN_Pendulum_with_PPO.py
│ ├── RUN_Pendulum_with_TD3.py
│ ├── RUN_mountaincar_with_DQN.py
│ └── __init__.py
└── algorithm_example
│ ├── RUN_Pendulum_with_PPO_LSTM.py
│ ├── RUN_Pendulum_with_PPO_largrangian.py
│ └── __init__.py
├── model
├── GNN_layer.py
├── GNN_network.py
├── Network.py
└── special_model.py
├── temp_file
├── PPO.py
├── PPO2.py
└── __init__.py
└── test_file
├── __init__.py
├── run_DP.py
├── run_HIRO.py
├── run_dp_dqn.py
├── testbackward.py
├── testtt.py
└── testttttt.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/.DS_Store
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | *.pyc
3 | savedate/.DS_Store
4 | *.txt
5 | *.csv
6 | *.iml
7 | *.xml
8 | .idea/Torch-rl.iml
9 | .idea/misc.xml
10 | *.xml
11 | *.iml
12 | *.iml
13 | *.xml
14 | Torch_rl/agent/.DS_Store
15 | *.pkl
16 | *.0
17 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Default ignored files
3 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/Torch-rl.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Zee-MAC
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Torch-rl
2 | ## introduction
3 | Torch和tensorflow是目前深度学习的主要两个框架,现如今 在 TF 和 torch两个方面都有非常出色的代码,但是从使用程度上来讲torch这边的RL实现,很少有一个兼顾框架和易用的代码。
4 | 这里借鉴了Keras-RL的框架 以及 baseline的实现思路,建立一套基于Torch版本的RL实现。
5 |
6 | 本着以最简单的 最快速的 最实际的方式建立一个Torch DRL的框架,节省大家学习的时间直接利用。希望大家也能加入,一起实现。
7 |
8 | **本仓库兼容CPU与GPU,目前还未实现MPI。** 算法(非严格)采用PET - 8编写, 并带有注释。
9 |
10 | ## 仓库架构
11 | + agent
12 | 包含agent 内核(与环境交互的过程) 以及 所有强化学习算法
13 | + common
14 | 包含记录文件、loss函数、经验池、DQN策略
15 | + model
16 | 包含所有深度网络实现
17 | + savedata
18 | 记录训练结果
19 | run_xxxxxxxxxxx 训练实例。
20 |
21 | ## 目前进展
22 | 基于Keras-RL建立交互以及算法框架,并借鉴了baseline的logger文件,可以直接输出 txt、CSV、tensorboard对训练过程进行观察
23 |
24 | #### 算法:
25 |
26 | + DQN(包含Double DQN、 Dueling DQN)[source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DQN.py)
27 |
28 | + DRQN [source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DRQN.py)
29 |
30 | + DDPG [source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DDPG.py)
31 |
32 | + PPO [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/PPO3.py)
33 |
34 | + Batch-PPO [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/PPO.py)
35 |
36 | + TD3 [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/TD3.py)
37 |
38 | #### 网络:
39 |
40 | 可以快速建立全联接网络、CNN、LSTM、CNN-LSTM。
41 |
42 |
43 |
44 |
45 |
46 | ## example
47 |
48 | 有一些简单的训练example
49 |
50 | [RUN_Catrpole_with_DQN.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Catrpole_with_DQN.py)
51 |
52 | [RUN_Pendulum_with_DDPG.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Pendulum_with_DDPG.py)
53 |
54 | [RUN_Pendulum_with_PPO.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Pendulum_with_PPO.py)
55 |
56 | ## 教程 等待进一步更新。。。。。。
57 |
58 |
59 |
--------------------------------------------------------------------------------
/Torch_rl/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/.DS_Store
--------------------------------------------------------------------------------
/Torch_rl/Hierarchical_RL/HIRO.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from Torch_rl.common.loss import huber_loss
10 | from torch.autograd import Variable
11 | from gym import spaces as Space
12 | from Torch_rl.common.Policy_for_DQN import BoltzmannQPolicy
13 |
14 | class HIRO_Agent(Agent_value_based):
15 | def __init__(self, env,
16 | H_policy, H_model, L_policy, L_model,
17 | goal = Space.Box(low=-1, high=1, shape=(1,), dtype=np.float32),
18 | # step for H_model
19 | step_interval = 10, H_train_interval=100, H_train_time = 100,
20 | ## hyper-parameter
21 | gamma=0.90, H_lr=1e-3, L_lr = 1e-3, batch_size=32, buffer_size=50000, learning_starts=1000,
22 | H_target_network_update_freq=500, L_target_network_update_freq=500,
23 | decay=False, decay_rate=0.9,
24 | ## prioritized_replay
25 | ##
26 | path=None):
27 | """
28 |
29 | :param env:
30 | :param H_policy:
31 | :param H_model:
32 | :param L_policy:
33 | :param L_model:
34 | :param goal:
35 | :param step_interval:
36 | :param gamma:
37 | :param H_lr:
38 | :param L_lr:
39 | :param batch_size:
40 | :param buffer_size:
41 | :param learning_starts:
42 | :param H_target_network_update_freq:
43 | :param L_target_network_update_freq:
44 | :param decay:
45 | :param decay_rate:
46 | :param path:
47 | """
48 |
49 | self.env = env
50 | self.gamma = gamma
51 | self.batch_size = batch_size
52 | self.learning_starts = learning_starts
53 | self.step_interval = step_interval
54 |
55 | # self.replay_buffer = ReplayMemory(buffer_size)
56 | # generate policy
57 | if H_policy == "DDPG" and isinstance(goal, Space.Box) and len(H_model) == 2:
58 | from agent.DDPG import DDPG_Agent
59 | if isinstance(H_lr,list):
60 | ac_lr = H_lr[0]
61 | cr_lr = H_lr[1]
62 | else:
63 | ac_lr = H_lr
64 | cr_lr = H_lr
65 | if isinstance(H_target_network_update_freq,list):
66 | actor_target_network_update_freq = H_target_network_update_freq[0]
67 | critic_target_network_update_freq = H_target_network_update_freq[1]
68 | else:
69 | actor_target_network_update_freq = H_target_network_update_freq
70 | critic_target_network_update_freq = H_target_network_update_freq
71 | self.H_agent = DDPG_Agent(env, H_model[0], H_model[1],
72 | actor_lr=ac_lr, critic_lr=cr_lr,
73 | actor_target_network_update_freq=actor_target_network_update_freq,
74 | critic_target_network_update_freq=critic_target_network_update_freq,
75 | ## hyper-parameter
76 | gamma=gamma, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts,
77 | ## decay
78 | decay=decay, decay_rate=decay_rate,
79 | )
80 | self.H_main_net = self.H_agent.actor
81 |
82 | if H_policy == "PPO" and isinstance(goal, Space.Box):
83 | from agent.PPO import PPO_Agent
84 | self.high_agent = PPO_Agent()
85 |
86 | if L_policy == "DQN":
87 | from agent.DQN import DQN_Agent
88 | self.L_agent = DQN_Agent(env, L_model, BoltzmannQPolicy,
89 | gamma=gamma, lr=L_lr, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts,
90 | target_network_update_freq=L_target_network_update_freq,
91 | decay=decay, decay_rate=decay_rate,
92 | double_dqn=True, dueling_dqn=False, dueling_way="native")
93 | self.L_main_net = self.L_agent.Q_net
94 |
95 | def forward(self, observation):
96 | observation = observation.astype(np.float32)
97 | observation = torch.from_numpy(observation)
98 | if self.step % self.step_interval == 0:
99 | goal = self.high_agent.forward(observation)
100 | if isinstance(goal,tuple):
101 | self.goal, Q = goal[0], goal[1]
102 | else:
103 | self.goal = goal
104 | L_observation = torch.cat(inputs=(observation, self.goal), dimension=0)
105 | action = self.L_agent.forward(L_observation)
106 | if isinstance(action, tuple):
107 | action, Q = action[0], action[1]
108 | else:
109 | action = action
110 |
111 |
112 |
113 | return action
114 |
115 | def backward(self, sample_):
116 | self.L_agent.backward(sample_)
117 | if self.step % self.step_interval == 0:
118 | self.L_agent.replay_buffer.sample(self.batch_size)
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | def load_weights(self, filepath):
130 | pass
131 |
132 | def save_weights(self, filepath, overwrite=False):
133 | pass
134 |
135 |
--------------------------------------------------------------------------------
/Torch_rl/Hierarchical_RL/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/Hierarchical_RL/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/ImitationLearning/Behavior_Clone.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.ImitationLearning.core_IL import Agent_IL
4 | from copy import deepcopy
5 | from torch import nn
6 | from Torch_rl.common import logger
7 | from Torch_rl.common.memory import ReplayMemory
8 | from torch.optim import Adam
9 |
10 | class BC_Agent(Agent_IL):
11 |
12 | def __init__(self, env, base_algorithm, policy_network, value_network = None,
13 | batch_size=32, lr=1e-4,
14 | path=None):
15 | self.env = env
16 | self.base_algorithm = base_algorithm
17 | self.policy_network = policy_network
18 | self.value_network = value_network
19 | self.batch_size = batch_size
20 |
21 | self.loss_cal = nn.MSELoss()
22 | self.policy_model_optim = Adam(self.policy_network.parameters(), lr=lr)
23 | if self.value_network is not None:
24 | self.value_model_optim = Adam(self.value_network.parameters(), lr=lr)
25 |
26 | super(BC_Agent, self).__init__(path)
27 |
28 | def training_with_data(self, expert_data, max_imitation_learning_step, training_ways):
29 |
30 | self.step = 0
31 |
32 | while self.step < max_imitation_learning_step:
33 | if training_ways == "random":
34 | samples = expert_data.sample(self.batch_size)
35 | elif training_ways == "episode":
36 | samples = expert_data.sample_episode()
37 | elif training_ways == "fragment":
38 | samples = expert_data.sample_fragment(self.batch_size)
39 |
40 | actions = self.policy_network.forward(samples["s"])
41 | loss = self.loss_cal(actions, samples["a"])
42 | self.policy_model_optim.zero_grad()
43 | loss.backward()
44 | self.policy_model_optim.step()
45 |
46 | def training_with_policy(self, expert_policy, max_imitation_learning_step=1e5,
47 | max_ep_cycle=2000, buffer_size=32):
48 | self.step = 0
49 | s = self.env.reset()
50 | loss_BC = 0
51 | ep_step, ep_reward, ep_loss = 0, 0, 0
52 | expert_action_set,policy_action_set = [],[]
53 |
54 | for _ in range(max_imitation_learning_step):
55 | self.step += 1
56 | ep_step += 1
57 | a_expert = expert_policy(s)
58 | a_policy = self.policy_network.forward(s)
59 |
60 | expert_action_set.append(torch.tensor(a_expert))
61 | policy_action_set.append(a_policy)
62 | s_, r, done, info = self.env.step(a_policy)
63 | ep_reward += r
64 | sample = {"s": s, "a": a_policy, "a_expert":a_expert, "s_": s_, "r": r, "tr": done}
65 | s = s_[:]
66 |
67 | if len(policy_action_set) > buffer_size:
68 |
69 | loss = self.loss_cal(expert_action_set, policy_action_set)
70 | ep_loss += loss.cpu().detach().numpy()
71 | self.policy_model_optim.zero_grad()
72 | loss.backward()
73 | self.policy_model_optim.step()
74 |
75 | if done or ep_step>max_ep_cycle:
76 | ep_step = 0
77 | logger.record_tabular("steps", self.step)
78 | logger.record_tabular("loss", ep_loss)
79 | logger.record_tabular("loss", ep_reward)
80 |
81 |
82 |
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/Torch_rl/ImitationLearning/GAIL.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.ImitationLearning.core_IL import Agent_IL
4 | from copy import deepcopy
5 | from torch.optim import Adam
6 | from torch import nn
7 | from Torch_rl.common.memory import ReplayMemory
8 | from types import MethodType,FunctionType
9 |
10 |
11 | class GAIL_Agent(Agent_IL):
12 | def __init__(self, env, base_algorithm, adversary_model, policy_network, value_network = None,
13 | Adversary_lr=1e-4, ent_coeff = 1e-3, batch_size=32,
14 | ##
15 | path=None):
16 |
17 | self.env = env
18 |
19 | self.policy_network = policy_network
20 | self.value_network = value_network
21 | self.dist = base_algorithm.dist
22 | self.base_algorithm = base_algorithm
23 | self.adversary_model = adversary_model
24 | self.adversary_model_optim = Adam(self.adversary_model.parameters(), lr=Adversary_lr)
25 |
26 | self.entcoeff = ent_coeff
27 | self.batch_size = batch_size
28 | self.loss_calculator = nn.CrossEntropyLoss()
29 |
30 |
31 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
32 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
33 |
34 | def training_with_data(self, expert_data, max_imitation_learning_episode, training_ways):
35 |
36 | self.episode = 0
37 |
38 | while self.step < max_imitation_learning_episode:
39 | if training_ways == "random":
40 | samples = expert_data.sample(self.batch_size)
41 | elif training_ways == "episode":
42 | samples = expert_data.sample_episode()
43 | elif training_ways == "fragment":
44 | samples = expert_data.sample_fragment(self.batch_size)
45 | self.episode +=1
46 | expert_action = samples["a"]
47 | generator_action = self.policy_network.forward(samples["s"])
48 | if self.value_network is not None:
49 | Q = self.value_network.forward(samples["s"])
50 | if self.gpu:
51 | expert_action.cuda()
52 | generator_action.cuda()
53 | for key in samples.keys():
54 | samples[key] = samples[key].cuda()
55 |
56 | IL_reward = self.Discriminator_training(samples, expert_action, generator_action)
57 | # for flag,rew in enumerate(IL_reward):
58 | # sample_new = {"s": samples["s"][flag], "a": generator_action, "s_": samples["s_"][flag], "r": rew, "tr": samples["tr"][flag]}
59 | samples["r"] = IL_reward
60 | samples["value"] = Q
61 | samples["logp"] = -1.9189 * np.ones_like(IL_reward)
62 |
63 | self.base_algorithm.backward(samples)
64 |
65 |
66 |
67 | def training_with_policy(self, expert_policy, max_imitation_learning_step):
68 |
69 | self.step = 0
70 | s = self.env.reset()
71 | buffer = ReplayMemory(self.batch_size, ["value", "logp"])
72 | expert_action_set,generator_action_set=[],[]
73 | while self.step < max_imitation_learning_step:
74 | expert_action = expert_policy(s)
75 | generator_action = self.policy_network.forward(s)
76 | s_, r, done, info = self.env.step(generator_action.cpu().squeeze(0).numpy())
77 | Q = self.value_network.forward(s)
78 | IL_reward = self.Discriminator_training(s, expert_action, generator_action)
79 | sample_ = {
80 | "s": s,
81 | "a": generator_action.squeeze(0),
82 | "r": IL_reward,
83 | "tr": torch.tensor([int(done)]),
84 | "s_":torch.from_numpy(s_),
85 | "logp": -1.9189,
86 | "value": Q}
87 |
88 | buffer.push(sample_)
89 | # expert_action_set.append(expert_action)
90 | # generator_action_set.append(generator_action)
91 |
92 | if self.step % self.batch_size==0 and self.step>1:
93 | self.base_algorithm.update(buffer.memory)
94 |
95 |
96 |
97 |
98 | def Discriminator_training(self,sample, expert_action, generator_action):
99 | expert_input = torch.cat((sample["s"],expert_action), dim=1)
100 | advertise_judgement = self.adversary_model.forward(expert_input)
101 | expert_acc = self.loss_cal(advertise_judgement, torch.ones_like(advertise_judgement))
102 |
103 | generator_input = torch.cat((sample["s"], generator_action), dim=1)
104 | generator_judgement = self.adversary_model.forward(generator_input)
105 | generator_acc = self.loss_calculator(generator_judgement, torch.zeros_like(generator_judgement))
106 |
107 | logits = torch.cat([advertise_judgement, generator_judgement], dim = 1)
108 | entropy = - logits*torch.log(logits) -(1-logits)*torch.log(1-logits)
109 |
110 | entropy_loss = - self.entcoeff * entropy
111 | total_loss = expert_acc + generator_acc - self.entcoeff * entropy
112 | self.adversary_model_optim.zero_grad()
113 | total_loss.backward()
114 | self.adversary_model_optim.step()
115 | IL_reward = -torch.log(1 - generator_judgement + 1e-8)
116 |
117 | return IL_reward
118 |
119 |
120 |
121 | def cuda(self):
122 | self.policy_network.to_gpu()
123 | self.value_network.to_gpu()
124 | self.adversary_model.to_gpu()
125 | self.loss_cal = self.loss_cal.cuda()
126 | self.gpu = True
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/Torch_rl/ImitationLearning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/ImitationLearning/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/ImitationLearning/core_IL.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from abc import ABC
3 | import numpy as np
4 | import torch
5 | from Torch_rl.common import logger
6 | from Torch_rl.common.logger import CSVOutputFormat
7 |
8 |
9 | class Agent_IL(ABC):
10 |
11 | """
12 | Abstract base class for all implemented imitation_learning algorithms.
13 |
14 | the class contains the following methods
15 |
16 | we define the episode as the agent finished the training
17 | and the step as the agent interact with the env once
18 | """
19 |
20 | def __init__(self, path):
21 | self.step = 0
22 | self.episode = 0
23 | """
24 | config the logfile
25 | """
26 | configlist = ["stdout", "log", 'tensorboard', "csv"]
27 | if path is None:
28 | path = "./"
29 | logger.configure(path, configlist)
30 | self.csvwritter = CSVOutputFormat(path + "record_trajectory.csv")
31 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')]
32 | self.writer = loggerCEN.writer
33 | self.path = path
34 |
35 | def training_with_data(self,expert_policy, max_imitation_learning_step=1e5,
36 | max_ep_cycle=2000, buffer_size=32, learning_start = 1000):
37 | raise NotImplementedError()
38 |
39 | def training_with_policy(self):
40 | raise NotImplementedError()
41 |
42 |
43 |
--------------------------------------------------------------------------------
/Torch_rl/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import logger
2 | # agent
3 | from Torch_rl.agent.DQN import DQN_Agent as DQN
4 | from Torch_rl.agent.DRQN import DRQN_Agent as DRQN
5 | from Torch_rl.agent.DDPG import DDPG_Agent as DDPG
6 | from Torch_rl.temp_file.PPO import PPO_Agent as Batch_PPO
7 | from Torch_rl.agent.PPO import PPO_Agent as PPO
8 | from Torch_rl.agent.TD3 import TD3_Agent as TD3
9 | from Torch_rl.Hierarchical_RL.HIRO import HIRO_Agent as HIRO
10 |
11 | # network
12 | from Torch_rl.model.Network import DenseNet
13 | from Torch_rl.model.Network import LSTM_Dense
14 | from Torch_rl.model.Network import CNN_2D_Dense
15 | from Torch_rl.model.Network import CNN_2D_LSTM_Dense
16 |
17 |
--------------------------------------------------------------------------------
/Torch_rl/agent/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/.DS_Store
--------------------------------------------------------------------------------
/Torch_rl/agent/A3C.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/A3C.py
--------------------------------------------------------------------------------
/Torch_rl/agent/DDPG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | from Torch_rl.common.loss import huber_loss
9 | from torch.autograd import Variable
10 |
11 | class actor_critic(nn.Module):
12 | def __init__(self, actor, critic):
13 | super(actor_critic, self).__init__()
14 | self.actor = actor
15 | self.critic = critic
16 |
17 | def forward(self, obs):
18 | a = self.actor(obs)
19 | input = torch.cat((obs, a), axis=-1)
20 | Q = self.critic(input)
21 | return Q
22 |
23 |
24 | class DDPG_Agent(Agent_value_based):
25 | def __init__(self, env, actor_model, critic_model,
26 | actor_lr=1e-4, critic_lr=1e-3,
27 | actor_target_network_update_freq=1000, critic_target_network_update_freq=1000,
28 | actor_training_freq=1, critic_training_freq=1,
29 | ## hyper-parameter
30 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000,
31 | ## lr_decay
32 | decay=False, decay_rate=0.9, critic_l2_reg=1e-2, clip_norm =None,
33 | ##
34 | path=None):
35 |
36 | self.gpu = False
37 | self.env = env
38 |
39 | self.gamma = gamma
40 | self.batch_size = batch_size
41 | self.learning_starts = learning_starts
42 |
43 | self.replay_buffer = ReplayMemory(buffer_size)
44 |
45 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
46 | self.actor_target_network_update_freq = actor_target_network_update_freq
47 | self.critic_target_network_update_freq = critic_target_network_update_freq
48 | self.actor = actor_model
49 | self.critic = critic_model
50 | self.target_actor = deepcopy(actor_model)
51 | self.target_critic = deepcopy(critic_model)
52 |
53 | self.actor_critic = actor_critic(self.actor, self.critic)
54 |
55 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
56 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_l2_reg)
57 | if decay:
58 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1)
59 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1)
60 | else:
61 | self.actor_optim = actor_optim
62 | self.critic_optim = critic_optim
63 |
64 | super(DDPG_Agent, self).__init__(path)
65 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
66 | self.writer.add_graph(self.actor_critic, input_to_model=example_input)
67 | self.forward_step_show_list = []
68 | self.backward_step_show_list =[]
69 | self.forward_ep_show_list = []
70 | self.backward_ep_show_list = []
71 |
72 | def forward(self, observation):
73 | observation = observation[np.newaxis, :].astype(np.float32)
74 | observation = torch.from_numpy(observation)
75 | action = self.actor.forward(observation)
76 | action = torch.normal(action, torch.ones_like(action))
77 | Q = self.critic(torch.cat((observation, action), dim=1)).squeeze().detach().numpy()
78 | return action.cpu().squeeze(0).detach().numpy(), Q, {}
79 |
80 | def backward(self, sample_):
81 | self.replay_buffer.push(sample_)
82 | if self.step > self.learning_starts and self.learning:
83 | sample = self.replay_buffer.sample(self.batch_size)
84 | if self.gpu:
85 | for key in sample.keys():
86 | sample[key] = sample[key].cuda()
87 | assert len(sample["s"]) == self.batch_size
88 | "update the critic "
89 | if self.step % self.critic_training_freq == 0:
90 | input = torch.cat((sample["s"], sample["a"]), -1)
91 | Q = self.critic(input)
92 | target_a = self.target_actor(sample["s_"])
93 | target_input = torch.cat((sample["s_"], target_a), -1)
94 | targetQ = self.target_critic(target_input)
95 | targetQ = targetQ.squeeze(1)
96 | Q = Q.squeeze(1)
97 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"])
98 | loss = torch.mean(huber_loss(expected_q_values - Q))
99 | self.critic_optim.zero_grad()
100 | loss.backward()
101 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2)
102 | self.critic_optim.step()
103 | "training the actor"
104 | if self.step % self.actor_training_freq == 0:
105 | # Q = self.critic(torch.cat((sample["s"], self.actor(sample["s"])), -1))
106 | Q = self.actor_critic.forward(sample["s"])
107 | Q = -torch.mean(Q)
108 | self.actor_optim.zero_grad()
109 | Q.backward()
110 | torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1, norm_type=2)
111 | self.actor_optim.step()
112 | if self.step % self.actor_target_network_update_freq == 0:
113 | self.target_actor_net_update()
114 | if self.step % self.critic_target_network_update_freq == 0:
115 | self.target_critic_net_update()
116 | loss = loss.data.numpy()
117 | return loss, {}
118 | return 0, {}
119 |
120 | def target_actor_net_update(self):
121 | self.target_actor.load_state_dict(self.actor.state_dict())
122 |
123 | def target_critic_net_update(self):
124 | self.target_critic.load_state_dict(self.critic.state_dict())
125 |
126 | def load_weights(self, filepath):
127 | model = torch.load(filepath)
128 | self.actor.load_state_dict(model["actor"])
129 | self.critic.load_state_dict(model["critic"])
130 | self.target_actor.load_state_dict(model["target_actor"])
131 | self.target_critic.load_state_dict(model["target_critic"])
132 | self.actor_optim.load_state_dict(model["actor_optim"])
133 | self.critic_optim.load_state_dict(model["critic_optim"])
134 |
135 |
136 | def save_weights(self, filepath, overwrite=False):
137 | torch.save({"actor": self.actor, "critic":self.critic,
138 | "target_actor": self.target_actor,"target_critic": self.target_critic,
139 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim
140 | }, filepath + "DDPG.pkl")
141 |
142 | def cuda(self):
143 | self.actor.to_gpu()
144 | self.critic.to_gpu()
145 | self.target_actor = deepcopy(self.actor)
146 | self.target_critic = deepcopy(self.critic)
147 | self.gpu = True
148 |
149 |
150 |
--------------------------------------------------------------------------------
/Torch_rl/agent/DDPG_2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | from Torch_rl.common.loss import huber_loss
9 | from torch.autograd import Variable
10 |
11 | class actor_critic(nn.Module):
12 | def __init__(self, actor, critic, GCN_enable):
13 | super(actor_critic, self).__init__()
14 | self.actor = actor
15 | self.critic = critic
16 | self.GCN_enable = GCN_enable
17 |
18 | def forward(self, obs):
19 | a = self.actor(obs)
20 | if self.GCN_enable:
21 | Q = self.critic(obs, a)
22 | else:
23 | input = torch.cat((obs, a), axis=-1)
24 | Q = self.critic(input)
25 | return Q
26 |
27 |
28 | class DDPG_Agent(Agent_value_based):
29 | def __init__(self, env, actor_model, critic_model,
30 | actor_lr=1e-4, critic_lr=1e-3,
31 | actor_target_network_update_freq=1000, critic_target_network_update_freq=1000,
32 | actor_training_freq=1, critic_training_freq=1, sperate_critic = False,
33 | ## hyper-parameter
34 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000,
35 | ## lr_decay
36 | decay=False, decay_rate=0.9, critic_l2_reg=1e-2, clip_norm =None,
37 | ##
38 | path=None):
39 |
40 | self.gpu = False
41 | self.env = env
42 | self.sperate_critic = sperate_critic
43 | self.gamma = gamma
44 | self.batch_size = batch_size
45 | self.learning_starts = learning_starts
46 |
47 | self.replay_buffer = ReplayMemory(buffer_size)
48 |
49 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
50 | self.actor_target_network_update_freq = actor_target_network_update_freq
51 | self.critic_target_network_update_freq = critic_target_network_update_freq
52 | self.actor = actor_model
53 | self.critic = critic_model
54 | self.target_actor = deepcopy(actor_model)
55 | self.target_critic = deepcopy(critic_model)
56 |
57 | self.actor_critic = actor_critic(self.actor, self.critic, self.GCN)
58 |
59 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
60 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_l2_reg)
61 | if decay:
62 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1)
63 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1)
64 | else:
65 | self.actor_optim = actor_optim
66 | self.critic_optim = critic_optim
67 |
68 | super(DDPG_Agent, self).__init__(path)
69 | #example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
70 | #self.writer.add_graph(self.actor_critic, input_to_model=example_input)
71 | self.forward_step_show_list = []
72 | self.backward_step_show_list =[]
73 | self.forward_ep_show_list = []
74 | self.backward_ep_show_list = []
75 |
76 | def forward(self, observation):
77 | observation = observation[np.newaxis, :].astype(np.float32)
78 | observation = torch.from_numpy(observation)
79 | action = self.actor.forward(observation)
80 | action = torch.normal(action, torch.ones_like(action))
81 | if self.sperate_critic:
82 | Q = self.critic.forward(observation, action).squeeze().detach().numpy()
83 | else:
84 | Q = self.critic(torch.cat((observation, action), dim=1)).squeeze().detach().numpy()
85 | return action.cpu().squeeze(0).detach().numpy(), Q, {}
86 |
87 | def backward(self, sample_):
88 | self.replay_buffer.push(sample_)
89 | if self.step > self.learning_starts and self.learning:
90 | sample = self.replay_buffer.sample(self.batch_size)
91 | if self.gpu:
92 | for key in sample.keys():
93 | sample[key] = sample[key].cuda()
94 | assert len(sample["s"]) == self.batch_size
95 | "update the critic "
96 | if self.step % self.critic_training_freq == 0:
97 | if self.sperate_critic:
98 | Q = self.critic.forward(sample["s"], sample["a"])
99 | else:
100 | input = torch.cat((sample["s"], sample["a"]), -1)
101 | Q = self.critic.forward(input)
102 | target_a = self.target_actor(sample["s_"])
103 | if self.sperate_critic:
104 | targetQ = self.target_critic(sample["s_"], target_a)
105 | else:
106 | target_input = torch.cat((sample["s_"], target_a), -1)
107 | targetQ = self.target_critic(target_input)
108 | targetQ = targetQ.squeeze(1)
109 | Q = Q.squeeze(1)
110 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"])
111 | loss = torch.mean(huber_loss(expected_q_values - Q))
112 | self.critic_optim.zero_grad()
113 | loss.backward()
114 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2)
115 | self.critic_optim.step()
116 | "training the actor"
117 | if self.step % self.actor_training_freq == 0:
118 | Q = self.actor_critic.forward(sample["s"])
119 | Q = -torch.mean(Q)
120 | self.actor_optim.zero_grad()
121 | Q.backward()
122 | torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1, norm_type=2)
123 | self.actor_optim.step()
124 | if self.step % self.actor_target_network_update_freq == 0:
125 | self.target_actor_net_update()
126 | if self.step % self.critic_target_network_update_freq == 0:
127 | self.target_critic_net_update()
128 | loss = loss.data.numpy()
129 | return loss, {}
130 | return 0, {}
131 |
132 | def target_actor_net_update(self):
133 | self.target_actor.load_state_dict(self.actor.state_dict())
134 |
135 | def target_critic_net_update(self):
136 | self.target_critic.load_state_dict(self.critic.state_dict())
137 |
138 | def load_weights(self, filepath):
139 | model = torch.load(filepath)
140 | self.actor.load_state_dict(model["actor"])
141 | self.critic.load_state_dict(model["critic"])
142 | self.target_actor.load_state_dict(model["target_actor"])
143 | self.target_critic.load_state_dict(model["target_critic"])
144 | self.actor_optim.load_state_dict(model["actor_optim"])
145 | self.critic_optim.load_state_dict(model["critic_optim"])
146 |
147 |
148 | def save_weights(self, filepath, overwrite=False):
149 | torch.save({"actor": self.actor, "critic":self.critic,
150 | "target_actor": self.target_actor,"target_critic": self.target_critic,
151 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim
152 | }, filepath + "DDPG.pkl")
153 |
154 | def cuda(self):
155 | self.actor.to_gpu()
156 | self.critic.to_gpu()
157 | self.target_actor = deepcopy(self.actor)
158 | self.target_critic = deepcopy(self.critic)
159 | self.gpu = True
160 |
161 |
162 |
--------------------------------------------------------------------------------
/Torch_rl/agent/DQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from Torch_rl.common.loss import huber_loss
10 | from torch.autograd import Variable
11 |
12 | class Dueling_dqn(nn.Module):
13 | def __init__(self, model, dueling_way):
14 | super(Dueling_dqn, self).__init__()
15 | self.dueling_way = dueling_way
16 | self.model_layer = model.linears[:-1]
17 | layer_infor = model.layer_infor
18 | self.A_est = nn.Linear(layer_infor[-2], layer_infor[-1])
19 | self.V_est = nn.Linear(layer_infor[-2], 1)
20 |
21 | def forward(self, obs):
22 | x = obs
23 | for layer in self.model_layer:
24 | x = layer(x)
25 | A = F.relu(self.A_est(x))
26 | V = self.V_est(x)
27 | if self.dueling_way == "native":
28 | A = A
29 | elif self.dueling_way == "mean":
30 | A = A - torch.max(A)
31 | elif self.dueling_way == "avg":
32 | A = A - torch.mean(A)
33 | return V - A
34 |
35 |
36 | class DQN_Agent(Agent_value_based):
37 | def __init__(self, env, model, policy,
38 | ## hyper-parameter
39 | gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000,
40 | target_network_update_freq=500,
41 | ## decay
42 | decay=False, decay_rate=0.9,
43 | ## DDqn && DuelingDQN
44 | double_dqn=True, dueling_dqn=False, dueling_way="native",
45 | ## prioritized_replay
46 | prioritized_replay=False,
47 | prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
48 | prioritized_replay_eps=1e-6, param_noise=False,
49 | ##
50 | path=None):
51 |
52 | """
53 |
54 | :param env: the GYM environment
55 | :param model: the Torch NN model
56 | :param policy: the policy when choosing action
57 | :param ep: the MAX episode time
58 | :param step: the MAx step time
59 | .........................hyper-parameter..................................
60 | :param gamma:
61 | :param lr:
62 | :param batchsize:
63 | :param buffer_size:
64 | :param target_network_update_freq:
65 | .........................further improve way..................................
66 | :param double_dqn: whether enable DDQN
67 | :param dueling_dqn: whether dueling DDQN
68 | :param dueling_way: the Dueling DQN method
69 | it can choose the following three ways
70 | `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
71 | `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
72 | `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
73 | .........................prioritized-part..................................
74 | :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
75 | :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
76 | It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
77 | :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
78 | :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
79 | value to 1.0. If set to None equals to max_timesteps.
80 | :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
81 | .........................imitation_learning_part..................................
82 | :param imitation_learning_policy: To initial the network with the given policy
83 | which is supervised way to training the network
84 | :param IL_time: supervised training times
85 | :param network_kwargs:
86 | """
87 | self.gpu = False
88 | self.env = env
89 | self.policy = policy
90 |
91 | self.gamma = gamma
92 | self.batch_size = batch_size
93 | self.learning_starts = learning_starts
94 | self.target_network_update_freq = target_network_update_freq
95 | self.double_dqn = double_dqn
96 |
97 | if dueling_dqn:
98 | self.Q_net = Dueling_dqn(model, dueling_way)
99 | else:
100 | self.Q_net = model
101 |
102 | self.target_Q_net = deepcopy(self.Q_net)
103 |
104 | q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
105 | if decay:
106 | self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1)
107 | else:
108 | self.optim = q_net_optim
109 |
110 | self.replay_buffer = ReplayMemory(buffer_size)
111 | self.learning = False
112 | super(DQN_Agent, self).__init__(path)
113 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
114 | self.writer.add_graph(self.Q_net, input_to_model=example_input)
115 | self.forward_step_show_list = []
116 | self.backward_step_show_list =[]
117 | self.forward_ep_show_list = []
118 | self.backward_ep_show_list = []
119 |
120 | def forward(self, observation):
121 | observation = observation[np.newaxis, :].astype(np.float32)
122 | observation = torch.from_numpy(observation)
123 | Q_value = self.Q_net.forward(observation)
124 | Q_value = Q_value.cpu().squeeze().detach().numpy()
125 | if self.policy is not None:
126 | action = self.policy.select_action(Q_value)
127 | else:
128 | action = np.argmax(Q_value)
129 | return action, np.max(Q_value), {}
130 |
131 | def backward(self, sample_):
132 | self.replay_buffer.push(sample_)
133 | if self.step > self.learning_starts and self.learning:
134 | sample = self.replay_buffer.sample(self.batch_size)
135 | if self.gpu:
136 | for key in sample.keys():
137 | sample[key] = sample[key].cuda()
138 | assert len(sample["s"]) == self.batch_size
139 | a = sample["a"].long().unsqueeze(1)
140 | Q = self.Q_net(sample["s"]).gather(1, a)
141 | if self.double_dqn:
142 | _, next_actions = self.Q_net(sample["s_"]).max(1, keepdim=True)
143 | targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions)
144 | else:
145 | _, next_actions = self.target_Q_net(sample["s_"]).max(1, keepdim=True)
146 | targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions)
147 | targetQ = targetQ.squeeze(1)
148 | Q = Q.squeeze(1)
149 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"])
150 | loss = torch.mean(huber_loss(expected_q_values-Q))
151 | self.optim.zero_grad()
152 | loss.backward()
153 | torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2)
154 | self.optim.step()
155 | if self.step % self.target_network_update_freq == 0:
156 | self.target_net_update()
157 | loss = loss.data.numpy()
158 | return loss, {}
159 | return 0, {}
160 |
161 | def target_net_update(self):
162 | self.target_Q_net.load_state_dict(self.Q_net.state_dict())
163 |
164 | def load_weights(self, filepath):
165 | model = torch.load(filepath+'DQN.pkl')
166 | self.Q_net.load_state_dict(model["Q_net"].state_dict())
167 | self.target_Q_net.load_state_dict(model["target_Q_net"].state_dict())
168 | # self.optim.load_state_dict(model["optim"])
169 |
170 | def save_weights(self, filepath, overwrite=True):
171 | torch.save({"Q_net": self.Q_net,
172 | "target_Q_net": self.target_Q_net,
173 | "optim": self.optim
174 | }, filepath+"DQN.pkl")
175 |
176 |
177 | def cuda(self):
178 | self.Q_net.to_gpu()
179 | self.target_Q_net = deepcopy(self.Q_net)
180 | self.gpu = True
181 |
182 |
183 |
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------
/Torch_rl/agent/DRQN.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory_Sequence
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from Torch_rl.common.loss import huber_loss
10 | from torch.autograd import Variable
11 |
12 | class Dueling_dqn(nn.Module):
13 | def __init__(self, model, dueling_way):
14 | super(Dueling_dqn, self).__init__()
15 | self.dueling_way = dueling_way
16 | self.model_layer = model.linears[:-1]
17 | layer_infor = model.layer_infor
18 | self.A_est = nn.Linear(layer_infor[-2], layer_infor[-1])
19 | self.V_est = nn.Linear(layer_infor[-2], 1)
20 |
21 | def forward(self, obs):
22 | x = obs
23 | for layer in self.model_layer:
24 | x = layer(x)
25 | A = F.relu(self.A_est(x))
26 | V = self.V_est(x)
27 | if self.dueling_way == "native":
28 | A = A
29 | elif self.dueling_way == "mean":
30 | A = A - torch.max(A)
31 | elif self.dueling_way == "avg":
32 | A = A - torch.mean(A)
33 | return V - A
34 |
35 |
36 | class DRQN_Agent(Agent_value_based):
37 | def __init__(self, env, model, policy,
38 | ## hyper-parameter
39 | gamma=0.90, lr=1e-3, learning_starts=1000,
40 | target_network_update_freq=500,
41 | ## memory
42 | batch_size=32, buffer_size=50000, max_seq_len=2000, replay_len=100,
43 | ## decay
44 | decay=False, decay_rate=0.9,
45 | ## DDqn && DuelingDQN
46 | double_dqn=True, dueling_dqn=False, dueling_way="native",
47 | ## prioritized_replay
48 | prioritized_replay=False,
49 | prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
50 | prioritized_replay_eps=1e-6, param_noise=False,
51 | ##
52 | path=None):
53 |
54 | """
55 |
56 | :param env: the GYM environment
57 | :param model: the Torch NN model
58 | :param policy: the policy when choosing action
59 | :param ep: the MAX episode time
60 | :param step: the MAx step time
61 | .........................hyper-parameter..................................
62 | :param gamma:
63 | :param lr:
64 | :param batchsize:
65 | :param buffer_size:
66 | :param target_network_update_freq:
67 | .........................further improve way..................................
68 | :param double_dqn: whether enable DDQN
69 | :param dueling_dqn: whether dueling DDQN
70 | :param dueling_way: the Dueling DQN method
71 | it can choose the following three ways
72 | `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
73 | `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
74 | `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta)
75 | .........................prioritized-part..................................
76 | :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
77 | :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
78 | It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
79 | :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
80 | :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
81 | value to 1.0. If set to None equals to max_timesteps.
82 | :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
83 | .........................imitation_learning_part..................................
84 | :param imitation_learning_policy: To initial the network with the given policy
85 | which is supervised way to training the network
86 | :param IL_time: supervised training times
87 | :param network_kwargs:
88 | """
89 | self.gpu = False
90 | self.env = env
91 | self.policy = policy
92 |
93 | self.gamma = gamma
94 | self.batch_size = batch_size
95 | self.learning_starts = learning_starts
96 | self.target_network_update_freq = target_network_update_freq
97 | self.double_dqn = double_dqn
98 | if dueling_dqn:
99 | self.Q_net = Dueling_dqn(model, dueling_way)
100 | else:
101 | self.Q_net = model
102 |
103 | self.target_Q_net = deepcopy(self.Q_net)
104 |
105 | q_net_optim = Adam(self.Q_net.parameters(), lr=lr)
106 | if decay:
107 | self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1)
108 | else:
109 | self.optim = q_net_optim
110 |
111 | self.replay_buffer = ReplayMemory_Sequence(buffer_size, max_seq_len, other_record=["h","c"])
112 |
113 | self.replay_buffer.batch_size = batch_size
114 | self.replay_buffer.sequence_len = replay_len
115 | if replay_len < max_seq_len:
116 | self.replay_sample = self.replay_buffer.sample_sequence
117 | else:
118 | self.replay_sample = self.replay_buffer.sample_ep
119 | self.learning = False
120 | super(DRQN_Agent, self).__init__(path)
121 | example_input = Variable(torch.rand((replay_len, 100)+self.env.observation_space.shape))
122 | self.writer.add_graph(self.Q_net, input_to_model=example_input)
123 | self.forward_step_show_list = []
124 | self.backward_step_show_list =[]
125 | self.forward_ep_show_list = []
126 | self.backward_ep_show_list = []
127 |
128 | self.h_state = model.init_H_C(1)
129 |
130 | def forward(self, observation):
131 | observation = observation[np.newaxis, np.newaxis, :].astype(np.float32)
132 | observation = torch.from_numpy(observation)
133 | Q_value, self.h_state = self.Q_net.forward(observation, self.h_state)
134 | Q_value = Q_value.cpu().squeeze().detach().numpy()
135 | if self.policy is not None:
136 | action = self.policy.select_action(Q_value)
137 | else:
138 | action = np.argmax(Q_value)
139 | return action, np.max(Q_value), {}
140 |
141 | def backward(self, sample_):
142 | sample_["h"] = self.h_state[0].detach().numpy()
143 | sample_["c"] = self.h_state[1].detach().numpy()
144 | self.replay_buffer.push(sample_)
145 | if self.step > self.learning_starts and self.learning:
146 | sample = self.replay_sample()
147 | if self.gpu:
148 | for key in sample.keys():
149 | sample[key] = sample[key].cuda()
150 | assert sample["s"].size(1) == self.batch_size
151 | a = sample["a"].long()
152 | Q, H = self.Q_net(sample["s"])
153 | Q = Q.gather(2, a)
154 | if self.double_dqn:
155 | Q_next, H = self.Q_net(sample["s_"])
156 | _, next_actions = Q_next.max(2, keepdim=True)
157 | Qtarget_next, H = self.Q_net(sample["s_"])
158 | targetQ = Qtarget_next.gather(2, next_actions)
159 | else:
160 | Qtarget_next, H = self.target_Q_net(sample["s_"])
161 | targetQ, next_actions = Qtarget_next.max(2, keepdim=True)
162 |
163 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"])
164 | loss = torch.mean(huber_loss(expected_q_values-Q))
165 | self.optim.zero_grad()
166 | loss.backward()
167 | torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2)
168 | self.optim.step()
169 | if self.step % self.target_network_update_freq == 0:
170 | self.target_net_update()
171 | loss = loss.data.numpy()
172 | return loss, {}
173 | return 0, {}
174 |
175 | def target_net_update(self):
176 | self.target_Q_net.load_state_dict(self.Q_net.state_dict())
177 |
178 | def load_weights(self, filepath):
179 | model = torch.load(filepath+'DQN.pkl')
180 | self.Q_net.load_state_dict(model["Q_net"].state_dict())
181 | self.target_Q_net.load_state_dict(model["target_Q_net"].state_dict())
182 | # self.optim.load_state_dict(model["optim"])
183 |
184 | def save_weights(self, filepath, overwrite=True):
185 | torch.save({"Q_net": self.Q_net,
186 | "target_Q_net": self.target_Q_net,
187 | "optim": self.optim
188 | }, filepath+"DQN.pkl")
189 |
190 | def cuda(self):
191 | self.Q_net.to_gpu()
192 | self.target_Q_net = deepcopy(self.Q_net)
193 | self.gpu = True
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
--------------------------------------------------------------------------------
/Torch_rl/agent/PPO.py:
--------------------------------------------------------------------------------
1 |
2 | from Torch_rl.agent.core_policy import Agent_policy_based
3 | import torch.nn as nn
4 | from copy import deepcopy
5 | from Torch_rl.common.distribution import *
6 | from torch.optim import Adam
7 | from torch.autograd import Variable
8 | from Torch_rl.common.util import get_gae
9 |
10 | class PPO_Agent(Agent_policy_based):
11 | def __init__(self, env, policy_model, value_model,
12 | lr=5e-4, ent_coef=0.01, vf_coef=0.5,
13 | ## hyper-parawmeter
14 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=10,
15 | running_step=2048, running_ep=20, value_regular=0.01,
16 | ## decay
17 | decay=False, decay_rate=0.9, lstm_enable=False,
18 | ##
19 | path=None):
20 | self.gpu = False
21 | self.env = env
22 | self.gamma = gamma
23 | self.lam = lam
24 | self.ent_coef = ent_coef
25 | self.vf_coef = vf_coef
26 | self.cliprange = cliprange
27 |
28 | self.value_train_step = value_train_round
29 |
30 | self.sample_rollout = running_step
31 | self.sample_ep = running_ep
32 | self.batch_size = batch_size
33 | self.lstm_enable = lstm_enable
34 |
35 | self.loss_cal = torch.nn.SmoothL1Loss()
36 |
37 | self.policy = policy_model
38 | if value_model == "shared":
39 | self.value = policy_model
40 | elif value_model == "copy":
41 | self.value = deepcopy(policy_model)
42 | else:
43 | self.value = value_model
44 |
45 | self.dist = make_pdtype(env.action_space, policy_model)
46 |
47 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
48 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular)
49 | if decay:
50 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate,
51 | last_epoch=-1)
52 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate,
53 | last_epoch=-1)
54 |
55 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
56 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2)
57 |
58 | super(PPO_Agent, self).__init__(path)
59 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
60 | self.writer.add_graph(self.policy, input_to_model=example_input)
61 |
62 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
63 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
64 |
65 | self.training_round = 0
66 | self.running_step = 0
67 | self.record_sample = None
68 | self.training_step = 0
69 |
70 |
71 | def update(self, sample):
72 |
73 | returns, advants = get_gae(sample["r"], sample["tr"], sample["value"], self.gamma,
74 | self.lam)
75 | sample["advs"] = advants.unsqueeze(1)
76 | sample["return"] = returns.unsqueeze(1)
77 |
78 | step_len = len(sample["s"])
79 | if self.lstm_enable:
80 | flagin = [time for time in range(step_len) if sample["tr"][time]==1]
81 | time_round = len(flagin)
82 | array_index = []
83 | for train_time in range(int(time_round)-1):
84 | array_index.append(range(flagin[train_time], flagin[train_time+1]))
85 | else:
86 | time_round = np.ceil(step_len/self.batch_size)
87 | time_left = time_round*self.batch_size-step_len
88 | array = list(range(step_len)) +list(range(int(time_left)))
89 | array_index = []
90 | for train_time in range(int(time_round)):
91 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size])
92 | loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], []
93 |
94 | for key in sample.keys():
95 | temp = torch.stack(list(sample[key]), 0)
96 | if self.gpu:
97 | sample[key] = temp.cuda()
98 | else:
99 | sample[key] = temp
100 | for train_time in range(int(time_round)):
101 | index = array_index[train_time]
102 | # for index in range(step_len):
103 | training_s = sample["s"][index].detach()
104 | training_a = sample["a"][index].detach()
105 | training_r = sample["r"][index].detach()
106 | R = sample["return"][index].detach()
107 | old_value = sample["value"][index].detach()
108 | old_neglogp = sample["logp"][index].detach()
109 | advs = sample["advs"][index].detach()
110 |
111 | " CALCULATE THE LOSS"
112 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"
113 |
114 | " the value loss"
115 | value_now = self.value.forward(training_s)
116 | # value loss
117 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
118 | max=self.cliprange) # Clipped value
119 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
120 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
121 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
122 |
123 | #generate Policy gradient loss
124 | outcome = self.policy.forward(training_s)
125 | new_policy = self.dist(outcome)
126 | new_neg_lop = new_policy.log_prob(training_a)
127 | ratio = torch.exp(new_neg_lop - old_neglogp)
128 | pg_loss1 = -advs * ratio
129 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
130 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()
131 |
132 | # entropy
133 | entropy = new_policy.entropy().mean()
134 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
135 |
136 | self.policy_model_optim.zero_grad()
137 | pg_loss.backward()
138 | self.policy_model_optim.step()
139 | for _ in range(self.value_train_step):
140 | value_now = self.value.forward(training_s)
141 | # value loss
142 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
143 | max=self.cliprange) # Clipped value
144 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
145 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
146 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
147 | self.value_model_optim.zero_grad()
148 | vf_loss1.backward()
149 | self.value_model_optim.step()
150 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
151 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
152 | loss_re = loss.cpu().detach().numpy()
153 | pgloss_re.append(pg_loss.cpu().detach().numpy())
154 | enloss_re.append(entropy.cpu().detach().numpy())
155 | vfloss_re.append(vf_loss1.cpu().detach().numpy())
156 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re),
157 | "entropy": np.sum(enloss_re),
158 | "vf_loss": np.sum(vfloss_re)}
159 |
160 | def load_weights(self, filepath):
161 | model = torch.load(filepath+"/PPO.pkl")
162 | self.policy.load_state_dict(model["policy"].state_dict())
163 | self.value.load_state_dict(model["value"].state_dict())
164 |
165 |
166 | def save_weights(self, filepath, overwrite=False):
167 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl")
168 |
169 | def policy_behavior_clone(self, sample_):
170 | action_label = sample_["a"].squeeze()
171 | if self.gpu:
172 | action_predict = self.policy(sample_["s"].cuda())
173 | action_label = action_label.cuda()
174 | else:
175 | action_predict = self.policy(sample_["s"])
176 | loss_bc = self.loss_cal(action_label, action_predict)
177 | del action_label
178 | del action_predict
179 | loss = loss_bc
180 | self.policy_model_optim.zero_grad()
181 | loss.backward()
182 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
183 | self.policy_model_optim.step()
184 | return loss.cpu().detach().numpy()
185 |
186 | def value_pretrain(self, record_sample, new_sample_len):
187 | train_times = int(np.floor(new_sample_len/128))
188 | round_loss = 0
189 | for io in range(train_times-1):
190 | index = list(range(128 * io, 128 * (io + 1)))
191 | if self.gpu:
192 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda()
193 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda()
194 | else:
195 | predict = torch.from_numpy(np.array(record_sample["s"])[index])
196 | lable = torch.from_numpy(np.array(record_sample["return"]))[index]
197 | value_now = self.value.forward(predict)
198 | # value loss
199 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss
200 | del predict
201 | del lable
202 | self.value_model_optim.zero_grad()
203 | vf_loss.backward()
204 | self.value_model_optim.step()
205 | round_loss += vf_loss.cpu().detach().numpy()
206 | return round_loss
207 |
208 | def cuda(self):
209 | self.policy.to_gpu()
210 | self.value.to_gpu()
211 | self.loss_cal = self.loss_cal.cuda()
212 | self.gpu = True
--------------------------------------------------------------------------------
/Torch_rl/agent/SAC.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/SAC.py
--------------------------------------------------------------------------------
/Torch_rl/agent/TD3.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.common.memory import ReplayMemory
4 | from Torch_rl.agent.core_value import Agent_value_based
5 | from copy import deepcopy
6 | from torch.optim import Adam
7 | from torch import nn
8 | from Torch_rl.common.loss import huber_loss
9 | from torch.autograd import Variable
10 | from Torch_rl.common.util import csv_record
11 |
12 |
13 | class critic_build(nn.Module):
14 | def __init__(self, critic):
15 | super(critic_build, self).__init__()
16 | self.critic_q1 = deepcopy(critic)
17 | self.critic_q2 = deepcopy(critic)
18 |
19 | def forward(self, obs):
20 | Q1 = self.critic_q1(obs)
21 | Q2 = self.critic_q2(obs)
22 | return Q1, Q2
23 |
24 |
25 | class actor_critic(nn.Module):
26 | def __init__(self, actor, critic):
27 | super(actor_critic, self).__init__()
28 | self.actor = actor
29 | self.critic = critic
30 |
31 | def forward(self, obs):
32 | a = self.actor(obs)
33 | input = torch.cat((obs, a), dim=-1)
34 | Q1, Q2 = self.critic(input)
35 | return Q1
36 |
37 |
38 |
39 | class TD3_Agent(Agent_value_based):
40 | def __init__(self, env, actor_model, critic_model,
41 | actor_lr=1e-4, critic_lr=3e-4,
42 | actor_target_network_update_freq=0.1, critic_target_network_update_freq=0.1,
43 | actor_training_freq=2, critic_training_freq=1,
44 | ## hyper-parameter
45 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000,
46 | ## decay
47 | decay=False, decay_rate=0.9, l2_regulization=0.01,
48 | ##
49 | path=None):
50 |
51 | self.gpu = False
52 | self.env = env
53 | self.gamma = gamma
54 | self.batch_size = batch_size
55 | self.learning_starts = learning_starts
56 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq
57 | self.actor_target_network_update_freq = actor_target_network_update_freq
58 | self.critic_target_network_update_freq = critic_target_network_update_freq
59 |
60 | self.replay_buffer = ReplayMemory(buffer_size)
61 | self.actor = actor_model
62 | self.critic = critic_build(critic_model)
63 |
64 | self.actor_critic = actor_critic(self.actor, self.critic)
65 |
66 | self.target_actor = deepcopy(self.actor)
67 | self.target_critic = deepcopy(self.critic)
68 |
69 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
70 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=l2_regulization)
71 | if decay:
72 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1)
73 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1)
74 | else:
75 | self.actor_optim = actor_optim
76 | self.critic_optim = critic_optim
77 |
78 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1, norm_type=2)
79 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2)
80 |
81 |
82 | super(TD3_Agent, self).__init__(path)
83 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
84 | self.writer.add_graph(self.actor_critic, input_to_model=example_input)
85 | self.forward_step_show_list = []
86 | self.backward_step_show_list = []
87 | self.forward_ep_show_list = []
88 | self.backward_ep_show_list = []
89 |
90 | def forward(self, observation):
91 | observation = observation.astype(np.float32)
92 | observation = torch.from_numpy(observation)
93 | action = self.actor.forward(observation)
94 | csv_record(action.detach().numpy(),"./")
95 | action = torch.normal(action,torch.ones_like(action))
96 | Q, _ = self.critic(torch.cat((observation, action),axis=0))
97 | action = action.data.numpy()
98 | return action, Q.detach().numpy(), {}
99 |
100 | def backward(self, sample_):
101 | self.replay_buffer.push(sample_)
102 | if self.step > self.learning_starts and self.learning:
103 | sample = self.replay_buffer.sample(self.batch_size)
104 | if self.gpu:
105 | for key in sample.keys():
106 | sample[key] = sample[key].cuda()
107 | assert len(sample["s"]) == self.batch_size
108 | "update the critic "
109 | if self.step % self.critic_training_freq == 0:
110 | target_a = self.target_actor(sample["s_"])
111 | target_input = torch.cat((sample["s_"], target_a), -1)
112 | Q1, Q2 = self.target_critic(target_input)
113 | target_Q = torch.min(Q1, Q2)
114 | expected_q_values = sample["r"] + self.gamma * target_Q * (1.0 - sample["tr"])
115 |
116 | input = torch.cat((sample["s"], sample["a"]), -1)
117 | Q1, Q2 = self.critic(input)
118 | loss = torch.mean(huber_loss(expected_q_values - Q1))+torch.mean(huber_loss(expected_q_values - Q2))
119 | self.critic.zero_grad()
120 | loss.backward()
121 | self.critic_optim.step()
122 | "training the actor"
123 | if self.step % self.actor_training_freq == 0:
124 | Q = self.actor_critic(sample["s"])
125 | Q = -torch.mean(Q)
126 | self.actor.zero_grad()
127 | Q.backward()
128 | self.actor_optim.step()
129 | self.target_net_update()
130 | loss = loss.data.numpy()
131 | return loss, {}
132 | return 0, {}
133 |
134 | def target_net_update(self):
135 | if self.actor_target_network_update_freq>1:
136 | if self.step % self.actor_target_network_update_freq == 0:
137 | self.target_actor.load_state_dict(self.actor.state_dict())
138 | else:
139 | for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
140 | target_param.data.copy_(self.actor_target_network_update_freq * param.data +
141 | (1 - self.actor_target_network_update_freq) * target_param.data)
142 | if self.critic_target_network_update_freq>1:
143 | if self.step % self.critic_target_network_update_freq == 0:
144 | self.target_critic.load_state_dict(self.critic.state_dict())
145 | else:
146 | for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
147 | target_param.data.copy_(self.critic_target_network_update_freq * param.data +
148 | (1 - self.critic_target_network_update_freq) * target_param.data)
149 |
150 |
151 | def load_weights(self, filepath):
152 | model = torch.load(filepath + "TD3.pkl")
153 | self.actor.load_state_dict(model["actor"])
154 | self.critic.load_state_dict(model["critic"])
155 | self.target_actor.load_state_dict(model["target_actor"])
156 | self.target_critic.load_state_dict(model["target_critic"])
157 | self.actor_optim.load_state_dict(model["actor_optim"])
158 | self.critic_optim.load_state_dict(model["critic_optim"])
159 |
160 |
161 | def save_weights(self, filepath, overwrite=False):
162 | torch.save({"actor": self.actor, "critic":self.critic,
163 | "target_actor": self.target_actor,"target_critic": self.target_critic,
164 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim
165 | }, filepath + "TD3.pkl")
166 |
167 | def cuda(self):
168 | self.actor.to_gpu()
169 | self.critic.to_gpu()
170 | self.target_actor = deepcopy(self.actor)
171 | self.target_critic = deepcopy(self.critic)
172 | self.gpu = True
173 |
174 |
--------------------------------------------------------------------------------
/Torch_rl/agent/TRPO.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.agent.core_policy import Agent_policy_based
4 | from Torch_rl.common.memory import ReplayMemory
5 | from copy import deepcopy
6 | # from Torch_rl.common.distribution import *
7 | from torch.optim import Adam
8 | from torch.autograd import Variable
9 | from gym import spaces
10 | from Torch_rl.common.util import csv_record
11 | from Torch_rl.common.util import gae
12 | from Torch_rl.common.distribution import *
13 |
14 |
15 | class TRPO_Agent(Agent_policy_based):
16 | def __init__(self, env, policy_model, value_model,
17 | lr=1e-3, ent_coef=0.01, vf_coef=0.5,
18 | ## hyper-parawmeter
19 | gamma=0.90, lam=0.95, cliprange=0.2,
20 | buffer_size=50000, learning_starts=1000, running_step=2000, batch_training_round=20,
21 | value_regular=0.01,
22 | ## decay
23 | decay=False, decay_rate=0.9,
24 | ##
25 | path=None):
26 |
27 | self.env = env
28 | self.gamma = gamma
29 | self.lam = lam
30 | self.ent_coef = ent_coef
31 | self.vf_coef = vf_coef
32 | self.cliprange = cliprange
33 |
34 | self.learning_starts = learning_starts
35 | self.batch_training_round = batch_training_round
36 | self.run_step = running_step
37 | self.sample_training_step = self.batch_training_round * self.run_step
38 |
39 | self.replay_buffer = ReplayMemory(buffer_size, ["value", "logp"])
40 | self.loss_cal = torch.nn.MSELoss()
41 |
42 | self.dist = make_pdtype(env.action_space, policy_model)
43 |
44 | self.policy_model = policy_model
45 | if value_model == "shared":
46 | self.value_model = policy_model
47 | elif value_model == "copy":
48 | self.value_model = deepcopy(policy_model)
49 | else:
50 | self.value_model = value_model
51 |
52 | policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
53 | value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
54 | if decay:
55 | self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
56 | last_epoch=-1)
57 | self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
58 | last_epoch=-1)
59 | else:
60 | self.policy_model_optim = policy_model_optim
61 | self.value_model_optim = value_model_optim
62 |
63 | torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
64 | torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)
65 |
66 | self.run_policy = deepcopy(self.policy_model)
67 | self.run_value = deepcopy(self.value_model)
68 |
69 | super(TRPO_Agent, self).__init__(path)
70 | example_input = Variable(torch.rand((100,)+ self.env.observation_space.shape))
71 | self.writer.add_graph(self.policy_model, input_to_model=example_input)
72 | self.forward_step_show_list = []
73 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
74 | self.forward_ep_show_list = []
75 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
76 |
77 | self.training_round = 0
78 | self.training_step = 0
79 | self.running_step = 0
80 | self.record_sample = None
81 | self.train_ticks = np.tile(np.arange(self.run_step), self.batch_training_round)
82 |
83 | def forward(self, observation):
84 | observation = observation[np.newaxis, :].astype(np.float32)
85 | observation = torch.from_numpy(observation)
86 | with torch.no_grad():
87 | outcome = self.run_policy.forward(observation)
88 | self.pd = self.dist(outcome)
89 | self.action = self.pd.sample()
90 | self.Q = self.run_value.forward(observation)
91 | return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).data.numpy(), {}
92 |
93 | def backward(self, sample_):
94 | sample_["logp"] = self.pd.log_prob(self.action)
95 | sample_["value"] = self.Q
96 | self.replay_buffer.push(sample_)
97 | self.running_step += 1
98 | """"""""""""""
99 | "training part"
100 | "in each step, we train for batch batch_training_times"
101 | """"""""""""""
102 | if self.step > self.learning_starts:
103 | if self.running_step % self.run_step == 0 and self.training_step == 0:
104 | " sample advantage generate "
105 | with torch.no_grad():
106 | sample = self.replay_buffer.recent_step_sample(self.running_step)
107 | last_value = self.value_model.forward(sample["s_"][-1])
108 | self.record_sample = gae(sample, last_value, self.gamma, self.lam)
109 | self.running_step = 0
110 |
111 | if self.training_step < self.sample_training_step and self.record_sample is not None:
112 | pg_loss_re = 0
113 | entropy_re = 0
114 | vf_loss_re = 0
115 | loss_re = 0
116 | for _ in range(self.batch_training_round):
117 | index = self.train_ticks[self.training_step]
118 | S = self.record_sample["s"][index].detach()
119 | A = self.record_sample["a"][index].detach()
120 | old_log = self.record_sample["logp"][index].detach()
121 | advs = self.record_sample["advs"][index].detach()
122 | value = self.record_sample["value"][index].detach()
123 | returns = self.record_sample["return"][index].detach()
124 | # generate Policy gradient loss
125 | outcome = self.run_policy.forward(S)
126 | new_policy = self.dist(outcome)
127 | new_lop = new_policy.log_prob(A)
128 | ratio = torch.exp(new_lop - old_log)
129 | pg_loss1 = advs * ratio
130 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
131 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()
132 | # value loss
133 | value_now = self.run_value.forward(S)
134 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange,
135 | max=self.cliprange) # Clipped value
136 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss
137 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss
138 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
139 | # vf_loss = 0.5 * vf_loss1
140 | # entropy
141 | entropy = new_policy.entropy().mean()
142 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
143 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
144 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
145 |
146 | self.value_model_optim.zero_grad()
147 | loss.backward(retain_graph=True)
148 | self.value_model_optim.step()
149 |
150 | self.policy_model_optim.zero_grad()
151 | loss.backward()
152 | self.policy_model_optim.step()
153 |
154 | self.training_step += 1
155 | pg_loss_re += pg_loss.data.numpy()
156 | entropy_re += entropy.data.numpy()
157 | vf_loss_re += vf_loss.data.numpy()
158 | loss_re += loss.data.numpy()
159 |
160 | if self.training_step == self.sample_training_step:
161 | print("the" + str(self.episode) + " round have training finished")
162 | self.run_policy.load_state_dict(self.policy_model.state_dict())
163 | self.run_value.load_state_dict(self.value_model.state_dict())
164 | self.training_step = 0
165 | self.record_sample = None
166 | return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re}
167 | return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}
168 |
169 | def load_weights(self, filepath):
170 | model = torch.load(filepath + "ppo.pkl")
171 | self.policy_model.load_state_dict(model["policy_model"].state_dict())
172 | self.value_model.load_state_dict(model["value_model"].state_dict())
173 |
174 | def save_weights(self, filepath, overwrite=False):
175 | torch.save({"policy_model": self.policy_model, "value_model": self.value_model}, filepath + "TRPO.pkl")
--------------------------------------------------------------------------------
/Torch_rl/agent/core_policy.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from abc import ABC
3 | import numpy as np
4 | import torch
5 | from Torch_rl.common import logger
6 | from Torch_rl.common.logger import CSVOutputFormat
7 | from Torch_rl.common.memory import ReplayMemory
8 | from Torch_rl.common.distribution import *
9 |
10 | from Torch_rl.common.util import csv_record
11 |
12 |
13 |
14 | class Agent_policy_based(ABC):
15 | """
16 | 所有算法的父类
17 | Abstract base class for all implemented agents.
18 |
19 | 其中包含了
20 | - `runner` 根据policy 产生 sample
21 | - `learning` 根据Sample 训练网络
22 | - `load_weights` 加载权重
23 | - `save_weights` 存储权重
24 | - `layers` 网络层
25 | - 'forward' 前向传播
26 | - 'backward' 前向传播
27 | 定义 episode 完成一次为一个episode
28 | 定义 step 为交互一次
29 | """
30 | def __init__(self, path):
31 | self.step = 0
32 | self.episode = 0
33 | """
34 | config the logfile
35 | """
36 | configlist = ["stdout", "log", 'tensorboard', "csv"]
37 | if path is None:
38 | path = "./"
39 | logger.configure(path, configlist)
40 | self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv")
41 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')]
42 | self.writer = loggerCEN.writer
43 | self.path = path
44 |
45 | def imitation_learning(self):
46 | pass
47 |
48 | def train(self, max_step=None, max_ep_cycle=2000, verbose=2, learning_start=1000, render=False, record_ep_inter=None):
49 | self.learning = True
50 | print("the train phase ........")
51 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, learning_start=learning_start, render=render,
52 | verbose=verbose, record_ep_inter=record_ep_inter)
53 |
54 | def test(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None):
55 | self.learning = False
56 | self.learning_starts = 0
57 | self.step = 0
58 | self.episode = 0
59 | print("the test phase ........")
60 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render,
61 | verbose=verbose, record_ep_inter=record_ep_inter)
62 |
63 | def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000,
64 | render = False, verbose=1, record_ep_inter=None):
65 | '''
66 | :param max_step:
67 | :param max_ep_time:
68 | :param max_ep_cycle: max step in per circle
69 | .........................show parameter..................................
70 | :param verbose
71 | if verbose == 1 show every ep
72 | if verbose == 2 show every step
73 | :param record_ep_inter
74 | record_ep_interact data
75 | :return: None
76 | '''
77 | # if IL_time is not None:
78 | self.render = render
79 |
80 | # .....................initially——recode...........................#
81 | rollout = 0
82 | now_best_reward = -np.inf
83 |
84 | self.dist = make_pdtype(self.env.action_space, self.policy)
85 | sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable)
86 | while self.step < max_step:
87 | sample = next(sample_generate)
88 | logger.record_tabular("01.step", self.step)
89 | logger.record_tabular("02.episode",self.episode)
90 | logger.record_tabular("03.rollout", rollout)
91 | logger.record_tabular("04.rollout/ep", sample["ep_used"])
92 | logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"]))
93 | logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"]))
94 | logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"]))
95 | logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"]))
96 | logger.dump_tabular()
97 | csv_record(sample["ep_reward"], self.path)
98 | record_sample = sample["buffer"]
99 |
100 | rollout += 1
101 |
102 | if self.step > learning_start and self.learning:
103 | ep_show = {}
104 | if self.backward_ep_show_list:
105 | for key in self.backward_ep_show_list:
106 | ep_show[key] = 0
107 | rollout_loss = 0
108 | for time in range(train_rollout):
109 | loss, other_infor = self.update(record_sample)
110 | if verbose == 1:
111 | logger.record_tabular("06.train_rollout", time)
112 | logger.record_tabular("07.loss", loss)
113 | flag = 10
114 | if self.backward_step_show_list:
115 | for key in self.backward_step_show_list:
116 | logger.record_tabular(str(flag) +"."+ key, other_infor[key])
117 | flag += 1
118 | logger.dump_tabular()
119 | rollout_loss += loss
120 | if self.backward_ep_show_list:
121 | for key in self.backward_ep_show_list:
122 | ep_show[key] += other_infor[key]
123 | if verbose == 2:
124 | logger.record_tabular("06.rollouts/loss", rollout_loss)
125 | logger.record_tabular("07.rollouts/episode_Q_value", torch.mean(
126 | torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy())
127 | # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"])
128 | # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"])
129 | # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"]))
130 |
131 | flag = 10
132 | if self.backward_ep_show_list:
133 | for key in self.backward_ep_show_list:
134 | logger.record_tabular(str(flag) + "." + key, ep_show[key])
135 | flag += 1
136 | logger.dump_tabular()
137 | if np.mean(sample["ep_reward"])>now_best_reward:
138 | self.save_weights(self.path)
139 | print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved")
140 | now_best_reward = np.mean(sample["ep_reward"])
141 |
142 | def runner(self, sample_step=None, sample_ep=None, max_ep_step=2000, record_ep_inter=None, lstm_enable=False):
143 | if sample_step is not None:
144 | buffer = ReplayMemory(sample_step, ["value", "logp","info"])
145 | else:
146 | buffer = ReplayMemory(sample_ep*max_ep_step, ["value", "logp","info"])
147 | s = self.env.reset()
148 | ep_reward, ep_Q_value, ep_step_used = [], [], []
149 | ep_r, ep_q, ep_cycle = 0, 0, 0
150 | while True:
151 | s = torch.from_numpy(s.astype(np.float32))
152 | with torch.no_grad():
153 | outcome = self.policy.forward(s.unsqueeze(0))
154 | Q = self.value.forward(s.unsqueeze(0))
155 | pd = self.dist(outcome)
156 | a = pd.sample()
157 | s_, r, done, info = self.env.step(a.cpu().squeeze(0).numpy())
158 | if self.render:
159 | self.env.render()
160 | ep_r += r
161 | ep_q += Q
162 | ep_cycle +=1
163 | self.step += 1
164 | logp = pd.log_prob(a)
165 | sample_ = {
166 | "s": s,
167 | "a": a.squeeze(0),
168 | "r": torch.tensor(np.array([r]).astype(np.float32)),
169 | "tr": torch.tensor([int(done)]),
170 | "s_":torch.from_numpy(s_),
171 | "logp": logp.squeeze(0),
172 | "value": Q.squeeze(0),
173 | "info": info}
174 | buffer.push(sample_)
175 | s = deepcopy(s_)
176 |
177 | if record_ep_inter is not None:
178 | if self.episode % record_ep_inter == 0:
179 | kvs = {"s": s, "a": a, "s_": s_, "r": r,
180 | "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle}
181 | self.csvwritter.writekvs(kvs)
182 |
183 | if done:
184 | s = self.env.reset()
185 | self.episode += 1
186 | ep_reward.append(ep_r)
187 | ep_Q_value.append(ep_q)
188 | ep_step_used.append(ep_cycle)
189 | ep_r, ep_q, ep_cycle = 0, 0, 0
190 | if lstm_enable:
191 | self.policy.reset_h()
192 |
193 | if sample_step is not None:
194 | if self.step > 0 and self.step % sample_step==0:
195 | s_ = torch.from_numpy(s_[np.newaxis,:].astype(np.float32))
196 | with torch.no_grad():
197 | last_Q = self.value.forward(s_).squeeze()
198 | #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
199 | # "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
200 | # "and the mean reward per step is", np.mean(buffer.memory["r"]),
201 | # "the mean ep reward is ", np.mean(ep_reward))
202 | yield {"buffer": buffer.memory,
203 | "ep_reward": ep_reward,
204 | "ep_Q_value": ep_Q_value,
205 | "ep_step_used": ep_step_used,
206 | "ep_used": len(ep_reward),
207 | "step_used": sample_step,
208 | "last_Q" : last_Q
209 | }
210 | ep_reward, ep_Q_value, ep_step_used = [], [], []
211 | if sample_step is not None:
212 | buffer = ReplayMemory(sample_step, ["value", "logp","info"])
213 | else:
214 | buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])
215 |
216 | else:
217 | if self.step > 0 and self.episode % sample_ep==0:
218 | s_ = torch.from_numpy(s_.astype(np.float32))
219 | last_Q = self.value.forward(s_)
220 | #print("now is we have sampled for :", self.step , "and" , self.episode,"\n",
221 | # "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode",
222 | # "and the mean reward per step is", np.mean(buffer.memory["r"]),
223 | # "the mean ep reward is ", np.mean(ep_reward))
224 | yield {"buffer": buffer.memory,
225 | "ep_reward": ep_reward,
226 | "ep_Q_value": ep_Q_value,
227 | "ep_step_used": ep_step_used,
228 | "ep_used": sample_ep,
229 | "step_used": len(buffer.memory["tr"]),
230 | "last_Q": last_Q
231 | }
232 | ep_reward, ep_Q_value = [], []
233 | if sample_step is not None:
234 | buffer = ReplayMemory(sample_step, ["value", "logp","info"])
235 | else:
236 | buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"])
237 |
238 |
239 | def update(self, sample):
240 | """Updates the agent after having executed the action returned by `forward`.
241 | If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.
242 |
243 | # Argument
244 | reward (float): The observed reward after executing the action returned by `forward`.
245 | terminal (boolean): `True` if the new state of the environment is terminal.
246 |
247 | # Returns
248 | List of metrics values
249 | """
250 | raise NotImplementedError()
251 |
252 | def load_weights(self, filepath):
253 | """Loads the weights of an agent from an HDF5 file.
254 |
255 | # Arguments
256 | filepath (str): The path to the HDF5 file.
257 | """
258 | raise NotImplementedError()
259 |
260 | def save_weights(self, filepath, overwrite=False):
261 | """Saves the weights of an agent as an HDF5 file.
262 |
263 | # Arguments
264 | filepath (str): The path to where the weights should be saved.
265 | overwrite (boolean): If `False` and `filepath` already exists, raises an error.
266 | """
267 | raise NotImplementedError()
268 |
269 | def cuda(self):
270 | """
271 | use the cuda
272 | """
273 | raise NotImplementedError()
274 |
275 |
276 | def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000,
277 | buffer_size = 5000, value_training_round = 10, value_training_fre = 2500,
278 | verbose=2,render = False):
279 | '''
280 | :param data: the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
281 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
282 | :param policy:
283 | :return:
284 | '''
285 | if data is not None and policy is not None:
286 | raise Exception("The IL only need one way to guide, Please make sure the input ")
287 |
288 | if data is not None:
289 | for time in step_time:
290 | self.step += 1
291 | loss = self.backward(data[time])
292 | if verbose == 1:
293 | logger.record_tabular("steps", self.step)
294 | logger.record_tabular("loss", loss)
295 | logger.dumpkvs()
296 |
297 | if policy is not None:
298 | buffer = ReplayMemory(buffer_size)
299 | s = self.env.reset()
300 | loss_BC = 0
301 | ep_step,ep_reward = 0, 0
302 | for _ in range(step_time):
303 | self.step += 1
304 | ep_step += 1
305 | a = policy(self.env)
306 | s_, r, done, info = self.env.step(a)
307 | #print(r,info)
308 | ep_reward += r
309 | if render:
310 | self.env.render()
311 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
312 | buffer.push(sample)
313 | s = s_[:]
314 | if self.step > learning_start:
315 | sample_ = buffer.sample(self.batch_size)
316 | loss = self.policy_behavior_clone(sample_)
317 | if self.step % value_training_fre==0:
318 | record_sample = {}
319 | for key in buffer.memory.keys():
320 | record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:]
321 | record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"]))
322 | returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"],
323 | self.gamma, self.lam)
324 | record_sample["advs"] = advants
325 | record_sample["return"] = returns
326 | for round_ in range(value_training_round):
327 | loss_value = self.value_pretrain(record_sample, value_training_fre)
328 | print(round_, loss_value)
329 |
330 | if verbose == 1:
331 | logger.record_tabular("learning_steps", self.step)
332 | logger.record_tabular("loss", loss)
333 | logger.record_tabular("rewrad",r)
334 | logger.dumpkvs()
335 | loss_BC += loss
336 | if done:
337 | if verbose == 2:
338 | logger.record_tabular("learning_steps", self.step)
339 | logger.record_tabular("step_used", ep_step)
340 | logger.record_tabular("loss", loss_BC/ep_step)
341 | logger.record_tabular("ep_reward",ep_reward )
342 | logger.dumpkvs()
343 |
344 | s = self.env.reset()
345 | loss_BC = 0
346 | ep_step,ep_reward = 0, 0
347 |
348 | def policy_behavior_clone(self, sample_):
349 | raise NotImplementedError()
350 |
351 | def value_pretrain(self, sample_):
352 | raise NotImplementedError()
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
--------------------------------------------------------------------------------
/Torch_rl/agent/core_value.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from abc import ABC
3 | import numpy as np
4 | from Torch_rl.common import logger
5 | from Torch_rl.common.logger import CSVOutputFormat
6 | import torch
7 |
8 |
9 | class Agent_value_based(ABC):
10 | """
11 | 所有算法的父类
12 | Abstract base class for all implemented agents.
13 |
14 | 其中包含了
15 | - `forward` 前向传播、计算action
16 | - `backward` 后向传播、更新网络
17 | - `load_weights` 加载权重
18 | - `save_weights` 存储权重
19 | - `layers` 网络层
20 | - 'forward' 前向传播
21 | - 'backward' 前向传播
22 | 定义 episode 完成一次为一个episode
23 | 定义 step 为交互一次
24 | """
25 | def __init__(self, path):
26 | self.step = 0
27 | self.episode = 0
28 | """
29 | config the logfile
30 | """
31 | configlist = ["stdout", "log", 'tensorboard', "csv"]
32 | if path is None:
33 | path = "./"
34 | configlist = ["stdout", "log", 'tensorboard', "csv"]
35 | logger.configure(path, configlist)
36 | self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv")
37 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')]
38 | self.writer = loggerCEN.writer
39 | self.path = path
40 |
41 | def imitation_learning(self):
42 | pass
43 |
44 | def train(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None):
45 | self.learning = True
46 | print("the train phase ........")
47 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render,
48 | verbose=verbose, record_ep_inter=record_ep_inter)
49 |
50 | def test(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None):
51 | self.learning = False
52 | self.learning_starts = 0
53 | self.step = 0
54 | self.episode = 0
55 | print("the test phase ........")
56 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render,
57 | verbose=verbose, record_ep_inter=record_ep_inter)
58 |
59 | def interact(self, max_step=50000, max_ep_cycle=2000, render = False,
60 | verbose=1, record_ep_inter=None):
61 | '''
62 | :param max_step:
63 | :param max_ep_time:
64 | :param max_ep_cycle: max step in per circle
65 | .........................show parameter..................................
66 | :param verbose
67 | if verbose == 1 show every ep
68 | if verbose == 2 show every step
69 | :param record_ep_inter
70 | record_ep_interact data
71 | :return: None
72 | '''
73 | # if IL_time is not None:
74 |
75 | # .....................initially——recode...........................#
76 | ep_reward = []
77 | ep_Q_value = []
78 | ep_loss = []
79 | now_best_reward = -np.inf
80 | while self.step < max_step:
81 | s = self.env.reset()
82 | 'reset the ep record'
83 | ep_r, ep_q, ep_l = 0, 0, 0
84 | 'reset the RL flag'
85 | ep_cycle, done = 0, 0
86 | ep_show={}
87 | if self.backward_ep_show_list:
88 | for key in self.backward_ep_show_list:
89 | ep_show[key] = 0
90 | self.episode += 1
91 | while done == 0 and ep_cycle < max_ep_cycle:
92 | self.step += 1
93 | ep_cycle += 1
94 | 'the interaction part'
95 | a, Q, info_forward = self.forward(s)
96 | # print(a)
97 | s_, r, done, info = self.env.step(a)
98 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
99 | s = deepcopy(s_)
100 | loss, info_backward = self.backward(sample)
101 | if render:
102 | self.env.render()
103 | 'the record part'
104 |
105 | if verbose == 1 and self.step > self.learning_starts:
106 | logger.record_tabular("steps", self.step)
107 | logger.record_tabular("episodes", self.episode)
108 | logger.record_tabular("loss", loss)
109 | logger.record_tabular("reward", r)
110 | logger.record_tabular("Q", Q)
111 | if self.forward_step_show_list:
112 | for key in self.forward_step_show_list:
113 | logger.record_tabular(key, info_forward[key])
114 | if self.backward_step_show_list:
115 | for key in self.backward_step_show_list:
116 | logger.record_tabular(key, info_backward[key])
117 | logger.dump_tabular()
118 | if record_ep_inter is not None:
119 | if self.episode % record_ep_inter == 0:
120 | kvs = {"s": s, "a": a, "s_": s_, "r": r,
121 | "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle}
122 | self.csvwritter.writekvs(kvs)
123 | ep_r += r
124 | ep_q += Q
125 | ep_l += loss
126 | if self.backward_ep_show_list:
127 | for key in self.backward_ep_show_list:
128 | ep_show[key] += info_backward[key]
129 | if done:
130 | ep_reward.append(ep_r)
131 | ep_Q_value.append(ep_q)
132 | ep_loss.append(ep_l)
133 | mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1)
134 | if verbose == 2 and self.step > self.learning_starts:
135 | logger.record_tabular("01.steps", self.step)
136 | logger.record_tabular("02.episodes", self.episode)
137 | logger.record_tabular("03.episode_reward", ep_reward[-1])
138 | # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle)
139 | logger.record_tabular("05.episode_loss", ep_l)
140 | # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle)
141 | # logger.record_tabular("07.episode_Q_value", ep_q)
142 | logger.record_tabular("08.episode_Q_value_per_step", ep_q / ep_cycle)
143 | # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward)
144 | # logger.record_tabular("10.step_used", ep_cycle)
145 | flag = 11
146 | if self.forward_ep_show_list:
147 | for key in self.forward_ep_show_list:
148 | logger.record_tabular(str(flag) + "." + key, info_forward[key])
149 | flag += 1
150 | if self.backward_ep_show_list:
151 | for key in self.backward_ep_show_list:
152 | logger.record_tabular(str(flag) + "." + key, ep_show[key])
153 | flag += 1
154 | logger.dump_tabular()
155 | if np.mean(ep_r)>now_best_reward:
156 | self.save_weights(self.path)
157 | print("the best mean ep reward is ", np.mean(ep_r), "the weight is saved")
158 | now_best_reward = np.mean(ep_r)
159 |
160 |
161 | def forward(self, observation):
162 | """Takes the an observation from the environment and returns the action to be taken next.
163 | If the policy is implemented by a neural network, this corresponds to a forward (inference) pass.
164 |
165 | # Argument
166 | observation (object): The current observation from the environment.
167 |
168 | # Returns
169 | The next action to be executed in the environment.
170 | """
171 | raise NotImplementedError()
172 |
173 | def backward(self, sample):
174 | """Updates the agent after having executed the action returned by `forward`.
175 | If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.
176 |
177 | # Argument
178 | reward (float): The observed reward after executing the action returned by `forward`.
179 | terminal (boolean): `True` if the new state of the environment is terminal.
180 |
181 | # Returns
182 | List of metrics values
183 | """
184 | raise NotImplementedError()
185 |
186 | def load_weights(self, filepath):
187 | """Loads the weights of an agent from an HDF5 file.
188 |
189 | # Arguments
190 | filepath (str): The path to the HDF5 file.
191 | """
192 | raise NotImplementedError()
193 |
194 | def save_weights(self, filepath, overwrite=False):
195 | """Saves the weights of an agent as an HDF5 file.
196 |
197 | # Arguments
198 | filepath (str): The path to where the weights should be saved.
199 | overwrite (boolean): If `False` and `filepath` already exists, raises an error.
200 | """
201 | raise NotImplementedError()
202 |
203 | def cuda(self):
204 | """
205 | use the cuda
206 | """
207 | raise NotImplementedError()
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
--------------------------------------------------------------------------------
/Torch_rl/algorithm/PPO_LSTM.py:
--------------------------------------------------------------------------------
1 |
2 | from Torch_rl.agent.core_policy import Agent_policy_based
3 | import torch.nn as nn
4 | from copy import deepcopy
5 | from Torch_rl.common.distribution import *
6 | from torch.optim import Adam
7 | from torch.autograd import Variable
8 | from Torch_rl.common.memory import ReplayMemory
9 |
10 |
11 | class PPO_Agent(Agent_policy_based):
12 | def __init__(self, env, policy_model, value_model,
13 | lr=5e-4, ent_coef=0.01, vf_coef=0.5,
14 | ## hyper-parawmeter
15 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=200,
16 | running_step=2048, running_ep=20, value_regular=0.01, buffer_size=50000,
17 | ## decay
18 | decay=False, decay_rate=0.9, lstm_enable=False,
19 | ##
20 | path=None):
21 | self.gpu = False
22 | self.env = env
23 | self.gamma = gamma
24 | self.lam = lam
25 | self.ent_coef = ent_coef
26 | self.vf_coef = vf_coef
27 | self.cliprange = cliprange
28 |
29 | self.value_train_step = value_train_round
30 |
31 | self.sample_rollout = running_step
32 | self.sample_ep = running_ep
33 | self.batch_size = batch_size
34 | self.lstm_enable = lstm_enable
35 | self.replay_buffer = ReplayMemory(buffer_size, other_record=["value", "return"])
36 |
37 | self.loss_cal = torch.nn.SmoothL1Loss()
38 |
39 | self.policy = policy_model
40 | if value_model == "shared":
41 | self.value = policy_model
42 | elif value_model == "copy":
43 | self.value = deepcopy(policy_model)
44 | else:
45 | self.value = value_model
46 |
47 | self.dist = make_pdtype(env.action_space, policy_model)
48 |
49 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
50 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular)
51 | if decay:
52 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate,
53 | last_epoch=-1)
54 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate,
55 | last_epoch=-1)
56 |
57 | super(PPO_Agent, self).__init__(path)
58 | #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
59 | #self.writer.add_graph(self.policy, input_to_model=example_input)
60 |
61 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
62 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
63 |
64 | self.training_round = 0
65 | self.running_step = 0
66 | self.record_sample = None
67 | self.training_step = 0
68 | self.lstm_enable = True
69 |
70 | def update(self, sample):
71 | step_len = len(sample["s"])
72 | for ki in range(step_len):
73 | sample_ = {
74 | "s": sample["s"][ki].cpu().numpy(),
75 | "a": sample["a"][ki].cpu().numpy(),
76 | "r": sample["r"][ki].cpu().numpy(),
77 | "tr": sample["tr"][ki].cpu().numpy(),
78 | "s_": sample["s_"][ki].cpu().numpy(),
79 | "value": sample["value"][ki].cpu().numpy(),
80 | "return": sample["return"][ki].cpu().numpy()
81 | }
82 | self.replay_buffer.push(sample_)
83 | '''
84 | train the value part
85 | '''
86 | vfloss_re = []
87 | for _ in range(self.value_train_step):
88 | tarin_value_sample = self.replay_buffer.sample(self.batch_size)
89 | for key in tarin_value_sample.keys():
90 | if self.gpu:
91 | tarin_value_sample[key] = tarin_value_sample[key].cuda()
92 | else:
93 | tarin_value_sample[key] = tarin_value_sample[key]
94 | old_value = tarin_value_sample["value"]
95 | training_s = tarin_value_sample["s"]
96 | R = tarin_value_sample["return"].squeeze()
97 | value_now = self.value.forward(training_s).squeeze()
98 | # value loss
99 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
100 | max=self.cliprange) # Clipped value
101 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
102 | # vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
103 | # vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
104 | self.value_model_optim.zero_grad()
105 | vf_loss1.backward()
106 | self.value_model_optim.step()
107 | vfloss_re.append(vf_loss1.cpu().detach().numpy())
108 |
109 | '''
110 | train the policy part
111 | '''
112 |
113 | for key in sample.keys():
114 | temp = torch.stack(list(sample[key]), 0).squeeze()
115 | if self.gpu:
116 | sample[key] = temp.cuda()
117 | else:
118 | sample[key] = temp
119 |
120 | array_index = []
121 | if self.lstm_enable:
122 | for time in range(step_len):
123 | array_index.append([time])
124 | "训练前重制"
125 | self.policy.reset_h()
126 | time_round = step_len
127 | else:
128 | time_round = np.ceil(step_len / self.batch_size)
129 | time_left = time_round * self.batch_size - step_len
130 | array = list(range(step_len)) + list(range(int(time_left)))
131 | array_index = []
132 | for train_time in range(int(time_round)):
133 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size])
134 |
135 | loss_re, pgloss_re, enloss_re = [], [], []
136 | for train_time in range(int(time_round)):
137 | index = array_index[train_time]
138 | training_s = sample["s"][index].detach()
139 | training_a = sample["a"][index].detach()
140 | old_neglogp = sample["logp"][index].detach()
141 | advs = sample["advs"][index].detach()
142 |
143 | " CALCULATE THE LOSS"
144 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"
145 |
146 | #generate Policy gradient loss
147 | outcome = self.policy.forward(training_s).squeeze()
148 | # new_neg_lop = torch.empty(size=(self.batch_size,))
149 | # for time in range(self.batch_size):
150 | # new_policy = self.dist(outcome[time])
151 | # new_neg_lop[time] = new_policy.log_prob(training_a[time])
152 | new_policy = self.dist(outcome)
153 | new_neg_lop = new_policy.log_prob(training_a)
154 | ratio = torch.exp(new_neg_lop - old_neglogp)
155 | pg_loss1 = -advs * ratio
156 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
157 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()
158 |
159 | # entropy
160 | entropy = new_policy.entropy().mean()
161 | # loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
162 | loss = pg_loss - entropy * self.ent_coef
163 | self.policy_model_optim.zero_grad()
164 | loss.backward()
165 | self.policy_model_optim.step()
166 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
167 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
168 | loss_re = loss.cpu().detach().numpy()
169 | pgloss_re.append(pg_loss.cpu().detach().numpy())
170 | enloss_re.append(entropy.cpu().detach().numpy())
171 | if self.lstm_enable:
172 | if sample["tr"][index] == 1:
173 | self.policy.reset_h()
174 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re),
175 | "entropy": np.sum(enloss_re),
176 | "vf_loss": np.sum(vfloss_re)}
177 |
178 |
179 | def load_weights(self, filepath):
180 | model = torch.load(filepath+"/PPO.pkl")
181 | self.policy.load_state_dict(model["policy"].state_dict())
182 | self.value.load_state_dict(model["value"].state_dict())
183 |
184 |
185 | def save_weights(self, filepath, overwrite=False):
186 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl")
187 |
188 | def policy_behavior_clone(self, sample_):
189 | action_label = sample_["a"].squeeze()
190 | if self.gpu:
191 | action_predict = self.policy(sample_["s"].cuda())
192 | action_label = action_label.cuda()
193 | else:
194 | action_predict = self.policy(sample_["s"])
195 | loss_bc = self.loss_cal(action_label, action_predict)
196 | del action_label
197 | del action_predict
198 | loss = loss_bc
199 | self.policy_model_optim.zero_grad()
200 | loss.backward()
201 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
202 | self.policy_model_optim.step()
203 | return loss.cpu().detach().numpy()
204 |
205 | def value_pretrain(self, record_sample, new_sample_len):
206 | train_times = int(np.floor(new_sample_len/128))
207 | round_loss = 0
208 | for io in range(train_times-1):
209 | index = list(range(128 * io, 128 * (io + 1)))
210 | if self.gpu:
211 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda()
212 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda()
213 | else:
214 | predict = torch.from_numpy(np.array(record_sample["s"])[index])
215 | lable = torch.from_numpy(np.array(record_sample["return"]))[index]
216 | value_now = self.value.forward(predict)
217 | # value loss
218 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss
219 | del predict
220 | del lable
221 | self.value_model_optim.zero_grad()
222 | vf_loss.backward()
223 | self.value_model_optim.step()
224 | round_loss += vf_loss.cpu().detach().numpy()
225 | return round_loss
226 |
227 | def cuda(self, device=None):
228 | self.policy.to_gpu(device)
229 | self.value.to_gpu(device)
230 | self.loss_cal = self.loss_cal.cuda(device)
231 | self.gpu = True
--------------------------------------------------------------------------------
/Torch_rl/algorithm/PPO_Lagrangian.py:
--------------------------------------------------------------------------------
1 |
2 | from Torch_rl.agent.core_policy import Agent_policy_based
3 | import torch.nn as nn
4 | from copy import deepcopy
5 | from Torch_rl.common.distribution import *
6 | from torch.optim import Adam
7 | from torch.autograd import Variable
8 | from Torch_rl.common.util import get_gae
9 |
10 |
11 | class ui_model(torch.nn.Module):
12 | def __init__(self):
13 | self.ui = torch.nn.parameter.Parameter([1], require_grad=True)
14 | def forward(self, x):
15 | x = self.ui * x
16 | return x
17 |
18 | class PPO_LAGRANGIAN_Agent(Agent_policy_based):
19 | def __init__(self, env, policy_model, value_model,
20 | lr=5e-4, ent_coef=0.01, vf_coef=0.5,
21 | ## hyper-parawmeter
22 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=10,
23 | running_step=2048, running_ep=20, value_regular=0.01,
24 | ## decay
25 | decay=False, decay_rate=0.9, lstm_enable=False,
26 | ##
27 | path=None):
28 | self.gpu = False
29 | self.env = env
30 | self.gamma = gamma
31 | self.lam = lam
32 | self.ent_coef = ent_coef
33 | self.vf_coef = vf_coef
34 | self.cliprange = cliprange
35 |
36 | self.value_train_step = value_train_round
37 |
38 | self.sample_rollout = running_step
39 | self.sample_ep = running_ep
40 | self.batch_size = batch_size
41 | self.lstm_enable = lstm_enable
42 |
43 | self.loss_cal = torch.nn.SmoothL1Loss()
44 |
45 | self.policy = policy_model
46 | if value_model == "shared":
47 | self.value = policy_model
48 | elif value_model == "copy":
49 | self.value = deepcopy(policy_model)
50 | else:
51 | self.value = value_model
52 |
53 | self.cost_value = deepcopy(self.value)
54 |
55 | self.dist = make_pdtype(env.action_space, policy_model)
56 |
57 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
58 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular)
59 | self.cost_value_model_optim = Adam(self.cost_value.parameters(), lr=lr, weight_decay=value_regular)
60 | if decay:
61 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate,
62 | last_epoch=-1)
63 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate,
64 | last_epoch=-1)
65 | self.cost_value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate,
66 | last_epoch=-1)
67 |
68 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
69 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2)
70 |
71 | super(PPO_LAGRANGIAN_Agent, self).__init__(path)
72 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
73 | self.writer.add_graph(self.policy, input_to_model=example_input)
74 |
75 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss","cost_value"]
76 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss","cost_value"]
77 |
78 | self.training_round = 0
79 | self.running_step = 0
80 | self.record_sample = None
81 | self.training_step = 0
82 |
83 | self.ui = ui_model()
84 | self.ui_optim = Adam(self.ui.parameters(), lr=lr)
85 |
86 |
87 | def update(self, sample):
88 |
89 | returns, advants = get_gae(sample["r"], sample["tr"], sample["value"], self.gamma,
90 | self.lam)
91 | sample["advs"] = advants.unsqueeze(1)
92 | sample["return"] = returns.unsqueeze(1)
93 |
94 | sample["cost"] = []
95 | for info in sample["info"]:
96 | sample["cost"].append(info["cost"])
97 |
98 | sample["cost_value"] = self.cost_value.forward(sample["s"])
99 |
100 | returns, advants = get_gae(sample["cost"], sample["tr"], sample["cost_value"], self.gamma,
101 | self.lam)
102 | sample["cost_advs"] = advants.unsqueeze(1)
103 | sample["cost_return"] = returns.unsqueeze(1)
104 |
105 |
106 | step_len = len(sample["s"])
107 | if self.lstm_enable:
108 | flagin = [time for time in range(step_len) if sample["tr"][time]==1]
109 | time_round = len(flagin)
110 | array_index = []
111 | for train_time in range(int(time_round)-1):
112 | array_index.append(range(flagin[train_time], flagin[train_time+1]))
113 | else:
114 | time_round = np.ceil(step_len/self.batch_size)
115 | time_left = time_round*self.batch_size-step_len
116 | array = list(range(step_len)) +list(range(int(time_left)))
117 | array_index = []
118 | for train_time in range(int(time_round)):
119 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size])
120 | loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], []
121 |
122 | for key in sample.keys():
123 | temp = torch.stack(list(sample[key]), 0)
124 | if self.gpu:
125 | sample[key] = temp.cuda()
126 | else:
127 | sample[key] = temp
128 | for train_time in range(int(time_round)):
129 | index = array_index[train_time]
130 | # for index in range(step_len):
131 | training_s = sample["s"][index].detach()
132 | training_a = sample["a"][index].detach()
133 | training_r = sample["r"][index].detach()
134 | R = sample["return"][index].detach()
135 | old_value = sample["value"][index].detach()
136 | old_neglogp = sample["logp"][index].detach()
137 | advs = sample["advs"][index].detach()
138 | c_advs = sample["cost_advs"][index].detach()
139 | c_value = sample["cost_value"][index].detach()
140 | cost = sample["cost"][index].detach()
141 |
142 | " CALCULATE THE LOSS"
143 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"
144 |
145 | " the value loss"
146 | value_now = self.value.forward(training_s)
147 | # value loss
148 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
149 | max=self.cliprange) # Clipped value
150 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
151 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
152 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
153 |
154 | # generate Policy gradient loss
155 | outcome = self.policy.forward(training_s)
156 | new_policy = self.dist(outcome)
157 | new_neg_lop = new_policy.log_prob(training_a)
158 | ratio = torch.exp(new_neg_lop - old_neglogp)
159 | pg_loss1 = -advs * ratio
160 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
161 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()
162 |
163 | # generate Policy gradient loss
164 | c_pg_loss1 = -c_advs * ratio
165 | c_pg_loss2 = -c_advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
166 | c_pg_loss = .5 * torch.max(c_pg_loss1, c_pg_loss2).mean()
167 |
168 |
169 |
170 | # entropy
171 | entropy = new_policy.entropy().mean()
172 | loss = pg_loss- self.ui * c_pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
173 |
174 | self.policy_model_optim.zero_grad()
175 | pg_loss.backward()
176 | self.policy_model_optim.step()
177 | for _ in range(self.value_train_step):
178 | value_now = self.value.forward(training_s)
179 | # value loss
180 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
181 | max=self.cliprange) # Clipped value
182 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
183 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
184 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
185 | self.value_model_optim.zero_grad()
186 | vf_loss1.backward()
187 | self.value_model_optim.step()
188 |
189 | cost_now = self.cost_value.forward(training_s)
190 | cost_vloss = self.loss_cal(cost_now, cost)
191 |
192 | self.cost_value_model_optim.zero_grad()
193 | cost_vloss.backward()
194 | self.cost_value_model_optim.step()
195 |
196 |
197 | loss_re = loss.cpu().detach().numpy()
198 | pgloss_re.append(pg_loss.cpu().detach().numpy())
199 | enloss_re.append(entropy.cpu().detach().numpy())
200 | vfloss_re.append(vf_loss1.cpu().detach().numpy())
201 | "training the weights ui"
202 | for i in sample["cost"]:
203 | cost = self.ui*sample["cost_value"]
204 | self.ui_optim.zero_grad()
205 | cost.backward()
206 | self.ui_optim.step()
207 |
208 |
209 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re),
210 | "entropy": np.sum(enloss_re),
211 | "vf_loss": np.sum(vfloss_re)}
212 |
213 | def load_weights(self, filepath):
214 | model = torch.load(filepath+"/PPO.pkl")
215 | self.policy.load_state_dict(model["policy"].state_dict())
216 | self.value.load_state_dict(model["value"].state_dict())
217 |
218 |
219 | def save_weights(self, filepath, overwrite=False):
220 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl")
221 |
222 |
223 | def cuda(self):
224 | self.policy.to_gpu()
225 | self.value.to_gpu()
226 | self.loss_cal = self.loss_cal.cuda()
227 | self.gpu = True
228 |
229 |
230 |
231 |
232 |
--------------------------------------------------------------------------------
/Torch_rl/algorithm/SPPO.py:
--------------------------------------------------------------------------------
1 |
2 | from Torch_rl.agent.core_policy import Agent_policy_based
3 | import torch.nn as nn
4 | from copy import deepcopy
5 | from Torch_rl.common.distribution import *
6 | from torch.optim import Adam
7 | from torch.autograd import Variable
8 | from Torch_rl.common.memory import ReplayMemory
9 |
10 |
11 | class PPO_Agent(Agent_policy_based):
12 | def __init__(self, env, policy_model, value_model,
13 | lr=5e-4, ent_coef=0.01, vf_coef=0.5,
14 | ## hyper-parawmeter
15 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=200,
16 | running_step=2048, running_ep=20, value_regular=0.01, buffer_size=50000,
17 | ## decay
18 | decay=False, decay_rate=0.9, lstm_enable=False,
19 | ##
20 | path=None):
21 | self.gpu = False
22 | self.env = env
23 | self.gamma = gamma
24 | self.lam = lam
25 | self.ent_coef = ent_coef
26 | self.vf_coef = vf_coef
27 | self.cliprange = cliprange
28 |
29 | self.value_train_step = value_train_round
30 |
31 | self.sample_rollout = running_step
32 | self.sample_ep = running_ep
33 | self.batch_size = batch_size
34 | self.lstm_enable = lstm_enable
35 | self.replay_buffer = ReplayMemory(buffer_size, other_record=["value", "return"])
36 |
37 | self.loss_cal = torch.nn.SmoothL1Loss()
38 |
39 | self.policy = policy_model
40 | if value_model == "shared":
41 | self.value = policy_model
42 | elif value_model == "copy":
43 | self.value = deepcopy(policy_model)
44 | else:
45 | self.value = value_model
46 |
47 | self.dist = make_pdtype(env.action_space, policy_model)
48 |
49 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr)
50 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular)
51 | if decay:
52 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate,
53 | last_epoch=-1)
54 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate,
55 | last_epoch=-1)
56 |
57 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
58 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2)
59 |
60 | super(PPO_Agent, self).__init__(path)
61 | #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape))
62 | #self.writer.add_graph(self.policy, input_to_model=example_input)
63 |
64 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
65 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
66 |
67 | self.training_round = 0
68 | self.running_step = 0
69 | self.record_sample = None
70 | self.training_step = 0
71 |
72 |
73 | def update(self, sample):
74 | step_len = len(sample["s"])
75 | for ki in range(step_len):
76 | sample_ = {
77 | "s": sample["s"][ki].cpu().numpy(),
78 | "a": sample["a"][ki].cpu().numpy(),
79 | "r": sample["r"][ki].cpu().numpy(),
80 | "tr": sample["tr"][ki].cpu().numpy(),
81 | "s_": sample["s_"][ki].cpu().numpy(),
82 | "value": sample["value"][ki].cpu().numpy(),
83 | "return": sample["return"][ki].cpu().numpy()
84 | }
85 | self.replay_buffer.push(sample_)
86 | '''
87 | train the value part
88 | '''
89 | vfloss_re = []
90 | for _ in range(self.value_train_step):
91 | tarin_value_sample = self.replay_buffer.sample(self.batch_size)
92 | for key in tarin_value_sample.keys():
93 | if self.gpu:
94 | tarin_value_sample[key] = tarin_value_sample[key].cuda()
95 | else:
96 | tarin_value_sample[key] = tarin_value_sample[key]
97 | old_value = tarin_value_sample["value"]
98 | training_s = tarin_value_sample["s"]
99 | R = tarin_value_sample["return"]
100 | value_now = self.value.forward(training_s).squeeze()
101 | # value loss
102 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange,
103 | max=self.cliprange) # Clipped value
104 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss
105 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss
106 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
107 | self.value_model_optim.zero_grad()
108 | vf_loss1.backward()
109 | self.value_model_optim.step()
110 | vfloss_re.append(vf_loss1.cpu().detach().numpy())
111 |
112 | '''
113 | train the policy part
114 | '''
115 |
116 | for key in sample.keys():
117 | temp = torch.stack(list(sample[key]), 0).squeeze()
118 | if self.gpu:
119 | sample[key] = temp.cuda()
120 | else:
121 | sample[key] = temp
122 |
123 | array_index = []
124 | time_round = np.ceil(step_len / self.batch_size)
125 | time_left = time_round * self.batch_size - step_len
126 | array = list(range(step_len)) + list(range(int(time_left)))
127 | array_index = []
128 | for train_time in range(int(time_round)):
129 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size])
130 |
131 | loss_re, pgloss_re, enloss_re = [], [], []
132 | for train_time in range(int(time_round)):
133 | index = array_index[train_time]
134 | # for index in range(step_len):
135 | training_s = sample["s"][index].detach()
136 | training_a = sample["a"][index].detach()
137 | old_neglogp = sample["logp"][index].detach()
138 | advs = sample["advs"][index].detach()
139 |
140 | " CALCULATE THE LOSS"
141 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"
142 |
143 | #generate Policy gradient loss
144 | outcome = self.policy.forward(training_s).squeeze()
145 | # new_neg_lop = torch.empty(size=(self.batch_size,))
146 | # for time in range(self.batch_size):
147 | # new_policy = self.dist(outcome[time])
148 | # new_neg_lop[time] = new_policy.log_prob(training_a[time])
149 | new_policy = self.dist(outcome)
150 | new_neg_lop = new_policy.log_prob(training_a)
151 | ratio = torch.exp(new_neg_lop - old_neglogp)
152 | pg_loss1 = -advs * ratio
153 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
154 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean()
155 |
156 | # entropy
157 | entropy = new_policy.entropy().mean()
158 | # loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
159 | loss = pg_loss - entropy * self.ent_coef
160 | self.policy_model_optim.zero_grad()
161 | loss.backward()
162 | self.policy_model_optim.step()
163 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
164 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
165 | loss_re = loss.cpu().detach().numpy()
166 | pgloss_re.append(pg_loss.cpu().detach().numpy())
167 | enloss_re.append(entropy.cpu().detach().numpy())
168 |
169 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re),
170 | "entropy": np.sum(enloss_re),
171 | "vf_loss": np.sum(vfloss_re)}
172 |
173 |
174 | def load_weights(self, filepath):
175 | model = torch.load(filepath+"/PPO.pkl")
176 | self.policy.load_state_dict(model["policy"].state_dict())
177 | self.value.load_state_dict(model["value"].state_dict())
178 |
179 |
180 | def save_weights(self, filepath, overwrite=False):
181 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl")
182 |
183 | def policy_behavior_clone(self, sample_):
184 | action_label = sample_["a"].squeeze()
185 | if self.gpu:
186 | action_predict = self.policy(sample_["s"].cuda())
187 | action_label = action_label.cuda()
188 | else:
189 | action_predict = self.policy(sample_["s"])
190 | loss_bc = self.loss_cal(action_label, action_predict)
191 | del action_label
192 | del action_predict
193 | loss = loss_bc
194 | self.policy_model_optim.zero_grad()
195 | loss.backward()
196 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2)
197 | self.policy_model_optim.step()
198 | return loss.cpu().detach().numpy()
199 |
200 | def value_pretrain(self, record_sample, new_sample_len):
201 | train_times = int(np.floor(new_sample_len/128))
202 | round_loss = 0
203 | for io in range(train_times-1):
204 | index = list(range(128 * io, 128 * (io + 1)))
205 | if self.gpu:
206 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda()
207 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda()
208 | else:
209 | predict = torch.from_numpy(np.array(record_sample["s"])[index])
210 | lable = torch.from_numpy(np.array(record_sample["return"]))[index]
211 | value_now = self.value.forward(predict)
212 | # value loss
213 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss
214 | del predict
215 | del lable
216 | self.value_model_optim.zero_grad()
217 | vf_loss.backward()
218 | self.value_model_optim.step()
219 | round_loss += vf_loss.cpu().detach().numpy()
220 | return round_loss
221 |
222 | def cuda(self, device=None):
223 | self.policy.to_gpu(device)
224 | self.value.to_gpu(device)
225 | self.loss_cal = self.loss_cal.cuda(device)
226 | self.gpu = True
--------------------------------------------------------------------------------
/Torch_rl/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/algorithm/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/common/Policy_for_DQN.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def get_object_config(o):
4 | if o is None:
5 | return None
6 |
7 | config = {
8 | 'class_name': o.__class__.__name__,
9 | 'config': o.get_config()
10 | }
11 | return config
12 |
13 |
14 | class Policy(object):
15 | """Abstract base class for all implemented policies.
16 |
17 | Each policy helps with selection of action to take on an environment.
18 |
19 | Do not use this abstract base class directly but instead use one of the concrete policies implemented.
20 | To implement your own policy, you have to implement the following methods:
21 |
22 | - `select_action`
23 |
24 | # Arguments
25 | agent (rl.core.Agent): Agent used
26 | """
27 | def _set_agent(self, agent):
28 | self.agent = agent
29 |
30 | @property
31 | def metrics_names(self):
32 | return []
33 |
34 | @property
35 | def metrics(self):
36 | return []
37 |
38 | def select_action(self, **kwargs):
39 | raise NotImplementedError()
40 |
41 | def get_config(self):
42 | """Return configuration of the policy
43 |
44 | # Returns
45 | Configuration as dict
46 | """
47 | return {}
48 |
49 |
50 | class LinearAnnealedPolicy(Policy):
51 | """Implement the linear annealing policy
52 |
53 | Linear Annealing Policy computes a current threshold value and
54 | transfers it to an inner policy which chooses the action. The threshold
55 | value is following a linear function decreasing over time."""
56 | def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps):
57 | if not hasattr(inner_policy, attr):
58 | raise ValueError('Policy does not have attribute "{}".'.format(attr))
59 |
60 | super(LinearAnnealedPolicy, self).__init__()
61 |
62 | self.inner_policy = inner_policy
63 | self.attr = attr
64 | self.value_max = value_max
65 | self.value_min = value_min
66 | self.value_test = value_test
67 | self.nb_steps = nb_steps
68 |
69 | def get_current_value(self):
70 | """Return current annealing value
71 |
72 | # Returns
73 | Value to use in annealing
74 | """
75 | if self.agent.training:
76 | # Linear annealed: f(x) = ax + b.
77 | a = -float(self.value_max - self.value_min) / float(self.nb_steps)
78 | b = float(self.value_max)
79 | value = max(self.value_min, a * float(self.agent.step) + b)
80 | else:
81 | value = self.value_test
82 | return value
83 |
84 | def select_action(self, **kwargs):
85 | """Choose an action to perform
86 |
87 | # Returns
88 | Action to take (int)
89 | """
90 | setattr(self.inner_policy, self.attr, self.get_current_value())
91 | return self.inner_policy.select_action(**kwargs)
92 |
93 | @property
94 | def metrics_names(self):
95 | """Return names of metrics
96 |
97 | # Returns
98 | List of metric names
99 | """
100 | return ['mean_{}'.format(self.attr)]
101 |
102 | @property
103 | def metrics(self):
104 | """Return metrics values
105 |
106 | # Returns
107 | List of metric values
108 | """
109 |
110 | return [getattr(self.inner_policy, self.attr)]
111 |
112 | def get_config(self):
113 | """Return configurations of LinearAnnealedPolicy
114 |
115 | # Returns
116 | Dict of config
117 | """
118 | config = super(LinearAnnealedPolicy, self).get_config()
119 | config['attr'] = self.attr
120 | config['value_max'] = self.value_max
121 | config['value_min'] = self.value_min
122 | config['value_test'] = self.value_test
123 | config['nb_steps'] = self.nb_steps
124 | config['inner_policy'] = get_object_config(self.inner_policy)
125 | return config
126 |
127 | class SoftmaxPolicy(Policy):
128 | """ Implement softmax policy for multinimial distribution
129 |
130 | Simple Policy
131 |
132 | - takes action according to the pobability distribution
133 |
134 | """
135 | def select_action(self, nb_actions, probs):
136 | """Return the selected action
137 |
138 | # Arguments
139 | probs (np.ndarray) : Probabilty for each action
140 |
141 | # Returns
142 | action
143 |
144 | """
145 | action = np.random.choice(range(nb_actions), p=probs)
146 | return action
147 |
148 | class EpsGreedyQPolicy(Policy):
149 | """Implement the epsilon greedy policy
150 |
151 | Eps Greedy policy either:
152 |
153 | - takes a random action with probability epsilon
154 | - takes current best action with prob (1 - epsilon)
155 | """
156 | def __init__(self, eps=.1):
157 | super(EpsGreedyQPolicy, self).__init__()
158 | self.eps = eps
159 |
160 | def select_action(self, q_values):
161 | """Return the selected action
162 |
163 | # Arguments
164 | q_values (np.ndarray): List of the estimations of Q for each action
165 |
166 | # Returns
167 | Selection action
168 | """
169 | assert q_values.ndim == 1
170 | nb_actions = q_values.shape[0]
171 |
172 | if np.random.uniform() < self.eps:
173 | action = np.random.randint(0, nb_actions)
174 | else:
175 | action = np.argmax(q_values)
176 | return action
177 |
178 | def get_config(self):
179 | """Return configurations of EpsGreedyQPolicy
180 |
181 | # Returns
182 | Dict of config
183 | """
184 | config = super(EpsGreedyQPolicy, self).get_config()
185 | config['eps'] = self.eps
186 | return config
187 |
188 |
189 | class GreedyQPolicy(Policy):
190 | """Implement the greedy policy
191 |
192 | Greedy policy returns the current best action according to q_values
193 | """
194 | def select_action(self, q_values):
195 | """Return the selected action
196 |
197 | # Arguments
198 | q_values (np.ndarray): List of the estimations of Q for each action
199 |
200 | # Returns
201 | Selection action
202 | """
203 | assert q_values.ndim == 1
204 | action = np.argmax(q_values)
205 | return action
206 |
207 |
208 |
209 | class BoltzmannQPolicy(Policy):
210 | """Implement the Boltzmann Q Policy
211 |
212 | Boltzmann Q Policy builds a probability law on q values and returns
213 | an action selected randomly according to this law.
214 | """
215 | def __init__(self, tau=1., clip=(-500., 500.)):
216 | super(BoltzmannQPolicy, self).__init__()
217 | self.tau = tau
218 | self.clip = clip
219 |
220 | def select_action(self, q_values):
221 | """Return the selected action
222 |
223 | # Arguments
224 | q_values (np.ndarray): List of the estimations of Q for each action
225 |
226 | # Returns
227 | Selection action
228 | """
229 | assert q_values.ndim == 1
230 | q_values = q_values.astype('float64')
231 | nb_actions = q_values.shape[0]
232 |
233 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
234 | probs = exp_values / np.sum(exp_values)
235 | action = np.random.choice(range(nb_actions), p=probs)
236 | return action
237 |
238 | def get_config(self):
239 | """Return configurations of BoltzmannQPolicy
240 |
241 | # Returns
242 | Dict of config
243 | """
244 | config = super(BoltzmannQPolicy, self).get_config()
245 | config['tau'] = self.tau
246 | config['clip'] = self.clip
247 | return config
248 |
249 |
250 | class MaxBoltzmannQPolicy(Policy):
251 | """
252 | A combination of the eps-greedy and Boltzman q-policy.
253 |
254 | Wiering, M.: Explorations in Efficient Reinforcement Learning.
255 | PhD thesis, University of Amsterdam, Amsterdam (1999)
256 |
257 | https://pure.uva.nl/ws/files/3153478/8461_UBA003000033.pdf
258 | """
259 | def __init__(self, eps=.1, tau=1., clip=(-500., 500.)):
260 | super(MaxBoltzmannQPolicy, self).__init__()
261 | self.eps = eps
262 | self.tau = tau
263 | self.clip = clip
264 |
265 | def select_action(self, q_values):
266 | """Return the selected action
267 | The selected action follows the BoltzmannQPolicy with probability epsilon
268 | or return the Greedy Policy with probability (1 - epsilon)
269 |
270 | # Arguments
271 | q_values (np.ndarray): List of the estimations of Q for each action
272 |
273 | # Returns
274 | Selection action
275 | """
276 | assert q_values.ndim == 1
277 | q_values = q_values.astype('float64')
278 | nb_actions = q_values.shape[0]
279 |
280 | if np.random.uniform() < self.eps:
281 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
282 | probs = exp_values / np.sum(exp_values)
283 | action = np.random.choice(range(nb_actions), p=probs)
284 | else:
285 | action = np.argmax(q_values)
286 | return action
287 |
288 | def get_config(self):
289 | """Return configurations of MaxBoltzmannQPolicy
290 |
291 | # Returns
292 | Dict of config
293 | """
294 | config = super(MaxBoltzmannQPolicy, self).get_config()
295 | config['eps'] = self.eps
296 | config['tau'] = self.tau
297 | config['clip'] = self.clip
298 | return config
299 |
300 |
301 | class BoltzmannGumbelQPolicy(Policy):
302 | """Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning
303 | based on the paper Boltzmann Exploration Done Right
304 | (https://arxiv.org/pdf/1705.10257.pdf).
305 |
306 | BGE is invariant with respect to the mean of the rewards but not their
307 | variance. The parameter C, which defaults to 1, can be used to correct for
308 | this, and should be set to the least upper bound on the standard deviation
309 | of the rewards.
310 |
311 | BGE is only available for training, not testing. For testing purposes, you
312 | can achieve approximately the same result as BGE after training for N steps
313 | on K actions with parameter C by using the BoltzmannQPolicy and setting
314 | tau = C/sqrt(N/K)."""
315 |
316 | def __init__(self, C=1.0):
317 | assert C > 0, "BoltzmannGumbelQPolicy C parameter must be > 0, not " + repr(C)
318 | super(BoltzmannGumbelQPolicy, self).__init__()
319 | self.C = C
320 | self.action_counts = None
321 |
322 | def select_action(self, q_values):
323 | """Return the selected action
324 |
325 | # Arguments
326 | q_values (np.ndarray): List of the estimations of Q for each action
327 |
328 | # Returns
329 | Selection action
330 | """
331 | # We can't use BGE during testing, since we don't have access to the
332 | # action_counts at the end of training.
333 | assert self.agent.training, "BoltzmannGumbelQPolicy should only be used for training, not testing"
334 |
335 | assert q_values.ndim == 1, q_values.ndim
336 | q_values = q_values.astype('float64')
337 |
338 | # If we are starting training, we should reset the action_counts.
339 | # Otherwise, action_counts should already be initialized, since we
340 | # always do so when we begin training.
341 | if self.agent.step == 0:
342 | self.action_counts = np.ones(q_values.shape)
343 | assert self.action_counts is not None, self.agent.step
344 | assert self.action_counts.shape == q_values.shape, (self.action_counts.shape, q_values.shape)
345 |
346 | beta = self.C/np.sqrt(self.action_counts)
347 | Z = np.random.gumbel(size=q_values.shape)
348 |
349 | perturbation = beta * Z
350 | perturbed_q_values = q_values + perturbation
351 | action = np.argmax(perturbed_q_values)
352 |
353 | self.action_counts[action] += 1
354 | return action
355 |
356 | def get_config(self):
357 | """Return configurations of BoltzmannGumbelQPolicy
358 |
359 | # Returns
360 | Dict of config
361 | """
362 | config = super(BoltzmannGumbelQPolicy, self).get_config()
363 | config['C'] = self.C
364 | return config
365 |
--------------------------------------------------------------------------------
/Torch_rl/common/distribution.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | def make_pdtype(ac_space,actor):
5 | from gym import spaces
6 | if isinstance(ac_space, spaces.Box):
7 | shape = ac_space.shape[0]
8 | layer_infor = []
9 | for name, param in actor.named_parameters():
10 | if "weight" in name:
11 | layer_infor.append(list(param.size()))
12 | output_layer = layer_infor[-1][0]
13 | return DiagGaussianPd_type(shape, output_layer)
14 | elif isinstance(ac_space, spaces.Discrete):
15 | return CategoricalPd(ac_space.n, actor)
16 | elif isinstance(ac_space, spaces.MultiDiscrete):
17 | return MultiCategoricalPd(ac_space.nvec)
18 | elif isinstance(ac_space, spaces.MultiBinary):
19 | return BernoulliPd(ac_space.n, actor)
20 | else:
21 | raise NotImplementedError
22 |
23 |
24 | class DiagGaussianPd_type():
25 | def __init__(self, shape, output_layer):
26 | self.shape = shape
27 | self.output_layer = output_layer
28 |
29 | def __call__(self, output, *args, **kwargs):
30 | if self.output_layer == self.shape:
31 | self.mean = output
32 | self.logstd = torch.ones_like(output)
33 | elif self.output_layer == self.shape*2:
34 | self.mean = torch.index_select(output.cpu(), -1, torch.arange(0, self.shape))
35 | self.logstd = torch.index_select(output.cpu(), -1, torch.arange(self.shape, self.shape*2))
36 | self.std = torch.exp(self.logstd)
37 | return DiagGaussianPd(self.mean, self.std)
38 |
39 |
40 |
41 | class Pd(object):
42 | """
43 | A particular probability distribution
44 | """
45 | def log_prob(self, x):
46 | return torch.sum(self.pd.log_prob(x), dim=-1, keepdim=True)
47 |
48 | def sample(self):
49 | return self.pd.sample()
50 |
51 | def neglogp(self, x):
52 | return -self.pd.log_prob(x)
53 |
54 | def kl(self, other):
55 | return torch.distributions.kl.kl_vergence(self.pd, other)
56 |
57 | def entropy(self):
58 | return self.pd.entropy()
59 |
60 |
61 |
62 | class DiagGaussianPd(Pd):
63 | def __init__(self, mean, std):
64 | from torch.distributions import Normal
65 | self.pd = Normal(mean, std)
66 |
67 |
68 | class CategoricalPd(Pd):
69 |
70 |
71 | def neglogp(self, x):
72 | # Usually it's easier to define the negative logprob
73 | raise NotImplementedError
74 |
75 | def kl(self, other):
76 | raise NotImplementedError
77 |
78 | def entropy(self):
79 | raise NotImplementedError
80 |
81 | def sample(self):
82 | raise NotImplementedError
83 | class MultiCategoricalPd(Pd):
84 | def flatparam(self):
85 | raise NotImplementedError
86 |
87 | def mode(self):
88 | raise NotImplementedError
89 |
90 | def neglogp(self, x):
91 | # Usually it's easier to define the negative logprob
92 | raise NotImplementedError
93 |
94 | def kl(self, other):
95 | raise NotImplementedError
96 |
97 | def entropy(self):
98 | raise NotImplementedError
99 |
100 | def sample(self):
101 | raise NotImplementedError
102 |
103 | class BernoulliPd(Pd):
104 | def flatparam(self):
105 | raise NotImplementedError
106 |
107 | def mode(self):
108 | raise NotImplementedError
109 |
110 | def neglogp(self, x):
111 | # Usually it's easier to define the negative logprob
112 | raise NotImplementedError
113 |
114 | def kl(self, other):
115 | raise NotImplementedError
116 |
117 | def entropy(self):
118 | raise NotImplementedError
119 |
120 | def sample(self):
121 | raise NotImplementedError
--------------------------------------------------------------------------------
/Torch_rl/common/loss.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 |
4 |
5 | def huber_loss(x, delta=1.0):
6 | """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
7 | return torch.where(
8 | torch.abs(x) < delta,
9 | torch.pow(x, 2) * 0.5,
10 | delta * (torch.abs(x) - 0.5 * delta)
11 | )
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/Torch_rl/common/memory.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | import random
4 | import numpy as np
5 | from abc import ABC
6 | from copy import deepcopy
7 | from collections import deque
8 |
9 | class Memory(ABC):
10 |
11 | def __init__(self, capacity, other_record=None):
12 | self.capacity = capacity
13 | self.memory = {"s": deque(maxlen=capacity),
14 | "a": deque(maxlen=capacity),
15 | "s_": deque(maxlen=capacity),
16 | "r": deque(maxlen=capacity),
17 | "tr": deque(maxlen=capacity)}
18 | if other_record is not None:
19 | for key in other_record:
20 | self.memory[key] = []
21 | self.position = 0
22 |
23 | def push(self, sample):
24 | raise NotImplementedError()
25 |
26 | def sample(self, batch_size):
27 | raise NotImplementedError()
28 |
29 | class ReplayMemory(Memory):
30 | def __init__(self, capacity, other_record=None):
31 | super(ReplayMemory, self).__init__(capacity, other_record=other_record)
32 |
33 | def push(self, sample):
34 | """Saves a transition."""
35 | for key in sample.keys():
36 | self.memory[key].append(sample[key])
37 | self.position = (self.position + 1) % self.capacity
38 | def sample(self, batch_size):
39 | sample_index = random.sample(range(len(self.memory["s"])), batch_size)
40 | sample = {}
41 | for key in self.memory.keys():
42 | sample[key] = []
43 | for key in self.memory.keys():
44 | for index in sample_index:
45 | sample[key].append(self.memory[key][index])
46 | sample[key] = np.array(sample[key],dtype=np.float32)
47 | sample[key] = torch.from_numpy(sample[key])
48 | return sample
49 |
50 | def recent_step_sample(self, batch_size):
51 | sample = {}
52 | for key in self.memory.keys():
53 | sample[key] = []
54 | for key in self.memory.keys():
55 | sample[key] = self.memory[key][-batch_size:]
56 | sample[key] = np.array(sample[key], dtype=np.float32)
57 | sample[key] = torch.from_numpy(sample[key])
58 | return sample
59 |
60 | def sample_episode(self):
61 | flag = []
62 | for f,i in enumerate(self.memory["tr"]):
63 | if i:
64 | flag.append(f)
65 | end_place = 0
66 | while end_place == 0:
67 | end_place = random.choice(list(range(len(flag))))
68 | sample = {}
69 | for key in self.memory.keys():
70 | sample[key] = []
71 | for key in self.memory.keys():
72 | sample[key] = self.memory[key][flag[end_place-1]:flag[end_place]]
73 | sample[key] = np.array(sample[key], dtype=np.float32)
74 | sample[key] = torch.from_numpy(sample[key])
75 | return sample
76 |
77 | def sample_fragment(self,length):
78 | end_place = 0
79 | while end_place < 0:
80 | end_place = random.choice(list(range(self.position)))
81 | sample = {}
82 | for key in self.memory.keys():
83 | sample[key] = []
84 | for key in self.memory.keys():
85 | sample[key] = self.memory[key][end_place-length:end_place]
86 | sample[key] = np.array(sample[key], dtype=np.float32)
87 | sample[key] = torch.from_numpy(sample[key])
88 | return sample
89 |
90 |
91 |
92 | def __len__(self):
93 | return len(self.memory)
94 |
95 |
96 | class ReplayMemory_HIRO(Memory):
97 | def __init__(self, capacity, other_record=None):
98 | super(ReplayMemory, self).__init__(capacity, other_record)
99 | self.memory = {"s": [],"g":[], "a": [], "s_": [], "r": [], "tr": []}
100 | def push(self, sample):
101 | """Saves a transition."""
102 | for key in self.memory.keys():
103 | self.memory[key].append(sample[key])
104 | if len(self.memory["s"]) > self.capacity:
105 | for key in self.memory.keys():
106 | del self.memory[key][0]
107 | self.position = (self.position + 1) % self.capacity
108 |
109 | def sample(self, batch_size):
110 | sample_index = random.sample(range(len(self.memory["s"])), batch_size)
111 | sample = {"s": [], "a": [], "s_": [], "r": [], "tr": []}
112 | for key in sample.keys():
113 | for index in sample_index:
114 | if key == "s":
115 | temp = np.array(self.memory["s"][index]+self.memory["g"][index], dtype=np.float32)
116 | sample[key].append(torch.from_numpy(temp))
117 | else:
118 | temp = np.array(self.memory[key][index], dtype=np.float32)
119 | sample[key].append(torch.from_numpy(temp))
120 | return sample
121 | def H_sample(self, batch_size):
122 | sample_index = random.sample(range(len(self.memory["s"])), batch_size)
123 | sample = {"s": [], "g": [], "s_": [], "r": [], "tr": []}
124 | for key in sample.keys():
125 | for index in sample_index:
126 | temp = np.array(self.memory[key][index], dtype=np.float32)
127 | sample[key].append(torch.from_numpy(temp))
128 | return sample
129 |
130 |
131 | class ReplayMemory_Sequence():
132 | def __init__(self, capacity, max_seq_len, other_record=None):
133 | self.capacity = capacity
134 | Sequence = {"s": [],
135 | "a": [],
136 | "s_": [],
137 | "r": [],
138 | "tr": []}
139 | if other_record is not None:
140 | for key in other_record:
141 | Sequence[key] = []
142 | self.Sequence = Sequence
143 | self.memory = [deepcopy(Sequence)]
144 | self.position = 0
145 | self.max_position = 0
146 | self.max_seq_len = max_seq_len
147 | self.batch = 32
148 | self.sequence_len = 100
149 |
150 | def push(self, sample):
151 | """Saves a transition."""
152 | for key in self.memory[self.position].keys():
153 | if sample[key] is np.ndarray:
154 | self.memory[self.position][key].append(sample[key])
155 | else:
156 | self.memory[self.position][key].append(np.array(sample[key]))
157 | if sample["tr"] == 1:
158 | self.position = (self.position + 1) % self.capacity
159 | if self.max_position <= self.capacity:
160 | self.memory.append(deepcopy(self.Sequence))
161 | self.max_position += 1
162 |
163 | def sample_ep(self, batch_size=None):
164 | if batch_size is not None:
165 | self.batch_size = batch_size
166 | sample_index = random.sample(range(self.max_position), self.batch_size)
167 | sample = {}
168 | for key in self.Sequence.keys():
169 | sample[key] = torch.empty((self.max_seq_len, self.batch_size, self.memory[0][key][0].shape[0]), dtype=torch.float32)
170 | for flag, index in enumerate(sample_index):
171 | ep_len = len(self.memory[index]['s'])
172 | for time_step in range(self.max_seq_len):
173 | if ep_len > self.max_seq_len:
174 | print("the memory size is longer than max_seq_len")
175 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step])
176 | elif ep_len == self.max_seq_len:
177 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step])
178 | else:
179 | for time_step in range(ep_len):
180 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step])
181 | for time_step in range(ep_len, self.max_seq_len):
182 | sample[key][time_step, flag, :] = torch.zeros(size=len(self.memory[index][key][0]))
183 | return sample
184 |
185 | def sample_sequence(self, batch_size=None, sequence_len=None):
186 | if batch_size is not None:
187 | self.batch_size = batch_size
188 | if sequence_len is not None:
189 | self.sequence_len = sequence_len
190 | sample = {}
191 | for key in self.Sequence.keys():
192 | temp_len = self.memory[0][key][0].size
193 | sample[key] = torch.empty((self.sequence_len, self.batch_size, temp_len), dtype=torch.float32)
194 | for loop in range(self.batch_size):
195 | index = random.sample(range(self.max_position), 1)[0]
196 | ep_len = len(self.memory[index]['s'])
197 | if ep_len <= self.sequence_len:
198 | for time_step in range(ep_len):
199 | sample[key][time_step, loop, :] = \
200 | torch.from_numpy(self.memory[index][key][time_step])
201 | for time_step in range(ep_len, self.sequence_len):
202 | sample[key][time_step, loop, :] = torch.zeros(temp_len)
203 | else:
204 | start_ = random.sample(range(0, ep_len - self.sequence_len), 1)[0]
205 | end_ = start_ + self.sequence_len
206 | for (time_step, time) in enumerate(range(start_, end_)):
207 | sample[key][time_step, loop, :] = \
208 | torch.from_numpy(self.memory[index][key][time_step])
209 | return sample
210 |
211 | def recent_ep_sample(self):
212 | return self.memory[self.position]
213 |
214 | def __len__(self):
215 | return len(self.memory)
--------------------------------------------------------------------------------
/Torch_rl/common/util.py:
--------------------------------------------------------------------------------
1 | import csv
2 | def csv_record(data,path):
3 | with open(path+"record.csv", "a+") as csvfile:
4 | writer = csv.writer(csvfile)
5 | writer.writerow(data)
6 |
7 | import torch
8 | def gae(sample, last_value, gamma, lam):
9 | running_step = len(sample["s"])
10 | sample["advs"] = torch.zeros((running_step), dtype=torch.float32)
11 | value_cal = torch.cat(sample["value"]).squeeze()
12 |
13 | last_gaelam = 0
14 | last_return = 0
15 |
16 | value = torch.cat((value_cal, last_value))
17 | for t in reversed(range(running_step)):
18 | # sample["return"][t] = last_return = sample["r"][t] + gamma * last_return * (1-sample["tr"][t])
19 | delta = sample["r"][t] + gamma * value[t+1] * (1-sample["tr"][t]) - value[t]
20 | last_gaelam = delta + gamma * lam * (1-sample["tr"][t]) * last_gaelam
21 | sample["advs"][t] = last_gaelam
22 | sample["return"] = sample["advs"]+value_cal
23 |
24 | adv = sample["advs"] # Normalize the advantages
25 | adv = (adv - torch.mean(adv))/(torch.std(adv)+1e-8)
26 | sample["advs"] = adv
27 | # mean_ep_reward = torch.sum(sample["r"])/torch.sum(torch.eq(sample["tr"],1))
28 | # print("the runner have sampled "+str(running_step)+" data and the mean_ep_reward is ", mean_ep_reward)
29 | return sample
30 |
31 | def generate_reture(sample, last_value, gamma, lam):
32 | running_step = sample["s"].size()[0]
33 | sample["advs"] = torch.zeros((running_step, 1), dtype=torch.float32)
34 | sample["return"] = torch.zeros((running_step, 1), dtype=torch.float32)
35 |
36 | last_return = 0
37 | for t in reversed(range(running_step)):
38 | sample["return"][t] = last_return = sample["r"][t] + gamma * last_return * (1 - sample["tr"][t])
39 |
40 | r = sample["return"]
41 | r = (r - torch.mean(r)) / (torch.std(r) + 1e-8)
42 | sample["return"] = r
43 | sample["advs"] = sample["return"]-sample["value"]
44 | mean_ep_reward = torch.sum(sample["r"]) / torch.sum(torch.eq(sample["tr"], 1))
45 | print("the runner have sampled " + str(running_step) + " data and the mean_ep_reward is ", mean_ep_reward)
46 | return sample
47 |
48 |
49 |
50 | def get_gae(rewards, masks, values, gamma, lamda):
51 | rewards = torch.Tensor(rewards)
52 | masks = torch.Tensor(masks)
53 | returns = torch.zeros_like(rewards)
54 | advants = torch.zeros_like(rewards)
55 |
56 | running_returns = 0
57 | previous_value = 0
58 | running_advants = 0
59 |
60 | for t in reversed(range(0, len(rewards))):
61 | running_returns = rewards[t] + gamma * running_returns * (1-masks[t])
62 | running_tderror = rewards[t] + gamma * previous_value * (1-masks[t]) - \
63 | values[t]
64 | running_advants = running_tderror + gamma * lamda * \
65 | running_advants * (1-masks[t])
66 |
67 | returns[t] = running_returns
68 | previous_value = values[t]
69 | advants[t] = running_advants
70 |
71 | advants = (advants - advants.mean()) / advants.std()
72 | return returns, advants
--------------------------------------------------------------------------------
/Torch_rl/example/agent_example/RUN_Catrpole_with_DQN.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl.agent.DQN import DQN_Agent
4 | from Torch_rl.model.Network import DenseNet
5 | from torch import nn
6 | from Torch_rl.common.Policy_for_DQN import MaxBoltzmannQPolicy
7 | #%%
8 | envID = "CartPole-v0"
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "savedate" + '/' + envID + "dqn" + nowtime+'/'
13 | #%%
14 |
15 | policy = MaxBoltzmannQPolicy()
16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh())
17 |
18 | Agent = DQN_Agent(env, model, policy, gamma=0.99, lr=1e-3, path=path)
19 |
20 | Agent.train(max_step=50000, render=False, verbose=2)
21 | Agent.test(max_step=10000, render=True, verbose=2)
22 |
--------------------------------------------------------------------------------
/Torch_rl/example/agent_example/RUN_Pendulum_with_PPO.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl import PPO
4 | from Torch_rl.model.Network import DenseNet
5 | from torch import nn
6 |
7 | #%%
8 | envID ="Pendulum-v0"
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/'
13 | #%%
14 | policy_model = DenseNet(env.observation_space.shape[0], env.action_space.shape[0],
15 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64])
16 | value_model = DenseNet(env.observation_space.shape[0], 1,
17 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64])
18 |
19 | Agent = PPO(env, policy_model, value_model, gamma=0.90,
20 | lr=1e-4, running_step=512, batch_size=64, value_train_round=10, path=path)
21 |
22 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1)
23 | Agent.test(max_step=10000, render=True, verbose=2)
24 |
--------------------------------------------------------------------------------
/Torch_rl/example/agent_example/RUN_Pendulum_with_TD3.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl import TD3
4 | from Torch_rl.model.Network import DenseNet
5 | from torch import nn
6 | #%%
7 | envID = "Pendulum-v0"
8 | env = gym.make(envID)
9 |
10 | nowtime = time.strftime('%y%m%d%H%M', time.localtime())
11 | path = "savedate" + '/' + envID + "-ddpg-" + nowtime+'/'
12 | #%%
13 | actor = DenseNet(env.observation_space.shape[0], env.action_space.shape[0],
14 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64])
15 | critic = DenseNet(env.observation_space.shape[0]+env.action_space.shape[0], 1,
16 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64])
17 | Agent = TD3(env, actor, critic, gamma=0.99, path=path)
18 |
19 | Agent.train(max_step=50000, render=False, verbose=2)
20 | Agent.test(max_step=10000, render=True, verbose=2)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/Torch_rl/example/agent_example/RUN_mountaincar_with_DQN.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl import DQN
4 | from Torch_rl.model.Network import DenseNet
5 | from torch import nn
6 | from Torch_rl.common.Policy_for_DQN import EpsGreedyQPolicy
7 | #%%
8 | envID = "MountainCar-v0"
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "savedate" + '/' + envID + "-dqn-" + nowtime+'/'
13 | #%%
14 |
15 | policy = EpsGreedyQPolicy()
16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh())
17 |
18 | Agent = DQN(env, model, policy, gamma=0.90, lr=1e-3, path=path)
19 |
20 | Agent.train(max_step=100000, render=False, verbose=2)
21 | Agent.save_weights(path)
22 | Agent.test(max_step=10000, render=True, verbose=2)
23 |
--------------------------------------------------------------------------------
/Torch_rl/example/agent_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/example/agent_example/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/example/algorithm_example/RUN_Pendulum_with_PPO_LSTM.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl import PPO
4 | from Torch_rl.model.Network import DenseNet, LSTM_Dense_Hin
5 | from torch import nn
6 |
7 | #%%
8 | envID ="Pendulum-v0"
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/'
13 | #%%
14 | policy_model = LSTM_Dense_Hin(env.observation_space.shape[0], env.action_space.shape[0],
15 | lstm_unit=64, lstm_layer=1, dense_layer=[64],
16 | hidden_activate=nn.Tanh())
17 | value_model = DenseNet(env.observation_space.shape[0], 1,
18 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64])
19 |
20 | Agent = PPO(env, policy_model, value_model, gamma=0.90,
21 | lr=1e-4, running_step=2048, batch_size=64, value_train_round=10, path=path, lstm_enable= True)
22 |
23 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1)
24 | Agent.test(max_step=10000, render=True, verbose=2)
25 |
--------------------------------------------------------------------------------
/Torch_rl/example/algorithm_example/RUN_Pendulum_with_PPO_largrangian.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl.algorithm.PPO_Lagrangian import PPO_LAGRANGIAN_Agent
4 | from Torch_rl.model.Network import DenseNet, LSTM_Dense_Hin
5 | from torch import nn
6 |
7 | #%%
8 | envID ="Pendulum-v0"
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/'
13 | #%%
14 | policy_model = LSTM_Dense_Hin(env.observation_space.shape[0], env.action_space.shape[0],
15 | lstm_unit=64, lstm_layer=1, dense_layer=[64],
16 | hidden_activate=nn.Tanh())
17 | value_model = DenseNet(env.observation_space.shape[0], 1,
18 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64])
19 |
20 | Agent = PPO_LAGRANGIAN_Agent(env, policy_model, value_model, gamma=0.90,
21 | lr=1e-4, running_step=2048, batch_size=64, value_train_round=10, path=path, lstm_enable= True)
22 |
23 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1)
24 | Agent.test(max_step=10000, render=True, verbose=2)
25 |
--------------------------------------------------------------------------------
/Torch_rl/example/algorithm_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/example/algorithm_example/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/model/GNN_layer.py:
--------------------------------------------------------------------------------
1 |
2 | import math
3 | import numpy as np
4 | import torch
5 | from torch import nn
6 | import scipy.sparse as sp
7 | from torch.nn.parameter import Parameter
8 | from torch.nn.modules.module import Module
9 | from scipy.sparse import coo_matrix
10 | import torch.nn.functional as F
11 |
12 |
13 | " The GCN layer"
14 | def normalize(adj):
15 | """Row-normalize sparse matrix"""
16 | rowsum = np.array(adj.sum(1))
17 | r_inv = np.power(rowsum, -1).flatten()
18 | r_inv[np.isinf(r_inv)] = 0.
19 | r_mat_inv = np.diag(r_inv)
20 | mx = r_mat_inv.dot(adj)
21 | return mx
22 |
23 | def normalize_sparse(adj):
24 | """Symmetrically normalize adjacency matrix."""
25 | adj = sp.coo_matrix(adj)
26 | rowsum = np.array(adj.sum(1))
27 | d_inv_sqrt = np.power(rowsum, -0.5).flatten()
28 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
29 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
30 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
31 |
32 | class GraphConvolution(Module):
33 | def __init__(self, adj, in_features, out_features,
34 | activate=nn.ReLU(), sparse_inputs=False, chebyshev_polynomials = 0, dropout = 0.5, bias=True):
35 | super(GraphConvolution, self).__init__()
36 | self.in_features = in_features
37 | self.out_features = out_features
38 | self.activate = activate
39 | self.weight = Parameter(torch.FloatTensor(in_features, out_features))
40 | if bias:
41 | self.bias = Parameter(torch.FloatTensor(out_features))
42 | else:
43 | self.register_parameter('bias', None)
44 | self.reset_parameters()
45 |
46 |
47 | if sparse_inputs:
48 | adj = adj.toarray()
49 | if chebyshev_polynomials>0:
50 | T_K = []
51 | T_K.append(np.eye(adj.shape[0]))
52 | laplacian = np.eye(adj.shape[0]) - normalize(adj)
53 | largest_eigval, _ = np.linalg.eig(laplacian)
54 | scaled_laplacian = (2. / largest_eigval[0]) * laplacian - np.eye(adj.shape[0])
55 | T_K.append(scaled_laplacian)
56 | for i in range(2, chebyshev_polynomials+1):
57 | T_K.append(2 * np.dot(scaled_laplacian,T_K[-1])-T_K[-2])
58 | self.T_k = T_K
59 | else:
60 | self.T_k = [normalize(adj)]
61 | if sparse_inputs:
62 | self.adj = [coo_matrix(T) for T in self.T_k]
63 | else:
64 | self.adj = self.T_k
65 |
66 | def reset_parameters(self):
67 | stdv = 1. / math.sqrt(self.weight.size(1))
68 | self.weight.data.uniform_(-stdv, stdv)
69 | if self.bias is not None:
70 | self.bias.data.uniform_(-stdv, stdv)
71 |
72 | def forward(self, input):
73 | support = torch.mm(input, self.weight)
74 | output = torch.zeros_like(support)
75 | if self.sparse_inputs:
76 | for adj in self.adj:
77 | output = output + torch.sparse.mm(adj, support)
78 | else:
79 | for adj in self.adj:
80 | output = output + torch.mm(adj, support)
81 | if self.bias is not None:
82 | return self.activate(output + self.bias)
83 | else:
84 | return self.activate(output)
85 |
86 | def __repr__(self):
87 | return self.__class__.__name__ + ' (' \
88 | + str(self.in_features) + ' -> ' \
89 | + str(self.out_features) + ')'
90 |
91 | "The Graph SAGE"
92 |
93 |
94 |
95 |
96 | "The Graph Attention Network"
97 |
98 | "the following code was implemented by https://github.com/Diego999/pyGAT"
99 |
100 | class GraphAttentionLayer(nn.Module):
101 | """
102 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
103 | """
104 |
105 | def __init__(self, in_features, out_features, dropout, alpha, concat=True):
106 | super(GraphAttentionLayer, self).__init__()
107 | self.dropout = dropout
108 | self.in_features = in_features
109 | self.out_features = out_features
110 | self.alpha = alpha
111 | self.concat = concat
112 |
113 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
114 | nn.init.xavier_uniform_(self.W.data, gain=1.414)
115 | self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
116 | nn.init.xavier_uniform_(self.a.data, gain=1.414)
117 |
118 | self.leakyrelu = nn.LeakyReLU(self.alpha)
119 |
120 | def forward(self, input, adj):
121 |
122 | h = torch.mm(input, self.W)
123 | N = h.size()[0]
124 |
125 | a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
126 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
127 |
128 | zero_vec = -9e15*torch.ones_like(e)
129 | attention = torch.where(adj > 0, e, zero_vec)
130 | attention = F.softmax(attention, dim=1)
131 | attention = F.dropout(attention, self.dropout, training=self.training)
132 | h_prime = torch.matmul(attention, h)
133 |
134 | if self.concat:
135 | return F.elu(h_prime)
136 | else:
137 | return h_prime
138 |
139 | def __repr__(self):
140 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
141 |
142 |
143 | class SpecialSpmmFunction(torch.autograd.Function):
144 | """Special function for only sparse region backpropataion layer."""
145 |
146 | @staticmethod
147 | def forward(ctx, indices, values, shape, b):
148 | assert indices.requires_grad == False
149 | a = torch.sparse_coo_tensor(indices, values, shape)
150 | ctx.save_for_backward(a, b)
151 | ctx.N = shape[0]
152 | return torch.matmul(a, b)
153 |
154 | @staticmethod
155 | def backward(ctx, grad_output):
156 | a, b = ctx.saved_tensors
157 | grad_values = grad_b = None
158 | if ctx.needs_input_grad[1]:
159 | grad_a_dense = grad_output.matmul(b.t())
160 | edge_idx = a._indices()[0, :] * ctx.N + a._indices()[1, :]
161 | grad_values = grad_a_dense.view(-1)[edge_idx]
162 | if ctx.needs_input_grad[3]:
163 | grad_b = a.t().matmul(grad_output)
164 | return None, grad_values, None, grad_b
165 |
166 |
167 | class SpecialSpmm(nn.Module):
168 | def forward(self, indices, values, shape, b):
169 | return SpecialSpmmFunction.apply(indices, values, shape, b)
170 |
171 |
172 | class SpGraphAttentionLayer(nn.Module):
173 | """
174 | Sparse version GAT layer, similar to https://arxiv.org/abs/1710.10903
175 | """
176 |
177 | def __init__(self, in_features, out_features, dropout, alpha, concat=True):
178 | super(SpGraphAttentionLayer, self).__init__()
179 | self.in_features = in_features
180 | self.out_features = out_features
181 | self.alpha = alpha
182 | self.concat = concat
183 |
184 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
185 | nn.init.xavier_normal_(self.W.data, gain=1.414)
186 |
187 | self.a = nn.Parameter(torch.zeros(size=(1, 2 * out_features)))
188 | nn.init.xavier_normal_(self.a.data, gain=1.414)
189 |
190 | self.dropout = nn.Dropout(dropout)
191 | self.leakyrelu = nn.LeakyReLU(self.alpha)
192 | self.special_spmm = SpecialSpmm()
193 |
194 | def forward(self, input, adj):
195 | dv = 'cuda' if input.is_cuda else 'cpu'
196 |
197 | N = input.size()[0]
198 | edge = adj.nonzero().t()
199 |
200 | h = torch.mm(input, self.W)
201 | # h: N x out
202 | assert not torch.isnan(h).any()
203 |
204 | # Self-attention on the nodes - Shared attention mechanism
205 | edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t()
206 | # edge: 2*D x E
207 |
208 | edge_e = torch.exp(-self.leakyrelu(self.a.mm(edge_h).squeeze()))
209 | assert not torch.isnan(edge_e).any()
210 | # edge_e: E
211 |
212 | e_rowsum = self.special_spmm(edge, edge_e, torch.Size([N, N]), torch.ones(size=(N, 1), device=dv))
213 | # e_rowsum: N x 1
214 |
215 | edge_e = self.dropout(edge_e)
216 | # edge_e: E
217 |
218 | h_prime = self.special_spmm(edge, edge_e, torch.Size([N, N]), h)
219 | assert not torch.isnan(h_prime).any()
220 | # h_prime: N x out
221 |
222 | h_prime = h_prime.div(e_rowsum)
223 | # h_prime: N x out
224 | assert not torch.isnan(h_prime).any()
225 |
226 | if self.concat:
227 | # if this layer is not last layer,
228 | return F.elu(h_prime)
229 | else:
230 | # if this layer is last layer,
231 | return h_prime
232 |
233 | def __repr__(self):
234 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
--------------------------------------------------------------------------------
/Torch_rl/model/GNN_network.py:
--------------------------------------------------------------------------------
1 |
2 | import math
3 | import numpy as np
4 | import torch
5 | from torch import nn
6 | import scipy.sparse as sp
7 | from torch.nn.parameter import Parameter
8 | from torch.nn.modules.module import Module
9 | from scipy.sparse import coo_matrix
10 |
11 |
12 |
13 | " The GCN Part"
14 | def normalize(adj):
15 | """Row-normalize sparse matrix"""
16 | rowsum = np.array(adj.sum(1))
17 | r_inv = np.power(rowsum, -1).flatten()
18 | r_inv[np.isinf(r_inv)] = 0.
19 | r_mat_inv = np.diag(r_inv)
20 | mx = r_mat_inv.dot(adj)
21 | return mx
22 |
23 | def normalize_sparse(adj):
24 | """Symmetrically normalize adjacency matrix."""
25 | adj = sp.coo_matrix(adj)
26 | rowsum = np.array(adj.sum(1))
27 | d_inv_sqrt = np.power(rowsum, -0.5).flatten()
28 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
29 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
30 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
31 |
32 | class GraphConvolution(Module):
33 | def __init__(self, adj, in_features, out_features,
34 | activate=nn.ReLU(), sparse_inputs=False, chebyshev_polynomials = 0, dropout = 0.5, bias=True):
35 | super(GraphConvolution, self).__init__()
36 | self.in_features = in_features
37 | self.out_features = out_features
38 | self.activate = activate
39 | self.weight = Parameter(torch.FloatTensor(in_features, out_features))
40 | if bias:
41 | self.bias = Parameter(torch.FloatTensor(out_features))
42 | else:
43 | self.register_parameter('bias', None)
44 | self.reset_parameters()
45 |
46 |
47 | if sparse_inputs:
48 | adj = adj.toarray()
49 | if chebyshev_polynomials>0:
50 | T_K = []
51 | T_K.append(np.eye(adj.shape[0]))
52 | laplacian = np.eye(adj.shape[0]) - normalize(adj)
53 | largest_eigval, _ = np.linalg.eig(laplacian)
54 | scaled_laplacian = (2. / largest_eigval[0]) * laplacian - np.eye(adj.shape[0])
55 | T_K.append(scaled_laplacian)
56 | for i in range(2, chebyshev_polynomials+1):
57 | T_K.append(2 * np.dot(scaled_laplacian,T_K[-1])-T_K[-2])
58 | self.T_k = T_K
59 | else:
60 | self.T_k = [normalize(adj)]
61 | if sparse_inputs:
62 | self.adj = [coo_matrix(T) for T in self.T_k]
63 | else:
64 | self.adj = self.T_k
65 |
66 | def reset_parameters(self):
67 | stdv = 1. / math.sqrt(self.weight.size(1))
68 | self.weight.data.uniform_(-stdv, stdv)
69 | if self.bias is not None:
70 | self.bias.data.uniform_(-stdv, stdv)
71 |
72 | def forward(self, input):
73 | support = torch.mm(input, self.weight)
74 | output = torch.zeros_like(support)
75 | if self.sparse_inputs:
76 | for adj in self.adj:
77 | output = output + torch.sparse.mm(adj, support)
78 | else:
79 | for adj in self.adj:
80 | output = output + torch.mm(adj, support)
81 | if self.bias is not None:
82 | return self.activate(output + self.bias)
83 | else:
84 | return self.activate(output)
85 |
86 | def __repr__(self):
87 | return self.__class__.__name__ + ' (' \
88 | + str(self.in_features) + ' -> ' \
89 | + str(self.out_features) + ')'
90 |
91 |
92 |
93 | "The Graph SAGE"
94 |
95 |
96 |
97 |
98 | "The Graph Attention Network"
99 |
--------------------------------------------------------------------------------
/Torch_rl/model/special_model.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | import numpy as np
4 | from torch import nn
5 | import torch.nn.functional as F
6 | from collections import OrderedDict
7 | from torch.distributions import Normal, Categorical
8 | from torch.autograd import Variable
9 | from copy import deepcopy
10 |
11 |
12 | class Multi_in(nn.Module):
13 | def __init__(self, observation_size, action_size, hidden_up_layer=[64, 64], hidden_down_layer=[64, 64],
14 | hidden_activate=nn.ReLU(), output_activate=None,
15 | BatchNorm = False):
16 | super(Multi_in, self).__init__()
17 |
18 | self.up_layer1 = nn.Linear(observation_size, hidden_up_layer[0], bias=True)
19 | self.up_layer2 = nn.Linear(hidden_up_layer[0], hidden_up_layer[1], bias=True)
20 | self.down_layer1 = nn.Linear(hidden_up_layer[1]*3, hidden_down_layer[0], bias=True)
21 | self.down_layer2 = nn.Linear(hidden_down_layer[0], hidden_up_layer[1], bias=True)
22 | self.outpu_layer3 = nn.Linear(hidden_down_layer[1], action_size+1, bias=True)
23 |
24 | self.hidden_activate = hidden_activate
25 | self.output_activate = output_activate
26 |
27 | self.gpu = False
28 |
29 | def forward(self, x1,x2,x3):
30 |
31 | x1 = self.hidden_activate(self.up_layer1(x1))
32 | x1 = self.hidden_activate(self.up_layer2(x1))
33 |
34 | x2 = self.hidden_activate(self.up_layer1(x2))
35 | x2 = self.hidden_activate(self.up_layer2(x2))
36 |
37 | x3 = self.hidden_activate(self.up_layer1(x3))
38 | x3 = self.hidden_activate(self.up_layer2(x3))
39 | x = torch.cat([x1,x2,x3], dim=-1)
40 | x = self.hidden_activate(self.down_layer1(x))
41 | x = self.hidden_activate(self.down_layer2(x))
42 | x = self.hidden_activate(self.output_layer1(x))
43 | Q = x[0]+torch.mean(x[1:])
44 | return Q
45 |
46 |
--------------------------------------------------------------------------------
/Torch_rl/temp_file/PPO.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from Torch_rl.agent.core_value import Agent_value_based
4 | from Torch_rl.common.memory import ReplayMemory
5 | from copy import deepcopy
6 | from Torch_rl.common.distribution import *
7 | from torch.optim import Adam
8 | from torch.autograd import Variable
9 | import random
10 | from Torch_rl.common.util import csv_record
11 | from Torch_rl.common.util import generate_reture,gae
12 |
13 |
14 | class PPO_Agent(Agent_value_based):
15 | def __init__(self, env, policy_model, value_model,
16 | lr=1e-4, ent_coef=0.01, vf_coef=0.5,
17 | ## hyper-parawmeter
18 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size = 32,
19 | buffer_size=50000, learning_starts=1000, running_step="synchronization", batch_training_round=10,
20 | value_regular=0.01, train_value_round = 1,
21 | ## decay
22 | decay=False, decay_rate=0.9,
23 | ##
24 | path=None):
25 |
26 | self.env = env
27 | self.gamma = gamma
28 | self.lam = lam
29 | self.ent_coef = ent_coef
30 | self.vf_coef = vf_coef
31 | self.cliprange = cliprange
32 | self.batch_size = batch_size
33 | self.batch_training_round = batch_training_round
34 | self.learning_starts = learning_starts
35 | self.train_value_round = train_value_round
36 | if running_step =="synchronization":
37 | self.run_step = 1
38 | else:
39 | self.run_step = running_step
40 |
41 |
42 | self.replay_buffer = ReplayMemory(buffer_size)
43 | self.loss_cal = torch.nn.MSELoss()
44 |
45 | self.policy_model = policy_model
46 | if value_model == "shared":
47 | self.value_model = policy_model
48 | elif value_model == "copy":
49 | self.value_model = deepcopy(policy_model)
50 | else:
51 | self.value_model = value_model
52 |
53 | self.run_policy_model,self.run_value_model = deepcopy(self.policy_model), deepcopy(self.value_model)
54 |
55 | self.dist = make_pdtype(env.action_space, policy_model)
56 |
57 | policy_model_optim = Adam(self.policy_model.parameters(), lr=lr)
58 | value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular)
59 | if decay:
60 | self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate,
61 | last_epoch=-1)
62 | self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate,
63 | last_epoch=-1)
64 | else:
65 | self.policy_model_optim = policy_model_optim
66 | self.value_model_optim = value_model_optim
67 |
68 | torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2)
69 | torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2)
70 |
71 | super(PPO_Agent, self).__init__(path)
72 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0]))
73 | self.writer.add_graph(self.policy_model, input_to_model=example_input)
74 | self.forward_step_show_list = []
75 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"]
76 | self.forward_ep_show_list = []
77 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"]
78 |
79 | self.training_round = 0
80 | self.running_step = 0
81 | self.record_sample = None
82 | self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}
83 |
84 | def forward(self, observation):
85 | observation = observation[np.newaxis, :].astype(np.float32)
86 | observation = torch.from_numpy(observation)
87 | outcome = self.policy_model.forward(observation)
88 | self.pd = self.dist(outcome)
89 | self.action = self.pd.sample()
90 | self.Q = self.value_model.forward(observation).squeeze()
91 | return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).detach().numpy(), {}
92 |
93 | def backward(self, sample_):
94 | self.replay_buffer.push(sample_)
95 | self.running_step += 1
96 | """"""""""""""
97 | "training part"
98 | """"""""""""""
99 | if self.step > self.learning_starts and self.learning:
100 | if self.record_sample is None and self.running_step > self.run_step:
101 | print("***************************************")
102 | print("In the ", self.episode, "ep")
103 | sample = self.replay_buffer.recent_step_sample(self.running_step)
104 | " sample advantage generate "
105 | sample["value"] = self.value_model.forward(sample["s"]).squeeze()
106 | last_value = self.value_model.forward(sample["s_"][-1])
107 | self.record_sample = gae(sample, last_value, self.gamma, self.lam)
108 | " sample log_probabilty generate"
109 | outcome = self.policy_model.forward(sample["s"])
110 | self.pd = self.dist(outcome)
111 | sample["logp"] = self.pd.log_prob(sample["a"])
112 | self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []}
113 | self.running_step = 0
114 | if self.record_sample is not None:
115 | print("the learning has start...........")
116 | while self.training_round < self.batch_training_round:
117 | start = (self.batch_size * self.training_round) % self.record_sample["s"].size()[0]
118 | if start+self.batch_size >= self.record_sample["s"].size()[0]:
119 | end = self.record_sample["s"].size()[0]
120 | else:
121 | end = start+self.batch_size
122 | index = np.arange(start, end)
123 | S = self.record_sample["s"][index]
124 | A = self.record_sample["a"][index]
125 | old_log = self.record_sample["logp"][index].detach()
126 | advs = self.record_sample["advs"][index].detach()
127 | value = self.record_sample["value"][index].detach()
128 | returns = self.record_sample["return"][index].detach()
129 |
130 | " traning the value model"
131 |
132 | value_now = self.value_model.forward(S)
133 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, max=self.cliprange) # Clipped value
134 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss
135 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss
136 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) # value loss
137 | vf_loss = 0.5 * vf_loss1
138 | " CALCULATE THE LOSS"
139 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss"
140 |
141 | #generate Policy gradient loss
142 | outcome = self.policy_model.forward(S)
143 | new_policy = self.dist(outcome)
144 | new_lop = new_policy.log_prob(A)
145 | ratio = torch.exp(new_lop-old_log)
146 | pg_loss1 = advs * ratio
147 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
148 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()
149 |
150 | # entropy
151 | entropy = new_policy.entropy().mean()
152 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
153 |
154 | self.value_model_optim.zero_grad()
155 | loss.backward(retain_graph=True)
156 | self.value_model_optim.step()
157 |
158 | self.policy_model_optim.zero_grad()
159 | loss.backward()
160 | self.policy_model_optim.step()
161 |
162 |
163 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
164 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
165 | self.training_round += 1
166 | print("round:", self.training_round,
167 | "pg_loss:", pg_loss.data.numpy(), "entropy:", entropy.data.numpy(), "vf_loss", vf_loss.data.numpy())
168 | self.loss_record["pg_loss"].append(pg_loss.data.numpy())
169 | self.loss_record["entropy"].append(entropy.data.numpy())
170 | self.loss_record["vf_loss"].append(vf_loss.data.numpy())
171 | self.loss_record["loss"].append(loss.data.numpy())
172 | self.training_round = 0
173 | self.record_sample = None
174 |
175 | if self.loss_record["loss"] and self.running_step self.learning_starts:
103 | if self.running_step % self.run_step == 0 and self.training_step == 0:
104 | " sample advantage generate "
105 | with torch.no_grad():
106 | sample = self.replay_buffer.recent_step_sample(self.running_step)
107 | last_value = self.value_model.forward(sample["s_"][-1])
108 | self.record_sample = gae(sample, last_value, self.gamma, self.lam)
109 | self.running_step = 0
110 |
111 | if self.training_step < self.sample_training_step and self.record_sample is not None:
112 | pg_loss_re = 0
113 | entropy_re = 0
114 | vf_loss_re = 0
115 | loss_re = 0
116 | for _ in range(self.batch_training_round):
117 | index = self.train_ticks[self.training_step]
118 | S = self.record_sample["s"][index].detach()
119 | A = self.record_sample["a"][index].detach()
120 | old_log = self.record_sample["logp"][index].detach()
121 | advs = self.record_sample["advs"][index].detach()
122 | value = self.record_sample["value"][index].detach()
123 | returns = self.record_sample["return"][index].detach()
124 | # generate Policy gradient loss
125 | outcome = self.run_policy.forward(S)
126 | new_policy = self.dist(outcome)
127 | new_lop = new_policy.log_prob(A)
128 | ratio = torch.exp(new_lop - old_log)
129 | pg_loss1 = advs * ratio
130 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange)
131 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean()
132 | # value loss
133 | value_now = self.run_value.forward(S)
134 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange,
135 | max=self.cliprange) # Clipped value
136 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss
137 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss
138 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2)
139 | # vf_loss = 0.5 * vf_loss1
140 | # entropy
141 | entropy = new_policy.entropy().mean()
142 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef
143 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"])
144 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange)
145 |
146 | self.value_model_optim.zero_grad()
147 | loss.backward(retain_graph=True)
148 | self.value_model_optim.step()
149 |
150 | self.policy_model_optim.zero_grad()
151 | loss.backward()
152 | self.policy_model_optim.step()
153 |
154 | self.training_step += 1
155 | pg_loss_re += pg_loss.data.numpy()
156 | entropy_re += entropy.data.numpy()
157 | vf_loss_re += vf_loss.data.numpy()
158 | loss_re += loss.data.numpy()
159 |
160 | if self.training_step == self.sample_training_step:
161 | print("the" + str(self.episode) + " round have training finished")
162 | self.run_policy.load_state_dict(self.policy_model.state_dict())
163 | self.run_value.load_state_dict(self.value_model.state_dict())
164 | self.training_step = 0
165 | self.record_sample = None
166 | return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re}
167 | return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0}
168 |
169 | def load_weights(self, filepath):
170 | model = torch.load(filepath+"ppo.pkl")
171 | self.policy_model.load_state_dict(model["policy_model"].state_dict())
172 | self.value_model.load_state_dict(model["value_model"].state_dict())
173 |
174 | def save_weights(self, filepath, overwrite=False):
175 | torch.save({"policy_model": self.policy_model,"value_model": self.value_model}, filepath + "PPO.pkl")
--------------------------------------------------------------------------------
/Torch_rl/temp_file/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/temp_file/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/test_file/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/test_file/__init__.py
--------------------------------------------------------------------------------
/Torch_rl/test_file/run_DP.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from Torch_rl.agent.DQN import DQN_Agent
4 | from Torch_rl.model.Network import DenseNet
5 | from torch import nn
6 | from Torch_rl.common.Policy_for_DQN import BoltzmannQPolicy
7 | #%%
8 | envID = 'D_place_action-v0'
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M', time.localtime())
12 | path = "savedate" + '/' + envID + "-DQN-" + nowtime+'/'
13 | #%%
14 |
15 | actor = DenseNet(env.observation_space.shape[0], env.action_space.shape[0], hidden_activate=nn.Tanh())
16 | critic = DenseNet(env.observation_space.shape[0]+env.action_space.shape[0], 1, hidden_activate=nn.Tanh())
17 | Agent = DQN_Agent(env, actor, critic, gamma=0.99, path=path)
18 |
19 | Agent.train(max_step=10000, render=True, verbose=2)
20 | Agent.test(max_step=10000, render=True, verbose=2)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/Torch_rl/test_file/run_HIRO.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from agent.HIRO import HIRO_Agent
4 | from model.Network import DenseNet
5 | from torch import nn
6 | from common.Policy_for_DQN import BoltzmannQPolicy
7 | #%%
8 | envID = 'D_place_action-v0'
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M', time.localtime())
12 | path = "savedate" + '/' + envID + "-DQN-" + nowtime+'/'
13 | #%%
14 | goal = gym.spaces.Box(low=-1, high=1, shape=(5,))
15 | H_model = DenseNet(env.observation_space.shape[0], 32, hidden_activate=nn.Tanh())
16 | L_model = DenseNet(env.observation_space.shape[0]+goal.shape[0], env.action_space.n, hidden_activate=nn.Tanh())
17 | Agent = HIRO_Agent(env, "DDPG", H_model, "DQN", L_model,goal=goal, gamma=0.99, path=path)
18 |
19 | Agent.train(max_step=10000, render=True, verbose=2)
20 | Agent.test(max_step=10000, render=True, verbose=2)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/Torch_rl/test_file/run_dp_dqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | from agent.DQN import DQN_Agent
4 | from model.Network import DenseNet
5 | from torch import nn
6 | from common.Policy_for_DQN import EpsGreedyQPolicy
7 | #%%
8 | envID = 'D_place_action-v0'
9 | env = gym.make(envID)
10 |
11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime())
12 | path = "savedate" + '/' + envID + "-dqn-" + nowtime+'/'
13 | #%%
14 |
15 | policy = EpsGreedyQPolicy()
16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh())
17 |
18 | Agent = DQN_Agent(env, model, policy, gamma=0.90, lr=1e-3, path=path)
19 |
20 | # Agent.train(max_step=1e6, render=False, verbose=2)
21 | # Agent.save_weights(path)
22 | #%%
23 | path = "savedate" + '/' + envID + "-dqn-" + "2002191728"
24 | Agent.load_weights(path)
25 | Agent.test(max_step=10000, render=True, verbose=2)
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/Torch_rl/test_file/testbackward.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from Torch_rl.model.Network import DenseNet
4 | from torch import nn
5 | import torch
6 | from torch.optim import Adam
7 | policy_model = DenseNet(12, 2,
8 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64])
9 | input = torch.rand(size=(32,12))
10 | output = torch.rand(size=(32,2))
11 | loss_cal1 = torch.nn.SmoothL1Loss()
12 | loss_cal2 = torch.nn.MSELoss()
13 | policy_model_optim = Adam(policy_model.parameters(), lr=1e-4)
14 | for time in range(100):
15 | y = policy_model.forward(input)
16 | loss = loss_cal1(y, output)
17 | policy_model_optim.zero_grad()
18 | loss.backward(retain_graph=True)
19 | policy_model_optim.step()
20 | loss = loss_cal2(y, output)
21 | policy_model_optim.zero_grad()
22 | loss.backward()
23 | policy_model_optim.step()
--------------------------------------------------------------------------------
/Torch_rl/test_file/testtt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from model.Network import DenseNet
3 | from torch import nn
4 | from torch.optim import Adam
5 |
6 | actor = DenseNet(5, 2, hidden_activate=nn.ReLU())
7 | critic = DenseNet(7, 1, hidden_activate=nn.ReLU())
8 |
9 | class actor_critic(nn.Module):
10 | def __init__(self, actor, critic):
11 | super(actor_critic, self).__init__()
12 | self.actor = actor
13 | self.critic = critic
14 |
15 | def forward(self, obs):
16 | a = self.actor(obs)
17 | input = torch.cat((obs, a), axis=-1)
18 | Q = self.critic(input)
19 | return Q
20 |
21 | actor_optim = Adam(actor.parameters(), lr=1e-1)
22 | critic_optim = Adam(critic.parameters(), lr=1e-1)
23 |
24 | input = torch.rand(10, 5)
25 | tgt = torch.rand(10, 1)
26 | loss_fun = torch.nn.MSELoss()
27 |
28 | a = actor(input)
29 | innn = torch.cat((input, a), axis=-1)
30 | b = critic(innn)
31 |
32 | actor.zero_grad()
33 | torch.mean(b).backward()
34 | actor_optim.step()
35 |
36 | ab = actor(input)
37 | bb = critic(innn)
38 |
39 | totalmodel = actor_critic(actor, critic)
40 | totalmodel(input)
--------------------------------------------------------------------------------
/Torch_rl/test_file/testttttt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.autograd as autograd # torch中自动计算梯度模块
3 | import torch.nn as nn # 神经网络模块
4 | import torch.nn.functional as F # 神经网络模块中的常用功能
5 | import torch.optim as optim # 模型优化器模块
6 |
7 | training_data = [
8 | ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
9 | ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
10 | ]
11 | word_to_ix = {} # 单词的索引字典
12 | for sent, tags in training_data:
13 | for word in sent:
14 | if word not in word_to_ix:
15 | word_to_ix[word] = len(word_to_ix)
16 | print(word_to_ix)
17 | tag_to_ix = {"DET": 0, "NN": 1, "V": 2} # 手工设定词性标签数据字典
18 |
19 | def prepare_sequence(seq, to_ix):
20 | idxs = [to_ix[w] for w in seq]
21 | tensor = torch.LongTensor(idxs)
22 | return autograd.Variable(tensor)
23 |
24 | inputs = prepare_sequence(training_data[0][0], word_to_ix)
25 |
26 |
27 | class LSTMTagger(nn.Module):
28 |
29 | def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
30 | super(LSTMTagger, self).__init__()
31 | self.hidden_dim = hidden_dim
32 |
33 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
34 |
35 | self.lstm = nn.LSTM(embedding_dim, hidden_dim)
36 |
37 | self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
38 | self.hidden = self.init_hidden()
39 |
40 | def init_hidden(self):
41 | return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
42 | autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
43 |
44 | def forward(self, sentence):
45 | embeds = self.word_embeddings(sentence)
46 | lstm_out, self.hidden = self.lstm(
47 | embeds.view(len(sentence), 1, -1), self.hidden)
48 | tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
49 | tag_scores = F.log_softmax(tag_space)
50 | return tag_scores
51 |
52 |
53 | model = LSTMTagger(10,100, len(word_to_ix), len(tag_to_ix))
54 | loss_function = nn.NLLLoss()
55 | optimizer = optim.SGD(model.parameters(), lr=0.1)
--------------------------------------------------------------------------------