├── .DS_Store ├── .gitattributes ├── .gitignore ├── .idea ├── .gitignore ├── Torch-rl.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── other.xml └── vcs.xml ├── LICENSE ├── README.md └── Torch_rl ├── .DS_Store ├── Hierarchical_RL ├── HIRO.py └── __init__.py ├── ImitationLearning ├── Behavior_Clone.py ├── GAIL.py ├── __init__.py └── core_IL.py ├── __init__.py ├── agent ├── .DS_Store ├── A3C.py ├── DDPG.py ├── DDPG_2.py ├── DQN.py ├── DRQN.py ├── PPO.py ├── SAC.py ├── TD3.py ├── TRPO.py ├── core_policy.py └── core_value.py ├── algorithm ├── PPO_LSTM.py ├── PPO_Lagrangian.py ├── SPPO.py └── __init__.py ├── common ├── Policy_for_DQN.py ├── distribution.py ├── logger.py ├── loss.py ├── memory.py └── util.py ├── example ├── agent_example │ ├── RUN_Catrpole_with_DQN.py │ ├── RUN_Pendulum_with_PPO.py │ ├── RUN_Pendulum_with_TD3.py │ ├── RUN_mountaincar_with_DQN.py │ └── __init__.py └── algorithm_example │ ├── RUN_Pendulum_with_PPO_LSTM.py │ ├── RUN_Pendulum_with_PPO_largrangian.py │ └── __init__.py ├── model ├── GNN_layer.py ├── GNN_network.py ├── Network.py └── special_model.py ├── temp_file ├── PPO.py ├── PPO2.py └── __init__.py └── test_file ├── __init__.py ├── run_DP.py ├── run_HIRO.py ├── run_dp_dqn.py ├── testbackward.py ├── testtt.py └── testttttt.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | savedate/.DS_Store 4 | *.txt 5 | *.csv 6 | *.iml 7 | *.xml 8 | .idea/Torch-rl.iml 9 | .idea/misc.xml 10 | *.xml 11 | *.iml 12 | *.iml 13 | *.xml 14 | Torch_rl/agent/.DS_Store 15 | *.pkl 16 | *.0 17 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /.idea/Torch-rl.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | 14 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Zee-MAC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Torch-rl 2 | ## introduction 3 | Torch和tensorflow是目前深度学习的主要两个框架，现如今在 TF 和 torch两个方面都有非常出色的代码，但是从使用程度上来讲torch这边的RL实现，很少有一个兼顾框架和易用的代码。 4 | 这里借鉴了Keras-RL的框架以及 baseline的实现思路，建立一套基于Torch版本的RL实现。 5 | 6 | 本着以最简单的最快速的最实际的方式建立一个Torch DRL的框架，节省大家学习的时间直接利用。希望大家也能加入，一起实现。 7 | 8 | **本仓库兼容CPU与GPU，目前还未实现MPI。** 算法（非严格）采用PET - 8编写，并带有注释。 9 | 10 | ## 仓库架构 11 | + agent 12 | 包含agent 内核（与环境交互的过程）以及所有强化学习算法 13 | + common 14 | 包含记录文件、loss函数、经验池、DQN策略 15 | + model 16 | 包含所有深度网络实现 17 | + savedata 18 | 记录训练结果 19 | run_xxxxxxxxxxx 训练实例。 20 | 21 | ## 目前进展 22 | 基于Keras-RL建立交互以及算法框架，并借鉴了baseline的logger文件，可以直接输出 txt、CSV、tensorboard对训练过程进行观察 23 | 24 | #### 算法： 25 | 26 | + DQN（包含Double DQN、 Dueling DQN）[source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DQN.py) 27 | 28 | + DRQN [source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DRQN.py) 29 | 30 | + DDPG [source code](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/DDPG.py) 31 | 32 | + PPO [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/PPO3.py) 33 | 34 | + Batch-PPO [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/PPO.py) 35 | 36 | + TD3 [source code ](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/agent/TD3.py) 37 | 38 | #### 网络: 39 | 40 | 可以快速建立全联接网络、CNN、LSTM、CNN-LSTM。 41 | 42 | 43 | 44 | 45 | 46 | ## example 47 | 48 | 有一些简单的训练example 49 | 50 | [RUN_Catrpole_with_DQN.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Catrpole_with_DQN.py) 51 | 52 | [RUN_Pendulum_with_DDPG.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Pendulum_with_DDPG.py) 53 | 54 | [RUN_Pendulum_with_PPO.py](https://github.com/zachary2wave/Torch-rl/blob/master/Torch_rl/RUN_Pendulum_with_PPO.py) 55 | 56 | ## 教程等待进一步更新。。。。。。 57 | 58 | 59 | -------------------------------------------------------------------------------- /Torch_rl/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/.DS_Store -------------------------------------------------------------------------------- /Torch_rl/Hierarchical_RL/HIRO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from Torch_rl.common.loss import huber_loss 10 | from torch.autograd import Variable 11 | from gym import spaces as Space 12 | from Torch_rl.common.Policy_for_DQN import BoltzmannQPolicy 13 | 14 | class HIRO_Agent(Agent_value_based): 15 | def __init__(self, env, 16 | H_policy, H_model, L_policy, L_model, 17 | goal = Space.Box(low=-1, high=1, shape=(1,), dtype=np.float32), 18 | # step for H_model 19 | step_interval = 10, H_train_interval=100, H_train_time = 100, 20 | ## hyper-parameter 21 | gamma=0.90, H_lr=1e-3, L_lr = 1e-3, batch_size=32, buffer_size=50000, learning_starts=1000, 22 | H_target_network_update_freq=500, L_target_network_update_freq=500, 23 | decay=False, decay_rate=0.9, 24 | ## prioritized_replay 25 | ## 26 | path=None): 27 | """ 28 | 29 | :param env: 30 | :param H_policy: 31 | :param H_model: 32 | :param L_policy: 33 | :param L_model: 34 | :param goal: 35 | :param step_interval: 36 | :param gamma: 37 | :param H_lr: 38 | :param L_lr: 39 | :param batch_size: 40 | :param buffer_size: 41 | :param learning_starts: 42 | :param H_target_network_update_freq: 43 | :param L_target_network_update_freq: 44 | :param decay: 45 | :param decay_rate: 46 | :param path: 47 | """ 48 | 49 | self.env = env 50 | self.gamma = gamma 51 | self.batch_size = batch_size 52 | self.learning_starts = learning_starts 53 | self.step_interval = step_interval 54 | 55 | # self.replay_buffer = ReplayMemory(buffer_size) 56 | # generate policy 57 | if H_policy == "DDPG" and isinstance(goal, Space.Box) and len(H_model) == 2: 58 | from agent.DDPG import DDPG_Agent 59 | if isinstance(H_lr,list): 60 | ac_lr = H_lr[0] 61 | cr_lr = H_lr[1] 62 | else: 63 | ac_lr = H_lr 64 | cr_lr = H_lr 65 | if isinstance(H_target_network_update_freq,list): 66 | actor_target_network_update_freq = H_target_network_update_freq[0] 67 | critic_target_network_update_freq = H_target_network_update_freq[1] 68 | else: 69 | actor_target_network_update_freq = H_target_network_update_freq 70 | critic_target_network_update_freq = H_target_network_update_freq 71 | self.H_agent = DDPG_Agent(env, H_model[0], H_model[1], 72 | actor_lr=ac_lr, critic_lr=cr_lr, 73 | actor_target_network_update_freq=actor_target_network_update_freq, 74 | critic_target_network_update_freq=critic_target_network_update_freq, 75 | ## hyper-parameter 76 | gamma=gamma, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts, 77 | ## decay 78 | decay=decay, decay_rate=decay_rate, 79 | ) 80 | self.H_main_net = self.H_agent.actor 81 | 82 | if H_policy == "PPO" and isinstance(goal, Space.Box): 83 | from agent.PPO import PPO_Agent 84 | self.high_agent = PPO_Agent() 85 | 86 | if L_policy == "DQN": 87 | from agent.DQN import DQN_Agent 88 | self.L_agent = DQN_Agent(env, L_model, BoltzmannQPolicy, 89 | gamma=gamma, lr=L_lr, batch_size=batch_size, buffer_size=buffer_size, learning_starts=learning_starts, 90 | target_network_update_freq=L_target_network_update_freq, 91 | decay=decay, decay_rate=decay_rate, 92 | double_dqn=True, dueling_dqn=False, dueling_way="native") 93 | self.L_main_net = self.L_agent.Q_net 94 | 95 | def forward(self, observation): 96 | observation = observation.astype(np.float32) 97 | observation = torch.from_numpy(observation) 98 | if self.step % self.step_interval == 0: 99 | goal = self.high_agent.forward(observation) 100 | if isinstance(goal,tuple): 101 | self.goal, Q = goal[0], goal[1] 102 | else: 103 | self.goal = goal 104 | L_observation = torch.cat(inputs=(observation, self.goal), dimension=0) 105 | action = self.L_agent.forward(L_observation) 106 | if isinstance(action, tuple): 107 | action, Q = action[0], action[1] 108 | else: 109 | action = action 110 | 111 | 112 | 113 | return action 114 | 115 | def backward(self, sample_): 116 | self.L_agent.backward(sample_) 117 | if self.step % self.step_interval == 0: 118 | self.L_agent.replay_buffer.sample(self.batch_size) 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | def load_weights(self, filepath): 130 | pass 131 | 132 | def save_weights(self, filepath, overwrite=False): 133 | pass 134 | 135 | -------------------------------------------------------------------------------- /Torch_rl/Hierarchical_RL/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/Hierarchical_RL/__init__.py -------------------------------------------------------------------------------- /Torch_rl/ImitationLearning/Behavior_Clone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.ImitationLearning.core_IL import Agent_IL 4 | from copy import deepcopy 5 | from torch import nn 6 | from Torch_rl.common import logger 7 | from Torch_rl.common.memory import ReplayMemory 8 | from torch.optim import Adam 9 | 10 | class BC_Agent(Agent_IL): 11 | 12 | def __init__(self, env, base_algorithm, policy_network, value_network = None, 13 | batch_size=32, lr=1e-4, 14 | path=None): 15 | self.env = env 16 | self.base_algorithm = base_algorithm 17 | self.policy_network = policy_network 18 | self.value_network = value_network 19 | self.batch_size = batch_size 20 | 21 | self.loss_cal = nn.MSELoss() 22 | self.policy_model_optim = Adam(self.policy_network.parameters(), lr=lr) 23 | if self.value_network is not None: 24 | self.value_model_optim = Adam(self.value_network.parameters(), lr=lr) 25 | 26 | super(BC_Agent, self).__init__(path) 27 | 28 | def training_with_data(self, expert_data, max_imitation_learning_step, training_ways): 29 | 30 | self.step = 0 31 | 32 | while self.step < max_imitation_learning_step: 33 | if training_ways == "random": 34 | samples = expert_data.sample(self.batch_size) 35 | elif training_ways == "episode": 36 | samples = expert_data.sample_episode() 37 | elif training_ways == "fragment": 38 | samples = expert_data.sample_fragment(self.batch_size) 39 | 40 | actions = self.policy_network.forward(samples["s"]) 41 | loss = self.loss_cal(actions, samples["a"]) 42 | self.policy_model_optim.zero_grad() 43 | loss.backward() 44 | self.policy_model_optim.step() 45 | 46 | def training_with_policy(self, expert_policy, max_imitation_learning_step=1e5, 47 | max_ep_cycle=2000, buffer_size=32): 48 | self.step = 0 49 | s = self.env.reset() 50 | loss_BC = 0 51 | ep_step, ep_reward, ep_loss = 0, 0, 0 52 | expert_action_set,policy_action_set = [],[] 53 | 54 | for _ in range(max_imitation_learning_step): 55 | self.step += 1 56 | ep_step += 1 57 | a_expert = expert_policy(s) 58 | a_policy = self.policy_network.forward(s) 59 | 60 | expert_action_set.append(torch.tensor(a_expert)) 61 | policy_action_set.append(a_policy) 62 | s_, r, done, info = self.env.step(a_policy) 63 | ep_reward += r 64 | sample = {"s": s, "a": a_policy, "a_expert":a_expert, "s_": s_, "r": r, "tr": done} 65 | s = s_[:] 66 | 67 | if len(policy_action_set) > buffer_size: 68 | 69 | loss = self.loss_cal(expert_action_set, policy_action_set) 70 | ep_loss += loss.cpu().detach().numpy() 71 | self.policy_model_optim.zero_grad() 72 | loss.backward() 73 | self.policy_model_optim.step() 74 | 75 | if done or ep_step>max_ep_cycle: 76 | ep_step = 0 77 | logger.record_tabular("steps", self.step) 78 | logger.record_tabular("loss", ep_loss) 79 | logger.record_tabular("loss", ep_reward) 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /Torch_rl/ImitationLearning/GAIL.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.ImitationLearning.core_IL import Agent_IL 4 | from copy import deepcopy 5 | from torch.optim import Adam 6 | from torch import nn 7 | from Torch_rl.common.memory import ReplayMemory 8 | from types import MethodType,FunctionType 9 | 10 | 11 | class GAIL_Agent(Agent_IL): 12 | def __init__(self, env, base_algorithm, adversary_model, policy_network, value_network = None, 13 | Adversary_lr=1e-4, ent_coeff = 1e-3, batch_size=32, 14 | ## 15 | path=None): 16 | 17 | self.env = env 18 | 19 | self.policy_network = policy_network 20 | self.value_network = value_network 21 | self.dist = base_algorithm.dist 22 | self.base_algorithm = base_algorithm 23 | self.adversary_model = adversary_model 24 | self.adversary_model_optim = Adam(self.adversary_model.parameters(), lr=Adversary_lr) 25 | 26 | self.entcoeff = ent_coeff 27 | self.batch_size = batch_size 28 | self.loss_calculator = nn.CrossEntropyLoss() 29 | 30 | 31 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 32 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 33 | 34 | def training_with_data(self, expert_data, max_imitation_learning_episode, training_ways): 35 | 36 | self.episode = 0 37 | 38 | while self.step < max_imitation_learning_episode: 39 | if training_ways == "random": 40 | samples = expert_data.sample(self.batch_size) 41 | elif training_ways == "episode": 42 | samples = expert_data.sample_episode() 43 | elif training_ways == "fragment": 44 | samples = expert_data.sample_fragment(self.batch_size) 45 | self.episode +=1 46 | expert_action = samples["a"] 47 | generator_action = self.policy_network.forward(samples["s"]) 48 | if self.value_network is not None: 49 | Q = self.value_network.forward(samples["s"]) 50 | if self.gpu: 51 | expert_action.cuda() 52 | generator_action.cuda() 53 | for key in samples.keys(): 54 | samples[key] = samples[key].cuda() 55 | 56 | IL_reward = self.Discriminator_training(samples, expert_action, generator_action) 57 | # for flag,rew in enumerate(IL_reward): 58 | # sample_new = {"s": samples["s"][flag], "a": generator_action, "s_": samples["s_"][flag], "r": rew, "tr": samples["tr"][flag]} 59 | samples["r"] = IL_reward 60 | samples["value"] = Q 61 | samples["logp"] = -1.9189 * np.ones_like(IL_reward) 62 | 63 | self.base_algorithm.backward(samples) 64 | 65 | 66 | 67 | def training_with_policy(self, expert_policy, max_imitation_learning_step): 68 | 69 | self.step = 0 70 | s = self.env.reset() 71 | buffer = ReplayMemory(self.batch_size, ["value", "logp"]) 72 | expert_action_set,generator_action_set=[],[] 73 | while self.step < max_imitation_learning_step: 74 | expert_action = expert_policy(s) 75 | generator_action = self.policy_network.forward(s) 76 | s_, r, done, info = self.env.step(generator_action.cpu().squeeze(0).numpy()) 77 | Q = self.value_network.forward(s) 78 | IL_reward = self.Discriminator_training(s, expert_action, generator_action) 79 | sample_ = { 80 | "s": s, 81 | "a": generator_action.squeeze(0), 82 | "r": IL_reward, 83 | "tr": torch.tensor([int(done)]), 84 | "s_":torch.from_numpy(s_), 85 | "logp": -1.9189, 86 | "value": Q} 87 | 88 | buffer.push(sample_) 89 | # expert_action_set.append(expert_action) 90 | # generator_action_set.append(generator_action) 91 | 92 | if self.step % self.batch_size==0 and self.step>1: 93 | self.base_algorithm.update(buffer.memory) 94 | 95 | 96 | 97 | 98 | def Discriminator_training(self,sample, expert_action, generator_action): 99 | expert_input = torch.cat((sample["s"],expert_action), dim=1) 100 | advertise_judgement = self.adversary_model.forward(expert_input) 101 | expert_acc = self.loss_cal(advertise_judgement, torch.ones_like(advertise_judgement)) 102 | 103 | generator_input = torch.cat((sample["s"], generator_action), dim=1) 104 | generator_judgement = self.adversary_model.forward(generator_input) 105 | generator_acc = self.loss_calculator(generator_judgement, torch.zeros_like(generator_judgement)) 106 | 107 | logits = torch.cat([advertise_judgement, generator_judgement], dim = 1) 108 | entropy = - logits*torch.log(logits) -(1-logits)*torch.log(1-logits) 109 | 110 | entropy_loss = - self.entcoeff * entropy 111 | total_loss = expert_acc + generator_acc - self.entcoeff * entropy 112 | self.adversary_model_optim.zero_grad() 113 | total_loss.backward() 114 | self.adversary_model_optim.step() 115 | IL_reward = -torch.log(1 - generator_judgement + 1e-8) 116 | 117 | return IL_reward 118 | 119 | 120 | 121 | def cuda(self): 122 | self.policy_network.to_gpu() 123 | self.value_network.to_gpu() 124 | self.adversary_model.to_gpu() 125 | self.loss_cal = self.loss_cal.cuda() 126 | self.gpu = True 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /Torch_rl/ImitationLearning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/ImitationLearning/__init__.py -------------------------------------------------------------------------------- /Torch_rl/ImitationLearning/core_IL.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from abc import ABC 3 | import numpy as np 4 | import torch 5 | from Torch_rl.common import logger 6 | from Torch_rl.common.logger import CSVOutputFormat 7 | 8 | 9 | class Agent_IL(ABC): 10 | 11 | """ 12 | Abstract base class for all implemented imitation_learning algorithms. 13 | 14 | the class contains the following methods 15 | 16 | we define the episode as the agent finished the training 17 | and the step as the agent interact with the env once 18 | """ 19 | 20 | def __init__(self, path): 21 | self.step = 0 22 | self.episode = 0 23 | """ 24 | config the logfile 25 | """ 26 | configlist = ["stdout", "log", 'tensorboard', "csv"] 27 | if path is None: 28 | path = "./" 29 | logger.configure(path, configlist) 30 | self.csvwritter = CSVOutputFormat(path + "record_trajectory.csv") 31 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')] 32 | self.writer = loggerCEN.writer 33 | self.path = path 34 | 35 | def training_with_data(self,expert_policy, max_imitation_learning_step=1e5, 36 | max_ep_cycle=2000, buffer_size=32, learning_start = 1000): 37 | raise NotImplementedError() 38 | 39 | def training_with_policy(self): 40 | raise NotImplementedError() 41 | 42 | 43 | -------------------------------------------------------------------------------- /Torch_rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import logger 2 | # agent 3 | from Torch_rl.agent.DQN import DQN_Agent as DQN 4 | from Torch_rl.agent.DRQN import DRQN_Agent as DRQN 5 | from Torch_rl.agent.DDPG import DDPG_Agent as DDPG 6 | from Torch_rl.temp_file.PPO import PPO_Agent as Batch_PPO 7 | from Torch_rl.agent.PPO import PPO_Agent as PPO 8 | from Torch_rl.agent.TD3 import TD3_Agent as TD3 9 | from Torch_rl.Hierarchical_RL.HIRO import HIRO_Agent as HIRO 10 | 11 | # network 12 | from Torch_rl.model.Network import DenseNet 13 | from Torch_rl.model.Network import LSTM_Dense 14 | from Torch_rl.model.Network import CNN_2D_Dense 15 | from Torch_rl.model.Network import CNN_2D_LSTM_Dense 16 | 17 | -------------------------------------------------------------------------------- /Torch_rl/agent/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/.DS_Store -------------------------------------------------------------------------------- /Torch_rl/agent/A3C.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/A3C.py -------------------------------------------------------------------------------- /Torch_rl/agent/DDPG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | from Torch_rl.common.loss import huber_loss 9 | from torch.autograd import Variable 10 | 11 | class actor_critic(nn.Module): 12 | def __init__(self, actor, critic): 13 | super(actor_critic, self).__init__() 14 | self.actor = actor 15 | self.critic = critic 16 | 17 | def forward(self, obs): 18 | a = self.actor(obs) 19 | input = torch.cat((obs, a), axis=-1) 20 | Q = self.critic(input) 21 | return Q 22 | 23 | 24 | class DDPG_Agent(Agent_value_based): 25 | def __init__(self, env, actor_model, critic_model, 26 | actor_lr=1e-4, critic_lr=1e-3, 27 | actor_target_network_update_freq=1000, critic_target_network_update_freq=1000, 28 | actor_training_freq=1, critic_training_freq=1, 29 | ## hyper-parameter 30 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000, 31 | ## lr_decay 32 | decay=False, decay_rate=0.9, critic_l2_reg=1e-2, clip_norm =None, 33 | ## 34 | path=None): 35 | 36 | self.gpu = False 37 | self.env = env 38 | 39 | self.gamma = gamma 40 | self.batch_size = batch_size 41 | self.learning_starts = learning_starts 42 | 43 | self.replay_buffer = ReplayMemory(buffer_size) 44 | 45 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq 46 | self.actor_target_network_update_freq = actor_target_network_update_freq 47 | self.critic_target_network_update_freq = critic_target_network_update_freq 48 | self.actor = actor_model 49 | self.critic = critic_model 50 | self.target_actor = deepcopy(actor_model) 51 | self.target_critic = deepcopy(critic_model) 52 | 53 | self.actor_critic = actor_critic(self.actor, self.critic) 54 | 55 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr) 56 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_l2_reg) 57 | if decay: 58 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1) 59 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1) 60 | else: 61 | self.actor_optim = actor_optim 62 | self.critic_optim = critic_optim 63 | 64 | super(DDPG_Agent, self).__init__(path) 65 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0])) 66 | self.writer.add_graph(self.actor_critic, input_to_model=example_input) 67 | self.forward_step_show_list = [] 68 | self.backward_step_show_list =[] 69 | self.forward_ep_show_list = [] 70 | self.backward_ep_show_list = [] 71 | 72 | def forward(self, observation): 73 | observation = observation[np.newaxis, :].astype(np.float32) 74 | observation = torch.from_numpy(observation) 75 | action = self.actor.forward(observation) 76 | action = torch.normal(action, torch.ones_like(action)) 77 | Q = self.critic(torch.cat((observation, action), dim=1)).squeeze().detach().numpy() 78 | return action.cpu().squeeze(0).detach().numpy(), Q, {} 79 | 80 | def backward(self, sample_): 81 | self.replay_buffer.push(sample_) 82 | if self.step > self.learning_starts and self.learning: 83 | sample = self.replay_buffer.sample(self.batch_size) 84 | if self.gpu: 85 | for key in sample.keys(): 86 | sample[key] = sample[key].cuda() 87 | assert len(sample["s"]) == self.batch_size 88 | "update the critic " 89 | if self.step % self.critic_training_freq == 0: 90 | input = torch.cat((sample["s"], sample["a"]), -1) 91 | Q = self.critic(input) 92 | target_a = self.target_actor(sample["s_"]) 93 | target_input = torch.cat((sample["s_"], target_a), -1) 94 | targetQ = self.target_critic(target_input) 95 | targetQ = targetQ.squeeze(1) 96 | Q = Q.squeeze(1) 97 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"]) 98 | loss = torch.mean(huber_loss(expected_q_values - Q)) 99 | self.critic_optim.zero_grad() 100 | loss.backward() 101 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2) 102 | self.critic_optim.step() 103 | "training the actor" 104 | if self.step % self.actor_training_freq == 0: 105 | # Q = self.critic(torch.cat((sample["s"], self.actor(sample["s"])), -1)) 106 | Q = self.actor_critic.forward(sample["s"]) 107 | Q = -torch.mean(Q) 108 | self.actor_optim.zero_grad() 109 | Q.backward() 110 | torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1, norm_type=2) 111 | self.actor_optim.step() 112 | if self.step % self.actor_target_network_update_freq == 0: 113 | self.target_actor_net_update() 114 | if self.step % self.critic_target_network_update_freq == 0: 115 | self.target_critic_net_update() 116 | loss = loss.data.numpy() 117 | return loss, {} 118 | return 0, {} 119 | 120 | def target_actor_net_update(self): 121 | self.target_actor.load_state_dict(self.actor.state_dict()) 122 | 123 | def target_critic_net_update(self): 124 | self.target_critic.load_state_dict(self.critic.state_dict()) 125 | 126 | def load_weights(self, filepath): 127 | model = torch.load(filepath) 128 | self.actor.load_state_dict(model["actor"]) 129 | self.critic.load_state_dict(model["critic"]) 130 | self.target_actor.load_state_dict(model["target_actor"]) 131 | self.target_critic.load_state_dict(model["target_critic"]) 132 | self.actor_optim.load_state_dict(model["actor_optim"]) 133 | self.critic_optim.load_state_dict(model["critic_optim"]) 134 | 135 | 136 | def save_weights(self, filepath, overwrite=False): 137 | torch.save({"actor": self.actor, "critic":self.critic, 138 | "target_actor": self.target_actor,"target_critic": self.target_critic, 139 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim 140 | }, filepath + "DDPG.pkl") 141 | 142 | def cuda(self): 143 | self.actor.to_gpu() 144 | self.critic.to_gpu() 145 | self.target_actor = deepcopy(self.actor) 146 | self.target_critic = deepcopy(self.critic) 147 | self.gpu = True 148 | 149 | 150 | -------------------------------------------------------------------------------- /Torch_rl/agent/DDPG_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | from Torch_rl.common.loss import huber_loss 9 | from torch.autograd import Variable 10 | 11 | class actor_critic(nn.Module): 12 | def __init__(self, actor, critic, GCN_enable): 13 | super(actor_critic, self).__init__() 14 | self.actor = actor 15 | self.critic = critic 16 | self.GCN_enable = GCN_enable 17 | 18 | def forward(self, obs): 19 | a = self.actor(obs) 20 | if self.GCN_enable: 21 | Q = self.critic(obs, a) 22 | else: 23 | input = torch.cat((obs, a), axis=-1) 24 | Q = self.critic(input) 25 | return Q 26 | 27 | 28 | class DDPG_Agent(Agent_value_based): 29 | def __init__(self, env, actor_model, critic_model, 30 | actor_lr=1e-4, critic_lr=1e-3, 31 | actor_target_network_update_freq=1000, critic_target_network_update_freq=1000, 32 | actor_training_freq=1, critic_training_freq=1, sperate_critic = False, 33 | ## hyper-parameter 34 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000, 35 | ## lr_decay 36 | decay=False, decay_rate=0.9, critic_l2_reg=1e-2, clip_norm =None, 37 | ## 38 | path=None): 39 | 40 | self.gpu = False 41 | self.env = env 42 | self.sperate_critic = sperate_critic 43 | self.gamma = gamma 44 | self.batch_size = batch_size 45 | self.learning_starts = learning_starts 46 | 47 | self.replay_buffer = ReplayMemory(buffer_size) 48 | 49 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq 50 | self.actor_target_network_update_freq = actor_target_network_update_freq 51 | self.critic_target_network_update_freq = critic_target_network_update_freq 52 | self.actor = actor_model 53 | self.critic = critic_model 54 | self.target_actor = deepcopy(actor_model) 55 | self.target_critic = deepcopy(critic_model) 56 | 57 | self.actor_critic = actor_critic(self.actor, self.critic, self.GCN) 58 | 59 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr) 60 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_l2_reg) 61 | if decay: 62 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1) 63 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1) 64 | else: 65 | self.actor_optim = actor_optim 66 | self.critic_optim = critic_optim 67 | 68 | super(DDPG_Agent, self).__init__(path) 69 | #example_input = Variable(torch.rand(100, self.env.observation_space.shape[0])) 70 | #self.writer.add_graph(self.actor_critic, input_to_model=example_input) 71 | self.forward_step_show_list = [] 72 | self.backward_step_show_list =[] 73 | self.forward_ep_show_list = [] 74 | self.backward_ep_show_list = [] 75 | 76 | def forward(self, observation): 77 | observation = observation[np.newaxis, :].astype(np.float32) 78 | observation = torch.from_numpy(observation) 79 | action = self.actor.forward(observation) 80 | action = torch.normal(action, torch.ones_like(action)) 81 | if self.sperate_critic: 82 | Q = self.critic.forward(observation, action).squeeze().detach().numpy() 83 | else: 84 | Q = self.critic(torch.cat((observation, action), dim=1)).squeeze().detach().numpy() 85 | return action.cpu().squeeze(0).detach().numpy(), Q, {} 86 | 87 | def backward(self, sample_): 88 | self.replay_buffer.push(sample_) 89 | if self.step > self.learning_starts and self.learning: 90 | sample = self.replay_buffer.sample(self.batch_size) 91 | if self.gpu: 92 | for key in sample.keys(): 93 | sample[key] = sample[key].cuda() 94 | assert len(sample["s"]) == self.batch_size 95 | "update the critic " 96 | if self.step % self.critic_training_freq == 0: 97 | if self.sperate_critic: 98 | Q = self.critic.forward(sample["s"], sample["a"]) 99 | else: 100 | input = torch.cat((sample["s"], sample["a"]), -1) 101 | Q = self.critic.forward(input) 102 | target_a = self.target_actor(sample["s_"]) 103 | if self.sperate_critic: 104 | targetQ = self.target_critic(sample["s_"], target_a) 105 | else: 106 | target_input = torch.cat((sample["s_"], target_a), -1) 107 | targetQ = self.target_critic(target_input) 108 | targetQ = targetQ.squeeze(1) 109 | Q = Q.squeeze(1) 110 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"]) 111 | loss = torch.mean(huber_loss(expected_q_values - Q)) 112 | self.critic_optim.zero_grad() 113 | loss.backward() 114 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2) 115 | self.critic_optim.step() 116 | "training the actor" 117 | if self.step % self.actor_training_freq == 0: 118 | Q = self.actor_critic.forward(sample["s"]) 119 | Q = -torch.mean(Q) 120 | self.actor_optim.zero_grad() 121 | Q.backward() 122 | torch.nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 1, norm_type=2) 123 | self.actor_optim.step() 124 | if self.step % self.actor_target_network_update_freq == 0: 125 | self.target_actor_net_update() 126 | if self.step % self.critic_target_network_update_freq == 0: 127 | self.target_critic_net_update() 128 | loss = loss.data.numpy() 129 | return loss, {} 130 | return 0, {} 131 | 132 | def target_actor_net_update(self): 133 | self.target_actor.load_state_dict(self.actor.state_dict()) 134 | 135 | def target_critic_net_update(self): 136 | self.target_critic.load_state_dict(self.critic.state_dict()) 137 | 138 | def load_weights(self, filepath): 139 | model = torch.load(filepath) 140 | self.actor.load_state_dict(model["actor"]) 141 | self.critic.load_state_dict(model["critic"]) 142 | self.target_actor.load_state_dict(model["target_actor"]) 143 | self.target_critic.load_state_dict(model["target_critic"]) 144 | self.actor_optim.load_state_dict(model["actor_optim"]) 145 | self.critic_optim.load_state_dict(model["critic_optim"]) 146 | 147 | 148 | def save_weights(self, filepath, overwrite=False): 149 | torch.save({"actor": self.actor, "critic":self.critic, 150 | "target_actor": self.target_actor,"target_critic": self.target_critic, 151 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim 152 | }, filepath + "DDPG.pkl") 153 | 154 | def cuda(self): 155 | self.actor.to_gpu() 156 | self.critic.to_gpu() 157 | self.target_actor = deepcopy(self.actor) 158 | self.target_critic = deepcopy(self.critic) 159 | self.gpu = True 160 | 161 | 162 | -------------------------------------------------------------------------------- /Torch_rl/agent/DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from Torch_rl.common.loss import huber_loss 10 | from torch.autograd import Variable 11 | 12 | class Dueling_dqn(nn.Module): 13 | def __init__(self, model, dueling_way): 14 | super(Dueling_dqn, self).__init__() 15 | self.dueling_way = dueling_way 16 | self.model_layer = model.linears[:-1] 17 | layer_infor = model.layer_infor 18 | self.A_est = nn.Linear(layer_infor[-2], layer_infor[-1]) 19 | self.V_est = nn.Linear(layer_infor[-2], 1) 20 | 21 | def forward(self, obs): 22 | x = obs 23 | for layer in self.model_layer: 24 | x = layer(x) 25 | A = F.relu(self.A_est(x)) 26 | V = self.V_est(x) 27 | if self.dueling_way == "native": 28 | A = A 29 | elif self.dueling_way == "mean": 30 | A = A - torch.max(A) 31 | elif self.dueling_way == "avg": 32 | A = A - torch.mean(A) 33 | return V - A 34 | 35 | 36 | class DQN_Agent(Agent_value_based): 37 | def __init__(self, env, model, policy, 38 | ## hyper-parameter 39 | gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000, 40 | target_network_update_freq=500, 41 | ## decay 42 | decay=False, decay_rate=0.9, 43 | ## DDqn && DuelingDQN 44 | double_dqn=True, dueling_dqn=False, dueling_way="native", 45 | ## prioritized_replay 46 | prioritized_replay=False, 47 | prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, 48 | prioritized_replay_eps=1e-6, param_noise=False, 49 | ## 50 | path=None): 51 | 52 | """ 53 | 54 | :param env: the GYM environment 55 | :param model: the Torch NN model 56 | :param policy: the policy when choosing action 57 | :param ep: the MAX episode time 58 | :param step: the MAx step time 59 | .........................hyper-parameter.................................. 60 | :param gamma: 61 | :param lr: 62 | :param batchsize: 63 | :param buffer_size: 64 | :param target_network_update_freq: 65 | .........................further improve way.................................. 66 | :param double_dqn: whether enable DDQN 67 | :param dueling_dqn: whether dueling DDQN 68 | :param dueling_way: the Dueling DQN method 69 | it can choose the following three ways 70 | `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) 71 | `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) 72 | `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta) 73 | .........................prioritized-part.................................. 74 | :param prioritized_replay: (bool) if True prioritized replay buffer will be used. 75 | :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. 76 | It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. 77 | :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer 78 | :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial 79 | value to 1.0. If set to None equals to max_timesteps. 80 | :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. 81 | .........................imitation_learning_part.................................. 82 | :param imitation_learning_policy: To initial the network with the given policy 83 | which is supervised way to training the network 84 | :param IL_time: supervised training times 85 | :param network_kwargs: 86 | """ 87 | self.gpu = False 88 | self.env = env 89 | self.policy = policy 90 | 91 | self.gamma = gamma 92 | self.batch_size = batch_size 93 | self.learning_starts = learning_starts 94 | self.target_network_update_freq = target_network_update_freq 95 | self.double_dqn = double_dqn 96 | 97 | if dueling_dqn: 98 | self.Q_net = Dueling_dqn(model, dueling_way) 99 | else: 100 | self.Q_net = model 101 | 102 | self.target_Q_net = deepcopy(self.Q_net) 103 | 104 | q_net_optim = Adam(self.Q_net.parameters(), lr=lr) 105 | if decay: 106 | self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1) 107 | else: 108 | self.optim = q_net_optim 109 | 110 | self.replay_buffer = ReplayMemory(buffer_size) 111 | self.learning = False 112 | super(DQN_Agent, self).__init__(path) 113 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape)) 114 | self.writer.add_graph(self.Q_net, input_to_model=example_input) 115 | self.forward_step_show_list = [] 116 | self.backward_step_show_list =[] 117 | self.forward_ep_show_list = [] 118 | self.backward_ep_show_list = [] 119 | 120 | def forward(self, observation): 121 | observation = observation[np.newaxis, :].astype(np.float32) 122 | observation = torch.from_numpy(observation) 123 | Q_value = self.Q_net.forward(observation) 124 | Q_value = Q_value.cpu().squeeze().detach().numpy() 125 | if self.policy is not None: 126 | action = self.policy.select_action(Q_value) 127 | else: 128 | action = np.argmax(Q_value) 129 | return action, np.max(Q_value), {} 130 | 131 | def backward(self, sample_): 132 | self.replay_buffer.push(sample_) 133 | if self.step > self.learning_starts and self.learning: 134 | sample = self.replay_buffer.sample(self.batch_size) 135 | if self.gpu: 136 | for key in sample.keys(): 137 | sample[key] = sample[key].cuda() 138 | assert len(sample["s"]) == self.batch_size 139 | a = sample["a"].long().unsqueeze(1) 140 | Q = self.Q_net(sample["s"]).gather(1, a) 141 | if self.double_dqn: 142 | _, next_actions = self.Q_net(sample["s_"]).max(1, keepdim=True) 143 | targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions) 144 | else: 145 | _, next_actions = self.target_Q_net(sample["s_"]).max(1, keepdim=True) 146 | targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions) 147 | targetQ = targetQ.squeeze(1) 148 | Q = Q.squeeze(1) 149 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"]) 150 | loss = torch.mean(huber_loss(expected_q_values-Q)) 151 | self.optim.zero_grad() 152 | loss.backward() 153 | torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2) 154 | self.optim.step() 155 | if self.step % self.target_network_update_freq == 0: 156 | self.target_net_update() 157 | loss = loss.data.numpy() 158 | return loss, {} 159 | return 0, {} 160 | 161 | def target_net_update(self): 162 | self.target_Q_net.load_state_dict(self.Q_net.state_dict()) 163 | 164 | def load_weights(self, filepath): 165 | model = torch.load(filepath+'DQN.pkl') 166 | self.Q_net.load_state_dict(model["Q_net"].state_dict()) 167 | self.target_Q_net.load_state_dict(model["target_Q_net"].state_dict()) 168 | # self.optim.load_state_dict(model["optim"]) 169 | 170 | def save_weights(self, filepath, overwrite=True): 171 | torch.save({"Q_net": self.Q_net, 172 | "target_Q_net": self.target_Q_net, 173 | "optim": self.optim 174 | }, filepath+"DQN.pkl") 175 | 176 | 177 | def cuda(self): 178 | self.Q_net.to_gpu() 179 | self.target_Q_net = deepcopy(self.Q_net) 180 | self.gpu = True 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /Torch_rl/agent/DRQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory_Sequence 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from Torch_rl.common.loss import huber_loss 10 | from torch.autograd import Variable 11 | 12 | class Dueling_dqn(nn.Module): 13 | def __init__(self, model, dueling_way): 14 | super(Dueling_dqn, self).__init__() 15 | self.dueling_way = dueling_way 16 | self.model_layer = model.linears[:-1] 17 | layer_infor = model.layer_infor 18 | self.A_est = nn.Linear(layer_infor[-2], layer_infor[-1]) 19 | self.V_est = nn.Linear(layer_infor[-2], 1) 20 | 21 | def forward(self, obs): 22 | x = obs 23 | for layer in self.model_layer: 24 | x = layer(x) 25 | A = F.relu(self.A_est(x)) 26 | V = self.V_est(x) 27 | if self.dueling_way == "native": 28 | A = A 29 | elif self.dueling_way == "mean": 30 | A = A - torch.max(A) 31 | elif self.dueling_way == "avg": 32 | A = A - torch.mean(A) 33 | return V - A 34 | 35 | 36 | class DRQN_Agent(Agent_value_based): 37 | def __init__(self, env, model, policy, 38 | ## hyper-parameter 39 | gamma=0.90, lr=1e-3, learning_starts=1000, 40 | target_network_update_freq=500, 41 | ## memory 42 | batch_size=32, buffer_size=50000, max_seq_len=2000, replay_len=100, 43 | ## decay 44 | decay=False, decay_rate=0.9, 45 | ## DDqn && DuelingDQN 46 | double_dqn=True, dueling_dqn=False, dueling_way="native", 47 | ## prioritized_replay 48 | prioritized_replay=False, 49 | prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, 50 | prioritized_replay_eps=1e-6, param_noise=False, 51 | ## 52 | path=None): 53 | 54 | """ 55 | 56 | :param env: the GYM environment 57 | :param model: the Torch NN model 58 | :param policy: the policy when choosing action 59 | :param ep: the MAX episode time 60 | :param step: the MAx step time 61 | .........................hyper-parameter.................................. 62 | :param gamma: 63 | :param lr: 64 | :param batchsize: 65 | :param buffer_size: 66 | :param target_network_update_freq: 67 | .........................further improve way.................................. 68 | :param double_dqn: whether enable DDQN 69 | :param dueling_dqn: whether dueling DDQN 70 | :param dueling_way: the Dueling DQN method 71 | it can choose the following three ways 72 | `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) 73 | `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) 74 | `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta) 75 | .........................prioritized-part.................................. 76 | :param prioritized_replay: (bool) if True prioritized replay buffer will be used. 77 | :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. 78 | It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. 79 | :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer 80 | :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial 81 | value to 1.0. If set to None equals to max_timesteps. 82 | :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. 83 | .........................imitation_learning_part.................................. 84 | :param imitation_learning_policy: To initial the network with the given policy 85 | which is supervised way to training the network 86 | :param IL_time: supervised training times 87 | :param network_kwargs: 88 | """ 89 | self.gpu = False 90 | self.env = env 91 | self.policy = policy 92 | 93 | self.gamma = gamma 94 | self.batch_size = batch_size 95 | self.learning_starts = learning_starts 96 | self.target_network_update_freq = target_network_update_freq 97 | self.double_dqn = double_dqn 98 | if dueling_dqn: 99 | self.Q_net = Dueling_dqn(model, dueling_way) 100 | else: 101 | self.Q_net = model 102 | 103 | self.target_Q_net = deepcopy(self.Q_net) 104 | 105 | q_net_optim = Adam(self.Q_net.parameters(), lr=lr) 106 | if decay: 107 | self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1) 108 | else: 109 | self.optim = q_net_optim 110 | 111 | self.replay_buffer = ReplayMemory_Sequence(buffer_size, max_seq_len, other_record=["h","c"]) 112 | 113 | self.replay_buffer.batch_size = batch_size 114 | self.replay_buffer.sequence_len = replay_len 115 | if replay_len < max_seq_len: 116 | self.replay_sample = self.replay_buffer.sample_sequence 117 | else: 118 | self.replay_sample = self.replay_buffer.sample_ep 119 | self.learning = False 120 | super(DRQN_Agent, self).__init__(path) 121 | example_input = Variable(torch.rand((replay_len, 100)+self.env.observation_space.shape)) 122 | self.writer.add_graph(self.Q_net, input_to_model=example_input) 123 | self.forward_step_show_list = [] 124 | self.backward_step_show_list =[] 125 | self.forward_ep_show_list = [] 126 | self.backward_ep_show_list = [] 127 | 128 | self.h_state = model.init_H_C(1) 129 | 130 | def forward(self, observation): 131 | observation = observation[np.newaxis, np.newaxis, :].astype(np.float32) 132 | observation = torch.from_numpy(observation) 133 | Q_value, self.h_state = self.Q_net.forward(observation, self.h_state) 134 | Q_value = Q_value.cpu().squeeze().detach().numpy() 135 | if self.policy is not None: 136 | action = self.policy.select_action(Q_value) 137 | else: 138 | action = np.argmax(Q_value) 139 | return action, np.max(Q_value), {} 140 | 141 | def backward(self, sample_): 142 | sample_["h"] = self.h_state[0].detach().numpy() 143 | sample_["c"] = self.h_state[1].detach().numpy() 144 | self.replay_buffer.push(sample_) 145 | if self.step > self.learning_starts and self.learning: 146 | sample = self.replay_sample() 147 | if self.gpu: 148 | for key in sample.keys(): 149 | sample[key] = sample[key].cuda() 150 | assert sample["s"].size(1) == self.batch_size 151 | a = sample["a"].long() 152 | Q, H = self.Q_net(sample["s"]) 153 | Q = Q.gather(2, a) 154 | if self.double_dqn: 155 | Q_next, H = self.Q_net(sample["s_"]) 156 | _, next_actions = Q_next.max(2, keepdim=True) 157 | Qtarget_next, H = self.Q_net(sample["s_"]) 158 | targetQ = Qtarget_next.gather(2, next_actions) 159 | else: 160 | Qtarget_next, H = self.target_Q_net(sample["s_"]) 161 | targetQ, next_actions = Qtarget_next.max(2, keepdim=True) 162 | 163 | expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"]) 164 | loss = torch.mean(huber_loss(expected_q_values-Q)) 165 | self.optim.zero_grad() 166 | loss.backward() 167 | torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2) 168 | self.optim.step() 169 | if self.step % self.target_network_update_freq == 0: 170 | self.target_net_update() 171 | loss = loss.data.numpy() 172 | return loss, {} 173 | return 0, {} 174 | 175 | def target_net_update(self): 176 | self.target_Q_net.load_state_dict(self.Q_net.state_dict()) 177 | 178 | def load_weights(self, filepath): 179 | model = torch.load(filepath+'DQN.pkl') 180 | self.Q_net.load_state_dict(model["Q_net"].state_dict()) 181 | self.target_Q_net.load_state_dict(model["target_Q_net"].state_dict()) 182 | # self.optim.load_state_dict(model["optim"]) 183 | 184 | def save_weights(self, filepath, overwrite=True): 185 | torch.save({"Q_net": self.Q_net, 186 | "target_Q_net": self.target_Q_net, 187 | "optim": self.optim 188 | }, filepath+"DQN.pkl") 189 | 190 | def cuda(self): 191 | self.Q_net.to_gpu() 192 | self.target_Q_net = deepcopy(self.Q_net) 193 | self.gpu = True 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /Torch_rl/agent/PPO.py: -------------------------------------------------------------------------------- 1 | 2 | from Torch_rl.agent.core_policy import Agent_policy_based 3 | import torch.nn as nn 4 | from copy import deepcopy 5 | from Torch_rl.common.distribution import * 6 | from torch.optim import Adam 7 | from torch.autograd import Variable 8 | from Torch_rl.common.util import get_gae 9 | 10 | class PPO_Agent(Agent_policy_based): 11 | def __init__(self, env, policy_model, value_model, 12 | lr=5e-4, ent_coef=0.01, vf_coef=0.5, 13 | ## hyper-parawmeter 14 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=10, 15 | running_step=2048, running_ep=20, value_regular=0.01, 16 | ## decay 17 | decay=False, decay_rate=0.9, lstm_enable=False, 18 | ## 19 | path=None): 20 | self.gpu = False 21 | self.env = env 22 | self.gamma = gamma 23 | self.lam = lam 24 | self.ent_coef = ent_coef 25 | self.vf_coef = vf_coef 26 | self.cliprange = cliprange 27 | 28 | self.value_train_step = value_train_round 29 | 30 | self.sample_rollout = running_step 31 | self.sample_ep = running_ep 32 | self.batch_size = batch_size 33 | self.lstm_enable = lstm_enable 34 | 35 | self.loss_cal = torch.nn.SmoothL1Loss() 36 | 37 | self.policy = policy_model 38 | if value_model == "shared": 39 | self.value = policy_model 40 | elif value_model == "copy": 41 | self.value = deepcopy(policy_model) 42 | else: 43 | self.value = value_model 44 | 45 | self.dist = make_pdtype(env.action_space, policy_model) 46 | 47 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr) 48 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular) 49 | if decay: 50 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate, 51 | last_epoch=-1) 52 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate, 53 | last_epoch=-1) 54 | 55 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 56 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2) 57 | 58 | super(PPO_Agent, self).__init__(path) 59 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape)) 60 | self.writer.add_graph(self.policy, input_to_model=example_input) 61 | 62 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 63 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 64 | 65 | self.training_round = 0 66 | self.running_step = 0 67 | self.record_sample = None 68 | self.training_step = 0 69 | 70 | 71 | def update(self, sample): 72 | 73 | returns, advants = get_gae(sample["r"], sample["tr"], sample["value"], self.gamma, 74 | self.lam) 75 | sample["advs"] = advants.unsqueeze(1) 76 | sample["return"] = returns.unsqueeze(1) 77 | 78 | step_len = len(sample["s"]) 79 | if self.lstm_enable: 80 | flagin = [time for time in range(step_len) if sample["tr"][time]==1] 81 | time_round = len(flagin) 82 | array_index = [] 83 | for train_time in range(int(time_round)-1): 84 | array_index.append(range(flagin[train_time], flagin[train_time+1])) 85 | else: 86 | time_round = np.ceil(step_len/self.batch_size) 87 | time_left = time_round*self.batch_size-step_len 88 | array = list(range(step_len)) +list(range(int(time_left))) 89 | array_index = [] 90 | for train_time in range(int(time_round)): 91 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size]) 92 | loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], [] 93 | 94 | for key in sample.keys(): 95 | temp = torch.stack(list(sample[key]), 0) 96 | if self.gpu: 97 | sample[key] = temp.cuda() 98 | else: 99 | sample[key] = temp 100 | for train_time in range(int(time_round)): 101 | index = array_index[train_time] 102 | # for index in range(step_len): 103 | training_s = sample["s"][index].detach() 104 | training_a = sample["a"][index].detach() 105 | training_r = sample["r"][index].detach() 106 | R = sample["return"][index].detach() 107 | old_value = sample["value"][index].detach() 108 | old_neglogp = sample["logp"][index].detach() 109 | advs = sample["advs"][index].detach() 110 | 111 | " CALCULATE THE LOSS" 112 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" 113 | 114 | " the value loss" 115 | value_now = self.value.forward(training_s) 116 | # value loss 117 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 118 | max=self.cliprange) # Clipped value 119 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 120 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 121 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 122 | 123 | #generate Policy gradient loss 124 | outcome = self.policy.forward(training_s) 125 | new_policy = self.dist(outcome) 126 | new_neg_lop = new_policy.log_prob(training_a) 127 | ratio = torch.exp(new_neg_lop - old_neglogp) 128 | pg_loss1 = -advs * ratio 129 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 130 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean() 131 | 132 | # entropy 133 | entropy = new_policy.entropy().mean() 134 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 135 | 136 | self.policy_model_optim.zero_grad() 137 | pg_loss.backward() 138 | self.policy_model_optim.step() 139 | for _ in range(self.value_train_step): 140 | value_now = self.value.forward(training_s) 141 | # value loss 142 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 143 | max=self.cliprange) # Clipped value 144 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 145 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 146 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 147 | self.value_model_optim.zero_grad() 148 | vf_loss1.backward() 149 | self.value_model_optim.step() 150 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 151 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 152 | loss_re = loss.cpu().detach().numpy() 153 | pgloss_re.append(pg_loss.cpu().detach().numpy()) 154 | enloss_re.append(entropy.cpu().detach().numpy()) 155 | vfloss_re.append(vf_loss1.cpu().detach().numpy()) 156 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re), 157 | "entropy": np.sum(enloss_re), 158 | "vf_loss": np.sum(vfloss_re)} 159 | 160 | def load_weights(self, filepath): 161 | model = torch.load(filepath+"/PPO.pkl") 162 | self.policy.load_state_dict(model["policy"].state_dict()) 163 | self.value.load_state_dict(model["value"].state_dict()) 164 | 165 | 166 | def save_weights(self, filepath, overwrite=False): 167 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl") 168 | 169 | def policy_behavior_clone(self, sample_): 170 | action_label = sample_["a"].squeeze() 171 | if self.gpu: 172 | action_predict = self.policy(sample_["s"].cuda()) 173 | action_label = action_label.cuda() 174 | else: 175 | action_predict = self.policy(sample_["s"]) 176 | loss_bc = self.loss_cal(action_label, action_predict) 177 | del action_label 178 | del action_predict 179 | loss = loss_bc 180 | self.policy_model_optim.zero_grad() 181 | loss.backward() 182 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 183 | self.policy_model_optim.step() 184 | return loss.cpu().detach().numpy() 185 | 186 | def value_pretrain(self, record_sample, new_sample_len): 187 | train_times = int(np.floor(new_sample_len/128)) 188 | round_loss = 0 189 | for io in range(train_times-1): 190 | index = list(range(128 * io, 128 * (io + 1))) 191 | if self.gpu: 192 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda() 193 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda() 194 | else: 195 | predict = torch.from_numpy(np.array(record_sample["s"])[index]) 196 | lable = torch.from_numpy(np.array(record_sample["return"]))[index] 197 | value_now = self.value.forward(predict) 198 | # value loss 199 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss 200 | del predict 201 | del lable 202 | self.value_model_optim.zero_grad() 203 | vf_loss.backward() 204 | self.value_model_optim.step() 205 | round_loss += vf_loss.cpu().detach().numpy() 206 | return round_loss 207 | 208 | def cuda(self): 209 | self.policy.to_gpu() 210 | self.value.to_gpu() 211 | self.loss_cal = self.loss_cal.cuda() 212 | self.gpu = True -------------------------------------------------------------------------------- /Torch_rl/agent/SAC.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/agent/SAC.py -------------------------------------------------------------------------------- /Torch_rl/agent/TD3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.common.memory import ReplayMemory 4 | from Torch_rl.agent.core_value import Agent_value_based 5 | from copy import deepcopy 6 | from torch.optim import Adam 7 | from torch import nn 8 | from Torch_rl.common.loss import huber_loss 9 | from torch.autograd import Variable 10 | from Torch_rl.common.util import csv_record 11 | 12 | 13 | class critic_build(nn.Module): 14 | def __init__(self, critic): 15 | super(critic_build, self).__init__() 16 | self.critic_q1 = deepcopy(critic) 17 | self.critic_q2 = deepcopy(critic) 18 | 19 | def forward(self, obs): 20 | Q1 = self.critic_q1(obs) 21 | Q2 = self.critic_q2(obs) 22 | return Q1, Q2 23 | 24 | 25 | class actor_critic(nn.Module): 26 | def __init__(self, actor, critic): 27 | super(actor_critic, self).__init__() 28 | self.actor = actor 29 | self.critic = critic 30 | 31 | def forward(self, obs): 32 | a = self.actor(obs) 33 | input = torch.cat((obs, a), dim=-1) 34 | Q1, Q2 = self.critic(input) 35 | return Q1 36 | 37 | 38 | 39 | class TD3_Agent(Agent_value_based): 40 | def __init__(self, env, actor_model, critic_model, 41 | actor_lr=1e-4, critic_lr=3e-4, 42 | actor_target_network_update_freq=0.1, critic_target_network_update_freq=0.1, 43 | actor_training_freq=2, critic_training_freq=1, 44 | ## hyper-parameter 45 | gamma=0.99, batch_size=32, buffer_size=50000, learning_starts=1000, 46 | ## decay 47 | decay=False, decay_rate=0.9, l2_regulization=0.01, 48 | ## 49 | path=None): 50 | 51 | self.gpu = False 52 | self.env = env 53 | self.gamma = gamma 54 | self.batch_size = batch_size 55 | self.learning_starts = learning_starts 56 | self.actor_training_freq, self.critic_training_freq = actor_training_freq, critic_training_freq 57 | self.actor_target_network_update_freq = actor_target_network_update_freq 58 | self.critic_target_network_update_freq = critic_target_network_update_freq 59 | 60 | self.replay_buffer = ReplayMemory(buffer_size) 61 | self.actor = actor_model 62 | self.critic = critic_build(critic_model) 63 | 64 | self.actor_critic = actor_critic(self.actor, self.critic) 65 | 66 | self.target_actor = deepcopy(self.actor) 67 | self.target_critic = deepcopy(self.critic) 68 | 69 | actor_optim = Adam(self.actor.parameters(), lr=actor_lr) 70 | critic_optim = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=l2_regulization) 71 | if decay: 72 | self.actor_optim = torch.optim.lr_scheduler.ExponentialLR(actor_optim, decay_rate, last_epoch=-1) 73 | self.critic_optim = torch.optim.lr_scheduler.ExponentialLR(critic_optim, decay_rate, last_epoch=-1) 74 | else: 75 | self.actor_optim = actor_optim 76 | self.critic_optim = critic_optim 77 | 78 | torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1, norm_type=2) 79 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1, norm_type=2) 80 | 81 | 82 | super(TD3_Agent, self).__init__(path) 83 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0])) 84 | self.writer.add_graph(self.actor_critic, input_to_model=example_input) 85 | self.forward_step_show_list = [] 86 | self.backward_step_show_list = [] 87 | self.forward_ep_show_list = [] 88 | self.backward_ep_show_list = [] 89 | 90 | def forward(self, observation): 91 | observation = observation.astype(np.float32) 92 | observation = torch.from_numpy(observation) 93 | action = self.actor.forward(observation) 94 | csv_record(action.detach().numpy(),"./") 95 | action = torch.normal(action,torch.ones_like(action)) 96 | Q, _ = self.critic(torch.cat((observation, action),axis=0)) 97 | action = action.data.numpy() 98 | return action, Q.detach().numpy(), {} 99 | 100 | def backward(self, sample_): 101 | self.replay_buffer.push(sample_) 102 | if self.step > self.learning_starts and self.learning: 103 | sample = self.replay_buffer.sample(self.batch_size) 104 | if self.gpu: 105 | for key in sample.keys(): 106 | sample[key] = sample[key].cuda() 107 | assert len(sample["s"]) == self.batch_size 108 | "update the critic " 109 | if self.step % self.critic_training_freq == 0: 110 | target_a = self.target_actor(sample["s_"]) 111 | target_input = torch.cat((sample["s_"], target_a), -1) 112 | Q1, Q2 = self.target_critic(target_input) 113 | target_Q = torch.min(Q1, Q2) 114 | expected_q_values = sample["r"] + self.gamma * target_Q * (1.0 - sample["tr"]) 115 | 116 | input = torch.cat((sample["s"], sample["a"]), -1) 117 | Q1, Q2 = self.critic(input) 118 | loss = torch.mean(huber_loss(expected_q_values - Q1))+torch.mean(huber_loss(expected_q_values - Q2)) 119 | self.critic.zero_grad() 120 | loss.backward() 121 | self.critic_optim.step() 122 | "training the actor" 123 | if self.step % self.actor_training_freq == 0: 124 | Q = self.actor_critic(sample["s"]) 125 | Q = -torch.mean(Q) 126 | self.actor.zero_grad() 127 | Q.backward() 128 | self.actor_optim.step() 129 | self.target_net_update() 130 | loss = loss.data.numpy() 131 | return loss, {} 132 | return 0, {} 133 | 134 | def target_net_update(self): 135 | if self.actor_target_network_update_freq>1: 136 | if self.step % self.actor_target_network_update_freq == 0: 137 | self.target_actor.load_state_dict(self.actor.state_dict()) 138 | else: 139 | for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): 140 | target_param.data.copy_(self.actor_target_network_update_freq * param.data + 141 | (1 - self.actor_target_network_update_freq) * target_param.data) 142 | if self.critic_target_network_update_freq>1: 143 | if self.step % self.critic_target_network_update_freq == 0: 144 | self.target_critic.load_state_dict(self.critic.state_dict()) 145 | else: 146 | for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): 147 | target_param.data.copy_(self.critic_target_network_update_freq * param.data + 148 | (1 - self.critic_target_network_update_freq) * target_param.data) 149 | 150 | 151 | def load_weights(self, filepath): 152 | model = torch.load(filepath + "TD3.pkl") 153 | self.actor.load_state_dict(model["actor"]) 154 | self.critic.load_state_dict(model["critic"]) 155 | self.target_actor.load_state_dict(model["target_actor"]) 156 | self.target_critic.load_state_dict(model["target_critic"]) 157 | self.actor_optim.load_state_dict(model["actor_optim"]) 158 | self.critic_optim.load_state_dict(model["critic_optim"]) 159 | 160 | 161 | def save_weights(self, filepath, overwrite=False): 162 | torch.save({"actor": self.actor, "critic":self.critic, 163 | "target_actor": self.target_actor,"target_critic": self.target_critic, 164 | "actor_optim": self.actor_optim, "critic_optim": self.critic_optim 165 | }, filepath + "TD3.pkl") 166 | 167 | def cuda(self): 168 | self.actor.to_gpu() 169 | self.critic.to_gpu() 170 | self.target_actor = deepcopy(self.actor) 171 | self.target_critic = deepcopy(self.critic) 172 | self.gpu = True 173 | 174 | -------------------------------------------------------------------------------- /Torch_rl/agent/TRPO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.agent.core_policy import Agent_policy_based 4 | from Torch_rl.common.memory import ReplayMemory 5 | from copy import deepcopy 6 | # from Torch_rl.common.distribution import * 7 | from torch.optim import Adam 8 | from torch.autograd import Variable 9 | from gym import spaces 10 | from Torch_rl.common.util import csv_record 11 | from Torch_rl.common.util import gae 12 | from Torch_rl.common.distribution import * 13 | 14 | 15 | class TRPO_Agent(Agent_policy_based): 16 | def __init__(self, env, policy_model, value_model, 17 | lr=1e-3, ent_coef=0.01, vf_coef=0.5, 18 | ## hyper-parawmeter 19 | gamma=0.90, lam=0.95, cliprange=0.2, 20 | buffer_size=50000, learning_starts=1000, running_step=2000, batch_training_round=20, 21 | value_regular=0.01, 22 | ## decay 23 | decay=False, decay_rate=0.9, 24 | ## 25 | path=None): 26 | 27 | self.env = env 28 | self.gamma = gamma 29 | self.lam = lam 30 | self.ent_coef = ent_coef 31 | self.vf_coef = vf_coef 32 | self.cliprange = cliprange 33 | 34 | self.learning_starts = learning_starts 35 | self.batch_training_round = batch_training_round 36 | self.run_step = running_step 37 | self.sample_training_step = self.batch_training_round * self.run_step 38 | 39 | self.replay_buffer = ReplayMemory(buffer_size, ["value", "logp"]) 40 | self.loss_cal = torch.nn.MSELoss() 41 | 42 | self.dist = make_pdtype(env.action_space, policy_model) 43 | 44 | self.policy_model = policy_model 45 | if value_model == "shared": 46 | self.value_model = policy_model 47 | elif value_model == "copy": 48 | self.value_model = deepcopy(policy_model) 49 | else: 50 | self.value_model = value_model 51 | 52 | policy_model_optim = Adam(self.policy_model.parameters(), lr=lr) 53 | value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular) 54 | if decay: 55 | self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate, 56 | last_epoch=-1) 57 | self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate, 58 | last_epoch=-1) 59 | else: 60 | self.policy_model_optim = policy_model_optim 61 | self.value_model_optim = value_model_optim 62 | 63 | torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2) 64 | torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2) 65 | 66 | self.run_policy = deepcopy(self.policy_model) 67 | self.run_value = deepcopy(self.value_model) 68 | 69 | super(TRPO_Agent, self).__init__(path) 70 | example_input = Variable(torch.rand((100,)+ self.env.observation_space.shape)) 71 | self.writer.add_graph(self.policy_model, input_to_model=example_input) 72 | self.forward_step_show_list = [] 73 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 74 | self.forward_ep_show_list = [] 75 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 76 | 77 | self.training_round = 0 78 | self.training_step = 0 79 | self.running_step = 0 80 | self.record_sample = None 81 | self.train_ticks = np.tile(np.arange(self.run_step), self.batch_training_round) 82 | 83 | def forward(self, observation): 84 | observation = observation[np.newaxis, :].astype(np.float32) 85 | observation = torch.from_numpy(observation) 86 | with torch.no_grad(): 87 | outcome = self.run_policy.forward(observation) 88 | self.pd = self.dist(outcome) 89 | self.action = self.pd.sample() 90 | self.Q = self.run_value.forward(observation) 91 | return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).data.numpy(), {} 92 | 93 | def backward(self, sample_): 94 | sample_["logp"] = self.pd.log_prob(self.action) 95 | sample_["value"] = self.Q 96 | self.replay_buffer.push(sample_) 97 | self.running_step += 1 98 | """""""""""""" 99 | "training part" 100 | "in each step, we train for batch batch_training_times" 101 | """""""""""""" 102 | if self.step > self.learning_starts: 103 | if self.running_step % self.run_step == 0 and self.training_step == 0: 104 | " sample advantage generate " 105 | with torch.no_grad(): 106 | sample = self.replay_buffer.recent_step_sample(self.running_step) 107 | last_value = self.value_model.forward(sample["s_"][-1]) 108 | self.record_sample = gae(sample, last_value, self.gamma, self.lam) 109 | self.running_step = 0 110 | 111 | if self.training_step < self.sample_training_step and self.record_sample is not None: 112 | pg_loss_re = 0 113 | entropy_re = 0 114 | vf_loss_re = 0 115 | loss_re = 0 116 | for _ in range(self.batch_training_round): 117 | index = self.train_ticks[self.training_step] 118 | S = self.record_sample["s"][index].detach() 119 | A = self.record_sample["a"][index].detach() 120 | old_log = self.record_sample["logp"][index].detach() 121 | advs = self.record_sample["advs"][index].detach() 122 | value = self.record_sample["value"][index].detach() 123 | returns = self.record_sample["return"][index].detach() 124 | # generate Policy gradient loss 125 | outcome = self.run_policy.forward(S) 126 | new_policy = self.dist(outcome) 127 | new_lop = new_policy.log_prob(A) 128 | ratio = torch.exp(new_lop - old_log) 129 | pg_loss1 = advs * ratio 130 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 131 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean() 132 | # value loss 133 | value_now = self.run_value.forward(S) 134 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, 135 | max=self.cliprange) # Clipped value 136 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss 137 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss 138 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 139 | # vf_loss = 0.5 * vf_loss1 140 | # entropy 141 | entropy = new_policy.entropy().mean() 142 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 143 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 144 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 145 | 146 | self.value_model_optim.zero_grad() 147 | loss.backward(retain_graph=True) 148 | self.value_model_optim.step() 149 | 150 | self.policy_model_optim.zero_grad() 151 | loss.backward() 152 | self.policy_model_optim.step() 153 | 154 | self.training_step += 1 155 | pg_loss_re += pg_loss.data.numpy() 156 | entropy_re += entropy.data.numpy() 157 | vf_loss_re += vf_loss.data.numpy() 158 | loss_re += loss.data.numpy() 159 | 160 | if self.training_step == self.sample_training_step: 161 | print("the" + str(self.episode) + " round have training finished") 162 | self.run_policy.load_state_dict(self.policy_model.state_dict()) 163 | self.run_value.load_state_dict(self.value_model.state_dict()) 164 | self.training_step = 0 165 | self.record_sample = None 166 | return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re} 167 | return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0} 168 | 169 | def load_weights(self, filepath): 170 | model = torch.load(filepath + "ppo.pkl") 171 | self.policy_model.load_state_dict(model["policy_model"].state_dict()) 172 | self.value_model.load_state_dict(model["value_model"].state_dict()) 173 | 174 | def save_weights(self, filepath, overwrite=False): 175 | torch.save({"policy_model": self.policy_model, "value_model": self.value_model}, filepath + "TRPO.pkl") -------------------------------------------------------------------------------- /Torch_rl/agent/core_policy.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from abc import ABC 3 | import numpy as np 4 | import torch 5 | from Torch_rl.common import logger 6 | from Torch_rl.common.logger import CSVOutputFormat 7 | from Torch_rl.common.memory import ReplayMemory 8 | from Torch_rl.common.distribution import * 9 | 10 | from Torch_rl.common.util import csv_record 11 | 12 | 13 | 14 | class Agent_policy_based(ABC): 15 | """ 16 | 所有算法的父类 17 | Abstract base class for all implemented agents. 18 | 19 | 其中包含了 20 | - `runner` 根据policy 产生 sample 21 | - `learning` 根据Sample 训练网络 22 | - `load_weights` 加载权重 23 | - `save_weights` 存储权重 24 | - `layers` 网络层 25 | - 'forward' 前向传播 26 | - 'backward' 前向传播 27 | 定义 episode 完成一次为一个episode 28 | 定义 step 为交互一次 29 | """ 30 | def __init__(self, path): 31 | self.step = 0 32 | self.episode = 0 33 | """ 34 | config the logfile 35 | """ 36 | configlist = ["stdout", "log", 'tensorboard', "csv"] 37 | if path is None: 38 | path = "./" 39 | logger.configure(path, configlist) 40 | self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv") 41 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')] 42 | self.writer = loggerCEN.writer 43 | self.path = path 44 | 45 | def imitation_learning(self): 46 | pass 47 | 48 | def train(self, max_step=None, max_ep_cycle=2000, verbose=2, learning_start=1000, render=False, record_ep_inter=None): 49 | self.learning = True 50 | print("the train phase ........") 51 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, learning_start=learning_start, render=render, 52 | verbose=verbose, record_ep_inter=record_ep_inter) 53 | 54 | def test(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None): 55 | self.learning = False 56 | self.learning_starts = 0 57 | self.step = 0 58 | self.episode = 0 59 | print("the test phase ........") 60 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render, 61 | verbose=verbose, record_ep_inter=record_ep_inter) 62 | 63 | def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000, 64 | render = False, verbose=1, record_ep_inter=None): 65 | ''' 66 | :param max_step: 67 | :param max_ep_time: 68 | :param max_ep_cycle: max step in per circle 69 | .........................show parameter.................................. 70 | :param verbose 71 | if verbose == 1 show every ep 72 | if verbose == 2 show every step 73 | :param record_ep_inter 74 | record_ep_interact data 75 | :return: None 76 | ''' 77 | # if IL_time is not None: 78 | self.render = render 79 | 80 | # .....................initially——recode...........................# 81 | rollout = 0 82 | now_best_reward = -np.inf 83 | 84 | self.dist = make_pdtype(self.env.action_space, self.policy) 85 | sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable) 86 | while self.step < max_step: 87 | sample = next(sample_generate) 88 | logger.record_tabular("01.step", self.step) 89 | logger.record_tabular("02.episode",self.episode) 90 | logger.record_tabular("03.rollout", rollout) 91 | logger.record_tabular("04.rollout/ep", sample["ep_used"]) 92 | logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"])) 93 | logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"])) 94 | logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"])) 95 | logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"])) 96 | logger.dump_tabular() 97 | csv_record(sample["ep_reward"], self.path) 98 | record_sample = sample["buffer"] 99 | 100 | rollout += 1 101 | 102 | if self.step > learning_start and self.learning: 103 | ep_show = {} 104 | if self.backward_ep_show_list: 105 | for key in self.backward_ep_show_list: 106 | ep_show[key] = 0 107 | rollout_loss = 0 108 | for time in range(train_rollout): 109 | loss, other_infor = self.update(record_sample) 110 | if verbose == 1: 111 | logger.record_tabular("06.train_rollout", time) 112 | logger.record_tabular("07.loss", loss) 113 | flag = 10 114 | if self.backward_step_show_list: 115 | for key in self.backward_step_show_list: 116 | logger.record_tabular(str(flag) +"."+ key, other_infor[key]) 117 | flag += 1 118 | logger.dump_tabular() 119 | rollout_loss += loss 120 | if self.backward_ep_show_list: 121 | for key in self.backward_ep_show_list: 122 | ep_show[key] += other_infor[key] 123 | if verbose == 2: 124 | logger.record_tabular("06.rollouts/loss", rollout_loss) 125 | logger.record_tabular("07.rollouts/episode_Q_value", torch.mean( 126 | torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy()) 127 | # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"]) 128 | # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"]) 129 | # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"])) 130 | 131 | flag = 10 132 | if self.backward_ep_show_list: 133 | for key in self.backward_ep_show_list: 134 | logger.record_tabular(str(flag) + "." + key, ep_show[key]) 135 | flag += 1 136 | logger.dump_tabular() 137 | if np.mean(sample["ep_reward"])>now_best_reward: 138 | self.save_weights(self.path) 139 | print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved") 140 | now_best_reward = np.mean(sample["ep_reward"]) 141 | 142 | def runner(self, sample_step=None, sample_ep=None, max_ep_step=2000, record_ep_inter=None, lstm_enable=False): 143 | if sample_step is not None: 144 | buffer = ReplayMemory(sample_step, ["value", "logp","info"]) 145 | else: 146 | buffer = ReplayMemory(sample_ep*max_ep_step, ["value", "logp","info"]) 147 | s = self.env.reset() 148 | ep_reward, ep_Q_value, ep_step_used = [], [], [] 149 | ep_r, ep_q, ep_cycle = 0, 0, 0 150 | while True: 151 | s = torch.from_numpy(s.astype(np.float32)) 152 | with torch.no_grad(): 153 | outcome = self.policy.forward(s.unsqueeze(0)) 154 | Q = self.value.forward(s.unsqueeze(0)) 155 | pd = self.dist(outcome) 156 | a = pd.sample() 157 | s_, r, done, info = self.env.step(a.cpu().squeeze(0).numpy()) 158 | if self.render: 159 | self.env.render() 160 | ep_r += r 161 | ep_q += Q 162 | ep_cycle +=1 163 | self.step += 1 164 | logp = pd.log_prob(a) 165 | sample_ = { 166 | "s": s, 167 | "a": a.squeeze(0), 168 | "r": torch.tensor(np.array([r]).astype(np.float32)), 169 | "tr": torch.tensor([int(done)]), 170 | "s_":torch.from_numpy(s_), 171 | "logp": logp.squeeze(0), 172 | "value": Q.squeeze(0), 173 | "info": info} 174 | buffer.push(sample_) 175 | s = deepcopy(s_) 176 | 177 | if record_ep_inter is not None: 178 | if self.episode % record_ep_inter == 0: 179 | kvs = {"s": s, "a": a, "s_": s_, "r": r, 180 | "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle} 181 | self.csvwritter.writekvs(kvs) 182 | 183 | if done: 184 | s = self.env.reset() 185 | self.episode += 1 186 | ep_reward.append(ep_r) 187 | ep_Q_value.append(ep_q) 188 | ep_step_used.append(ep_cycle) 189 | ep_r, ep_q, ep_cycle = 0, 0, 0 190 | if lstm_enable: 191 | self.policy.reset_h() 192 | 193 | if sample_step is not None: 194 | if self.step > 0 and self.step % sample_step==0: 195 | s_ = torch.from_numpy(s_[np.newaxis,:].astype(np.float32)) 196 | with torch.no_grad(): 197 | last_Q = self.value.forward(s_).squeeze() 198 | #print("now is we have sampled for :", self.step , "and" , self.episode,"\n", 199 | # "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode", 200 | # "and the mean reward per step is", np.mean(buffer.memory["r"]), 201 | # "the mean ep reward is ", np.mean(ep_reward)) 202 | yield {"buffer": buffer.memory, 203 | "ep_reward": ep_reward, 204 | "ep_Q_value": ep_Q_value, 205 | "ep_step_used": ep_step_used, 206 | "ep_used": len(ep_reward), 207 | "step_used": sample_step, 208 | "last_Q" : last_Q 209 | } 210 | ep_reward, ep_Q_value, ep_step_used = [], [], [] 211 | if sample_step is not None: 212 | buffer = ReplayMemory(sample_step, ["value", "logp","info"]) 213 | else: 214 | buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"]) 215 | 216 | else: 217 | if self.step > 0 and self.episode % sample_ep==0: 218 | s_ = torch.from_numpy(s_.astype(np.float32)) 219 | last_Q = self.value.forward(s_) 220 | #print("now is we have sampled for :", self.step , "and" , self.episode,"\n", 221 | # "this round have sampled for " + str(sample_step) + " steps, ", len(ep_reward), "episode", 222 | # "and the mean reward per step is", np.mean(buffer.memory["r"]), 223 | # "the mean ep reward is ", np.mean(ep_reward)) 224 | yield {"buffer": buffer.memory, 225 | "ep_reward": ep_reward, 226 | "ep_Q_value": ep_Q_value, 227 | "ep_step_used": ep_step_used, 228 | "ep_used": sample_ep, 229 | "step_used": len(buffer.memory["tr"]), 230 | "last_Q": last_Q 231 | } 232 | ep_reward, ep_Q_value = [], [] 233 | if sample_step is not None: 234 | buffer = ReplayMemory(sample_step, ["value", "logp","info"]) 235 | else: 236 | buffer = ReplayMemory(sample_ep * max_ep_step, ["value", "logp","info"]) 237 | 238 | 239 | def update(self, sample): 240 | """Updates the agent after having executed the action returned by `forward`. 241 | If the policy is implemented by a neural network, this corresponds to a weight update using back-prop. 242 | 243 | # Argument 244 | reward (float): The observed reward after executing the action returned by `forward`. 245 | terminal (boolean): `True` if the new state of the environment is terminal. 246 | 247 | # Returns 248 | List of metrics values 249 | """ 250 | raise NotImplementedError() 251 | 252 | def load_weights(self, filepath): 253 | """Loads the weights of an agent from an HDF5 file. 254 | 255 | # Arguments 256 | filepath (str): The path to the HDF5 file. 257 | """ 258 | raise NotImplementedError() 259 | 260 | def save_weights(self, filepath, overwrite=False): 261 | """Saves the weights of an agent as an HDF5 file. 262 | 263 | # Arguments 264 | filepath (str): The path to where the weights should be saved. 265 | overwrite (boolean): If `False` and `filepath` already exists, raises an error. 266 | """ 267 | raise NotImplementedError() 268 | 269 | def cuda(self): 270 | """ 271 | use the cuda 272 | """ 273 | raise NotImplementedError() 274 | 275 | 276 | def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000, 277 | buffer_size = 5000, value_training_round = 10, value_training_fre = 2500, 278 | verbose=2,render = False): 279 | ''' 280 | :param data: the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr 281 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} 282 | :param policy: 283 | :return: 284 | ''' 285 | if data is not None and policy is not None: 286 | raise Exception("The IL only need one way to guide, Please make sure the input ") 287 | 288 | if data is not None: 289 | for time in step_time: 290 | self.step += 1 291 | loss = self.backward(data[time]) 292 | if verbose == 1: 293 | logger.record_tabular("steps", self.step) 294 | logger.record_tabular("loss", loss) 295 | logger.dumpkvs() 296 | 297 | if policy is not None: 298 | buffer = ReplayMemory(buffer_size) 299 | s = self.env.reset() 300 | loss_BC = 0 301 | ep_step,ep_reward = 0, 0 302 | for _ in range(step_time): 303 | self.step += 1 304 | ep_step += 1 305 | a = policy(self.env) 306 | s_, r, done, info = self.env.step(a) 307 | #print(r,info) 308 | ep_reward += r 309 | if render: 310 | self.env.render() 311 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} 312 | buffer.push(sample) 313 | s = s_[:] 314 | if self.step > learning_start: 315 | sample_ = buffer.sample(self.batch_size) 316 | loss = self.policy_behavior_clone(sample_) 317 | if self.step % value_training_fre==0: 318 | record_sample = {} 319 | for key in buffer.memory.keys(): 320 | record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:] 321 | record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"])) 322 | returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"], 323 | self.gamma, self.lam) 324 | record_sample["advs"] = advants 325 | record_sample["return"] = returns 326 | for round_ in range(value_training_round): 327 | loss_value = self.value_pretrain(record_sample, value_training_fre) 328 | print(round_, loss_value) 329 | 330 | if verbose == 1: 331 | logger.record_tabular("learning_steps", self.step) 332 | logger.record_tabular("loss", loss) 333 | logger.record_tabular("rewrad",r) 334 | logger.dumpkvs() 335 | loss_BC += loss 336 | if done: 337 | if verbose == 2: 338 | logger.record_tabular("learning_steps", self.step) 339 | logger.record_tabular("step_used", ep_step) 340 | logger.record_tabular("loss", loss_BC/ep_step) 341 | logger.record_tabular("ep_reward",ep_reward ) 342 | logger.dumpkvs() 343 | 344 | s = self.env.reset() 345 | loss_BC = 0 346 | ep_step,ep_reward = 0, 0 347 | 348 | def policy_behavior_clone(self, sample_): 349 | raise NotImplementedError() 350 | 351 | def value_pretrain(self, sample_): 352 | raise NotImplementedError() 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | -------------------------------------------------------------------------------- /Torch_rl/agent/core_value.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from abc import ABC 3 | import numpy as np 4 | from Torch_rl.common import logger 5 | from Torch_rl.common.logger import CSVOutputFormat 6 | import torch 7 | 8 | 9 | class Agent_value_based(ABC): 10 | """ 11 | 所有算法的父类 12 | Abstract base class for all implemented agents. 13 | 14 | 其中包含了 15 | - `forward` 前向传播、计算action 16 | - `backward` 后向传播、更新网络 17 | - `load_weights` 加载权重 18 | - `save_weights` 存储权重 19 | - `layers` 网络层 20 | - 'forward' 前向传播 21 | - 'backward' 前向传播 22 | 定义 episode 完成一次为一个episode 23 | 定义 step 为交互一次 24 | """ 25 | def __init__(self, path): 26 | self.step = 0 27 | self.episode = 0 28 | """ 29 | config the logfile 30 | """ 31 | configlist = ["stdout", "log", 'tensorboard', "csv"] 32 | if path is None: 33 | path = "./" 34 | configlist = ["stdout", "log", 'tensorboard', "csv"] 35 | logger.configure(path, configlist) 36 | self.csvwritter = CSVOutputFormat(path+"record_trajectory.csv") 37 | loggerCEN = logger.get_current().output_formats[configlist.index('tensorboard')] 38 | self.writer = loggerCEN.writer 39 | self.path = path 40 | 41 | def imitation_learning(self): 42 | pass 43 | 44 | def train(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None): 45 | self.learning = True 46 | print("the train phase ........") 47 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render, 48 | verbose=verbose, record_ep_inter=record_ep_inter) 49 | 50 | def test(self, max_step=None, max_ep_cycle=2000, verbose=2, render=False, record_ep_inter=None): 51 | self.learning = False 52 | self.learning_starts = 0 53 | self.step = 0 54 | self.episode = 0 55 | print("the test phase ........") 56 | self.interact(max_step=max_step, max_ep_cycle=max_ep_cycle, render=render, 57 | verbose=verbose, record_ep_inter=record_ep_inter) 58 | 59 | def interact(self, max_step=50000, max_ep_cycle=2000, render = False, 60 | verbose=1, record_ep_inter=None): 61 | ''' 62 | :param max_step: 63 | :param max_ep_time: 64 | :param max_ep_cycle: max step in per circle 65 | .........................show parameter.................................. 66 | :param verbose 67 | if verbose == 1 show every ep 68 | if verbose == 2 show every step 69 | :param record_ep_inter 70 | record_ep_interact data 71 | :return: None 72 | ''' 73 | # if IL_time is not None: 74 | 75 | # .....................initially——recode...........................# 76 | ep_reward = [] 77 | ep_Q_value = [] 78 | ep_loss = [] 79 | now_best_reward = -np.inf 80 | while self.step < max_step: 81 | s = self.env.reset() 82 | 'reset the ep record' 83 | ep_r, ep_q, ep_l = 0, 0, 0 84 | 'reset the RL flag' 85 | ep_cycle, done = 0, 0 86 | ep_show={} 87 | if self.backward_ep_show_list: 88 | for key in self.backward_ep_show_list: 89 | ep_show[key] = 0 90 | self.episode += 1 91 | while done == 0 and ep_cycle < max_ep_cycle: 92 | self.step += 1 93 | ep_cycle += 1 94 | 'the interaction part' 95 | a, Q, info_forward = self.forward(s) 96 | # print(a) 97 | s_, r, done, info = self.env.step(a) 98 | sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} 99 | s = deepcopy(s_) 100 | loss, info_backward = self.backward(sample) 101 | if render: 102 | self.env.render() 103 | 'the record part' 104 | 105 | if verbose == 1 and self.step > self.learning_starts: 106 | logger.record_tabular("steps", self.step) 107 | logger.record_tabular("episodes", self.episode) 108 | logger.record_tabular("loss", loss) 109 | logger.record_tabular("reward", r) 110 | logger.record_tabular("Q", Q) 111 | if self.forward_step_show_list: 112 | for key in self.forward_step_show_list: 113 | logger.record_tabular(key, info_forward[key]) 114 | if self.backward_step_show_list: 115 | for key in self.backward_step_show_list: 116 | logger.record_tabular(key, info_backward[key]) 117 | logger.dump_tabular() 118 | if record_ep_inter is not None: 119 | if self.episode % record_ep_inter == 0: 120 | kvs = {"s": s, "a": a, "s_": s_, "r": r, 121 | "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle} 122 | self.csvwritter.writekvs(kvs) 123 | ep_r += r 124 | ep_q += Q 125 | ep_l += loss 126 | if self.backward_ep_show_list: 127 | for key in self.backward_ep_show_list: 128 | ep_show[key] += info_backward[key] 129 | if done: 130 | ep_reward.append(ep_r) 131 | ep_Q_value.append(ep_q) 132 | ep_loss.append(ep_l) 133 | mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1) 134 | if verbose == 2 and self.step > self.learning_starts: 135 | logger.record_tabular("01.steps", self.step) 136 | logger.record_tabular("02.episodes", self.episode) 137 | logger.record_tabular("03.episode_reward", ep_reward[-1]) 138 | # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle) 139 | logger.record_tabular("05.episode_loss", ep_l) 140 | # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle) 141 | # logger.record_tabular("07.episode_Q_value", ep_q) 142 | logger.record_tabular("08.episode_Q_value_per_step", ep_q / ep_cycle) 143 | # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward) 144 | # logger.record_tabular("10.step_used", ep_cycle) 145 | flag = 11 146 | if self.forward_ep_show_list: 147 | for key in self.forward_ep_show_list: 148 | logger.record_tabular(str(flag) + "." + key, info_forward[key]) 149 | flag += 1 150 | if self.backward_ep_show_list: 151 | for key in self.backward_ep_show_list: 152 | logger.record_tabular(str(flag) + "." + key, ep_show[key]) 153 | flag += 1 154 | logger.dump_tabular() 155 | if np.mean(ep_r)>now_best_reward: 156 | self.save_weights(self.path) 157 | print("the best mean ep reward is ", np.mean(ep_r), "the weight is saved") 158 | now_best_reward = np.mean(ep_r) 159 | 160 | 161 | def forward(self, observation): 162 | """Takes the an observation from the environment and returns the action to be taken next. 163 | If the policy is implemented by a neural network, this corresponds to a forward (inference) pass. 164 | 165 | # Argument 166 | observation (object): The current observation from the environment. 167 | 168 | # Returns 169 | The next action to be executed in the environment. 170 | """ 171 | raise NotImplementedError() 172 | 173 | def backward(self, sample): 174 | """Updates the agent after having executed the action returned by `forward`. 175 | If the policy is implemented by a neural network, this corresponds to a weight update using back-prop. 176 | 177 | # Argument 178 | reward (float): The observed reward after executing the action returned by `forward`. 179 | terminal (boolean): `True` if the new state of the environment is terminal. 180 | 181 | # Returns 182 | List of metrics values 183 | """ 184 | raise NotImplementedError() 185 | 186 | def load_weights(self, filepath): 187 | """Loads the weights of an agent from an HDF5 file. 188 | 189 | # Arguments 190 | filepath (str): The path to the HDF5 file. 191 | """ 192 | raise NotImplementedError() 193 | 194 | def save_weights(self, filepath, overwrite=False): 195 | """Saves the weights of an agent as an HDF5 file. 196 | 197 | # Arguments 198 | filepath (str): The path to where the weights should be saved. 199 | overwrite (boolean): If `False` and `filepath` already exists, raises an error. 200 | """ 201 | raise NotImplementedError() 202 | 203 | def cuda(self): 204 | """ 205 | use the cuda 206 | """ 207 | raise NotImplementedError() 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /Torch_rl/algorithm/PPO_LSTM.py: -------------------------------------------------------------------------------- 1 | 2 | from Torch_rl.agent.core_policy import Agent_policy_based 3 | import torch.nn as nn 4 | from copy import deepcopy 5 | from Torch_rl.common.distribution import * 6 | from torch.optim import Adam 7 | from torch.autograd import Variable 8 | from Torch_rl.common.memory import ReplayMemory 9 | 10 | 11 | class PPO_Agent(Agent_policy_based): 12 | def __init__(self, env, policy_model, value_model, 13 | lr=5e-4, ent_coef=0.01, vf_coef=0.5, 14 | ## hyper-parawmeter 15 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=200, 16 | running_step=2048, running_ep=20, value_regular=0.01, buffer_size=50000, 17 | ## decay 18 | decay=False, decay_rate=0.9, lstm_enable=False, 19 | ## 20 | path=None): 21 | self.gpu = False 22 | self.env = env 23 | self.gamma = gamma 24 | self.lam = lam 25 | self.ent_coef = ent_coef 26 | self.vf_coef = vf_coef 27 | self.cliprange = cliprange 28 | 29 | self.value_train_step = value_train_round 30 | 31 | self.sample_rollout = running_step 32 | self.sample_ep = running_ep 33 | self.batch_size = batch_size 34 | self.lstm_enable = lstm_enable 35 | self.replay_buffer = ReplayMemory(buffer_size, other_record=["value", "return"]) 36 | 37 | self.loss_cal = torch.nn.SmoothL1Loss() 38 | 39 | self.policy = policy_model 40 | if value_model == "shared": 41 | self.value = policy_model 42 | elif value_model == "copy": 43 | self.value = deepcopy(policy_model) 44 | else: 45 | self.value = value_model 46 | 47 | self.dist = make_pdtype(env.action_space, policy_model) 48 | 49 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr) 50 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular) 51 | if decay: 52 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate, 53 | last_epoch=-1) 54 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate, 55 | last_epoch=-1) 56 | 57 | super(PPO_Agent, self).__init__(path) 58 | #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape)) 59 | #self.writer.add_graph(self.policy, input_to_model=example_input) 60 | 61 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 62 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 63 | 64 | self.training_round = 0 65 | self.running_step = 0 66 | self.record_sample = None 67 | self.training_step = 0 68 | self.lstm_enable = True 69 | 70 | def update(self, sample): 71 | step_len = len(sample["s"]) 72 | for ki in range(step_len): 73 | sample_ = { 74 | "s": sample["s"][ki].cpu().numpy(), 75 | "a": sample["a"][ki].cpu().numpy(), 76 | "r": sample["r"][ki].cpu().numpy(), 77 | "tr": sample["tr"][ki].cpu().numpy(), 78 | "s_": sample["s_"][ki].cpu().numpy(), 79 | "value": sample["value"][ki].cpu().numpy(), 80 | "return": sample["return"][ki].cpu().numpy() 81 | } 82 | self.replay_buffer.push(sample_) 83 | ''' 84 | train the value part 85 | ''' 86 | vfloss_re = [] 87 | for _ in range(self.value_train_step): 88 | tarin_value_sample = self.replay_buffer.sample(self.batch_size) 89 | for key in tarin_value_sample.keys(): 90 | if self.gpu: 91 | tarin_value_sample[key] = tarin_value_sample[key].cuda() 92 | else: 93 | tarin_value_sample[key] = tarin_value_sample[key] 94 | old_value = tarin_value_sample["value"] 95 | training_s = tarin_value_sample["s"] 96 | R = tarin_value_sample["return"].squeeze() 97 | value_now = self.value.forward(training_s).squeeze() 98 | # value loss 99 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 100 | max=self.cliprange) # Clipped value 101 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 102 | # vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 103 | # vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 104 | self.value_model_optim.zero_grad() 105 | vf_loss1.backward() 106 | self.value_model_optim.step() 107 | vfloss_re.append(vf_loss1.cpu().detach().numpy()) 108 | 109 | ''' 110 | train the policy part 111 | ''' 112 | 113 | for key in sample.keys(): 114 | temp = torch.stack(list(sample[key]), 0).squeeze() 115 | if self.gpu: 116 | sample[key] = temp.cuda() 117 | else: 118 | sample[key] = temp 119 | 120 | array_index = [] 121 | if self.lstm_enable: 122 | for time in range(step_len): 123 | array_index.append([time]) 124 | "训练前重制" 125 | self.policy.reset_h() 126 | time_round = step_len 127 | else: 128 | time_round = np.ceil(step_len / self.batch_size) 129 | time_left = time_round * self.batch_size - step_len 130 | array = list(range(step_len)) + list(range(int(time_left))) 131 | array_index = [] 132 | for train_time in range(int(time_round)): 133 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size]) 134 | 135 | loss_re, pgloss_re, enloss_re = [], [], [] 136 | for train_time in range(int(time_round)): 137 | index = array_index[train_time] 138 | training_s = sample["s"][index].detach() 139 | training_a = sample["a"][index].detach() 140 | old_neglogp = sample["logp"][index].detach() 141 | advs = sample["advs"][index].detach() 142 | 143 | " CALCULATE THE LOSS" 144 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" 145 | 146 | #generate Policy gradient loss 147 | outcome = self.policy.forward(training_s).squeeze() 148 | # new_neg_lop = torch.empty(size=(self.batch_size,)) 149 | # for time in range(self.batch_size): 150 | # new_policy = self.dist(outcome[time]) 151 | # new_neg_lop[time] = new_policy.log_prob(training_a[time]) 152 | new_policy = self.dist(outcome) 153 | new_neg_lop = new_policy.log_prob(training_a) 154 | ratio = torch.exp(new_neg_lop - old_neglogp) 155 | pg_loss1 = -advs * ratio 156 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 157 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean() 158 | 159 | # entropy 160 | entropy = new_policy.entropy().mean() 161 | # loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 162 | loss = pg_loss - entropy * self.ent_coef 163 | self.policy_model_optim.zero_grad() 164 | loss.backward() 165 | self.policy_model_optim.step() 166 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 167 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 168 | loss_re = loss.cpu().detach().numpy() 169 | pgloss_re.append(pg_loss.cpu().detach().numpy()) 170 | enloss_re.append(entropy.cpu().detach().numpy()) 171 | if self.lstm_enable: 172 | if sample["tr"][index] == 1: 173 | self.policy.reset_h() 174 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re), 175 | "entropy": np.sum(enloss_re), 176 | "vf_loss": np.sum(vfloss_re)} 177 | 178 | 179 | def load_weights(self, filepath): 180 | model = torch.load(filepath+"/PPO.pkl") 181 | self.policy.load_state_dict(model["policy"].state_dict()) 182 | self.value.load_state_dict(model["value"].state_dict()) 183 | 184 | 185 | def save_weights(self, filepath, overwrite=False): 186 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl") 187 | 188 | def policy_behavior_clone(self, sample_): 189 | action_label = sample_["a"].squeeze() 190 | if self.gpu: 191 | action_predict = self.policy(sample_["s"].cuda()) 192 | action_label = action_label.cuda() 193 | else: 194 | action_predict = self.policy(sample_["s"]) 195 | loss_bc = self.loss_cal(action_label, action_predict) 196 | del action_label 197 | del action_predict 198 | loss = loss_bc 199 | self.policy_model_optim.zero_grad() 200 | loss.backward() 201 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 202 | self.policy_model_optim.step() 203 | return loss.cpu().detach().numpy() 204 | 205 | def value_pretrain(self, record_sample, new_sample_len): 206 | train_times = int(np.floor(new_sample_len/128)) 207 | round_loss = 0 208 | for io in range(train_times-1): 209 | index = list(range(128 * io, 128 * (io + 1))) 210 | if self.gpu: 211 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda() 212 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda() 213 | else: 214 | predict = torch.from_numpy(np.array(record_sample["s"])[index]) 215 | lable = torch.from_numpy(np.array(record_sample["return"]))[index] 216 | value_now = self.value.forward(predict) 217 | # value loss 218 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss 219 | del predict 220 | del lable 221 | self.value_model_optim.zero_grad() 222 | vf_loss.backward() 223 | self.value_model_optim.step() 224 | round_loss += vf_loss.cpu().detach().numpy() 225 | return round_loss 226 | 227 | def cuda(self, device=None): 228 | self.policy.to_gpu(device) 229 | self.value.to_gpu(device) 230 | self.loss_cal = self.loss_cal.cuda(device) 231 | self.gpu = True -------------------------------------------------------------------------------- /Torch_rl/algorithm/PPO_Lagrangian.py: -------------------------------------------------------------------------------- 1 | 2 | from Torch_rl.agent.core_policy import Agent_policy_based 3 | import torch.nn as nn 4 | from copy import deepcopy 5 | from Torch_rl.common.distribution import * 6 | from torch.optim import Adam 7 | from torch.autograd import Variable 8 | from Torch_rl.common.util import get_gae 9 | 10 | 11 | class ui_model(torch.nn.Module): 12 | def __init__(self): 13 | self.ui = torch.nn.parameter.Parameter([1], require_grad=True) 14 | def forward(self, x): 15 | x = self.ui * x 16 | return x 17 | 18 | class PPO_LAGRANGIAN_Agent(Agent_policy_based): 19 | def __init__(self, env, policy_model, value_model, 20 | lr=5e-4, ent_coef=0.01, vf_coef=0.5, 21 | ## hyper-parawmeter 22 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=10, 23 | running_step=2048, running_ep=20, value_regular=0.01, 24 | ## decay 25 | decay=False, decay_rate=0.9, lstm_enable=False, 26 | ## 27 | path=None): 28 | self.gpu = False 29 | self.env = env 30 | self.gamma = gamma 31 | self.lam = lam 32 | self.ent_coef = ent_coef 33 | self.vf_coef = vf_coef 34 | self.cliprange = cliprange 35 | 36 | self.value_train_step = value_train_round 37 | 38 | self.sample_rollout = running_step 39 | self.sample_ep = running_ep 40 | self.batch_size = batch_size 41 | self.lstm_enable = lstm_enable 42 | 43 | self.loss_cal = torch.nn.SmoothL1Loss() 44 | 45 | self.policy = policy_model 46 | if value_model == "shared": 47 | self.value = policy_model 48 | elif value_model == "copy": 49 | self.value = deepcopy(policy_model) 50 | else: 51 | self.value = value_model 52 | 53 | self.cost_value = deepcopy(self.value) 54 | 55 | self.dist = make_pdtype(env.action_space, policy_model) 56 | 57 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr) 58 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular) 59 | self.cost_value_model_optim = Adam(self.cost_value.parameters(), lr=lr, weight_decay=value_regular) 60 | if decay: 61 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate, 62 | last_epoch=-1) 63 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate, 64 | last_epoch=-1) 65 | self.cost_value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate, 66 | last_epoch=-1) 67 | 68 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 69 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2) 70 | 71 | super(PPO_LAGRANGIAN_Agent, self).__init__(path) 72 | example_input = Variable(torch.rand((100,)+self.env.observation_space.shape)) 73 | self.writer.add_graph(self.policy, input_to_model=example_input) 74 | 75 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss","cost_value"] 76 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss","cost_value"] 77 | 78 | self.training_round = 0 79 | self.running_step = 0 80 | self.record_sample = None 81 | self.training_step = 0 82 | 83 | self.ui = ui_model() 84 | self.ui_optim = Adam(self.ui.parameters(), lr=lr) 85 | 86 | 87 | def update(self, sample): 88 | 89 | returns, advants = get_gae(sample["r"], sample["tr"], sample["value"], self.gamma, 90 | self.lam) 91 | sample["advs"] = advants.unsqueeze(1) 92 | sample["return"] = returns.unsqueeze(1) 93 | 94 | sample["cost"] = [] 95 | for info in sample["info"]: 96 | sample["cost"].append(info["cost"]) 97 | 98 | sample["cost_value"] = self.cost_value.forward(sample["s"]) 99 | 100 | returns, advants = get_gae(sample["cost"], sample["tr"], sample["cost_value"], self.gamma, 101 | self.lam) 102 | sample["cost_advs"] = advants.unsqueeze(1) 103 | sample["cost_return"] = returns.unsqueeze(1) 104 | 105 | 106 | step_len = len(sample["s"]) 107 | if self.lstm_enable: 108 | flagin = [time for time in range(step_len) if sample["tr"][time]==1] 109 | time_round = len(flagin) 110 | array_index = [] 111 | for train_time in range(int(time_round)-1): 112 | array_index.append(range(flagin[train_time], flagin[train_time+1])) 113 | else: 114 | time_round = np.ceil(step_len/self.batch_size) 115 | time_left = time_round*self.batch_size-step_len 116 | array = list(range(step_len)) +list(range(int(time_left))) 117 | array_index = [] 118 | for train_time in range(int(time_round)): 119 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size]) 120 | loss_re, pgloss_re, enloss_re, vfloss_re = [], [], [], [] 121 | 122 | for key in sample.keys(): 123 | temp = torch.stack(list(sample[key]), 0) 124 | if self.gpu: 125 | sample[key] = temp.cuda() 126 | else: 127 | sample[key] = temp 128 | for train_time in range(int(time_round)): 129 | index = array_index[train_time] 130 | # for index in range(step_len): 131 | training_s = sample["s"][index].detach() 132 | training_a = sample["a"][index].detach() 133 | training_r = sample["r"][index].detach() 134 | R = sample["return"][index].detach() 135 | old_value = sample["value"][index].detach() 136 | old_neglogp = sample["logp"][index].detach() 137 | advs = sample["advs"][index].detach() 138 | c_advs = sample["cost_advs"][index].detach() 139 | c_value = sample["cost_value"][index].detach() 140 | cost = sample["cost"][index].detach() 141 | 142 | " CALCULATE THE LOSS" 143 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" 144 | 145 | " the value loss" 146 | value_now = self.value.forward(training_s) 147 | # value loss 148 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 149 | max=self.cliprange) # Clipped value 150 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 151 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 152 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 153 | 154 | # generate Policy gradient loss 155 | outcome = self.policy.forward(training_s) 156 | new_policy = self.dist(outcome) 157 | new_neg_lop = new_policy.log_prob(training_a) 158 | ratio = torch.exp(new_neg_lop - old_neglogp) 159 | pg_loss1 = -advs * ratio 160 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 161 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean() 162 | 163 | # generate Policy gradient loss 164 | c_pg_loss1 = -c_advs * ratio 165 | c_pg_loss2 = -c_advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 166 | c_pg_loss = .5 * torch.max(c_pg_loss1, c_pg_loss2).mean() 167 | 168 | 169 | 170 | # entropy 171 | entropy = new_policy.entropy().mean() 172 | loss = pg_loss- self.ui * c_pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 173 | 174 | self.policy_model_optim.zero_grad() 175 | pg_loss.backward() 176 | self.policy_model_optim.step() 177 | for _ in range(self.value_train_step): 178 | value_now = self.value.forward(training_s) 179 | # value loss 180 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 181 | max=self.cliprange) # Clipped value 182 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 183 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 184 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 185 | self.value_model_optim.zero_grad() 186 | vf_loss1.backward() 187 | self.value_model_optim.step() 188 | 189 | cost_now = self.cost_value.forward(training_s) 190 | cost_vloss = self.loss_cal(cost_now, cost) 191 | 192 | self.cost_value_model_optim.zero_grad() 193 | cost_vloss.backward() 194 | self.cost_value_model_optim.step() 195 | 196 | 197 | loss_re = loss.cpu().detach().numpy() 198 | pgloss_re.append(pg_loss.cpu().detach().numpy()) 199 | enloss_re.append(entropy.cpu().detach().numpy()) 200 | vfloss_re.append(vf_loss1.cpu().detach().numpy()) 201 | "training the weights ui" 202 | for i in sample["cost"]: 203 | cost = self.ui*sample["cost_value"] 204 | self.ui_optim.zero_grad() 205 | cost.backward() 206 | self.ui_optim.step() 207 | 208 | 209 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re), 210 | "entropy": np.sum(enloss_re), 211 | "vf_loss": np.sum(vfloss_re)} 212 | 213 | def load_weights(self, filepath): 214 | model = torch.load(filepath+"/PPO.pkl") 215 | self.policy.load_state_dict(model["policy"].state_dict()) 216 | self.value.load_state_dict(model["value"].state_dict()) 217 | 218 | 219 | def save_weights(self, filepath, overwrite=False): 220 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl") 221 | 222 | 223 | def cuda(self): 224 | self.policy.to_gpu() 225 | self.value.to_gpu() 226 | self.loss_cal = self.loss_cal.cuda() 227 | self.gpu = True 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /Torch_rl/algorithm/SPPO.py: -------------------------------------------------------------------------------- 1 | 2 | from Torch_rl.agent.core_policy import Agent_policy_based 3 | import torch.nn as nn 4 | from copy import deepcopy 5 | from Torch_rl.common.distribution import * 6 | from torch.optim import Adam 7 | from torch.autograd import Variable 8 | from Torch_rl.common.memory import ReplayMemory 9 | 10 | 11 | class PPO_Agent(Agent_policy_based): 12 | def __init__(self, env, policy_model, value_model, 13 | lr=5e-4, ent_coef=0.01, vf_coef=0.5, 14 | ## hyper-parawmeter 15 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size=64, value_train_round=200, 16 | running_step=2048, running_ep=20, value_regular=0.01, buffer_size=50000, 17 | ## decay 18 | decay=False, decay_rate=0.9, lstm_enable=False, 19 | ## 20 | path=None): 21 | self.gpu = False 22 | self.env = env 23 | self.gamma = gamma 24 | self.lam = lam 25 | self.ent_coef = ent_coef 26 | self.vf_coef = vf_coef 27 | self.cliprange = cliprange 28 | 29 | self.value_train_step = value_train_round 30 | 31 | self.sample_rollout = running_step 32 | self.sample_ep = running_ep 33 | self.batch_size = batch_size 34 | self.lstm_enable = lstm_enable 35 | self.replay_buffer = ReplayMemory(buffer_size, other_record=["value", "return"]) 36 | 37 | self.loss_cal = torch.nn.SmoothL1Loss() 38 | 39 | self.policy = policy_model 40 | if value_model == "shared": 41 | self.value = policy_model 42 | elif value_model == "copy": 43 | self.value = deepcopy(policy_model) 44 | else: 45 | self.value = value_model 46 | 47 | self.dist = make_pdtype(env.action_space, policy_model) 48 | 49 | self.policy_model_optim = Adam(self.policy.parameters(), lr=lr) 50 | self.value_model_optim = Adam(self.value.parameters(), lr=lr, weight_decay=value_regular) 51 | if decay: 52 | self.policy_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.policy_model_optim, decay_rate, 53 | last_epoch=-1) 54 | self.value_model_decay_optim = torch.optim.lr_scheduler.ExponentialLR(self.value_model_optim, decay_rate, 55 | last_epoch=-1) 56 | 57 | #torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 58 | #torch.nn.utils.clip_grad_norm_(self.value.parameters(), 1, norm_type=2) 59 | 60 | super(PPO_Agent, self).__init__(path) 61 | #example_input = Variable(torch.rand((100,)+self.env.observation_space.shape)) 62 | #self.writer.add_graph(self.policy, input_to_model=example_input) 63 | 64 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 65 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 66 | 67 | self.training_round = 0 68 | self.running_step = 0 69 | self.record_sample = None 70 | self.training_step = 0 71 | 72 | 73 | def update(self, sample): 74 | step_len = len(sample["s"]) 75 | for ki in range(step_len): 76 | sample_ = { 77 | "s": sample["s"][ki].cpu().numpy(), 78 | "a": sample["a"][ki].cpu().numpy(), 79 | "r": sample["r"][ki].cpu().numpy(), 80 | "tr": sample["tr"][ki].cpu().numpy(), 81 | "s_": sample["s_"][ki].cpu().numpy(), 82 | "value": sample["value"][ki].cpu().numpy(), 83 | "return": sample["return"][ki].cpu().numpy() 84 | } 85 | self.replay_buffer.push(sample_) 86 | ''' 87 | train the value part 88 | ''' 89 | vfloss_re = [] 90 | for _ in range(self.value_train_step): 91 | tarin_value_sample = self.replay_buffer.sample(self.batch_size) 92 | for key in tarin_value_sample.keys(): 93 | if self.gpu: 94 | tarin_value_sample[key] = tarin_value_sample[key].cuda() 95 | else: 96 | tarin_value_sample[key] = tarin_value_sample[key] 97 | old_value = tarin_value_sample["value"] 98 | training_s = tarin_value_sample["s"] 99 | R = tarin_value_sample["return"] 100 | value_now = self.value.forward(training_s).squeeze() 101 | # value loss 102 | value_clip = old_value + torch.clamp(old_value - value_now, min=-self.cliprange, 103 | max=self.cliprange) # Clipped value 104 | vf_loss1 = self.loss_cal(value_now, R) # Unclipped loss 105 | vf_loss2 = self.loss_cal(value_clip, R) # clipped loss 106 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 107 | self.value_model_optim.zero_grad() 108 | vf_loss1.backward() 109 | self.value_model_optim.step() 110 | vfloss_re.append(vf_loss1.cpu().detach().numpy()) 111 | 112 | ''' 113 | train the policy part 114 | ''' 115 | 116 | for key in sample.keys(): 117 | temp = torch.stack(list(sample[key]), 0).squeeze() 118 | if self.gpu: 119 | sample[key] = temp.cuda() 120 | else: 121 | sample[key] = temp 122 | 123 | array_index = [] 124 | time_round = np.ceil(step_len / self.batch_size) 125 | time_left = time_round * self.batch_size - step_len 126 | array = list(range(step_len)) + list(range(int(time_left))) 127 | array_index = [] 128 | for train_time in range(int(time_round)): 129 | array_index.append(array[train_time * self.batch_size: (train_time + 1) * self.batch_size]) 130 | 131 | loss_re, pgloss_re, enloss_re = [], [], [] 132 | for train_time in range(int(time_round)): 133 | index = array_index[train_time] 134 | # for index in range(step_len): 135 | training_s = sample["s"][index].detach() 136 | training_a = sample["a"][index].detach() 137 | old_neglogp = sample["logp"][index].detach() 138 | advs = sample["advs"][index].detach() 139 | 140 | " CALCULATE THE LOSS" 141 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" 142 | 143 | #generate Policy gradient loss 144 | outcome = self.policy.forward(training_s).squeeze() 145 | # new_neg_lop = torch.empty(size=(self.batch_size,)) 146 | # for time in range(self.batch_size): 147 | # new_policy = self.dist(outcome[time]) 148 | # new_neg_lop[time] = new_policy.log_prob(training_a[time]) 149 | new_policy = self.dist(outcome) 150 | new_neg_lop = new_policy.log_prob(training_a) 151 | ratio = torch.exp(new_neg_lop - old_neglogp) 152 | pg_loss1 = -advs * ratio 153 | pg_loss2 = -advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 154 | pg_loss = .5 * torch.max(pg_loss1, pg_loss2).mean() 155 | 156 | # entropy 157 | entropy = new_policy.entropy().mean() 158 | # loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 159 | loss = pg_loss - entropy * self.ent_coef 160 | self.policy_model_optim.zero_grad() 161 | loss.backward() 162 | self.policy_model_optim.step() 163 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 164 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 165 | loss_re = loss.cpu().detach().numpy() 166 | pgloss_re.append(pg_loss.cpu().detach().numpy()) 167 | enloss_re.append(entropy.cpu().detach().numpy()) 168 | 169 | return np.sum(loss_re), {"pg_loss": np.sum(pgloss_re), 170 | "entropy": np.sum(enloss_re), 171 | "vf_loss": np.sum(vfloss_re)} 172 | 173 | 174 | def load_weights(self, filepath): 175 | model = torch.load(filepath+"/PPO.pkl") 176 | self.policy.load_state_dict(model["policy"].state_dict()) 177 | self.value.load_state_dict(model["value"].state_dict()) 178 | 179 | 180 | def save_weights(self, filepath, overwrite=False): 181 | torch.save({"policy": self.policy,"value": self.value}, filepath + "/PPO.pkl") 182 | 183 | def policy_behavior_clone(self, sample_): 184 | action_label = sample_["a"].squeeze() 185 | if self.gpu: 186 | action_predict = self.policy(sample_["s"].cuda()) 187 | action_label = action_label.cuda() 188 | else: 189 | action_predict = self.policy(sample_["s"]) 190 | loss_bc = self.loss_cal(action_label, action_predict) 191 | del action_label 192 | del action_predict 193 | loss = loss_bc 194 | self.policy_model_optim.zero_grad() 195 | loss.backward() 196 | nn.utils.clip_grad_norm_(self.policy.parameters(), 1, norm_type=2) 197 | self.policy_model_optim.step() 198 | return loss.cpu().detach().numpy() 199 | 200 | def value_pretrain(self, record_sample, new_sample_len): 201 | train_times = int(np.floor(new_sample_len/128)) 202 | round_loss = 0 203 | for io in range(train_times-1): 204 | index = list(range(128 * io, 128 * (io + 1))) 205 | if self.gpu: 206 | predict = torch.from_numpy(np.array(record_sample["s"])[index]).cuda() 207 | lable = torch.from_numpy(np.array(record_sample["return"]))[index].cuda() 208 | else: 209 | predict = torch.from_numpy(np.array(record_sample["s"])[index]) 210 | lable = torch.from_numpy(np.array(record_sample["return"]))[index] 211 | value_now = self.value.forward(predict) 212 | # value loss 213 | vf_loss = self.loss_cal(value_now, lable) # Unclipped loss 214 | del predict 215 | del lable 216 | self.value_model_optim.zero_grad() 217 | vf_loss.backward() 218 | self.value_model_optim.step() 219 | round_loss += vf_loss.cpu().detach().numpy() 220 | return round_loss 221 | 222 | def cuda(self, device=None): 223 | self.policy.to_gpu(device) 224 | self.value.to_gpu(device) 225 | self.loss_cal = self.loss_cal.cuda(device) 226 | self.gpu = True -------------------------------------------------------------------------------- /Torch_rl/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/algorithm/__init__.py -------------------------------------------------------------------------------- /Torch_rl/common/Policy_for_DQN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def get_object_config(o): 4 | if o is None: 5 | return None 6 | 7 | config = { 8 | 'class_name': o.__class__.__name__, 9 | 'config': o.get_config() 10 | } 11 | return config 12 | 13 | 14 | class Policy(object): 15 | """Abstract base class for all implemented policies. 16 | 17 | Each policy helps with selection of action to take on an environment. 18 | 19 | Do not use this abstract base class directly but instead use one of the concrete policies implemented. 20 | To implement your own policy, you have to implement the following methods: 21 | 22 | - `select_action` 23 | 24 | # Arguments 25 | agent (rl.core.Agent): Agent used 26 | """ 27 | def _set_agent(self, agent): 28 | self.agent = agent 29 | 30 | @property 31 | def metrics_names(self): 32 | return [] 33 | 34 | @property 35 | def metrics(self): 36 | return [] 37 | 38 | def select_action(self, **kwargs): 39 | raise NotImplementedError() 40 | 41 | def get_config(self): 42 | """Return configuration of the policy 43 | 44 | # Returns 45 | Configuration as dict 46 | """ 47 | return {} 48 | 49 | 50 | class LinearAnnealedPolicy(Policy): 51 | """Implement the linear annealing policy 52 | 53 | Linear Annealing Policy computes a current threshold value and 54 | transfers it to an inner policy which chooses the action. The threshold 55 | value is following a linear function decreasing over time.""" 56 | def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps): 57 | if not hasattr(inner_policy, attr): 58 | raise ValueError('Policy does not have attribute "{}".'.format(attr)) 59 | 60 | super(LinearAnnealedPolicy, self).__init__() 61 | 62 | self.inner_policy = inner_policy 63 | self.attr = attr 64 | self.value_max = value_max 65 | self.value_min = value_min 66 | self.value_test = value_test 67 | self.nb_steps = nb_steps 68 | 69 | def get_current_value(self): 70 | """Return current annealing value 71 | 72 | # Returns 73 | Value to use in annealing 74 | """ 75 | if self.agent.training: 76 | # Linear annealed: f(x) = ax + b. 77 | a = -float(self.value_max - self.value_min) / float(self.nb_steps) 78 | b = float(self.value_max) 79 | value = max(self.value_min, a * float(self.agent.step) + b) 80 | else: 81 | value = self.value_test 82 | return value 83 | 84 | def select_action(self, **kwargs): 85 | """Choose an action to perform 86 | 87 | # Returns 88 | Action to take (int) 89 | """ 90 | setattr(self.inner_policy, self.attr, self.get_current_value()) 91 | return self.inner_policy.select_action(**kwargs) 92 | 93 | @property 94 | def metrics_names(self): 95 | """Return names of metrics 96 | 97 | # Returns 98 | List of metric names 99 | """ 100 | return ['mean_{}'.format(self.attr)] 101 | 102 | @property 103 | def metrics(self): 104 | """Return metrics values 105 | 106 | # Returns 107 | List of metric values 108 | """ 109 | 110 | return [getattr(self.inner_policy, self.attr)] 111 | 112 | def get_config(self): 113 | """Return configurations of LinearAnnealedPolicy 114 | 115 | # Returns 116 | Dict of config 117 | """ 118 | config = super(LinearAnnealedPolicy, self).get_config() 119 | config['attr'] = self.attr 120 | config['value_max'] = self.value_max 121 | config['value_min'] = self.value_min 122 | config['value_test'] = self.value_test 123 | config['nb_steps'] = self.nb_steps 124 | config['inner_policy'] = get_object_config(self.inner_policy) 125 | return config 126 | 127 | class SoftmaxPolicy(Policy): 128 | """ Implement softmax policy for multinimial distribution 129 | 130 | Simple Policy 131 | 132 | - takes action according to the pobability distribution 133 | 134 | """ 135 | def select_action(self, nb_actions, probs): 136 | """Return the selected action 137 | 138 | # Arguments 139 | probs (np.ndarray) : Probabilty for each action 140 | 141 | # Returns 142 | action 143 | 144 | """ 145 | action = np.random.choice(range(nb_actions), p=probs) 146 | return action 147 | 148 | class EpsGreedyQPolicy(Policy): 149 | """Implement the epsilon greedy policy 150 | 151 | Eps Greedy policy either: 152 | 153 | - takes a random action with probability epsilon 154 | - takes current best action with prob (1 - epsilon) 155 | """ 156 | def __init__(self, eps=.1): 157 | super(EpsGreedyQPolicy, self).__init__() 158 | self.eps = eps 159 | 160 | def select_action(self, q_values): 161 | """Return the selected action 162 | 163 | # Arguments 164 | q_values (np.ndarray): List of the estimations of Q for each action 165 | 166 | # Returns 167 | Selection action 168 | """ 169 | assert q_values.ndim == 1 170 | nb_actions = q_values.shape[0] 171 | 172 | if np.random.uniform() < self.eps: 173 | action = np.random.randint(0, nb_actions) 174 | else: 175 | action = np.argmax(q_values) 176 | return action 177 | 178 | def get_config(self): 179 | """Return configurations of EpsGreedyQPolicy 180 | 181 | # Returns 182 | Dict of config 183 | """ 184 | config = super(EpsGreedyQPolicy, self).get_config() 185 | config['eps'] = self.eps 186 | return config 187 | 188 | 189 | class GreedyQPolicy(Policy): 190 | """Implement the greedy policy 191 | 192 | Greedy policy returns the current best action according to q_values 193 | """ 194 | def select_action(self, q_values): 195 | """Return the selected action 196 | 197 | # Arguments 198 | q_values (np.ndarray): List of the estimations of Q for each action 199 | 200 | # Returns 201 | Selection action 202 | """ 203 | assert q_values.ndim == 1 204 | action = np.argmax(q_values) 205 | return action 206 | 207 | 208 | 209 | class BoltzmannQPolicy(Policy): 210 | """Implement the Boltzmann Q Policy 211 | 212 | Boltzmann Q Policy builds a probability law on q values and returns 213 | an action selected randomly according to this law. 214 | """ 215 | def __init__(self, tau=1., clip=(-500., 500.)): 216 | super(BoltzmannQPolicy, self).__init__() 217 | self.tau = tau 218 | self.clip = clip 219 | 220 | def select_action(self, q_values): 221 | """Return the selected action 222 | 223 | # Arguments 224 | q_values (np.ndarray): List of the estimations of Q for each action 225 | 226 | # Returns 227 | Selection action 228 | """ 229 | assert q_values.ndim == 1 230 | q_values = q_values.astype('float64') 231 | nb_actions = q_values.shape[0] 232 | 233 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1])) 234 | probs = exp_values / np.sum(exp_values) 235 | action = np.random.choice(range(nb_actions), p=probs) 236 | return action 237 | 238 | def get_config(self): 239 | """Return configurations of BoltzmannQPolicy 240 | 241 | # Returns 242 | Dict of config 243 | """ 244 | config = super(BoltzmannQPolicy, self).get_config() 245 | config['tau'] = self.tau 246 | config['clip'] = self.clip 247 | return config 248 | 249 | 250 | class MaxBoltzmannQPolicy(Policy): 251 | """ 252 | A combination of the eps-greedy and Boltzman q-policy. 253 | 254 | Wiering, M.: Explorations in Efficient Reinforcement Learning. 255 | PhD thesis, University of Amsterdam, Amsterdam (1999) 256 | 257 | https://pure.uva.nl/ws/files/3153478/8461_UBA003000033.pdf 258 | """ 259 | def __init__(self, eps=.1, tau=1., clip=(-500., 500.)): 260 | super(MaxBoltzmannQPolicy, self).__init__() 261 | self.eps = eps 262 | self.tau = tau 263 | self.clip = clip 264 | 265 | def select_action(self, q_values): 266 | """Return the selected action 267 | The selected action follows the BoltzmannQPolicy with probability epsilon 268 | or return the Greedy Policy with probability (1 - epsilon) 269 | 270 | # Arguments 271 | q_values (np.ndarray): List of the estimations of Q for each action 272 | 273 | # Returns 274 | Selection action 275 | """ 276 | assert q_values.ndim == 1 277 | q_values = q_values.astype('float64') 278 | nb_actions = q_values.shape[0] 279 | 280 | if np.random.uniform() < self.eps: 281 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1])) 282 | probs = exp_values / np.sum(exp_values) 283 | action = np.random.choice(range(nb_actions), p=probs) 284 | else: 285 | action = np.argmax(q_values) 286 | return action 287 | 288 | def get_config(self): 289 | """Return configurations of MaxBoltzmannQPolicy 290 | 291 | # Returns 292 | Dict of config 293 | """ 294 | config = super(MaxBoltzmannQPolicy, self).get_config() 295 | config['eps'] = self.eps 296 | config['tau'] = self.tau 297 | config['clip'] = self.clip 298 | return config 299 | 300 | 301 | class BoltzmannGumbelQPolicy(Policy): 302 | """Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning 303 | based on the paper Boltzmann Exploration Done Right 304 | (https://arxiv.org/pdf/1705.10257.pdf). 305 | 306 | BGE is invariant with respect to the mean of the rewards but not their 307 | variance. The parameter C, which defaults to 1, can be used to correct for 308 | this, and should be set to the least upper bound on the standard deviation 309 | of the rewards. 310 | 311 | BGE is only available for training, not testing. For testing purposes, you 312 | can achieve approximately the same result as BGE after training for N steps 313 | on K actions with parameter C by using the BoltzmannQPolicy and setting 314 | tau = C/sqrt(N/K).""" 315 | 316 | def __init__(self, C=1.0): 317 | assert C > 0, "BoltzmannGumbelQPolicy C parameter must be > 0, not " + repr(C) 318 | super(BoltzmannGumbelQPolicy, self).__init__() 319 | self.C = C 320 | self.action_counts = None 321 | 322 | def select_action(self, q_values): 323 | """Return the selected action 324 | 325 | # Arguments 326 | q_values (np.ndarray): List of the estimations of Q for each action 327 | 328 | # Returns 329 | Selection action 330 | """ 331 | # We can't use BGE during testing, since we don't have access to the 332 | # action_counts at the end of training. 333 | assert self.agent.training, "BoltzmannGumbelQPolicy should only be used for training, not testing" 334 | 335 | assert q_values.ndim == 1, q_values.ndim 336 | q_values = q_values.astype('float64') 337 | 338 | # If we are starting training, we should reset the action_counts. 339 | # Otherwise, action_counts should already be initialized, since we 340 | # always do so when we begin training. 341 | if self.agent.step == 0: 342 | self.action_counts = np.ones(q_values.shape) 343 | assert self.action_counts is not None, self.agent.step 344 | assert self.action_counts.shape == q_values.shape, (self.action_counts.shape, q_values.shape) 345 | 346 | beta = self.C/np.sqrt(self.action_counts) 347 | Z = np.random.gumbel(size=q_values.shape) 348 | 349 | perturbation = beta * Z 350 | perturbed_q_values = q_values + perturbation 351 | action = np.argmax(perturbed_q_values) 352 | 353 | self.action_counts[action] += 1 354 | return action 355 | 356 | def get_config(self): 357 | """Return configurations of BoltzmannGumbelQPolicy 358 | 359 | # Returns 360 | Dict of config 361 | """ 362 | config = super(BoltzmannGumbelQPolicy, self).get_config() 363 | config['C'] = self.C 364 | return config 365 | -------------------------------------------------------------------------------- /Torch_rl/common/distribution.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def make_pdtype(ac_space,actor): 5 | from gym import spaces 6 | if isinstance(ac_space, spaces.Box): 7 | shape = ac_space.shape[0] 8 | layer_infor = [] 9 | for name, param in actor.named_parameters(): 10 | if "weight" in name: 11 | layer_infor.append(list(param.size())) 12 | output_layer = layer_infor[-1][0] 13 | return DiagGaussianPd_type(shape, output_layer) 14 | elif isinstance(ac_space, spaces.Discrete): 15 | return CategoricalPd(ac_space.n, actor) 16 | elif isinstance(ac_space, spaces.MultiDiscrete): 17 | return MultiCategoricalPd(ac_space.nvec) 18 | elif isinstance(ac_space, spaces.MultiBinary): 19 | return BernoulliPd(ac_space.n, actor) 20 | else: 21 | raise NotImplementedError 22 | 23 | 24 | class DiagGaussianPd_type(): 25 | def __init__(self, shape, output_layer): 26 | self.shape = shape 27 | self.output_layer = output_layer 28 | 29 | def __call__(self, output, *args, **kwargs): 30 | if self.output_layer == self.shape: 31 | self.mean = output 32 | self.logstd = torch.ones_like(output) 33 | elif self.output_layer == self.shape*2: 34 | self.mean = torch.index_select(output.cpu(), -1, torch.arange(0, self.shape)) 35 | self.logstd = torch.index_select(output.cpu(), -1, torch.arange(self.shape, self.shape*2)) 36 | self.std = torch.exp(self.logstd) 37 | return DiagGaussianPd(self.mean, self.std) 38 | 39 | 40 | 41 | class Pd(object): 42 | """ 43 | A particular probability distribution 44 | """ 45 | def log_prob(self, x): 46 | return torch.sum(self.pd.log_prob(x), dim=-1, keepdim=True) 47 | 48 | def sample(self): 49 | return self.pd.sample() 50 | 51 | def neglogp(self, x): 52 | return -self.pd.log_prob(x) 53 | 54 | def kl(self, other): 55 | return torch.distributions.kl.kl_vergence(self.pd, other) 56 | 57 | def entropy(self): 58 | return self.pd.entropy() 59 | 60 | 61 | 62 | class DiagGaussianPd(Pd): 63 | def __init__(self, mean, std): 64 | from torch.distributions import Normal 65 | self.pd = Normal(mean, std) 66 | 67 | 68 | class CategoricalPd(Pd): 69 | 70 | 71 | def neglogp(self, x): 72 | # Usually it's easier to define the negative logprob 73 | raise NotImplementedError 74 | 75 | def kl(self, other): 76 | raise NotImplementedError 77 | 78 | def entropy(self): 79 | raise NotImplementedError 80 | 81 | def sample(self): 82 | raise NotImplementedError 83 | class MultiCategoricalPd(Pd): 84 | def flatparam(self): 85 | raise NotImplementedError 86 | 87 | def mode(self): 88 | raise NotImplementedError 89 | 90 | def neglogp(self, x): 91 | # Usually it's easier to define the negative logprob 92 | raise NotImplementedError 93 | 94 | def kl(self, other): 95 | raise NotImplementedError 96 | 97 | def entropy(self): 98 | raise NotImplementedError 99 | 100 | def sample(self): 101 | raise NotImplementedError 102 | 103 | class BernoulliPd(Pd): 104 | def flatparam(self): 105 | raise NotImplementedError 106 | 107 | def mode(self): 108 | raise NotImplementedError 109 | 110 | def neglogp(self, x): 111 | # Usually it's easier to define the negative logprob 112 | raise NotImplementedError 113 | 114 | def kl(self, other): 115 | raise NotImplementedError 116 | 117 | def entropy(self): 118 | raise NotImplementedError 119 | 120 | def sample(self): 121 | raise NotImplementedError -------------------------------------------------------------------------------- /Torch_rl/common/loss.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | 5 | def huber_loss(x, delta=1.0): 6 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 7 | return torch.where( 8 | torch.abs(x) < delta, 9 | torch.pow(x, 2) * 0.5, 10 | delta * (torch.abs(x) - 0.5 * delta) 11 | ) 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /Torch_rl/common/memory.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import random 4 | import numpy as np 5 | from abc import ABC 6 | from copy import deepcopy 7 | from collections import deque 8 | 9 | class Memory(ABC): 10 | 11 | def __init__(self, capacity, other_record=None): 12 | self.capacity = capacity 13 | self.memory = {"s": deque(maxlen=capacity), 14 | "a": deque(maxlen=capacity), 15 | "s_": deque(maxlen=capacity), 16 | "r": deque(maxlen=capacity), 17 | "tr": deque(maxlen=capacity)} 18 | if other_record is not None: 19 | for key in other_record: 20 | self.memory[key] = [] 21 | self.position = 0 22 | 23 | def push(self, sample): 24 | raise NotImplementedError() 25 | 26 | def sample(self, batch_size): 27 | raise NotImplementedError() 28 | 29 | class ReplayMemory(Memory): 30 | def __init__(self, capacity, other_record=None): 31 | super(ReplayMemory, self).__init__(capacity, other_record=other_record) 32 | 33 | def push(self, sample): 34 | """Saves a transition.""" 35 | for key in sample.keys(): 36 | self.memory[key].append(sample[key]) 37 | self.position = (self.position + 1) % self.capacity 38 | def sample(self, batch_size): 39 | sample_index = random.sample(range(len(self.memory["s"])), batch_size) 40 | sample = {} 41 | for key in self.memory.keys(): 42 | sample[key] = [] 43 | for key in self.memory.keys(): 44 | for index in sample_index: 45 | sample[key].append(self.memory[key][index]) 46 | sample[key] = np.array(sample[key],dtype=np.float32) 47 | sample[key] = torch.from_numpy(sample[key]) 48 | return sample 49 | 50 | def recent_step_sample(self, batch_size): 51 | sample = {} 52 | for key in self.memory.keys(): 53 | sample[key] = [] 54 | for key in self.memory.keys(): 55 | sample[key] = self.memory[key][-batch_size:] 56 | sample[key] = np.array(sample[key], dtype=np.float32) 57 | sample[key] = torch.from_numpy(sample[key]) 58 | return sample 59 | 60 | def sample_episode(self): 61 | flag = [] 62 | for f,i in enumerate(self.memory["tr"]): 63 | if i: 64 | flag.append(f) 65 | end_place = 0 66 | while end_place == 0: 67 | end_place = random.choice(list(range(len(flag)))) 68 | sample = {} 69 | for key in self.memory.keys(): 70 | sample[key] = [] 71 | for key in self.memory.keys(): 72 | sample[key] = self.memory[key][flag[end_place-1]:flag[end_place]] 73 | sample[key] = np.array(sample[key], dtype=np.float32) 74 | sample[key] = torch.from_numpy(sample[key]) 75 | return sample 76 | 77 | def sample_fragment(self,length): 78 | end_place = 0 79 | while end_place < 0: 80 | end_place = random.choice(list(range(self.position))) 81 | sample = {} 82 | for key in self.memory.keys(): 83 | sample[key] = [] 84 | for key in self.memory.keys(): 85 | sample[key] = self.memory[key][end_place-length:end_place] 86 | sample[key] = np.array(sample[key], dtype=np.float32) 87 | sample[key] = torch.from_numpy(sample[key]) 88 | return sample 89 | 90 | 91 | 92 | def __len__(self): 93 | return len(self.memory) 94 | 95 | 96 | class ReplayMemory_HIRO(Memory): 97 | def __init__(self, capacity, other_record=None): 98 | super(ReplayMemory, self).__init__(capacity, other_record) 99 | self.memory = {"s": [],"g":[], "a": [], "s_": [], "r": [], "tr": []} 100 | def push(self, sample): 101 | """Saves a transition.""" 102 | for key in self.memory.keys(): 103 | self.memory[key].append(sample[key]) 104 | if len(self.memory["s"]) > self.capacity: 105 | for key in self.memory.keys(): 106 | del self.memory[key][0] 107 | self.position = (self.position + 1) % self.capacity 108 | 109 | def sample(self, batch_size): 110 | sample_index = random.sample(range(len(self.memory["s"])), batch_size) 111 | sample = {"s": [], "a": [], "s_": [], "r": [], "tr": []} 112 | for key in sample.keys(): 113 | for index in sample_index: 114 | if key == "s": 115 | temp = np.array(self.memory["s"][index]+self.memory["g"][index], dtype=np.float32) 116 | sample[key].append(torch.from_numpy(temp)) 117 | else: 118 | temp = np.array(self.memory[key][index], dtype=np.float32) 119 | sample[key].append(torch.from_numpy(temp)) 120 | return sample 121 | def H_sample(self, batch_size): 122 | sample_index = random.sample(range(len(self.memory["s"])), batch_size) 123 | sample = {"s": [], "g": [], "s_": [], "r": [], "tr": []} 124 | for key in sample.keys(): 125 | for index in sample_index: 126 | temp = np.array(self.memory[key][index], dtype=np.float32) 127 | sample[key].append(torch.from_numpy(temp)) 128 | return sample 129 | 130 | 131 | class ReplayMemory_Sequence(): 132 | def __init__(self, capacity, max_seq_len, other_record=None): 133 | self.capacity = capacity 134 | Sequence = {"s": [], 135 | "a": [], 136 | "s_": [], 137 | "r": [], 138 | "tr": []} 139 | if other_record is not None: 140 | for key in other_record: 141 | Sequence[key] = [] 142 | self.Sequence = Sequence 143 | self.memory = [deepcopy(Sequence)] 144 | self.position = 0 145 | self.max_position = 0 146 | self.max_seq_len = max_seq_len 147 | self.batch = 32 148 | self.sequence_len = 100 149 | 150 | def push(self, sample): 151 | """Saves a transition.""" 152 | for key in self.memory[self.position].keys(): 153 | if sample[key] is np.ndarray: 154 | self.memory[self.position][key].append(sample[key]) 155 | else: 156 | self.memory[self.position][key].append(np.array(sample[key])) 157 | if sample["tr"] == 1: 158 | self.position = (self.position + 1) % self.capacity 159 | if self.max_position <= self.capacity: 160 | self.memory.append(deepcopy(self.Sequence)) 161 | self.max_position += 1 162 | 163 | def sample_ep(self, batch_size=None): 164 | if batch_size is not None: 165 | self.batch_size = batch_size 166 | sample_index = random.sample(range(self.max_position), self.batch_size) 167 | sample = {} 168 | for key in self.Sequence.keys(): 169 | sample[key] = torch.empty((self.max_seq_len, self.batch_size, self.memory[0][key][0].shape[0]), dtype=torch.float32) 170 | for flag, index in enumerate(sample_index): 171 | ep_len = len(self.memory[index]['s']) 172 | for time_step in range(self.max_seq_len): 173 | if ep_len > self.max_seq_len: 174 | print("the memory size is longer than max_seq_len") 175 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step]) 176 | elif ep_len == self.max_seq_len: 177 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step]) 178 | else: 179 | for time_step in range(ep_len): 180 | sample[key][time_step, flag, :] = torch.from_numpy(self.memory[index][key][time_step]) 181 | for time_step in range(ep_len, self.max_seq_len): 182 | sample[key][time_step, flag, :] = torch.zeros(size=len(self.memory[index][key][0])) 183 | return sample 184 | 185 | def sample_sequence(self, batch_size=None, sequence_len=None): 186 | if batch_size is not None: 187 | self.batch_size = batch_size 188 | if sequence_len is not None: 189 | self.sequence_len = sequence_len 190 | sample = {} 191 | for key in self.Sequence.keys(): 192 | temp_len = self.memory[0][key][0].size 193 | sample[key] = torch.empty((self.sequence_len, self.batch_size, temp_len), dtype=torch.float32) 194 | for loop in range(self.batch_size): 195 | index = random.sample(range(self.max_position), 1)[0] 196 | ep_len = len(self.memory[index]['s']) 197 | if ep_len <= self.sequence_len: 198 | for time_step in range(ep_len): 199 | sample[key][time_step, loop, :] = \ 200 | torch.from_numpy(self.memory[index][key][time_step]) 201 | for time_step in range(ep_len, self.sequence_len): 202 | sample[key][time_step, loop, :] = torch.zeros(temp_len) 203 | else: 204 | start_ = random.sample(range(0, ep_len - self.sequence_len), 1)[0] 205 | end_ = start_ + self.sequence_len 206 | for (time_step, time) in enumerate(range(start_, end_)): 207 | sample[key][time_step, loop, :] = \ 208 | torch.from_numpy(self.memory[index][key][time_step]) 209 | return sample 210 | 211 | def recent_ep_sample(self): 212 | return self.memory[self.position] 213 | 214 | def __len__(self): 215 | return len(self.memory) -------------------------------------------------------------------------------- /Torch_rl/common/util.py: -------------------------------------------------------------------------------- 1 | import csv 2 | def csv_record(data,path): 3 | with open(path+"record.csv", "a+") as csvfile: 4 | writer = csv.writer(csvfile) 5 | writer.writerow(data) 6 | 7 | import torch 8 | def gae(sample, last_value, gamma, lam): 9 | running_step = len(sample["s"]) 10 | sample["advs"] = torch.zeros((running_step), dtype=torch.float32) 11 | value_cal = torch.cat(sample["value"]).squeeze() 12 | 13 | last_gaelam = 0 14 | last_return = 0 15 | 16 | value = torch.cat((value_cal, last_value)) 17 | for t in reversed(range(running_step)): 18 | # sample["return"][t] = last_return = sample["r"][t] + gamma * last_return * (1-sample["tr"][t]) 19 | delta = sample["r"][t] + gamma * value[t+1] * (1-sample["tr"][t]) - value[t] 20 | last_gaelam = delta + gamma * lam * (1-sample["tr"][t]) * last_gaelam 21 | sample["advs"][t] = last_gaelam 22 | sample["return"] = sample["advs"]+value_cal 23 | 24 | adv = sample["advs"] # Normalize the advantages 25 | adv = (adv - torch.mean(adv))/(torch.std(adv)+1e-8) 26 | sample["advs"] = adv 27 | # mean_ep_reward = torch.sum(sample["r"])/torch.sum(torch.eq(sample["tr"],1)) 28 | # print("the runner have sampled "+str(running_step)+" data and the mean_ep_reward is ", mean_ep_reward) 29 | return sample 30 | 31 | def generate_reture(sample, last_value, gamma, lam): 32 | running_step = sample["s"].size()[0] 33 | sample["advs"] = torch.zeros((running_step, 1), dtype=torch.float32) 34 | sample["return"] = torch.zeros((running_step, 1), dtype=torch.float32) 35 | 36 | last_return = 0 37 | for t in reversed(range(running_step)): 38 | sample["return"][t] = last_return = sample["r"][t] + gamma * last_return * (1 - sample["tr"][t]) 39 | 40 | r = sample["return"] 41 | r = (r - torch.mean(r)) / (torch.std(r) + 1e-8) 42 | sample["return"] = r 43 | sample["advs"] = sample["return"]-sample["value"] 44 | mean_ep_reward = torch.sum(sample["r"]) / torch.sum(torch.eq(sample["tr"], 1)) 45 | print("the runner have sampled " + str(running_step) + " data and the mean_ep_reward is ", mean_ep_reward) 46 | return sample 47 | 48 | 49 | 50 | def get_gae(rewards, masks, values, gamma, lamda): 51 | rewards = torch.Tensor(rewards) 52 | masks = torch.Tensor(masks) 53 | returns = torch.zeros_like(rewards) 54 | advants = torch.zeros_like(rewards) 55 | 56 | running_returns = 0 57 | previous_value = 0 58 | running_advants = 0 59 | 60 | for t in reversed(range(0, len(rewards))): 61 | running_returns = rewards[t] + gamma * running_returns * (1-masks[t]) 62 | running_tderror = rewards[t] + gamma * previous_value * (1-masks[t]) - \ 63 | values[t] 64 | running_advants = running_tderror + gamma * lamda * \ 65 | running_advants * (1-masks[t]) 66 | 67 | returns[t] = running_returns 68 | previous_value = values[t] 69 | advants[t] = running_advants 70 | 71 | advants = (advants - advants.mean()) / advants.std() 72 | return returns, advants -------------------------------------------------------------------------------- /Torch_rl/example/agent_example/RUN_Catrpole_with_DQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl.agent.DQN import DQN_Agent 4 | from Torch_rl.model.Network import DenseNet 5 | from torch import nn 6 | from Torch_rl.common.Policy_for_DQN import MaxBoltzmannQPolicy 7 | #%% 8 | envID = "CartPole-v0" 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "savedate" + '/' + envID + "dqn" + nowtime+'/' 13 | #%% 14 | 15 | policy = MaxBoltzmannQPolicy() 16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh()) 17 | 18 | Agent = DQN_Agent(env, model, policy, gamma=0.99, lr=1e-3, path=path) 19 | 20 | Agent.train(max_step=50000, render=False, verbose=2) 21 | Agent.test(max_step=10000, render=True, verbose=2) 22 | -------------------------------------------------------------------------------- /Torch_rl/example/agent_example/RUN_Pendulum_with_PPO.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl import PPO 4 | from Torch_rl.model.Network import DenseNet 5 | from torch import nn 6 | 7 | #%% 8 | envID ="Pendulum-v0" 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/' 13 | #%% 14 | policy_model = DenseNet(env.observation_space.shape[0], env.action_space.shape[0], 15 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64]) 16 | value_model = DenseNet(env.observation_space.shape[0], 1, 17 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64]) 18 | 19 | Agent = PPO(env, policy_model, value_model, gamma=0.90, 20 | lr=1e-4, running_step=512, batch_size=64, value_train_round=10, path=path) 21 | 22 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1) 23 | Agent.test(max_step=10000, render=True, verbose=2) 24 | -------------------------------------------------------------------------------- /Torch_rl/example/agent_example/RUN_Pendulum_with_TD3.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl import TD3 4 | from Torch_rl.model.Network import DenseNet 5 | from torch import nn 6 | #%% 7 | envID = "Pendulum-v0" 8 | env = gym.make(envID) 9 | 10 | nowtime = time.strftime('%y%m%d%H%M', time.localtime()) 11 | path = "savedate" + '/' + envID + "-ddpg-" + nowtime+'/' 12 | #%% 13 | actor = DenseNet(env.observation_space.shape[0], env.action_space.shape[0], 14 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64]) 15 | critic = DenseNet(env.observation_space.shape[0]+env.action_space.shape[0], 1, 16 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64]) 17 | Agent = TD3(env, actor, critic, gamma=0.99, path=path) 18 | 19 | Agent.train(max_step=50000, render=False, verbose=2) 20 | Agent.test(max_step=10000, render=True, verbose=2) 21 | 22 | 23 | -------------------------------------------------------------------------------- /Torch_rl/example/agent_example/RUN_mountaincar_with_DQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl import DQN 4 | from Torch_rl.model.Network import DenseNet 5 | from torch import nn 6 | from Torch_rl.common.Policy_for_DQN import EpsGreedyQPolicy 7 | #%% 8 | envID = "MountainCar-v0" 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "savedate" + '/' + envID + "-dqn-" + nowtime+'/' 13 | #%% 14 | 15 | policy = EpsGreedyQPolicy() 16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh()) 17 | 18 | Agent = DQN(env, model, policy, gamma=0.90, lr=1e-3, path=path) 19 | 20 | Agent.train(max_step=100000, render=False, verbose=2) 21 | Agent.save_weights(path) 22 | Agent.test(max_step=10000, render=True, verbose=2) 23 | -------------------------------------------------------------------------------- /Torch_rl/example/agent_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/example/agent_example/__init__.py -------------------------------------------------------------------------------- /Torch_rl/example/algorithm_example/RUN_Pendulum_with_PPO_LSTM.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl import PPO 4 | from Torch_rl.model.Network import DenseNet, LSTM_Dense_Hin 5 | from torch import nn 6 | 7 | #%% 8 | envID ="Pendulum-v0" 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/' 13 | #%% 14 | policy_model = LSTM_Dense_Hin(env.observation_space.shape[0], env.action_space.shape[0], 15 | lstm_unit=64, lstm_layer=1, dense_layer=[64], 16 | hidden_activate=nn.Tanh()) 17 | value_model = DenseNet(env.observation_space.shape[0], 1, 18 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64]) 19 | 20 | Agent = PPO(env, policy_model, value_model, gamma=0.90, 21 | lr=1e-4, running_step=2048, batch_size=64, value_train_round=10, path=path, lstm_enable= True) 22 | 23 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1) 24 | Agent.test(max_step=10000, render=True, verbose=2) 25 | -------------------------------------------------------------------------------- /Torch_rl/example/algorithm_example/RUN_Pendulum_with_PPO_largrangian.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl.algorithm.PPO_Lagrangian import PPO_LAGRANGIAN_Agent 4 | from Torch_rl.model.Network import DenseNet, LSTM_Dense_Hin 5 | from torch import nn 6 | 7 | #%% 8 | envID ="Pendulum-v0" 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "../savedate" + '/' + envID + "ppo" + nowtime+'/' 13 | #%% 14 | policy_model = LSTM_Dense_Hin(env.observation_space.shape[0], env.action_space.shape[0], 15 | lstm_unit=64, lstm_layer=1, dense_layer=[64], 16 | hidden_activate=nn.Tanh()) 17 | value_model = DenseNet(env.observation_space.shape[0], 1, 18 | hidden_activate=nn.Tanh(), hidden_layer=[64, 64]) 19 | 20 | Agent = PPO_LAGRANGIAN_Agent(env, policy_model, value_model, gamma=0.90, 21 | lr=1e-4, running_step=2048, batch_size=64, value_train_round=10, path=path, lstm_enable= True) 22 | 23 | Agent.train(max_step=1500000, render=False, verbose=0, record_ep_inter=1) 24 | Agent.test(max_step=10000, render=True, verbose=2) 25 | -------------------------------------------------------------------------------- /Torch_rl/example/algorithm_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/example/algorithm_example/__init__.py -------------------------------------------------------------------------------- /Torch_rl/model/GNN_layer.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | import scipy.sparse as sp 7 | from torch.nn.parameter import Parameter 8 | from torch.nn.modules.module import Module 9 | from scipy.sparse import coo_matrix 10 | import torch.nn.functional as F 11 | 12 | 13 | " The GCN layer" 14 | def normalize(adj): 15 | """Row-normalize sparse matrix""" 16 | rowsum = np.array(adj.sum(1)) 17 | r_inv = np.power(rowsum, -1).flatten() 18 | r_inv[np.isinf(r_inv)] = 0. 19 | r_mat_inv = np.diag(r_inv) 20 | mx = r_mat_inv.dot(adj) 21 | return mx 22 | 23 | def normalize_sparse(adj): 24 | """Symmetrically normalize adjacency matrix.""" 25 | adj = sp.coo_matrix(adj) 26 | rowsum = np.array(adj.sum(1)) 27 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 28 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 29 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 30 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() 31 | 32 | class GraphConvolution(Module): 33 | def __init__(self, adj, in_features, out_features, 34 | activate=nn.ReLU(), sparse_inputs=False, chebyshev_polynomials = 0, dropout = 0.5, bias=True): 35 | super(GraphConvolution, self).__init__() 36 | self.in_features = in_features 37 | self.out_features = out_features 38 | self.activate = activate 39 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 40 | if bias: 41 | self.bias = Parameter(torch.FloatTensor(out_features)) 42 | else: 43 | self.register_parameter('bias', None) 44 | self.reset_parameters() 45 | 46 | 47 | if sparse_inputs: 48 | adj = adj.toarray() 49 | if chebyshev_polynomials>0: 50 | T_K = [] 51 | T_K.append(np.eye(adj.shape[0])) 52 | laplacian = np.eye(adj.shape[0]) - normalize(adj) 53 | largest_eigval, _ = np.linalg.eig(laplacian) 54 | scaled_laplacian = (2. / largest_eigval[0]) * laplacian - np.eye(adj.shape[0]) 55 | T_K.append(scaled_laplacian) 56 | for i in range(2, chebyshev_polynomials+1): 57 | T_K.append(2 * np.dot(scaled_laplacian,T_K[-1])-T_K[-2]) 58 | self.T_k = T_K 59 | else: 60 | self.T_k = [normalize(adj)] 61 | if sparse_inputs: 62 | self.adj = [coo_matrix(T) for T in self.T_k] 63 | else: 64 | self.adj = self.T_k 65 | 66 | def reset_parameters(self): 67 | stdv = 1. / math.sqrt(self.weight.size(1)) 68 | self.weight.data.uniform_(-stdv, stdv) 69 | if self.bias is not None: 70 | self.bias.data.uniform_(-stdv, stdv) 71 | 72 | def forward(self, input): 73 | support = torch.mm(input, self.weight) 74 | output = torch.zeros_like(support) 75 | if self.sparse_inputs: 76 | for adj in self.adj: 77 | output = output + torch.sparse.mm(adj, support) 78 | else: 79 | for adj in self.adj: 80 | output = output + torch.mm(adj, support) 81 | if self.bias is not None: 82 | return self.activate(output + self.bias) 83 | else: 84 | return self.activate(output) 85 | 86 | def __repr__(self): 87 | return self.__class__.__name__ + ' (' \ 88 | + str(self.in_features) + ' -> ' \ 89 | + str(self.out_features) + ')' 90 | 91 | "The Graph SAGE" 92 | 93 | 94 | 95 | 96 | "The Graph Attention Network" 97 | 98 | "the following code was implemented by https://github.com/Diego999/pyGAT" 99 | 100 | class GraphAttentionLayer(nn.Module): 101 | """ 102 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903 103 | """ 104 | 105 | def __init__(self, in_features, out_features, dropout, alpha, concat=True): 106 | super(GraphAttentionLayer, self).__init__() 107 | self.dropout = dropout 108 | self.in_features = in_features 109 | self.out_features = out_features 110 | self.alpha = alpha 111 | self.concat = concat 112 | 113 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 114 | nn.init.xavier_uniform_(self.W.data, gain=1.414) 115 | self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1))) 116 | nn.init.xavier_uniform_(self.a.data, gain=1.414) 117 | 118 | self.leakyrelu = nn.LeakyReLU(self.alpha) 119 | 120 | def forward(self, input, adj): 121 | 122 | h = torch.mm(input, self.W) 123 | N = h.size()[0] 124 | 125 | a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features) 126 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) 127 | 128 | zero_vec = -9e15*torch.ones_like(e) 129 | attention = torch.where(adj > 0, e, zero_vec) 130 | attention = F.softmax(attention, dim=1) 131 | attention = F.dropout(attention, self.dropout, training=self.training) 132 | h_prime = torch.matmul(attention, h) 133 | 134 | if self.concat: 135 | return F.elu(h_prime) 136 | else: 137 | return h_prime 138 | 139 | def __repr__(self): 140 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' 141 | 142 | 143 | class SpecialSpmmFunction(torch.autograd.Function): 144 | """Special function for only sparse region backpropataion layer.""" 145 | 146 | @staticmethod 147 | def forward(ctx, indices, values, shape, b): 148 | assert indices.requires_grad == False 149 | a = torch.sparse_coo_tensor(indices, values, shape) 150 | ctx.save_for_backward(a, b) 151 | ctx.N = shape[0] 152 | return torch.matmul(a, b) 153 | 154 | @staticmethod 155 | def backward(ctx, grad_output): 156 | a, b = ctx.saved_tensors 157 | grad_values = grad_b = None 158 | if ctx.needs_input_grad[1]: 159 | grad_a_dense = grad_output.matmul(b.t()) 160 | edge_idx = a._indices()[0, :] * ctx.N + a._indices()[1, :] 161 | grad_values = grad_a_dense.view(-1)[edge_idx] 162 | if ctx.needs_input_grad[3]: 163 | grad_b = a.t().matmul(grad_output) 164 | return None, grad_values, None, grad_b 165 | 166 | 167 | class SpecialSpmm(nn.Module): 168 | def forward(self, indices, values, shape, b): 169 | return SpecialSpmmFunction.apply(indices, values, shape, b) 170 | 171 | 172 | class SpGraphAttentionLayer(nn.Module): 173 | """ 174 | Sparse version GAT layer, similar to https://arxiv.org/abs/1710.10903 175 | """ 176 | 177 | def __init__(self, in_features, out_features, dropout, alpha, concat=True): 178 | super(SpGraphAttentionLayer, self).__init__() 179 | self.in_features = in_features 180 | self.out_features = out_features 181 | self.alpha = alpha 182 | self.concat = concat 183 | 184 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 185 | nn.init.xavier_normal_(self.W.data, gain=1.414) 186 | 187 | self.a = nn.Parameter(torch.zeros(size=(1, 2 * out_features))) 188 | nn.init.xavier_normal_(self.a.data, gain=1.414) 189 | 190 | self.dropout = nn.Dropout(dropout) 191 | self.leakyrelu = nn.LeakyReLU(self.alpha) 192 | self.special_spmm = SpecialSpmm() 193 | 194 | def forward(self, input, adj): 195 | dv = 'cuda' if input.is_cuda else 'cpu' 196 | 197 | N = input.size()[0] 198 | edge = adj.nonzero().t() 199 | 200 | h = torch.mm(input, self.W) 201 | # h: N x out 202 | assert not torch.isnan(h).any() 203 | 204 | # Self-attention on the nodes - Shared attention mechanism 205 | edge_h = torch.cat((h[edge[0, :], :], h[edge[1, :], :]), dim=1).t() 206 | # edge: 2*D x E 207 | 208 | edge_e = torch.exp(-self.leakyrelu(self.a.mm(edge_h).squeeze())) 209 | assert not torch.isnan(edge_e).any() 210 | # edge_e: E 211 | 212 | e_rowsum = self.special_spmm(edge, edge_e, torch.Size([N, N]), torch.ones(size=(N, 1), device=dv)) 213 | # e_rowsum: N x 1 214 | 215 | edge_e = self.dropout(edge_e) 216 | # edge_e: E 217 | 218 | h_prime = self.special_spmm(edge, edge_e, torch.Size([N, N]), h) 219 | assert not torch.isnan(h_prime).any() 220 | # h_prime: N x out 221 | 222 | h_prime = h_prime.div(e_rowsum) 223 | # h_prime: N x out 224 | assert not torch.isnan(h_prime).any() 225 | 226 | if self.concat: 227 | # if this layer is not last layer, 228 | return F.elu(h_prime) 229 | else: 230 | # if this layer is last layer, 231 | return h_prime 232 | 233 | def __repr__(self): 234 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' -------------------------------------------------------------------------------- /Torch_rl/model/GNN_network.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | import scipy.sparse as sp 7 | from torch.nn.parameter import Parameter 8 | from torch.nn.modules.module import Module 9 | from scipy.sparse import coo_matrix 10 | 11 | 12 | 13 | " The GCN Part" 14 | def normalize(adj): 15 | """Row-normalize sparse matrix""" 16 | rowsum = np.array(adj.sum(1)) 17 | r_inv = np.power(rowsum, -1).flatten() 18 | r_inv[np.isinf(r_inv)] = 0. 19 | r_mat_inv = np.diag(r_inv) 20 | mx = r_mat_inv.dot(adj) 21 | return mx 22 | 23 | def normalize_sparse(adj): 24 | """Symmetrically normalize adjacency matrix.""" 25 | adj = sp.coo_matrix(adj) 26 | rowsum = np.array(adj.sum(1)) 27 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 28 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 29 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 30 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() 31 | 32 | class GraphConvolution(Module): 33 | def __init__(self, adj, in_features, out_features, 34 | activate=nn.ReLU(), sparse_inputs=False, chebyshev_polynomials = 0, dropout = 0.5, bias=True): 35 | super(GraphConvolution, self).__init__() 36 | self.in_features = in_features 37 | self.out_features = out_features 38 | self.activate = activate 39 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 40 | if bias: 41 | self.bias = Parameter(torch.FloatTensor(out_features)) 42 | else: 43 | self.register_parameter('bias', None) 44 | self.reset_parameters() 45 | 46 | 47 | if sparse_inputs: 48 | adj = adj.toarray() 49 | if chebyshev_polynomials>0: 50 | T_K = [] 51 | T_K.append(np.eye(adj.shape[0])) 52 | laplacian = np.eye(adj.shape[0]) - normalize(adj) 53 | largest_eigval, _ = np.linalg.eig(laplacian) 54 | scaled_laplacian = (2. / largest_eigval[0]) * laplacian - np.eye(adj.shape[0]) 55 | T_K.append(scaled_laplacian) 56 | for i in range(2, chebyshev_polynomials+1): 57 | T_K.append(2 * np.dot(scaled_laplacian,T_K[-1])-T_K[-2]) 58 | self.T_k = T_K 59 | else: 60 | self.T_k = [normalize(adj)] 61 | if sparse_inputs: 62 | self.adj = [coo_matrix(T) for T in self.T_k] 63 | else: 64 | self.adj = self.T_k 65 | 66 | def reset_parameters(self): 67 | stdv = 1. / math.sqrt(self.weight.size(1)) 68 | self.weight.data.uniform_(-stdv, stdv) 69 | if self.bias is not None: 70 | self.bias.data.uniform_(-stdv, stdv) 71 | 72 | def forward(self, input): 73 | support = torch.mm(input, self.weight) 74 | output = torch.zeros_like(support) 75 | if self.sparse_inputs: 76 | for adj in self.adj: 77 | output = output + torch.sparse.mm(adj, support) 78 | else: 79 | for adj in self.adj: 80 | output = output + torch.mm(adj, support) 81 | if self.bias is not None: 82 | return self.activate(output + self.bias) 83 | else: 84 | return self.activate(output) 85 | 86 | def __repr__(self): 87 | return self.__class__.__name__ + ' (' \ 88 | + str(self.in_features) + ' -> ' \ 89 | + str(self.out_features) + ')' 90 | 91 | 92 | 93 | "The Graph SAGE" 94 | 95 | 96 | 97 | 98 | "The Graph Attention Network" 99 | -------------------------------------------------------------------------------- /Torch_rl/model/special_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import numpy as np 4 | from torch import nn 5 | import torch.nn.functional as F 6 | from collections import OrderedDict 7 | from torch.distributions import Normal, Categorical 8 | from torch.autograd import Variable 9 | from copy import deepcopy 10 | 11 | 12 | class Multi_in(nn.Module): 13 | def __init__(self, observation_size, action_size, hidden_up_layer=[64, 64], hidden_down_layer=[64, 64], 14 | hidden_activate=nn.ReLU(), output_activate=None, 15 | BatchNorm = False): 16 | super(Multi_in, self).__init__() 17 | 18 | self.up_layer1 = nn.Linear(observation_size, hidden_up_layer[0], bias=True) 19 | self.up_layer2 = nn.Linear(hidden_up_layer[0], hidden_up_layer[1], bias=True) 20 | self.down_layer1 = nn.Linear(hidden_up_layer[1]*3, hidden_down_layer[0], bias=True) 21 | self.down_layer2 = nn.Linear(hidden_down_layer[0], hidden_up_layer[1], bias=True) 22 | self.outpu_layer3 = nn.Linear(hidden_down_layer[1], action_size+1, bias=True) 23 | 24 | self.hidden_activate = hidden_activate 25 | self.output_activate = output_activate 26 | 27 | self.gpu = False 28 | 29 | def forward(self, x1,x2,x3): 30 | 31 | x1 = self.hidden_activate(self.up_layer1(x1)) 32 | x1 = self.hidden_activate(self.up_layer2(x1)) 33 | 34 | x2 = self.hidden_activate(self.up_layer1(x2)) 35 | x2 = self.hidden_activate(self.up_layer2(x2)) 36 | 37 | x3 = self.hidden_activate(self.up_layer1(x3)) 38 | x3 = self.hidden_activate(self.up_layer2(x3)) 39 | x = torch.cat([x1,x2,x3], dim=-1) 40 | x = self.hidden_activate(self.down_layer1(x)) 41 | x = self.hidden_activate(self.down_layer2(x)) 42 | x = self.hidden_activate(self.output_layer1(x)) 43 | Q = x[0]+torch.mean(x[1:]) 44 | return Q 45 | 46 | -------------------------------------------------------------------------------- /Torch_rl/temp_file/PPO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from Torch_rl.agent.core_value import Agent_value_based 4 | from Torch_rl.common.memory import ReplayMemory 5 | from copy import deepcopy 6 | from Torch_rl.common.distribution import * 7 | from torch.optim import Adam 8 | from torch.autograd import Variable 9 | import random 10 | from Torch_rl.common.util import csv_record 11 | from Torch_rl.common.util import generate_reture,gae 12 | 13 | 14 | class PPO_Agent(Agent_value_based): 15 | def __init__(self, env, policy_model, value_model, 16 | lr=1e-4, ent_coef=0.01, vf_coef=0.5, 17 | ## hyper-parawmeter 18 | gamma=0.99, lam=0.95, cliprange=0.2, batch_size = 32, 19 | buffer_size=50000, learning_starts=1000, running_step="synchronization", batch_training_round=10, 20 | value_regular=0.01, train_value_round = 1, 21 | ## decay 22 | decay=False, decay_rate=0.9, 23 | ## 24 | path=None): 25 | 26 | self.env = env 27 | self.gamma = gamma 28 | self.lam = lam 29 | self.ent_coef = ent_coef 30 | self.vf_coef = vf_coef 31 | self.cliprange = cliprange 32 | self.batch_size = batch_size 33 | self.batch_training_round = batch_training_round 34 | self.learning_starts = learning_starts 35 | self.train_value_round = train_value_round 36 | if running_step =="synchronization": 37 | self.run_step = 1 38 | else: 39 | self.run_step = running_step 40 | 41 | 42 | self.replay_buffer = ReplayMemory(buffer_size) 43 | self.loss_cal = torch.nn.MSELoss() 44 | 45 | self.policy_model = policy_model 46 | if value_model == "shared": 47 | self.value_model = policy_model 48 | elif value_model == "copy": 49 | self.value_model = deepcopy(policy_model) 50 | else: 51 | self.value_model = value_model 52 | 53 | self.run_policy_model,self.run_value_model = deepcopy(self.policy_model), deepcopy(self.value_model) 54 | 55 | self.dist = make_pdtype(env.action_space, policy_model) 56 | 57 | policy_model_optim = Adam(self.policy_model.parameters(), lr=lr) 58 | value_model_optim = Adam(self.value_model.parameters(), lr=lr, weight_decay=value_regular) 59 | if decay: 60 | self.policy_model_optim = torch.optim.lr_scheduler.ExponentialLR(policy_model_optim, decay_rate, 61 | last_epoch=-1) 62 | self.value_model_optim = torch.optim.lr_scheduler.ExponentialLR(value_model_optim, decay_rate, 63 | last_epoch=-1) 64 | else: 65 | self.policy_model_optim = policy_model_optim 66 | self.value_model_optim = value_model_optim 67 | 68 | torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1, norm_type=2) 69 | torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1, norm_type=2) 70 | 71 | super(PPO_Agent, self).__init__(path) 72 | example_input = Variable(torch.rand(100, self.env.observation_space.shape[0])) 73 | self.writer.add_graph(self.policy_model, input_to_model=example_input) 74 | self.forward_step_show_list = [] 75 | self.backward_step_show_list = ["pg_loss", "entropy", "vf_loss"] 76 | self.forward_ep_show_list = [] 77 | self.backward_ep_show_list = ["pg_loss", "entropy", "vf_loss"] 78 | 79 | self.training_round = 0 80 | self.running_step = 0 81 | self.record_sample = None 82 | self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []} 83 | 84 | def forward(self, observation): 85 | observation = observation[np.newaxis, :].astype(np.float32) 86 | observation = torch.from_numpy(observation) 87 | outcome = self.policy_model.forward(observation) 88 | self.pd = self.dist(outcome) 89 | self.action = self.pd.sample() 90 | self.Q = self.value_model.forward(observation).squeeze() 91 | return self.action.squeeze(0).detach().numpy(), self.Q.squeeze(0).detach().numpy(), {} 92 | 93 | def backward(self, sample_): 94 | self.replay_buffer.push(sample_) 95 | self.running_step += 1 96 | """""""""""""" 97 | "training part" 98 | """""""""""""" 99 | if self.step > self.learning_starts and self.learning: 100 | if self.record_sample is None and self.running_step > self.run_step: 101 | print("***************************************") 102 | print("In the ", self.episode, "ep") 103 | sample = self.replay_buffer.recent_step_sample(self.running_step) 104 | " sample advantage generate " 105 | sample["value"] = self.value_model.forward(sample["s"]).squeeze() 106 | last_value = self.value_model.forward(sample["s_"][-1]) 107 | self.record_sample = gae(sample, last_value, self.gamma, self.lam) 108 | " sample log_probabilty generate" 109 | outcome = self.policy_model.forward(sample["s"]) 110 | self.pd = self.dist(outcome) 111 | sample["logp"] = self.pd.log_prob(sample["a"]) 112 | self.loss_record = {"pg_loss": [], "entropy": [], "vf_loss": [], "loss": []} 113 | self.running_step = 0 114 | if self.record_sample is not None: 115 | print("the learning has start...........") 116 | while self.training_round < self.batch_training_round: 117 | start = (self.batch_size * self.training_round) % self.record_sample["s"].size()[0] 118 | if start+self.batch_size >= self.record_sample["s"].size()[0]: 119 | end = self.record_sample["s"].size()[0] 120 | else: 121 | end = start+self.batch_size 122 | index = np.arange(start, end) 123 | S = self.record_sample["s"][index] 124 | A = self.record_sample["a"][index] 125 | old_log = self.record_sample["logp"][index].detach() 126 | advs = self.record_sample["advs"][index].detach() 127 | value = self.record_sample["value"][index].detach() 128 | returns = self.record_sample["return"][index].detach() 129 | 130 | " traning the value model" 131 | 132 | value_now = self.value_model.forward(S) 133 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, max=self.cliprange) # Clipped value 134 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss 135 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss 136 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) # value loss 137 | vf_loss = 0.5 * vf_loss1 138 | " CALCULATE THE LOSS" 139 | " Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss" 140 | 141 | #generate Policy gradient loss 142 | outcome = self.policy_model.forward(S) 143 | new_policy = self.dist(outcome) 144 | new_lop = new_policy.log_prob(A) 145 | ratio = torch.exp(new_lop-old_log) 146 | pg_loss1 = advs * ratio 147 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 148 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean() 149 | 150 | # entropy 151 | entropy = new_policy.entropy().mean() 152 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 153 | 154 | self.value_model_optim.zero_grad() 155 | loss.backward(retain_graph=True) 156 | self.value_model_optim.step() 157 | 158 | self.policy_model_optim.zero_grad() 159 | loss.backward() 160 | self.policy_model_optim.step() 161 | 162 | 163 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 164 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 165 | self.training_round += 1 166 | print("round:", self.training_round, 167 | "pg_loss:", pg_loss.data.numpy(), "entropy:", entropy.data.numpy(), "vf_loss", vf_loss.data.numpy()) 168 | self.loss_record["pg_loss"].append(pg_loss.data.numpy()) 169 | self.loss_record["entropy"].append(entropy.data.numpy()) 170 | self.loss_record["vf_loss"].append(vf_loss.data.numpy()) 171 | self.loss_record["loss"].append(loss.data.numpy()) 172 | self.training_round = 0 173 | self.record_sample = None 174 | 175 | if self.loss_record["loss"] and self.running_step self.learning_starts: 103 | if self.running_step % self.run_step == 0 and self.training_step == 0: 104 | " sample advantage generate " 105 | with torch.no_grad(): 106 | sample = self.replay_buffer.recent_step_sample(self.running_step) 107 | last_value = self.value_model.forward(sample["s_"][-1]) 108 | self.record_sample = gae(sample, last_value, self.gamma, self.lam) 109 | self.running_step = 0 110 | 111 | if self.training_step < self.sample_training_step and self.record_sample is not None: 112 | pg_loss_re = 0 113 | entropy_re = 0 114 | vf_loss_re = 0 115 | loss_re = 0 116 | for _ in range(self.batch_training_round): 117 | index = self.train_ticks[self.training_step] 118 | S = self.record_sample["s"][index].detach() 119 | A = self.record_sample["a"][index].detach() 120 | old_log = self.record_sample["logp"][index].detach() 121 | advs = self.record_sample["advs"][index].detach() 122 | value = self.record_sample["value"][index].detach() 123 | returns = self.record_sample["return"][index].detach() 124 | # generate Policy gradient loss 125 | outcome = self.run_policy.forward(S) 126 | new_policy = self.dist(outcome) 127 | new_lop = new_policy.log_prob(A) 128 | ratio = torch.exp(new_lop - old_log) 129 | pg_loss1 = advs * ratio 130 | pg_loss2 = advs * torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) 131 | pg_loss = -.5 * torch.min(pg_loss1, pg_loss2).mean() 132 | # value loss 133 | value_now = self.run_value.forward(S) 134 | value_clip = value + torch.clamp(value_now - value, min=-self.cliprange, 135 | max=self.cliprange) # Clipped value 136 | vf_loss1 = self.loss_cal(value_now, returns) # Unclipped loss 137 | vf_loss2 = self.loss_cal(value_clip, returns) # clipped loss 138 | vf_loss = .5 * torch.max(vf_loss1, vf_loss2) 139 | # vf_loss = 0.5 * vf_loss1 140 | # entropy 141 | entropy = new_policy.entropy().mean() 142 | loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef 143 | # approxkl = self.loss_cal(neg_log_pac, self.record_sample["neglogp"]) 144 | # self.cliprange = torch.gt(torch.abs(ratio - 1.0).mean(), self.cliprange) 145 | 146 | self.value_model_optim.zero_grad() 147 | loss.backward(retain_graph=True) 148 | self.value_model_optim.step() 149 | 150 | self.policy_model_optim.zero_grad() 151 | loss.backward() 152 | self.policy_model_optim.step() 153 | 154 | self.training_step += 1 155 | pg_loss_re += pg_loss.data.numpy() 156 | entropy_re += entropy.data.numpy() 157 | vf_loss_re += vf_loss.data.numpy() 158 | loss_re += loss.data.numpy() 159 | 160 | if self.training_step == self.sample_training_step: 161 | print("the" + str(self.episode) + " round have training finished") 162 | self.run_policy.load_state_dict(self.policy_model.state_dict()) 163 | self.run_value.load_state_dict(self.value_model.state_dict()) 164 | self.training_step = 0 165 | self.record_sample = None 166 | return loss_re, {"pg_loss": pg_loss_re, "entropy": entropy_re, "vf_loss": vf_loss_re} 167 | return 0, {"pg_loss": 0, "entropy": 0, "vf_loss": 0} 168 | 169 | def load_weights(self, filepath): 170 | model = torch.load(filepath+"ppo.pkl") 171 | self.policy_model.load_state_dict(model["policy_model"].state_dict()) 172 | self.value_model.load_state_dict(model["value_model"].state_dict()) 173 | 174 | def save_weights(self, filepath, overwrite=False): 175 | torch.save({"policy_model": self.policy_model,"value_model": self.value_model}, filepath + "PPO.pkl") -------------------------------------------------------------------------------- /Torch_rl/temp_file/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/temp_file/__init__.py -------------------------------------------------------------------------------- /Torch_rl/test_file/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zachary2wave/Torch-rl/cdf2128c5415b2e3d2c1f4f8861a1346f6c4dcd5/Torch_rl/test_file/__init__.py -------------------------------------------------------------------------------- /Torch_rl/test_file/run_DP.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from Torch_rl.agent.DQN import DQN_Agent 4 | from Torch_rl.model.Network import DenseNet 5 | from torch import nn 6 | from Torch_rl.common.Policy_for_DQN import BoltzmannQPolicy 7 | #%% 8 | envID = 'D_place_action-v0' 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M', time.localtime()) 12 | path = "savedate" + '/' + envID + "-DQN-" + nowtime+'/' 13 | #%% 14 | 15 | actor = DenseNet(env.observation_space.shape[0], env.action_space.shape[0], hidden_activate=nn.Tanh()) 16 | critic = DenseNet(env.observation_space.shape[0]+env.action_space.shape[0], 1, hidden_activate=nn.Tanh()) 17 | Agent = DQN_Agent(env, actor, critic, gamma=0.99, path=path) 18 | 19 | Agent.train(max_step=10000, render=True, verbose=2) 20 | Agent.test(max_step=10000, render=True, verbose=2) 21 | 22 | 23 | -------------------------------------------------------------------------------- /Torch_rl/test_file/run_HIRO.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from agent.HIRO import HIRO_Agent 4 | from model.Network import DenseNet 5 | from torch import nn 6 | from common.Policy_for_DQN import BoltzmannQPolicy 7 | #%% 8 | envID = 'D_place_action-v0' 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M', time.localtime()) 12 | path = "savedate" + '/' + envID + "-DQN-" + nowtime+'/' 13 | #%% 14 | goal = gym.spaces.Box(low=-1, high=1, shape=(5,)) 15 | H_model = DenseNet(env.observation_space.shape[0], 32, hidden_activate=nn.Tanh()) 16 | L_model = DenseNet(env.observation_space.shape[0]+goal.shape[0], env.action_space.n, hidden_activate=nn.Tanh()) 17 | Agent = HIRO_Agent(env, "DDPG", H_model, "DQN", L_model,goal=goal, gamma=0.99, path=path) 18 | 19 | Agent.train(max_step=10000, render=True, verbose=2) 20 | Agent.test(max_step=10000, render=True, verbose=2) 21 | 22 | 23 | -------------------------------------------------------------------------------- /Torch_rl/test_file/run_dp_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | from agent.DQN import DQN_Agent 4 | from model.Network import DenseNet 5 | from torch import nn 6 | from common.Policy_for_DQN import EpsGreedyQPolicy 7 | #%% 8 | envID = 'D_place_action-v0' 9 | env = gym.make(envID) 10 | 11 | nowtime = time.strftime('%y%m%d%H%M',time.localtime()) 12 | path = "savedate" + '/' + envID + "-dqn-" + nowtime+'/' 13 | #%% 14 | 15 | policy = EpsGreedyQPolicy() 16 | model = DenseNet(env.observation_space.shape[0], env.action_space.n, hidden_activate=nn.Tanh()) 17 | 18 | Agent = DQN_Agent(env, model, policy, gamma=0.90, lr=1e-3, path=path) 19 | 20 | # Agent.train(max_step=1e6, render=False, verbose=2) 21 | # Agent.save_weights(path) 22 | #%% 23 | path = "savedate" + '/' + envID + "-dqn-" + "2002191728" 24 | Agent.load_weights(path) 25 | Agent.test(max_step=10000, render=True, verbose=2) 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /Torch_rl/test_file/testbackward.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from Torch_rl.model.Network import DenseNet 4 | from torch import nn 5 | import torch 6 | from torch.optim import Adam 7 | policy_model = DenseNet(12, 2, 8 | hidden_activate=nn.ReLU(), hidden_layer=[64, 64]) 9 | input = torch.rand(size=(32,12)) 10 | output = torch.rand(size=(32,2)) 11 | loss_cal1 = torch.nn.SmoothL1Loss() 12 | loss_cal2 = torch.nn.MSELoss() 13 | policy_model_optim = Adam(policy_model.parameters(), lr=1e-4) 14 | for time in range(100): 15 | y = policy_model.forward(input) 16 | loss = loss_cal1(y, output) 17 | policy_model_optim.zero_grad() 18 | loss.backward(retain_graph=True) 19 | policy_model_optim.step() 20 | loss = loss_cal2(y, output) 21 | policy_model_optim.zero_grad() 22 | loss.backward() 23 | policy_model_optim.step() -------------------------------------------------------------------------------- /Torch_rl/test_file/testtt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from model.Network import DenseNet 3 | from torch import nn 4 | from torch.optim import Adam 5 | 6 | actor = DenseNet(5, 2, hidden_activate=nn.ReLU()) 7 | critic = DenseNet(7, 1, hidden_activate=nn.ReLU()) 8 | 9 | class actor_critic(nn.Module): 10 | def __init__(self, actor, critic): 11 | super(actor_critic, self).__init__() 12 | self.actor = actor 13 | self.critic = critic 14 | 15 | def forward(self, obs): 16 | a = self.actor(obs) 17 | input = torch.cat((obs, a), axis=-1) 18 | Q = self.critic(input) 19 | return Q 20 | 21 | actor_optim = Adam(actor.parameters(), lr=1e-1) 22 | critic_optim = Adam(critic.parameters(), lr=1e-1) 23 | 24 | input = torch.rand(10, 5) 25 | tgt = torch.rand(10, 1) 26 | loss_fun = torch.nn.MSELoss() 27 | 28 | a = actor(input) 29 | innn = torch.cat((input, a), axis=-1) 30 | b = critic(innn) 31 | 32 | actor.zero_grad() 33 | torch.mean(b).backward() 34 | actor_optim.step() 35 | 36 | ab = actor(input) 37 | bb = critic(innn) 38 | 39 | totalmodel = actor_critic(actor, critic) 40 | totalmodel(input) -------------------------------------------------------------------------------- /Torch_rl/test_file/testttttt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.autograd as autograd # torch中自动计算梯度模块 3 | import torch.nn as nn # 神经网络模块 4 | import torch.nn.functional as F # 神经网络模块中的常用功能 5 | import torch.optim as optim # 模型优化器模块 6 | 7 | training_data = [ 8 | ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]), 9 | ("Everybody read that book".split(), ["NN", "V", "DET", "NN"]) 10 | ] 11 | word_to_ix = {} # 单词的索引字典 12 | for sent, tags in training_data: 13 | for word in sent: 14 | if word not in word_to_ix: 15 | word_to_ix[word] = len(word_to_ix) 16 | print(word_to_ix) 17 | tag_to_ix = {"DET": 0, "NN": 1, "V": 2} # 手工设定词性标签数据字典 18 | 19 | def prepare_sequence(seq, to_ix): 20 | idxs = [to_ix[w] for w in seq] 21 | tensor = torch.LongTensor(idxs) 22 | return autograd.Variable(tensor) 23 | 24 | inputs = prepare_sequence(training_data[0][0], word_to_ix) 25 | 26 | 27 | class LSTMTagger(nn.Module): 28 | 29 | def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size): 30 | super(LSTMTagger, self).__init__() 31 | self.hidden_dim = hidden_dim 32 | 33 | self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 34 | 35 | self.lstm = nn.LSTM(embedding_dim, hidden_dim) 36 | 37 | self.hidden2tag = nn.Linear(hidden_dim, tagset_size) 38 | self.hidden = self.init_hidden() 39 | 40 | def init_hidden(self): 41 | return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), 42 | autograd.Variable(torch.zeros(1, 1, self.hidden_dim))) 43 | 44 | def forward(self, sentence): 45 | embeds = self.word_embeddings(sentence) 46 | lstm_out, self.hidden = self.lstm( 47 | embeds.view(len(sentence), 1, -1), self.hidden) 48 | tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) 49 | tag_scores = F.log_softmax(tag_space) 50 | return tag_scores 51 | 52 | 53 | model = LSTMTagger(10,100, len(word_to_ix), len(tag_to_ix)) 54 | loss_function = nn.NLLLoss() 55 | optimizer = optim.SGD(model.parameters(), lr=0.1) --------------------------------------------------------------------------------