├── src ├── NN │ ├── RL │ │ ├── __init__.py │ │ ├── deterministic.py │ │ ├── base.py │ │ └── stochastic.py │ ├── __init__.py │ └── model.py ├── RLalgo │ ├── __init__.py │ ├── base.py │ ├── ddpg.py │ ├── td3.py │ └── sac.py ├── utility │ ├── __init__.py │ ├── utils.py │ └── RL.py ├── ModelBase │ ├── dyna.py │ └── dynav2.py └── envs.py ├── .env ├── ks_init.tensor ├── requirements.txt ├── docs ├── controlled.png ├── performance.png └── uncontrolled.png ├── .gitignore ├── LICENSE ├── examples ├── pendu.py ├── cartpole.py ├── ks.py └── burgers.py └── README.md /src/NN/RL/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/NN/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/RLalgo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utility/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=${PYTHONPATH}:./src/ -------------------------------------------------------------------------------- /ks_init.tensor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianxun-Wang/PIMBRL/HEAD/ks_init.tensor -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python>=3.8.8 2 | pytorch==1.8.1 3 | gym==0.19.0 4 | numpy>=1.19.2 5 | -------------------------------------------------------------------------------- /docs/controlled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianxun-Wang/PIMBRL/HEAD/docs/controlled.png -------------------------------------------------------------------------------- /docs/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianxun-Wang/PIMBRL/HEAD/docs/performance.png -------------------------------------------------------------------------------- /docs/uncontrolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianxun-Wang/PIMBRL/HEAD/docs/uncontrolled.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | *.png 4 | !docs/*.png 5 | *.jpg 6 | *.gif 7 | *.pdf 8 | 9 | *.npy 10 | 11 | *settings.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Xin-yang Liu, Jian-xun Wang (jwang33@nd.edu) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /examples/pendu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from src.ModelBase.dyna import * 4 | from src.envs import pendulum 5 | from src.NN.RL import model 6 | from RLalgo.td3 import TD3 7 | 8 | if __name__=='__main__': 9 | from src.NN.RL.base import MLPActor, MLPQ 10 | import os 11 | torch.manual_seed(10) 12 | np.random.seed(10) 13 | torch.backends.cuda.matmul.allow_tf32 = False 14 | torch.backends.cudnn.allow_tf32 = False 15 | # torch.set_default_tensor_type('torch.cuda.FloatTensor') 16 | RL_batchsize=128 17 | env_batchsize=200 18 | realenv = pendulum()#lambda:gym.make('Pendulum-v0') 19 | fakeenv = model.fake_pendu_env(env_batchsize) 20 | os.chdir('/home/lxy/store/RL/pendulum/td3/') 21 | RLinp = {"env":fakeenv, 22 | 'Actor':MLPActor, 23 | 'Q': MLPQ, 24 | 'act_space_type':'c', 25 | 'a_kwargs':dict(activation=nn.ReLU, 26 | hidden_sizes=[256]*2, 27 | output_activation=nn.Tanh), 28 | 'ep_type':'inf', 'max_ep_len':200, 29 | 'replay_size':int(5e5)} 30 | RL = TD3(**RLinp) 31 | mb = dyna(RL,realenv,False,env_batchsize,real_buffer_size=int(5e5)) 32 | mb(1000,20000,20000,12000,update_every=200 ,RL_batch_size=RL_batchsize,test_every=4, 33 | num_test_episodes=100,RL_update_iter=50,RL_loop_per_epoch=1600, 34 | env_train_start_size=6000,noiselist=torch.linspace(0.1,0.0,int(16e4)), 35 | data_train_max_iter=50,mixed_train=False, 36 | fake_env_loss_criteria=1e7,usemodel=True) -------------------------------------------------------------------------------- /examples/cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import numpy as np 4 | from src.ModelBase.dyna import * 5 | from src.NN import model 6 | from RLalgo.td3 import TD3 7 | 8 | if __name__=='__main__': 9 | import random 10 | import os 11 | from src.NN.RL.base import MLPActor, MLPQ 12 | random.seed(0) 13 | torch.manual_seed(0) 14 | np.random.seed(0) 15 | test_episodes=100 16 | RL_batchsize=128 17 | env_batchsize=200 18 | realenv = gym.make('CartPole-v0') 19 | fakeenv = model.fake_cartpole_env(env_batchsize) 20 | # os.chdir() 21 | RLinp = {"env":fakeenv, # the surogate environment defined above 22 | 'Actor':MLPActor, # the type of policy network, defined in src/NN/RL 23 | 'Q': MLPQ, # the type of value function network, defined in src/NN/RL 24 | 'act_space_type':'d', # the type of action space, 'c' for continuous, 'd' for discrete 25 | 'a_kwargs':dict(activation=nn.ReLU, 26 | hidden_sizes=[256]*2, 27 | output_activation=nn.Tanh),# the hyperparameters of the network 28 | 'ep_type':'inf', # the type of episode, 'inf' for infinite, 'finite' for finite (only inf is supported for now) 29 | 'max_ep_len':400, # the maximum length of an episode 30 | 'replay_size':int(5e5) # the max size of the replay buffer 31 | } 32 | RL = TD3(**RLinp) 33 | mb = dyna(RL,realenv,True,env_batchsize,real_buffer_size=int(5e5)) 34 | mb(80,1000,1000,1000,update_every=100 ,RL_batch_size=RL_batchsize,test_every=4, 35 | num_test_episodes=test_episodes,RL_update_iter=50,RL_loop_per_epoch=4000, 36 | env_train_start_size=800,noiselist=torch.zeros(16000),mixed_train=False, 37 | data_train_max_iter=100, fake_env_loss_criteria=1e-4,env_num_batch=10) 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/NN/RL/deterministic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from utility.utils import clip_tensor 5 | from .base import MLPActor,ActorCritic,MLPQ 6 | 7 | 8 | class dActorCritic(ActorCritic): 9 | ''' 10 | Deterministic variant of ActorCritic 11 | ''' 12 | def act(self, obs): 13 | with torch.no_grad(): 14 | return self.pi(obs) 15 | 16 | def __get_action_d__(self,o, noise_scale=0): 17 | a = self.act(o) 18 | a += noise_scale*torch.randn(a.shape) 19 | return clip_tensor(a,self.act_limit[:,0],self.act_limit[:,1]).round().int() 20 | 21 | def __get_action_c__(self,o, noise_scale=0): 22 | a = self.act(o) 23 | a += noise_scale*torch.randn(a.shape) 24 | return clip_tensor(a,self.act_limit[:,0],self.act_limit[:,1]) 25 | 26 | 27 | class DDPG_net(dActorCritic): 28 | ''' 29 | networks for DDPG 30 | ''' 31 | def __init__(self, 32 | act_limit, act_space_type:str, 33 | Actor_type:MLPActor, Actor_para:dict, 34 | Q_type=MLPQ, Q_para:dict=None): 35 | # check_dict_valid(net_type,NET_TYPE) 36 | super().__init__(act_limit, act_space_type) 37 | # build policy and value function 38 | self.pi = Actor_type(act_limit,**Actor_para) 39 | self.q = Q_type(**Q_para) 40 | 41 | 42 | class TD3_net(dActorCritic): 43 | ''' 44 | networks for TD3 45 | ''' 46 | def __init__(self, act_limit, act_space_type:str, 47 | Actor_type:MLPActor, Actor_para:dict, 48 | Q_type=MLPQ, Q_para:dict=None): 49 | # check_dict_valid(net_type, NET_TYPE) 50 | super().__init__(act_limit, act_space_type) 51 | # build policy and value functions 52 | self.pi = Actor_type(act_limit,**Actor_para) 53 | self.q1 = Q_type(**Q_para) 54 | self.q2 = Q_type(**Q_para) 55 | -------------------------------------------------------------------------------- /src/NN/RL/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | from utility.utils import check_dict_valid 6 | 7 | def mlp(qp:str, 8 | activation, 9 | obs_dim:int, 10 | act_dim:int, 11 | hidden_sizes:list, 12 | output_activation,): 13 | 14 | sizes_dict = {'p':[obs_dim] + hidden_sizes + [act_dim], 15 | 'q':[obs_dim + act_dim] + hidden_sizes + [1], 16 | 'c':[obs_dim] + hidden_sizes} 17 | check_dict_valid(qp,sizes_dict) 18 | if qp == 'q': output_activation = nn.Identity 19 | sizes = sizes_dict[qp] 20 | 21 | layers = [] 22 | for j in range(len(sizes)-1): 23 | act = activation if j < len(sizes)-2 else output_activation 24 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 25 | return nn.Sequential(*layers) 26 | 27 | def cnn1d(): 28 | raise NotImplementedError 29 | 30 | 31 | 32 | ######################### Actor Network ######################### 33 | class MLPActor(nn.Module): 34 | def __init__(self, act_limit, **para): 35 | super().__init__() 36 | self.act_min = act_limit[:,0] 37 | self.act_len = act_limit[:,1]-act_limit[:,0] 38 | self.net = mlp(qp='p',**para) 39 | 40 | def forward(self, obs): 41 | return self.act_min+self.act_len*(self.net(obs)+1)/2 42 | 43 | ######################### Q Function ############################ 44 | class MLPQ(nn.Module): 45 | def __init__(self, **para): 46 | super().__init__() 47 | self.net = mlp(qp='q',**para) 48 | 49 | def forward(self, obs, act): 50 | q = self.net(torch.cat([obs, act], dim=-1)) 51 | return torch.squeeze(q, -1) 52 | 53 | 54 | ###################### Actor-Critic ############################## 55 | class ActorCritic(nn.Module): 56 | ''' 57 | Base class for actor-critic structure 58 | ''' 59 | def __init__(self, act_limit, act_space_type:str): 60 | super().__init__() 61 | act_space_types = {'d':self.__get_action_d__,'c':self.__get_action_c__} 62 | check_dict_valid(act_space_type,act_space_types) 63 | self.get_action = act_space_types[act_space_type] 64 | self.act_limit = act_limit 65 | self.act_min = act_limit[:,0] 66 | self.act_len = act_limit[:,1]-act_limit[:,0] 67 | 68 | 69 | 70 | 71 | if __name__=="__main__": 72 | pass 73 | -------------------------------------------------------------------------------- /src/utility/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def combined_shape(length, shape=None): 6 | if shape is None: 7 | return (length,) 8 | return (length, shape) if np.isscalar(shape) else (length, *shape) 9 | 10 | 11 | def clip_tensor(inp,min,max): 12 | result = (inp >= min)*inp + (inpmax)*max 14 | return result 15 | 16 | 17 | def nptotorch(x): 18 | if isinstance(x,torch.Tensor): 19 | return x 20 | elif isinstance(x,np.ndarray): 21 | return torch.from_numpy(x.astype(np.float32)) 22 | else: torch.tensor(x) 23 | 24 | def check_dict_valid(typ, type_dict): 25 | if not(typ in type_dict): 26 | raise ValueError(typ + ' is not supported\n\ 27 | Supported are: '+ str([types for types in type_dict])) 28 | 29 | 30 | def FD_Central_CoefficientMatrix(c:list,meshx:int,periodic:bool=False): 31 | ''' 32 | c is list of FD coefficient 33 | e.g. for 1st derivative with 2nd accuracy central difference: 34 | c=[-0.5,0] 35 | ''' 36 | if 2*len(c)-1>=meshx: raise ValueError 37 | acc = len(c) 38 | 39 | tmp=[] 40 | c.reverse() 41 | for i in range(acc): 42 | x = torch.cat((torch.cat((torch.zeros((i,meshx-i)), 43 | c[i]*torch.eye(meshx-i)),dim=0), 44 | torch.zeros((meshx,i)) 45 | ),dim=1) 46 | tmp.append(x) 47 | re=tmp[0] 48 | for k in tmp[1:]: 49 | re+=k+k.T 50 | 51 | if periodic: 52 | re[:acc,-acc:]=re[acc:2*acc,:acc] 53 | re[-acc:,:acc]=re[:acc,acc:2*acc] 54 | return re 55 | 56 | def FD_upwind_CoefficientMatrix(c:list,meshx:int,periodic:bool=False): 57 | ''' 58 | c is list of Backward FD coefficient 59 | e.g. for 1st derivative with 1st accuracy: 60 | c=[-1,1] 61 | ''' 62 | if len(c)>=meshx: raise ValueError 63 | acc = len(c) 64 | 65 | tmp=[] 66 | 67 | c.reverse() 68 | for i in range(acc): 69 | x = torch.cat((torch.cat((torch.zeros((i,meshx-i)), 70 | c[i]*torch.eye(meshx-i)),dim=0), 71 | torch.zeros((meshx,i)) 72 | ),dim=1) 73 | tmp.append(x) 74 | 75 | bre=tmp[0] 76 | fre=-tmp[0] 77 | 78 | for k in tmp[1:]: 79 | fre+=-k.T 80 | bre+=k 81 | 82 | if periodic: 83 | fre[-acc:,:acc]=fre[:acc,acc:2*acc] 84 | bre[:acc,-acc:]=bre[acc:2*acc,:acc] 85 | return fre,bre 86 | -------------------------------------------------------------------------------- /src/NN/RL/stochastic.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.distributions.normal import Normal 6 | 7 | 8 | from .base import MLPActor, ActorCritic, MLPQ, mlp 9 | from utility.utils import check_dict_valid, clip_tensor 10 | 11 | 12 | 13 | TENSOR2 = torch.tensor(2) 14 | LOG_STD_MIN = -20 15 | LOG_STD_MAX = 2 16 | 17 | 18 | class sActorCritic(ActorCritic): 19 | ''' 20 | Deterministic variant of ActorCritic 21 | ''' 22 | def act(self, obs,stochastic): 23 | with torch.no_grad(): 24 | return self.pi(obs,stochastic) 25 | 26 | def __get_action_d__(self,o, stochastic=True): 27 | a,_ = self.act(o,stochastic) 28 | return clip_tensor(a,self.act_limit[:,0],self.act_limit[:,1]).round().int() 29 | 30 | def __get_action_c__(self,o, stochastic=True): 31 | a,_ = self.act(o, stochastic) 32 | return clip_tensor(a,self.act_limit[:,0],self.act_limit[:,1]) 33 | 34 | 35 | class GaussianMLPActor(MLPActor): 36 | def __init__(self, act_limit, **para): 37 | super().__init__(act_limit, **para) 38 | self.net = mlp(qp='c', **para) 39 | hidden_sizes, act_dim = para['hidden_sizes'], para['act_dim'] 40 | self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim) 41 | self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim) 42 | # self.act_limit = act_limit 43 | 44 | def forward(self, obs, stochastic=True): 45 | net_out = self.net(obs) 46 | mu = self.mu_layer(net_out) 47 | log_std = self.log_std_layer(net_out) 48 | if (log_std>LOG_STD_MAX).any(): 49 | print('std clamped') 50 | log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) 51 | std = torch.exp(log_std) 52 | 53 | # Pre-squash distribution and sample 54 | pi_distribution = Normal(mu, std) 55 | if stochastic: 56 | pi_action = pi_distribution.rsample() 57 | 58 | logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)\ 59 | - (2*(torch.log(TENSOR2) - pi_action - F.softplus(-2*pi_action))).sum(axis=-1) 60 | 61 | pi_action = torch.tanh(pi_action) 62 | pi_action = self.act_len * pi_action 63 | 64 | return pi_action, logp_pi 65 | 66 | else: 67 | # Only used at test. 68 | pi_action = torch.tanh(mu) 69 | pi_action = self.act_len * pi_action 70 | return pi_action, None 71 | 72 | 73 | 74 | 75 | 76 | class SAC_net(sActorCritic): 77 | 78 | def __init__(self, act_limit, act_space_type:str, 79 | Actor_type, Actor_para:dict, 80 | Q_type, Q_para:dict): 81 | 82 | super().__init__(act_limit, act_space_type) 83 | self.pi = Actor_type(act_limit,**Actor_para) 84 | self.q1 = Q_type(**Q_para) 85 | self.q2 = Q_type(**Q_para) 86 | 87 | -------------------------------------------------------------------------------- /examples/ks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import torch 4 | import numpy as np 5 | from src.ModelBase.dynav2 import * 6 | from src.envs import * 7 | from src.NN import model 8 | from RLalgo.td3 import TD3 9 | 10 | if __name__=='__main__': 11 | from src.NN.RL.base import MLPActor, MLPQ 12 | random.seed(0) 13 | torch.manual_seed(0) 14 | np.random.seed(0) 15 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 16 | 17 | # set GPU device or use CPU 18 | device=torch.device('cuda:0') 19 | torch.cuda.set_device(device) 20 | 21 | # disable TF32 22 | torch.backends.cuda.matmul.allow_tf32 = False 23 | torch.backends.cudnn.allow_tf32 = False 24 | 25 | # set batch size 26 | RL_batchsize=128 27 | env_batchsize=400 28 | 29 | # set environment, OpenAI Gym is supported, you can define your own environment in envs.py 30 | realenv = ks(device=device) 31 | 32 | # set the model(support both physicas-informed and non-physicas-informed) 33 | fakeenv = model.fake_ks_env(env_batchsize,ratio=1,forward_size=400) 34 | 35 | # change the directory to save results 36 | # os.chdir(os.path.expanduser("~")+'/store/RL/ks/td3/free3') 37 | 38 | # define the RL network and other hyperparameters 39 | RLinp = {"env":fakeenv, # the surogate environment defined above 40 | 'Actor':MLPActor, # the type of policy network, src/NN/RL 41 | 'Q': MLPQ, # the type of value function network, src/NN/RL 42 | 'act_space_type':'c', # the type of action space, 'c' for continuous, 'd' for discrete 43 | 'a_kwargs':dict(activation=nn.ReLU, 44 | hidden_sizes=[256]*2, 45 | output_activation=nn.Tanh),# the hyperparameters of the network 46 | 'ep_type':'inf', # the type of episode, 'inf' for infinite, 'finite' for finite (only inf is supported for now) 47 | 'max_ep_len':400, # the maximum length of an episode 48 | 'gamma':0.977, # the discount factor 49 | 'replay_size':int(5e5) # the max size of the replay buffer 50 | } 51 | RL = TD3(**RLinp) 52 | 53 | # define the dyna hyperparameters 54 | mb = dyna(RL, 55 | realenv, 56 | False, # whether to use the physicas-informed model 57 | env_batchsize, 58 | real_buffer_size=int(5e5)) 59 | 60 | mb(epoch=1600000, 61 | real_policy_action_after = 16000, 62 | fake_policy_action_after = 16000, 63 | update_after = 12000, 64 | RL_batch_size=RL_batchsize, 65 | test_every=3, 66 | num_test_episodes=200, # the number of episodes used to test the performance of the RL agent 67 | RL_update_iter=50, # the number of RL update for each iteration 68 | noiselist=torch.linspace(0.2,0.2,int(16e5)), # artificial noise added to actions 69 | phy_train_max_iter=21, 70 | fake_env_loss_criteria=0.01, # the criteria for the beginning of using the fake environment 71 | env_train_start_size=6000, 72 | fake_len=3, 73 | usemodel=True, # use model-based RL or model-free RL 74 | RL_loop=10, 75 | refresh_RL_buffer_interval=3) -------------------------------------------------------------------------------- /src/RLalgo/base.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from copy import deepcopy 3 | import torch.nn as nn 4 | import src.utility.RL as RL 5 | from src.envs import env_base 6 | from utility.utils import * 7 | from utility.RL import * 8 | from NN.RL.base import ActorCritic 9 | 10 | 11 | class OnPolicy(object): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | class OffPolicy(object): 16 | """ 17 | The base class for off policy 18 | --------- 19 | - update 20 | 21 | """ 22 | def __init__(self, 23 | env:Union[gym.wrappers.time_limit.TimeLimit, env_base, nn.Module], 24 | replay_size:int=int(1e6), batch_size:int=100, 25 | interact_type:str = 's', max_ep_len=200, 26 | ep_type:str='inf', actlimit=None,device=None): 27 | 28 | super().__init__() 29 | self.ac:ActorCritic 30 | self.mseloss = nn.MSELoss() 31 | self.env = env 32 | self.batch_size = batch_size 33 | if isinstance(self.env,gym.wrappers.time_limit.TimeLimit): 34 | self.obs_dim = self.env.observation_space.shape[0] 35 | self.act_dim = 1 36 | self.act_limit = actlimit 37 | 38 | else: 39 | self.obs_dim = self.env.obs_dim 40 | self.act_dim = self.env.act_dim 41 | self.act_limit = self.env.act_limit 42 | self.buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size,device=device) 43 | self.interact_type = interact_type 44 | if type(interact_type)==str: 45 | self.interact_env = getattr(RL,'interact_env_'+interact_type) 46 | self.max_ep_len = max_ep_len 47 | self.ep_type = ep_type 48 | 49 | def update(self): 50 | """ 51 | update RL policy 52 | """ 53 | raise NotImplementedError 54 | 55 | def __call__(self, epoch,policy_action_after,update_after,update_every, 56 | RL_update_iter,batch_size,test_every,num_test_episodes,noiselist): 57 | 58 | o, ep_ret, ep_len = self.env.reset(), 0, 0 59 | self.num_test_episodes = num_test_episodes 60 | testenv = deepcopy(self.env) 61 | o = nptotorch(o) 62 | testcount = 0 63 | returnlist=[] 64 | sizelist=[] 65 | import time 66 | for i in range(epoch): 67 | with torch.no_grad(): 68 | if i > policy_action_after: 69 | a = self.ac.get_action(o, noiselist[0]) 70 | else: 71 | a = self.env.action_space.sample() 72 | o, ep_ret, ep_len = self.interact_env(a, o, ep_ret, ep_len,self.env,self.buffer,self.max_ep_len,self.ep_type) 73 | 74 | if i >= update_after and i % update_every == 0: 75 | start = time.time() 76 | for _ in range(RL_update_iter): 77 | batch = self.buffer.sample_batch(batch_size) 78 | self.update(data=batch) 79 | print(time.time()-start) 80 | if (i+1)%test_every==0 and i>=update_after or i==0: 81 | testcount+=1 82 | ret,max,min=test_RL(testenv,num_test_episodes,self.max_ep_len, self.ac,i=self.buffer.size) 83 | returnlist.append([ret,max,min]) 84 | sizelist.append(self.buffer.size) 85 | print('RL buffer size: {}\tTest: {} th\t Retrun: {}'.format(self.buffer.size,testcount,ret)) 86 | returnhis = np.zeros([4,len(returnlist)]) 87 | returnhis[1:] = np.array(returnlist).T 88 | returnhis[0]=np.array(sizelist) 89 | np.save('free',returnhis) 90 | -------------------------------------------------------------------------------- /examples/burgers.py: -------------------------------------------------------------------------------- 1 | from src.NN import model 2 | from utility.RL import * 3 | from envs import * 4 | from ModelBase.dyna import dyna 5 | from torch.utils.data import DataLoader 6 | 7 | class phyburgers(dyna): 8 | def __init__(self, RL, env, phyloss_flag, env_batch_size, fake_buffer_size) -> None: 9 | super().__init__(RL, env, phyloss_flag, env_batch_size=env_batch_size, real_buffer_size=fake_buffer_size) 10 | self.real_buffer = BufferforRef(obs_dim=self.RL.obs_dim, act_dim=self.RL.act_dim, size=fake_buffer_size) 11 | self.RL.buffer = BufferforRef(obs_dim=self.RL.obs_dim, act_dim=self.RL.act_dim, size=self.RL.buffer.max_size) 12 | 13 | def trainenv(self,buffer,max_train_iter,dataloss=True,phyloss=False,num_batch=5,printflag=False,trainflag=True): 14 | i=0 15 | self.RL.env.train() 16 | for p in self.RL.env.parameters(): 17 | p.requires_grad = True 18 | data = bufferdataref(buffer,self.env_batch_size*num_batch) 19 | loader = DataLoader(dataset=data, batch_size=self.env_batch_size,shuffle=True) 20 | 21 | while True: 22 | j=0 23 | losssum=0 24 | for o, o2, a, len in loader: 25 | j+=1 26 | self.optimizer.zero_grad() 27 | myo2=self.RL.env(o, a, len) 28 | # loss = (dataloss)*self.mseloss(myo2,o2) + (phyloss)*self.RL.env.phyloss_f(o, a) 29 | 30 | # loss.backward() 31 | # self.optimizer.step() 32 | if phyloss: 33 | if not dataloss: 34 | loss = self.RL.env.phyloss_f(o,a) 35 | else: loss = self.mseloss(myo2,o2) + self.RL.env.phyloss_f(o,a) 36 | else: 37 | loss = self.mseloss(myo2,o2) 38 | losssum+=loss.detach() 39 | if trainflag==True: 40 | loss.backward() 41 | self.optimizer.step() 42 | i+=1 43 | avgloss = (losssum/j).item() 44 | 45 | i+=1 46 | if avgloss<1e-6 or i>max_train_iter: 47 | if printflag: 48 | print( 49 | 'Epoch: {}\tCase in Buffer: {}\tModel loss: {}'.format( 50 | i,buffer.size,avgloss)) 51 | break 52 | return avgloss 53 | 54 | 55 | if __name__=='__main__': 56 | 57 | from NN.RL.base import MLPActor, MLPQ 58 | from RLalgo.td3 import TD3 59 | import torch.nn as nn 60 | import random 61 | import os 62 | 63 | random.seed(0) 64 | torch.manual_seed(0) 65 | np.random.seed(0) 66 | 67 | RL_batchsize=120 68 | env_batchsize=120 69 | realenv = burgers() 70 | fakeenv = model.fake_burgers_env(env_batchsize,ratio=1) 71 | #os.chdir('/home/lxy/store/RL/burgers/td3') 72 | 73 | RLinp = {"env":fakeenv, 74 | 'Actor':MLPActor, 75 | "Q":MLPQ, 76 | 'act_space_type':'c', 77 | 'a_kwargs':dict(activation=nn.ReLU, 78 | hidden_sizes=[256]*2, 79 | output_activation=nn.Tanh), 80 | 'ep_type':'inf', 81 | 'max_ep_len':60} 82 | RL = TD3(**RLinp) 83 | mb = phyburgers(RL,realenv,False,env_batchsize,50000) 84 | 85 | mb(150,240,240,120,update_every=120 ,RL_batch_size=RL_batchsize, 86 | test_every=2, num_test_episodes=10, RL_loop_per_epoch=16, 87 | RL_update_iter = 50, env_train_start_size=120, 88 | noiselist=torch.linspace(0.2,0.2,int(16e4)), 89 | data_train_max_iter=50, mixed_train=False, 90 | fake_env_loss_criteria=0.02, usemodel=False,noref_flag=False) 91 | 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PiMBRL 2 | This repo provides code for our paper [Physics-informed Dyna-style model-based deep reinforcement learning for dynamic control](https://royalsocietypublishing.org/doi/pdf/10.1098/rspa.2021.0618) ([arXiv version](https://arxiv.org/abs/2108.00128)), implemented in Pytorch. 3 | * Authors: Xin-Yang Liu \[ [Google Scholar](https://scholar.google.com/citations?user=DI9KTLoAAAAJ&hl=en) \], Jian-Xun Wang \[ [Google Scholar](https://scholar.google.com/citations?user=1cXHUD4AAAAJ&hl=en) | [Homepage](http://sites.nd.edu/jianxun-wang/) \] 4 | 5 | 6 |

7 | 8 |

9 |

An uncontrolled KS environment.

10 | 11 |

12 | 13 |

14 |

A RL controlled KS environment.

15 | 16 |

17 | 18 |

19 |

PiMBRL performance vs. Model-free RL baseline.

20 |

(Vanilla Model-based RL failed to converge, thus not shown in this figure).

21 | 22 | 23 | 24 | 25 | ## Abstract 26 | Model-based reinforcement learning (MBRL) is believed to have much higher sample efficiency compared with model-free algorithms by learning a predictive model of the environment. However, the performance of MBRL highly relies on the quality of the learned model, which is usually built in a blackbox manner and may have poor predictive accuracy outside of the data distribution. The deficiencies of the learned model may prevent the policy from being fully optimized. Although some uncertainty analysisbased remedies have been proposed to alleviate this issue, model bias still poses a great challenge for MBRL. In this work, we propose to leverage the prior knowledge of underlying physics of the environment, where the governing laws are (partially) known. In particular, we developed a physics-informed MBRL framework, where governing equations and physical constraints are used to inform the model learning and policy search. By incorporating the prior information of the environment, the quality of the learned model can be notably improved, while the required interactions with the environment are significantly reduced, leading to better sample efficiency and learning performance. The effectiveness and merit have been emonstrated over a handful of classic control problems, where the environments are governed by canonical ordinary/partial differential equations. 27 | 28 | ## code structure: 29 | * `src/` contains the source code of the framework. 30 | * `NN/` includes deep nerual network related code. 31 | * `RL/` includes policy & Q function networks 32 | * `model.py` contains surrogate models 33 | * `RLago/` contains RL algorithms including `DDPG`, `TD3`, `SAC`. 34 | * `ModelBased/` 35 | * `dyna.py` & `dynav2.py` contains the code for the dyna-style algorithms. 36 | * `utils/` useful tools. 37 | * `envs.py` contains self-defined environments, mimic the gym environments. 38 | 39 | * `examples/` contains a set of examples of using the framework. 40 | * `ks.py` one-dimensional environment governed by KS equation. 41 | * `cartpole.py` the classic cartpole problem 42 | * `pendu.py` modified pendulum problem 43 | * `burgers.py` one-dimensional environment governed by Burgers' equation. 44 | 45 | * `.env` 46 | 47 | ## Usage: 48 | Please refer to `examples/ks.py` for usage. 49 | 50 | ## Requirements: 51 | ``` 52 | python>=3.8.8 53 | pytorch==1.8.1 54 | gym==0.19.0 55 | numpy>=1.19.2 56 | ``` 57 | 58 | ## Citation 59 | If you find this repo useful in your research, please consider citing our paper: [Physics-informed Dyna-style model-based deep reinforcement learning for dynamic control](https://royalsocietypublishing.org/doi/pdf/10.1098/rspa.2021.0618). 60 | 61 | ``` 62 | @article{liu2021physics, 63 | title={Physics-informed Dyna-style model-based deep reinforcement learning for dynamic control}, 64 | author={Liu, Xin-Yang and Wang, Jian-Xun}, 65 | journal={Proceedings of the Royal Society A}, 66 | volume={477}, 67 | number={2255}, 68 | pages={20210618}, 69 | year={2021}, 70 | publisher={The Royal Society} 71 | } 72 | ``` 73 | 74 | ## Problems 75 | If you find any bugs in the code or have trouble in running PiMBRL in your machine, you are very welcome to [open an issue](https://github.com/Jianxun-Wang/PIMBRL/issues) in this repository. 76 | 77 | 78 | ## Acknoledgements 79 | The code in `src/RLalgo` is inspired by OpenAI's [spinningup](https://spinningup.openai.com/en/latest/). 80 | -------------------------------------------------------------------------------- /src/RLalgo/ddpg.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | from torch.optim import Adam 4 | import gym 5 | from NN.RL.deterministic import DDPG_net 6 | from utility.utils import * 7 | from RLalgo.base import OffPolicy 8 | 9 | ''' 10 | Two buffer, Two env 11 | ''' 12 | 13 | class DDPG(OffPolicy): 14 | def __init__(self,env, Actor,Q, act_space_type,a_kwargs:dict, q_kwargs=None, 15 | replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, 16 | interact_type = 's', max_ep_len=200, ep_type='inf',actlimit=None): 17 | 18 | super(DDPG,self).__init__(env, replay_size, batch_size, 19 | interact_type, max_ep_len, ep_type,actlimit) 20 | a_kwargs = {'obs_dim':self.obs_dim, 21 | 'act_dim':self.act_dim, 22 | **a_kwargs} 23 | if q_kwargs == None: 24 | q_kwargs={'obs_dim':self.obs_dim, 25 | 'act_dim':self.act_dim, 26 | **a_kwargs} 27 | # Create actor-critic module and target networks 28 | self.ac = DDPG_net(self.act_limit, act_space_type, 29 | Actor, 30 | {'obs_dim':self.obs_dim, 31 | 'act_dim':self.act_dim, 32 | **a_kwargs}, 33 | Q, 34 | {'obs_dim':self.obs_dim, 35 | 'act_dim':self.act_dim, 36 | **q_kwargs}) 37 | # self.ac.float() 38 | self.ac_targ = deepcopy(self.ac) 39 | 40 | self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr) 41 | self.q_optimizer = Adam(self.ac.q.parameters(), lr=q_lr) 42 | 43 | self.gamma = gamma 44 | self.polyak = polyak 45 | for p in self.ac_targ.parameters(): 46 | p.requires_grad = False 47 | 48 | 49 | def compute_loss_q(self,data): 50 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 51 | 52 | q = self.ac.q(o,a) 53 | 54 | # Bellman backup for Q function 55 | with torch.no_grad(): 56 | q_pi_targ = self.ac_targ.q(o2, self.ac_targ.pi(o2)) 57 | backup = r + self.gamma * (1 - d) * q_pi_targ 58 | 59 | # MSE loss against Bellman backup 60 | loss_q = self.mseloss(q,backup) 61 | return loss_q 62 | 63 | # Set up function for computing DDPG pi loss 64 | def compute_loss_pi(self,data): 65 | o = data['obs'] 66 | q_pi = self.ac.q(o, self.ac.pi(o)) 67 | return -q_pi.mean() 68 | 69 | # Set up optimizers for policy and q-function 70 | 71 | def update(self,data): 72 | # First run one gradient descent step for Q. 73 | self.q_optimizer.zero_grad() 74 | loss_q = self.compute_loss_q(data) 75 | loss_q.backward() 76 | self.q_optimizer.step() 77 | 78 | # Freeze Q-network 79 | for p in self.ac.q.parameters(): 80 | p.requires_grad = False 81 | 82 | self.pi_optimizer.zero_grad() 83 | loss_pi = self.compute_loss_pi(data) 84 | loss_pi.backward() 85 | self.pi_optimizer.step() 86 | 87 | # Unfreeze Q-network 88 | for p in self.ac.q.parameters(): 89 | p.requires_grad = True 90 | 91 | # Update target networks by polyak averaging. 92 | with torch.no_grad(): 93 | for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): 94 | # in-place operations "mul_", "add_" to update target 95 | p_targ.data.mul_(self.polyak) 96 | p_targ.data.add_((1 - self.polyak) * p.data) 97 | 98 | 99 | 100 | if __name__=='__main__': 101 | import torch.nn as nn 102 | from NN.RL.base import MLPActor, MLPQ 103 | RL = DDPG(gym.make('CartPole-v0'), 104 | Actor=MLPActor, Q=MLPQ, 105 | a_kwargs=dict(activation=nn.ReLU, 106 | hidden_sizes=[256]*2, 107 | output_activation=nn.Tanh), 108 | act_space_type= 'd', 109 | actlimit=torch.tensor([[0,1]])) 110 | testenv = gym.make('CartPole-v0') 111 | para={'epoch':100000,'policy_action_after':1600,'update_after':1600, 112 | 'update_every':100,'RL_update_iter':50,'batch_size':128, 113 | 'test_every':2000,'num_test_episodes':100, 114 | 'noiselist':torch.linspace(0.2,0.2,int(16e4))} 115 | RL(**para) -------------------------------------------------------------------------------- /src/RLalgo/td3.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import itertools 3 | import torch 4 | from torch.optim import Adam 5 | from NN.RL.deterministic import TD3_net 6 | from utility.utils import * 7 | from RLalgo.base import OffPolicy 8 | 9 | class TD3(OffPolicy): 10 | def __init__(self,env, Actor, Q, act_space_type, a_kwargs:dict, q_kwargs=None, 11 | replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 12 | batch_size=100, interact_type = 's',max_ep_len=1000, policy_delay=2, 13 | ep_type='inf',actlimit=None): 14 | 15 | super(TD3, self).__init__(env, replay_size, batch_size, 16 | interact_type, max_ep_len, ep_type, actlimit) 17 | a_kwargs = {'obs_dim':self.obs_dim, 18 | 'act_dim':self.act_dim, 19 | **a_kwargs} 20 | if q_kwargs == None: 21 | q_kwargs={'obs_dim':self.obs_dim, 22 | 'act_dim':self.act_dim, 23 | **a_kwargs} 24 | self.ac = TD3_net(self.act_limit, 25 | act_space_type, 26 | Actor, a_kwargs, 27 | Q, q_kwargs) 28 | 29 | self.ac_targ = deepcopy(self.ac) 30 | self.gamma = gamma 31 | self.polyak = polyak 32 | 33 | self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) 34 | self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr) 35 | self.q_optimizer = Adam(self.q_params, lr=q_lr) 36 | 37 | for p in self.ac_targ.parameters(): 38 | p.requires_grad = False 39 | self.policy_delay = policy_delay 40 | self.timer=0 41 | 42 | def compute_loss_q(self,data): 43 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 44 | 45 | q1 = self.ac.q1(o,a) 46 | q2 = self.ac.q2(o,a) 47 | 48 | # Bellman backup for Q functions 49 | with torch.no_grad(): 50 | 51 | a2 = self.ac_targ.get_action(o2) 52 | 53 | # Target Q-values 54 | q1_pi_targ = self.ac_targ.q1(o2, a2) 55 | q2_pi_targ = self.ac_targ.q2(o2, a2) 56 | q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) 57 | backup = r + self.gamma * (1 - d) * q_pi_targ 58 | 59 | # MSE loss against Bellman backup 60 | loss_q1 = ((q1 - backup)**2).mean() 61 | loss_q2 = ((q2 - backup)**2).mean() 62 | loss_q = loss_q1 + loss_q2 63 | 64 | return loss_q 65 | 66 | # Set up function for computing TD3 pi loss 67 | def compute_loss_pi(self, data): 68 | o = data['obs'] 69 | q1_pi = self.ac.q1(o, self.ac.pi(o)) 70 | return -q1_pi.mean() 71 | 72 | 73 | def update(self, data): 74 | # First run one gradient descent step for Q1 and Q2 75 | self.q_optimizer.zero_grad() 76 | loss_q = self.compute_loss_q(data) 77 | loss_q.backward() 78 | self.q_optimizer.step() 79 | 80 | 81 | # Possibly update pi and target networks 82 | if self.timer % self.policy_delay == 0: 83 | 84 | # Freeze Q-networks so you don't waste computational effort 85 | # computing gradients for them during the policy learning step. 86 | for p in self.q_params: 87 | p.requires_grad = False 88 | 89 | # Next run one gradient descent step for pi. 90 | self.pi_optimizer.zero_grad() 91 | loss_pi = self.compute_loss_pi(data) 92 | loss_pi.backward() 93 | self.pi_optimizer.step() 94 | 95 | # Unfreeze Q-networks so you can optimize it at next DDPG step. 96 | for p in self.q_params: 97 | p.requires_grad = True 98 | 99 | # Finally, update target networks by polyak averaging. 100 | with torch.no_grad(): 101 | for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): 102 | # NB: We use an in-place operations "mul_", "add_" to update target 103 | # params, as opposed to "mul" and "add", which would make new tensors. 104 | p_targ.data.mul_(self.polyak) 105 | p_targ.data.add_((1 - self.polyak) * p.data) 106 | self.timer += 1 107 | 108 | 109 | 110 | if __name__=='__main__': 111 | import torch.nn as nn 112 | from NN.RL.base import MLPActor, MLPQ 113 | import gym 114 | RL = TD3(gym.make('CartPole-v0'), 115 | Actor=MLPActor, Q=MLPQ, 116 | a_kwargs=dict(activation=nn.ReLU, 117 | hidden_sizes=[256]*2, 118 | output_activation=nn.Tanh), 119 | act_space_type= 'd', 120 | actlimit=torch.tensor([[0,1]]), 121 | max_ep_len=200,) 122 | para={'epoch':100000,'policy_action_after':1600,'update_after':1600, 123 | 'update_every':100,'RL_update_iter':50,'batch_size':128, 124 | 'test_every':2000,'num_test_episodes':100, 125 | 'noiselist':torch.linspace(0.2,0.2,int(16e4))} 126 | RL(**para) -------------------------------------------------------------------------------- /src/RLalgo/sac.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import List 3 | import itertools 4 | import torch 5 | from torch.optim import Adam 6 | from NN.RL.stochastic import SAC_net 7 | from utility.utils import * 8 | from RLalgo.base import OffPolicy 9 | 10 | class SAC(OffPolicy): 11 | 12 | def __init__(self, env, Actor, Q, act_space_type, a_kwargs:dict, q_kwargs:dict=None, 13 | replay_size=int(1e6), batch_size=100, interact_type = 's', max_ep_len = 1000, 14 | ep_type='inf', pi_lr=1e-3, q_lr=1e-3, gamma=0.99, 15 | polyak=0.995, alpha=0.2, device=None): 16 | 17 | super(SAC, self).__init__(env, replay_size, batch_size, 18 | interact_type, max_ep_len, ep_type, env.act_limit,device) 19 | 20 | a_kwargs = {'obs_dim':self.obs_dim, 21 | 'act_dim':self.act_dim, 22 | **a_kwargs} 23 | if q_kwargs == None: 24 | q_kwargs={'obs_dim':self.obs_dim, 25 | 'act_dim':self.act_dim, 26 | **a_kwargs} 27 | self.ac:SAC_net = SAC_net(self.act_limit, 28 | act_space_type, 29 | Actor, a_kwargs, 30 | Q, q_kwargs) 31 | if device != None: 32 | self.ac=self.ac.to(device) 33 | self.ac.pi.act_len = self.ac.pi.act_len.to(device) 34 | self.ac_targ = deepcopy(self.ac) 35 | 36 | self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) 37 | self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr) 38 | self.q_optimizer = Adam(self.q_params, lr=q_lr) 39 | 40 | 41 | # Freeze target networks with respect to optimizers (only update via polyak averaging) 42 | for p in self.ac_targ.parameters(): 43 | p.requires_grad = False 44 | 45 | self.gamma = gamma 46 | self.polyak = polyak 47 | self.alpha = alpha 48 | 49 | # Set up function for computing SAC Q-losses 50 | def compute_loss_q(self, data): 51 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 52 | 53 | q1 = self.ac.q1(o,a) 54 | q2 = self.ac.q2(o,a) 55 | 56 | # Bellman backup for Q functions 57 | with torch.no_grad(): 58 | # Target actions come from *current* policy 59 | a2, logp_a2 = self.ac.pi(o2) 60 | 61 | # Target Q-values 62 | q1_pi_targ = self.ac_targ.q1(o2, a2) 63 | q2_pi_targ = self.ac_targ.q2(o2, a2) 64 | q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) 65 | backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) 66 | 67 | # MSE loss against Bellman backup 68 | loss_q1 = ((q1 - backup)**2).mean() 69 | loss_q2 = ((q2 - backup)**2).mean() 70 | loss_q = loss_q1 + loss_q2 71 | 72 | return loss_q 73 | 74 | # Set up function for computing SAC pi loss 75 | def compute_loss_pi(self, data): 76 | o = data['obs'] 77 | pi, logp_pi = self.ac.pi(o) 78 | q1_pi = self.ac.q1(o, pi) 79 | q2_pi = self.ac.q2(o, pi) 80 | q_pi = torch.min(q1_pi, q2_pi) 81 | 82 | # Entropy-regularized policy loss 83 | loss_pi = (self.alpha * logp_pi - q_pi).mean() 84 | 85 | return loss_pi 86 | 87 | 88 | def update(self, data): 89 | # First run one gradient descent step for Q1 and Q2 90 | self.q_optimizer.zero_grad() 91 | loss_q = self.compute_loss_q(data) 92 | loss_q.backward() 93 | self.q_optimizer.step() 94 | 95 | # Freeze Q-networks so you don't waste computational effort 96 | # computing gradients for them during the policy learning step. 97 | for p in self.q_params: 98 | p.requires_grad = False 99 | 100 | # Next run one gradient descent step for pi. 101 | self.pi_optimizer.zero_grad() 102 | loss_pi = self.compute_loss_pi(data) 103 | loss_pi.backward() 104 | self.pi_optimizer.step() 105 | 106 | # Unfreeze Q-networks so you can optimize it at next DDPG step. 107 | for p in self.q_params: 108 | p.requires_grad = True 109 | 110 | 111 | # Finally, update target networks by polyak averaging. 112 | with torch.no_grad(): 113 | for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): 114 | # NB: We use an in-place operations "mul_", "add_" to update target 115 | # params, as opposed to "mul" and "add", which would make new tensors. 116 | p_targ.data.mul_(self.polyak) 117 | p_targ.data.add_((1 - self.polyak) * p.data) 118 | 119 | if __name__=='__main__': 120 | import torch.nn as nn 121 | from NN.RL.base import MLPQ 122 | from NN.RL.stochastic import GaussianMLPActor 123 | import gym 124 | RL = SAC(gym.make('CartPole-v0'), 125 | Actor=GaussianMLPActor, Q=MLPQ, 126 | a_kwargs=dict(activation=nn.ReLU, 127 | hidden_sizes=[256]*2, 128 | output_activation=nn.Tanh), 129 | act_space_type= 'd',) 130 | testenv = gym.make('CartPole-v0') 131 | para={'epoch':100000,'policy_action_after':1600,'update_after':1600, 132 | 'update_every':100,'RL_update_iter':50,'batch_size':128, 133 | 'test_every':2000,'num_test_episodes':100, 134 | 'noiselist':torch.linspace(0.2,0.2,int(16e4))} 135 | RL(**para) -------------------------------------------------------------------------------- /src/ModelBase/dyna.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | import torch.nn as nn 4 | from torch.utils.data import DataLoader 5 | from src.RLalgo.base import OffPolicy 6 | 7 | from utility.utils import * 8 | from utility.RL import * 9 | 10 | class dyna(object): 11 | ''' 12 | class for dyna-style MBRL 13 | ''' 14 | def __init__(self,RL: OffPolicy, env,phyloss_flag, 15 | env_batch_size=200,real_buffer_size = 10000) -> None: 16 | super().__init__() 17 | self.RL = RL 18 | self.real_env = env 19 | self.test_env = deepcopy(self.real_env) 20 | self.optimizer = torch.optim.Adam(self.RL.env.parameters(),lr=1e-3) 21 | 22 | self.phyloss_flag = phyloss_flag 23 | self.env_batch_size = env_batch_size 24 | self.mseloss = nn.MSELoss() 25 | 26 | self.interact_env = self.RL.interact_env 27 | self.real_buffer = ReplayBuffer(obs_dim=self.RL.obs_dim, act_dim=self.RL.act_dim, size=real_buffer_size) 28 | 29 | def trainenv(self,buffer,max_train_iter,dataloss=True,phyloss=False,num_batch=20,printflag=False,trainflag=True): 30 | i=0 31 | data = bufferdata(buffer,self.env_batch_size*num_batch) 32 | loader = DataLoader(dataset=data, batch_size=self.env_batch_size,shuffle=True) 33 | while True: 34 | j=0 35 | losssum=0 36 | for o, o2, a in loader: 37 | j+=1 38 | self.optimizer.zero_grad() 39 | myo2=self.RL.env(o,a) 40 | if phyloss: 41 | if not dataloss: 42 | loss = self.RL.env.phyloss_f(o,myo2,a) 43 | else: loss = self.mseloss(myo2,o2) + self.RL.env.phyloss_f(o,myo2,a) 44 | else: 45 | loss = self.mseloss(myo2,o2) 46 | losssum+=loss.detach() 47 | if trainflag==True: 48 | loss.backward() 49 | self.optimizer.step() 50 | i+=1 51 | avgloss = (losssum/j).item() 52 | if avgloss < 1e-6 or i >= max_train_iter: 53 | if printflag: 54 | print( 55 | 'Epoch: {}\tCase in Buffer: {}\tModel loss: {}' 56 | .format(i,buffer.size,avgloss)) 57 | break 58 | return avgloss 59 | 60 | def __call__(self, epoch, real_policy_action_after,fake_policy_action_after,update_after,update_every, 61 | RL_batch_size,test_every,num_test_episodes,RL_update_iter,RL_loop_per_epoch, 62 | noiselist,fake_env_loss_criteria=0.008,env_train_start_size=4000,env_num_batch=20, 63 | data_train_max_iter=20,phy_train_max_iter=50,mixed_train=False,usemodel=True, noref_flag=True): 64 | # torch.manual_seed(0) 65 | # np.random.seed(0) 66 | dataloss=1e6 67 | RL_trained_flag = False 68 | fake_o, fake_ep_ret, fake_ep_len = self.RL.env.reset(), 0, 0 69 | o, ep_ret, ep_len = self.real_env.reset(), 0, 0 70 | o=nptotorch(o) 71 | returnlist=[] 72 | sizelist=[] 73 | for i in range(epoch): 74 | for j in range(1,self.env_batch_size+1): 75 | with torch.no_grad(): 76 | if i*j>=real_policy_action_after: a = self.RL.ac.get_action(o, noiselist[0]) 77 | else: a = self.real_env.action_space.sample() 78 | o, ep_ret, ep_len = self.interact_env(a, o, ep_ret, ep_len,self.real_env, 79 | self.real_buffer,self.RL.max_ep_len,self.RL.ep_type,secondbuffer=self.RL.buffer,noref_flag=noref_flag) 80 | 81 | # if self.real_buffer.size>=env_train_start_size and usemodel: 82 | # # for _ in range(5): 83 | # if self.phyloss_flag: self.trainenv(self.real_buffer,max_train_iter=phy_train_max_iter,dataloss=False,phyloss=True,num_batch=env_num_batch) 84 | # else: self.trainenv(self.real_buffer,max_train_iter=data_train_max_iter,phyloss=False,num_batch=env_num_batch,printflag=False) 85 | # dataloss=self.trainenv(self.real_buffer,max_train_iter=data_train_max_iter,phyloss=False,num_batch=env_num_batch,printflag=True) 86 | 87 | if self.real_buffer.size>=update_after and (self.real_buffer.size % update_every) == 0: 88 | for _ in range(RL_update_iter): 89 | batch = self.real_buffer.sample_batch(RL_batch_size) 90 | self.RL.update(data=batch) 91 | RL_trained_flag=True 92 | 93 | if dataloss= update_after: 96 | 97 | # with torch.no_grad(): 98 | # if self.RL.buffer.size >= fake_policy_action_after: 99 | # a = self.RL.ac.get_action(fake_o,noiselist[t]) 100 | # else: a = self.RL.env.action_space.sample() 101 | # # if self.RL.buffer.size > fake_policy_action_after:policy = self.RL.ac.get_action 102 | # # else: policy=None 103 | # self.RL.env.eval() 104 | # # interact_fakeenvRef(self.real_buffer,self.RL.buffer,self.RL.env,self.env_batch_size,noiselist[self.RL.buffer.size],policy) 105 | # fake_o, fake_ep_ret, fake_ep_len = self.interact_env(a, 106 | # fake_o, fake_ep_ret, fake_ep_len,self.RL.env,self.RL.buffer,self.RL.max_ep_len,'d') 107 | # self.RL.env.train() 108 | 109 | # if self.RL.buffer.size % update_every == 0 and self.RL.buffer.size>update_after: 110 | if (t+1) % update_every == 0 and self.RL.buffer.size>update_after: 111 | for _ in range(RL_update_iter): 112 | batch = self.RL.buffer.sample_batch(RL_batch_size) 113 | self.RL.update(data=batch) 114 | RL_trained_flag=True 115 | 116 | 117 | # if (t+1) % update_every == 0 and self.phyloss_flag and self.RL.buffer.size>env_train_start_size: 118 | # for _ in range(1): 119 | # self.trainenv(self.RL.buffer,phy_train_max_iter,dataloss=False,phyloss=True,num_batch=env_num_batch) 120 | # # self.trainenv(self.RL.buffer,max_train_iter=1,phyloss=False,num_batch=env_num_batch,printflag=True,trainflag=False) 121 | # if (t+1) % update_every == 0 and (not self.phyloss_flag) and self.RL.buffer.size>env_train_start_size: 122 | # # for _ in range(int(10/self.RL.buffer.size*self.real_buffer.size)): 123 | # self.trainenv(self.real_buffer,max_train_iter=phy_train_max_iter,num_batch=env_num_batch) 124 | self.RL.buffer.size,self.RL.buffer.ptr = self.real_buffer.size, self.real_buffer.ptr 125 | 126 | 127 | if (RL_trained_flag and i%test_every==0)or i==0: 128 | ret,max,min=test_RL(self.test_env,num_test_episodes,self.RL.max_ep_len, self.RL.ac,i=self.real_buffer.size) 129 | print('\nbuffer size: {}\t Retrun: {}'.format(self.real_buffer.size,ret)) 130 | returnlist.append([ret,max,min]) 131 | sizelist.append(self.real_buffer.size) 132 | returnhis = np.zeros([4,len(returnlist)]) 133 | returnhis[1:] = np.array(returnlist).T 134 | returnhis[0]=np.array(sizelist) 135 | 136 | # torch.save(self.RL.env,'modelphy1'+str(i)) 137 | # torch.save(self.real_buffer,'bufferphy1') 138 | np.save('free3',returnhis) 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | if __name__=='__main__': 149 | import gym 150 | from src.NN import core,model 151 | from src.RLalgo import ddpg 152 | realenv = gym.make('CartPole-v0') 153 | fakeenv = model.fake_cartpole_env() 154 | RLinp = {"env":fakeenv,'actor_critic':core.MLPDDPG,'ac_kwargs':dict(hidden_sizes=[256]*2,act_space_type='d'),'ep_type':'finite'} 155 | mb = dyna(ddpg.DDPG,RLinp,realenv,False) 156 | mb(10,400,1000,50,100,1,10,400,1000) 157 | -------------------------------------------------------------------------------- /src/ModelBase/dynav2.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from random import randint 3 | import torch 4 | import torch.nn as nn 5 | from torch.utils.data import DataLoader 6 | import RLalgo as rla 7 | from .dyna import dyna 8 | from utility.utils import * 9 | from utility.RL import * 10 | import envs 11 | 12 | class dyna(dyna): 13 | 14 | def __call__(self, epoch, real_policy_action_after,fake_policy_action_after,update_after, 15 | RL_batch_size,test_every,num_test_episodes,RL_update_iter, noiselist, 16 | fake_env_loss_criteria=0.008,env_train_start_size=4000,env_num_batch=20, 17 | data_train_max_iter=21,phy_train_max_iter=50,mixed_train=False,fake_len=3, 18 | usemodel = True,RL_loop=5,refresh_RL_buffer_interval=3): 19 | import time 20 | import math 21 | RL_trained_flag = False 22 | avgret = -float('inf') 23 | dataloss = 1e6#float('inf') 24 | count = 0 25 | o, ep_ret, ep_len = self.real_env.reset(), 0, 0 26 | returnlist = [] 27 | sizelist = [] 28 | returngoal = -55 29 | modelgoal = -120 30 | policygoal = -90 31 | rethis = torch.zeros([3],device='cpu') 32 | testcount = 0 33 | modellock = False 34 | k = 0 35 | precollect=True 36 | assert RL_loop > refresh_RL_buffer_interval 37 | if usemodel: 38 | if fake_len >= RL_loop: 39 | assert fake_len % refresh_RL_buffer_interval == 0 40 | block_len = fake_len//refresh_RL_buffer_interval 41 | 42 | else: 43 | block_len = 1 44 | assert RL_loop > fake_len 45 | refresh_RL_buffer_interval = fake_len 46 | 47 | 48 | # main loop 49 | for i in range(epoch): 50 | # interact with real env 51 | for j in range(1,self.env_batch_size+1): 52 | with torch.no_grad(): 53 | if i*j>real_policy_action_after: a = self.RL.ac.get_action(o, noiselist[i*j]) 54 | else: a = self.real_env.action_space.sample() 55 | o, ep_ret, ep_len = self.interact_env(a, o, ep_ret, ep_len,self.real_env, 56 | self.real_buffer,self.RL.max_ep_len,self.RL.ep_type,secondbuffer=self.RL.buffer) 57 | 58 | # train model with data loss 59 | if usemodel and self.real_buffer.size>env_train_start_size: 60 | dataloss = self.trainenv(self.real_buffer,max_train_iter=data_train_max_iter, 61 | phyloss=mixed_train,num_batch=env_num_batch,printflag=True) 62 | 63 | # update policy & value function with real data 64 | if self.real_buffer.size>update_after: 65 | for _ in range(RL_update_iter): 66 | batch = self.real_buffer.sample_batch(RL_batch_size) 67 | self.RL.update(data=batch) 68 | RL_trained_flag=True # to determine wether to test 69 | 70 | # train model with equation loss or balance between data & equation training iterations 71 | if usemodel and self.RL.buffer.size>env_train_start_size: 72 | if self.phyloss_flag: 73 | for _ in range(20): 74 | self.trainenv(self.RL.buffer,phy_train_max_iter, 75 | dataloss=False,phyloss=True,num_batch=env_num_batch) 76 | else: 77 | for _ in range(20): 78 | self.trainenv(self.RL.buffer,phy_train_max_iter, 79 | dataloss=True,phyloss=False,num_batch=env_num_batch) 80 | 81 | # Fine-tune flag 82 | if avgret>returngoal: usemodel=False 83 | 84 | # train with data generated by model 85 | 86 | while usemodel: 87 | count +=1 88 | if dataloss<=fake_env_loss_criteria: 89 | # generate data with model 90 | self.RL.env.eval() 91 | # if avgret>=policygoal:policy = self.RL.ac.get_action 92 | if self.RL.buffer.size>=fake_policy_action_after: policy = self.RL.ac.get_action 93 | else: policy=None 94 | for _ in range(block_len): 95 | previoussize=self.RL.buffer.size 96 | with torch.no_grad(): 97 | interact_fakeenv(self.RL.buffer,self.RL.buffer, 98 | self.RL.env,self.RL.env.forward_size,noiselist[0], 99 | policy,end=previoussize) 100 | 101 | self.RL.env.train() 102 | if self.phyloss_flag: 103 | for _ in range(20): 104 | self.trainenv(self.RL.buffer,phy_train_max_iter, 105 | dataloss=False,phyloss=True,num_batch=env_num_batch) 106 | else: 107 | for _ in range(int(20/self.RL.buffer.size*self.real_buffer.size)): 108 | self.trainenv(self.real_buffer,phy_train_max_iter, 109 | dataloss=True,phyloss=False,num_batch=env_num_batch) 110 | 111 | if self.RL.buffer.size > update_after and dataloss<=fake_env_loss_criteria: 112 | precollect = False 113 | self.RL.env.eval() 114 | # update policy & value with data from model 115 | for _ in range(RL_update_iter): 116 | batch = self.RL.buffer.sample_batch(RL_batch_size) 117 | self.RL.update(data=batch) 118 | self.RL.env.train() 119 | # train model with equation loss or balance data loss iterations 120 | 121 | 122 | # test model on all data and remove fake data by resetting pointer 123 | if count%refresh_RL_buffer_interval==0: 124 | self.trainenv(self.RL.buffer,max_train_iter=1,phyloss=False, 125 | num_batch=env_num_batch,printflag=True,trainflag=False) 126 | self.RL.buffer.size,self.RL.buffer.ptr=self.real_buffer.size, self.real_buffer.ptr 127 | 128 | # test agent 129 | if count%test_every==0: 130 | ret,_,_=test_RL(self.test_env,num_test_episodes,self.RL.max_ep_len, self.RL.ac,self.real_buffer.size,self.RL.gamma) 131 | rethis[testcount%3]=ret 132 | testcount+=1 133 | avgret = ret#rethis.sum()/(rethis<0).sum() 134 | print('\nModel: RL buffer size: {}\t Retrun: {} \t avgret:{}'.format(self.RL.buffer.size,ret,avgret)) 135 | 136 | 137 | # exit control 138 | if (count>=RL_loop and (modellock or precollect)) or \ 139 | (avgret>modelgoal and not modellock) or count>=50 : 140 | if not precollect: modellock = True 141 | count=0 142 | self.RL.buffer.size,self.RL.buffer.ptr=self.real_buffer.size, self.real_buffer.ptr 143 | break 144 | 145 | # test & save 146 | if (RL_trained_flag and i%test_every==0)or i==0: 147 | ret,max,min=test_RL(self.test_env,num_test_episodes,self.RL.max_ep_len, self.RL.ac,i=self.real_buffer.size,parallel=True) 148 | rethis[(testcount)%3]=ret 149 | testcount+=1 150 | returnlist.append([ret,max,min]) 151 | sizelist.append(self.real_buffer.size) 152 | # if RL_trained_flag: torch.save(self.RL.env,str(i)+' '+f"{dataloss:.9f}") 153 | returnhis = np.zeros([4,len(returnlist)]) 154 | returnhis[1:] = np.array(returnlist).T 155 | returnhis[0]=np.array(sizelist) 156 | np.save('phy',returnhis) 157 | if usemodel:torch.save(self.RL.env,'model') 158 | torch.save(self.RL.ac,'agent') 159 | avgret = ret#rethis.sum()/(rethis<0).sum() 160 | print('\nReal buffer size: {}\t Retrun: {} \t avgret:{}'.format(self.real_buffer.size,ret,avgret)) 161 | 162 | 163 | 164 | 165 | 166 | if __name__=='__main__': 167 | import gym 168 | from src.NN import core,model 169 | from src.RLalgo import ddpg 170 | realenv = gym.make('CartPole-v0') 171 | fakeenv = model.fake_cartpole_env() 172 | RLinp = {"env":fakeenv,'actor_critic':core.MLPActorCritic,'ac_kwargs':dict(hidden_sizes=[256]*2,act_space_type='d'),'ep_type':'finite'} 173 | mb = dyna(ddpg.DDPG,RLinp,realenv,False) 174 | mb(10,400,1000,50,100,1,10,400,1000) 175 | -------------------------------------------------------------------------------- /src/utility/RL.py: -------------------------------------------------------------------------------- 1 | r"""RL related tools 2 | 3 | - ReplayBuffer 4 | - interact_env 5 | - interact_fakeenv 6 | - test_RL 7 | """ 8 | 9 | import torch 10 | import numpy as np 11 | from torch.utils.data import Dataset 12 | import gym 13 | from .utils import combined_shape, nptotorch 14 | 15 | class ReplayBuffer(object): 16 | """ 17 | First In First Out experience replay buffer agents. 18 | """ 19 | 20 | def __init__(self, obs_dim, act_dim, size, device=None): 21 | super(ReplayBuffer,self).__init__() 22 | self.obs_buf = torch.zeros(combined_shape(size, obs_dim), dtype=torch.float) 23 | self.obs2_buf = torch.zeros(combined_shape(size, obs_dim), dtype=torch.float) 24 | self.act_buf = torch.zeros(combined_shape(size, act_dim), dtype=torch.float) 25 | self.rew_buf = torch.zeros(size, dtype=torch.float) 26 | self.done_buf = torch.zeros(size, dtype=torch.float) 27 | if device is not None: 28 | self.obs_buf = self.obs_buf.to(device) 29 | self.obs2_buf = self.obs2_buf.to(device) 30 | self.act_buf = self.act_buf.to(device) 31 | self.rew_buf = self.rew_buf.to(device) 32 | self.done_buf = self.done_buf.to(device) 33 | 34 | self.ptr, self.size, self.max_size = 0, 0, size 35 | 36 | def store(self, obs, act, rew, next_obs, done,store_size=1): 37 | self.obs_buf[self.ptr:self.ptr+store_size] = obs 38 | self.obs2_buf[self.ptr:self.ptr+store_size] = next_obs 39 | self.act_buf[self.ptr:self.ptr+store_size] = act 40 | self.rew_buf[self.ptr:self.ptr+store_size] = rew 41 | self.done_buf[self.ptr:self.ptr+store_size] = done 42 | self.ptr = (self.ptr+store_size) % self.max_size 43 | self.size = min(self.size+store_size, self.max_size) 44 | 45 | def sample_batch(self, batch_size=32,start=0,end=int(1e8)): 46 | idxs = torch.randint(start, min(self.size,end), size=(batch_size,)) 47 | return dict(obs=self.obs_buf[idxs], 48 | obs2=self.obs2_buf[idxs], 49 | act=self.act_buf[idxs], 50 | rew=self.rew_buf[idxs], 51 | done=self.done_buf[idxs]) 52 | 53 | class BufferforRef(ReplayBuffer): 54 | ''' 55 | ReplayBuffer for environments with a reference (e.g. Burgers) 56 | ''' 57 | def __init__(self, obs_dim, act_dim, size): 58 | super().__init__(obs_dim, act_dim, size) 59 | self.len = torch.zeros(size,dtype=torch.long) 60 | 61 | def store(self, obs, act, rew, next_obs, done,len,store_size=1): 62 | self.len[self.ptr:self.ptr+store_size] = len 63 | super().store(obs, act, rew, next_obs, done,store_size) 64 | 65 | def sample_batch(self, batch_size=32,start=0,end=int(1e8)): 66 | idxs = torch.randint(start, min(self.size,end), size=(batch_size,)) 67 | return dict(obs=self.obs_buf[idxs], 68 | obs2=self.obs2_buf[idxs], 69 | act=self.act_buf[idxs], 70 | rew=self.rew_buf[idxs], 71 | done=self.done_buf[idxs], 72 | len=self.len[idxs]) 73 | 74 | 75 | class bufferdata(Dataset): 76 | def __init__(self,buffer,size=float('inf')): 77 | super(bufferdata,self).__init__() 78 | self.size = min(size,buffer.size) 79 | self.idxs = torch.randint(0, buffer.size, size=(self.size,)) 80 | self.obs = buffer.obs_buf[self.idxs] 81 | self.obs2 = buffer.obs2_buf[self.idxs]#[:self.size] 82 | self.act = buffer.act_buf[self.idxs]#[:self.size] 83 | self.rew = buffer.rew_buf[self.idxs] 84 | 85 | 86 | def __getitem__(self, index): 87 | return self.obs[index], self.obs2[index], self.act[index] 88 | 89 | def __len__(self): 90 | return self.size 91 | 92 | class bufferdataref(bufferdata): 93 | def __init__(self, buffer: BufferforRef,size=None): 94 | super().__init__(buffer,size=size) 95 | self.len = buffer.len[self.idxs] 96 | 97 | def __getitem__(self, index): 98 | return *super().__getitem__(index), self.len[index] 99 | 100 | 101 | def test_RL(env,num_test_episode,max_len,RLNN,parallel=False,i=0): 102 | returnlist=[] 103 | # logger = np.zeros((60,150)) 104 | # rwlogger=np.zeros(60) 105 | # actlogger=np.zeros((60,2)) 106 | RLNN.eval() 107 | if parallel: 108 | o, ep_ret, ep_len = env.test_reset(), 0, 0 109 | while(ep_len < max_len): 110 | a = RLNN.get_action(torch.Tensor(o.squeeze(-1)), 0) 111 | o, r = env.step_p(o,a.cpu().numpy()) 112 | ep_ret = r + ep_ret 113 | ep_len += 1 114 | returnlist=ep_ret 115 | return_=np.array(returnlist,dtype=np.float) 116 | else: 117 | for j in range(num_test_episode): 118 | o, d, ep_ret, ep_len = env.reset(), False, 0, 0 119 | o = nptotorch(o) 120 | while not(d or (ep_len >= max_len)): 121 | a = RLNN.get_action(o, 0) 122 | if isinstance(env, gym.wrappers.time_limit.TimeLimit): 123 | if isinstance(env.action_space,gym.spaces.discrete.Discrete): 124 | if isinstance(a,torch.Tensor):a=int(a) 125 | else:a=a.numpy() 126 | o, r, d, _ = env.step(a) 127 | o = nptotorch(o) 128 | ep_ret = r + ep_ret 129 | ep_len += 1 130 | # if j==0: 131 | # logger[ep_len-1]=o.cpu() 132 | # rwlogger[ep_len-1]=r.cpu() 133 | # actlogger[ep_len-1]=a.cpu() 134 | # if j==0: 135 | # np.save('act'+str(i),actlogger) 136 | # np.save('state'+str(i),logger) 137 | # np.save('rew'+str(i),rwlogger) 138 | # plt.pcolormesh(logger) 139 | # plt.colorbar() 140 | # plt.savefig(str(i)) 141 | # plt.close() 142 | # plt.plot(actlogger) 143 | # plt.savefig('act'+str(i)) 144 | # plt.close() 145 | returnlist.append(ep_ret) 146 | return_=torch.tensor(returnlist,dtype=torch.float) 147 | mean,max,min=return_.mean().item(),return_.max().item(),return_.min().item() 148 | RLNN.train() 149 | return mean,max,min 150 | 151 | 152 | def interact_env_s(a, o, ep_ret, ep_len, env, buffer, max_len,ep_type,noref_flag=True,secondbuffer=None): 153 | ''' 154 | Sequentially interact with env and save data pair to buffer 155 | ''' 156 | # Step the env 157 | if isinstance(env, gym.wrappers.time_limit.TimeLimit): 158 | if isinstance(env.action_space,gym.spaces.discrete.Discrete): 159 | if isinstance(a,torch.Tensor):a=int(a) 160 | if isinstance(a,torch.Tensor):a=a.numpy() 161 | # else: a=nptotorch(a) 162 | o2, r, d, len = env.step(a) 163 | o2 = nptotorch(o2) 164 | ep_ret += r 165 | ep_len += 1 166 | 167 | if ep_type=='finite': 168 | d = False if ep_len==max_len else d 169 | if isinstance(a,np.ndarray):a=nptotorch(a) 170 | # Store experience to replay buffer 171 | if noref_flag: 172 | if secondbuffer!=None: 173 | secondbuffer.store(o,a,r,o2,d) 174 | buffer.store(o, a, r, o2, d) 175 | else: 176 | if secondbuffer!=None: 177 | secondbuffer.store(o,a,r,o2,d,len) 178 | buffer.store(o, a, r, o2, d,len) 179 | 180 | o = o2 181 | 182 | # End of trajectory handling 183 | if d or (ep_len == max_len): 184 | o, ep_ret, ep_len = env.reset(), 0, 0 185 | return nptotorch(o), ep_ret, ep_len 186 | 187 | 188 | def interact_env_p(a, o, ep_ret, ep_len, env, 189 | buffer, max_len,parallel_size,noref_flag=True): 190 | ''' 191 | Parallel variant of interact_env_s, should be used for all continuious trajactories 192 | policy action delay should be divided by the batch_size 193 | 194 | Parallel in dimension of batches, but iterate over time steps 195 | ''' 196 | raise NotImplementedError 197 | 198 | 199 | def interact_fakeenv(source_buffer:ReplayBuffer,save_buffer:ReplayBuffer, 200 | fake_env:torch.nn.Module,batch_size:int, noise_scale,policy=None, 201 | end=int(1e8)): 202 | data = source_buffer.sample_batch(batch_size,end=end) 203 | o,d = data['obs2'],data['done'] 204 | if policy==None: a = fake_env.action_space.sample(batch_size) 205 | else: a = policy(o,noise_scale) 206 | o2,r,_,_ = fake_env.step(o,a) 207 | save_buffer.store(o,a,r,o2,d,store_size=batch_size) 208 | 209 | def step_fakeenv(source_buffer:ReplayBuffer,fake_env:torch.nn.Module, 210 | batch_size:int, noise_scale,policy=None): 211 | data = source_buffer.sample_batch(batch_size) 212 | o,d = data['obs2'],data['done'] 213 | if policy==None: a = fake_env.action_space.sample(batch_size) 214 | else: a = policy(o, noise_scale) 215 | o2,r,_,_ = fake_env.step(o,a) 216 | return dict(obs=o, obs2=o2, act=a, rew=r, done=d) 217 | 218 | 219 | def interact_fakeenvRef(real_buffer:ReplayBuffer,fake_buffer:ReplayBuffer, 220 | fake_env:torch.nn.Module,batch_size:int, noise_scale,policy=None, 221 | end=int(1e8)): 222 | data = real_buffer.sample_batch(batch_size,end=end) 223 | o,len = data['obs2'],data['len'] 224 | if policy==None: a = fake_env.action_space.sample(batch_size) 225 | else: a = policy(o,noise_scale) 226 | o,a,o2,r,d,len,batch_size = fake_env.step(o,a,len) 227 | fake_buffer.store(o,a,r,o2,d,len,store_size=batch_size) -------------------------------------------------------------------------------- /src/envs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from utility.utils import * 7 | 8 | 9 | ''' 10 | envs should have following: 11 | variables: 12 | obs_dim: tuple or int 13 | act_dim: tuple or int 14 | act_limit: tensor, shape act_dim*2 15 | ''' 16 | 17 | 18 | class env_base(object): 19 | def __init__(self): 20 | super(env_base, self).__init__() 21 | self.action_space = action_space() 22 | self.act_dim = 0 23 | self.obs_dim = 0 24 | self.act_limit = 0 25 | # self. 26 | 27 | def step(self): 28 | raise NotImplementedError 29 | 30 | def step_p(self): 31 | raise NotImplementedError 32 | 33 | def reset(self): 34 | raise NotImplementedError 35 | 36 | 37 | class action_space(object): 38 | def __init__(self, device='cpu'): 39 | super(action_space, self).__init__() 40 | self.device = device 41 | 42 | def sample(self): 43 | raise NotImplementedError 44 | 45 | 46 | class pendulum(env_base): 47 | def __init__(self): 48 | super(pendulum, self).__init__() 49 | self.max_speed = 8 50 | self.max_torque = 2. 51 | self.dt = .05 52 | self.g = 10. 53 | self.m = 1. 54 | self.l = 1. 55 | self.pi = torch.tensor(np.pi) 56 | self.action_space = pen_action_space() 57 | self.act_dim = 1 58 | self.obs_dim = 2 59 | self.act_limit = torch.tensor([[-2, 2], ]) 60 | 61 | def step(self, a): 62 | a = clip_tensor(a, -self.max_torque, self.max_torque) 63 | costs = self.angle_normalize( 64 | self.state[0]) ** 2 + .1 * self.state[1] ** 2 + .001 * (a * a) 65 | 66 | newthdot = self.state[1] + (-3 * self.g / (2 * self.l) * torch.sin( 67 | self.state[0] + self.pi) + 3. / (self.m * self.l ** 2) * a) * self.dt 68 | newth = self.state[0] + newthdot * self.dt 69 | newthdot = clip_tensor(newthdot, -self.max_speed, self.max_speed) 70 | 71 | self.state = torch.tensor([newth, newthdot]) 72 | return self.state, -costs, False, {} 73 | 74 | def angle_normalize(self, x): 75 | return (((x+self.pi) % (2*self.pi)) - self.pi) 76 | 77 | def reset(self): 78 | x = 2*torch.rand((2))-1 79 | self.state = torch.tensor([self.pi*x[0], x[1]]) 80 | return self.state 81 | 82 | 83 | class pen_action_space(action_space): 84 | def sample(self): 85 | return 4*torch.rand((1))-2 86 | 87 | 88 | class burgers(env_base): 89 | ''' 90 | meshx: 100 91 | max_t: 500 92 | ''' 93 | 94 | def __init__(self): 95 | super(burgers, self).__init__() 96 | l = 2 97 | 98 | self.meshx = 150 99 | dx = l/self.meshx 100 | self.maxt = 60 101 | self.dx = 1/self.meshx 102 | self.dt = 0.001 103 | self.nu = 0.01 104 | 105 | x = torch.linspace(0, l-dx, self.meshx) 106 | self.f1 = torch.exp(-225*(x/l-.25)*(x/l-.25)) 107 | self.f2 = torch.exp(-225*(x/l-.75)*(x/l-.75)) 108 | self.init1 = 0.2*torch.exp(-25*(x/l-0.5)*(x/l-0.5)) 109 | self.init2 = 0.2*torch.sin(4*math.pi*x/l) 110 | 111 | self.len = 0 112 | self.d = False 113 | 114 | self.loss = torch.nn.MSELoss() 115 | self.info = {} 116 | self.num_steps = 500 117 | ref = torch.arange(0, 30.5, self.dt*self.num_steps) 118 | self.ref = (0.05*torch.sin(np.pi/15*ref) + 119 | 0.5).reshape(-1, 1).repeat([1, self.meshx]) 120 | 121 | self.action_space = burgers_action_space() 122 | self.act_dim = 2 123 | self.obs_dim = int(*x.shape) 124 | self.act_limit = torch.tensor([[-.025, .075], [-.025, .075]]) 125 | 126 | def step(self, act): 127 | self.len += 1 128 | for _ in range(self.num_steps): 129 | self.pdestate = torch.cat( 130 | (self.pdestate[-2:], self.pdestate, self.pdestate[:2])) 131 | lapa = -1/12*self.pdestate[:-4]+4/3*self.pdestate[1:-3]-5/2 * \ 132 | self.pdestate[2:-2]+4/3 * \ 133 | self.pdestate[3:-1]-1/12*self.pdestate[4:] 134 | state2 = self.pdestate**2/2 135 | gradient = 0.5*state2[:-4]-2*state2[1:-3]+1.5*state2[2:-2] 136 | 137 | self.pdestate = self.pdestate[2:-2] + self.dt * ( 138 | self.nu * lapa / self.dx**2 - gradient / self.dx 139 | + act[0]*self.f1 + act[1]*self.f2) 140 | 141 | self.state = self.pdestate-self.ref[self.len] 142 | # TODO when nan occurs, treat as a normal condition requires reset() and rew=-inf 143 | if torch.any(self.state.isnan()) == True: 144 | raise ValueError 145 | self.d = False if self.len < self.maxt else True 146 | 147 | rew = self.compute_rew() 148 | return self.state, rew, self.d, self.len 149 | 150 | def compute_rew(self): 151 | return -10*((self.state**2).mean()) 152 | 153 | def reset(self): 154 | a = torch.rand(1) 155 | self.pdestate = a*self.init1 + (1-a)*self.init2 + 0.2 156 | self.state = self.pdestate - self.ref[0] 157 | self.d = False 158 | self.len = 0 159 | return self.state 160 | 161 | 162 | class burgers_action_space(action_space): 163 | def sample(self, batch=None): 164 | if batch == None: 165 | shape = 2 166 | else: 167 | shape = (batch, 2) 168 | return 0.1*(torch.rand(shape))-0.025 169 | # return (0.1*torch.rand((2))-0.025) 170 | 171 | 172 | @torch.jit.script 173 | def RHS(u, lapa_c, lapa2_c, gradient_fc, gradient_bc, dx: float, dx2: float, dx4: float, f): 174 | u2 = u*u 175 | lapa = torch.matmul(lapa_c, u) 176 | lapa2 = torch.matmul(lapa2_c, u) 177 | gradient = torch.matmul(gradient_fc, u2)*(u < 0)\ 178 | + torch.matmul(gradient_bc, u2)*(u >= 0) 179 | return -lapa2/dx4 - lapa/dx2 - gradient/2./dx + f, lapa, gradient 180 | 181 | 182 | @torch.jit.script 183 | def __calculate__(state, act, lapa_c, lapa2_c, gradient_fc, gradient_bc, 184 | dt: float, dx: float, dx2: float, dx4: float, f0, f1, f2, f3, r, num_steps: int): 185 | 186 | f = act[0]*f0 + act[1]*f1 + act[2]*f2 + act[3]*f3 187 | for _ in range(num_steps): 188 | k1, lapa, gradient = RHS( 189 | state, lapa_c, lapa2_c, gradient_fc, gradient_bc, dx, dx2, dx4, f) 190 | k2, _, _ = RHS(state + dt*k1/2, lapa_c, lapa2_c, 191 | gradient_fc, gradient_bc, dx, dx2, dx4, f) 192 | k3, _, _ = RHS(state + dt*k2/2, lapa_c, lapa2_c, 193 | gradient_fc, gradient_bc, dx, dx2, dx4, f) 194 | k4, _, _ = RHS(state + dt*k3, lapa_c, lapa2_c, 195 | gradient_fc, gradient_bc, dx, dx2, dx4, f) 196 | r += (lapa*lapa).mean() + (gradient*gradient).mean() + (state*f).mean() 197 | state = state + dt*(k1 + 2*k2 + 2*k3 + k4)/6 198 | return state, -r/num_steps 199 | 200 | 201 | class ks(env_base): 202 | ''' 203 | meshx: 100 204 | max_t: 500 205 | ''' 206 | 207 | def __init__(self, device='cuda:0'): 208 | super(ks, self).__init__() 209 | 210 | l = 8*math.pi 211 | 212 | self.meshx = 64 213 | dx = l/self.meshx 214 | self.maxt = 400 215 | self.dx = l/self.meshx 216 | self.dx2 = self.dx**2 217 | self.dx4 = self.dx**4 218 | self.dt = 0.001 219 | 220 | x = torch.linspace(0, l-dx, self.meshx).to(device) 221 | self.f0 = (torch.exp(-x**2/2)/math.sqrt(2*math.pi)).to(device) 222 | self.f1 = (torch.exp(-(x - 0.25*l)**2/2) / 223 | math.sqrt(2*math.pi)).to(device) 224 | self.f2 = (torch.exp(-(x - 0.50*l)**2/2) / 225 | math.sqrt(2*math.pi)).to(device) 226 | self.f3 = (torch.exp(-(x - 0.75*l)**2/2) / 227 | math.sqrt(2*math.pi)).to(device) 228 | 229 | # self.init1 = torch.exp(-25*(x/l-0.5)**2) 230 | # self.init = torch.cos(x/16)*(1+torch.sin(x/16)) 231 | self.init = torch.load('ks_init.tensor', map_location=device) 232 | # self.init2 = 1+torch.sin(1*math.pi*x/l) 233 | # self.init3 = torch.cos(1*math.pi*x/l) 234 | # self.init4 = torch.sin(2*math.pi*x/l) 235 | # self.init5 = torch.sin(8*math.pi*x/l) 236 | self.len = 0 237 | self.d = False 238 | 239 | # self.loss = torch.nn.MSELoss() 240 | self.info = {} 241 | self.num_steps = 250 242 | 243 | self.action_space = ks_action_space() 244 | self.act_dim = 4 245 | self.obs_dim = int(*x.shape) 246 | self.act_limit = torch.tensor( 247 | [[-.5, .5], [-.5, .5], [-.5, .5], [-.5, .5]]) 248 | 249 | self.lapa_c = FD_Central_CoefficientMatrix( 250 | [1/90, -3/20, 3/2, -49/18], self.meshx, periodic=True) 251 | self.lapa2_c = FD_Central_CoefficientMatrix( 252 | [7/240, -2/5, 169/60, -122/15, 91/8], self.meshx, periodic=True) 253 | self.gradient_fc, self.gradient_bc = FD_upwind_CoefficientMatrix( 254 | [1/4, -4/3, 3, -4, 25/12], self.meshx, periodic=True) 255 | self.numpy = dict(f=torch.cat( 256 | (self.f0.unsqueeze(0), self.f1.unsqueeze(0), 257 | self.f2.unsqueeze(0), self.f3.unsqueeze(0)), dim=0).cpu().numpy(), 258 | init=self.init.cpu().numpy(), 259 | lapa_c=self.lapa_c.cpu().numpy(), 260 | lapa2_c=self.lapa2_c.cpu().numpy(), 261 | gradient_fc=self.gradient_fc.cpu().numpy(), 262 | gradient_bc=self.gradient_bc.cpu().numpy()) 263 | 264 | def step(self, act): 265 | r = torch.zeros(1) 266 | self.state, rew = __calculate__(self.state, act, self.lapa_c, self.lapa2_c, 267 | self.gradient_fc, self.gradient_bc, 268 | self.dt, self.dx, self.dx2, self.dx4, self.f0, 269 | self.f1, self.f2, self.f3, r, self.num_steps) 270 | 271 | self.d = False if self.len < self.maxt else True 272 | self.len += 1 273 | return self.state, rew, self.d, self.info 274 | 275 | def step_p(self, state, act): 276 | state, rew = self.__calculate__(state, act) 277 | return state, rew 278 | 279 | def reset(self, shape=()): 280 | self.state = self.init[torch.randint(0, 200, size=shape)] 281 | self.d = False 282 | self.len = 0 283 | return self.state 284 | 285 | def test_reset(self, num_test=5): 286 | # return self.init[torch.multinomial(torch.ones(200), num_samples=num_test, 287 | # replacement=False)].unsqueeze(2).cpu().numpy() 288 | return self.init[torch.linspace(0,199,num_test,dtype=torch.long)].unsqueeze(2).cpu().numpy() 289 | 290 | def RHS(self, u, f): 291 | u2 = u*u 292 | lapa = np.matmul(self.numpy['lapa_c'], u) 293 | lapa2 = np.matmul(self.numpy['lapa2_c'], u) 294 | gradient = np.matmul(self.numpy['gradient_fc'], u2)*(u < 0)\ 295 | + np.matmul(self.numpy['gradient_bc'], u2)*(u >= 0) 296 | return -lapa2/self.dx4 - lapa/self.dx2 - gradient/2./self.dx + f, lapa, gradient 297 | 298 | def __calculate__(self, state, act): 299 | f = np.matmul(act, self.numpy['f']) 300 | f = np.expand_dims(f, axis=-1) 301 | r = 0 302 | for _ in range(self.num_steps): 303 | k1, lapa, gradient = self.RHS(state, f) 304 | k2, _, _ = self.RHS(state + self.dt*k1/2, f) 305 | k3, _, _ = self.RHS(state + self.dt*k2/2, f) 306 | k4, _, _ = self.RHS(state + self.dt*k3, f) 307 | r += (lapa*lapa).mean(axis=(1, 2)) + (gradient*gradient).mean(axis=(1, 2))\ 308 | + (state*f).mean(axis=(1, 2)) 309 | state = state + self.dt*(k1 + 2*k2 + 2*k3 + k4)/6 310 | return state, -r/self.num_steps 311 | 312 | 313 | class ks_action_space(action_space): 314 | def sample(self, batch=None): 315 | if batch == None: 316 | shape = 4 317 | else: 318 | shape = (batch, 4) 319 | return (torch.rand(shape, device=self.device)-0.5) 320 | 321 | 322 | if __name__ == '__main__': 323 | # torch.set_default_tensor_type('torch.cuda.FloatTensor') 324 | import os 325 | 326 | torch.manual_seed(0) 327 | np.random.seed(0) 328 | import time 329 | result = [] 330 | result2 = [] 331 | env = ks(device='cpu') 332 | s = env.reset() 333 | a = torch.zeros(4) 334 | timelog = [] 335 | startt = time.time() 336 | n = 2 337 | state = s.reshape(1, -1, 1).repeat([n, 1, 1]).cpu().numpy() 338 | act = torch.zeros((n, 4)) 339 | for i in range(60): 340 | _, r, _, _ = env.step(a) 341 | # result.append(r) 342 | # state,r2=env.step_p(state,act.cpu().numpy()) 343 | # result2.append(r2) 344 | tensortime = time.time()-startt 345 | print(tensortime) 346 | 347 | for n in [2, 4, 8, 16, 32, 64, 128, 256]: 348 | state = s.reshape(1, -1, 1).repeat([n, 1, 1]).cpu().numpy() 349 | act = torch.zeros((n, 4)) 350 | start = time.time() 351 | for i in range(60): 352 | state, _ = env.step_p(state, act.cpu().numpy()) 353 | state1 = torch.from_numpy(state) 354 | nptime = time.time()-start 355 | print(nptime) 356 | timelog.append(nptime/tensortime) 357 | plt.plot(timelog) 358 | plt.show() 359 | # x=torch.cat([i.unsqueeze(0) for i in result]) 360 | # y=np.concatenate([j[0:1].squeeze(-1) for j in result2],axis=0) 361 | # plt.pcolormesh(x.detach().cpu()) 362 | # plt.colorbar() 363 | # plt.show() 364 | -------------------------------------------------------------------------------- /src/NN/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import envs 5 | from utility.utils import clip_tensor 6 | 7 | pi = np.pi 8 | 9 | class action_space_d(object): 10 | def __init__(self, low,high,size): 11 | self.low = low 12 | self.high = high 13 | self.size = size 14 | 15 | def sample(self,batch=None): 16 | if batch==None:shape = (self.size,) 17 | else: shape=(batch,self.size) 18 | return torch.randint(low=self.low,high=self.high,size=shape) 19 | 20 | class action_space_c(object): 21 | def __init__(self, low,high,size): 22 | self.low = low 23 | self.high = high 24 | self.size = size 25 | 26 | def sample(self,batch=None): 27 | if batch==None:shape = (self.size,) 28 | else: shape=(batch,self.size) 29 | return (self.high-self.low)*torch.rand(shape)+self.low 30 | ################################ CartPole ################################# 31 | 32 | class fake_cartpole_env(nn.Module): 33 | def __init__(self,batch_size): 34 | super(fake_cartpole_env, self).__init__() 35 | self.l1 = nn.Linear(5,12) 36 | self.l2 = nn.Linear(12,32) 37 | self.l3 = nn.Linear(32,64) 38 | self.l4 = nn.Linear(64,4) 39 | self.action_space = action_space_d(low=0,high=2,size=1) 40 | self.state = torch.zeros([4]) 41 | self.d = False 42 | self.high = torch.tensor([2.4,float('inf'),0.209,float('inf')]) 43 | self.low = torch.tensor([-2.4,float('-inf'),-0.209,float('-inf')]) 44 | self.maxstep=200 45 | self.act = nn.ReLU() 46 | self.info = 'info is not provided for fake env' 47 | self.obs_dim=4 48 | self.act_dim=1 49 | self.act_limit=torch.tensor([[0,1],]) 50 | self.gravity = 9.8 51 | self.masscart = 1.0 52 | self.masspole = 0.1 53 | self.total_mass = (self.masspole + self.masscart) 54 | self.length = 0.5 55 | self.polemass_length = (self.masspole * self.length) 56 | self.force_mag = 10.0 57 | self.tau = 0.02 58 | # self.zero = torch.zeros((1,4)) 59 | self.loss_f = nn.MSELoss() 60 | 61 | 62 | def forward(self, state, act): 63 | # if not isinstance(act, torch.Tensor): act = torch.tensor([act]) 64 | if state.dim()==1:state=state.unsqueeze(0) 65 | x = torch.cat((state,act),-1) 66 | x = self.act(self.l1(x)) 67 | x = self.act(self.l2(x)) 68 | x = self.act(self.l3(x)) 69 | x = self.l4(x) 70 | return x 71 | 72 | def step(self,a): 73 | if isinstance(a,int): 74 | a=torch.tensor([[a]]) 75 | else:a = a.unsqueeze(0) 76 | self.state = self.forward(self.state,a) 77 | self.d = ((self.state>self.high)+(self.state self.max_speed): velocity = self.max_speed 157 | if (velocity < -self.max_speed): velocity = -self.max_speed 158 | if (position > self.max_position): position = self.max_position 159 | if (position < self.min_position): position = self.min_position 160 | if (position == self.min_position and velocity < 0): velocity = 0 161 | self.state = torch.tensor((position,velocity)).unsqueeze(0) 162 | self.d = (self.state[:,0]>=self.goal_position)*(self.state[:,1]>=self.goal_velocity) 163 | rew = -1#0 164 | # if self.d: 165 | # rew = 100.0 166 | # rew -= a[0]*a[0] * 0.1 167 | return self.state[0], rew, self.d, self.info 168 | 169 | def reset(self): 170 | self.state = torch.rand((1,2))-0.6 171 | self.d=False 172 | return self.state[0] 173 | 174 | def phyloss_f(self,o,o2,a): 175 | 176 | vloss=o2[:,1]-o[:,1] - a.squeeze()*self.force + torch.cos(3*o[:,0])*self.gravity 177 | ploss=o2[:,0]-o[:,0] - o[:,1] 178 | return self.loss_f(ploss,0.)+self.loss_f(vloss,0.) 179 | ############################ End of MountainCar ################################# 180 | 181 | @torch.jit.script 182 | def cost(th,thdot,a,pi:float=pi): 183 | return (((th+pi) % (2*pi)) - pi) ** 2 + .1 * thdot *thdot + .001 * (a *a) 184 | 185 | ############################ Pendulem ################################# 186 | class fake_pendu_env(envs.pendulum,nn.Module): 187 | ''' 188 | in this env, obs is different from state. 189 | state is only for internal use (phyloss) 190 | ''' 191 | def __init__(self,batch_size): 192 | super(fake_pendu_env, self).__init__() 193 | self.net = nn.Sequential( 194 | nn.Linear(3,15), 195 | nn.ReLU(), 196 | nn.Linear(15,32), 197 | nn.ReLU(), 198 | nn.Linear(32,64), 199 | nn.ReLU(), 200 | nn.Linear(64,2)) 201 | 202 | self.d = False 203 | self.high = torch.tensor([-1,-1,-8]) 204 | self.low = torch.tensor([1,1,8]) 205 | self.maxstep=200 206 | self.act = nn.ReLU() 207 | self.info = 'info is not provided for fake env' 208 | 209 | self.zeros = torch.zeros((batch_size)) 210 | self.loss_f = nn.MSELoss() 211 | self.maxT = 2*torch.ones([batch_size,1]) 212 | self.maxspeed = 8*torch.ones([batch_size,1]) 213 | 214 | def forward(self, obs, act, maxT=None,maxspeed=None): 215 | if maxT==None: maxT = self.maxT 216 | if maxspeed==None: maxspeed=self.maxspeed 217 | act = clip_tensor(act,-maxT, maxT) 218 | state = torch.cat((obs,act),-1) 219 | x = self.net(state) 220 | xdot = clip_tensor(x[:,1:],-maxspeed,maxspeed) 221 | return torch.cat((x[:,0:1],xdot),1) 222 | 223 | def step(self,a): 224 | a=a.unsqueeze(0) 225 | self.state=self.forward(self.state,a,self.maxT[0:1],self.maxspeed[0:1]) 226 | return self.state[0], -cost(self.state[:,0],self.state[:,1],a), False, self.info 227 | 228 | def reset(self): 229 | x = torch.rand((1,2)) 230 | self.state = torch.cat((pi*x[:,0:1],x[:,1:]),1) 231 | return self.state[0] 232 | 233 | def phyloss_f(self,o,o2,a): 234 | a=a.squeeze() 235 | ids = ((o2[:,1]-self.maxspeed[:,0])).nonzero().squeeze() 236 | loss1=(o2[ids,1] - o[ids,1] - (-3 * self.g / (2 * self.l) * torch.sin(o[ids,0] + pi) 237 | + 3. / (self.m * self.l ** 2) * a[ids]) * self.dt) 238 | loss2=o2[ids,0] - o[ids,0] - o[ids,1] * self.dt 239 | return self.loss_f(loss1,self.zeros[ids]) + self.loss_f(loss2,self.zeros[ids]) 240 | ############################ End of Pendulem ################################# 241 | 242 | ############################ Burger ################################# 243 | 244 | class fake_burgers_env(envs.burgers, nn.Module): 245 | def __init__(self,batch_size,ratio=50): 246 | super(fake_burgers_env,self).__init__() 247 | self.encoder = nn.Sequential( 248 | nn.Conv1d(1,8,6,stride=3), 249 | nn.ReLU(), 250 | nn.Conv1d(8,16,7,stride=3), 251 | nn.ReLU(), 252 | nn.Conv1d(16,32,5,stride=2), 253 | nn.ReLU(), 254 | nn.Conv1d(32,48,6)) 255 | 256 | self.l = nn.Sequential(nn.Linear(48,24),nn.ReLU(),nn.Linear(24,10)) 257 | self.lstm = nn.LSTM(2,10,1,batch_first=True) 258 | self.decoder = nn.Sequential( 259 | nn.Linear(10,36), 260 | nn.ReLU(), 261 | nn.Linear(36,72), 262 | nn.ReLU(), 263 | nn.Linear(72,150)) 264 | 265 | self.dt = self.dt*ratio 266 | self.info = 'No info for fake env' 267 | self.seq_len = int(self.num_steps/ratio) 268 | self.loss = nn.MSELoss() 269 | self.f1 = self.f1.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 270 | self.f2 = self.f2.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 271 | self.zero = torch.zeros((batch_size,self.seq_len,self.meshx)) 272 | self.mseloss = nn.MSELoss() 273 | self.maxt=self.maxt*torch.ones(batch_size) 274 | 275 | def forward(self, state, act, len): 276 | pdestate = state + self.ref[len-1] 277 | x = self.encoder(pdestate.unsqueeze(1)) 278 | x = self.l(x.squeeze(-1)).unsqueeze(1) 279 | h,c = x.permute(1,0,2),x.permute(1,0,2) 280 | act = act.unsqueeze(1).repeat([1,self.seq_len,1]) 281 | x,(h,c) = self.lstm(act,(h,c)) 282 | self.result = self.decoder(x) 283 | return self.result[:,-1] - self.ref[len] 284 | 285 | 286 | def step(self,o,a,len): 287 | d = (len=self.maxt[filterid]) 293 | rew = -10*((o2**2).mean(axis=1)) 294 | return o,a,o2, rew, d, len, filterid.shape[0] 295 | 296 | 297 | def reset(self): 298 | tmp=super().reset() 299 | self.state = self.state.unsqueeze(0) 300 | # self.pdestate = self.pdestate.unsqueeze(0) 301 | return tmp 302 | 303 | def phyloss_f(self,o,a): 304 | 305 | a = a.unsqueeze(1).repeat([1,self.seq_len,1]) 306 | x = torch.cat((self.result[:,:,-2:],self.result,self.result[:,:,:2]),2) 307 | lapa = -1/12*x[:,:,:-4]+4/3*x[:,:,1:-3]-5/2*x[:,:,2:-2]+4/3*x[:,:,3:-1]-1/12*x[:,:,4:] 308 | state2 = x**2/2 309 | gradient = 0.5*state2[:,:,:-4]-2*state2[:,:,1:-3]+1.5*state2[:,:,2:-2] 310 | x_grad = self.nu * lapa / self.dx**2 - gradient / self.dx \ 311 | +a[:,:,:1]*self.f1 + a[:,:,1:]*self.f2 312 | x = torch.cat((o.unsqueeze(1),x[:,:,2:-2]),1) 313 | residual = (x[:,1:] - x[:,:-1])/self.dt - x_grad 314 | 315 | return self.mseloss(residual,self.zero) 316 | 317 | 318 | ############################ End of Burger ################################# 319 | @torch.jit.script 320 | def __rew__(lapa_c,gradient_fc,gradient_bc,u,a,f0,f1,f2,f3): 321 | ur = u.unsqueeze(3) 322 | u2 = ur*ur 323 | lapa = torch.matmul(lapa_c,ur).squeeze() 324 | gradient = torch.matmul(gradient_fc,u2).squeeze()*(u<0)\ 325 | +torch.matmul(gradient_bc,u2).squeeze()*(u>=0) 326 | f = a[:,:,0:1]*f0 + a[:,:,1:2]*f1 + a[:,:,2:3]*f2 + a[:,:,3:]*f3 327 | return -(lapa*lapa).mean(dim=(1,2)) - (gradient*gradient).mean(dim=(1,2)) - (u*f).mean(dim=(1,2)) 328 | 329 | @torch.jit.script 330 | def RHS(u,lapa_c,lapa2_c,gradient_fc,gradient_bc,dx:float,dx2:float,dx4:float,f): 331 | u1=u.unsqueeze(3) 332 | state2=u1*u1 333 | lapa=torch.matmul(lapa_c,u1).squeeze() 334 | lapa2=torch.matmul(lapa2_c,u1).squeeze() 335 | gradient=torch.matmul(gradient_fc,state2).squeeze()*(u<0)\ 336 | +torch.matmul(gradient_bc,state2).squeeze()*(u>=0) 337 | return -lapa2/dx4 - lapa/dx2 - gradient/2./dx + f 338 | 339 | @torch.jit.script 340 | def __loss__(u,u1,a,lapa_c,lapa2_c,gradient_fc,gradient_bc, 341 | dx:float,dx2:float,dx4:float,dt:float,f0,f1,f2,f3): 342 | f = a[:,:,0:1]*f0 + a[:,:,1:2]*f1 + a[:,:,2:3]*f2 + a[:,:,3:]*f3 343 | k1 = RHS(u,lapa_c,lapa2_c,gradient_fc,gradient_bc,dx,dx2,dx4,f) 344 | k2 = RHS(u + dt*k1/2,lapa_c,lapa2_c,gradient_fc,gradient_bc,dx,dx2,dx4,f) 345 | k3 = RHS(u + dt*k2/2,lapa_c,lapa2_c,gradient_fc,gradient_bc,dx,dx2,dx4,f) 346 | k4 = RHS(u + dt*k3,lapa_c,lapa2_c,gradient_fc,gradient_bc,dx,dx2,dx4,f) 347 | 348 | return u1-u - dt*(k1 + 2*k2 + 2*k3 + k4)/6 349 | 350 | ############################ KS ################################# 351 | 352 | class fake_ks_env(envs.ks,nn.Module): 353 | def __init__(self,batch_size,ratio=5,forward_size=1): 354 | super(fake_ks_env,self).__init__() 355 | self.encoder = nn.Sequential( 356 | nn.Conv1d(1,8,7,stride=3), 357 | nn.ReLU(), 358 | nn.Conv1d(8,16,6,stride=2), 359 | nn.ReLU(), 360 | nn.Conv1d(16,32,5,stride=1), 361 | nn.ReLU(), 362 | nn.Conv1d(32,48,4)) 363 | 364 | self.l = nn.Sequential(nn.Linear(48,24),nn.ReLU(),nn.Linear(24,12)) 365 | self.lstm = nn.LSTM(4,12,1,batch_first=True) 366 | self.decoder = nn.Sequential( 367 | nn.Linear(12,24), 368 | nn.ReLU(), 369 | nn.Linear(24,48), 370 | nn.ReLU(), 371 | nn.Linear(48,64)) 372 | 373 | self.dt = self.dt*ratio 374 | self.info = 'No info for fake env' 375 | self.seq_len = int(self.num_steps/ratio) 376 | self.loss = nn.MSELoss() 377 | self.f0f = self.f0.reshape(1,1,-1).repeat([forward_size,self.seq_len,1]) 378 | self.f1f = self.f1.reshape(1,1,-1).repeat([forward_size,self.seq_len,1]) 379 | self.f2f = self.f2.reshape(1,1,-1).repeat([forward_size,self.seq_len,1]) 380 | self.f3f = self.f3.reshape(1,1,-1).repeat([forward_size,self.seq_len,1]) 381 | self.f0 = self.f0.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 382 | self.f1 = self.f1.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 383 | self.f2 = self.f2.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 384 | self.f3 = self.f3.reshape(1,1,-1).repeat([batch_size,self.seq_len,1]) 385 | self.zero = torch.zeros((batch_size,self.seq_len,self.meshx)) 386 | self.mseloss = nn.MSELoss() 387 | self.forward_size = forward_size 388 | 389 | def forward(self, state, act): 390 | 391 | x = self.encoder(state.unsqueeze(1)) 392 | x = self.l(x.squeeze(-1)).unsqueeze(0) 393 | h,c = x,x 394 | act = act.unsqueeze(1).repeat([1,self.seq_len,1]) 395 | x,(h,c) = self.lstm(act,(h,c)) 396 | self.result = self.decoder(x) 397 | return self.result[:,-1] 398 | 399 | 400 | def step(self,state,a): 401 | # a=a.unsqueeze(0) 402 | state = self.forward(state,a) 403 | 404 | # self.d = False if self.len