├── .gitattributes ├── .gitignore ├── Code ├── __pycache__ │ ├── core.cpython-36.pyc │ ├── core_DDPG.cpython-36.pyc │ ├── core_PPO.cpython-36.pyc │ ├── core_VPG.cpython-36.pyc │ ├── enviroment.cpython-36.pyc │ └── user_config.cpython-36.pyc ├── core_DDPG.py ├── core_PPO.py ├── core_VPG.py ├── ddpg.py ├── enviroment.py ├── user_config.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── logx.cpython-36.pyc │ ├── mpi_pytorch.cpython-36.pyc │ ├── mpi_tools.cpython-36.pyc │ ├── run_utils.cpython-36.pyc │ └── serialization_utils.cpython-36.pyc │ ├── logx.py │ ├── mpi_pytorch.py │ ├── mpi_tf.py │ ├── mpi_tools.py │ ├── plot.py │ ├── run_entrypoint.py │ ├── run_utils.py │ ├── serialization_utils.py │ └── test_policy.py ├── Data ├── Solar │ ├── Hamza_Elsheikh_solar_generation.csv │ ├── Tannah_solar_generation.csv │ └── Um_Bader_solar_generation.csv ├── usage_trends.csv └── wind │ ├── Hamza_Elsheikh_wind_generation.csv │ ├── Tannah_wind_generation.csv │ └── Um_Bader_wind_generation.csv ├── Final Version ISA ├── Bitchplzwork.mdl ├── Bitchplzwork.mdl.autosave ├── Bitchplzwork.mdl.original ├── Bitchplzwork.slxc ├── Finally.csv ├── Finally_with_not_changed_time_step.csv ├── Hybrid.slx ├── Hybrid.slxc ├── MPPT_algorithm.m ├── Microgrid.mdl ├── Microgrid.mdl.original ├── Microgrid.slxc ├── Microgrid_24h_Simulation.mdl ├── Microgrid_24h_Simulation.mdl.original ├── Microgrid_grt_rtw │ └── build_exception.mat ├── New Text Document.txt ├── Solar.mdl ├── Solar.slxc ├── Wind-54.csv ├── Wind-960.csv ├── Wind │ ├── Wind-54.csv │ ├── Wind-960.csv │ ├── Wind2gen-100.csv │ ├── Wind2gen-150.csv │ ├── Wind2gen-200.csv │ ├── Wind2gen-250.csv │ ├── Wind2gen-300.csv │ ├── Wind2gen-350.csv │ ├── Wind2gen-400.csv │ ├── Wind2gen-450.csv │ ├── Wind2gen-50.csv │ ├── Wind2gen-500.csv │ ├── Wind2gen-550.csv │ ├── Wind2gen-600.csv │ ├── Wind2gen-650.xlsx │ ├── Wind2gen-700.xlsx │ ├── Wind2gen-750.xlsx │ ├── Wind2gen-800.xlsx │ ├── Wind3gen-150.csv │ ├── Wind3gen-200.csv │ ├── Wind3gen-250.csv │ ├── Wind3gen-350.csv │ ├── Wind3gen-400.csv │ ├── Wind3gen-450.csv │ ├── Wind3gen-50.csv │ ├── Wind3gen-500.csv │ ├── Wind3gen-550.csv │ ├── Wind3gen-600.xlsx │ ├── Wind3gen-650.csv │ ├── Wind3gen-700.csv │ ├── Wind3gen-750.xlsx │ ├── Wind3gen-800.xlsx │ ├── Wind960.xlsx │ ├── Windgen-200.csv │ ├── Windgen-300.csv │ ├── Windgen-50.csv │ ├── Windgen-550.xlsx │ ├── Windgen-600.xlsx │ ├── Windgen-650.xlsx │ ├── Windgen-700.xlsx │ ├── Windgen-750.xlsx │ ├── Windgen-800.xlsx │ ├── Windgen150.csv │ ├── Windgen350.csv │ ├── windgen-100.csv │ ├── windgen400.csv │ ├── windgen450.csv │ └── windgen500.csv ├── anfismicrogrid.slx ├── forUsage.csv ├── inverter.m ├── license.txt ├── main1.mdl ├── power_PVarray_250kW.slxc ├── pvwindupfc11.slx ├── pvwindupfc11.slx.original ├── slprj │ ├── Wind960.csv │ ├── _jitprj │ │ ├── jitEngineAccessInfo.mat │ │ ├── s9PYPNMBhHdO6QryHA5PViF.l │ │ ├── s9PYPNMBhHdO6QryHA5PViF.mat │ │ ├── sCkFIU2kiuEAIU96i73GX7E.l │ │ ├── sCkFIU2kiuEAIU96i73GX7E.mat │ │ ├── sFQgwjAbPyPHrsWx5nIcBCF.l │ │ ├── sFQgwjAbPyPHrsWx5nIcBCF.mat │ │ ├── sHTsCkXfsaxgGvp5B6gIsDH.l │ │ ├── sHTsCkXfsaxgGvp5B6gIsDH.mat │ │ ├── sJpzdckcHW32W9btqkbe87E.l │ │ ├── sJpzdckcHW32W9btqkbe87E.mat │ │ ├── sMNlS4WSQlq2BjqTB1MhFN.l │ │ ├── sMNlS4WSQlq2BjqTB1MhFN.mat │ │ ├── sMsIGN7kQJ7vM1eACdbvrHF.l │ │ ├── sMsIGN7kQJ7vM1eACdbvrHF.mat │ │ ├── sZ4lfP1e5BI7PUlKP0L1IiG.l │ │ ├── sZ4lfP1e5BI7PUlKP0L1IiG.mat │ │ ├── sdV6jeitGTsO8Ess5ig6gPC.l │ │ ├── sdV6jeitGTsO8Ess5ig6gPC.mat │ │ ├── snBTaQ6nSCACl4z6OgoKUdC.l │ │ └── snBTaQ6nSCACl4z6OgoKUdC.mat │ ├── _sfprj │ │ ├── Bitchplzwork │ │ │ ├── _self │ │ │ │ └── sfun │ │ │ │ │ └── info │ │ │ │ │ └── binfo.mat │ │ │ └── amsi_serial.mat │ │ ├── EMLReport │ │ │ ├── emlReportAccessInfo.mat │ │ │ ├── s9PYPNMBhHdO6QryHA5PViF.mat │ │ │ ├── sCkFIU2kiuEAIU96i73GX7E.mat │ │ │ ├── sFQgwjAbPyPHrsWx5nIcBCF.mat │ │ │ ├── sHTsCkXfsaxgGvp5B6gIsDH.mat │ │ │ ├── sJpzdckcHW32W9btqkbe87E.mat │ │ │ ├── sMNlS4WSQlq2BjqTB1MhFN.mat │ │ │ ├── sMsIGN7kQJ7vM1eACdbvrHF.mat │ │ │ ├── sZ4lfP1e5BI7PUlKP0L1IiG.mat │ │ │ ├── sdV6jeitGTsO8Ess5ig6gPC.mat │ │ │ └── snBTaQ6nSCACl4z6OgoKUdC.mat │ │ ├── Hybrid │ │ │ ├── _self │ │ │ │ └── sfun │ │ │ │ │ └── info │ │ │ │ │ └── binfo.mat │ │ │ └── amsi_serial.mat │ │ ├── Microgrid │ │ │ ├── _self │ │ │ │ └── sfun │ │ │ │ │ └── info │ │ │ │ │ └── binfo.mat │ │ │ └── amsi_serial.mat │ │ ├── loadchrge │ │ │ ├── _self │ │ │ │ └── sfun │ │ │ │ │ └── info │ │ │ │ │ └── binfo.mat │ │ │ └── amsi_serial.mat │ │ ├── power_PVarray_250kW │ │ │ ├── _self │ │ │ │ └── sfun │ │ │ │ │ └── info │ │ │ │ │ └── binfo.mat │ │ │ └── amsi_serial.mat │ │ └── precompile │ │ │ ├── 45Nz1moTM3gFMUVl155FEF.mat │ │ │ ├── 7uidPLBlbuJVtI1v11OWIC.mat │ │ │ ├── NEmFLVdIaIDOOAzSH3YT1B.mat │ │ │ ├── NvPlnt6L2iOdf1Jwkfdh5F.mat │ │ │ ├── RYvFsnwz0IyeTG5zuBZAYE.mat │ │ │ ├── S2OZXTJXMrHV6AqcBi8ZoF.mat │ │ │ ├── ZmkJN8jBDV9LXomvv3HUVD.mat │ │ │ ├── autoInferAccessInfo.mat │ │ │ ├── fW4c5frv0z318KTM3NEwLF.mat │ │ │ ├── fq8kkmRZGkv3H4UCFo0tQG.mat │ │ │ ├── jGVoJDT4nRc7uyAddd0mBD.mat │ │ │ ├── l1jRDt3SKHwBvKsrqidaeE.mat │ │ │ └── zjUDhUWRKJD1aq4Ci2NxnH.mat │ ├── grt │ │ └── Microgrid │ │ │ └── tmwinternal │ │ │ └── minfo.mat │ ├── modeladvisor │ │ └── UpgradeAdv_ │ │ │ └── anfismicrogrid1 │ │ │ └── ModelAdvisorData │ ├── sim │ │ └── varcache │ │ │ ├── Bitchplzwork │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ │ │ ├── Hybrid │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ │ │ ├── Microgrid │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ │ │ ├── Solar │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ │ │ ├── power_PVarray_250kW │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ │ │ └── windgen │ │ │ ├── checksumOfCache.mat │ │ │ ├── tmwinternal │ │ │ └── simulink_cache.xml │ │ │ └── varInfo.mat │ └── sl_proj.tmw ├── wIND.m ├── wind.csv ├── wind.mdl ├── wind.mdl.original ├── windgen.mdl ├── windgen.mdl.original └── windgen.slxc ├── README.md ├── main_DDPG.py ├── main_PPO.py └── main_vpg.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | spinningup/ 3 | -------------------------------------------------------------------------------- /Code/__pycache__/core.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/core.cpython-36.pyc -------------------------------------------------------------------------------- /Code/__pycache__/core_DDPG.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/core_DDPG.cpython-36.pyc -------------------------------------------------------------------------------- /Code/__pycache__/core_PPO.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/core_PPO.cpython-36.pyc -------------------------------------------------------------------------------- /Code/__pycache__/core_VPG.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/core_VPG.cpython-36.pyc -------------------------------------------------------------------------------- /Code/__pycache__/enviroment.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/enviroment.cpython-36.pyc -------------------------------------------------------------------------------- /Code/__pycache__/user_config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/__pycache__/user_config.cpython-36.pyc -------------------------------------------------------------------------------- /Code/core_DDPG.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | 21 | def pi_newtork(obs_size, act_size, hidden_size): 22 | net = nn.Sequential( 23 | nn.Linear(obs_size, hidden_size), 24 | nn.ReLU(), 25 | nn.Linear(hidden_size, hidden_size), 26 | nn.ReLU(), 27 | nn.Linear(hidden_size, act_size), 28 | nn.Tanh()) 29 | return net 30 | 31 | def q_newtork(obs_size, act_size, hidden_size): 32 | in_size = obs_size + act_size 33 | net = nn.Sequential( 34 | nn.Linear(in_size, hidden_size), 35 | nn.ReLU(), 36 | nn.Linear(hidden_size, hidden_size), 37 | nn.ReLU(), 38 | nn.Linear(hidden_size, 1), 39 | nn.Identity()) 40 | return net 41 | 42 | def count_vars(module): 43 | return sum([np.prod(p.shape) for p in module.parameters()]) 44 | 45 | class MLPActor(nn.Module): 46 | 47 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 48 | super().__init__() 49 | device='cuda' 50 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 51 | #print(obs_dim, act_dim, hidden_sizes) 52 | self.pi = pi_newtork(obs_dim,act_dim, hidden_sizes[0])#mlp(pi_sizes, activation, nn.Tanh) 53 | self.act_limit = torch.from_numpy(act_limit).to(device) 54 | 55 | def forward(self, obs): 56 | # Return output from network scaled to action space limits. 57 | return self.act_limit * self.pi(obs) 58 | 59 | class MLPQFunction(nn.Module): 60 | 61 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 62 | super().__init__() 63 | self.q = q_newtork(obs_dim,act_dim, hidden_sizes[0])#mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 64 | 65 | def forward(self, obs, act): 66 | q = self.q(torch.cat([obs, act], dim=-1)) 67 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 68 | 69 | class MLPActorCritic(nn.Module): 70 | 71 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 72 | activation=nn.ReLU): 73 | super().__init__() 74 | 75 | obs_dim = observation_space.shape[0] 76 | act_dim = action_space.shape[0] 77 | act_limit = action_space.high 78 | device = 'cuda' 79 | # build policy and value functions 80 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit).to(device) 81 | self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation).to(device) 82 | 83 | def act(self, obs): 84 | with torch.no_grad(): 85 | return self.pi(obs).cpu().data.numpy() 86 | -------------------------------------------------------------------------------- /Code/core_PPO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | def actor_newtork(obs_size, act_size, hidden_size): 66 | net = nn.Sequential( 67 | nn.Linear(obs_size, hidden_size), 68 | nn.ReLU(), 69 | nn.Linear(hidden_size, hidden_size), 70 | nn.ReLU(), 71 | nn.Linear(hidden_size, act_size), 72 | nn.Tanh()) 73 | return net 74 | 75 | 76 | def q_newtork(obs_size,hidden_size): 77 | net = nn.Sequential( 78 | nn.Linear(obs_size, hidden_size), 79 | nn.ReLU(), 80 | nn.Linear(hidden_size, hidden_size), 81 | nn.ReLU(), 82 | nn.Linear(hidden_size, 1), 83 | nn.Tanh()) 84 | return net 85 | 86 | class MLPCategoricalActor(Actor): 87 | 88 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 89 | super().__init__() 90 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)#actor_newtork(obs_dim, act_dim, hidden_sizes[0])# 91 | 92 | def _distribution(self, obs): 93 | logits = self.logits_net(obs) 94 | return Categorical(logits=logits) 95 | 96 | def _log_prob_from_distribution(self, pi, act): 97 | return pi.log_prob(act) 98 | 99 | 100 | class MLPGaussianActor(Actor): 101 | 102 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 103 | super().__init__() 104 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 105 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 106 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)#actor_newtork(obs_dim, act_dim, hidden_sizes[0])# 107 | 108 | def _distribution(self, obs): 109 | mu = self.mu_net(obs) 110 | std = torch.exp(self.log_std) 111 | return Normal(mu, std) 112 | 113 | def _log_prob_from_distribution(self, pi, act): 114 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 115 | 116 | 117 | class MLPCritic(nn.Module): 118 | 119 | def __init__(self, obs_dim, hidden_sizes, activation): 120 | super().__init__() 121 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)#q_newtork(obs_dim,hidden_sizes[0])# 122 | 123 | def forward(self, obs): 124 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 125 | 126 | 127 | 128 | class MLPActorCritic(nn.Module): 129 | 130 | 131 | def __init__(self, observation_space, action_space, 132 | hidden_sizes=(64,64), activation=nn.Tanh): 133 | super().__init__() 134 | 135 | obs_dim = observation_space.shape[0] 136 | 137 | # policy builder depends on action space 138 | if isinstance(action_space, Box): 139 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation).to('cuda') 140 | elif isinstance(action_space, Discrete): 141 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation).to('cuda') 142 | 143 | # build value function 144 | self.v = MLPCritic(obs_dim, hidden_sizes, activation).to('cuda') 145 | 146 | def step(self, obs): 147 | with torch.no_grad(): 148 | pi = self.pi._distribution(obs) 149 | a = pi.sample() 150 | logp_a = self.pi._log_prob_from_distribution(pi, a) 151 | v = self.v(obs) 152 | return a.cpu().data.numpy(), v.cpu().data.numpy(), logp_a.cpu().data.numpy() 153 | 154 | def act(self, obs): 155 | return self.step(obs)[0] -------------------------------------------------------------------------------- /Code/core_VPG.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | 66 | class MLPCategoricalActor(Actor): 67 | 68 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 69 | super().__init__() 70 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 71 | 72 | def _distribution(self, obs): 73 | logits = self.logits_net(obs) 74 | return Categorical(logits=logits) 75 | 76 | def _log_prob_from_distribution(self, pi, act): 77 | return pi.log_prob(act) 78 | 79 | 80 | class MLPGaussianActor(Actor): 81 | 82 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 83 | super().__init__() 84 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 85 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 86 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 87 | 88 | def _distribution(self, obs): 89 | mu = self.mu_net(obs) 90 | std = torch.exp(self.log_std) 91 | return Normal(mu, std) 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 95 | 96 | 97 | class MLPCritic(nn.Module): 98 | 99 | def __init__(self, obs_dim, hidden_sizes, activation): 100 | super().__init__() 101 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 102 | 103 | def forward(self, obs): 104 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 105 | 106 | 107 | 108 | class MLPActorCritic(nn.Module): 109 | 110 | 111 | def __init__(self, observation_space, action_space, 112 | hidden_sizes=(64,64), activation=nn.Tanh): 113 | super().__init__() 114 | 115 | obs_dim = observation_space.shape[0] 116 | 117 | # policy builder depends on action space 118 | if isinstance(action_space, Box): 119 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation).to('cuda') 120 | elif isinstance(action_space, Discrete): 121 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation).to('cuda') 122 | 123 | # build value function 124 | self.v = MLPCritic(obs_dim, hidden_sizes, activation).to('cuda') 125 | 126 | def step(self, obs): 127 | with torch.no_grad(): 128 | pi = self.pi._distribution(obs) 129 | a = pi.sample() 130 | logp_a = self.pi._log_prob_from_distribution(pi, a) 131 | v = self.v(obs) 132 | return a.cpu().data.numpy(), v.cpu().data.numpy(), logp_a.cpu().data.numpy() 133 | 134 | def act(self, obs): 135 | return self.step(obs)[0] -------------------------------------------------------------------------------- /Code/ddpg.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch.optim import Adam 5 | import gym 6 | import time 7 | from spinup.algos.pytorch.ddpg import core 8 | from Code.environment import MicrogridEnv 9 | from spinup.utils.logx import EpochLogger 10 | 11 | 12 | class ReplayBuffer: 13 | """ 14 | A simple FIFO experience replay buffer for DDPG agents. 15 | """ 16 | 17 | def __init__(self, obs_dim, act_dim, size): 18 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 19 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 20 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 21 | self.rew_buf = np.zeros(size, dtype=np.float32) 22 | self.done_buf = np.zeros(size, dtype=np.float32) 23 | self.ptr, self.size, self.max_size = 0, 0, size 24 | 25 | def store(self, obs, act, rew, next_obs, done): 26 | self.obs_buf[self.ptr] = obs 27 | self.obs2_buf[self.ptr] = next_obs 28 | self.act_buf[self.ptr] = act 29 | self.rew_buf[self.ptr] = rew 30 | self.done_buf[self.ptr] = done 31 | self.ptr = (self.ptr+1) % self.max_size 32 | self.size = min(self.size+1, self.max_size) 33 | 34 | def sample_batch(self, batch_size=32): 35 | idxs = np.random.randint(0, self.size, size=batch_size) 36 | batch = dict(obs=self.obs_buf[idxs], 37 | obs2=self.obs2_buf[idxs], 38 | act=self.act_buf[idxs], 39 | rew=self.rew_buf[idxs], 40 | done=self.done_buf[idxs]) 41 | return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()} 42 | 43 | 44 | 45 | def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 46 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 47 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 48 | update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 49 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 50 | """ 51 | Deep Deterministic Policy Gradient (DDPG) 52 | 53 | 54 | Args: 55 | env_fn : A function which creates a copy of the environment. 56 | The environment must satisfy the OpenAI Gym API. 57 | 58 | actor_critic: The constructor method for a PyTorch Module with an ``act`` 59 | method, a ``pi`` module, and a ``q`` module. The ``act`` method and 60 | ``pi`` module should accept batches of observations as inputs, 61 | and ``q`` should accept a batch of observations and a batch of 62 | actions as inputs. When called, these should return: 63 | 64 | =========== ================ ====================================== 65 | Call Output Shape Description 66 | =========== ================ ====================================== 67 | ``act`` (batch, act_dim) | Numpy array of actions for each 68 | | observation. 69 | ``pi`` (batch, act_dim) | Tensor containing actions from policy 70 | | given observations. 71 | ``q`` (batch,) | Tensor containing the current estimate 72 | | of Q* for the provided observations 73 | | and actions. (Critical: make sure to 74 | | flatten this!) 75 | =========== ================ ====================================== 76 | 77 | ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 78 | you provided to DDPG. if using a custom ActorCritic or changing the 79 | Hidden size of activation function. 80 | 81 | seed (int): Seed for random number generators. 82 | 83 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 84 | for the agent and the environment in each epoch. 85 | 86 | epochs (int): Number of epochs to run and train agent. 87 | 88 | replay_size (int): Maximum length of replay buffer. 89 | 90 | gamma (float): Discount factor. (Always between 0 and 1.) 91 | 92 | polyak (float): Interpolation factor in polyak averaging for target 93 | networks. Target networks are updated towards main networks 94 | according to: 95 | 96 | .. math:: \\theta_{\\text{targ}} \\leftarrow 97 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 98 | 99 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 100 | close to 1.) 101 | 102 | pi_lr (float): Learning rate for policy. 103 | 104 | q_lr (float): Learning rate for Q-networks. 105 | 106 | batch_size (int): Minibatch size for SGD. 107 | 108 | start_steps (int): Number of steps for uniform-random action selection, 109 | before running real policy. Helps exploration. 110 | 111 | update_after (int): Number of env interactions to collect before 112 | starting to do gradient descent updates. Ensures replay buffer 113 | is full enough for useful updates. 114 | 115 | update_every (int): Number of env interactions that should elapse 116 | between gradient descent updates. Note: Regardless of how long 117 | you wait between updates, the ratio of env steps to gradient steps 118 | is locked to 1. 119 | 120 | act_noise (float): Stddev for Gaussian exploration noise added to 121 | policy at training time. (At test time, no noise is added.) 122 | 123 | num_test_episodes (int): Number of episodes to test the deterministic 124 | policy at the end of each epoch. 125 | 126 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 127 | 128 | logger_kwargs (dict): Keyword args for EpochLogger. 129 | 130 | save_freq (int): How often (in terms of gap between epochs) to save 131 | the current policy and value function. 132 | 133 | """ 134 | 135 | logger = EpochLogger(**logger_kwargs) 136 | logger.save_config(locals()) 137 | 138 | torch.manual_seed(seed) 139 | np.random.seed(seed) 140 | 141 | env, test_env = env_fn(), env_fn() 142 | obs_dim = env.observation_space.shape 143 | act_dim = env.action_space.shape[0] 144 | 145 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 146 | act_limit = [env.action_space.low, env.action_space.high] 147 | 148 | # Create actor-critic module and target networks 149 | ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) 150 | ac_targ = deepcopy(ac) 151 | 152 | # Freeze target networks with respect to optimizers (only update via polyak averaging) 153 | for p in ac_targ.parameters(): 154 | p.requires_grad = False 155 | 156 | # Experience buffer 157 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 158 | 159 | # Count variables (protip: try to get a feel for how different size networks behave!) 160 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) 161 | logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) 162 | 163 | # Set up function for computing DDPG Q-loss 164 | def compute_loss_q(data): 165 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 166 | 167 | q = ac.q(o,a) 168 | 169 | # Bellman backup for Q function 170 | with torch.no_grad(): 171 | q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) 172 | backup = r + gamma * (1 - d) * q_pi_targ 173 | 174 | # MSE loss against Bellman backup 175 | loss_q = ((q - backup)**2).mean() 176 | 177 | # Useful info for logging 178 | loss_info = dict(QVals=q.detach().numpy()) 179 | 180 | return loss_q, loss_info 181 | 182 | # Set up function for computing DDPG pi loss 183 | def compute_loss_pi(data): 184 | o = data['obs'] 185 | q_pi = ac.q(o, ac.pi(o)) 186 | return -q_pi.mean() 187 | 188 | # Set up optimizers for policy and q-function 189 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) 190 | q_optimizer = Adam(ac.q.parameters(), lr=q_lr) 191 | 192 | # Set up model saving 193 | logger.setup_pytorch_saver(ac) 194 | 195 | def update(data): 196 | # First run one gradient descent step for Q. 197 | q_optimizer.zero_grad() 198 | loss_q, loss_info = compute_loss_q(data) 199 | loss_q.backward() 200 | q_optimizer.step() 201 | 202 | # Freeze Q-network so you don't waste computational effort 203 | # computing gradients for it during the policy learning step. 204 | for p in ac.q.parameters(): 205 | p.requires_grad = False 206 | 207 | # Next run one gradient descent step for pi. 208 | pi_optimizer.zero_grad() 209 | loss_pi = compute_loss_pi(data) 210 | loss_pi.backward() 211 | pi_optimizer.step() 212 | 213 | # Unfreeze Q-network so you can optimize it at next DDPG step. 214 | for p in ac.q.parameters(): 215 | p.requires_grad = True 216 | 217 | # Record things 218 | logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) 219 | 220 | # Finally, update target networks by polyak averaging. 221 | with torch.no_grad(): 222 | for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): 223 | # NB: We use an in-place operations "mul_", "add_" to update target 224 | # params, as opposed to "mul" and "add", which would make new tensors. 225 | p_targ.data.mul_(polyak) 226 | p_targ.data.add_((1 - polyak) * p.data) 227 | 228 | def get_action(o, noise_scale): 229 | a = ac.act(torch.as_tensor(o, dtype=torch.float32)) 230 | a += noise_scale * np.random.randn(act_dim) 231 | return np.clip(a, act_limit[0], act_limit[1]) 232 | 233 | def test_agent(): 234 | for j in range(num_test_episodes): 235 | o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 236 | while not(d or (ep_len == max_ep_len)): 237 | # Take deterministic actions at test time (noise_scale=0) 238 | o, r, d, _ = test_env.step(get_action(o, 0)) 239 | ep_ret += r 240 | ep_len += 1 241 | logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 242 | 243 | # Prepare for interaction with environment 244 | total_steps = steps_per_epoch * epochs 245 | start_time = time.time() 246 | o, ep_ret, ep_len = env.reset(), 0, 0 247 | 248 | # Main loop: collect experience in env and update/log each epoch 249 | for t in range(total_steps): 250 | 251 | # Until start_steps have elapsed, randomly sample actions 252 | # from a uniform distribution for better exploration. Afterwards, 253 | # use the learned policy (with some noise, via act_noise). 254 | if t > start_steps: 255 | a = get_action(o, act_noise) 256 | else: 257 | a = env.action_space.sample() 258 | 259 | # Step the env 260 | o2, r, d, _ = env.step(a) 261 | ep_ret += r 262 | ep_len += 1 263 | 264 | # Ignore the "done" signal if it comes from hitting the time 265 | # horizon (that is, when it's an artificial terminal signal 266 | # that isn't based on the agent's state) 267 | d = False if ep_len==max_ep_len else d 268 | 269 | # Store experience to replay buffer 270 | replay_buffer.store(o, a, r, o2, d) 271 | 272 | # Super critical, easy to overlook step: make sure to update 273 | # most recent observation! 274 | o = o2 275 | 276 | # End of trajectory handling 277 | if d or (ep_len == max_ep_len): 278 | logger.store(EpRet=ep_ret, EpLen=ep_len) 279 | o, ep_ret, ep_len = env.reset(), 0, 0 280 | 281 | # Update handling 282 | if t >= update_after and t % update_every == 0: 283 | for _ in range(update_every): 284 | batch = replay_buffer.sample_batch(batch_size) 285 | update(data=batch) 286 | 287 | # End of epoch handling 288 | if (t+1) % steps_per_epoch == 0: 289 | epoch = (t+1) // steps_per_epoch 290 | 291 | # Save model 292 | if (epoch % save_freq == 0) or (epoch == epochs): 293 | logger.save_state({'env': env}, None) 294 | 295 | # Test the performance of the deterministic version of the agent. 296 | test_agent() 297 | 298 | # Log info about epoch 299 | logger.log_tabular('Epoch', epoch) 300 | logger.log_tabular('EpRet', with_min_and_max=True) 301 | logger.log_tabular('TestEpRet', with_min_and_max=True) 302 | logger.log_tabular('EpLen', average_only=True) 303 | logger.log_tabular('TestEpLen', average_only=True) 304 | logger.log_tabular('TotalEnvInteracts', t) 305 | logger.log_tabular('QVals', with_min_and_max=True) 306 | logger.log_tabular('LossPi', average_only=True) 307 | logger.log_tabular('LossQ', average_only=True) 308 | logger.log_tabular('Time', time.time()-start_time) 309 | logger.dump_tabular() 310 | 311 | if __name__ == '__main__': 312 | import argparse 313 | parser = argparse.ArgumentParser() 314 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 315 | parser.add_argument('--hid', type=int, default=256) 316 | parser.add_argument('--l', type=int, default=2) 317 | parser.add_argument('--gamma', type=float, default=0.99) 318 | parser.add_argument('--seed', '-s', type=int, default=0) 319 | parser.add_argument('--epochs', type=int, default=50) 320 | parser.add_argument('--exp_name', type=str, default='ddpg') 321 | args = parser.parse_args() 322 | 323 | from spinup.utils.run_utils import setup_logger_kwargs 324 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 325 | 326 | env= MicrogridEnv() 327 | 328 | ddpg(lambda : env, actor_critic=core.MLPActorCritic, 329 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 330 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 331 | logger_kwargs=logger_kwargs) 332 | -------------------------------------------------------------------------------- /Code/enviroment.py: -------------------------------------------------------------------------------- 1 | #Imports 2 | import pandas as pd 3 | import numpy as np 4 | from matplotlib import pyplot 5 | import gym 6 | from gym import spaces 7 | import random 8 | import math 9 | 10 | #CONSTANTS 11 | #MAXIMUM LOADS 12 | SCHOOL_MAX_LOAD = 6.012 13 | HOUSE_MAX_LOAD = 5.678 14 | MOSQUE_MAX_LOAD = 4.324 15 | HEALTH_CENTER_MAX_LOAD = 5.8 16 | WATER_PUMP_MAX_LOAD = 0.77 17 | #MICROGRIDS PARAMETERS 18 | UM_BADER_LOAD_PARAMETERS = [70, 1, 2, 1, 2] 19 | UM_BADER_MAX_LOAD = 500 20 | UM_BADER_BATTERY_PARAMETERS = [500, 0.02, 300, 0.3] 21 | HAMZA_ELSHEIKH_LOAD_PARAMETERS = [50, 1, 1, 0, 1] 22 | HAMZA_ELSHEIKH_MAX_LOAD =350 23 | HAMZA_ELSHEIKH_BATTERY_PARAMETERS = [350, 0.02, 200, 0.3] 24 | TANNAH_LOAD_PARAMETERS = [45, 0, 1, 0, 1] 25 | TANNAH_MAX_LOAD = 300 26 | TANNAH_BATTERY_PARAMETERS = [300, 0.02, 150, 0.3] 27 | 28 | #DISTANCES AND PRICES 29 | distances = {"Um_Bader_Tannah": 10, "Um_Bader_Hamza_Elsheikh": 50, "Tannah_Hamza_Elsheikh": 30, "Tannah_Um_Bader": 10, "Hamza_Elsheikh_Um_Bader": 50, "Hamza_Elsheikh_Tannah": 30} 30 | NETWORK_PRICE = 19 #In cents 31 | 32 | #Helper Classes 33 | 34 | #defining a load, ie. schools, houses, health centers and water pumps 35 | class Load: 36 | def __init__(self, name, max_load, num_of_units): 37 | self.name = name #name of the load item to get the usage trend 38 | self.max_load = max_load #maximum_load_needed_by_load_category 39 | self.usage_trends_df = pd.read_csv("Data/usage_trends.csv") 40 | self.usage_trends_values = np.array(self.usage_trends_df[name]) #trend_of_percentage_of_usage_during_a_day 41 | self.num_of_units = num_of_units #number_of_units_of_load_available_in_area 42 | 43 | def _current_single_Load(self, time): 44 | idx = self.usage_trends_df[self.usage_trends_df["Time"] == time].index.values 45 | current_load = self.max_load * self.usage_trends_values[idx] 46 | return current_load 47 | 48 | def current_total_load(self, time): 49 | single_load = self._current_single_Load(time) 50 | return single_load * self.num_of_units 51 | 52 | #battery Class, to define the storage of the system 53 | class Battery: 54 | def __init__(self, max_capacity, discharge_cofficient, remaining_capacity, charge_rate): 55 | self.max_capacity = max_capacity #full_charge_capacity 56 | self.discharge_cofficient = discharge_cofficient #Discharge_coefficient 57 | self.remaining_capacity = remaining_capacity #Remaining_Capacity 58 | self.charge_rate = charge_rate #percentage_charged_from_inputed_amount 59 | 60 | #charges the battery with given amount and returns leftover amount to be returned if amount + currnet capacity > max capacity 61 | def charge(self, amount): 62 | empty = self.max_capacity - self.remaining_capacity 63 | if empty <=0: 64 | return amount 65 | else: 66 | self.remaining_capacity+= amount 67 | leftover = self.remaining_capacity - self.max_capacity 68 | self.remaining_capacity = min(self.max_capacity, self.remaining_capacity) 69 | return max(leftover,0) 70 | 71 | #takes energy from the battery providing the needed amount and returns amount provided form the battery 72 | 73 | def supply(self, amount): 74 | remaining = self.remaining_capacity 75 | self.remaining_capacity -= amount 76 | self.remaining_capacity = max(self.remaining_capacity,0) 77 | return min(amount, remaining) 78 | 79 | class Generation: 80 | def __init__(self, name, maxCapacity = None): 81 | self.solar_df = pd.read_csv("Data/Solar/" + name + "_solar_generation.csv") 82 | self.wind_df = pd.read_csv("Data/wind/" + name + "_wind_generation.csv") 83 | self.solar_generation = np.array(self.solar_df["value"], dtype = np.float32) 84 | self.wind_generation = np.array(self.wind_df["value"], dtype = np.float32) 85 | for i in range(len(self.wind_generation)): 86 | if self.wind_generation[i] <0: 87 | self.wind_generation[i] = 0 88 | #self.wind_generation = 0 89 | self.generation = self.solar_generation + self.wind_generation 90 | self.max_generation = max(self.generation) 91 | #given current time, give the total generation of the solar and wind units 92 | def current_generation(self, time): 93 | idx = self.solar_df[self.solar_df["Time"] == time].index.values 94 | return (self.generation[idx]/1000) #KW 95 | 96 | class Microgrid: 97 | def __init__(self, name, load_parameters, battery_parameters): 98 | self.name = name #name of the microgrid used for data loading 99 | self.load_parameters = load_parameters #np array of the parameters to create the load of the microgrid that is the number of schools, houses, mosques, health centers and water pumps 100 | self.battery_parameters = battery_parameters #np array of the parameters to create the battery of the microgrid 101 | self.battery = self._create_battery(battery_parameters) 102 | self.houses, self.schools, self.mosques, self.health_centers, self.water_pumps= self._create_loads(load_parameters) 103 | self.generation = Generation(name) 104 | self.unit_price = 10 105 | 106 | 107 | #creats a battery given its battery parameters ie max cap, dis coeff, initial rem cap and it's charge rate 108 | def _create_battery(self, battery_parameters): 109 | max_capacity = battery_parameters[0] 110 | discharge_cofficient = battery_parameters[1] 111 | remaining_capacity = battery_parameters[2] 112 | charge_rate = battery_parameters[3] 113 | battery = Battery(max_capacity, discharge_cofficient, remaining_capacity, charge_rate) 114 | return battery 115 | 116 | #creates the loads of the MG, using the number of units for each load and its name for data reasons 117 | def _create_loads(self, load_parameters): 118 | num_houses, num_schools, num_mosques, num_health_centers, num_water_pumps = load_parameters 119 | houses_load = Load("House", HOUSE_MAX_LOAD, num_houses) 120 | schools_load = Load("School", SCHOOL_MAX_LOAD, num_schools) 121 | mosques_load = Load("Mosque", MOSQUE_MAX_LOAD, num_mosques) 122 | health_centers_load = Load("Health_center", HEALTH_CENTER_MAX_LOAD, num_health_centers) 123 | water_pumps_load = Load("Water_pump", WATER_PUMP_MAX_LOAD, num_water_pumps) 124 | return houses_load, schools_load, mosques_load, health_centers_load, water_pumps_load 125 | 126 | #returns the total load by all MG load units 127 | def total_load(self, time): 128 | houses_load = self.houses.current_total_load(time) 129 | schools_load = self.schools.current_total_load(time) 130 | mosques_load = self.mosques.current_total_load(time) 131 | health_centers_load = self.health_centers.current_total_load(time) 132 | water_pumps_load = self.water_pumps.current_total_load(time) 133 | total_load = houses_load + schools_load + mosques_load + health_centers_load + water_pumps_load 134 | return total_load 135 | 136 | #current status of the MG containing it's battery's remaining capacity, it's current power generation and its current total load 137 | def state (self, time): 138 | total_generation = self.generation.current_generation(time) 139 | total_load = self.total_load(time) 140 | battery_status = self.battery.remaining_capacity 141 | return total_load, total_generation, battery_status 142 | 143 | def to_trade(self, time): 144 | load, generation, battery = self.state(time) 145 | to_trade = load - (generation + battery) 146 | return to_trade 147 | 148 | def supply(self, load, time): 149 | if load >= self.generation.current_generation(time): 150 | load -= self.generation.current_generation(time) 151 | if load <= self.battery.remaining_capacity: 152 | self.battery.remaining_capacity -= load 153 | load = 0 154 | else: 155 | load -= self.battery.remaining_capacity 156 | self.battery.remaining_capacity = 0 157 | else: 158 | load = 0 159 | return load 160 | 161 | class MicrogridEnv (gym.Env): 162 | def __init__(self): 163 | self.main_mG = Microgrid("Hamza_Elsheikh", HAMZA_ELSHEIKH_LOAD_PARAMETERS, HAMZA_ELSHEIKH_BATTERY_PARAMETERS) 164 | self.first_mg = Microgrid("Um_Bader", UM_BADER_LOAD_PARAMETERS, UM_BADER_BATTERY_PARAMETERS) 165 | self.second_mg= Microgrid("Tannah", TANNAH_LOAD_PARAMETERS, TANNAH_BATTERY_PARAMETERS) 166 | self.time_step = 0 167 | self.dates = np.array(pd.read_csv("Data/Solar/" + self.main_mG.name + "_solar_generation.csv")["Time"]) 168 | self.start_date = self.dates[self.time_step] 169 | self.current_price = NETWORK_PRICE 170 | self.action_space = spaces.Box(low=np.array([0,0,0,self.main_mG.unit_price]), high=np.array([3, 2, self.main_mG.battery.max_capacity, NETWORK_PRICE]), dtype = np.float32) 171 | self.observation_space = spaces.Box(low =np.array([0.0, 0.0, 0.0, 0.0]), high =np.array([1, HAMZA_ELSHEIKH_MAX_LOAD, self.main_mG.generation.max_generation, NETWORK_PRICE]), dtype = np.float32) 172 | 173 | def _status(self): 174 | if self.time_step + self.start_date_idx >= len(self.dates): 175 | self.time_step = 0 176 | self.start_date_idx = 0 177 | self.current_date = self.dates[self.time_step + self.start_date_idx] 178 | current_load, current_generation, remaining_capacity = self.main_mG.state(self.current_date) 179 | time_s = self.time_step 180 | previous_price = self.current_price 181 | a = np.array([1,2,3]) 182 | if type(remaining_capacity) == type(a): 183 | state = [remaining_capacity[0], current_load[0], current_generation[0], previous_price] 184 | else: 185 | state = [remaining_capacity, current_load[0], current_generation[0], previous_price] 186 | return state 187 | 188 | def to_trade_m (self, mg): 189 | cl, cg, rc = mg.state(self.current_date) 190 | a = np.array([1,2,3]) 191 | if type(rc) == type(a): 192 | state = rc[0] + cg[0] 193 | state = cl[0] - state 194 | else: 195 | state = rc + cg[0] 196 | state -= cl[0] 197 | return state 198 | 199 | def reset(self): 200 | self.start_date_idx = random.randint(0,len(self.dates)) 201 | self.start_date = self.dates[self.start_date_idx] 202 | self.current_date = self.start_date 203 | self.main_mG.battery = self.main_mG._create_battery(HAMZA_ELSHEIKH_BATTERY_PARAMETERS) 204 | self.current_price = NETWORK_PRICE 205 | self.energy_bought = [] 206 | self.energy_sold = [] 207 | self.prices = [] 208 | self.tot = [] 209 | state = self._status() 210 | return state 211 | 212 | def _travel_loss(self, target_mg, amount): 213 | 214 | src_name = self.main_mG.name 215 | dist_name = target_mg.name 216 | final_name = src_name +"_"+ dist_name 217 | distance = distances[final_name] 218 | base_res = 1.1 #25mm aluminium 219 | voltage = 33000#use sub_transmission? 220 | loss = ((amount**2) * (base_res*distance))/(voltage **2) 221 | return loss 222 | 223 | def step(self, action): 224 | action_type = action[0] 225 | target_mg_idx = action[1] 226 | amount = action[2] 227 | price = action[3] 228 | reward = 0 229 | is_done = False 230 | main_mg = self.main_mG 231 | if target_mg_idx <1: 232 | target_mg = self.first_mg 233 | else: 234 | target_mg = self.second_mg 235 | 236 | needed_main_mg = self.to_trade_m(main_mg) 237 | if amount > abs(needed_main_mg): 238 | reward -= 10 239 | amount += self._travel_loss(target_mg, amount) 240 | offer = abs(target_mg.to_trade(self.current_date)) 241 | if action_type <1:#buy from target MG 242 | if main_mg.to_trade(self.current_date) < 0: 243 | reward -= 10 244 | if price >= target_mg.unit_price: 245 | if offer != 0: 246 | if amount == 0: 247 | reward -=10 248 | elif offer >= amount: 249 | target_mg.battery.supply(amount) 250 | main_mg.battery.charge(amount) 251 | rem_amount = 0 252 | reward -= rem_amount / amount 253 | reward += (NETWORK_PRICE - price)/main_mg.unit_price 254 | self.energy_bought.append(amount - rem_amount) 255 | self.energy_sold.append(0) 256 | 257 | else: 258 | target_mg.battery.supply(offer) 259 | main_mg.battery.charge(offer) 260 | rem_amount = amount - offer 261 | reward -= rem_amount / amount 262 | reward += (NETWORK_PRICE - price)/main_mg.unit_price 263 | reward = reward[0] 264 | self.energy_bought.append(amount - rem_amount) 265 | self.energy_sold.append(0) 266 | else: 267 | reward -= 1 268 | else: 269 | reward -= 1 270 | self.prices.append(price) 271 | self.energy_bought.append(0) 272 | self.energy_sold.append(0) 273 | main_mg.supply(main_mg.total_load(self.current_date), self.current_date) 274 | 275 | 276 | elif action_type < 2:#Sell to target MG 277 | if main_mg.to_trade(self.current_date) > 0: 278 | reward -= 10 279 | if price >= main_mg.unit_price and price <= NETWORK_PRICE: 280 | if offer != 0: 281 | if amount == 0: 282 | reward -=10 283 | elif offer >= amount: 284 | main_mg.battery.supply(amount) 285 | target_mg.battery.charge(amount) 286 | rem_amount = 0 287 | reward -= rem_amount / amount 288 | reward += (price - main_mg.unit_price)/main_mg.unit_price 289 | self.energy_sold.append(amount - rem_amount) 290 | self.energy_bought.append(0) 291 | else: 292 | main_mg.battery.supply(offer) 293 | target_mg.battery.charge(offer) 294 | rem_amount = amount - offer 295 | reward -= rem_amount / amount 296 | reward += (price - main_mg.unit_price)/main_mg.unit_price 297 | reward = reward[0] 298 | self.energy_sold.append(amount - rem_amount) 299 | self.energy_bought.append(0) 300 | else: 301 | reward -= 1 302 | else: 303 | reward -= 1 304 | self.prices.append(price) 305 | self.energy_bought.append(0) 306 | self.energy_sold.append(0) 307 | main_mg.supply(main_mg.total_load(self.current_date), self.current_date) 308 | 309 | else: 310 | self.prices.append(price) 311 | self.energy_bought.append(0) 312 | self.energy_sold.append(0) 313 | main_mg.supply(main_mg.total_load(self.current_date), self.current_date) 314 | 315 | tot = self.to_trade_m(main_mg) 316 | if tot >0: 317 | reward += 100 318 | self.time_step +=1 319 | self.tot.append(tot) 320 | state = self._status() 321 | tgt_total_load, tgt_total_generation, tgt_battery_status = target_mg.state(self.current_date) 322 | target_mg.battery.charge(tgt_total_load - tgt_total_load) 323 | is_done = False 324 | return state, reward, is_done, {} 325 | 326 | def render(self): 327 | pass -------------------------------------------------------------------------------- /Code/user_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | 4 | # Default neural network backend for each algo 5 | # (Must be either 'tf1' or 'pytorch') 6 | DEFAULT_BACKEND = { 7 | 'vpg': 'pytorch', 8 | 'trpo': 'tf1', 9 | 'ppo': 'pytorch', 10 | 'ddpg': 'pytorch', 11 | 'td3': 'pytorch', 12 | 'sac': 'pytorch' 13 | } 14 | 15 | # Where experiment outputs are saved by default: 16 | DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data') 17 | 18 | # Whether to automatically insert a date and time stamp into the names of 19 | # save directories: 20 | FORCE_DATESTAMP = False 21 | 22 | # Whether GridSearch provides automatically-generated default shorthands: 23 | DEFAULT_SHORTHAND = True 24 | 25 | # Tells the GridSearch how many seconds to pause for before launching 26 | # experiments. 27 | WAIT_BEFORE_LAUNCH = 5 -------------------------------------------------------------------------------- /Code/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__init__.py -------------------------------------------------------------------------------- /Code/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/__pycache__/logx.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/logx.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/__pycache__/mpi_pytorch.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/mpi_pytorch.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/__pycache__/mpi_tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/mpi_tools.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/__pycache__/run_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/run_utils.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/__pycache__/serialization_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Code/utils/__pycache__/serialization_utils.cpython-36.pyc -------------------------------------------------------------------------------- /Code/utils/logx.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | 5 | Logs to a tab-separated-values file (path/to/output_directory/progress.txt) 6 | 7 | """ 8 | import json 9 | import joblib 10 | import shutil 11 | import numpy as np 12 | import tensorflow as tf 13 | import torch 14 | import os.path as osp, time, atexit, os 15 | import warnings 16 | from Code.utils.mpi_tools import proc_id, mpi_statistics_scalar 17 | from Code.utils.serialization_utils import convert_json 18 | 19 | color2num = dict( 20 | gray=30, 21 | red=31, 22 | green=32, 23 | yellow=33, 24 | blue=34, 25 | magenta=35, 26 | cyan=36, 27 | white=37, 28 | crimson=38 29 | ) 30 | 31 | def colorize(string, color, bold=False, highlight=False): 32 | """ 33 | Colorize a string. 34 | 35 | This function was originally written by John Schulman. 36 | """ 37 | attr = [] 38 | num = color2num[color] 39 | if highlight: num += 10 40 | attr.append(str(num)) 41 | if bold: attr.append('1') 42 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 43 | 44 | def restore_tf_graph(sess, fpath): 45 | """ 46 | Loads graphs saved by Logger. 47 | 48 | Will output a dictionary whose keys and values are from the 'inputs' 49 | and 'outputs' dict you specified with logger.setup_tf_saver(). 50 | 51 | Args: 52 | sess: A Tensorflow session. 53 | fpath: Filepath to save directory. 54 | 55 | Returns: 56 | A dictionary mapping from keys to tensors in the computation graph 57 | loaded from ``fpath``. 58 | """ 59 | tf.saved_model.loader.load( 60 | sess, 61 | [tf.saved_model.tag_constants.SERVING], 62 | fpath 63 | ) 64 | model_info = joblib.load(osp.join(fpath, 'model_info.pkl')) 65 | graph = tf.get_default_graph() 66 | model = dict() 67 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()}) 68 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()}) 69 | return model 70 | 71 | class Logger: 72 | """ 73 | A general-purpose logger. 74 | 75 | Makes it easy to save diagnostics, hyperparameter configurations, the 76 | state of a training run, and the trained model. 77 | """ 78 | 79 | def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): 80 | """ 81 | Initialize a Logger. 82 | 83 | Args: 84 | output_dir (string): A directory for saving results to. If 85 | ``None``, defaults to a temp directory of the form 86 | ``/tmp/experiments/somerandomnumber``. 87 | 88 | output_fname (string): Name for the tab-separated-value file 89 | containing metrics logged throughout a training run. 90 | Defaults to ``progress.txt``. 91 | 92 | exp_name (string): Experiment name. If you run multiple training 93 | runs and give them all the same ``exp_name``, the plotter 94 | will know to group them. (Use case: if you run the same 95 | hyperparameter configuration with multiple random seeds, you 96 | should give them all the same ``exp_name``.) 97 | """ 98 | if proc_id()==0: 99 | self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time()) 100 | if osp.exists(self.output_dir): 101 | print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir) 102 | else: 103 | os.makedirs(self.output_dir) 104 | self.output_file = open(osp.join(self.output_dir, output_fname), 'w') 105 | atexit.register(self.output_file.close) 106 | print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True)) 107 | else: 108 | self.output_dir = None 109 | self.output_file = None 110 | self.first_row=True 111 | self.log_headers = [] 112 | self.log_current_row = {} 113 | self.exp_name = exp_name 114 | 115 | def log(self, msg, color='green'): 116 | """Print a colorized message to stdout.""" 117 | if proc_id()==0: 118 | print(colorize(msg, color, bold=True)) 119 | 120 | def log_tabular(self, key, val): 121 | """ 122 | Log a value of some diagnostic. 123 | 124 | Call this only once for each diagnostic quantity, each iteration. 125 | After using ``log_tabular`` to store values for each diagnostic, 126 | make sure to call ``dump_tabular`` to write them out to file and 127 | stdout (otherwise they will not get saved anywhere). 128 | """ 129 | if self.first_row: 130 | self.log_headers.append(key) 131 | else: 132 | assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 133 | assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 134 | self.log_current_row[key] = val 135 | 136 | def save_config(self, config): 137 | """ 138 | Log an experiment configuration. 139 | 140 | Call this once at the top of your experiment, passing in all important 141 | config vars as a dict. This will serialize the config to JSON, while 142 | handling anything which can't be serialized in a graceful way (writing 143 | as informative a string as possible). 144 | 145 | Example use: 146 | 147 | .. code-block:: python 148 | 149 | logger = EpochLogger(**logger_kwargs) 150 | logger.save_config(locals()) 151 | """ 152 | config_json = convert_json(config) 153 | if self.exp_name is not None: 154 | config_json['exp_name'] = self.exp_name 155 | if proc_id()==0: 156 | output = json.dumps(config_json, separators=(',',':\t'), indent=4, sort_keys=True) 157 | print(colorize('Saving config:\n', color='cyan', bold=True)) 158 | print(output) 159 | with open(osp.join(self.output_dir, "config.json"), 'w') as out: 160 | out.write(output) 161 | 162 | def save_state(self, state_dict, itr=None): 163 | """ 164 | Saves the state of an experiment. 165 | 166 | To be clear: this is about saving *state*, not logging diagnostics. 167 | All diagnostic logging is separate from this function. This function 168 | will save whatever is in ``state_dict``---usually just a copy of the 169 | environment---and the most recent parameters for the model you 170 | previously set up saving for with ``setup_tf_saver``. 171 | 172 | Call with any frequency you prefer. If you only want to maintain a 173 | single state and overwrite it at each call with the most recent 174 | version, leave ``itr=None``. If you want to keep all of the states you 175 | save, provide unique (increasing) values for 'itr'. 176 | 177 | Args: 178 | state_dict (dict): Dictionary containing essential elements to 179 | describe the current state of training. 180 | 181 | itr: An int, or None. Current iteration of training. 182 | """ 183 | if proc_id()==0: 184 | fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr 185 | try: 186 | joblib.dump(state_dict, osp.join(self.output_dir, fname)) 187 | except: 188 | self.log('Warning: could not pickle state_dict.', color='red') 189 | if hasattr(self, 'tf_saver_elements'): 190 | self._tf_simple_save(itr) 191 | if hasattr(self, 'pytorch_saver_elements'): 192 | self._pytorch_simple_save(itr) 193 | 194 | def setup_tf_saver(self, sess, inputs, outputs): 195 | """ 196 | Set up easy model saving for tensorflow. 197 | 198 | Call once, after defining your computation graph but before training. 199 | 200 | Args: 201 | sess: The Tensorflow session in which you train your computation 202 | graph. 203 | 204 | inputs (dict): A dictionary that maps from keys of your choice 205 | to the tensorflow placeholders that serve as inputs to the 206 | computation graph. Make sure that *all* of the placeholders 207 | needed for your outputs are included! 208 | 209 | outputs (dict): A dictionary that maps from keys of your choice 210 | to the outputs from your computation graph. 211 | """ 212 | self.tf_saver_elements = dict(session=sess, inputs=inputs, outputs=outputs) 213 | self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()}, 214 | 'outputs': {k:v.name for k,v in outputs.items()}} 215 | 216 | def _tf_simple_save(self, itr=None): 217 | """ 218 | Uses simple_save to save a trained model, plus info to make it easy 219 | to associated tensors to variables after restore. 220 | """ 221 | if proc_id()==0: 222 | assert hasattr(self, 'tf_saver_elements'), \ 223 | "First have to setup saving with self.setup_tf_saver" 224 | fpath = 'tf1_save' + ('%d'%itr if itr is not None else '') 225 | fpath = osp.join(self.output_dir, fpath) 226 | if osp.exists(fpath): 227 | # simple_save refuses to be useful if fpath already exists, 228 | # so just delete fpath if it's there. 229 | shutil.rmtree(fpath) 230 | tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) 231 | joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl')) 232 | 233 | 234 | def setup_pytorch_saver(self, what_to_save): 235 | """ 236 | Set up easy model saving for a single PyTorch model. 237 | 238 | Because PyTorch saving and loading is especially painless, this is 239 | very minimal; we just need references to whatever we would like to 240 | pickle. This is integrated into the logger because the logger 241 | knows where the user would like to save information about this 242 | training run. 243 | 244 | Args: 245 | what_to_save: Any PyTorch model or serializable object containing 246 | PyTorch models. 247 | """ 248 | self.pytorch_saver_elements = what_to_save 249 | 250 | def _pytorch_simple_save(self, itr=None): 251 | """ 252 | Saves the PyTorch model (or models). 253 | """ 254 | if proc_id()==0: 255 | assert hasattr(self, 'pytorch_saver_elements'), \ 256 | "First have to setup saving with self.setup_pytorch_saver" 257 | fpath = 'pyt_save' 258 | fpath = osp.join(self.output_dir, fpath) 259 | fname = 'model' + ('%d'%itr if itr is not None else '') + '.pt' 260 | fname = osp.join(fpath, fname) 261 | os.makedirs(fpath, exist_ok=True) 262 | with warnings.catch_warnings(): 263 | warnings.simplefilter("ignore") 264 | # We are using a non-recommended way of saving PyTorch models, 265 | # by pickling whole objects (which are dependent on the exact 266 | # directory structure at the time of saving) as opposed to 267 | # just saving network weights. This works sufficiently well 268 | # for the purposes of Spinning Up, but you may want to do 269 | # something different for your personal PyTorch project. 270 | # We use a catch_warnings() context to avoid the warnings about 271 | # not being able to save the source code. 272 | torch.save(self.pytorch_saver_elements, fname) 273 | 274 | 275 | def dump_tabular(self): 276 | """ 277 | Write all of the diagnostics from the current iteration. 278 | 279 | Writes both to stdout, and to the output file. 280 | """ 281 | if proc_id()==0: 282 | vals = [] 283 | key_lens = [len(key) for key in self.log_headers] 284 | max_key_len = max(15,max(key_lens)) 285 | keystr = '%'+'%d'%max_key_len 286 | fmt = "| " + keystr + "s | %15s |" 287 | n_slashes = 22 + max_key_len 288 | print("-"*n_slashes) 289 | for key in self.log_headers: 290 | val = self.log_current_row.get(key, "") 291 | valstr = "%8.3g"%val if hasattr(val, "__float__") else val 292 | print(fmt%(key, valstr)) 293 | vals.append(val) 294 | print("-"*n_slashes, flush=True) 295 | if self.output_file is not None: 296 | if self.first_row: 297 | self.output_file.write("\t".join(self.log_headers)+"\n") 298 | self.output_file.write("\t".join(map(str,vals))+"\n") 299 | self.output_file.flush() 300 | self.log_current_row.clear() 301 | self.first_row=False 302 | 303 | class EpochLogger(Logger): 304 | """ 305 | A variant of Logger tailored for tracking average values over epochs. 306 | 307 | Typical use case: there is some quantity which is calculated many times 308 | throughout an epoch, and at the end of the epoch, you would like to 309 | report the average / std / min / max value of that quantity. 310 | 311 | With an EpochLogger, each time the quantity is calculated, you would 312 | use 313 | 314 | .. code-block:: python 315 | 316 | epoch_logger.store(NameOfQuantity=quantity_value) 317 | 318 | to load it into the EpochLogger's state. Then at the end of the epoch, you 319 | would use 320 | 321 | .. code-block:: python 322 | 323 | epoch_logger.log_tabular(NameOfQuantity, **options) 324 | 325 | to record the desired values. 326 | """ 327 | 328 | def __init__(self, *args, **kwargs): 329 | super().__init__(*args, **kwargs) 330 | self.epoch_dict = dict() 331 | 332 | def store(self, **kwargs): 333 | """ 334 | Save something into the epoch_logger's current state. 335 | 336 | Provide an arbitrary number of keyword arguments with numerical 337 | values. 338 | """ 339 | for k,v in kwargs.items(): 340 | if not(k in self.epoch_dict.keys()): 341 | self.epoch_dict[k] = [] 342 | self.epoch_dict[k].append(v) 343 | 344 | def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False): 345 | """ 346 | Log a value or possibly the mean/std/min/max values of a diagnostic. 347 | 348 | Args: 349 | key (string): The name of the diagnostic. If you are logging a 350 | diagnostic whose state has previously been saved with 351 | ``store``, the key here has to match the key you used there. 352 | 353 | val: A value for the diagnostic. If you have previously saved 354 | values for this key via ``store``, do *not* provide a ``val`` 355 | here. 356 | 357 | with_min_and_max (bool): If true, log min and max values of the 358 | diagnostic over the epoch. 359 | 360 | average_only (bool): If true, do not log the standard deviation 361 | of the diagnostic over the epoch. 362 | """ 363 | if val is not None: 364 | super().log_tabular(key,val) 365 | else: 366 | v = self.epoch_dict[key] 367 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 368 | stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max) 369 | super().log_tabular(key if average_only else 'Average' + key, stats[0]) 370 | if not(average_only): 371 | super().log_tabular('Std'+key, stats[1]) 372 | if with_min_and_max: 373 | super().log_tabular('Max'+key, stats[3]) 374 | super().log_tabular('Min'+key, stats[2]) 375 | self.epoch_dict[key] = [] 376 | 377 | def get_stats(self, key): 378 | """ 379 | Lets an algorithm ask the logger for mean/std/min/max of a diagnostic. 380 | """ 381 | v = self.epoch_dict[key] 382 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 383 | return mpi_statistics_scalar(vals) -------------------------------------------------------------------------------- /Code/utils/mpi_pytorch.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import numpy as np 3 | import os 4 | import torch 5 | from mpi4py import MPI 6 | from Code.utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id 7 | 8 | def setup_pytorch_for_mpi(): 9 | """ 10 | Avoid slowdowns caused by each separate process's PyTorch using 11 | more than its fair share of CPU resources. 12 | """ 13 | #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 14 | if torch.get_num_threads()==1: 15 | return 16 | fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) 17 | torch.set_num_threads(fair_num_threads) 18 | #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 19 | 20 | def mpi_avg_grads(module): 21 | """ Average contents of gradient buffers across MPI processes. """ 22 | if num_procs()==1: 23 | return 24 | for p in module.parameters(): 25 | p_grad_numpy = p.grad.numpy() # numpy view of tensor data 26 | avg_p_grad = mpi_avg(p.grad) 27 | p_grad_numpy[:] = avg_p_grad[:] 28 | 29 | def sync_params(module): 30 | """ Sync all parameters of module across all MPI processes. """ 31 | if num_procs()==1: 32 | return 33 | for p in module.parameters(): 34 | p_numpy = p.data.numpy() 35 | broadcast(p_numpy) -------------------------------------------------------------------------------- /Code/utils/mpi_tf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | from spinup.utils.mpi_tools import broadcast 5 | 6 | 7 | def flat_concat(xs): 8 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 9 | 10 | def assign_params_from_flat(x, params): 11 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 12 | splits = tf.split(x, [flat_size(p) for p in params]) 13 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 14 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 15 | 16 | def sync_params(params): 17 | get_params = flat_concat(params) 18 | def _broadcast(x): 19 | broadcast(x) 20 | return x 21 | synced_params = tf.py_func(_broadcast, [get_params], tf.float32) 22 | return assign_params_from_flat(synced_params, params) 23 | 24 | def sync_all_params(): 25 | """Sync all tf variables across MPI processes.""" 26 | return sync_params(tf.global_variables()) 27 | 28 | 29 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 30 | """ 31 | Adam optimizer that averages gradients across MPI processes. 32 | 33 | The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 34 | For documentation on method arguments, see the Tensorflow docs page for 35 | the base `AdamOptimizer`_. 36 | 37 | .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py 38 | .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 39 | """ 40 | 41 | def __init__(self, **kwargs): 42 | self.comm = MPI.COMM_WORLD 43 | tf.train.AdamOptimizer.__init__(self, **kwargs) 44 | 45 | def compute_gradients(self, loss, var_list, **kwargs): 46 | """ 47 | Same as normal compute_gradients, except average grads over processes. 48 | """ 49 | grads_and_vars = super().compute_gradients(loss, var_list, **kwargs) 50 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 51 | flat_grad = flat_concat([g for g, v in grads_and_vars]) 52 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 53 | sizes = [int(np.prod(s)) for s in shapes] 54 | 55 | num_tasks = self.comm.Get_size() 56 | buf = np.zeros(flat_grad.shape, np.float32) 57 | 58 | def _collect_grads(flat_grad): 59 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 60 | np.divide(buf, float(num_tasks), out=buf) 61 | return buf 62 | 63 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 64 | avg_flat_grad.set_shape(flat_grad.shape) 65 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 66 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 67 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 68 | 69 | return avg_grads_and_vars 70 | 71 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 72 | """ 73 | Same as normal apply_gradients, except sync params after update. 74 | """ 75 | opt = super().apply_gradients(grads_and_vars, global_step, name) 76 | with tf.control_dependencies([opt]): 77 | sync = sync_params([v for g,v in grads_and_vars]) 78 | return tf.group([opt, sync]) -------------------------------------------------------------------------------- /Code/utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import os, subprocess, sys 3 | import numpy as np 4 | 5 | 6 | def mpi_fork(n, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers linked by MPI. 9 | 10 | Also, terminates the original process that launched it. 11 | 12 | Taken almost without modification from the Baselines function of the 13 | `same name`_. 14 | 15 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 16 | 17 | Args: 18 | n (int): Number of process to split into. 19 | 20 | bind_to_core (bool): Bind each MPI process to a core. 21 | """ 22 | if n<=1: 23 | return 24 | if os.getenv("IN_MPI") is None: 25 | env = os.environ.copy() 26 | env.update( 27 | MKL_NUM_THREADS="1", 28 | OMP_NUM_THREADS="1", 29 | IN_MPI="1" 30 | ) 31 | args = ["mpirun", "-np", str(n)] 32 | if bind_to_core: 33 | args += ["-bind-to", "core"] 34 | args += [sys.executable] + sys.argv 35 | subprocess.check_call(args, env=env) 36 | sys.exit() 37 | 38 | 39 | def msg(m, string=''): 40 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 41 | 42 | def proc_id(): 43 | """Get rank of calling process.""" 44 | return MPI.COMM_WORLD.Get_rank() 45 | 46 | def allreduce(*args, **kwargs): 47 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 48 | 49 | def num_procs(): 50 | """Count active MPI processes.""" 51 | return MPI.COMM_WORLD.Get_size() 52 | 53 | def broadcast(x, root=0): 54 | MPI.COMM_WORLD.Bcast(x, root=root) 55 | 56 | def mpi_op(x, op): 57 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 58 | x = np.asarray(x, dtype=np.float32) 59 | buff = np.zeros_like(x, dtype=np.float32) 60 | allreduce(x, buff, op=op) 61 | return buff[0] if scalar else buff 62 | 63 | def mpi_sum(x): 64 | return mpi_op(x, MPI.SUM) 65 | 66 | def mpi_avg(x): 67 | """Average a scalar or vector over MPI processes.""" 68 | return mpi_sum(x) / num_procs() 69 | 70 | def mpi_statistics_scalar(x, with_min_and_max=False): 71 | """ 72 | Get mean/std and optional min/max of scalar x across MPI processes. 73 | 74 | Args: 75 | x: An array containing samples of the scalar to produce statistics 76 | for. 77 | 78 | with_min_and_max (bool): If true, return min and max of x in 79 | addition to mean and std. 80 | """ 81 | x = np.array(x, dtype=np.float32) 82 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 83 | mean = global_sum / global_n 84 | 85 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 86 | std = np.sqrt(global_sum_sq / global_n) # compute global std 87 | 88 | if with_min_and_max: 89 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 90 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 91 | return mean, std, global_min, global_max 92 | return mean, std -------------------------------------------------------------------------------- /Code/utils/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | 9 | DIV_LINE_WIDTH = 50 10 | 11 | # Global vars for tracking and labeling data at load time. 12 | exp_idx = 0 13 | units = dict() 14 | 15 | def plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs): 16 | if smooth > 1: 17 | """ 18 | smooth data with moving window average. 19 | that is, 20 | smoothed_y[t] = average(y[t-k], y[t-k+1], ..., y[t+k-1], y[t+k]) 21 | where the "smooth" param is width of that window (2k+1) 22 | """ 23 | y = np.ones(smooth) 24 | for datum in data: 25 | x = np.asarray(datum[value]) 26 | z = np.ones(len(x)) 27 | smoothed_x = np.convolve(x,y,'same') / np.convolve(z,y,'same') 28 | datum[value] = smoothed_x 29 | 30 | if isinstance(data, list): 31 | data = pd.concat(data, ignore_index=True) 32 | sns.set(style="darkgrid", font_scale=1.5) 33 | sns.tsplot(data=data, time=xaxis, value=value, unit="Unit", condition=condition, ci='sd', **kwargs) 34 | """ 35 | If you upgrade to any version of Seaborn greater than 0.8.1, switch from 36 | tsplot to lineplot replacing L29 with: 37 | 38 | sns.lineplot(data=data, x=xaxis, y=value, hue=condition, ci='sd', **kwargs) 39 | 40 | Changes the colorscheme and the default legend style, though. 41 | """ 42 | plt.legend(loc='best').set_draggable(True) 43 | #plt.legend(loc='upper center', ncol=3, handlelength=1, 44 | # borderaxespad=0., prop={'size': 13}) 45 | 46 | """ 47 | For the version of the legend used in the Spinning Up benchmarking page, 48 | swap L38 with: 49 | 50 | plt.legend(loc='upper center', ncol=6, handlelength=1, 51 | mode="expand", borderaxespad=0., prop={'size': 13}) 52 | """ 53 | 54 | xscale = np.max(np.asarray(data[xaxis])) > 5e3 55 | if xscale: 56 | # Just some formatting niceness: x-axis scale in scientific notation if max x is large 57 | plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 58 | 59 | plt.tight_layout(pad=0.5) 60 | 61 | def get_datasets(logdir, condition=None): 62 | """ 63 | Recursively look through logdir for output files produced by 64 | spinup.logx.Logger. 65 | 66 | Assumes that any file "progress.txt" is a valid hit. 67 | """ 68 | global exp_idx 69 | global units 70 | datasets = [] 71 | for root, _, files in os.walk(logdir): 72 | if 'progress.txt' in files: 73 | exp_name = None 74 | try: 75 | config_path = open(os.path.join(root,'config.json')) 76 | config = json.load(config_path) 77 | if 'exp_name' in config: 78 | exp_name = config['exp_name'] 79 | except: 80 | print('No file named config.json') 81 | condition1 = condition or exp_name or 'exp' 82 | condition2 = condition1 + '-' + str(exp_idx) 83 | exp_idx += 1 84 | if condition1 not in units: 85 | units[condition1] = 0 86 | unit = units[condition1] 87 | units[condition1] += 1 88 | 89 | try: 90 | exp_data = pd.read_table(os.path.join(root,'progress.txt')) 91 | except: 92 | print('Could not read from %s'%os.path.join(root,'progress.txt')) 93 | continue 94 | performance = 'AverageTestEpRet' if 'AverageTestEpRet' in exp_data else 'AverageEpRet' 95 | exp_data.insert(len(exp_data.columns),'Unit',unit) 96 | exp_data.insert(len(exp_data.columns),'Condition1',condition1) 97 | exp_data.insert(len(exp_data.columns),'Condition2',condition2) 98 | exp_data.insert(len(exp_data.columns),'Performance',exp_data[performance]) 99 | datasets.append(exp_data) 100 | return datasets 101 | 102 | 103 | def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None): 104 | """ 105 | For every entry in all_logdirs, 106 | 1) check if the entry is a real directory and if it is, 107 | pull data from it; 108 | 109 | 2) if not, check to see if the entry is a prefix for a 110 | real directory, and pull data from that. 111 | """ 112 | logdirs = [] 113 | for logdir in all_logdirs: 114 | if osp.isdir(logdir) and logdir[-1]==os.sep: 115 | logdirs += [logdir] 116 | else: 117 | basedir = osp.dirname(logdir) 118 | fulldir = lambda x : osp.join(basedir, x) 119 | prefix = logdir.split(os.sep)[-1] 120 | listdir= os.listdir(basedir) 121 | logdirs += sorted([fulldir(x) for x in listdir if prefix in x]) 122 | 123 | """ 124 | Enforce selection rules, which check logdirs for certain substrings. 125 | Makes it easier to look at graphs from particular ablations, if you 126 | launch many jobs at once with similar names. 127 | """ 128 | if select is not None: 129 | logdirs = [log for log in logdirs if all(x in log for x in select)] 130 | if exclude is not None: 131 | logdirs = [log for log in logdirs if all(not(x in log) for x in exclude)] 132 | 133 | # Verify logdirs 134 | print('Plotting from...\n' + '='*DIV_LINE_WIDTH + '\n') 135 | for logdir in logdirs: 136 | print(logdir) 137 | print('\n' + '='*DIV_LINE_WIDTH) 138 | 139 | # Make sure the legend is compatible with the logdirs 140 | assert not(legend) or (len(legend) == len(logdirs)), \ 141 | "Must give a legend title for each set of experiments." 142 | 143 | # Load data from logdirs 144 | data = [] 145 | if legend: 146 | for log, leg in zip(logdirs, legend): 147 | data += get_datasets(log, leg) 148 | else: 149 | for log in logdirs: 150 | data += get_datasets(log) 151 | return data 152 | 153 | 154 | def make_plots(all_logdirs, legend=None, xaxis=None, values=None, count=False, 155 | font_scale=1.5, smooth=1, select=None, exclude=None, estimator='mean'): 156 | data = get_all_datasets(all_logdirs, legend, select, exclude) 157 | values = values if isinstance(values, list) else [values] 158 | condition = 'Condition2' if count else 'Condition1' 159 | estimator = getattr(np, estimator) # choose what to show on main curve: mean? max? min? 160 | for value in values: 161 | plt.figure() 162 | plot_data(data, xaxis=xaxis, value=value, condition=condition, smooth=smooth, estimator=estimator) 163 | plt.show() 164 | 165 | 166 | def main(): 167 | import argparse 168 | parser = argparse.ArgumentParser() 169 | parser.add_argument('logdir', nargs='*') 170 | parser.add_argument('--legend', '-l', nargs='*') 171 | parser.add_argument('--xaxis', '-x', default='TotalEnvInteracts') 172 | parser.add_argument('--value', '-y', default='Performance', nargs='*') 173 | parser.add_argument('--count', action='store_true') 174 | parser.add_argument('--smooth', '-s', type=int, default=1) 175 | parser.add_argument('--select', nargs='*') 176 | parser.add_argument('--exclude', nargs='*') 177 | parser.add_argument('--est', default='mean') 178 | args = parser.parse_args() 179 | """ 180 | 181 | Args: 182 | logdir (strings): As many log directories (or prefixes to log 183 | directories, which the plotter will autocomplete internally) as 184 | you'd like to plot from. 185 | 186 | legend (strings): Optional way to specify legend for the plot. The 187 | plotter legend will automatically use the ``exp_name`` from the 188 | config.json file, unless you tell it otherwise through this flag. 189 | This only works if you provide a name for each directory that 190 | will get plotted. (Note: this may not be the same as the number 191 | of logdir args you provide! Recall that the plotter looks for 192 | autocompletes of the logdir args: there may be more than one 193 | match for a given logdir prefix, and you will need to provide a 194 | legend string for each one of those matches---unless you have 195 | removed some of them as candidates via selection or exclusion 196 | rules (below).) 197 | 198 | xaxis (string): Pick what column from data is used for the x-axis. 199 | Defaults to ``TotalEnvInteracts``. 200 | 201 | value (strings): Pick what columns from data to graph on the y-axis. 202 | Submitting multiple values will produce multiple graphs. Defaults 203 | to ``Performance``, which is not an actual output of any algorithm. 204 | Instead, ``Performance`` refers to either ``AverageEpRet``, the 205 | correct performance measure for the on-policy algorithms, or 206 | ``AverageTestEpRet``, the correct performance measure for the 207 | off-policy algorithms. The plotter will automatically figure out 208 | which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for 209 | each separate logdir. 210 | 211 | count: Optional flag. By default, the plotter shows y-values which 212 | are averaged across all results that share an ``exp_name``, 213 | which is typically a set of identical experiments that only vary 214 | in random seed. But if you'd like to see all of those curves 215 | separately, use the ``--count`` flag. 216 | 217 | smooth (int): Smooth data by averaging it over a fixed window. This 218 | parameter says how wide the averaging window will be. 219 | 220 | select (strings): Optional selection rule: the plotter will only show 221 | curves from logdirs that contain all of these substrings. 222 | 223 | exclude (strings): Optional exclusion rule: plotter will only show 224 | curves from logdirs that do not contain these substrings. 225 | 226 | """ 227 | 228 | make_plots(args.logdir, args.legend, args.xaxis, args.value, args.count, 229 | smooth=args.smooth, select=args.select, exclude=args.exclude, 230 | estimator=args.est) 231 | 232 | if __name__ == "__main__": 233 | main() -------------------------------------------------------------------------------- /Code/utils/run_entrypoint.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | import pickle 3 | import base64 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('encoded_thunk') 9 | args = parser.parse_args() 10 | thunk = pickle.loads(zlib.decompress(base64.b64decode(args.encoded_thunk))) 11 | thunk() -------------------------------------------------------------------------------- /Code/utils/run_utils.py: -------------------------------------------------------------------------------- 1 | from Code.user_config import DEFAULT_DATA_DIR, FORCE_DATESTAMP, \ 2 | DEFAULT_SHORTHAND, WAIT_BEFORE_LAUNCH 3 | from Code.utils.logx import colorize 4 | from Code.utils.mpi_tools import mpi_fork, msg 5 | from Code.utils.serialization_utils import convert_json 6 | import base64 7 | from copy import deepcopy 8 | import cloudpickle 9 | import json 10 | import numpy as np 11 | import os 12 | import os.path as osp 13 | import psutil 14 | import string 15 | import subprocess 16 | from subprocess import CalledProcessError 17 | import sys 18 | from textwrap import dedent 19 | import time 20 | from tqdm import trange 21 | import zlib 22 | 23 | DIV_LINE_WIDTH = 80 24 | 25 | def setup_logger_kwargs(exp_name, seed=None, data_dir=None, datestamp=False): 26 | """ 27 | Sets up the output_dir for a logger and returns a dict for logger kwargs. 28 | 29 | If no seed is given and datestamp is false, 30 | 31 | :: 32 | 33 | output_dir = data_dir/exp_name 34 | 35 | If a seed is given and datestamp is false, 36 | 37 | :: 38 | 39 | output_dir = data_dir/exp_name/exp_name_s[seed] 40 | 41 | If datestamp is true, amend to 42 | 43 | :: 44 | 45 | output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed] 46 | 47 | You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in 48 | ``spinup/user_config.py``. 49 | 50 | Args: 51 | 52 | exp_name (string): Name for experiment. 53 | 54 | seed (int): Seed for random number generators used by experiment. 55 | 56 | data_dir (string): Path to folder where results should be saved. 57 | Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``. 58 | 59 | datestamp (bool): Whether to include a date and timestamp in the 60 | name of the save directory. 61 | 62 | Returns: 63 | 64 | logger_kwargs, a dict containing output_dir and exp_name. 65 | """ 66 | 67 | # Datestamp forcing 68 | datestamp = datestamp or FORCE_DATESTAMP 69 | 70 | # Make base path 71 | ymd_time = time.strftime("%Y-%m-%d_") if datestamp else '' 72 | relpath = ''.join([ymd_time, exp_name]) 73 | 74 | if seed is not None: 75 | # Make a seed-specific subfolder in the experiment directory. 76 | if datestamp: 77 | hms_time = time.strftime("%Y-%m-%d_%H-%M-%S") 78 | subfolder = ''.join([hms_time, '-', exp_name, '_s', str(seed)]) 79 | else: 80 | subfolder = ''.join([exp_name, '_s', str(seed)]) 81 | relpath = osp.join(relpath, subfolder) 82 | 83 | data_dir = data_dir or DEFAULT_DATA_DIR 84 | logger_kwargs = dict(output_dir=osp.join(data_dir, relpath), 85 | exp_name=exp_name) 86 | return logger_kwargs 87 | 88 | 89 | def call_experiment(exp_name, thunk, seed=0, num_cpu=1, data_dir=None, 90 | datestamp=False, **kwargs): 91 | """ 92 | Run a function (thunk) with hyperparameters (kwargs), plus configuration. 93 | 94 | This wraps a few pieces of functionality which are useful when you want 95 | to run many experiments in sequence, including logger configuration and 96 | splitting into multiple processes for MPI. 97 | 98 | There's also a SpinningUp-specific convenience added into executing the 99 | thunk: if ``env_name`` is one of the kwargs passed to call_experiment, it's 100 | assumed that the thunk accepts an argument called ``env_fn``, and that 101 | the ``env_fn`` should make a gym environment with the given ``env_name``. 102 | 103 | The way the experiment is actually executed is slightly complicated: the 104 | function is serialized to a string, and then ``run_entrypoint.py`` is 105 | executed in a subprocess call with the serialized string as an argument. 106 | ``run_entrypoint.py`` unserializes the function call and executes it. 107 | We choose to do it this way---instead of just calling the function 108 | directly here---to avoid leaking state between successive experiments. 109 | 110 | Args: 111 | 112 | exp_name (string): Name for experiment. 113 | 114 | thunk (callable): A python function. 115 | 116 | seed (int): Seed for random number generators. 117 | 118 | num_cpu (int): Number of MPI processes to split into. Also accepts 119 | 'auto', which will set up as many procs as there are cpus on 120 | the machine. 121 | 122 | data_dir (string): Used in configuring the logger, to decide where 123 | to store experiment results. Note: if left as None, data_dir will 124 | default to ``DEFAULT_DATA_DIR`` from ``spinup/user_config.py``. 125 | 126 | **kwargs: All kwargs to pass to thunk. 127 | 128 | """ 129 | 130 | # Determine number of CPU cores to run on 131 | num_cpu = psutil.cpu_count(logical=False) if num_cpu=='auto' else num_cpu 132 | 133 | # Send random seed to thunk 134 | kwargs['seed'] = seed 135 | 136 | # Be friendly and print out your kwargs, so we all know what's up 137 | print(colorize('Running experiment:\n', color='cyan', bold=True)) 138 | print(exp_name + '\n') 139 | print(colorize('with kwargs:\n', color='cyan', bold=True)) 140 | kwargs_json = convert_json(kwargs) 141 | print(json.dumps(kwargs_json, separators=(',',':\t'), indent=4, sort_keys=True)) 142 | print('\n') 143 | 144 | # Set up logger output directory 145 | if 'logger_kwargs' not in kwargs: 146 | kwargs['logger_kwargs'] = setup_logger_kwargs(exp_name, seed, data_dir, datestamp) 147 | else: 148 | print('Note: Call experiment is not handling logger_kwargs.\n') 149 | 150 | def thunk_plus(): 151 | # Make 'env_fn' from 'env_name' 152 | if 'env_name' in kwargs: 153 | import gym 154 | env_name = kwargs['env_name'] 155 | kwargs['env_fn'] = lambda : gym.make(env_name) 156 | del kwargs['env_name'] 157 | 158 | # Fork into multiple processes 159 | mpi_fork(num_cpu) 160 | 161 | # Run thunk 162 | thunk(**kwargs) 163 | 164 | # Prepare to launch a script to run the experiment 165 | pickled_thunk = cloudpickle.dumps(thunk_plus) 166 | encoded_thunk = base64.b64encode(zlib.compress(pickled_thunk)).decode('utf-8') 167 | 168 | entrypoint = osp.join(osp.abspath(osp.dirname(__file__)),'run_entrypoint.py') 169 | cmd = [sys.executable if sys.executable else 'python', entrypoint, encoded_thunk] 170 | try: 171 | subprocess.check_call(cmd, env=os.environ) 172 | except CalledProcessError: 173 | err_msg = '\n'*3 + '='*DIV_LINE_WIDTH + '\n' + dedent(""" 174 | 175 | There appears to have been an error in your experiment. 176 | 177 | Check the traceback above to see what actually went wrong. The 178 | traceback below, included for completeness (but probably not useful 179 | for diagnosing the error), shows the stack leading up to the 180 | experiment launch. 181 | 182 | """) + '='*DIV_LINE_WIDTH + '\n'*3 183 | print(err_msg) 184 | raise 185 | 186 | # Tell the user about where results are, and how to check them 187 | logger_kwargs = kwargs['logger_kwargs'] 188 | 189 | plot_cmd = 'python -m spinup.run plot '+logger_kwargs['output_dir'] 190 | plot_cmd = colorize(plot_cmd, 'green') 191 | 192 | test_cmd = 'python -m spinup.run test_policy '+logger_kwargs['output_dir'] 193 | test_cmd = colorize(test_cmd, 'green') 194 | 195 | output_msg = '\n'*5 + '='*DIV_LINE_WIDTH +'\n' + dedent("""\ 196 | End of experiment. 197 | 198 | 199 | Plot results from this run with: 200 | 201 | %s 202 | 203 | 204 | Watch the trained agent with: 205 | 206 | %s 207 | 208 | 209 | """%(plot_cmd,test_cmd)) + '='*DIV_LINE_WIDTH + '\n'*5 210 | 211 | print(output_msg) 212 | 213 | 214 | def all_bools(vals): 215 | return all([isinstance(v,bool) for v in vals]) 216 | 217 | def valid_str(v): 218 | """ 219 | Convert a value or values to a string which could go in a filepath. 220 | 221 | Partly based on `this gist`_. 222 | 223 | .. _`this gist`: https://gist.github.com/seanh/93666 224 | 225 | """ 226 | if hasattr(v, '__name__'): 227 | return valid_str(v.__name__) 228 | 229 | if isinstance(v, tuple) or isinstance(v, list): 230 | return '-'.join([valid_str(x) for x in v]) 231 | 232 | # Valid characters are '-', '_', and alphanumeric. Replace invalid chars 233 | # with '-'. 234 | str_v = str(v).lower() 235 | valid_chars = "-_%s%s" % (string.ascii_letters, string.digits) 236 | str_v = ''.join(c if c in valid_chars else '-' for c in str_v) 237 | return str_v 238 | 239 | 240 | class ExperimentGrid: 241 | """ 242 | Tool for running many experiments given hyperparameter ranges. 243 | """ 244 | 245 | def __init__(self, name=''): 246 | self.keys = [] 247 | self.vals = [] 248 | self.shs = [] 249 | self.in_names = [] 250 | self.name(name) 251 | 252 | def name(self, _name): 253 | assert isinstance(_name, str), "Name has to be a string." 254 | self._name = _name 255 | 256 | def print(self): 257 | """Print a helpful report about the experiment grid.""" 258 | print('='*DIV_LINE_WIDTH) 259 | 260 | # Prepare announcement at top of printing. If the ExperimentGrid has a 261 | # short name, write this as one line. If the name is long, break the 262 | # announcement over two lines. 263 | base_msg = 'ExperimentGrid %s runs over parameters:\n' 264 | name_insert = '['+self._name+']' 265 | if len(base_msg%name_insert) <= 80: 266 | msg = base_msg%name_insert 267 | else: 268 | msg = base_msg%(name_insert+'\n') 269 | print(colorize(msg, color='green', bold=True)) 270 | 271 | # List off parameters, shorthands, and possible values. 272 | for k, v, sh in zip(self.keys, self.vals, self.shs): 273 | color_k = colorize(k.ljust(40), color='cyan', bold=True) 274 | print('', color_k, '['+sh+']' if sh is not None else '', '\n') 275 | for i, val in enumerate(v): 276 | print('\t' + str(convert_json(val))) 277 | print() 278 | 279 | # Count up the number of variants. The number counting seeds 280 | # is the total number of experiments that will run; the number not 281 | # counting seeds is the total number of otherwise-unique configs 282 | # being investigated. 283 | nvars_total = int(np.prod([len(v) for v in self.vals])) 284 | if 'seed' in self.keys: 285 | num_seeds = len(self.vals[self.keys.index('seed')]) 286 | nvars_seedless = int(nvars_total / num_seeds) 287 | else: 288 | nvars_seedless = nvars_total 289 | print(' Variants, counting seeds: '.ljust(40), nvars_total) 290 | print(' Variants, not counting seeds: '.ljust(40), nvars_seedless) 291 | print() 292 | print('='*DIV_LINE_WIDTH) 293 | 294 | 295 | def _default_shorthand(self, key): 296 | # Create a default shorthand for the key, built from the first 297 | # three letters of each colon-separated part. 298 | # But if the first three letters contains something which isn't 299 | # alphanumeric, shear that off. 300 | valid_chars = "%s%s" % (string.ascii_letters, string.digits) 301 | def shear(x): 302 | return ''.join(z for z in x[:3] if z in valid_chars) 303 | sh = '-'.join([shear(x) for x in key.split(':')]) 304 | return sh 305 | 306 | def add(self, key, vals, shorthand=None, in_name=False): 307 | """ 308 | Add a parameter (key) to the grid config, with potential values (vals). 309 | 310 | By default, if a shorthand isn't given, one is automatically generated 311 | from the key using the first three letters of each colon-separated 312 | term. To disable this behavior, change ``DEFAULT_SHORTHAND`` in the 313 | ``spinup/user_config.py`` file to ``False``. 314 | 315 | Args: 316 | key (string): Name of parameter. 317 | 318 | vals (value or list of values): Allowed values of parameter. 319 | 320 | shorthand (string): Optional, shortened name of parameter. For 321 | example, maybe the parameter ``steps_per_epoch`` is shortened 322 | to ``steps``. 323 | 324 | in_name (bool): When constructing variant names, force the 325 | inclusion of this parameter into the name. 326 | """ 327 | assert isinstance(key, str), "Key must be a string." 328 | assert shorthand is None or isinstance(shorthand, str), \ 329 | "Shorthand must be a string." 330 | if not isinstance(vals, list): 331 | vals = [vals] 332 | if DEFAULT_SHORTHAND and shorthand is None: 333 | shorthand = self._default_shorthand(key) 334 | self.keys.append(key) 335 | self.vals.append(vals) 336 | self.shs.append(shorthand) 337 | self.in_names.append(in_name) 338 | 339 | def variant_name(self, variant): 340 | """ 341 | Given a variant (dict of valid param/value pairs), make an exp_name. 342 | 343 | A variant's name is constructed as the grid name (if you've given it 344 | one), plus param names (or shorthands if available) and values 345 | separated by underscores. 346 | 347 | Note: if ``seed`` is a parameter, it is not included in the name. 348 | """ 349 | 350 | def get_val(v, k): 351 | # Utility method for getting the correct value out of a variant 352 | # given as a nested dict. Assumes that a parameter name, k, 353 | # describes a path into the nested dict, such that k='a:b:c' 354 | # corresponds to value=variant['a']['b']['c']. Uses recursion 355 | # to get this. 356 | if k in v: 357 | return v[k] 358 | else: 359 | splits = k.split(':') 360 | k0, k1 = splits[0], ':'.join(splits[1:]) 361 | return get_val(v[k0], k1) 362 | 363 | # Start the name off with the name of the variant generator. 364 | var_name = self._name 365 | 366 | # Build the rest of the name by looping through all parameters, 367 | # and deciding which ones need to go in there. 368 | for k, v, sh, inn in zip(self.keys, self.vals, self.shs, self.in_names): 369 | 370 | # Include a parameter in a name if either 1) it can take multiple 371 | # values, or 2) the user specified that it must appear in the name. 372 | # Except, however, when the parameter is 'seed'. Seed is handled 373 | # differently so that runs of the same experiment, with different 374 | # seeds, will be grouped by experiment name. 375 | if (len(v)>1 or inn) and not(k=='seed'): 376 | 377 | # Use the shorthand if available, otherwise the full name. 378 | param_name = sh if sh is not None else k 379 | param_name = valid_str(param_name) 380 | 381 | # Get variant value for parameter k 382 | variant_val = get_val(variant, k) 383 | 384 | # Append to name 385 | if all_bools(v): 386 | # If this is a param which only takes boolean values, 387 | # only include in the name if it's True for this variant. 388 | var_name += ('_' + param_name) if variant_val else '' 389 | else: 390 | var_name += '_' + param_name + valid_str(variant_val) 391 | 392 | return var_name.lstrip('_') 393 | 394 | def _variants(self, keys, vals): 395 | """ 396 | Recursively builds list of valid variants. 397 | """ 398 | if len(keys)==1: 399 | pre_variants = [dict()] 400 | else: 401 | pre_variants = self._variants(keys[1:], vals[1:]) 402 | 403 | variants = [] 404 | for val in vals[0]: 405 | for pre_v in pre_variants: 406 | v = {} 407 | v[keys[0]] = val 408 | v.update(pre_v) 409 | variants.append(v) 410 | return variants 411 | 412 | def variants(self): 413 | """ 414 | Makes a list of dicts, where each dict is a valid config in the grid. 415 | 416 | There is special handling for variant parameters whose names take 417 | the form 418 | 419 | ``'full:param:name'``. 420 | 421 | The colons are taken to indicate that these parameters should 422 | have a nested dict structure. eg, if there are two params, 423 | 424 | ==================== === 425 | Key Val 426 | ==================== === 427 | ``'base:param:a'`` 1 428 | ``'base:param:b'`` 2 429 | ==================== === 430 | 431 | the variant dict will have the structure 432 | 433 | .. parsed-literal:: 434 | 435 | variant = { 436 | base: { 437 | param : { 438 | a : 1, 439 | b : 2 440 | } 441 | } 442 | } 443 | """ 444 | flat_variants = self._variants(self.keys, self.vals) 445 | 446 | def unflatten_var(var): 447 | """ 448 | Build the full nested dict version of var, based on key names. 449 | """ 450 | new_var = dict() 451 | unflatten_set = set() 452 | 453 | for k,v in var.items(): 454 | if ':' in k: 455 | splits = k.split(':') 456 | k0 = splits[0] 457 | assert k0 not in new_var or isinstance(new_var[k0], dict), \ 458 | "You can't assign multiple values to the same key." 459 | 460 | if not(k0 in new_var): 461 | new_var[k0] = dict() 462 | 463 | sub_k = ':'.join(splits[1:]) 464 | new_var[k0][sub_k] = v 465 | unflatten_set.add(k0) 466 | else: 467 | assert not(k in new_var), \ 468 | "You can't assign multiple values to the same key." 469 | new_var[k] = v 470 | 471 | # Make sure to fill out the nested dicts. 472 | for k in unflatten_set: 473 | new_var[k] = unflatten_var(new_var[k]) 474 | 475 | return new_var 476 | 477 | new_variants = [unflatten_var(var) for var in flat_variants] 478 | return new_variants 479 | 480 | def run(self, thunk, num_cpu=1, data_dir=None, datestamp=False): 481 | """ 482 | Run each variant in the grid with function 'thunk'. 483 | 484 | Note: 'thunk' must be either a callable function, or a string. If it is 485 | a string, it must be the name of a parameter whose values are all 486 | callable functions. 487 | 488 | Uses ``call_experiment`` to actually launch each experiment, and gives 489 | each variant a name using ``self.variant_name()``. 490 | 491 | Maintenance note: the args for ExperimentGrid.run should track closely 492 | to the args for call_experiment. However, ``seed`` is omitted because 493 | we presume the user may add it as a parameter in the grid. 494 | """ 495 | 496 | # Print info about self. 497 | self.print() 498 | 499 | # Make the list of all variants. 500 | variants = self.variants() 501 | 502 | # Print variant names for the user. 503 | var_names = set([self.variant_name(var) for var in variants]) 504 | var_names = sorted(list(var_names)) 505 | line = '='*DIV_LINE_WIDTH 506 | preparing = colorize('Preparing to run the following experiments...', 507 | color='green', bold=True) 508 | joined_var_names = '\n'.join(var_names) 509 | announcement = f"\n{preparing}\n\n{joined_var_names}\n\n{line}" 510 | print(announcement) 511 | 512 | 513 | if WAIT_BEFORE_LAUNCH > 0: 514 | delay_msg = colorize(dedent(""" 515 | Launch delayed to give you a few seconds to review your experiments. 516 | 517 | To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in 518 | spinup/user_config.py. 519 | 520 | """), color='cyan', bold=True)+line 521 | print(delay_msg) 522 | wait, steps = WAIT_BEFORE_LAUNCH, 100 523 | prog_bar = trange(steps, desc='Launching in...', 524 | leave=False, ncols=DIV_LINE_WIDTH, 525 | mininterval=0.25, 526 | bar_format='{desc}: {bar}| {remaining} {elapsed}') 527 | for _ in prog_bar: 528 | time.sleep(wait/steps) 529 | 530 | # Run the variants. 531 | for var in variants: 532 | exp_name = self.variant_name(var) 533 | 534 | # Figure out what the thunk is. 535 | if isinstance(thunk, str): 536 | # Assume one of the variant parameters has the same 537 | # name as the string you passed for thunk, and that 538 | # variant[thunk] is a valid callable function. 539 | thunk_ = var[thunk] 540 | del var[thunk] 541 | else: 542 | # Assume thunk is given as a function. 543 | thunk_ = thunk 544 | 545 | call_experiment(exp_name, thunk_, num_cpu=num_cpu, 546 | data_dir=data_dir, datestamp=datestamp, **var) 547 | 548 | 549 | def test_eg(): 550 | eg = ExperimentGrid() 551 | eg.add('test:a', [1,2,3], 'ta', True) 552 | eg.add('test:b', [1,2,3]) 553 | eg.add('some', [4,5]) 554 | eg.add('why', [True,False]) 555 | eg.add('huh', 5) 556 | eg.add('no', 6, in_name=True) 557 | return eg.variants() 558 | -------------------------------------------------------------------------------- /Code/utils/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convert_json(obj): 4 | """ Convert obj to a version which can be serialized with JSON. """ 5 | if is_json_serializable(obj): 6 | return obj 7 | else: 8 | if isinstance(obj, dict): 9 | return {convert_json(k): convert_json(v) 10 | for k,v in obj.items()} 11 | 12 | elif isinstance(obj, tuple): 13 | return (convert_json(x) for x in obj) 14 | 15 | elif isinstance(obj, list): 16 | return [convert_json(x) for x in obj] 17 | 18 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 19 | return convert_json(obj.__name__) 20 | 21 | elif hasattr(obj,'__dict__') and obj.__dict__: 22 | obj_dict = {convert_json(k): convert_json(v) 23 | for k,v in obj.__dict__.items()} 24 | return {str(obj): obj_dict} 25 | 26 | return str(obj) 27 | 28 | def is_json_serializable(v): 29 | try: 30 | json.dumps(v) 31 | return True 32 | except: 33 | return False -------------------------------------------------------------------------------- /Code/utils/test_policy.py: -------------------------------------------------------------------------------- 1 | import time 2 | import joblib 3 | import os 4 | import os.path as osp 5 | import tensorflow as tf 6 | import torch 7 | from spinup import EpochLogger 8 | from spinup.utils.logx import restore_tf_graph 9 | 10 | 11 | def load_policy_and_env(fpath, itr='last', deterministic=False): 12 | """ 13 | Load a policy from save, whether it's TF or PyTorch, along with RL env. 14 | 15 | Not exceptionally future-proof, but it will suffice for basic uses of the 16 | Spinning Up implementations. 17 | 18 | Checks to see if there's a tf1_save folder. If yes, assumes the model 19 | is tensorflow and loads it that way. Otherwise, loads as if there's a 20 | PyTorch save. 21 | """ 22 | 23 | # determine if tf save or pytorch save 24 | if any(['tf1_save' in x for x in os.listdir(fpath)]): 25 | backend = 'tf1' 26 | else: 27 | backend = 'pytorch' 28 | 29 | # handle which epoch to load from 30 | if itr=='last': 31 | # check filenames for epoch (AKA iteration) numbers, find maximum value 32 | 33 | if backend == 'tf1': 34 | saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x)>8] 35 | 36 | elif backend == 'pytorch': 37 | pytsave_path = osp.join(fpath, 'pyt_save') 38 | # Each file in this folder has naming convention 'modelXX.pt', where 39 | # 'XX' is either an integer or empty string. Empty string case 40 | # corresponds to len(x)==8, hence that case is excluded. 41 | saves = [int(x.split('.')[0][5:]) for x in os.listdir(pytsave_path) if len(x)>8 and 'model' in x] 42 | 43 | itr = '%d'%max(saves) if len(saves) > 0 else '' 44 | 45 | else: 46 | assert isinstance(itr, int), \ 47 | "Bad value provided for itr (needs to be int or 'last')." 48 | itr = '%d'%itr 49 | 50 | # load the get_action function 51 | if backend == 'tf1': 52 | get_action = load_tf_policy(fpath, itr, deterministic) 53 | else: 54 | get_action = load_pytorch_policy(fpath, itr, deterministic) 55 | 56 | # try to load environment from save 57 | # (sometimes this will fail because the environment could not be pickled) 58 | try: 59 | state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl')) 60 | env = state['env'] 61 | except: 62 | env = None 63 | 64 | return env, get_action 65 | 66 | 67 | def load_tf_policy(fpath, itr, deterministic=False): 68 | """ Load a tensorflow policy saved with Spinning Up Logger.""" 69 | 70 | fname = osp.join(fpath, 'tf1_save'+itr) 71 | print('\n\nLoading from %s.\n\n'%fname) 72 | 73 | # load the things! 74 | sess = tf.Session() 75 | model = restore_tf_graph(sess, fname) 76 | 77 | # get the correct op for executing actions 78 | if deterministic and 'mu' in model.keys(): 79 | # 'deterministic' is only a valid option for SAC policies 80 | print('Using deterministic action op.') 81 | action_op = model['mu'] 82 | else: 83 | print('Using default action op.') 84 | action_op = model['pi'] 85 | 86 | # make function for producing an action given a single state 87 | get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x[None,:]})[0] 88 | 89 | return get_action 90 | 91 | 92 | def load_pytorch_policy(fpath, itr, deterministic=False): 93 | """ Load a pytorch policy saved with Spinning Up Logger.""" 94 | 95 | fname = osp.join(fpath, 'pyt_save', 'model'+itr+'.pt') 96 | print('\n\nLoading from %s.\n\n'%fname) 97 | 98 | model = torch.load(fname) 99 | 100 | # make function for producing an action given a single state 101 | def get_action(x): 102 | with torch.no_grad(): 103 | x = torch.as_tensor(x, dtype=torch.float32) 104 | action = model.act(x) 105 | return action 106 | 107 | return get_action 108 | 109 | 110 | def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): 111 | 112 | assert env is not None, \ 113 | "Environment not found!\n\n It looks like the environment wasn't saved, " + \ 114 | "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ 115 | "page on Experiment Outputs for how to handle this situation." 116 | 117 | logger = EpochLogger() 118 | o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 119 | while n < num_episodes: 120 | if render: 121 | env.render() 122 | time.sleep(1e-3) 123 | 124 | a = get_action(o) 125 | o, r, d, _ = env.step(a) 126 | ep_ret += r 127 | ep_len += 1 128 | 129 | if d or (ep_len == max_ep_len): 130 | logger.store(EpRet=ep_ret, EpLen=ep_len) 131 | print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len)) 132 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 133 | n += 1 134 | 135 | logger.log_tabular('EpRet', with_min_and_max=True) 136 | logger.log_tabular('EpLen', average_only=True) 137 | logger.dump_tabular() 138 | 139 | 140 | if __name__ == '__main__': 141 | import argparse 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('fpath', type=str) 144 | parser.add_argument('--len', '-l', type=int, default=0) 145 | parser.add_argument('--episodes', '-n', type=int, default=100) 146 | parser.add_argument('--norender', '-nr', action='store_true') 147 | parser.add_argument('--itr', '-i', type=int, default=-1) 148 | parser.add_argument('--deterministic', '-d', action='store_true') 149 | args = parser.parse_args() 150 | env, get_action = load_policy_and_env(args.fpath, 151 | args.itr if args.itr >=0 else 'last', 152 | args.deterministic) 153 | run_policy(env, get_action, args.len, args.episodes, not(args.norender)) -------------------------------------------------------------------------------- /Final Version ISA/Bitchplzwork.slxc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/Bitchplzwork.slxc -------------------------------------------------------------------------------- /Final Version ISA/Hybrid.slx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/Hybrid.slx -------------------------------------------------------------------------------- /Final Version ISA/Hybrid.slxc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/Hybrid.slxc -------------------------------------------------------------------------------- /Final Version ISA/MPPT_algorithm.m: -------------------------------------------------------------------------------- 1 | function duty = MPPT_algorithm(vpv,ipv,delta) 2 | 3 | duty_init = 0.1; 4 | duty_min = 0; 5 | duty_max = 0.85; 6 | 7 | 8 | persistent Vold Pold duty_old; 9 | 10 | 11 | if isempty(Vold) 12 | Vold=0; 13 | Pold=0; 14 | duty_old=duty_init; 15 | end 16 | P=vpv*ipv; 17 | dV= vpv - Vold; 18 | dP= P - Pold; 19 | 20 | if dP == 0 && vpv>30 21 | if dP < 0 22 | if dV < 0 23 | duty = duty_old - delta ; 24 | else 25 | duty = duty_old + delta ; 26 | end 27 | else 28 | if dV < 0 29 | duty = duty_old + delta; 30 | else 31 | duty = duty_old - delta; 32 | end 33 | end 34 | else 35 | duty = duty_old; 36 | end 37 | if duty >= duty_max 38 | duty=duty_max; 39 | elseif duty 2 | 3 | 4 | xLFpLMrm6hGf8byEPcXhCA== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Bitchplzwork/varInfo.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Bitchplzwork/varInfo.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Hybrid/checksumOfCache.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Hybrid/checksumOfCache.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Hybrid/tmwinternal/simulink_cache.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /Z3GBmUXc1Zkma8yxRkU/g== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Hybrid/varInfo.mat: -------------------------------------------------------------------------------- 1 | MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Fri May 8 15:43:14 2020 IM( GlobalWorkspace8Cfmax@base workspace8globalX$$00000000-0000-0000-0000-000000000000000Lf@base workspace8globalX$$00000000-0000-0000-0000-000000000000000RLf@base workspace8globalX$$00000000-0000-0000-0000-000000000000000Tss@base workspace8globalX$$00000000-0000-0000-0000-000000000000008irrad@base workspace8globalX$$00000000-0000-0000-0000-000000000000000temp@base workspace8globalX$$00000000-0000-0000-0000-00000000000000@  timeStamp@base workspace8globalX$$00000000-0000-0000-0000-00000000000000@  windSpeed@base workspace8globalX$$00000000-0000-0000-0000-000000000000008 ConfigSetRef8ModelWorkspace -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Microgrid/checksumOfCache.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Microgrid/checksumOfCache.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Microgrid/tmwinternal/simulink_cache.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 79zZ+U9q702OyWlBSq2WOg== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Microgrid/varInfo.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Microgrid/varInfo.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Solar/checksumOfCache.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Solar/checksumOfCache.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Solar/tmwinternal/simulink_cache.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | HYEWkLSyPw63ZJNRQKmncA== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/Solar/varInfo.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/Solar/varInfo.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/power_PVarray_250kW/checksumOfCache.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/power_PVarray_250kW/checksumOfCache.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/power_PVarray_250kW/tmwinternal/simulink_cache.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /W7swpNEa+rDzqy/qdnqMw== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/power_PVarray_250kW/varInfo.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/power_PVarray_250kW/varInfo.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/windgen/checksumOfCache.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/slprj/sim/varcache/windgen/checksumOfCache.mat -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/windgen/tmwinternal/simulink_cache.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | TjsfiNG4JH+faQ6yfxU1bw== 5 | 6 | -------------------------------------------------------------------------------- /Final Version ISA/slprj/sim/varcache/windgen/varInfo.mat: -------------------------------------------------------------------------------- 1 | MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Thu Sep 3 15:21:43 2020 IMGlobalWorkspace@  timeStamp@base workspace8globalX$$00000000-0000-0000-0000-000000000000008wSpeed@base workspace8globalX$$00000000-0000-0000-0000-000000000000008 ConfigSetRef8ModelWorkspace -------------------------------------------------------------------------------- /Final Version ISA/slprj/sl_proj.tmw: -------------------------------------------------------------------------------- 1 | Simulink Coder project marker file. Please don't change it. 2 | slprjVersion: 9.3_074 -------------------------------------------------------------------------------- /Final Version ISA/wIND.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | data=readtable("wind.csv"); 3 | 4 | 5 | timeStamp = data.TimeStamp; 6 | wSpeed = data.wSpeed100; -------------------------------------------------------------------------------- /Final Version ISA/windgen.slxc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moayad-hsn/Enhancing-energy-trading-between-different-Islanded-Microgrids-A-Reinforcement-Learning-Algorithm/aafef27a983e05ec71bd4cf9b1d159727d55551e/Final Version ISA/windgen.slxc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microgrid Trading Using Reinforcement Learning 2 | This code is for a graduation project, then transformed into 3 papers presented at ICCCEEE20 available at IEEEXPLORE relating to the managment and control of a trading game between islanded microgrids with different deep reinforcement learning techniques run on a custom environment. 3 | 4 | The paper presents an algorithm that acts as a trading controller for islanded microgrids, applied to data from Sudanese villages. The paper uses two Deep Reinforcement Learning algorithms, DDPG and PPO on the environment designed by the researchers. 5 | DOI for the 3 papers are: 6 | 7 | Enhancing Energy Trading Between Different Islanded Microgrids A Reinforcement Learning Algorithm Case Study in Northern Kordofan State: [10.1109/ICCCEEE49695.2021.9429584](10.1109/ICCCEEE49695.2021.9429584) 8 | 9 | Comparison of Deep Reinforcement Learning Algorithms in Enhancing Energy Trading in Microgrids: [10.1109/ICCCEEE49695.2021.9429565](10.1109/ICCCEEE49695.2021.9429565) 10 | 11 | An Economic Evaluation of Islanded Microgrids Implementation in Northern Kordofan State: [10.1109/ICCCEEE49695.2021.9429680](10.1109/ICCCEEE49695.2021.9429680) 12 | 13 | -------------------------------------------------------------------------------- /main_DDPG.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import numpy as np 3 | import torch 4 | from torch.optim import Adam 5 | import gym 6 | import time 7 | import Code.core_DDPG as core 8 | from Code.enviroment import MicrogridEnv 9 | from Code.utils.logx import EpochLogger 10 | 11 | class ReplayBuffer: 12 | 13 | def __init__(self, obs_dim, act_dim, size): 14 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 15 | self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 16 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 17 | self.rew_buf = np.zeros(size, dtype=np.float32) 18 | self.done_buf = np.zeros(size, dtype=np.float32) 19 | self.ptr, self.size, self.max_size = 0, 0, size 20 | 21 | def store(self, obs, act, rew, next_obs, done): 22 | self.obs_buf[self.ptr] = obs 23 | self.obs2_buf[self.ptr] = next_obs 24 | self.act_buf[self.ptr] = act 25 | self.rew_buf[self.ptr] = rew 26 | self.done_buf[self.ptr] = done 27 | self.ptr = (self.ptr+1) % self.max_size 28 | self.size = min(self.size+1, self.max_size) 29 | 30 | def sample_batch(self, batch_size=32): 31 | idxs = np.random.randint(0, self.size, size=batch_size) 32 | device = 'cuda' 33 | batch = dict(obs=self.obs_buf[idxs], 34 | obs2=self.obs2_buf[idxs], 35 | act=self.act_buf[idxs], 36 | rew=self.rew_buf[idxs], 37 | done=self.done_buf[idxs]) 38 | return {k: torch.as_tensor(v, dtype=torch.float32).to(device) for k,v in batch.items()} 39 | 40 | 41 | def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 42 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 43 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 44 | update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 45 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 46 | 47 | 48 | logger = EpochLogger(**logger_kwargs) 49 | logger.save_config(locals()) 50 | 51 | torch.cuda.manual_seed(seed) 52 | np.random.seed(seed) 53 | 54 | env, test_env = env_fn(), env_fn() 55 | obs_dim = env.observation_space.shape 56 | act_dim = env.action_space.shape[0] 57 | 58 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 59 | act_limit = [env.action_space.low, env.action_space.high] 60 | 61 | # Create actor-critic module and target networks 62 | ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) 63 | ac_targ = deepcopy(ac) 64 | 65 | # Freeze target networks with respect to optimizers (only update via polyak averaging) 66 | for p in ac_targ.parameters(): 67 | p.requires_grad = False 68 | 69 | # Experience buffer 70 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 71 | 72 | # Count variables (protip: try to get a feel for how different size networks behave!) 73 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) 74 | logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) 75 | 76 | # Set up function for computing DDPG Q-loss 77 | def compute_loss_q(data): 78 | o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] 79 | 80 | q = ac.q(o,a) 81 | 82 | # Bellman backup for Q function 83 | with torch.no_grad(): 84 | q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) 85 | backup = r + gamma * (1 - d) * q_pi_targ 86 | 87 | # MSE loss against Bellman backup 88 | loss_q = ((q - backup)**2).mean() 89 | 90 | # Useful info for logging 91 | loss_info = dict(QVals=q.detach().cpu().data.numpy()) 92 | 93 | return loss_q, loss_info 94 | 95 | # Set up function for computing DDPG pi loss 96 | def compute_loss_pi(data): 97 | o = data['obs'] 98 | q_pi = ac.q(o, ac.pi(o)) 99 | return -q_pi.mean() 100 | 101 | # Set up optimizers for policy and q-function 102 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) 103 | q_optimizer = Adam(ac.q.parameters(), lr=q_lr) 104 | 105 | # Set up model saving 106 | logger.setup_pytorch_saver(ac) 107 | 108 | def update(data): 109 | # First run one gradient descent step for Q. 110 | q_optimizer.zero_grad() 111 | loss_q, loss_info = compute_loss_q(data) 112 | loss_q.backward() 113 | q_optimizer.step() 114 | 115 | # Freeze Q-network so you don't waste computational effort 116 | # computing gradients for it during the policy learning step. 117 | for p in ac.q.parameters(): 118 | p.requires_grad = False 119 | 120 | # Next run one gradient descent step for pi. 121 | pi_optimizer.zero_grad() 122 | loss_pi = compute_loss_pi(data) 123 | loss_pi.backward() 124 | pi_optimizer.step() 125 | 126 | # Unfreeze Q-network so you can optimize it at next DDPG step. 127 | for p in ac.q.parameters(): 128 | p.requires_grad = True 129 | 130 | # Record things 131 | logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) 132 | 133 | # Finally, update target networks by polyak averaging. 134 | with torch.no_grad(): 135 | for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): 136 | # NB: We use an in-place operations "mul_", "add_" to update target 137 | # params, as opposed to "mul" and "add", which would make new tensors. 138 | p_targ.data.mul_(polyak) 139 | p_targ.data.add_((1 - polyak) * p.data) 140 | 141 | def get_action(o, noise_scale): 142 | a = ac.act(torch.as_tensor(o, dtype=torch.float32).to('cuda')) 143 | a += noise_scale * np.random.randn(act_dim) 144 | return np.clip(a, act_limit[0], act_limit[1]) 145 | 146 | def test_agent(): 147 | for j in range(num_test_episodes): 148 | o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 149 | while not(d or (ep_len == max_ep_len)): 150 | # Take deterministic actions at test time (noise_scale=0) 151 | o, r, d, _ = test_env.step(get_action(o, 0)) 152 | ep_ret += r 153 | ep_len += 1 154 | logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 155 | 156 | # Prepare for interaction with environment 157 | total_steps = steps_per_epoch * epochs 158 | start_time = time.time() 159 | o, ep_ret, ep_len = env.reset(), 0, 0 160 | 161 | # Main loop: collect experience in env and update/log each epoch 162 | for t in range(total_steps): 163 | 164 | # Until start_steps have elapsed, randomly sample actions 165 | # from a uniform distribution for better exploration. Afterwards, 166 | # use the learned policy (with some noise, via act_noise). 167 | if t > start_steps: 168 | a = get_action(o, act_noise) 169 | #print(a) 170 | else: 171 | a = env.action_space.sample() 172 | #print("t:",t) 173 | if t == 10: 174 | print(ac) 175 | # Step the env 176 | o2, r, d, _ = env.step(a) 177 | ep_ret += r 178 | ep_len += 1 179 | 180 | # Ignore the "done" signal if it comes from hitting the time 181 | # horizon (that is, when it's an artificial terminal signal 182 | # that isn't based on the agent's state) 183 | d = False if ep_len==max_ep_len else d 184 | 185 | # Store experience to replay buffer 186 | replay_buffer.store(o, a, r, o2, d) 187 | 188 | # Super critical, easy to overlook step: make sure to update 189 | # most recent observation! 190 | o = o2 191 | 192 | # End of trajectory handling 193 | if d or (ep_len == max_ep_len): 194 | logger.store(EpRet=ep_ret, EpLen=ep_len) 195 | o, ep_ret, ep_len = env.reset(), 0, 0 196 | 197 | # Update handling 198 | if t >= update_after and t % update_every == 0: 199 | for _ in range(update_every): 200 | batch = replay_buffer.sample_batch(batch_size) 201 | update(data=batch) 202 | 203 | # End of epoch handling 204 | if (t+1) % steps_per_epoch == 0: 205 | epoch = (t+1) // steps_per_epoch 206 | 207 | # Save model 208 | if (epoch % save_freq == 0) or (epoch == epochs): 209 | logger.save_state({'env': env}, None) 210 | 211 | # Test the performance of the deterministic version of the agent. 212 | test_agent() 213 | 214 | # Log info about epoch 215 | logger.log_tabular('Epoch', epoch) 216 | logger.log_tabular('EpRet', with_min_and_max=False) 217 | logger.log_tabular('TestEpRet', with_min_and_max=False) 218 | logger.log_tabular('EpLen', average_only=True) 219 | logger.log_tabular('TestEpLen', average_only=True) 220 | logger.log_tabular('TotalEnvInteracts', t) 221 | logger.log_tabular('AvgEnergyBought', np.mean(env.energy_bought)) 222 | logger.log_tabular('AvgEnergySold', np.mean(env.energy_sold)) 223 | logger.log_tabular('AvgPriceBought', np.mean(env.prices)) 224 | logger.log_tabular('QVals', with_min_and_max=False) 225 | logger.log_tabular('LossPi', average_only=True) 226 | logger.log_tabular('LossQ', average_only=True) 227 | logger.log_tabular('Time', time.time()-start_time) 228 | logger.dump_tabular() 229 | 230 | if __name__ == '__main__': 231 | import argparse 232 | parser = argparse.ArgumentParser() 233 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 234 | parser.add_argument('--hid', type=int, default=256) 235 | parser.add_argument('--l', type=int, default=2) 236 | parser.add_argument('--gamma', type=float, default=0.99) 237 | parser.add_argument('--seed', '-s', type=int, default=0) 238 | parser.add_argument('--epochs', type=int, default=50) 239 | parser.add_argument('--exp_name', type=str, default='ddpg') 240 | args = parser.parse_args() 241 | 242 | from Code.utils.run_utils import setup_logger_kwargs 243 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 244 | env = MicrogridEnv() 245 | ''' 246 | o = env.reset() 247 | rewards = [] 248 | t=0 249 | t1 = time.time() 250 | while True: 251 | amount = o[1] - (o[2] + o[0]) 252 | if amount == 0: 253 | o, r, d, _ = env.step([0,0,0,0]) 254 | elif amount>0: 255 | o, r, d, _ = env.step([0,0,amount,10]) 256 | else: 257 | o, r, d, _ = env.step([1,0,abs(amount), 19]) 258 | rewards.append(r) 259 | t+=1 260 | if t > 4000: 261 | break 262 | #for i in range(99): 263 | # print("Bought, sold, prices, rewards tot_sold", (env.energy_bought[i], env.energy_sold[i], env.prices[i], rewards[i], env.tot[i])) 264 | t2= time.time() 265 | print("time:",t2-t1) 266 | print("sum rewards: ",sum(rewards)) 267 | print("avg reward: ", np.mean(rewards)) 268 | ''' 269 | ddpg(lambda : env, actor_critic=core.MLPActorCritic, 270 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 271 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 272 | logger_kwargs=logger_kwargs) -------------------------------------------------------------------------------- /main_PPO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.optim import Adam 4 | import gym 5 | import time 6 | import Code.core_PPO as core 7 | from Code.utils.logx import EpochLogger 8 | from Code.enviroment import MicrogridEnv 9 | from Code.utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads 10 | from Code.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs 11 | 12 | 13 | class PPOBuffer: 14 | 15 | def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): 16 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 17 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 18 | self.adv_buf = np.zeros(size, dtype=np.float32) 19 | self.rew_buf = np.zeros(size, dtype=np.float32) 20 | self.ret_buf = np.zeros(size, dtype=np.float32) 21 | self.val_buf = np.zeros(size, dtype=np.float32) 22 | self.logp_buf = np.zeros(size, dtype=np.float32) 23 | self.gamma, self.lam = gamma, lam 24 | self.ptr, self.path_start_idx, self.max_size = 0, 0, size 25 | 26 | def store(self, obs, act, rew, val, logp): 27 | """ 28 | Append one timestep of agent-environment interaction to the buffer. 29 | """ 30 | assert self.ptr < self.max_size # buffer has to have room so you can store 31 | self.obs_buf[self.ptr] = obs 32 | self.act_buf[self.ptr] = act 33 | self.rew_buf[self.ptr] = rew 34 | self.val_buf[self.ptr] = val 35 | self.logp_buf[self.ptr] = logp 36 | self.ptr += 1 37 | 38 | def finish_path(self, last_val=0): 39 | 40 | path_slice = slice(self.path_start_idx, self.ptr) 41 | rews = np.append(self.rew_buf[path_slice], last_val) 42 | vals = np.append(self.val_buf[path_slice], last_val) 43 | 44 | # the next two lines implement GAE-Lambda advantage calculation 45 | deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] 46 | self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam) 47 | 48 | # the next line computes rewards-to-go, to be targets for the value function 49 | self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1] 50 | 51 | self.path_start_idx = self.ptr 52 | 53 | def get(self): 54 | 55 | assert self.ptr == self.max_size # buffer has to be full before you can get 56 | self.ptr, self.path_start_idx = 0, 0 57 | # the next two lines implement the advantage normalization trick 58 | adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf) 59 | self.adv_buf = (self.adv_buf - adv_mean) / adv_std 60 | data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf, 61 | adv=self.adv_buf, logp=self.logp_buf) 62 | return {k: torch.as_tensor(v, dtype=torch.float32).to('cuda') for k,v in data.items()} 63 | 64 | def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 65 | steps_per_epoch=4000, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, 66 | vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, 67 | target_kl=0.01, logger_kwargs=dict(), save_freq=10): 68 | 69 | setup_pytorch_for_mpi() 70 | 71 | # Set up logger and save configuration 72 | logger = EpochLogger(**logger_kwargs) 73 | logger.save_config(locals()) 74 | 75 | # Random seed 76 | seed += 10000 * proc_id() 77 | torch.cuda.manual_seed(seed) 78 | np.random.seed(seed) 79 | 80 | # Instantiate environment 81 | env = env_fn() 82 | obs_dim = env.observation_space.shape 83 | act_dim = env.action_space.shape 84 | 85 | # Create actor-critic module 86 | ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) 87 | 88 | # Sync params across processes 89 | sync_params(ac) 90 | 91 | # Count variables 92 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) 93 | logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) 94 | 95 | # Set up experience buffer 96 | local_steps_per_epoch = int(steps_per_epoch / num_procs()) 97 | buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) 98 | 99 | # Set up function for computing PPO policy loss 100 | def compute_loss_pi(data): 101 | obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] 102 | 103 | # Policy loss 104 | pi, logp = ac.pi(obs, act) 105 | ratio = torch.exp(logp - logp_old) 106 | clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv 107 | loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() 108 | 109 | # Useful extra info 110 | approx_kl = (logp_old - logp).mean().item() 111 | ent = pi.entropy().mean().item() 112 | clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) 113 | clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() 114 | pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) 115 | 116 | return loss_pi, pi_info 117 | 118 | # Set up function for computing value loss 119 | def compute_loss_v(data): 120 | obs, ret = data['obs'], data['ret'] 121 | return ((ac.v(obs) - ret)**2).mean() 122 | 123 | # Set up optimizers for policy and value function 124 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) 125 | vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) 126 | 127 | # Set up model saving 128 | logger.setup_pytorch_saver(ac) 129 | 130 | def update(): 131 | data = buf.get() 132 | 133 | pi_l_old, pi_info_old = compute_loss_pi(data) 134 | pi_l_old = pi_l_old.item() 135 | v_l_old = compute_loss_v(data).item() 136 | 137 | # Train policy with multiple steps of gradient descent 138 | for i in range(train_pi_iters): 139 | pi_optimizer.zero_grad() 140 | loss_pi, pi_info = compute_loss_pi(data) 141 | kl = mpi_avg(pi_info['kl']) 142 | if kl > 1.5 * target_kl: 143 | logger.log('Early stopping at step %d due to reaching max kl.'%i) 144 | break 145 | loss_pi.backward() 146 | mpi_avg_grads(ac.pi) # average grads across MPI processes 147 | pi_optimizer.step() 148 | 149 | logger.store(StopIter=i) 150 | 151 | # Value function learning 152 | for i in range(train_v_iters): 153 | vf_optimizer.zero_grad() 154 | loss_v = compute_loss_v(data) 155 | loss_v.backward() 156 | mpi_avg_grads(ac.v) # average grads across MPI processes 157 | vf_optimizer.step() 158 | 159 | # Log changes from update 160 | kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] 161 | logger.store(LossPi=pi_l_old, LossV=v_l_old, 162 | KL=kl, Entropy=ent, ClipFrac=cf, 163 | DeltaLossPi=(loss_pi.item() - pi_l_old), 164 | DeltaLossV=(loss_v.item() - v_l_old)) 165 | 166 | # Prepare for interaction with environment 167 | start_time = time.time() 168 | o, ep_ret, ep_len = env.reset(), 0, 0 169 | 170 | # Main loop: collect experience in env and update/log each epoch 171 | for epoch in range(epochs): 172 | for t in range(local_steps_per_epoch): 173 | a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32).to('cuda'))#.to('cuda') 174 | 175 | next_o, r, d, _ = env.step(a) 176 | ep_ret += r 177 | ep_len += 1 178 | 179 | # save and log 180 | buf.store(o, a, r, v, logp) 181 | logger.store(VVals=v) 182 | 183 | # Update obs (critical!) 184 | o = next_o 185 | 186 | timeout = ep_len == max_ep_len 187 | terminal = d or timeout 188 | epoch_ended = t==local_steps_per_epoch-1 189 | 190 | if terminal or epoch_ended: 191 | if epoch_ended and not(terminal): 192 | print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) 193 | # if trajectory didn't reach terminal state, bootstrap value target 194 | if timeout or epoch_ended: 195 | _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32).to('cuda')) 196 | else: 197 | v = 0 198 | buf.finish_path(v) 199 | if terminal: 200 | # only save EpRet / EpLen if trajectory finished 201 | logger.store(EpRet=ep_ret, EpLen=ep_len) 202 | o, ep_ret, ep_len = env.reset(), 0, 0 203 | 204 | 205 | # Save model 206 | if (epoch % save_freq == 0) or (epoch == epochs-1): 207 | logger.save_state({'env': env}, None) 208 | 209 | # Perform PPO update! 210 | update() 211 | 212 | # Log info about epoch 213 | logger.log_tabular('Epoch', epoch) 214 | logger.log_tabular('EpRet', with_min_and_max=True) 215 | logger.log_tabular('EpLen', average_only=True) 216 | logger.log_tabular('VVals', with_min_and_max=True) 217 | logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) 218 | logger.log_tabular('LossPi', average_only=True) 219 | logger.log_tabular('LossV', average_only=True) 220 | logger.log_tabular('DeltaLossPi', average_only=True) 221 | logger.log_tabular('DeltaLossV', average_only=True) 222 | logger.log_tabular('Entropy', average_only=True) 223 | logger.log_tabular('KL', average_only=True) 224 | logger.log_tabular('ClipFrac', average_only=True) 225 | logger.log_tabular('StopIter', average_only=True) 226 | logger.log_tabular('Time', time.time()-start_time) 227 | logger.dump_tabular() 228 | 229 | if __name__ == '__main__': 230 | import argparse 231 | parser = argparse.ArgumentParser() 232 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 233 | parser.add_argument('--hid', type=int, default=64) 234 | parser.add_argument('--l', type=int, default=2) 235 | parser.add_argument('--gamma', type=float, default=0.99) 236 | parser.add_argument('--seed', '-s', type=int, default=0) 237 | parser.add_argument('--cpu', type=int, default=4) 238 | parser.add_argument('--steps', type=int, default=4000) 239 | parser.add_argument('--epochs', type=int, default=50) 240 | parser.add_argument('--exp_name', type=str, default='ppo') 241 | args = parser.parse_args() 242 | 243 | #mpi_fork(2) # run parallel code with mpi 244 | 245 | from Code.utils.run_utils import setup_logger_kwargs 246 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 247 | env = MicrogridEnv() 248 | ''' 249 | o = env.reset() 250 | rewards = [] 251 | t=0 252 | while True: 253 | amount = o[1] - (o[2] + o[0]) 254 | if amount == 0: 255 | o, r, d, _ = env.step([0,0,0,0]) 256 | elif amount>0: 257 | o, r, d, _ = env.step([0,0,amount,10]) 258 | else: 259 | o, r, d, _ = env.step([1,0,abs(amount), 19]) 260 | rewards.append(r) 261 | t+=1 262 | if t > 100: 263 | break 264 | for i in range(99): 265 | print("Bought, sold, prices, rewards tot_sold", (env.energy_bought[i], env.energy_sold[i], env.prices[i], rewards[i], env.tot[i])) 266 | print("sum rewards: ",sum(rewards)) 267 | print("avg reward: ", np.mean(rewards)) 268 | ''' 269 | 270 | ppo(lambda : env, actor_critic=core.MLPActorCritic, 271 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, 272 | seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, 273 | logger_kwargs=logger_kwargs) -------------------------------------------------------------------------------- /main_vpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.optim import Adam 4 | import gym 5 | import time 6 | import Code.core_VPG as core 7 | from Code.enviroment import MicrogridEnv 8 | from Code.utils.logx import EpochLogger 9 | from Code.utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads 10 | from Code.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs 11 | 12 | 13 | class VPGBuffer: 14 | """ 15 | A buffer for storing trajectories experienced by a VPG agent interacting 16 | with the environment, and using Generalized Advantage Estimation (GAE-Lambda) 17 | for calculating the advantages of state-action pairs. 18 | """ 19 | 20 | def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): 21 | self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) 22 | self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) 23 | self.adv_buf = np.zeros(size, dtype=np.float32) 24 | self.rew_buf = np.zeros(size, dtype=np.float32) 25 | self.ret_buf = np.zeros(size, dtype=np.float32) 26 | self.val_buf = np.zeros(size, dtype=np.float32) 27 | self.logp_buf = np.zeros(size, dtype=np.float32) 28 | self.gamma, self.lam = gamma, lam 29 | self.ptr, self.path_start_idx, self.max_size = 0, 0, size 30 | 31 | def store(self, obs, act, rew, val, logp): 32 | """ 33 | Append one timestep of agent-environment interaction to the buffer. 34 | """ 35 | assert self.ptr < self.max_size # buffer has to have room so you can store 36 | self.obs_buf[self.ptr] = obs 37 | self.act_buf[self.ptr] = act 38 | self.rew_buf[self.ptr] = rew 39 | self.val_buf[self.ptr] = val 40 | self.logp_buf[self.ptr] = logp 41 | self.ptr += 1 42 | 43 | def finish_path(self, last_val=0): 44 | """ 45 | Call this at the end of a trajectory, or when one gets cut off 46 | by an epoch ending. This looks back in the buffer to where the 47 | trajectory started, and uses rewards and value estimates from 48 | the whole trajectory to compute advantage estimates with GAE-Lambda, 49 | as well as compute the rewards-to-go for each state, to use as 50 | the targets for the value function. 51 | 52 | The "last_val" argument should be 0 if the trajectory ended 53 | because the agent reached a terminal state (died), and otherwise 54 | should be V(s_T), the value function estimated for the last state. 55 | This allows us to bootstrap the reward-to-go calculation to account 56 | for timesteps beyond the arbitrary episode horizon (or epoch cutoff). 57 | """ 58 | 59 | path_slice = slice(self.path_start_idx, self.ptr) 60 | rews = np.append(self.rew_buf[path_slice], last_val) 61 | vals = np.append(self.val_buf[path_slice], last_val) 62 | 63 | # the next two lines implement GAE-Lambda advantage calculation 64 | deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] 65 | self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam) 66 | 67 | # the next line computes rewards-to-go, to be targets for the value function 68 | self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1] 69 | 70 | self.path_start_idx = self.ptr 71 | 72 | def get(self): 73 | """ 74 | Call this at the end of an epoch to get all of the data from 75 | the buffer, with advantages appropriately normalized (shifted to have 76 | mean zero and std one). Also, resets some pointers in the buffer. 77 | """ 78 | assert self.ptr == self.max_size # buffer has to be full before you can get 79 | self.ptr, self.path_start_idx = 0, 0 80 | # the next two lines implement the advantage normalization trick 81 | adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf) 82 | self.adv_buf = (self.adv_buf - adv_mean) / adv_std 83 | data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf, 84 | adv=self.adv_buf, logp=self.logp_buf) 85 | return {k: torch.as_tensor(v, dtype=torch.float32).to('cuda') for k,v in data.items()} 86 | 87 | 88 | 89 | def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 90 | steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, 91 | vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, 92 | logger_kwargs=dict(), save_freq=10): 93 | """ 94 | Vanilla Policy Gradient 95 | 96 | (with GAE-Lambda for advantage estimation) 97 | 98 | Args: 99 | env_fn : A function which creates a copy of the environment. 100 | The environment must satisfy the OpenAI Gym API. 101 | 102 | actor_critic: The constructor method for a PyTorch Module with a 103 | ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 104 | module. The ``step`` method should accept a batch of observations 105 | and return: 106 | 107 | =========== ================ ====================================== 108 | Symbol Shape Description 109 | =========== ================ ====================================== 110 | ``a`` (batch, act_dim) | Numpy array of actions for each 111 | | observation. 112 | ``v`` (batch,) | Numpy array of value estimates 113 | | for the provided observations. 114 | ``logp_a`` (batch,) | Numpy array of log probs for the 115 | | actions in ``a``. 116 | =========== ================ ====================================== 117 | 118 | The ``act`` method behaves the same as ``step`` but only returns ``a``. 119 | 120 | The ``pi`` module's forward call should accept a batch of 121 | observations and optionally a batch of actions, and return: 122 | 123 | =========== ================ ====================================== 124 | Symbol Shape Description 125 | =========== ================ ====================================== 126 | ``pi`` N/A | Torch Distribution object, containing 127 | | a batch of distributions describing 128 | | the policy for the provided observations. 129 | ``logp_a`` (batch,) | Optional (only returned if batch of 130 | | actions is given). Tensor containing 131 | | the log probability, according to 132 | | the policy, of the provided actions. 133 | | If actions not given, will contain 134 | | ``None``. 135 | =========== ================ ====================================== 136 | 137 | The ``v`` module's forward call should accept a batch of observations 138 | and return: 139 | 140 | =========== ================ ====================================== 141 | Symbol Shape Description 142 | =========== ================ ====================================== 143 | ``v`` (batch,) | Tensor containing the value estimates 144 | | for the provided observations. (Critical: 145 | | make sure to flatten this!) 146 | =========== ================ ====================================== 147 | 148 | ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 149 | you provided to VPG. 150 | 151 | seed (int): Seed for random number generators. 152 | 153 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 154 | for the agent and the environment in each epoch. 155 | 156 | epochs (int): Number of epochs of interaction (equivalent to 157 | number of policy updates) to perform. 158 | 159 | gamma (float): Discount factor. (Always between 0 and 1.) 160 | 161 | pi_lr (float): Learning rate for policy optimizer. 162 | 163 | vf_lr (float): Learning rate for value function optimizer. 164 | 165 | train_v_iters (int): Number of gradient descent steps to take on 166 | value function per epoch. 167 | 168 | lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, 169 | close to 1.) 170 | 171 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 172 | 173 | logger_kwargs (dict): Keyword args for EpochLogger. 174 | 175 | save_freq (int): How often (in terms of gap between epochs) to save 176 | the current policy and value function. 177 | 178 | """ 179 | 180 | # Special function to avoid certain slowdowns from PyTorch + MPI combo. 181 | setup_pytorch_for_mpi() 182 | 183 | # Set up logger and save configuration 184 | logger = EpochLogger(**logger_kwargs) 185 | logger.save_config(locals()) 186 | 187 | # Random seed 188 | seed += 10000 * proc_id() 189 | torch.cuda.manual_seed(seed) 190 | np.random.seed(seed) 191 | 192 | # Instantiate environment 193 | env = env_fn() 194 | obs_dim = env.observation_space.shape 195 | act_dim = env.action_space.shape 196 | 197 | # Create actor-critic module 198 | ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) 199 | print(ac) 200 | 201 | # Sync params across processes 202 | sync_params(ac) 203 | 204 | # Count variables 205 | var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) 206 | logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) 207 | 208 | # Set up experience buffer 209 | local_steps_per_epoch = int(steps_per_epoch / num_procs()) 210 | buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) 211 | 212 | # Set up function for computing VPG policy loss 213 | def compute_loss_pi(data): 214 | obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] 215 | 216 | # Policy loss 217 | pi, logp = ac.pi(obs, act) 218 | loss_pi = -(logp * adv).mean() 219 | 220 | # Useful extra info 221 | approx_kl = (logp_old - logp).mean().item() 222 | ent = pi.entropy().mean().item() 223 | pi_info = dict(kl=approx_kl, ent=ent) 224 | 225 | return loss_pi, pi_info 226 | 227 | # Set up function for computing value loss 228 | def compute_loss_v(data): 229 | obs, ret = data['obs'], data['ret'] 230 | return ((ac.v(obs) - ret)**2).mean() 231 | 232 | # Set up optimizers for policy and value function 233 | pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) 234 | vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) 235 | 236 | # Set up model saving 237 | logger.setup_pytorch_saver(ac) 238 | 239 | def update(): 240 | data = buf.get() 241 | 242 | # Get loss and info values before update 243 | pi_l_old, pi_info_old = compute_loss_pi(data) 244 | pi_l_old = pi_l_old.item() 245 | v_l_old = compute_loss_v(data).item() 246 | 247 | # Train policy with a single step of gradient descent 248 | pi_optimizer.zero_grad() 249 | loss_pi, pi_info = compute_loss_pi(data) 250 | loss_pi.backward() 251 | mpi_avg_grads(ac.pi) # average grads across MPI processes 252 | pi_optimizer.step() 253 | 254 | # Value function learning 255 | for i in range(train_v_iters): 256 | vf_optimizer.zero_grad() 257 | loss_v = compute_loss_v(data) 258 | loss_v.backward() 259 | mpi_avg_grads(ac.v) # average grads across MPI processes 260 | vf_optimizer.step() 261 | 262 | # Log changes from update 263 | kl, ent = pi_info['kl'], pi_info_old['ent'] 264 | logger.store(LossPi=pi_l_old, LossV=v_l_old, 265 | KL=kl, Entropy=ent, 266 | DeltaLossPi=(loss_pi.item() - pi_l_old), 267 | DeltaLossV=(loss_v.item() - v_l_old)) 268 | 269 | # Prepare for interaction with environment 270 | start_time = time.time() 271 | o, ep_ret, ep_len = env.reset(), 0, 0 272 | 273 | # Main loop: collect experience in env and update/log each epoch 274 | for epoch in range(epochs): 275 | for t in range(local_steps_per_epoch): 276 | a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32).to('cuda')) 277 | 278 | next_o, r, d, _ = env.step(a) 279 | ep_ret += r 280 | ep_len += 1 281 | 282 | # save and log 283 | buf.store(o, a, r, v, logp) 284 | logger.store(VVals=v) 285 | 286 | # Update obs (critical!) 287 | o = next_o 288 | 289 | timeout = ep_len == max_ep_len 290 | terminal = d or timeout 291 | epoch_ended = t==local_steps_per_epoch-1 292 | 293 | if terminal or epoch_ended: 294 | if epoch_ended and not(terminal): 295 | print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) 296 | # if trajectory didn't reach terminal state, bootstrap value target 297 | if timeout or epoch_ended: 298 | _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32).to('cuda')) 299 | else: 300 | v = 0 301 | buf.finish_path(v) 302 | if terminal: 303 | # only save EpRet / EpLen if trajectory finished 304 | logger.store(EpRet=ep_ret, EpLen=ep_len) 305 | o, ep_ret, ep_len = env.reset(), 0, 0 306 | 307 | 308 | # Save model 309 | if (epoch % save_freq == 0) or (epoch == epochs-1): 310 | logger.save_state({'env': env}, None) 311 | 312 | # Perform VPG update! 313 | update() 314 | 315 | # Log info about epoch 316 | logger.log_tabular('Epoch', epoch) 317 | logger.log_tabular('EpRet', with_min_and_max=True) 318 | logger.log_tabular('EpLen', average_only=True) 319 | logger.log_tabular('VVals', with_min_and_max=True) 320 | logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) 321 | logger.log_tabular('LossPi', average_only=True) 322 | logger.log_tabular('LossV', average_only=True) 323 | logger.log_tabular('DeltaLossPi', average_only=True) 324 | logger.log_tabular('DeltaLossV', average_only=True) 325 | logger.log_tabular('Entropy', average_only=True) 326 | logger.log_tabular('KL', average_only=True) 327 | logger.log_tabular('Time', time.time()-start_time) 328 | logger.dump_tabular() 329 | 330 | if __name__ == '__main__': 331 | import argparse 332 | parser = argparse.ArgumentParser() 333 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 334 | parser.add_argument('--hid', type=int, default=64) 335 | parser.add_argument('--l', type=int, default=2) 336 | parser.add_argument('--gamma', type=float, default=0.99) 337 | parser.add_argument('--seed', '-s', type=int, default=0) 338 | parser.add_argument('--cpu', type=int, default=4) 339 | parser.add_argument('--steps', type=int, default=4000) 340 | parser.add_argument('--epochs', type=int, default=50) 341 | parser.add_argument('--exp_name', type=str, default='vpg') 342 | args = parser.parse_args() 343 | 344 | #mpi_fork(args.cpu) # run parallel code with mpi 345 | 346 | from Code.utils.run_utils import setup_logger_kwargs 347 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 348 | env = MicrogridEnv() 349 | vpg(lambda : env, actor_critic=core.MLPActorCritic, 350 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, 351 | seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, 352 | logger_kwargs=logger_kwargs) --------------------------------------------------------------------------------