├── example_video ├── ep1.mp4 ├── ep2.mp4 ├── ep3.mp4 ├── ep4.mp4 └── ep5.mp4 ├── demo.py ├── README.md ├── utils.py └── modelbased.py /example_video/ep1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielwillemsen/PendulumDemo/HEAD/example_video/ep1.mp4 -------------------------------------------------------------------------------- /example_video/ep2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielwillemsen/PendulumDemo/HEAD/example_video/ep2.mp4 -------------------------------------------------------------------------------- /example_video/ep3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielwillemsen/PendulumDemo/HEAD/example_video/ep3.mp4 -------------------------------------------------------------------------------- /example_video/ep4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielwillemsen/PendulumDemo/HEAD/example_video/ep4.mp4 -------------------------------------------------------------------------------- /example_video/ep5.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielwillemsen/PendulumDemo/HEAD/example_video/ep5.mp4 -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from modelbased import ModelAgent 3 | import random 4 | import numpy as np 5 | import torch 6 | from gym import wrappers 7 | 8 | 9 | def scale_action(env, action): 10 | return (env.action_space.high-env.action_space.low)*action + env.action_space.low 11 | 12 | 13 | def run_episode(env, agent): 14 | obs = env.reset() 15 | reward_tot = 0.0 16 | done = False 17 | reward = 0.0 18 | while not done: 19 | action = scale_action(env, agent.step(obs, reward)) 20 | obs, reward, done, _ = env.step(action) 21 | reward_tot += reward 22 | env.render() 23 | agent.step(obs, reward) 24 | agent.reset() 25 | return reward_tot 26 | 27 | if __name__ == '__main__': 28 | random.seed(0) 29 | np.random.seed(0) 30 | torch.manual_seed(0) 31 | 32 | env_to_wrap = gym.make("Pendulum-v0") 33 | env = wrappers.Monitor(env_to_wrap, 'logging/', force=True, video_callable=lambda episode_id: True) 34 | 35 | agent = ModelAgent(env.observation_space.shape[0], env.action_space.shape[0]) 36 | 37 | for i in range(10): 38 | reward_tot = run_episode(env, agent) 39 | print("Episode: ", i+1, "---", "Total Reward: ", reward_tot) 40 | env.close() 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PendulumDemo 2 | Model-Based RL Demo for Pendulum-v0. 3 | 4 | This demo is, very roughly inspired by the following paper from Janner et al.: When to Trust Your Model: Model-Based Policy Optimization (https://arxiv.org/abs/1906.08253). 5 | 6 | 7 | Known requirements: 8 | openAI gym (pip install gym), 9 | torch (pip install torch) 10 | 11 | To run: 12 | python3 demo.py 13 | 14 | This will save videos of each episode in the "logging" folder. 15 | 16 | Additional information: 17 | The default settings uses 200 gradient steps every 51 time steps and uses an ensemble of 25 models. 18 | On my laptop, this results in every episode taking approximately 10 minutes of computing. To speed up training, computing wise, these numbers can be reduced. Might be at the cost of sample efficiency, but the parameters have not been thoroughly tested at all. 19 | 20 | Disclaimer: this code is just a small demonstration for now. There might very well be mistakes in the code. In addition, documentation is near non-existent. Questions and or contributions are always welcome though! In addition, after a few additional runs, I have noticed that the performance is rather inconsistent, sometimes learning to balance in around ~3 episodes, and sometimes not learning anything even for 5 episodes. 21 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | from torch import nn 5 | 6 | 7 | class ReplayBuffer(): 8 | def __init__(self): 9 | self.buffer = [] 10 | self.n_samples = 128 11 | self.max_size = 1000000 12 | 13 | def len(self): 14 | return len(self.buffer) 15 | 16 | def add(self, sample): 17 | self.buffer.append(sample) 18 | if len(self.buffer) > self.max_size: 19 | del self.buffer[0] 20 | 21 | def sample(self): 22 | samples = random.choices(self.buffer, k=self.n_samples) 23 | data = [*zip(samples)] 24 | data_dict = {"o": data[0], "a": data[1], "r": data[2], "o_next": data[3], "done": data[3]} 25 | return data_dict 26 | 27 | def sample_tensors(self, n=128): 28 | samples = random.choices(self.buffer, k=n) 29 | data = [*zip(*samples)] 30 | data_dict = {"o": torch.stack(data[0]), "a": torch.stack(data[1]), "r": torch.stack(data[2]), "o_next": torch.stack(data[3]), "done": torch.stack(data[4])} 31 | return data_dict 32 | 33 | 34 | class ActorCritic(nn.Module): 35 | def __init__(self, obs_dim, action_dim, hidden_dims_actor=(164, 164), hidden_dims_critic=(164,164)): 36 | super().__init__() 37 | self.actor = Actor(obs_dim, hidden_dims_actor, action_dim) 38 | self.critic = Critic(obs_dim + action_dim, hidden_dims_critic) 39 | 40 | 41 | class Actor(nn.Module): 42 | def __init__(self, input_dim, hidden_dims, output_dim): 43 | super().__init__() 44 | layers = [] 45 | layers += [nn.Linear(input_dim, hidden_dims[0]), nn.ReLU()] 46 | for i in range(len(hidden_dims)-1): 47 | layers += [nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.ReLU()] 48 | layers += [nn.Linear(hidden_dims[-1], output_dim), nn.Sigmoid()] 49 | self.net = nn.Sequential(*layers) 50 | 51 | def forward(self, observation): 52 | return self.net(observation) 53 | 54 | 55 | class Critic(nn.Module): 56 | def __init__(self, input_dim, hidden_dims): 57 | super().__init__() 58 | layers = [] 59 | layers += [nn.Linear(input_dim, hidden_dims[0]), nn.ReLU()] 60 | for i in range(len(hidden_dims) - 1): 61 | layers += [nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.ReLU()] 62 | layers += [nn.Linear(hidden_dims[-1], 1), nn.Identity()] 63 | self.net = nn.Sequential(*layers) 64 | 65 | def forward(self, observation, action): 66 | x = torch.cat([observation, action], dim=-1) 67 | return self.net(x) 68 | 69 | 70 | class Model(nn.Module): 71 | """Model. 72 | Contains a probabilistic world model. Outputs 2 lists: one containing mu, sigma of reward, second containing mu, sigma of observation 73 | """ 74 | def __init__(self, input_dim, hidden_dims, obs_dim): 75 | """__init__. 76 | 77 | :param input_dim: 78 | :param hidden_dims: 79 | :param output_dim: 80 | """ 81 | super().__init__() 82 | layers = [] 83 | layers += [nn.Linear(input_dim, hidden_dims[0]), nn.ReLU()] 84 | for i in range(len(hidden_dims) - 1): 85 | layers += [nn.Linear(hidden_dims[i], hidden_dims[i+1]), nn.ReLU()] 86 | self.net = nn.Sequential(*layers) 87 | self.mu_output = nn.Linear(hidden_dims[-1], obs_dim) 88 | self.sigma_output = nn.Linear(hidden_dims[-1], obs_dim) 89 | self.mu_reward = nn.Linear(hidden_dims[-1], 1) 90 | self.sigma_reward = nn.Linear(hidden_dims[-1], 1) 91 | 92 | def forward(self, observation, action): 93 | x = torch.cat([observation, action], dim=-1) 94 | x = self.net(x) 95 | return [self.mu_output(x), torch.exp(self.sigma_output(x))], [self.mu_reward(x)*5, torch.exp(self.sigma_reward(x))] 96 | 97 | def sample(self, observation, action): 98 | with torch.no_grad(): 99 | new_o, r = self.forward(observation, action) 100 | new_o = torch.normal(new_o[0], new_o[1]) 101 | r = torch.normal(r[0], r[1]) 102 | return new_o, r 103 | -------------------------------------------------------------------------------- /modelbased.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import copy 4 | import numpy as np 5 | from utils import ReplayBuffer 6 | from utils import Critic 7 | from utils import Actor 8 | from utils import Model 9 | 10 | 11 | class ModelAgent: 12 | def __init__(self, obs_dim, action_dim, *args, **kwargs): 13 | # Initialize arguments 14 | hidden_dims_actor = tuple(kwargs.get("hidden_dims_actor", 15 | (256, 256))) 16 | hidden_dims_critic = tuple(kwargs.get("hidden_dims_critic", 17 | (256, 256))) 18 | hidden_dims_model = tuple(kwargs.get("hidden_dims_model", 19 | (256, 256))) 20 | 21 | self.gamma = 0.99 22 | self.tau = 0.005 23 | self.delay = 2 24 | lr_actor = 0.001 25 | lr_critic = 0.001 26 | lr_model = 0.0001 27 | self.step_random = 500 # How many random actions to take before using actor for action selection 28 | self.update_every_n_steps = 51 # How often to update model, actor and critics 29 | self.update_steps = 200 # How many gradient updates to perform, per model, when updating 30 | self.time = time.time() 31 | 32 | # Initialize actor 33 | self.actor = Actor(obs_dim, hidden_dims_actor, action_dim) 34 | self.actor_target = copy.deepcopy(self.actor) 35 | self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), 36 | lr=lr_actor) 37 | for par in self.actor_target.parameters(): 38 | par.requires_grad = False 39 | 40 | # Initialize 2 critics 41 | self.critics = [] 42 | self.critics_target = [] 43 | self.optimizer_critics = [] 44 | for k in range(2): 45 | critic = Critic(obs_dim + action_dim, hidden_dims_critic) 46 | self.critics.append(critic) 47 | self.critics_target.append(copy.deepcopy(critic)) 48 | self.optimizer_critics.append(torch.optim.Adam(critic.parameters(), 49 | lr=lr_critic)) 50 | 51 | for par in self.critics_target[k].parameters(): 52 | par.requires_grad = False 53 | 54 | # Initialize models 55 | self.models = [] 56 | self.optimizer_models = [] 57 | for k in range(25): 58 | model = Model(obs_dim + action_dim, hidden_dims_model, obs_dim) 59 | self.models.append(model) 60 | self.optimizer_models.append(torch.optim.Adam(model.parameters(), 61 | lr=lr_model)) 62 | 63 | # Setup Replay Buffer 64 | self.buffer = ReplayBuffer() 65 | self.o_old = None 66 | self.a_old = None 67 | 68 | self.step_i = 0 69 | 70 | def reset(self): 71 | self.o_old = None 72 | self.a_old = None 73 | 74 | def loss_critic(self, val, target): 75 | diffs = target - val 76 | return torch.mean(diffs**2) 77 | 78 | def step(self, o, r, eval=False, done=False): 79 | o = torch.Tensor(o) 80 | r = torch.Tensor(np.array(float(r))) 81 | done = torch.Tensor(np.array(float(done))) 82 | if not eval: 83 | # Add to replay buffer 84 | if self.o_old is not None: 85 | self.buffer.add((self.o_old, self.a_old, r, o, done)) 86 | 87 | if self.step_i % self.update_every_n_steps == 0 and self.buffer.len() > self.buffer.n_samples: 88 | print("Performing update steps...") 89 | for step in range(self.update_steps): 90 | # Train Model 91 | self.update_models() 92 | 93 | # Train actor and critics. 94 | # Using one minibatch of samples from every model. 95 | for model in self.models: 96 | b = self.sample_from_model(model) 97 | 98 | # Update Critic 99 | self.update_critics(b) 100 | 101 | # Update Actor 102 | if self.step_i % self.delay == 0: 103 | self.update_actor(b) 104 | 105 | # Update Target Networks 106 | self.update_target_networks() 107 | 108 | # Select Action 109 | with torch.no_grad(): 110 | action = self.actor(o.unsqueeze(0)).squeeze() 111 | action_noisy = action + torch.randn(action.size())*0.3 112 | action = torch.clamp(action_noisy, 0., 1.0) 113 | if self.step_i < self.step_random: 114 | action = torch.rand(action_noisy.shape) 115 | self.o_old = o 116 | if action.size() == torch.Size([]): 117 | self.a_old = action.unsqueeze(0) 118 | else: 119 | self.a_old = action 120 | self.step_i += 1 121 | else: 122 | action = self.actor(o.unsqueeze(0)).squeeze() 123 | action = torch.clamp(action, 0., 1.0) 124 | 125 | return action.detach().numpy() 126 | 127 | def update_target_networks(self): 128 | with torch.no_grad(): 129 | for par, par_target in zip(self.actor.parameters(), self.actor_target.parameters()): 130 | par_target.data.copy_( 131 | (1 - self.tau) * par_target + self.tau * par.data) 132 | for k in range(2): 133 | for par, par_target in zip(self.critics[k].parameters(), self.critics_target[k].parameters()): 134 | par_target.data.copy_( 135 | (1 - self.tau) * par_target + self.tau * par.data) 136 | 137 | def update_actor(self, b): 138 | for par in self.critics[0].parameters(): 139 | par.requires_grad = False 140 | self.optimizer_actor.zero_grad() 141 | loss_actor = - \ 142 | torch.mean(self.critics[0](b["o"], self.actor(b["o"]))) 143 | loss_actor.backward() 144 | self.optimizer_actor.step() 145 | for par in self.critics[0].parameters(): 146 | par.requires_grad = True 147 | 148 | def update_critics(self, b): 149 | with torch.no_grad(): 150 | a_target = self.actor_target(b["o_next"]) 151 | a_target = torch.clamp( 152 | a_target + torch.clamp(torch.randn(a_target.size()) * 0.1, -0.5, 0.5), 0., 1.) 153 | y = b["r"].unsqueeze(-1) + (1 - b["done"]) * self.gamma * torch.min( 154 | *[critic_target(b["o_next"], a_target) for critic_target in self.critics_target]) 155 | for optimizer, critic in zip(self.optimizer_critics, self.critics): 156 | loss = self.loss_critic(critic(b["o"], b["a"]), y) 157 | 158 | optimizer.zero_grad() 159 | loss.backward() 160 | optimizer.step() 161 | 162 | def sample_from_model(self, model): 163 | # Sample Minibatch 164 | b = self.buffer.sample_tensors(n=128) 165 | with torch.no_grad(): 166 | action = self.actor(b["o"]) 167 | action_noisy = action + torch.randn(action.size()) * 0.3 168 | b["a"] = torch.clamp(action_noisy, 0., 1.0) 169 | new_o, r = model.sample(b["o"], b["a"]) 170 | b["o_next"] = new_o 171 | b["r"] = r.squeeze() 172 | return b 173 | 174 | def update_models(self): 175 | samples = self.buffer.sample_tensors() 176 | for optim, model in zip(self.optimizer_models, self.models): 177 | self.model_step(model, optim, samples) 178 | 179 | def model_step(self, model, optim, samples): 180 | # Do one gradient update for a model. 181 | o_next_pred, r_pred = model(samples["o"], samples["a"]) 182 | sigma = o_next_pred[1] 183 | sigma_2 = r_pred[1] 184 | mu = o_next_pred[0] 185 | target = samples["o_next"] 186 | loss1 = torch.mean((mu - target) / sigma ** 2 * (mu - target)) 187 | loss2 = torch.mean(torch.log(torch.prod(sigma ** 2, 1) * torch.prod(sigma_2 ** 2, 1))) 188 | mu = r_pred[0] 189 | target = samples["r"].unsqueeze(1) 190 | loss3 = torch.mean((mu - target) / sigma_2 ** 2 * (mu - target)) 191 | loss = loss1 + loss2 + loss3 192 | optim.zero_grad() 193 | loss.backward() 194 | optim.step() 195 | --------------------------------------------------------------------------------